In the following, we will fit random forest models to the first New York data set, using the tidymodels framework. The goal is to predict the temporal change to the second atlas replication using Jaccard dissimilarity and log ratio AOO values of the species distribution for each species in the dataset. We use ranger random forest models with Bayesian hyperparameter tuning. The model is evaluated using 10-fold cross-validation repeated three times. We will also explore variable importance and partial dependencies, as well as interactions between predictors.
Code
rm(list=ls())gc()
used (Mb) gc trigger (Mb) max used (Mb)
Ncells 595704 31.9 1358800 72.6 686422 36.7
Vcells 1091028 8.4 8388608 64.0 1877384 14.4
Code
install.packages("tidymodels", repos ="https://cloud.r-project.org/") # install here package if not already installed
Paket 'tidymodels' erfolgreich ausgepackt und MD5 Summen abgeglichen
Die heruntergeladenen Binärpakete sind in
C:\Users\wolke\AppData\Local\Temp\RtmpCmQG1d\downloaded_packages
Code
suppressPackageStartupMessages({library(here)library(dplyr)library(ranger)library(tidymodels)library(caret)library(skimr)library(DALEXtra)library(hstats)library(rsample)library(tune)library(parsnip)library(recipes)library(workflows)library(yardstick)library(dials)library(forcats)})tidymodels_prefer() # solve conflicted functions in favor of tidymodelsset.seed(123)
tictoc::tic()set.seed(123)# model specificsmod_spec <-rand_forest(mtry =tune(),trees =tune(),min_n =tune()) %>%set_engine("ranger",importance ="permutation",respect.unordered.factors =TRUE) %>%set_mode("regression")# tuning parameterrf_params <-parameters(mtry(range =c(2L, 10L)),min_n(range =c(5L, 15L)),trees(range =c(1000L, 5000L)) )# Set up cross-validationset.seed(123)folds <-vfold_cv(train, v =10, repeats =3)# save data splits for later use in predictionslist_split_data <-list(split_info = split_info,train = train,test = test,folds = folds )
Code
# Loop through response variables:for (resp_i inseq_along(responses)){# Create model recipe:if (responses[resp_i] =="Jaccard_dissim") { mod_rec <-recipe(Jaccard_dissim ~ ., data = train) %>%update_role(all_of(sp_id),new_role ="speciesID",old_role ="predictor" ) %>%update_role(all_of(responses[-resp_i]),old_role ="predictor",new_role ="alt_resp" ) } elseif (responses[resp_i] =="log_R2_1") { mod_rec <-recipe(log_R2_1 ~ ., data = train) %>%update_role(all_of(sp_id),new_role ="speciesID",old_role ="predictor" ) %>%update_role(all_of(responses[-resp_i]),old_role ="predictor",new_role ="alt_resp" ) } elseif (responses[resp_i] =="log_R2_1_per_year") { mod_rec <-recipe(log_R2_1_per_year ~ ., data = train) %>%update_role(all_of(sp_id),new_role ="speciesID",old_role ="predictor" ) %>%update_role(all_of(responses[-resp_i]),old_role ="predictor",new_role ="alt_resp" ) }# Create workflow with recipeset.seed(123) mod_wf <-workflow() %>%add_recipe(mod_rec) %>%add_model(mod_spec)#----------------------------------------------------------## Bayesian hyperparameter tuning -----#----------------------------------------------------------# tictoc::tic()set.seed(123) tuned_bayes <-tune_bayes( mod_wf,resamples = folds,param_info = rf_params, initial =5,iter =50,control =control_bayes(verbose = T,no_improve =10, seed =123) ) tictoc::toc() # 1233.37 sec elapsed best <- tuned_bayes %>%select_best(metric ="rmse")#----------------------------------------------------------## Final model fit -----#----------------------------------------------------------# final_wf <- mod_wf %>%finalize_workflow(best)# perform the model evaluation on the testing data directly final_fit <- final_wf %>%last_fit(split_info) #----------------------------------------------------------## Predictions on test data -----#----------------------------------------------------------# p <-predict( final_fit %>% tune::extract_workflow(), new_data = test ) %>%bind_cols( test %>%select(responses[resp_i], verbatimIdentification, scientificName) ) %>%# change y with response varsetNames(c(".pred", responses[resp_i], "verbatimIdentification", "scientificName") ) %>%mutate(resid = .[[responses[resp_i]]] - .pred)#----------------------------------------------------------## explainable AI (xAI) -----#----------------------------------------------------------# fit <- final_fit %>%extract_fit_parsnip() explainer <- DALEXtra::explain_tidymodels(label =paste0(responses[resp_i]), fit,data = train %>%select(!any_of(c(responses, "verbatimIdentification", "scientificName")) ),y = train %>%pull(responses[resp_i]) )# save results to list results_response[[resp_i]] <-list(tuned_bayes = tuned_bayes,best = best,final_fit = final_fit,predictions = p,explainer = explainer,pd =model_profile( explainer,type ="partial",N =NULL,variables = explainer$data %>%names() ),md =model_diagnostics(explainer),mp =model_performance(explainer),vi =model_parts(explainer) ) }tictoc::toc()saveRDS(results_response, here::here("Demo_NewYork/Data/output/B_01_NY_model_results.rds"))
# read results back in model_results <-readRDS(here::here("Demo_NewYork/Data/output/B_01_NY_model_results.rds"))importances <-readRDS(here::here("Demo_NewYork/Data/output/B_01_NY_variable_importance.rds"))interactions <-readRDS(here::here("Demo_NewYork/Data/output/B_01_NY_interactions.rds"))
Warning: Using an external vector in selections was deprecated in tidyselect 1.1.0.
ℹ Please use `all_of()` or `any_of()` instead.
# Was:
data %>% select(this_variable)
# Now:
data %>% select(all_of(this_variable))
See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
Code
num <-lapply(pdp_list_num,bind_rows)cat <-lapply(pdp_list_cat,bind_rows)