B_03_Phylogenetic_autocorrelation.R

B - 03: Phylogenetic autocorrelation

Code
# set libraries
package_list <- 
  c("dplyr","ggplot2", 
    "phylosignal", "ape", "phylobase",
    "tidymodels", "ranger", 
    "purrr", "tibble", 
    "ggplot2", 
    "here", "skimr", 
    "readr")

# load libraries from vector:
x <- 
  lapply(package_list, require, character = TRUE)

rm(list = ls())

Set paths for data

Code
# Data paths:
tax_path <-
  here("Data/input/Tax_lookup.csv")

dta_path <- 
  here("Data/output/1_all_predictors_merged.rds")

tree_path <- 
  here("Data/input/MRC_consensus_BirdTree.tre")

ranger_res_pooled_path <-
  here("Data/output/B_models/B_01_list_all_results_rf.rds")

ranger_res_regional_path <- 
  here("Data/output/B_models/B_02_rf_res_atlas_split.rds")

Load the data

Code
#----------------------------------------------------------#
# Load data  -----
#----------------------------------------------------------#

dta <- 
  readRDS(dta_path) %>%
  distinct(scientificName, verbatimIdentification, 
           datasetID, 
           Jaccard_dissim, log_R2_1, log_R2_1_per_year )

Tax <-
  read_csv(tax_path, 
           col_select = 
             c(verbatimIdentification, scientificName, BirdTree)
           ) %>%
  full_join(dta) %>%
  filter(!c(is.na(datasetID))) %>%
  mutate(BirdTree = gsub(" ", "_", BirdTree))

tree <-
  ape::read.tree(tree_path)

# BirdTree in data per atlas
sp_in_dta <-
  Tax %>%
  group_by(datasetID) %>%
  group_split()

res_full <- 
  readRDS(ranger_res_pooled_path)
res_split <- 
  readRDS(ranger_res_regional_path)

function: raw data

Code
# ──────────────────────────────────────────────────────────────#
# I. Phylogenetic autocorrelation in the raw data
# --------------------------------------------------------------#

# test subset:
# dat <- sp_in_dta[[1]] %>% head()
# tree <- tree
# region <- "czechia"
# response <- "Jaccard_dissim"


## Helper function for raw data ##
phylo_corr_df <- function(dat, tree, region) {
  
  ## 1 ─ mean value of each trait per tip
  dd_new <- dat %>%
    filter(!is.na(BirdTree)) %>%
    group_by(BirdTree) %>%
    summarise(across(
      c(Jaccard_dissim, log_R2_1, log_R2_1_per_year),
      ~ mean(.x, na.rm = TRUE)
    )) %>%
    column_to_rownames("BirdTree") %>%
    na.omit()
  
  ## 2 ─ prune tree to those tips
  tre_pruned <- tree %>%
    drop.tip(setdiff(tree$tip.label, rownames(dd_new))) %>%
    ladderize()
  
  ## 3 ─ build phylo4d object once
  p4d <- phylo4d(tre_pruned, dd_new)
  
  ## 4 ─ helper to reshape one correlogram
  corr_to_tbl <- function(response) {
    phyloCorrelogram(p4d, trait = response, ci.bs = 100)$res %>%
      as_tibble() %>%
      mutate(
        dist      = .[[1]], # mid‑point of the distance class
        I         = .[[4]], # Moran’s I 
        lower_CI  = .[[2]], # lower CI
        upper_CI  = .[[3]], # upper CI
        model     = "none",
        type      = "raw data",
        response  = response,
        region    = region,
        N         = nrow(dd_new),
        .keep = "used"
      )
  }
  
  ## 5 ─ run for the three traits & bind rows 
  map_dfr(
    #c("Jaccard_dissim"),
    c("Jaccard_dissim", "log_R2_1", "log_R2_1_per_year"),
    corr_to_tbl
  )
}
Code
plot_and_save <- function(dat,
                          responses   = c("Jaccard_dissim", "log_R2_1", "log_R2_1_per_year"),
                          regions     = c("5", "6", "13", "26"),
                          tree,
                          ci.bs       = 100,
                          out_dir     = here("Figures/B_models/phylo_autocorr/raw_data")) {

  if (dir.exists(out_dir) == FALSE) dir.create(out_dir)
  for (reg in regions) {
    
    # some species have the same BirdTree-name. 
    # We need to aggregate those together to match the phylogeny.
    dat_reg <- dat %>% 
      filter(datasetID == reg, !is.na(BirdTree)) %>%
      group_by(BirdTree) %>% 
      summarise(across(all_of(responses), \(x) mean(x, na.rm = TRUE))) %>% 
      column_to_rownames("BirdTree") %>% 
      na.omit()
    
      # 2. prune tree to match remaining tips
      tre_pruned <- tree %>% 
        drop.tip(setdiff(tree$tip.label, rownames(dat_reg))) %>% 
        ladderize()

      # 3. build p4d object & correlogram
      p4d  <- phylo4d(tre_pruned, dat_reg)
      
    for (trait in responses) {

      print(paste(reg, trait, "...started..."))
      
      tictoc::tic()
      pcor <- phyloCorrelogram(p4d, trait = trait, ci.bs = ci.bs) # takes between 78-140 sec
      tictoc::toc()
      
      if (reg == 5) region_name <- "Czechia" else
        if (reg == 6) region_name <- "New_York" else
          if (reg == 13) region_name <- "Japan" else
            if (reg == 26) region_name <- "Europe"
      
      plot(pcor, main = sprintf("%s – %s", region_name, trait))
      
      # 4. plot & save
      svg(file.path(out_dir, sprintf("%s_%s_raw_data.svg", region_name, trait)),
          width = 6,
          height = 4)
      plot(pcor, main = sprintf("%s – %s", region_name, trait),
           ylab = "Moran's I")
      dev.off()
      
      print(paste0("... plot saved to: ", out_dir, "/", region_name, "_", trait,"_raw_data.svg"))
    }
  }
  invisible(TRUE)
}


plot_and_save(
  dat         = Tax, 
  tree        = tree,
  responses   = c("Jaccard_dissim", "log_R2_1", "log_R2_1_per_year"),
  regions     = c("5", "6", "13", "26"),
  ci.bs       = 100,
  out_dir     = here("Figures/B_models/phylo_autocorr/raw_data")
)
[1] "5 Jaccard_dissim ...started..."
85 sec elapsed

[1] "... plot saved to: C:/Users/wolke/OneDrive - CZU v Praze/Frieda_PhD_files/02_StaticPatterns/Git/Figures/B_models/phylo_autocorr/raw_data/Czechia_Jaccard_dissim_raw_data.svg"
[1] "5 log_R2_1 ...started..."
90.58 sec elapsed

[1] "... plot saved to: C:/Users/wolke/OneDrive - CZU v Praze/Frieda_PhD_files/02_StaticPatterns/Git/Figures/B_models/phylo_autocorr/raw_data/Czechia_log_R2_1_raw_data.svg"
[1] "5 log_R2_1_per_year ...started..."
94.19 sec elapsed

[1] "... plot saved to: C:/Users/wolke/OneDrive - CZU v Praze/Frieda_PhD_files/02_StaticPatterns/Git/Figures/B_models/phylo_autocorr/raw_data/Czechia_log_R2_1_per_year_raw_data.svg"
[1] "6 Jaccard_dissim ...started..."
110.48 sec elapsed

[1] "... plot saved to: C:/Users/wolke/OneDrive - CZU v Praze/Frieda_PhD_files/02_StaticPatterns/Git/Figures/B_models/phylo_autocorr/raw_data/New_York_Jaccard_dissim_raw_data.svg"
[1] "6 log_R2_1 ...started..."
92.73 sec elapsed

[1] "... plot saved to: C:/Users/wolke/OneDrive - CZU v Praze/Frieda_PhD_files/02_StaticPatterns/Git/Figures/B_models/phylo_autocorr/raw_data/New_York_log_R2_1_raw_data.svg"
[1] "6 log_R2_1_per_year ...started..."
100.35 sec elapsed

[1] "... plot saved to: C:/Users/wolke/OneDrive - CZU v Praze/Frieda_PhD_files/02_StaticPatterns/Git/Figures/B_models/phylo_autocorr/raw_data/New_York_log_R2_1_per_year_raw_data.svg"
[1] "13 Jaccard_dissim ...started..."
88.39 sec elapsed

[1] "... plot saved to: C:/Users/wolke/OneDrive - CZU v Praze/Frieda_PhD_files/02_StaticPatterns/Git/Figures/B_models/phylo_autocorr/raw_data/Japan_Jaccard_dissim_raw_data.svg"
[1] "13 log_R2_1 ...started..."
88.33 sec elapsed

[1] "... plot saved to: C:/Users/wolke/OneDrive - CZU v Praze/Frieda_PhD_files/02_StaticPatterns/Git/Figures/B_models/phylo_autocorr/raw_data/Japan_log_R2_1_raw_data.svg"
[1] "13 log_R2_1_per_year ...started..."
91.41 sec elapsed

[1] "... plot saved to: C:/Users/wolke/OneDrive - CZU v Praze/Frieda_PhD_files/02_StaticPatterns/Git/Figures/B_models/phylo_autocorr/raw_data/Japan_log_R2_1_per_year_raw_data.svg"
[1] "26 Jaccard_dissim ...started..."
550.31 sec elapsed

[1] "... plot saved to: C:/Users/wolke/OneDrive - CZU v Praze/Frieda_PhD_files/02_StaticPatterns/Git/Figures/B_models/phylo_autocorr/raw_data/Europe_Jaccard_dissim_raw_data.svg"
[1] "26 log_R2_1 ...started..."
597.23 sec elapsed

[1] "... plot saved to: C:/Users/wolke/OneDrive - CZU v Praze/Frieda_PhD_files/02_StaticPatterns/Git/Figures/B_models/phylo_autocorr/raw_data/Europe_log_R2_1_raw_data.svg"
[1] "26 log_R2_1_per_year ...started..."
645.01 sec elapsed

[1] "... plot saved to: C:/Users/wolke/OneDrive - CZU v Praze/Frieda_PhD_files/02_StaticPatterns/Git/Figures/B_models/phylo_autocorr/raw_data/Europe_log_R2_1_per_year_raw_data.svg"

I. Calculate phylo autocorr for responses in the raw data

Code
# ---------------------------------------------------- #
# Run the function for raw data phylo autocorr 
# ---------------------------------------------------- #

# Across regions:
region_names <- c("Czechia", "New York", "Japan", "Europe")

all_corr <- map2_dfr(
  sp_in_dta,                    # list of four data frames
  region_names,                 # parallel vector of region labels
  ~ phylo_corr_df(.x, tree, .y) # .x = data, .y = region
)

saveRDS(all_corr, here("Data/output/temp/B_03_phylo_autocorr_raw_data.rds"))

Check the results:

Code
# Check results
head(all_corr)
# A tibble: 6 × 9
   dist     I lower_CI upper_CI model type     response       region      N
  <dbl> <dbl>    <dbl>    <dbl> <chr> <chr>    <chr>          <chr>   <int>
1  0    0.270    0.154    0.404 none  raw data Jaccard_dissim Czechia   208
2  1.82 0.269    0.148    0.413 none  raw data Jaccard_dissim Czechia   208
3  3.64 0.269    0.152    0.402 none  raw data Jaccard_dissim Czechia   208
4  5.45 0.268    0.149    0.401 none  raw data Jaccard_dissim Czechia   208
5  7.27 0.267    0.146    0.403 none  raw data Jaccard_dissim Czechia   208
6  9.09 0.267    0.157    0.404 none  raw data Jaccard_dissim Czechia   208

Plot results

Code
# Plot results
ggplot(all_corr %>% filter(response != "log_R2_1_per_year"),
       aes(dist, I, colour = response, fill = response)) +
  geom_hline(yintercept = 0)+
  geom_ribbon(aes(ymin = lower_CI, ymax = upper_CI),
              alpha = 0.25, colour = NA) +
  geom_line(linewidth = 1.1) +
  facet_wrap(~ region) +
  labs(x = "Phylogenetic distance", y = "Moran’s I") +
  theme_classic()

function: model residuals

Code
# ──────────────────────────────────────────────────────────────#
# II. Phylogenetic autocorrelation in the model residuals
# --------------------------------------------------------------#

## Helper function for residuals ##

resid_correlogram_df <- function(pred_df,
                                 tree,
                                 Tax,
                                 response = c("Jaccard_dissim", "log_R2_1", "log_R2_1_per_year"), 
                                 model   = c("pooled", "regional"),
                                 region  = c("all", "cz", "ny", "jp", "eu")) {
  
  model  <- match.arg(model)
  region <- match.arg(region)
  
  ## 1 ─ mean residual per tip -------------------------------------------
  tip_resid <- pred_df %>%
    select(verbatimIdentification, resid) %>%
    left_join(
      Tax %>% select(verbatimIdentification, BirdTree),
      by = "verbatimIdentification"
    ) %>%
    distinct() %>%
    filter(!is.na(BirdTree)) %>%
    group_by(BirdTree) %>%
    summarise(resid = mean(resid, na.rm = TRUE), .groups = "drop") %>%
    column_to_rownames("BirdTree")
  
  ## 2 ─ prune the tree ---------------------------------------------------
  tre_pruned <- tree %>%
    drop.tip(setdiff(tree$tip.label, rownames(tip_resid))) %>%
    ladderize()
  
  ## 3 ─ correlogram ------------------------------------------------------
  cor_obj <- phylo4d(tre_pruned, tip_resid) %>%
    phyloCorrelogram(trait = "resid", ci.bs = 100)
  
  ## 4 ─ tidy output ------------------------------------------------------
  as_tibble(cor_obj$res) %>%
    mutate(
      dist      = .[[1]], # mid‑point of the distance class
      I         = .[[4]], # Moran’s I 
      lower_CI  = .[[2]], # lower CI
      upper_CI  = .[[3]], # upper CI
      model     = model,
      type      = "residuals",
      response  = response,
      region    = region,
      N         = nrow(tip_resid),
      .keep = "used"
    )
}

II. Calculate phylo autocorr in the model residuals

in the pooled model

Code
# ---------------------------------------------------- #
# Run the function for model residuals phylo autocorr 
# ---------------------------------------------------- #

# a) Pooled model
responses <- c("Jaccard_dissim", "log_R2_1", "log_R2_1_per_year")

corr_pooled <- purrr::imap_dfr(
  res_full$predictions,                           # list length = 3
  ~ resid_correlogram_df(
    pred_df  = .x,
    tree     = tree,
    Tax   = Tax,
    response = responses[.y],                   # tag by position
    model    = "pooled",                        
    region = "all"
  )
)

in the regional models

Code
# b) models split by region
regions   <- c("cz", "ny", "jp", "eu")           
responses <- c("Jaccard_dissim", "log_R2_1", "log_R2_1_per_year")

corr_regional <- purrr::imap_dfr(res_split, \(region_list, i_reg) {
  region <- regions[i_reg]
  
  purrr::imap_dfr(region_list, \(resp_list, i_resp) {
    resid_correlogram_df(
      pred_df  = resp_list$predictions,
      tree     = tree,
      Tax   = Tax,
      response = responses[i_resp],
      model    = "regional",
      region   = region
    )
  })
})

Merge both results into one data frame

Code
# Merge split and pooled results
corr_residuals_all <- bind_rows(corr_pooled, corr_regional) 
saveRDS(corr_residuals_all, here("Data/output/temp/B_03_phylo_autocorr_model_residuals.rds"))

Check the results

Code
# Check results
head(corr_residuals_all)
# A tibble: 6 × 9
   dist     I lower_CI upper_CI model  type      response       region     N
  <dbl> <dbl>    <dbl>    <dbl> <chr>  <chr>     <chr>          <chr>  <int>
1  0    0.127  0.00613    0.296 pooled residuals Jaccard_dissim all      191
2  1.82 0.125  0.00618    0.286 pooled residuals Jaccard_dissim all      191
3  3.64 0.124  0.00796    0.304 pooled residuals Jaccard_dissim all      191
4  5.45 0.122  0.0133     0.293 pooled residuals Jaccard_dissim all      191
5  7.27 0.121  0.00731    0.278 pooled residuals Jaccard_dissim all      191
6  9.09 0.119  0.00754    0.278 pooled residuals Jaccard_dissim all      191

Plot results

Code
# Plot results
corr_residuals_all %>% 
  filter(response != "log_R2_1_per_year") %>%
  ggplot(aes(dist, I, colour = response, fill = response)) +
  geom_ribbon(aes(ymin = lower_CI, 
                  ymax = upper_CI),
              alpha = 0.25, 
              colour = NA) +
  geom_line(linewidth = 1.1) +
  facet_wrap(~ region + model, scales = "free_y") +
  labs(x = "Phylogenetic distance", y = "Moran’s I of residuals") +
  theme_classic()