A_04_Calculate_atlas_variables.R

A - 04 Calculate atlas variables

Source 00_Configuration.R to load libraries

Code
library(skimr)
library(kableExtra)
source(here::here("Code/00_Configuration.R"))
x <- lapply(package_list, require, character = TRUE)
rm(x)

Enable s2 processing for sf package with projected data

Code
sf_use_s2(TRUE)

Load data

Read spatial dataset

Code
data_sf <- 
  readRDS(here("Data/output/1_data_sf.rds"))

Calculate grid information

Custom function to get total and sampled area/cells from datasets

Code
calculate_grid_info <- function(data_sf) {
  grid_info <-
    full_join(
      data_sf %>%
      st_drop_geometry() %>%
      distinct(datasetID, scalingID, siteID, cell_sampling_repeats, croppedArea, time_span) %>%
      group_by(datasetID, scalingID) %>%
      summarise(
        Total_Ncells = n_distinct(siteID)),
    data_sf %>%
      st_drop_geometry() %>%
      distinct(datasetID, scalingID, siteID, cell_sampling_repeats, croppedArea, time_span) %>%
      filter(cell_sampling_repeats == 2) %>%
      group_by(datasetID, scalingID, time_span) %>%
      summarise(
        Total_Ncells_samp = n_distinct(siteID),
        Total_area_samp = sum(croppedArea, na.rm = TRUE))
  )
  return(grid_info)
}

Apply function to compute grid information

Code
atlas_areas <-
  calculate_grid_info(data_sf)

Load filtered data

Read processed species data & merge with atlas_areas

Code
pres_dat_final <- readRDS(here("Data/output/1_data_filtered.rds")) %>%
  left_join(atlas_areas) %>%
  mutate(scalingID = factor(scalingID, levels = vars$desired_levels)) %>%
  filter(!is.na(verbatimIdentification)) # Remove unsampled cells

Overview:

Code
atlas_areas %>%
  kable()
datasetID scalingID Total_Ncells time_span Total_Ncells_samp Total_area_samp
5 1 671 2 628 78308.81
5 1 671 4 628 78308.81
5 2 185 2 176 78751.93
5 2 185 4 176 78751.93
5 4 55 2 54 78871.95
5 4 55 4 54 78871.95
5 8 18 2 18 78873.98
5 8 18 4 18 78873.98
5 16 6 2 6 78873.98
5 16 6 4 6 78873.98
5 32 2 2 2 78873.98
5 32 2 4 2 78873.98
5 64 1 2 1 78873.98
5 64 1 4 1 78873.98
6 1 5335 5 5319 126878.54
6 2 1398 5 1396 127118.11
6 4 386 5 384 127118.11
6 8 113 5 112 127121.79
6 16 37 5 37 127146.80
6 32 13 5 13 127146.80
6 64 5 5 5 127146.80
6 128 2 5 2 127146.80
13 1 1309 4 1184 367713.86
13 1 1309 5 1184 367713.86
13 2 429 4 376 370862.75
13 2 429 5 376 370862.75
13 4 150 4 124 371375.95
13 4 150 5 124 371375.95
13 8 56 4 43 371668.23
13 8 56 5 43 371668.23
13 16 21 4 17 371860.74
13 16 21 5 17 371860.74
13 32 9 4 7 371879.89
13 32 9 5 7 371879.89
13 64 4 4 3 371879.89
13 64 4 5 3 371879.89
13 128 1 4 1 371982.46
13 128 1 5 1 371982.46
26 1 5080 4 2821 5909157.77
26 1 5080 23 2821 5909157.77
26 2 1394 4 810 6099573.67
26 2 1394 23 810 6099573.67
26 4 379 4 231 6303088.59
26 4 379 23 231 6303088.59
26 8 166 4 106 6552320.49
26 8 166 23 106 6552320.49
26 16 41 4 30 7447947.85
26 16 41 23 30 7447947.85
26 32 13 4 10 9854014.22
26 32 13 23 10 9854014.22
26 64 6 4 5 10619391.39
26 64 6 23 5 10619391.39
26 128 1 4 1 11171485.87
26 128 1 23 1 11171485.87

Summary statistics:

Code
atlas_areas %>%
  skim()
Data summary
Name Piped data
Number of rows 54
Number of columns 6
_______________________
Column type frequency:
numeric 5
________________________
Group variables datasetID

Variable type: numeric

skim_variable datasetID n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
scalingID 5 0 1 18.14 22.04 1.00 2.50 8.00 28.00 64.00 ▇▂▂▁▂
scalingID 6 0 1 31.88 44.27 1.00 3.50 12.00 40.00 128.00 ▇▂▂▁▂
scalingID 13 0 1 31.88 42.77 1.00 3.50 12.00 40.00 128.00 ▇▂▂▁▂
scalingID 26 0 1 31.88 42.77 1.00 3.50 12.00 40.00 128.00 ▇▂▂▁▂
Total_Ncells 5 0 1 134.00 236.05 1.00 3.00 18.00 152.50 671.00 ▇▂▁▁▂
Total_Ncells 6 0 1 911.12 1849.28 2.00 11.00 75.00 639.00 5335.00 ▇▁▁▁▁
Total_Ncells 13 0 1 247.38 437.29 1.00 7.75 38.50 219.75 1309.00 ▇▁▁▁▁
Total_Ncells 26 0 1 885.00 1699.29 1.00 11.25 103.50 632.75 5080.00 ▇▁▁▁▁
time_span 5 0 1 3.00 1.04 2.00 2.00 3.00 4.00 4.00 ▇▁▁▁▇
time_span 6 0 1 5.00 0.00 5.00 5.00 5.00 5.00 5.00 ▁▁▇▁▁
time_span 13 0 1 4.50 0.52 4.00 4.00 4.50 5.00 5.00 ▇▁▁▁▇
time_span 26 0 1 13.50 9.81 4.00 4.00 13.50 23.00 23.00 ▇▁▁▁▇
Total_Ncells_samp 5 0 1 126.43 220.75 1.00 3.00 18.00 145.50 628.00 ▇▂▁▁▂
Total_Ncells_samp 6 0 1 908.50 1843.88 2.00 11.00 74.50 637.00 5319.00 ▇▁▁▁▁
Total_Ncells_samp 13 0 1 219.38 395.95 1.00 6.00 30.00 187.00 1184.00 ▇▁▁▁▁
Total_Ncells_samp 26 0 1 501.75 942.74 1.00 8.75 68.00 375.75 2821.00 ▇▁▁▁▁
Total_area_samp 5 0 1 78775.51 202.47 78308.81 78781.93 78873.98 78873.98 78873.98 ▂▁▁▂▇
Total_area_samp 6 0 1 127102.97 91.70 126878.54 127118.11 127134.29 127146.80 127146.80 ▁▁▁▁▇
Total_area_samp 13 0 1 371152.97 1389.14 367713.86 371247.65 371764.49 371879.89 371982.46 ▁▁▁▁▇
Total_area_samp 26 0 1 7994622.48 2117229.59 5909157.77 6252209.86 7000134.17 10045358.51 11171485.87 ▇▂▁▂▃

Save results

Code
write.csv(atlas_areas,
          paste0(vars$Documentation, "atlas_areas_METADATA.csv"))

Calculate jaccard index

Custom function to calculate jaccard across sites

Code
jaccard <- function(set1, set2) {
  a <- length(base::intersect(set1, set2))
  b <- length(setdiff(set2, set1))
  c <- length(setdiff(set1, set2))
  n_union  <- length(base::union(set1,set2))
  return(list(jaccard_index = a / (a + b + c), a = a, b = b, c = c, n_union = n_union))
}

Calculate jaccard for all datasets

Code
Jaccard_df <- pres_dat_final %>%
  filter(scalingID == 1) %>%
  group_by(datasetID, verbatimIdentification) %>%
  mutate(
    jaccard_results = list(jaccard(
      filter(pick(everything()), samplingPeriodID == 1)$siteID,
      filter(pick(everything()), samplingPeriodID == 2)$siteID
    ))
  ) %>%
  unnest_wider(jaccard_results) %>%
  group_by(verbatimIdentification, datasetID) %>%
  mutate(d = Total_Ncells_samp - n_union) %>%
  rename(Jaccard_sim = jaccard_index) %>%
  mutate(Jaccard_dissim = 1 - Jaccard_sim) %>%
  ungroup() %>%
  distinct(datasetID, verbatimIdentification, Jaccard_dissim, Jaccard_sim, a, b, c, d, Total_Ncells_samp)

Save jaccard results

Code
write.csv(Jaccard_df, here::here("Data/output/results/A_Jaccard_df.csv"))

Merge:

Code
pres_dat_final_v2 <- pres_dat_final %>%
  left_join(Jaccard_df)

head(pres_dat_final_v2)
  datasetID scalingID siteID footprintSRS verbatimFootprintSRS     area
1         5         1      2    epsg:4326            epsg:5514 130.0204
2         5         1      2    epsg:4326            epsg:5514 130.0204
3         5         1      2    epsg:4326            epsg:5514 130.0204
4         5         1      2    epsg:4326            epsg:5514 130.0204
5         5         1      2    epsg:4326            epsg:5514 130.0204
6         5         1      2    epsg:4326            epsg:5514 130.0204
  croppedArea areaUnit croppedAreaPercent centroidDecimalLongitude
1    41.36332      km2          0.3181295                 14.41488
2    41.36332      km2          0.3181295                 14.41488
3    41.36332      km2          0.3181295                 14.41488
4    41.36332      km2          0.3181295                 14.41488
5    41.36332      km2          0.3181295                 14.41488
6    41.36332      km2          0.3181295                 14.41488
  centroidDecimalLatitude croppedDecimalLongitude croppedDecimalLatitude
1                51.04962                 14.4096               51.01669
2                51.04962                 14.4096               51.01669
3                51.04962                 14.4096               51.01669
4                51.04962                 14.4096               51.01669
5                51.04962                 14.4096               51.01669
6                51.04962                 14.4096               51.01669
  northSouthLength eastWestLength maxLength croppedNorthSouthLength
1         11.12541           11.7  16.13557                5.178504
2         11.12541           11.7  16.13557                5.178504
3         11.12541           11.7  16.13557                5.178504
4         11.12541           11.7  16.13557                5.178504
5         11.12541           11.7  16.13557                5.178504
6         11.12541           11.7  16.13557                5.178504
  croppedEastWestLength croppedMaxLength lengthUnit startYear endYear
1              11.69983         12.78905         km      2001    2003
2              11.69983         12.78905         km      2001    2003
3              11.69983         12.78905         km      2001    2003
4              11.69983         12.78905         km      1985    1989
5              11.69983         12.78905         km      1985    1989
6              11.69983         12.78905         km      1985    1989
   verbatimIdentification          scientificName samplingPeriodID time_span
1 Nucifraga caryocatactes Nucifraga caryocatactes                2         2
2      Anas platyrhynchos      Anas platyrhynchos                2         2
3         Aythya fuligula         Aythya fuligula                2         2
4           Ciconia nigra           Ciconia nigra                1         4
5     Carduelis carduelis     Carduelis carduelis                1         4
6         Passer montanus         Passer montanus                1         4
  cell_sampling_repeats cells_keep sp_remove_expert introduced
1                     2          1                0          0
2                     2          1                0          0
3                     2          1                0          0
4                     2          1                0          0
5                     2          1                0          0
6                     2          1                0          0
  sp_sampling_repeats species_keep Total_Ncells Total_Ncells_samp
1                   2            1          671               628
2                   2            1          671               628
3                   2            1          671               628
4                   2            1          671               628
5                   2            1          671               628
6                   2            1          671               628
  Total_area_samp Jaccard_dissim Jaccard_sim   a   b  c   d
1        78308.81     0.39066339   0.6093366 248  91 68 221
2        78308.81     0.05806452   0.9419355 584  25 11   8
3        78308.81     0.26171875   0.7382812 378  71 63 116
4        78308.81     0.36900369   0.6309963 342 152 48  86
5        78308.81     0.02388535   0.9761146 613  10  5   0
6        78308.81     0.06310680   0.9368932 579  13 26  10

Calculate area of occupancy

Code
occ_data_final <-
  pres_dat_final_v2 %>%
  ungroup() %>%
  distinct(datasetID, samplingPeriodID, scalingID,
           verbatimIdentification, siteID, time_span,
           .keep_all = TRUE) %>%
  group_by(datasetID, samplingPeriodID, scalingID, verbatimIdentification) %>%
    dplyr::summarise(
    mean_area = mean(croppedArea, na.rm = TRUE),
    AOO = sum(croppedArea, na.rm = TRUE),
    occ_Ncells = n_distinct(siteID)) %>%
  left_join(pres_dat_final_v2 %>%
              mutate(scalingID = as.factor(as.character(scalingID)))) %>%
  dplyr::mutate(
    rel_AOO = AOO / Total_area_samp,
    rel_occ_Ncells = occ_Ncells / Total_Ncells_samp) %>% # Prevalence
  distinct()

Save AOO data to .rds

Code
saveRDS(occ_data_final, here::here("Data", "output", "temp", "A_04_occ_data_final.rds"))

Create species-level data

Code
species_data <-
  occ_data_final %>%
  dplyr::select(
    -siteID, -cell_sampling_repeats, -croppedArea, -sp_sampling_repeats
  ) %>%
  distinct(datasetID, samplingPeriodID, scalingID, verbatimIdentification,
           .keep_all = TRUE)

Calculate occupancy-area-relationship

Custom function to calculate OAR and Fractal dimension

Code
calculate_OAR <- function(species_data) {
  datasets <-
    unique(species_data$datasetID)
  sampling_periods <-
    unique(species_data$samplingPeriodID)

  sp_dta <-
    species_data %>%
    select(datasetID, samplingPeriodID, scalingID,
         verbatimIdentification,
         rel_AOO, rel_occ_Ncells, mean_area, AOO)
  
  # Expand the grid for all combinations of datasetID and samplingPeriodID
  species_data_new <-
  expand_grid(
  datasetID = datasets,
  samplingPeriodID = sampling_periods) %>%
  inner_join(sp_dta, by = c("datasetID", "samplingPeriodID")) %>%
  group_by(datasetID, samplingPeriodID, verbatimIdentification) %>%
  #Get available scales where relative occupancy is not saturated (< 1)
  summarise(
    available_scales = n(),
    mean_relAOO = mean(rel_AOO, na.rm = TRUE),
    exclude_sp_OAR = if_else(available_scales < 2, 1, 0),
    .groups = "drop"
  ) %>%
  # remove those where the range is saturated at the smallest resolution
  mutate(
    exclude_sp_OAR = if_else(available_scales == 0, 1, exclude_sp_OAR),
    mean_relAOO = if_else(available_scales == 0, 1, mean_relAOO)
  ) %>%
  full_join(sp_dta, by = c("datasetID", "samplingPeriodID", "verbatimIdentification")) %>%
  filter(
    exclude_sp_OAR == 0,
    rel_occ_Ncells < 1
  ) %>%
  distinct() %>%
  filter_at(vars(scalingID, AOO, mean_area), any_vars(!is.na(.))) %>%
  ungroup()
  
  # Fit models
  species_data_new_v2 <-
  species_data_new %>%
  group_by(datasetID, samplingPeriodID, verbatimIdentification) %>%
  nest(data = c(scalingID, AOO, mean_area, rel_AOO, rel_occ_Ncells)) %>%
  mutate(
    coefficients = map(
      data,
      ~ .x %>%
        filter(
          !is.na(AOO) & AOO > 0,
          !is.na(mean_area) & mean_area > 0
        ) %>%
        lm(log(AOO) ~ log(mean_area), data = .) %>%
        coef() %>%
        {
          tibble(
            m_AOO_a = .[2],
            b_AOO_a = .[1],
            D_AOO_a = -2 * .[2] + 2
          )
        }
    )
  ) %>%
  unnest(coefficients) %>%
  ungroup() %>%
  #Final cleanup of results
  select(datasetID, samplingPeriodID, verbatimIdentification, m_AOO_a, b_AOO_a, D_AOO_a) %>%
  distinct() %>%
  # Merge with the original dataset
  full_join(species_data) %>%
  filter(scalingID == 1) %>%
  select(
    datasetID, verbatimIdentification, samplingPeriodID, Total_area_samp,
    Total_Ncells, Total_Ncells_samp, AOO, occ_Ncells, rel_occ_Ncells,
    rel_AOO, Jaccard_dissim, m_AOO_a, b_AOO_a, D_AOO_a
  )

return(species_data_new_v2)
}

Apply OAR-function:

Code
species_data_new <- calculate_OAR(species_data) %>%
  distinct(datasetID, samplingPeriodID, verbatimIdentification, .keep_all = TRUE)   %>%
  mutate(
    D_AOO_a = case_when(is.na(D_AOO_a) ~ 2,
                        .default = D_AOO_a)) %>%
      left_join(species_data %>% filter(scalingID == 1)) %>%
  # reduce columns
      select(
        datasetID, verbatimIdentification, samplingPeriodID,
        Total_area_samp, Total_Ncells, Total_Ncells_samp,
        AOO, occ_Ncells, rel_occ_Ncells, rel_AOO,
        Jaccard_dissim, a,b,c,d, D_AOO_a, time_span
      ) %>%
      distinct(datasetID, samplingPeriodID, verbatimIdentification,
               .keep_all = TRUE)

Calculate log ratio AOO

Code
time_periods <- vars$time_periods

Custom function to transform data to wide-format

Code
transform_to_wide <- function(species_data_new, time_periods = c(1, 2)) {
  wide_dfs <- list()

  #Create a list to store wide data for each time period
  for (i in seq_along(time_periods)) {
    wide_dfs[[i]] <- species_data_new %>%
      distinct(datasetID, samplingPeriodID, verbatimIdentification, AOO) %>%
      group_by(datasetID, samplingPeriodID, verbatimIdentification) %>%
      filter(samplingPeriodID == time_periods[i]) %>%
      setNames(paste0("samplingPeriodID", i, "_", names(.))) %>%
      ungroup() %>%
      select(-c(paste0("samplingPeriodID", i, "_samplingPeriodID"))) %>%
      dplyr::rename(
        verbatimIdentification = paste0("samplingPeriodID", i, "_verbatimIdentification"),
        datasetID = paste0("samplingPeriodID", i, "_datasetID")
      )
  }
  # Merge the wide data frames sequentially

  sp_dat_wide <- reduce(wide_dfs, full_join, by = c("verbatimIdentification", "datasetID"))

  cat("NA counts in wide data after processing:\n")
  print(colSums(is.na(sp_dat_wide)))

  cat("Preview of wide data:\n")
  print(head(sp_dat_wide))

  return(sp_dat_wide)
}
Code
time_between_samples <- data.frame(
  datasetID = c(5,6,13,26),
  startYear1 = c(1985, 1980, 1974, 1972),
  endYear2 = c(2003, 2005, 2002, 2017)
) %>% mutate(
  n_years = endYear2-startYear1+1)

Apply function:

Code
sp_dat_wide <-
  transform_to_wide(species_data_new, time_periods) %>%
  na.omit() # drop species lost or gained completely
NA counts in wide data after processing:
             datasetID verbatimIdentification  samplingPeriodID1_AOO 
                     0                      0                      0 
 samplingPeriodID2_AOO 
                     0 
Preview of wide data:
# A tibble: 6 × 4
  datasetID verbatimIdentification   samplingPeriodID1_AOO samplingPeriodID2_AOO
      <int> <chr>                                    <dbl>                 <dbl>
1         5 Accipiter gentilis                      72264.                70596.
2         5 Accipiter nisus                         71868.                74755.
3         5 Acrocephalus arundinace…                30767.                32280.
4         5 Acrocephalus palustris                  67931.                72820.
5         5 Acrocephalus schoenobae…                35662.                43319.
6         5 Acrocephalus scirpaceus                 55801.                57993.

Calculate log ratio of AOO

log[(St2 /St1 )/(t2 − t1 + 1)]

Code
logRatio <-
  sp_dat_wide %>%
  left_join(time_between_samples) %>%
  group_by(datasetID, verbatimIdentification) %>%
  mutate(
    log_R2_1 = log(samplingPeriodID2_AOO / samplingPeriodID1_AOO)) %>%
  mutate(
    log_R2_1_per_year = log_R2_1/ n_years) %>%
  select(-samplingPeriodID1_AOO, -samplingPeriodID2_AOO)

plots - histograms

  1. log ratio per year
Code
ggplot(logRatio, aes(x = log_R2_1_per_year, fill = as.factor(datasetID))) +
  geom_histogram() +
  facet_wrap(~as.factor(datasetID))+
  labs(title = "Histogram of log ratio of AOO per year",
       x = "log(St2 /St1 )/(t2 − t1 + 1)",
       y = "Frequency")
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

  1. log ratio without year adjustment
Code
ggplot(logRatio, aes(x = log_R2_1, fill = as.factor(datasetID))) +
  geom_histogram() +
  facet_wrap(~as.factor(datasetID))+
  labs(title = "Histogram of log ratio of AOO per time period",
       x = "log(St2 /St1 )",
       y = "Frequency")
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Code
big_table <-
  full_join(species_data_new, logRatio) %>%
  distinct(datasetID, samplingPeriodID, verbatimIdentification,
           .keep_all = T) %>%
  mutate_if(is.numeric, round, 3)
Joining with `by = join_by(datasetID, verbatimIdentification)`

Save results to .rds

Code
saveRDS(species_data_new, here("Data/output/temp/A_04_species_data.rds"))
saveRDS(big_table, here("Data/output/A_predictors/Big_table.rds"))