Code
library(skimr)
library(kableExtra)
source(here::here("Code/00_Configuration.R"))
x <- lapply(package_list, require, character = TRUE)
rm(x)Source 00_Configuration.R to load libraries
library(skimr)
library(kableExtra)
source(here::here("Code/00_Configuration.R"))
x <- lapply(package_list, require, character = TRUE)
rm(x)Enable s2 processing for sf package with projected data
sf_use_s2(TRUE)Read spatial dataset
data_sf <-
readRDS(here("Data/output/1_data_sf.rds"))Custom function to get total and sampled area/cells from datasets
calculate_grid_info <- function(data_sf) {
grid_info <-
full_join(
data_sf %>%
st_drop_geometry() %>%
distinct(datasetID, scalingID, siteID, cell_sampling_repeats, croppedArea, time_span) %>%
group_by(datasetID, scalingID) %>%
summarise(
Total_Ncells = n_distinct(siteID)),
data_sf %>%
st_drop_geometry() %>%
distinct(datasetID, scalingID, siteID, cell_sampling_repeats, croppedArea, time_span) %>%
filter(cell_sampling_repeats == 2) %>%
group_by(datasetID, scalingID, time_span) %>%
summarise(
Total_Ncells_samp = n_distinct(siteID),
Total_area_samp = sum(croppedArea, na.rm = TRUE))
)
return(grid_info)
}Apply function to compute grid information
atlas_areas <-
calculate_grid_info(data_sf)Read processed species data & merge with atlas_areas
pres_dat_final <- readRDS(here("Data/output/1_data_filtered.rds")) %>%
left_join(atlas_areas) %>%
mutate(scalingID = factor(scalingID, levels = vars$desired_levels)) %>%
filter(!is.na(verbatimIdentification)) # Remove unsampled cellsOverview:
atlas_areas %>%
kable()| datasetID | scalingID | Total_Ncells | time_span | Total_Ncells_samp | Total_area_samp |
|---|---|---|---|---|---|
| 5 | 1 | 671 | 2 | 628 | 78308.81 |
| 5 | 1 | 671 | 4 | 628 | 78308.81 |
| 5 | 2 | 185 | 2 | 176 | 78751.93 |
| 5 | 2 | 185 | 4 | 176 | 78751.93 |
| 5 | 4 | 55 | 2 | 54 | 78871.95 |
| 5 | 4 | 55 | 4 | 54 | 78871.95 |
| 5 | 8 | 18 | 2 | 18 | 78873.98 |
| 5 | 8 | 18 | 4 | 18 | 78873.98 |
| 5 | 16 | 6 | 2 | 6 | 78873.98 |
| 5 | 16 | 6 | 4 | 6 | 78873.98 |
| 5 | 32 | 2 | 2 | 2 | 78873.98 |
| 5 | 32 | 2 | 4 | 2 | 78873.98 |
| 5 | 64 | 1 | 2 | 1 | 78873.98 |
| 5 | 64 | 1 | 4 | 1 | 78873.98 |
| 6 | 1 | 5335 | 5 | 5319 | 126878.54 |
| 6 | 2 | 1398 | 5 | 1396 | 127118.11 |
| 6 | 4 | 386 | 5 | 384 | 127118.11 |
| 6 | 8 | 113 | 5 | 112 | 127121.79 |
| 6 | 16 | 37 | 5 | 37 | 127146.80 |
| 6 | 32 | 13 | 5 | 13 | 127146.80 |
| 6 | 64 | 5 | 5 | 5 | 127146.80 |
| 6 | 128 | 2 | 5 | 2 | 127146.80 |
| 13 | 1 | 1309 | 4 | 1184 | 367713.86 |
| 13 | 1 | 1309 | 5 | 1184 | 367713.86 |
| 13 | 2 | 429 | 4 | 376 | 370862.75 |
| 13 | 2 | 429 | 5 | 376 | 370862.75 |
| 13 | 4 | 150 | 4 | 124 | 371375.95 |
| 13 | 4 | 150 | 5 | 124 | 371375.95 |
| 13 | 8 | 56 | 4 | 43 | 371668.23 |
| 13 | 8 | 56 | 5 | 43 | 371668.23 |
| 13 | 16 | 21 | 4 | 17 | 371860.74 |
| 13 | 16 | 21 | 5 | 17 | 371860.74 |
| 13 | 32 | 9 | 4 | 7 | 371879.89 |
| 13 | 32 | 9 | 5 | 7 | 371879.89 |
| 13 | 64 | 4 | 4 | 3 | 371879.89 |
| 13 | 64 | 4 | 5 | 3 | 371879.89 |
| 13 | 128 | 1 | 4 | 1 | 371982.46 |
| 13 | 128 | 1 | 5 | 1 | 371982.46 |
| 26 | 1 | 5080 | 4 | 2821 | 5909157.77 |
| 26 | 1 | 5080 | 23 | 2821 | 5909157.77 |
| 26 | 2 | 1394 | 4 | 810 | 6099573.67 |
| 26 | 2 | 1394 | 23 | 810 | 6099573.67 |
| 26 | 4 | 379 | 4 | 231 | 6303088.59 |
| 26 | 4 | 379 | 23 | 231 | 6303088.59 |
| 26 | 8 | 166 | 4 | 106 | 6552320.49 |
| 26 | 8 | 166 | 23 | 106 | 6552320.49 |
| 26 | 16 | 41 | 4 | 30 | 7447947.85 |
| 26 | 16 | 41 | 23 | 30 | 7447947.85 |
| 26 | 32 | 13 | 4 | 10 | 9854014.22 |
| 26 | 32 | 13 | 23 | 10 | 9854014.22 |
| 26 | 64 | 6 | 4 | 5 | 10619391.39 |
| 26 | 64 | 6 | 23 | 5 | 10619391.39 |
| 26 | 128 | 1 | 4 | 1 | 11171485.87 |
| 26 | 128 | 1 | 23 | 1 | 11171485.87 |
Summary statistics:
atlas_areas %>%
skim()| Name | Piped data |
| Number of rows | 54 |
| Number of columns | 6 |
| _______________________ | |
| Column type frequency: | |
| numeric | 5 |
| ________________________ | |
| Group variables | datasetID |
Variable type: numeric
| skim_variable | datasetID | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|---|
| scalingID | 5 | 0 | 1 | 18.14 | 22.04 | 1.00 | 2.50 | 8.00 | 28.00 | 64.00 | ▇▂▂▁▂ |
| scalingID | 6 | 0 | 1 | 31.88 | 44.27 | 1.00 | 3.50 | 12.00 | 40.00 | 128.00 | ▇▂▂▁▂ |
| scalingID | 13 | 0 | 1 | 31.88 | 42.77 | 1.00 | 3.50 | 12.00 | 40.00 | 128.00 | ▇▂▂▁▂ |
| scalingID | 26 | 0 | 1 | 31.88 | 42.77 | 1.00 | 3.50 | 12.00 | 40.00 | 128.00 | ▇▂▂▁▂ |
| Total_Ncells | 5 | 0 | 1 | 134.00 | 236.05 | 1.00 | 3.00 | 18.00 | 152.50 | 671.00 | ▇▂▁▁▂ |
| Total_Ncells | 6 | 0 | 1 | 911.12 | 1849.28 | 2.00 | 11.00 | 75.00 | 639.00 | 5335.00 | ▇▁▁▁▁ |
| Total_Ncells | 13 | 0 | 1 | 247.38 | 437.29 | 1.00 | 7.75 | 38.50 | 219.75 | 1309.00 | ▇▁▁▁▁ |
| Total_Ncells | 26 | 0 | 1 | 885.00 | 1699.29 | 1.00 | 11.25 | 103.50 | 632.75 | 5080.00 | ▇▁▁▁▁ |
| time_span | 5 | 0 | 1 | 3.00 | 1.04 | 2.00 | 2.00 | 3.00 | 4.00 | 4.00 | ▇▁▁▁▇ |
| time_span | 6 | 0 | 1 | 5.00 | 0.00 | 5.00 | 5.00 | 5.00 | 5.00 | 5.00 | ▁▁▇▁▁ |
| time_span | 13 | 0 | 1 | 4.50 | 0.52 | 4.00 | 4.00 | 4.50 | 5.00 | 5.00 | ▇▁▁▁▇ |
| time_span | 26 | 0 | 1 | 13.50 | 9.81 | 4.00 | 4.00 | 13.50 | 23.00 | 23.00 | ▇▁▁▁▇ |
| Total_Ncells_samp | 5 | 0 | 1 | 126.43 | 220.75 | 1.00 | 3.00 | 18.00 | 145.50 | 628.00 | ▇▂▁▁▂ |
| Total_Ncells_samp | 6 | 0 | 1 | 908.50 | 1843.88 | 2.00 | 11.00 | 74.50 | 637.00 | 5319.00 | ▇▁▁▁▁ |
| Total_Ncells_samp | 13 | 0 | 1 | 219.38 | 395.95 | 1.00 | 6.00 | 30.00 | 187.00 | 1184.00 | ▇▁▁▁▁ |
| Total_Ncells_samp | 26 | 0 | 1 | 501.75 | 942.74 | 1.00 | 8.75 | 68.00 | 375.75 | 2821.00 | ▇▁▁▁▁ |
| Total_area_samp | 5 | 0 | 1 | 78775.51 | 202.47 | 78308.81 | 78781.93 | 78873.98 | 78873.98 | 78873.98 | ▂▁▁▂▇ |
| Total_area_samp | 6 | 0 | 1 | 127102.97 | 91.70 | 126878.54 | 127118.11 | 127134.29 | 127146.80 | 127146.80 | ▁▁▁▁▇ |
| Total_area_samp | 13 | 0 | 1 | 371152.97 | 1389.14 | 367713.86 | 371247.65 | 371764.49 | 371879.89 | 371982.46 | ▁▁▁▁▇ |
| Total_area_samp | 26 | 0 | 1 | 7994622.48 | 2117229.59 | 5909157.77 | 6252209.86 | 7000134.17 | 10045358.51 | 11171485.87 | ▇▂▁▂▃ |
write.csv(atlas_areas,
paste0(vars$Documentation, "atlas_areas_METADATA.csv"))Custom function to calculate jaccard across sites
jaccard <- function(set1, set2) {
a <- length(base::intersect(set1, set2))
b <- length(setdiff(set2, set1))
c <- length(setdiff(set1, set2))
n_union <- length(base::union(set1,set2))
return(list(jaccard_index = a / (a + b + c), a = a, b = b, c = c, n_union = n_union))
}Calculate jaccard for all datasets
Jaccard_df <- pres_dat_final %>%
filter(scalingID == 1) %>%
group_by(datasetID, verbatimIdentification) %>%
mutate(
jaccard_results = list(jaccard(
filter(pick(everything()), samplingPeriodID == 1)$siteID,
filter(pick(everything()), samplingPeriodID == 2)$siteID
))
) %>%
unnest_wider(jaccard_results) %>%
group_by(verbatimIdentification, datasetID) %>%
mutate(d = Total_Ncells_samp - n_union) %>%
rename(Jaccard_sim = jaccard_index) %>%
mutate(Jaccard_dissim = 1 - Jaccard_sim) %>%
ungroup() %>%
distinct(datasetID, verbatimIdentification, Jaccard_dissim, Jaccard_sim, a, b, c, d, Total_Ncells_samp)Save jaccard results
write.csv(Jaccard_df, here::here("Data/output/results/A_Jaccard_df.csv"))Merge:
pres_dat_final_v2 <- pres_dat_final %>%
left_join(Jaccard_df)
head(pres_dat_final_v2) datasetID scalingID siteID footprintSRS verbatimFootprintSRS area
1 5 1 2 epsg:4326 epsg:5514 130.0204
2 5 1 2 epsg:4326 epsg:5514 130.0204
3 5 1 2 epsg:4326 epsg:5514 130.0204
4 5 1 2 epsg:4326 epsg:5514 130.0204
5 5 1 2 epsg:4326 epsg:5514 130.0204
6 5 1 2 epsg:4326 epsg:5514 130.0204
croppedArea areaUnit croppedAreaPercent centroidDecimalLongitude
1 41.36332 km2 0.3181295 14.41488
2 41.36332 km2 0.3181295 14.41488
3 41.36332 km2 0.3181295 14.41488
4 41.36332 km2 0.3181295 14.41488
5 41.36332 km2 0.3181295 14.41488
6 41.36332 km2 0.3181295 14.41488
centroidDecimalLatitude croppedDecimalLongitude croppedDecimalLatitude
1 51.04962 14.4096 51.01669
2 51.04962 14.4096 51.01669
3 51.04962 14.4096 51.01669
4 51.04962 14.4096 51.01669
5 51.04962 14.4096 51.01669
6 51.04962 14.4096 51.01669
northSouthLength eastWestLength maxLength croppedNorthSouthLength
1 11.12541 11.7 16.13557 5.178504
2 11.12541 11.7 16.13557 5.178504
3 11.12541 11.7 16.13557 5.178504
4 11.12541 11.7 16.13557 5.178504
5 11.12541 11.7 16.13557 5.178504
6 11.12541 11.7 16.13557 5.178504
croppedEastWestLength croppedMaxLength lengthUnit startYear endYear
1 11.69983 12.78905 km 2001 2003
2 11.69983 12.78905 km 2001 2003
3 11.69983 12.78905 km 2001 2003
4 11.69983 12.78905 km 1985 1989
5 11.69983 12.78905 km 1985 1989
6 11.69983 12.78905 km 1985 1989
verbatimIdentification scientificName samplingPeriodID time_span
1 Nucifraga caryocatactes Nucifraga caryocatactes 2 2
2 Anas platyrhynchos Anas platyrhynchos 2 2
3 Aythya fuligula Aythya fuligula 2 2
4 Ciconia nigra Ciconia nigra 1 4
5 Carduelis carduelis Carduelis carduelis 1 4
6 Passer montanus Passer montanus 1 4
cell_sampling_repeats cells_keep sp_remove_expert introduced
1 2 1 0 0
2 2 1 0 0
3 2 1 0 0
4 2 1 0 0
5 2 1 0 0
6 2 1 0 0
sp_sampling_repeats species_keep Total_Ncells Total_Ncells_samp
1 2 1 671 628
2 2 1 671 628
3 2 1 671 628
4 2 1 671 628
5 2 1 671 628
6 2 1 671 628
Total_area_samp Jaccard_dissim Jaccard_sim a b c d
1 78308.81 0.39066339 0.6093366 248 91 68 221
2 78308.81 0.05806452 0.9419355 584 25 11 8
3 78308.81 0.26171875 0.7382812 378 71 63 116
4 78308.81 0.36900369 0.6309963 342 152 48 86
5 78308.81 0.02388535 0.9761146 613 10 5 0
6 78308.81 0.06310680 0.9368932 579 13 26 10
occ_data_final <-
pres_dat_final_v2 %>%
ungroup() %>%
distinct(datasetID, samplingPeriodID, scalingID,
verbatimIdentification, siteID, time_span,
.keep_all = TRUE) %>%
group_by(datasetID, samplingPeriodID, scalingID, verbatimIdentification) %>%
dplyr::summarise(
mean_area = mean(croppedArea, na.rm = TRUE),
AOO = sum(croppedArea, na.rm = TRUE),
occ_Ncells = n_distinct(siteID)) %>%
left_join(pres_dat_final_v2 %>%
mutate(scalingID = as.factor(as.character(scalingID)))) %>%
dplyr::mutate(
rel_AOO = AOO / Total_area_samp,
rel_occ_Ncells = occ_Ncells / Total_Ncells_samp) %>% # Prevalence
distinct()Save AOO data to .rds
saveRDS(occ_data_final, here::here("Data", "output", "temp", "A_04_occ_data_final.rds"))species_data <-
occ_data_final %>%
dplyr::select(
-siteID, -cell_sampling_repeats, -croppedArea, -sp_sampling_repeats
) %>%
distinct(datasetID, samplingPeriodID, scalingID, verbatimIdentification,
.keep_all = TRUE)Custom function to calculate OAR and Fractal dimension
calculate_OAR <- function(species_data) {
datasets <-
unique(species_data$datasetID)
sampling_periods <-
unique(species_data$samplingPeriodID)
sp_dta <-
species_data %>%
select(datasetID, samplingPeriodID, scalingID,
verbatimIdentification,
rel_AOO, rel_occ_Ncells, mean_area, AOO)
# Expand the grid for all combinations of datasetID and samplingPeriodID
species_data_new <-
expand_grid(
datasetID = datasets,
samplingPeriodID = sampling_periods) %>%
inner_join(sp_dta, by = c("datasetID", "samplingPeriodID")) %>%
group_by(datasetID, samplingPeriodID, verbatimIdentification) %>%
#Get available scales where relative occupancy is not saturated (< 1)
summarise(
available_scales = n(),
mean_relAOO = mean(rel_AOO, na.rm = TRUE),
exclude_sp_OAR = if_else(available_scales < 2, 1, 0),
.groups = "drop"
) %>%
# remove those where the range is saturated at the smallest resolution
mutate(
exclude_sp_OAR = if_else(available_scales == 0, 1, exclude_sp_OAR),
mean_relAOO = if_else(available_scales == 0, 1, mean_relAOO)
) %>%
full_join(sp_dta, by = c("datasetID", "samplingPeriodID", "verbatimIdentification")) %>%
filter(
exclude_sp_OAR == 0,
rel_occ_Ncells < 1
) %>%
distinct() %>%
filter_at(vars(scalingID, AOO, mean_area), any_vars(!is.na(.))) %>%
ungroup()
# Fit models
species_data_new_v2 <-
species_data_new %>%
group_by(datasetID, samplingPeriodID, verbatimIdentification) %>%
nest(data = c(scalingID, AOO, mean_area, rel_AOO, rel_occ_Ncells)) %>%
mutate(
coefficients = map(
data,
~ .x %>%
filter(
!is.na(AOO) & AOO > 0,
!is.na(mean_area) & mean_area > 0
) %>%
lm(log(AOO) ~ log(mean_area), data = .) %>%
coef() %>%
{
tibble(
m_AOO_a = .[2],
b_AOO_a = .[1],
D_AOO_a = -2 * .[2] + 2
)
}
)
) %>%
unnest(coefficients) %>%
ungroup() %>%
#Final cleanup of results
select(datasetID, samplingPeriodID, verbatimIdentification, m_AOO_a, b_AOO_a, D_AOO_a) %>%
distinct() %>%
# Merge with the original dataset
full_join(species_data) %>%
filter(scalingID == 1) %>%
select(
datasetID, verbatimIdentification, samplingPeriodID, Total_area_samp,
Total_Ncells, Total_Ncells_samp, AOO, occ_Ncells, rel_occ_Ncells,
rel_AOO, Jaccard_dissim, m_AOO_a, b_AOO_a, D_AOO_a
)
return(species_data_new_v2)
}Apply OAR-function:
species_data_new <- calculate_OAR(species_data) %>%
distinct(datasetID, samplingPeriodID, verbatimIdentification, .keep_all = TRUE) %>%
mutate(
D_AOO_a = case_when(is.na(D_AOO_a) ~ 2,
.default = D_AOO_a)) %>%
left_join(species_data %>% filter(scalingID == 1)) %>%
# reduce columns
select(
datasetID, verbatimIdentification, samplingPeriodID,
Total_area_samp, Total_Ncells, Total_Ncells_samp,
AOO, occ_Ncells, rel_occ_Ncells, rel_AOO,
Jaccard_dissim, a,b,c,d, D_AOO_a, time_span
) %>%
distinct(datasetID, samplingPeriodID, verbatimIdentification,
.keep_all = TRUE)time_periods <- vars$time_periodsCustom function to transform data to wide-format
transform_to_wide <- function(species_data_new, time_periods = c(1, 2)) {
wide_dfs <- list()
#Create a list to store wide data for each time period
for (i in seq_along(time_periods)) {
wide_dfs[[i]] <- species_data_new %>%
distinct(datasetID, samplingPeriodID, verbatimIdentification, AOO) %>%
group_by(datasetID, samplingPeriodID, verbatimIdentification) %>%
filter(samplingPeriodID == time_periods[i]) %>%
setNames(paste0("samplingPeriodID", i, "_", names(.))) %>%
ungroup() %>%
select(-c(paste0("samplingPeriodID", i, "_samplingPeriodID"))) %>%
dplyr::rename(
verbatimIdentification = paste0("samplingPeriodID", i, "_verbatimIdentification"),
datasetID = paste0("samplingPeriodID", i, "_datasetID")
)
}
# Merge the wide data frames sequentially
sp_dat_wide <- reduce(wide_dfs, full_join, by = c("verbatimIdentification", "datasetID"))
cat("NA counts in wide data after processing:\n")
print(colSums(is.na(sp_dat_wide)))
cat("Preview of wide data:\n")
print(head(sp_dat_wide))
return(sp_dat_wide)
}time_between_samples <- data.frame(
datasetID = c(5,6,13,26),
startYear1 = c(1985, 1980, 1974, 1972),
endYear2 = c(2003, 2005, 2002, 2017)
) %>% mutate(
n_years = endYear2-startYear1+1)Apply function:
sp_dat_wide <-
transform_to_wide(species_data_new, time_periods) %>%
na.omit() # drop species lost or gained completelyNA counts in wide data after processing:
datasetID verbatimIdentification samplingPeriodID1_AOO
0 0 0
samplingPeriodID2_AOO
0
Preview of wide data:
# A tibble: 6 × 4
datasetID verbatimIdentification samplingPeriodID1_AOO samplingPeriodID2_AOO
<int> <chr> <dbl> <dbl>
1 5 Accipiter gentilis 72264. 70596.
2 5 Accipiter nisus 71868. 74755.
3 5 Acrocephalus arundinace… 30767. 32280.
4 5 Acrocephalus palustris 67931. 72820.
5 5 Acrocephalus schoenobae… 35662. 43319.
6 5 Acrocephalus scirpaceus 55801. 57993.
log[(St2 /St1 )/(t2 − t1 + 1)]
logRatio <-
sp_dat_wide %>%
left_join(time_between_samples) %>%
group_by(datasetID, verbatimIdentification) %>%
mutate(
log_R2_1 = log(samplingPeriodID2_AOO / samplingPeriodID1_AOO)) %>%
mutate(
log_R2_1_per_year = log_R2_1/ n_years) %>%
select(-samplingPeriodID1_AOO, -samplingPeriodID2_AOO)ggplot(logRatio, aes(x = log_R2_1_per_year, fill = as.factor(datasetID))) +
geom_histogram() +
facet_wrap(~as.factor(datasetID))+
labs(title = "Histogram of log ratio of AOO per year",
x = "log(St2 /St1 )/(t2 − t1 + 1)",
y = "Frequency")`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplot(logRatio, aes(x = log_R2_1, fill = as.factor(datasetID))) +
geom_histogram() +
facet_wrap(~as.factor(datasetID))+
labs(title = "Histogram of log ratio of AOO per time period",
x = "log(St2 /St1 )",
y = "Frequency")`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
big_table <-
full_join(species_data_new, logRatio) %>%
distinct(datasetID, samplingPeriodID, verbatimIdentification,
.keep_all = T) %>%
mutate_if(is.numeric, round, 3)Joining with `by = join_by(datasetID, verbatimIdentification)`
saveRDS(species_data_new, here("Data/output/temp/A_04_species_data.rds"))
saveRDS(big_table, here("Data/output/A_predictors/Big_table.rds"))