Demonstration: New York

Data preparation (by Gabriel Ortega Solís)

Since most data we used here come with restricted rights of sharing, we set up a workflow for the New York Breeding Bird atlas, which is freely available from the website: https://www.dec.ny.gov/animals/7312.html

We processed the original New York BBA data into a common format with the other atlases that we used. The structure is therefore a bit different from the original data. These steps are not necessary to repeat the analysis but it shows how the data was formated after downloading it. Multiple steps in between were done manually by hand to standardize all BBAs to the same format - these cannot be reproduced with the code chunk below. The code below is just for documentation. We provide .rds files with the prepared (standardized) data used for our study below.

Code

# libraries for preparation only:

pacman::p_load(
  sf, terra, data.table, tidyverse, tidyterra, tidytable, knitr,
  tictoc, RPostgres, DBI, dbplyr, parallel,
  geodata
)


# Variables
seldata <- "data/4processedAtlases/Birds_Atlas_New_York"
datasetID <- 6
licenseID <- 1
verbatimFootprintSRS <- "epsg:26918"

# Data processing
dataset <- readRDS(paste0(seldata, "_beast_data.rds")) %>% as.data.table()

spnamesinfo <- read.csv(paste0(seldata, "_data_INFO_spnames.csv")) %>%
  as.data.table() %>%
  mutate(cellID = str_split(cellID, pattern = ",")) %>%
  unnest(cellID) %>%
  select(cellID, name_in_data, original_verbatim_name) %>%
  mutate(cellID = cellID) %>%
  unique()

grid <- st_layers(paste0(seldata, "_grid.gpkg"))$name %>%
  sapply(., USE.NAMES = T, simplify = F, function(x) {
    st_read(paste0(seldata, "_grid.gpkg"), layer = x)
  })

corrs <- readRDS(paste0(seldata, "_cells_corr.rds")) %>%
  as.data.table() %>%
  pivot_longer(-cellID, names_to = "cell_grouping", values_to = "cell_label") %>%
  mutate(cell_grouping = as.numeric(cell_grouping))

full_data <- left_join(dataset, corrs, by = c("cell_grouping", "cell_label"), relationship = "many-to-many") %>%
  left_join(., filter(spnamesinfo, !is.na(cellID)), by = c("cellID" = "cellID", "verbatim_name" = "name_in_data")) %>%
  mutate(verbatim_name = ifelse(is.na(original_verbatim_name), verbatim_name, original_verbatim_name)) %>%
  select(-original_verbatim_name) %>%
  unique() %>%
  mutate(
    datasetID = datasetID,
    licenseID = licenseID,
    verbatimIdentificationID = dense_rank(verbatim_name),
    samplingPeriod = dense_rank(start_year),
    effort = ifelse(effort == 0, NA, effort),
    samp_effort_type = ifelse(is.na(samp_effort_type), NA, 4)
  ) %>%
  unique()

gc()


data_table <- data.table(
  datasetID = datasetID,
  datasetName = "New York State Breeding Bird Atlas",
  datasetPublisher = "New York State Department of Environmental Conservation, New York Natural Heritage Program",
  datasetPublisherContact = "nybbba3@gmail.com | julie.hart@dec.ny.gov",
  licenseID = licenseID,
  rightsHolder = "New York State Department of Environmental Conservation",
  bibliographicCitation = "BBA1 data: New York State Breeding Bird Atlas [Internet]. 1980 - 1985. Release 1.0. Albany (New York): New York State Department of Environmental Conservation. updated 2007 Jun 6; cited 2024 Apr 09. Available from: https://www.dec.ny.gov/animals/7312.html. | BBA2 data: New York State Breeding Bird Atlas 2000 [Internet]. 2000 - 2005. Release 1.0. Albany (New York): New York State Department of Environmental Conservation. updated 2007 Jun 11; cited 2024 Apr 09. Available from: https://www.dec.ny.gov/animals/7312.html.",
  citationIdentifier = "",
  provider = "New York State Department of Environmental Conservation",
  shareable = "NO",
  coauthorshipRequired = "NO",
  coauthors = "",
  coauthorshipSuggested = "Carmen Soria - carmendianasoria@gmail.com | Kateřina Tschernosterová - tschernosterova@fzp.czu.cz | Friederike Wölke - friederike.woelke@gmail.com | Gabriel Ortega - g.ortega.solis@gmail.com",
  isSamplingEffortReported = "YES",
  isOccurrenceProbabilityAvailable = "NO"
)


data_table <- data.table(
  siteID = full_data$cell_label,
  scalingID = full_data$cell_grouping,
  datasetID = full_data$datasetID,
  area = full_data$area,
  croppedArea = full_data$area_cropped,
  areaUnit = "km2",
  maxLength = NA,
  northSouthLength = NA,
  eastWestLength = NA,
  lengthUnit = "km",
  centroidDecimalLongitude = full_data$cell_long,
  centroidDecimalLatitude = full_data$cell_lat,
  samplingRepetitions = full_data$repeated
) %>% unique()


data_table <- grid %>%
  dplyr::bind_rows() %>%
  ungroup() %>%
  rename(
    siteID = cell_label,
    scalingID = cell_grouping,
    geometry = geom
  ) %>%
  mutate(
    siteID = as.integer(siteID),
    scalingID = as.integer(scalingID),
    datasetID = as.integer(datasetID),
    footprintSRS = "epsg:4326",
    verbatimFootprintSRS = verbatimFootprintSRS
  ) %>%
  select(
    ., siteID, scalingID, datasetID,
    footprintSRS, verbatimFootprintSRS, geometry
  )

data_table$geometry <- st_cast(data_table$geometry, "MULTIPOLYGON")


data_table <- filter(full_data, cell_grouping == 1) %>%
  mutate(
    verbatimIdentificationID = verbatimIdentificationID,
    verbatimSiteID = cellID,
    datasetID = datasetID,
    samplingPeriodID = samplingPeriod,
    recordFilter = NA
  ) %>%
  select(
    verbatimIdentificationID,
    verbatimSiteID,
    datasetID,
    samplingPeriodID,
    recordFilter
  ) %>%
  unique()

Getting started

Code

suppressPackageStartupMessages({
library(dplyr)
library(sf)
library(here)
#install.packages("skimr", repos = "https://cloud.r-project.org/")
library(skimr)
})

Warning: package 'dplyr' was built under R version 4.4.2

Warning: package 'sf' was built under R version 4.4.3

Warning: package 'skimr' was built under R version 4.4.2

Code

dta <- 
  readRDS(here::here("Data/input/data.rds")) %>% 
  filter(datasetID == 6)
grids <- 
  readRDS(here::here("Data/input/grid.rds")) %>% 
  filter(datasetID == 6)
data_sf <- 
  readRDS(here::here("Data/input/data_sf.rds")) %>% 
  filter(datasetID == 6)


if (!dir.exists(here::here("Demo_NewYork/Data/input"))) {
  dir.create(here::here("Demo_NewYork/Data/input"))
  }

saveRDS(dta, here::here("Demo_NewYork/Data/input/raw_data_ny.rds"))


saveRDS(grids, here::here("Demo_NewYork/Data/input/grid_ny.rds"))
saveRDS(data_sf, here::here("Demo_NewYork/Data/input/sf_data_ny.rds"))

Data filtering

Code

# 1. cells sampled twice
cells <-
  data_sf %>%
  st_drop_geometry() %>%
  group_by(datasetID, scalingID, siteID, samplingPeriodID) %>%
  mutate(cell_sampled = if_else(is.na(verbatimIdentification), 0, 1)) %>%
  ungroup() %>%
  distinct(datasetID, scalingID, siteID, samplingPeriodID, cell_sampled, croppedArea, time_span) %>%
  group_by(datasetID, scalingID, siteID, croppedArea) %>%
  dplyr::summarise(
    cell_sampling_repeats = n_distinct(samplingPeriodID, na.rm = TRUE),
    .groups = "keep") %>%
  mutate(cells_keep =
           case_when(
             cell_sampling_repeats == 2 & !is.na(croppedArea) ~ 1,
             cell_sampling_repeats %in% c(0,1) | is.na(croppedArea) ~ 0,
            .default = NA)) %>%
  unique()


# Checks:
colSums(is.na(cells)) # no NAs

            datasetID             scalingID                siteID 
                    0                     0                     0 
          croppedArea cell_sampling_repeats            cells_keep 
                    1                     0                     0

Code

glimpse(cells)

Rows: 7,289
Columns: 6
Groups: datasetID, scalingID, siteID, croppedArea [7,289]
$ datasetID             <int> 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, …
$ scalingID             <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
$ siteID                <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 1…
$ croppedArea           <dbl> 20.623832, 24.927742, 20.822468, 24.824370, 17.9…
$ cell_sampling_repeats <int> 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, …
$ cells_keep            <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …

Code

species_df <-
  data_sf %>%
  st_drop_geometry() %>%
  filter(scalingID == 1) %>%
  distinct(datasetID, verbatimIdentification, scientificName, samplingPeriodID, time_span) %>%
  filter(!is.na(verbatimIdentification)) # NAs from cells that were not sampled twice (here: only site 5316)

glimpse(species_df)

Rows: 495
Columns: 5
$ datasetID              <int> 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,…
$ verbatimIdentification <chr> "Picoides pubescens", "Empidonax virescens", "G…
$ scientificName         <chr> "Dryobates pubescens", "Empidonax virescens", "…
$ samplingPeriodID       <dbl> 2, 2, 1, 2, 1, 2, 2, 1, 2, 1, 2, 2, 1, 2, 2, 2,…
$ time_span              <int> 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,…

Introduced species

This part requires access to the BirdLife International range maps available here: https://datazone.birdlife.org/contact-us/request-our-data

Due to licensing restrictions we cannot share this part, but instead we share a list of species that are introduced in New York.

Code

# temp <-  
#   readRDS(here::here("Data", "output", "1_data_sf.rds")) %>% 
#   st_drop_geometry() %>%
#   filter(datasetID == 6)
# 
# # write to file
# temp %>% 
#   distinct(verbatimIdentification, introduced) %>% 
#   write.csv2(here::here("Demo_NewYork/Data/input/introduced_species_ny.csv"))


# read back in
introduced <- 
  read.csv2(here::here("Demo_NewYork/Data/input/introduced_species_ny.csv"))

# cheks: 7 introduced species
table(introduced$introduced)


  0   1 
249   7

Code

# here is the list of species removed:
introduced %>% 
  filter(introduced == 1) %>% 
  pull(verbatimIdentification)

[1] "Passer domesticus"   "Sturnus vulgaris"    "Phasianus colchicus"
[4] "Cygnus olor"         "Perdix perdix"       "Myiopsitta monachus"
[7] "Nandayus nenday"

Code

species_df2 <- 
  species_df %>% 
  left_join(introduced[2:3])

Joining with `by = join_by(verbatimIdentification)`

Code

glimpse(species_df2)

Rows: 495
Columns: 6
$ datasetID              <int> 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,…
$ verbatimIdentification <chr> "Picoides pubescens", "Empidonax virescens", "G…
$ scientificName         <chr> "Dryobates pubescens", "Empidonax virescens", "…
$ samplingPeriodID       <dbl> 2, 2, 1, 2, 1, 2, 2, 1, 2, 1, 2, 2, 1, 2, 2, 2,…
$ time_span              <int> 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,…
$ introduced             <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…

Species lost or gained completely

Code

common_sp <- 
  data_sf %>%
  st_drop_geometry() %>%
  filter(scalingID == 1) %>%
  left_join(species_df2) %>%
  left_join(cells %>% filter(scalingID == 1)) %>%
  na.omit() %>%
  filter(cells_keep == 1 & introduced == 0) %>%
  group_by(verbatimIdentification) %>%
  dplyr::summarise(sp_sampling_repeats = n_distinct(samplingPeriodID),
                   .groups = "drop")

Joining with `by = join_by(datasetID, verbatimIdentification, scientificName,
samplingPeriodID, time_span)`
Joining with `by = join_by(datasetID, scalingID, siteID, croppedArea)`

Code

# checks: removes 16 species that were only sampled once
table(common_sp$sp_sampling_repeats)


  1   2 
 16 233

Code

# here is the list of species removed:
common_sp %>%
  filter(sp_sampling_repeats == 1) %>% 
  pull(verbatimIdentification)

 [1] "Ammodramus nelsoni"               "Aythya marila"                   
 [3] "Bucephala albeola"                "Chlidonias leucopterus"          
 [5] "Coragyps atratus"                 "Cygnus buccinator"               
 [7] "Euphagus cyanocephalus"           "Falco columbarius"               
 [9] "Grus canadensis"                  "Pelecanus erythrorhynchos"       
[11] "Phalaropus tricolor"              "Plegadis chihi"                  
[13] "Somateria mollissima"             "Streptopelia decaocto"           
[15] "Tyrannus verticalis"              "Vermivora pinus x V. chrysoptera"

Decide which species to keep

Code

species_df3 <- 
  species_df2 %>%
  left_join(common_sp) %>%
    mutate(
    species_keep =
      case_when(
        sp_sampling_repeats == 2 & introduced == 0 ~ 1,
        TRUE ~ 0))

Joining with `by = join_by(verbatimIdentification)`

Code

# checks: we remove 29 species and keep 466 total in both sampling periods.
table(species_df3$species_keep)


  0   1 
 29 466

Apply data filters

Code

data_sf2 <-
  data_sf %>%
  left_join(cells) %>%
  left_join(species_df3)

Joining with `by = join_by(datasetID, scalingID, siteID, croppedArea)`
Joining with `by = join_by(datasetID, verbatimIdentification, scientificName,
samplingPeriodID, time_span)`

Code

# keep only species and cells sampled twice
data_filt <-
  data_sf2 %>%
  st_drop_geometry() %>%
  filter(!is.na(verbatimIdentification) & species_keep == 1 & cells_keep == 1)

# Checks:
data_filt %>%
  group_by(samplingPeriodID) %>%
  skimr::skim()

Data summary
Name	Piped data
Number of rows	1120865
Number of columns	31
_______________________
Column type frequency:
character	6
numeric	24
________________________
Group variables	samplingPeriodID

Variable type: character

skim_variable	samplingPeriodID	complete_rate	min	max	n_unique
footprintSRS	1	1	9	9	1
footprintSRS	2	1	9	9	1
verbatimFootprintSRS	1	1	10	10	1
verbatimFootprintSRS	2	1	10	10	1
areaUnit	1	1	3	3	1
areaUnit	2	1	3	3	1
lengthUnit	1	1	2	2	1
lengthUnit	2	1	2	2	1
verbatimIdentification	1	1	9	32	233
verbatimIdentification	2	1	9	32	233
scientificName	1	1	9	39	233
scientificName	2	1	9	39	233

Variable type: numeric

skim_variable	samplingPeriodID	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
datasetID	1	1	6.00	0.00	6.00	6.00	6.00	6.00	6.00	▁▁▇▁▁
datasetID	2	1	6.00	0.00	6.00	6.00	6.00	6.00	6.00	▁▁▇▁▁
scalingID	1	1	2.13	4.91	1.00	1.00	1.00	2.00	128.00	▇▁▁▁▁
scalingID	2	1	2.09	4.76	1.00	1.00	1.00	2.00	128.00	▇▁▁▁▁
siteID	1	1	2150.86	1484.84	1.00	872.00	1921.00	3225.00	5335.00	▇▇▆▃▃
siteID	2	1	2128.88	1473.71	1.00	859.00	1905.00	3175.00	5335.00	▇▇▆▃▃
area	1	1	291.59	3264.13	24.93	25.01	25.02	100.05	133087.29	▇▁▁▁▁
area	2	1	282.42	3188.68	24.93	25.01	25.02	100.05	133087.29	▇▁▁▁▁
croppedArea	1	1	278.72	3120.13	0.01	25.01	25.02	100.05	127077.41	▇▁▁▁▁
croppedArea	2	1	270.15	3048.35	0.01	25.01	25.02	100.04	127077.41	▇▁▁▁▁
croppedAreaPercent	1	1	0.96	0.16	0.00	1.00	1.00	1.00	1.00	▁▁▁▁▇
croppedAreaPercent	2	1	0.96	0.15	0.00	1.00	1.00	1.00	1.00	▁▁▁▁▇
centroidDecimalLongitude	1	1	-75.49	1.71	-79.76	-76.63	-75.12	-74.12	-71.87	▂▃▆▇▁
centroidDecimalLongitude	2	1	-75.55	1.72	-79.76	-76.69	-75.21	-74.15	-71.87	▂▃▆▇▁
centroidDecimalLatitude	1	1	42.88	0.99	40.49	42.22	42.80	43.51	45.03	▂▅▇▅▃
centroidDecimalLatitude	2	1	42.89	0.97	40.49	42.24	42.81	43.51	45.03	▂▅▇▅▃
croppedDecimalLongitude	1	1	-75.49	1.71	-79.74	-76.63	-75.12	-74.12	-71.88	▂▃▅▇▁
croppedDecimalLongitude	2	1	-75.55	1.72	-79.74	-76.69	-75.21	-74.15	-71.88	▂▃▆▇▁
croppedDecimalLatitude	1	1	42.88	0.99	40.51	42.23	42.80	43.51	45.01	▂▅▇▅▃
croppedDecimalLatitude	2	1	42.89	0.97	40.51	42.24	42.81	43.51	45.01	▂▅▇▅▃
northSouthLength	1	1	9.78	17.62	5.00	5.05	5.12	10.11	509.30	▇▁▁▁▁
northSouthLength	2	1	9.67	17.28	5.00	5.05	5.12	10.11	509.30	▇▁▁▁▁
eastWestLength	1	1	10.03	20.55	5.01	5.05	5.12	10.12	654.17	▇▁▁▁▁
eastWestLength	2	1	9.90	20.15	5.01	5.05	5.12	10.12	654.17	▇▁▁▁▁
maxLength	1	1	13.77	24.32	7.06	7.07	7.07	14.15	659.64	▇▁▁▁▁
maxLength	2	1	13.60	23.88	7.06	7.07	7.07	14.15	659.64	▇▁▁▁▁
croppedNorthSouthLength	1	1	9.63	17.40	0.18	5.05	5.10	10.10	502.08	▇▁▁▁▁
croppedNorthSouthLength	2	1	9.52	17.07	0.18	5.05	5.10	10.09	502.08	▇▁▁▁▁
croppedEastWestLength	1	1	9.92	20.45	0.06	5.05	5.11	10.11	650.95	▇▁▁▁▁
croppedEastWestLength	2	1	9.78	20.05	0.06	5.05	5.11	10.10	650.95	▇▁▁▁▁
croppedMaxLength	1	1	13.61	24.16	0.32	7.07	7.07	14.15	655.19	▇▁▁▁▁
croppedMaxLength	2	1	13.44	23.72	0.32	7.07	7.07	14.15	655.19	▇▁▁▁▁
startYear	1	1	1980.00	0.00	1980.00	1980.00	1980.00	1980.00	1980.00	▁▁▇▁▁
startYear	2	1	2000.00	0.00	2000.00	2000.00	2000.00	2000.00	2000.00	▁▁▇▁▁
endYear	1	1	1985.00	0.00	1985.00	1985.00	1985.00	1985.00	1985.00	▁▁▇▁▁
endYear	2	1	2005.00	0.00	2005.00	2005.00	2005.00	2005.00	2005.00	▁▁▇▁▁
time_span	1	1	5.00	0.00	5.00	5.00	5.00	5.00	5.00	▁▁▇▁▁
time_span	2	1	5.00	0.00	5.00	5.00	5.00	5.00	5.00	▁▁▇▁▁
cell_sampling_repeats	1	1	2.00	0.00	2.00	2.00	2.00	2.00	2.00	▁▁▇▁▁
cell_sampling_repeats	2	1	2.00	0.00	2.00	2.00	2.00	2.00	2.00	▁▁▇▁▁
cells_keep	1	1	1.00	0.00	1.00	1.00	1.00	1.00	1.00	▁▁▇▁▁
cells_keep	2	1	1.00	0.00	1.00	1.00	1.00	1.00	1.00	▁▁▇▁▁
introduced	1	1	0.00	0.00	0.00	0.00	0.00	0.00	0.00	▁▁▇▁▁
introduced	2	1	0.00	0.00	0.00	0.00	0.00	0.00	0.00	▁▁▇▁▁
sp_sampling_repeats	1	1	2.00	0.00	2.00	2.00	2.00	2.00	2.00	▁▁▇▁▁
sp_sampling_repeats	2	1	2.00	0.00	2.00	2.00	2.00	2.00	2.00	▁▁▇▁▁
species_keep	1	1	1.00	0.00	1.00	1.00	1.00	1.00	1.00	▁▁▇▁▁
species_keep	2	1	1.00	0.00	1.00	1.00	1.00	1.00	1.00	▁▁▇▁▁

Code

data_filt_reduced <- 
  data_filt %>% 
  select(scalingID, siteID, samplingPeriodID, 
         verbatimIdentification, scientificName, 
         verbatimFootprintSRS, footprintSRS, croppedArea, areaUnit, 
         centroidDecimalLongitude, centroidDecimalLatitude, 
         startYear, endYear)

# checks: looks good.
glimpse(data_filt_reduced)

Rows: 1,120,865
Columns: 13
$ scalingID                <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
$ siteID                   <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
$ samplingPeriodID         <dbl> 2, 2, 1, 2, 1, 2, 2, 1, 2, 1, 2, 2, 1, 2, 2, …
$ verbatimIdentification   <chr> "Picoides pubescens", "Empidonax virescens", …
$ scientificName           <chr> "Dryobates pubescens", "Empidonax virescens",…
$ verbatimFootprintSRS     <chr> "epsg:26918", "epsg:26918", "epsg:26918", "ep…
$ footprintSRS             <chr> "epsg:4326", "epsg:4326", "epsg:4326", "epsg:…
$ croppedArea              <dbl> 20.62383, 20.62383, 20.62383, 20.62383, 20.62…
$ areaUnit                 <chr> "km2", "km2", "km2", "km2", "km2", "km2", "km…
$ centroidDecimalLongitude <dbl> -79.74312, -79.74312, -79.74312, -79.74312, -…
$ centroidDecimalLatitude  <dbl> 42.06346, 42.06346, 42.06346, 42.06346, 42.06…
$ startYear                <int> 2000, 2000, 1980, 2000, 1980, 2000, 2000, 198…
$ endYear                  <int> 2005, 2005, 1985, 2005, 1985, 2005, 2005, 198…

Taxon matching

Code

tax_lookup <-
  read.csv(here::here("Data/input/Tax_lookup.csv"))[2:7] %>%
  filter(verbatimIdentification %in% data_filt_reduced$verbatimIdentification)

write.csv2(tax_lookup, 
           here::here("Demo_NewYork/Data/input/Tax_lookup_ny.csv"))

Write to file

Code

if (!dir.exists(here::here("Demo_NewYork/Data/output"))) {
  dir.create(here::here("Demo_NewYork/Data/output"))
}

saveRDS(data_sf2, 
        here::here("Demo_NewYork/Data/output/1_data_sf_ny.rds"))
saveRDS(data_filt_reduced, 
        here::here("Demo_NewYork/Data/output/1_data_filtered_ny.rds"))