---
title: "A_13_Data_viz"
format:
html:
self-contained: true
embed-resources: true
toc: true # optional: adds a table of contents
theme: cosmo # optional: Bootstrap theme
code-fold: show # optional: collapsible code blocks
code-tools: true # optional: adds copy/paste buttons
toc-depth: 4
editor:
markdown:
wrap: 72
---
# A - 13: Data visualization
```{r}
#| echo: false
#| message: false
#| warning: false
#| results: "hide"
#| output: false
rm (list = ls ())
gc ()
```
## Libraries
```{r}
#| message: false
#| warning: false
#| results: "hide"
#| output: false
suppressPackageStartupMessages ({
library (here)
source (here:: here ("Code/00_Configuration.R" ))
lapply (package_list, require, character = TRUE )
library (corrr)
library (fastDummies)
library (inspectdf)
library (caret)
library (summarytools)
})
```
## Read data
```{r}
#| message: false
#| warning: false
#| error: false
dta <- readRDS (here:: here ("Data/output/1_all_predictors_merged.rds" )) %>%
filter (samplingPeriodID == 1 )
```
## Set variable vectors for hypotheses
```{r}
sp_id <-
c ("verbatimIdentification" , "scientificName" )
H1 <-
c ("Mass" , "GlobRangeSize_km2" , "Migration" , "Habitat_5" , "Generalism" , "Threatened" , "pd" )
H2 <-
c ("D_AOO_a" , "mean_lnLac" , "AOO" , "joincount_delta" , "circNorm" , "minDist_toBorder_centr" )
H3 <-
c ("datasetID" )
predictors <-
c (H1, H2, H3)
responses <-
c ("Jaccard_dissim" , "log_R2_1" , "log_R2_1_per_year" )
```
## Reduce data to model variables
```{r}
dta_new <-
dta %>%
select (all_of (c (sp_id, responses, H3, H1, H2))) %>%
ungroup ()
```
# Correlations
We will dummy code the ordinal variables to include them in the correlation matrix
```{r}
dummy_reduced <-
dta_new %>%
select (
- datasetID,
- verbatimIdentification,
- scientificName,
- Jaccard_dissim,
- log_R2_1,
- log_R2_1_per_year
) %>%
## turn the level "NA" into a real NA
mutate (
Migration = na_if (as.character (Migration), "NA" ),
Threatened = na_if (as.character (Threatened), "NA" ),
Habitat_5 = na_if (as.character (Habitat_5), "NA" ),
Generalism = na_if (as.character (Generalism), "NA" )
) %>%
mutate (
Migration = factor (
Migration,
levels = c ("1" , "2" , "3" ),
),
Threatened = factor (
Threatened,
levels = c ("1" , "0" )
),
Habitat_5 = factor (
Habitat_5,
levels = c ("closed" , "freshwater" , "open" , "human" , "marine" )
),
Generalism = factor (
Generalism,
levels = c ("1" , "0" )
)
) %>%
na.omit ()
dta_dummies <-
fastDummies:: dummy_cols (dummy_reduced,
remove_first_dummy = FALSE , # avoids multicollinearity
remove_selected_columns = TRUE
)
cor_matrix <-
corrr:: correlate (dta_dummies,
use = "pairwise.complete.obs" ,
quiet = TRUE
) %>%
tidyr:: replace_na (list (r = 1 )) %>%
rearrange ()
```
### a) tile-chart
```{r}
rplot (cor_matrix)
```
# Inspect data
## a) Pairs-plot of all variables
```{r}
#| fig-width: 25
#| fig-height: 25
#| message: false
#| error: false
#| warning: false
GGally:: ggpairs (
ggplot2:: aes (colour = datasetID),
data = dta_new %>%
select (
- verbatimIdentification,
- scientificName
) %>% as.data.frame (),
progress = FALSE
)
```
## b) Inspect variable distributions
i) variables & types
```{r}
#| fig-width: 8
#| fig-height: 8
inspect_types (dta_new) %>%
show_plot ()
```
ii) numeric variables (univariate plots)
```{r}
#| fig-width: 12
#| fig-height: 12
inspect_num (dta_new) %>%
show_plot ()
```
iii) categorical variable imbalance (most frequent level)
```{r}
#| fig-width: 6
#| fig-height: 4
inspect_imb (dta_new %>%
select (
- verbatimIdentification,
- scientificName
)) %>%
show_plot ()
```
iv) categorical variables: frequency of levels
```{r}
inspect_cat (dta_new %>%
select (- verbatimIdentification, - scientificName)) %>%
show_plot ()
```
```{r}
ggplot (dta_new, aes (x = Threatened, fill = datasetID)) +
geom_bar ()
ggplot (dta_new, aes (x = Habitat_5, fill = datasetID)) +
geom_bar ()
```
v) strongest correlations between variables
```{r}
#| fig-width: 12
#| fig-height: 18
inspect_cor (dta_new) %>%
show_plot ()
```
## c) Cross-tabulations
Categorical Variables
```{r}
# summarytools::freq(dta_new)
summarytools:: ctable (
x = dta_new$ Habitat_5,
y = dta_new$ Threatened
)
summarytools:: ctable (
x = dta_new$ datasetID,
y = dta_new$ Threatened
)
summarytools:: descr (dta_new)
```
Numerical variables:
```{r}
summarytools:: dfSummary (dta_new, plain.ascii = FALSE )
summarytools:: stby (
data = dta_new,
INDICES = dta_new$ datasetID,
FUN = summarytools:: descr,
stats = "common" ,
transpose = T
)
```
# Feature plots
## all atlases together
```{r}
trellis.par.set (theme = col.whitebg (), warn = FALSE )
```
### Jaccard
H1 ~ Jaccard
```{r}
#| fig-width: 10
#| fig-height: 10
featurePlot (
x = dta_new %>% select (datasetID, all_of (H1)),
y = dta_new$ Jaccard_dissim,
group = dta_new$ datasetID,
plot = "pairs" ,
pch = 16 ,
alpha = 0.3 ,
cex = 0.5 ,
xlab = "Scatterplot Matrix of traits (H1) - Jaccard 1" ,
auto.key = list (columns = 4 ),
par.settings =
list (fontsize = list (text = 6 ))
)
```
H2 ~ Jaccard
```{r}
#| fig-width: 12
#| fig-height: 12
featurePlot (
x = dta_new %>% select (datasetID, all_of (H2)),
y = dta_new$ Jaccard_dissim,
group = dta_new$ datasetID,
plot = "pairs" ,
pch = 16 ,
alpha = 0.3 ,
cex = 0.5 ,
xlab = "Scatterplot Matrix of range geometry (H2) - Jaccard 1" ,
auto.key = list (columns = 4 ),
par.settings =
list (fontsize = list (text = 6 ))
)
```
### log Ratio
H1 ~ log ratio
```{r}
#| fig-width: 10
#| fig-height: 10
featurePlot (
x = dta_new %>% select (datasetID, all_of (H1)),
y = dta_new$ log_R2_1,
group = dta_new$ datasetID,
plot = "pairs" ,
pch = 16 ,
alpha = 0.3 ,
cex = 0.5 ,
xlab = "Scatterplot Matrix of traits (H1) - Log Ratio" ,
auto.key = list (columns = 4 ),
par.settings =
list (fontsize = list (text = 6 ))
)
```
H2 ~ log ratio
```{r}
#| fig-width: 12
#| fig-height: 12
featurePlot (
x = dta_new %>% select (datasetID, all_of (H2)),
y = dta_new$ log_R2_1,
group = dta_new$ datasetID,
plot = "pairs" ,
pch = 16 ,
alpha = 0.3 ,
cex = 0.5 ,
xlab = "Scatterplot Matrix of range geometry (H2) - log Ratio" ,
auto.key = list (columns = 4 ),
par.settings =
list (fontsize = list (text = 6 ))
)
```
## For each atlas separately:
H2 ~ Jaccard
```{r}
#| fig-width: 10
#| fig-height: 10
featurePlot (
x = dta_new %>% filter (datasetID == 5 ) %>%
select (all_of (c (H2))),
y = dta_new %>% filter (datasetID == 5 ) %>%
select (Jaccard_dissim),
plot = "pairs" ,
pch = 16 ,
alpha = 0.3 ,
cex = 0.5 ,
xlab = "Czechia: Scatterplot Matrix of traits (H1) - Jaccard 1" ,
auto.key = list (columns = 4 ),
par.settings =
list (fontsize = list (text = 6 ))
)
featurePlot (
x = dta_new %>% filter (datasetID == 6 ) %>%
select (all_of (c (H2))),
y = dta_new %>% filter (datasetID == 6 ) %>%
select (Jaccard_dissim),
plot = "pairs" ,
pch = 16 ,
alpha = 0.3 ,
cex = 0.5 ,
xlab = "New York: Scatterplot Matrix of range geometry (H2) - Jaccard 1" ,
auto.key = list (columns = 4 ),
par.settings =
list (fontsize = list (text = 6 ))
)
featurePlot (
x = dta_new %>% filter (datasetID == 13 ) %>%
select (all_of (c (H2))),
y = dta_new %>% filter (datasetID == 13 ) %>%
select (Jaccard_dissim),
plot = "pairs" ,
pch = 16 ,
alpha = 0.3 ,
cex = 0.5 ,
xlab = "Japan: Scatterplot Matrix of range geometry (H2) - Jaccard 1" ,
auto.key = list (columns = 4 ),
par.settings =
list (fontsize = list (text = 6 ))
)
featurePlot (
x = dta_new %>% filter (datasetID == 26 ) %>%
select (all_of (c (H2))),
y = dta_new %>% filter (datasetID == 26 ) %>%
select (Jaccard_dissim),
plot = "pairs" ,
pch = 16 ,
alpha = 0.3 ,
cex = 0.5 ,
xlab = "Europe: Scatterplot Matrix of range geometry (H2) - Jaccard 1" ,
auto.key = list (columns = 4 ),
par.settings =
list (fontsize = list (text = 6 ))
)
```
H2 ~ log ratio
```{r}
#| fig-width: 12
#| fig-height: 12
featurePlot (
x = dta_new %>% filter (datasetID == 5 ) %>%
select (all_of (c (H2))),
y = dta_new %>% filter (datasetID == 5 ) %>%
select (log_R2_1),
plot = "pairs" ,
pch = 16 ,
alpha = 0.3 ,
cex = 0.5 ,
xlab = "Czechia: Scatterplot Matrix of traits (H1) - Jaccard 1" ,
auto.key = list (columns = 4 ),
par.settings =
list (fontsize = list (text = 6 ))
)
featurePlot (
x = dta_new %>% filter (datasetID == 6 ) %>%
select (all_of (c (H2))),
y = dta_new %>% filter (datasetID == 6 ) %>%
select (log_R2_1),
plot = "pairs" ,
pch = 16 ,
alpha = 0.3 ,
cex = 0.5 ,
xlab = "New York: Scatterplot Matrix of range geometry (H2) - Jaccard 1" ,
auto.key = list (columns = 4 ),
par.settings =
list (fontsize = list (text = 6 ))
)
featurePlot (
x = dta_new %>% filter (datasetID == 13 ) %>%
select (all_of (c (H2))),
y = dta_new %>% filter (datasetID == 13 ) %>%
select (log_R2_1),
plot = "pairs" ,
pch = 16 ,
alpha = 0.3 ,
cex = 0.5 ,
xlab = "Japan: Scatterplot Matrix of range geometry (H2) - Jaccard 1" ,
auto.key = list (columns = 4 ),
par.settings =
list (fontsize = list (text = 6 ))
)
featurePlot (
x = dta_new %>% filter (datasetID == 26 ) %>%
select (all_of (c (H2))),
y = dta_new %>% filter (datasetID == 26 ) %>%
select (log_R2_1),
plot = "pairs" ,
pch = 16 ,
alpha = 0.3 ,
cex = 0.5 ,
xlab = "Europe: Scatterplot Matrix of range geometry (H2) - Jaccard 1" ,
auto.key = list (columns = 4 ),
par.settings =
list (fontsize = list (text = 6 ))
)
```