Last updated: 2019-07-04

This reproducible R Markdown analysis was created with workflowr (version 1.3.0).

Here, we calculate some of the initial (pre-model) results from the infant cohort and exposure characteristcs. - Demographics - Initial survial curves - Exposure assessment

conflict_prefer("filter", "dplyr")
  theme_bw() +
    theme(panel.grid.minor = element_blank(),
          legend.position = "top")


exposure_data = read_csv("data/exposure_data.csv")
exposure_data_long = read_csv("data/exposure_data_long.csv")
leg_plot = ggplot(data = exposure_data_long, aes(x = idpar, y = count, fill = factor(obs_infected))) +
  geom_tile() +
  scale_fill_manual("", values = infection_labels$colours, breaks = infection_labels$breaks,
                      labels = infection_labels$labels)
trans_legend = gtable::gtable_filter(ggplot_gtable(ggplot_build(leg_plot + theme(legend.position = "top"))), "guide-box")

range_str = function(x, digits = 3) paste(round(range(x), digits), collapse = " - ")
IQR_range_str = function(x, digits = 3) paste(round(quantile(x, c(0.25, 0.75)), digits), collapse = " - ")


Infant ages

exposure_data %>%
  select(FamilyID, enrollment_age) %>%
  distinct() %>%
        N = n(),
        enroll_median_age_days = median(enrollment_age),
        IQR = paste(quantile(enrollment_age, c(0.25, 0.75)), collapse = ", "),
        range_days = paste(range(enrollment_age), collapse = ", ")
        ) %>%
  kable() %>% kable_styling(full_width = F)
N enroll_median_age_days IQR range_days
32 2 1, 3 0, 9


exposure_data %>% select(FamilyID, momhiv) %>% 
  distinct() %>%
  group_by(momhiv) %>%
  summarize(N = n()) %>% 
  kable() %>% kable_styling(full_width = F)
momhiv N
neg 15
pos 17

Survival analysis

exposure_data %>% 
  group_by(virus, FamilyID) %>% 
  summarize(obs_infected = max(infectious_1wk),
            is_infected = max(infected)) %>% group_by(virus) %>%
    total_infants = n_distinct(FamilyID),
    total_infected = sum(is_infected),
    total_outcome = sum(obs_infected)
    ) %>%
  kable() %>%
  kable_styling(full_width = F)
virus total_infants total_infected total_outcome
CMV 30 20 16
HHV-6 31 24 23
surv_data = exposure_data %>% 
  group_by(FamilyID, virus, momhiv, final_infant_wk) %>%
    infected = max(infected)

surv_fit = surv_data %>%
  group_by(virus) %>%
  nest() %>%
    surv_mod = map(data, ~survfit(Surv(final_infant_wk, infected) ~ 1, data = .)),
    surv_mod_hiv = map(data, ~survfit(Surv(final_infant_wk, infected) ~ momhiv, data = .)),
    logrank = map_dbl(data, ~coin::pvalue(coin::logrank_test(Surv(final_infant_wk, infected) ~ factor(momhiv),
                                                             data = ., distribution = "exact")))
    ) %>%

surv_fit %>%
  select(virus, logrank) %>%
  rename(`Mother HIV Log-rank` = logrank) %>%
  kable() %>% kable_styling(full_width = F)
virus Mother HIV Log-rank
CMV 0.9708864
HHV-6 0.3649318
surv_res = pmap_df(surv_fit, function(virus, surv_mod, surv_mod_hiv, logrank){
   broom::tidy(surv_mod) %>%
    mutate(strata = "Pooled") %>%
    bind_rows(broom::tidy(surv_mod_hiv)) %>%
      virus = virus,
      momhiv = str_remove_all(strata, "momhiv=")
      ) %>%
    bind_rows(crossing(virus = virus, time = -1e-12, estimate = 1, momhiv = c("Pooled", "neg", "pos")))
surv_res %>%
  arrange(virus, momhiv, time) %>%
  ggplot(aes(time, estimate, colour = momhiv)) + 
  geom_step() +
  geom_point(aes(shape = n.censor > 0)) +
  scale_shape_manual(guide = F, values = c(-1, 3)) +
  scale_x_continuous("Weeks after infant birth", breaks = 0:10 * 10) +
  scale_y_continuous("Proportion uninfected", expand = c(0.01, 0)) +
  geom_vline(xintercept = 52, colour = "black", linetype = "dashed") +
  scale_color_discrete("", breaks = c("neg", "pos", "Pooled"), 
                     labels = c("Mother HIV-", "Mother HIV+", "Pooled")) +
  geom_text(data= surv_fit, aes(label = str_c("p = ", round(logrank, 2))), 
            x = Inf, y = Inf, colour = "black", vjust = 1.2, hjust = 1.2) +
  facet_wrap(~virus) +
  theme(legend.position = "top")

Exposure Analysis


exposure_data_long %>% 
  group_by(virus, idpar) %>% 
    total = n(),
    total_observed = total - sum(interpolated),
    total_interpolate = stat_paste(sum(interpolated),  100*mean(interpolated), digits = 1)
  ) %>%
  kable() %>%
  kable_styling(full_width = F)
virus idpar total total_observed total_interpolate
CMV HH 819 647 172.0 (21.0)
CMV M 819 767 52.0 (6.3)
CMV S 819 647 172.0 (21.0)
HHV-6 HH 684 544 140.0 (20.5)
HHV-6 M 684 646 38.0 (5.6)
HHV-6 S 684 544 140.0 (20.5)
exposure_data_summary = exposure_data_long %>%
    pos_count = count > 0
    ) %>%
  subset(!interpolated) %>%
  group_by(virus, FamilyID, obs_infected, idpar) %>%
    total_pos = sum(pos_count),
    pct_pos = 100 * mean(pos_count)
    ) %>%
  group_by(pct_pos, total_pos, add = T) %>%
  summarise_at(vars(count), list(~n(), mean = mean, median = median, maximum = max)) %>%
  rename(N = n)

plot_labels = exposure_data_summary %>%
  gather(stat, estimate, mean, maximum, pct_pos) %>%
  group_by(stat) %>%
  summarize(min_lim = min(estimate), max_lim = ceiling(max(estimate))) %>%
  left_join(tibble(stat = c("pct_pos", "mean", "maximum"), 
                   out_lab = c("Percent~Positive", "Mean~Log[10]~VL", "Max.~Log[10]~VL"))) %>%
  mutate(stat = factor(stat, levels = c("pct_pos", "mean", "maximum"))) %>%
  arrange(stat) %>%
  ungroup() %>%
  mutate(letter_code = 1:3)
pl_exposure = map(plot_labels$stat %>% levels(), function(s){

    tmp_theme = theme(
      legend.position = "none",
      axis.title = element_text(size = 10),
      axis.text = element_text(size = 9)
    pl_lab = subset(plot_labels, stat == s)
    out_label = pl_lab$out_lab[1]
    lower_limit = pl_lab$min_lim
    upper_limit = pl_lab$max_lim
    pl1 = exposure_data_summary %>%
      gather(stat, estimate, mean, maximum, pct_pos) %>%
      select(-median, -total_pos, -N) %>%
      spread(idpar, estimate) %>%
      filter(stat == s) %>%
      ggplot(aes(x = S, y = M, colour = factor(obs_infected))) +
      geom_point() +
      geom_abline() +
      geom_point() +
      scale_colour_manual(values = c("#4393C3", "#D6604D")) +
      scale_y_continuous(parse(text = paste0("Mother~", out_label)), 
                         limits = c(lower_limit, upper_limit)) +
      scale_x_continuous(parse(text = paste0("Secondary~Children~", out_label)), 
                         limits = c(lower_limit, upper_limit)) +
      facet_wrap(~virus) +
    pl2 = exposure_data_summary %>%
      ungroup() %>%
        obs_infected = factor(obs_infected),
        idpar = fct_recode(fct_rev(idpar), "Secondary\nChildren" = "S", "Mother" = "M", "Household\nSum" = "HH")
        ) %>%
      ggplot(aes_string(x = "idpar", y = s, colour = "obs_infected")) +
      geom_boxplot() +
      scale_colour_manual(values = c("#4393C3", "#D6604D")) +
      geom_point(position = position_dodge(width = 0.75)) +
      scale_y_continuous(parse(text = out_label), limits = c(lower_limit, upper_limit)) +
      xlab(parse(text = c(""))) +
      facet_wrap(~virus) +
      tmp_theme +theme(axis.text.x = element_text(size = 7))
    plot_grid(pl1, pl2, nrow = 1, labels = LETTERS[c(pl_lab$letter_code, pl_lab$letter_code + 3)])

plot_grid(plot_grid(plotlist = pl_exposure, nrow = 3), trans_legend, nrow = 2, rel_heights = c(11, 1))

exposure_data_summary %>%
  subset(idpar != "HH") %>%
  group_by(virus, FamilyID, idpar) %>%
  gather(outcome, value, pct_pos, mean, maximum) %>%
  select(virus, FamilyID, idpar, outcome, value) %>%
  spread(idpar, value) %>%
  ungroup() %>%
    fid = factor(FamilyID)
  ) %>%
  group_by(virus, outcome) %>%
  nest() %>%
    cor_test = map(data, ~cor.test(.x$M, .x$S), method = "Spearmen"),
    cor_res = map(cor_test, broom::tidy)
  ) %>%
  unnest(cor_res) %>%
    Endpoint = factor(outcome, levels = c("pct_pos", "mean", "maximum"),
                      labels = c("Pct. Positive", "Mean VL", "Maximum VL")),
    Estimate = stat_paste(estimate, conf.low, conf.high, digits = 2)
  ) %>%
  select(virus, Endpoint, Estimate, p.value) %>%
  arrange(virus, Endpoint) %>%
  kable(digits = 3) %>%
  kable_styling(full_width = F) %>%
virus Endpoint Estimate p.value
CMV Pct. Positive -0.04 (-0.40, 0.32) 0.818
Mean VL 0.06 (-0.31, 0.41) 0.752
Maximum VL 0.06 (-0.31, 0.41) 0.768
HHV-6 Pct. Positive 0.30 (-0.06, 0.59) 0.095
Mean VL 0.33 (-0.03, 0.61) 0.074
Maximum VL 0.29 (-0.07, 0.59) 0.111
exposure_data_tests = 
  exposure_data_summary %>% 
  gather(stat, estimate, mean, maximum, pct_pos) %>%
  group_by(virus, stat, idpar) %>%
  mutate(obs_infected = factor(obs_infected)) %>%
    wilcox_pvalue =  coin::pvalue(coin::wilcox_test(estimate ~ obs_infected, distribution = "exact"))

overall_summary = exposure_data_summary %>% 
  gather(stat, estimate, mean, maximum, pct_pos) %>%
  group_by(virus, stat, idpar, obs_infected) %>%
    median = median(estimate)
  ) %>%
  mutate(obs_infected = recode_factor(obs_infected, `1` = "Transmission", `0` = "No transmission", .ordered = T)) %>%
  spread(obs_infected, median) %>%
  mutate(Difference = `Transmission`-`No transmission`) %>%
overall_summary %>%
  kable(digits = 3) %>%
  kable_styling(full_width = F) %>%
  collapse_rows(columns = 1:2)
virus stat idpar Transmission No transmission Difference wilcox_pvalue
CMV maximum HH 4.432 4.344 0.089 0.400
M 1.109 2.486 -1.377 0.653
S 4.422 4.344 0.079 0.377
mean HH 3.471 3.052 0.419 0.179
M 0.131 0.055 0.076 0.684
S 3.460 3.051 0.408 0.179
pct_pos HH 100.000 95.302 4.698 0.249
M 5.405 2.183 3.222 0.622
S 100.000 94.052 5.948 0.230
HHV-6 maximum HH 4.628 3.903 0.725 0.016
M 3.336 2.997 0.339 0.147
S 4.628 3.683 0.945 0.020
mean HH 4.116 3.406 0.710 0.006
M 2.018 1.049 0.968 0.102
S 4.076 3.072 1.004 0.007
pct_pos HH 100.000 98.913 1.087 0.008
M 76.000 40.064 35.936 0.105
S 100.000 94.501 5.499 0.003

Household composition

Household sum composition was determined and reported by taking the mean proportion over measurements within a household. The summary across the households uses median and IQR to match the box plot statistics.

hh_summary = exposure_data %>%
  filter(HH > 0) %>%
    S_pctHH = if_else(HH == 0, 0, 100 * (10^S/10^HH)),
    M_pctHH = if_else(HH == 0, 0, 100 * (10^M/10^HH))
    ) %>%
  group_by(virus, FamilyID, obs_infected) %>%
  summarise_at(vars(S_pctHH, M_pctHH), funs(mean = mean, median = median))
hh_summary %>%
  ungroup() %>%
  select(-obs_infected) %>%
  gather(stat, est, -virus, -FamilyID) %>%
  group_by(virus, stat) %>%
  summarize_if(is.double, funs(mean, median, IQR = IQR_range_str, range = range_str)) %>%
  mutate_if(is.double, round, digits = 2) %>%
  filter(str_detect(stat, "mean")) %>%
    stat = substr(stat, 1, 1)
  ) %>%
  rename(`Household member` = stat) %>%
  rename_at(vars(-IQR), funs(str_to_title)) %>%
  kable(caption = "Percent household composition") %>% kable_styling(full_width = F)
Percent household composition
Virus Household Member Mean Median IQR Range
CMV M 7.27 0.32 0.094 - 5.426 0.004 - 66.421
CMV S 92.73 99.68 94.574 - 99.906 33.579 - 99.996
HHV-6 M 15.42 8.76 1.16 - 19.477 0.01 - 99.782
HHV-6 S 84.58 91.24 80.523 - 98.84 0.218 - 99.99
hh_summary %>%
  ggplot(aes(x = virus, y = S_pctHH_mean)) +
  geom_boxplot() +
  geom_point(aes(colour = factor(obs_infected))) +
  xlab("") +
  scale_colour_manual("", values = infection_labels$colours, breaks = infection_labels$breaks,
                      labels = infection_labels$labels) +
  ylab("Percent of household shedding attributable to secondary children") 

Interpolated exposure

In the sensitivity analysis, we’d like to assess the interpolation. All of the summary statistical analysis is limited to the observed exposures. Assuming the interpolation was unbiased, the precision in tests could still be artificially inflated without some correction. Because linear interpolation is used, the mean and maximum exposure estimates should be largely unaffected. However, the percent positive estimate may not be precise without a larger sample size.

exposure_interpolated_summary = exposure_data_long %>%
    pos_count = count > 0
    ) %>%
  group_by(virus, FamilyID, obs_infected, idpar) %>%
    interpolated_pct = 100*mean(interpolated),
    total_pos = sum(pos_count),
    pct_pos = 100 * mean(pos_count)
    ) %>%
  group_by(pct_pos, total_pos, interpolated_pct, add = T) %>%
  summarise_at(vars(count), funs(N = n(), mean = mean, median = median, maximum = max))

exposure_interpolated_summary %>%
  ungroup() %>%
    idpar = fct_recode(fct_rev(idpar), 
                       "Secondary\nChildren" = "S", "Mother" = "M", "Household\nSum" = "HH")
    ) %>%
  ggplot(aes(x = idpar, y = interpolated_pct)) +
  geom_boxplot() +
  geom_point() +
  xlab("") +
  ylab("Percent of weekly exposures interpolated") +

exposure_interpolated_summary %>% 
  ungroup() %>%
  subset(idpar != "HH") %>%
  gather(stat, estimate, mean, maximum, pct_pos) %>%
    idpar = fct_recode(fct_rev(idpar), 
                       "Secondary\nChildren" = "S", "Mother" = "M"),
    stat = fct_recode(fct_rev(stat), 
                       "% positive (weekly)" = "pct_pos", "Mean VL" = "mean", "Maximum VL" = "maximum"),
    ) %>%
  ggplot(aes(x = interpolated_pct, y = estimate, colour = factor(obs_infected))) +
  geom_point() +
  ylab("") +
  xlab("Percent of weekly exposures interpolated") +
  facet_grid(stat ~ virus+idpar, scales = "free_y", switch = "y") +
  scale_colour_manual("", values = infection_labels$colours, breaks = infection_labels$breaks,
                      labels = infection_labels$labels) +
  theme(strip.placement = "outside")

