Chapter 18 Multivariate distributions

18.1 Overview

18.1.1 Variable correlation

Compute correlations between the independent variables or their suggested transformations.

variables <- unique(c(bact_transformed$demog_vars, 
                      bact_transformed$pivotal_vars,
                      bact_transformed$vip_vars,
                      bact_transformed$leuko_related_vars,
                      bact_transformed$leuko_ratio_vars,
                      bact_transformed$kidney_related_vars,
                      bact_transformed$acute_related_vars,
                      bact_transformed$remaining_vars))

corrp <- c_bact %>%
    dplyr::select(all_of(variables)) %>%
      cor(use="pairwise.complete.obs", method="pearson")

corrs <- c_bact %>%
    dplyr::select(all_of(variables)) %>%
      cor(use="pairwise.complete.obs", method="spearman")

# differences of pearson and spearman correlations to check for outliers
corrd <- corrp-corrs

Next, we depict the correlation coefficient in a quadratic heat map:

ggcorrplot(corrp, tl.cex=5, tl.srt=90)

Explore if there are clusters of variables. Such clusters may give rise to define groups of variables for which a summary or only a representative may be considered in modeling:

vc_bact<-Hmisc::varclus(as.matrix(c_bact[,variables]))
plot(vc_bact, cex=0.7)

Some of the clusters that pop up here are related to width/volume of blood cells (MPV, PDW), red blood cells (RBC, HGB, HCT; MCV, MCH), and some further ‘known’ associations such as that between KREA and eGFR (which follows from the construction of eGFR), and between ASAT and ALAT, between AMY and PAMY or between TP and ALB).

In the following scatterplots we have a look at those associations:

ggplot(c_bact, aes(MPV, PDW))+geom_point(alpha = alpha_value, shape = 20) + geom_smooth()
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## Warning: Removed 1102 rows containing non-finite values (stat_smooth).
## Warning: Removed 1102 rows containing missing values (geom_point).

ggplot(c_bact, aes(RBC, HGB))+geom_point(alpha = alpha_value, shape = 20) + geom_smooth()
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## Warning: Removed 461 rows containing non-finite values (stat_smooth).
## Warning: Removed 461 rows containing missing values (geom_point).

ggplot(c_bact, aes(RBC, HCT))+geom_point(alpha = alpha_value, shape = 20) + geom_smooth()
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## Warning: Removed 461 rows containing non-finite values (stat_smooth).
## Removed 461 rows containing missing values (geom_point).

ggplot(c_bact, aes(HGB, HCT))+geom_point(alpha = alpha_value, shape = 20) + geom_smooth()
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## Warning: Removed 42 rows containing non-finite values (stat_smooth).
## Warning: Removed 42 rows containing missing values (geom_point).

ggplot(c_bact, aes(MCV, MCH))+geom_point(alpha = alpha_value, shape = 20) + geom_smooth()
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## Warning: Removed 42 rows containing non-finite values (stat_smooth).
## Removed 42 rows containing missing values (geom_point).

ggplot(c_bact, aes(t_KREA, eGFR))+geom_point(alpha = alpha_value, shape = 20) + geom_smooth()
## Don't know how to automatically pick scale for object of type labelled/integer. Defaulting to continuous.
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## Warning: Removed 159 rows containing non-finite values (stat_smooth).
## Warning: Removed 159 rows containing missing values (geom_point).

ggplot(c_bact, aes(t_ASAT, t_ALAT))+geom_point(alpha = alpha_value, shape = 20) + geom_smooth()
## Don't know how to automatically pick scale for object of type labelled/integer. Defaulting to continuous.
## Don't know how to automatically pick scale for object of type labelled/integer. Defaulting to continuous.
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## Warning: Removed 1195 rows containing non-finite values (stat_smooth).
## Warning: Removed 1195 rows containing missing values (geom_point).

ggplot(c_bact, aes(t_AMY, t_PAMY))+geom_point(alpha = alpha_value, shape = 20) + geom_smooth()
## Don't know how to automatically pick scale for object of type labelled/integer. Defaulting to continuous.
## Don't know how to automatically pick scale for object of type labelled/integer. Defaulting to continuous.
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## Warning: Removed 7182 rows containing non-finite values (stat_smooth).
## Warning: Removed 7182 rows containing missing values (geom_point).

ggplot(c_bact, aes(t_WBC, t_NEU))+geom_point(alpha = alpha_value, shape = 20) + geom_smooth()
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## Warning: Removed 728 rows containing non-finite values (stat_smooth).
## Warning: Removed 728 rows containing missing values (geom_point).

Create scatterplots for pairs of variables with a large distance between Spearman and Pearson correlations (could be an indication of nonlinear association):

for(j in 1:(length(variables)-1)){
  for(jj in (j+1):(length(variables))){
    if(abs(corrd[j, jj])>0.1) print(ggplot(data=c_bact, mapping=aes(x=.data[[variables[j]]],y=.data[[variables[jj]]]))+ geom_point(alpha = alpha_value , shape = 20)+geom_smooth())
  }
}
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## Warning: Removed 174 rows containing non-finite values (stat_smooth).
## Warning: Removed 174 rows containing missing values (geom_point).

## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## Warning: Removed 1279 rows containing non-finite values (stat_smooth).
## Warning: Removed 1279 rows containing missing values (geom_point).

## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## Warning: Removed 732 rows containing non-finite values (stat_smooth).
## Warning: Removed 732 rows containing missing values (geom_point).

## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## Warning: Removed 732 rows containing non-finite values (stat_smooth).
## Removed 732 rows containing missing values (geom_point).

## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## Warning: Removed 732 rows containing non-finite values (stat_smooth).
## Removed 732 rows containing missing values (geom_point).

## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## Warning: Removed 732 rows containing non-finite values (stat_smooth).
## Removed 732 rows containing missing values (geom_point).

## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## Warning: Removed 732 rows containing non-finite values (stat_smooth).
## Removed 732 rows containing missing values (geom_point).

## Don't know how to automatically pick scale for object of type labelled/integer. Defaulting to continuous.
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## Warning: Removed 1580 rows containing non-finite values (stat_smooth).
## Warning: Removed 1580 rows containing missing values (geom_point).

18.1.2 Distribution of age by sex

Figure 7.4: Distribution of age by sex

18.1.3 Distribution of leukocytes by age, coloured by sex

c_bact$gender=factor(c_bact$sex, levels=c(1,2), labels=c("male","female"))


c_bact %>%
ggplot(data=c_bact, mapping=aes(x=Alter, y=t_WBC, color=gender)) + geom_point(shape = 20) + geom_smooth()
## Don't know how to automatically pick scale for object of type labelled/integer. Defaulting to continuous.
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## Warning: Removed 462 rows containing non-finite values (stat_smooth).
## Warning: Removed 462 rows containing missing values (geom_point).

18.1.4 Plot all variables vs. WBC in age/sex groups

c_bact$Agegroup <- factor(cut(c_bact$Alter, c(min(c_bact$Alter), 50, 65, max(c_bact$Alter))))
table(c_bact$gender,c_bact$Agegroup)
##         
##          (16,50] (50,65] (65,101]
##   male      2900    2671     2962
##   female    2460    1579     2114
for(j in 4:length(variables)){
    p1 <- 
      c_bact %>% 
      filter(!is.na(Agegroup)) %>% 
      ggplot(c_bact, mapping=aes(x=t_WBC,y=.data[[variables[j]]])) + 
      geom_point(alpha = alpha_value, shape = 20) + 
      geom_smooth() +  
      geom_rug(alpha = alpha_value) +
      facet_grid(gender ~ Agegroup)
    print(p1)
}
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## Warning: Removed 634 rows containing non-finite values (stat_smooth).
## Warning: Removed 634 rows containing missing values (geom_point).

## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## Warning: Removed 620 rows containing non-finite values (stat_smooth).
## Warning: Removed 620 rows containing missing values (geom_point).

## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## Warning: Removed 728 rows containing non-finite values (stat_smooth).
## Warning: Removed 728 rows containing missing values (geom_point).

## Don't know how to automatically pick scale for object of type labelled/integer. Defaulting to continuous.
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## Warning: Removed 462 rows containing non-finite values (stat_smooth).
## Warning: Removed 462 rows containing missing values (geom_point).

## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## Warning: Removed 552 rows containing non-finite values (stat_smooth).
## Warning: Removed 552 rows containing missing values (geom_point).

## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## Warning: Removed 557 rows containing non-finite values (stat_smooth).
## Warning: Removed 557 rows containing missing values (geom_point).

## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## Warning: Removed 666 rows containing non-finite values (stat_smooth).
## Warning: Removed 666 rows containing missing values (geom_point).

## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## Warning: Removed 648 rows containing non-finite values (stat_smooth).
## Warning: Removed 648 rows containing missing values (geom_point).

## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## Warning: Removed 732 rows containing non-finite values (stat_smooth).
## Warning: Removed 732 rows containing missing values (geom_point).

## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## Warning: Removed 732 rows containing non-finite values (stat_smooth).
## Removed 732 rows containing missing values (geom_point).

## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## Warning: Removed 732 rows containing non-finite values (stat_smooth).
## Removed 732 rows containing missing values (geom_point).

## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## Warning: Removed 732 rows containing non-finite values (stat_smooth).
## Removed 732 rows containing missing values (geom_point).

## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## Warning: Removed 732 rows containing non-finite values (stat_smooth).
## Removed 732 rows containing missing values (geom_point).

## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## Warning: Removed 2428 rows containing non-finite values (stat_smooth).
## Warning: Removed 2428 rows containing missing values (geom_point).

## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## Warning: Removed 620 rows containing non-finite values (stat_smooth).
## Warning: Removed 620 rows containing missing values (geom_point).

## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## Warning: Removed 635 rows containing non-finite values (stat_smooth).
## Warning: Removed 635 rows containing missing values (geom_point).

## Don't know how to automatically pick scale for object of type labelled/integer. Defaulting to continuous.
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## Warning: Removed 2990 rows containing non-finite values (stat_smooth).
## Warning: Removed 2990 rows containing missing values (geom_point).

## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## Warning: Removed 614 rows containing non-finite values (stat_smooth).
## Warning: Removed 614 rows containing missing values (geom_point).

## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## Warning: Removed 1610 rows containing non-finite values (stat_smooth).
## Warning: Removed 1610 rows containing missing values (geom_point).

## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## Warning: Removed 1446 rows containing non-finite values (stat_smooth).
## Warning: Removed 1446 rows containing missing values (geom_point).

## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## Warning: Removed 1718 rows containing non-finite values (stat_smooth).
## Warning: Removed 1718 rows containing missing values (geom_point).

## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## Warning: Removed 463 rows containing non-finite values (stat_smooth).
## Warning: Removed 463 rows containing missing values (geom_point).

## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## Warning: Removed 462 rows containing non-finite values (stat_smooth).
## Warning: Removed 462 rows containing missing values (geom_point).

## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## Warning: Removed 463 rows containing non-finite values (stat_smooth).
## Warning: Removed 463 rows containing missing values (geom_point).

## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## Warning: Removed 463 rows containing non-finite values (stat_smooth).
## Removed 463 rows containing missing values (geom_point).

## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## Warning: Removed 463 rows containing non-finite values (stat_smooth).
## Removed 463 rows containing missing values (geom_point).

## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## Warning: Removed 476 rows containing non-finite values (stat_smooth).
## Warning: Removed 476 rows containing missing values (geom_point).

## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## Warning: Removed 1104 rows containing non-finite values (stat_smooth).
## Warning: Removed 1104 rows containing missing values (geom_point).

## Don't know how to automatically pick scale for object of type labelled/integer. Defaulting to continuous.
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## Warning: Removed 2887 rows containing non-finite values (stat_smooth).
## Warning: Removed 2887 rows containing missing values (geom_point).

## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## Warning: Removed 2955 rows containing non-finite values (stat_smooth).
## Warning: Removed 2955 rows containing missing values (geom_point).

## Don't know how to automatically pick scale for object of type labelled/integer. Defaulting to continuous.
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## Warning: Removed 1724 rows containing non-finite values (stat_smooth).
## Warning: Removed 1724 rows containing missing values (geom_point).

## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## Warning: Removed 1733 rows containing non-finite values (stat_smooth).
## Warning: Removed 1733 rows containing missing values (geom_point).

## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## Warning: Removed 1696 rows containing non-finite values (stat_smooth).
## Warning: Removed 1696 rows containing missing values (geom_point).

## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## Warning: Removed 2313 rows containing non-finite values (stat_smooth).
## Warning: Removed 2313 rows containing missing values (geom_point).

## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## Warning: Removed 3476 rows containing non-finite values (stat_smooth).
## Warning: Removed 3476 rows containing missing values (geom_point).

## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## Warning: Removed 1892 rows containing non-finite values (stat_smooth).
## Warning: Removed 1892 rows containing missing values (geom_point).

## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## Warning: Removed 2008 rows containing non-finite values (stat_smooth).
## Warning: Removed 2008 rows containing missing values (geom_point).

## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## Warning: Removed 2108 rows containing non-finite values (stat_smooth).
## Warning: Removed 2108 rows containing missing values (geom_point).

## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## Warning: Removed 4323 rows containing non-finite values (stat_smooth).
## Warning: Removed 4323 rows containing missing values (geom_point).

## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## Warning: Removed 7410 rows containing non-finite values (stat_smooth).
## Warning: Removed 7410 rows containing missing values (geom_point).

## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## Warning: Removed 4108 rows containing non-finite values (stat_smooth).
## Warning: Removed 4108 rows containing missing values (geom_point).

## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## Warning: Removed 2865 rows containing non-finite values (stat_smooth).
## Warning: Removed 2865 rows containing missing values (geom_point).

## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## Warning: Removed 1851 rows containing non-finite values (stat_smooth).
## Warning: Removed 1851 rows containing missing values (geom_point).

## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## Warning: Removed 2149 rows containing non-finite values (stat_smooth).
## Warning: Removed 2149 rows containing missing values (geom_point).

## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## Warning: Removed 2514 rows containing non-finite values (stat_smooth).
## Warning: Removed 2514 rows containing missing values (geom_point).

## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## Warning: Removed 4572 rows containing non-finite values (stat_smooth).
## Warning: Removed 4572 rows containing missing values (geom_point).

## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## Warning: Removed 5440 rows containing non-finite values (stat_smooth).
## Warning: Removed 5440 rows containing missing values (geom_point).

## Don't know how to automatically pick scale for object of type labelled/integer. Defaulting to continuous.
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## Warning: Removed 5417 rows containing non-finite values (stat_smooth).
## Warning: Removed 5417 rows containing missing values (geom_point).

## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## Warning: Removed 1104 rows containing non-finite values (stat_smooth).
## Warning: Removed 1104 rows containing missing values (geom_point).

## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## Warning: Removed 463 rows containing non-finite values (stat_smooth).
## Warning: Removed 463 rows containing missing values (geom_point).

#+
#  theme_minimal()

18.1.5 Plot all variables vs. WBC in age/sex groups: loess curves only

for(j in 4:length(variables)){
    p1 <- 
      c_bact %>% 
      filter(!is.na(Agegroup)) %>% 
      ggplot(c_bact, mapping=aes(x=t_WBC,y=.data[[variables[j]]])) + 
     # geom_point(alpha = alpha_value) + 
      geom_smooth() +  
     # geom_rug() +
      facet_grid(gender ~ Agegroup)
    print(p1)
}
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## Warning: Removed 634 rows containing non-finite values (stat_smooth).

## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## Warning: Removed 620 rows containing non-finite values (stat_smooth).

## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## Warning: Removed 728 rows containing non-finite values (stat_smooth).

## Don't know how to automatically pick scale for object of type labelled/integer. Defaulting to continuous.
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## Warning: Removed 462 rows containing non-finite values (stat_smooth).

## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## Warning: Removed 552 rows containing non-finite values (stat_smooth).

## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## Warning: Removed 557 rows containing non-finite values (stat_smooth).

## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## Warning: Removed 666 rows containing non-finite values (stat_smooth).

## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## Warning: Removed 648 rows containing non-finite values (stat_smooth).

## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## Warning: Removed 732 rows containing non-finite values (stat_smooth).

## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## Warning: Removed 732 rows containing non-finite values (stat_smooth).

## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## Warning: Removed 732 rows containing non-finite values (stat_smooth).

## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## Warning: Removed 732 rows containing non-finite values (stat_smooth).

## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## Warning: Removed 732 rows containing non-finite values (stat_smooth).

## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## Warning: Removed 2428 rows containing non-finite values (stat_smooth).

## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## Warning: Removed 620 rows containing non-finite values (stat_smooth).

## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## Warning: Removed 635 rows containing non-finite values (stat_smooth).

## Don't know how to automatically pick scale for object of type labelled/integer. Defaulting to continuous.
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## Warning: Removed 2990 rows containing non-finite values (stat_smooth).

## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## Warning: Removed 614 rows containing non-finite values (stat_smooth).

## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## Warning: Removed 1610 rows containing non-finite values (stat_smooth).

## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## Warning: Removed 1446 rows containing non-finite values (stat_smooth).

## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## Warning: Removed 1718 rows containing non-finite values (stat_smooth).

## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## Warning: Removed 463 rows containing non-finite values (stat_smooth).

## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## Warning: Removed 462 rows containing non-finite values (stat_smooth).

## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## Warning: Removed 463 rows containing non-finite values (stat_smooth).

## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## Warning: Removed 463 rows containing non-finite values (stat_smooth).

## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## Warning: Removed 463 rows containing non-finite values (stat_smooth).

## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## Warning: Removed 476 rows containing non-finite values (stat_smooth).

## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## Warning: Removed 1104 rows containing non-finite values (stat_smooth).

## Don't know how to automatically pick scale for object of type labelled/integer. Defaulting to continuous.
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## Warning: Removed 2887 rows containing non-finite values (stat_smooth).

## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## Warning: Removed 2955 rows containing non-finite values (stat_smooth).

## Don't know how to automatically pick scale for object of type labelled/integer. Defaulting to continuous.
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## Warning: Removed 1724 rows containing non-finite values (stat_smooth).

## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## Warning: Removed 1733 rows containing non-finite values (stat_smooth).

## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## Warning: Removed 1696 rows containing non-finite values (stat_smooth).

## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## Warning: Removed 2313 rows containing non-finite values (stat_smooth).

## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## Warning: Removed 3476 rows containing non-finite values (stat_smooth).

## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## Warning: Removed 1892 rows containing non-finite values (stat_smooth).

## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## Warning: Removed 2008 rows containing non-finite values (stat_smooth).

## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## Warning: Removed 2108 rows containing non-finite values (stat_smooth).

## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## Warning: Removed 4323 rows containing non-finite values (stat_smooth).

## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## Warning: Removed 7410 rows containing non-finite values (stat_smooth).

## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## Warning: Removed 4108 rows containing non-finite values (stat_smooth).

## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## Warning: Removed 2865 rows containing non-finite values (stat_smooth).

## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## Warning: Removed 1851 rows containing non-finite values (stat_smooth).

## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## Warning: Removed 2149 rows containing non-finite values (stat_smooth).

## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## Warning: Removed 2514 rows containing non-finite values (stat_smooth).

## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## Warning: Removed 4572 rows containing non-finite values (stat_smooth).

## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## Warning: Removed 5440 rows containing non-finite values (stat_smooth).

## Don't know how to automatically pick scale for object of type labelled/integer. Defaulting to continuous.
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## Warning: Removed 5417 rows containing non-finite values (stat_smooth).

## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## Warning: Removed 1104 rows containing non-finite values (stat_smooth).

## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## Warning: Removed 463 rows containing non-finite values (stat_smooth).

#+
#  theme_minimal()

18.2 Variable redundancy

18.2.1 Redundancy among very important predictors

First we start with a redundancy analysis with the predictors deemed important by preceding studies.

formula <- as.formula(
              paste("~Alter+",
                    paste(unique(c(bact_transformed$pivotal_vars,
                                   bact_transformed$vip_vars)),
                          collapse="+")))


Hmisc::redun(formula, data=c_bact)
## 
## Redundancy Analysis
## 
## Hmisc::redun(formula = formula, data = c_bact)
## 
## n: 13793     p: 6    nk: 3 
## 
## Number of NAs:    898 
## Frequencies of Missing Values Due to Each Variable
##  Alter  t_WBC  t_BUN t_KREA  t_NEU    PLT 
##      0    462    172    159    728     42 
## 
## 
## Transformation of target variables forced to be linear
## 
## R-squared cutoff: 0.9    Type: ordinary 
## 
## R^2 with which each variable can be predicted from all other variables:
## 
##  Alter  t_WBC  t_BUN t_KREA  t_NEU    PLT 
##  0.196  0.934  0.610  0.574  0.933  0.161 
## 
## Rendundant variables:
## 
## t_WBC
## 
## Predicted from variables:
## 
## Alter t_BUN t_KREA t_NEU PLT 
## 
##   Variable Deleted   R^2 R^2 after later deletions
## 1            t_WBC 0.934

This analysis suggests redundancy of WBC after NEU is in the predictor set. We investigate this further by looking only at leukocyte-related variables.

18.2.3 Redundancy among all potential predictors

Now we perform a full redundancy analysis, but omitting WBC, the leukocyte ratio variables, and BUN and KREA (for their use in constructing BUN_KREA and eGFR).

formula <- as.formula(
              paste("~I(t_EOS)+I(t_BASO)+t_LYM+MONO+t_NEU+",
                    paste(unique(c(bact_transformed$demog_vars,
                                   c("t_BUN_KREA","eGFR","K"),
                                   bact_transformed$acute_related_vars,
                                   bact_transformed$remaining_vars)),collapse="+")))


Hmisc::redun(formula, data=c_bact)
## 
## Redundancy Analysis
## 
## Hmisc::redun(formula = formula, data = c_bact)
## 
## n: 3980  p: 44   nk: 3 
## 
## Number of NAs:    10711 
## Frequencies of Missing Values Due to Each Variable
##   I(t_EOS)  I(t_BASO)      t_LYM       MONO      t_NEU      Alter        sex 
##        135        146        262        246        728          0          0 
## t_BUN_KREA       eGFR          K        FIB        CRP     t_ASAT     t_ALAT 
##        174        159       2008       2567        155       1154        987 
##      t_GGT        MCV        HGB        HCT        MCH       MCHC        RDW 
##       1262         42         41         42         42         42         56 
##        MPV         NT     t_APTT        NA.         CA     t_PHOS         MG 
##        702       2467       2549       1282       1276       1242       1869 
##         HS     t_GBIL         TP        ALB      t_AMY     t_PAMY      t_LIP 
##       3061       1441       1583       1676       3913       7114       3699 
##        CHE       t_AP      t_LDH       t_CK      t_GLU     t_TRIG       CHOL 
##       2447       1400       1714       2080       4192       5061       5045 
##        PDW        RBC 
##       1102        461 
## 
## 
## Transformation of target variables forced to be linear
## 
## R-squared cutoff: 0.9    Type: ordinary 
## 
## R^2 with which each variable can be predicted from all other variables:
## 
##   I(t_EOS)  I(t_BASO)      t_LYM       MONO      t_NEU      Alter        sex 
##      0.254      0.244      0.391      0.426      0.425      0.612      0.321 
## t_BUN_KREA       eGFR          K        FIB        CRP     t_ASAT     t_ALAT 
##      0.376      0.637      0.300      0.670      0.660      0.874      0.762 
##      t_GGT        MCV        HGB        HCT        MCH       MCHC        RDW 
##      0.643      0.993      0.996      0.996      0.995      0.984      0.530 
##        MPV         NT     t_APTT        NA.         CA     t_PHOS         MG 
##      0.905      0.340      0.257      0.335      0.530      0.319      0.244 
##         HS     t_GBIL         TP        ALB      t_AMY     t_PAMY      t_LIP 
##      0.482      0.393      0.787      0.855      0.667      0.722      0.590 
##        CHE       t_AP      t_LDH       t_CK      t_GLU     t_TRIG       CHOL 
##      0.658      0.615      0.588      0.585      0.263      0.385      0.498 
##        PDW        RBC 
##      0.899      0.977 
## 
## Rendundant variables:
## 
## HCT MCH HGB MPV
## 
## Predicted from variables:
## 
## I(t_EOS) I(t_BASO) t_LYM MONO t_NEU Alter sex t_BUN_KREA eGFR K FIB CRP t_ASAT t_ALAT t_GGT MCV MCHC RDW NT t_APTT NA. CA t_PHOS MG HS t_GBIL TP ALB t_AMY t_PAMY t_LIP CHE t_AP t_LDH t_CK t_GLU t_TRIG CHOL PDW RBC 
## 
##   Variable Deleted   R^2 R^2 after later deletions
## 1              HCT 0.996         0.996 0.975 0.975
## 2              MCH 0.994               0.994 0.994
## 3              HGB 0.975                     0.975
## 4              MPV 0.904

This analysis suggests that HCT, MCH, HGB and MPV may be redundant on top of the other variables. Note that BUN, KREA and WBC were already omitted from this redundancy analysis.

18.3 Section session info

## R version 4.1.3 (2022-03-10)
## Platform: x86_64-w64-mingw32/x64 (64-bit)
## Running under: Windows 10 x64 (build 17763)
## 
## Matrix products: default
## 
## locale:
## [1] LC_COLLATE=English_Austria.1252  LC_CTYPE=English_Austria.1252   
## [3] LC_MONETARY=English_Austria.1252 LC_NUMERIC=C                    
## [5] LC_TIME=English_Austria.1252    
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] mice_3.14.0      ggcorrplot_0.1.3 gtsummary_1.5.2  Hmisc_4.6-0     
##  [5] Formula_1.2-4    survival_3.2-13  lattice_0.20-45  plotly_4.10.0   
##  [9] forcats_0.5.1    stringr_1.4.0    dplyr_1.0.8      purrr_0.3.4     
## [13] readr_2.1.2      tidyr_1.2.0      tibble_3.1.6     ggplot2_3.3.5   
## [17] tidyverse_1.3.1  here_1.0.1      
## 
## loaded via a namespace (and not attached):
##  [1] nlme_3.1-155        fs_1.5.2            lubridate_1.8.0    
##  [4] RColorBrewer_1.1-2  httr_1.4.2          rprojroot_2.0.2    
##  [7] tools_4.1.3         backports_1.4.1     bslib_0.3.1        
## [10] utf8_1.2.2          R6_2.5.1            rpart_4.1.16       
## [13] mgcv_1.8-39         DBI_1.1.2           lazyeval_0.2.2     
## [16] colorspace_2.0-3    nnet_7.3-17         withr_2.5.0        
## [19] tidyselect_1.1.2    gridExtra_2.3       compiler_4.1.3     
## [22] cli_3.2.0           rvest_1.0.2         gt_0.4.0           
## [25] htmlTable_2.4.0     xml2_1.3.3          labeling_0.4.2     
## [28] bookdown_0.25       sass_0.4.1          checkmate_2.0.0    
## [31] scales_1.1.1        digest_0.6.29       foreign_0.8-82     
## [34] rmarkdown_2.13      base64enc_0.1-3     jpeg_0.1-9         
## [37] pkgconfig_2.0.3     htmltools_0.5.2     highr_0.9          
## [40] dbplyr_2.1.1        fastmap_1.1.0       htmlwidgets_1.5.4  
## [43] rlang_1.0.2         readxl_1.3.1        rstudioapi_0.13    
## [46] farver_2.1.0        jquerylib_0.1.4     generics_0.1.2     
## [49] jsonlite_1.8.0      crosstalk_1.2.0     magrittr_2.0.2     
## [52] Matrix_1.4-0        Rcpp_1.0.8.3        munsell_0.5.0      
## [55] fansi_1.0.3         lifecycle_1.0.1     stringi_1.7.6      
## [58] yaml_2.3.5          plyr_1.8.7          grid_4.1.3         
## [61] crayon_1.5.1        haven_2.4.3         splines_4.1.3      
## [64] hms_1.1.1           knitr_1.38          pillar_1.7.0       
## [67] reshape2_1.4.4      reprex_2.0.1        glue_1.6.2         
## [70] evaluate_0.15       latticeExtra_0.6-29 broom.helpers_1.6.0
## [73] data.table_1.14.2   modelr_0.1.8        vctrs_0.3.8        
## [76] png_0.1-7           tzdb_0.2.0          cellranger_1.1.0   
## [79] gtable_0.3.0        assertthat_0.2.1    xfun_0.30          
## [82] broom_0.7.12        viridisLite_0.4.0   cluster_2.1.2      
## [85] ellipsis_0.3.2