list.of.packages <- c("tidyverse","bestNormalize")
new.packages <- list.of.packages[!(list.of.packages %in% installed.packages()[,"Package"])]
if(length(new.packages))
install.packages(new.packages)
lapply(list.of.packages , require, character.only = T)## [[1]]
## [1] TRUE
##
## [[2]]
## [1] TRUE
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 5.030 6.750 8.740 8.954 10.910 14.990
hist(bt_pheno$AGE, main="AGE destribution", xlab="AGE")
#plot(density(na.omit(bt_pheno$AGE)))
#polygon(density(na.omit(bt_pheno$AGE)), col = "coral")
#abline(v=c(min(bt_pheno$AGE), max(bt_pheno$AGE)), lty=2, lwd=1)bt_pheno %>%
select(SEX) %>%
table() %>%
as.data.frame() %>%
na.omit() %>%
mutate(Prop = (Freq/sum(Freq))*100) -> sex # Sex
sexcolnames(sex) <- c("cat", "freq", "prop")
par(mar=c(4,4,3,4))
barplot(
sex$prop,
ylim=c(0, 60),
col=c(2,4)
)
text(
x=c(0.7,1.9),
y=30,
labels=c("Female", "Male")
)
mtext(
text = "Proportion (%)",
side = 2,
line = 3
)
mtext(
text = "Sex",
side = 1,
line = 1
)# figures-side,
par(mar = c(4, 4, 2, 2))
hist(bt_pheno$RBC, main="RBC (million/uL)")
plot(density(na.omit(bt_pheno$RBC)), main="RBC (million/uL)")
polygon(density(na.omit(bt_pheno$RBC)), col = "coral")
# removing outliers
abline(v=8, lwd=1, lty=2)
bt_pheno$RBC <- as.numeric(ifelse(bt_pheno$RBC > 8, "NA", bt_pheno$RBC))
plot(density(na.omit(bt_pheno$RBC)), main="RBC (>8) outlier pruned")
polygon(density(na.omit(bt_pheno$RBC)), col = "coral")
BNphen <- bestNormalize::orderNorm(bt_pheno$RBC)
BNphen ## orderNorm Transformation with 953 nonmissing obs and ties
## - 42 unique values
## - Original quantiles:
## 0% 25% 50% 75% 100%
## 1.0 2.4 2.7 3.1 5.0
bt_pheno$nRBC <- BNphen$x.t
plot(density(na.omit(bt_pheno$nRBC)), main="Normalized RBC")
polygon(density(na.omit(bt_pheno$nRBC)), col = "lightblue")
# fit smooth spline for Age against RBC
rbc_pheno <- na.omit(bt_pheno[, which(names(bt_pheno) %in% c("FID","GENDER","AGE","RBC"))])
rbc_plot <- qplot(AGE, RBC, data = rbc_pheno, geom='smooth', span =0.5, color = GENDER) + theme_bw()
rbc_plot## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 1.000 2.400 2.700 2.795 3.100 5.000 25
par(mar = c(4, 4, 2, 2))
hist(bt_pheno$SHb, main="Hb (g/dL)", xlab="Steady state Hb")
plot(density(na.omit(bt_pheno$SHb)), main="Steady state Hb (g/dL)")
polygon(density(na.omit(bt_pheno$SHb)), col = "coral")
# removing outliers
#abline(v=13, lwd=1, lty=2)
#bt_pheno$Hb <- as.numeric(ifelse(bt_pheno$Hb > 13, "NA", bt_pheno$Hb))
#plot(density(na.omit(bt_pheno$Hb)), main="Hb (>13) outlier pruned")
#polygon(density(na.omit(bt_pheno$Hb)), col = "coral")
BNphen <- bestNormalize::orderNorm(bt_pheno$SHb)
BNphen ## orderNorm Transformation with 974 nonmissing obs and ties
## - 63 unique values
## - Original quantiles:
## 0% 25% 50% 75% 100%
## 5.3 7.4 8.0 8.7 11.8
bt_pheno$nSHb <- BNphen$x.t
plot(density(na.omit(bt_pheno$nSHb)), main="Normalized SHb")
polygon(density(na.omit(bt_pheno$nSHb)), col = "lightblue")
# fit smooth spline for Age against Hb
hb_pheno <- na.omit(bt_pheno[, which(names(bt_pheno) %in% c("FID","GENDER","AGE","SHb"))])
hb_plot <- qplot(AGE, SHb, data = hb_pheno, geom='smooth', span =0.5, color = GENDER) + theme_bw()
hb_plot## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 5.30 7.40 8.00 8.08 8.70 11.80 4
par(mar = c(4, 4, 2, 2))
hist(bt_pheno$Hb, main="Hb (g/dL)", xlab="Hb")
plot(density(na.omit(bt_pheno$Hb)), main="Hb (g/dL)")
polygon(density(na.omit(bt_pheno$Hb)), col = "coral")
# removing outliers
abline(v=30, lwd=1, lty=2)
bt_pheno$Hb <- as.numeric(ifelse(bt_pheno$Hb > 30, "NA", bt_pheno$Hb))
plot(density(na.omit(bt_pheno$Hb)), main="Hb (>30) outlier pruned")
polygon(density(na.omit(bt_pheno$Hb)), col = "coral")
BNphen <- bestNormalize::orderNorm(bt_pheno$Hb)
BNphen ## orderNorm Transformation with 959 nonmissing obs and ties
## - 69 unique values
## - Original quantiles:
## 0% 25% 50% 75% 100%
## 4.4 7.4 8.0 8.8 24.0
bt_pheno$nHb <- BNphen$x.t
plot(density(na.omit(bt_pheno$nHb)), main="Normalized Hb")
polygon(density(na.omit(bt_pheno$nHb)), col = "lightblue")
# fit smooth spline for Age against Hb
hb_pheno <- na.omit(bt_pheno[, which(names(bt_pheno) %in% c("FID","GENDER","AGE","Hb"))])
hb_plot <- qplot(AGE, Hb, data = hb_pheno, geom='smooth', span =0.5, color = GENDER) + theme_bw()
hb_plot## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 4.40 7.40 8.00 8.18 8.80 24.00 19
par(mar = c(4, 4, 2, 2))
hist(bt_pheno$HbF, main="HbF", xlab="HbF")
plot(density(na.omit(bt_pheno$HbF)), main="HbF")
polygon(density(na.omit(bt_pheno$HbF)), col = "coral")
# removing outliers
abline(v=c(0,70), lwd=1, lty=2)
bt_pheno$HbF <- as.numeric(
ifelse(bt_pheno$HbF > 70, "NA",
ifelse(bt_pheno$HbF == 0, "NA", bt_pheno$HbF)
)
)
plot(density(na.omit(bt_pheno$HbF)), main="HbF (>70) outlier pruned")
polygon(density(na.omit(bt_pheno$HbF)), col = "coral")
BNphen <- bestNormalize::orderNorm(bt_pheno$HbF)
BNphen ## orderNorm Transformation with 931 nonmissing obs and ties
## - 47 unique values
## - Original quantiles:
## 0% 25% 50% 75% 100%
## 1 6 11 17 69
bt_pheno$nHbF <- BNphen$x.t
plot(density(na.omit(bt_pheno$nHbF)), main="Normalized HbF")
polygon(density(na.omit(bt_pheno$nHbF)), col = "lightblue")
# CUBIC ROOT TRANSFORMATION
bt_pheno$nHbF <- (bt_pheno$HbF)^(1/3)
plot(density(na.omit(bt_pheno$nHbF)), main="Cubic root transformed HbF")
polygon(density(na.omit(bt_pheno$nHbF)), col = "lightblue")
# fit smooth spline for Age against Hb
HbF_pheno <- na.omit(bt_pheno[, which(names(bt_pheno) %in% c("FID","GENDER","AGE","HbF"))])
HbF_plot <- qplot(AGE, HbF, data = HbF_pheno, geom='smooth', span =0.5, color = GENDER) + theme_bw()
HbF_plot## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 1.00 6.00 11.00 12.82 17.00 69.00 47
par(mar = c(4, 4, 2, 2))
hist(bt_pheno$HbF2, main="HbF (Hopkins)", xlab="HbF")
plot(density(na.omit(bt_pheno$HbF2)), main="HbF (Hopkins)")
polygon(density(na.omit(bt_pheno$HbF2)), col = "coral")
# removing outliers
#abline(v=50, lwd=1, lty=2)
#bt_pheno$HbF2 <- as.numeric(ifelse(bt_pheno$HbF2 > 50, "NA", bt_pheno$HbF2))
#plot(density(na.omit(bt_pheno$HbF2)), main="HbF (>50) outlier pruned")
#polygon(density(na.omit(bt_pheno$HbF2)), col = "coral")
BNphen <- bestNormalize(bt_pheno$HbF2)
BNphen ## Best Normalizing transformation with 552 Observations
## Estimated Normality Statistics (Pearson P / df, lower => more normal):
## - arcsinh(x): 1.244
## - Box-Cox: 1.0992
## - Center+scale: 2.1832
## - Double Reversed Log_b(x+a): 3.7168
## - Exp(x): 61.2147
## - Log_b(x+a): 1.3413
## - orderNorm (ORQ): 1.2394
## - sqrt(x + a): 1.2356
## - Yeo-Johnson: 1.1608
## Estimation method: Out-of-sample via CV with 10 folds and 5 repeats
##
## Based off these, bestNormalize chose:
## Standardized Box Cox Transformation with 552 nonmissing obs.:
## Estimated statistics:
## - lambda = 0.2890797
## - mean (before standardization) = 2.730489
## - sd (before standardization) = 1.161546
bt_pheno$nHbF2 <- BNphen$x.t
plot(density(na.omit(bt_pheno$nHbF2)), main="Normalized HbF2")
polygon(density(na.omit(bt_pheno$nHbF2)), col = "lightblue")
# CUBIC ROOT TRANSFORMATION
bt_pheno$nHbF2 <- (bt_pheno$HbF2)^(1/3)
plot(density(na.omit(bt_pheno$nHbF2)), main="Cubic root transformed HbF")
polygon(density(na.omit(bt_pheno$nHbF2)), col = "lightblue")
# fit smooth spline for Age against Hb
HbF2_pheno <- na.omit(bt_pheno[, which(names(bt_pheno) %in% c("FID","GENDER","AGE","HbF2"))])
HbF2_plot <- qplot(AGE, HbF2, data = HbF2_pheno, geom='smooth', span =0.5, color = GENDER) + theme_bw()
HbF2_plot## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0.600 4.600 7.600 8.605 11.500 30.900 426
par(mar = c(4, 4, 2, 2))
hist(bt_pheno$FCELLS, main="FCELLS", xlab="FCELLS")
plot(density(na.omit(bt_pheno$FCELLS)), main="FCELLS")
polygon(density(na.omit(bt_pheno$FCELLS)), col = "coral")
# removing outliers
abline(v=0, lwd=1, lty=2)
bt_pheno$FCELLS <- as.numeric(ifelse(bt_pheno$FCELLS == 0, "NA", bt_pheno$FCELLS))
plot(density(na.omit(bt_pheno$FCELLS)), main="FCELLS (=0) outlier pruned")
polygon(density(na.omit(bt_pheno$FCELLS)), col = "coral")
BNphen <- bestNormalize::orderNorm(bt_pheno$FCELLS)
BNphen ## orderNorm Transformation with 802 nonmissing obs and ties
## - 578 unique values
## - Original quantiles:
## 0% 25% 50% 75% 100%
## 1.00 27.00 40.40 55.47 96.09
bt_pheno$nFCELLS <- BNphen$x.t
plot(density(na.omit(bt_pheno$nFCELLS)), main="Normalized FCELLS")
polygon(density(na.omit(bt_pheno$nFCELLS)), col = "lightblue")
# fit smooth spline for Age against Hb
FCELLS_pheno <- na.omit(bt_pheno[, which(names(bt_pheno) %in% c("FID","GENDER","AGE","FCELLS"))])
FCELLS_plot <- qplot(AGE, FCELLS, data = FCELLS_pheno, geom='smooth', span =0.5, color = GENDER) + theme_bw()
FCELLS_plot## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 1.00 27.00 40.40 41.88 55.47 96.09 176
par(mar = c(4, 4, 2, 2))
hist(bt_pheno$MCV, main="MCV (fL)")
plot(density(na.omit(bt_pheno$MCV)), main="MCV (fL)")
polygon(density(na.omit(bt_pheno$MCV)), col = "coral")
# removing outliers
# abline(v=55, lwd=1, lty=2)
# bt_pheno$MCV <- as.numeric(ifelse(bt_pheno$MCV < 55, "NA", bt_pheno$MCV))
# plot(density(na.omit(bt_pheno$MCV)), main="MCV (>55) outlier pruned")
# polygon(density(na.omit(bt_pheno$MCV)), col = "coral")
BNphen <- bestNormalize::orderNorm(bt_pheno$MCV)
BNphen ## orderNorm Transformation with 969 nonmissing obs and ties
## - 292 unique values
## - Original quantiles:
## 0% 25% 50% 75% 100%
## 8.2 78.4 83.4 88.0 104.0
bt_pheno$nMCV <- BNphen$x.t
plot(density(na.omit(bt_pheno$nMCV)), main="Normalized MCV")
polygon(density(na.omit(bt_pheno$nMCV)), col = "lightblue")
# fit smooth spline for Age against Hb
mcv_pheno <- na.omit(bt_pheno[, which(names(bt_pheno) %in% c("FID","GENDER","AGE","MCV"))])
mcv_plot <- qplot(AGE, MCV, data = mcv_pheno, geom='smooth', span =0.5, color = GENDER) + theme_bw()
mcv_plot## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 8.20 78.40 83.40 82.76 88.00 104.00 9
par(mar = c(4, 4, 2, 2))
hist(bt_pheno$HCT, main="HCT")
plot(density(na.omit(bt_pheno$HCT)), main="HCT")
polygon(density(na.omit(bt_pheno$HCT)), col = "coral")
# removing ourliers
#abline(v=43, lwd=1, lty=2)
#bt_pheno$HTC <- as.numeric(ifelse(bt_pheno$HTC > 43, "NA", bt_pheno$HTC))
#plot(density(na.omit(bt_pheno$HTC)), main="HTC (>43) outlier pruned")
#polygon(density(na.omit(bt_pheno$HTC)), col = "coral")
BNphen <- bestNormalize::orderNorm(bt_pheno$HCT)## Warning in bestNormalize::orderNorm(bt_pheno$HCT): Ties in data, Normal distribution not guaranteed
## orderNorm Transformation with 971 nonmissing obs and ties
## - 24 unique values
## - Original quantiles:
## 0% 25% 50% 75% 100%
## 12 21 23 25 35
bt_pheno$nHCT <- BNphen$x.t
plot(density(na.omit(bt_pheno$nHCT)), main="Normalized HCT")
polygon(density(na.omit(bt_pheno$nHCT)), col = "lightblue")
# fit smooth spline for Age against Hb
htc_pheno <- na.omit(bt_pheno[, which(names(bt_pheno) %in% c("FID","GENDER","AGE","HCT"))])
htc_plot <- qplot(AGE, HCT, data = htc_pheno, geom='smooth', span =0.5, color = GENDER) + theme_bw()
htc_plot## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 12.0 21.0 23.0 23.2 25.0 35.0 7
#Mean Corpuscular Hemoglobin Concentration
par(mar = c(4, 4, 2, 2))
hist(bt_pheno$MCHC, main="MCHC (g/dL)")
plot(density(na.omit(bt_pheno$MCHC)), main="MCHC (g/dL)")
polygon(density(na.omit(bt_pheno$MCHC)), col = "coral")
# removing
abline(v=20, lwd=1, lty=2)
bt_pheno$MCHC <- as.numeric(
ifelse(bt_pheno$MCHC < 20, "NA",
bt_pheno$MCHC
#ifelse(bt_pheno$MCHC < 10, "NA", bt_pheno$MCHC)
)
)## Warning: NAs introduced by coercion
plot(density(na.omit(bt_pheno$MCHC)), main="MCHC (<20) outlier pruned")
polygon(density(na.omit(bt_pheno$MCHC)), col = "coral")
BNphen <- bestNormalize::orderNorm(bt_pheno$MCHC)## Warning in bestNormalize::orderNorm(bt_pheno$MCHC): Ties in data, Normal distribution not guaranteed
## orderNorm Transformation with 913 nonmissing obs and ties
## - 49 unique values
## - Original quantiles:
## 0% 25% 50% 75% 100%
## 21 34 35 36 40
bt_pheno$nMCHC <- BNphen$x.t
plot(density(na.omit(bt_pheno$nMCHC)), main="Normalized MCHC")
polygon(density(na.omit(bt_pheno$nMCHC)), col = "lightblue")
# fit smooth spline for Age against Hb
mchc_pheno <- na.omit(bt_pheno[, which(names(bt_pheno) %in% c("FID","GENDER","AGE","MCHC"))])
mchc_plot <- qplot(AGE, MCHC, data = mchc_pheno, geom='smooth', span =0.5, color = GENDER) + theme_bw()
mchc_plot## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 21.00 34.00 35.00 34.54 36.00 40.00 65
#The Mean Corpuscular Hemoglobin
par(mar = c(4, 4, 2, 2))
hist(bt_pheno$MCH, main="MCH (pg/cell)")
plot(density(na.omit(bt_pheno$MCH)), main="MCH (pg/cell)")
polygon(density(na.omit(bt_pheno$MCH)), col = "coral")
# removing outliers
#abline(v=10, lwd=1, lty=2)
#bt_pheno$MCH <- as.numeric(
# ifelse(bt_pheno$MCH < 10, "NA",
# bt_pheno$MCH
# #ifelse(bt_pheno$MCH < 10, "NA", bt_pheno$MCH)
# )
#)
#plot(density(na.omit(bt_pheno$MCH)), main="MCH (<10) outlier pruned")
#polygon(density(na.omit(bt_pheno$MCH)), col = "coral")
BNphen <- bestNormalize::orderNorm(bt_pheno$MCH)
BNphen ## orderNorm Transformation with 954 nonmissing obs and ties
## - 148 unique values
## - Original quantiles:
## 0% 25% 50% 75% 100%
## 18.000 26.900 29.100 30.875 37.000
bt_pheno$nMCH <- BNphen$x.t
plot(density(na.omit(bt_pheno$nMCH)), main="Normalized MCH")
polygon(density(na.omit(bt_pheno$nMCH)), col = "lightblue")
# fit smooth spline for Age against Hb
tcmh_pheno <- na.omit(bt_pheno[, which(names(bt_pheno) %in% c("FID","GENDER","AGE","MCH"))])
tcmh_plot <- qplot(AGE, MCH, data = tcmh_pheno, geom='smooth', span =0.5, color = GENDER) + theme_bw()
tcmh_plot## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 18.00 26.90 29.10 28.84 30.88 37.00 24
par(mar = c(4, 4, 2, 2))
hist(bt_pheno$WBC, main="WBC (counts/uL)")
plot(density(na.omit(bt_pheno$WBC)), main="WBC (counts/uL)")
polygon(density(na.omit(bt_pheno$WBC)), col = "coral")
# removing
abline(v=60000, lwd=1, lty=2)
bt_pheno$WBC <- as.numeric(
ifelse(bt_pheno$WBC > 60000, "NA",
bt_pheno$WBC
#ifelse(bt_pheno$WBC < 10, "NA", bt_pheno$WBC)
)
)## Warning: NAs introduced by coercion
plot(density(na.omit(bt_pheno$WBC)), main="WBC (>60000) outlier pruned")
polygon(density(na.omit(bt_pheno$WBC)), col = "coral")
BNphen <- bestNormalize(bt_pheno$WBC)
BNphen ## Best Normalizing transformation with 960 Observations
## Estimated Normality Statistics (Pearson P / df, lower => more normal):
## - arcsinh(x): 2.2165
## - Box-Cox: 1.6553
## - Center+scale: 1.7284
## - Double Reversed Log_b(x+a): 2.7775
## - Log_b(x+a): 2.2165
## - orderNorm (ORQ): 1.3205
## - sqrt(x + a): 1.567
## - Yeo-Johnson: 1.6553
## Estimation method: Out-of-sample via CV with 10 folds and 5 repeats
##
## Based off these, bestNormalize chose:
## orderNorm Transformation with 960 nonmissing obs and ties
## - 303 unique values
## - Original quantiles:
## 0% 25% 50% 75% 100%
## 1055 10000 12000 14700 35000
bt_pheno$nWBC <- BNphen$x.t
plot(density(na.omit(bt_pheno$nWBC)), main="Normalized WBC")
polygon(density(na.omit(bt_pheno$nWBC)), col = "lightblue")
# fit smooth spline for Age against Hb
wbc_pheno <- na.omit(bt_pheno[, which(names(bt_pheno) %in% c("FID","GENDER","AGE","WBC"))])
wbc_plot <- qplot(AGE, WBC, data = wbc_pheno, geom='smooth', span =0.5, color = GENDER) + theme_bw()
wbc_plot## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 1055 10000 12000 12514 14700 35000 18
par(mar = c(4, 4, 2, 2))
hist(bt_pheno$NEUTROPHIL, main="NEUTROPHIL (g/L)")
plot(density(na.omit(bt_pheno$NEUTROPHIL)), main="NEUTROPHIL (g/L)")
polygon(density(na.omit(bt_pheno$NEUTROPHIL)), col = "coral")
# removing
#abline(v=20, lwd=1, lty=2)
#bt_pheno$NEUTROPHIL <- as.numeric(
# ifelse(bt_pheno$NEUTROPHIL > 20, "NA",
# bt_pheno$NEUTROPHIL
# #ifelse(bt_pheno$NEUTROPHIL < 10, "NA", bt_pheno$NEUTROPHIL)
# )
#)
#plot(density(na.omit(bt_pheno$NEUTROPHILE)), main="NEUTROPHIL (>20) outlier pruned")
#polygon(density(na.omit(bt_pheno$NEUTROPHILE)), col = "coral")
BNphen <- bestNormalize::orderNorm(bt_pheno$NEUTROPHIL)## Warning in bestNormalize::orderNorm(bt_pheno$NEUTROPHIL): Ties in data, Normal distribution not guaranteed
## orderNorm Transformation with 914 nonmissing obs and ties
## - 86 unique values
## - Original quantiles:
## 0% 25% 50% 75% 100%
## 1 37 46 56 92
bt_pheno$nNEUTROPHIL <- BNphen$x.t
plot(density(na.omit(bt_pheno$nNEUTROPHIL)), main="Normalized NEUTROPHIL")
polygon(density(na.omit(bt_pheno$nNEUTROPHIL)), col = "lightblue")
# fit smooth spline for Age against NEUTROPHILE
neu_pheno <- na.omit(bt_pheno[, which(names(bt_pheno) %in% c("FID","GENDER","AGE","NEUTROPHIL"))])
neu_plot <- qplot(AGE, NEUTROPHIL, data = neu_pheno, geom='smooth', span =0.5, color = GENDER) + theme_bw()
neu_plot## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 1.00 37.00 46.00 45.39 56.00 92.00 64
Greater than 10 BANDS is associated with poor disease outcome, septic shock, increased 30day mortality, bacteremia, etc. https://www.sciencedirect.com/science/article/pii/S0735675720310767
Binarize such that <=10 is control and >10 is case
par(mar = c(4, 4, 2, 2))
hist(bt_pheno$BANDS, main="BANDS (% neutrophil)")
plot(density(na.omit(bt_pheno$BANDS)), main="BANDS (% neutrophil)")
polygon(density(na.omit(bt_pheno$BANDS)), col = "coral")
BNphen <- bestNormalize(bt_pheno$BANDS)
BNphen ## Best Normalizing transformation with 725 Observations
## Estimated Normality Statistics (Pearson P / df, lower => more normal):
## - arcsinh(x): 51.2912
## - Center+scale: 54.6382
## - Double Reversed Log_b(x+a): 57.2668
## - Exp(x): 28.2585
## - Log_b(x+a): 53.331
## - orderNorm (ORQ): 51.7743
## - sqrt(x + a): 51.0067
## - Yeo-Johnson: 53.992
## Estimation method: Out-of-sample via CV with 10 folds and 5 repeats
##
## Based off these, bestNormalize chose:
## Standardized exp(x) Transformation with 725 nonmissing obs.:
## Relevant statistics:
## - mean (before standardization) = 1.003578e+40
## - sd (before standardization) = 1.909436e+41
bt_pheno$nBANDS <- BNphen$x.t
plot(density(na.omit(bt_pheno$nBANDS)), main="Normalized BANDS")
polygon(density(na.omit(bt_pheno$nBANDS)), col = "lightblue")
# PLINK CASE-CONTROL
bt_pheno$bBANDS <- as.numeric(
ifelse(bt_pheno$BANDS <= 10, 1,
#bt_pheno$BASOPHILE
ifelse(bt_pheno$BANDS > 10, 2, "NA")
)
)
# SAIGE CASE-CONTROL
bt_pheno$cBANDS <- as.numeric(
ifelse(bt_pheno$BANDS <= 10, 0,
#bt_pheno$BASOPHILE
ifelse(bt_pheno$BANDS > 10, 1, "NA")
)
)
hist(bt_pheno$cBANDS, main="binarized")
# fit smooth spline for Age against BANDS
eu_pheno <- na.omit(bt_pheno[, which(names(bt_pheno) %in% c("FID","GENDER","AGE","BANDS"))])
eu_plot <- qplot(AGE, BANDS, data = eu_pheno, geom='smooth', span =0.5, color = GENDER) + theme_bw()
eu_plot## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0.000 0.000 0.000 1.179 0.000 98.000 253
par(mar = c(4, 4, 2, 2))
hist(bt_pheno$PLATELET, main="PLATELET (K/cu mm)")
plot(density(na.omit(bt_pheno$PLATELET)), main="PLATELET (K/cu mm)")
polygon(density(na.omit(bt_pheno$PLATELET)), col = "coral")
BNphen <- bestNormalize::orderNorm(bt_pheno$PLATELET)## Warning in bestNormalize::orderNorm(bt_pheno$PLATELET): Ties in data, Normal distribution not guaranteed
## orderNorm Transformation with 952 nonmissing obs and ties
## - 438 unique values
## - Original quantiles:
## 0% 25% 50% 75% 100%
## 1000 364750 443500 526250 997000
bt_pheno$nPLATELET <- BNphen$x.t
plot(density(na.omit(bt_pheno$nPLATELET)), main="Normalized PLATELET")
polygon(density(na.omit(bt_pheno$nPLATELET)), col = "lightblue")
# fit smooth spline for Age against PLATELET
pla_pheno <- na.omit(bt_pheno[, which(names(bt_pheno) %in% c("FID","GENDER","AGE","PLATELET"))])
pla_plot <- qplot(AGE, PLATELET, data = pla_pheno, geom='smooth', span =0.5, color = GENDER) + theme_bw()
pla_plot## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 1000 364750 443500 450453 526250 997000 26
par(mar = c(4, 4, 2, 2))
hist(bt_pheno$MPV, main="MPV")
plot(density(na.omit(bt_pheno$MPV)), main="MPV")
polygon(density(na.omit(bt_pheno$MPV)), col = "coral")
# removing
abline(v=20, lwd=1, lty=2)
bt_pheno$MPV <- as.numeric(
ifelse(bt_pheno$MPV > 20, "NA",
bt_pheno$MPV
#ifelse(bt_pheno$MPV < 10, "NA", bt_pheno$MPV)
)
)## Warning: NAs introduced by coercion
plot(density(na.omit(bt_pheno$MPV)), main="MPV (>20) outlier pruned")
polygon(density(na.omit(bt_pheno$MPV)), col = "coral")
BNphen <- bestNormalize::orderNorm(bt_pheno$MPV)## Warning in bestNormalize::orderNorm(bt_pheno$MPV): Ties in data, Normal distribution not guaranteed
## orderNorm Transformation with 783 nonmissing obs and ties
## - 67 unique values
## - Original quantiles:
## 0% 25% 50% 75% 100%
## 5.0 8.0 9.0 9.7 13.7
bt_pheno$nMPV <- BNphen$x.t
plot(density(na.omit(bt_pheno$nMPV)), main="Normalized MPV")
polygon(density(na.omit(bt_pheno$nMPV)), col = "lightblue")
# fit smooth spline for Age against PLATELET
pla_pheno <- na.omit(bt_pheno[, which(names(bt_pheno) %in% c("FID","GENDER","AGE","MPV"))])
pla_plot <- qplot(AGE, MPV, data = pla_pheno, geom='smooth', span =0.5, color = GENDER) + theme_bw()
pla_plot## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 5.000 8.000 9.000 8.876 9.700 13.700 195
par(mar = c(4, 4, 2, 2))
hist(bt_pheno$RETICULOCYTE, main="RETICULOCYTE")
plot(density(na.omit(bt_pheno$RETICULOCYTE)), main="RETICULOCYTE")
polygon(density(na.omit(bt_pheno$RETICULOCYTE)), col = "coral")
BNphen <- bestNormalize::orderNorm(bt_pheno$RETICULOCYTE)## Warning in bestNormalize::orderNorm(bt_pheno$RETICULOCYTE): Ties in data, Normal distribution not guaranteed
## orderNorm Transformation with 957 nonmissing obs and ties
## - 250 unique values
## - Original quantiles:
## 0% 25% 50% 75% 100%
## 0.50 8.53 11.51 15.20 40.00
bt_pheno$nRETICULOCYTE <- BNphen$x.t
plot(density(na.omit(bt_pheno$nRETICULOCYTE)), main="Normalized RETICULOCYTE")
polygon(density(na.omit(bt_pheno$nRETICULOCYTE)), col = "lightblue")
# fit smooth spline for Age against PLATELET
ret_pheno <- na.omit(bt_pheno[, which(names(bt_pheno) %in% c("FID","GENDER","AGE","RETICULOCYTE"))])
ret_plot <- qplot(AGE, RETICULOCYTE, data = ret_pheno, geom='smooth', span =0.5, color = GENDER) + theme_bw()
ret_plot## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0.50 8.53 11.51 12.20 15.20 40.00 21
# RETICULOCYTES /uL
par(mar = c(4, 4, 2, 2))
hist(bt_pheno$RETICS, main="RETICS")
plot(density(na.omit(bt_pheno$RETICS)), main="RETICS")
polygon(density(na.omit(bt_pheno$RETICS)), col = "coral")
# removing outliers
#abline(v=1000000, lwd=1, lty=2)
#bt_pheno$RETICS <- as.numeric(
# ifelse(bt_pheno$RETICS > 1000000, "NA",
# bt_pheno$RETICS
# #ifelse(bt_pheno$RETICS < 10, "NA", bt_pheno$RETICS)
# )
#)
#plot(density(na.omit(bt_pheno$RETICS)), main="RETICS (>1000000) outlier pruned")
#polygon(density(na.omit(bt_pheno$RETICS)), col = "coral")
BNphen <- bestNormalize::orderNorm(bt_pheno$RETICS)
BNphen ## orderNorm Transformation with 676 nonmissing obs and ties
## - 475 unique values
## - Original quantiles:
## 0% 25% 50% 75% 100%
## 0.000 10.495 17.900 29.625 82.800
bt_pheno$nRETICS <- BNphen$x.t
plot(density(na.omit(bt_pheno$nRETICS)), main="Normalized RETICS")
polygon(density(na.omit(bt_pheno$nRETICS)), col = "lightblue")
# fit smooth spline for Age against RETICS
rets_pheno <- na.omit(bt_pheno[, which(names(bt_pheno) %in% c("FID","GENDER","AGE","RETICS"))])
rets_plot <- qplot(AGE, RETICS, data = rets_pheno, geom='smooth', span =0.5, color = GENDER) + theme_bw()
rets_plot## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0.00 10.49 17.90 21.14 29.62 82.80 302
par(mar = c(4, 4, 2, 2))
hist(bt_pheno$SBP, main="SBP")
plot(density(na.omit(bt_pheno$SBP)), main="SBP")
polygon(density(na.omit(bt_pheno$SBP)), col = "coral")
BNphen <- bestNormalize::orderNorm(bt_pheno$SBP)
BNphen ## orderNorm Transformation with 971 nonmissing obs and ties
## - 70 unique values
## - Original quantiles:
## 0% 25% 50% 75% 100%
## 66 100 107 115 151
bt_pheno$nSBP <- BNphen$x.t
plot(density(na.omit(bt_pheno$nSBP)), main="Normalized SBP")
polygon(density(na.omit(bt_pheno$nSBP)), col = "lightblue")
# fit smooth spline for Age against RETICS
sbp_pheno <- na.omit(bt_pheno[, which(names(bt_pheno) %in% c("FID","GENDER","AGE","SBP"))])
sbp_plot <- qplot(AGE, SBP, data = sbp_pheno, geom='smooth', span =0.5, color = GENDER) + theme_bw()
sbp_plot## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 66 100 107 108 115 151 7
par(mar = c(4, 4, 2, 2))
hist(bt_pheno$DBP, main="DBP")
plot(density(na.omit(bt_pheno$DBP)), main="DBP")
polygon(density(na.omit(bt_pheno$DBP)), col = "coral")
BNphen <- bestNormalize::orderNorm(bt_pheno$DBP)
BNphen ## orderNorm Transformation with 970 nonmissing obs and ties
## - 51 unique values
## - Original quantiles:
## 0% 25% 50% 75% 100%
## 34 55 60 65 106
bt_pheno$nDBP <- BNphen$x.t
plot(density(na.omit(bt_pheno$nDBP)), main="Normalized DBP")
polygon(density(na.omit(bt_pheno$nDBP)), col = "lightblue")
# fit smooth spline for Age against DBP
dbp_pheno <- na.omit(bt_pheno[, which(names(bt_pheno) %in% c("FID","GENDER","AGE","DBP"))])
dbp_plot <- qplot(AGE, DBP, data = dbp_pheno, geom='smooth', span =0.5, color = GENDER) + theme_bw()
dbp_plot## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 34.00 55.00 60.00 60.23 65.00 106.00 8
par(mar = c(4, 4, 2, 2))
hist(bt_pheno$BMI, main="BMI")
plot(density(na.omit(bt_pheno$BMI)), main="BMI")
polygon(density(na.omit(bt_pheno$BMI)), col = "coral")
# removing outliers
abline(v=50, lwd=1, lty=2)
bt_pheno$BMI <- as.numeric(
ifelse(bt_pheno$BMI > 50, "NA",
bt_pheno$BMI
#ifelse(bt_pheno$BMI < 10, "NA", bt_pheno$BMI)
)
)
plot(density(na.omit(bt_pheno$BMI)), main="BMI (>50) outlier pruned")
polygon(density(na.omit(bt_pheno$BMI)), col = "coral")
BNphen <- bestNormalize::orderNorm(bt_pheno$BMI)
BNphen ## orderNorm Transformation with 969 nonmissing obs and ties
## - 388 unique values
## - Original quantiles:
## 0% 25% 50% 75% 100%
## 6.90 14.95 16.01 17.34 39.02
bt_pheno$nBMI <- BNphen$x.t
plot(density(na.omit(bt_pheno$nBMI)), main="Normalized BMI")
polygon(density(na.omit(bt_pheno$nBMI)), col = "lightblue")
# fit smooth spline for Age against BMI
bmi_pheno <- na.omit(bt_pheno[, which(names(bt_pheno) %in% c("FID","GENDER","AGE","BMI"))])
bmi_plot <- qplot(AGE, BMI, data = bmi_pheno, geom='smooth', span =0.5, color = GENDER) + theme_bw()
bmi_plot## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 34.00 55.00 60.00 60.23 65.00 106.00 8