Describe phenotypes and
normalize if needed
RBC
# figures-side,
par(mar = c(4, 4, 2, 2))
hist(bt_pheno$RBC, main="RBC (million/uL)")
plot(density(na.omit(bt_pheno$RBC)), main="RBC (million/uL)")
polygon(density(na.omit(bt_pheno$RBC)), col = "coral")
BNphen <- bestNormalize::orderNorm(bt_pheno$RBC)
## Warning in bestNormalize::orderNorm(bt_pheno$RBC): Ties in data, Normal distribution not guaranteed
## orderNorm Transformation with 1210 nonmissing obs and ties
## - 299 unique values
## - Original quantiles:
## 0% 25% 50% 75% 100%
## 0.840 2.262 2.690 3.110 6.480
bt_pheno$nRBC <- BNphen$x.t
plot(density(na.omit(bt_pheno$nRBC)), main="Normalized RBC")
polygon(density(na.omit(bt_pheno$nRBC)), col = "lightblue")
# fit smooth spline for Age against RBC
rbc_pheno <- na.omit(bt_pheno[, which(names(bt_pheno) %in% c("FID","GENDER","AGE","RBC"))])
rbc_plot <- qplot(AGE, RBC, data = rbc_pheno, geom='smooth', span =0.5, color = GENDER) + theme_bw()
## Warning: `qplot()` was deprecated in ggplot2 3.4.0.
## This warning is displayed once per session.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0.840 2.263 2.690 2.730 3.110 6.480 23




Hb
par(mar = c(4, 4, 2, 2))
hist(bt_pheno$Hb, main="Hb (g/dL)", xlab="Hb")
plot(density(na.omit(bt_pheno$Hb)), main="Hb (g/dL)")
polygon(density(na.omit(bt_pheno$Hb)), col = "coral")
# removing outliers
#abline(v=13, lwd=1, lty=2)
#bt_pheno$Hb <- as.numeric(ifelse(bt_pheno$Hb > 13, "NA", bt_pheno$Hb))
#plot(density(na.omit(bt_pheno$Hb)), main="Hb (>13) outlier pruned")
#polygon(density(na.omit(bt_pheno$Hb)), col = "coral")
BNphen <- bestNormalize::orderNorm(bt_pheno$Hb)
## Warning in bestNormalize::orderNorm(bt_pheno$Hb): Ties in data, Normal distribution not guaranteed
## orderNorm Transformation with 1212 nonmissing obs and ties
## - 92 unique values
## - Original quantiles:
## 0% 25% 50% 75% 100%
## 2.9 6.7 7.6 8.5 14.5
bt_pheno$nHb <- BNphen$x.t
plot(density(na.omit(bt_pheno$nHb)), main="Normalized Hb")
polygon(density(na.omit(bt_pheno$nHb)), col = "lightblue")
# fit smooth spline for Age against Hb
hb_pheno <- na.omit(bt_pheno[, which(names(bt_pheno) %in% c("FID","GENDER","AGE","Hb"))])
hb_plot <- qplot(AGE, Hb, data = hb_pheno, geom='smooth', span =0.5, color = GENDER) + theme_bw()
hb_plot
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 2.900 6.700 7.600 7.678 8.500 14.500 21




HbF
par(mar = c(4, 4, 2, 2))
hist(bt_pheno$HbF, main="HbF (%)", xlab="HbF")
plot(density(na.omit(bt_pheno$HbF)), main="HbF")
polygon(density(na.omit(bt_pheno$HbF)), col = "coral")
# removing outliers
abline(v=0, lwd=1, lty=2)
bt_pheno$HbF <- as.numeric(ifelse(bt_pheno$HbF == 0, "NA", bt_pheno$HbF))
plot(density(na.omit(bt_pheno$HbF)), main="HbF (0) outlier pruned")
polygon(density(na.omit(bt_pheno$HbF)), col = "coral")
BNphen <- bestNormalize::orderNorm(bt_pheno$HbF)
BNphen
## orderNorm Transformation with 1106 nonmissing obs and ties
## - 261 unique values
## - Original quantiles:
## 0% 25% 50% 75% 100%
## 0.800 4.825 8.800 14.000 40.300
bt_pheno$onHbF <- BNphen$x.t
plot(density(na.omit(bt_pheno$onHbF)), main="Normalized HbF")
polygon(density(na.omit(bt_pheno$onHbF)), col = "lightblue")
# CUBIC ROOT TRANSFORMATION
bt_pheno$nHbF <- (bt_pheno$HbF)^(1/3)
plot(density(na.omit(bt_pheno$nHbF)), main="Cubic root transformed HbF")
polygon(density(na.omit(bt_pheno$nHbF)), col = "lightblue")
# fit smooth spline for Age against Hb
HbF_pheno <- na.omit(bt_pheno[, which(names(bt_pheno) %in% c("FID","GENDER","AGE","HbF"))])
HbF_plot <- qplot(AGE, HbF, data = HbF_pheno, geom='smooth', span =0.5, color = GENDER) + theme_bw()
HbF_plot
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0.800 4.825 8.800 10.221 14.000 40.300 127






MCV
par(mar = c(4, 4, 2, 2))
hist(pheno$MCV, main="MCV (fL)")
plot(density(na.omit(bt_pheno$MCV)), main="MCV (fL)")
polygon(density(na.omit(bt_pheno$MCV)), col = "coral")
# removing outliers
#abline(v=120, lwd=1, lty=2)
#bt_pheno$MCV <- as.numeric(ifelse(bt_pheno$MCV > 120, "NA", bt_pheno$MCV))
#plot(density(na.omit(bt_pheno$MCV)), main="MCV (>120) outlier pruned")
#polygon(density(na.omit(bt_pheno$MCV)), col = "coral")
BNphen <- bestNormalize(bt_pheno$MCV)
BNphen
## Best Normalizing transformation with 1212 Observations
## Estimated Normality Statistics (Pearson P / df, lower => more normal):
## - arcsinh(x): 1.7603
## - Box-Cox: 1.5548
## - Center+scale: 1.4957
## - Double Reversed Log_b(x+a): 1.8836
## - Exp(x): 137.5572
## - Log_b(x+a): 1.7603
## - orderNorm (ORQ): 1.6575
## - sqrt(x + a): 1.6329
## - Yeo-Johnson: 1.5548
## Estimation method: Out-of-sample via CV with 10 folds and 5 repeats
##
## Based off these, bestNormalize chose:
## center_scale(x) Transformation with 1212 nonmissing obs.
## Estimated statistics:
## - mean (before standardization) = 87.24282
## - sd (before standardization) = 10.16145
bt_pheno$nMCV <- BNphen$x.t
plot(density(na.omit(bt_pheno$nMCV)), main="Normalized MCV")
polygon(density(na.omit(bt_pheno$nMCV)), col = "lightblue")
BNphen <- bestNormalize::orderNorm(bt_pheno$MCV)
## Warning in bestNormalize::orderNorm(bt_pheno$MCV): Ties in data, Normal distribution not guaranteed
## orderNorm Transformation with 1212 nonmissing obs and ties
## - 72 unique values
## - Original quantiles:
## 0% 25% 50% 75% 100%
## 51 81 88 94 133
bt_pheno$onMCV <- BNphen$x.t
plot(density(na.omit(bt_pheno$onMCV)), main="Normalized (orderNorm) MCV")
polygon(density(na.omit(bt_pheno$onMCV)), col = "lightblue")
# fit smooth spline for Age against Hb
mcv_pheno <- na.omit(bt_pheno[, which(names(bt_pheno) %in% c("FID","GENDER","AGE","MCV"))])
mcv_plot <- qplot(AGE, MCV, data = mcv_pheno, geom='smooth', span =0.5, color = GENDER) + theme_bw()
mcv_plot
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 51.00 81.00 88.00 87.24 94.00 133.00 21





HCT
par(mar = c(4, 4, 2, 2))
hist(bt_pheno$HCT, main="HCT")
plot(density(na.omit(bt_pheno$HCT)), main="HCT")
polygon(density(na.omit(bt_pheno$HCT)), col = "coral")
# removing ourliers
#abline(v=43, lwd=1, lty=2)
#bt_pheno$HCT <- as.numeric(ifelse(bt_pheno$HCT > 43, "NA", bt_pheno$HCT))
#plot(density(na.omit(bt_pheno$HCT)), main="HCT (>43) outlier pruned")
#polygon(density(na.omit(bt_pheno$HCT)), col = "coral")
BNphen <- bestNormalize::orderNorm(bt_pheno$HCT)
## Warning in bestNormalize::orderNorm(bt_pheno$HCT): Ties in data, Normal distribution not guaranteed
## orderNorm Transformation with 742 nonmissing obs and ties
## - 197 unique values
## - Original quantiles:
## 0% 25% 50% 75% 100%
## 9.2 20.8 23.5 26.5 46.4
bt_pheno$nHCT <- BNphen$x.t
plot(density(na.omit(bt_pheno$nHCT)), main="Normalized HCT")
polygon(density(na.omit(bt_pheno$nHCT)), col = "lightblue")
# fit smooth spline for Age against Hb
htc_pheno <- na.omit(bt_pheno[, which(names(bt_pheno) %in% c("FID","GENDER","AGE","HCT"))])
htc_plot <- qplot(AGE, HCT, data = htc_pheno, geom='smooth', span =0.5, color = GENDER) + theme_bw()
htc_plot
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 9.2 20.8 23.5 23.7 26.5 46.4 491




MCHC
#Mean Corpuscular Hemoglobin Concentration
par(mar = c(4, 4, 2, 2))
hist(bt_pheno$MCHC, main="MCHC (g/dL)")
plot(density(na.omit(bt_pheno$MCHC)), main="MCHC (g/dL)")
polygon(density(na.omit(bt_pheno$MCHC)), col = "coral")
# removing
abline(v=c(10, 60), lwd=1, lty=2)
bt_pheno$MCHC <- as.numeric(
ifelse(bt_pheno$MCHC > 60, "NA",
#bt_pheno$MCHC
ifelse(bt_pheno$MCHC < 10, "NA", bt_pheno$MCHC)
)
)
## Warning: NAs introduced by coercion
plot(density(na.omit(bt_pheno$MCHC)), main="MCHC (>60 and <10) outlier pruned")
polygon(density(na.omit(bt_pheno$MCHC)), col = "coral")
BNphen <- bestNormalize::orderNorm(bt_pheno$MCHC)
## Warning in bestNormalize::orderNorm(bt_pheno$MCHC): Ties in data, Normal distribution not guaranteed
## orderNorm Transformation with 1178 nonmissing obs and ties
## - 111 unique values
## - Original quantiles:
## 0% 25% 50% 75% 100%
## 11.9 26.0 31.0 33.0 54.2
bt_pheno$nMCHC <- BNphen$x.t
plot(density(na.omit(bt_pheno$nMCHC)), main="Normalized MCHC")
polygon(density(na.omit(bt_pheno$nMCHC)), col = "lightblue")
# fit smooth spline for Age against Hb
mchc_pheno <- na.omit(bt_pheno[, which(names(bt_pheno) %in% c("FID","GENDER","AGE","MCHC"))])
mchc_plot <- qplot(AGE, MCHC, data = mchc_pheno, geom='smooth', span =0.5, color = GENDER) + theme_bw()
mchc_plot
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 11.90 26.00 31.00 29.69 33.00 54.20 55





MCH
#The Mean Corpuscular Hemoglobin
par(mar = c(4, 4, 2, 2))
hist(bt_pheno$MCH, main="MCH (pg/cell)")
plot(density(na.omit(bt_pheno$MCH)), main="MCH (pg/cell)")
polygon(density(na.omit(bt_pheno$MCH)), col = "coral")
# removing outliers
abline(v=10, lwd=1, lty=2)
bt_pheno$MCH <- as.numeric(
ifelse(bt_pheno$MCH < 10, "NA",
bt_pheno$MCH
#ifelse(bt_pheno$MCH < 10, "NA", bt_pheno$MCH)
)
)
## Warning: NAs introduced by coercion
plot(density(na.omit(bt_pheno$MCH)), main="MCH (<10) outlier pruned")
polygon(density(na.omit(bt_pheno$MCH)), col = "coral")
BNphen <- bestNormalize::orderNorm(bt_pheno$MCH)
## Warning in bestNormalize::orderNorm(bt_pheno$MCH): Ties in data, Normal distribution not guaranteed
## orderNorm Transformation with 741 nonmissing obs and ties
## - 36 unique values
## - Original quantiles:
## 0% 25% 50% 75% 100%
## 13 26 29 31 44
bt_pheno$nMCH <- BNphen$x.t
plot(density(na.omit(bt_pheno$nMCH)), main="Normalized MCH")
polygon(density(na.omit(bt_pheno$nMCH)), col = "lightblue")
# fit smooth spline for Age against Hb
MCH_pheno <- na.omit(bt_pheno[, which(names(bt_pheno) %in% c("FID","GENDER","AGE","MCH"))])
MCH_plot <- qplot(AGE, MCH, data = MCH_pheno, geom='smooth', span =0.5, color = GENDER) + theme_bw()
MCH_plot
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 13.00 26.00 29.00 28.53 31.00 44.00 492





WBC
par(mar = c(4, 4, 2, 2))
hist(bt_pheno$WBC, main="WBC (x10^9 cells/L)")
plot(density(na.omit(bt_pheno$WBC)), main="WBC (x10^9 cells/L)")
polygon(density(na.omit(bt_pheno$WBC)), col = "coral")
# removing
abline(v=60, lwd=1, lty=2)
bt_pheno$WBC <- as.numeric(
ifelse(bt_pheno$WBC > 60, "NA",
bt_pheno$WBC
#ifelse(bt_pheno$WBC < 10, "NA", bt_pheno$WBC)
)
)
## Warning: NAs introduced by coercion
plot(density(na.omit(bt_pheno$WBC)), main="WBC (>60) outlier pruned")
polygon(density(na.omit(bt_pheno$WBC)), col = "coral")
BNphen <- bestNormalize::orderNorm(bt_pheno$WBC)
## Warning in bestNormalize::orderNorm(bt_pheno$WBC): Ties in data, Normal distribution not guaranteed
## orderNorm Transformation with 1210 nonmissing obs and ties
## - 277 unique values
## - Original quantiles:
## 0% 25% 50% 75% 100%
## 0.40 8.80 11.25 14.50 49.80
bt_pheno$nWBC <- BNphen$x.t
plot(density(na.omit(bt_pheno$nWBC)), main="Normalized WBC")
polygon(density(na.omit(bt_pheno$nWBC)), col = "lightblue")
# fit smooth spline for Age against Hb
wbc_pheno <- na.omit(bt_pheno[, which(names(bt_pheno) %in% c("FID","GENDER","AGE","WBC"))])
wbc_plot <- qplot(AGE, WBC, data = wbc_pheno, geom='smooth', span =0.5, color = GENDER) + theme_bw()
wbc_plot
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0.40 8.80 11.25 12.09 14.50 49.80 23





LYMPHOCYTE
par(mar = c(4, 4, 2, 2))
hist(bt_pheno$LYMPHOCYTE, main="LYMPHOCYTE (x10^9 cells/L)")
plot(density(na.omit(bt_pheno$LYMPHOCYTE)), main="LYMPHOCYTE (x10^9 cells/L)")
polygon(density(na.omit(bt_pheno$LYMPHOCYTE)), col = "coral")
BNphen <- bestNormalize::orderNorm(bt_pheno$LYMPHOCYTE)
## Warning in bestNormalize::orderNorm(bt_pheno$LYMPHOCYTE): Ties in data, Normal distribution not guaranteed
## orderNorm Transformation with 1211 nonmissing obs and ties
## - 418 unique values
## - Original quantiles:
## 0% 25% 50% 75% 100%
## 0.100 3.600 4.800 6.405 22.600
bt_pheno$nLYMPHOCYTE <- BNphen$x.t
plot(density(na.omit(bt_pheno$nLYMPHOCYTE)), main="Normalized LYMPHOCYTE")
polygon(density(na.omit(bt_pheno$nLYMPHOCYTE)), col = "lightblue")
# fit smooth spline for Age against Hb
lym_pheno <- na.omit(bt_pheno[, which(names(bt_pheno) %in% c("FID","GENDER","AGE","LYMPHOCYTE"))])
lym_plot <- qplot(AGE, LYMPHOCYTE, data = lym_pheno, geom='smooth', span =0.5, color = GENDER) + theme_bw()
lym_plot
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
summary(bt_pheno$LYMPHOCYTE)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0.100 3.600 4.800 5.331 6.405 22.600 22




MONOCYTE
par(mar = c(4, 4, 2, 2))
hist(bt_pheno$MONOCYTE, main="MONOCYTE (x10^9 cells/L)")
plot(density(na.omit(bt_pheno$MONOCYTE)), main="MONOCYTE (x10^9 cells/L)")
polygon(density(na.omit(bt_pheno$MONOCYTE)), col = "coral")
# removing
abline(v=0, lwd=1, lty=2)
bt_pheno$MONOCYTE <- as.numeric(
ifelse(bt_pheno$MONOCYTE == 0, "NA",
bt_pheno$MONOCYTE
#ifelse(bt_pheno$MONOCYTE == 0, "NA", bt_pheno$MONOCYTE)
)
)
## Warning: NAs introduced by coercion
plot(density(na.omit(bt_pheno$MONOCYTE)), main="MONOCYTE (=0) outlier pruned")
polygon(density(na.omit(bt_pheno$MONOCYTE)), col = "coral")
BNphen <- bestNormalize::orderNorm(bt_pheno$MONOCYTE)
## Warning in bestNormalize::orderNorm(bt_pheno$MONOCYTE): Ties in data, Normal distribution not guaranteed
## orderNorm Transformation with 1207 nonmissing obs and ties
## - 233 unique values
## - Original quantiles:
## 0% 25% 50% 75% 100%
## 0.10 0.70 1.00 1.45 9.40
bt_pheno$nMONOCYTE <- BNphen$x.t
plot(density(na.omit(bt_pheno$nMONOCYTE)), main="Normalized MONOCYTE")
polygon(density(na.omit(bt_pheno$nMONOCYTE)), col = "lightblue")
# fit smooth spline for Age against Hb
mon_pheno <- na.omit(bt_pheno[, which(names(bt_pheno) %in% c("FID","GENDER","AGE","MONOCYTE"))])
mon_plot <- qplot(AGE, MONOCYTE, data = mon_pheno, geom='smooth', span =0.5, color = GENDER) + theme_bw()
mon_plot
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
summary(bt_pheno$MONOCYTE)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0.10 0.70 1.00 1.21 1.45 9.40 26





NEUTROPHIL
par(mar = c(4, 4, 2, 2))
hist(bt_pheno$NEUTROPHIL, main="NEUTROPHIL (g/L)")
plot(density(na.omit(bt_pheno$NEUTROPHIL)), main="NEUTROPHIL (g/L)")
polygon(density(na.omit(bt_pheno$NEUTROPHIL)), col = "coral")
# removing
abline(v=0, lwd=1, lty=2)
bt_pheno$NEUTROPHIL <- as.numeric(
ifelse(bt_pheno$NEUTROPHIL == 0, "NA",
bt_pheno$NEUTROPHIL
#ifelse(bt_pheno$NEUTROPHIL < 10, "NA", bt_pheno$NEUTROPHIL)
)
)
plot(density(na.omit(bt_pheno$NEUTROPHIL)), main="NEUTROPHIL (=0) outlier pruned")
polygon(density(na.omit(bt_pheno$NEUTROPHIL)), col = "coral")
BNphen <- bestNormalize::orderNorm(bt_pheno$NEUTROPHIL)
## Warning in bestNormalize::orderNorm(bt_pheno$NEUTROPHIL): Ties in data, Normal distribution not guaranteed
## orderNorm Transformation with 743 nonmissing obs and ties
## - 139 unique values
## - Original quantiles:
## 0% 25% 50% 75% 100%
## 0.10 2.80 4.20 6.05 66.00
bt_pheno$nNEUTROPHIL <- BNphen$x.t
plot(density(na.omit(bt_pheno$nNEUTROPHIL)), main="Normalized NEUTROPHIL")
polygon(density(na.omit(bt_pheno$nNEUTROPHIL)), col = "lightblue")
# fit smooth spline for Age against NEUTROPHIL
neu_pheno <- na.omit(bt_pheno[, which(names(bt_pheno) %in% c("FID","GENDER","AGE","NEUTROPHIL"))])
neu_plot <- qplot(AGE, NEUTROPHIL, data = neu_pheno, geom='smooth', span =0.5, color = GENDER) + theme_bw()
neu_plot
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
summary(bt_pheno$NEUTROPHIL)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0.100 2.800 4.200 5.082 6.050 66.000 490





EOSINOPHIL
par(mar = c(4, 4, 2, 2))
hist(bt_pheno$EOSINOPHIL, main="EOSINOPHIL (x10^9 cells/L)")
plot(density(na.omit(bt_pheno$EOSINOPHIL)), main="EOSINOPHIL (x10^9 cells/L)")
polygon(density(na.omit(bt_pheno$EOSINOPHIL)), col = "coral")
# removing
# abline(v=2, lwd=1, lty=2)
# bt_pheno$EOSINOPHIL <- as.numeric(
# ifelse(bt_pheno$EOSINOPHIL > 2, "NA",
# bt_pheno$EOSINOPHIL
# #ifelse(bt_pheno$EOSINOPHIL < 10, "NA", bt_pheno$EOSINOPHIL)
# )
# )
#
# plot(density(na.omit(bt_pheno$EOSINOPHIL)), main="EOSINOPHIL (>2) outlier pruned")
# polygon(density(na.omit(bt_pheno$EOSINOPHIL)), col = "coral")
BNphen <- bestNormalize(bt_pheno$EOSINOPHIL)
BNphen
## Best Normalizing transformation with 740 Observations
## Estimated Normality Statistics (Pearson P / df, lower => more normal):
## - arcsinh(x): 7.9709
## - Center+scale: 10.2261
## - Double Reversed Log_b(x+a): 11.7165
## - Exp(x): 35.7939
## - Log_b(x+a): 9.2617
## - orderNorm (ORQ): 8.1158
## - sqrt(x + a): 7.7237
## - Yeo-Johnson: 8.1742
## Estimation method: Out-of-sample via CV with 10 folds and 5 repeats
##
## Based off these, bestNormalize chose:
## Standardized sqrt(x + a) Transformation with 740 nonmissing obs.:
## Relevant statistics:
## - a = 0
## - mean (before standardization) = 0.5289364
## - sd (before standardization) = 0.3187159
bt_pheno$nEOSINOPHIL <- BNphen$x.t
plot(density(na.omit(bt_pheno$nEOSINOPHIL)), main="Normalized EOSINOPHIL")
polygon(density(na.omit(bt_pheno$nEOSINOPHIL)), col = "lightblue")
# fit smooth spline for Age against EOSINOPHIL
eu_pheno <- na.omit(bt_pheno[, which(names(bt_pheno) %in% c("FID","GENDER","AGE","EOSINOPHIL"))])
eu_plot <- qplot(AGE, EOSINOPHIL, data = eu_pheno, geom='smooth', span =0.5, color = GENDER) + theme_bw()
eu_plot
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
summary(bt_pheno$EOSINOPHIL)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0.0000 0.1000 0.2000 0.3812 0.4000 5.4000 493




BASOPHIL
par(mar = c(4, 4, 2, 2))
hist(bt_pheno$BASOPHIL, main="BASOPHIL")
plot(density(na.omit(bt_pheno$BASOPHIL)), main="BASOPHIL")
polygon(density(na.omit(bt_pheno$BASOPHIL)), col = "coral")
# removing
abline(v=1, lwd=1, lty=2)
bt_pheno$BASOPHIL <- as.numeric(
ifelse(bt_pheno$BASOPHIL > 1, "NA",
bt_pheno$BASOPHIL
#ifelse(bt_pheno$BASOPHIL < 10, "NA", bt_pheno$BASOPHIL)
)
)
## Warning: NAs introduced by coercion
plot(density(na.omit(bt_pheno$BASOPHIL)), main="BASOPHIL (>1) outlier pruned")
polygon(density(na.omit(bt_pheno$BASOPHIL)), col = "coral")
BNphen <- bestNormalize(bt_pheno$BASOPHIL)
BNphen
## Best Normalizing transformation with 734 Observations
## Estimated Normality Statistics (Pearson P / df, lower => more normal):
## - arcsinh(x): 20.4253
## - Center+scale: 20.3933
## - Double Reversed Log_b(x+a): 20.28
## - Exp(x): 20.3063
## - Log_b(x+a): 21.5986
## - orderNorm (ORQ): 20.6695
## - sqrt(x + a): 20.2993
## - Yeo-Johnson: 20.4633
## Estimation method: Out-of-sample via CV with 10 folds and 5 repeats
##
## Based off these, bestNormalize chose:
## Standardized double reversed Log_b(x + a) Transformation with 734 nonmissing obs.:
## Relevant statistics:
## - a =
## - b = 10
## - max(x) = 1.1 ; min(x) = -0.1
## - mean (before standardization) = 0.1035505
## - sd (before standardization) = 0.100856
bt_pheno$nBASOPHIL <- BNphen$x.t
plot(density(na.omit(bt_pheno$nBASOPHIL)), main="Normalized BASOPHIL")
polygon(density(na.omit(bt_pheno$nBASOPHIL)), col = "lightblue")
# binarize basophil after removing outliers (1-control, 2-case)
# manual (10.1016/j.ajhg.2021.08.007)
bt_pheno$bBASOPHIL <- as.numeric(
ifelse(bt_pheno$BASOPHIL < 0.05, 1,
#bt_pheno$BASOPHIL
ifelse(bt_pheno$BASOPHIL >= 0.05, 2, "NA")
)
)
bt_pheno$cBASOPHIL <- as.numeric(
ifelse(bt_pheno$BASOPHIL < 0.05, 0,
#bt_pheno$BASOPHIL
ifelse(bt_pheno$BASOPHIL >= 0.05, 1, "NA")
)
)
hist(bt_pheno$bBASOPHIL, main="binarized")
# fit smooth spline for Age against BASOPHIL
bas_pheno <- na.omit(bt_pheno[, which(names(bt_pheno) %in% c("FID","GENDER","AGE","BASOPHIL"))])
bas_plot <- qplot(AGE, BASOPHIL, data = bas_pheno, geom='smooth', span =0.5, color = GENDER) + theme_bw()
bas_plot
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
summary(bt_pheno$BASOPHIL)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0.0000 0.1000 0.1000 0.1368 0.2000 1.0000 499






PLATELET
par(mar = c(4, 4, 2, 2))
hist(bt_pheno$PLATELET, main="PLATELET (x10^9/L)")
plot(density(na.omit(bt_pheno$PLATELET)), main="PLATELET (x10^9/L)")
polygon(density(na.omit(bt_pheno$PLATELET)), col = "coral")
BNphen <- bestNormalize::orderNorm(bt_pheno$PLATELET)
## Warning in bestNormalize::orderNorm(bt_pheno$PLATELET): Ties in data, Normal distribution not guaranteed
## orderNorm Transformation with 1210 nonmissing obs and ties
## - 518 unique values
## - Original quantiles:
## 0% 25% 50% 75% 100%
## 22.00 312.00 401.50 499.75 1401.00
bt_pheno$nPLATELET <- BNphen$x.t
plot(density(na.omit(bt_pheno$nPLATELET)), main="Normalized PLATELET")
polygon(density(na.omit(bt_pheno$nPLATELET)), col = "lightblue")
# fit smooth spline for Age against PLATELET
pla_pheno <- na.omit(bt_pheno[, which(names(bt_pheno) %in% c("FID","GENDER","AGE","PLATELET"))])
pla_plot <- qplot(AGE, PLATELET, data = pla_pheno, geom='smooth', span =0.5, color = GENDER) + theme_bw()
pla_plot
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
summary(bt_pheno$PLATELET)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 22.0 312.0 401.5 416.1 499.8 1401.0 23




RETICULOCYTE
# RETICULOCYTE PER 1000 RBCs
par(mar = c(4, 4, 2, 2))
hist(bt_pheno$RETICULOCYTE, main="RETICULOCYTE")
plot(density(na.omit(bt_pheno$RETICULOCYTE)), main="RETICULOCYTE")
polygon(density(na.omit(bt_pheno$RETICULOCYTE)), col = "coral")
BNphen <- bestNormalize::orderNorm(bt_pheno$RETICULOCYTE)
## Warning in bestNormalize::orderNorm(bt_pheno$RETICULOCYTE): Ties in data, Normal distribution not guaranteed
## orderNorm Transformation with 639 nonmissing obs and ties
## - 217 unique values
## - Original quantiles:
## 0% 25% 50% 75% 100%
## 8.0 65.0 91.0 131.5 411.0
bt_pheno$nRETICULOCYTE <- BNphen$x.t
plot(density(na.omit(bt_pheno$nRETICULOCYTE)), main="Normalized RETICULOCYTE")
polygon(density(na.omit(bt_pheno$nRETICULOCYTE)), col = "lightblue")
# fit smooth spline for Age against PLATELET
ret_pheno <- na.omit(bt_pheno[, which(names(bt_pheno) %in% c("FID","GENDER","AGE","RETICULOCYTE"))])
ret_plot <- qplot(AGE, RETICULOCYTE, data = ret_pheno, geom='smooth', span =0.5, color = GENDER) + theme_bw()
ret_plot
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
summary(bt_pheno$RETICULOCYTE)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 8.0 65.0 91.0 105.2 131.5 411.0 594




RETICULOCYTES
# RETICULOCYTES /uL
par(mar = c(4, 4, 2, 2))
hist(bt_pheno$RETICULOCYTES, main="RETICULOCYTES /uL")
plot(density(na.omit(bt_pheno$RETICULOCYTES)), main="RETICULOCYTES /uL")
polygon(density(na.omit(bt_pheno$RETICULOCYTES)), col = "coral")
# removing outliers
abline(v=1000000, lwd=1, lty=2)
bt_pheno$RETICULOCYTES <- as.numeric(
ifelse(bt_pheno$RETICULOCYTES > 1000000, "NA",
bt_pheno$RETICULOCYTES
#ifelse(bt_pheno$RETICULOCYTES < 10, "NA", bt_pheno$RETICULOCYTES)
)
)
## Warning: NAs introduced by coercion
plot(density(na.omit(bt_pheno$RETICULOCYTES)), main="RETICULOCYTES (>1000000) outlier pruned")
polygon(density(na.omit(bt_pheno$RETICULOCYTES)), col = "coral")
BNphen <- bestNormalize::orderNorm(bt_pheno$RETICULOCYTES)
## Warning in bestNormalize::orderNorm(bt_pheno$RETICULOCYTES): Ties in data, Normal distribution not guaranteed
## orderNorm Transformation with 637 nonmissing obs and ties
## - 620 unique values
## - Original quantiles:
## 0% 25% 50% 75% 100%
## 1363 182820 243540 317790 764460
bt_pheno$nRETICULOCYTES <- BNphen$x.t
plot(density(na.omit(bt_pheno$nRETICULOCYTES)), main="Normalized RETICULOCYTES")
polygon(density(na.omit(bt_pheno$nRETICULOCYTES)), col = "lightblue")
# fit smooth spline for Age against RETICULOCYTES
rets_pheno <- na.omit(bt_pheno[, which(names(bt_pheno) %in% c("FID","GENDER","AGE","RETICULOCYTES"))])
rets_plot <- qplot(AGE, RETICULOCYTES, data = rets_pheno, geom='smooth', span =0.5, color = GENDER) + theme_bw()
rets_plot
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
summary(bt_pheno$RETICULOCYTES)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 1363 182820 243540 265553 317790 764460 596





SBP
par(mar = c(4, 4, 2, 2))
hist(bt_pheno$SBP, main="SBP")
plot(density(na.omit(bt_pheno$SBP)), main="SBP")
polygon(density(na.omit(bt_pheno$SBP)), col = "coral")
BNphen <- bestNormalize::orderNorm(bt_pheno$SBP)
BNphen
## orderNorm Transformation with 645 nonmissing obs and ties
## - 77 unique values
## - Original quantiles:
## 0% 25% 50% 75% 100%
## 67 99 110 117 165
bt_pheno$nSBP <- BNphen$x.t
plot(density(na.omit(bt_pheno$nSBP)), main="Normalized SBP")
polygon(density(na.omit(bt_pheno$nSBP)), col = "lightblue")
# fit smooth spline for Age against RETICS
sbp_pheno <- na.omit(bt_pheno[, which(names(bt_pheno) %in% c("FID","GENDER","AGE","SBP"))])
sbp_plot <- qplot(AGE, SBP, data = sbp_pheno, geom='smooth', span =0.5, color = GENDER) + theme_bw()
sbp_plot
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 67.0 99.0 110.0 108.5 117.0 165.0 588




DBP
par(mar = c(4, 4, 2, 2))
hist(bt_pheno$DBP, main="DBP")
plot(density(na.omit(bt_pheno$DBP)), main="DBP")
polygon(density(na.omit(bt_pheno$DBP)), col = "coral")
BNphen <- bestNormalize::orderNorm(bt_pheno$DBP)
BNphen
## orderNorm Transformation with 645 nonmissing obs and ties
## - 67 unique values
## - Original quantiles:
## 0% 25% 50% 75% 100%
## 13 59 65 72 107
bt_pheno$nDBP <- BNphen$x.t
plot(density(na.omit(bt_pheno$nDBP)), main="Normalized DBP")
polygon(density(na.omit(bt_pheno$nDBP)), col = "lightblue")
# fit smooth spline for Age against DBP
dbp_pheno <- na.omit(bt_pheno[, which(names(bt_pheno) %in% c("FID","GENDER","AGE","DBP"))])
dbp_plot <- qplot(AGE, DBP, data = dbp_pheno, geom='smooth', span =0.5, color = GENDER) + theme_bw()
dbp_plot
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 13.00 59.00 65.00 65.81 72.00 107.00 588




BMI
par(mar = c(4, 4, 2, 2))
hist(bt_pheno$BMI, main="BMI")
plot(density(na.omit(bt_pheno$BMI)), main="BMI")
polygon(density(na.omit(bt_pheno$BMI)), col = "coral")
# removing outliers
abline(v=40, lwd=1, lty=2)
bt_pheno$BMI <- as.numeric(
ifelse(bt_pheno$BMI > 40, "NA",
bt_pheno$BMI
#ifelse(bt_pheno$BMI < 10, "NA", bt_pheno$BMI)
)
)
plot(density(na.omit(bt_pheno$BMI)), main="BMI (>40) outlier pruned")
polygon(density(na.omit(bt_pheno$BMI)), col = "coral")
BNphen <- bestNormalize::orderNorm(bt_pheno$BMI)
BNphen
## orderNorm Transformation with 743 nonmissing obs and ties
## - 599 unique values
## - Original quantiles:
## 0% 25% 50% 75% 100%
## 0.002 15.913 17.928 20.422 35.938
bt_pheno$nBMI <- BNphen$x.t
plot(density(na.omit(bt_pheno$nBMI)), main="Normalized BMI")
polygon(density(na.omit(bt_pheno$nBMI)), col = "lightblue")
# fit smooth spline for Age against BMI
bmi_pheno <- na.omit(bt_pheno[, which(names(bt_pheno) %in% c("FID","GENDER","AGE","BMI"))])
bmi_plot <- qplot(AGE, BMI, data = bmi_pheno, geom='smooth', span =0.5, color = GENDER) + theme_bw()
bmi_plot
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 13.00 59.00 65.00 65.81 72.00 107.00 588




