Back

1 Load package

list.of.packages <- c("tidyverse","bestNormalize")
new.packages <- list.of.packages[!(list.of.packages %in% installed.packages()[,"Package"])]
if(length(new.packages)) 
  install.packages(new.packages)
lapply(list.of.packages , require, character.only = T)
## [[1]]
## [1] TRUE
## 
## [[2]]
## [1] TRUE
#if(!lapply(pkgs, require, character.only = T)) {
#  install.packages(p)
#  require(bestNormalize)
#}

2 Load phenotype data and extract blood traits with relevant additional columns

https://www.hematology.org/education/trainees/fellows/hematopoiesis/2021/hemoglobin-electrophoresis-in-sickle-cell-disease

pheno <- read.table(params$data, sep="\t", h=T)

bt_pheno = pheno

3 Demographic data

3.1 AGE

summary(bt_pheno$AGE)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   5.030   6.750   8.740   8.954  10.910  14.990
hist(bt_pheno$AGE, main="AGE destribution", xlab="AGE")

#plot(density(na.omit(bt_pheno$AGE)))
#polygon(density(na.omit(bt_pheno$AGE)), col = "coral")
#abline(v=c(min(bt_pheno$AGE), max(bt_pheno$AGE)), lty=2, lwd=1)

3.2 SEX

bt_pheno %>% 
   select(SEX) %>% 
   table() %>% 
   as.data.frame() %>% 
   na.omit() %>%
   mutate(Prop = (Freq/sum(Freq))*100) -> sex       # Sex

sex
colnames(sex) <- c("cat", "freq", "prop")
par(mar=c(4,4,3,4))
barplot(
   sex$prop, 
   ylim=c(0, 60), 
   col=c(2,4)
)
text(
   x=c(0.7,1.9), 
   y=30, 
   labels=c("Female", "Male")
)
mtext(
   text = "Proportion (%)",
   side = 2,
   line = 3
)
mtext(
   text = "Sex",
   side = 1,
   line = 1
)

4 Describe phenotypes and normalize if needed

4.1 RBC

# figures-side,
par(mar = c(4, 4, 2, 2))
hist(bt_pheno$RBC, main="RBC (million/uL)") 
plot(density(na.omit(bt_pheno$RBC)), main="RBC (million/uL)")
polygon(density(na.omit(bt_pheno$RBC)), col = "coral")

# removing outliers
abline(v=8, lwd=1, lty=2)
bt_pheno$RBC <- as.numeric(ifelse(bt_pheno$RBC > 8, "NA", bt_pheno$RBC))
plot(density(na.omit(bt_pheno$RBC)), main="RBC (>8) outlier pruned")
polygon(density(na.omit(bt_pheno$RBC)), col = "coral")

BNphen <- bestNormalize::orderNorm(bt_pheno$RBC)
BNphen 
## orderNorm Transformation with 953 nonmissing obs and ties
##  - 42 unique values 
##  - Original quantiles:
##   0%  25%  50%  75% 100% 
##  1.0  2.4  2.7  3.1  5.0
bt_pheno$nRBC <- BNphen$x.t
plot(density(na.omit(bt_pheno$nRBC)), main="Normalized RBC") 
polygon(density(na.omit(bt_pheno$nRBC)), col = "lightblue")

# fit smooth spline for Age against RBC
rbc_pheno <- na.omit(bt_pheno[, which(names(bt_pheno) %in% c("FID","GENDER","AGE","RBC"))])

rbc_plot <- qplot(AGE, RBC, data = rbc_pheno, geom='smooth', span =0.5, color = GENDER) + theme_bw()
rbc_plot
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
summary(bt_pheno$RBC)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   1.000   2.400   2.700   2.795   3.100   5.000      25

4.2 SHb

par(mar = c(4, 4, 2, 2))
hist(bt_pheno$SHb, main="Hb (g/dL)", xlab="Steady state Hb") 
plot(density(na.omit(bt_pheno$SHb)), main="Steady state Hb (g/dL)")
polygon(density(na.omit(bt_pheno$SHb)), col = "coral")

# removing outliers
#abline(v=13, lwd=1, lty=2)
#bt_pheno$Hb <- as.numeric(ifelse(bt_pheno$Hb > 13, "NA", bt_pheno$Hb))
#plot(density(na.omit(bt_pheno$Hb)), main="Hb (>13) outlier pruned")
#polygon(density(na.omit(bt_pheno$Hb)), col = "coral")

BNphen <- bestNormalize::orderNorm(bt_pheno$SHb)
BNphen 
## orderNorm Transformation with 974 nonmissing obs and ties
##  - 63 unique values 
##  - Original quantiles:
##   0%  25%  50%  75% 100% 
##  5.3  7.4  8.0  8.7 11.8
bt_pheno$nSHb <- BNphen$x.t
plot(density(na.omit(bt_pheno$nSHb)), main="Normalized SHb") 
polygon(density(na.omit(bt_pheno$nSHb)), col = "lightblue")

# fit smooth spline for Age against Hb
hb_pheno <- na.omit(bt_pheno[, which(names(bt_pheno) %in% c("FID","GENDER","AGE","SHb"))])

hb_plot <- qplot(AGE, SHb, data = hb_pheno, geom='smooth', span =0.5, color = GENDER) + theme_bw()
hb_plot
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
summary(bt_pheno$SHb)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##    5.30    7.40    8.00    8.08    8.70   11.80       4

4.3 Hb

par(mar = c(4, 4, 2, 2))
hist(bt_pheno$Hb, main="Hb (g/dL)", xlab="Hb") 
plot(density(na.omit(bt_pheno$Hb)), main="Hb (g/dL)")
polygon(density(na.omit(bt_pheno$Hb)), col = "coral")

# removing outliers
abline(v=30, lwd=1, lty=2)
bt_pheno$Hb <- as.numeric(ifelse(bt_pheno$Hb > 30, "NA", bt_pheno$Hb))
plot(density(na.omit(bt_pheno$Hb)), main="Hb (>30) outlier pruned")
polygon(density(na.omit(bt_pheno$Hb)), col = "coral")

BNphen <- bestNormalize::orderNorm(bt_pheno$Hb)
BNphen 
## orderNorm Transformation with 959 nonmissing obs and ties
##  - 69 unique values 
##  - Original quantiles:
##   0%  25%  50%  75% 100% 
##  4.4  7.4  8.0  8.8 24.0
bt_pheno$nHb <- BNphen$x.t
plot(density(na.omit(bt_pheno$nHb)), main="Normalized Hb") 
polygon(density(na.omit(bt_pheno$nHb)), col = "lightblue")

# fit smooth spline for Age against Hb
hb_pheno <- na.omit(bt_pheno[, which(names(bt_pheno) %in% c("FID","GENDER","AGE","Hb"))])

hb_plot <- qplot(AGE, Hb, data = hb_pheno, geom='smooth', span =0.5, color = GENDER) + theme_bw()
hb_plot
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
summary(bt_pheno$Hb)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##    4.40    7.40    8.00    8.18    8.80   24.00      19

4.4 HbF

par(mar = c(4, 4, 2, 2))
hist(bt_pheno$HbF, main="HbF", xlab="HbF") 
plot(density(na.omit(bt_pheno$HbF)), main="HbF")
polygon(density(na.omit(bt_pheno$HbF)), col = "coral")

# removing outliers
abline(v=c(0,70), lwd=1, lty=2)
bt_pheno$HbF <- as.numeric(
  ifelse(bt_pheno$HbF > 70, "NA", 
    ifelse(bt_pheno$HbF == 0, "NA", bt_pheno$HbF)
  )
)
plot(density(na.omit(bt_pheno$HbF)), main="HbF (>70) outlier pruned")
polygon(density(na.omit(bt_pheno$HbF)), col = "coral")

BNphen <- bestNormalize::orderNorm(bt_pheno$HbF)
BNphen 
## orderNorm Transformation with 931 nonmissing obs and ties
##  - 47 unique values 
##  - Original quantiles:
##   0%  25%  50%  75% 100% 
##    1    6   11   17   69
bt_pheno$nHbF <- BNphen$x.t
plot(density(na.omit(bt_pheno$nHbF)), main="Normalized HbF") 
polygon(density(na.omit(bt_pheno$nHbF)), col = "lightblue")

# CUBIC ROOT TRANSFORMATION
bt_pheno$nHbF <- (bt_pheno$HbF)^(1/3)
plot(density(na.omit(bt_pheno$nHbF)), main="Cubic root transformed HbF") 
polygon(density(na.omit(bt_pheno$nHbF)), col = "lightblue")

# fit smooth spline for Age against Hb
HbF_pheno <- na.omit(bt_pheno[, which(names(bt_pheno) %in% c("FID","GENDER","AGE","HbF"))])

HbF_plot <- qplot(AGE, HbF, data = HbF_pheno, geom='smooth', span =0.5, color = GENDER) + theme_bw()
HbF_plot
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
summary(bt_pheno$HbF)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##    1.00    6.00   11.00   12.82   17.00   69.00      47

4.5 HbF (Hopkins)

par(mar = c(4, 4, 2, 2))
hist(bt_pheno$HbF2, main="HbF (Hopkins)", xlab="HbF") 
plot(density(na.omit(bt_pheno$HbF2)), main="HbF (Hopkins)")
polygon(density(na.omit(bt_pheno$HbF2)), col = "coral")

# removing outliers
#abline(v=50, lwd=1, lty=2)
#bt_pheno$HbF2 <- as.numeric(ifelse(bt_pheno$HbF2 > 50, "NA", bt_pheno$HbF2))
#plot(density(na.omit(bt_pheno$HbF2)), main="HbF (>50) outlier pruned")
#polygon(density(na.omit(bt_pheno$HbF2)), col = "coral")

BNphen <- bestNormalize(bt_pheno$HbF2)
BNphen 
## Best Normalizing transformation with 552 Observations
##  Estimated Normality Statistics (Pearson P / df, lower => more normal):
##  - arcsinh(x): 1.244
##  - Box-Cox: 1.0992
##  - Center+scale: 2.1832
##  - Double Reversed Log_b(x+a): 3.7168
##  - Exp(x): 61.2147
##  - Log_b(x+a): 1.3413
##  - orderNorm (ORQ): 1.2394
##  - sqrt(x + a): 1.2356
##  - Yeo-Johnson: 1.1608
## Estimation method: Out-of-sample via CV with 10 folds and 5 repeats
##  
## Based off these, bestNormalize chose:
## Standardized Box Cox Transformation with 552 nonmissing obs.:
##  Estimated statistics:
##  - lambda = 0.2890797 
##  - mean (before standardization) = 2.730489 
##  - sd (before standardization) = 1.161546
bt_pheno$nHbF2 <- BNphen$x.t
plot(density(na.omit(bt_pheno$nHbF2)), main="Normalized HbF2") 
polygon(density(na.omit(bt_pheno$nHbF2)), col = "lightblue")

# CUBIC ROOT TRANSFORMATION
bt_pheno$nHbF2 <- (bt_pheno$HbF2)^(1/3)
plot(density(na.omit(bt_pheno$nHbF2)), main="Cubic root transformed HbF") 
polygon(density(na.omit(bt_pheno$nHbF2)), col = "lightblue")

# fit smooth spline for Age against Hb
HbF2_pheno <- na.omit(bt_pheno[, which(names(bt_pheno) %in% c("FID","GENDER","AGE","HbF2"))])

HbF2_plot <- qplot(AGE, HbF2, data = HbF2_pheno, geom='smooth', span =0.5, color = GENDER) + theme_bw()
HbF2_plot
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
summary(bt_pheno$HbF2)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   0.600   4.600   7.600   8.605  11.500  30.900     426

4.6 Fcells

par(mar = c(4, 4, 2, 2))
hist(bt_pheno$FCELLS, main="FCELLS", xlab="FCELLS") 
plot(density(na.omit(bt_pheno$FCELLS)), main="FCELLS")
polygon(density(na.omit(bt_pheno$FCELLS)), col = "coral")

# removing outliers
abline(v=0, lwd=1, lty=2)
bt_pheno$FCELLS <- as.numeric(ifelse(bt_pheno$FCELLS == 0, "NA", bt_pheno$FCELLS))
plot(density(na.omit(bt_pheno$FCELLS)), main="FCELLS (=0) outlier pruned")
polygon(density(na.omit(bt_pheno$FCELLS)), col = "coral")

BNphen <- bestNormalize::orderNorm(bt_pheno$FCELLS)
BNphen 
## orderNorm Transformation with 802 nonmissing obs and ties
##  - 578 unique values 
##  - Original quantiles:
##    0%   25%   50%   75%  100% 
##  1.00 27.00 40.40 55.47 96.09
bt_pheno$nFCELLS <- BNphen$x.t
plot(density(na.omit(bt_pheno$nFCELLS)), main="Normalized FCELLS") 
polygon(density(na.omit(bt_pheno$nFCELLS)), col = "lightblue")

# fit smooth spline for Age against Hb
FCELLS_pheno <- na.omit(bt_pheno[, which(names(bt_pheno) %in% c("FID","GENDER","AGE","FCELLS"))])

FCELLS_plot <- qplot(AGE, FCELLS, data = FCELLS_pheno, geom='smooth', span =0.5, color = GENDER) + theme_bw()
FCELLS_plot
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
summary(bt_pheno$FCELLS)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##    1.00   27.00   40.40   41.88   55.47   96.09     176

4.7 MCV

par(mar = c(4, 4, 2, 2))
hist(bt_pheno$MCV, main="MCV (fL)") 
plot(density(na.omit(bt_pheno$MCV)), main="MCV (fL)")
polygon(density(na.omit(bt_pheno$MCV)), col = "coral")

# removing outliers
# abline(v=55, lwd=1, lty=2)
# bt_pheno$MCV <- as.numeric(ifelse(bt_pheno$MCV < 55, "NA", bt_pheno$MCV))
# plot(density(na.omit(bt_pheno$MCV)), main="MCV (>55) outlier pruned")
# polygon(density(na.omit(bt_pheno$MCV)), col = "coral")

BNphen <- bestNormalize::orderNorm(bt_pheno$MCV)
BNphen 
## orderNorm Transformation with 969 nonmissing obs and ties
##  - 292 unique values 
##  - Original quantiles:
##    0%   25%   50%   75%  100% 
##   8.2  78.4  83.4  88.0 104.0
bt_pheno$nMCV <- BNphen$x.t
plot(density(na.omit(bt_pheno$nMCV)), main="Normalized MCV")
polygon(density(na.omit(bt_pheno$nMCV)), col = "lightblue")

# fit smooth spline for Age against Hb
mcv_pheno <- na.omit(bt_pheno[, which(names(bt_pheno) %in% c("FID","GENDER","AGE","MCV"))])

mcv_plot <- qplot(AGE, MCV, data = mcv_pheno, geom='smooth', span =0.5, color = GENDER) + theme_bw()
mcv_plot
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
summary(bt_pheno$MCV)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##    8.20   78.40   83.40   82.76   88.00  104.00       9

4.8 HCT

par(mar = c(4, 4, 2, 2))
hist(bt_pheno$HCT, main="HCT") 
plot(density(na.omit(bt_pheno$HCT)), main="HCT")
polygon(density(na.omit(bt_pheno$HCT)), col = "coral")

# removing ourliers
#abline(v=43, lwd=1, lty=2)
#bt_pheno$HTC <- as.numeric(ifelse(bt_pheno$HTC > 43, "NA", bt_pheno$HTC))
#plot(density(na.omit(bt_pheno$HTC)), main="HTC (>43) outlier pruned")
#polygon(density(na.omit(bt_pheno$HTC)), col = "coral")

BNphen <- bestNormalize::orderNorm(bt_pheno$HCT)
## Warning in bestNormalize::orderNorm(bt_pheno$HCT): Ties in data, Normal distribution not guaranteed
BNphen 
## orderNorm Transformation with 971 nonmissing obs and ties
##  - 24 unique values 
##  - Original quantiles:
##   0%  25%  50%  75% 100% 
##   12   21   23   25   35
bt_pheno$nHCT <- BNphen$x.t
plot(density(na.omit(bt_pheno$nHCT)), main="Normalized HCT") 
polygon(density(na.omit(bt_pheno$nHCT)), col = "lightblue")

# fit smooth spline for Age against Hb
htc_pheno <- na.omit(bt_pheno[, which(names(bt_pheno) %in% c("FID","GENDER","AGE","HCT"))])

htc_plot <- qplot(AGE, HCT, data = htc_pheno, geom='smooth', span =0.5, color = GENDER) + theme_bw()
htc_plot
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
summary(bt_pheno$HCT)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##    12.0    21.0    23.0    23.2    25.0    35.0       7

4.9 MCHC

#Mean Corpuscular Hemoglobin Concentration

par(mar = c(4, 4, 2, 2))
hist(bt_pheno$MCHC, main="MCHC (g/dL)") 
plot(density(na.omit(bt_pheno$MCHC)), main="MCHC (g/dL)")
polygon(density(na.omit(bt_pheno$MCHC)), col = "coral")

# removing
abline(v=20, lwd=1, lty=2)
bt_pheno$MCHC <- as.numeric(
  ifelse(bt_pheno$MCHC < 20, "NA", 
    bt_pheno$MCHC
    #ifelse(bt_pheno$MCHC < 10, "NA", bt_pheno$MCHC)
  )
)
## Warning: NAs introduced by coercion
plot(density(na.omit(bt_pheno$MCHC)), main="MCHC (<20) outlier pruned")
polygon(density(na.omit(bt_pheno$MCHC)), col = "coral")

BNphen <- bestNormalize::orderNorm(bt_pheno$MCHC)
## Warning in bestNormalize::orderNorm(bt_pheno$MCHC): Ties in data, Normal distribution not guaranteed
BNphen 
## orderNorm Transformation with 913 nonmissing obs and ties
##  - 49 unique values 
##  - Original quantiles:
##   0%  25%  50%  75% 100% 
##   21   34   35   36   40
bt_pheno$nMCHC <- BNphen$x.t
plot(density(na.omit(bt_pheno$nMCHC)), main="Normalized MCHC") 
polygon(density(na.omit(bt_pheno$nMCHC)), col = "lightblue")

# fit smooth spline for Age against Hb
mchc_pheno <- na.omit(bt_pheno[, which(names(bt_pheno) %in% c("FID","GENDER","AGE","MCHC"))])

mchc_plot <- qplot(AGE, MCHC, data = mchc_pheno, geom='smooth', span =0.5, color = GENDER) + theme_bw()
mchc_plot
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
summary(bt_pheno$MCHC)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   21.00   34.00   35.00   34.54   36.00   40.00      65

4.10 MCH

#The Mean Corpuscular Hemoglobin

par(mar = c(4, 4, 2, 2))
hist(bt_pheno$MCH, main="MCH (pg/cell)") 
plot(density(na.omit(bt_pheno$MCH)), main="MCH (pg/cell)")
polygon(density(na.omit(bt_pheno$MCH)), col = "coral")

# removing outliers
#abline(v=10, lwd=1, lty=2)
#bt_pheno$MCH <- as.numeric(
#  ifelse(bt_pheno$MCH < 10, "NA", 
#    bt_pheno$MCH
#    #ifelse(bt_pheno$MCH < 10, "NA", bt_pheno$MCH)
#  )
#)
#plot(density(na.omit(bt_pheno$MCH)), main="MCH (<10) outlier pruned")
#polygon(density(na.omit(bt_pheno$MCH)), col = "coral")

BNphen <- bestNormalize::orderNorm(bt_pheno$MCH)
BNphen 
## orderNorm Transformation with 954 nonmissing obs and ties
##  - 148 unique values 
##  - Original quantiles:
##     0%    25%    50%    75%   100% 
## 18.000 26.900 29.100 30.875 37.000
bt_pheno$nMCH <- BNphen$x.t
plot(density(na.omit(bt_pheno$nMCH)), main="Normalized MCH") 
polygon(density(na.omit(bt_pheno$nMCH)), col = "lightblue")

# fit smooth spline for Age against Hb
tcmh_pheno <- na.omit(bt_pheno[, which(names(bt_pheno) %in% c("FID","GENDER","AGE","MCH"))])

tcmh_plot <- qplot(AGE, MCH, data = tcmh_pheno, geom='smooth', span =0.5, color = GENDER) + theme_bw()
tcmh_plot
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
summary(bt_pheno$MCH)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   18.00   26.90   29.10   28.84   30.88   37.00      24

4.11 WBC

par(mar = c(4, 4, 2, 2))
hist(bt_pheno$WBC, main="WBC (counts/uL)") 
plot(density(na.omit(bt_pheno$WBC)), main="WBC (counts/uL)")
polygon(density(na.omit(bt_pheno$WBC)), col = "coral")

# removing
abline(v=60000, lwd=1, lty=2)
bt_pheno$WBC <- as.numeric(
  ifelse(bt_pheno$WBC > 60000, "NA", 
    bt_pheno$WBC
    #ifelse(bt_pheno$WBC < 10, "NA", bt_pheno$WBC)
  )
)
## Warning: NAs introduced by coercion
plot(density(na.omit(bt_pheno$WBC)), main="WBC (>60000) outlier pruned")
polygon(density(na.omit(bt_pheno$WBC)), col = "coral")

BNphen <- bestNormalize(bt_pheno$WBC)
BNphen 
## Best Normalizing transformation with 960 Observations
##  Estimated Normality Statistics (Pearson P / df, lower => more normal):
##  - arcsinh(x): 2.2165
##  - Box-Cox: 1.6553
##  - Center+scale: 1.7284
##  - Double Reversed Log_b(x+a): 2.7775
##  - Log_b(x+a): 2.2165
##  - orderNorm (ORQ): 1.3205
##  - sqrt(x + a): 1.567
##  - Yeo-Johnson: 1.6553
## Estimation method: Out-of-sample via CV with 10 folds and 5 repeats
##  
## Based off these, bestNormalize chose:
## orderNorm Transformation with 960 nonmissing obs and ties
##  - 303 unique values 
##  - Original quantiles:
##    0%   25%   50%   75%  100% 
##  1055 10000 12000 14700 35000
bt_pheno$nWBC <- BNphen$x.t
plot(density(na.omit(bt_pheno$nWBC)), main="Normalized WBC") 
polygon(density(na.omit(bt_pheno$nWBC)), col = "lightblue")

# fit smooth spline for Age against Hb
wbc_pheno <- na.omit(bt_pheno[, which(names(bt_pheno) %in% c("FID","GENDER","AGE","WBC"))])

wbc_plot <- qplot(AGE, WBC, data = wbc_pheno, geom='smooth', span =0.5, color = GENDER) + theme_bw()
wbc_plot
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
summary(bt_pheno$WBC)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##    1055   10000   12000   12514   14700   35000      18

4.12 NEUTROPHIL

par(mar = c(4, 4, 2, 2))
hist(bt_pheno$NEUTROPHIL, main="NEUTROPHIL (g/L)") 
plot(density(na.omit(bt_pheno$NEUTROPHIL)), main="NEUTROPHIL (g/L)")
polygon(density(na.omit(bt_pheno$NEUTROPHIL)), col = "coral")

# removing
#abline(v=20, lwd=1, lty=2)
#bt_pheno$NEUTROPHIL <- as.numeric(
#  ifelse(bt_pheno$NEUTROPHIL > 20, "NA", 
#    bt_pheno$NEUTROPHIL
#    #ifelse(bt_pheno$NEUTROPHIL < 10, "NA", bt_pheno$NEUTROPHIL)
#  )
#)
#plot(density(na.omit(bt_pheno$NEUTROPHILE)), main="NEUTROPHIL (>20) outlier pruned")
#polygon(density(na.omit(bt_pheno$NEUTROPHILE)), col = "coral")

BNphen <- bestNormalize::orderNorm(bt_pheno$NEUTROPHIL)
## Warning in bestNormalize::orderNorm(bt_pheno$NEUTROPHIL): Ties in data, Normal distribution not guaranteed
BNphen 
## orderNorm Transformation with 914 nonmissing obs and ties
##  - 86 unique values 
##  - Original quantiles:
##   0%  25%  50%  75% 100% 
##    1   37   46   56   92
bt_pheno$nNEUTROPHIL <- BNphen$x.t
plot(density(na.omit(bt_pheno$nNEUTROPHIL)), main="Normalized NEUTROPHIL") 
polygon(density(na.omit(bt_pheno$nNEUTROPHIL)), col = "lightblue")

# fit smooth spline for Age against NEUTROPHILE
neu_pheno <- na.omit(bt_pheno[, which(names(bt_pheno) %in% c("FID","GENDER","AGE","NEUTROPHIL"))])

neu_plot <- qplot(AGE, NEUTROPHIL, data = neu_pheno, geom='smooth', span =0.5, color = GENDER) + theme_bw()
neu_plot
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
summary(bt_pheno$NEUTROPHIL)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##    1.00   37.00   46.00   45.39   56.00   92.00      64

4.13 BANDS (Immature neutrophils)

par(mar = c(4, 4, 2, 2))
hist(bt_pheno$BANDS, main="BANDS (% neutrophil)") 
plot(density(na.omit(bt_pheno$BANDS)), main="BANDS (% neutrophil)")
polygon(density(na.omit(bt_pheno$BANDS)), col = "coral")

BNphen <- bestNormalize(bt_pheno$BANDS)
BNphen 
## Best Normalizing transformation with 725 Observations
##  Estimated Normality Statistics (Pearson P / df, lower => more normal):
##  - arcsinh(x): 51.2912
##  - Center+scale: 54.6382
##  - Double Reversed Log_b(x+a): 57.2668
##  - Exp(x): 28.2585
##  - Log_b(x+a): 53.331
##  - orderNorm (ORQ): 51.7743
##  - sqrt(x + a): 51.0067
##  - Yeo-Johnson: 53.992
## Estimation method: Out-of-sample via CV with 10 folds and 5 repeats
##  
## Based off these, bestNormalize chose:
## Standardized exp(x) Transformation with 725 nonmissing obs.:
##  Relevant statistics:
##  - mean (before standardization) = 1.003578e+40 
##  - sd (before standardization) = 1.909436e+41
bt_pheno$nBANDS <- BNphen$x.t

plot(density(na.omit(bt_pheno$nBANDS)), main="Normalized BANDS") 
polygon(density(na.omit(bt_pheno$nBANDS)), col = "lightblue")

# PLINK CASE-CONTROL
bt_pheno$bBANDS <- as.numeric(
  ifelse(bt_pheno$BANDS <= 10, 1, 
    #bt_pheno$BASOPHILE
    ifelse(bt_pheno$BANDS > 10, 2, "NA")
  )
)

# SAIGE CASE-CONTROL
bt_pheno$cBANDS <- as.numeric(
  ifelse(bt_pheno$BANDS <= 10, 0, 
    #bt_pheno$BASOPHILE
    ifelse(bt_pheno$BANDS > 10, 1, "NA")
  )
)

hist(bt_pheno$cBANDS, main="binarized")

# fit smooth spline for Age against BANDS
eu_pheno <- na.omit(bt_pheno[, which(names(bt_pheno) %in% c("FID","GENDER","AGE","BANDS"))])

eu_plot <- qplot(AGE, BANDS, data = eu_pheno, geom='smooth', span =0.5, color = GENDER) + theme_bw()
eu_plot
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
summary(bt_pheno$BANDS)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   0.000   0.000   0.000   1.179   0.000  98.000     253

4.14 PLATELET

par(mar = c(4, 4, 2, 2))
hist(bt_pheno$PLATELET, main="PLATELET (K/cu mm)") 
plot(density(na.omit(bt_pheno$PLATELET)), main="PLATELET (K/cu mm)")
polygon(density(na.omit(bt_pheno$PLATELET)), col = "coral") 

BNphen <- bestNormalize::orderNorm(bt_pheno$PLATELET)
## Warning in bestNormalize::orderNorm(bt_pheno$PLATELET): Ties in data, Normal distribution not guaranteed
BNphen 
## orderNorm Transformation with 952 nonmissing obs and ties
##  - 438 unique values 
##  - Original quantiles:
##     0%    25%    50%    75%   100% 
##   1000 364750 443500 526250 997000
bt_pheno$nPLATELET <- BNphen$x.t
plot(density(na.omit(bt_pheno$nPLATELET)), main="Normalized PLATELET") 
polygon(density(na.omit(bt_pheno$nPLATELET)), col = "lightblue")

# fit smooth spline for Age against PLATELET
pla_pheno <- na.omit(bt_pheno[, which(names(bt_pheno) %in% c("FID","GENDER","AGE","PLATELET"))])

pla_plot <- qplot(AGE, PLATELET, data = pla_pheno, geom='smooth', span =0.5, color = GENDER) + theme_bw()
pla_plot
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
summary(bt_pheno$PLATELET)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##    1000  364750  443500  450453  526250  997000      26

4.15 MPV

par(mar = c(4, 4, 2, 2))
hist(bt_pheno$MPV, main="MPV") 
plot(density(na.omit(bt_pheno$MPV)), main="MPV")
polygon(density(na.omit(bt_pheno$MPV)), col = "coral") 

# removing
abline(v=20, lwd=1, lty=2)
bt_pheno$MPV <- as.numeric(
  ifelse(bt_pheno$MPV > 20, "NA", 
    bt_pheno$MPV
    #ifelse(bt_pheno$MPV < 10, "NA", bt_pheno$MPV)
  )
)
## Warning: NAs introduced by coercion
plot(density(na.omit(bt_pheno$MPV)), main="MPV (>20) outlier pruned")
polygon(density(na.omit(bt_pheno$MPV)), col = "coral")

BNphen <- bestNormalize::orderNorm(bt_pheno$MPV)
## Warning in bestNormalize::orderNorm(bt_pheno$MPV): Ties in data, Normal distribution not guaranteed
BNphen 
## orderNorm Transformation with 783 nonmissing obs and ties
##  - 67 unique values 
##  - Original quantiles:
##   0%  25%  50%  75% 100% 
##  5.0  8.0  9.0  9.7 13.7
bt_pheno$nMPV <- BNphen$x.t
plot(density(na.omit(bt_pheno$nMPV)), main="Normalized MPV") 
polygon(density(na.omit(bt_pheno$nMPV)), col = "lightblue")

# fit smooth spline for Age against PLATELET
pla_pheno <- na.omit(bt_pheno[, which(names(bt_pheno) %in% c("FID","GENDER","AGE","MPV"))])

pla_plot <- qplot(AGE, MPV, data = pla_pheno, geom='smooth', span =0.5, color = GENDER) + theme_bw()
pla_plot
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
summary(bt_pheno$MPV)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   5.000   8.000   9.000   8.876   9.700  13.700     195

4.16 RETICULOCYTE

par(mar = c(4, 4, 2, 2))
hist(bt_pheno$RETICULOCYTE, main="RETICULOCYTE") 
plot(density(na.omit(bt_pheno$RETICULOCYTE)), main="RETICULOCYTE")
polygon(density(na.omit(bt_pheno$RETICULOCYTE)), col = "coral") 

BNphen <- bestNormalize::orderNorm(bt_pheno$RETICULOCYTE)
## Warning in bestNormalize::orderNorm(bt_pheno$RETICULOCYTE): Ties in data, Normal distribution not guaranteed
BNphen 
## orderNorm Transformation with 957 nonmissing obs and ties
##  - 250 unique values 
##  - Original quantiles:
##    0%   25%   50%   75%  100% 
##  0.50  8.53 11.51 15.20 40.00
bt_pheno$nRETICULOCYTE <- BNphen$x.t
plot(density(na.omit(bt_pheno$nRETICULOCYTE)), main="Normalized RETICULOCYTE") 
polygon(density(na.omit(bt_pheno$nRETICULOCYTE)), col = "lightblue")

# fit smooth spline for Age against PLATELET
ret_pheno <- na.omit(bt_pheno[, which(names(bt_pheno) %in% c("FID","GENDER","AGE","RETICULOCYTE"))])

ret_plot <- qplot(AGE, RETICULOCYTE, data = ret_pheno, geom='smooth', span =0.5, color = GENDER) + theme_bw()
ret_plot
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
summary(bt_pheno$RETICULOCYTE)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##    0.50    8.53   11.51   12.20   15.20   40.00      21

4.17 RETICS (% HbF Reticulocytes measured at Hopkins)

# RETICULOCYTES /uL

par(mar = c(4, 4, 2, 2))
hist(bt_pheno$RETICS, main="RETICS") 
plot(density(na.omit(bt_pheno$RETICS)), main="RETICS")
polygon(density(na.omit(bt_pheno$RETICS)), col = "coral")

# removing outliers
#abline(v=1000000, lwd=1, lty=2)
#bt_pheno$RETICS <- as.numeric(
#  ifelse(bt_pheno$RETICS > 1000000, "NA", 
#    bt_pheno$RETICS
#    #ifelse(bt_pheno$RETICS < 10, "NA", bt_pheno$RETICS)
#  )
#)
#plot(density(na.omit(bt_pheno$RETICS)), main="RETICS (>1000000) outlier pruned")
#polygon(density(na.omit(bt_pheno$RETICS)), col = "coral")

BNphen <- bestNormalize::orderNorm(bt_pheno$RETICS)
BNphen 
## orderNorm Transformation with 676 nonmissing obs and ties
##  - 475 unique values 
##  - Original quantiles:
##     0%    25%    50%    75%   100% 
##  0.000 10.495 17.900 29.625 82.800
bt_pheno$nRETICS <- BNphen$x.t
plot(density(na.omit(bt_pheno$nRETICS)), main="Normalized RETICS") 
polygon(density(na.omit(bt_pheno$nRETICS)), col = "lightblue")

# fit smooth spline for Age against RETICS
rets_pheno <- na.omit(bt_pheno[, which(names(bt_pheno) %in% c("FID","GENDER","AGE","RETICS"))])

rets_plot <- qplot(AGE, RETICS, data = rets_pheno, geom='smooth', span =0.5, color = GENDER) + theme_bw()
rets_plot
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
summary(bt_pheno$RETICS)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##    0.00   10.49   17.90   21.14   29.62   82.80     302

4.18 SBP

par(mar = c(4, 4, 2, 2))
hist(bt_pheno$SBP, main="SBP") 
plot(density(na.omit(bt_pheno$SBP)), main="SBP")
polygon(density(na.omit(bt_pheno$SBP)), col = "coral")

BNphen <- bestNormalize::orderNorm(bt_pheno$SBP)
BNphen 
## orderNorm Transformation with 971 nonmissing obs and ties
##  - 70 unique values 
##  - Original quantiles:
##   0%  25%  50%  75% 100% 
##   66  100  107  115  151
bt_pheno$nSBP <- BNphen$x.t
plot(density(na.omit(bt_pheno$nSBP)), main="Normalized SBP") 
polygon(density(na.omit(bt_pheno$nSBP)), col = "lightblue")

# fit smooth spline for Age against RETICS
sbp_pheno <- na.omit(bt_pheno[, which(names(bt_pheno) %in% c("FID","GENDER","AGE","SBP"))])

sbp_plot <- qplot(AGE, SBP, data = sbp_pheno, geom='smooth', span =0.5, color = GENDER) + theme_bw()
sbp_plot
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
summary(bt_pheno$SBP)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##      66     100     107     108     115     151       7

4.19 DBP

par(mar = c(4, 4, 2, 2))
hist(bt_pheno$DBP, main="DBP") 
plot(density(na.omit(bt_pheno$DBP)), main="DBP")
polygon(density(na.omit(bt_pheno$DBP)), col = "coral")

BNphen <- bestNormalize::orderNorm(bt_pheno$DBP)
BNphen 
## orderNorm Transformation with 970 nonmissing obs and ties
##  - 51 unique values 
##  - Original quantiles:
##   0%  25%  50%  75% 100% 
##   34   55   60   65  106
bt_pheno$nDBP <- BNphen$x.t
plot(density(na.omit(bt_pheno$nDBP)), main="Normalized DBP") 
polygon(density(na.omit(bt_pheno$nDBP)), col = "lightblue")

# fit smooth spline for Age against DBP
dbp_pheno <- na.omit(bt_pheno[, which(names(bt_pheno) %in% c("FID","GENDER","AGE","DBP"))])

dbp_plot <- qplot(AGE, DBP, data = dbp_pheno, geom='smooth', span =0.5, color = GENDER) + theme_bw()
dbp_plot
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
summary(bt_pheno$DBP)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   34.00   55.00   60.00   60.23   65.00  106.00       8

4.20 BMI

par(mar = c(4, 4, 2, 2))
hist(bt_pheno$BMI, main="BMI") 
plot(density(na.omit(bt_pheno$BMI)), main="BMI")
polygon(density(na.omit(bt_pheno$BMI)), col = "coral")

# removing outliers
abline(v=50, lwd=1, lty=2)
bt_pheno$BMI <- as.numeric(
  ifelse(bt_pheno$BMI > 50, "NA", 
    bt_pheno$BMI
    #ifelse(bt_pheno$BMI < 10, "NA", bt_pheno$BMI)
  )
)
plot(density(na.omit(bt_pheno$BMI)), main="BMI (>50) outlier pruned")
polygon(density(na.omit(bt_pheno$BMI)), col = "coral")

BNphen <- bestNormalize::orderNorm(bt_pheno$BMI)
BNphen 
## orderNorm Transformation with 969 nonmissing obs and ties
##  - 388 unique values 
##  - Original quantiles:
##    0%   25%   50%   75%  100% 
##  6.90 14.95 16.01 17.34 39.02
bt_pheno$nBMI <- BNphen$x.t
plot(density(na.omit(bt_pheno$nBMI)), main="Normalized BMI") 
polygon(density(na.omit(bt_pheno$nBMI)), col = "lightblue")

# fit smooth spline for Age against BMI
bmi_pheno <- na.omit(bt_pheno[, which(names(bt_pheno) %in% c("FID","GENDER","AGE","BMI"))])

bmi_plot <- qplot(AGE, BMI, data = bmi_pheno, geom='smooth', span =0.5, color = GENDER) + theme_bw()
bmi_plot
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
summary(bt_pheno$DBP)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   34.00   55.00   60.00   60.23   65.00  106.00       8

5 Save new pheno to file

#head(bt_pheno[, -c(3:4)])

outf <- paste0(
  params$data,
  "_new_pheno.tsv"
)

write.table(
  bt_pheno,
  outf,
  col.names=T,
  row.names=F,
  quote=F,
  sep="\t"
)