Back

1 Load package

list.of.packages <- c("tidyverse","bestNormalize")
new.packages <- list.of.packages[!(list.of.packages %in% installed.packages()[,"Package"])]
if(length(new.packages)) 
  install.packages(new.packages)
lapply(list.of.packages , require, character.only = T)
## [[1]]
## [1] TRUE
## 
## [[2]]
## [1] TRUE
#if(!lapply(pkgs, require, character.only = T)) {
#  install.packages(p)
#  require(bestNormalize)
#}

2 Load phenotype data and extract blood traits with relevant additional columns

pheno <- read.table(params$data, sep="\t", h=T)

keep_colmns <- c(
  "FID","IID","SAMPLEID","GENEMAP_CODE",
  "AGE","SEX", "GENDER","HEIGHT","WEIGHT","BMI","SBP","DBP",
  "STATUS","HbA","HbA2","HbF","nHbF","HbS","HbC",
  "RBC","Hb","MCV","HCT","MCHC","MCH","WBC",
  "LYMPHOCYTE","MONOCYTE","NEUTROPHIL",
  "EOSINOPHIL","BASOPHIL","PLATELET",
  "RETICULOCYTE","RETICULOCYTES"
)

bt_pheno <- pheno[, which(names(pheno) %in% keep_colmns)]

bt_pheno$BMI <- (bt_pheno$WEIGHT)/(bt_pheno$HEIGHT)^2

3 Demographic data

3.1 AGE

summary(bt_pheno$AGE)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##    0.67   10.00   16.00   17.60   23.00   66.00      10
hist(bt_pheno$AGE, main="AGE destribution", xlab="AGE")

#plot(density(na.omit(bt_pheno$AGE)))
#polygon(density(na.omit(bt_pheno$AGE)), col = "coral")
#abline(v=c(min(bt_pheno$AGE), max(bt_pheno$AGE)), lty=2, lwd=1)

3.2 SEX

bt_pheno %>% 
   select(SEX) %>% 
   table() %>% 
   as.data.frame() %>% 
   na.omit() %>%
   mutate(Prop = (Freq/sum(Freq))*100) -> sex       # Sex

sex
colnames(sex) <- c("cat", "freqRBC                : num  2.71 4.33 3 2.85 2.43 3.71 4.02 2.11 4.27 3.13 ...
##  $ Hb                 : num  8.9 11 9.2 7.4 7.4 10.8 10.7 6 11.1 10.6 ...
##  $ HCT                : num  25.8 33 27.4 22.2 22 31.5 31.9 17.6 31 30 ...
##  $ MCV                : num  95.3 76.1 91.4 78 90.5 51.9 79.3 83.1 72.4 96 ...
##  $ MCH                : num  32.8 25.4 30.7 25.9 36.5 29.1 24.7 28.3 25.9 34 ...
##  $ MCHC               : num  34.5 33.4 33.6 33.2 33.7 NA 33.7 34 35.8 35.4 ...
##  $ RDW                : num  18.1 14.1 14.6 20.1 19.9 NA 14.2 29.9 17.2 14.6 ...
##  $ WBC                : num  8.65 6.38 9.15 16.08 8.57 ...
##  $ LYMPHOCYTE_PERCENT : num  51.3 59.9 45 NA 44.3 NA 15.7 56.4 43.7 51.2 ...
##  $ MONOCYTE_PERCENT   : num  6.2 4.5 9.7 2.8 1.2 NA NA 7 6 7.3 ...
##  $ GRANULOCYTE_PERCENT: num  NA NA NA NA NA NA NA NA NA NA ...
##  $ LYMPHOCYTE         : num  4.44 3.82 4.11 NA 3.78 NA 3.31 9.46 3.06 3.86 ...
##  $ MONOCYTE           : num  0.54 0.29 0.89 0.45 0.82 NA 0.4 1.17 0.42 0.54 ...
##  $ GRANULOCYTE        : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ PLATELET           : num  413 411 323 673 347 NA 387 355 274 374 ...
##  $ MPV", "prop")
par(mar=c(4,4,3,4))
barplot(
   sex$prop, 
   ylim=c(0, 60), 
   col=c(2,4)
)
text(
   x=c(0.7,1.9), 
   y=30, 
   labels=c("Female", "Male")
)
mtext(
   text = "Proportion (%)",
   side = 2,
   line = 3
)
mtext(
   text = "Sex",
   side = 1,
   line = 1
)

4 Describe phenotypes and normalize if needed

4.1 RBC

# figures-side,
par(mar = c(4, 4, 2, 2))
hist(bt_pheno$RBC, main="RBC (million/uL)") 
plot(density(na.omit(bt_pheno$RBC)), main="RBC (million/uL)")
polygon(density(na.omit(bt_pheno$RBC)), col = "coral") 
BNphen <- bestNormalize::orderNorm(bt_pheno$RBC)
## Warning in bestNormalize::orderNorm(bt_pheno$RBC): Ties in data, Normal distribution not guaranteed
BNphen 
## orderNorm Transformation with 1210 nonmissing obs and ties
##  - 299 unique values 
##  - Original quantiles:
##    0%   25%   50%   75%  100% 
## 0.840 2.262 2.690 3.110 6.480
bt_pheno$nRBC <- BNphen$x.t
plot(density(na.omit(bt_pheno$nRBC)), main="Normalized RBC") 
polygon(density(na.omit(bt_pheno$nRBC)), col = "lightblue")

# fit smooth spline for Age against RBC
rbc_pheno <- na.omit(bt_pheno[, which(names(bt_pheno) %in% c("FID","GENDER","AGE","RBC"))])

rbc_plot <- qplot(AGE, RBC, data = rbc_pheno, geom='smooth', span =0.5, color = GENDER) + theme_bw()
## Warning: `qplot()` was deprecated in ggplot2 3.4.0.
## This warning is displayed once per session.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
rbc_plot
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
summary(bt_pheno$RBC)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   0.840   2.263   2.690   2.730   3.110   6.480      23

4.2 Hb

par(mar = c(4, 4, 2, 2))
hist(bt_pheno$Hb, main="Hb (g/dL)", xlab="Hb") 
plot(density(na.omit(bt_pheno$Hb)), main="Hb (g/dL)")
polygon(density(na.omit(bt_pheno$Hb)), col = "coral")

# removing outliers
#abline(v=13, lwd=1, lty=2)
#bt_pheno$Hb <- as.numeric(ifelse(bt_pheno$Hb > 13, "NA", bt_pheno$Hb))
#plot(density(na.omit(bt_pheno$Hb)), main="Hb (>13) outlier pruned")
#polygon(density(na.omit(bt_pheno$Hb)), col = "coral")

BNphen <- bestNormalize::orderNorm(bt_pheno$Hb)
## Warning in bestNormalize::orderNorm(bt_pheno$Hb): Ties in data, Normal distribution not guaranteed
BNphen 
## orderNorm Transformation with 1212 nonmissing obs and ties
##  - 92 unique values 
##  - Original quantiles:
##   0%  25%  50%  75% 100% 
##  2.9  6.7  7.6  8.5 14.5
bt_pheno$nHb <- BNphen$x.t
plot(density(na.omit(bt_pheno$nHb)), main="Normalized Hb") 
polygon(density(na.omit(bt_pheno$nHb)), col = "lightblue")

# fit smooth spline for Age against Hb
hb_pheno <- na.omit(bt_pheno[, which(names(bt_pheno) %in% c("FID","GENDER","AGE","Hb"))])

hb_plot <- qplot(AGE, Hb, data = hb_pheno, geom='smooth', span =0.5, color = GENDER) + theme_bw()
hb_plot
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
summary(bt_pheno$Hb)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   2.900   6.700   7.600   7.678   8.500  14.500      21

4.3 HbF

par(mar = c(4, 4, 2, 2))
hist(bt_pheno$HbF, main="HbF (%)", xlab="HbF") 
plot(density(na.omit(bt_pheno$HbF)), main="HbF")
polygon(density(na.omit(bt_pheno$HbF)), col = "coral")

# removing outliers
abline(v=0, lwd=1, lty=2)
bt_pheno$HbF <- as.numeric(ifelse(bt_pheno$HbF == 0, "NA", bt_pheno$HbF))
plot(density(na.omit(bt_pheno$HbF)), main="HbF (0) outlier pruned")
polygon(density(na.omit(bt_pheno$HbF)), col = "coral")

BNphen <- bestNormalize::orderNorm(bt_pheno$HbF)
BNphen 
## orderNorm Transformation with 1106 nonmissing obs and ties
##  - 261 unique values 
##  - Original quantiles:
##     0%    25%    50%    75%   100% 
##  0.800  4.825  8.800 14.000 40.300
bt_pheno$onHbF <- BNphen$x.t
plot(density(na.omit(bt_pheno$onHbF)), main="Normalized HbF") 
polygon(density(na.omit(bt_pheno$onHbF)), col = "lightblue")

# CUBIC ROOT TRANSFORMATION
bt_pheno$nHbF <- (bt_pheno$HbF)^(1/3)
plot(density(na.omit(bt_pheno$nHbF)), main="Cubic root transformed HbF") 
polygon(density(na.omit(bt_pheno$nHbF)), col = "lightblue")

# fit smooth spline for Age against Hb
HbF_pheno <- na.omit(bt_pheno[, which(names(bt_pheno) %in% c("FID","GENDER","AGE","HbF"))])

HbF_plot <- qplot(AGE, HbF, data = HbF_pheno, geom='smooth', span =0.5, color = GENDER) + theme_bw()
HbF_plot
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
summary(bt_pheno$HbF)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   0.800   4.825   8.800  10.221  14.000  40.300     127

4.4 MCV

par(mar = c(4, 4, 2, 2))
hist(pheno$MCV, main="MCV (fL)") 
plot(density(na.omit(bt_pheno$MCV)), main="MCV (fL)")
polygon(density(na.omit(bt_pheno$MCV)), col = "coral")

# removing outliers
#abline(v=120, lwd=1, lty=2)
#bt_pheno$MCV <- as.numeric(ifelse(bt_pheno$MCV > 120, "NA", bt_pheno$MCV))
#plot(density(na.omit(bt_pheno$MCV)), main="MCV (>120) outlier pruned")
#polygon(density(na.omit(bt_pheno$MCV)), col = "coral")

BNphen <- bestNormalize(bt_pheno$MCV)
BNphen 
## Best Normalizing transformation with 1212 Observations
##  Estimated Normality Statistics (Pearson P / df, lower => more normal):
##  - arcsinh(x): 1.7603
##  - Box-Cox: 1.5548
##  - Center+scale: 1.4957
##  - Double Reversed Log_b(x+a): 1.8836
##  - Exp(x): 137.5572
##  - Log_b(x+a): 1.7603
##  - orderNorm (ORQ): 1.6575
##  - sqrt(x + a): 1.6329
##  - Yeo-Johnson: 1.5548
## Estimation method: Out-of-sample via CV with 10 folds and 5 repeats
##  
## Based off these, bestNormalize chose:
## center_scale(x) Transformation with 1212 nonmissing obs.
##  Estimated statistics:
##  - mean (before standardization) = 87.24282 
##  - sd (before standardization) = 10.16145
bt_pheno$nMCV <- BNphen$x.t
plot(density(na.omit(bt_pheno$nMCV)), main="Normalized MCV")
polygon(density(na.omit(bt_pheno$nMCV)), col = "lightblue")

BNphen <- bestNormalize::orderNorm(bt_pheno$MCV)
## Warning in bestNormalize::orderNorm(bt_pheno$MCV): Ties in data, Normal distribution not guaranteed
BNphen 
## orderNorm Transformation with 1212 nonmissing obs and ties
##  - 72 unique values 
##  - Original quantiles:
##   0%  25%  50%  75% 100% 
##   51   81   88   94  133
bt_pheno$onMCV <- BNphen$x.t
plot(density(na.omit(bt_pheno$onMCV)), main="Normalized (orderNorm) MCV")
polygon(density(na.omit(bt_pheno$onMCV)), col = "lightblue")

# fit smooth spline for Age against Hb
mcv_pheno <- na.omit(bt_pheno[, which(names(bt_pheno) %in% c("FID","GENDER","AGE","MCV"))])

mcv_plot <- qplot(AGE, MCV, data = mcv_pheno, geom='smooth', span =0.5, color = GENDER) + theme_bw()
mcv_plot
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
summary(bt_pheno$MCV)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   51.00   81.00   88.00   87.24   94.00  133.00      21

4.5 HCT

par(mar = c(4, 4, 2, 2))
hist(bt_pheno$HCT, main="HCT") 
plot(density(na.omit(bt_pheno$HCT)), main="HCT")
polygon(density(na.omit(bt_pheno$HCT)), col = "coral")

# removing ourliers
#abline(v=43, lwd=1, lty=2)
#bt_pheno$HCT <- as.numeric(ifelse(bt_pheno$HCT > 43, "NA", bt_pheno$HCT))
#plot(density(na.omit(bt_pheno$HCT)), main="HCT (>43) outlier pruned")
#polygon(density(na.omit(bt_pheno$HCT)), col = "coral")

BNphen <- bestNormalize::orderNorm(bt_pheno$HCT)
## Warning in bestNormalize::orderNorm(bt_pheno$HCT): Ties in data, Normal distribution not guaranteed
BNphen 
## orderNorm Transformation with 742 nonmissing obs and ties
##  - 197 unique values 
##  - Original quantiles:
##   0%  25%  50%  75% 100% 
##  9.2 20.8 23.5 26.5 46.4
bt_pheno$nHCT <- BNphen$x.t
plot(density(na.omit(bt_pheno$nHCT)), main="Normalized HCT") 
polygon(density(na.omit(bt_pheno$nHCT)), col = "lightblue")

# fit smooth spline for Age against Hb
htc_pheno <- na.omit(bt_pheno[, which(names(bt_pheno) %in% c("FID","GENDER","AGE","HCT"))])

htc_plot <- qplot(AGE, HCT, data = htc_pheno, geom='smooth', span =0.5, color = GENDER) + theme_bw()
htc_plot
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
summary(bt_pheno$HCT)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##     9.2    20.8    23.5    23.7    26.5    46.4     491

4.6 MCHC

#Mean Corpuscular Hemoglobin Concentration

par(mar = c(4, 4, 2, 2))
hist(bt_pheno$MCHC, main="MCHC (g/dL)") 
plot(density(na.omit(bt_pheno$MCHC)), main="MCHC (g/dL)")
polygon(density(na.omit(bt_pheno$MCHC)), col = "coral")

# removing
abline(v=c(10, 60), lwd=1, lty=2)
bt_pheno$MCHC <- as.numeric(
  ifelse(bt_pheno$MCHC > 60, "NA", 
    #bt_pheno$MCHC
    ifelse(bt_pheno$MCHC < 10, "NA", bt_pheno$MCHC)
  )
)
## Warning: NAs introduced by coercion
plot(density(na.omit(bt_pheno$MCHC)), main="MCHC (>60 and <10) outlier pruned")
polygon(density(na.omit(bt_pheno$MCHC)), col = "coral")

BNphen <- bestNormalize::orderNorm(bt_pheno$MCHC)
## Warning in bestNormalize::orderNorm(bt_pheno$MCHC): Ties in data, Normal distribution not guaranteed
BNphen 
## orderNorm Transformation with 1178 nonmissing obs and ties
##  - 111 unique values 
##  - Original quantiles:
##   0%  25%  50%  75% 100% 
## 11.9 26.0 31.0 33.0 54.2
bt_pheno$nMCHC <- BNphen$x.t
plot(density(na.omit(bt_pheno$nMCHC)), main="Normalized MCHC") 
polygon(density(na.omit(bt_pheno$nMCHC)), col = "lightblue")

# fit smooth spline for Age against Hb
mchc_pheno <- na.omit(bt_pheno[, which(names(bt_pheno) %in% c("FID","GENDER","AGE","MCHC"))])

mchc_plot <- qplot(AGE, MCHC, data = mchc_pheno, geom='smooth', span =0.5, color = GENDER) + theme_bw()
mchc_plot
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
summary(bt_pheno$MCHC)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   11.90   26.00   31.00   29.69   33.00   54.20      55

4.7 MCH

#The Mean Corpuscular Hemoglobin

par(mar = c(4, 4, 2, 2))
hist(bt_pheno$MCH, main="MCH (pg/cell)") 
plot(density(na.omit(bt_pheno$MCH)), main="MCH (pg/cell)")
polygon(density(na.omit(bt_pheno$MCH)), col = "coral")

# removing outliers
abline(v=10, lwd=1, lty=2)
bt_pheno$MCH <- as.numeric(
  ifelse(bt_pheno$MCH < 10, "NA", 
    bt_pheno$MCH
    #ifelse(bt_pheno$MCH < 10, "NA", bt_pheno$MCH)
  )
)
## Warning: NAs introduced by coercion
plot(density(na.omit(bt_pheno$MCH)), main="MCH (<10) outlier pruned")
polygon(density(na.omit(bt_pheno$MCH)), col = "coral")

BNphen <- bestNormalize::orderNorm(bt_pheno$MCH)
## Warning in bestNormalize::orderNorm(bt_pheno$MCH): Ties in data, Normal distribution not guaranteed
BNphen 
## orderNorm Transformation with 741 nonmissing obs and ties
##  - 36 unique values 
##  - Original quantiles:
##   0%  25%  50%  75% 100% 
##   13   26   29   31   44
bt_pheno$nMCH <- BNphen$x.t
plot(density(na.omit(bt_pheno$nMCH)), main="Normalized MCH") 
polygon(density(na.omit(bt_pheno$nMCH)), col = "lightblue")

# fit smooth spline for Age against Hb
MCH_pheno <- na.omit(bt_pheno[, which(names(bt_pheno) %in% c("FID","GENDER","AGE","MCH"))])

MCH_plot <- qplot(AGE, MCH, data = MCH_pheno, geom='smooth', span =0.5, color = GENDER) + theme_bw()
MCH_plot
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
summary(bt_pheno$MCH)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   13.00   26.00   29.00   28.53   31.00   44.00     492

4.8 WBC

par(mar = c(4, 4, 2, 2))
hist(bt_pheno$WBC, main="WBC (x10^9 cells/L)") 
plot(density(na.omit(bt_pheno$WBC)), main="WBC (x10^9 cells/L)")
polygon(density(na.omit(bt_pheno$WBC)), col = "coral")

# removing
abline(v=60, lwd=1, lty=2)
bt_pheno$WBC <- as.numeric(
  ifelse(bt_pheno$WBC > 60, "NA", 
    bt_pheno$WBC
    #ifelse(bt_pheno$WBC < 10, "NA", bt_pheno$WBC)
  )
)
## Warning: NAs introduced by coercion
plot(density(na.omit(bt_pheno$WBC)), main="WBC (>60) outlier pruned")
polygon(density(na.omit(bt_pheno$WBC)), col = "coral")

BNphen <- bestNormalize::orderNorm(bt_pheno$WBC)
## Warning in bestNormalize::orderNorm(bt_pheno$WBC): Ties in data, Normal distribution not guaranteed
BNphen 
## orderNorm Transformation with 1210 nonmissing obs and ties
##  - 277 unique values 
##  - Original quantiles:
##    0%   25%   50%   75%  100% 
##  0.40  8.80 11.25 14.50 49.80
bt_pheno$nWBC <- BNphen$x.t
plot(density(na.omit(bt_pheno$nWBC)), main="Normalized WBC") 
polygon(density(na.omit(bt_pheno$nWBC)), col = "lightblue")

# fit smooth spline for Age against Hb
wbc_pheno <- na.omit(bt_pheno[, which(names(bt_pheno) %in% c("FID","GENDER","AGE","WBC"))])

wbc_plot <- qplot(AGE, WBC, data = wbc_pheno, geom='smooth', span =0.5, color = GENDER) + theme_bw()
wbc_plot
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
summary(bt_pheno$WBC)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##    0.40    8.80   11.25   12.09   14.50   49.80      23

4.9 LYMPHOCYTE

par(mar = c(4, 4, 2, 2))
hist(bt_pheno$LYMPHOCYTE, main="LYMPHOCYTE (x10^9 cells/L)") 
plot(density(na.omit(bt_pheno$LYMPHOCYTE)), main="LYMPHOCYTE (x10^9 cells/L)")
polygon(density(na.omit(bt_pheno$LYMPHOCYTE)), col = "coral") 

BNphen <- bestNormalize::orderNorm(bt_pheno$LYMPHOCYTE)
## Warning in bestNormalize::orderNorm(bt_pheno$LYMPHOCYTE): Ties in data, Normal distribution not guaranteed
BNphen 
## orderNorm Transformation with 1211 nonmissing obs and ties
##  - 418 unique values 
##  - Original quantiles:
##     0%    25%    50%    75%   100% 
##  0.100  3.600  4.800  6.405 22.600
bt_pheno$nLYMPHOCYTE <- BNphen$x.t
plot(density(na.omit(bt_pheno$nLYMPHOCYTE)), main="Normalized LYMPHOCYTE") 
polygon(density(na.omit(bt_pheno$nLYMPHOCYTE)), col = "lightblue")

# fit smooth spline for Age against Hb
lym_pheno <- na.omit(bt_pheno[, which(names(bt_pheno) %in% c("FID","GENDER","AGE","LYMPHOCYTE"))])

lym_plot <- qplot(AGE, LYMPHOCYTE, data = lym_pheno, geom='smooth', span =0.5, color = GENDER) + theme_bw()
lym_plot
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
summary(bt_pheno$LYMPHOCYTE)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   0.100   3.600   4.800   5.331   6.405  22.600      22

4.10 MONOCYTE

par(mar = c(4, 4, 2, 2))
hist(bt_pheno$MONOCYTE, main="MONOCYTE (x10^9 cells/L)") 
plot(density(na.omit(bt_pheno$MONOCYTE)), main="MONOCYTE (x10^9 cells/L)")
polygon(density(na.omit(bt_pheno$MONOCYTE)), col = "coral")

# removing
abline(v=0, lwd=1, lty=2)
bt_pheno$MONOCYTE <- as.numeric(
  ifelse(bt_pheno$MONOCYTE == 0, "NA", 
    bt_pheno$MONOCYTE
    #ifelse(bt_pheno$MONOCYTE == 0, "NA", bt_pheno$MONOCYTE)
  )
)
## Warning: NAs introduced by coercion
plot(density(na.omit(bt_pheno$MONOCYTE)), main="MONOCYTE (=0) outlier pruned")
polygon(density(na.omit(bt_pheno$MONOCYTE)), col = "coral")

BNphen <- bestNormalize::orderNorm(bt_pheno$MONOCYTE)
## Warning in bestNormalize::orderNorm(bt_pheno$MONOCYTE): Ties in data, Normal distribution not guaranteed
BNphen 
## orderNorm Transformation with 1207 nonmissing obs and ties
##  - 233 unique values 
##  - Original quantiles:
##   0%  25%  50%  75% 100% 
## 0.10 0.70 1.00 1.45 9.40
bt_pheno$nMONOCYTE <- BNphen$x.t
plot(density(na.omit(bt_pheno$nMONOCYTE)), main="Normalized MONOCYTE") 
polygon(density(na.omit(bt_pheno$nMONOCYTE)), col = "lightblue")

# fit smooth spline for Age against Hb
mon_pheno <- na.omit(bt_pheno[, which(names(bt_pheno) %in% c("FID","GENDER","AGE","MONOCYTE"))])

mon_plot <- qplot(AGE, MONOCYTE, data = mon_pheno, geom='smooth', span =0.5, color = GENDER) + theme_bw()
mon_plot
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
summary(bt_pheno$MONOCYTE)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##    0.10    0.70    1.00    1.21    1.45    9.40      26

4.11 NEUTROPHIL

par(mar = c(4, 4, 2, 2))
hist(bt_pheno$NEUTROPHIL, main="NEUTROPHIL (g/L)") 
plot(density(na.omit(bt_pheno$NEUTROPHIL)), main="NEUTROPHIL (g/L)")
polygon(density(na.omit(bt_pheno$NEUTROPHIL)), col = "coral")

# removing
abline(v=0, lwd=1, lty=2)
bt_pheno$NEUTROPHIL <- as.numeric(
  ifelse(bt_pheno$NEUTROPHIL == 0, "NA", 
    bt_pheno$NEUTROPHIL
    #ifelse(bt_pheno$NEUTROPHIL < 10, "NA", bt_pheno$NEUTROPHIL)
  )
)

plot(density(na.omit(bt_pheno$NEUTROPHIL)), main="NEUTROPHIL (=0) outlier pruned")
polygon(density(na.omit(bt_pheno$NEUTROPHIL)), col = "coral")

BNphen <- bestNormalize::orderNorm(bt_pheno$NEUTROPHIL)
## Warning in bestNormalize::orderNorm(bt_pheno$NEUTROPHIL): Ties in data, Normal distribution not guaranteed
BNphen 
## orderNorm Transformation with 743 nonmissing obs and ties
##  - 139 unique values 
##  - Original quantiles:
##    0%   25%   50%   75%  100% 
##  0.10  2.80  4.20  6.05 66.00
bt_pheno$nNEUTROPHIL <- BNphen$x.t
plot(density(na.omit(bt_pheno$nNEUTROPHIL)), main="Normalized NEUTROPHIL") 
polygon(density(na.omit(bt_pheno$nNEUTROPHIL)), col = "lightblue")

# fit smooth spline for Age against NEUTROPHIL
neu_pheno <- na.omit(bt_pheno[, which(names(bt_pheno) %in% c("FID","GENDER","AGE","NEUTROPHIL"))])

neu_plot <- qplot(AGE, NEUTROPHIL, data = neu_pheno, geom='smooth', span =0.5, color = GENDER) + theme_bw()
neu_plot
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
summary(bt_pheno$NEUTROPHIL)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   0.100   2.800   4.200   5.082   6.050  66.000     490

4.12 EOSINOPHIL

par(mar = c(4, 4, 2, 2))
hist(bt_pheno$EOSINOPHIL, main="EOSINOPHIL (x10^9 cells/L)") 
plot(density(na.omit(bt_pheno$EOSINOPHIL)), main="EOSINOPHIL (x10^9 cells/L)")
polygon(density(na.omit(bt_pheno$EOSINOPHIL)), col = "coral")

# removing
# abline(v=2, lwd=1, lty=2)
# bt_pheno$EOSINOPHIL <- as.numeric(
#   ifelse(bt_pheno$EOSINOPHIL > 2, "NA", 
#     bt_pheno$EOSINOPHIL
#     #ifelse(bt_pheno$EOSINOPHIL < 10, "NA", bt_pheno$EOSINOPHIL)
#   )
# )
# 
# plot(density(na.omit(bt_pheno$EOSINOPHIL)), main="EOSINOPHIL (>2) outlier pruned")
# polygon(density(na.omit(bt_pheno$EOSINOPHIL)), col = "coral")

BNphen <- bestNormalize(bt_pheno$EOSINOPHIL)
BNphen 
## Best Normalizing transformation with 740 Observations
##  Estimated Normality Statistics (Pearson P / df, lower => more normal):
##  - arcsinh(x): 7.9709
##  - Center+scale: 10.2261
##  - Double Reversed Log_b(x+a): 11.7165
##  - Exp(x): 35.7939
##  - Log_b(x+a): 9.2617
##  - orderNorm (ORQ): 8.1158
##  - sqrt(x + a): 7.7237
##  - Yeo-Johnson: 8.1742
## Estimation method: Out-of-sample via CV with 10 folds and 5 repeats
##  
## Based off these, bestNormalize chose:
## Standardized sqrt(x + a) Transformation with 740 nonmissing obs.:
##  Relevant statistics:
##  - a = 0 
##  - mean (before standardization) = 0.5289364 
##  - sd (before standardization) = 0.3187159
bt_pheno$nEOSINOPHIL <- BNphen$x.t

plot(density(na.omit(bt_pheno$nEOSINOPHIL)), main="Normalized EOSINOPHIL") 
polygon(density(na.omit(bt_pheno$nEOSINOPHIL)), col = "lightblue")

# fit smooth spline for Age against EOSINOPHIL
eu_pheno <- na.omit(bt_pheno[, which(names(bt_pheno) %in% c("FID","GENDER","AGE","EOSINOPHIL"))])

eu_plot <- qplot(AGE, EOSINOPHIL, data = eu_pheno, geom='smooth', span =0.5, color = GENDER) + theme_bw()
eu_plot
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
summary(bt_pheno$EOSINOPHIL)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##  0.0000  0.1000  0.2000  0.3812  0.4000  5.4000     493

4.13 BASOPHIL

par(mar = c(4, 4, 2, 2))
hist(bt_pheno$BASOPHIL, main="BASOPHIL") 
plot(density(na.omit(bt_pheno$BASOPHIL)), main="BASOPHIL")
polygon(density(na.omit(bt_pheno$BASOPHIL)), col = "coral")

# removing
abline(v=1, lwd=1, lty=2)
bt_pheno$BASOPHIL <- as.numeric(
  ifelse(bt_pheno$BASOPHIL > 1, "NA", 
    bt_pheno$BASOPHIL
    #ifelse(bt_pheno$BASOPHIL < 10, "NA", bt_pheno$BASOPHIL)
  )
)
## Warning: NAs introduced by coercion
plot(density(na.omit(bt_pheno$BASOPHIL)), main="BASOPHIL (>1) outlier pruned")
polygon(density(na.omit(bt_pheno$BASOPHIL)), col = "coral")

BNphen <- bestNormalize(bt_pheno$BASOPHIL)
BNphen 
## Best Normalizing transformation with 734 Observations
##  Estimated Normality Statistics (Pearson P / df, lower => more normal):
##  - arcsinh(x): 20.4253
##  - Center+scale: 20.3933
##  - Double Reversed Log_b(x+a): 20.28
##  - Exp(x): 20.3063
##  - Log_b(x+a): 21.5986
##  - orderNorm (ORQ): 20.6695
##  - sqrt(x + a): 20.2993
##  - Yeo-Johnson: 20.4633
## Estimation method: Out-of-sample via CV with 10 folds and 5 repeats
##  
## Based off these, bestNormalize chose:
## Standardized double reversed Log_b(x + a) Transformation with 734 nonmissing obs.:
##  Relevant statistics:
##  - a = 
##  - b = 10 
##  - max(x) = 1.1 ; min(x) = -0.1 
##  - mean (before standardization) = 0.1035505 
##  - sd (before standardization) = 0.100856
bt_pheno$nBASOPHIL <- BNphen$x.t
plot(density(na.omit(bt_pheno$nBASOPHIL)), main="Normalized BASOPHIL") 
polygon(density(na.omit(bt_pheno$nBASOPHIL)), col = "lightblue")

# binarize basophil after removing outliers (1-control, 2-case)
# manual (10.1016/j.ajhg.2021.08.007)
bt_pheno$bBASOPHIL <- as.numeric(
  ifelse(bt_pheno$BASOPHIL < 0.05, 1, 
    #bt_pheno$BASOPHIL
    ifelse(bt_pheno$BASOPHIL >= 0.05, 2, "NA")
  )
)

bt_pheno$cBASOPHIL <- as.numeric(
  ifelse(bt_pheno$BASOPHIL < 0.05, 0, 
    #bt_pheno$BASOPHIL
    ifelse(bt_pheno$BASOPHIL >= 0.05, 1, "NA")
  )
)

hist(bt_pheno$bBASOPHIL, main="binarized")

# fit smooth spline for Age against BASOPHIL
bas_pheno <- na.omit(bt_pheno[, which(names(bt_pheno) %in% c("FID","GENDER","AGE","BASOPHIL"))])

bas_plot <- qplot(AGE, BASOPHIL, data = bas_pheno, geom='smooth', span =0.5, color = GENDER) + theme_bw()
bas_plot
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
summary(bt_pheno$BASOPHIL)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##  0.0000  0.1000  0.1000  0.1368  0.2000  1.0000     499

4.14 PLATELET

par(mar = c(4, 4, 2, 2))
hist(bt_pheno$PLATELET, main="PLATELET (x10^9/L)") 
plot(density(na.omit(bt_pheno$PLATELET)), main="PLATELET (x10^9/L)")
polygon(density(na.omit(bt_pheno$PLATELET)), col = "coral") 
BNphen <- bestNormalize::orderNorm(bt_pheno$PLATELET)
## Warning in bestNormalize::orderNorm(bt_pheno$PLATELET): Ties in data, Normal distribution not guaranteed
BNphen 
## orderNorm Transformation with 1210 nonmissing obs and ties
##  - 518 unique values 
##  - Original quantiles:
##      0%     25%     50%     75%    100% 
##   22.00  312.00  401.50  499.75 1401.00
bt_pheno$nPLATELET <- BNphen$x.t
plot(density(na.omit(bt_pheno$nPLATELET)), main="Normalized PLATELET") 
polygon(density(na.omit(bt_pheno$nPLATELET)), col = "lightblue")

# fit smooth spline for Age against PLATELET
pla_pheno <- na.omit(bt_pheno[, which(names(bt_pheno) %in% c("FID","GENDER","AGE","PLATELET"))])

pla_plot <- qplot(AGE, PLATELET, data = pla_pheno, geom='smooth', span =0.5, color = GENDER) + theme_bw()
pla_plot
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
summary(bt_pheno$PLATELET)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##    22.0   312.0   401.5   416.1   499.8  1401.0      23

4.15 RETICULOCYTE

# RETICULOCYTE PER 1000 RBCs

par(mar = c(4, 4, 2, 2))
hist(bt_pheno$RETICULOCYTE, main="RETICULOCYTE") 
plot(density(na.omit(bt_pheno$RETICULOCYTE)), main="RETICULOCYTE")
polygon(density(na.omit(bt_pheno$RETICULOCYTE)), col = "coral") 
BNphen <- bestNormalize::orderNorm(bt_pheno$RETICULOCYTE)
## Warning in bestNormalize::orderNorm(bt_pheno$RETICULOCYTE): Ties in data, Normal distribution not guaranteed
BNphen 
## orderNorm Transformation with 639 nonmissing obs and ties
##  - 217 unique values 
##  - Original quantiles:
##    0%   25%   50%   75%  100% 
##   8.0  65.0  91.0 131.5 411.0
bt_pheno$nRETICULOCYTE <- BNphen$x.t
plot(density(na.omit(bt_pheno$nRETICULOCYTE)), main="Normalized RETICULOCYTE") 
polygon(density(na.omit(bt_pheno$nRETICULOCYTE)), col = "lightblue")

# fit smooth spline for Age against PLATELET
ret_pheno <- na.omit(bt_pheno[, which(names(bt_pheno) %in% c("FID","GENDER","AGE","RETICULOCYTE"))])

ret_plot <- qplot(AGE, RETICULOCYTE, data = ret_pheno, geom='smooth', span =0.5, color = GENDER) + theme_bw()
ret_plot
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
summary(bt_pheno$RETICULOCYTE)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##     8.0    65.0    91.0   105.2   131.5   411.0     594

4.16 RETICULOCYTES

# RETICULOCYTES /uL

par(mar = c(4, 4, 2, 2))
hist(bt_pheno$RETICULOCYTES, main="RETICULOCYTES /uL") 
plot(density(na.omit(bt_pheno$RETICULOCYTES)), main="RETICULOCYTES /uL")
polygon(density(na.omit(bt_pheno$RETICULOCYTES)), col = "coral")

# removing outliers
abline(v=1000000, lwd=1, lty=2)
bt_pheno$RETICULOCYTES <- as.numeric(
  ifelse(bt_pheno$RETICULOCYTES > 1000000, "NA", 
    bt_pheno$RETICULOCYTES
    #ifelse(bt_pheno$RETICULOCYTES < 10, "NA", bt_pheno$RETICULOCYTES)
  )
)
## Warning: NAs introduced by coercion
plot(density(na.omit(bt_pheno$RETICULOCYTES)), main="RETICULOCYTES (>1000000) outlier pruned")
polygon(density(na.omit(bt_pheno$RETICULOCYTES)), col = "coral")

BNphen <- bestNormalize::orderNorm(bt_pheno$RETICULOCYTES)
## Warning in bestNormalize::orderNorm(bt_pheno$RETICULOCYTES): Ties in data, Normal distribution not guaranteed
BNphen 
## orderNorm Transformation with 637 nonmissing obs and ties
##  - 620 unique values 
##  - Original quantiles:
##     0%    25%    50%    75%   100% 
##   1363 182820 243540 317790 764460
bt_pheno$nRETICULOCYTES <- BNphen$x.t
plot(density(na.omit(bt_pheno$nRETICULOCYTES)), main="Normalized RETICULOCYTES") 
polygon(density(na.omit(bt_pheno$nRETICULOCYTES)), col = "lightblue")

# fit smooth spline for Age against RETICULOCYTES
rets_pheno <- na.omit(bt_pheno[, which(names(bt_pheno) %in% c("FID","GENDER","AGE","RETICULOCYTES"))])

rets_plot <- qplot(AGE, RETICULOCYTES, data = rets_pheno, geom='smooth', span =0.5, color = GENDER) + theme_bw()
rets_plot
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
summary(bt_pheno$RETICULOCYTES)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##    1363  182820  243540  265553  317790  764460     596

4.17 SBP

par(mar = c(4, 4, 2, 2))
hist(bt_pheno$SBP, main="SBP") 
plot(density(na.omit(bt_pheno$SBP)), main="SBP")
polygon(density(na.omit(bt_pheno$SBP)), col = "coral")

BNphen <- bestNormalize::orderNorm(bt_pheno$SBP)
BNphen 
## orderNorm Transformation with 645 nonmissing obs and ties
##  - 77 unique values 
##  - Original quantiles:
##   0%  25%  50%  75% 100% 
##   67   99  110  117  165
bt_pheno$nSBP <- BNphen$x.t
plot(density(na.omit(bt_pheno$nSBP)), main="Normalized SBP") 
polygon(density(na.omit(bt_pheno$nSBP)), col = "lightblue")

# fit smooth spline for Age against RETICS
sbp_pheno <- na.omit(bt_pheno[, which(names(bt_pheno) %in% c("FID","GENDER","AGE","SBP"))])

sbp_plot <- qplot(AGE, SBP, data = sbp_pheno, geom='smooth', span =0.5, color = GENDER) + theme_bw()
sbp_plot
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
summary(bt_pheno$SBP)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##    67.0    99.0   110.0   108.5   117.0   165.0     588

4.18 DBP

par(mar = c(4, 4, 2, 2))
hist(bt_pheno$DBP, main="DBP") 
plot(density(na.omit(bt_pheno$DBP)), main="DBP")
polygon(density(na.omit(bt_pheno$DBP)), col = "coral")

BNphen <- bestNormalize::orderNorm(bt_pheno$DBP)
BNphen 
## orderNorm Transformation with 645 nonmissing obs and ties
##  - 67 unique values 
##  - Original quantiles:
##   0%  25%  50%  75% 100% 
##   13   59   65   72  107
bt_pheno$nDBP <- BNphen$x.t
plot(density(na.omit(bt_pheno$nDBP)), main="Normalized DBP") 
polygon(density(na.omit(bt_pheno$nDBP)), col = "lightblue")

# fit smooth spline for Age against DBP
dbp_pheno <- na.omit(bt_pheno[, which(names(bt_pheno) %in% c("FID","GENDER","AGE","DBP"))])

dbp_plot <- qplot(AGE, DBP, data = dbp_pheno, geom='smooth', span =0.5, color = GENDER) + theme_bw()
dbp_plot
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
summary(bt_pheno$DBP)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   13.00   59.00   65.00   65.81   72.00  107.00     588

4.19 BMI

par(mar = c(4, 4, 2, 2))
hist(bt_pheno$BMI, main="BMI") 
plot(density(na.omit(bt_pheno$BMI)), main="BMI")
polygon(density(na.omit(bt_pheno$BMI)), col = "coral")

# removing outliers
abline(v=40, lwd=1, lty=2)
bt_pheno$BMI <- as.numeric(
  ifelse(bt_pheno$BMI > 40, "NA", 
    bt_pheno$BMI
    #ifelse(bt_pheno$BMI < 10, "NA", bt_pheno$BMI)
  )
)
plot(density(na.omit(bt_pheno$BMI)), main="BMI (>40) outlier pruned")
polygon(density(na.omit(bt_pheno$BMI)), col = "coral")

BNphen <- bestNormalize::orderNorm(bt_pheno$BMI)
BNphen 
## orderNorm Transformation with 743 nonmissing obs and ties
##  - 599 unique values 
##  - Original quantiles:
##     0%    25%    50%    75%   100% 
##  0.002 15.913 17.928 20.422 35.938
bt_pheno$nBMI <- BNphen$x.t
plot(density(na.omit(bt_pheno$nBMI)), main="Normalized BMI") 
polygon(density(na.omit(bt_pheno$nBMI)), col = "lightblue")

# fit smooth spline for Age against BMI
bmi_pheno <- na.omit(bt_pheno[, which(names(bt_pheno) %in% c("FID","GENDER","AGE","BMI"))])

bmi_plot <- qplot(AGE, BMI, data = bmi_pheno, geom='smooth', span =0.5, color = GENDER) + theme_bw()
bmi_plot
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
summary(bt_pheno$DBP)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   13.00   59.00   65.00   65.81   72.00  107.00     588

5 Save new pheno to file

#head(bt_pheno[, -c(3:4)])

outf <- paste0(
  params$data,
  "_new_pheno.tsv"
)

write.table(
  bt_pheno,
  outf,
  col.names=T,
  row.names=F,
  quote=F,
  sep="\t"
)