Back

1 Load package

list.of.packages <- c("tidyverse","bestNormalize", "readxl")
new.packages <- list.of.packages[!(list.of.packages %in% installed.packages()[,"Package"])]
if(length(new.packages)) 
  install.packages(new.packages)
lapply(list.of.packages , require, character.only = T)
## [[1]]
## [1] TRUE
## 
## [[2]]
## [1] TRUE
## 
## [[3]]
## [1] TRUE
#if(!lapply(pkgs, require, character.only = T)) {
#  install.packages(p)
#  require(bestNormalize)
#}

2 Load phenotype data and extract blood traits with relevant additional columns

https://www.hematology.org/education/trainees/fellows/hematopoiesis/2021/hemoglobin-electrophoresis-in-sickle-cell-disease

pheno <- read_excel(params$data, sheet = "clean_metadata")

names(pheno)
##  [1] "FID"                "IID"                "GID"               
##  [4] "SID"                "GENDER"             "GENDER2"           
##  [7] "SEX"                "AGE1"               "AGE_UNITS"         
## [10] "AGE"                "DISTRICT"           "WEIGHT"            
## [13] "HEIGHT"             "BMI"                "HU_STATUS"         
## [16] "SBP"                "DBP"                "UREA"              
## [19] "CREATININE"         "uACR"               "BASOPHIL"          
## [22] "BASOPHIL_PERCENT"   "EOSINOPHIL"         "EOSINOPHIL_PERCENT"
## [25] "HCT"                "Hb"                 "LDH"               
## [28] "LYMPHOCYTE"         "LYMPHOCYTE_PERCENT" "MCH"               
## [31] "MCHC"               "MCV"                "MPV"               
## [34] "MONOCYTE"           "MONOCYTE_PERCENT"   "NEUTROPHIL"        
## [37] "NEUTROPHIL_PERCENT" "PLATELET"           "RBC"               
## [40] "RDW"                "RDWSD"              "WBC"               
## [43] "HbA"                "HbA2"               "HbF"               
## [46] "HbS"                "SICKLECELLTEST"     "CURATED_STATUS2"   
## [49] "CURATED_STATUS"
#table(pheno$DISTRICT)
keep_colmns <- c( 
  "FID","IID","GID","AGE","GENDER","SEX","HEIGHT","WEIGHT","BMI",
  "CURATED_STATUS","HU_STATUS","SBP","DBP",
  "BASOPHIL","BASOPHIL_PERCENT","EOSINOPHIL","EOSINOPHIL_PERCENT","HCT",
  "Hb","LYMPHOCYTE","LYMPHOCYTE_PERCENT","MCH","MCHC","MCV","MPV","MONOCYTE",
  "MONOCYTE_PERCENT","NEUTROPHIL","NEUTROPHIL_PERCENT","PLATELET","RBC","RDW","RDWSD",
  "WBC","HbA","HbA2","HbF","HbS"
)

bt_pheno <- pheno[, which(names(pheno) %in% keep_colmns)]

colmns_to_update <- c( 
  "AGE","SEX",
  "HEIGHT","WEIGHT","BMI","SBP","DBP",
  "BASOPHIL","BASOPHIL_PERCENT","EOSINOPHIL","EOSINOPHIL_PERCENT","LYMPHOCYTE","LYMPHOCYTE_PERCENT",
  "MONOCYTE","MONOCYTE_PERCENT","NEUTROPHIL","NEUTROPHIL_PERCENT","WBC",
  "MCH","MCHC","MCV","RBC","RDW","RDWSD",
  "PLATELET","MPV",
  "HCT","Hb","HbA","HbA2","HbF","HbS"
)

set_var_type <- function(x) {
  as.numeric(x)
}

bt_pheno[, c(colmns_to_update)] <- lapply(bt_pheno[, c(colmns_to_update)] , set_var_type)

#bt_pheno$BMI <- as.numeric((bt_pheno$WEIGHT)/(bt_pheno$HEIGHT)^2)

str(bt_pheno)
## tibble [952 × 38] (S3: tbl_df/tbl/data.frame)
##  $ FID               : chr [1:952] "207477820002_R03C01" "207477820010_R03C01" "207477820029_R04C01" "207477820029_R02C01" ...
##  $ IID               : chr [1:952] "207477820002_R03C01" "207477820010_R03C01" "207477820029_R04C01" "207477820029_R02C01" ...
##  $ GID               : chr [1:952] "SMLW000077" "SMLW000093" "SMLW000126" "SMLW000124" ...
##  $ GENDER            : chr [1:952] "Female" "Male" "Male" "Female" ...
##  $ SEX               : num [1:952] 2 1 1 2 1 2 1 1 1 1 ...
##  $ AGE               : num [1:952] 16 3 10 9 13 5 4 16 2.75 5 ...
##  $ WEIGHT            : num [1:952] 41.1 12.9 23.9 25.4 36.9 ...
##  $ HEIGHT            : num [1:952] 149 94.4 128 127 149 108 112 158 90 115 ...
##  $ BMI               : num [1:952] 18.5 14.5 14.6 15.7 16.6 ...
##  $ HU_STATUS         : chr [1:952] "Yes" "Yes" "Yes" "Yes" ...
##  $ SBP               : num [1:952] 123 88 90 100 95 90 90 113 101 NA ...
##  $ DBP               : num [1:952] 96 42 57 53 62 48 54 73 58 NA ...
##  $ BASOPHIL          : num [1:952] 0.02 0.01 0.04 0.02 0.06 0.04 0.05 0.11 0.06 NA ...
##  $ BASOPHIL_PERCENT  : num [1:952] 0.3 0.2 0.4 0.3 0.5 0.3 0.4 0.8 0.4 NA ...
##  $ EOSINOPHIL        : num [1:952] 0.15 0.14 0.15 0.12 0.1 0.26 0.65 0.42 0.4 NA ...
##  $ EOSINOPHIL_PERCENT: num [1:952] 1.9 2.7 1.2 2 0.8 2 4.9 3 2.5 NA ...
##  $ HCT               : num [1:952] 33.7 37.1 19.9 32.9 20.5 24.3 19.5 22.4 25.8 NA ...
##  $ Hb                : num [1:952] 11.3 11.9 6.5 11 6.8 7.9 6.7 7.4 8.4 NA ...
##  $ LYMPHOCYTE        : num [1:952] 2.49 3.25 4.75 2.61 2.91 6.04 5.49 6.34 9.82 NA ...
##  $ LYMPHOCYTE_PERCENT: num [1:952] 32.1 62.9 38.5 43.7 24.2 47.1 41 45.4 60.1 NA ...
##  $ MCH               : num [1:952] 38.3 22.3 35.1 29.8 30 28.6 33.1 32.9 29.6 NA ...
##  $ MCHC              : num [1:952] 33.6 32.1 32.8 33.3 32.9 32.5 34.3 33.2 32.7 NA ...
##  $ MCV               : num [1:952] 114 70 107 90 91 88 96 99 91 NA ...
##  $ MPV               : num [1:952] 8.8 7.5 10.9 10.1 9.9 9.7 8.8 10.3 8.9 NA ...
##  $ MONOCYTE          : num [1:952] 1.11 0.27 1.39 0.38 1.25 1.41 0.97 1.16 1.04 NA ...
##  $ MONOCYTE_PERCENT  : num [1:952] 14.3 5.3 11.3 6.3 10.4 11 7.3 8.3 6.4 NA ...
##  $ NEUTROPHIL        : num [1:952] 3.99 1.49 6 2.86 7.71 5.09 6.22 5.92 5.02 NA ...
##  $ NEUTROPHIL_PERCENT: num [1:952] 51.4 29 48.7 47.8 64.2 39.6 46.5 42.4 30.7 NA ...
##  $ PLATELET          : num [1:952] 373 274 244 383 441 411 388 293 439 NA ...
##  $ RBC               : num [1:952] 2.96 5.33 1.86 3.68 2.25 2.76 2.02 2.26 2.85 NA ...
##  $ RDW               : num [1:952] 14.5 17.2 17.7 19 22.7 23.2 27.7 19.5 18.7 NA ...
##  $ RDWSD             : num [1:952] 63.8 40.6 69.3 60.1 69.3 71.5 89.2 69.3 59.6 NA ...
##  $ WBC               : num [1:952] 7.8 5.2 12.3 6 12 12.8 13.4 14 16.3 NA ...
##  $ HbA               : num [1:952] 0 65.9 0 11.7 0 2.8 2.6 7.3 27.9 0 ...
##  $ HbA2              : num [1:952] 5.2 6.1 0 6.6 4.6 5.9 5.7 5.4 4.8 4.6 ...
##  $ HbF               : num [1:952] 15.9 NA 14.2 11.8 9 6.4 5.7 3.8 9 8.6 ...
##  $ HbS               : num [1:952] 78.9 28 85.8 70 86.6 84.8 86 83.4 58.1 86.8 ...
##  $ CURATED_STATUS    : chr [1:952] "HbSS" "HbAS" "HbSS" "NA" ...
# Get only HbSS
bt_pheno = bt_pheno[bt_pheno$CURATED_STATUS == "HbSS",]

3 Demographic data

3.1 AGE

summary(bt_pheno$AGE)
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max.     NA's 
##  0.07671  5.00000  8.00000  8.58932 11.50000 25.00000        2
hist(bt_pheno$AGE, main="AGE destribution", xlab="AGE", breaks = 20)

#plot(density(na.omit(bt_pheno$AGE)))
#polygon(density(na.omit(bt_pheno$AGE)), col = "coral")
#abline(v=c(min(bt_pheno$AGE), max(bt_pheno$AGE)), lty=2, lwd=1)

3.2 SEX

bt_pheno %>% 
   select(SEX) %>% 
   table() %>% 
   as.data.frame() %>% 
   na.omit() %>%
   mutate(Prop = (Freq/sum(Freq))*100) -> sex       # Sex

sex
colnames(sex) <- c("cat", "freq", "prop")
par(mar=c(4,4,3,4))
barplot(
   sex$prop, 
   ylim=c(0, 60), 
   col=c(2,4)
)
text(
   x=c(0.7,1.9), 
   y=30, 
   labels=c("Female", "Male")
)
mtext(
   text = "Proportion (%)",
   side = 2,
   line = 3
)
mtext(
   text = "Sex",
   side = 1,
   line = 1
)

4 Describe phenotypes and normalize if needed

4.1 RBC

# figures-side,
par(mar = c(4, 4, 2, 2))
hist(bt_pheno$RBC, main="RBC (million/uL)") 
plot(density(na.omit(bt_pheno$RBC)), main="RBC (million/uL)")
polygon(density(na.omit(bt_pheno$RBC)), col = "coral")

# removing outliers
#abline(v=8, lwd=1, lty=2)
#bt_pheno$RBC <- as.numeric(ifelse(bt_pheno$RBC > 8, "NA", bt_pheno$RBC))
#plot(density(na.omit(bt_pheno$RBC)), main="RBC (>8) outlier pruned")
#polygon(density(na.omit(bt_pheno$RBC)), col = "coral")

BNphen <- bestNormalize::orderNorm(bt_pheno$RBC)
BNphen 
## orderNorm Transformation with 281 nonmissing obs and ties
##  - 150 unique values 
##  - Original quantiles:
##   0%  25%  50%  75% 100% 
## 1.03 2.30 2.58 2.88 4.94
bt_pheno$nRBC <- BNphen$x.t
plot(density(na.omit(bt_pheno$nRBC)), main="Normalized RBC") 
polygon(density(na.omit(bt_pheno$nRBC)), col = "lightblue")

# fit smooth spline for Age against RBC
rbc_pheno <- na.omit(bt_pheno[, which(names(bt_pheno) %in% c("FID","GENDER","AGE","RBC"))])

rbc_plot <- qplot(AGE, RBC, data = rbc_pheno, geom='smooth', span =0.5, color = GENDER) + theme_bw()
rbc_plot
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
summary(bt_pheno$RBC)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   1.030   2.300   2.580   2.606   2.880   4.940      12

4.2 RDW

# figures-side,
par(mar = c(4, 4, 2, 2))
hist(bt_pheno$RDW, main="RDW (%)") 
plot(density(na.omit(bt_pheno$RDW)), main="RDW (%)")
polygon(density(na.omit(bt_pheno$RDW)), col = "coral") 

# removing outliers
#abline(v=10, lwd=1, lty=2)
#bt_pheno$RDW <- as.numeric(ifelse(bt_pheno$RDW > 10, "NA", bt_pheno$RDW))
#plot(density(na.omit(bt_pheno$RDW)), main="RDW (>10) outlier pruned")
#polygon(density(na.omit(bt_pheno$RDW)), col = "coral")

BNphen <- bestNormalize::orderNorm(bt_pheno$RDW)
BNphen 
## orderNorm Transformation with 281 nonmissing obs and ties
##  - 152 unique values 
##  - Original quantiles:
##   0%  25%  50%  75% 100% 
## 13.0 18.6 21.8 25.6 47.0
bt_pheno$nRDW <- BNphen$x.t
plot(density(na.omit(bt_pheno$nRDW)), main="Normalized RDW") 
polygon(density(na.omit(bt_pheno$nRDW)), col = "lightblue")

# fit smooth spline for Age against RBC
rdw_pheno <- na.omit(bt_pheno[, which(names(bt_pheno) %in% c("FID","GENDER","AGE","RDW"))])

rdw_plot <- qplot(AGE, RDW, data = rdw_pheno, geom='smooth', span =0.5, color = GENDER) + theme_bw()
rdw_plot
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
summary(bt_pheno$RDW)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   13.00   18.60   21.80   22.58   25.60   47.00      12

4.3 RDWSD

# figures-side,
par(mar = c(4, 4, 2, 2))
hist(bt_pheno$RDWSD, main="RDWSD (%)") 
plot(density(na.omit(bt_pheno$RDWSD)), main="RDWSD (%)")
polygon(density(na.omit(bt_pheno$RDWSD)), col = "coral") 

# removing outliers
abline(v=200, lwd=1, lty=2)
bt_pheno$RDWSD <- as.numeric(ifelse(bt_pheno$RDWSD > 200, "NA", bt_pheno$RDWSD))
plot(density(na.omit(bt_pheno$RDWSD)), main="RDWSD (>200) outlier pruned")
polygon(density(na.omit(bt_pheno$RDWSD)), col = "coral")

BNphen <- bestNormalize::orderNorm(bt_pheno$RDWSD)
BNphen 
## orderNorm Transformation with 281 nonmissing obs and ties
##  - 204 unique values 
##  - Original quantiles:
##    0%   25%   50%   75%  100% 
##  41.1  60.3  67.8  76.4 118.0
bt_pheno$nRDWSD <- BNphen$x.t
plot(density(na.omit(bt_pheno$nRDWSD)), main="Normalized RDWSD") 
polygon(density(na.omit(bt_pheno$nRDWSD)), col = "lightblue")

# fit smooth spline for Age against RBC
RDWSD_pheno <- na.omit(bt_pheno[, which(names(bt_pheno) %in% c("FID","GENDER","AGE","RDWSD"))])

RDWSD_plot <- qplot(AGE, RDWSD, data = RDWSD_pheno, geom='smooth', span =0.5, color = GENDER) + theme_bw()
RDWSD_plot
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
summary(bt_pheno$RDWSD)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   41.10   60.30   67.80   69.04   76.40  118.00      12

4.4 Hb

par(mar = c(4, 4, 2, 2))
hist(bt_pheno$Hb, main="Hb (g/dL)", xlab="Hb") 
plot(density(na.omit(bt_pheno$Hb)), main="Hb (g/dL)")
polygon(density(na.omit(bt_pheno$Hb)), col = "coral")

# removing outliers
#abline(v=30, lwd=1, lty=2)
#bt_pheno$Hb <- as.numeric(ifelse(bt_pheno$Hb > 30, "NA", bt_pheno$Hb))
#plot(density(na.omit(bt_pheno$Hb)), main="Hb (>30) outlier pruned")
#polygon(density(na.omit(bt_pheno$Hb)), col = "coral")

BNphen <- bestNormalize::orderNorm(bt_pheno$Hb)
BNphen 
## orderNorm Transformation with 281 nonmissing obs and ties
##  - 63 unique values 
##  - Original quantiles:
##   0%  25%  50%  75% 100% 
##  2.9  7.3  8.2  9.0 13.9
bt_pheno$nHb <- BNphen$x.t
plot(density(na.omit(bt_pheno$nHb)), main="Normalized Hb") 
polygon(density(na.omit(bt_pheno$nHb)), col = "lightblue")

# fit smooth spline for Age against Hb
hb_pheno <- na.omit(bt_pheno[, which(names(bt_pheno) %in% c("FID","GENDER","AGE","Hb"))])

hb_plot <- qplot(AGE, Hb, data = hb_pheno, geom='smooth', span =0.5, color = GENDER) + theme_bw()
hb_plot
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
summary(bt_pheno$Hb)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   2.900   7.300   8.200   8.174   9.000  13.900      12

4.5 HbF

par(mar = c(4, 4, 2, 2))
hist(bt_pheno$HbF, main="HbF", xlab="HbF") 
plot(density(na.omit(bt_pheno$HbF)), main="HbF")
polygon(density(na.omit(bt_pheno$HbF)), col = "coral")

# removing outliers
abline(v=c(0,70), lwd=1, lty=2)
bt_pheno$HbF <- as.numeric(
  ifelse(bt_pheno$HbF > 70, "NA", 
    ifelse(bt_pheno$HbF == 0, "NA", bt_pheno$HbF)
  )
)
plot(density(na.omit(bt_pheno$HbF)), main="HbF (>70) outlier pruned")
polygon(density(na.omit(bt_pheno$HbF)), col = "coral")

BNphen <- bestNormalize::orderNorm(bt_pheno$HbF)
BNphen 
## orderNorm Transformation with 279 nonmissing obs and ties
##  - 167 unique values 
##  - Original quantiles:
##   0%  25%  50%  75% 100% 
##  1.8  7.1 10.7 16.1 39.4
bt_pheno$nHbF <- BNphen$x.t
plot(density(na.omit(bt_pheno$nHbF)), main="Normalized HbF") 
polygon(density(na.omit(bt_pheno$nHbF)), col = "lightblue")

# CUBIC ROOT TRANSFORMATION
#bt_pheno$nHbF <- (bt_pheno$HbF)^(1/3)
#plot(density(na.omit(bt_pheno$nHbF)), main="Cubic root transformed HbF") 
#polygon(density(na.omit(bt_pheno$nHbF)), col = "lightblue")

# fit smooth spline for Age against Hb
HbF_pheno <- na.omit(bt_pheno[, which(names(bt_pheno) %in% c("FID","GENDER","AGE","HbF"))])

HbF_plot <- qplot(AGE, HbF, data = HbF_pheno, geom='smooth', span =0.5, color = GENDER) + theme_bw()
HbF_plot
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
summary(bt_pheno$HbF)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##     1.8     7.1    10.7    12.3    16.1    39.4      14

4.6 MCV

par(mar = c(4, 4, 2, 2))
hist(bt_pheno$MCV, main="MCV (fL)")
plot(density(na.omit(bt_pheno$MCV)), main="MCV (fL)")
polygon(density(na.omit(bt_pheno$MCV)), col = "coral")

# removing outliers
# abline(v=55, lwd=1, lty=2)
# bt_pheno$MCV <- as.numeric(ifelse(bt_pheno$MCV < 55, "NA", bt_pheno$MCV))
# plot(density(na.omit(bt_pheno$MCV)), main="MCV (>55) outlier pruned")
# polygon(density(na.omit(bt_pheno$MCV)), col = "coral")

BNphen <- bestNormalize::orderNorm(bt_pheno$MCV)
BNphen 
## orderNorm Transformation with 281 nonmissing obs and ties
##  - 52 unique values 
##  - Original quantiles:
##   0%  25%  50%  75% 100% 
##   70   89   97  104  133
bt_pheno$nMCV <- BNphen$x.t
plot(density(na.omit(bt_pheno$nMCV)), main="Normalized MCV")
polygon(density(na.omit(bt_pheno$nMCV)), col = "lightblue")

# fit smooth spline for Age against Hb
mcv_pheno <- na.omit(bt_pheno[, which(names(bt_pheno) %in% c("FID","GENDER","AGE","MCV"))])

mcv_plot <- qplot(AGE, MCV, data = mcv_pheno, geom='smooth', span =0.5, color = GENDER) + theme_bw()
mcv_plot
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
summary(bt_pheno$MCV)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   70.00   89.00   97.00   96.85  104.00  133.00      12

4.7 HCT

par(mar = c(4, 4, 2, 2))
hist(bt_pheno$HCT, main="HCT") 
plot(density(na.omit(bt_pheno$HCT)), main="HCT")
polygon(density(na.omit(bt_pheno$HCT)), col = "coral")

# removing ourliers
#abline(v=43, lwd=1, lty=2)
#bt_pheno$HTC <- as.numeric(ifelse(bt_pheno$HTC > 43, "NA", bt_pheno$HTC))
#plot(density(na.omit(bt_pheno$HTC)), main="HTC (>43) outlier pruned")
#polygon(density(na.omit(bt_pheno$HTC)), col = "coral")

BNphen <- bestNormalize::orderNorm(bt_pheno$HCT)
BNphen 
## orderNorm Transformation with 281 nonmissing obs and ties
##  - 133 unique values 
##  - Original quantiles:
##   0%  25%  50%  75% 100% 
##  9.1 22.2 24.9 27.2 42.6
bt_pheno$nHCT <- BNphen$x.t
plot(density(na.omit(bt_pheno$nHCT)), main="Normalized HCT") 
polygon(density(na.omit(bt_pheno$nHCT)), col = "lightblue")

# fit smooth spline for Age against Hb
htc_pheno <- na.omit(bt_pheno[, which(names(bt_pheno) %in% c("FID","GENDER","AGE","HCT"))])

htc_plot <- qplot(AGE, HCT, data = htc_pheno, geom='smooth', span =0.5, color = GENDER) + theme_bw()
htc_plot
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
summary(bt_pheno$HCT)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##    9.10   22.20   24.90   24.89   27.20   42.60      12

4.8 MCHC

#Mean Corpuscular Hemoglobin Concentration

par(mar = c(4, 4, 2, 2))
hist(bt_pheno$MCHC, main="MCHC (g/dL)") 
plot(density(na.omit(bt_pheno$MCHC)), main="MCHC (g/dL)")
polygon(density(na.omit(bt_pheno$MCHC)), col = "coral")

# removing
#abline(v=20, lwd=1, lty=2)
#bt_pheno$MCHC <- as.numeric(
#  ifelse(bt_pheno$MCHC < 20, "NA", 
#    bt_pheno$MCHC
#    #ifelse(bt_pheno$MCHC < 10, "NA", bt_pheno$MCHC)
#  )
#)
#plot(density(na.omit(bt_pheno$MCHC)), main="MCHC (<20) outlier pruned")
#polygon(density(na.omit(bt_pheno$MCHC)), col = "coral")

BNphen <- bestNormalize::orderNorm(bt_pheno$MCHC)
BNphen 
## orderNorm Transformation with 281 nonmissing obs and ties
##  - 59 unique values 
##  - Original quantiles:
##   0%  25%  50%  75% 100% 
## 27.8 32.0 32.9 33.7 35.3
bt_pheno$nMCHC <- BNphen$x.t
plot(density(na.omit(bt_pheno$nMCHC)), main="Normalized MCHC") 
polygon(density(na.omit(bt_pheno$nMCHC)), col = "lightblue")

# fit smooth spline for Age against Hb
mchc_pheno <- na.omit(bt_pheno[, which(names(bt_pheno) %in% c("FID","GENDER","AGE","MCHC"))])

mchc_plot <- qplot(AGE, MCHC, data = mchc_pheno, geom='smooth', span =0.5, color = GENDER) + theme_bw()
mchc_plot
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
summary(bt_pheno$MCHC)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   27.80   32.00   32.90   32.79   33.70   35.30      12

4.9 MCH

#The Mean Corpuscular Hemoglobin

par(mar = c(4, 4, 2, 2))
hist(bt_pheno$MCH, main="MCH (pg/cell)") 
plot(density(na.omit(bt_pheno$MCH)), main="MCH (pg/cell)")
polygon(density(na.omit(bt_pheno$MCH)), col = "coral")

# removing outliers
#abline(v=10, lwd=1, lty=2)
#bt_pheno$MCH <- as.numeric(
#  ifelse(bt_pheno$MCH < 10, "NA", 
#    bt_pheno$MCH
#    #ifelse(bt_pheno$MCH < 10, "NA", bt_pheno$MCH)
#  )
#)
#plot(density(na.omit(bt_pheno$MCH)), main="MCH (<10) outlier pruned")
#polygon(density(na.omit(bt_pheno$MCH)), col = "coral")

BNphen <- bestNormalize::orderNorm(bt_pheno$MCH)
BNphen 
## orderNorm Transformation with 281 nonmissing obs and ties
##  - 122 unique values 
##  - Original quantiles:
##   0%  25%  50%  75% 100% 
## 21.5 28.9 31.8 34.5 42.3
bt_pheno$nMCH <- BNphen$x.t
plot(density(na.omit(bt_pheno$nMCH)), main="Normalized MCH") 
polygon(density(na.omit(bt_pheno$nMCH)), col = "lightblue")

# fit smooth spline for Age against Hb
tcmh_pheno <- na.omit(bt_pheno[, which(names(bt_pheno) %in% c("FID","GENDER","AGE","MCH"))])

tcmh_plot <- qplot(AGE, MCH, data = tcmh_pheno, geom='smooth', span =0.5, color = GENDER) + theme_bw()
tcmh_plot
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
summary(bt_pheno$MCH)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   21.50   28.90   31.80   31.76   34.50   42.30      12

4.10 WBC

par(mar = c(4, 4, 2, 2))
hist(bt_pheno$WBC, main="WBC (counts/uL)") 
plot(density(na.omit(bt_pheno$WBC)), main="WBC (counts/uL)")
polygon(density(na.omit(bt_pheno$WBC)), col = "coral")

# removing
#abline(v=60000, lwd=1, lty=2)
#bt_pheno$WBC <- as.numeric(
#  ifelse(bt_pheno$WBC > 60000, "NA", 
#    bt_pheno$WBC
#    #ifelse(bt_pheno$WBC < 10, "NA", bt_pheno$WBC)
#  )
#)
#plot(density(na.omit(bt_pheno$WBC)), main="WBC (>60000) outlier pruned")
#polygon(density(na.omit(bt_pheno$WBC)), col = "coral")

BNphen <- bestNormalize(bt_pheno$WBC)
BNphen 
## Best Normalizing transformation with 281 Observations
##  Estimated Normality Statistics (Pearson P / df, lower => more normal):
##  - arcsinh(x): 1.3158
##  - Box-Cox: 1.2405
##  - Center+scale: 1.4121
##  - Double Reversed Log_b(x+a): 2.6596
##  - Exp(x): 31.6407
##  - Log_b(x+a): 1.3112
##  - orderNorm (ORQ): 1.3273
##  - sqrt(x + a): 1.2894
##  - Yeo-Johnson: 1.2382
## Estimation method: Out-of-sample via CV with 10 folds and 5 repeats
##  
## Based off these, bestNormalize chose:
## Standardized Yeo-Johnson Transformation with 281 nonmissing obs.:
##  Estimated statistics:
##  - lambda = -0.4513376 
##  - mean (before standardization) = 1.518601 
##  - sd (before standardization) = 0.09695185
bt_pheno$nWBC <- BNphen$x.t
plot(density(na.omit(bt_pheno$nWBC)), main="Normalized WBC") 
polygon(density(na.omit(bt_pheno$nWBC)), col = "lightblue")

# fit smooth spline for Age against Hb
wbc_pheno <- na.omit(bt_pheno[, which(names(bt_pheno) %in% c("FID","GENDER","AGE","WBC"))])

wbc_plot <- qplot(AGE, WBC, data = wbc_pheno, geom='smooth', span =0.5, color = GENDER) + theme_bw()
wbc_plot
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
summary(bt_pheno$WBC)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##    4.10   10.30   12.30   12.84   14.60   28.10      12

4.11 NEUTROPHIL

par(mar = c(4, 4, 2, 2))
hist(bt_pheno$NEUTROPHIL, main="NEUTROPHIL (10^3/uL)") 
plot(density(na.omit(bt_pheno$NEUTROPHIL)), main="NEUTROPHIL (10^3/uL)")
polygon(density(na.omit(bt_pheno$NEUTROPHIL)), col = "coral")

# removing
#abline(v=20, lwd=1, lty=2)
#bt_pheno$NEUTROPHIL <- as.numeric(
#  ifelse(bt_pheno$NEUTROPHIL > 20, "NA", 
#    bt_pheno$NEUTROPHIL
#    #ifelse(bt_pheno$NEUTROPHIL < 10, "NA", bt_pheno$NEUTROPHIL)
#  )
#)
#plot(density(na.omit(bt_pheno$NEUTROPHILE)), main="NEUTROPHIL (>20) outlier pruned")
#polygon(density(na.omit(bt_pheno$NEUTROPHILE)), col = "coral")

BNphen <- bestNormalize::orderNorm(bt_pheno$NEUTROPHIL)
BNphen 
## orderNorm Transformation with 287 nonmissing obs and ties
##  - 229 unique values 
##  - Original quantiles:
##    0%   25%   50%   75%  100% 
##  0.93  3.94  5.34  6.90 16.42
bt_pheno$nNEUTROPHIL <- BNphen$x.t
plot(density(na.omit(bt_pheno$nNEUTROPHIL)), main="Normalized NEUTROPHIL") 
polygon(density(na.omit(bt_pheno$nNEUTROPHIL)), col = "lightblue")

# fit smooth spline for Age against NEUTROPHILE
neu_pheno <- na.omit(bt_pheno[, which(names(bt_pheno) %in% c("FID","GENDER","AGE","NEUTROPHIL"))])

neu_plot <- qplot(AGE, NEUTROPHIL, data = neu_pheno, geom='smooth', span =0.5, color = GENDER) + theme_bw()
neu_plot
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
summary(bt_pheno$NEUTROPHIL)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   0.930   3.940   5.340   5.702   6.900  16.420       6

4.12 NEUTROPHIL_PERCENT

par(mar = c(4, 4, 2, 2))
hist(bt_pheno$NEUTROPHIL_PERCENT, main="NEUTROPHIL_PERCENT (10^3/uL)") 
plot(density(na.omit(bt_pheno$NEUTROPHIL_PERCENT)), main="NEUTROPHIL_PERCENT (10^3/uL)")
polygon(density(na.omit(bt_pheno$NEUTROPHIL_PERCENT)), col = "coral")

# removing
#abline(v=20, lwd=1, lty=2)
#bt_pheno$NEUTROPHIL_PERCENT <- as.numeric(
#  ifelse(bt_pheno$NEUTROPHIL_PERCENT > 20, "NA", 
#    bt_pheno$NEUTROPHIL_PERCENT
#    #ifelse(bt_pheno$NEUTROPHIL_PERCENT < 10, "NA", bt_pheno$NEUTROPHIL_PERCENT)
#  )
#)
#plot(density(na.omit(bt_pheno$NEUTROPHIL_PERCENTE)), main="NEUTROPHIL_PERCENT (>20) outlier pruned")
#polygon(density(na.omit(bt_pheno$NEUTROPHIL_PERCENTE)), col = "coral")

BNphen <- bestNormalize::orderNorm(bt_pheno$NEUTROPHIL_PERCENT)
BNphen 
## orderNorm Transformation with 287 nonmissing obs and ties
##  - 205 unique values 
##  - Original quantiles:
##    0%   25%   50%   75%  100% 
## 16.20 36.70 43.70 51.95 89.00
bt_pheno$nNEUTROPHIL_PERCENT <- BNphen$x.t
plot(density(na.omit(bt_pheno$nNEUTROPHIL_PERCENT)), main="Normalized NEUTROPHIL_PERCENT") 
polygon(density(na.omit(bt_pheno$nNEUTROPHIL_PERCENT)), col = "lightblue")

# fit smooth spline for Age against NEUTROPHIL_PERCENTE
neup_pheno <- na.omit(bt_pheno[, which(names(bt_pheno) %in% c("FID","GENDER","AGE","NEUTROPHIL_PERCENT"))])

neup_plot <- qplot(AGE, NEUTROPHIL_PERCENT, data = neup_pheno, geom='smooth', span =0.5, color = GENDER) + theme_bw()
neup_plot
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
summary(bt_pheno$NEUTROPHIL_PERCENT)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   16.20   36.70   43.70   44.14   51.95   89.00       6

4.13 LYMPHOCYTE

par(mar = c(4, 4, 2, 2))
hist(bt_pheno$LYMPHOCYTE, main="LYMPHOCYTE (10^3/uL)") 
plot(density(na.omit(bt_pheno$LYMPHOCYTE)), main="LYMPHOCYTE (10^3/uL)")
polygon(density(na.omit(bt_pheno$LYMPHOCYTE)), col = "coral")

# removing
#abline(v=20, lwd=1, lty=2)
#bt_pheno$LYMPHOCYTE <- as.numeric(
#  ifelse(bt_pheno$LYMPHOCYTE > 20, "NA", 
#    bt_pheno$LYMPHOCYTE
#    #ifelse(bt_pheno$LYMPHOCYTE < 10, "NA", bt_pheno$LYMPHOCYTE)
#  )
#)
#plot(density(na.omit(bt_pheno$LYMPHOCYTEE)), main="LYMPHOCYTE (>20) outlier pruned")
#polygon(density(na.omit(bt_pheno$LYMPHOCYTEE)), col = "coral")

BNphen <- bestNormalize::orderNorm(bt_pheno$LYMPHOCYTE)
BNphen 
## orderNorm Transformation with 287 nonmissing obs and ties
##  - 238 unique values 
##  - Original quantiles:
##     0%    25%    50%    75%   100% 
##  1.040  3.985  5.140  6.365 14.490
bt_pheno$nLYMPHOCYTE <- BNphen$x.t
plot(density(na.omit(bt_pheno$nLYMPHOCYTE)), main="Normalized LYMPHOCYTE") 
polygon(density(na.omit(bt_pheno$nLYMPHOCYTE)), col = "lightblue")

# fit smooth spline for Age against LYMPHOCYTEE
lym_pheno <- na.omit(bt_pheno[, which(names(bt_pheno) %in% c("FID","GENDER","AGE","LYMPHOCYTE"))])

lym_plot <- qplot(AGE, LYMPHOCYTE, data = lym_pheno, geom='smooth', span =0.5, color = GENDER) + theme_bw()
lym_plot
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
summary(bt_pheno$LYMPHOCYTE)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   1.040   3.985   5.140   5.415   6.365  14.490       6

4.14 LYMPHOCYTE_PERCENT

par(mar = c(4, 4, 2, 2))
hist(bt_pheno$LYMPHOCYTE_PERCENT, main="LYMPHOCYTE_PERCENT (10^3/uL)") 
plot(density(na.omit(bt_pheno$LYMPHOCYTE_PERCENT)), main="LYMPHOCYTE_PERCENT (10^3/uL)")
polygon(density(na.omit(bt_pheno$LYMPHOCYTE_PERCENT)), col = "coral")

# removing
#abline(v=20, lwd=1, lty=2)
#bt_pheno$LYMPHOCYTE_PERCENT <- as.numeric(
#  ifelse(bt_pheno$LYMPHOCYTE_PERCENT > 20, "NA", 
#    bt_pheno$LYMPHOCYTE_PERCENT
#    #ifelse(bt_pheno$LYMPHOCYTE_PERCENT < 10, "NA", bt_pheno$LYMPHOCYTE_PERCENT)
#  )
#)
#plot(density(na.omit(bt_pheno$LYMPHOCYTE_PERCENTE)), main="LYMPHOCYTE_PERCENT (>20) outlier pruned")
#polygon(density(na.omit(bt_pheno$LYMPHOCYTE_PERCENTE)), col = "coral")

BNphen <- bestNormalize::orderNorm(bt_pheno$LYMPHOCYTE_PERCENT)
BNphen 
## orderNorm Transformation with 287 nonmissing obs and ties
##  - 214 unique values 
##  - Original quantiles:
##    0%   25%   50%   75%  100% 
##  6.60 34.95 42.50 49.10 75.60
bt_pheno$nLYMPHOCYTE_PERCENT <- BNphen$x.t
plot(density(na.omit(bt_pheno$nLYMPHOCYTE_PERCENT)), main="Normalized LYMPHOCYTE_PERCENT") 
polygon(density(na.omit(bt_pheno$nLYMPHOCYTE_PERCENT)), col = "lightblue")

# fit smooth spline for Age against LYMPHOCYTE_PERCENTE
lymp_pheno <- na.omit(bt_pheno[, which(names(bt_pheno) %in% c("FID","GENDER","AGE","LYMPHOCYTE_PERCENT"))])

lymp_plot <- qplot(AGE, LYMPHOCYTE_PERCENT, data = lymp_pheno, geom='smooth', span =0.5, color = GENDER) + theme_bw()
lymp_plot
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
summary(bt_pheno$LYMPHOCYTE_PERCENT)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##    6.60   34.95   42.50   42.54   49.10   75.60       6

4.15 MONOCYTE

par(mar = c(4, 4, 2, 2))
hist(bt_pheno$MONOCYTE, main="MONOCYTE (10^3/uL)") 
plot(density(na.omit(bt_pheno$MONOCYTE)), main="MONOCYTE (10^3/uL)")
polygon(density(na.omit(bt_pheno$MONOCYTE)), col = "coral")

# removing
#abline(v=20, lwd=1, lty=2)
#bt_pheno$MONOCYTE <- as.numeric(
#  ifelse(bt_pheno$MONOCYTE > 20, "NA", 
#    bt_pheno$MONOCYTE
#    #ifelse(bt_pheno$MONOCYTE < 10, "NA", bt_pheno$MONOCYTE)
#  )
#)
#plot(density(na.omit(bt_pheno$MONOCYTEE)), main="MONOCYTE (>20) outlier pruned")
#polygon(density(na.omit(bt_pheno$MONOCYTEE)), col = "coral")

BNphen <- bestNormalize::orderNorm(bt_pheno$MONOCYTE)
BNphen 
## orderNorm Transformation with 287 nonmissing obs and ties
##  - 152 unique values 
##  - Original quantiles:
##    0%   25%   50%   75%  100% 
## 0.180 0.905 1.170 1.485 4.260
bt_pheno$nMONOCYTE <- BNphen$x.t
plot(density(na.omit(bt_pheno$nMONOCYTE)), main="Normalized MONOCYTE") 
polygon(density(na.omit(bt_pheno$nMONOCYTE)), col = "lightblue")

# fit smooth spline for Age against MONOCYTEE
mon_pheno <- na.omit(bt_pheno[, which(names(bt_pheno) %in% c("FID","GENDER","AGE","MONOCYTE"))])

mon_plot <- qplot(AGE, MONOCYTE, data = mon_pheno, geom='smooth', span =0.5, color = GENDER) + theme_bw()
mon_plot
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
summary(bt_pheno$MONOCYTE)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   0.180   0.905   1.170   1.244   1.485   4.260       6

4.16 MONOCYTE_PERCENT

par(mar = c(4, 4, 2, 2))
hist(bt_pheno$MONOCYTE_PERCENT, main="MONOCYTE_PERCENT (10^3/uL)") 
plot(density(na.omit(bt_pheno$MONOCYTE_PERCENT)), main="MONOCYTE_PERCENT (10^3/uL)")
polygon(density(na.omit(bt_pheno$MONOCYTE_PERCENT)), col = "coral")

# removing
#abline(v=20, lwd=1, lty=2)
#bt_pheno$MONOCYTE_PERCENT <- as.numeric(
#  ifelse(bt_pheno$MONOCYTE_PERCENT > 20, "NA", 
#    bt_pheno$MONOCYTE_PERCENT
#    #ifelse(bt_pheno$MONOCYTE_PERCENT < 10, "NA", bt_pheno$MONOCYTE_PERCENT)
#  )
#)
#plot(density(na.omit(bt_pheno$MONOCYTE_PERCENTE)), main="MONOCYTE_PERCENT (>20) outlier pruned")
#polygon(density(na.omit(bt_pheno$MONOCYTE_PERCENTE)), col = "coral")

BNphen <- bestNormalize::orderNorm(bt_pheno$MONOCYTE_PERCENT)
BNphen 
## orderNorm Transformation with 287 nonmissing obs and ties
##  - 104 unique values 
##  - Original quantiles:
##   0%  25%  50%  75% 100% 
##  2.5  7.5  9.2 11.5 18.6
bt_pheno$nMONOCYTE_PERCENT <- BNphen$x.t
plot(density(na.omit(bt_pheno$nMONOCYTE_PERCENT)), main="Normalized MONOCYTE_PERCENT") 
polygon(density(na.omit(bt_pheno$nMONOCYTE_PERCENT)), col = "lightblue")

# fit smooth spline for Age against MONOCYTE_PERCENTE
monp_pheno <- na.omit(bt_pheno[, which(names(bt_pheno) %in% c("FID","GENDER","AGE","MONOCYTE_PERCENT"))])

monp_plot <- qplot(AGE, MONOCYTE_PERCENT, data = monp_pheno, geom='smooth', span =0.5, color = GENDER) + theme_bw()
monp_plot
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
summary(bt_pheno$MONOCYTE_PERCENT)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   2.500   7.500   9.200   9.587  11.500  18.600       6

4.17 BASOPHIL

par(mar = c(4, 4, 2, 2))
hist(bt_pheno$BASOPHIL, main="BASOPHIL (10^3/uL)") 
plot(density(na.omit(bt_pheno$BASOPHIL)), main="BASOPHIL (10^3/uL)")
polygon(density(na.omit(bt_pheno$BASOPHIL)), col = "coral")

# removing
#abline(v=20, lwd=1, lty=2)
#bt_pheno$BASOPHIL <- as.numeric(
#  ifelse(bt_pheno$BASOPHIL > 20, "NA", 
#    bt_pheno$BASOPHIL
#    #ifelse(bt_pheno$BASOPHIL < 10, "NA", bt_pheno$BASOPHIL)
#  )
#)
#plot(density(na.omit(bt_pheno$BASOPHILE)), main="BASOPHIL (>20) outlier pruned")
#polygon(density(na.omit(bt_pheno$BASOPHILE)), col = "coral")

BNphen <- bestNormalize::orderNorm(bt_pheno$BASOPHIL)
BNphen 
## orderNorm Transformation with 287 nonmissing obs and ties
##  - 18 unique values 
##  - Original quantiles:
##   0%  25%  50%  75% 100% 
## 0.00 0.03 0.05 0.07 0.19
bt_pheno$nBASOPHIL <- BNphen$x.t
plot(density(na.omit(bt_pheno$nBASOPHIL)), main="Normalized BASOPHIL") 
polygon(density(na.omit(bt_pheno$nBASOPHIL)), col = "lightblue")

# fit smooth spline for Age against BASOPHILE
baso_pheno <- na.omit(bt_pheno[, which(names(bt_pheno) %in% c("FID","GENDER","AGE","BASOPHIL"))])

baso_plot <- qplot(AGE, BASOPHIL, data = baso_pheno, geom='smooth', span =0.5, color = GENDER) + theme_bw()
baso_plot
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
summary(bt_pheno$BASOPHIL)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
## 0.00000 0.03000 0.05000 0.05265 0.07000 0.19000       6

4.18 BASOPHIL_PERCENT

par(mar = c(4, 4, 2, 2))
hist(bt_pheno$BASOPHIL_PERCENT, main="BASOPHIL_PERCENT (10^3/uL)") 
plot(density(na.omit(bt_pheno$BASOPHIL_PERCENT)), main="BASOPHIL_PERCENT (10^3/uL)")
polygon(density(na.omit(bt_pheno$BASOPHIL_PERCENT)), col = "coral")

# removing
#abline(v=20, lwd=1, lty=2)
#bt_pheno$BASOPHIL_PERCENT <- as.numeric(
#  ifelse(bt_pheno$BASOPHIL_PERCENT > 20, "NA", 
#    bt_pheno$BASOPHIL_PERCENT
#    #ifelse(bt_pheno$BASOPHIL_PERCENT < 10, "NA", bt_pheno$BASOPHIL_PERCENT)
#  )
#)
#plot(density(na.omit(bt_pheno$BASOPHIL_PERCENTE)), main="BASOPHIL_PERCENT (>20) outlier pruned")
#polygon(density(na.omit(bt_pheno$BASOPHIL_PERCENTE)), col = "coral")

BNphen <- bestNormalize::orderNorm(bt_pheno$BASOPHIL_PERCENT)
BNphen 
## orderNorm Transformation with 287 nonmissing obs and ties
##  - 12 unique values 
##  - Original quantiles:
##   0%  25%  50%  75% 100% 
##  0.0  0.3  0.4  0.5  1.2
bt_pheno$nBASOPHIL_PERCENT <- BNphen$x.t
plot(density(na.omit(bt_pheno$nBASOPHIL_PERCENT)), main="Normalized BASOPHIL_PERCENT") 
polygon(density(na.omit(bt_pheno$nBASOPHIL_PERCENT)), col = "lightblue")

# fit smooth spline for Age against BASOPHIL_PERCENTE
basop_pheno <- na.omit(bt_pheno[, which(names(bt_pheno) %in% c("FID","GENDER","AGE","BASOPHIL_PERCENT"))])

basop_plot <- qplot(AGE, BASOPHIL_PERCENT, data = basop_pheno, geom='smooth', span =0.5, color = GENDER) + theme_bw()
basop_plot
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
summary(bt_pheno$BASOPHIL_PERCENT)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##  0.0000  0.3000  0.4000  0.4098  0.5000  1.2000       6

4.19 EOSINOPHIL

par(mar = c(4, 4, 2, 2))
hist(bt_pheno$EOSINOPHIL, main="EOSINOPHIL (10^3/uL)") 
plot(density(na.omit(bt_pheno$EOSINOPHIL)), main="EOSINOPHIL (10^3/uL)")
polygon(density(na.omit(bt_pheno$EOSINOPHIL)), col = "coral")

# removing
abline(v=3, lwd=1, lty=2)
bt_pheno$EOSINOPHIL <- as.numeric(
  ifelse(bt_pheno$EOSINOPHIL > 3, "NA", 
    bt_pheno$EOSINOPHIL
    #ifelse(bt_pheno$EOSINOPHIL < 10, "NA", bt_pheno$EOSINOPHIL)
  )
)
plot(density(na.omit(bt_pheno$EOSINOPHIL)), main="EOSINOPHIL (>3) outlier pruned")
polygon(density(na.omit(bt_pheno$EOSINOPHIL)), col = "coral")

BNphen <- bestNormalize::orderNorm(bt_pheno$EOSINOPHIL)
BNphen 
## orderNorm Transformation with 282 nonmissing obs and ties
##  - 100 unique values 
##  - Original quantiles:
##   0%  25%  50%  75% 100% 
## 0.02 0.13 0.26 0.52 1.95
bt_pheno$nEOSINOPHIL <- BNphen$x.t
plot(density(na.omit(bt_pheno$nEOSINOPHIL)), main="Normalized EOSINOPHIL") 
polygon(density(na.omit(bt_pheno$nEOSINOPHIL)), col = "lightblue")

# fit smooth spline for Age against EOSINOPHILE
eos_pheno <- na.omit(bt_pheno[, which(names(bt_pheno) %in% c("FID","GENDER","AGE","EOSINOPHIL"))])

eos_plot <- qplot(AGE, EOSINOPHIL, data = eos_pheno, geom='smooth', span =0.5, color = GENDER) + theme_bw()
eos_plot
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
summary(bt_pheno$EOSINOPHIL)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##  0.0200  0.1300  0.2600  0.3852  0.5200  1.9500      11

4.20 EOSINOPHIL_PERCENT

par(mar = c(4, 4, 2, 2))
hist(bt_pheno$EOSINOPHIL_PERCENT, main="EOSINOPHIL_PERCENT (10^3/uL)") 
plot(density(na.omit(bt_pheno$EOSINOPHIL_PERCENT)), main="EOSINOPHIL_PERCENT (10^3/uL)")
polygon(density(na.omit(bt_pheno$EOSINOPHIL_PERCENT)), col = "coral")

# removing
abline(v=25, lwd=1, lty=2)
bt_pheno$EOSINOPHIL_PERCENT <- as.numeric(
  ifelse(bt_pheno$EOSINOPHIL_PERCENT > 25, "NA", 
    bt_pheno$EOSINOPHIL_PERCENT
    #ifelse(bt_pheno$EOSINOPHIL_PERCENT < 10, "NA", bt_pheno$EOSINOPHIL_PERCENT)
  )
)
plot(density(na.omit(bt_pheno$EOSINOPHIL_PERCENT)), main="EOSINOPHIL_PERCENT (>25) outlier pruned")
polygon(density(na.omit(bt_pheno$EOSINOPHIL_PERCENT)), col = "coral")

BNphen <- bestNormalize::orderNorm(bt_pheno$EOSINOPHIL_PERCENT)
BNphen 
## orderNorm Transformation with 285 nonmissing obs and ties
##  - 80 unique values 
##  - Original quantiles:
##   0%  25%  50%  75% 100% 
##  0.2  1.1  2.2  3.8 18.9
bt_pheno$nEOSINOPHIL_PERCENT <- BNphen$x.t
plot(density(na.omit(bt_pheno$nEOSINOPHIL_PERCENT)), main="Normalized EOSINOPHIL_PERCENT") 
polygon(density(na.omit(bt_pheno$nEOSINOPHIL_PERCENT)), col = "lightblue")

# fit smooth spline for Age against EOSINOPHIL_PERCENTE
eosp_pheno <- na.omit(bt_pheno[, which(names(bt_pheno) %in% c("FID","GENDER","AGE","EOSINOPHIL_PERCENT"))])

eosp_plot <- qplot(AGE, EOSINOPHIL_PERCENT, data = eosp_pheno, geom='smooth', span =0.5, color = GENDER) + theme_bw()
eosp_plot
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
summary(bt_pheno$EOSINOPHIL_PERCENT)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   0.200   1.100   2.200   3.119   3.800  18.900       8

4.21 PLATELET

par(mar = c(4, 4, 2, 2))
hist(bt_pheno$PLATELET, main="PLATELET (K/cu mm)") 
plot(density(na.omit(bt_pheno$PLATELET)), main="PLATELET (K/cu mm)")
polygon(density(na.omit(bt_pheno$PLATELET)), col = "coral") 

BNphen <- bestNormalize::orderNorm(bt_pheno$PLATELET)
BNphen 
## orderNorm Transformation with 281 nonmissing obs and ties
##  - 220 unique values 
##  - Original quantiles:
##   0%  25%  50%  75% 100% 
##   44  333  440  548 1186
bt_pheno$nPLATELET <- BNphen$x.t
plot(density(na.omit(bt_pheno$nPLATELET)), main="Normalized PLATELET") 
polygon(density(na.omit(bt_pheno$nPLATELET)), col = "lightblue")

# fit smooth spline for Age against PLATELET
pla_pheno <- na.omit(bt_pheno[, which(names(bt_pheno) %in% c("FID","GENDER","AGE","PLATELET"))])

pla_plot <- qplot(AGE, PLATELET, data = pla_pheno, geom='smooth', span =0.5, color = GENDER) + theme_bw()
pla_plot
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
summary(bt_pheno$PLATELET)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##    44.0   333.0   440.0   456.9   548.0  1186.0      12

4.22 MPV

par(mar = c(4, 4, 2, 2))
hist(bt_pheno$MPV, main="MPV") 
plot(density(na.omit(bt_pheno$MPV)), main="MPV")
polygon(density(na.omit(bt_pheno$MPV)), col = "coral") 

# removing
#abline(v=20, lwd=1, lty=2)
#bt_pheno$MPV <- as.numeric(
#  ifelse(bt_pheno$MPV > 20, "NA", 
#    bt_pheno$MPV
#    #ifelse(bt_pheno$MPV < 10, "NA", bt_pheno$MPV)
#  )
#)
#plot(density(na.omit(bt_pheno$MPV)), main="MPV (>20) outlier pruned")
#polygon(density(na.omit(bt_pheno$MPV)), col = "coral")

BNphen <- bestNormalize::orderNorm(bt_pheno$MPV)
BNphen 
## orderNorm Transformation with 281 nonmissing obs and ties
##  - 53 unique values 
##  - Original quantiles:
##   0%  25%  50%  75% 100% 
##  6.9  8.2  8.8  9.6 14.0
bt_pheno$nMPV <- BNphen$x.t
plot(density(na.omit(bt_pheno$nMPV)), main="Normalized MPV") 
polygon(density(na.omit(bt_pheno$nMPV)), col = "lightblue")

# fit smooth spline for Age against PLATELET
pla_pheno <- na.omit(bt_pheno[, which(names(bt_pheno) %in% c("FID","GENDER","AGE","MPV"))])

pla_plot <- qplot(AGE, MPV, data = pla_pheno, geom='smooth', span =0.5, color = GENDER) + theme_bw()
pla_plot
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
summary(bt_pheno$MPV)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   6.900   8.200   8.800   9.012   9.600  14.000      12

4.23 SBP

par(mar = c(4, 4, 2, 2))
hist(bt_pheno$SBP, main="SBP") 
plot(density(na.omit(bt_pheno$SBP)), main="SBP")
polygon(density(na.omit(bt_pheno$SBP)), col = "coral")

BNphen <- bestNormalize::orderNorm(bt_pheno$SBP)
BNphen 
## orderNorm Transformation with 239 nonmissing obs and ties
##  - 49 unique values 
##  - Original quantiles:
##   0%  25%  50%  75% 100% 
##   70   90   98  110  140
bt_pheno$nSBP <- BNphen$x.t
plot(density(na.omit(bt_pheno$nSBP)), main="Normalized SBP") 
polygon(density(na.omit(bt_pheno$nSBP)), col = "lightblue")

# fit smooth spline for Age against RETICS
sbp_pheno <- na.omit(bt_pheno[, which(names(bt_pheno) %in% c("FID","GENDER","AGE","SBP"))])

sbp_plot <- qplot(AGE, SBP, data = sbp_pheno, geom='smooth', span =0.5, color = GENDER) + theme_bw()
sbp_plot
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
summary(bt_pheno$SBP)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##      70      90      98     100     110     140      54

4.24 DBP

par(mar = c(4, 4, 2, 2))
hist(bt_pheno$DBP, main="DBP") 
plot(density(na.omit(bt_pheno$DBP)), main="DBP")
polygon(density(na.omit(bt_pheno$DBP)), col = "coral")

BNphen <- bestNormalize::orderNorm(bt_pheno$DBP)
BNphen 
## orderNorm Transformation with 239 nonmissing obs and ties
##  - 47 unique values 
##  - Original quantiles:
##   0%  25%  50%  75% 100% 
##   30   52   60   68   96
bt_pheno$nDBP <- BNphen$x.t
plot(density(na.omit(bt_pheno$nDBP)), main="Normalized DBP") 
polygon(density(na.omit(bt_pheno$nDBP)), col = "lightblue")

# fit smooth spline for Age against DBP
dbp_pheno <- na.omit(bt_pheno[, which(names(bt_pheno) %in% c("FID","GENDER","AGE","DBP"))])

dbp_plot <- qplot(AGE, DBP, data = dbp_pheno, geom='smooth', span =0.5, color = GENDER) + theme_bw()
dbp_plot
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
summary(bt_pheno$DBP)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   30.00   52.00   60.00   60.38   68.00   96.00      54

4.25 BMI

par(mar = c(4, 4, 2, 2))
hist(bt_pheno$BMI, main="BMI") 
plot(density(na.omit(bt_pheno$BMI)), main="BMI")
polygon(density(na.omit(bt_pheno$BMI)), col = "coral")

# removing outliers
abline(v=40, lwd=1, lty=2)
bt_pheno$BMI <- as.numeric(
  ifelse(bt_pheno$BMI > 40, "NA", 
    bt_pheno$BMI
    #ifelse(bt_pheno$BMI < 10, "NA", bt_pheno$BMI)
  )
)
plot(density(na.omit(bt_pheno$BMI)), main="BMI (>40) outlier pruned")
polygon(density(na.omit(bt_pheno$BMI)), col = "coral")

BNphen <- bestNormalize::orderNorm(bt_pheno$BMI)
BNphen 
## orderNorm Transformation with 281 nonmissing obs and ties
##  - 276 unique values 
##  - Original quantiles:
##     0%    25%    50%    75%   100% 
##  8.889 14.284 15.503 16.667 31.267
bt_pheno$nBMI <- BNphen$x.t
plot(density(na.omit(bt_pheno$nBMI)), main="Normalized BMI") 
polygon(density(na.omit(bt_pheno$nBMI)), col = "lightblue")

# fit smooth spline for Age against BMI
bmi_pheno <- na.omit(bt_pheno[, which(names(bt_pheno) %in% c("FID","GENDER","AGE","BMI"))])

bmi_plot <- qplot(AGE, BMI, data = bmi_pheno, geom='smooth', span =0.5, color = GENDER) + theme_bw()
bmi_plot
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
summary(bt_pheno$DBP)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   30.00   52.00   60.00   60.38   68.00   96.00      54

5 Save new pheno to file

head(bt_pheno[, -c(3:4)])
outf <- paste0(
  sub(
    ".xlsx",
    "_new_pheno.tsv",
    params$data
  )
)

write.table(
  bt_pheno,
  outf,
  col.names=T,
  row.names=F,
  quote=F,
  sep="\t"
)