Back

setwd("/home/kesohku1/projects/pheno/gha/")

1 Load package

## Loading required package: tidyverse
## -- Attaching core tidyverse packages ------------------------ tidyverse 2.0.0 --
## v dplyr     1.1.4     v readr     2.1.5
## v forcats   1.0.0     v stringr   1.5.1
## v ggplot2   4.0.1     v tibble    3.3.0
## v lubridate 1.9.3     v tidyr     1.3.1
## v purrr     1.0.4     
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
## i Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
## Loading required package: bestNormalize
## 
## Registered S3 method overwritten by 'butcher':
##   method                 from    
##   as.character.dev_topic generics
## [[1]]
## [1] TRUE
## 
## [[2]]
## [1] TRUE

2 Load phenotype data and extract blood traits with relevant additional columns

sheets <- c("gha-metadata", "hbss", "hbsc", "hbss-sc")
pheno <- readxl::read_xlsx(params$data, sheet = "hbss-sc")
#names(pheno)

write.table(
  pheno, 
  sub(".xlsx", ".tsv", params$data), 
  col.names = TRUE, 
  row.names = FALSE, 
  sep="\t", 
  quote = FALSE
)

str(pheno)
## tibble [2,108 x 25] (S3: tbl_df/tbl/data.frame)
##  $ FID                : chr [1:2108] "SGA-ACC-01-0004" "SGA-ACC-01-0006" "SGA-ACC-01-0009" "SGA-ACC-01-0011" ...
##  $ IID                : chr [1:2108] "SGA-ACC-01-0004" "SGA-ACC-01-0006" "SGA-ACC-01-0009" "SGA-ACC-01-0011" ...
##  $ AGE                : chr [1:2108] "12" "9" "9" "6" ...
##  $ GENDER             : chr [1:2108] "Female" "Female" "Female" "Male" ...
##  $ SEX                : chr [1:2108] "2" "2" "2" "1" ...
##  $ HEIGHT             : chr [1:2108] "1.36" "1.54" "1.27" "1.32" ...
##  $ WEIGHT             : chr [1:2108] "28.5" "32" "24" "38" ...
##  $ BMI                : chr [1:2108] "15.4087370242215" "13.4930005059875" "14.8800297600595" "21.8089990817264" ...
##  $ STATUS             : chr [1:2108] "SS" "SC" "SS" "SS" ...
##  $ RBC                : chr [1:2108] "2.71" "4.33" "3" "2.85" ...
##  $ Hb                 : num [1:2108] 8.9 11 9.2 7.4 7.4 10.8 10.7 6 11.1 10.6 ...
##  $ HCT                : num [1:2108] 25.8 33 27.4 22.2 22 31.5 31.9 17.6 31 30 ...
##  $ MCV                : chr [1:2108] "95.3" "76.1" "91.4" "78" ...
##  $ MCH                : num [1:2108] 32.8 25.4 30.7 25.9 36.5 29.1 24.7 28.3 25.9 34 ...
##  $ MCHC               : chr [1:2108] "34.5" "33.4" "33.6" "33.2" ...
##  $ RDW                : chr [1:2108] "18.1" "14.1" "14.6" "20.1" ...
##  $ WBC                : chr [1:2108] "8.65" "6.38" "9.15" "16.08" ...
##  $ LYMPHOCYTE_PERCENT : chr [1:2108] "51.3" "59.9" "45" "NA" ...
##  $ MONOCYTE_PERCENT   : chr [1:2108] "6.2" "4.5" "9.7" "2.8" ...
##  $ GRANULOCYTE_PERCENT: chr [1:2108] "NA" "NA" "NA" "NA" ...
##  $ LYMPHOCYTE         : chr [1:2108] "4.44" "3.82" "4.11" "NA" ...
##  $ MONOCYTE           : chr [1:2108] "0.54" "0.29" "0.89" "0.45" ...
##  $ GRANULOCYTE        : chr [1:2108] "NA" "NA" "NA" "NA" ...
##  $ PLATELET           : chr [1:2108] "413" "411" "323" "673" ...
##  $ MPV                : chr [1:2108] "8.6" "7.9" "7.7" "7.6" ...
pheno <- read.table( sub(".xlsx", ".tsv", params$data), sep="\t", header = TRUE)
keep_colmns <- c( 
  "FID","IID","AGE","GENDER","SEX","HEIGHT","WEIGHT","BMI",
  "STATUS","RBC","Hb","HCT","MCV","MCH","MCHC","RDW",
  "WBC","LYMPHOCYTE_PERCENT","MONOCYTE_PERCENT",
  "GRANULOCYTE_PERCENT","LYMPHOCYTE","MONOCYTE",
  "GRANULOCYTE","PLATELET","MPV"
)

bt_pheno <- pheno[, which(names(pheno) %in% keep_colmns)]

colmns_to_update <- c( 
  "AGE","SEX","HEIGHT","WEIGHT","BMI",
  "RBC","Hb","HCT","MCV","MCH","MCHC","RDW",
  "WBC","LYMPHOCYTE_PERCENT","MONOCYTE_PERCENT",
  "GRANULOCYTE_PERCENT","LYMPHOCYTE","MONOCYTE",
  "GRANULOCYTE","PLATELET","MPV"
)

set_var_type <- function(x) {
  as.numeric(x)
}

bt_pheno[, c(colmns_to_update)] <- lapply(bt_pheno[, c(colmns_to_update)] , set_var_type)

#bt_pheno$BMI <- as.numeric((bt_pheno$WEIGHT)/(bt_pheno$HEIGHT)^2)

str(bt_pheno)
## 'data.frame':    2108 obs. of  25 variables:
##  $ FID                : chr  "SGA-ACC-01-0004" "SGA-ACC-01-0006" "SGA-ACC-01-0009" "SGA-ACC-01-0011" ...
##  $ IID                : chr  "SGA-ACC-01-0004" "SGA-ACC-01-0006" "SGA-ACC-01-0009" "SGA-ACC-01-0011" ...
##  $ AGE                : num  12 9 9 6 13 7 12 7 12 10 ...
##  $ GENDER             : chr  "Female" "Female" "Female" "Male" ...
##  $ SEX                : num  2 2 2 1 2 1 1 2 1 2 ...
##  $ HEIGHT             : num  1.36 1.54 1.27 1.32 1.4 ...
##  $ WEIGHT             : num  28.5 32 24 38 31 21.5 37 23 31.5 28 ...
##  $ BMI                : num  15.4 13.5 14.9 21.8 15.8 ...
##  $ STATUS             : chr  "SS" "SC" "SS" "SS" ...
##  $ RBC                : num  2.71 4.33 3 2.85 2.43 3.71 4.02 2.11 4.27 3.13 ...
##  $ Hb                 : num  8.9 11 9.2 7.4 7.4 10.8 10.7 6 11.1 10.6 ...
##  $ HCT                : num  25.8 33 27.4 22.2 22 31.5 31.9 17.6 31 30 ...
##  $ MCV                : num  95.3 76.1 91.4 78 90.5 51.9 79.3 83.1 72.4 96 ...
##  $ MCH                : num  32.8 25.4 30.7 25.9 36.5 29.1 24.7 28.3 25.9 34 ...
##  $ MCHC               : num  34.5 33.4 33.6 33.2 33.7 NA 33.7 34 35.8 35.4 ...
##  $ RDW                : num  18.1 14.1 14.6 20.1 19.9 NA 14.2 29.9 17.2 14.6 ...
##  $ WBC                : num  8.65 6.38 9.15 16.08 8.57 ...
##  $ LYMPHOCYTE_PERCENT : num  51.3 59.9 45 NA 44.3 NA 15.7 56.4 43.7 51.2 ...
##  $ MONOCYTE_PERCENT   : num  6.2 4.5 9.7 2.8 1.2 NA NA 7 6 7.3 ...
##  $ GRANULOCYTE_PERCENT: num  NA NA NA NA NA NA NA NA NA NA ...
##  $ LYMPHOCYTE         : num  4.44 3.82 4.11 NA 3.78 NA 3.31 9.46 3.06 3.86 ...
##  $ MONOCYTE           : num  0.54 0.29 0.89 0.45 0.82 NA 0.4 1.17 0.42 0.54 ...
##  $ GRANULOCYTE        : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ PLATELET           : num  413 411 323 673 347 NA 387 355 274 374 ...
##  $ MPV                : num  8.6 7.9 7.7 7.6 9.7 NA 10.2 8.7 7.9 7.2 ...

3 Demographic data

3.1 AGE

summary(bt_pheno$AGE)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##    0.00    6.00   12.00   16.59   24.00   73.00       6
hist(bt_pheno$AGE, main="AGE destribution", xlab="AGE")

#plot(density(na.omit(bt_pheno$AGE)))
#polygon(density(na.omit(bt_pheno$AGE)), col = "coral")
#abline(v=c(min(bt_pheno$AGE), max(bt_pheno$AGE)), lty=2, lwd=1)

3.2 SEX

bt_pheno %>% 
   select(SEX) %>% 
   table() %>% 
   as.data.frame() %>% 
   na.omit() %>%
   mutate(Prop = (Freq/sum(Freq))*100) -> sex       # Sex

sex
colnames(sex) <- c("cat", "freq", "prop")
par(mar=c(4,4,3,4))
barplot(
   sex$prop, 
   ylim=c(0, 60), 
   col=c(2,4)
)
text(
   x=c(0.7,1.9), 
   y=30, 
   labels=c("Female", "Male")
)
mtext(
   text = "Proportion (%)",
   side = 2,
   line = 3
)
mtext(
   text = "Sex",
   side = 1,
   line = 1
)

4 Describe phenotypes and normalize if needed

Definition of outliers removed:

  • Very extreme values that are likely to be experimentation or data capture error.

4.1 RBC

# figures-side,
par(mar = c(4, 4, 2, 2))
hist(bt_pheno$RBC, main="RBC (million/µL)") 
plot(density(na.omit(bt_pheno$RBC)), main="RBC (million/µL)")
polygon(density(na.omit(bt_pheno$RBC)), col = "coral") 

# removing outliers
abline(v=40, lwd=1, lty=2)
bt_pheno$RBC <- as.numeric(ifelse(bt_pheno$RBC > 40, "NA", bt_pheno$RBC))
plot(density(na.omit(bt_pheno$RBC)), main="RBC (>40) outlier pruned")
polygon(density(na.omit(bt_pheno$RBC)), col = "coral")

BNphen <- bestNormalize::orderNorm(bt_pheno$RBC)
## Warning in bestNormalize::orderNorm(bt_pheno$RBC): Ties in data, Normal distribution not guaranteed
BNphen 
## orderNorm Transformation with 2106 nonmissing obs and ties
##  - 383 unique values 
##  - Original quantiles:
##   0%  25%  50%  75% 100% 
## 0.31 2.56 3.10 3.94 5.95
bt_pheno$nRBC <- BNphen$x.t
plot(density(na.omit(bt_pheno$nRBC)), main="Normalized RBC") 
polygon(density(na.omit(bt_pheno$nRBC)), col = "lightblue")

# fit smooth spline for Age against RBC
rbc_pheno <- na.omit(bt_pheno[, which(names(bt_pheno) %in% c("FID","GENDER","AGE","RBC"))])

rbc_plot <- qplot(AGE, RBC, data = rbc_pheno, geom='smooth', span =0.5, color = GENDER) + theme_bw()
## Warning: `qplot()` was deprecated in ggplot2 3.4.0.
## This warning is displayed once per session.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
rbc_plot
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
summary(bt_pheno$RBC)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   0.310   2.560   3.100   3.266   3.940   5.950       2

4.2 RDW

# figures-side,
par(mar = c(4, 4, 2, 2))
hist(bt_pheno$RDW, main="RDW (%)") 
plot(density(na.omit(bt_pheno$RDW)), main="RDW (%)")
polygon(density(na.omit(bt_pheno$RDW)), col = "coral") 

# removing outliers
#abline(v=10, lwd=1, lty=2)
#bt_pheno$RDW <- as.numeric(ifelse(bt_pheno$RDW > 10, "NA", bt_pheno$RDW))
#plot(density(na.omit(bt_pheno$RDW)), main="RDW (>10) outlier pruned")
#polygon(density(na.omit(bt_pheno$RDW)), col = "coral")

BNphen <- bestNormalize::orderNorm(bt_pheno$RDW)
## Warning in bestNormalize::orderNorm(bt_pheno$RDW): Ties in data, Normal distribution not guaranteed
BNphen 
## orderNorm Transformation with 2016 nonmissing obs and ties
##  - 191 unique values 
##  - Original quantiles:
##    0%   25%   50%   75%  100% 
##   1.7  15.9  18.1  21.0 312.0
bt_pheno$nRDW <- BNphen$x.t
plot(density(na.omit(bt_pheno$nRDW)), main="Normalized RDW") 
polygon(density(na.omit(bt_pheno$nRDW)), col = "lightblue")

# fit smooth spline for Age against RBC
rdw_pheno <- na.omit(bt_pheno[, which(names(bt_pheno) %in% c("FID","GENDER","AGE","RDW"))])

rdw_plot <- qplot(AGE, RDW, data = rdw_pheno, geom='smooth', span =0.5, color = GENDER) + theme_bw()
rdw_plot
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
summary(bt_pheno$RDW)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##    1.70   15.90   18.10   18.91   21.00  312.00      92

4.3 Hb

par(mar = c(4, 4, 2, 2))
hist(bt_pheno$Hb, main="Hb (g/dL)", xlab="Hb") 
plot(density(na.omit(bt_pheno$Hb)), main="Hb (g/dL)")
polygon(density(na.omit(bt_pheno$Hb)), col = "coral")

# removing outliers
#abline(v=13, lwd=1, lty=2)
#bt_pheno$Hb <- as.numeric(ifelse(bt_pheno$Hb > 13, "NA", bt_pheno$Hb))
#plot(density(na.omit(bt_pheno$Hb)), main="Hb (>13) outlier pruned")
#polygon(density(na.omit(bt_pheno$Hb)), col = "coral")

BNphen <- bestNormalize::orderNorm(bt_pheno$Hb)
## Warning in bestNormalize::orderNorm(bt_pheno$Hb): Ties in data, Normal distribution not guaranteed
BNphen 
## orderNorm Transformation with 2108 nonmissing obs and ties
##  - 101 unique values 
##  - Original quantiles:
##   0%  25%  50%  75% 100% 
##  0.1  7.4  8.5 10.1 15.6
bt_pheno$nHb <- BNphen$x.t
plot(density(na.omit(bt_pheno$nHb)), main="Normalized Hb") 
polygon(density(na.omit(bt_pheno$nHb)), col = "lightblue")

# fit smooth spline for Age against Hb
hb_pheno <- na.omit(bt_pheno[, which(names(bt_pheno) %in% c("FID","GENDER","AGE","Hb"))])

hb_plot <- qplot(AGE, Hb, data = hb_pheno, geom='smooth', span =0.5, color = GENDER) + theme_bw()
hb_plot
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
summary(bt_pheno$Hb)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    0.10    7.40    8.50    8.75   10.10   15.60

4.4 MCV

par(mar = c(4, 4, 2, 2))
hist(pheno$MCV, main="MCV (fL)") 
plot(density(na.omit(bt_pheno$MCV)), main="MCV (fL)")
polygon(density(na.omit(bt_pheno$MCV)), col = "coral")

# removing outliers
#abline(v=10, lwd=1, lty=2)
#bt_pheno$MCV <- as.numeric(ifelse(bt_pheno$MCV < 10, "NA", bt_pheno$MCV))
#plot(density(na.omit(bt_pheno$MCV)), main="MCV (<10) outlier pruned")
#polygon(density(na.omit(bt_pheno$MCV)), col = "coral")

BNphen <- bestNormalize::orderNorm(bt_pheno$MCV)
## Warning in bestNormalize::orderNorm(bt_pheno$MCV): Ties in data, Normal distribution not guaranteed
BNphen 
## orderNorm Transformation with 2107 nonmissing obs and ties
##  - 462 unique values 
##  - Original quantiles:
##    0%   25%   50%   75%  100% 
##  43.0  71.0  80.0  88.0 124.7
bt_pheno$nMCV <- BNphen$x.t
plot(density(na.omit(bt_pheno$nMCV)), main="Normalized (orderNorm) MCV")
polygon(density(na.omit(bt_pheno$nMCV)), col = "lightblue")

# fit smooth spline for Age against Hb
mcv_pheno <- na.omit(bt_pheno[, which(names(bt_pheno) %in% c("FID","GENDER","AGE","MCV"))])

mcv_plot <- qplot(AGE, MCV, data = mcv_pheno, geom='smooth', span =0.5, color = GENDER) + theme_bw()
mcv_plot
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
summary(bt_pheno$MCV)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   43.00   71.00   80.00   79.62   88.00  124.70       1

4.5 HCT

par(mar = c(4, 4, 2, 2))
hist(bt_pheno$HCT, main="HCT (%)") 
plot(density(na.omit(bt_pheno$HCT)), main="HCT (%)")
polygon(density(na.omit(bt_pheno$HCT)), col = "coral")

# removing ourliers
#abline(v=43, lwd=1, lty=2)
#bt_pheno$HCT <- as.numeric(ifelse(bt_pheno$HCT > 43, "NA", bt_pheno$HCT))
#plot(density(na.omit(bt_pheno$HCT)), main="HCT (>43) outlier pruned")
#polygon(density(na.omit(bt_pheno$HCT)), col = "coral")

BNphen <- bestNormalize::orderNorm(bt_pheno$HCT)
## Warning in bestNormalize::orderNorm(bt_pheno$HCT): Ties in data, Normal distribution not guaranteed
BNphen 
## orderNorm Transformation with 2108 nonmissing obs and ties
##  - 284 unique values 
##  - Original quantiles:
##   0%  25%  50%  75% 100% 
##  2.9 21.1 24.7 28.9 70.0
bt_pheno$nHCT <- BNphen$x.t
plot(density(na.omit(bt_pheno$nHCT)), main="Normalized HCT") 
polygon(density(na.omit(bt_pheno$nHCT)), col = "lightblue")

# fit smooth spline for Age against Hb
htc_pheno <- na.omit(bt_pheno[, which(names(bt_pheno) %in% c("FID","GENDER","AGE","HCT"))])

htc_plot <- qplot(AGE, HCT, data = htc_pheno, geom='smooth', span =0.5, color = GENDER) + theme_bw()
htc_plot
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
summary(bt_pheno$HCT)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    2.90   21.10   24.70   25.46   28.90   70.00

4.6 MCHC

#Mean Corpuscular Hemoglobin Concentration

par(mar = c(4, 4, 2, 2))
hist(bt_pheno$MCHC, main="MCHC (g/dL)") 
plot(density(na.omit(bt_pheno$MCHC)), main="MCHC (g/dL)")
polygon(density(na.omit(bt_pheno$MCHC)), col = "coral")

# removing
abline(v=c(10, 60), lwd=1, lty=2)
bt_pheno$MCHC <- as.numeric(
  ifelse(bt_pheno$MCHC > 60, "NA", 
    #bt_pheno$MCHC
    ifelse(bt_pheno$MCHC < 10, "NA", bt_pheno$MCHC)
  )
)
## Warning: NAs introduced by coercion
plot(density(na.omit(bt_pheno$MCHC)), main="MCHC (>60 and <10) outlier pruned")
polygon(density(na.omit(bt_pheno$MCHC)), col = "coral")

BNphen <- bestNormalize::orderNorm(bt_pheno$MCHC)
## Warning in bestNormalize::orderNorm(bt_pheno$MCHC): Ties in data, Normal distribution not guaranteed
BNphen 
## orderNorm Transformation with 2100 nonmissing obs and ties
##  - 182 unique values 
##  - Original quantiles:
##   0%  25%  50%  75% 100% 
## 18.3 32.2 34.5 36.6 51.4
bt_pheno$nMCHC <- BNphen$x.t
plot(density(na.omit(bt_pheno$nMCHC)), main="Normalized MCHC") 
polygon(density(na.omit(bt_pheno$nMCHC)), col = "lightblue")

# fit smooth spline for Age against Hb
mchc_pheno <- na.omit(bt_pheno[, which(names(bt_pheno) %in% c("FID","GENDER","AGE","MCHC"))])

mchc_plot <- qplot(AGE, MCHC, data = mchc_pheno, geom='smooth', span =0.5, color = GENDER) + theme_bw()
mchc_plot
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
summary(bt_pheno$MCHC)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   18.30   32.20   34.50   34.67   36.60   51.40       8

4.7 MCH

#The Mean Corpuscular Hemoglobin

par(mar = c(4, 4, 2, 2))
hist(bt_pheno$MCH, main="MCH (pg/cell)") 
plot(density(na.omit(bt_pheno$MCH)), main="MCH (pg/cell)")
polygon(density(na.omit(bt_pheno$MCH)), col = "coral")

# removing outliers
abline(v=c(60), lwd=1, lty=2)
bt_pheno$MCH <- as.numeric(
 ifelse(bt_pheno$MCH > 60, "NA", 
   bt_pheno$MCH
   #ifelse(bt_pheno$MCH > 50, "NA", bt_pheno$MCH)
 )
)
## Warning: NAs introduced by coercion
plot(density(na.omit(bt_pheno$MCH)), main="MCH (>60) outlier pruned")
polygon(density(na.omit(bt_pheno$MCH)), col = "coral")

BNphen <- bestNormalize::orderNorm(bt_pheno$MCH)
## Warning in bestNormalize::orderNorm(bt_pheno$MCH): Ties in data, Normal distribution not guaranteed
BNphen 
## orderNorm Transformation with 2107 nonmissing obs and ties
##  - 205 unique values 
##  - Original quantiles:
##    0%   25%   50%   75%  100% 
## 16.00 24.70 27.20 30.05 58.50
bt_pheno$nMCH <- BNphen$x.t
plot(density(na.omit(bt_pheno$nMCH)), main="Normalized MCH") 
polygon(density(na.omit(bt_pheno$nMCH)), col = "lightblue")

# fit smooth spline for Age against Hb
MCH_pheno <- na.omit(bt_pheno[, which(names(bt_pheno) %in% c("FID","GENDER","AGE","MCH"))])

MCH_plot <- qplot(AGE, MCH, data = MCH_pheno, geom='smooth', span =0.5, color = GENDER) + theme_bw()
MCH_plot
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
summary(bt_pheno$MCH)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   16.00   24.70   27.20   27.49   30.05   58.50       1

4.8 WBC

Check standard units here

par(mar = c(4, 4, 2, 2))
hist(bt_pheno$WBC, main="WBC (10^9/L)") 
plot(density(na.omit(bt_pheno$WBC)), main="WBC (10^9/L)")
polygon(density(na.omit(bt_pheno$WBC)), col = "coral")

# removing
#abline(v=35, lwd=1, lty=2)
#bt_pheno$WBC <- as.numeric(
#  ifelse(bt_pheno$WBC > 35, "NA", 
#    bt_pheno$WBC
#    #ifelse(bt_pheno$WBC < 10, "NA", bt_pheno$WBC)
#  )
#)

#plot(density(na.omit(bt_pheno$WBC)), main="WBC (>35) outlier pruned")
#polygon(density(na.omit(bt_pheno$WBC)), col = "coral")

BNphen <- bestNormalize::orderNorm(bt_pheno$WBC)
## Warning in bestNormalize::orderNorm(bt_pheno$WBC): Ties in data, Normal distribution not guaranteed
BNphen 
## orderNorm Transformation with 2107 nonmissing obs and ties
##  - 924 unique values 
##  - Original quantiles:
##   0%  25%  50%  75% 100% 
##  0.0  7.7 10.3 13.5 68.2
bt_pheno$nWBC <- BNphen$x.t
plot(density(na.omit(bt_pheno$nWBC)), main="Normalized WBC") 
polygon(density(na.omit(bt_pheno$nWBC)), col = "lightblue")

# fit smooth spline for Age against Hb
wbc_pheno <- na.omit(bt_pheno[, which(names(bt_pheno) %in% c("FID","GENDER","AGE","WBC"))])

wbc_plot <- qplot(AGE, WBC, data = wbc_pheno, geom='smooth', span =0.5, color = GENDER) + theme_bw()
wbc_plot
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
summary(bt_pheno$WBC)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##    0.00    7.70   10.30   11.11   13.50   68.20       1

4.9 LYMPHOCYTE

par(mar = c(4, 4, 2, 2))
hist(bt_pheno$LYMPHOCYTE, main="LYMPHOCYTE (10^9/L)") 
plot(density(na.omit(bt_pheno$LYMPHOCYTE)), main="LYMPHOCYTE (10^9/L)")
polygon(density(na.omit(bt_pheno$LYMPHOCYTE)), col = "coral") 

BNphen <- bestNormalize::orderNorm(bt_pheno$LYMPHOCYTE)
## Warning in bestNormalize::orderNorm(bt_pheno$LYMPHOCYTE): Ties in data, Normal distribution not guaranteed
BNphen 
## orderNorm Transformation with 1899 nonmissing obs and ties
##  - 584 unique values 
##  - Original quantiles:
##    0%   25%   50%   75%  100% 
##  0.63  2.88  3.80  5.15 19.90
bt_pheno$nLYMPHOCYTE <- BNphen$x.t
plot(density(na.omit(bt_pheno$nLYMPHOCYTE)), main="Normalized LYMPHOCYTE") 
polygon(density(na.omit(bt_pheno$nLYMPHOCYTE)), col = "lightblue")

# fit smooth spline for Age against Hb
lym_pheno <- na.omit(bt_pheno[, which(names(bt_pheno) %in% c("FID","GENDER","AGE","LYMPHOCYTE"))])

lym_plot <- qplot(AGE, LYMPHOCYTE, data = lym_pheno, geom='smooth', span =0.5, color = GENDER) + theme_bw()
lym_plot
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
summary(bt_pheno$LYMPHOCYTE)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##    0.63    2.88    3.80    4.30    5.15   19.90     209

4.10 MONOCYTE

par(mar = c(4, 4, 2, 2))
hist(bt_pheno$MONOCYTE, main="MONOCYTE (10^9/L)") 
plot(density(na.omit(bt_pheno$MONOCYTE)), main="MONOCYTE (10^9/L)")
polygon(density(na.omit(bt_pheno$MONOCYTE)), col = "coral")

# removing
#abline(v=5, lwd=1, lty=2)
#bt_pheno$MONOCYTE <- as.numeric(
#  ifelse(bt_pheno$MONOCYTE > 5, "NA", 
#    bt_pheno$MONOCYTE
#    #ifelse(bt_pheno$MONOCYTE == 0, "NA", bt_pheno$MONOCYTE)
#  )
#)

#plot(density(na.omit(bt_pheno$MONOCYTE)), main="MONOCYTE (>5) outlier pruned")
#polygon(density(na.omit(bt_pheno$MONOCYTE)), col = "coral")

BNphen <- bestNormalize::orderNorm(bt_pheno$MONOCYTE)
## Warning in bestNormalize::orderNorm(bt_pheno$MONOCYTE): Ties in data, Normal distribution not guaranteed
BNphen 
## orderNorm Transformation with 1903 nonmissing obs and ties
##  - 249 unique values 
##  - Original quantiles:
##    0%   25%   50%   75%  100% 
##  0.00  0.50  0.70  1.09 60.00
bt_pheno$nMONOCYTE <- BNphen$x.t
plot(density(na.omit(bt_pheno$nMONOCYTE)), main="Normalized MONOCYTE") 
polygon(density(na.omit(bt_pheno$nMONOCYTE)), col = "lightblue")

# fit smooth spline for Age against Hb
mon_pheno <- na.omit(bt_pheno[, which(names(bt_pheno) %in% c("FID","GENDER","AGE","MONOCYTE"))])

mon_plot <- qplot(AGE, MONOCYTE, data = mon_pheno, geom='smooth', span =0.5, color = GENDER) + theme_bw()
mon_plot
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
summary(bt_pheno$MONOCYTE)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   0.000   0.500   0.700   0.893   1.090  60.000     205

4.11 PLATELET

par(mar = c(4, 4, 2, 2))
hist(bt_pheno$PLATELET, main="PLATELET (10^9/L)") 
plot(density(na.omit(bt_pheno$PLATELET)), main="PLATELET (10^9/L)")
polygon(density(na.omit(bt_pheno$PLATELET)), col = "coral") 
BNphen <- bestNormalize::orderNorm(bt_pheno$PLATELET)
## Warning in bestNormalize::orderNorm(bt_pheno$PLATELET): Ties in data, Normal distribution not guaranteed
BNphen 
## orderNorm Transformation with 2102 nonmissing obs and ties
##  - 587 unique values 
##  - Original quantiles:
##      0%     25%     50%     75%    100% 
##   24.00  258.25  349.00  449.00 1601.00
bt_pheno$nPLATELET <- BNphen$x.t
plot(density(na.omit(bt_pheno$nPLATELET)), main="Normalized PLATELET") 
polygon(density(na.omit(bt_pheno$nPLATELET)), col = "lightblue")

# fit smooth spline for Age against PLATELET
pla_pheno <- na.omit(bt_pheno[, which(names(bt_pheno) %in% c("FID","GENDER","AGE","PLATELET"))])

pla_plot <- qplot(AGE, PLATELET, data = pla_pheno, geom='smooth', span =0.5, color = GENDER) + theme_bw()
pla_plot
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
summary(bt_pheno$PLATELET)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##    24.0   258.2   349.0   365.4   449.0  1601.0       6

4.12 MPV

par(mar = c(4, 4, 2, 2))
hist(bt_pheno$MPV, main="MPV") 
plot(density(na.omit(bt_pheno$MPV)), main="MPV")
polygon(density(na.omit(bt_pheno$MPV)), col = "coral") 

# removing
abline(v=20, lwd=1, lty=2)
bt_pheno$MPV <- as.numeric(
  ifelse(bt_pheno$MPV > 20, "NA", 
    bt_pheno$MPV
    #ifelse(bt_pheno$MPV < 10, "NA", bt_pheno$MPV)
  )
)
## Warning: NAs introduced by coercion
plot(density(na.omit(bt_pheno$MPV)), main="MPV (>20) outlier pruned")
polygon(density(na.omit(bt_pheno$MPV)), col = "coral")

BNphen <- bestNormalize::orderNorm(bt_pheno$MPV)
## Warning in bestNormalize::orderNorm(bt_pheno$MPV): Ties in data, Normal distribution not guaranteed
BNphen 
## orderNorm Transformation with 1927 nonmissing obs and ties
##  - 75 unique values 
##  - Original quantiles:
##   0%  25%  50%  75% 100% 
##  0.2  7.6  8.4  9.3 14.0
bt_pheno$nMPV <- BNphen$x.t
plot(density(na.omit(bt_pheno$nMPV)), main="Normalized MPV") 
polygon(density(na.omit(bt_pheno$nMPV)), col = "lightblue")

# fit smooth spline for Age against PLATELET
pla_pheno <- na.omit(bt_pheno[, which(names(bt_pheno) %in% c("FID","GENDER","AGE","MPV"))])

pla_plot <- qplot(AGE, MPV, data = pla_pheno, geom='smooth', span =0.5, color = GENDER) + theme_bw()
pla_plot
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
summary(bt_pheno$MPV)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   0.200   7.600   8.400   8.503   9.300  14.000     181

4.13 BMI

par(mar = c(4, 4, 2, 2))
hist(bt_pheno$BMI, main="BMI") 
plot(density(na.omit(bt_pheno$BMI)), main="BMI")
polygon(density(na.omit(bt_pheno$BMI)), col = "coral")

# removing outliers
abline(v=50, lwd=1, lty=2)
bt_pheno$BMI <- as.numeric(
  ifelse(bt_pheno$BMI > 50, "NA", 
    bt_pheno$BMI
    #ifelse(bt_pheno$BMI < 10, "NA", bt_pheno$BMI)
  )
)
plot(density(na.omit(bt_pheno$BMI)), main="BMI (>50) outlier pruned")
polygon(density(na.omit(bt_pheno$BMI)), col = "coral")

BNphen <- bestNormalize::orderNorm(bt_pheno$BMI)
BNphen 
## orderNorm Transformation with 2069 nonmissing obs and ties
##  - 1484 unique values 
##  - Original quantiles:
##     0%    25%    50%    75%   100% 
##  8.333 15.151 17.492 21.028 42.999
bt_pheno$nBMI <- BNphen$x.t
plot(density(na.omit(bt_pheno$nBMI)), main="Normalized BMI") 
polygon(density(na.omit(bt_pheno$nBMI)), col = "lightblue")

# fit smooth spline for Age against BMI
bmi_pheno <- na.omit(bt_pheno[, which(names(bt_pheno) %in% c("FID","GENDER","AGE","BMI"))])

bmi_plot <- qplot(AGE, BMI, data = bmi_pheno, geom='smooth', span =0.5, color = GENDER) + theme_bw()
bmi_plot
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
summary(bt_pheno$BMI)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   8.333  15.151  17.492  18.551  21.028  42.999      39

5 Save new pheno to file

#head(bt_pheno[, -c(3:4)])

sheet = "hbss-sc"

outf <- paste0(
  params$odir,
  "/",
  sub(".xlsx","", basename(params$data)),
  "_",
  sheet,
  "_new_pheno.tsv"
)

write.table(
  bt_pheno,
  outf,
  col.names=T,
  row.names=F,
  quote=F,
  sep="\t"
)