Back

1 Load package

list.of.packages <- c("tidyverse","bestNormalize")
new.packages <- list.of.packages[!(list.of.packages %in% installed.packages()[,"Package"])]
if(length(new.packages)) 
  install.packages(new.packages)
lapply(list.of.packages , require, character.only = T)
## [[1]]
## [1] TRUE
## 
## [[2]]
## [1] TRUE

2 Load phenotype data and extract blood traits with relevant additional columns

sheets <- c("gha-metadata", "hbss", "hbsc", "hbss-sc")
pheno <- readxl::read_xlsx(params$data, sheet = "hbss-sc")

write.table(
  pheno, 
  sub(".xlsx", ".tsv", params$data), 
  col.names = TRUE, 
  row.names = FALSE, 
  sep="\t", 
  quote = FALSE
)
pheno <- read.table( sub(".xlsx", ".tsv", params$data), sep="\t", header = TRUE)
keep_colmns <- c( 
  "FID","IID","AGE","GENDER","SEX","HEIGHT","WEIGHT","BMI",
  "STATUS","RBC","Hb","HCT","MCV","MCH","MCHC","RDW",
  "WBC","LYMPHOCYTE_PERCENT","MONOCYTE_PERCENT",
  "GRANULOCYTE_PERCENT","LYMPHOCYTE","MONOCYTE",
  "GRANULOCYTE","PLATELET","MPV"
)

bt_pheno <- pheno[, which(names(pheno) %in% keep_colmns)]

colmns_to_update <- c( 
  "AGE","SEX","HEIGHT","WEIGHT","BMI",
  "RBC","Hb","HCT","MCV","MCH","MCHC","RDW",
  "WBC","LYMPHOCYTE_PERCENT","MONOCYTE_PERCENT",
  "GRANULOCYTE_PERCENT","LYMPHOCYTE","MONOCYTE",
  "GRANULOCYTE","PLATELET","MPV"
)

set_var_type <- function(x) {
  as.numeric(x)
}

bt_pheno[, c(colmns_to_update)] <- lapply(bt_pheno[, c(colmns_to_update)] , set_var_type)
bt_pheno = bt_pheno[bt_pheno$AGE > 0,]

3 Demographic data

3.1 AGE

summary(bt_pheno$AGE)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##    1.00    6.00   12.00   16.61   24.00   73.00       6
hist(bt_pheno$AGE, main="AGE destribution", xlab="AGE")

#plot(density(na.omit(bt_pheno$AGE)))
#polygon(density(na.omit(bt_pheno$AGE)), col = "coral")
#abline(v=c(min(bt_pheno$AGE), max(bt_pheno$AGE)), lty=2, lwd=1)

3.2 SEX

bt_pheno %>% 
   select(SEX) %>% 
   table() %>% 
   as.data.frame() %>% 
   na.omit() %>%
   mutate(Prop = (Freq/sum(Freq))*100) -> sex       # Sex

sex
colnames(sex) <- c("cat", "freq", "prop")
par(mar=c(4,4,3,4))
barplot(
   sex$prop, 
   ylim=c(0, 60), 
   col=c(2,4)
)
text(
   x=c(0.7,1.9), 
   y=30, 
   labels=c("Female", "Male")
)
mtext(
   text = "Proportion (%)",
   side = 2,
   line = 3
)
mtext(
   text = "Sex",
   side = 1,
   line = 1
)

4 Normalize traits

Definition of outliers removed:

  • Very extreme values that are likely to be experimentation or data capture error.

4.1 RBC

# figures-side,
par(mar = c(4, 4, 2, 2))
hist(bt_pheno$RBC, main="RBC (million/µL)") 
plot(density(na.omit(bt_pheno$RBC)), main="RBC (million/µL)")
polygon(density(na.omit(bt_pheno$RBC)), col = "coral") 

# removing outliers
abline(v=40, lwd=1, lty=2)
bt_pheno$RBC <- as.numeric(ifelse(bt_pheno$RBC > 40, "NA", bt_pheno$RBC))
plot(density(na.omit(bt_pheno$RBC)), main="RBC (>40) outlier pruned")
polygon(density(na.omit(bt_pheno$RBC)), col = "coral")

BNphen <- bestNormalize::orderNorm(bt_pheno$RBC)
 
bt_pheno$nRBC <- BNphen$x.t
plot(density(na.omit(bt_pheno$nRBC)), main="Normalized RBC") 
polygon(density(na.omit(bt_pheno$nRBC)), col = "lightblue")

# fit smooth spline for Age against RBC
rbc_pheno <- na.omit(bt_pheno[, which(names(bt_pheno) %in% c("FID","GENDER","AGE","RBC"))])

rbc_plot <- qplot(AGE, RBC, data = rbc_pheno, geom='smooth', span =0.5, color = GENDER) + theme_bw()
rbc_plot
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'

4.2 RDW

# figures-side,
par(mar = c(4, 4, 2, 2))
hist(bt_pheno$RDW, main="RDW (%)") 
plot(density(na.omit(bt_pheno$RDW)), main="RDW (%)")
polygon(density(na.omit(bt_pheno$RDW)), col = "coral") 

# removing outliers
#abline(v=10, lwd=1, lty=2)
#bt_pheno$RDW <- as.numeric(ifelse(bt_pheno$RDW > 10, "NA", bt_pheno$RDW))
#plot(density(na.omit(bt_pheno$RDW)), main="RDW (>10) outlier pruned")
#polygon(density(na.omit(bt_pheno$RDW)), col = "coral")

BNphen <- bestNormalize::orderNorm(bt_pheno$RDW)
 
bt_pheno$nRDW <- BNphen$x.t
plot(density(na.omit(bt_pheno$nRDW)), main="Normalized RDW") 
polygon(density(na.omit(bt_pheno$nRDW)), col = "lightblue")

# fit smooth spline for Age against RBC
rdw_pheno <- na.omit(bt_pheno[, which(names(bt_pheno) %in% c("FID","GENDER","AGE","RDW"))])

rdw_plot <- qplot(AGE, RDW, data = rdw_pheno, geom='smooth', span =0.5, color = GENDER) + theme_bw()
rdw_plot
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'

4.3 Hb

par(mar = c(4, 4, 2, 2))
hist(bt_pheno$Hb, main="Hb (g/dL)", xlab="Hb") 
plot(density(na.omit(bt_pheno$Hb)), main="Hb (g/dL)")
polygon(density(na.omit(bt_pheno$Hb)), col = "coral")

# removing outliers
#abline(v=13, lwd=1, lty=2)
#bt_pheno$Hb <- as.numeric(ifelse(bt_pheno$Hb > 13, "NA", bt_pheno$Hb))
#plot(density(na.omit(bt_pheno$Hb)), main="Hb (>13) outlier pruned")
#polygon(density(na.omit(bt_pheno$Hb)), col = "coral")

BNphen <- bestNormalize::orderNorm(bt_pheno$Hb)
 
bt_pheno$nHb <- BNphen$x.t
plot(density(na.omit(bt_pheno$nHb)), main="Normalized Hb") 
polygon(density(na.omit(bt_pheno$nHb)), col = "lightblue")

# fit smooth spline for Age against Hb
hb_pheno <- na.omit(bt_pheno[, which(names(bt_pheno) %in% c("FID","GENDER","AGE","Hb"))])

hb_plot <- qplot(AGE, Hb, data = hb_pheno, geom='smooth', span =0.5, color = GENDER) + theme_bw()
hb_plot
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'

4.4 MCV

par(mar = c(4, 4, 2, 2))
hist(pheno$MCV, main="MCV (fL)") 
plot(density(na.omit(bt_pheno$MCV)), main="MCV (fL)")
polygon(density(na.omit(bt_pheno$MCV)), col = "coral")

# removing outliers
#abline(v=10, lwd=1, lty=2)
#bt_pheno$MCV <- as.numeric(ifelse(bt_pheno$MCV < 10, "NA", bt_pheno$MCV))
#plot(density(na.omit(bt_pheno$MCV)), main="MCV (<10) outlier pruned")
#polygon(density(na.omit(bt_pheno$MCV)), col = "coral")

BNphen <- bestNormalize::orderNorm(bt_pheno$MCV)
 
bt_pheno$nMCV <- BNphen$x.t
plot(density(na.omit(bt_pheno$nMCV)), main="Normalized (orderNorm) MCV")
polygon(density(na.omit(bt_pheno$nMCV)), col = "lightblue")

# fit smooth spline for Age against Hb
mcv_pheno <- na.omit(bt_pheno[, which(names(bt_pheno) %in% c("FID","GENDER","AGE","MCV"))])

mcv_plot <- qplot(AGE, MCV, data = mcv_pheno, geom='smooth', span =0.5, color = GENDER) + theme_bw()
mcv_plot
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'

4.5 HCT

par(mar = c(4, 4, 2, 2))
hist(bt_pheno$HCT, main="HCT (%)") 
plot(density(na.omit(bt_pheno$HCT)), main="HCT (%)")
polygon(density(na.omit(bt_pheno$HCT)), col = "coral")

# removing ourliers
#abline(v=43, lwd=1, lty=2)
#bt_pheno$HCT <- as.numeric(ifelse(bt_pheno$HCT > 43, "NA", bt_pheno$HCT))
#plot(density(na.omit(bt_pheno$HCT)), main="HCT (>43) outlier pruned")
#polygon(density(na.omit(bt_pheno$HCT)), col = "coral")

BNphen <- bestNormalize::orderNorm(bt_pheno$HCT)
 
bt_pheno$nHCT <- BNphen$x.t
plot(density(na.omit(bt_pheno$nHCT)), main="Normalized HCT") 
polygon(density(na.omit(bt_pheno$nHCT)), col = "lightblue")

# fit smooth spline for Age against Hb
htc_pheno <- na.omit(bt_pheno[, which(names(bt_pheno) %in% c("FID","GENDER","AGE","HCT"))])

htc_plot <- qplot(AGE, HCT, data = htc_pheno, geom='smooth', span =0.5, color = GENDER) + theme_bw()
htc_plot
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'

4.6 MCHC

#Mean Corpuscular Hemoglobin Concentration

par(mar = c(4, 4, 2, 2))
hist(bt_pheno$MCHC, main="MCHC (g/dL)") 
plot(density(na.omit(bt_pheno$MCHC)), main="MCHC (g/dL)")
polygon(density(na.omit(bt_pheno$MCHC)), col = "coral")

# removing
abline(v=c(10, 60), lwd=1, lty=2)
bt_pheno$MCHC <- as.numeric(
  ifelse(bt_pheno$MCHC > 60, "NA", 
    #bt_pheno$MCHC
    ifelse(bt_pheno$MCHC < 10, "NA", bt_pheno$MCHC)
  )
)

plot(density(na.omit(bt_pheno$MCHC)), main="MCHC (>60 and <10) outlier pruned")
polygon(density(na.omit(bt_pheno$MCHC)), col = "coral")

BNphen <- bestNormalize::orderNorm(bt_pheno$MCHC)
 
bt_pheno$nMCHC <- BNphen$x.t
plot(density(na.omit(bt_pheno$nMCHC)), main="Normalized MCHC") 
polygon(density(na.omit(bt_pheno$nMCHC)), col = "lightblue")

# fit smooth spline for Age against Hb
mchc_pheno <- na.omit(bt_pheno[, which(names(bt_pheno) %in% c("FID","GENDER","AGE","MCHC"))])

mchc_plot <- qplot(AGE, MCHC, data = mchc_pheno, geom='smooth', span =0.5, color = GENDER) + theme_bw()
mchc_plot
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'

4.7 MCH

#The Mean Corpuscular Hemoglobin

par(mar = c(4, 4, 2, 2))
hist(bt_pheno$MCH, main="MCH (pg/cell)") 
plot(density(na.omit(bt_pheno$MCH)), main="MCH (pg/cell)")
polygon(density(na.omit(bt_pheno$MCH)), col = "coral")

# removing outliers
abline(v=c(60), lwd=1, lty=2)
bt_pheno$MCH <- as.numeric(
 ifelse(bt_pheno$MCH > 60, "NA", 
   bt_pheno$MCH
   #ifelse(bt_pheno$MCH > 50, "NA", bt_pheno$MCH)
 )
)
 
plot(density(na.omit(bt_pheno$MCH)), main="MCH (>60) outlier pruned")
polygon(density(na.omit(bt_pheno$MCH)), col = "coral")

BNphen <- bestNormalize::orderNorm(bt_pheno$MCH)
 
bt_pheno$nMCH <- BNphen$x.t
plot(density(na.omit(bt_pheno$nMCH)), main="Normalized MCH") 
polygon(density(na.omit(bt_pheno$nMCH)), col = "lightblue")

# fit smooth spline for Age against Hb
MCH_pheno <- na.omit(bt_pheno[, which(names(bt_pheno) %in% c("FID","GENDER","AGE","MCH"))])

MCH_plot <- qplot(AGE, MCH, data = MCH_pheno, geom='smooth', span =0.5, color = GENDER) + theme_bw()
MCH_plot
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'

4.8 WBC

Check standard units here

par(mar = c(4, 4, 2, 2))
hist(bt_pheno$WBC, main="WBC (10^9/L)") 
plot(density(na.omit(bt_pheno$WBC)), main="WBC (10^9/L)")
polygon(density(na.omit(bt_pheno$WBC)), col = "coral")

# removing
#abline(v=35, lwd=1, lty=2)
#bt_pheno$WBC <- as.numeric(
#  ifelse(bt_pheno$WBC > 35, "NA", 
#    bt_pheno$WBC
#    #ifelse(bt_pheno$WBC < 10, "NA", bt_pheno$WBC)
#  )
#)

#plot(density(na.omit(bt_pheno$WBC)), main="WBC (>35) outlier pruned")
#polygon(density(na.omit(bt_pheno$WBC)), col = "coral")

BNphen <- bestNormalize::orderNorm(bt_pheno$WBC)
 
bt_pheno$nWBC <- BNphen$x.t
plot(density(na.omit(bt_pheno$nWBC)), main="Normalized WBC") 
polygon(density(na.omit(bt_pheno$nWBC)), col = "lightblue")

# fit smooth spline for Age against Hb
wbc_pheno <- na.omit(bt_pheno[, which(names(bt_pheno) %in% c("FID","GENDER","AGE","WBC"))])

wbc_plot <- qplot(AGE, WBC, data = wbc_pheno, geom='smooth', span =0.5, color = GENDER) + theme_bw()
wbc_plot
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'

4.9 LYMPHOCYTE

par(mar = c(4, 4, 2, 2))
hist(bt_pheno$LYMPHOCYTE, main="LYMPHOCYTE (10^9/L)") 
plot(density(na.omit(bt_pheno$LYMPHOCYTE)), main="LYMPHOCYTE (10^9/L)")
polygon(density(na.omit(bt_pheno$LYMPHOCYTE)), col = "coral") 

BNphen <- bestNormalize::orderNorm(bt_pheno$LYMPHOCYTE)
 
bt_pheno$nLYMPHOCYTE <- BNphen$x.t
plot(density(na.omit(bt_pheno$nLYMPHOCYTE)), main="Normalized LYMPHOCYTE") 
polygon(density(na.omit(bt_pheno$nLYMPHOCYTE)), col = "lightblue")

# fit smooth spline for Age against Hb
lym_pheno <- na.omit(bt_pheno[, which(names(bt_pheno) %in% c("FID","GENDER","AGE","LYMPHOCYTE"))])

lym_plot <- qplot(AGE, LYMPHOCYTE, data = lym_pheno, geom='smooth', span =0.5, color = GENDER) + theme_bw()
lym_plot
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'

4.10 MONOCYTE

par(mar = c(4, 4, 2, 2))
hist(bt_pheno$MONOCYTE, main="MONOCYTE (10^9/L)") 
plot(density(na.omit(bt_pheno$MONOCYTE)), main="MONOCYTE (10^9/L)")
polygon(density(na.omit(bt_pheno$MONOCYTE)), col = "coral")

# removing
#abline(v=5, lwd=1, lty=2)
#bt_pheno$MONOCYTE <- as.numeric(
#  ifelse(bt_pheno$MONOCYTE > 5, "NA", 
#    bt_pheno$MONOCYTE
#    #ifelse(bt_pheno$MONOCYTE == 0, "NA", bt_pheno$MONOCYTE)
#  )
#)

#plot(density(na.omit(bt_pheno$MONOCYTE)), main="MONOCYTE (>5) outlier pruned")
#polygon(density(na.omit(bt_pheno$MONOCYTE)), col = "coral")

BNphen <- bestNormalize::orderNorm(bt_pheno$MONOCYTE)
 
bt_pheno$nMONOCYTE <- BNphen$x.t
plot(density(na.omit(bt_pheno$nMONOCYTE)), main="Normalized MONOCYTE") 
polygon(density(na.omit(bt_pheno$nMONOCYTE)), col = "lightblue")

# fit smooth spline for Age against Hb
mon_pheno <- na.omit(bt_pheno[, which(names(bt_pheno) %in% c("FID","GENDER","AGE","MONOCYTE"))])

mon_plot <- qplot(AGE, MONOCYTE, data = mon_pheno, geom='smooth', span =0.5, color = GENDER) + theme_bw()
mon_plot
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'

4.11 PLATELET

par(mar = c(4, 4, 2, 2))
hist(bt_pheno$PLATELET, main="PLATELET (10^9/L)") 
plot(density(na.omit(bt_pheno$PLATELET)), main="PLATELET (10^9/L)")
polygon(density(na.omit(bt_pheno$PLATELET)), col = "coral") 
BNphen <- bestNormalize::orderNorm(bt_pheno$PLATELET)
 
bt_pheno$nPLATELET <- BNphen$x.t
plot(density(na.omit(bt_pheno$nPLATELET)), main="Normalized PLATELET") 
polygon(density(na.omit(bt_pheno$nPLATELET)), col = "lightblue")

# fit smooth spline for Age against PLATELET
pla_pheno <- na.omit(bt_pheno[, which(names(bt_pheno) %in% c("FID","GENDER","AGE","PLATELET"))])

pla_plot <- qplot(AGE, PLATELET, data = pla_pheno, geom='smooth', span =0.5, color = GENDER) + theme_bw()
pla_plot
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'

4.12 MPV

par(mar = c(4, 4, 2, 2))
hist(bt_pheno$MPV, main="MPV") 
plot(density(na.omit(bt_pheno$MPV)), main="MPV")
polygon(density(na.omit(bt_pheno$MPV)), col = "coral") 

# removing
abline(v=20, lwd=1, lty=2)
bt_pheno$MPV <- as.numeric(
  ifelse(bt_pheno$MPV > 20, "NA", 
    bt_pheno$MPV
    #ifelse(bt_pheno$MPV < 10, "NA", bt_pheno$MPV)
  )
)
plot(density(na.omit(bt_pheno$MPV)), main="MPV (>20) outlier pruned")
polygon(density(na.omit(bt_pheno$MPV)), col = "coral")

BNphen <- bestNormalize::orderNorm(bt_pheno$MPV)
 
bt_pheno$nMPV <- BNphen$x.t
plot(density(na.omit(bt_pheno$nMPV)), main="Normalized MPV") 
polygon(density(na.omit(bt_pheno$nMPV)), col = "lightblue")

# fit smooth spline for Age against PLATELET
pla_pheno <- na.omit(bt_pheno[, which(names(bt_pheno) %in% c("FID","GENDER","AGE","MPV"))])

pla_plot <- qplot(AGE, MPV, data = pla_pheno, geom='smooth', span =0.5, color = GENDER) + theme_bw()
pla_plot
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'

4.13 BMI

par(mar = c(4, 4, 2, 2))
hist(bt_pheno$BMI, main="BMI") 
plot(density(na.omit(bt_pheno$BMI)), main="BMI")
polygon(density(na.omit(bt_pheno$BMI)), col = "coral")

# removing outliers
abline(v=50, lwd=1, lty=2)
bt_pheno$BMI <- as.numeric(
  ifelse(bt_pheno$BMI > 50, "NA", 
    bt_pheno$BMI
    #ifelse(bt_pheno$BMI < 10, "NA", bt_pheno$BMI)
  )
)
plot(density(na.omit(bt_pheno$BMI)), main="BMI (>50) outlier pruned")
polygon(density(na.omit(bt_pheno$BMI)), col = "coral")

BNphen <- bestNormalize::orderNorm(bt_pheno$BMI)
 
bt_pheno$nBMI <- BNphen$x.t
plot(density(na.omit(bt_pheno$nBMI)), main="Normalized BMI") 
polygon(density(na.omit(bt_pheno$nBMI)), col = "lightblue")

# fit smooth spline for Age against BMI
bmi_pheno <- na.omit(bt_pheno[, which(names(bt_pheno) %in% c("FID","GENDER","AGE","BMI"))])

bmi_plot <- qplot(AGE, BMI, data = bmi_pheno, geom='smooth', span =0.5, color = GENDER) + theme_bw()
bmi_plot
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'

5 Save new pheno to file

#head(bt_pheno[, -c(3:4)])

sheet = "hbss-sc"

outf <- paste0(
  params$odir,
  "/",
  sub(".xlsx","", basename(params$data)),
  "_",
  sheet,
  "_new_pheno.tsv"
)

write.table(
  bt_pheno,
  outf,
  col.names=T,
  row.names=F,
  quote=F,
  sep="\t"
)