Align genotypes against the 1000 Genomes reference

This is done to: - Check strand - Check reference overlap - Validate ref/alt SNP assignments

Only SNPs that overlap the reference panel with correct strand and alleles assignment are retained.
The conform-gt tool from BEAGLE Utils was used for this purpose.
The pipeline is written in Nextfow and available in GitHub here: esohkevin/ei-imputationservice

To impute with TOPMed and H3Africa panels (hg38), each dataset above is lifted over to hg38 and then relagined to the reference as described

Commandline

From our Nextflow pipeline

...
   script:
   if(params.exclude_sample == "NULL")
       """
       conform-gt \
           gt="${input_vcf}" \
           ref="${ref_vcf}" \
           chrom=${chrom} \
           match=POS \
           out="chr${chrom}-aligned"
       """
   else
       """
       conform-gt \
           gt="${input_vcf}" \
           ref="${ref_vcf}" \
           chrom=${chrom} \
           match=POS \
           excludesamples="${params.exclude_sample}" \
           out="chr${chrom}-aligned"
       """
...

Cameroon


$isdir/genemapis.sh \
  align \
  slurm \
  --vcf /data/awonkam1/kesoh/gwas/cmr/raw/vcf/AW2018-2019-for-phasing_ref_fixed.vcf.gz \
  --output_prefix AW2018-2019-clean \
  --output_dir /data/awonkam1/kesoh/gwas/cmr/ \
  --exclude_sample /vast/awonkam1/resources/extras/non.afr.excl \
  --threads 4 \
  --njobs 25
  
nextflow \
  -c AW2018-2019-clean-align.config \
  run esohkevin/ei-imputationservice/alignGenotypesToReference.nf \
  -profile slurm,singularity,hg19 \
  -w /scratch4/awonkam1/nfworkdirs/gwas/cmr/ \
  -r dev \
  -resume

Ghana


$isdir/genemapis.sh \
  align \
  slurm \
  --vcf /data/awonkam1/kesoh/gwas/gha/qc/sicklegen-gha-hg19-pass-qc.vcf.gz \
  --output_prefix sicklegen-gha-hg19 \
  --output_dir /data/awonkam1/kesoh/gwas/gha/ \
  --exclude_sample /vast/awonkam1/resources/extras/non.afr.excl \
  --threads 4 \
  --njobs 25
  
nextflow \
  -c sicklegen-gha-hg19-align.config \
  run esohkevin/ei-imputationservice/alignGenotypesToReference.nf \
  -profile slurm,singularity,hg19 \
  -w /scratch4/awonkam1/nfworkdirs/gwas/gha/ \
  -r dev \
  -resume

Tanzania


$isdir/genemapis.sh \
  align \
  slurm \
  --vcf /data/awonkam1/kesoh/gwas/tzn/tz_scd_for_phasing_ref_fixed.vcf.gz \
  --output_prefix tz_scd-clean \
  --output_dir /data/awonkam1/kesoh/gwas/tzn/ \
  --exclude_sample /vast/awonkam1/resources/extras/non.afr.excl \
  --threads 4 \
  --njobs 25
  
nextflow \
  -c tz_scd-clean-align.config \
  run esohkevin/ei-imputationservice/alignGenotypesToReference.nf \
  -profile slurm,singularity,hg19 \
  -w /scratch4/awonkam1/nfworkdirs/gwas/tzn/ \
  -r dev \
  -resume

SITT


$isdir/genemapis.sh \
  align \
  slurm \
  --vcf /data/awonkam1/kesoh/gwas/sitt/qc/sitt-pass-qc-crossmap-hg19.vcf.gz \
  --output_prefix sitt-hg19-clean \
  --output_dir /data/awonkam1/kesoh/gwas/cmr/ \
  --exclude_sample /vast/awonkam1/resources/extras/non.afr.excl \
  --threads 4 \
  --njobs 25
  
nextflow \
  -c sitt-hg19-clean-align.config \
  run esohkevin/ei-imputationservice/alignGenotypesToReference.nf \
  -profile slurm,singularity,hg19 \
  -w /scratch4/awonkam1/nfworkdirs/gwas/sitt/ \
  -r dev \
  -resume

Lift over script

#!/usr/bin/env bash

#SBATCH --account awonkam1_bigmem
#SBATCH --partition bigmem
#SBATCH --nodes 1
#SBATCH --cpus-per-task 48
#SBATCH --time 24:00:00

ml parallel

#mkdir -p /vast/awonkam1/DATA/combined/wgs/hg19/vcf/

crossmap=/vast/awonkam1/containers/sickleinafrica-crossmap-latest.img
bgzip=/home/kesohku1/workflows/genemapngs/bin/bgzip
chain=/vast/awonkam1/resources/extras/hg19ToHg38.over.chain
ref=/vast/awonkam1/resources/hg38/refgenome/hg38.fa
bcftools=/vast/awonkam1/containers/staphb-bcftools-latest.img
plink=/vast/awonkam1/containers/sickleinafrica-plink-latest.img

# EDIT WITH VCF
vcf=/data/awonkam1/kesoh/gwas/tzn/aligned/tz_scd_aligned.vcf.gz
newbuild=hg38
vcfbase=$(basename ${vcf/.vcf.gz/})
outdir=/data/awonkam1/kesoh/gwas/tzn/aligned/


singularity \
  exec ${crossmap} \
  CrossMap \
    vcf \
      --chromid l \
      ${chain} \
      ${vcf} \
      ${ref} \
      ${outdir}${vcfbase}-${newbuild}.vcf

${bgzip} \
  -@ ${SLURM_NTASKS} \
  -f \
  ${outdir}${vcfbase}-${newbuild}.vcf

singularity \
  run ${bcftools} \
  bcftools \
    sort \
    -T ${outdir}bcftools.XXXX \
    -Oz \
    -o ${outdir}${vcfbase}-${newbuild}.vcf.gz \
    ${outdir}${vcfbase}-${newbuild}.vcf.gz

singularity \
  run ${bcftools} \
  bcftools \
    index \
    -ft \
    --threads ${SLURM_CPUS_PER_TASK} \
    ${outdir}${vcfbase}-${newbuild}.vcf.gz

singularity \
  run ${bcftools} \
  bcftools \
    view \
    -v snps \
    -m2 -M2 \
    --threads ${SLURM_CPUS_PER_TASK} \
    -Oz \
    -o ${outdir}${vcfbase}-${newbuild}-clean.vcf.gz \
    ${outdir}${vcfbase}-${newbuild}.vcf.gz

singularity \
  run ${bcftools} \
  bcftools \
    index \
    -ft \
    --threads ${SLURM_CPUS_PER_TASK} \
    ${outdir}${vcfbase}-${newbuild}-clean.vcf.gz

for i in {1..22} X; do echo $i; done | \
parallel \
  -j 22 \
singularity \
  run ${bcftools} \
  bcftools \
    view \
    -v snps \
    -r chr{} \
    -m2 -M2 \
    --threads 4 \
    -Oz \
    -o ${outdir}/chr{}-${vcfbase}-${newbuild}.vcf.gz \
    ${outdir}/${vcfbase}-${newbuild}-clean.vcf.gz

for i in {1..22} X; do echo $i; done | \
parallel \
  -j 22 \
  singularity \
    run ${bcftools} \
    bcftools \
      index \
      -ft \
      --threads 2 \
      ${outdir}/chr{}-${vcfbase}-${newbuild}.vcf.gz
echo "Done!"

#sbatch /home/kesohku1/workflows/genemapgwas/scripts/liftover-genomebuild.sh

Haplotype reference panels

Six panels are generally used, or seven as in the case of Cameroon and SITT including AGR from imputation in our HbF GWAS study

Panel	Sample Size	% African Ancestry	Number of Variants	Chromosomes Supported	Genome Build	Imputation Service	Link
GMP1V2	3,407	45.9	16,535,955	1-22	hg19	-	This study
GMP3V1	673	99.9	46,862,904	1-22, X	hg38	-	This study
H3Africa	4,447	50	130 million	1-22	hg38	AfriGen-D	https://impute.afrigen-d.org/
TOPMed	97,256	29	308,107,085	1-22, X	hg38	TOPMed	https://imputation.biodatacatalyst.nhlbi.nih.gov/
KGP	2504	26.4	49,143,605	1-22, X	hg19	Michigan	https://imputationserver.sph.umich.edu/
CAAPA	883	100	41,163,897	1-22	hg19	Michigan	https://imputationserver.sph.umich.edu/
AGR	4,956	62	93,421,145	1-22, X	hg19	Sanger	Retired

Imputation with GMP1V2 and GMP3V1 were performed in-house using the commandline below from our Nextflow imputation pipeline.
First, the aligned dataset was pre-phased with SHAPEIT4 against the reference panel used to impute it.
Imputation was then performed per chromosome using MINIMAC4.

In-house Commandline


# phasing
...
    script:
        """
        shapeit4 \
          --input ${input_vcf} \
          --reference ${ref_vcf} \
          --map ${geneticMap} \
          --region ${chrom} \
          --thread ${task.cpus} \
          --pbwt-depth ${params.pbwt} \
          --log chr${chrom}.${params.output_prefix}.shapeit4.log \
          --output chr${chrom}.${params.output_prefix}.shapeit4.vcf.gz
        """
...


# imputation
...
    script:
        """
        minimac4 \
          --all-typed-sites \
          --chunk ${params.chunk_size} \
          --threads ${task.cpus} \
          --format GT,HDS,DS,GP \
          --output chr${chrom}.dose.vcf.gz \
          --empirical-output chr${chrom}.empiricalDose.vcf.gz \
          ${msav} \
          ${target_vcf}
        """
...

GMP1V2 and GMP3V1

$isdir/genemapis.sh \
  phase \
  slurm \
  --phase_tool shapeit4 \
  --impute_tool minimac4 \
  --panel gmp1v2 \
  --autosome \
  --vcf /data/awonkam1/kesoh/gwas/cmr/aligned/combined/cms-aligned.vcf.gz \
  --output_prefix cmr_scd \
  --output_dir /data/awonkam1/kesoh/gwas/cmr/ \
  --threads 12 \
  --njobs 25

nextflow \
  -c cmr_scd-impute.config \
  run esohkevin/ei-imputationservice/phaseAndImputeGenotypes.nf  \
  -profile slurm,singularity,hg19 \
  -w /scratch4/awonkam1/nfworkdirs/gwas/cmr/ \
  -r dev \
  -resume \
  --account awonkam1_bigmem \
  --queue bigmem

Web-based

TOPMed and Michigan panels were submitted using a custom imputationbot script ustilizing the API functionality of the servers.
H3Africa/AfriGen-D was submitted using their web-based option.

key_file="$(dirname $0)/topmed_imputation_server_api_token.txt"
test_jq="$(which jq 2>/dev/null)"

if [ -z "${test_jq}" ]; then
   jq="$(dirname $0)/jq-linux-amd64"
else
   jq=${test_jq}
fi

if [ ! -e "${key_file}" ]; then
   echo "API token file 'topmed_imputation_server_api_token.txt' not found!";
   exit 1;
else
   TOKEN=$(cat "${key_file}");
fi

if [ $# -lt 5 ]; then
   RED="\e[31m"
   ENDCOLOR="\e[0m"

   echo -e """
   Usage: submit_to_topmed_imputation_server.sh [vcf directory] [build] [population] [r2Filter <0 - 0.3>] [meta]

          vcf directory must contain only the VCF files to be processed

           reference build:     hg19 | hg38 (the reference build of your VCF files)
                population:     all
                  r2Filter:     0 | 0.001 | 0.1 | 0.2 | 0.3 [default: 0]
                      meta:     generate meta-imputation file? (yes/no) [default: no]

          NB: The TOPMed reference panel in use is 'apps@topmed-r3'
   """

else

   mkdir -p tisjobs

   vcfdir=$1
   refbuild=$2
   pop=$3
   filter=$4
   meta=$5

   # get random string to append to job name
   jobid=$(echo $RANDOM | md5sum | head -c 30)

echo """curl https://imputation.biodatacatalyst.nhlbi.nih.gov/api/v2/jobs/submit/imputationserver \\
  -H \"X-Auth-Token: $TOKEN\" \\
  -F \"refpanel=apps@topmed-r3\" \\
  -F \"build=${refbuild}\" \\
  -F \"meta=${meta}\" \\
  -F \"population=${pop}\" \\\
""" > tisjobs/${jobid}.sh

for vcf in $(ls ${vcfdir}/*.vcf.gz | sort -V); do
  echo "  -F \"files=@${vcf}\" \\";
done >> tisjobs/${jobid}.sh

echo """  -F \"r2Filter=${filter}\"
""" >> tisjobs/${jobid}.sh

   chmod 755 tisjobs/${jobid}.sh

   echo -e "\nJob created with ID: ${jobid}\n"
   sleep 1
   echo -e "\nSubmitting the job ./tisjobs/${jobid}.sh\n"
   ./tisjobs/${jobid}.sh | $jq > ./tisjobs/${jobid}.log
   echo ""

   job_status=$(cat ./tisjobs/${jobid}.log | $jq '.success')

   if [[ "${job_status}" == "false" ]]; then
      echo ""
      echo "Something went wrong! your job could not be submitted successfully."
      echo "Have you updated you API token?"
      echo "Login to the Topmed Imputation Server online, go to your profile and check if your API token expired."
      echo "Revoke the expired token and create a new one."
      echo ""

      cat ./tisjobs/${jobid}.log | $jq
   fi

   cat ./tisjobs/${jobid}.log
   echo -e "\n\nSee this message in the log file './tisjobs/${jobid}.log'\n"
   rm ./tisjobs/${jobid}.sh
fi

mkdir -p vcf
ln -s /data/awonkam1/kesoh/gwas/tzn/aligned/*.vcf.gz vcf/
submit_to_topmed_imputation_server.sh vcf/ hg19 all 0.3 yes

Imputation Performance

Get Imputation Scores

# nextflow

Visualize Imputation Metrics

# nextflow

Imputation

Kevin Esoh

June 7, 2025