Difference between revisions of "GSCAN dbGaP"

From Vrieze Wiki
Jump to navigation Jump to search
Line 41: Line 41:
  
 
===Phenotypes===
 
===Phenotypes===
###
+
#
 
phenotypes <- read.table("/work/KellerLab/GSCAN/dbGaP/ARIC/PhenoGenotypeFiles/ChildStudyConsentSet_phs000090.ARIC_RootStudy.v3.p1.c1.HMB-IRB/PhenotypeFiles/phs000090.v3.pht000114.v2.p1.c1.GENEVA_ARIC_Subject_Phenotypes.HMB-IRB.txt.gz",header=T,sep="\t")
 
phenotypes <- read.table("/work/KellerLab/GSCAN/dbGaP/ARIC/PhenoGenotypeFiles/ChildStudyConsentSet_phs000090.ARIC_RootStudy.v3.p1.c1.HMB-IRB/PhenotypeFiles/phs000090.v3.pht000114.v2.p1.c1.GENEVA_ARIC_Subject_Phenotypes.HMB-IRB.txt.gz",header=T,sep="\t")
  
Line 50: Line 50:
  
  
###to connect sample ids to geneva ids, sample id, geneva id, sample site columns taken from ###/work/KellerLab/GSCAN/dbGaP/ARIC/PhenoGenotypeFiles/ChildStudyConsentSet_phs000090.ARIC_RootStudy.v3.p1.c1.HMB-IRB/GenotypeFiles/phg000035.v1.ARIC_GEI.genotype-qc.MULTI/geno-qc/samp-subj-mapping.csv.gz
+
#to connect sample ids to geneva ids, sample id, geneva id, sample site columns taken from #/work/KellerLab/GSCAN/dbGaP/ARIC/PhenoGenotypeFiles/ChildStudyConsentSet_phs000090.ARIC_RootStudy.v3.p1.c1.HMB-IRB/GenotypeFiles/phg000035.v1.ARIC_GEI.genotype-qc.MULTI/geno-qc/samp-subj-mapping.csv.gz
 
id_map <-read.table("/rc_scratch/hayo0753/id_map.txt", head=TRUE)
 
id_map <-read.table("/rc_scratch/hayo0753/id_map.txt", head=TRUE)
  
###import genotype data to get family info
+
#import genotype data to get family info
 
fam_data <- read.table("/work/KellerLab/GSCAN/dbGaP/ARIC/PhenoGenotypeFiles/ChildStudyConsentSet_phs000090.ARIC_RootStudy.v3.p1.c1.HMB-IRB/GenotypeFiles/phg000035.v1.ARIC_GEI.genotype-calls-matrixfmt.c1.GRU.update1/Genotypes_with_flagged_chromosomal_abnormalities_zeroed_out/ARIC_PLINK_flagged_chromosomal_abnormalities_zeroed_out.fam")
 
fam_data <- read.table("/work/KellerLab/GSCAN/dbGaP/ARIC/PhenoGenotypeFiles/ChildStudyConsentSet_phs000090.ARIC_RootStudy.v3.p1.c1.HMB-IRB/GenotypeFiles/phg000035.v1.ARIC_GEI.genotype-calls-matrixfmt.c1.GRU.update1/Genotypes_with_flagged_chromosomal_abnormalities_zeroed_out/ARIC_PLINK_flagged_chromosomal_abnormalities_zeroed_out.fam")
###study doesn't have maternal or paternal ids so generate entire columns of NAs
+
#study doesn't have maternal or paternal ids so generate entire columns of NAs
 
fam_data$patid <- NA
 
fam_data$patid <- NA
 
fam_data$matid <- NA
 
fam_data$matid <- NA
Line 61: Line 61:
 
fam_data$V1[fam_data$V1==0] = "x"
 
fam_data$V1[fam_data$V1==0] = "x"
  
###make family data variables more readable
+
#make family data variables more readable
 
patid <- fam_data$patid
 
patid <- fam_data$patid
 
matid <- fam_data$matid
 
matid <- fam_data$matid
Line 72: Line 72:
 
height <- phenotypes$anta01
 
height <- phenotypes$anta01
 
weight <- phenotypes$anta04
 
weight <- phenotypes$anta04
###put phenotype covariates in table
+
#put phenotype covariates in table
 
covariates <- cbind(gen_id,race,sex,age, height, weight)
 
covariates <- cbind(gen_id,race,sex,age, height, weight)
 
colnames(covariates) = c("geneva_id", "race", "sex", "age", "height", "weight")
 
colnames(covariates) = c("geneva_id", "race", "sex", "age", "height", "weight")
Line 79: Line 79:
 
geneva_id <- id_map$gen_id
 
geneva_id <- id_map$gen_id
 
site <- id_map$center
 
site <- id_map$center
###sample_number and geneva_id for id mapping, site source was taken from to be used as covariate
+
#sample_number and geneva_id for id mapping, site source was taken from to be used as covariate
 
ids <- cbind(sample_number, geneva_id, site)
 
ids <- cbind(sample_number, geneva_id, site)
 
ids <- as.data.frame(ids)
 
ids <- as.data.frame(ids)
 
colnames(ids) <- c("sample_number", "geneva_id", "site")
 
colnames(ids) <- c("sample_number", "geneva_id", "site")
  
###merge ids with covariates
+
#merge ids with covariates
 
merged <- merge(covariates, ids, by="geneva_id", all=TRUE)
 
merged <- merge(covariates, ids, by="geneva_id", all=TRUE)
  
###change to non-drinker =1 , drinker =2
+
#change to non-drinker =1 , drinker =2
 
phenotypes$drnkr01 <- as.numeric(as.character(phenotypes$drnkr01))
 
phenotypes$drnkr01 <- as.numeric(as.character(phenotypes$drnkr01))
 
phenotypes$drnkr01[phenotypes$drnkr01 == 1 ] = 6
 
phenotypes$drnkr01[phenotypes$drnkr01 == 1 ] = 6
Line 96: Line 96:
 
dnd <- phenotypes$drnkr01
 
dnd <- phenotypes$drnkr01
  
###Age 1st regularly smoked cigarettes (visit 1)
+
#Age 1st regularly smoked cigarettes (visit 1)
 
phenotypes$hom29 <- as.character(phenotypes$hom29)
 
phenotypes$hom29 <- as.character(phenotypes$hom29)
 
phenotypes$hom29[phenotypes$hom29 == "A" ] = "x"
 
phenotypes$hom29[phenotypes$hom29 == "A" ] = "x"
 
phenotypes$hom29 <- as.numeric(phenotypes$hom29)
 
phenotypes$hom29 <- as.numeric(phenotypes$hom29)
###remove ages older than 35 and younger than 10
+
#remove ages older than 35 and younger than 10
 
phenotypes$hom29[(phenotypes$hom29 > 35) | (phenotypes$hom29 < 10)] = "x"
 
phenotypes$hom29[(phenotypes$hom29 > 35) | (phenotypes$hom29 < 10)] = "x"
 
ai <- phenotypes$hom29
 
ai <- phenotypes$hom29
  
###Overall number of cigarettes per day (visit 1)
+
#Overall number of cigarettes per day (visit 1)
###put cpd into bins from analysis plan
+
#put cpd into bins from analysis plan
 
phenotypes$hom35 <- as.character(phenotypes$hom35)
 
phenotypes$hom35 <- as.character(phenotypes$hom35)
 
phenotypes$hom35[phenotypes$hom35 == "A" ] = "x"
 
phenotypes$hom35[phenotypes$hom35 == "A" ] = "x"
Line 122: Line 122:
 
phenotypes$dtia98 <- as.integer(as.character(phenotypes$dtia98))
 
phenotypes$dtia98 <- as.integer(as.character(phenotypes$dtia98))
  
###divide wine by 5 to normalize to standard drink
+
#divide wine by 5 to normalize to standard drink
 
dpw <- (phenotypes$dtia96)/5 + (phenotypes$dtia97) + (phenotypes$dtia98)  
 
dpw <- (phenotypes$dtia96)/5 + (phenotypes$dtia97) + (phenotypes$dtia98)  
 
dpw <- as.data.frame(dpw)
 
dpw <- as.data.frame(dpw)
Line 128: Line 128:
 
dpw$dpw<-dpw[,1]+1
 
dpw$dpw<-dpw[,1]+1
 
dpw <- log(dpw)
 
dpw <- log(dpw)
###Number of drinks per week consumed, wine+beer+liquor
+
#Number of drinks per week consumed, wine+beer+liquor
  
 
phenotypes$evrsmk01 <- as.character(phenotypes$evrsmk01)
 
phenotypes$evrsmk01 <- as.character(phenotypes$evrsmk01)
Line 163: Line 163:
 
final_phen <- qpcR:::cbind.na(fam_id, merged$geneva_id, patid, matid, sex, cpd, si, sc2, ai, dpw, dnd)
 
final_phen <- qpcR:::cbind.na(fam_id, merged$geneva_id, patid, matid, sex, cpd, si, sc2, ai, dpw, dnd)
 
colnames(final_phen) <- c("famid", "iid", "patid", "matid", "sex", "cpd", "si", "sc", "ai", "dpw", "dnd")
 
colnames(final_phen) <- c("famid", "iid", "patid", "matid", "sex", "cpd", "si", "sc", "ai", "dpw", "dnd")
###change NAs to "x"
+
#change NAs to "x"
 
final_phens <- replace(final_phen, is.na(final_phen), "x")
 
final_phens <- replace(final_phen, is.na(final_phen), "x")
 
write.table(final_phens, "/rc_scratch/hay0753/aricphen2.ped", quote = FALSE, row.names =FALSE)
 
write.table(final_phens, "/rc_scratch/hay0753/aricphen2.ped", quote = FALSE, row.names =FALSE)

Revision as of 18:30, 6 September 2016

Studies

Framingham

ID Mapping

The following snppit describes how the PLINK genotype file IDs (which use "sampid") were mapped to the dbGaP_Subject_ID contained in the available phenotype files. The snippet only contains one of Joyce Ling's phenotype files for illustration only.

This script and the results it produces can be found in /work/KellerLab/GSCAN/dbGaP/Framingham/ID_mapping.r

### ID mapping file from dbGaP_Subject_ID to SAMPID 
ID.map <- read.table(gzfile("/work/KellerLab/GSCAN/dbGaP/Framingham/PhenoGenotypeFiles/RootStudyConsentSet_phs000007.Framingham.v28.p10.c1.HMB-IRB-MDS/PhenotypeFiles/phs000007.v28.pht001415.v16.p10.Framingham_Sample.MULTI.txt.gz"), header=T, sep="\t", stringsAsFactors=F)

### Genotype files
genotype.IDs <-  read.table("/work/KellerLab/GSCAN/dbGaP/Framingham/PhenoGenotypeFiles/ChildStudyConsentSet_phs000342.Framingham.v16.p10.c1.HMB-IRB-MDS/GenotypeFiles/phg000006.v9.FHS_SHARe_Affy500K.genotype-calls-matrixfmt.c1/subject_level_PLINK_sets/FHS_SHARe_Affy500K_subjects_c1.fam", header=F)
names(genotype.IDs) <- c("famid", "SAMPID", "patid", "matid", "sex", "phenotype")

length(which(genotype.IDs$SAMPID %in% ID.map$SAMPID))
## 6954 

x <- merge(genotype.IDs, ID.map, by="SAMPID", all.x=T)
x <- x[,c(1:5,7)]

### One of Joyce's phenotype ID files (eventually all phenotypes for all available participants were merged in, but this is a good example.)
phenotypes <- read.table("OffC_Exam_1.txt", header=T, sep="\t")

xx <- merge(x, phenotypes, by="dbGaP_Subject_ID", all=TRUE)

xx <- xx[,c(2:ncol(xx),1)] ## Reorder so the dbGaP_Subject_ID is the last column 

write.table(xx, file="framingham_GSCAN_phenotypesCovariates.ped", quote=FALSE, sep="\t", row.names=F)

Phenotypes

(Joyce will update this section)

Genotypes

We used the Affy 500K genotypes found here: /work/KellerLab/GSCAN/dbGaP/Framingham/PhenoGenotypeFiles/ChildStudyConsentSet_phs000342.Framingham.v16.p10.c1.HMB-IRB-MDS/GenotypeFiles/phg000006.v9.FHS_SHARe_Affy500K.genotype-calls-matrixfmt.c1/subject_level_PLINK_sets/FHS_SHARe_Affy500K_subjects_c1.[bed|bim|fam]


ARIC

(Hannah/Joyce to update this section following Framingham as a guide)

ID Mapping

Phenotypes

phenotypes <- read.table("/work/KellerLab/GSCAN/dbGaP/ARIC/PhenoGenotypeFiles/ChildStudyConsentSet_phs000090.ARIC_RootStudy.v3.p1.c1.HMB-IRB/PhenotypeFiles/phs000090.v3.pht000114.v2.p1.c1.GENEVA_ARIC_Subject_Phenotypes.HMB-IRB.txt.gz",header=T,sep="\t")

phenotypes <- cbind(phenotypes$geneva_id, phenotypes$racegrp, phenotypes$gender, phenotypes$v1age01, phenotypes$anta01, phenotypes$anta04, phenotypes$drnkr01, phenotypes$hom29, phenotypes$hom35, phenotypes$hom32, phenotypes$cigt01, phenotypes$evrsmk01, phenotypes$dtia90, phenotypes$dtia96, phenotypes$dtia97, phenotypes$dtia98, phenotypes$cursmk01, phenotypes$forsmk01)

phenotypes <- as.data.frame(phenotypes) colnames(phenotypes) <- c("geneva_id", "racegrp", "gender", "v1age01", "anta01", "anta04", "drnkr01", "hom29", "hom35", "hom32", "cigt01", "evrsmk01", "dtia90", "dtia96", "dtia97", "dtia98", "cursmk01", "forsmk01")


  1. to connect sample ids to geneva ids, sample id, geneva id, sample site columns taken from #/work/KellerLab/GSCAN/dbGaP/ARIC/PhenoGenotypeFiles/ChildStudyConsentSet_phs000090.ARIC_RootStudy.v3.p1.c1.HMB-IRB/GenotypeFiles/phg000035.v1.ARIC_GEI.genotype-qc.MULTI/geno-qc/samp-subj-mapping.csv.gz

id_map <-read.table("/rc_scratch/hayo0753/id_map.txt", head=TRUE)

  1. import genotype data to get family info

fam_data <- read.table("/work/KellerLab/GSCAN/dbGaP/ARIC/PhenoGenotypeFiles/ChildStudyConsentSet_phs000090.ARIC_RootStudy.v3.p1.c1.HMB-IRB/GenotypeFiles/phg000035.v1.ARIC_GEI.genotype-calls-matrixfmt.c1.GRU.update1/Genotypes_with_flagged_chromosomal_abnormalities_zeroed_out/ARIC_PLINK_flagged_chromosomal_abnormalities_zeroed_out.fam")

  1. study doesn't have maternal or paternal ids so generate entire columns of NAs

fam_data$patid <- NA fam_data$matid <- NA

  1. set individuals without related participant to "x"

fam_data$V1[fam_data$V1==0] = "x"

  1. make family data variables more readable

patid <- fam_data$patid matid <- fam_data$matid fam_id <- fam_data$V1

  1. make phenotype covariate variables more readable

gen_id <- phenotypes$geneva_id race <- phenotypes$racegrp sex <- phenotypes$gender age <- phenotypes$v1age01 height <- phenotypes$anta01 weight <- phenotypes$anta04

  1. put phenotype covariates in table

covariates <- cbind(gen_id,race,sex,age, height, weight) colnames(covariates) = c("geneva_id", "race", "sex", "age", "height", "weight")

sample_number <- id_map$sampid geneva_id <- id_map$gen_id site <- id_map$center

  1. sample_number and geneva_id for id mapping, site source was taken from to be used as covariate

ids <- cbind(sample_number, geneva_id, site) ids <- as.data.frame(ids) colnames(ids) <- c("sample_number", "geneva_id", "site")

  1. merge ids with covariates

merged <- merge(covariates, ids, by="geneva_id", all=TRUE)

  1. change to non-drinker =1 , drinker =2

phenotypes$drnkr01 <- as.numeric(as.character(phenotypes$drnkr01)) phenotypes$drnkr01[phenotypes$drnkr01 == 1 ] = 6 phenotypes$drnkr01[phenotypes$drnkr01 == 2 ] = 1 phenotypes$drnkr01[phenotypes$drnkr01 == 3 ] = 1 phenotypes$drnkr01[phenotypes$drnkr01 == 6 ] = 2 phenotypes$drnkr01[phenotypes$drnkr01 == 4 ] = "x" dnd <- phenotypes$drnkr01

  1. Age 1st regularly smoked cigarettes (visit 1)

phenotypes$hom29 <- as.character(phenotypes$hom29) phenotypes$hom29[phenotypes$hom29 == "A" ] = "x" phenotypes$hom29 <- as.numeric(phenotypes$hom29)

  1. remove ages older than 35 and younger than 10

phenotypes$hom29[(phenotypes$hom29 > 35) | (phenotypes$hom29 < 10)] = "x" ai <- phenotypes$hom29

  1. Overall number of cigarettes per day (visit 1)
  2. put cpd into bins from analysis plan

phenotypes$hom35 <- as.character(phenotypes$hom35) phenotypes$hom35[phenotypes$hom35 == "A" ] = "x" phenotypes$hom35[(phenotypes$hom35 >="61") ] = "x" phenotypes$hom35 <- as.integer(phenotypes$hom35) phenotypes$hom35[(phenotypes$hom35 <= 5) & (phenotypes$hom35 >= 1)] = 1 phenotypes$hom35[(phenotypes$hom35 <= 15) & (phenotypes$hom35 >= 6)] = 2 phenotypes$hom35[(phenotypes$hom35 <= 25) & (phenotypes$hom35 >= 16)] = 3 phenotypes$hom35[(phenotypes$hom35 <= 35) & (phenotypes$hom35 >= 26)] = 4 phenotypes$hom35[(phenotypes$hom35 >= 36) & (phenotypes$hom35 <=60)] = 5 cpd <- phenotypes$hom35


phenotypes$dtia96 <- as.integer(as.character(phenotypes$dtia96)) phenotypes$dtia97 <- as.integer(as.character(phenotypes$dtia97)) phenotypes$dtia98 <- as.integer(as.character(phenotypes$dtia98))

  1. divide wine by 5 to normalize to standard drink

dpw <- (phenotypes$dtia96)/5 + (phenotypes$dtia97) + (phenotypes$dtia98) dpw <- as.data.frame(dpw) dpw$dpw <- as.numeric(as.character(dpw$dpw)) dpw$dpw<-dpw[,1]+1 dpw <- log(dpw)

  1. Number of drinks per week consumed, wine+beer+liquor

phenotypes$evrsmk01 <- as.character(phenotypes$evrsmk01) phenotypes$evrsmk01[phenotypes$evrsmk01 == "T"] = "x" phenotypes$evrsmk01 <- as.integer(as.character(phenotypes$evrsmk01)) phenotypes$evrsmk01[phenotypes$evrsmk01 == 1] = 2 phenotypes$evrsmk01[phenotypes$evrsmk01 == 0] = 1 si2 <- phenotypes$evrsmk01 si2 <- as.data.frame(si2)

UsedData <- phenotypes[,17:18] X <- c() for(i in 1:nrow(UsedData)){

 Delin <- (UsedData[i,])
 if(is.na(Delin[,1]) | is.na(Delin[,2])){
   X <- append(X,"x")
 } 
 
 else if (Delin[,1]  == 0 & Delin[,2] == 0){
   X <- append(X,"x")
 }
 
 else if (Delin[,1]  == 0 & Delin[,2] == 1){
   X <- append(X,1)
 }
 
 else if (Delin[,1]  == 1 & Delin[,2] == 0){
   X <- append(X,2)
 }

} sc2 <- X

library(qpcR) final_phen <- qpcR:::cbind.na(fam_id, merged$geneva_id, patid, matid, sex, cpd, si, sc2, ai, dpw, dnd) colnames(final_phen) <- c("famid", "iid", "patid", "matid", "sex", "cpd", "si", "sc", "ai", "dpw", "dnd")

  1. change NAs to "x"

final_phens <- replace(final_phen, is.na(final_phen), "x") write.table(final_phens, "/rc_scratch/hay0753/aricphen2.ped", quote = FALSE, row.names =FALSE)

final_covar <- qpcR:::cbind.na(fam_id, merged$geneva_id, patid, matid, sex, age, height, weight) colnames(final_covar) <- c("famid", "iid", "patid", "matid", "sex", "age", "height", "weight") final_covars <- replace(final_covar, is.na(final_covar), "x") final_covars <- as.data.frame(final_covars) write.table(final_covars, "/rc_scratch/hay0753/ariccovar2.ped", quote = FALSE, row.names=FALSE)

Genotypes

MESA

(Hannah/Joyce to update following Framingham as a guide)

Phenotypes

Description of phenotypes can be found here: Media:MESA phenotypes - FINAL.pdf


eMERGE

(Hannah/Joyce to update following Framingham as a guide)

Phenotypes

Description of phenotypes can be found here: Media:EMERGE.pdf


Stroke

(Hannah/Joyce to update following Framingham as a guide)

Genotype Processing

Pre-Phasing QC

QC parameters that we chose: MAF > 0.01

SNP callrate > 0.95

Missingness per individual > 0.95

HWE = 0.05 / number of markers but greater than 5e-8

To update the strand builds: http://www.well.ox.ac.uk/~wrayner/strand/


## Check strands against latest 1000G: http://www.well.ox.ac.uk/~wrayner/tools/
#!/bin/bash
#SBATCH --qos=blanca-ibg
#SBATCH --mem=40gb
perl HRC-1000G-check-bim.pl -b ARIC_b37_filtered.bim -f ARIC_b37_filtered.frq -r  1000GP_Phase3_combined.legend -g -p EUR


## Phasing using shapeit
#!/bin/bash
#SBATCH --mem=20gb
#SBATCH --time=48:00:00
#SBATCH -o shapeit_aric_%j.out
#SBATCH -e shapeit_aric_%j.err
#SBATCH --qos blanca-ibgc1
#SBATCH --ntasks-per-node 48
#SBATCH -J shapeit_aric
shapeit -B ARIC_b37_filtered-updated-chr${1} -M /rc_scratch/meli7712/dbGAP/1000GP_Phase3/genetic_map_chr${1}_combined_b37.txt -O phased/ARIC_b37_filtered-updated-chr${1}.phased -T 48

## To convert the shapeit output into vcf
#!/bin/bash
#SBATCH --mem=20gb
#SBATCH --time=24:00:00
#SBATCH -o shapeit_mesa_%j.out
#SBATCH -e shapeit_mesa_%j.err
#SBATCH --qos janus
#SBATCH --ntasks-per-node 12
#SBATCH -J shapeit_mesa
shapeit -convert --input-haps mesa-chr${1}.phased --output-vcf mesa-chr${1}.phased.vcf -T 12
## Imputation
#!/bin/bash
#SBATCH --mem=30gb
#SBATCH --time=72:00:00
#SBATCH -o impute_mesa_%j.out
#SBATCH -e impute_mesa_%j.err
#SBATCH --qos blanca-ibgc1
#SBATCH --ntasks-per-node 48
#SBATCH -J impute_mesa

/work/KellerLab/Zhen/bin/Minimac3/bin/Minimac3 --haps mesa-chr${1}.phased.vcf --cpus 48 --refHaps /rc_scratch/meli7712/dbGAP/references/${1}.1000g.Phase3.v5.With.Parameter.Estimates.m3vcf.gz --chr ${1} --noPhoneHome --format GT,DS,GP --allTypedSites --prefix mesa-chr${1}.phased.imputed