Difference between revisions of "GSCAN dbGaP"
Hannah Young (talk | contribs) |
|||
Line 57: | Line 57: | ||
### import genotype data to get family info | ### import genotype data to get family info | ||
− | fam_data <- read.table("/work/KellerLab/GSCAN/dbGaP/ARIC/PhenoGenotypeFiles//ARIC_PLINK_flagged_chromosomal_abnormalities_zeroed_out.fam") | + | fam_data <- read.table("/work/KellerLab/GSCAN/dbGaP/ARIC/PhenoGenotypeFiles/ChildStudyConsentSet_phs000090.ARIC_RootStudy.v3.p1.c1.HMB-IRB/GenotypeFiles/phg000035.v1.ARIC_GEI.genotype-calls-matrixfmt.c1.GRU.update1/Genotypes_with_flagged_chromosomal_abnormalities_zeroed_out/ARIC_PLINK_flagged_chromosomal_abnormalities_zeroed_out.fam") |
### study doesn't have maternal or paternal ids so generate entire | ### study doesn't have maternal or paternal ids so generate entire | ||
### columns of NAs | ### columns of NAs |
Revision as of 23:45, 6 September 2016
Studies
Framingham
ID Mapping
The following snppit describes how the PLINK genotype file IDs (which use "sampid") were mapped to the dbGaP_Subject_ID contained in the available phenotype files. The snippet only contains one of Joyce Ling's phenotype files for illustration only.
This script and the results it produces can be found in /work/KellerLab/GSCAN/dbGaP/Framingham/ID_mapping.r
### ID mapping file from dbGaP_Subject_ID to SAMPID
ID.map <- read.table(gzfile("/work/KellerLab/GSCAN/dbGaP/Framingham/PhenoGenotypeFiles/RootStudyConsentSet_phs000007.Framingham.v28.p10.c1.HMB-IRB-MDS/PhenotypeFiles/phs000007.v28.pht001415.v16.p10.Framingham_Sample.MULTI.txt.gz"), header=T, sep="\t", stringsAsFactors=F)
### Genotype files
genotype.IDs <- read.table("/work/KellerLab/GSCAN/dbGaP/Framingham/PhenoGenotypeFiles/ChildStudyConsentSet_phs000342.Framingham.v16.p10.c1.HMB-IRB-MDS/GenotypeFiles/phg000006.v9.FHS_SHARe_Affy500K.genotype-calls-matrixfmt.c1/subject_level_PLINK_sets/FHS_SHARe_Affy500K_subjects_c1.fam", header=F)
names(genotype.IDs) <- c("famid", "SAMPID", "patid", "matid", "sex", "phenotype")
length(which(genotype.IDs$SAMPID %in% ID.map$SAMPID))
## 6954
x <- merge(genotype.IDs, ID.map, by="SAMPID", all.x=T)
x <- x[,c(1:5,7)]
### One of Joyce's phenotype ID files (eventually all phenotypes for all available participants were merged in, but this is a good example.)
phenotypes <- read.table("OffC_Exam_1.txt", header=T, sep="\t")
xx <- merge(x, phenotypes, by="dbGaP_Subject_ID", all=TRUE)
xx <- xx[,c(2:ncol(xx),1)] ## Reorder so the dbGaP_Subject_ID is the last column
write.table(xx, file="framingham_GSCAN_phenotypesCovariates.ped", quote=FALSE, sep="\t", row.names=F)
Phenotypes
(Joyce will update this section)
Genotypes
We used the Affy 500K genotypes found here: /work/KellerLab/GSCAN/dbGaP/Framingham/PhenoGenotypeFiles/ChildStudyConsentSet_phs000342.Framingham.v16.p10.c1.HMB-IRB-MDS/GenotypeFiles/phg000006.v9.FHS_SHARe_Affy500K.genotype-calls-matrixfmt.c1/subject_level_PLINK_sets/FHS_SHARe_Affy500K_subjects_c1.[bed|bim|fam]
ARIC
(Hannah/Joyce to update this section following Framingham as a guide)
ID Mapping
Phenotypes
phenotypes <- read.table("/work/KellerLab/GSCAN/dbGaP/ARIC/PhenoGenotypeFiles/ChildStudyConsentSet_phs000090.ARIC_RootStudy.v3.p1.c1.HMB-IRB/PhenotypeFiles/phs000090.v3.pht000114.v2.p1.c1.GENEVA_ARIC_Subject_Phenotypes.HMB-IRB.txt.gz",
header=T,
sep="\t")
phenotypes <- subset(phenotypes, select=c("geneva_id", "racegrp",
"gender", "v1age01", "anta01", "anta04",
"drnkr01", "hom29", 'hom35', "hom32", 'cigt01',
'evrsmk01', 'dtia90','dtia96', 'dtia97','dtia98',
'cursmk01','forsmk01'))
### To connect sample ids to geneva ids, sample id, geneva id, sample
### site columns taken from
### /work/KellerLab/GSCAN/dbGaP/ARIC/PhenoGenotypeFiles/ChildStudyConsentSet_phs000090.ARIC_RootStudy.v3.p1.c1.HMB-IRB/GenotypeFiles/phg000035.v1.ARIC_GEI.genotype-qc.MULTI/geno-qc/samp-subj-mapping.csv.gz
id_map <-read.table("/rc_scratch/hayo0753/id_map.txt", header=TRUE)
### import genotype data to get family info
fam_data <- read.table("/work/KellerLab/GSCAN/dbGaP/ARIC/PhenoGenotypeFiles/ChildStudyConsentSet_phs000090.ARIC_RootStudy.v3.p1.c1.HMB-IRB/GenotypeFiles/phg000035.v1.ARIC_GEI.genotype-calls-matrixfmt.c1.GRU.update1/Genotypes_with_flagged_chromosomal_abnormalities_zeroed_out/ARIC_PLINK_flagged_chromosomal_abnormalities_zeroed_out.fam")
### study doesn't have maternal or paternal ids so generate entire
### columns of NAs
fam_data$patid <- NA
fam_data$matid <- NA
### set individuals without related participant to "x"
fam_data$V1[fam_data$V1==0] = "x"
### make family data variables more readable
patid <- fam_data$patid
matid <- fam_data$matid
fam_id <- fam_data$V1
### make phenotype covariate variables more readable
gen_id <- phenotypes$geneva_id
race <- phenotypes$racegrp
sex <- phenotypes$gender
age <- phenotypes$v1age01
height <- phenotypes$anta01
weight <- phenotypes$anta04
### put phenotype covariates in table
covariates <- cbind(gen_id,race,sex,age, height, weight)
colnames(covariates) = c("geneva_id", "race", "sex", "age", "height", "weight")
sample_number <- id_map$sampid
geneva_id <- id_map$gen_id
site <- id_map$center
### sample_number and geneva_id for id mapping, site source was taken from to be used as covariate
ids <- cbind(sample_number, geneva_id, site)
ids <- as.data.frame(ids)
colnames(ids) <- c("sample_number", "geneva_id", "site")
### merge ids with covariates
merged <- merge(covariates, ids, by="geneva_id", all=TRUE)
### change to non-drinker =1 , drinker =2
phenotypes$drnkr01 <- as.numeric(as.character(phenotypes$drnkr01))
phenotypes$drnkr01[phenotypes$drnkr01 == 1 ] = 6
phenotypes$drnkr01[phenotypes$drnkr01 == 2 ] = 1
phenotypes$drnkr01[phenotypes$drnkr01 == 3 ] = 1
phenotypes$drnkr01[phenotypes$drnkr01 == 6 ] = 2
phenotypes$drnkr01[phenotypes$drnkr01 == 4 ] = "x"
dnd <- phenotypes$drnkr01
### Age 1st regularly smoked cigarettes (visit 1)
phenotypes$hom29 <- as.character(phenotypes$hom29)
phenotypes$hom29[phenotypes$hom29 == "A" ] = "x"
phenotypes$hom29 <- as.numeric(phenotypes$hom29)
### remove ages older than 35 and younger than 10
phenotypes$hom29[(phenotypes$hom29 > 35) | (phenotypes$hom29 < 10)] = "x"
ai <- phenotypes$hom29
### Overall number of cigarettes per day (visit 1)
### put cpd into bins from analysis plan
phenotypes$hom35 <- as.character(phenotypes$hom35)
phenotypes$hom35[phenotypes$hom35 == "A" ] = "x"
phenotypes$hom35[(phenotypes$hom35 >="61") ] = "x"
phenotypes$hom35 <- as.integer(phenotypes$hom35)
phenotypes$hom35[(phenotypes$hom35 <= 5) & (phenotypes$hom35 >= 1)] = 1
phenotypes$hom35[(phenotypes$hom35 <= 15) & (phenotypes$hom35 >= 6)] = 2
phenotypes$hom35[(phenotypes$hom35 <= 25) & (phenotypes$hom35 >= 16)] = 3
phenotypes$hom35[(phenotypes$hom35 <= 35) & (phenotypes$hom35 >= 26)] = 4
phenotypes$hom35[(phenotypes$hom35 >= 36) & (phenotypes$hom35 <=60)] = 5
cpd <- phenotypes$hom35
phenotypes$dtia96 <- as.integer(as.character(phenotypes$dtia96))
phenotypes$dtia97 <- as.integer(as.character(phenotypes$dtia97))
phenotypes$dtia98 <- as.integer(as.character(phenotypes$dtia98))
### divide wine by 5 to normalize to standard drink
dpw <- (phenotypes$dtia96)/5 + (phenotypes$dtia97) + (phenotypes$dtia98)
dpw <- as.data.frame(dpw)
dpw$dpw <- as.numeric(as.character(dpw$dpw))
dpw$dpw<-dpw[,1]+1
dpw <- log(dpw)
### Number of drinks per week consumed, wine+beer+liquor
phenotypes$evrsmk01 <- as.character(phenotypes$evrsmk01)
phenotypes$evrsmk01[phenotypes$evrsmk01 == "T"] = "x"
phenotypes$evrsmk01 <- as.integer(as.character(phenotypes$evrsmk01))
phenotypes$evrsmk01[phenotypes$evrsmk01 == 1] = 2
phenotypes$evrsmk01[phenotypes$evrsmk01 == 0] = 1
si2 <- phenotypes$evrsmk01
si2 <- as.data.frame(si2)
UsedData <- phenotypes[,17:18]
X <- c()
for(i in 1:nrow(UsedData)){
Delin <- (UsedData[i,])
if(is.na(Delin[,1]) | is.na(Delin[,2])){
X <- append(X,"x")
}
else if (Delin[,1] == 0 & Delin[,2] == 0){
X <- append(X,"x")
}
else if (Delin[,1] == 0 & Delin[,2] == 1){
X <- append(X,1)
}
else if (Delin[,1] == 1 & Delin[,2] == 0){
X <- append(X,2)
}
}
sc2 <- X
library(qpcR)
final_phen <- qpcR:::cbind.na(fam_id, merged$geneva_id, patid, matid, sex, cpd, si, sc2, ai, dpw, dnd)
colnames(final_phen) <- c("famid", "iid", "patid", "matid", "sex", "cpd", "si", "sc", "ai", "dpw", "dnd")
final_phens <- replace(final_phen, is.na(final_phen), "x")
write.table(final_phens, "/rc_scratch/hay0753/aricphen2.ped", quote = FALSE, row.names =FALSE)
final_covar <- qpcR:::cbind.na(fam_id, merged$geneva_id, patid, matid, sex, age, height, weight)
colnames(final_covar) <- c("famid", "iid", "patid", "matid", "sex", "age", "height", "weight")
final_covars <- replace(final_covar, is.na(final_covar), "x")
final_covars <- as.data.frame(final_covars)
write.table(final_covars, "/rc_scratch/hay0753/ariccovar2.ped", quote = FALSE, row.names=FALSE)
Genotypes
MESA
(Hannah/Joyce to update following Framingham as a guide)
Phenotypes
Description of phenotypes can be found here: Media:MESA phenotypes - FINAL.pdf
eMERGE
(Hannah/Joyce to update following Framingham as a guide)
Phenotypes
Description of phenotypes can be found here: Media:EMERGE.pdf
Stroke
(Hannah/Joyce to update following Framingham as a guide)
Genotype Processing
Pre-Phasing QC
QC parameters that we chose: MAF > 0.01
SNP callrate > 0.95
Missingness per individual > 0.95
HWE = 0.05 / number of markers but greater than 5e-8
To update the strand builds: http://www.well.ox.ac.uk/~wrayner/strand/
## Check strands against latest 1000G: http://www.well.ox.ac.uk/~wrayner/tools/ #!/bin/bash #SBATCH --qos=blanca-ibg #SBATCH --mem=40gb perl HRC-1000G-check-bim.pl -b ARIC_b37_filtered.bim -f ARIC_b37_filtered.frq -r 1000GP_Phase3_combined.legend -g -p EUR
## Phasing using shapeit #!/bin/bash #SBATCH --mem=20gb #SBATCH --time=48:00:00 #SBATCH -o shapeit_aric_%j.out #SBATCH -e shapeit_aric_%j.err #SBATCH --qos blanca-ibgc1 #SBATCH --ntasks-per-node 48 #SBATCH -J shapeit_aric shapeit -B ARIC_b37_filtered-updated-chr${1} -M /rc_scratch/meli7712/dbGAP/1000GP_Phase3/genetic_map_chr${1}_combined_b37.txt -O phased/ARIC_b37_filtered-updated-chr${1}.phased -T 48
## To convert the shapeit output into vcf #!/bin/bash #SBATCH --mem=20gb #SBATCH --time=24:00:00 #SBATCH -o shapeit_mesa_%j.out #SBATCH -e shapeit_mesa_%j.err #SBATCH --qos janus #SBATCH --ntasks-per-node 12 #SBATCH -J shapeit_mesa shapeit -convert --input-haps mesa-chr${1}.phased --output-vcf mesa-chr${1}.phased.vcf -T 12
## Imputation #!/bin/bash #SBATCH --mem=30gb #SBATCH --time=72:00:00 #SBATCH -o impute_mesa_%j.out #SBATCH -e impute_mesa_%j.err #SBATCH --qos blanca-ibgc1 #SBATCH --ntasks-per-node 48 #SBATCH -J impute_mesa /work/KellerLab/Zhen/bin/Minimac3/bin/Minimac3 --haps mesa-chr${1}.phased.vcf --cpus 48 --refHaps /rc_scratch/meli7712/dbGAP/references/${1}.1000g.Phase3.v5.With.Parameter.Estimates.m3vcf.gz --chr ${1} --noPhoneHome --format GT,DS,GP --allTypedSites --prefix mesa-chr${1}.phased.imputed