Difference between revisions of "GSCAN dbGaP"
Hannah Young (talk | contribs) |
|||
(15 intermediate revisions by 2 users not shown) | |||
Line 568: | Line 568: | ||
==ARIC== | ==ARIC== | ||
− | |||
− | |||
===Phenotypes=== | ===Phenotypes=== | ||
Line 852: | Line 850: | ||
</syntaxhighlight> | </syntaxhighlight> | ||
− | |||
− | |||
− | |||
==MESA== | ==MESA== | ||
Line 865: | Line 860: | ||
==eMERGE== | ==eMERGE== | ||
− | |||
===Phenotypes=== | ===Phenotypes=== | ||
− | + | <syntaxhighlight lang="rsplus"> | |
+ | |||
+ | options(stringsAsFactors=F) | ||
+ | |||
+ | |||
+ | ### eMERGE is broken into different consent classes. We can conduct analyses on hmb, hmb-gso-nic, and | ||
+ | |||
+ | |||
+ | emerge.hmb <- read.table("/work/KellerLab/dbGaP/eMERGE-MergedSet/PhenoGenotypeFiles/RootStudyConsentSet_phs000360.eMERGE_MergedSet.v3.p1.c1.HMB/PhenotypeFiles/phs000360.v3.pht003255.v2.p1.c1.MergedSet_Subject_Phenotypes.HMB.txt.gz", header=TRUE, sep="\t", stringsAsFactors=F) | ||
+ | emerge.hmb.genos <- read.table("/work/KellerLab/dbGaP/eMERGE-MergedSet/PhenoGenotypeFiles/RootStudyConsentSet_phs000360.eMERGE_MergedSet.v3.p1.c1.HMB/GenotypeFiles/matrix/c1.HMB/eMerge_660_11212012_c1.fam", header=FALSE, sep="\t", stringsAsFactors=F) | ||
+ | |||
+ | |||
+ | emerge.hmb.gso.nic <- read.table("/work/KellerLab/dbGaP/eMERGE-MergedSet/PhenoGenotypeFiles/RootStudyConsentSet_phs000360.eMERGE_MergedSet.v3.p1.c3.HM-B-GSO-NIC/PhenotypeFiles/phs000360.v3.pht003255.v2.p1.c3.MergedSet_Subject_Phenotypes.HM-B-GSO-NIC.txt.gz", header=TRUE, sep="\t", stringsAsFactors=F) | ||
+ | emerge.hmb.gso.nic.genos <- read.table("/work/KellerLab/dbGaP/eMERGE-MergedSet/PhenoGenotypeFiles/RootStudyConsentSet_phs000360.eMERGE_MergedSet.v3.p1.c3.HM-B-GSO-NIC/GenotypeFiles/matrix/c3.HM-B-GSO-NIC/eMerge_660_11212012_c3.fam", header=FALSE, sep="\t", stringsAsFactors=F) | ||
+ | |||
+ | |||
+ | emerge.hmb.gso <- read.table("/work/KellerLab/dbGaP/eMERGE-MergedSet/PhenoGenotypeFiles/RootStudyConsentSet_phs000360.eMERGE_MergedSet.v3.p1.c4.HMB-GSO/PhenotypeFiles/phs000360.v3.pht003255.v2.p1.c4.MergedSet_Subject_Phenotypes.HMB-GSO.txt.gz", header=TRUE, sep="\t", stringsAsFactors=F) | ||
+ | emerge.hmb.gso.genos <- read.table("/work/KellerLab/dbGaP/eMERGE-MergedSet/PhenoGenotypeFiles/RootStudyConsentSet_phs000360.eMERGE_MergedSet.v3.p1.c4.HMB-GSO/GenotypeFiles/matrix/c4.HMB-GSO/eMerge_660_11212012_c4.fam", header=FALSE, sep="\t", stringsAsFactors=F) | ||
+ | |||
+ | |||
+ | ### Merge all files above according to SUBJID, which is used in the | ||
+ | ### genotype files. | ||
+ | |||
+ | emerge <- merge(emerge.hmb, emerge.hmb.gso, all=T) | ||
+ | emerge <- merge(d, emerge.hmb.gso.nic, all=T) | ||
+ | |||
+ | ### SMOKING INITIATION | ||
+ | ### | ||
+ | ### The eMERGE variable name is SMOKING_STATUS | ||
+ | ### C65108 = never smoker | ||
+ | ### C67147 = current smoker | ||
+ | ### C67148 = past smoker | ||
+ | ### C67151 = Unknown if ever smoked | ||
+ | ### | ||
+ | ### Descriptives: | ||
+ | ### | ||
+ | ### table(emerge$SMOKING_STATUS) | ||
+ | ### | ||
+ | ### C65108 C67147 C67148 C67151 | ||
+ | ### 2217 1736 3457 9635 | ||
+ | |||
+ | si <- emerge$SMOKING_STATUS | ||
+ | si[si == "C67147" | si == "C67148"] <- 2 | ||
+ | si[si == "C65108"] <- 1 | ||
+ | si[si != 1 & si != 2] <- NA | ||
+ | |||
+ | ### SMOKING Cessation | ||
+ | ### | ||
+ | ### Current == 2 & Former == 1 in GSCAN. This is already the case for these data. | ||
+ | |||
+ | sc <- emerge$SMOKING_STATUS | ||
+ | sc[sc == "C67147"] <- 2 | ||
+ | sc[sc == "C67148"] <- 1 | ||
+ | sc[sc != 1 & sc != 2] <- NA | ||
+ | |||
+ | |||
+ | ### eMERGE age variable is tricky because there is no obvious age at | ||
+ | ### assessment. We will use their "DECADE_BIRTH" as a terrible | ||
+ | ### approximation. | ||
+ | ### 1=1900-1919; 2=1920-1929, 3=1930-1939; 4=1940-1949; 5=1950-1959; 6=Unknown | ||
+ | ### | ||
+ | ### Descriptives: | ||
+ | ### | ||
+ | ### table(emerge$DECADE_BIRTH) | ||
+ | ### | ||
+ | ### . 1 2 3 4 5 6 7 8 9 99 | ||
+ | ### 6 612 2667 3533 4439 3127 1291 761 490 10 109 | ||
+ | birthyear <- emerge$DECADE_BIRTH | ||
+ | birthyear[birthyear == "99"] <- NA | ||
+ | birthyear[birthyear == "."] <- NA | ||
+ | |||
+ | |||
+ | ### SEX | ||
+ | sex <- emerge$SEX | ||
+ | sex[sex == "C46109"] <- 1 | ||
+ | sex[sex == "C46110"] <- 2 | ||
+ | |||
+ | |||
+ | ### Scott decided not to correct for additional case-control variables | ||
+ | ### given what appears to be a highly complex sample and uncertainty | ||
+ | ### about the best course of action to account for disease status in | ||
+ | ### conducting smoking analyses. | ||
+ | |||
+ | phenotypes <- data.frame(fid = emerge$SUBJID, | ||
+ | iid = emerge$SUBJID, | ||
+ | patid = "x", | ||
+ | matid = "x", | ||
+ | sex = sex, | ||
+ | si = si, | ||
+ | sc = sc) | ||
+ | |||
+ | phenotypes[is.na(phenotypes)] <- "x" | ||
+ | |||
+ | write.table(phenotypes, | ||
+ | "/work/KellerLab/vrieze/GSCAN/GWAS/summary_stats_generated_internally/eMERGE/GSCAN_eMERGE_phenotypes.ped", | ||
+ | row.names=F, | ||
+ | quote = F, | ||
+ | sep="\t") | ||
+ | |||
+ | |||
+ | covariates <- data.frame(fid = emerge$SUBJID, | ||
+ | iid = emerge$SUBJID, | ||
+ | patid = "x", | ||
+ | matid = "x", | ||
+ | sex = sex, | ||
+ | birthyear = birthyear) | ||
+ | |||
+ | covariates[is.na(covariates)] <- "x" | ||
+ | |||
+ | write.table(covariates, | ||
+ | "/work/KellerLab/vrieze/GSCAN/GWAS/summary_stats_generated_internally/eMERGE/GSCAN_eMERGE_covariates.ped", | ||
+ | row.names=F, | ||
+ | quote = F, | ||
+ | sep="\t") | ||
+ | </syntaxhighlight> | ||
==Stroke== | ==Stroke== | ||
Line 875: | Line 983: | ||
<syntaxhighlight lang="rsplus"> | <syntaxhighlight lang="rsplus"> | ||
+ | ### Date: Feb 13 2017 | ||
+ | ### Author: Scott Vrieze | ||
− | options( | + | options(stringsAsFactors=F) |
− | ### Load in dataset ### | + | ### Load in dataset ### |
− | ninds <- read.table | + | ninds <- read.table("/work/KellerLab/GSCAN/dbGaP/CIDR_StrokeGenetics/PhenoGenotypeFiles/RootStudyConsentSet_phs000615.CIDR_International_StrokeGenetics.v1.p1.c3.GRU-NPU/PhenotypeFiles/phs000615.v1.pht003307.v1.p1.c3.IschemicStroke_Subject_Phenotypes.GRU-NPU.txt.gz", |
+ | header = T, sep="\t") | ||
+ | ### The file reads into R incorrectly because of a weird trailing tab | ||
+ | ### character in the data file, so use the below code to shift column | ||
+ | ### names to the correct column. | ||
+ | names(ninds)[1] <- "XXX" | ||
+ | ninds$dbGaP_Subject_ID <- row.names(ninds) | ||
+ | ninds$smokingStatus <- NULL | ||
+ | names(ninds) <- c(names(ninds)[2:17], "smokingStatus", "dbGaP_Subject_ID") | ||
+ | |||
− | ### | + | ### subset the only variables needed |
− | + | pheno <- subset(ninds, select=c("subject_id", "smokingStatus", "age", "gender", "AFFECTION_STATUS")) | |
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
###————————————————————### | ###————————————————————### | ||
Line 898: | Line 1,011: | ||
### NINDS variable is “smokingStatus” | ### NINDS variable is “smokingStatus” | ||
− | ### Variables are “NEVER”, “FORMER”, “CURRENT”, “NO”, “UNKNOWN”, and blank. | + | ### Variables are “NEVER”, “FORMER”, “CURRENT”, “NO”, “UNKNOWN”, and blank. |
− | ### | + | ### |
− | ### ’CURRENT' defined as cigarette smoking within last 30 days, 'FORMER' defined as ### | + | ### ’CURRENT' defined as cigarette smoking within last 30 days, |
+ | ### 'FORMER' defined as ### more than 100 cigarettes in one's lifetime | ||
+ | ### but no smoking within the last 30 ### days; ’NEVER' defined as | ||
+ | ### less than 100 cigarettes smoked in one's lifetime. | ||
### table(pheno$smokingStatus) | ### table(pheno$smokingStatus) | ||
### | ### | ||
− | ### CURRENT FORMER NEVER NO UNKNOWN | + | ### CURRENT FORMER NEVER NO UNKNOWN |
− | ### 63 727 1137 2397 4 251 | + | ### 63 727 1137 2397 4 251 |
− | pheno[ | + | si <- pheno$smokingStatus |
− | + | si[si == "CURRENT" | si == "FORMER"] <- 2 | |
+ | si[si == "NEVER" | si == "NO"] <- 1 | ||
+ | si[si != "1" & si != "2"] <- NA | ||
+ | si <- as.numeric(si) | ||
− | ### table( | + | ### table(si) |
### | ### | ||
− | ### 1 2 | + | ### 1 2 |
− | ### 2401 1864 | + | ### 2401 1864 |
− | |||
− | |||
Line 922: | Line 1,039: | ||
### NINDS variable is “smokingStatus” | ### NINDS variable is “smokingStatus” | ||
− | ### Variables are “NEVER”, “FORMER”, “CURRENT”, “NO”, “UNKNOWN”, and blank. | + | ### Variables are “NEVER”, “FORMER”, “CURRENT”, “NO”, “UNKNOWN”, and blank. |
− | ### | + | ### |
− | ### ’CURRENT' defined as cigarette smoking within last 30 days, 'FORMER' defined as | + | ### ’CURRENT' defined as cigarette smoking within last 30 days, |
+ | ### 'FORMER' defined as more than 100 cigarettes in one's lifetime but | ||
+ | ### no smoking within the last 30 days; ’NEVER' defined as less than | ||
+ | ### 100 cigarettes smoked in one's lifetime. | ||
### table(pheno$smokingStatus) | ### table(pheno$smokingStatus) | ||
### | ### | ||
− | ### CURRENT FORMER NEVER NO UNKNOWN | + | ### CURRENT FORMER NEVER NO UNKNOWN |
− | ### 63 727 1137 2397 4 251 | + | ### 63 727 1137 2397 4 251 |
− | + | sc <- pheno$smokingStatus | |
− | + | sc[sc == "CURRENT"] <- 2 | |
+ | sc[sc == "FORMER"] <- 1 | ||
+ | sc[sc != "1" & sc != "2"] <- NA | ||
+ | sc <- as.numeric(sc) | ||
### table(pheno$V5) | ### table(pheno$V5) | ||
### | ### | ||
− | ### 1 2 | + | ### 1 2 |
− | ### 1137 727 | + | ### 1137 727 |
− | |||
###——————————### | ###——————————### | ||
Line 948: | Line 1,070: | ||
### table(pheno$gender) | ### table(pheno$gender) | ||
### | ### | ||
− | ### F M | + | ### F M |
− | ### 2627 1952 | + | ### 2627 1952 |
+ | sex <- pheno$gender | ||
+ | sex[sex == "M"] <- 1 | ||
+ | sex[sex == "F"] <- 2 | ||
+ | sex <- as.numeric(sex) | ||
− | + | ###----------------### | |
− | + | ### Write to files ### | |
+ | ###----------------### | ||
− | + | ### This study uses the "subject_id" ID field in the genotype files, | |
+ | ### so use that here, instead of the dbGaP_Subject_ID | ||
+ | phenotypes <- data.frame(fid = pheno$subject_id, | ||
+ | iid = pheno$subject_id, | ||
+ | patid = "x", | ||
+ | matid = "x", | ||
+ | sex = sex, | ||
+ | si = si, | ||
+ | sc = sc) | ||
− | + | write.table(phenotypes, | |
− | + | "/work/KellerLab/vrieze/GSCAN/GWAS/summary_stats_generated_internally/Stroke/GSCAN_STROKE_phenotypes.ped", | |
− | + | row.names=F, | |
− | + | quote = F, | |
− | + | sep="\t") | |
− | |||
− | |||
− | + | covariates <- data.frame(fid = pheno$subject_id, | |
+ | iid = pheno$subject_id, | ||
+ | patid = "x", | ||
+ | matid = "x", | ||
+ | sex = sex, | ||
+ | age = pheno$age, | ||
+ | age2 = pheno$age^2, | ||
+ | AFFECTION_STATUS = pheno$AFFECTION_STATUS) | ||
− | + | write.table(covariates, | |
− | + | "/work/KellerLab/vrieze/GSCAN/GWAS/summary_stats_generated_internally/Stroke/GSCAN_STROKE_covariates.ped", | |
+ | row.names=F, | ||
+ | quote = F, | ||
+ | sep="\t") ### save table | ||
+ | </syntaxhighlight> | ||
− | + | == BEAGESS == | |
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
+ | ===Phenotypes=== | ||
− | + | <syntaxhighlight lang="rsplus"> | |
− | + | options(stringsAsFactors=F) | |
− | |||
− | |||
− | |||
− | + | beagess <- read.table("/work/KellerLab/GSCAN/dbGaP/Barretts/PhenoGenotypeFiles/RootStudyConsentSet_phs000869.BEAGESS.v1.p1.c1.GRU-MDS/PhenotypeFiles/phs000869.v1.pht004610.v1.p1.c1.BEAGESS_Subject_Phenotypes.GRU-MDS.txt.gz", header=TRUE, sep="\t", stringsAsFactors=F) | |
− | |||
− | |||
− | |||
− | |||
− | + | genos <- read.table("/work/KellerLab/GSCAN/dbGaP/Barretts/PhenoGenotypeFiles/RootStudyConsentSet_phs000869.BEAGESS.v1.p1.c1.GRU-MDS/GenotypeFiles/phg000580.v1.NCI_BEAGESS.genotype-calls-matrixfmt.c1.GRU-MDS.update/BEAGESS_dbGaP_29Jan2015.fam", header=FALSE, stringsAsFactors=F) | |
− | |||
− | + | ### The file reads into R incorrectly because of a weird trailing tab | |
+ | ### character in the data file, so use the below code to shift column | ||
+ | ### names to the correct column. | ||
+ | names(beagess)[1] <- "XXX" | ||
+ | beagess$dbGaP_Subject_ID <- row.names(beagess) | ||
+ | beagess$assocEABEvsCO <- NULL | ||
+ | names(beagess) <- c(names(beagess)[2:11], "assocEABEvsCO", "dbGaP_Subject_ID") | ||
− | + | ### SMOKING INITIATION | |
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | ### SMOKING INITIATION | ||
### | ### | ||
### BEAGESS variable name is "cig_smk_status" and is listed under the column "BMI". | ### BEAGESS variable name is "cig_smk_status" and is listed under the column "BMI". | ||
Line 1,024: | Line 1,144: | ||
### -99 = not consented | ### -99 = not consented | ||
### -9 = Missing | ### -9 = Missing | ||
− | ### 0 = Never | + | ### 0 = Never |
### 1 = Former | ### 1 = Former | ||
### 2 = Current | ### 2 = Current | ||
− | ### | + | ### |
− | ### Descriptives: | + | ### Descriptives: |
### | ### | ||
### > table(beagess_data$BMI) | ### > table(beagess_data$BMI) | ||
− | ### | + | ### |
− | ### -99 -9 0 1 2 | + | ### -99 -9 0 1 2 |
− | ### 494 2051 1515 2288 575 | + | ### 494 2051 1515 2288 575 |
### | ### | ||
### > summary(beagess_data$BMI) | ### > summary(beagess_data$BMI) | ||
− | ### Min. 1st Qu. Median Mean 3rd Qu. Max. | + | ### Min. 1st Qu. Median Mean 3rd Qu. Max. |
− | ### -99.000 -9.000 0.000 -9.234 1.000 2.000 | + | ### -99.000 -9.000 0.000 -9.234 1.000 2.000 |
− | + | si <- beagess$cig_smk_status | |
− | + | si[si == 1 | si == 2] <- 2 | |
− | + | si[si == 0] <- 1 | |
− | + | si[si != 1 & si != 2] <- NA | |
− | ### SMOKING Cessation | + | ### SMOKING Cessation |
### | ### | ||
### BEAGESS variable name is "cig_smk_status" and is listed under the column "BMI". | ### BEAGESS variable name is "cig_smk_status" and is listed under the column "BMI". | ||
Line 1,051: | Line 1,171: | ||
### -99 = not consented | ### -99 = not consented | ||
### -9 = Missing | ### -9 = Missing | ||
− | ### 0 = Never | + | ### 0 = Never |
### 1 = Former | ### 1 = Former | ||
### 2 = Current | ### 2 = Current | ||
− | |||
− | |||
### | ### | ||
− | ### | + | ### Descriptives: |
− | ### | + | ### table(beagess$cig_smk_status) |
− | + | ### -99 -9 0 1 2 | |
− | ### 494 2051 1515 2288 575 | + | ### 494 2051 1515 2288 575 |
### | ### | ||
− | ### | + | ### Current == 2 & Former == 1 in GSCAN. This is already the case for these data. |
− | |||
− | |||
− | + | sc <- beagess$cig_smk_status | |
− | + | sc[sc == 0 | sc == -9 | sc == -99] <- NA | |
− | |||
− | + | ### BEAGESS variable name is "agegpcat" | |
− | |||
− | ### BEAGESS variable name is "agegpcat" | ||
### "Age in 5 year categories" | ### "Age in 5 year categories" | ||
− | ### Response options are integers 1 through 14 or -9. | + | ### Response options are integers 1 through 14 or -9. |
### -9 = Missing | ### -9 = Missing | ||
### 1 = 15-29 years of age | ### 1 = 15-29 years of age | ||
Line 1,091: | Line 1,204: | ||
### 14 = 90-100 years of age | ### 14 = 90-100 years of age | ||
### | ### | ||
− | ### Descriptives: | + | ### Descriptives: |
+ | ### | ||
+ | ### > table(beagess_data$agegpcat) | ||
### | ### | ||
− | + | ### -9 1 2 3 4 5 6 7 8 9 10 11 12 13 14 | |
− | + | ### 27 19 48 112 190 378 657 958 1134 1238 1018 794 246 87 17 | |
− | ### -9 1 2 3 4 5 6 7 8 9 10 11 12 13 14 | ||
− | ### 27 19 48 112 190 378 657 958 1134 1238 1018 794 246 87 17 | ||
### | ### | ||
− | ### | + | ### This is fine, but change the -9 to NA |
− | + | beagess$agegpcat[beagess$agegpcat == -9] <- NA | |
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | ### | + | ### looks like BEAGESS uses their own internal "SUBJECT_ID" in the |
− | + | ### genotype files, so we'll use that in our pedigree files | |
− | + | phenotypes <- data.frame(fid = beagess$SUBJECT_ID, | |
+ | iid = beagess$SUBJECT_ID, | ||
+ | patid = "x", | ||
+ | matid = "x", | ||
+ | sex = beagess$sex, | ||
+ | si = si, | ||
+ | sc = sc) | ||
− | + | phenotypes[is.na(phenotypes)] <- "x" | |
− | |||
+ | write.table(phenotypes, | ||
+ | "/work/KellerLab/vrieze/GSCAN/GWAS/summary_stats_generated_internally/BEAGESS/GSCAN_BEAGESS_phenotypes.ped", | ||
+ | row.names=F, | ||
+ | quote = F, | ||
+ | sep="\t") | ||
− | |||
− | |||
− | + | covariates <- data.frame(fid = beagess$SUBJECT_ID, | |
− | + | iid = beagess$SUBJECT_ID, | |
+ | patid = "x", | ||
+ | matid = "x", | ||
+ | sex = beagess$sex, | ||
+ | age = beagess$age, | ||
+ | age2 = beagess$age^2, | ||
+ | Barretts.esophagus.case_v_control = beagess$assocBEvsCO, | ||
+ | Esophageal.adenocarcinoma.case_v_control = beagess$assocEAvsCO) | ||
− | + | covariates[is.na(covariates)] <- "x" | |
− | |||
− | + | write.table(covariates, | |
− | + | "/work/KellerLab/vrieze/GSCAN/GWAS/summary_stats_generated_internally/BEAGESS/GSCAN_BEAGESS_covariates.ped", | |
− | write.table( | + | row.names=F, |
+ | quote = F, | ||
+ | sep="\t") ### save table | ||
− | + | </syntaxhighlight> | |
− | |||
− | < | + | == Jackson Heart Study == |
+ | |||
+ | ===Phenotypes=== | ||
+ | <syntaxhighlight lang="rsplus"> | ||
+ | |||
+ | hmb <- read.table("/work/KellerLab/GSCAN/dbGaP/JHS/PhenoGenotypeFiles/ChildStudyConsentSet_phs000499.JHS.v3.p1.c3.HMB-IRB/PhenotypeFiles/phs000499.v3.pht003209.v1.p1.c3.JHS_CARe_Subject_Phenotypes.HMB-IRB.txt.gz", header=T, sep="\t", stringsAsFactors=F) | ||
+ | npu <- read.table("/work/KellerLab/GSCAN/dbGaP/JHS/PhenoGenotypeFiles/ChildStudyConsentSet_phs000499.JHS.v3.p1.c1.HMB-IRB-NPU/PhenotypeFiles/phs000499.v3.pht003209.v1.p1.c1.JHS_CARe_Subject_Phenotypes.HMB-IRB-NPU.txt.gz", header=T, sep="\t", stringsAsFactors=F) | ||
+ | phens <- rbind(hmb, npu) | ||
+ | hmb_geno <- read.table("/work/KellerLab/GSCAN/dbGaP/JHS/PhenoGenotypeFiles/ChildStudyConsentSet_phs000499.JHS.v3.p1.c3.HMB-IRB/GenotypeFiles/phg000103.v1.JHS.genotype-calls-matrixfmt.c3.HMB-IRB/JHS_PLINK_illu_GRU.fam", header=F, stringsAsFactors=F) | ||
+ | npu_geno <- read.table("/work/KellerLab/GSCAN/dbGaP/JHS/PhenoGenotypeFiles/ChildStudyConsentSet_phs000499.JHS.v3.p1.c1.HMB-IRB-NPU/GenotypeFiles/phg000103.v1.JHS.genotype-calls-matrixfmt.c1.HMB-IRB-NPU/JHS_PLINK_illu_NCU.fam", header=F, stringsAsFactor=F) | ||
+ | geno <- rbind(hmb_geno, npu_geno) | ||
− | == | + | jhs_map <- read.table("/work/KellerLab/GSCAN/dbGaP/JHS/PhenoGenotypeFiles/ChildStudyConsentSet_phs000499.JHS.v3.p1.c3.HMB-IRB/GenotypeFiles/phg000104.v1.JHS.sample-info.MULTI/JHS_subject_sample_map_8_10_2010.txt", header=T, sep="\t", stringsAsFactors=F) |
+ | jhs_pcs <- read.table("/rc_scratch/hayo0753/JHS_files/JHS_ancestry_and_pcs.txt", header=T, stringsAsFactors=F) | ||
− | + | names(jhs_pcs) [2] <- "SUBJID" | |
+ | names(geno) <- c("SUBJID", "iid", "patid", "matid", "sex", "phen") | ||
+ | filtered_geno <- geno[(geno$SUBJID %in% phens$SUBJID),] | ||
+ | filtered_phens <- phens[(phens$SUBJID %in% geno$SUBJID),] | ||
− | + | mapped_geno <- merge(filtered_phens, filtered_geno, by="SUBJID", all.x=T) | |
+ | mapped_geno_pcs <- merge(mapped_geno, jhs_pcs, by="SUBJID", all.x=T) | ||
− | + | covariates <- mapped_geno_pcs[c(1,69,70:72,5,75:84)] | |
− | + | names(covariates) [1] <- "fid" | |
+ | names(covariates) [2] <- "iid" | ||
+ | names(covariates) [6] <- "age" | ||
− | + | ###--------------------### | |
+ | ### Smoking initiation ### | ||
+ | ###--------------------### | ||
+ | ### | ||
+ | ### JHSCare variables are "current_smoker_baseline" and "former_smoking_baseline" | ||
+ | ### current_smoker_baseline = Current smoking status at first participant visit | ||
+ | ### former_smoker_baseline = Former smoking status at first participant visit | ||
+ | ### | ||
+ | ### Response options are | ||
+ | ### 0 - No | ||
+ | ### 1 - Yes | ||
+ | ### | ||
+ | ### table(si) | ||
+ | ### | ||
+ | ### 0 1 x | ||
+ | ### 1206 537 9 | ||
− | + | current.smoker <- subset(mapped_geno_pcs, select=c("current_smoker_baseline", "SUBJID")) | |
− | + | former.smoker <- subset(mapped_geno_pcs, select=c("former_smoker_baseline", "SUBJID")) | |
+ | N <- nrow(mapped_geno_pcs) | ||
+ | si <- rep(NA, N) | ||
+ | for(i in 1:N){ | ||
+ | if(is.na(current.smoker[i,1]) | is.na(former.smoker[i,1])){ | ||
+ | si[i] <- "x" | ||
+ | } | ||
+ | else if (current.smoker[i,1] == 0 & former.smoker[i,1] == 0){ | ||
+ | si[i] <- "1" | ||
+ | } | ||
+ | else if (current.smoker[i,1] == 0 & former.smoker[i,1] == 1){ | ||
+ | si[i] <- 2 ### former smokers are coded as 2 | ||
+ | } | ||
+ | else if (current.smoker[i,1] == 1 & former.smoker[i,1] == 0){ | ||
+ | si[i] <- 2 ### current smokers are coded as 2 | ||
+ | } | ||
+ | } | ||
+ | mapped_geno_pcs_phen <- cbind(si, mapped_geno_pcs) | ||
− | |||
− | + | ###--------------------### | |
− | + | ### Smoking Cessation ### | |
+ | ###--------------------### | ||
+ | ### | ||
+ | ### JHSCare variables are "current_smoker_baseline" and "former_smoking_baseline" | ||
+ | ### current_smoker_baseline = Current smoking status at first participant visit | ||
+ | ### former_smoker_baseline = Former smoking status at first participant visit | ||
+ | ### | ||
+ | ### Response options are | ||
+ | ### 0 - No | ||
+ | ### 1 - Yes | ||
+ | ### | ||
+ | ### table(sc) | ||
+ | ### | ||
+ | ### 0 1 x | ||
+ | ### 1206 537 9 | ||
− | |||
− | + | current.smoker <- subset(mapped_geno_pcs, select=c("current_smoker_baseline", "SUBJID")) | |
− | - | + | former.smoker <- subset(mapped_geno_pcs, select=c("former_smoker_baseline","SUBJID")) |
+ | N <- nrow(mapped_geno_pcs) | ||
+ | sc <- rep(NA, N) | ||
+ | for(i in 1:N){ | ||
+ | if(is.na(current.smoker[i,1]) | is.na(former.smoker[i,1])){ | ||
+ | sc[i] <- "x" | ||
+ | } | ||
+ | else if (current.smoker[i,1] == 0 & former.smoker[i,1] == 0){ | ||
+ | sc[i] <- "x" | ||
+ | } | ||
+ | else if (current.smoker[i,1] == 0 & former.smoker[i,1] == 1){ | ||
+ | sc[i] <- 1 ### former smokers are coded as 1 | ||
+ | } | ||
+ | else if (current.smoker[i,1] == 1 & former.smoker[i,1] == 0){ | ||
+ | sc[i] <- 2 ### current smokers are coded as 2 | ||
+ | } | ||
+ | } | ||
+ | phenotypes <- cbind(sc, mapped_geno_pcs_phen) | ||
− | + | phenotypes <- phenotypes[c(3,71,72:74,1:2)] | |
+ | names(phenotypes)[1] <- "fid" | ||
− | + | write.table(covariates, "AFR.JHS.covariates.ped", row.names=F, quote=F) | |
− | + | write.table(phenotypes, "AFR.JHS.phenotypes.ped", row.names=F, quote=F) | |
+ | </syntaxhighlight> | ||
=Genotype Processing= | =Genotype Processing= |
Latest revision as of 20:51, 15 February 2017
UK Biobank
More information about the files used for UKBiobank are here. In brief, we used the UK10K + 1kgp3 imputed vcfs provided by UKBionank and added in dosages w/ this python script:
import gzip, argparse, re, os, datetime
from subprocess import Popen, PIPE
def add_dosage(pair):
a, b = pair
probs = b.split(b',')
dose = float(probs[1]) + (float(probs[2]) * 2)
return a + b':' + str(dose).encode('ascii') + b':' + b
def gziplines(fname):
f = Popen(['zcat', fname], stdout=PIPE)
for line in f.stdout:
yield line
parser = argparse.ArgumentParser()
parser.add_argument('inputVCF', help = 'The path to the VCF')
args = parser.parse_args()
flag = False
for line in gziplines(args.inputVCF):
if line.startswith(b'#'):
os.write(1, line.rstrip() + b'\n')
if not flag:
os.write(1, b'##FORMAT=<ID=DS,Number=1,Type=Float,Description="Genotype Dosages">\n')
os.write(1, b'##Dosages added using the script add.dosages.subprocess.py at ' +
str(datetime.datetime.now()).encode('ascii') + b'\n')
flag = True
else:
elements = re.split(b'\t|:', line.rstrip())
first8 = elements[:8]
genotypes = elements[10:]
form = b'GT:DS:GP'
genotypes_split = zip(genotypes[::2], genotypes[1::2])
try:
dose_genos = [add_dosage(pair) for pair in genotypes_split]
except (ValueError, IndexError) as e:
os.write(2, "\n" + line)
os.write(2, line + "\n" + args.inputVCF + "\n\n")
raise e
os.write(1, b'\t'.join(first8) + b'\t' + form + b'\t' + b'\t'.join(dose_genos) + b'\n')
dbGaP Studies
Framingham
Phenotypes
options(stringsAsFactors=F)
### Offspring cohort
off <- read.table(gzfile("/work/KellerLab/GSCAN/dbGaP/Framingham/PhenoGenotypeFiles/RootStudyConsentSet_phs000007.Framingham.v28.p10.c1.HMB-IRB-MDS/PhenotypeFiles/phs000007.v28.pht000030.v7.p10.c1.ex1_1s.HMB-IRB-MDS.txt.gz"), header=T, sep="\t")
### Generation 3 cohort
g3 <- read.table(gzfile("/work/KellerLab/GSCAN/dbGaP/Framingham/PhenoGenotypeFiles/RootStudyConsentSet_phs000007.Framingham.v28.p10.c1.HMB-IRB-MDS/PhenotypeFiles/phs000007.v28.pht000074.v9.p10.c1.ex3_1s.HMB-IRB-MDS.txt.gz"), header=T, skip=10, sep="\t", fill=T)
###---------------------###
### Cigarettes per Day ###
###---------------------###
### Offspring Cohort Exam 1
### Framingham variable name is "A102".
### "Usual number of cigarettes smoked (now or formerly)"
### Response option is an integer value
###
### table(off$A102)
### 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
### 1208 50 29 44 26 47 31 16 30 3 175 1 20 1 9 94
### 16 17 18 19 20 22 24 25 26 27 28 30 34 35 40 42
### 5 6 50 1 556 10 3 39 1 1 2 214 1 15 147 2
### 45 50 60 80 88 90
### 4 21 19 1 14 2
###
### summary(off$A102)
### Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
### 0.00 0.00 7.00 12.11 20.00 90.00 8
cpd.off <- off$A102
cpd.off[cpd.off==0] <- NA ### Set those who smoke "0" cpd to NA
cpd.off[cpd.off > 60] <- 60 ### Set those who report smoking >40 cpd to 40
cpd.off[cpd.off <= 5 & cpd.off >= 1] <- 1
cpd.off[cpd.off <= 15 & cpd.off >= 6] <- 2
cpd.off[cpd.off <= 25 & cpd.off >= 16] <- 3
cpd.off[cpd.off <= 35 & cpd.off >= 26] <- 4
cpd.off[cpd.off >= 36 & cpd.off <= 60] <- 5
cpd.off[cpd.off > 60 | is.na(cpd.off)] <- NA
### G3 cohort Exam 1
### Framingham variable name is "G3A074
### "IF EVER SMOKED CIGS REGULARLY: ON THE AVERAGE
### OF THE ENTIRE TIME YOU SMOKED, HOW MANY CIGARETTES
### DID YOU SMOKE PER DAY?"
### Response option is an integer value
###
### table(g3$G3A074)
###
### 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
### 3180 57 14 15 20 30 8 13 7 2 122 2 16 3 2 60
### 16 17 18 19 20 21 22 23 24 25 28 30 35 40 45 50
### 3 3 10 1 287 2 1 1 1 12 1 53 1 47 1 12
### 55 60 80
### 1 6 1
###
### summary(g3$G3A074)
### Min. 1st Qu. Median Mean 3rd Qu. Max.
### 0.000 0.000 0.000 3.528 0.000 80.000
cpd.g3 <- g3$G3A074
cpd.g3[cpd.g3==0] <- NA ### Set those who smoke "0" cpd to NA
cpd.g3[cpd.g3 > 60] <- 60 ### Set those who report smoking >40 cpd to 40
cpd.g3[cpd.g3 <= 5 & cpd.g3 >= 1] <- 1
cpd.g3[cpd.g3 <= 15 & cpd.g3 >= 6] <- 2
cpd.g3[cpd.g3 <= 25 & cpd.g3 >= 16] <- 3
cpd.g3[cpd.g3 <= 35 & cpd.g3 >= 26] <- 4
cpd.g3[cpd.g3 >= 36 & cpd.g3 <= 60] <- 5
cpd.g3[cpd.g3 > 60 | is.na(cpd.g3)] <- NA
###--------------------###
### Smoking initiation ###
###--------------------###
### Offspring Cohort Exam 1
### Framingham variable is "A99"
### "Smokes cigarettes: Yes(now)/No/Former"
### Response option is fill-in
### 0 is no, 1 is yes (now), 2 is former
###
### table(off$A99)
###
### 0 1 2
### 1203 1126 566
si.off <- off$A99
si.off[si.off==1] <- 2 ### Set current smoker "1" to smoker "2"
si.off[si.off==0] <- 1 ### Set never smoker "0" to nonsmoker "1"
### Former smoker "2" remains as a "2" as smoker
### G3 Cohort Exam 1
### Framingham variables is "G3A070"
### G3A070 - HAVE YOU EVER SMOKED CIGARETTES REGULARLY? (NO MEANS
### LESS THAN 20 PACKS OF CIGARETTES OR 12 OZ OF TOBACCO IN A
### LIFETIME OR LESS THAN 1 CIGARETTE A DAY FOR A YEAR.)"
### Response options:
### "0" is no, "1" is yes, "0" is unknown
###
### table(g3$G3A070)
### 0 1 20
### 2137 1629 1
si.g3 <- g3$G3A070
si.g3[si.g3==20] <- NA
si.g3[si.g3==1] <- 2
si.g3[si.g3==0] <- 1
###---------------------###
### Smoking cessation ###
###---------------------###
### Offspring Cohort Exam 1
### Framingham variable is "A99"
### Smokes cigarettes: Yes(now)/No/Former"
### Response option is fill-in
### 0 is no, 1 is yes (now), 2 is former
###
### table(off$A99)
###
### 0 1 2
### 1203 1126 566
sc.off <- off$A99
sc.off[sc.off==2] <- 3 ### temporarily set former smoker "2" as "3"
sc.off[sc.off==1] <- 2 ### set current smoker "1" as current smoker "2"
sc.off[sc.off==3] <- 1 ### set former smoker "3" as former smoker "1"
sc.off[sc.off==0] <- NA ### code all non smokers "0" as NA
### G3 Cohort Exam 1
### Framingham variables are “G3A070” and “G3A072”
### Response option is fill-in
### G3A070 - HAVE YOU EVER SMOKED CIGARETTES REGULARLY? (NO MEANS LESS THAN 20 PACKS OF CIGARETTES OR 12 OZ OF ###TOBACCO IN A LIFETIME OR LESS THAN 1 CIGARETTE A DAY FOR A YEAR.)
###“0” is no, “1” is yes, “20” is unknown
### G3A072 - IF EVER SMOKED CIGARETTES REGULARLY: DO YOU NOW SMOKE CIGARETTES (AS OF 1 MONTH AGO)?
###“0” is no or never smoked, “1” is yes
###
### table(g3$G3A070)
### 0 1 20
### 2137 1629 1
###
### table(g3$G3A072)
### 0 1
### 3179 585
sc.g3 <- g3$G3A072
sc.g3[sc.g3==1] <- 2 ## These are our current smokers
sc.g3[sc.g3==0] <- 1
sc.g3 <- ifelse(sc.g3==1 & (g3$G3A070 == 0 | is.na(g3$G3A070)), NA, sc.g3)
###---------------------###
### Age of initiation ###
###---------------------###
### Offspring Cohort Exam 1
### Framingham variable "A100"
### Age started smoking cigarettes regularly\
### Response option is an integer value, 88 is doesn't smoke regularly
### table(off$A100)
###
### 0 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25
### 6 2 2 2 8 4 24 37 61 147 266 233 330 166 164 85 42 26 13 25
### 26 27 28 29 30 31 33 34 35 37 38 39 40 44 45 46 51 88
### 2 8 4 3 7 1 1 1 3 1 2 1 3 1 1 2 1 105
###
ai.off <- off$A100
ai.off[ai.off < 10] <- NA ### Set AI less than 10 as missing\
ai.off[ai.off > 35] <- NA ### Set AI greater than 35 as missing \
### G3 Cohort Exam 1
### Framingham variable "G3A075"
### IF EVER SMOKED CIGARETTES REGULARLY: HOW OLD WERE YOU WHEN YOU FIRST STARTED REGULAR CIGARETTE SMOKING?
### Response option is an integer value, 0 is never smoked
### table(g3$G3A075)
###
### 0 5 7 8 9 10 11 12 13 14 15 16 17 18 19 20
### 2138 1 2 2 3 5 14 52 108 159 152 277 193 256 98 107
### 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36
### 51 31 19 13 31 6 4 6 2 13 2 1 1 4 1 4
### 37 38 40 42 46 50
### 2 2 1 1 1 2
###
ai.g3 <- g3$G3A075
ai.g3[ai.g3 < 10] <- NA ### Set AI less than 10 as missing
ai.g3[ai.g3 > 35] <- NA ### Set AI greater than 35 as missing
###-------------------###
### Drinks per week ###
###-------------------###
### Offspring Cohort Exam 1
### Framingham variables "A111", "A112", "A113"
### A111 - Beer (bottles, cans or glasses per week)
### A112 - Wine (glasses per week)
### A113 - Cocktails, highballs, straight drinks (# per week)
### Response options are integer values.
### table(off$A111)
###
### 0 1 2 3 4 5 6 7 8 9 10 12 14 15 16 18
### 1452 524 171 131 78 49 125 34 33 4 46 68 21 21 5 13
### 20 21 24 25 26 28 30 35 36 40 42 48 49 50 70 90
### 18 7 48 5 1 6 6 2 2 4 4 6 1 2 1 1
###
### table(off$A112)
###
### 0 1 2 3 4 5 6 7 8 9 10 12 14 15 16 18
### 1329 919 218 105 69 41 22 64 25 3 25 11 30 2 2 2
### 20 21 24 28 30 32 50
### 7 4 2 3 1 2 1
###
### table(off$A113)
###
### 0 1 2 3 4 5 6 7 8 9 10 12 14 15 16 18
### 861 1094 267 157 96 64 59 65 31 9 49 16 57 13 2 1
### 19 20 21 24 25 28 30 35 38 42 50 55 60 61
### 1 12 10 4 2 2 6 2 1 1 1 1 1 1
###
dpw.off <- rowSums(subset(off, select = c("A111", "A112", "A113")), na.rm=T)
### The above assumes all drinks are of equal ETOH content
dpw.off[dpw.off == 0] <- NA
dpw.off <- log(dpw.off)
### G3 Cohort Exam 1\
### Framingham variables are "G3A115", "G3A116", "G3A119", "G3A120",
### "G3A123", "G3A124", "G3A127", "G3A128",
### "G3A131", "G3A132"
### G3A115 - BEER: NUMBER OF BEER (12 OZ. BOTTLE, GLASS, CAN) YOU DRINK PER WEEK OVER THE PAST YEAR
### G3A116 - BEER: NUMBER OF BEER (12 OZ. BOTTLE, GLASS, CAN) YOU DRINK PER MONTH OVER THE PAST YEAR
### G3A119 - WHITE WINE: NUMBER OF WHITE WINE (4 OZ GLASS) YOU DRINK PER WEEK OVER THE PAST YEAR
### G3A120 - WHITE WINE: NUMBER OF WHITE WINE (4 OZ GLASS) YOU DRINK PER MONTH OVER THE PAST YEAR
### G3A123 - RED WINE: NUMBER OF RED WINE (4 OZ GLASS) YOU DRINK PER WEEK OVER THE PAST YEAR
### G3A124 - RED WINE: NUMBER OF RED WINE (4 OZ GLASS) YOU DRINK PER MONTH OVER THE PAST YEAR
### G3A127 - LIQUOR/SPIRITS: AVERAGE NUMBER OF LIQUOR/SPIRITS (1 1/4 OZ JIGGER) YOU DRINK PER WEEK OVER THE PAST YEAR
### G3A128 - LIQUOR/SPIRITS: AVERAGE NUMBER OF LIQUOR/SPIRITS (1 1/4 OZ JIGGER) YOU DRINK PER MONTH OVERTHE PAST YEAR
### G3A131 - OTHER BEVERAGE: AVERAGE NUMBER OF OTHER BEVERAGE YOU DRINK PER WEEK OVER THE PAST YEAR
### G3A132 - OTHER BEVERAGE: AVERAGE NUMBER OF OTHER BEVERAGE YOU DRINK PER MONTH OVER THE PAST YEAR
###
### *Participant was allowed to report alcohol consumption in either drinks per week or drinks per
### month. Therefore, to calculate total alcohol consumption, you must use both number of drinks per
### week and number of drinks per month (E.G. DRINKS PER MONTH = SUM(OF (DRINKS PER WEEK)(DRINKS PER
### MONTH/4)).
###
### Response options are integer values.
###
all <- subset(g3, select = c("G3A115", "G3A116", "G3A119", "G3A120", "G3A123", "G3A124", "G3A127", "G3A128",
"G3A131", "G3A132"))
names(all) <- c("beerpw", "beerpm", "wwinepw", "wwinepm", "rwinepw", "rwinepm", "liqpw", "liqpm", "othpw", "othpm")
w <- all[,c(1,3,5,7,9)]
m <- all[,c(2,4,6,8,10)]/4
dpw.tmp <- data.frame(beer = rowSums(cbind(w$beerpw, m$beerpm), na.rm=T),
wwine = rowSums(cbind(w$wwinepw, m$wwinepm), na.rm=T),
rwine = rowSums(cbind(w$rwinepw, m$rwinepm), na.rm=T),
liq = rowSums(cbind(w$liqpw, m$liqpm), na.rm=T),
oth = rowSums(cbind(w$othpw, m$othpm), na.rm=T))
### WHITE WINE - convert from 4oz to 5oz drink
dpw.tmp$wwine <- dpw.tmp$wwine*5/4
### RED WINE - convert from 4oz to 5oz drink
dpw.tmp$rwine <- dpw.tmp$rwine*5/4
### LIQUOR - convert from 1.25 to 1.50 drink
dpw.tmp$liq <- dpw.tmp$liq*1.5/1.25
### OTHER DRINK is an unknown. leave as-is.
dpw.g3 <- rowSums(dpw.tmp, na.rm=T)
### We have a few outliers, draw cuttoff at 70 drinks / week
dpw.g3[dpw.g3 > 70] <- NA
dpw.g3[dpw.g3 == 0] <- NA
dpw.g3 <- log(dpw.g3 + .75)
###---------------------------###
### Drinker vs. non-drinker ###
###---------------------------###
###
### Use the variables already computed for dpw above. Assume
### anyone who is drinking less than 1 drink / month is a non-drinker
x <- subset(off, select = c("A111", "A112", "A113"))
index <- rep(NA, nrow(x))
for(i in 1:nrow(x)) {
if(is.na(x[i,1]) & is.na(x[i,2]) & is.na(x[i,3])) {
index[i] <- 1
} else {
index[i] <- 0
}
}
dnd.off <- ifelse(rowSums(subset(off, select = c("A111", "A112", "A113")), na.rm=T) == 0, 1, 2)
dnd.off[index==1] <- NA
x <- all
index <- rep(NA, nrow(x))
for(i in 1:nrow(x)) {
if(is.na(x[i,1]) & is.na(x[i,2]) & is.na(x[i,3])) {
index[i] <- 1
} else {
index[i] <- 0
}
}
dnd.g3 <- ifelse(rowSums(all, na.rm=T) == 0, 1, 2)
dnd.g3[index==1] <- NA
###----------------------------###
### Binge drinking in everyone ###
### (Only available for G3) ###
###----------------------------###
### G3 Cohort Exam 1
### Framingham variable is "G3A137"
### IF EVER CONSUMED ALCOHOL: WHAT WAS THE MAXIMUM NUMBER OF
### DRINKS YOU HAD IN A 24 HOUR PERIOD DURING THE PAST MONTH?
### Response option is an integer value
###
### summary(g3$G3A137)
### Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
### 0.000 1.000 3.000 3.566 5.000 50.000 3
bde <- g3$G3A137
sex <- g3$G3A440 ### bde is defined differently for males & females
### sex == 1 is male, sex == 2 is female
tmp <- rep(NA, length(bde)) ### temporary vector to hold results and avoid conflicts
tmp[sex == 1 & bde < 5] <- 1
tmp[sex == 1 & bde >=5] <- 2
tmp[sex == 2 & bde < 4] <- 1
tmp[sex == 2 & bde >=4] <- 2
bde.g3 <- tmp
###-----------------------------------------###
### Binge-drinking (in lifetime drinkers) ###
### (Only available for G3) ###
###-----------------------------------------###
### G3 Cohort Exam 1
### Framingham variable is "G3A112"
### HAVE YOU EVER CONSUMED ALCOHOLIC BEVERAGES (BEER, WINE, LIQUOR/SPIRITS)? \
### 0 is no, 1 is yes
lifednd <- g3$G3A112 ## No missings, so next command has no problem
bdl.g3 <- bde.g3
bdl.g3 <- ifelse(lifednd == 0, NA, bde.g3)
##################
##################
### ###
### Covariates ###
### ###
##################
##################
### HEIGHT
### Offspring Cohort variable is “A51”
### Generation 3 Cohort variable is “G3A446”
height.off <- off$A51
height.g3 <- g3$G3A446
### WEIGHT
### Offspring Cohort variable is “A50”
### Generation 3 Cohort variable is “G3A444”
weight.off <- off$A50
weight.g3 <- g3$G3A444
###-------------###
### Age at exam ###
###-------------###
### Get BIRTHDATE
birth <- read.table(gzfile("/work/KellerLab/GSCAN/dbGaP/Framingham/PhenoGenotypeFiles/RootStudyConsentSet_phs000007.Framingham.v28.p10.c1.HMB-IRB-MDS/PhenotypeFiles/phs000007.v28.pht000740.v7.p10.c1.birthyr_alls.HMB-IRB-MDS.txt.gz"), header=T, sep="\t")
off.IDs <- off$dbGaP_Subject_ID
g3.IDs <- g3$dbGaP_Subject_ID
birth.off <- subset(birth, dbGaP_Subject_ID %in% off.IDs)
birth.g3 <- subset(birth, dbGaP_Subject_ID %in% g3.IDs)
### True exam dates spanned 1971 - 1975 for offspring
### True exam dates spanned 2002 - 2005 for generation 3
### A good-enough approximation is to take the middle
### year and subtract birth year to get age at exam
### Offspring Cohort
off.age <- data.frame(dbGaP_Subject_ID = birth.off$dbGaP_Subject_ID,
Age = 1973 - birth.off$birthyr)
### Framingham Cohort
g3.age <- data.frame(dbGaP_Subject_ID = birth.g3$dbGaP_Subject_ID,
Age = 2004 - birth.g3$birthyr)
age <- rbind(off.age, g3.age)
####################################
####################################
### ###
### Create Phenotype Data Frames ###
### ###
####################################
####################################
offspring <- data.frame(dbGaP_Subject_ID = off$dbGaP_Subject_ID,
shareid = off$shareid,
cpd = cpd.off,
si = si.off,
sc = sc.off,
ai = ai.off,
dpw = dpw.off,
dnd = dnd.off,
bde = rep(NA, nrow(off)),
bdl = rep(NA, nrow(off)),
height = height.off,
weight = weight.off,
cohort = rep(1, nrow(off)))
generation3 <- data.frame(dbGaP_Subject_ID = g3$dbGaP_Subject_ID,
shareid = g3$shareid,
cpd = cpd.g3,
si = si.g3,
sc = sc.g3,
ai = ai.g3,
dpw = dpw.g3,
dnd = dnd.g3,
bde = bde.g3,
bdl = bdl.g3,
height = height.g3,
weight = weight.g3,
cohort = rep(2, nrow(g3)))
tmp <- rbind(offspring, generation3)
phenotypes <- merge(tmp, age, by="dbGaP_Subject_ID")
phenotypes$age2 <- phenotypes$Age^2
##################
### ID MAPPING ###
##################
### ID mapping file from dbGaP_Subject_ID to SAMPID
ID.map <- read.table(gzfile("/work/KellerLab/GSCAN/dbGaP/Framingham/PhenoGenotypeFiles/RootStudyConsentSet_phs000007.Framingham.v28.p10.c1.HMB-IRB-MDS/PhenotypeFiles/phs000007.v28.pht001415.v16.p10.Framingham_Sample.MULTI.txt.gz"), header=T, sep="\t", stringsAsFactors=F)
### Genotype files
genotype.IDs <- read.table("/work/KellerLab/GSCAN/dbGaP/Framingham/PhenoGenotypeFiles/ChildStudyConsentSet_phs000342.Framingham.v16.p10.c1.HMB-IRB-MDS/GenotypeFiles/phg000006.v9.FHS_SHARe_Affy500K.genotype-calls-matrixfmt.c1/subject_level_PLINK_sets/FHS_SHARe_Affy500K_subjects_c1.fam", header=F)
names(genotype.IDs) <- c("famid", "SAMPID", "patid", "matid", "sex", "phenotype")
length(which(genotype.IDs$SAMPID %in% ID.map$SAMPID))
## 6954
x <- merge(genotype.IDs, ID.map, by="SAMPID", all.x=T)
x <- x[,c(1:5,7)]
### There are many duplicates, which I can remove because I only want
### a single entry for every SAMPID-SUBJID
x <- x[which(!duplicated(x$dbGaP_Subject_ID)),]
almost.final <- merge(phenotypes, x, by="dbGaP_Subject_ID", all.x=T)
####################################################
### Bring in genetic PCs and ancestry categories ###
####################################################
PCs <- read.table("/work/KellerLab/Zhen/FRAMINGHAM/PCA/FRAMINGHAM_pcs_and_ancestries.txt", header=T)
names(PCs)[2] <- c("SAMPID")
final <- merge(almost.final, PCs, by="SAMPID", all.x=T)
phenotypes.ped <- subset(final, ancestry=="EUR",
select=c("famid", "SAMPID", "patid",
"matid", "sex", "cpd", "ai", "si",
"sc", "dpw", "dnd", "bde", "bdl"))
phenotypes.ped[is.na(phenotypes.ped)] <- "x"
covariates.ped <- subset(final, ancestry=="EUR",
select=c("famid", "SAMPID", "patid",
"matid", "sex", "Age", "age2", "sc",
"height", "weight", "cohort",
"PC1", "PC2", "PC3", "PC4", "PC5",
"PC6", "PC7", "PC8", "PC9", "PC10"))
covariates.ped[is.na(covariates.ped)] <- "x"
write.table(phenotypes.ped, file="Framingham.EUR.phenotypes.ped", quote=F,
col.names=T, row.names=F)
write.table(covariates.ped, file="Framingham.EUR.covariates.ped", quote=F,
col.names=T, row.names=F)
Genotypes
We used the Affy 500K genotypes found here: /work/KellerLab/GSCAN/dbGaP/Framingham/PhenoGenotypeFiles/ChildStudyConsentSet_phs000342.Framingham.v16.p10.c1.HMB-IRB-MDS/GenotypeFiles/phg000006.v9.FHS_SHARe_Affy500K.genotype-calls-matrixfmt.c1/subject_level_PLINK_sets/FHS_SHARe_Affy500K_subjects_c1.[bed|bim|fam]
ARIC
Phenotypes
phenotypes <- read.table("/work/KellerLab/GSCAN/dbGaP/ARIC/PhenoGenotypeFiles/ChildStudyConsentSet_phs000090.ARIC_RootStudy.v3.p1.c1.HMB-IRB/PhenotypeFiles/phs000090.v3.pht000114.v2.p1.c1.GENEVA_ARIC_Subject_Phenotypes.HMB-IRB.txt.gz",header=T,sep="\t",stringsAsFactors=F)
phenotypes <- subset(phenotypes, select=c("geneva_id", "racegrp", "gender","v1age01", "anta01","anta04", "drnkr01", "hom29", 'hom35', "hom32", 'cigt01','evrsmk01', 'dtia90','dtia96', 'dtia97','dtia98', 'cursmk01','forsmk01'))
### rename phenotypes to be readable
names(phenotypes)[c(1,2,3,4,5,6)] <- c("geneva_id", "race", "sex", "age", "height", "weight")
### To connect sample ids to geneva ids, take SAMPID (the ID used in the genotype fam file, and SUBJID (aka geneva_id) )
id_map <- read.table(gzfile("/work/KellerLab/GSCAN/dbGaP/ARIC/PhenoGenotypeFiles/ChildStudyConsentSet_phs000090.ARIC_RootStudy.v3.p1.c1.HMB-IRB/GenotypeFiles/phg000035.v1.ARIC_GEI.genotype-qc.MULTI/geno-qc/samp-subj-mapping.csv.gz"),header=T,sep=",",stringsAsFactors=F)[,c(2,1)]
names(id_map) <- c("SAMPID", "geneva_id")
phenotypes <- merge(phenotypes, id_map, by="geneva_id", all.x=TRUE)
### import genotype data to get family info
fam_data <- read.table("/work/KellerLab/GSCAN/dbGaP/ARIC/PhenoGenotypeFiles/ChildStudyConsentSet_phs000090.ARIC_RootStudy.v3.p1.c1.HMB-IRB/GenotypeFiles/phg000035.v1.ARIC_GEI.genotype-calls-matrixfmt.c1.GRU.update1/Genotypes_with_flagged_chromosomal_abnormalities_zeroed_out/ARIC_PLINK_flagged_chromosomal_abnormalities_zeroed_out.fam", col.names = c("fam_id", "SAMPID", "patid", "matid", "sex", "dummy"))
### Replace 0's with "x" for rvTest preferred formatting
fam_data$patid <- fam_data$matid <- fam_data$fam_id[fam_data$fam_id == 0] <- "x"
########################################
###---- Derive GSCAN phenotypes -----###
########################################
### DRINKER VERSUS NON-DRINKER
### ARIC variable name is "drnkr01".
### Combination of "Do you presently drink alcoholic beverages?" and "Have you ever consumed alcoholic beverages?"
### Response option for both questions are "yes" or "no", which are turned into the options below.
### Response options:
### 1 = Current Drinker
### 2 = Former Drinker
### 3 = Never Drinker
### 4 = Unknown
###
### Descriptives:
### table(phenotypes$drnkr01)
### 1 2 3 4
### 7257 2309 3153 6
###
### To obtain GSCAN "DND" collapse across Former and Never Drinkers
### and make "Non-Drinkers". Current Drinkers will be made "Drinkers"
dnd <- phenotypes$drnkr01
dnd[dnd == 1] <- "Current Drinker"
dnd[dnd == 2 | dnd == 3] <- 1
dnd[dnd == "Current Drinker"] <- 2
dnd[dnd == 4 | is.na(dnd)] <- "x"
### AGE OF INITIATION OF SMOKING
###
### ARIC variable name is "hom29".
### "How old were you when you first started regular cigarette smoking?"
### Response option is an integer value.
###
### Descriptives:
###
### > table(phenotypes$hom29)
### 0 1 4 5 6 7 8 9 10 11 12 13 14 15 16 17
### 19 1 2 10 11 15 22 32 65 44 154 187 302 659 941 715
### 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33
### 1219 567 703 447 275 129 88 247 56 45 59 22 100 8 34 8
### 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49
### 17 51 9 9 10 8 35 3 11 5 5 16 3 4 2 3
### 50 51 52 55 57 59 60 62
### 7 1 2 1 1 2 2 1
###
### > summary(phenotypes$hom29)
### Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
### 0.00 16.00 18.00 18.77 20.00 62.00 5377
###
ai <- phenotypes$hom29
### remove ages older than 35 and younger than 10
ai[ai > 35 | ai < 10 | is.na(ai)] <- "x"
### CIGARETTES PER DAY
### ARIC variable name is "hom35"
### "On the average of the entire time you smoked, how many cigarettes did you usually smoke per day?"
### Response option is integer, or "0" for <1 cigarette per day
###
### > table(phenotypes$hom35)
### 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
### 46 54 118 187 133 254 146 84 89 17 990 23 106 30 12 520
### 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31
### 31 30 62 5 2559 1 5 10 5 193 9 3 7 6 771 3
### 32 33 34 35 36 37 38 40 42 43 45 50 51 54 55 58
### 1 1 1 44 3 1 1 572 1 3 14 72 1 1 3 1
### 60 65 70 75 80 86 90 99
### 100 1 4 2 10 1 1 3
### > summary(phenotypes$hom35)
### Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
### 0.00 10.00 20.00 19.67 24.00 99.00 5420
### Responses are binned in accordance with the GSCAN Analysis Plan.
cpd <- phenotypes$hom35
cpd[cpd <= 5 & cpd >= 1] <- 1
cpd[cpd <= 15 & cpd >= 6] <- 2
cpd[cpd <= 25 & cpd >= 16] <- 3
cpd[cpd <= 35 & cpd >= 26] <- 4
cpd[cpd >= 36 & cpd <= 60] <- 5
cpd[cpd > 60 | is.na(cpd)] <- "x"
### DRINKS PER WEEK
### ARIC variable names are "dtia96", "dtia97", and "dtia98"
### "dtia96" - "How many glasses of wine do you usualy have per week? (4oz. glasses; round down)."
### "dtia97" - "How many bottles of cans or beer do you usualy have per week? (12oz. bottles or cans; round down)."
### "dtia98" - "How many drinks of hard liquor do you usualy have per week? (4oz. glasses; round down)."
### Response option for all three is integer.
###
### Descriptives:
###
### >table(phenotypes$dtia96)
### 0 1 2 3 4 5 6 7 8 9 10 11 12 14 15 16
### 5226 844 461 255 147 90 75 50 27 3 34 1 15 28 9 2
### 17 18 20 21 25 28 30 32 33 35 40
### 1 3 7 5 1 2 3 1 1 1 1
###
### >summary(phenotypes$dtia96)
### Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
### 0.000 0.000 0.000 0.868 1.000 40.000 5478
###
### >table(phenotypes$dtia97)
### 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
### 4356 674 528 312 214 107 297 93 76 10 97 3 186 4 40 28
### 16 18 19 20 21 22 23 24 25 28 30 32 33 35 36 40
### 5 28 1 36 19 1 1 89 8 8 12 2 1 10 8 6
### 42 45 48 49 50 56 60 63 70 72 80 92
### 13 2 6 1 3 2 4 1 1 2 1 1
###
### >summary(phenotypes$dtia97)
### Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
### 0.000 0.000 0.000 2.609 2.000 92.000 5474
###
### >table(phenotypes$dtia98)
### 0 1 2 3 4 5 6 7 8 9 10 11 12 14 15 16
### 5226 844 461 255 147 90 75 50 27 3 34 1 15 28 9 2
### 17 18 20 21 25 28 30 32 33 35 40
### 1 3 7 5 1 2 3 1 1 1 1
###
### >summary(phenotypes$dtia96)
### Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
### 0.000 0.000 0.000 0.868 1.000 40.000 5478
###
### >table(phenotypes$dtia97)
### 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
### 4356 674 528 312 214 107 297 93 76 10 97 3 186 4 40 28
### 16 18 19 20 21 22 23 24 25 28 30 32 33 35 36 40
### 5 28 1 36 19 1 1 89 8 8 12 2 1 10 8 6
### 42 45 48 49 50 56 60 63 70 72 80 92
### 13 2 6 1 3 2 4 1 1 2 1 1
###
### >summary(phenotypes$dtia97)
### Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
### 0.000 0.000 0.000 2.609 2.000 92.000 5474
###
### >table(phenotypes$dtia98)
### 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
### 4387 735 551 295 239 190 148 138 70 11 151 14 57 3 103 33
### 16 17 18 20 21 24 25 26 27 28 30 32 33 34 35 36
### 9 9 7 36 30 1 10 1 1 9 6 2 1 2 4 1
### 39 40 44 45 47 48 50 51 52 54 55 56 63 64 75 77
### 1 7 1 1 1 2 5 1 1 1 1 3 1 2 1 2
### 90 99
### 1 2
###
### >summary(phenotypes$dtia98)
### Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
### 0.000 0.000 0.000 2.227 2.000 99.000 5483
wine <- phenotypes$dtia96 # 1 drink = 4oz
beer <- phenotypes$dtia97 # 1 drink = 12oz
spirits <- phenotypes$dtia98 # 1 drink = 1.5oz
### muliply wine by 4 and divide wine by 5 to normalize to standard drink of 5 oz for wine
### Combine all alcohol types, left-anchor at 1, and log
dpw <- log((wine*4/5 + beer + spirits) + 1)
dpw[is.na(dpw)] <- "x"
### SMOKING INITIATION
### ARIC variable name is "evrsmk01"
### The variable checks answers to "Have you ever smoked cigarettes?" and "Do you now smoke cigarettes?".
### Response options are "yes" or "no".
###
### Descriptives:
###
### >table(phenotypes$evrsmk01)
### 0 1
### 5328 7434
###
### >summary(phenotypes$evrsmk01)
### Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
### 0.0000 0.0000 1.0000 0.5825 1.0000 1.0000 9
si <- phenotypes$evrsmk01
si[si == 1] <- 2
si[si == 0] <- 1
si[is.na(si)] <- "x"
### SMOKING CESSATION
### ARIC variable names are "cursmk01" and "forsmk01"
### Both varaiables take into account the questions: "Have you ever smoked cigarettes?" and "Do you now smoke cigarettes?"
### Response options are "yes" or "no".
### Smoking inititation (si) is coded as "2" for "Smoker" if "yes" to "Have you ever smoked cigarettes?"
### If a subsequent "yes" to "Do you now smoke cigarettes?", smoking cessation (sc) is coded as "2" for "Current Smoker".
### If a subsequent "no" to "Do you now smoke cigarettes?", smoking cessation (sc) is coded as "1" for "Former Smoker".
### Smoking inititation (si) is coded as "1" for "Non Smoker" if "no" to "Have you ever smoked cigarettes?"
current.smoker <- subset(phenotypes, select=c("cursmk01"))
former.smoker <- subset(phenotypes, select=c("forsmk01"))
N <- nrow(phenotypes)
sc <- rep(NA, N)
for(i in 1:N){
if(is.na(current.smoker[i,1]) | is.na(former.smoker[i,1])){
sc[i] <- NA
}
else if (current.smoker[i,1] == 0 & former.smoker[i,1] == 0){
sc[i] <- NA
}
else if (current.smoker[i,1] == 0 & former.smoker[i,1] == 1){
sc[i] <- 1 ### former smokers are coded as 1
}
else if (current.smoker[i,1] == 1 & former.smoker[i,1] == 0){
sc[i] <- 2 ### current smokers are coded as 2
}
}
sc[is.na(sc)] <- "x"
### Create dataframe with our new GSCAN variables
N <- nrow(phenotypes)
NAs <- rep("x", N)
gscan.phenotypes <- data.frame(SAMPID = phenotypes$SAMPID, famid = NAs,geneva_id = phenotypes$geneva_id,patid = NAs,matid = NAs,sex = ifelse(phenotypes$sex == "M", 1, 2),cpd = cpd,ai = ai,si = si,sc = sc,dnd = dnd,dpw = dpw,age = phenotypes$age,age2 = phenotypes$age^2,height = phenotypes$height,weight = phenotypes$weight,currentformersmoker = sc)
### Merge in the SAMPID, which is used in the genotype files
gscan.phenotypes <- merge(gscan.phenotypes, id_map, by="geneva_id", all.x=TRUE)
### Reorder phenotype file to make pedigree file consistent with genotype IDs
gscan.phenotypes <- gscan.phenotypes[c(17,2,16,3:15)]
colnames(gscan.phenotypes) [2] <- "SAMPID"
### Read in PCs and add to pedigree file, then write out to a phenotype and covariate file
### [ here read in PCs and merge into phenotype file (probably by the SAMPID) ]
pcs <- read.table("/rc_scratch/hayo0753/aric/aric_ancestry_and_pcs", head=TRUE, stringsAsFactors=F)
colnames(pcs) [1] <- "SAMPID"
pcs <- merge(pcs, gscan.phenotypes, by="SAMPID", all.x=TRUE)
############# PRELIMINARY #####################
### Write to file [NOTE TO HANNAH: will have to be changed once we
### have PCs and ancestry groups identified. PCs will have to be read
### in like with read.table() and we'll have to subset the dataset
### into European and African ancestry, and then write out one
### phenotype and covariate file per ancestry group.
### EUROPEANS
phenotypes.EUR.ped <- subset(pcs, ancestry == "EUR",select=c("famid","SAMPID","patid","matid", "sex","cpd", "ai","si", "sc", "dnd","dpw"))
write.table(phenotypes.EUR.ped, file="ARIC.EUR.phenotypes1.ped", quote=F, col.names=T, row.names=F, sep="\t")
covariates.EUR.ped <- subset(pcs, ancestry == "EUR", select=c("famid","SAMPID","patid","matid", "sex","age", "age2", "height", "weight", "currentformersmoker","PC1", "PC2", "PC3", "PC4", "PC5", "PC6", "PC7", "PC8","PC9", "PC10"))
write.table(covariates.EUR.ped, file="ARIC.EUR.covariates1.ped", quote=F, col.names=T, row.names=F, sep="\t")
phenotypes.AFR.ped <- subset(pcs, ancestry == "AFR",select=c("famid","SAMPID","patid","matid", "sex","cpd", "ai","si", "sc", "dnd","dpw"))
write.table(phenotypes.AFR.ped, file="ARIC.AFR.phenotypes1.ped", quote=F, col.names=T, row.names=F, sep="\t")
covariates.AFR.ped <- subset(pcs, ancestry == "AFR", select=c("famid","SAMPID","patid","matid", "sex","age", "age2", "height", "weight", "currentformersmoker", "PC1", "PC2", "PC3", "PC4", "PC5", "PC6", "PC7", "PC8","PC9", "PC10"))
write.table(covariates.AFR.ped, file="ARIC.AFR.covariates1.ped", quote=F, col.names=T, row.names=F, sep="\t")
### Must remove duplicates from all files, use UNIX
### sort ARIC.EUR.phenotypes1.ped | uniq > ARIC.EUR.phenotypes.ped
### sort ARIC.EUR.covariates.ped | uniq > ARIC.EUR.covariates.ped
### sort ARIC.AFR.phenotypes1.ped | uniq > ARIC.AFR.phenotypes.ped
### sort ARIC.AFR.covariates.ped | uniq > ARIC.AFR.covariates.ped
MESA
(Hannah/Joyce to update following Framingham as a guide)
Phenotypes
Description of phenotypes can be found here: Media:MESA phenotypes - FINAL.pdf
eMERGE
Phenotypes
options(stringsAsFactors=F)
### eMERGE is broken into different consent classes. We can conduct analyses on hmb, hmb-gso-nic, and
emerge.hmb <- read.table("/work/KellerLab/dbGaP/eMERGE-MergedSet/PhenoGenotypeFiles/RootStudyConsentSet_phs000360.eMERGE_MergedSet.v3.p1.c1.HMB/PhenotypeFiles/phs000360.v3.pht003255.v2.p1.c1.MergedSet_Subject_Phenotypes.HMB.txt.gz", header=TRUE, sep="\t", stringsAsFactors=F)
emerge.hmb.genos <- read.table("/work/KellerLab/dbGaP/eMERGE-MergedSet/PhenoGenotypeFiles/RootStudyConsentSet_phs000360.eMERGE_MergedSet.v3.p1.c1.HMB/GenotypeFiles/matrix/c1.HMB/eMerge_660_11212012_c1.fam", header=FALSE, sep="\t", stringsAsFactors=F)
emerge.hmb.gso.nic <- read.table("/work/KellerLab/dbGaP/eMERGE-MergedSet/PhenoGenotypeFiles/RootStudyConsentSet_phs000360.eMERGE_MergedSet.v3.p1.c3.HM-B-GSO-NIC/PhenotypeFiles/phs000360.v3.pht003255.v2.p1.c3.MergedSet_Subject_Phenotypes.HM-B-GSO-NIC.txt.gz", header=TRUE, sep="\t", stringsAsFactors=F)
emerge.hmb.gso.nic.genos <- read.table("/work/KellerLab/dbGaP/eMERGE-MergedSet/PhenoGenotypeFiles/RootStudyConsentSet_phs000360.eMERGE_MergedSet.v3.p1.c3.HM-B-GSO-NIC/GenotypeFiles/matrix/c3.HM-B-GSO-NIC/eMerge_660_11212012_c3.fam", header=FALSE, sep="\t", stringsAsFactors=F)
emerge.hmb.gso <- read.table("/work/KellerLab/dbGaP/eMERGE-MergedSet/PhenoGenotypeFiles/RootStudyConsentSet_phs000360.eMERGE_MergedSet.v3.p1.c4.HMB-GSO/PhenotypeFiles/phs000360.v3.pht003255.v2.p1.c4.MergedSet_Subject_Phenotypes.HMB-GSO.txt.gz", header=TRUE, sep="\t", stringsAsFactors=F)
emerge.hmb.gso.genos <- read.table("/work/KellerLab/dbGaP/eMERGE-MergedSet/PhenoGenotypeFiles/RootStudyConsentSet_phs000360.eMERGE_MergedSet.v3.p1.c4.HMB-GSO/GenotypeFiles/matrix/c4.HMB-GSO/eMerge_660_11212012_c4.fam", header=FALSE, sep="\t", stringsAsFactors=F)
### Merge all files above according to SUBJID, which is used in the
### genotype files.
emerge <- merge(emerge.hmb, emerge.hmb.gso, all=T)
emerge <- merge(d, emerge.hmb.gso.nic, all=T)
### SMOKING INITIATION
###
### The eMERGE variable name is SMOKING_STATUS
### C65108 = never smoker
### C67147 = current smoker
### C67148 = past smoker
### C67151 = Unknown if ever smoked
###
### Descriptives:
###
### table(emerge$SMOKING_STATUS)
###
### C65108 C67147 C67148 C67151
### 2217 1736 3457 9635
si <- emerge$SMOKING_STATUS
si[si == "C67147" | si == "C67148"] <- 2
si[si == "C65108"] <- 1
si[si != 1 & si != 2] <- NA
### SMOKING Cessation
###
### Current == 2 & Former == 1 in GSCAN. This is already the case for these data.
sc <- emerge$SMOKING_STATUS
sc[sc == "C67147"] <- 2
sc[sc == "C67148"] <- 1
sc[sc != 1 & sc != 2] <- NA
### eMERGE age variable is tricky because there is no obvious age at
### assessment. We will use their "DECADE_BIRTH" as a terrible
### approximation.
### 1=1900-1919; 2=1920-1929, 3=1930-1939; 4=1940-1949; 5=1950-1959; 6=Unknown
###
### Descriptives:
###
### table(emerge$DECADE_BIRTH)
###
### . 1 2 3 4 5 6 7 8 9 99
### 6 612 2667 3533 4439 3127 1291 761 490 10 109
birthyear <- emerge$DECADE_BIRTH
birthyear[birthyear == "99"] <- NA
birthyear[birthyear == "."] <- NA
### SEX
sex <- emerge$SEX
sex[sex == "C46109"] <- 1
sex[sex == "C46110"] <- 2
### Scott decided not to correct for additional case-control variables
### given what appears to be a highly complex sample and uncertainty
### about the best course of action to account for disease status in
### conducting smoking analyses.
phenotypes <- data.frame(fid = emerge$SUBJID,
iid = emerge$SUBJID,
patid = "x",
matid = "x",
sex = sex,
si = si,
sc = sc)
phenotypes[is.na(phenotypes)] <- "x"
write.table(phenotypes,
"/work/KellerLab/vrieze/GSCAN/GWAS/summary_stats_generated_internally/eMERGE/GSCAN_eMERGE_phenotypes.ped",
row.names=F,
quote = F,
sep="\t")
covariates <- data.frame(fid = emerge$SUBJID,
iid = emerge$SUBJID,
patid = "x",
matid = "x",
sex = sex,
birthyear = birthyear)
covariates[is.na(covariates)] <- "x"
write.table(covariates,
"/work/KellerLab/vrieze/GSCAN/GWAS/summary_stats_generated_internally/eMERGE/GSCAN_eMERGE_covariates.ped",
row.names=F,
quote = F,
sep="\t")
Stroke
Phenotypes
### Date: Feb 13 2017
### Author: Scott Vrieze
options(stringsAsFactors=F)
### Load in dataset ###
ninds <- read.table("/work/KellerLab/GSCAN/dbGaP/CIDR_StrokeGenetics/PhenoGenotypeFiles/RootStudyConsentSet_phs000615.CIDR_International_StrokeGenetics.v1.p1.c3.GRU-NPU/PhenotypeFiles/phs000615.v1.pht003307.v1.p1.c3.IschemicStroke_Subject_Phenotypes.GRU-NPU.txt.gz",
header = T, sep="\t")
### The file reads into R incorrectly because of a weird trailing tab
### character in the data file, so use the below code to shift column
### names to the correct column.
names(ninds)[1] <- "XXX"
ninds$dbGaP_Subject_ID <- row.names(ninds)
ninds$smokingStatus <- NULL
names(ninds) <- c(names(ninds)[2:17], "smokingStatus", "dbGaP_Subject_ID")
### subset the only variables needed
pheno <- subset(ninds, select=c("subject_id", "smokingStatus", "age", "gender", "AFFECTION_STATUS"))
###————————————————————###
### SMOKING INITIATION ###
###————————————————————###
### NINDS variable is “smokingStatus”
### Variables are “NEVER”, “FORMER”, “CURRENT”, “NO”, “UNKNOWN”, and blank.
###
### ’CURRENT' defined as cigarette smoking within last 30 days,
### 'FORMER' defined as ### more than 100 cigarettes in one's lifetime
### but no smoking within the last 30 ### days; ’NEVER' defined as
### less than 100 cigarettes smoked in one's lifetime.
### table(pheno$smokingStatus)
###
### CURRENT FORMER NEVER NO UNKNOWN
### 63 727 1137 2397 4 251
si <- pheno$smokingStatus
si[si == "CURRENT" | si == "FORMER"] <- 2
si[si == "NEVER" | si == "NO"] <- 1
si[si != "1" & si != "2"] <- NA
si <- as.numeric(si)
### table(si)
###
### 1 2
### 2401 1864
###————————————————————-###
### SMOKING CESSATION ###
###————————————————————-###
### NINDS variable is “smokingStatus”
### Variables are “NEVER”, “FORMER”, “CURRENT”, “NO”, “UNKNOWN”, and blank.
###
### ’CURRENT' defined as cigarette smoking within last 30 days,
### 'FORMER' defined as more than 100 cigarettes in one's lifetime but
### no smoking within the last 30 days; ’NEVER' defined as less than
### 100 cigarettes smoked in one's lifetime.
### table(pheno$smokingStatus)
###
### CURRENT FORMER NEVER NO UNKNOWN
### 63 727 1137 2397 4 251
sc <- pheno$smokingStatus
sc[sc == "CURRENT"] <- 2
sc[sc == "FORMER"] <- 1
sc[sc != "1" & sc != "2"] <- NA
sc <- as.numeric(sc)
### table(pheno$V5)
###
### 1 2
### 1137 727
###——————————###
### GENDER ###
###——————————###
### NINDS variable is “gender”
### Variables are “F” and “M”
### table(pheno$gender)
###
### F M
### 2627 1952
sex <- pheno$gender
sex[sex == "M"] <- 1
sex[sex == "F"] <- 2
sex <- as.numeric(sex)
###----------------###
### Write to files ###
###----------------###
### This study uses the "subject_id" ID field in the genotype files,
### so use that here, instead of the dbGaP_Subject_ID
phenotypes <- data.frame(fid = pheno$subject_id,
iid = pheno$subject_id,
patid = "x",
matid = "x",
sex = sex,
si = si,
sc = sc)
write.table(phenotypes,
"/work/KellerLab/vrieze/GSCAN/GWAS/summary_stats_generated_internally/Stroke/GSCAN_STROKE_phenotypes.ped",
row.names=F,
quote = F,
sep="\t")
covariates <- data.frame(fid = pheno$subject_id,
iid = pheno$subject_id,
patid = "x",
matid = "x",
sex = sex,
age = pheno$age,
age2 = pheno$age^2,
AFFECTION_STATUS = pheno$AFFECTION_STATUS)
write.table(covariates,
"/work/KellerLab/vrieze/GSCAN/GWAS/summary_stats_generated_internally/Stroke/GSCAN_STROKE_covariates.ped",
row.names=F,
quote = F,
sep="\t") ### save table
BEAGESS
Phenotypes
options(stringsAsFactors=F)
beagess <- read.table("/work/KellerLab/GSCAN/dbGaP/Barretts/PhenoGenotypeFiles/RootStudyConsentSet_phs000869.BEAGESS.v1.p1.c1.GRU-MDS/PhenotypeFiles/phs000869.v1.pht004610.v1.p1.c1.BEAGESS_Subject_Phenotypes.GRU-MDS.txt.gz", header=TRUE, sep="\t", stringsAsFactors=F)
genos <- read.table("/work/KellerLab/GSCAN/dbGaP/Barretts/PhenoGenotypeFiles/RootStudyConsentSet_phs000869.BEAGESS.v1.p1.c1.GRU-MDS/GenotypeFiles/phg000580.v1.NCI_BEAGESS.genotype-calls-matrixfmt.c1.GRU-MDS.update/BEAGESS_dbGaP_29Jan2015.fam", header=FALSE, stringsAsFactors=F)
### The file reads into R incorrectly because of a weird trailing tab
### character in the data file, so use the below code to shift column
### names to the correct column.
names(beagess)[1] <- "XXX"
beagess$dbGaP_Subject_ID <- row.names(beagess)
beagess$assocEABEvsCO <- NULL
names(beagess) <- c(names(beagess)[2:11], "assocEABEvsCO", "dbGaP_Subject_ID")
### SMOKING INITIATION
###
### BEAGESS variable name is "cig_smk_status" and is listed under the column "BMI".
### "Cigarette smoking status."
### Response options are "-99", "-9", "0", "1" and "2".
### -99 = not consented
### -9 = Missing
### 0 = Never
### 1 = Former
### 2 = Current
###
### Descriptives:
###
### > table(beagess_data$BMI)
###
### -99 -9 0 1 2
### 494 2051 1515 2288 575
###
### > summary(beagess_data$BMI)
### Min. 1st Qu. Median Mean 3rd Qu. Max.
### -99.000 -9.000 0.000 -9.234 1.000 2.000
si <- beagess$cig_smk_status
si[si == 1 | si == 2] <- 2
si[si == 0] <- 1
si[si != 1 & si != 2] <- NA
### SMOKING Cessation
###
### BEAGESS variable name is "cig_smk_status" and is listed under the column "BMI".
### "Cigarette smoking status."
### Response options are "-99", "-9", "0", "1" and "2".
### -99 = not consented
### -9 = Missing
### 0 = Never
### 1 = Former
### 2 = Current
###
### Descriptives:
### table(beagess$cig_smk_status)
### -99 -9 0 1 2
### 494 2051 1515 2288 575
###
### Current == 2 & Former == 1 in GSCAN. This is already the case for these data.
sc <- beagess$cig_smk_status
sc[sc == 0 | sc == -9 | sc == -99] <- NA
### BEAGESS variable name is "agegpcat"
### "Age in 5 year categories"
### Response options are integers 1 through 14 or -9.
### -9 = Missing
### 1 = 15-29 years of age
### 2 = 30-34 years of age
### 3 = 35-39 years of age
### 4 = 40-44 years of age
### 5 = 45-49 years of age
### 6 = 50-54 years of age
### 7 = 55-59 years of age
### 8 = 60-64 years of age
### 9 = 65-69 years of age
### 10 = 70-74 years of age
### 11 = 75-79 years of age
### 12 = 80-84 years of age
### 13 = 85-89 years of age
### 14 = 90-100 years of age
###
### Descriptives:
###
### > table(beagess_data$agegpcat)
###
### -9 1 2 3 4 5 6 7 8 9 10 11 12 13 14
### 27 19 48 112 190 378 657 958 1134 1238 1018 794 246 87 17
###
### This is fine, but change the -9 to NA
beagess$agegpcat[beagess$agegpcat == -9] <- NA
### looks like BEAGESS uses their own internal "SUBJECT_ID" in the
### genotype files, so we'll use that in our pedigree files
phenotypes <- data.frame(fid = beagess$SUBJECT_ID,
iid = beagess$SUBJECT_ID,
patid = "x",
matid = "x",
sex = beagess$sex,
si = si,
sc = sc)
phenotypes[is.na(phenotypes)] <- "x"
write.table(phenotypes,
"/work/KellerLab/vrieze/GSCAN/GWAS/summary_stats_generated_internally/BEAGESS/GSCAN_BEAGESS_phenotypes.ped",
row.names=F,
quote = F,
sep="\t")
covariates <- data.frame(fid = beagess$SUBJECT_ID,
iid = beagess$SUBJECT_ID,
patid = "x",
matid = "x",
sex = beagess$sex,
age = beagess$age,
age2 = beagess$age^2,
Barretts.esophagus.case_v_control = beagess$assocBEvsCO,
Esophageal.adenocarcinoma.case_v_control = beagess$assocEAvsCO)
covariates[is.na(covariates)] <- "x"
write.table(covariates,
"/work/KellerLab/vrieze/GSCAN/GWAS/summary_stats_generated_internally/BEAGESS/GSCAN_BEAGESS_covariates.ped",
row.names=F,
quote = F,
sep="\t") ### save table
Jackson Heart Study
Phenotypes
hmb <- read.table("/work/KellerLab/GSCAN/dbGaP/JHS/PhenoGenotypeFiles/ChildStudyConsentSet_phs000499.JHS.v3.p1.c3.HMB-IRB/PhenotypeFiles/phs000499.v3.pht003209.v1.p1.c3.JHS_CARe_Subject_Phenotypes.HMB-IRB.txt.gz", header=T, sep="\t", stringsAsFactors=F)
npu <- read.table("/work/KellerLab/GSCAN/dbGaP/JHS/PhenoGenotypeFiles/ChildStudyConsentSet_phs000499.JHS.v3.p1.c1.HMB-IRB-NPU/PhenotypeFiles/phs000499.v3.pht003209.v1.p1.c1.JHS_CARe_Subject_Phenotypes.HMB-IRB-NPU.txt.gz", header=T, sep="\t", stringsAsFactors=F)
phens <- rbind(hmb, npu)
hmb_geno <- read.table("/work/KellerLab/GSCAN/dbGaP/JHS/PhenoGenotypeFiles/ChildStudyConsentSet_phs000499.JHS.v3.p1.c3.HMB-IRB/GenotypeFiles/phg000103.v1.JHS.genotype-calls-matrixfmt.c3.HMB-IRB/JHS_PLINK_illu_GRU.fam", header=F, stringsAsFactors=F)
npu_geno <- read.table("/work/KellerLab/GSCAN/dbGaP/JHS/PhenoGenotypeFiles/ChildStudyConsentSet_phs000499.JHS.v3.p1.c1.HMB-IRB-NPU/GenotypeFiles/phg000103.v1.JHS.genotype-calls-matrixfmt.c1.HMB-IRB-NPU/JHS_PLINK_illu_NCU.fam", header=F, stringsAsFactor=F)
geno <- rbind(hmb_geno, npu_geno)
jhs_map <- read.table("/work/KellerLab/GSCAN/dbGaP/JHS/PhenoGenotypeFiles/ChildStudyConsentSet_phs000499.JHS.v3.p1.c3.HMB-IRB/GenotypeFiles/phg000104.v1.JHS.sample-info.MULTI/JHS_subject_sample_map_8_10_2010.txt", header=T, sep="\t", stringsAsFactors=F)
jhs_pcs <- read.table("/rc_scratch/hayo0753/JHS_files/JHS_ancestry_and_pcs.txt", header=T, stringsAsFactors=F)
names(jhs_pcs) [2] <- "SUBJID"
names(geno) <- c("SUBJID", "iid", "patid", "matid", "sex", "phen")
filtered_geno <- geno[(geno$SUBJID %in% phens$SUBJID),]
filtered_phens <- phens[(phens$SUBJID %in% geno$SUBJID),]
mapped_geno <- merge(filtered_phens, filtered_geno, by="SUBJID", all.x=T)
mapped_geno_pcs <- merge(mapped_geno, jhs_pcs, by="SUBJID", all.x=T)
covariates <- mapped_geno_pcs[c(1,69,70:72,5,75:84)]
names(covariates) [1] <- "fid"
names(covariates) [2] <- "iid"
names(covariates) [6] <- "age"
###--------------------###
### Smoking initiation ###
###--------------------###
###
### JHSCare variables are "current_smoker_baseline" and "former_smoking_baseline"
### current_smoker_baseline = Current smoking status at first participant visit
### former_smoker_baseline = Former smoking status at first participant visit
###
### Response options are
### 0 - No
### 1 - Yes
###
### table(si)
###
### 0 1 x
### 1206 537 9
current.smoker <- subset(mapped_geno_pcs, select=c("current_smoker_baseline", "SUBJID"))
former.smoker <- subset(mapped_geno_pcs, select=c("former_smoker_baseline", "SUBJID"))
N <- nrow(mapped_geno_pcs)
si <- rep(NA, N)
for(i in 1:N){
if(is.na(current.smoker[i,1]) | is.na(former.smoker[i,1])){
si[i] <- "x"
}
else if (current.smoker[i,1] == 0 & former.smoker[i,1] == 0){
si[i] <- "1"
}
else if (current.smoker[i,1] == 0 & former.smoker[i,1] == 1){
si[i] <- 2 ### former smokers are coded as 2
}
else if (current.smoker[i,1] == 1 & former.smoker[i,1] == 0){
si[i] <- 2 ### current smokers are coded as 2
}
}
mapped_geno_pcs_phen <- cbind(si, mapped_geno_pcs)
###--------------------###
### Smoking Cessation ###
###--------------------###
###
### JHSCare variables are "current_smoker_baseline" and "former_smoking_baseline"
### current_smoker_baseline = Current smoking status at first participant visit
### former_smoker_baseline = Former smoking status at first participant visit
###
### Response options are
### 0 - No
### 1 - Yes
###
### table(sc)
###
### 0 1 x
### 1206 537 9
current.smoker <- subset(mapped_geno_pcs, select=c("current_smoker_baseline", "SUBJID"))
former.smoker <- subset(mapped_geno_pcs, select=c("former_smoker_baseline","SUBJID"))
N <- nrow(mapped_geno_pcs)
sc <- rep(NA, N)
for(i in 1:N){
if(is.na(current.smoker[i,1]) | is.na(former.smoker[i,1])){
sc[i] <- "x"
}
else if (current.smoker[i,1] == 0 & former.smoker[i,1] == 0){
sc[i] <- "x"
}
else if (current.smoker[i,1] == 0 & former.smoker[i,1] == 1){
sc[i] <- 1 ### former smokers are coded as 1
}
else if (current.smoker[i,1] == 1 & former.smoker[i,1] == 0){
sc[i] <- 2 ### current smokers are coded as 2
}
}
phenotypes <- cbind(sc, mapped_geno_pcs_phen)
phenotypes <- phenotypes[c(3,71,72:74,1:2)]
names(phenotypes)[1] <- "fid"
write.table(covariates, "AFR.JHS.covariates.ped", row.names=F, quote=F)
write.table(phenotypes, "AFR.JHS.phenotypes.ped", row.names=F, quote=F)
Genotype Processing
Pre-Phasing QC
QC parameters that we chose: MAF > 0.01
SNP callrate > 0.95
Missingness per individual > 0.95
HWE = 0.05 / number of markers but greater than 5e-8
## To update the strand builds: http://www.well.ox.ac.uk/~wrayner/strand/
## Check strands against latest 1000G: http://www.well.ox.ac.uk/~wrayner/tools/
perl HRC-1000G-check-bim.pl \
-b ARIC_b37_filtered.bim \
-f ARIC_b37_filtered.frq \
-r 1000GP_Phase3_combined.legend \
-g \
-p EUR
## Phasing using shapeit (ARIC used as an example)
shapeit -B ARIC_b37_filtered-updated-chr${1} -M /rc_scratch/meli7712/dbGAP/1000GP_Phase3/genetic_map_chr${1}_combined_b37.txt -O phased/ARIC_b37_filtered-updated-chr${1}.phased -T 48
## To convert the shapeit output into vcf
shapeit -convert --input-haps mesa-chr${1}.phased \
--output-vcf mesa-chr${1}.phased.vcf \
-T 12
## Imputation
Minimac3-omp3 --haps mesa-chr${1}.phased.vcf \
--cpus 48
--refHaps /rc_scratch/meli7712/dbGAP/references/${1}.1000g.Phase3.v5.With.Parameter.Estimates.m3vcf.gz \
--chr ${1} \
--noPhoneHome \
--format GT,DS,GP \
--allTypedSites \
--prefix mesa-chr${1}.phased.imputed