Difference between revisions of "GSCAN dbGaP"

From Vrieze Wiki
Jump to navigation Jump to search
Line 43: Line 43:
 
phenotypes <- read.table("/work/KellerLab/GSCAN/dbGaP/ARIC/PhenoGenotypeFiles/ChildStudyConsentSet_phs000090.ARIC_RootStudy.v3.p1.c1.HMB-IRB/PhenotypeFiles/phs000090.v3.pht000114.v2.p1.c1.GENEVA_ARIC_Subject_Phenotypes.HMB-IRB.txt.gz",
 
phenotypes <- read.table("/work/KellerLab/GSCAN/dbGaP/ARIC/PhenoGenotypeFiles/ChildStudyConsentSet_phs000090.ARIC_RootStudy.v3.p1.c1.HMB-IRB/PhenotypeFiles/phs000090.v3.pht000114.v2.p1.c1.GENEVA_ARIC_Subject_Phenotypes.HMB-IRB.txt.gz",
 
                         header=T,
 
                         header=T,
                         sep="\t")
+
                         sep="\t",
phenotypes <- subset(phenotypes, select=c("geneva_id", "racegrp",
+
                        stringsAsFactors=F)
                    "gender", "v1age01", "anta01", "anta04",
 
                    "drnkr01", "hom29", 'hom35', "hom32", 'cigt01',
 
                    'evrsmk01', 'dtia90','dtia96', 'dtia97','dtia98',
 
                    'cursmk01','forsmk01'))
 
  
### To connect sample ids to geneva ids, sample id, geneva id, sample
+
phenotypes <- subset(phenotypes, select=c("geneva_id", "racegrp", "gender","v1age01", "anta01",
### site columns taken from
+
                        "anta04", "drnkr01", "hom29", 'hom35', "hom32", 'cigt01',
### /work/KellerLab/GSCAN/dbGaP/ARIC/PhenoGenotypeFiles/ChildStudyConsentSet_phs000090.ARIC_RootStudy.v3.p1.c1.HMB-IRB/GenotypeFiles/phg000035.v1.ARIC_GEI.genotype-qc.MULTI/geno-qc/samp-subj-mapping.csv.gz
+
                        'evrsmk01', 'dtia90','dtia96', 'dtia97','dtia98', 'cursmk01',
 +
                        'forsmk01'))
  
id_map <-read.table("/rc_scratch/hayo0753/id_map.txt", header=TRUE)
+
### rename phenotypes to be readable
 +
names(phenotypes)[c(1,2,3,4,5,6)] <- c("gen_id", "race", "sex", "age", "height", "weight")
 +
 
 +
### To connect sample ids to geneva ids, take SAMPID (the ID used in the genotype fam file, and SUBJID (aka geneva_id) )
 +
id_map <- read.table(gzfile("/work/KellerLab/GSCAN/dbGaP/ARIC/PhenoGenotypeFiles/ChildStudyConsentSet_phs000090.ARIC_RootStudy.v3.p1.c1.HMB-IRB/GenotypeFiles/phg000035.v1.ARIC_GEI.genotype-qc.MULTI/geno-qc/samp-subj-mapping.csv.gz"),
 +
                    header=T,
 +
                    sep=",",
 +
                    stringsAsFactors=F)[,c(2,1)]
 +
names(id_map) <- c("SAMPID", "geneva_id")
  
 
### import genotype data to get family info
 
### import genotype data to get family info
fam_data <- read.table("/work/KellerLab/GSCAN/dbGaP/ARIC/PhenoGenotypeFiles/ChildStudyConsentSet_phs000090.ARIC_RootStudy.v3.p1.c1.HMB-IRB/GenotypeFiles/phg000035.v1.ARIC_GEI.genotype-calls-matrixfmt.c1.GRU.update1/Genotypes_with_flagged_chromosomal_abnormalities_zeroed_out/ARIC_PLINK_flagged_chromosomal_abnormalities_zeroed_out.fam")
+
fam_data <- read.table("/work/KellerLab/GSCAN/dbGaP/ARIC/PhenoGenotypeFiles/ChildStudyConsentSet_phs000090.ARIC_RootStudy.v3.p1.c1.HMB-IRB/GenotypeFiles/phg000035.v1.ARIC_GEI.genotype-calls-matrixfmt.c1.GRU.update1/Genotypes_with_flagged_chromosomal_abnormalities_zeroed_out/ARIC_PLINK_flagged_chromosomal_abnormalities_zeroed_out.fam", col.names = c("fam_id", "SAMPID", "patid", "matid", "sex", "dummy"))
### study doesn't have maternal or paternal ids so generate entire
+
 
### columns of NAs
+
### Replace 0's with "x" for rvTest preferred formatting
fam_data$patid <- NA
+
fam_data$patid <- fam_data$matid <- fam_data$fam_id[fam_data$fam_id == 0] <- "x"
fam_data$matid <- NA
+
 
### set individuals without related participant to "x"
 
fam_data$V1[fam_data$V1==0] = "x"
 
  
### make family data variables more readable
+
########################################
patid <- fam_data$patid
+
###---- Derive GSCAN phenotypes -----###
matid <- fam_data$matid
+
########################################
fam_id <- fam_data$V1
+
 
### make phenotype covariate variables more readable
+
### DRINKER VERSUS NON-DRINKER
gen_id <- phenotypes$geneva_id
+
### ARIC variable name is "drnkr01".
race <- phenotypes$racegrp
+
### [Hannah will fill in question text]
sex <- phenotypes$gender
+
###  Response options:
age <- phenotypes$v1age01
+
###          1 = Current Drinker
height <- phenotypes$anta01
+
###          2 = Former Drinker
weight <- phenotypes$anta04
+
###          3 = Never Drinker
 +
###          4 = Unknown
 +
###
 +
### Descriptives:
 +
### table(phenotypes$drnkr01)
 +
###    1    2    3    4
 +
### 7257 2309 3153    6
 +
###
 +
###  To obtain GSCAN "DND" collapse across Former and Never Drinkers
 +
###  and make "Non-Drinkers". Current Drinkers will be made "Drinkers"
  
### put phenotype covariates in table
 
covariates <- cbind(gen_id,race,sex,age, height, weight)
 
colnames(covariates) = c("geneva_id", "race", "sex", "age", "height", "weight")
 
 
sample_number <- id_map$sampid
 
geneva_id <- id_map$gen_id
 
site <- id_map$center
 
### sample_number and geneva_id for id mapping, site source was taken from to be used as covariate
 
ids <- cbind(sample_number, geneva_id, site)
 
ids <- as.data.frame(ids)
 
colnames(ids) <- c("sample_number", "geneva_id", "site")
 
 
### merge ids with covariates
 
merged <- merge(covariates, ids, by="geneva_id", all=TRUE)
 
 
### change to non-drinker =1 , drinker =2
 
phenotypes$drnkr01 <- as.numeric(as.character(phenotypes$drnkr01))
 
phenotypes$drnkr01[phenotypes$drnkr01 == 1 ] = 6
 
phenotypes$drnkr01[phenotypes$drnkr01 == 2 ] = 1
 
phenotypes$drnkr01[phenotypes$drnkr01 == 3 ] = 1
 
phenotypes$drnkr01[phenotypes$drnkr01 == 6 ] = 2
 
phenotypes$drnkr01[phenotypes$drnkr01 == 4 ] = "x"
 
 
dnd <- phenotypes$drnkr01
 
dnd <- phenotypes$drnkr01
+
dnd[dnd == 1] <- "Current Drinker"
### Age 1st regularly smoked cigarettes (visit 1)
+
dnd[dnd == 2 | dnd == 3] <- 1
phenotypes$hom29 <- as.character(phenotypes$hom29)
+
dnd[dnd == "Current Drinker"] <- 2
phenotypes$hom29[phenotypes$hom29 == "A" ] = "x"
+
dnd[dnd == 4 | is.na(dnd)] <- "x"
phenotypes$hom29 <- as.numeric(phenotypes$hom29)
+
 
 +
### AGE OF INITIATION OF SMOKING
 +
###
 +
### ARIC variable name is "hom29".
 +
###    "How old were you when you first started regular cigarette smoking?"
 +
###    Response option is an integer value.
 +
###
 +
### Descriptives:
 +
###
 +
### > table(phenotypes$hom29)
 +
###    0    1    4    5    6    7    8    9  10  11  12  13  14  15  16  17
 +
###    19    1    2  10  11  15  22  32  65  44  154  187  302  659  941  715
 +
###    18  19  20  21  22  23  24  25  26  27  28  29  30  31  32  33
 +
###  1219  567  703  447  275  129  88  247  56  45  59  22  100    8  34    8
 +
###    34  35  36  37  38  39  40  41  42  43  44  45  46  47  48  49
 +
###    17  51    9    9  10    8  35    3  11    5    5  16    3    4    2    3
 +
###    50  51  52  55  57  59  60  62
 +
###    7    1    2    1    1    2    2    1
 +
###
 +
### > summary(phenotypes$hom29)
 +
###  Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's
 +
###  0.00  16.00  18.00  18.77  20.00  62.00    5377
 +
###
 +
ai <- phenotypes$hom29
 
### remove ages older than 35 and younger than 10
 
### remove ages older than 35 and younger than 10
phenotypes$hom29[(phenotypes$hom29 > 35) | (phenotypes$hom29 < 10)] = "x"
+
ai[ai > 35 | ai < 10 | is.na(ai)] <- "x"
ai <- phenotypes$hom29
+
 
  
### Overall number of cigarettes per day (visit 1)
+
### CIGARETTES PER DAY
### put cpd into bins from analysis plan
+
### ARIC variable name is "hom35"
phenotypes$hom35 <- as.character(phenotypes$hom35)
+
###    "On the average of the entire time you smoked, how many cigarettes did you usually smoke per day?"
phenotypes$hom35[phenotypes$hom35 == "A" ] = "x"
+
###    Response option is integer, or "0" for <1 cigarette per day
phenotypes$hom35[(phenotypes$hom35 >="61") ] = "x"
+
###
phenotypes$hom35 <- as.integer(phenotypes$hom35)
+
### > table(phenotypes$hom35)
phenotypes$hom35[(phenotypes$hom35 <= 5) & (phenotypes$hom35 >= 1)] = 1
+
###    0    1    2    3    4    5    6    7    8    9  10  11  12  13  14  15
phenotypes$hom35[(phenotypes$hom35 <= 15) & (phenotypes$hom35 >= 6)] = 2
+
###    46  54  118  187  133  254  146  84  89  17  990  23  106  30  12  520
phenotypes$hom35[(phenotypes$hom35 <= 25) & (phenotypes$hom35 >= 16)] = 3
+
###    16  17  18  19  20  21  22  23  24  25  26  27  28  29  30  31
phenotypes$hom35[(phenotypes$hom35 <= 35) & (phenotypes$hom35 >= 26)] = 4
+
###    31  30  62    5 2559    1    5  10    5 193    9    3    7    6  771    3
phenotypes$hom35[(phenotypes$hom35 >= 36) & (phenotypes$hom35 <=60)] = 5
+
###    32  33  34  35  36  37  38  40  42  43  45  50  51  54  55  58
 +
###    1    1    1  44    3    1    1  572    1    3  14  72    1    1   3    1
 +
###    60  65  70  75  80  86  90  99
 +
###  100    1    4    2   10    1    1    3
 +
###
 +
### > summary(phenotypes$hom35)
 +
###    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's
 +
###    0.00  10.00  20.00  19.67  24.00  99.00    5420
 
cpd <- phenotypes$hom35
 
cpd <- phenotypes$hom35
 +
cpd[cpd <=  5 & cpd >=  1] <- 1
 +
cpd[cpd <= 15 & cpd >=  6] <- 2
 +
cpd[cpd <= 25 & cpd >= 16] <- 3
 +
cpd[cpd <= 35 & cpd >= 26] <- 4
 +
cpd[cpd >= 36 & cpd <= 60] <- 5
 +
cpd[cpd > 60 | is.na(cpd)] <- "x"
 +
 +
 +
### DRINKS PER WEEK
 +
### ARIC variable name is "hom35"
 +
###    "On the average of the entire time you smoked, how many cigarettes did you usually smoke per day?"
 +
###    Response option is integer, or "0" for <1 cigarette per day
  
phenotypes$dtia96 <- as.integer(as.character(phenotypes$dtia96))
+
wine <- phenotypes$dtia96 # 1 drink = 4oz
phenotypes$dtia97 <- as.integer(as.character(phenotypes$dtia97))
+
beer <- phenotypes$dtia97 # 1 drink = 12oz
phenotypes$dtia98 <- as.integer(as.character(phenotypes$dtia98))
+
spirits <- phenotypes$dtia98 # 1 drink = 1.5oz
  
 
### divide wine by 5 to normalize to standard drink
 
### divide wine by 5 to normalize to standard drink
dpw <- (phenotypes$dtia96)/5 + (phenotypes$dtia97) + (phenotypes$dtia98)  
+
dpw <- log((wine*4/5 + beer + spirits) + 1) ## Combine all alcohol types, left-anchor at 1, and log
dpw <- as.data.frame(dpw)
+
dpw[is.na(dpw)] <- "x"
dpw$dpw <- as.numeric(as.character(dpw$dpw))
+
 
dpw$dpw<-dpw[,1]+1
 
dpw <- log(dpw)
 
### Number of drinks per week consumed, wine+beer+liquor
 
  
phenotypes$evrsmk01 <- as.character(phenotypes$evrsmk01)
+
### SMOKING INITIATION
phenotypes$evrsmk01[phenotypes$evrsmk01 == "T"] = "x"
+
### ARIC variable name is "evrsmk01"
phenotypes$evrsmk01 <- as.integer(as.character(phenotypes$evrsmk01))
+
### [hannah fill in details]
phenotypes$evrsmk01[phenotypes$evrsmk01 == 1] = 2
+
si <- phenotypes$evrsmk01
phenotypes$evrsmk01[phenotypes$evrsmk01 == 0] = 1
+
si[si == 1] <- 2
si2 <- phenotypes$evrsmk01
+
si[si == 0] <- 1
si2 <- as.data.frame(si2)
+
si[is.na(si)] <- "x"
  
UsedData <- phenotypes[,17:18]
+
 
X <- c()
+
### SMOKING CESSATION
for(i in 1:nrow(UsedData)){
+
### ARIC variable names are "cursmk01" and "forsmk01"
    Delin <- (UsedData[i,])
+
current.smoker <- subset(phenotypes, select=c("cursmk01"))
     if(is.na(Delin[,1]) | is.na(Delin[,2])){
+
former.smoker <- subset(phenotypes, select=c("forsmk01"))
         X <- append(X,"x")
+
N <- nrow(phenotypes)
     }  
+
sc <- rep(NA, N)
     else if (Delin[,1]  == 0 & Delin[,2] == 0){
+
for(i in 1:N){
         X <- append(X,"x")
+
     if(is.na(current.smoker[i,1]) | is.na(former.smoker[i,1])){
 +
         sc[i] <- NA
 +
     }
 +
     else if (current.smoker[i,1]  == 0 & former.smoker[i,1] == 0){
 +
         sc[i] <- NA
 
     }
 
     }
     else if (Delin[,1]  == 0 & Delin[,2] == 1){
+
     else if (current.smoker[i,1]  == 0 & former.smoker[i,1] == 1){
         X <- append(X,1)
+
         sc[i] <- 1 ### former smokers are coded as 1
 
     }
 
     }
     else if (Delin[,1]  == 1 & Delin[,2] == 0){
+
     else if (current.smoker[i,1]  == 1 & former.smoker[i,1] == 0){
         X <- append(X,2)
+
         sc[i] <- 2 ### current smokers are coded as 2
 
     }
 
     }
 
}
 
}
sc2 <- X
+
sc[is.na(sc)] <- "x"
 +
 
 +
 
 +
 
 +
### Create dataframe with our new GSCAN variables
 +
N <- nrow(phenotypes)
 +
NAs <- rep("x", N)
 +
gscan.phenotypes <- data.frame(famid = NAs,
 +
                              geneva_id = phenotypes$gen_id,
 +
                              patid = NAs,
 +
                              matid = NAs,
 +
                              sex = ifelse(phenotypes$sex == "M", 1, 2),
 +
                              cpd = cpd,
 +
                              ai = ai,
 +
                              si = si,
 +
                              sc = sc,
 +
                              dnd = dnd,
 +
                              dpw = dpw,
 +
                              age = phenotypes$age,
 +
                              age2 = phenotypes$age^2,
 +
                              height = phenotypes$height,
 +
                              weight = phenotypes$weight,
 +
                              currentformersmoker = sc)
 +
 
 +
### Merge in the SAMPID, which is used in the genotype files
 +
gscan.phenotypes <- merge(gscan.phenotypes, id_map, by="geneva_id", all.x=TRUE)
 +
 
 +
### Reorder phenotype file to make pedigree file consistent with genotype IDs
 +
gscan.phenotypes <- gscan.phenotypes[c(2,16,3:15)]
 +
 
 +
### Read in PCs and add to pedigree file, then write out to a phenotype and covariate file
 +
[ here read in PCs and merge into phenotype file (probably by the SAMPID) ]
 +
 
 +
 
 +
############# PRELIMINARY #####################
 +
 
 +
### Write to file [NOTE TO HANNAH: will have to be changed once we
 +
### have PCs and ancestry groups identified. PCs will have to be read
 +
### in like with read.table() and we'll have to subset the dataset
 +
### into European and African ancestry, and then write out one
 +
### phenotype and covariate file per ancestry group.
 +
 
 +
PCs <- read.table(xxxFile from Zhenxxx)
 +
 
 +
ancestry.IDs <- read.table(xxxFile from Zhenxxx)
 +
### EUROPEANS
 +
phenotypes.EUR.ped <- subset(gscan.phenotypes, ancestry == "EUR",
 +
                        select=c("famid","SAMPID","patid","matid", "sex",
 +
                            "cpd", "ai","si", "sc", "dnd","dpw"))
 +
write.table(phenotypes.EUR.ped, file="ARIC.EUR.phenotypes.ped", quote=F, col.names=T, row.names=F, sep="\t")
 +
 
 +
covariates.EUR.ped <- subset(gscan.phenotypes, ancestry == "EUR"
 +
                        select=c("famid","SAMPID","patid","matid", "sex",
 +
                            "age", "age2", "height", "weight", "currentformersmoker",
 +
                            "PC1", "PC2", "PC3", "PC4", "PC5", "PC6", "PC7", "PC8",
 +
                            "PC9", "PC10"))
 +
write.table(covariates.EUR.ped, file="ARIC.EUR.covariates.ped", quote=F, col.names=T, row.names=F, sep="\t")
 +
 
  
library(qpcR)
+
phenotypes.AFR.ped <- subset(gscan.phenotypes, ancestry == "AFR",
final_phen <- qpcR:::cbind.na(fam_id, merged$geneva_id, patid, matid, sex, cpd, si, sc2, ai, dpw, dnd)
+
                        select=c("famid","SAMPID","patid","matid", "sex",
colnames(final_phen) <- c("famid", "iid", "patid", "matid", "sex", "cpd", "si", "sc", "ai", "dpw", "dnd")
+
                            "cpd", "ai","si", "sc", "dnd","dpw"))
final_phens <- replace(final_phen, is.na(final_phen), "x")
+
write.table(phenotypes.AFR.ped, file="ARIC.AFR.phenotypes.ped", quote=F, col.names=T, row.names=F, sep="\t")
write.table(final_phens, "/rc_scratch/hay0753/aricphen2.ped", quote = FALSE, row.names =FALSE)
 
  
final_covar <- qpcR:::cbind.na(fam_id, merged$geneva_id, patid, matid, sex, age, height, weight)
+
covariates.AFR.ped <- subset(gscan.phenotypes, ancestry == "AFR"
colnames(final_covar) <- c("famid", "iid", "patid", "matid", "sex", "age", "height", "weight")
+
                        select=c("famid","SAMPID","patid","matid", "sex",
final_covars <- replace(final_covar, is.na(final_covar), "x")
+
                            "age", "age2", "height", "weight", "currentformersmoker",
final_covars <- as.data.frame(final_covars)
+
                            "PC1", "PC2", "PC3", "PC4", "PC5", "PC6", "PC7", "PC8",
write.table(final_covars, "/rc_scratch/hay0753/ariccovar2.ped", quote = FALSE, row.names=FALSE)
+
                            "PC9", "PC10"))
 +
write.table(covariates.AFR.ped, file="ARIC.AFR.covariates.ped", quote=F, col.names=T, row.names=F, sep="\t")
 
</syntaxhighlight>
 
</syntaxhighlight>
  

Revision as of 22:09, 7 September 2016

Studies

Framingham

ID Mapping

The following snppit describes how the PLINK genotype file IDs (which use "sampid") were mapped to the dbGaP_Subject_ID contained in the available phenotype files. The snippet only contains one of Joyce Ling's phenotype files for illustration only.

This script and the results it produces can be found in /work/KellerLab/GSCAN/dbGaP/Framingham/ID_mapping.r

### ID mapping file from dbGaP_Subject_ID to SAMPID 
ID.map <- read.table(gzfile("/work/KellerLab/GSCAN/dbGaP/Framingham/PhenoGenotypeFiles/RootStudyConsentSet_phs000007.Framingham.v28.p10.c1.HMB-IRB-MDS/PhenotypeFiles/phs000007.v28.pht001415.v16.p10.Framingham_Sample.MULTI.txt.gz"), header=T, sep="\t", stringsAsFactors=F)
 
### Genotype files
genotype.IDs <-  read.table("/work/KellerLab/GSCAN/dbGaP/Framingham/PhenoGenotypeFiles/ChildStudyConsentSet_phs000342.Framingham.v16.p10.c1.HMB-IRB-MDS/GenotypeFiles/phg000006.v9.FHS_SHARe_Affy500K.genotype-calls-matrixfmt.c1/subject_level_PLINK_sets/FHS_SHARe_Affy500K_subjects_c1.fam", header=F)
names(genotype.IDs) <- c("famid", "SAMPID", "patid", "matid", "sex", "phenotype")
 
length(which(genotype.IDs$SAMPID %in% ID.map$SAMPID))
## 6954 
 
x <- merge(genotype.IDs, ID.map, by="SAMPID", all.x=T)
x <- x[,c(1:5,7)]
 
### One of Joyce's phenotype ID files (eventually all phenotypes for all available participants were merged in, but this is a good example.)
phenotypes <- read.table("OffC_Exam_1.txt", header=T, sep="\t")
xx <- merge(x, phenotypes, by="dbGaP_Subject_ID", all=TRUE)
xx <- xx[,c(2:ncol(xx),1)] ## Reorder so the dbGaP_Subject_ID is the last column 
write.table(xx, file="framingham_GSCAN_phenotypesCovariates.ped", quote=FALSE, sep="\t", row.names=F)

Phenotypes

(Joyce will update this section)

Genotypes

We used the Affy 500K genotypes found here: /work/KellerLab/GSCAN/dbGaP/Framingham/PhenoGenotypeFiles/ChildStudyConsentSet_phs000342.Framingham.v16.p10.c1.HMB-IRB-MDS/GenotypeFiles/phg000006.v9.FHS_SHARe_Affy500K.genotype-calls-matrixfmt.c1/subject_level_PLINK_sets/FHS_SHARe_Affy500K_subjects_c1.[bed|bim|fam]


ARIC

(Hannah/Joyce to update this section following Framingham as a guide)

ID Mapping

Phenotypes

phenotypes <- read.table("/work/KellerLab/GSCAN/dbGaP/ARIC/PhenoGenotypeFiles/ChildStudyConsentSet_phs000090.ARIC_RootStudy.v3.p1.c1.HMB-IRB/PhenotypeFiles/phs000090.v3.pht000114.v2.p1.c1.GENEVA_ARIC_Subject_Phenotypes.HMB-IRB.txt.gz",
                         header=T,
                         sep="\t",
                         stringsAsFactors=F)

phenotypes <- subset(phenotypes, select=c("geneva_id", "racegrp", "gender","v1age01", "anta01",
                         "anta04", "drnkr01", "hom29", 'hom35', "hom32", 'cigt01',
                         'evrsmk01', 'dtia90','dtia96', 'dtia97','dtia98', 'cursmk01',
                         'forsmk01'))

### rename phenotypes to be readable
names(phenotypes)[c(1,2,3,4,5,6)] <- c("gen_id", "race", "sex", "age", "height", "weight")

### To connect sample ids to geneva ids, take SAMPID (the ID used in the genotype fam file, and SUBJID (aka geneva_id) )
id_map <- read.table(gzfile("/work/KellerLab/GSCAN/dbGaP/ARIC/PhenoGenotypeFiles/ChildStudyConsentSet_phs000090.ARIC_RootStudy.v3.p1.c1.HMB-IRB/GenotypeFiles/phg000035.v1.ARIC_GEI.genotype-qc.MULTI/geno-qc/samp-subj-mapping.csv.gz"),
                     header=T,
                     sep=",",
                     stringsAsFactors=F)[,c(2,1)]
names(id_map) <- c("SAMPID", "geneva_id")

### import genotype data to get family info
fam_data <- read.table("/work/KellerLab/GSCAN/dbGaP/ARIC/PhenoGenotypeFiles/ChildStudyConsentSet_phs000090.ARIC_RootStudy.v3.p1.c1.HMB-IRB/GenotypeFiles/phg000035.v1.ARIC_GEI.genotype-calls-matrixfmt.c1.GRU.update1/Genotypes_with_flagged_chromosomal_abnormalities_zeroed_out/ARIC_PLINK_flagged_chromosomal_abnormalities_zeroed_out.fam", col.names = c("fam_id", "SAMPID", "patid", "matid", "sex", "dummy"))

### Replace 0's with "x" for rvTest preferred formatting
fam_data$patid <- fam_data$matid <- fam_data$fam_id[fam_data$fam_id == 0] <- "x"


########################################
###---- Derive GSCAN phenotypes -----###
########################################

### DRINKER VERSUS NON-DRINKER
### ARIC variable name is "drnkr01".
### [Hannah will fill in question text]
###   Response options:
###           1 = Current Drinker
###           2 = Former Drinker
###           3 = Never Drinker
###           4 = Unknown
###
### Descriptives: 
### table(phenotypes$drnkr01)
###    1    2    3    4 
### 7257 2309 3153    6 
###
###  To obtain GSCAN "DND" collapse across Former and Never Drinkers
###  and make "Non-Drinkers". Current Drinkers will be made "Drinkers"

dnd <- phenotypes$drnkr01
dnd[dnd == 1] <- "Current Drinker"
dnd[dnd == 2 | dnd == 3] <- 1
dnd[dnd == "Current Drinker"] <- 2
dnd[dnd == 4 | is.na(dnd)] <- "x"

### AGE OF INITIATION OF SMOKING
###
### ARIC variable name is "hom29".
###    "How old were you when you first started regular cigarette smoking?"
###    Response option is an integer value.
###
### Descriptives: 
###
### > table(phenotypes$hom29)
###     0    1    4    5    6    7    8    9   10   11   12   13   14   15   16   17
###    19    1    2   10   11   15   22   32   65   44  154  187  302  659  941  715
###    18   19   20   21   22   23   24   25   26   27   28   29   30   31   32   33
###  1219  567  703  447  275  129   88  247   56   45   59   22  100    8   34    8
###    34   35   36   37   38   39   40   41   42   43   44   45   46   47   48   49
###    17   51    9    9   10    8   35    3   11    5    5   16    3    4    2    3
###    50   51   52   55   57   59   60   62
###     7    1    2    1    1    2    2    1
###
### > summary(phenotypes$hom29)
###   Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
###   0.00   16.00   18.00   18.77   20.00   62.00    5377 
###
ai <- phenotypes$hom29
### remove ages older than 35 and younger than 10
ai[ai > 35 | ai < 10 | is.na(ai)] <- "x"


### CIGARETTES PER DAY
### ARIC variable name is "hom35"
###    "On the average of the entire time you smoked, how many cigarettes did you usually smoke per day?"
###    Response option is integer, or "0" for <1 cigarette per day
###
### > table(phenotypes$hom35)
###     0    1    2    3    4    5    6    7    8    9   10   11   12   13   14   15
###    46   54  118  187  133  254  146   84   89   17  990   23  106   30   12  520
###    16   17   18   19   20   21   22   23   24   25   26   27   28   29   30   31
###    31   30   62    5 2559    1    5   10    5  193    9    3    7    6  771    3
###    32   33   34   35   36   37   38   40   42   43   45   50   51   54   55   58
###     1    1    1   44    3    1    1  572    1    3   14   72    1    1    3    1
###    60   65   70   75   80   86   90   99
###   100    1    4    2   10    1    1    3
###
### > summary(phenotypes$hom35)
###    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's
###    0.00   10.00   20.00   19.67   24.00   99.00    5420
cpd <- phenotypes$hom35
cpd[cpd <=  5 & cpd >=  1] <- 1
cpd[cpd <= 15 & cpd >=  6] <- 2
cpd[cpd <= 25 & cpd >= 16] <- 3
cpd[cpd <= 35 & cpd >= 26] <- 4
cpd[cpd >= 36 & cpd <= 60] <- 5
cpd[cpd > 60 | is.na(cpd)] <- "x"


### DRINKS PER WEEK
### ARIC variable name is "hom35"
###    "On the average of the entire time you smoked, how many cigarettes did you usually smoke per day?"
###    Response option is integer, or "0" for <1 cigarette per day

wine <- phenotypes$dtia96 # 1 drink = 4oz
beer <- phenotypes$dtia97 # 1 drink = 12oz
spirits <- phenotypes$dtia98 # 1 drink = 1.5oz

### divide wine by 5 to normalize to standard drink
dpw <- log((wine*4/5 + beer + spirits) + 1) ## Combine all alcohol types, left-anchor at 1, and log
dpw[is.na(dpw)] <- "x"


### SMOKING INITIATION
### ARIC variable name is "evrsmk01"
### [hannah fill in details]
si <- phenotypes$evrsmk01
si[si == 1] <- 2
si[si == 0] <- 1
si[is.na(si)] <- "x"


### SMOKING CESSATION
### ARIC variable names are "cursmk01" and "forsmk01"
current.smoker <- subset(phenotypes, select=c("cursmk01"))
former.smoker <- subset(phenotypes, select=c("forsmk01"))
N <- nrow(phenotypes)
sc <- rep(NA, N)
for(i in 1:N){
    if(is.na(current.smoker[i,1]) | is.na(former.smoker[i,1])){
        sc[i] <- NA
    }
    else if (current.smoker[i,1]  == 0 & former.smoker[i,1] == 0){
        sc[i] <- NA
    }
    else if (current.smoker[i,1]  == 0 & former.smoker[i,1] == 1){
        sc[i] <- 1 ### former smokers are coded as 1
    }
    else if (current.smoker[i,1]  == 1 & former.smoker[i,1] == 0){
        sc[i] <- 2 ### current smokers are coded as 2
    }
}
sc[is.na(sc)] <- "x"



### Create dataframe with our new GSCAN variables
N <- nrow(phenotypes)
NAs <- rep("x", N)
gscan.phenotypes <- data.frame(famid = NAs,
                               geneva_id = phenotypes$gen_id,
                               patid = NAs,
                               matid = NAs,
                               sex = ifelse(phenotypes$sex == "M", 1, 2),
                               cpd = cpd,
                               ai = ai,
                               si = si,
                               sc = sc,
                               dnd = dnd,
                               dpw = dpw,
                               age = phenotypes$age,
                               age2 = phenotypes$age^2,
                               height = phenotypes$height,
                               weight = phenotypes$weight,
                               currentformersmoker = sc)

### Merge in the SAMPID, which is used in the genotype files
gscan.phenotypes <- merge(gscan.phenotypes, id_map, by="geneva_id", all.x=TRUE)

### Reorder phenotype file to make pedigree file consistent with genotype IDs
gscan.phenotypes <- gscan.phenotypes[c(2,16,3:15)]

### Read in PCs and add to pedigree file, then write out to a phenotype and covariate file
[ here read in PCs and merge into phenotype file (probably by the SAMPID) ]


############# PRELIMINARY #####################

### Write to file [NOTE TO HANNAH: will have to be changed once we
### have PCs and ancestry groups identified. PCs will have to be read
### in like with read.table() and we'll have to subset the dataset
### into European and African ancestry, and then write out one
### phenotype and covariate file per ancestry group.

PCs <- read.table(xxxFile from Zhenxxx)

ancestry.IDs <- read.table(xxxFile from Zhenxxx)
### EUROPEANS
phenotypes.EUR.ped <- subset(gscan.phenotypes, ancestry == "EUR",
                        select=c("famid","SAMPID","patid","matid", "sex",
                            "cpd", "ai","si", "sc", "dnd","dpw"))
write.table(phenotypes.EUR.ped, file="ARIC.EUR.phenotypes.ped", quote=F, col.names=T, row.names=F, sep="\t")

covariates.EUR.ped <- subset(gscan.phenotypes, ancestry == "EUR"
                        select=c("famid","SAMPID","patid","matid", "sex",
                            "age", "age2", "height", "weight", "currentformersmoker",
                            "PC1", "PC2", "PC3", "PC4", "PC5", "PC6", "PC7", "PC8",
                            "PC9", "PC10"))
write.table(covariates.EUR.ped, file="ARIC.EUR.covariates.ped", quote=F, col.names=T, row.names=F, sep="\t")


phenotypes.AFR.ped <- subset(gscan.phenotypes, ancestry == "AFR",
                        select=c("famid","SAMPID","patid","matid", "sex",
                            "cpd", "ai","si", "sc", "dnd","dpw"))
write.table(phenotypes.AFR.ped, file="ARIC.AFR.phenotypes.ped", quote=F, col.names=T, row.names=F, sep="\t")

covariates.AFR.ped <- subset(gscan.phenotypes, ancestry == "AFR"
                        select=c("famid","SAMPID","patid","matid", "sex",
                            "age", "age2", "height", "weight", "currentformersmoker",
                            "PC1", "PC2", "PC3", "PC4", "PC5", "PC6", "PC7", "PC8",
                            "PC9", "PC10"))
write.table(covariates.AFR.ped, file="ARIC.AFR.covariates.ped", quote=F, col.names=T, row.names=F, sep="\t")

Genotypes

MESA

(Hannah/Joyce to update following Framingham as a guide)

Phenotypes

Description of phenotypes can be found here: Media:MESA phenotypes - FINAL.pdf


eMERGE

(Hannah/Joyce to update following Framingham as a guide)

Phenotypes

Description of phenotypes can be found here: Media:EMERGE.pdf


Stroke

(Hannah/Joyce to update following Framingham as a guide)

Genotype Processing

Pre-Phasing QC

QC parameters that we chose: MAF > 0.01

SNP callrate > 0.95

Missingness per individual > 0.95

HWE = 0.05 / number of markers but greater than 5e-8

To update the strand builds: http://www.well.ox.ac.uk/~wrayner/strand/


## Check strands against latest 1000G: http://www.well.ox.ac.uk/~wrayner/tools/
#!/bin/bash
#SBATCH --qos=blanca-ibg
#SBATCH --mem=40gb
perl HRC-1000G-check-bim.pl -b ARIC_b37_filtered.bim -f ARIC_b37_filtered.frq -r  1000GP_Phase3_combined.legend -g -p EUR


## Phasing using shapeit
#!/bin/bash
#SBATCH --mem=20gb
#SBATCH --time=48:00:00
#SBATCH -o shapeit_aric_%j.out
#SBATCH -e shapeit_aric_%j.err
#SBATCH --qos blanca-ibgc1
#SBATCH --ntasks-per-node 48
#SBATCH -J shapeit_aric
shapeit -B ARIC_b37_filtered-updated-chr${1} -M /rc_scratch/meli7712/dbGAP/1000GP_Phase3/genetic_map_chr${1}_combined_b37.txt -O phased/ARIC_b37_filtered-updated-chr${1}.phased -T 48

## To convert the shapeit output into vcf
#!/bin/bash
#SBATCH --mem=20gb
#SBATCH --time=24:00:00
#SBATCH -o shapeit_mesa_%j.out
#SBATCH -e shapeit_mesa_%j.err
#SBATCH --qos janus
#SBATCH --ntasks-per-node 12
#SBATCH -J shapeit_mesa
shapeit -convert --input-haps mesa-chr${1}.phased --output-vcf mesa-chr${1}.phased.vcf -T 12
## Imputation
#!/bin/bash
#SBATCH --mem=30gb
#SBATCH --time=72:00:00
#SBATCH -o impute_mesa_%j.out
#SBATCH -e impute_mesa_%j.err
#SBATCH --qos blanca-ibgc1
#SBATCH --ntasks-per-node 48
#SBATCH -J impute_mesa

/work/KellerLab/Zhen/bin/Minimac3/bin/Minimac3 --haps mesa-chr${1}.phased.vcf --cpus 48 --refHaps /rc_scratch/meli7712/dbGAP/references/${1}.1000g.Phase3.v5.With.Parameter.Estimates.m3vcf.gz --chr ${1} --noPhoneHome --format GT,DS,GP --allTypedSites --prefix mesa-chr${1}.phased.imputed