GSCAN dbGaP
UK Biobank
More information about the files used for UKBiobank are here. In brief, we used the UK10K + 1kgp3 imputed vcfs provided by UKBionank and added in dosages w/ this python script:
import gzip, argparse, re, os, datetime
from subprocess import Popen, PIPE
def add_dosage(pair):
a, b = pair
probs = b.split(b',')
dose = float(probs[1]) + (float(probs[2]) * 2)
return a + b':' + str(dose).encode('ascii') + b':' + b
def gziplines(fname):
f = Popen(['zcat', fname], stdout=PIPE)
for line in f.stdout:
yield line
parser = argparse.ArgumentParser()
parser.add_argument('inputVCF', help = 'The path to the VCF')
args = parser.parse_args()
flag = False
for line in gziplines(args.inputVCF):
if line.startswith(b'#'):
os.write(1, line.rstrip() + b'\n')
if not flag:
os.write(1, b'##FORMAT=<ID=DS,Number=1,Type=Float,Description="Genotype Dosages">\n')
os.write(1, b'##Dosages added using the script add.dosages.subprocess.py at ' +
str(datetime.datetime.now()).encode('ascii') + b'\n')
flag = True
else:
elements = re.split(b'\t|:', line.rstrip())
first8 = elements[:8]
genotypes = elements[10:]
form = b'GT:DS:GP'
genotypes_split = zip(genotypes[::2], genotypes[1::2])
try:
dose_genos = [add_dosage(pair) for pair in genotypes_split]
except (ValueError, IndexError) as e:
os.write(2, "\n" + line)
os.write(2, line + "\n" + args.inputVCF + "\n\n")
raise e
os.write(1, b'\t'.join(first8) + b'\t' + form + b'\t' + b'\t'.join(dose_genos) + b'\n')
dbGaP Studies
Framingham
Phenotypes
options(stringsAsFactors=F)
### Offspring cohort
off <- read.table(gzfile("/work/KellerLab/GSCAN/dbGaP/Framingham/PhenoGenotypeFiles/RootStudyConsentSet_phs000007.Framingham.v28.p10.c1.HMB-IRB-MDS/PhenotypeFiles/phs000007.v28.pht000030.v7.p10.c1.ex1_1s.HMB-IRB-MDS.txt.gz"), header=T, sep="\t")
### Generation 3 cohort
g3 <- read.table(gzfile("/work/KellerLab/GSCAN/dbGaP/Framingham/PhenoGenotypeFiles/RootStudyConsentSet_phs000007.Framingham.v28.p10.c1.HMB-IRB-MDS/PhenotypeFiles/phs000007.v28.pht000074.v9.p10.c1.ex3_1s.HMB-IRB-MDS.txt.gz"), header=T, skip=10, sep="\t", fill=T)
###---------------------###
### Cigarettes per Day ###
###---------------------###
### Offspring Cohort Exam 1
### Framingham variable name is "A102".
### "Usual number of cigarettes smoked (now or formerly)"
### Response option is an integer value
###
### table(off$A102)
### 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
### 1208 50 29 44 26 47 31 16 30 3 175 1 20 1 9 94
### 16 17 18 19 20 22 24 25 26 27 28 30 34 35 40 42
### 5 6 50 1 556 10 3 39 1 1 2 214 1 15 147 2
### 45 50 60 80 88 90
### 4 21 19 1 14 2
###
### summary(off$A102)
### Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
### 0.00 0.00 7.00 12.11 20.00 90.00 8
cpd.off <- off$A102
cpd.off[cpd.off==0] <- NA ### Set those who smoke "0" cpd to NA
cpd.off[cpd.off > 60] <- 60 ### Set those who report smoking >40 cpd to 40
cpd.off[cpd.off <= 5 & cpd.off >= 1] <- 1
cpd.off[cpd.off <= 15 & cpd.off >= 6] <- 2
cpd.off[cpd.off <= 25 & cpd.off >= 16] <- 3
cpd.off[cpd.off <= 35 & cpd.off >= 26] <- 4
cpd.off[cpd.off >= 36 & cpd.off <= 60] <- 5
cpd.off[cpd.off > 60 | is.na(cpd.off)] <- NA
### G3 cohort Exam 1
### Framingham variable name is "G3A074
### "IF EVER SMOKED CIGS REGULARLY: ON THE AVERAGE
### OF THE ENTIRE TIME YOU SMOKED, HOW MANY CIGARETTES
### DID YOU SMOKE PER DAY?"
### Response option is an integer value
###
### table(g3$G3A074)
###
### 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
### 3180 57 14 15 20 30 8 13 7 2 122 2 16 3 2 60
### 16 17 18 19 20 21 22 23 24 25 28 30 35 40 45 50
### 3 3 10 1 287 2 1 1 1 12 1 53 1 47 1 12
### 55 60 80
### 1 6 1
###
### summary(g3$G3A074)
### Min. 1st Qu. Median Mean 3rd Qu. Max.
### 0.000 0.000 0.000 3.528 0.000 80.000
cpd.g3 <- g3$G3A074
cpd.g3[cpd.g3==0] <- NA ### Set those who smoke "0" cpd to NA
cpd.g3[cpd.g3 > 60] <- 60 ### Set those who report smoking >40 cpd to 40
cpd.g3[cpd.g3 <= 5 & cpd.g3 >= 1] <- 1
cpd.g3[cpd.g3 <= 15 & cpd.g3 >= 6] <- 2
cpd.g3[cpd.g3 <= 25 & cpd.g3 >= 16] <- 3
cpd.g3[cpd.g3 <= 35 & cpd.g3 >= 26] <- 4
cpd.g3[cpd.g3 >= 36 & cpd.g3 <= 60] <- 5
cpd.g3[cpd.g3 > 60 | is.na(cpd.g3)] <- NA
###--------------------###
### Smoking initiation ###
###--------------------###
### Offspring Cohort Exam 1
### Framingham variable is "A99"
### "Smokes cigarettes: Yes(now)/No/Former"
### Response option is fill-in
### 0 is no, 1 is yes (now), 2 is former
###
### table(off$A99)
###
### 0 1 2
### 1203 1126 566
si.off <- off$A99
si.off[si.off==1] <- 2 ### Set current smoker "1" to smoker "2"
si.off[si.off==0] <- 1 ### Set never smoker "0" to nonsmoker "1"
### Former smoker "2" remains as a "2" as smoker
### G3 Cohort Exam 1
### Framingham variables is "G3A070"
### G3A070 - HAVE YOU EVER SMOKED CIGARETTES REGULARLY? (NO MEANS
### LESS THAN 20 PACKS OF CIGARETTES OR 12 OZ OF TOBACCO IN A
### LIFETIME OR LESS THAN 1 CIGARETTE A DAY FOR A YEAR.)"
### Response options:
### "0" is no, "1" is yes, "0" is unknown
###
### table(g3$G3A070)
### 0 1 20
### 2137 1629 1
si.g3 <- g3$G3A070
si.g3[si.g3==20] <- NA
si.g3[si.g3==1] <- 2
si.g3[si.g3==0] <- 1
###---------------------###
### Smoking cessation ###
###---------------------###
### Offspring Cohort Exam 1
### Framingham variable is "A99"
### Smokes cigarettes: Yes(now)/No/Former"
### Response option is fill-in
### 0 is no, 1 is yes (now), 2 is former
###
### table(off$A99)
###
### 0 1 2
### 1203 1126 566
sc.off <- off$A99
sc.off[sc.off==2] <- 3 ### temporarily set former smoker "2" as "3"
sc.off[sc.off==1] <- 2 ### set current smoker "1" as current smoker "2"
sc.off[sc.off==3] <- 1 ### set former smoker "3" as former smoker "1"
sc.off[sc.off==0] <- NA ### code all non smokers "0" as NA
### G3 Cohort Exam 1
### Framingham variables are “G3A070” and “G3A072”
### Response option is fill-in
### G3A070 - HAVE YOU EVER SMOKED CIGARETTES REGULARLY? (NO MEANS LESS THAN 20 PACKS OF CIGARETTES OR 12 OZ OF ###TOBACCO IN A LIFETIME OR LESS THAN 1 CIGARETTE A DAY FOR A YEAR.)
###“0” is no, “1” is yes, “20” is unknown
### G3A072 - IF EVER SMOKED CIGARETTES REGULARLY: DO YOU NOW SMOKE CIGARETTES (AS OF 1 MONTH AGO)?
###“0” is no or never smoked, “1” is yes
###
### table(g3$G3A070)
### 0 1 20
### 2137 1629 1
###
### table(g3$G3A072)
### 0 1
### 3179 585
sc.g3 <- g3$G3A072
sc.g3[sc.g3==1] <- 2 ## These are our current smokers
sc.g3[sc.g3==0] <- 1
sc.g3 <- ifelse(sc.g3==1 & (g3$G3A070 == 0 | is.na(g3$G3A070)), NA, sc.g3)
###---------------------###
### Age of initiation ###
###---------------------###
### Offspring Cohort Exam 1
### Framingham variable "A100"
### Age started smoking cigarettes regularly\
### Response option is an integer value, 88 is doesn't smoke regularly
### table(off$A100)
###
### 0 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25
### 6 2 2 2 8 4 24 37 61 147 266 233 330 166 164 85 42 26 13 25
### 26 27 28 29 30 31 33 34 35 37 38 39 40 44 45 46 51 88
### 2 8 4 3 7 1 1 1 3 1 2 1 3 1 1 2 1 105
###
ai.off <- off$A100
ai.off[ai.off < 10] <- NA ### Set AI less than 10 as missing\
ai.off[ai.off > 35] <- NA ### Set AI greater than 35 as missing \
### G3 Cohort Exam 1
### Framingham variable "G3A075"
### IF EVER SMOKED CIGARETTES REGULARLY: HOW OLD WERE YOU WHEN YOU FIRST STARTED REGULAR CIGARETTE SMOKING?
### Response option is an integer value, 0 is never smoked
### table(g3$G3A075)
###
### 0 5 7 8 9 10 11 12 13 14 15 16 17 18 19 20
### 2138 1 2 2 3 5 14 52 108 159 152 277 193 256 98 107
### 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36
### 51 31 19 13 31 6 4 6 2 13 2 1 1 4 1 4
### 37 38 40 42 46 50
### 2 2 1 1 1 2
###
ai.g3 <- g3$G3A075
ai.g3[ai.g3 < 10] <- NA ### Set AI less than 10 as missing
ai.g3[ai.g3 > 35] <- NA ### Set AI greater than 35 as missing
###-------------------###
### Drinks per week ###
###-------------------###
### Offspring Cohort Exam 1
### Framingham variables "A111", "A112", "A113"
### A111 - Beer (bottles, cans or glasses per week)
### A112 - Wine (glasses per week)
### A113 - Cocktails, highballs, straight drinks (# per week)
### Response options are integer values.
### table(off$A111)
###
### 0 1 2 3 4 5 6 7 8 9 10 12 14 15 16 18
### 1452 524 171 131 78 49 125 34 33 4 46 68 21 21 5 13
### 20 21 24 25 26 28 30 35 36 40 42 48 49 50 70 90
### 18 7 48 5 1 6 6 2 2 4 4 6 1 2 1 1
###
### table(off$A112)
###
### 0 1 2 3 4 5 6 7 8 9 10 12 14 15 16 18
### 1329 919 218 105 69 41 22 64 25 3 25 11 30 2 2 2
### 20 21 24 28 30 32 50
### 7 4 2 3 1 2 1
###
### table(off$A113)
###
### 0 1 2 3 4 5 6 7 8 9 10 12 14 15 16 18
### 861 1094 267 157 96 64 59 65 31 9 49 16 57 13 2 1
### 19 20 21 24 25 28 30 35 38 42 50 55 60 61
### 1 12 10 4 2 2 6 2 1 1 1 1 1 1
###
dpw.off <- rowSums(subset(off, select = c("A111", "A112", "A113")), na.rm=T)
### The above assumes all drinks are of equal ETOH content
dpw.off[dpw.off == 0] <- NA
dpw.off <- log(dpw.off)
### G3 Cohort Exam 1\
### Framingham variables are "G3A115", "G3A116", "G3A119", "G3A120",
### "G3A123", "G3A124", "G3A127", "G3A128",
### "G3A131", "G3A132"
### G3A115 - BEER: NUMBER OF BEER (12 OZ. BOTTLE, GLASS, CAN) YOU DRINK PER WEEK OVER THE PAST YEAR
### G3A116 - BEER: NUMBER OF BEER (12 OZ. BOTTLE, GLASS, CAN) YOU DRINK PER MONTH OVER THE PAST YEAR
### G3A119 - WHITE WINE: NUMBER OF WHITE WINE (4 OZ GLASS) YOU DRINK PER WEEK OVER THE PAST YEAR
### G3A120 - WHITE WINE: NUMBER OF WHITE WINE (4 OZ GLASS) YOU DRINK PER MONTH OVER THE PAST YEAR
### G3A123 - RED WINE: NUMBER OF RED WINE (4 OZ GLASS) YOU DRINK PER WEEK OVER THE PAST YEAR
### G3A124 - RED WINE: NUMBER OF RED WINE (4 OZ GLASS) YOU DRINK PER MONTH OVER THE PAST YEAR
### G3A127 - LIQUOR/SPIRITS: AVERAGE NUMBER OF LIQUOR/SPIRITS (1 1/4 OZ JIGGER) YOU DRINK PER WEEK OVER THE PAST YEAR
### G3A128 - LIQUOR/SPIRITS: AVERAGE NUMBER OF LIQUOR/SPIRITS (1 1/4 OZ JIGGER) YOU DRINK PER MONTH OVERTHE PAST YEAR
### G3A131 - OTHER BEVERAGE: AVERAGE NUMBER OF OTHER BEVERAGE YOU DRINK PER WEEK OVER THE PAST YEAR
### G3A132 - OTHER BEVERAGE: AVERAGE NUMBER OF OTHER BEVERAGE YOU DRINK PER MONTH OVER THE PAST YEAR
###
### *Participant was allowed to report alcohol consumption in either drinks per week or drinks per
### month. Therefore, to calculate total alcohol consumption, you must use both number of drinks per
### week and number of drinks per month (E.G. DRINKS PER MONTH = SUM(OF (DRINKS PER WEEK)(DRINKS PER
### MONTH/4)).
###
### Response options are integer values.
###
all <- subset(g3, select = c("G3A115", "G3A116", "G3A119", "G3A120", "G3A123", "G3A124", "G3A127", "G3A128",
"G3A131", "G3A132"))
names(all) <- c("beerpw", "beerpm", "wwinepw", "wwinepm", "rwinepw", "rwinepm", "liqpw", "liqpm", "othpw", "othpm")
w <- all[,c(1,3,5,7,9)]
m <- all[,c(2,4,6,8,10)]/4
dpw.tmp <- data.frame(beer = rowSums(cbind(w$beerpw, m$beerpm), na.rm=T),
wwine = rowSums(cbind(w$wwinepw, m$wwinepm), na.rm=T),
rwine = rowSums(cbind(w$rwinepw, m$rwinepm), na.rm=T),
liq = rowSums(cbind(w$liqpw, m$liqpm), na.rm=T),
oth = rowSums(cbind(w$othpw, m$othpm), na.rm=T))
### WHITE WINE - convert from 4oz to 5oz drink
dpw.tmp$wwine <- dpw.tmp$wwine*5/4
### RED WINE - convert from 4oz to 5oz drink
dpw.tmp$rwine <- dpw.tmp$rwine*5/4
### LIQUOR - convert from 1.25 to 1.50 drink
dpw.tmp$liq <- dpw.tmp$liq*1.5/1.25
### OTHER DRINK is an unknown. leave as-is.
dpw.g3 <- rowSums(dpw.tmp, na.rm=T)
### We have a few outliers, draw cuttoff at 70 drinks / week
dpw.g3[dpw.g3 > 70] <- NA
dpw.g3[dpw.g3 == 0] <- NA
dpw.g3 <- log(dpw.g3 + .75)
###---------------------------###
### Drinker vs. non-drinker ###
###---------------------------###
###
### Use the variables already computed for dpw above. Assume
### anyone who is drinking less than 1 drink / month is a non-drinker
x <- subset(off, select = c("A111", "A112", "A113"))
index <- rep(NA, nrow(x))
for(i in 1:nrow(x)) {
if(is.na(x[i,1]) & is.na(x[i,2]) & is.na(x[i,3])) {
index[i] <- 1
} else {
index[i] <- 0
}
}
dnd.off <- ifelse(rowSums(subset(off, select = c("A111", "A112", "A113")), na.rm=T) == 0, 1, 2)
dnd.off[index==1] <- NA
x <- all
index <- rep(NA, nrow(x))
for(i in 1:nrow(x)) {
if(is.na(x[i,1]) & is.na(x[i,2]) & is.na(x[i,3])) {
index[i] <- 1
} else {
index[i] <- 0
}
}
dnd.g3 <- ifelse(rowSums(all, na.rm=T) == 0, 1, 2)
dnd.g3[index==1] <- NA
###----------------------------###
### Binge drinking in everyone ###
### (Only available for G3) ###
###----------------------------###
### G3 Cohort Exam 1
### Framingham variable is "G3A137"
### IF EVER CONSUMED ALCOHOL: WHAT WAS THE MAXIMUM NUMBER OF
### DRINKS YOU HAD IN A 24 HOUR PERIOD DURING THE PAST MONTH?
### Response option is an integer value
###
### summary(g3$G3A137)
### Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
### 0.000 1.000 3.000 3.566 5.000 50.000 3
bde <- g3$G3A137
sex <- g3$G3A440 ### bde is defined differently for males & females
### sex == 1 is male, sex == 2 is female
tmp <- rep(NA, length(bde)) ### temporary vector to hold results and avoid conflicts
tmp[sex == 1 & bde < 5] <- 1
tmp[sex == 1 & bde >=5] <- 2
tmp[sex == 2 & bde < 4] <- 1
tmp[sex == 2 & bde >=4] <- 2
bde.g3 <- tmp
###-----------------------------------------###
### Binge-drinking (in lifetime drinkers) ###
### (Only available for G3) ###
###-----------------------------------------###
### G3 Cohort Exam 1
### Framingham variable is "G3A112"
### HAVE YOU EVER CONSUMED ALCOHOLIC BEVERAGES (BEER, WINE, LIQUOR/SPIRITS)? \
### 0 is no, 1 is yes
lifednd <- g3$G3A112 ## No missings, so next command has no problem
bdl.g3 <- bde.g3
bdl.g3 <- ifelse(lifednd == 0, NA, bde.g3)
##################
##################
### ###
### Covariates ###
### ###
##################
##################
### HEIGHT
### Offspring Cohort variable is “A51”
### Generation 3 Cohort variable is “G3A446”
height.off <- off$A51
height.g3 <- g3$G3A446
### WEIGHT
### Offspring Cohort variable is “A50”
### Generation 3 Cohort variable is “G3A444”
weight.off <- off$A50
weight.g3 <- g3$G3A444
###-------------###
### Age at exam ###
###-------------###
### Get BIRTHDATE
birth <- read.table(gzfile("/work/KellerLab/GSCAN/dbGaP/Framingham/PhenoGenotypeFiles/RootStudyConsentSet_phs000007.Framingham.v28.p10.c1.HMB-IRB-MDS/PhenotypeFiles/phs000007.v28.pht000740.v7.p10.c1.birthyr_alls.HMB-IRB-MDS.txt.gz"), header=T, sep="\t")
off.IDs <- off$dbGaP_Subject_ID
g3.IDs <- g3$dbGaP_Subject_ID
birth.off <- subset(birth, dbGaP_Subject_ID %in% off.IDs)
birth.g3 <- subset(birth, dbGaP_Subject_ID %in% g3.IDs)
### True exam dates spanned 1971 - 1975 for offspring
### True exam dates spanned 2002 - 2005 for generation 3
### A good-enough approximation is to take the middle
### year and subtract birth year to get age at exam
### Offspring Cohort
off.age <- data.frame(dbGaP_Subject_ID = birth.off$dbGaP_Subject_ID,
Age = 1973 - birth.off$birthyr)
### Framingham Cohort
g3.age <- data.frame(dbGaP_Subject_ID = birth.g3$dbGaP_Subject_ID,
Age = 2004 - birth.g3$birthyr)
age <- rbind(off.age, g3.age)
####################################
####################################
### ###
### Create Phenotype Data Frames ###
### ###
####################################
####################################
offspring <- data.frame(dbGaP_Subject_ID = off$dbGaP_Subject_ID,
shareid = off$shareid,
cpd = cpd.off,
si = si.off,
sc = sc.off,
ai = ai.off,
dpw = dpw.off,
dnd = dnd.off,
bde = rep(NA, nrow(off)),
bdl = rep(NA, nrow(off)),
height = height.off,
weight = weight.off,
cohort = rep(1, nrow(off)))
generation3 <- data.frame(dbGaP_Subject_ID = g3$dbGaP_Subject_ID,
shareid = g3$shareid,
cpd = cpd.g3,
si = si.g3,
sc = sc.g3,
ai = ai.g3,
dpw = dpw.g3,
dnd = dnd.g3,
bde = bde.g3,
bdl = bdl.g3,
height = height.g3,
weight = weight.g3,
cohort = rep(2, nrow(g3)))
tmp <- rbind(offspring, generation3)
phenotypes <- merge(tmp, age, by="dbGaP_Subject_ID")
phenotypes$age2 <- phenotypes$Age^2
##################
### ID MAPPING ###
##################
### ID mapping file from dbGaP_Subject_ID to SAMPID
ID.map <- read.table(gzfile("/work/KellerLab/GSCAN/dbGaP/Framingham/PhenoGenotypeFiles/RootStudyConsentSet_phs000007.Framingham.v28.p10.c1.HMB-IRB-MDS/PhenotypeFiles/phs000007.v28.pht001415.v16.p10.Framingham_Sample.MULTI.txt.gz"), header=T, sep="\t", stringsAsFactors=F)
### Genotype files
genotype.IDs <- read.table("/work/KellerLab/GSCAN/dbGaP/Framingham/PhenoGenotypeFiles/ChildStudyConsentSet_phs000342.Framingham.v16.p10.c1.HMB-IRB-MDS/GenotypeFiles/phg000006.v9.FHS_SHARe_Affy500K.genotype-calls-matrixfmt.c1/subject_level_PLINK_sets/FHS_SHARe_Affy500K_subjects_c1.fam", header=F)
names(genotype.IDs) <- c("famid", "SAMPID", "patid", "matid", "sex", "phenotype")
length(which(genotype.IDs$SAMPID %in% ID.map$SAMPID))
## 6954
x <- merge(genotype.IDs, ID.map, by="SAMPID", all.x=T)
x <- x[,c(1:5,7)]
### There are many duplicates, which I can remove because I only want
### a single entry for every SAMPID-SUBJID
x <- x[which(!duplicated(x$dbGaP_Subject_ID)),]
almost.final <- merge(phenotypes, x, by="dbGaP_Subject_ID", all.x=T)
####################################################
### Bring in genetic PCs and ancestry categories ###
####################################################
PCs <- read.table("/work/KellerLab/Zhen/FRAMINGHAM/PCA/FRAMINGHAM_pcs_and_ancestries.txt", header=T)
names(PCs)[2] <- c("SAMPID")
final <- merge(almost.final, PCs, by="SAMPID", all.x=T)
phenotypes.ped <- subset(final, ancestry=="EUR",
select=c("famid", "SAMPID", "patid",
"matid", "sex", "cpd", "ai", "si",
"sc", "dpw", "dnd", "bde", "bdl"))
phenotypes.ped[is.na(phenotypes.ped)] <- "x"
covariates.ped <- subset(final, ancestry=="EUR",
select=c("famid", "SAMPID", "patid",
"matid", "sex", "Age", "age2", "sc",
"height", "weight", "cohort",
"PC1", "PC2", "PC3", "PC4", "PC5",
"PC6", "PC7", "PC8", "PC9", "PC10"))
covariates.ped[is.na(covariates.ped)] <- "x"
write.table(phenotypes.ped, file="Framingham.EUR.phenotypes.ped", quote=F,
col.names=T, row.names=F)
write.table(covariates.ped, file="Framingham.EUR.covariates.ped", quote=F,
col.names=T, row.names=F)
Genotypes
We used the Affy 500K genotypes found here: /work/KellerLab/GSCAN/dbGaP/Framingham/PhenoGenotypeFiles/ChildStudyConsentSet_phs000342.Framingham.v16.p10.c1.HMB-IRB-MDS/GenotypeFiles/phg000006.v9.FHS_SHARe_Affy500K.genotype-calls-matrixfmt.c1/subject_level_PLINK_sets/FHS_SHARe_Affy500K_subjects_c1.[bed|bim|fam]
ARIC
(Hannah/Joyce to update this section following Framingham as a guide)
ID Mapping
Phenotypes
phenotypes <- read.table("/work/KellerLab/GSCAN/dbGaP/ARIC/PhenoGenotypeFiles/ChildStudyConsentSet_phs000090.ARIC_RootStudy.v3.p1.c1.HMB-IRB/PhenotypeFiles/phs000090.v3.pht000114.v2.p1.c1.GENEVA_ARIC_Subject_Phenotypes.HMB-IRB.txt.gz",header=T,sep="\t",stringsAsFactors=F)
phenotypes <- subset(phenotypes, select=c("geneva_id", "racegrp", "gender","v1age01", "anta01","anta04", "drnkr01", "hom29", 'hom35', "hom32", 'cigt01','evrsmk01', 'dtia90','dtia96', 'dtia97','dtia98', 'cursmk01','forsmk01'))
### rename phenotypes to be readable
names(phenotypes)[c(1,2,3,4,5,6)] <- c("geneva_id", "race", "sex", "age", "height", "weight")
### To connect sample ids to geneva ids, take SAMPID (the ID used in the genotype fam file, and SUBJID (aka geneva_id) )
id_map <- read.table(gzfile("/work/KellerLab/GSCAN/dbGaP/ARIC/PhenoGenotypeFiles/ChildStudyConsentSet_phs000090.ARIC_RootStudy.v3.p1.c1.HMB-IRB/GenotypeFiles/phg000035.v1.ARIC_GEI.genotype-qc.MULTI/geno-qc/samp-subj-mapping.csv.gz"),header=T,sep=",",stringsAsFactors=F)[,c(2,1)]
names(id_map) <- c("SAMPID", "geneva_id")
phenotypes <- merge(phenotypes, id_map, by="geneva_id", all.x=TRUE)
### import genotype data to get family info
fam_data <- read.table("/work/KellerLab/GSCAN/dbGaP/ARIC/PhenoGenotypeFiles/ChildStudyConsentSet_phs000090.ARIC_RootStudy.v3.p1.c1.HMB-IRB/GenotypeFiles/phg000035.v1.ARIC_GEI.genotype-calls-matrixfmt.c1.GRU.update1/Genotypes_with_flagged_chromosomal_abnormalities_zeroed_out/ARIC_PLINK_flagged_chromosomal_abnormalities_zeroed_out.fam", col.names = c("fam_id", "SAMPID", "patid", "matid", "sex", "dummy"))
### Replace 0's with "x" for rvTest preferred formatting
fam_data$patid <- fam_data$matid <- fam_data$fam_id[fam_data$fam_id == 0] <- "x"
########################################
###---- Derive GSCAN phenotypes -----###
########################################
### DRINKER VERSUS NON-DRINKER
### ARIC variable name is "drnkr01".
### Combination of "Do you presently drink alcoholic beverages?" and "Have you ever consumed alcoholic beverages?"
### Response option for both questions are "yes" or "no", which are turned into the options below.
### Response options:
### 1 = Current Drinker
### 2 = Former Drinker
### 3 = Never Drinker
### 4 = Unknown
###
### Descriptives:
### table(phenotypes$drnkr01)
### 1 2 3 4
### 7257 2309 3153 6
###
### To obtain GSCAN "DND" collapse across Former and Never Drinkers
### and make "Non-Drinkers". Current Drinkers will be made "Drinkers"
dnd <- phenotypes$drnkr01
dnd[dnd == 1] <- "Current Drinker"
dnd[dnd == 2 | dnd == 3] <- 1
dnd[dnd == "Current Drinker"] <- 2
dnd[dnd == 4 | is.na(dnd)] <- "x"
### AGE OF INITIATION OF SMOKING
###
### ARIC variable name is "hom29".
### "How old were you when you first started regular cigarette smoking?"
### Response option is an integer value.
###
### Descriptives:
###
### > table(phenotypes$hom29)
### 0 1 4 5 6 7 8 9 10 11 12 13 14 15 16 17
### 19 1 2 10 11 15 22 32 65 44 154 187 302 659 941 715
### 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33
### 1219 567 703 447 275 129 88 247 56 45 59 22 100 8 34 8
### 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49
### 17 51 9 9 10 8 35 3 11 5 5 16 3 4 2 3
### 50 51 52 55 57 59 60 62
### 7 1 2 1 1 2 2 1
###
### > summary(phenotypes$hom29)
### Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
### 0.00 16.00 18.00 18.77 20.00 62.00 5377
###
ai <- phenotypes$hom29
### remove ages older than 35 and younger than 10
ai[ai > 35 | ai < 10 | is.na(ai)] <- "x"
### CIGARETTES PER DAY
### ARIC variable name is "hom35"
### "On the average of the entire time you smoked, how many cigarettes did you usually smoke per day?"
### Response option is integer, or "0" for <1 cigarette per day
###
### > table(phenotypes$hom35)
### 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
### 46 54 118 187 133 254 146 84 89 17 990 23 106 30 12 520
### 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31
### 31 30 62 5 2559 1 5 10 5 193 9 3 7 6 771 3
### 32 33 34 35 36 37 38 40 42 43 45 50 51 54 55 58
### 1 1 1 44 3 1 1 572 1 3 14 72 1 1 3 1
### 60 65 70 75 80 86 90 99
### 100 1 4 2 10 1 1 3
### > summary(phenotypes$hom35)
### Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
### 0.00 10.00 20.00 19.67 24.00 99.00 5420
### Responses are binned in accordance with the GSCAN Analysis Plan.
cpd <- phenotypes$hom35
cpd[cpd <= 5 & cpd >= 1] <- 1
cpd[cpd <= 15 & cpd >= 6] <- 2
cpd[cpd <= 25 & cpd >= 16] <- 3
cpd[cpd <= 35 & cpd >= 26] <- 4
cpd[cpd >= 36 & cpd <= 60] <- 5
cpd[cpd > 60 | is.na(cpd)] <- "x"
### DRINKS PER WEEK
### ARIC variable names are "dtia96", "dtia97", and "dtia98"
### "dtia96" - "How many glasses of wine do you usualy have per week? (4oz. glasses; round down)."
### "dtia97" - "How many bottles of cans or beer do you usualy have per week? (12oz. bottles or cans; round down)."
### "dtia98" - "How many drinks of hard liquor do you usualy have per week? (4oz. glasses; round down)."
### Response option for all three is integer.
###
### Descriptives:
###
### >table(phenotypes$dtia96)
### 0 1 2 3 4 5 6 7 8 9 10 11 12 14 15 16
### 5226 844 461 255 147 90 75 50 27 3 34 1 15 28 9 2
### 17 18 20 21 25 28 30 32 33 35 40
### 1 3 7 5 1 2 3 1 1 1 1
###
### >summary(phenotypes$dtia96)
### Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
### 0.000 0.000 0.000 0.868 1.000 40.000 5478
###
### >table(phenotypes$dtia97)
### 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
### 4356 674 528 312 214 107 297 93 76 10 97 3 186 4 40 28
### 16 18 19 20 21 22 23 24 25 28 30 32 33 35 36 40
### 5 28 1 36 19 1 1 89 8 8 12 2 1 10 8 6
### 42 45 48 49 50 56 60 63 70 72 80 92
### 13 2 6 1 3 2 4 1 1 2 1 1
###
### >summary(phenotypes$dtia97)
### Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
### 0.000 0.000 0.000 2.609 2.000 92.000 5474
###
### >table(phenotypes$dtia98)
### 0 1 2 3 4 5 6 7 8 9 10 11 12 14 15 16
### 5226 844 461 255 147 90 75 50 27 3 34 1 15 28 9 2
### 17 18 20 21 25 28 30 32 33 35 40
### 1 3 7 5 1 2 3 1 1 1 1
###
### >summary(phenotypes$dtia96)
### Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
### 0.000 0.000 0.000 0.868 1.000 40.000 5478
###
### >table(phenotypes$dtia97)
### 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
### 4356 674 528 312 214 107 297 93 76 10 97 3 186 4 40 28
### 16 18 19 20 21 22 23 24 25 28 30 32 33 35 36 40
### 5 28 1 36 19 1 1 89 8 8 12 2 1 10 8 6
### 42 45 48 49 50 56 60 63 70 72 80 92
### 13 2 6 1 3 2 4 1 1 2 1 1
###
### >summary(phenotypes$dtia97)
### Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
### 0.000 0.000 0.000 2.609 2.000 92.000 5474
###
### >table(phenotypes$dtia98)
### 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
### 4387 735 551 295 239 190 148 138 70 11 151 14 57 3 103 33
### 16 17 18 20 21 24 25 26 27 28 30 32 33 34 35 36
### 9 9 7 36 30 1 10 1 1 9 6 2 1 2 4 1
### 39 40 44 45 47 48 50 51 52 54 55 56 63 64 75 77
### 1 7 1 1 1 2 5 1 1 1 1 3 1 2 1 2
### 90 99
### 1 2
###
### >summary(phenotypes$dtia98)
### Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
### 0.000 0.000 0.000 2.227 2.000 99.000 5483
wine <- phenotypes$dtia96 # 1 drink = 4oz
beer <- phenotypes$dtia97 # 1 drink = 12oz
spirits <- phenotypes$dtia98 # 1 drink = 1.5oz
### muliply wine by 4 and divide wine by 5 to normalize to standard drink of 5 oz for wine
### Combine all alcohol types, left-anchor at 1, and log
dpw <- log((wine*4/5 + beer + spirits) + 1)
dpw[is.na(dpw)] <- "x"
### SMOKING INITIATION
### ARIC variable name is "evrsmk01"
### The variable checks answers to "Have you ever smoked cigarettes?" and "Do you now smoke cigarettes?".
### Response options are "yes" or "no".
###
### Descriptives:
###
### >table(phenotypes$evrsmk01)
### 0 1
### 5328 7434
###
### >summary(phenotypes$evrsmk01)
### Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
### 0.0000 0.0000 1.0000 0.5825 1.0000 1.0000 9
si <- phenotypes$evrsmk01
si[si == 1] <- 2
si[si == 0] <- 1
si[is.na(si)] <- "x"
### SMOKING CESSATION
### ARIC variable names are "cursmk01" and "forsmk01"
### Both varaiables take into account the questions: "Have you ever smoked cigarettes?" and "Do you now smoke cigarettes?"
### Response options are "yes" or "no".
### Smoking inititation (si) is coded as "2" for "Smoker" if "yes" to "Have you ever smoked cigarettes?"
### If a subsequent "yes" to "Do you now smoke cigarettes?", smoking cessation (sc) is coded as "2" for "Current Smoker".
### If a subsequent "no" to "Do you now smoke cigarettes?", smoking cessation (sc) is coded as "1" for "Former Smoker".
### Smoking inititation (si) is coded as "1" for "Non Smoker" if "no" to "Have you ever smoked cigarettes?"
current.smoker <- subset(phenotypes, select=c("cursmk01"))
former.smoker <- subset(phenotypes, select=c("forsmk01"))
N <- nrow(phenotypes)
sc <- rep(NA, N)
for(i in 1:N){
if(is.na(current.smoker[i,1]) | is.na(former.smoker[i,1])){
sc[i] <- NA
}
else if (current.smoker[i,1] == 0 & former.smoker[i,1] == 0){
sc[i] <- NA
}
else if (current.smoker[i,1] == 0 & former.smoker[i,1] == 1){
sc[i] <- 1 ### former smokers are coded as 1
}
else if (current.smoker[i,1] == 1 & former.smoker[i,1] == 0){
sc[i] <- 2 ### current smokers are coded as 2
}
}
sc[is.na(sc)] <- "x"
### Create dataframe with our new GSCAN variables
N <- nrow(phenotypes)
NAs <- rep("x", N)
gscan.phenotypes <- data.frame(SAMPID = phenotypes$SAMPID, famid = NAs,geneva_id = phenotypes$geneva_id,patid = NAs,matid = NAs,sex = ifelse(phenotypes$sex == "M", 1, 2),cpd = cpd,ai = ai,si = si,sc = sc,dnd = dnd,dpw = dpw,age = phenotypes$age,age2 = phenotypes$age^2,height = phenotypes$height,weight = phenotypes$weight,currentformersmoker = sc)
### Merge in the SAMPID, which is used in the genotype files
gscan.phenotypes <- merge(gscan.phenotypes, id_map, by="geneva_id", all.x=TRUE)
### Reorder phenotype file to make pedigree file consistent with genotype IDs
gscan.phenotypes <- gscan.phenotypes[c(17,2,16,3:15)]
colnames(gscan.phenotypes) [2] <- "SAMPID"
### Read in PCs and add to pedigree file, then write out to a phenotype and covariate file
### [ here read in PCs and merge into phenotype file (probably by the SAMPID) ]
pcs <- read.table("/rc_scratch/hayo0753/aric/aric_ancestry_and_pcs", head=TRUE, stringsAsFactors=F)
colnames(pcs) [1] <- "SAMPID"
pcs <- merge(pcs, gscan.phenotypes, by="SAMPID", all.x=TRUE)
############# PRELIMINARY #####################
### Write to file [NOTE TO HANNAH: will have to be changed once we
### have PCs and ancestry groups identified. PCs will have to be read
### in like with read.table() and we'll have to subset the dataset
### into European and African ancestry, and then write out one
### phenotype and covariate file per ancestry group.
### EUROPEANS
phenotypes.EUR.ped <- subset(pcs, ancestry == "EUR",select=c("famid","SAMPID","patid","matid", "sex","cpd", "ai","si", "sc", "dnd","dpw"))
write.table(phenotypes.EUR.ped, file="ARIC.EUR.phenotypes1.ped", quote=F, col.names=T, row.names=F, sep="\t")
covariates.EUR.ped <- subset(pcs, ancestry == "EUR", select=c("famid","SAMPID","patid","matid", "sex","age", "age2", "height", "weight", "currentformersmoker","PC1", "PC2", "PC3", "PC4", "PC5", "PC6", "PC7", "PC8","PC9", "PC10"))
write.table(covariates.EUR.ped, file="ARIC.EUR.covariates1.ped", quote=F, col.names=T, row.names=F, sep="\t")
phenotypes.AFR.ped <- subset(pcs, ancestry == "AFR",select=c("famid","SAMPID","patid","matid", "sex","cpd", "ai","si", "sc", "dnd","dpw"))
write.table(phenotypes.AFR.ped, file="ARIC.AFR.phenotypes1.ped", quote=F, col.names=T, row.names=F, sep="\t")
covariates.AFR.ped <- subset(pcs, ancestry == "AFR", select=c("famid","SAMPID","patid","matid", "sex","age", "age2", "height", "weight", "currentformersmoker", "PC1", "PC2", "PC3", "PC4", "PC5", "PC6", "PC7", "PC8","PC9", "PC10"))
write.table(covariates.AFR.ped, file="ARIC.AFR.covariates1.ped", quote=F, col.names=T, row.names=F, sep="\t")
### Must remove duplicates from all files, use UNIX
### sort ARIC.EUR.phenotypes1.ped | uniq > ARIC.EUR.phenotypes.ped
### sort ARIC.EUR.covariates.ped | uniq > ARIC.EUR.covariates.ped
### sort ARIC.AFR.phenotypes1.ped | uniq > ARIC.AFR.phenotypes.ped
### sort ARIC.AFR.covariates.ped | uniq > ARIC.AFR.covariates.ped
Genotypes
MESA
(Hannah/Joyce to update following Framingham as a guide)
Phenotypes
Description of phenotypes can be found here: Media:MESA phenotypes - FINAL.pdf
eMERGE
(Hannah/Joyce to update following Framingham as a guide)
Phenotypes
Description of phenotypes can be found here: Media:EMERGE.pdf
Stroke
Phenotypes
options(stringsAs Factors=F)
### Load in dataset ###
ninds <- read.table(gzfile("/work/KellerLab/GSCAN/dbGaP/CIDR_StrokeGenetics/PhenoGenotypeFiles/RootStudyConsentSet_phs000615.CIDR_International_StrokeGenetics.v1.p1.c3.GRU-NPU/PhenotypeFiles/phs000615.v1.pht003307.v1.p1.c3.IschemicStroke_Subject_Phenotypes.GRU-NPU.txt.gz"), header = T, sep="\t")
### The file reads into R incorrectly, so use the below code to shift column names to the correct column.
tmp <- colnames(ninds)
tmp2 <- tmp[c(2:19)]
tmp2 <- c(tmp2, "tmp")
colnames(ninds) <- tmp2
### subset the only variables needed
phenotypes <- c("smokingStatus", "age", "gender")
pheno <- ninds2[phenotypes]
###————————————————————###
### SMOKING INITIATION ###
###————————————————————###
### NINDS variable is “smokingStatus”
### Variables are “NEVER”, “FORMER”, “CURRENT”, “NO”, “UNKNOWN”, and blank.
###
### ’CURRENT' defined as cigarette smoking within last 30 days, 'FORMER' defined as ### more than 100 cigarettes in one's lifetime but no smoking within the last 30 ### days; ’NEVER' defined as less than 100 cigarettes smoked in one's lifetime.
### table(pheno$smokingStatus)
###
### CURRENT FORMER NEVER NO UNKNOWN
### 63 727 1137 2397 4 251
pheno[smokingStatus == "CURRENT" | smokingStatus == "FORMER", 4] <- 2
pheno[pheno$smokingStatus == "NEVER" | pheno$smokingStatus == “NO”, 4] <- 1
### table(pheno$V4)
###
### 1 2
### 2401 1864
colnames(pheno)[4] <- "si"
###————————————————————-###
### SMOKING CESSATION ###
###————————————————————-###
### NINDS variable is “smokingStatus”
### Variables are “NEVER”, “FORMER”, “CURRENT”, “NO”, “UNKNOWN”, and blank.
###
### ’CURRENT' defined as cigarette smoking within last 30 days, 'FORMER' defined as ### more than 100 cigarettes in one's lifetime but no smoking within the last 30 ### days; ’NEVER' defined as less than 100 cigarettes smoked in one's lifetime.
### table(pheno$smokingStatus)
###
### CURRENT FORMER NEVER NO UNKNOWN
### 63 727 1137 2397 4 251
pheno[pheno$smokingStatus == "CURRENT", 5] <- 2
pheno[pheno$smokingStatus == "FORMER", 5] <- 1
### table(pheno$V5)
###
### 1 2
### 1137 727
colnames(pheno)[5] <- "sc"
###——————————###
### GENDER ###
###——————————###
### NINDS variable is “gender”
### Variables are “F” and “M”
### table(pheno$gender)
###
### F M
### 2627 1952
pheno[gender=="F", 6] <- 2
pheno[gender=="M", 6] <- 1
colnames(pheno)[6] <- "sex"
#########################
#########################
###### FORM TABLES ######
#########################
#########################
cbind(pheno, ninds[,1:2]) -> new
unknown <- NA
### PHENOTYPE TABLE ###
phenotype <- new[,c(7,8,6,4,5)]
colnames(phenotype)[1] <- "iid"
### generate fid, matid, patid columns
phenotype$fid <- unknown
phenotype$matid <- unknown
phenotype$patid <- unkown
phenotype <- phenotype[,c(6,1,7,8,2,3,4,5)] ### re-order
phenotype <- replace(phenotype, is.na(phenotype), "x") ### generate x’s
write.table(phenotype, "/work/KellerLab/GSCAN/dbGaP/CIDR_StrokeGenetics/STROKE_gscan_ANCESTRY_phen.ped", row.names=F, quote = F, sep="\t") ### save table
### COVARIATE TABLE ###
covariate <- new[,c(7,8,6,2)]
covariate$age2 <- covariate$age^2
colnames(covariate)[1] <- "iid"
### generate fid, matid, patid columns
covariate$fid <- unknown
covariate$matid <- unknown
covariate$patid <- unknown
covariate <- covariate[,c(6,1,7,8,2,3,4,5)] ### re-order
covariate <- replace(covariate, is.na(covariate), "x") ### generate x’s
write.table(covariate, "/work/KellerLab/GSCAN/dbGaP/CIDR_StrokeGenetics/STROKE_gscan_ANCESTRY_cov.ped", row.names=F, quote = F, sep="\t") ### save table
== BEAGESS ==
===Phenotypes===
beagess_data <- read.table("/work/KellerLab/GSCAN/dbGaP/Barretts/PhenoGenotypeFiles/RootStudyConsentSet_phs000869.BEAGESS.v1.p1.c1.GRU-MDS/PhenotypeFiles/phs000869.v1.pht004610.v1.p1.c1.BEAGESS_Subject_Phenotypes.GRU-MDS.txt.gz", header=TRUE, sep="\t", stringsAsFactors=F)
geno_data <- read.table("/work/KellerLab/GSCAN/dbGaP/Barretts/PhenoGenotypeFiles/RootStudyConsentSet_phs000869.BEAGESS.v1.p1.c1.GRU-MDS/GenotypeFiles/phg000580.v1.NCI_BEAGESS.genotype-calls-matrixfmt.c1.GRU-MDS.update/BEAGESS_dbGaP_29Jan2015.fam", header=FALSE, stringsAsFactors=F)
geno_data <- geno_data[-c(6)]
geno_data$V3[geno_data$V3 =="0"] = "x"
geno_data$V4[geno_data$V4 =="0"] = "x"
### IMPORTANT: All columns in original phenotype file are shifted one column to the the right, so the label of the column does not match the data in that column!!!
si <- beagess_data$BMI
sc <- beagess_data$BMI
age <- beagess_data$sex
beagess_data <- cbind(beagess_data, beagess_data[,6])
colnames(beagess_data)[13] <- "sc"
### SMOKING INITIATION
###
### BEAGESS variable name is "cig_smk_status" and is listed under the column "BMI".
### "Cigarette smoking status."
### Response options are "-99", "-9", "0", "1" and "2".
### -99 = not consented
### -9 = Missing
### 0 = Never
### 1 = Former
### 2 = Current
###
### Descriptives:
###
### > table(beagess_data$BMI)
###
### -99 -9 0 1 2
### 494 2051 1515 2288 575
###
### > summary(beagess_data$BMI)
### Min. 1st Qu. Median Mean 3rd Qu. Max.
### -99.000 -9.000 0.000 -9.234 1.000 2.000
beagess_data$BMI[beagess_data$BMI == -99] = "x"
beagess_data$BMI[beagess_data$BMI == -9] = "x"
beagess_data$BMI[beagess_data$BMI == 1] = 2
beagess_data$BMI[beagess_data$BMI == 0] = 1
### SMOKING Cessation
###
### BEAGESS variable name is "cig_smk_status" and is listed under the column "BMI".
### "Cigarette smoking status."
### Response options are "-99", "-9", "0", "1" and "2".
### -99 = not consented
### -9 = Missing
### 0 = Never
### 1 = Former
### 2 = Current
###
### Descriptives:
###
### > table(beagess_data$BMI)
###
### -99 -9 0 1 2
### 494 2051 1515 2288 575
###
### > summary(beagess_data$BMI)
### Min. 1st Qu. Median Mean 3rd Qu. Max.
### -99.000 -9.000 0.000 -9.234 1.000 2.000
beagess_data$sc[beagess_data$sc == -99] = "x"
beagess_data$sc[beagess_data$sc == -9] = "x"
beagess_data$sc[beagess_data$sc == 0] = "x"
### AGE
###
### BEAGESS variable name is "agegpcat" and is listed under the column "sex".
### "Age in 5 year categories"
### Response options are integers 1 through 14 or -9.
### -9 = Missing
### 1 = 15-29 years of age
### 2 = 30-34 years of age
### 3 = 35-39 years of age
### 4 = 40-44 years of age
### 5 = 45-49 years of age
### 6 = 50-54 years of age
### 7 = 55-59 years of age
### 8 = 60-64 years of age
### 9 = 65-69 years of age
### 10 = 70-74 years of age
### 11 = 75-79 years of age
### 12 = 80-84 years of age
### 13 = 85-89 years of age
### 14 = 90-100 years of age
###
### Descriptives:
###
### > table(beagess_data$sex)
###
### -9 1 2 3 4 5 6 7 8 9 10 11 12 13 14
### 27 19 48 112 190 378 657 958 1134 1238 1018 794 246 87 17
###
### > summary(beagess_data$BMI)
### Min. 1st Qu. Median Mean 3rd Qu. Max.
### -9.000 7.000 8.000 8.227 10.000 14.000
beagess_data$sex[beagess_data$sex== 1] = 22
beagess_data$sex[beagess_data$sex== 2] = 32
beagess_data$sex[beagess_data$sex== 3] = 37
beagess_data$sex[beagess_data$sex== 4] = 42
beagess_data$sex[beagess_data$sex== 5] = 47
beagess_data$sex[beagess_data$sex== 6] = 52
beagess_data$sex[beagess_data$sex== 7] = 57
beagess_data$sex[beagess_data$sex== 8] = 62
beagess_data$sex[beagess_data$sex== 9] = 67
beagess_data$sex[beagess_data$sex== 10] = 72
beagess_data$sex[beagess_data$sex== 11] = 77
beagess_data$sex[beagess_data$sex== 12] = 82
beagess_data$sex[beagess_data$sex== 13] = 87
beagess_data$sex[beagess_data$sex== 14] = 95
beagess_data$sex[beagess_data$sex== -9] = "x"
beagess_data$sex <- as.numeric(beagess_data$sex)
### rename SUBJID column to dbGaP_Subject_ID in genotype file so it matches phenotype file where SUBJID is misslabelled as dbGaP_Subject_ID
colnames(geno_data) <- c("famid", "dbGaP_Subject_ID", "patid", "matid", "sex")
### merge geno and pheno files
phen <- merge(geno_data,beagess_data, by="dbGaP_Subject_ID", all =TRUE)
phen <- phen[,c(1:5,7,10,17)]
phen <- phen[,c(2,1,3,4,5,6,7,8)]
colnames(phen) <- c("famid", "dbGaP_Subject_ID", "patid", "matid", "sex", "age","si","sc")
phenotypes <- data.frame(famid=phen$famid, dbGaP_Subject_ID=phen$dbGaP_Subject_ID, patid=phen$patid, matid=phen$matid, sex=phen$sex, si=phen$si, sc=phen$sc, currentformersmoker=phen$sc, age=phen$age, age2=phen$age^2)
colnames(phenotypes) <- c("famid", "dbGaP_Subject_ID", "patid", "matid", "sex", "si", "sc", "currentformersmoker", "age", "age2")
pcs <- read.table("/rc_scratch/hayo0753/BEAGESS/BEAGESS_pcs_and_ancestrys.txt",head=TRUE, stringsAsFactors=F, sep=" ")
colnames(pcs) [1] <- "dbGaP_Subject_ID"
gscan.phenotypes <- merge(phenotypes, pcs, by="dbGaP_Subject_ID", all=TRUE)
gscan.phenotypes[is.na(gscan.phenotypes)] <- "x"
### EUROPEANS - entire sample is EUR
phenotypes.EUR.ped <- subset(gscan.phenotypes, ancestry == "EUR", select=c("famid","dbGaP_Subject_ID","patid","matid", "sex", "si", "sc"))
write.table(phenotypes.EUR.ped, file="BEAGESS.EUR.phenotypes.ped", quote=F, col.names=T, row.names=F, sep="\t")
covariates.EUR.ped <- subset(gscan.phenotypes, ancestry == "EUR", select=c("famid","dbGaP_Subject_ID","patid","matid", "sex", "age", "age2", "currentformersmoker", "PC1", "PC2", "PC3", "PC4", "PC5", "PC6", "PC7", "PC8", "PC9", "PC10"))
write.table(covariates.EUR.ped, file="BEAGESS.EUR.covariates.ped", quote=F, col.names=T, row.names=F, sep="\t")
Jackson Heart Study
Phenotypes
hmb <- read.table("/work/KellerLab/GSCAN/dbGaP/JHS/PhenoGenotypeFiles/ChildStudyConsentSet_phs000499.JHS.v3.p1.c3.HMB-IRB/PhenotypeFiles/phs000499.v3.pht003209.v1.p1.c3.JHS_CARe_Subject_Phenotypes.HMB-IRB.txt.gz", header=T, sep="\t", stringsAsFactors=F)
npu <- read.table("/work/KellerLab/GSCAN/dbGaP/JHS/PhenoGenotypeFiles/ChildStudyConsentSet_phs000499.JHS.v3.p1.c1.HMB-IRB-NPU/PhenotypeFiles/phs000499.v3.pht003209.v1.p1.c1.JHS_CARe_Subject_Phenotypes.HMB-IRB-NPU.txt.gz", header=T, sep="\t", stringsAsFactors=F)
phens <- rbind(hmb, npu)
hmb_geno <- read.table("/work/KellerLab/GSCAN/dbGaP/JHS/PhenoGenotypeFiles/ChildStudyConsentSet_phs000499.JHS.v3.p1.c3.HMB-IRB/GenotypeFiles/phg000103.v1.JHS.genotype-calls-matrixfmt.c3.HMB-IRB/JHS_PLINK_illu_GRU.fam", header=F, stringsAsFactors=F)
npu_geno <- read.table("/work/KellerLab/GSCAN/dbGaP/JHS/PhenoGenotypeFiles/ChildStudyConsentSet_phs000499.JHS.v3.p1.c1.HMB-IRB-NPU/GenotypeFiles/phg000103.v1.JHS.genotype-calls-matrixfmt.c1.HMB-IRB-NPU/JHS_PLINK_illu_NCU.fam", header=F, stringsAsFactor=F)
geno <- rbind(hmb_geno, npu_geno)
jhs_map <- read.table("/work/KellerLab/GSCAN/dbGaP/JHS/PhenoGenotypeFiles/ChildStudyConsentSet_phs000499.JHS.v3.p1.c3.HMB-IRB/GenotypeFiles/phg000104.v1.JHS.sample-info.MULTI/JHS_subject_sample_map_8_10_2010.txt", header=T, sep="\t", stringsAsFactors=F)
jhs_pcs <- read.table("/rc_scratch/hayo0753/JHS_files/JHS_ancestry_and_pcs.txt", header=T, stringsAsFactors=F)
names(jhs_pcs) [2] <- "SUBJID"
names(geno) <- c("SUBJID", "iid", "patid", "matid", "sex", "phen")
filtered_geno <- geno[(geno$SUBJID %in% phens$SUBJID),]
filtered_phens <- phens[(phens$SUBJID %in% geno$SUBJID),]
mapped_geno <- merge(filtered_phens, filtered_geno, by="SUBJID", all.x=T)
mapped_geno_pcs <- merge(mapped_geno, jhs_pcs, by="SUBJID", all.x=T)
covariates <- mapped_geno_pcs[c(1,69,70:72,5,75:84)]
names(covariates) [1] <- "fid"
names(covariates) [2] <- "iid"
names(covariates) [6] <- "age"
###--------------------###
### Smoking initiation ###
###--------------------###
###
### JHSCare variables are "current_smoker_baseline" and "former_smoking_baseline"
### current_smoker_baseline = Current smoking status at first participant visit
### former_smoker_baseline = Former smoking status at first participant visit
###
### Response options are
### 0 - No
### 1 - Yes
###
### table(si)
###
### 0 1 x
### 1206 537 9
current.smoker <- subset(mapped_geno_pcs, select=c("current_smoker_baseline", "SUBJID"))
former.smoker <- subset(mapped_geno_pcs, select=c("former_smoker_baseline", "SUBJID"))
N <- nrow(mapped_geno_pcs)
si <- rep(NA, N)
for(i in 1:N){
if(is.na(current.smoker[i,1]) | is.na(former.smoker[i,1])){
si[i] <- "x"
}
else if (current.smoker[i,1] == 0 & former.smoker[i,1] == 0){
si[i] <- "1"
}
else if (current.smoker[i,1] == 0 & former.smoker[i,1] == 1){
si[i] <- 2 ### former smokers are coded as 2
}
else if (current.smoker[i,1] == 1 & former.smoker[i,1] == 0){
si[i] <- 2 ### current smokers are coded as 2
}
}
mapped_geno_pcs_phen <- cbind(si, mapped_geno_pcs)
###--------------------###
### Smoking Cessation ###
###--------------------###
###
### JHSCare variables are "current_smoker_baseline" and "former_smoking_baseline"
### current_smoker_baseline = Current smoking status at first participant visit
### former_smoker_baseline = Former smoking status at first participant visit
###
### Response options are
### 0 - No
### 1 - Yes
###
### table(sc)
###
### 0 1 x
### 1206 537 9
current.smoker <- subset(mapped_geno_pcs, select=c("current_smoker_baseline", "SUBJID"))
former.smoker <- subset(mapped_geno_pcs, select=c("former_smoker_baseline","SUBJID"))
N <- nrow(mapped_geno_pcs)
sc <- rep(NA, N)
for(i in 1:N){
if(is.na(current.smoker[i,1]) | is.na(former.smoker[i,1])){
sc[i] <- "x"
}
else if (current.smoker[i,1] == 0 & former.smoker[i,1] == 0){
sc[i] <- "x"
}
else if (current.smoker[i,1] == 0 & former.smoker[i,1] == 1){
sc[i] <- 1 ### former smokers are coded as 1
}
else if (current.smoker[i,1] == 1 & former.smoker[i,1] == 0){
sc[i] <- 2 ### current smokers are coded as 2
}
}
phenotypes <- cbind(sc, mapped_geno_pcs_phen)
phenotypes <- phenotypes[c(3,71,72:74,1:2)]
names(phenotypes)[1] <- "fid"
write.table(covariates, "AFR.JHS.covariates.ped", row.names=F, quote=F)
write.table(phenotypes, "AFR.JHS.phenotypes.ped", row.names=F, quote=F)
Genotype Processing
Pre-Phasing QC
QC parameters that we chose: MAF > 0.01
SNP callrate > 0.95
Missingness per individual > 0.95
HWE = 0.05 / number of markers but greater than 5e-8
## To update the strand builds: http://www.well.ox.ac.uk/~wrayner/strand/
## Check strands against latest 1000G: http://www.well.ox.ac.uk/~wrayner/tools/
perl HRC-1000G-check-bim.pl \
-b ARIC_b37_filtered.bim \
-f ARIC_b37_filtered.frq \
-r 1000GP_Phase3_combined.legend \
-g \
-p EUR
## Phasing using shapeit (ARIC used as an example)
shapeit -B ARIC_b37_filtered-updated-chr${1} -M /rc_scratch/meli7712/dbGAP/1000GP_Phase3/genetic_map_chr${1}_combined_b37.txt -O phased/ARIC_b37_filtered-updated-chr${1}.phased -T 48
## To convert the shapeit output into vcf
shapeit -convert --input-haps mesa-chr${1}.phased \
--output-vcf mesa-chr${1}.phased.vcf \
-T 12
## Imputation
Minimac3-omp3 --haps mesa-chr${1}.phased.vcf \
--cpus 48
--refHaps /rc_scratch/meli7712/dbGAP/references/${1}.1000g.Phase3.v5.With.Parameter.Estimates.m3vcf.gz \
--chr ${1} \
--noPhoneHome \
--format GT,DS,GP \
--allTypedSites \
--prefix mesa-chr${1}.phased.imputed