Editing GSCAN dbGaP (section)

==Framingham==


===Phenotypes===
<syntaxhighlight lang="rsplus">
options(stringsAsFactors=F)

### Offspring cohort
off <- read.table(gzfile("/work/KellerLab/GSCAN/dbGaP/Framingham/PhenoGenotypeFiles/RootStudyConsentSet_phs000007.Framingham.v28.p10.c1.HMB-IRB-MDS/PhenotypeFiles/phs000007.v28.pht000030.v7.p10.c1.ex1_1s.HMB-IRB-MDS.txt.gz"), header=T, sep="\t")

### Generation 3 cohort
g3 <- read.table(gzfile("/work/KellerLab/GSCAN/dbGaP/Framingham/PhenoGenotypeFiles/RootStudyConsentSet_phs000007.Framingham.v28.p10.c1.HMB-IRB-MDS/PhenotypeFiles/phs000007.v28.pht000074.v9.p10.c1.ex3_1s.HMB-IRB-MDS.txt.gz"), header=T, skip=10, sep="\t", fill=T)

###---------------------###
### Cigarettes per Day  ###
###---------------------###

### Offspring Cohort Exam 1
### Framingham variable name is "A102".
###   "Usual number of cigarettes smoked (now or formerly)"
### Response option is an integer value
###
### table(off$A102)
###     0    1    2    3    4    5    6    7    8    9   10   11   12   13   14   15
###  1208   50   29   44   26   47   31   16   30    3  175    1   20    1    9   94
###    16   17   18   19   20   22   24   25   26   27   28   30   34   35   40   42
###     5    6   50    1  556   10    3   39    1    1    2  214    1   15  147    2
###    45   50   60   80   88   90
###     4   21   19    1   14    2
###
### summary(off$A102)
###  Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's
###  0.00    0.00    7.00   12.11   20.00   90.00       8

cpd.off <- off$A102
cpd.off[cpd.off==0] <- NA ### Set those who smoke "0" cpd to NA
cpd.off[cpd.off > 60] <- 60 ### Set those who report smoking >40 cpd to 40

cpd.off[cpd.off <=  5 & cpd.off >=  1] <- 1
cpd.off[cpd.off <= 15 & cpd.off >=  6] <- 2
cpd.off[cpd.off <= 25 & cpd.off >= 16] <- 3
cpd.off[cpd.off <= 35 & cpd.off >= 26] <- 4
cpd.off[cpd.off >= 36 & cpd.off <= 60] <- 5
cpd.off[cpd.off > 60 | is.na(cpd.off)] <- NA

### G3 cohort Exam 1
### Framingham variable name is "G3A074
###   "IF EVER SMOKED CIGS REGULARLY: ON THE AVERAGE
###    OF THE ENTIRE TIME YOU SMOKED, HOW MANY CIGARETTES
###    DID YOU SMOKE PER DAY?"
### Response option is an integer value
###
### table(g3$G3A074)
###
###    0    1    2    3    4    5    6    7    8    9   10   11   12   13   14   15 
### 3180   57   14   15   20   30    8   13    7    2  122    2   16    3    2   60 
###   16   17   18   19   20   21   22   23   24   25   28   30   35   40   45   50 
###    3    3   10    1  287    2    1    1    1   12    1   53    1   47    1   12 
###   55   60   80 
###    1    6    1
###
### summary(g3$G3A074)
###    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.
###   0.000   0.000   0.000   3.528   0.000  80.000

cpd.g3 <- g3$G3A074
cpd.g3[cpd.g3==0] <- NA ### Set those who smoke "0" cpd to NA
cpd.g3[cpd.g3 > 60] <- 60 ### Set those who report smoking >40 cpd to 40

cpd.g3[cpd.g3 <=  5 & cpd.g3 >=  1] <- 1
cpd.g3[cpd.g3 <= 15 & cpd.g3 >=  6] <- 2
cpd.g3[cpd.g3 <= 25 & cpd.g3 >= 16] <- 3
cpd.g3[cpd.g3 <= 35 & cpd.g3 >= 26] <- 4
cpd.g3[cpd.g3 >= 36 & cpd.g3 <= 60] <- 5
cpd.g3[cpd.g3 > 60 | is.na(cpd.g3)] <- NA

###--------------------###
### Smoking initiation ###
###--------------------###
### Offspring Cohort Exam 1
### Framingham variable is "A99"
###     "Smokes cigarettes: Yes(now)/No/Former"
### Response option is fill-in 
###     0 is no, 1 is yes (now), 2 is former
###
### table(off$A99)
###
###    0    1    2
### 1203 1126  566

si.off <- off$A99
si.off[si.off==1] <- 2 ### Set current smoker "1" to smoker "2"
si.off[si.off==0] <- 1 ### Set never smoker "0" to nonsmoker "1"
                       ### Former smoker "2" remains as a "2" as smoker 


### G3 Cohort Exam 1
### Framingham variables is "G3A070"
###     G3A070 - HAVE YOU EVER SMOKED CIGARETTES REGULARLY? (NO MEANS
###              LESS THAN 20 PACKS OF CIGARETTES OR 12 OZ OF TOBACCO IN A 
###              LIFETIME OR LESS THAN 1 CIGARETTE A DAY FOR A YEAR.)"
### Response options:
###              "0" is no, "1" is yes,  "0" is unknown
###
### table(g3$G3A070)
###    0    1   20 
### 2137 1629    1
si.g3 <- g3$G3A070
si.g3[si.g3==20] <- NA
si.g3[si.g3==1] <- 2
si.g3[si.g3==0] <- 1


###---------------------###
###  Smoking cessation  ###
###---------------------###

### Offspring Cohort Exam 1
### Framingham variable is "A99"
###     Smokes cigarettes: Yes(now)/No/Former"
### Response option is fill-in
###     0 is no, 1 is yes (now), 2 is former
### 
### table(off$A99)
###
###    0    1    2
### 1203 1126  566

sc.off <- off$A99 
sc.off[sc.off==2] <- 3 ### temporarily set former smoker "2" as "3"
sc.off[sc.off==1] <- 2 ### set current smoker "1" as current smoker "2"
sc.off[sc.off==3] <- 1 ### set former smoker "3" as former smoker "1"
sc.off[sc.off==0] <- NA ### code all non smokers "0" as NA


### G3 Cohort Exam 1
### Framingham variables are “G3A070” and “G3A072”
### Response option is fill-in
### G3A070 - HAVE YOU EVER SMOKED CIGARETTES REGULARLY? (NO MEANS LESS THAN 20 PACKS OF CIGARETTES OR 12 OZ OF ###TOBACCO IN A LIFETIME OR LESS THAN 1 CIGARETTE A DAY FOR A YEAR.)
###“0” is no, “1” is yes,  “20” is unknown
### G3A072 - IF EVER SMOKED CIGARETTES REGULARLY: DO YOU NOW SMOKE CIGARETTES (AS OF 1 MONTH AGO)?
###“0” is no or never smoked, “1” is yes
###
### table(g3$G3A070)
###    0    1   20
### 2137 1629    1
###
### table(g3$G3A072)
### 0    1
### 3179  585
sc.g3 <- g3$G3A072
sc.g3[sc.g3==1] <- 2 ## These are our current smokers
sc.g3[sc.g3==0] <- 1
sc.g3 <- ifelse(sc.g3==1 & (g3$G3A070 == 0 | is.na(g3$G3A070)), NA, sc.g3)


###---------------------###
###  Age of initiation  ###
###---------------------###

### Offspring Cohort Exam 1
### Framingham variable "A100"
###     Age started smoking cigarettes regularly\
### Response option is an integer value, 88 is doesn't smoke regularly
### table(off$A100)
### 
###   0   7   8   9  10  11  12  13  14  15  16  17  18  19  20  21  22  23  24  25
###   6   2   2   2   8   4  24  37  61 147 266 233 330 166 164  85  42  26  13  25 
###   26  27  28  29  30  31  33  34  35  37  38  39  40  44  45  46  51  88 
###   2   8   4   3   7   1   1   1   3   1   2   1   3   1   1   2   1 105 
###

ai.off <- off$A100
ai.off[ai.off < 10] <- NA ### Set AI less than 10 as missing\
ai.off[ai.off > 35] <- NA ### Set AI greater than 35 as missing \


### G3 Cohort Exam 1
### Framingham variable "G3A075"
###     IF EVER SMOKED CIGARETTES REGULARLY: HOW OLD WERE YOU WHEN YOU FIRST STARTED REGULAR CIGARETTE SMOKING?
### Response option is an integer value, 0 is never smoked
###  table(g3$G3A075)
###
###    0    5    7    8    9   10   11   12   13   14   15   16   17   18   19   20
### 2138    1    2    2    3    5   14   52  108  159  152  277  193  256   98  107
###   21   22   23   24   25   26   27   28   29   30   31   32   33   34   35   36
###   51   31   19   13   31    6    4    6    2   13    2    1    1    4    1    4
###   37   38   40   42   46   50
###    2    2    1    1    1    2
###

ai.g3 <- g3$G3A075
ai.g3[ai.g3 < 10] <- NA ### Set AI less than 10 as missing
ai.g3[ai.g3 > 35] <- NA ### Set AI greater than 35 as missing


###-------------------###
###  Drinks per week  ###
###-------------------###

### Offspring Cohort Exam 1 
### Framingham variables "A111", "A112", "A113"
###     A111 - Beer (bottles, cans or glasses per week)
###     A112 - Wine (glasses per week)
###     A113 - Cocktails, highballs, straight drinks (# per week)
### Response options are integer values. 
### table(off$A111)
###
###    0    1    2    3    4    5    6    7    8    9   10   12   14   15   16   18 
### 1452  524  171  131   78   49  125   34   33    4   46   68   21   21    5   13 
###   20   21   24   25   26   28   30   35   36   40   42   48   49   50   70   90 
###   18    7   48    5    1    6    6    2    2    4    4    6    1    2    1    1 
###
### table(off$A112)
###
###    0    1    2    3    4    5    6    7    8    9   10   12   14   15   16   18 
### 1329  919  218  105   69   41   22   64   25    3   25   11   30    2    2    2 
###   20   21   24   28   30   32   50 
###    7    4    2    3    1    2    1 
###
### table(off$A113)
### 
###    0    1    2    3    4    5    6    7    8    9   10   12   14   15   16   18 
###  861 1094  267  157   96   64   59   65   31    9   49   16   57   13    2    1
###   19   20   21   24   25   28   30   35   38   42   50   55   60   61 
###    1   12   10    4    2    2    6    2    1    1    1    1    1    1 
###
dpw.off <- rowSums(subset(off, select = c("A111", "A112", "A113")), na.rm=T)
### The above assumes all drinks are of equal ETOH content
dpw.off[dpw.off == 0] <- NA
dpw.off <- log(dpw.off)


### G3 Cohort Exam 1\
### Framingham variables are "G3A115", "G3A116", "G3A119", "G3A120",
###                          "G3A123", "G3A124", "G3A127", "G3A128",
###                          "G3A131", "G3A132"
###    G3A115 - BEER: NUMBER OF BEER (12 OZ. BOTTLE, GLASS, CAN) YOU DRINK PER WEEK OVER THE PAST YEAR
###    G3A116 - BEER: NUMBER OF BEER (12 OZ. BOTTLE, GLASS, CAN) YOU DRINK PER MONTH OVER THE PAST YEAR
###    G3A119 - WHITE WINE: NUMBER OF WHITE WINE (4 OZ GLASS) YOU DRINK PER WEEK OVER THE PAST YEAR
###    G3A120 - WHITE WINE: NUMBER OF WHITE WINE (4 OZ GLASS) YOU DRINK PER MONTH OVER THE PAST YEAR
###    G3A123 - RED WINE: NUMBER OF RED WINE (4 OZ GLASS) YOU DRINK PER WEEK OVER THE PAST YEAR
###    G3A124 - RED WINE: NUMBER OF RED WINE (4 OZ GLASS) YOU DRINK PER MONTH OVER THE PAST YEAR
###    G3A127 - LIQUOR/SPIRITS: AVERAGE NUMBER OF LIQUOR/SPIRITS (1 1/4 OZ JIGGER) YOU DRINK PER WEEK OVER THE PAST YEAR
###    G3A128 - LIQUOR/SPIRITS: AVERAGE NUMBER OF LIQUOR/SPIRITS (1 1/4 OZ JIGGER) YOU DRINK PER MONTH OVERTHE PAST YEAR
###    G3A131 - OTHER BEVERAGE: AVERAGE NUMBER OF OTHER BEVERAGE YOU DRINK PER WEEK OVER THE PAST YEAR
###    G3A132 - OTHER BEVERAGE: AVERAGE NUMBER OF OTHER BEVERAGE YOU DRINK PER MONTH OVER THE PAST YEAR
###
###  *Participant was allowed to report alcohol consumption in either drinks per week or drinks per
###   month. Therefore, to calculate total alcohol consumption, you must use both number of drinks per
###   week and number of drinks per month (E.G. DRINKS PER MONTH = SUM(OF (DRINKS PER WEEK)(DRINKS PER
###   MONTH/4)).
###
### Response options are integer values.
###
all <- subset(g3, select = c("G3A115", "G3A116", "G3A119", "G3A120", "G3A123", "G3A124", "G3A127", "G3A128",
                      "G3A131", "G3A132"))

names(all) <- c("beerpw", "beerpm", "wwinepw", "wwinepm", "rwinepw", "rwinepm", "liqpw", "liqpm", "othpw", "othpm")
w <- all[,c(1,3,5,7,9)]
m <- all[,c(2,4,6,8,10)]/4

dpw.tmp <- data.frame(beer = rowSums(cbind(w$beerpw, m$beerpm), na.rm=T),
                      wwine = rowSums(cbind(w$wwinepw, m$wwinepm), na.rm=T),
                      rwine = rowSums(cbind(w$rwinepw, m$rwinepm), na.rm=T),
                      liq = rowSums(cbind(w$liqpw, m$liqpm), na.rm=T),
                      oth = rowSums(cbind(w$othpw, m$othpm), na.rm=T))
                      




### WHITE WINE - convert from 4oz to 5oz drink
dpw.tmp$wwine <- dpw.tmp$wwine*5/4
### RED WINE - convert from 4oz to 5oz drink
dpw.tmp$rwine <- dpw.tmp$rwine*5/4
### LIQUOR - convert from 1.25 to 1.50 drink
dpw.tmp$liq <- dpw.tmp$liq*1.5/1.25
### OTHER DRINK is an unknown. leave as-is.
dpw.g3 <- rowSums(dpw.tmp, na.rm=T)

### We have a few outliers, draw cuttoff at 70 drinks / week
dpw.g3[dpw.g3 > 70] <- NA
dpw.g3[dpw.g3 == 0] <- NA
dpw.g3 <- log(dpw.g3 + .75)

###---------------------------###
###  Drinker vs. non-drinker  ###
###---------------------------###
###
### Use the variables already computed for dpw above. Assume
### anyone who is drinking less than 1 drink / month is a non-drinker
x <- subset(off, select = c("A111", "A112", "A113"))
index <- rep(NA, nrow(x))
for(i in 1:nrow(x)) {
    if(is.na(x[i,1]) & is.na(x[i,2]) & is.na(x[i,3])) {
        index[i] <- 1
    } else {
        index[i] <- 0
    }
}

dnd.off <-  ifelse(rowSums(subset(off, select = c("A111", "A112", "A113")), na.rm=T) == 0, 1, 2)
dnd.off[index==1] <- NA


x <- all
index <- rep(NA, nrow(x))
for(i in 1:nrow(x)) {
    if(is.na(x[i,1]) & is.na(x[i,2]) & is.na(x[i,3])) {
        index[i] <- 1
    } else {
        index[i] <- 0
    }
}


dnd.g3 <- ifelse(rowSums(all, na.rm=T) == 0, 1, 2)
dnd.g3[index==1] <- NA



###----------------------------###
### Binge drinking in everyone ###
### (Only available for G3)    ###
###----------------------------###
### G3 Cohort Exam 1 
### Framingham variable is "G3A137"
###     IF EVER CONSUMED ALCOHOL: WHAT WAS THE MAXIMUM NUMBER OF
###        DRINKS YOU HAD IN A 24 HOUR PERIOD DURING THE PAST MONTH?
### Response option is an integer value 
### 
### summary(g3$G3A137)
###    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's
###   0.000   1.000   3.000   3.566   5.000  50.000       3

bde <- g3$G3A137
sex <- g3$G3A440 ### bde is defined differently for males & females
### sex == 1 is male, sex == 2 is female

tmp <- rep(NA, length(bde)) ### temporary vector to hold results and avoid conflicts

tmp[sex == 1 & bde < 5] <- 1
tmp[sex == 1 & bde >=5] <- 2
tmp[sex == 2 & bde < 4] <- 1
tmp[sex == 2 & bde >=4] <- 2

bde.g3 <- tmp

###-----------------------------------------###
###  Binge-drinking (in lifetime drinkers)  ###
###  (Only available for G3)                ###
###-----------------------------------------###

### G3 Cohort Exam 1
### Framingham variable is "G3A112"
###     HAVE YOU EVER CONSUMED ALCOHOLIC BEVERAGES (BEER, WINE, LIQUOR/SPIRITS)? \
### 0 is no, 1 is yes

lifednd <- g3$G3A112 ## No missings, so next command has no problem
bdl.g3 <- bde.g3
bdl.g3 <- ifelse(lifednd == 0, NA, bde.g3)




##################
##################
###            ###
### Covariates ###
###            ###
##################
##################


### HEIGHT
### Offspring Cohort variable is “A51”
### Generation 3 Cohort variable is “G3A446”

height.off <- off$A51
height.g3 <- g3$G3A446

### WEIGHT
### Offspring Cohort variable is “A50”
### Generation 3 Cohort variable is “G3A444”

weight.off <- off$A50
weight.g3 <- g3$G3A444


###-------------###
### Age at exam ###
###-------------###
### Get BIRTHDATE
birth <- read.table(gzfile("/work/KellerLab/GSCAN/dbGaP/Framingham/PhenoGenotypeFiles/RootStudyConsentSet_phs000007.Framingham.v28.p10.c1.HMB-IRB-MDS/PhenotypeFiles/phs000007.v28.pht000740.v7.p10.c1.birthyr_alls.HMB-IRB-MDS.txt.gz"), header=T, sep="\t")

off.IDs <- off$dbGaP_Subject_ID
g3.IDs <- g3$dbGaP_Subject_ID

birth.off <- subset(birth, dbGaP_Subject_ID %in% off.IDs)
birth.g3 <- subset(birth, dbGaP_Subject_ID %in% g3.IDs)

### True exam dates spanned 1971 - 1975 for offspring
### True exam dates spanned 2002 - 2005 for generation 3
### A good-enough approximation is to take the middle
### year and subtract birth year to get age at exam

### Offspring Cohort
off.age <- data.frame(dbGaP_Subject_ID = birth.off$dbGaP_Subject_ID,
                      Age = 1973 - birth.off$birthyr)
### Framingham Cohort
g3.age <- data.frame(dbGaP_Subject_ID = birth.g3$dbGaP_Subject_ID,
                     Age = 2004 - birth.g3$birthyr)

age <- rbind(off.age, g3.age)
####################################
####################################
###                              ###
### Create Phenotype Data Frames ###
###                              ###
####################################
####################################

offspring <- data.frame(dbGaP_Subject_ID = off$dbGaP_Subject_ID,
                        shareid = off$shareid,
                        cpd = cpd.off,
                        si  = si.off,
                        sc  = sc.off,
                        ai  = ai.off,
                        dpw = dpw.off,
                        dnd = dnd.off,
                        bde = rep(NA, nrow(off)),
                        bdl = rep(NA, nrow(off)),
                        height = height.off,
                        weight = weight.off,
                        cohort = rep(1, nrow(off)))

generation3 <- data.frame(dbGaP_Subject_ID = g3$dbGaP_Subject_ID,
                          shareid = g3$shareid,
                          cpd = cpd.g3,
                          si  = si.g3,
                          sc  = sc.g3,
                          ai  = ai.g3,
                          dpw = dpw.g3,
                          dnd = dnd.g3,
                          bde = bde.g3,
                          bdl = bdl.g3,
                          height = height.g3,
                          weight = weight.g3,
                          cohort = rep(2, nrow(g3)))


tmp <- rbind(offspring, generation3)

phenotypes <- merge(tmp, age, by="dbGaP_Subject_ID")
phenotypes$age2 <- phenotypes$Age^2



##################
### ID MAPPING ###
##################
### ID mapping file from dbGaP_Subject_ID to SAMPID
ID.map <- read.table(gzfile("/work/KellerLab/GSCAN/dbGaP/Framingham/PhenoGenotypeFiles/RootStudyConsentSet_phs000007.Framingham.v28.p10.c1.HMB-IRB-MDS/PhenotypeFiles/phs000007.v28.pht001415.v16.p10.Framingham_Sample.MULTI.txt.gz"), header=T, sep="\t", stringsAsFactors=F)

### Genotype files
genotype.IDs <-  read.table("/work/KellerLab/GSCAN/dbGaP/Framingham/PhenoGenotypeFiles/ChildStudyConsentSet_phs000342.Framingham.v16.p10.c1.HMB-IRB-MDS/GenotypeFiles/phg000006.v9.FHS_SHARe_Affy500K.genotype-calls-matrixfmt.c1/subject_level_PLINK_sets/FHS_SHARe_Affy500K_subjects_c1.fam", header=F)
names(genotype.IDs) <- c("famid", "SAMPID", "patid", "matid", "sex", "phenotype")

length(which(genotype.IDs$SAMPID %in% ID.map$SAMPID))
## 6954

x <- merge(genotype.IDs, ID.map, by="SAMPID", all.x=T)
x <- x[,c(1:5,7)]

### There are many duplicates, which I can remove because I only want
### a single entry for every SAMPID-SUBJID
x <- x[which(!duplicated(x$dbGaP_Subject_ID)),]



almost.final <- merge(phenotypes, x, by="dbGaP_Subject_ID", all.x=T)

####################################################
### Bring in genetic PCs and ancestry categories ###
####################################################

PCs <- read.table("/work/KellerLab/Zhen/FRAMINGHAM/PCA/FRAMINGHAM_pcs_and_ancestries.txt", header=T)
names(PCs)[2] <- c("SAMPID")

final <- merge(almost.final, PCs, by="SAMPID", all.x=T)

phenotypes.ped <- subset(final, ancestry=="EUR",
                         select=c("famid", "SAMPID", "patid",
                             "matid", "sex", "cpd", "ai", "si",
                             "sc", "dpw", "dnd", "bde", "bdl"))
phenotypes.ped[is.na(phenotypes.ped)] <- "x"


covariates.ped <- subset(final, ancestry=="EUR",
                         select=c("famid", "SAMPID", "patid",
                             "matid", "sex", "Age", "age2", "sc",
                             "height", "weight", "cohort",
                             "PC1", "PC2", "PC3", "PC4", "PC5",
                             "PC6", "PC7", "PC8", "PC9", "PC10"))
covariates.ped[is.na(covariates.ped)] <- "x"

write.table(phenotypes.ped, file="Framingham.EUR.phenotypes.ped", quote=F,
            col.names=T, row.names=F)

write.table(covariates.ped, file="Framingham.EUR.covariates.ped", quote=F,
            col.names=T, row.names=F)

</syntaxhighlight>



===Genotypes===
We used the Affy 500K genotypes found here: /work/KellerLab/GSCAN/dbGaP/Framingham/PhenoGenotypeFiles/ChildStudyConsentSet_phs000342.Framingham.v16.p10.c1.HMB-IRB-MDS/GenotypeFiles/phg000006.v9.FHS_SHARe_Affy500K.genotype-calls-matrixfmt.c1/subject_level_PLINK_sets/FHS_SHARe_Affy500K_subjects_c1.[bed|bim|fam]