Difference between revisions of "Homework 6: Ancestry"

From Vrieze Wiki
Jump to navigation Jump to search
 
(16 intermediate revisions by the same user not shown)
Line 1: Line 1:
 +
==== Notes ====
 +
 +
==== Code for Homework ====
 
<syntaxhighlight lang="bash">
 
<syntaxhighlight lang="bash">
 
# PSYC 7102 -- Statistical Genetics
 
# PSYC 7102 -- Statistical Genetics
# Homework #6: Ancestry
+
# Homework #6: Ancestry
# Due: November 25th
+
# Due: November 25th
+
 
# PLEASE CONTACT DAVID TO GET YOUR NEW VCF.
+
# PLEASE CONTACT DAVID TO GET YOUR NEW VCF.
+
 
# Overview: Using your variant calls contained in your VCF files,
+
# Overview: Using your variant calls contained in your VCF files,
# combined with the existing 1000 Genomes variant calls from the 1000
+
# combined with the existing 1000 Genomes variant calls from the 1000
# Genomes project, you will conduct a principal components analysis
+
# Genomes project, you will conduct a principal components analysis
# of the 1000 Genomes samples using PLINK and estimate your position in
+
# of the 1000 Genomes samples using PLINK and estimate your position in
# the 1000G PCA space
+
# the 1000G PCA space
+
 
# This homework is a little bit different because the commands are
+
# This homework is a little bit different because the commands are
# slightly complicated, so I've written them all for you. In addition
+
# slightly complicated, so I've written them all for you. In addition
# to specific conceptual questions, I ask you to run the commands
+
# to specific conceptual questions, I ask you to run the commands
# below (changing directories appropriately as needed) and then
+
# below (changing directories appropriately as needed) and then
# explain to me what the command did and why we did it. I don't need
+
# explain to me what the command did and why we did it. I don't need
# super technical details but simply a solid conceptual overview of
+
# super technical details but simply a solid conceptual overview of
# the purpose of the command.
+
# the purpose of the command.
+
 
### ONLY EXPLAIN COMMANDS WHERE I SPECIFICALLY REQUEST IT! YOU DO NOT
+
### ONLY EXPLAIN COMMANDS WHERE I SPECIFICALLY REQUEST IT! YOU DO NOT
### HAVE TO EXPLAIN EVERY COMMAND!
+
### HAVE TO EXPLAIN EVERY COMMAND! But please run all commands to produce
+
### in the end a PCA plot containing yourself compared to all 1000 Genomes
# For many questions you'll want to run analyses by chromosome. To do
+
### samples.
#  this, please create an interactive PBS session like this:
+
 
+
# For many questions you'll want to run analyses by chromosome. To do
qsub -I -l walltime=23:00:00 -q short -l mem=10gb -l nodes=1:ppn=22
+
#  this, please create an interactive PBS session like this:
+
 
+
qsub -I -l walltime=23:00:00 -q short -l mem=10gb -l nodes=1:ppn=22
+
 
###############
+
### Load apigenome, plink
### STEP 1  ###
+
module load apigenome_0.0.2
###############
+
module load plink_latest
# Here I take a minute to describe the various files you'll use to get started
+
module load tabix_0.2.6
+
 
#    The 1000 Genomes vcfs from the 1000 Genomes Consortium
+
 
/Users/scvr9332/1000Gp3/variant_calls/ALL.chr[1-22].phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz
+
###############
+
### STEP 1  ###
+
###############
#    A vcf file with, depending on your level of comfort, your own
+
# Here I take a minute to describe the various files you'll use to get started
#    genotypes or 1kg sample HG000096. All examples that follow use
+
 
#    HG00096 for convenience, which can be obtained as follows. To use
+
#    The 1000 Genomes vcfs from the 1000 Genomes Consortium
#    your own sample just replace the $10 with $NF, assuming your
+
/Users/scvr9332/1000Gp3/variant_calls/ALL.chr[1-22].phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz
#    sample is in the last column of the vcf.
+
 
+
 
    (zcat chr1.filtered.PASS.beagled.vcf.gz          | gawk '{print $1"\t"$2"\t"$3"\t"$4"\t"$5"\t"$6"\t"$7"\t"$8"\t"$9"\t"$10}'; \
+
#    A vcf file with, depending on your level of comfort, your own
    zgrep -v '#' chr2.filtered.PASS.beagled.vcf.gz  | gawk '{print $1"\t"$2"\t"$3"\t"$4"\t"$5"\t"$6"\t"$7"\t"$8"\t"$9"\t"$10}'; \
+
#    genotypes or 1kg sample HG000096. All examples that follow use
    zgrep -v '#' chr3.filtered.PASS.beagled.vcf.gz  | gawk '{print $1"\t"$2"\t"$3"\t"$4"\t"$5"\t"$6"\t"$7"\t"$8"\t"$9"\t"$10}'; \
+
#    HG00096 for convenience, which can be obtained as follows. To use
    zgrep -v '#' chr4.filtered.PASS.beagled.vcf.gz  | gawk '{print $1"\t"$2"\t"$3"\t"$4"\t"$5"\t"$6"\t"$7"\t"$8"\t"$9"\t"$10}'; \
+
#    your own sample just replace the $10 with $NF, assuming your
    zgrep -v '#' chr5.filtered.PASS.beagled.vcf.gz  | gawk '{print $1"\t"$2"\t"$3"\t"$4"\t"$5"\t"$6"\t"$7"\t"$8"\t"$9"\t"$10}'; \
+
#    sample is in the last column of the vcf.
    zgrep -v '#' chr6.filtered.PASS.beagled.vcf.gz  | gawk '{print $1"\t"$2"\t"$3"\t"$4"\t"$5"\t"$6"\t"$7"\t"$8"\t"$9"\t"$10}'; \
+
 
    zgrep -v '#' chr7.filtered.PASS.beagled.vcf.gz  | gawk '{print $1"\t"$2"\t"$3"\t"$4"\t"$5"\t"$6"\t"$7"\t"$8"\t"$9"\t"$10}'; \
+
  (zcat chr1.filtered.PASS.beagled.vcf.gz          | gawk '{print $1"\t"$2"\t"$3"\t"$4"\t"$5"\t"$6"\t"$7"\t"$8"\t"$9"\t"$10}'; \
    zgrep -v '#' chr8.filtered.PASS.beagled.vcf.gz  | gawk '{print $1"\t"$2"\t"$3"\t"$4"\t"$5"\t"$6"\t"$7"\t"$8"\t"$9"\t"$10}'; \
+
  zgrep -v '#' chr2.filtered.PASS.beagled.vcf.gz  | gawk '{print $1"\t"$2"\t"$3"\t"$4"\t"$5"\t"$6"\t"$7"\t"$8"\t"$9"\t"$10}'; \
    zgrep -v '#' chr9.filtered.PASS.beagled.vcf.gz  | gawk '{print $1"\t"$2"\t"$3"\t"$4"\t"$5"\t"$6"\t"$7"\t"$8"\t"$9"\t"$10}'; \
+
  zgrep -v '#' chr3.filtered.PASS.beagled.vcf.gz  | gawk '{print $1"\t"$2"\t"$3"\t"$4"\t"$5"\t"$6"\t"$7"\t"$8"\t"$9"\t"$10}'; \
    zgrep -v '#' chr10.filtered.PASS.beagled.vcf.gz | gawk '{print $1"\t"$2"\t"$3"\t"$4"\t"$5"\t"$6"\t"$7"\t"$8"\t"$9"\t"$10}'; \
+
  zgrep -v '#' chr4.filtered.PASS.beagled.vcf.gz  | gawk '{print $1"\t"$2"\t"$3"\t"$4"\t"$5"\t"$6"\t"$7"\t"$8"\t"$9"\t"$10}'; \
    zgrep -v '#' chr11.filtered.PASS.beagled.vcf.gz | gawk '{print $1"\t"$2"\t"$3"\t"$4"\t"$5"\t"$6"\t"$7"\t"$8"\t"$9"\t"$10}'; \
+
  zgrep -v '#' chr5.filtered.PASS.beagled.vcf.gz  | gawk '{print $1"\t"$2"\t"$3"\t"$4"\t"$5"\t"$6"\t"$7"\t"$8"\t"$9"\t"$10}'; \
    zgrep -v '#' chr12.filtered.PASS.beagled.vcf.gz | gawk '{print $1"\t"$2"\t"$3"\t"$4"\t"$5"\t"$6"\t"$7"\t"$8"\t"$9"\t"$10}'; \
+
  zgrep -v '#' chr6.filtered.PASS.beagled.vcf.gz  | gawk '{print $1"\t"$2"\t"$3"\t"$4"\t"$5"\t"$6"\t"$7"\t"$8"\t"$9"\t"$10}'; \
    zgrep -v '#' chr13.filtered.PASS.beagled.vcf.gz | gawk '{print $1"\t"$2"\t"$3"\t"$4"\t"$5"\t"$6"\t"$7"\t"$8"\t"$9"\t"$10}'; \
+
  zgrep -v '#' chr7.filtered.PASS.beagled.vcf.gz  | gawk '{print $1"\t"$2"\t"$3"\t"$4"\t"$5"\t"$6"\t"$7"\t"$8"\t"$9"\t"$10}'; \
    zgrep -v '#' chr14.filtered.PASS.beagled.vcf.gz | gawk '{print $1"\t"$2"\t"$3"\t"$4"\t"$5"\t"$6"\t"$7"\t"$8"\t"$9"\t"$10}'; \
+
  zgrep -v '#' chr8.filtered.PASS.beagled.vcf.gz  | gawk '{print $1"\t"$2"\t"$3"\t"$4"\t"$5"\t"$6"\t"$7"\t"$8"\t"$9"\t"$10}'; \
    zgrep -v '#' chr15.filtered.PASS.beagled.vcf.gz | gawk '{print $1"\t"$2"\t"$3"\t"$4"\t"$5"\t"$6"\t"$7"\t"$8"\t"$9"\t"$10}'; \
+
  zgrep -v '#' chr9.filtered.PASS.beagled.vcf.gz  | gawk '{print $1"\t"$2"\t"$3"\t"$4"\t"$5"\t"$6"\t"$7"\t"$8"\t"$9"\t"$10}'; \
    zgrep -v '#' chr16.filtered.PASS.beagled.vcf.gz | gawk '{print $1"\t"$2"\t"$3"\t"$4"\t"$5"\t"$6"\t"$7"\t"$8"\t"$9"\t"$10}'; \
+
  zgrep -v '#' chr10.filtered.PASS.beagled.vcf.gz | gawk '{print $1"\t"$2"\t"$3"\t"$4"\t"$5"\t"$6"\t"$7"\t"$8"\t"$9"\t"$10}'; \
    zgrep -v '#' chr17.filtered.PASS.beagled.vcf.gz | gawk '{print $1"\t"$2"\t"$3"\t"$4"\t"$5"\t"$6"\t"$7"\t"$8"\t"$9"\t"$10}'; \
+
  zgrep -v '#' chr11.filtered.PASS.beagled.vcf.gz | gawk '{print $1"\t"$2"\t"$3"\t"$4"\t"$5"\t"$6"\t"$7"\t"$8"\t"$9"\t"$10}'; \
    zgrep -v '#' chr18.filtered.PASS.beagled.vcf.gz | gawk '{print $1"\t"$2"\t"$3"\t"$4"\t"$5"\t"$6"\t"$7"\t"$8"\t"$9"\t"$10}'; \
+
  zgrep -v '#' chr12.filtered.PASS.beagled.vcf.gz | gawk '{print $1"\t"$2"\t"$3"\t"$4"\t"$5"\t"$6"\t"$7"\t"$8"\t"$9"\t"$10}'; \
    zgrep -v '#' chr19.filtered.PASS.beagled.vcf.gz | gawk '{print $1"\t"$2"\t"$3"\t"$4"\t"$5"\t"$6"\t"$7"\t"$8"\t"$9"\t"$10}'; \
+
  zgrep -v '#' chr13.filtered.PASS.beagled.vcf.gz | gawk '{print $1"\t"$2"\t"$3"\t"$4"\t"$5"\t"$6"\t"$7"\t"$8"\t"$9"\t"$10}'; \
    zgrep -v '#' chr20.filtered.PASS.beagled.vcf.gz | gawk '{print $1"\t"$2"\t"$3"\t"$4"\t"$5"\t"$6"\t"$7"\t"$8"\t"$9"\t"$10}'; \
+
  zgrep -v '#' chr14.filtered.PASS.beagled.vcf.gz | gawk '{print $1"\t"$2"\t"$3"\t"$4"\t"$5"\t"$6"\t"$7"\t"$8"\t"$9"\t"$10}'; \
    zgrep -v '#' chr21.filtered.PASS.beagled.vcf.gz | gawk '{print $1"\t"$2"\t"$3"\t"$4"\t"$5"\t"$6"\t"$7"\t"$8"\t"$9"\t"$10}'; \
+
  zgrep -v '#' chr15.filtered.PASS.beagled.vcf.gz | gawk '{print $1"\t"$2"\t"$3"\t"$4"\t"$5"\t"$6"\t"$7"\t"$8"\t"$9"\t"$10}'; \
    zgrep -v '#' chr22.filtered.PASS.beagled.vcf.gz | gawk '{print $1"\t"$2"\t"$3"\t"$4"\t"$5"\t"$6"\t"$7"\t"$8"\t"$9"\t"$10}') | bgzip -c > chrALL.filtered.PASS.beagled.HG00096.vcf.gz
+
  zgrep -v '#' chr16.filtered.PASS.beagled.vcf.gz | gawk '{print $1"\t"$2"\t"$3"\t"$4"\t"$5"\t"$6"\t"$7"\t"$8"\t"$9"\t"$10}'; \
+
  zgrep -v '#' chr17.filtered.PASS.beagled.vcf.gz | gawk '{print $1"\t"$2"\t"$3"\t"$4"\t"$5"\t"$6"\t"$7"\t"$8"\t"$9"\t"$10}'; \
### Add in rsIDs from dbSNP. PLINK needs these to reconcile
+
  zgrep -v '#' chr18.filtered.PASS.beagled.vcf.gz | gawk '{print $1"\t"$2"\t"$3"\t"$4"\t"$5"\t"$6"\t"$7"\t"$8"\t"$9"\t"$10}'; \
### positions/alleles
+
  zgrep -v '#' chr19.filtered.PASS.beagled.vcf.gz | gawk '{print $1"\t"$2"\t"$3"\t"$4"\t"$5"\t"$6"\t"$7"\t"$8"\t"$9"\t"$10}'; \
vcf-add-rsid -vcf chrALL.filtered.PASS.beagled.HG00096.vcf.gz -db /Users/scvr9332/reference_data/dbsnp.144.b37.vcf.gz | bgzip -c > chrALL.filtered.PASS.beagled.HG00096.rsID.vcf.gz
+
  zgrep -v '#' chr20.filtered.PASS.beagled.vcf.gz | gawk '{print $1"\t"$2"\t"$3"\t"$4"\t"$5"\t"$6"\t"$7"\t"$8"\t"$9"\t"$10}'; \
### The previous command keeps only variants with rsIDs, otherwise
+
  zgrep -v '#' chr21.filtered.PASS.beagled.vcf.gz | gawk '{print $1"\t"$2"\t"$3"\t"$4"\t"$5"\t"$6"\t"$7"\t"$8"\t"$9"\t"$10}'; \
### plink throws an error that there are >1 variants with ID = "."
+
  zgrep -v '#' chr22.filtered.PASS.beagled.vcf.gz | gawk '{print $1"\t"$2"\t"$3"\t"$4"\t"$5"\t"$6"\t"$7"\t"$8"\t"$9"\t"$10}') | bgzip -c > chrALL.filtered.PASS.beagled.HG00096.vcf.gz
###
+
 
###------ QUESTION 1: WHAT DOES THIS COMMAND DO?
+
### Add in rsIDs from dbSNP. PLINK needs these to reconcile
zgrep -E '#|\srs[0-9]' chrALL.filtered.PASS.beagled.HG00096.rsID.vcf.gz | bgzip -c > chrALL.filtered.PASS.beagled.HG00096.rsIDsOnly.vcf.gz
+
### positions/alleles
+
vcf-add-rsid -vcf chrALL.filtered.PASS.beagled.HG00096.vcf.gz -db /Users/scvr9332/reference_data/dbsnp_144/dbsnp.144.b37.vcf.gz | bgzip -c > chrALL.filtered.PASS.beagled.HG00096.rsID.vcf.gz
##############
+
### The previous command keeps only variants with rsIDs, otherwise
### STEP 2 ###
+
### plink throws an error that there are >1 variants with ID = "."
##############
+
###
### Whole bunch of data cleaning. Retain in the 1000 Genomes
+
###------ QUESTION 1: WHAT DOES THIS COMMAND DO? (1 point)
###  Consortium VCFs only biallelic SNPs that also exist in your vcf
+
zgrep -E '#|\srs[0-9]' chrALL.filtered.PASS.beagled.HG00096.rsID.vcf.gz | bgzip -c > chrALL.filtered.PASS.beagled.HG00096.rsIDsOnly.vcf.gz
###  files. PLINK 1.9 can't handle all variant types gracefully.
+
 
+
##############
#  Get list of your SNPs
+
### STEP 2 ###
zcat chrALL.filtered.PASS.beagled.HG00096.rsIDsOnly.vcf.gz | cut -f3 | grep '^rs' > MyrsIDs.txt
+
##############
#
+
### Whole bunch of data cleaning. Retain in the 1000 Genomes
#  Reduce the 1000 Genomes VCFs to bi-allelic SNPs (that's all PLINK
+
###  Consortium VCFs only biallelic SNPs that also exist in your vcf
#  can handle anyway). BE SURE TO LOG INTO A COMPUTE NODE BEFORE
+
###  files. PLINK 1.9 can't handle all variant types gracefully.
#  RUNNING ANY OF THESE LOOPS!!!!!
+
 
###------ QUESTION 2: WHAT DOES THIS COMMAND (THE ENTIRE FOR LOOP) DO?
+
#  Get list of your SNPs
for i in {1..22}; do
+
zcat chrALL.filtered.PASS.beagled.HG00096.rsIDsOnly.vcf.gz | cut -f3 | grep '^rs' > MyrsIDs.txt
    zgrep -E '^#|\s[ACTG]\s[ACTG]\s' /Users/scvr9332/1000Gp3/variant_calls /ALL.chr$i.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz | bgzip -c > chr$i.1000G.biallelic.vcf.gz &
+
#
done
+
#  Reduce the 1000 Genomes VCFs to bi-allelic SNPs (that's all PLINK
+
#  can handle anyway). BE SURE TO LOG INTO A COMPUTE NODE BEFORE
### Retain in the 1000 Genomes VCF only your SNPs that are also fairly common
+
#  RUNNING ANY OF THESE LOOPS!!!!!
###
+
###
###------ QUESTION 3: WHY WOULD WE REMOVE COMMON SNPS, OTHER THAN IT
+
###------ QUESTION 2: WHAT DOES THIS COMMAND (THE ENTIRE FOR LOOP) DO? (2 points)
###------            MAKES EVERY COMMAND LATER FASTER?
+
for i in {1..22}; do
for i in {1..22}; do
+
    zgrep -E '^#|\s[ACTG]\s[ACTG]\s' /Users/scvr9332/1000Gp3/variant_calls/ALL.chr$i.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz | bgzip -c > chr$i.1000G.biallelic.vcf.gz &
    plink --vcf chr$i.1000G.biallelic.vcf.gz \
+
done
        --extract MyrsIDs.txt \
+
 
        --maf .05 \
+
### Retain in the 1000 Genomes VCF only your SNPs that are also fairly common
        --make-bed \
+
### because we're going to conduct PCA on these SNPs and only want common ones.
        --out chr$i.1000G.biallelic.common.plink &
+
###  
done
+
###------ QUESTION 3: WHY WOULD WE RETAIN ONLY COMMON SNPS, OTHER THAN IT
+
###------            MAKES EVERY COMMAND LATER FASTER? (2 points)
### Remove any remaining duplicates from the 1000G reference files
+
for i in {1..22}; do
### A) Identify duplicate sites
+
    plink --vcf chr$i.1000G.biallelic.vcf.gz \
###
+
        --extract MyrsIDs.txt \
###------ QUESTION 4: WALK ME THROUGH WHAT THIS COMMAND (THE ENTIRE
+
        --maf .05 \
###------            FOR LOOP) IS DOING.
+
        --make-bed \
for i in {1..22}; do
+
        --out chr$i.1000G.biallelic.common.plink &
    zcat chr$i.1000G.biallelic.vcf.gz | cut -f3 | sort | uniq -c | grep -E '\s2\s' | grep '^rs' | gawk '{print $2}' >  dups$i.txt &  
+
done
done
+
 
+
### Remove any remaining duplicates from the 1000G reference files
### B) Remove the duplicate sites
+
### A) Identify duplicate sites
for i in {1..22}; do
+
###
    plink --bfile chr$i.1000G.biallelic.common.plink \
+
###------ QUESTION 4: WALK ME THROUGH WHAT THIS COMMAND (THE ENTIRE
        --exclude dups$i.txt \
+
###------            FOR LOOP) IS DOING. (3 points)
        --make-bed \
+
for i in {1..22}; do
        --out chr$i.1000G.biallelic.common.plink.nodups &
+
    zcat chr$i.1000G.biallelic.vcf.gz | cut -f3 | sort | uniq -c | grep -E '\s2\s' | grep '^rs' | gawk '{print $2}' >  dups$i.txt &  
done
+
done
+
 
rm chr*.1000G.biallelic.vcf.gz
+
### B) Remove the duplicate sites
rm chr*.1000G.biallelic.common.plink.bed
+
for i in {1..22}; do
rm chr*.1000G.biallelic.common.plink.bim
+
    plink --bfile chr$i.1000G.biallelic.common.plink \
rm chr*.1000G.biallelic.common.plink.fam
+
        --exclude dups$i.txt \
+
        --make-bed \
+
        --out chr$i.1000G.biallelic.common.plink.nodups &
## Identify variants in LD
+
done
for i in {1..22}; do
+
 
    plink --bfile chr$i.1000G.biallelic.common.plink.nodups \
+
rm chr*.1000G.biallelic.vcf.gz
        --indep-pairwise 100k 5 .01 \  #------ QUESTION 5: EXPLAIN WHAT THIS ARGUMENT IS DOING
+
rm chr*.1000G.biallelic.common.plink.bed
        --maf .05 \
+
rm chr*.1000G.biallelic.common.plink.bim
        --out chr$i.snpsToExtract &
+
rm chr*.1000G.biallelic.common.plink.fam
done
+
 
+
 
+
## Identify variants in LD
## Extract LD-independent variants
+
for i in {1..22}; do
for i in {1..22}; do
+
    plink --bfile chr$i.1000G.biallelic.common.plink.nodups \
    plink --bfile chr$i.1000G.biallelic.common.plink.nodups \
+
        --indep-pairwise 100k 5 .01 \  #------ QUESTION 5: EXPLAIN WHAT THIS ARGUMENT IS DOING (1 point)
        --extract chr$i.snpsToExtract.prune.in \
+
        --maf .05 \
        --make-bed \
+
        --out chr$i.snpsToExtract &
        --out chr$i.1000G.biallelic.common.plink.nodups.LDpruned &
+
done
done
+
 
+
 
rm chr*.1000G.biallelic.common.plink.nodups.bed
+
## Extract LD-independent variants
rm chr*.1000G.biallelic.common.plink.nodups.bim
+
for i in {1..22}; do
rm chr*.1000G.biallelic.common.plink.nodups.fam
+
    plink --bfile chr$i.1000G.biallelic.common.plink.nodups \
rm dups*
+
        --extract chr$i.snpsToExtract.prune.in \
+
        --make-bed \
+
        --out chr$i.1000G.biallelic.common.plink.nodups.LDpruned &
##############
+
done
### STEP 3 ###
 
##############
 
### Merge the new and improved 1000 Genomes plink files into a single
 
###  file with all chromosomes. This is easier to do when they're vcf
 
###  files
 
  
# Example Command:
+
rm chr*.1000G.biallelic.common.plink.nodups.bed
 +
rm chr*.1000G.biallelic.common.plink.nodups.bim
 +
rm chr*.1000G.biallelic.common.plink.nodups.fam
 +
rm dups*
 +
 
 +
 
 +
##############
 +
### STEP 3 ###
 +
##############
 +
### Merge the new and improved 1000 Genomes plink files into a single
 +
### file with all chromosomes. This is easier to do when they're vcf
 +
### files
 +
### Example Command:
 
for i in {1..22}; do  
 
for i in {1..22}; do  
 
     plink --bfile chr$i.1000G.biallelic.common.plink.nodups.LDpruned --recode-vcf --out tmp$i &  
 
     plink --bfile chr$i.1000G.biallelic.common.plink.nodups.LDpruned --recode-vcf --out tmp$i &  
Line 172: Line 182:
  
 
###----- QUESTION 6: HOW MANY VARIANTS ARE IN THE OUTPUT FILE FROM THE
 
###----- QUESTION 6: HOW MANY VARIANTS ARE IN THE OUTPUT FILE FROM THE
###-----            COMMAND ABOVE?
+
###-----            COMMAND ABOVE? (1 point)
  
  
Line 215: Line 225:
 
### average for each site in the score file.
 
### average for each site in the score file.
 
###
 
###
###------ QUESTION 7: EXPLAIN WHAT THE FOLLOWING COMMANDS ARE DOING  
+
###------ QUESTION 7: EXPLAIN WHAT THE FOLLOWING COMMANDS ARE DOING (2 points)
 
plink --vcf chrALL.filtered.PASS.beagled.HG00096.rsIDsOnly.vcf.gz  --score PCA-score-file.txt 2 3 4 header --out PC1
 
plink --vcf chrALL.filtered.PASS.beagled.HG00096.rsIDsOnly.vcf.gz  --score PCA-score-file.txt 2 3 4 header --out PC1
 
plink --vcf chrALL.filtered.PASS.beagled.HG00096.rsIDsOnly.vcf.gz  --score PCA-score-file.txt 2 3 5 header --out PC2
 
plink --vcf chrALL.filtered.PASS.beagled.HG00096.rsIDsOnly.vcf.gz  --score PCA-score-file.txt 2 3 5 header --out PC2
Line 241: Line 251:
 
R
 
R
 
### Then in this R session run: 'install.packages('car', repos='http://cran.r-project.org')'
 
### Then in this R session run: 'install.packages('car', repos='http://cran.r-project.org')'
### If you log out and back in, you'll have to again run this command:
+
### If you log out and back in, you'll have to again run this command again to get the car library loaded:
 
export R_LIBS=/Users/scvr9332/R
 
export R_LIBS=/Users/scvr9332/R
  
Line 247: Line 257:
 
###------ QUESTION 8: PLEASE INTERPRET THE RESULTING GRAPH. WHAT IS
 
###------ QUESTION 8: PLEASE INTERPRET THE RESULTING GRAPH. WHAT IS
 
###------            BEING PLOTTED? WHAT DOES IT SAY ABOUT THE
 
###------            BEING PLOTTED? WHAT DOES IT SAY ABOUT THE
###------            ANCESTRY OF YOUR SAMPLE (OR WHATEVER
+
###------            ANCESTRY OF YOUR SAMPLE (OR WHATEVER SAMPLE YOU
 +
###------            USED) (2 points)
 
echo "
 
echo "
  
Line 257: Line 268:
  
 
### 'Self-reported' ancestry of 1000g participants
 
### 'Self-reported' ancestry of 1000g participants
kg_sf <- read.table('20130502.sequence.index', header=T, sep='\t', fill=T, stringsAsFactors=F)
+
kg_sf <- read.table('/Users/scvr9332/PCA/20130502.sequence.index', header=T, sep='\t', fill=T, stringsAsFactors=F)
  
 
sample_ids <- unique(data.frame(IID=kg_sf$SAMPLE_NAME, POPULATION=kg_sf$POPULATION, stringsAsFactors=F))
 
sample_ids <- unique(data.frame(IID=kg_sf$SAMPLE_NAME, POPULATION=kg_sf$POPULATION, stringsAsFactors=F))

Latest revision as of 17:18, 16 November 2015

Notes

Code for Homework

# PSYC 7102 -- Statistical Genetics
# Homework #6: Ancestry
# Due: November 25th

# PLEASE CONTACT DAVID TO GET YOUR NEW VCF.

# Overview: Using your variant calls contained in your VCF files,
# combined with the existing 1000 Genomes variant calls from the 1000
# Genomes project, you will conduct a principal components analysis
# of the 1000 Genomes samples using PLINK and estimate your position in
# the 1000G PCA space

# This homework is a little bit different because the commands are
# slightly complicated, so I've written them all for you. In addition
# to specific conceptual questions, I ask you to run the commands
# below (changing directories appropriately as needed) and then
# explain to me what the command did and why we did it. I don't need
# super technical details but simply a solid conceptual overview of
# the purpose of the command.

### ONLY EXPLAIN COMMANDS WHERE I SPECIFICALLY REQUEST IT! YOU DO NOT
### HAVE TO EXPLAIN EVERY COMMAND! But please run all commands to produce 
### in the end a PCA plot containing yourself compared to all 1000 Genomes 
### samples.

# For many questions you'll want to run analyses by chromosome. To do
#  this, please create an interactive PBS session like this:

qsub -I -l walltime=23:00:00 -q short -l mem=10gb -l nodes=1:ppn=22

### Load apigenome, plink
module load apigenome_0.0.2
module load plink_latest
module load tabix_0.2.6


###############
### STEP 1  ###
###############
# Here I take a minute to describe the various files you'll use to get started

#    The 1000 Genomes vcfs from the 1000 Genomes Consortium
/Users/scvr9332/1000Gp3/variant_calls/ALL.chr[1-22].phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz


#    A vcf file with, depending on your level of comfort, your own
#    genotypes or 1kg sample HG000096. All examples that follow use
#    HG00096 for convenience, which can be obtained as follows. To use
#    your own sample just replace the $10 with $NF, assuming your
#    sample is in the last column of the vcf.

   (zcat chr1.filtered.PASS.beagled.vcf.gz          | gawk '{print $1"\t"$2"\t"$3"\t"$4"\t"$5"\t"$6"\t"$7"\t"$8"\t"$9"\t"$10}'; \
   zgrep -v '#' chr2.filtered.PASS.beagled.vcf.gz  | gawk '{print $1"\t"$2"\t"$3"\t"$4"\t"$5"\t"$6"\t"$7"\t"$8"\t"$9"\t"$10}'; \
   zgrep -v '#' chr3.filtered.PASS.beagled.vcf.gz  | gawk '{print $1"\t"$2"\t"$3"\t"$4"\t"$5"\t"$6"\t"$7"\t"$8"\t"$9"\t"$10}'; \
   zgrep -v '#' chr4.filtered.PASS.beagled.vcf.gz  | gawk '{print $1"\t"$2"\t"$3"\t"$4"\t"$5"\t"$6"\t"$7"\t"$8"\t"$9"\t"$10}'; \
   zgrep -v '#' chr5.filtered.PASS.beagled.vcf.gz  | gawk '{print $1"\t"$2"\t"$3"\t"$4"\t"$5"\t"$6"\t"$7"\t"$8"\t"$9"\t"$10}'; \
   zgrep -v '#' chr6.filtered.PASS.beagled.vcf.gz  | gawk '{print $1"\t"$2"\t"$3"\t"$4"\t"$5"\t"$6"\t"$7"\t"$8"\t"$9"\t"$10}'; \
   zgrep -v '#' chr7.filtered.PASS.beagled.vcf.gz  | gawk '{print $1"\t"$2"\t"$3"\t"$4"\t"$5"\t"$6"\t"$7"\t"$8"\t"$9"\t"$10}'; \
   zgrep -v '#' chr8.filtered.PASS.beagled.vcf.gz  | gawk '{print $1"\t"$2"\t"$3"\t"$4"\t"$5"\t"$6"\t"$7"\t"$8"\t"$9"\t"$10}'; \
   zgrep -v '#' chr9.filtered.PASS.beagled.vcf.gz  | gawk '{print $1"\t"$2"\t"$3"\t"$4"\t"$5"\t"$6"\t"$7"\t"$8"\t"$9"\t"$10}'; \
   zgrep -v '#' chr10.filtered.PASS.beagled.vcf.gz | gawk '{print $1"\t"$2"\t"$3"\t"$4"\t"$5"\t"$6"\t"$7"\t"$8"\t"$9"\t"$10}'; \
   zgrep -v '#' chr11.filtered.PASS.beagled.vcf.gz | gawk '{print $1"\t"$2"\t"$3"\t"$4"\t"$5"\t"$6"\t"$7"\t"$8"\t"$9"\t"$10}'; \
   zgrep -v '#' chr12.filtered.PASS.beagled.vcf.gz | gawk '{print $1"\t"$2"\t"$3"\t"$4"\t"$5"\t"$6"\t"$7"\t"$8"\t"$9"\t"$10}'; \
   zgrep -v '#' chr13.filtered.PASS.beagled.vcf.gz | gawk '{print $1"\t"$2"\t"$3"\t"$4"\t"$5"\t"$6"\t"$7"\t"$8"\t"$9"\t"$10}'; \
   zgrep -v '#' chr14.filtered.PASS.beagled.vcf.gz | gawk '{print $1"\t"$2"\t"$3"\t"$4"\t"$5"\t"$6"\t"$7"\t"$8"\t"$9"\t"$10}'; \
   zgrep -v '#' chr15.filtered.PASS.beagled.vcf.gz | gawk '{print $1"\t"$2"\t"$3"\t"$4"\t"$5"\t"$6"\t"$7"\t"$8"\t"$9"\t"$10}'; \
   zgrep -v '#' chr16.filtered.PASS.beagled.vcf.gz | gawk '{print $1"\t"$2"\t"$3"\t"$4"\t"$5"\t"$6"\t"$7"\t"$8"\t"$9"\t"$10}'; \
   zgrep -v '#' chr17.filtered.PASS.beagled.vcf.gz | gawk '{print $1"\t"$2"\t"$3"\t"$4"\t"$5"\t"$6"\t"$7"\t"$8"\t"$9"\t"$10}'; \
   zgrep -v '#' chr18.filtered.PASS.beagled.vcf.gz | gawk '{print $1"\t"$2"\t"$3"\t"$4"\t"$5"\t"$6"\t"$7"\t"$8"\t"$9"\t"$10}'; \
   zgrep -v '#' chr19.filtered.PASS.beagled.vcf.gz | gawk '{print $1"\t"$2"\t"$3"\t"$4"\t"$5"\t"$6"\t"$7"\t"$8"\t"$9"\t"$10}'; \
   zgrep -v '#' chr20.filtered.PASS.beagled.vcf.gz | gawk '{print $1"\t"$2"\t"$3"\t"$4"\t"$5"\t"$6"\t"$7"\t"$8"\t"$9"\t"$10}'; \
   zgrep -v '#' chr21.filtered.PASS.beagled.vcf.gz | gawk '{print $1"\t"$2"\t"$3"\t"$4"\t"$5"\t"$6"\t"$7"\t"$8"\t"$9"\t"$10}'; \
   zgrep -v '#' chr22.filtered.PASS.beagled.vcf.gz | gawk '{print $1"\t"$2"\t"$3"\t"$4"\t"$5"\t"$6"\t"$7"\t"$8"\t"$9"\t"$10}') | bgzip -c > chrALL.filtered.PASS.beagled.HG00096.vcf.gz

### Add in rsIDs from dbSNP. PLINK needs these to reconcile
### positions/alleles
vcf-add-rsid -vcf chrALL.filtered.PASS.beagled.HG00096.vcf.gz -db /Users/scvr9332/reference_data/dbsnp_144/dbsnp.144.b37.vcf.gz | bgzip -c > chrALL.filtered.PASS.beagled.HG00096.rsID.vcf.gz
### The previous command keeps only variants with rsIDs, otherwise
### plink throws an error that there are >1 variants with ID = "."
###
###------ QUESTION 1: WHAT DOES THIS COMMAND DO? (1 point)
zgrep -E '#|\srs[0-9]' chrALL.filtered.PASS.beagled.HG00096.rsID.vcf.gz | bgzip -c > chrALL.filtered.PASS.beagled.HG00096.rsIDsOnly.vcf.gz

##############
### STEP 2 ###
##############
### Whole bunch of data cleaning. Retain in the 1000 Genomes
###   Consortium VCFs only biallelic SNPs that also exist in your vcf
###   files. PLINK 1.9 can't handle all variant types gracefully.

#  Get list of your SNPs
zcat chrALL.filtered.PASS.beagled.HG00096.rsIDsOnly.vcf.gz | cut -f3 | grep '^rs' > MyrsIDs.txt
#
#  Reduce the 1000 Genomes VCFs to bi-allelic SNPs (that's all PLINK
#  can handle anyway). BE SURE TO LOG INTO A COMPUTE NODE BEFORE
#  RUNNING ANY OF THESE LOOPS!!!!!
###
###------ QUESTION 2: WHAT DOES THIS COMMAND (THE ENTIRE FOR LOOP) DO? (2 points)
for i in {1..22}; do
    zgrep -E '^#|\s[ACTG]\s[ACTG]\s' /Users/scvr9332/1000Gp3/variant_calls/ALL.chr$i.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz | bgzip -c > chr$i.1000G.biallelic.vcf.gz &
done

### Retain in the 1000 Genomes VCF only your SNPs that are also fairly common
### because we're going to conduct PCA on these SNPs and only want common ones.
### 
###------ QUESTION 3: WHY WOULD WE RETAIN ONLY COMMON SNPS, OTHER THAN IT
###------             MAKES EVERY COMMAND LATER FASTER? (2 points)
for i in {1..22}; do
    plink --vcf chr$i.1000G.biallelic.vcf.gz \
        --extract MyrsIDs.txt \
        --maf .05 \
        --make-bed \
        --out chr$i.1000G.biallelic.common.plink &
done

### Remove any remaining duplicates from the 1000G reference files
### A) Identify duplicate sites
###
###------ QUESTION 4: WALK ME THROUGH WHAT THIS COMMAND (THE ENTIRE
###------             FOR LOOP) IS DOING. (3 points)
for i in {1..22}; do
    zcat chr$i.1000G.biallelic.vcf.gz | cut -f3 | sort | uniq -c | grep -E '\s2\s' | grep '^rs' | gawk '{print $2}' >  dups$i.txt & 
done

### B) Remove the duplicate sites
for i in {1..22}; do
    plink --bfile chr$i.1000G.biallelic.common.plink \
        --exclude dups$i.txt \
        --make-bed \
        --out chr$i.1000G.biallelic.common.plink.nodups &
done

rm chr*.1000G.biallelic.vcf.gz
rm chr*.1000G.biallelic.common.plink.bed
rm chr*.1000G.biallelic.common.plink.bim
rm chr*.1000G.biallelic.common.plink.fam


## Identify variants in LD
for i in {1..22}; do
    plink --bfile chr$i.1000G.biallelic.common.plink.nodups \
        --indep-pairwise 100k 5 .01 \  #------ QUESTION 5: EXPLAIN WHAT THIS ARGUMENT IS DOING (1 point)
        --maf .05 \
        --out chr$i.snpsToExtract &
done


## Extract LD-independent variants
for i in {1..22}; do
    plink --bfile chr$i.1000G.biallelic.common.plink.nodups \
        --extract chr$i.snpsToExtract.prune.in \
        --make-bed \
        --out chr$i.1000G.biallelic.common.plink.nodups.LDpruned &
done

rm chr*.1000G.biallelic.common.plink.nodups.bed
rm chr*.1000G.biallelic.common.plink.nodups.bim
rm chr*.1000G.biallelic.common.plink.nodups.fam
rm dups*


##############
### STEP 3 ###
##############
### Merge the new and improved 1000 Genomes plink files into a single
### file with all chromosomes. This is easier to do when they're vcf
### files
### Example Command:
for i in {1..22}; do 
    plink --bfile chr$i.1000G.biallelic.common.plink.nodups.LDpruned --recode-vcf --out tmp$i & 
done
    
(cat tmp1.vcf; grep -v '#' tmp2.vcf; grep -v '#' tmp3.vcf; grep -v '#' tmp4.vcf; grep -v '#' tmp5.vcf; \
    grep -v '#' tmp6.vcf; grep -v '#' tmp7.vcf; grep -v '#' tmp8.vcf; grep -v '#' tmp9.vcf; grep -v '#' tmp10.vcf; \
    grep -v '#' tmp11.vcf; grep -v '#' tmp12.vcf; grep -v '#' tmp13.vcf; grep -v '#' tmp14.vcf; grep -v '#' tmp15.vcf; \
    grep -v '#' tmp16.vcf; grep -v '#' tmp17.vcf; grep -v '#' tmp18.vcf; grep -v '#' tmp19.vcf; grep -v '#' tmp20.vcf; \
    grep -v '#' tmp21.vcf; grep -v '#' tmp22.vcf) | bgzip -c > chrALL.1000G.biallelic.common.plink.nodups.LDpruned.vcf.gz

###----- QUESTION 6: HOW MANY VARIANTS ARE IN THE OUTPUT FILE FROM THE
###-----             COMMAND ABOVE? (1 point)


plink --vcf chrALL.1000G.biallelic.common.plink.nodups.LDpruned.vcf.gz \
    --make-bed \
    --out chrALL.1000G.biallelic.common.plink.nodups.LDpruned   

rm tmp*
rm chr[0-9]*.bed
rm chr[0-9]*.bim
rm chr[0-9]*.fam
rm chr[0-9]*.nosex
rm chr[0-9]*.log
rm chr[0-9]*.prune.in
rm chr[0-9]*.prune.out


##############
### STEP 4 ###
##############
# Conduct a PCA on these new and improved 1000 Genomes plink files.
plink --bfile chrALL.1000G.biallelic.common.plink.nodups.LDpruned --pca var-wts

# Use R to organize the results from the PCA into a file that can then be
# imported into PLINK using the --score command
echo "### Extract per - SNP weights for each of the first 20 PCs and get the
### correct effect allele for later scoring
###
PCA <- read.table('plink.eigenvec.var', header = F, col.names = c('CHROM','RS',paste('PC',1:20,sep=''))) 
bim <- read.table('chrALL.1000G.biallelic.common.plink.nodups.LDpruned.bim', header = F, col.names = c('CHROM','RS','unknown','POS','A1','A2')) 
x   <- merge (PCA, bim) 
write.table(subset(x, select = c('RS','A1', paste('PC', 1:20, sep = '' ))) , file = 'PCA-score-file.txt', quote=F) 
### End: extract per - SNP weights for each of the first 20
" | R --vanilla

##############
### STEP 5 ###
##############
### Score yourself (or 1000 Genomes sample HG00096) on the PCs

### Calculate the score for each person by taking the PC-weighted
### average for each site in the score file.
###
###------ QUESTION 7: EXPLAIN WHAT THE FOLLOWING COMMANDS ARE DOING (2 points)
plink --vcf chrALL.filtered.PASS.beagled.HG00096.rsIDsOnly.vcf.gz  --score PCA-score-file.txt 2 3 4 header --out PC1
plink --vcf chrALL.filtered.PASS.beagled.HG00096.rsIDsOnly.vcf.gz  --score PCA-score-file.txt 2 3 5 header --out PC2
plink --vcf chrALL.filtered.PASS.beagled.HG00096.rsIDsOnly.vcf.gz  --score PCA-score-file.txt 2 3 6 header --out PC3
plink --vcf chrALL.filtered.PASS.beagled.HG00096.rsIDsOnly.vcf.gz  --score PCA-score-file.txt 2 3 7 header --out PC4

plink --bfile chrALL.1000G.biallelic.common.plink.nodups.LDpruned  --score PCA-score-file.txt 2 3 4 header --out PC1.1kg
plink --bfile chrALL.1000G.biallelic.common.plink.nodups.LDpruned  --score PCA-score-file.txt 2 3 5 header --out PC2.1kg
plink --bfile chrALL.1000G.biallelic.common.plink.nodups.LDpruned  --score PCA-score-file.txt 2 3 6 header --out PC3.1kg
plink --bfile chrALL.1000G.biallelic.common.plink.nodups.LDpruned  --score PCA-score-file.txt 2 3 7 header --out PC4.1kg


##############
### STEP 6 ###
##############
### Plot the results!  
###
### To run the following command you'll have to install the R package
### 'car'. To do this on vieques please run the following. ONly need
### to run it once

mkdir /Users/YourUniqueName/R
export R_LIBS=/Users/scvr9332/R ## replace my unique name with yours
module load R_3.2.2
R
### Then in this R session run: 'install.packages('car', repos='http://cran.r-project.org')'
### If you log out and back in, you'll have to again run this command again to get the car library loaded:
export R_LIBS=/Users/scvr9332/R


###------ QUESTION 8: PLEASE INTERPRET THE RESULTING GRAPH. WHAT IS
###------             BEING PLOTTED? WHAT DOES IT SAY ABOUT THE
###------             ANCESTRY OF YOUR SAMPLE (OR WHATEVER SAMPLE YOU
###------             USED) (2 points)
echo "

### Read in scores for PC1 and PC2 in 1000g
kg_PC1 <- read.table('PC1.1kg.profile', header=T)
kg_PC2 <- read.table('PC2.1kg.profile', header=T)
kg_PC3 <- read.table('PC3.1kg.profile', header=T)
kg_PC4 <- read.table('PC4.1kg.profile', header=T)

### 'Self-reported' ancestry of 1000g participants
kg_sf <- read.table('/Users/scvr9332/PCA/20130502.sequence.index', header=T, sep='\t', fill=T, stringsAsFactors=F)

sample_ids <- unique(data.frame(IID=kg_sf$SAMPLE_NAME, POPULATION=kg_sf$POPULATION, stringsAsFactors=F))

sample_ids$CONTINENT <- ifelse(sample_ids$POPULATION=='CHB' |
                               sample_ids$POPULATION=='JPT' |
                               sample_ids$POPULATION=='CHS' |
                               sample_ids$POPULATION=='CDX' |
                               sample_ids$POPULATION=='KHV' |
                               sample_ids$POPULATION=='CHD', 'EAS', sample_ids$POPULATION)
sample_ids$CONTINENT <- ifelse(sample_ids$CONTINENT=='CEU' |
                               sample_ids$CONTINENT=='TSI' |
                               sample_ids$CONTINENT=='GBR' |
                               sample_ids$CONTINENT=='FIN' |
                               sample_ids$CONTINENT=='IBS', 'EUR', sample_ids$CONTINENT)
sample_ids$CONTINENT <- ifelse(sample_ids$CONTINENT=='YRI' |
                               sample_ids$CONTINENT=='LWK' |
                               sample_ids$CONTINENT=='GWD' |
                               sample_ids$CONTINENT=='MSL' |
                               sample_ids$CONTINENT=='ESN', 'AFR', sample_ids$CONTINENT)
sample_ids$CONTINENT <- ifelse(sample_ids$CONTINENT=='ASW' |
                               sample_ids$CONTINENT=='ACB' |
                               sample_ids$CONTINENT=='MXL' |
                               sample_ids$CONTINENT=='PUR' |
                               sample_ids$CONTINENT=='CLM' |
                               sample_ids$CONTINENT=='PEL', 'ADM', sample_ids$CONTINENT)
sample_ids$CONTINENT <- ifelse(sample_ids$CONTINENT=='GIH' |
                               sample_ids$CONTINENT=='PJL' |
                               sample_ids$CONTINENT=='BEB' |
                               sample_ids$CONTINENT=='STU' |
                               sample_ids$CONTINENT=='ITU', 'SAS', sample_ids$CONTINENT)

sample_ids$CONTINENT_numeric <- ifelse(sample_ids$CONTINENT == 'EAS', 1, sample_ids$CONTINENT)
sample_ids$CONTINENT_numeric <- ifelse(sample_ids$CONTINENT == 'EUR', 1, sample_ids$CONTINENT_numeric)
sample_ids$CONTINENT_numeric <- ifelse(sample_ids$CONTINENT == 'AFR', 1, sample_ids$CONTINENT_numeric)
sample_ids$CONTINENT_numeric <- ifelse(sample_ids$CONTINENT == 'ADM', 1, sample_ids$CONTINENT_numeric)
sample_ids$CONTINENT_numeric <- ifelse(sample_ids$CONTINENT == 'SAS', 1, sample_ids$CONTINENT_numeric)

kg_PC1 <- merge(kg_PC1, sample_ids)

My_PC1 <- read.table('PC1.profile', header=T)
My_PC2 <- read.table('PC2.profile', header=T)
My_PC3 <- read.table('PC3.profile', header=T)
My_PC4 <- read.table('PC4.profile', header=T)

kg <- data.frame(PC1 = -kg_PC1$SCORE,
                 PC2 = -kg_PC2$SCORE,
                 PC3 = -kg_PC3$SCORE,
                 PC4 = -kg_PC4$SCORE,
                 ancestry = kg_PC1$CONTINENT)

all <- rbind(kg, data.frame(PC1=-My_PC1$SCORE,
                            PC2=-My_PC2$SCORE,
                            PC3=-My_PC3$SCORE,
                            PC4=-My_PC4$SCORE,
                            ancestry=rep('Me', nrow(My_PC1))))

### Scatterplot matrix
library(car)
pdf('ancestry_Scatterplot.pdf')
scatterplotMatrix(~PC1+PC2+PC3+PC4 | ancestry, data=all, smoother=FALSE, reg.line=FALSE, cex=.7)
dev.off()
" | R --vanilla