################################################################################################################################################################################################
################################################################################################################################################################################################
## This is the code used to perform the principle components analysis (PCA) using genotype data from TCGA
## for the manuscript titled Weak sharing of genetic association signals in three lung cancer subtypes: evidence at the SNP, gene, regulation and pathway levels.
################################################################################################################################################################################################
################################################################################################################################################################################################

### Section 1. Format genotype files downloaded from TCGA, extract blood normal samples, and exclude all non-white and white hispanic samples

################################################################################################################################################################################################
################################################################################################################################################################################################

### all genotype samples and clinical information downloaded from https://portal.gdc.cancer.gov/legacy-archive/search/f using the GDC transfer tool for six cancer types (after permission):
### LUAD, LUSC, HNSC, BLCA, GBM, and LGG
### for each cancer type, extract sdrf.txt file from one of the samples (file is same for every sample) to use for annotation
### 

## start new R session for working with LUAD

## load libraries

library('reshape2')

## load all samples from TCGA for LUAD 

all.luad <- list.files("LUAD/", recursive = TRUE, pattern = "*.birdseed.data.txt$")

setwd("LUAD/") ## change directory for LUAD samples

## load sdrf file that contains the naming information for all LUAD samples

luad.names <- read.delim("broad.mit.edu_LUAD.Genome_Wide_SNP_6.sdrf.txt", header = TRUE)

all.luad.split <- colsplit(all.luad, "/", c("folder_name", "sample_name")) ## split the file names to be compatible with the sdrf.txt file

luad.final.matched <- luad.names[match(all.luad.split$sample_name, luad.names$Derived.Array.Data.Matrix.File.1), c(1,2,31)] ## match file names to sdrf 

luad.tcga.split.barcode <- colsplit(luad.final.matched$Comment..TCGA.Barcode., "-", c("project", "TSS", "participant", "sample", "portion", "plate", "center"))

## extract the list of germline blood only files using the TCGA code

luad.germline.blood <- luad.final.matched[grep(10, luad.tcga.split.barcode$sample), ]

## load the clinical file from TCGA

luad.clin <- read.delim('nationwidechildrens.org_clinical_patient_luad.txt', skip = 1)

luad.clin.to.use <- luad.clin[-1, ] ## remove first row to use file for matching

### extract only white non-hispanic samples

luad.samples.white <- luad.clin.to.use[which(luad.clin.to.use$race == "WHITE"), ] ## extract white only samples

luad.samples.final.pop <- luad.samples.white[which(luad.samples.white$ethnicity == "NOT HISPANIC OR LATINO"), ] ## remove Hispanic or Latino samples

## split columns and filter blood germline list to exclude non-white or white hispanic samples

luad.germline.blood.split <- colsplit(luad.germline.blood$Comment..TCGA.Barcode., "-10", c("patient_ID", "sample_info"))

luad.germline.blood$patient_ID <- luad.germline.blood.split$patient_ID

luad.germline.blood.samples.white.only <- luad.germline.blood[which(luad.germline.blood$patient_ID %in% luad.samples.final.pop$bcr_patient_barcode), ]

## write this list to a new file 

write.table(luad.germline.blood.samples.white.only, file = "/LUAD/blood_normal_white_only_LUAD.tsv", col.names = TRUE, row.names = FALSE, quote = FALSE, sep = "\t")
______________________________________________________________________________________________________________________________________________________________________________________________________

## start new R session for working with LUSC

## load libraries

library('reshape2')

## load all samples from TCGA for LUAD 

all.lusc <- list.files("LUSC/", recursive = TRUE, pattern = "*.birdseed.data.txt$")

setwd("LUSC/") ## change directory for LUSC samples

## load sdrf file that contains the naming information for all LUSC samples

lusc.names <- read.delim("broad.mit.edu_LUSC.Genome_Wide_SNP_6.sdrf.txt", header = TRUE)

all.lusc.split <- colsplit(all.lusc, "/", c("folder_name", "sample_name")) ## split the file names to be compatible with the sdrf.txt file

lusc.final.matched <- lusc.names[match(all.lusc.split$sample_name, lusc.names$Derived.Array.Data.Matrix.File.1), c(1,2,31)] ## match file names to sdrf 

lusc.tcga.split.barcode <- colsplit(lusc.final.matched$Comment..TCGA.Barcode., "-", c("project", "TSS", "participant", "sample", "portion", "plate", "center"))

## extract the list of germline blood only files using the TCGA code

lusc.germline.blood <- lusc.final.matched[grep(10, lusc.tcga.split.barcode$sample), ]

## load the clinical file from TCGA

lusc.clin <- read.delim('nationwidechildrens.org_clinical_patient_lusc.txt', skip = 1)

lusc.clin.to.use <- lusc.clin[-1, ] ## remove first row to use file for matching

### extract only white non-hispanic samples

lusc.samples.white <- lusc.clin.to.use[which(lusc.clin.to.use$race == "WHITE"), ] ## extract white only samples

lusc.samples.final.pop <- lusc.samples.white[which(lusc.samples.white$ethnicity == "NOT HISPANIC OR LATINO"), ] ## remove Hispanic or Latino samples

## split columns and filter blood germline list to exclude non-white or white hispanic samples

lusc.germline.blood.split <- colsplit(lusc.germline.blood$Comment..TCGA.Barcode., "-10", c("patient_ID", "sample_info"))

lusc.germline.blood$patient_ID <- lusc.germline.blood.split$patient_ID

lusc.germline.blood.samples.white.only <- lusc.germline.blood[which(lusc.germline.blood$patient_ID %in% lusc.samples.final.pop$bcr_patient_barcode), ]

## write this list to a new file 

write.table(lusc.germline.blood.samples.white.only, file = "/LUSC/blood_normal_white_only_LUSC.tsv", col.names = TRUE, row.names = FALSE, quote = FALSE, sep = "\t")
______________________________________________________________________________________________________________________________________________________________________________________________________

## start new R session for working with HNSC

## load libraries

library('reshape2')

## load all samples from TCGA for HNSC 

all.hnsc <- list.files("HNSC/", recursive = TRUE, pattern = "*.birdseed.data.txt$")

setwd("HNSC/") ## change directory for HNSC samples

## load sdrf file that contains the naming information for all HNSC samples

hnsc.names <- read.delim("broad.mit.edu_HNSC.Genome_Wide_SNP_6.sdrf.txt", header = TRUE)

all.hnsc.split <- colsplit(all.hnsc, "/", c("folder_name", "sample_name")) ## split the file names to be compatible with the sdrf.txt file

hnsc.final.matched <- hnsc.names[match(all.hnsc.split$sample_name, hnsc.names$Derived.Array.Data.Matrix.File.1), c(1,2,31)] ## match file names to sdrf 

hnsc.tcga.split.barcode <- colsplit(hnsc.final.matched$Comment..TCGA.Barcode., "-", c("project", "TSS", "participant", "sample", "portion", "plate", "center"))

## extract the list of germline blood only files using the TCGA code

hnsc.germline.blood <- hnsc.final.matched[grep(10, hnsc.tcga.split.barcode$sample), ]

## load the clinical file from TCGA

hnsc.clin <- read.delim('nationwidechildrens.org_clinical_patient_hnsc.txt', skip = 1)

hnsc.clin.to.use <- hnsc.clin[-1, ] ## remove first row to use file for matching

### extract only white non-hispanic samples

hnsc.samples.white <- hnsc.clin.to.use[which(hnsc.clin.to.use$race == "WHITE"), ] ## extract white only samples

hnsc.samples.final.pop <- hnsc.samples.white[which(hnsc.samples.white$ethnicity == "NOT HISPANIC OR LATINO"), ] ## remove Hispanic or Latino samples

## split columns and filter blood germline list to exclude non-white or white hispanic samples

hnsc.germline.blood.split <- colsplit(hnsc.germline.blood$Comment..TCGA.Barcode., "-10", c("patient_ID", "sample_info"))

hnsc.germline.blood$patient_ID <- hnsc.germline.blood.split$patient_ID

hnsc.germline.blood.samples.white.only <- hnsc.germline.blood[which(hnsc.germline.blood$patient_ID %in% hnsc.samples.final.pop$bcr_patient_barcode), ]

## write this list to a new file 

write.table(hnsc.germline.blood.samples.white.only, file = "/HNSC/blood_normal_white_only_HNSC.tsv", col.names = TRUE, row.names = FALSE, quote = FALSE, sep = "\t")
______________________________________________________________________________________________________________________________________________________________________________________________________

## start new R session for working with BLCA

## load libraries

library('reshape2')

## load all samples from TCGA for BLCA 

all.blca <- list.files("BLCA/", recursive = TRUE, pattern = "*.birdseed.data.txt$")

setwd("BLCA/") ## change directory for BLCA samples

## load sdrf file that contains the naming information for all BLCA samples

blca.names <- read.delim("broad.mit.edu_BLCA.Genome_Wide_SNP_6.sdrf.txt", header = TRUE)

all.blca.split <- colsplit(all.blca, "/", c("folder_name", "sample_name")) ## split the file names to be compatible with the sdrf.txt file

blca.final.matched <- blca.names[match(all.blca.split$sample_name, blca.names$Derived.Array.Data.Matrix.File.1), c(1,2,31)] ## match file names to sdrf 

blca.tcga.split.barcode <- colsplit(blca.final.matched$Comment..TCGA.Barcode., "-", c("project", "TSS", "participant", "sample", "portion", "plate", "center"))

## extract the list of germline blood only files using the TCGA code

blca.germline.blood <- blca.final.matched[grep(10, blca.tcga.split.barcode$sample), ]

## load the clinical file from TCGA

blca.clin <- read.delim('nationwidechildrens.org_clinical_patient_blca.txt', skip = 1)

blca.clin.to.use <- blca.clin[-1, ] ## remove first row to use file for matching

### extract only white non-hispanic samples

blca.samples.white <- blca.clin.to.use[which(blca.clin.to.use$race == "WHITE"), ] ## extract white only samples

blca.samples.final.pop <- blca.samples.white[which(blca.samples.white$ethnicity == "NOT HISPANIC OR LATINO"), ] ## remove Hispanic or Latino samples

## split columns and filter blood germline list to exclude non-white or white hispanic samples

blca.germline.blood.split <- colsplit(blca.germline.blood$Comment..TCGA.Barcode., "-10", c("patient_ID", "sample_info"))

blca.germline.blood$patient_ID <- blca.germline.blood.split$patient_ID

blca.germline.blood.samples.white.only <- blca.germline.blood[which(blca.germline.blood$patient_ID %in% blca.samples.final.pop$bcr_patient_barcode), ]

write.table(blca.germline.blood.samples.white.only, file = "/BLCA/blood_normal_white_only_BLCA.tsv", col.names = TRUE, row.names = FALSE, quote = FALSE, sep = "\t")
______________________________________________________________________________________________________________________________________________________________________________________________________

## start new R session for working with GBM

## load libraries

library('reshape2')

## load all samples from TCGA for GBM 

all.gbm <- list.files("GBM/", recursive = TRUE, pattern = "*.birdseed.data.txt$")

setwd("GBM/") ## change directory for GBM samples

## load sdrf file that contains the naming information for all GBM samples

gbm.names <- read.delim("broad.mit.edu_GBM.Genome_Wide_SNP_6.sdrf.txt", header = TRUE)

all.gbm.split <- colsplit(all.gbm, "/", c("folder_name", "sample_name")) ## split the file names to be compatible with the sdrf.txt file

gbm.final.matched <- gbm.names[match(all.gbm.split$sample_name, gbm.names$Derived.Array.Data.Matrix.File.1), c(1,2,31)] ## match file names to sdrf 

gbm.tcga.split.barcode <- colsplit(gbm.final.matched$Comment..TCGA.Barcode., "-", c("project", "TSS", "participant", "sample", "portion", "plate", "center"))

## extract the list of germline blood only files using the TCGA code

gbm.germline.blood <- gbm.final.matched[grep(10, gbm.tcga.split.barcode$sample), ]

## load the clinical file from TCGA

gbm.clin <- read.delim('nationwidechildrens.org_clinical_patient_gbm.txt', skip = 1)

gbm.clin.to.use <- gbm.clin[-1, ] ## remove first row to use file for matching

### extract only white non-hispanic samples

gbm.samples.white <- gbm.clin.to.use[which(gbm.clin.to.use$race == "WHITE"), ] ## extract white only samples

gbm.samples.final.pop <- gbm.samples.white[which(gbm.samples.white$ethnicity == "NOT HISPANIC OR LATINO"), ] ## remove Hispanic or Latino samples

## split columns and filter blood germline list to exclude non-white or white hispanic samples

gbm.germline.blood.split <- colsplit(gbm.germline.blood$Comment..TCGA.Barcode., "-10", c("patient_ID", "sample_info"))

gbm.germline.blood$patient_ID <- gbm.germline.blood.split$patient_ID

gbm.germline.blood.samples.white.only <- gbm.germline.blood[which(gbm.germline.blood$patient_ID %in% gbm.samples.final.pop$bcr_patient_barcode), ]
 
write.table(gbm.germline.blood.samples.white.only, file = "/GBM/blood_normal_white_only_GBM.tsv", col.names = TRUE, row.names = FALSE, quote = FALSE, sep = "\t")
______________________________________________________________________________________________________________________________________________________________________________________________________

## start new R session for working with LGG

## load libraries

library('reshape2')

## load all samples from TCGA for LGG 

all.lgg <- list.files("LGG/", recursive = TRUE, pattern = "*.birdseed.data.txt$")

setwd("LGG/") ## change directory for GBM samples

## load sdrf file that contains the naming information for all GBM samples

lgg.names <- read.delim("broad.mit.edu_LGG.Genome_Wide_SNP_6.sdrf.txt", header = TRUE)

all.lgg.split <- colsplit(all.lgg, "/", c("folder_name", "sample_name")) ## split the file names to be compatible with the sdrf.txt file

lgg.final.matched <- lgg.names[match(all.lgg.split$sample_name, lgg.names$Derived.Array.Data.Matrix.File.1), c(1,2,31)] ## match file names to sdrf 

lgg.tcga.split.barcode <- colsplit(lgg.final.matched$Comment..TCGA.Barcode., "-", c("project", "TSS", "participant", "sample", "portion", "plate", "center"))

## extract the list of germline blood only files using the TCGA code

lgg.germline.blood <- lgg.final.matched[grep(10, lgg.tcga.split.barcode$sample), ]

## load the clinical file from TCGA

lgg.clin <- read.delim('nationwidechildrens.org_clinical_patient_lgg.txt', skip = 1)

lgg.clin.to.use <- lgg.clin[-1, ] ## remove first row to use file for matching

### extract only white non-hispanic samples

lgg.samples.white <- lgg.clin.to.use[which(lgg.clin.to.use$race == "WHITE"), ] ## extract white only samples

lgg.samples.final.pop <- lgg.samples.white[which(lgg.samples.white$ethnicity == "NOT HISPANIC OR LATINO"), ] ## remove Hispanic or Latino samples

## split columns and filter blood germline list to exclude non-white or white hispanic samples

lgg.germline.blood.split <- colsplit(lgg.germline.blood$Comment..TCGA.Barcode., "-10", c("patient_ID", "sample_info"))

lgg.germline.blood$patient_ID <- lgg.germline.blood.split$patient_ID

lgg.germline.blood.samples.white.only <- lgg.germline.blood[which(lgg.germline.blood$patient_ID %in% lgg.samples.final.pop$bcr_patient_barcode), ]

write.table(lgg.germline.blood.samples.white.only, file = "/LGG/blood_normal_white_only_LGG.tsv", col.names = TRUE, row.names = FALSE, quote = FALSE, sep = "\t")
______________________________________________________________________________________________________________________________________________________________________________________________________


################################################################################################################################################################################################
################################################################################################################################################################################################

### Section 2. Extract file names, copy files to keep backup, and format low-confidence calls

################################################################################################################################################################################################
################################################################################################################################################################################################

### first, write individual shell code to extract the last column of germline blood names, and copy the correct files to new directories
### read them from file with three columns, take those names and paste them in new directories

### first have to extract just the third column from the genotype files

## extract third column from each file to make a new file to use

cut -f3 blood_normal_white_only_LUAD.tsv > blood_normal_white_only_for_copy_LUAD.tsv

cut -f3 blood_normal_white_only_LUSC.tsv > blood_normal_white_only_for_copy_LUSC.tsv

cut -f3 blood_normal_white_only_HNSC.tsv > blood_normal_white_only_for_copy_HNSC.tsv

cut -f3 blood_normal_white_only_BLCA.tsv > blood_normal_white_only_for_copy_BLCA.tsv

cut -f3 blood_normal_white_only_GBM.tsv > blood_normal_white_only_for_copy_GBM.tsv

cut -f3 blood_normal_white_only_LGG.tsv > blood_normal_white_only_for_copy_LGG.tsv

## create new directories for copied files to use 

mkdir LUAD_TCGA_normal_blood_white_only
mkdir LUSC_TCGA_normal_blood_white_only
mkdir HNSC_TCGA_normal_blood_white_only
mkdir BLCA_TCGA_normal_blood_white_only
mkdir GBM_TCGA_normal_blood_white_only
mkdir LGG_TCGA_normal_blood_white_only

#### now, make scripts and run the scripts from the germline_genotypes directories generated above

#################### LUAD ############################# 

vi LUAD_blood_genotype_white_only_copy.sh ## make new file

#!/bin/bash

while read line
do 
	find LUAD -iname "$line" -exec cp '{}' LUAD_TCGA_normal_blood_white_only \; 
done < LUAD/blood_normal_white_only_for_copy_LUAD.tsv

chmod u+x LUAD_blood_genotype_white_only_copy.sh ## change permissions to run

#################### LUSC ############################# 

vi LUSC_blood_genotype_white_only_copy.sh

#!/bin/bash

while read line
do 
	find LUSC -iname "$line" -exec cp '{}' LUSC_TCGA_normal_blood_white_only \; 
done < LUSC/blood_normal_white_only_for_copy_LUSC.tsv


chmod u+x LUSC_blood_genotype_white_only_copy.sh ## change permissions to run

#################### HNSC ############################# 

vi HNSC_blood_genotype_white_only_copy.sh

#!/bin/bash

while read line
do 
	find HNSC -iname "$line" -exec cp '{}' HNSC_TCGA_normal_blood_white_only \; 
done < HNSC/blood_normal_white_only_for_copy_HNSC.tsv

chmod u+x HNSC_blood_genotype_white_only_copy.sh ## change permissions to run

#################### BLCA ############################# 

vi BLCA_blood_genotype_white_only_copy.sh

#!/bin/bash

while read line
do 
	find BLCA -iname "$line" -exec cp '{}' BLCA_TCGA_normal_blood_white_only \; 
done < BLCA/blood_normal_white_only_for_copy_BLCA.tsv

chmod u+x BLCA_blood_genotype_white_only_copy.sh

#################### GBM ############################# 

vi GBM_blood_genotype_white_only_copy.sh

#!/bin/bash

while read line
do 
	find GBM -iname "$line" -exec cp '{}' GBM_TCGA_normal_blood_white_only \; 
done < GBM/blood_normal_white_only_for_copy_GBM.tsv

chmod u+x GBM_blood_genotype_white_only_copy.sh

#################### LGG ############################# 

vi LGG_blood_genotype_white_only_copy.sh

#!/bin/bash

while read line
do 
	find LGG -iname "$line" -exec cp '{}' LGG_TCGA_normal_blood_white_only \; 
done < LGG/blood_normal_white_only_for_copy_LGG.tsv

chmod u+x LGG_blood_genotype_white_only_copy.sh

#########################################################################################################
### birdseed non-confident calls have score > 0.1, so need to remove these calls
### Take each of those files and replace genotype with -9 if confidence score is > 0.1
#########################################################################################################

## Replace all values with low confidence calls (> 0.1) to -9 in all files

for f in LUAD_TCGA_normal_blood_white_only/*.birdseed.data.txt
	do sed 's/ //g' $f | awk '{if ($3>0.1){$2=-9}}{print $0}' | sed 's/ /\t/g' > $f.-9_conversion_white_only.txt
done



for f in LUSC_TCGA_normal_blood_white_only/*.birdseed.data.txt
	do sed 's/ //g' $f | awk '{if ($3>0.1){$2=-9}}{print $0}' | sed 's/ /\t/g' > $f.-9_conversion_white_only.txt
done



for f in HNSC_TCGA_normal_blood_white_only/*.birdseed.data.txt
	do sed 's/ //g' $f | awk '{if ($3>0.1){$2=-9}}{print $0}' | sed 's/ /\t/g' > $f.-9_conversion_white_only.txt
done



for f in BLCA_TCGA_normal_blood_white_only/*.birdseed.data.txt
	do sed 's/ //g' $f | awk '{if ($3>0.1){$2=-9}}{print $0}' | sed 's/ /\t/g' > $f.-9_conversion_white_only.txt
done



for f in GBM_TCGA_normal_blood_white_only/*.birdseed.data.txt
	do sed 's/ //g' $f | awk '{if ($3>0.1){$2=-9}}{print $0}' | sed 's/ /\t/g' > $f.-9_conversion_white_only.txt
done



for f in LGG_TCGA_normal_blood_white_only/*.birdseed.data.txt
	do sed 's/ //g' $f | awk '{if ($3>0.1){$2=-9}}{print $0}' | sed 's/ /\t/g' > $f.-9_conversion_white_only.txt
done


################################################################################################################################################################################################
################################################################################################################################################################################################

### Section 3. Combine all of the genotype calls together per cancer type

################################################################################################################################################################################################
################################################################################################################################################################################################


############ now need to combine all of these files together only keeping the genotype data

### code below will be used for separate shell scripts that are executed on the linux command line


######################################################## LUAD ########################################################

vi combine_reformatted_tcga_blood_genotype_white_only_LUAD.sh

#!/bin/bash

tmp=$(mktemp) 
tmp2=$(mktemp) 
tmp3=$(mktemp) ## make temp directories to hold results

echo $tmp;

for file in LUAD_TCGA_normal_blood_white_only/*.-9_conversion_white_only.txt
do
        if [ -s "$tmp" ]
        then
                cut -f 1,2 "$file" > "$tmp3"
                join "$tmp" "$tmp3" > "$tmp2"
        else
                cut -f 1,2 "$file" > "$tmp3"
                cp "$tmp3" "$tmp2"
        fi
        cp "$tmp2" "$tmp"
done

chmod u+x combine_reformatted_tcga_blood_genotype_white_only_LUAD.sh

./combine_reformatted_tcga_blood_genotype_white_only_LUAD.sh ## run the script 
/tmp/tmp.60ypaUcH1Q ## note the tmp location 

## when finished running script copy tmp files to new location

cp -i /tmp/tmp.60ypaUcH1Q /reformatted_files/LUAD_combined_reformatted_genotype_files_white_only.txt

######################################################## LUSC ########################################################

vi combine_reformatted_tcga_blood_genotype_white_only_LUSC.sh

#!/bin/bash

tmp=$(mktemp) 
tmp2=$(mktemp) 
tmp3=$(mktemp) ## make temp directories to hold results

echo $tmp;

for file in LUSC_TCGA_normal_blood_white_only/*.-9_conversion_white_only.txt
do
        if [ -s "$tmp" ]
        then
                cut -f 1,2 "$file" > "$tmp3"
                join "$tmp" "$tmp3" > "$tmp2"
        else
                cut -f 1,2 "$file" > "$tmp3"
                cp "$tmp3" "$tmp2"
        fi
        cp "$tmp2" "$tmp"
done

chmod u+x combine_reformatted_tcga_blood_genotype_white_only_LUSC.sh

./combine_reformatted_tcga_blood_genotype_white_only_LUSC.sh ## run the script 
/tmp/tmp.D9raYSwbz7 ## note the tmp location 

## when finished running script copy tmp files to new location

cp -i /tmp/tmp.D9raYSwbz7 /reformatted_files/LUSC_combined_reformatted_genotype_files_white_only.txt

######################################################## HNSC ########################################################

vi combine_reformatted_tcga_blood_genotype_white_only_HNSC.sh

#!/bin/bash

tmp=$(mktemp) 
tmp2=$(mktemp) 
tmp3=$(mktemp) ## make temp directories to hold results

echo $tmp;

for file in HNSC_TCGA_normal_blood_white_only/*.-9_conversion_white_only.txt
do
        if [ -s "$tmp" ]
        then
                cut -f 1,2 "$file" > "$tmp3"
                join "$tmp" "$tmp3" > "$tmp2"
        else
                cut -f 1,2 "$file" > "$tmp3"
                cp "$tmp3" "$tmp2"
        fi
        cp "$tmp2" "$tmp"
done

chmod u+x combine_reformatted_tcga_blood_genotype_white_only_HNSC.sh

./combine_reformatted_tcga_blood_genotype_white_only_HNSC.sh ## run the script 
/tmp/tmp.WPM03Qv36Q ## note the tmp location 

## when finished running script copy tmp files to new location

cp -i /tmp/tmp.WPM03Qv36Q /reformatted_files/HNSC_combined_reformatted_genotype_files_white_only.txt

######################################################## BLCA ########################################################

vi combine_reformatted_tcga_blood_genotype_white_only_BLCA.sh

#!/bin/bash

tmp=$(mktemp) 
tmp2=$(mktemp) 
tmp3=$(mktemp) ## make temp directories to hold results

echo $tmp;

for file in BLCA_TCGA_normal_blood_white_only/*.-9_conversion_white_only.txt
do
        if [ -s "$tmp" ]
        then
                cut -f 1,2 "$file" > "$tmp3"
                join "$tmp" "$tmp3" > "$tmp2"
        else
                cut -f 1,2 "$file" > "$tmp3"
                cp "$tmp3" "$tmp2"
        fi
        cp "$tmp2" "$tmp"
done

chmod u+x combine_reformatted_tcga_blood_genotype_white_only_BLCA.sh

./combine_reformatted_tcga_blood_genotype_white_only_BLCA.sh ## run the script 
/tmp/tmp.fPreKxKsGF ## note the tmp location 

## when finished running script copy tmp files to new location

cp -i /tmp/tmp.fPreKxKsGF /reformatted_files/BLCA_combined_reformatted_genotype_files_white_only.txt

######################################################## GBM ########################################################

vi combine_reformatted_tcga_blood_genotype_white_only_GBM.sh

#!/bin/bash

tmp=$(mktemp) 
tmp2=$(mktemp) 
tmp3=$(mktemp) ## make temp directories to hold results

echo $tmp;

for file in GBM_TCGA_normal_blood_white_only/*.-9_conversion_white_only.txt
do
        if [ -s "$tmp" ]
        then
                cut -f 1,2 "$file" > "$tmp3"
                join "$tmp" "$tmp3" > "$tmp2"
        else
                cut -f 1,2 "$file" > "$tmp3"
                cp "$tmp3" "$tmp2"
        fi
        cp "$tmp2" "$tmp"
done

chmod u+x combine_reformatted_tcga_blood_genotype_white_only_GBM.sh

./combine_reformatted_tcga_blood_genotype_white_only_GBM.sh ## run the script 
/tmp/tmp.9fkiRWhmni ## note the tmp location 

## when finished running script copy tmp files to new location

cp -i /tmp/tmp.9fkiRWhmni /reformatted_files/GBM_combined_reformatted_genotype_files_white_only.txt

######################################################## LGG ########################################################

vi combine_reformatted_tcga_blood_genotype_white_only_LGG.sh

#!/bin/bash

tmp=$(mktemp) 
tmp2=$(mktemp) 
tmp3=$(mktemp) ## make temp directories to hold results

echo $tmp;

for file in LGG_TCGA_normal_blood_white_only/*.-9_conversion_white_only.txt
do
        if [ -s "$tmp" ]
        then
                cut -f 1,2 "$file" > "$tmp3"
                join "$tmp" "$tmp3" > "$tmp2"
        else
                cut -f 1,2 "$file" > "$tmp3"
                cp "$tmp3" "$tmp2"
        fi
        cp "$tmp2" "$tmp"
done

chmod u+x combine_reformatted_tcga_blood_genotype_white_only_LGG.sh

./combine_reformatted_tcga_blood_genotype_white_only_LGG.sh ## run the script 
/tmp/tmp.xNE03ajAQI ## note the tmp location 

## when finished running script copy tmp files to new location

cp -i /tmp/tmp.xNE03ajAQI /reformatted_files/LGG_combined_reformatted_genotype_files_white_only.txt

################################################################################################################################################################################################
################################################################################################################################################################################################

### Section 4. Use R to convert these combined genotype files to PLINK allele coded format of A,C,G,T and 0 0 for missing calls

################################################################################################################################################################################################
################################################################################################################################################################################################


######################################################## LUAD ########################################################

### start new R session 

## read in combined genotypes file

luad.combined <- read.table("/reformatted_files/LUAD_combined_reformatted_genotype_files_white_only.txt")

## need to remove top two header lines of the combined file

luad.combined.2 <- luad.combined[-c(1,2), ] 

### load the affy file downloaded from http://www.affymetrix.com/Auth/analysis/downloads/na35/genotyping and skip all the heading information

affymap <- read.csv("GenomeWideSNP_6.na35.annot.csv", header = TRUE, skip = 18)

## merge together some of the attributes from mapping file with the TCGA data

luad.combined.merge <- merge(affymap[, c(1:5)], luad.combined.2, by.x=c("Probe.Set.ID"), by.y=c("V1")) ## this merges the files to only keep the first 5 columns from affymapping file

luad.map.file.for.plink <- luad.combined.merge[ ,c(3,2,4)] ## this makes the initial map file for plink

luad.sample.names <- as.data.frame(paste("LUAD_sample", seq(1:ncol(luad.combined.2[ ,-1])), sep = "")) ## this is needed for ped file of sample names.

## need to remove probe IDs without SNP positions and make the new map and genotype files

positions.to.remove <- which(luad.map.file.for.plink$Physical.Position == "---")

luad.map.new.to.use <- luad.map.file.for.plink[-positions.to.remove, ]

## now make genotype ped file   

### first make new merged file with the alleles 

luad.combined.merge.2 <- merge(affymap[, c(1:5,9,10)], luad.combined.2, by.x=c("Probe.Set.ID"), by.y=c("V1"))

### remove bad probes without SNP positions

luad.combined.merge.3 <- luad.combined.merge.2[-positions.to.remove, ]

## now recode the 0,1,2 alleles to the actual bases for PLINK

## first remove luad.combined.merge.2. so no mistakes

rm(luad.combined.merge.2)

## now, run the conversion

luad.ped <- ifelse(luad.combined.merge.3 == 0, paste(luad.combined.merge.3$Allele.A, luad.combined.merge.3$Allele.A, sep = " "), ifelse(luad.combined.merge.3 == 1, paste(luad.combined.merge.3$Allele.A, luad.combined.merge.3$Allele.B, sep = " "), ifelse(luad.combined.merge.3 == 2, paste(luad.combined.merge.3$Allele.B, luad.combined.merge.3$Allele.B, sep = " "), paste("0","0", sep = " "))))

luad.ped.for.plink <- luad.ped[ ,-c(1:7)] ### removes the non sample stuff from the ped file

luad.ped.for.plink.t <- t(luad.ped.for.plink) ### transposes file to be used for PLINK

## write new files for PLINK

write.table(luad.ped.for.plink.t, file = "LUAD_white_only_genotype_file_for_plink.ped", row.names = FALSE, col.names = FALSE, quote = FALSE)

write.table(luad.map.new.to.use, file = "LUAD_white_only_genotype_file_for_plink.map", row.names = FALSE, col.names = FALSE, quote = FALSE)

write.table(luad.sample.names, file = "LUAD_sample_names_white_only.txt", row.names = FALSE, col.names = FALSE, quote = FALSE)

######################################################## LUSC ########################################################

### start new R session 

## read in combined genotypes file

lusc.combined <- read.table("/reformatted_files/LUSC_combined_reformatted_genotype_files_white_only.txt") 

## need to remove top two header lines of the combined file

lusc.combined.2 <- lusc.combined[-c(1,2), ]

### load the affy file downloaded from http://www.affymetrix.com/Auth/analysis/downloads/na35/genotyping and skip all the heading information

affymap <- read.csv("GenomeWideSNP_6.na35.annot.csv", header = TRUE, skip = 18)

## merge together some of the attributes from mapping file with the TCGA data

lusc.combined.merge <- merge(affymap[, c(1:5)], lusc.combined.2, by.x=c("Probe.Set.ID"), by.y=c("V1")) ## this merges the files to only keep the first 5 columns from affymapping file

lusc.map.file.for.plink <- lusc.combined.merge[ ,c(3,2,4)] ## this makes the initial map file for plink

lusc.sample.names <- as.data.frame(paste("LUSC_sample", seq(1:ncol(lusc.combined.2[ ,-1])), sep = "")) ## this is needed for ped file of sample names.

## need to remove probe IDs without SNP positions and make the new map and genotype files

positions.to.remove <- which(lusc.map.file.for.plink$Physical.Position == "---")

lusc.map.new.to.use <- lusc.map.file.for.plink[-positions.to.remove, ]

## now make genotype ped file   

### first make new merged file with the alleles 

lusc.combined.merge.2 <- merge(affymap[, c(1:5,9,10)], lusc.combined.2, by.x=c("Probe.Set.ID"), by.y=c("V1"))

### remove bad probes without SNP positions

lusc.combined.merge.3 <- lusc.combined.merge.2[-positions.to.remove, ]

## now recode the 0,1,2 alleles to the actual bases for PLINK

## first remove lusc.combined.merge.2. so no mistakes

rm(lusc.combined.merge.2)

## now, run the conversion

lusc.ped <- ifelse(lusc.combined.merge.3 == 0, paste(lusc.combined.merge.3$Allele.A, lusc.combined.merge.3$Allele.A, sep = " "), ifelse(lusc.combined.merge.3 == 1, paste(lusc.combined.merge.3$Allele.A, lusc.combined.merge.3$Allele.B, sep = " "), ifelse(lusc.combined.merge.3 == 2, paste(lusc.combined.merge.3$Allele.B, lusc.combined.merge.3$Allele.B, sep = " "), paste("0","0", sep = " "))))

lusc.ped.for.plink <- lusc.ped[ ,-c(1:7)] ### removes the non sample stuff from the ped file

lusc.ped.for.plink.t <- t(lusc.ped.for.plink) ### transposes file to be used for PLINK

## write new files for PLINK

write.table(lusc.ped.for.plink.t, file = "LUSC_white_only_genotype_file_for_plink.ped", row.names = FALSE, col.names = FALSE, quote = FALSE)

write.table(lusc.map.new.to.use, file = "LUSC_white_only_genotype_file_for_plink.map", row.names = FALSE, col.names = FALSE, quote = FALSE)

write.table(lusc.sample.names, file = "LUSC_sample_names_white_only.txt", row.names = FALSE, col.names = FALSE, quote = FALSE)

######################################################## HNSC ########################################################

### start new R session 

## read in combined genotypes file

hnsc.combined <- read.table("/reformatted_files/HNSC_combined_reformatted_genotype_files_white_only.txt") 

## need to remove top two header lines of the combined file

hnsc.combined.2 <- hnsc.combined[-c(1,2), ]

### load the affy file downloaded from http://www.affymetrix.com/Auth/analysis/downloads/na35/genotyping and skip all the heading information

affymap <- read.csv("GenomeWideSNP_6.na35.annot.csv", header = TRUE, skip = 18)

## merge together some of the attributes from mapping file with the TCGA data

hnsc.combined.merge <- merge(affymap[, c(1:5)], hnsc.combined.2, by.x=c("Probe.Set.ID"), by.y=c("V1")) ## this merges the files to only keep the first 5 columns from affymapping file

hnsc.map.file.for.plink <- hnsc.combined.merge[ ,c(3,2,4)] ## this makes the initial map file for plink

hnsc.sample.names <- as.data.frame(paste("HNSC_sample", seq(1:ncol(hnsc.combined.2[ ,-1])), sep = "")) ## this is needed for ped file of sample names.

## need to remove probe IDs without SNP positions and make the new map and genotype files

positions.to.remove <- which(hnsc.map.file.for.plink$Physical.Position == "---")

hnsc.map.new.to.use <- hnsc.map.file.for.plink[-positions.to.remove, ]

## now make genotype ped file   

### first make new merged file with the alleles 

hnsc.combined.merge.2 <- merge(affymap[, c(1:5,9,10)], hnsc.combined.2, by.x=c("Probe.Set.ID"), by.y=c("V1"))

### remove bad probes without SNP positions

hnsc.combined.merge.3 <- hnsc.combined.merge.2[-positions.to.remove, ]

## now recode the 0,1,2 alleles to the actual bases for PLINK

## first remove hnsc.combined.merge.2. so no mistakes

rm(hnsc.combined.merge.2)

## now, run the conversion

hnsc.ped <- ifelse(hnsc.combined.merge.3 == 0, paste(hnsc.combined.merge.3$Allele.A, hnsc.combined.merge.3$Allele.A, sep = " "), ifelse(hnsc.combined.merge.3 == 1, paste(hnsc.combined.merge.3$Allele.A, hnsc.combined.merge.3$Allele.B, sep = " "), ifelse(hnsc.combined.merge.3 == 2, paste(hnsc.combined.merge.3$Allele.B, hnsc.combined.merge.3$Allele.B, sep = " "), paste("0","0", sep = " "))))

hnsc.ped.for.plink <- hnsc.ped[ ,-c(1:7)] ### removes the non sample stuff from the ped file

hnsc.ped.for.plink.t <- t(hnsc.ped.for.plink) ### transposes file to be used for PLINK

## write new files for PLINK

write.table(hnsc.ped.for.plink.t, file = "HNSC_white_only_genotype_file_for_plink.ped", row.names = FALSE, col.names = FALSE, quote = FALSE)

write.table(hnsc.map.new.to.use, file = "HNSC_white_only_genotype_file_for_plink.map", row.names = FALSE, col.names = FALSE, quote = FALSE)

write.table(hnsc.sample.names, file = "HNSC_sample_names_white_only.txt", row.names = FALSE, col.names = FALSE, quote = FALSE)

######################################################## BLCA ########################################################

### start new R session 

## read in combined genotypes file

blca.combined <- read.table("/reformatted_files/BLCA_combined_reformatted_genotype_files_white_only.txt")

## need to remove top two header lines of the combined file

blca.combined.2 <- blca.combined[-c(1,2), ]

### load the affy file downloaded from http://www.affymetrix.com/Auth/analysis/downloads/na35/genotyping and skip all the heading information

affymap <- read.csv("GenomeWideSNP_6.na35.annot.csv", header = TRUE, skip = 18)

## merge together some of the attributes from mapping file with the TCGA results

blca.combined.merge <- merge(affymap[, c(1:5)], blca.combined.2, by.x=c("Probe.Set.ID"), by.y=c("V1")) ## this merges the files to only keep the first 5 columns from affymapping file

blca.map.file.for.plink <- blca.combined.merge[ ,c(3,2,4)] ## this makes the initial map file for plink

blca.sample.names <- as.data.frame(paste("BLCA_sample", seq(1:ncol(blca.combined.2[ ,-1])), sep = "")) ## this is needed for ped file of sample names.

## need to remove probe IDs without SNP positions and make the new map and genotype files

positions.to.remove <- which(blca.map.file.for.plink$Physical.Position == "---")

blca.map.new.to.use <- blca.map.file.for.plink[-positions.to.remove, ]

## now make genotype ped file   

### first make new merged file with the alleles 

blca.combined.merge.2 <- merge(affymap[, c(1:5,9,10)], blca.combined.2, by.x=c("Probe.Set.ID"), by.y=c("V1"))

### remove bad probes without SNP positions

blca.combined.merge.3 <- blca.combined.merge.2[-positions.to.remove, ]

## now recode the 0,1,2 alleles to the actual bases for PLINK

## first remove blca.combined.merge.2. so no mistakes

rm(blca.combined.merge.2)

## now, run the conversion

blca.ped <- ifelse(blca.combined.merge.3 == 0, paste(blca.combined.merge.3$Allele.A, blca.combined.merge.3$Allele.A, sep = " "), ifelse(blca.combined.merge.3 == 1, paste(blca.combined.merge.3$Allele.A, blca.combined.merge.3$Allele.B, sep = " "), ifelse(blca.combined.merge.3 == 2, paste(blca.combined.merge.3$Allele.B, blca.combined.merge.3$Allele.B, sep = " "), paste("0","0", sep = " "))))

blca.ped.for.plink <- blca.ped[ ,-c(1:7)] ### removes the non sample stuff from the ped file

blca.ped.for.plink.t <- t(blca.ped.for.plink) ### transposes file to be used for PLINK

## write new files for PLINK

write.table(blca.ped.for.plink.t, file = "BLCA_white_only_genotype_file_for_plink.ped", row.names = FALSE, col.names = FALSE, quote = FALSE)

write.table(blca.map.new.to.use, file = "BLCA_white_only_genotype_file_for_plink.map", row.names = FALSE, col.names = FALSE, quote = FALSE)

write.table(blca.sample.names, file = "BLCA_sample_names_white_only.txt", row.names = FALSE, col.names = FALSE, quote = FALSE)

######################################################## GBM ########################################################

### start new R session 

## read in combined genotypes file

gbm.combined <- read.table("/reformatted_files/GBM_combined_reformatted_genotype_files_white_only.txt")

## need to remove top two header lines of the combined file

gbm.combined.2 <- gbm.combined[-c(1,2), ]

### load the affy file downloaded from http://www.affymetrix.com/Auth/analysis/downloads/na35/genotyping and skip all the heading information

affymap <- read.csv("GenomeWideSNP_6.na35.annot.csv", header = TRUE, skip = 18)

## merge together some of the attributes from mapping file with the TCGA results

gbm.combined.merge <- merge(affymap[, c(1:5)], gbm.combined.2, by.x=c("Probe.Set.ID"), by.y=c("V1")) ## this merges the files to only keep the first 5 columns from affymapping file

gbm.map.file.for.plink <- gbm.combined.merge[ ,c(3,2,4)] ## this makes the initial map file for plink

gbm.sample.names <- as.data.frame(paste("GBM_sample", seq(1:ncol(gbm.combined.2[ ,-1])), sep = "")) ## this is needed for ped file of sample names.

## need to remove probe IDs without SNP positions and make the new map and genotype files

positions.to.remove <- which(gbm.map.file.for.plink$Physical.Position == "---")

gbm.map.new.to.use <- gbm.map.file.for.plink[-positions.to.remove, ]

## now make genotype ped file   

### first make new merged file with the alleles 

gbm.combined.merge.2 <- merge(affymap[, c(1:5,9,10)], gbm.combined.2, by.x=c("Probe.Set.ID"), by.y=c("V1"))

### remove bad probes without SNP positions

gbm.combined.merge.3 <- gbm.combined.merge.2[-positions.to.remove, ]

## now recode the 0,1,2 alleles to the actual bases for PLINK

## first remove gbm.combined.merge.2. so no mistakes

rm(gbm.combined.merge.2)

## now, run the conversion

gbm.ped <- ifelse(gbm.combined.merge.3 == 0, paste(gbm.combined.merge.3$Allele.A, gbm.combined.merge.3$Allele.A, sep = " "), ifelse(gbm.combined.merge.3 == 1, paste(gbm.combined.merge.3$Allele.A, gbm.combined.merge.3$Allele.B, sep = " "), ifelse(gbm.combined.merge.3 == 2, paste(gbm.combined.merge.3$Allele.B, gbm.combined.merge.3$Allele.B, sep = " "), paste("0","0", sep = " "))))

gbm.ped.for.plink <- gbm.ped[ ,-c(1:7)] ### removes the non sample stuff from the ped file

gbm.ped.for.plink.t <- t(gbm.ped.for.plink) ### transposes file to be used for PLINK

## write new files for PLINK

write.table(gbm.ped.for.plink.t, file = "GBM_white_only_genotype_file_for_plink.ped", row.names = FALSE, col.names = FALSE, quote = FALSE)

write.table(gbm.map.new.to.use, file = "GBM_white_only_genotype_file_for_plink.map", row.names = FALSE, col.names = FALSE, quote = FALSE)

write.table(gbm.sample.names, file = "GBM_sample_names_white_only.txt", row.names = FALSE, col.names = FALSE, quote = FALSE)

######################################################## LGG ########################################################

### start new R session 

## read in combined genotypes file

lgg.combined <- read.table("/reformatted_files/LGG_combined_reformatted_genotype_files_white_only.txt")

## need to remove top two header lines of the combined file

lgg.combined.2 <- lgg.combined[-c(1,2), ]

### load the affy file downloaded from http://www.affymetrix.com/Auth/analysis/downloads/na35/genotyping and skip all the heading information

affymap <- read.csv("GenomeWideSNP_6.na35.annot.csv", header = TRUE, skip = 18)

## merge together some of the attributes from mapping file with the TCGA results

lgg.combined.merge <- merge(affymap[, c(1:5)], lgg.combined.2, by.x=c("Probe.Set.ID"), by.y=c("V1")) ## this merges the files to only keep the first 5 columns from affymapping file

lgg.map.file.for.plink <- lgg.combined.merge[ ,c(3,2,4)] ## this makes the initial map file for plink

lgg.sample.names <- as.data.frame(paste("LGG_sample", seq(1:ncol(lgg.combined.2[ ,-1])), sep = "")) ## this is needed for ped file of sample names.

## need to remove probe IDs without SNP positions and make the new map and genotype files

positions.to.remove <- which(lgg.map.file.for.plink$Physical.Position == "---")

lgg.map.new.to.use <- lgg.map.file.for.plink[-positions.to.remove, ]

## now make genotype ped file   

### first make new merged file with the alleles 

lgg.combined.merge.2 <- merge(affymap[, c(1:5,9,10)], lgg.combined.2, by.x=c("Probe.Set.ID"), by.y=c("V1"))

### remove bad probes without SNP positions

lgg.combined.merge.3 <- lgg.combined.merge.2[-positions.to.remove, ]

## now recode the 0,1,2 alleles to the actual bases for PLINK

## first remove lgg.combined.merge.2. so no mistakes

rm(lgg.combined.merge.2)

## now, run the conversion

lgg.ped <- ifelse(lgg.combined.merge.3 == 0, paste(lgg.combined.merge.3$Allele.A, lgg.combined.merge.3$Allele.A, sep = " "), ifelse(lgg.combined.merge.3 == 1, paste(lgg.combined.merge.3$Allele.A, lgg.combined.merge.3$Allele.B, sep = " "), ifelse(lgg.combined.merge.3 == 2, paste(lgg.combined.merge.3$Allele.B, lgg.combined.merge.3$Allele.B, sep = " "), paste("0","0", sep = " "))))

lgg.ped.for.plink <- lgg.ped[ ,-c(1:7)] ### removes the non sample stuff from the ped file

lgg.ped.for.plink.t <- t(lgg.ped.for.plink) ### transposes file to be used for PLINK

## write new files for PLINK

write.table(lgg.ped.for.plink.t, file = "LGG_white_only_genotype_file_for_plink.ped", row.names = FALSE, col.names = FALSE, quote = FALSE)

write.table(lgg.map.new.to.use, file = "LGG_white_only_genotype_file_for_plink.map", row.names = FALSE, col.names = FALSE, quote = FALSE)

write.table(lgg.sample.names, file = "LGG_sample_names_white_only.txt", row.names = FALSE, col.names = FALSE, quote = FALSE)


################################################################################################################################################################################################
################################################################################################################################################################################################

### Section 5. Combine the newly coded genotype files together using PLINK and run the PCA

################################################################################################################################################################################################
################################################################################################################################################################################################

### First, paste the sample names to the ped files and change map name to match. 

paste -d" " LUAD_sample_names_white_only.txt LUAD_white_only_genotype_file_for_plink.ped > LUAD_white_only_genotype_file_for_plink_with_sample_names.ped

cp LUAD_white_only_genotype_file_for_plink.map LUAD_white_only_genotype_file_for_plink_with_sample_names.map ## make new map file with identical new ped file name


paste -d" " LUSC_sample_names_white_only.txt LUSC_white_only_genotype_file_for_plink.ped > LUSC_white_only_genotype_file_for_plink_with_sample_names.ped

cp LUSC_white_only_genotype_file_for_plink.map LUSC_white_only_genotype_file_for_plink_with_sample_names.map ## make new map file with identical new ped file name


paste -d" " HNSC_sample_names_white_only.txt HNSC_white_only_genotype_file_for_plink.ped > HNSC_white_only_genotype_file_for_plink_with_sample_names.ped

cp HNSC_white_only_genotype_file_for_plink.map HNSC_white_only_genotype_file_for_plink_with_sample_names.map ## make new map file with identical new ped file name


paste -d" " BLCA_sample_names_white_only.txt BLCA_white_only_genotype_file_for_plink.ped > BLCA_white_only_genotype_file_for_plink_with_sample_names.ped

cp BLCA_white_only_genotype_file_for_plink.map BLCA_white_only_genotype_file_for_plink_with_sample_names.map ## make new map file with identical new ped file name


paste -d" " GBM_sample_names_white_only.txt GBM_white_only_genotype_file_for_plink.ped > GBM_white_only_genotype_file_for_plink_with_sample_names.ped

cp GBM_white_only_genotype_file_for_plink.map GBM_white_only_genotype_file_for_plink_with_sample_names.map ## make new map file with identical new ped file name


paste -d" " LGG_sample_names_white_only.txt LGG_white_only_genotype_file_for_plink.ped > LGG_white_only_genotype_file_for_plink_with_sample_names.ped

cp LGG_white_only_genotype_file_for_plink.map LGG_white_only_genotype_file_for_plink_with_sample_names.map ## make new map file with identical new ped file name

######## now, make bfiles from the map and ped files using PLINK

./plink --file LUAD_white_only_genotype_file_for_plink_with_sample_names --no-fid --no-parents --no-sex --no-pheno --make-bed --out LUAD_white_only_genotype_file_for_plink_with_sample_names

./plink --file LUSC_white_only_genotype_file_for_plink_with_sample_names --no-fid --no-parents --no-sex --no-pheno --make-bed --out LUSC_white_only_genotype_file_for_plink_with_sample_names

./plink --file HNSC_white_only_genotype_file_for_plink_with_sample_names --no-fid --no-parents --no-sex --no-pheno --make-bed --out HNSC_white_only_genotype_file_for_plink_with_sample_names

./plink --file BLCA_white_only_genotype_file_for_plink_with_sample_names --no-fid --no-parents --no-sex --no-pheno --make-bed --out BLCA_white_only_genotype_file_for_plink_with_sample_names

./plink --file GBM_white_only_genotype_file_for_plink_with_sample_names --no-fid --no-parents --no-sex --no-pheno --make-bed --out GBM_white_only_genotype_file_for_plink_with_sample_names

./plink --file LGG_white_only_genotype_file_for_plink_with_sample_names --no-fid --no-parents --no-sex --no-pheno --make-bed --out LGG_white_only_genotype_file_for_plink_with_sample_names

### now, use mergelist function in PLINK to merge the bfiles together

### first, need to combine all file names together into new file

vi tcga_genotype_files_white_only_for_merging_in_plink.txt

LUAD_white_only_genotype_file_for_plink_with_sample_names
LUSC_white_only_genotype_file_for_plink_with_sample_names
HNSC_white_only_genotype_file_for_plink_with_sample_names
BLCA_white_only_genotype_file_for_plink_with_sample_names
GBM_white_only_genotype_file_for_plink_with_sample_names
LGG_white_only_genotype_file_for_plink_with_sample_names

### run mergelist

./plink --merge-list tcga_genotype_files_white_only_for_merging_in_plink.txt --make-bed --out all_tcga_genotypes_merged_white_only

### above command produced a missnp file due to one trialleclic SNP. Use this missnp file with original bed files to remove triallelic SNP
### then need to generate new merged genotype file

### make new bed files using missnp

./plink --file LUAD_white_only_genotype_file_for_plink_with_sample_names --no-fid --no-parents --no-sex --no-pheno --exclude all_tcga_genotypes_merged_white_only-merge.missnp --make-bed --out LUAD_white_only_genotype_file_for_plink_with_sample_names_tri_allelic_removed


./plink --file LUSC_white_only_genotype_file_for_plink_with_sample_names --no-fid --no-parents --no-sex --no-pheno --exclude all_tcga_genotypes_merged_white_only-merge.missnp --make-bed --out LUSC_white_only_genotype_file_for_plink_with_sample_names_tri_allelic_removed


./plink --file HNSC_white_only_genotype_file_for_plink_with_sample_names --no-fid --no-parents --no-sex --no-pheno --exclude all_tcga_genotypes_merged_white_only-merge.missnp --make-bed --out HNSC_white_only_genotype_file_for_plink_with_sample_names_tri_allelic_removed


./plink --file BLCA_white_only_genotype_file_for_plink_with_sample_names --no-fid --no-parents --no-sex --no-pheno --exclude all_tcga_genotypes_merged_white_only-merge.missnp --make-bed --out BLCA_white_only_genotype_file_for_plink_with_sample_names_tri_allelic_removed


./plink --file GBM_white_only_genotype_file_for_plink_with_sample_names --no-fid --no-parents --no-sex --no-pheno --exclude all_tcga_genotypes_merged_white_only-merge.missnp --make-bed --out GBM_white_only_genotype_file_for_plink_with_sample_names_tri_allelic_removed


./plink --file LGG_white_only_genotype_file_for_plink_with_sample_names --no-fid --no-parents --no-sex --no-pheno --exclude all_tcga_genotypes_merged_white_only-merge.missnp --make-bed --out LGG_white_only_genotype_file_for_plink_with_sample_names_tri_allelic_removed

############### now, make new final mergelist

vi tcga_genotype_files_white_only_for_merging_in_plink_tri_allelic_removed.txt

LUAD_white_only_genotype_file_for_plink_with_sample_names_tri_allelic_removed
LUSC_white_only_genotype_file_for_plink_with_sample_names_tri_allelic_removed
HNSC_white_only_genotype_file_for_plink_with_sample_names_tri_allelic_remove
BLCA_white_only_genotype_file_for_plink_with_sample_names_tri_allelic_removed
GBM_white_only_genotype_file_for_plink_with_sample_names_tri_allelic_removed
LGG_white_only_genotype_file_for_plink_with_sample_names_tri_allelic_removed

### now run new final merge command 

./plink --merge-list tcga_genotype_files_white_only_for_merging_in_plink_tri_allelic_removed.txt --make-bed --out all_tcga_genotypes_merged_white_only_tri_allelic_removed


### first need to trim the SNPs by LD before PCA. Use r2 value of 0.5 

### first, need to make a set of SNPs for the LD trim. Need to get this from the merged file.

### use PLINK's write-snplist command to get the list of SNPs needed for LD pruning

./plink -bfile all_tcga_genotypes_merged_white_only_tri_allelic_removed --write-snplist --out all_tcga_genotypes_merged_white_only_tri_allelic_removed_for_ld_trim

## now, run the LD trim on these SNPs using PLINK formatted 1000 Genomes EUROPEAN file from https://vegas2.qimrberghofer.edu.au/g1000p3_EUR.tar.gz

./plink -bfile g1000p3_EUR --indep-pairwise 1000 kb 1 0.5 --extract all_tcga_genotypes_merged_white_only_tri_allelic_removed_for_ld_trim.snplist --out all_tcga_genotypes_merged_white_only_tri_allelic_removed_ld_snps_pruned

### run PCA using the prune.in SNPs

./plink -bfile all_tcga_genotypes_merged_white_only_tri_allelic_removed --extract all_tcga_genotypes_merged_white_only_tri_allelic_removed_ld_snps_pruned.prune.in --pca --out pca_all_tcga_genotypes_merged_no_tri_using_ld_pruned_snps

### group of GBM outliers in the plot. Remove these samples and run new PCA
### filename of samples is gbm_samples_to_exculde_pca.txt

./plink -bfile all_tcga_genotypes_merged_white_only_tri_allelic_removed --remove gbm_samples_to_exculde_pca.txt --extract all_tcga_genotypes_merged_white_only_tri_allelic_removed_ld_snps_pruned.prune.in --pca --out pca_all_tcga_genotypes_merged_no_tri_using_ld_pruned_snps_gbm_outliers_exculded

### Still outliers in the plot. Remove these samples and run new PCA

### filename of all outlier samples is all_samples_to_exclude_final_pca

### run the final PCA

./plink -bfile all_tcga_genotypes_merged_white_only_tri_allelic_removed --remove all_samples_to_exclude_final_pca --extract all_tcga_genotypes_merged_white_only_tri_allelic_removed_ld_snps_pruned.prune.in --pca --out pca_all_tcga_genotypes_merged_no_tri_using_ld_pruned_snps_all_outliers_exculded