################################################################################################################################################################################################ ################################################################################################################################################################################################ ## This is the code used to identify the unique genomic regions represented by the overlapping genes from the germline-regulated gene results for LUSC unique ## for the manuscript titled Weak sharing of genetic association signals in three lung cancer subtypes: evidence at the SNP, gene, regulation and pathway levels. ################################################################################################################################################################################################ ################################################################################################################################################################################################ ### Section 1. Load compiled list of all germline-regulated genes, get genomic location, and extract all genes per functional category ################################################################################################################################################################################################ ################################################################################################################################################################################################ ## start new R session for working with LUSC ## load libraries library("biomaRt") ## load the compiled list of germline-regulated genes per subtype luad.final.germline.genes <- read.csv("luad_final_germline_genes.csv", header = TRUE) lusc.final.germline.genes <- read.csv("lusc_final_germline_genes.csv", header = TRUE) sclc.final.germline.genes <- read.csv("sclc_final_germline_genes.csv", header = TRUE) # below pull out columns going to use, so gene, Regulatory category, and the dataset where gene was located. Remove one NA from LUAD luad.final.germline.genes.to.use.with.na <- luad.final.germline.genes[ ,c(1,3,4)] # pull out columns to use luad.final.germline.genes.to.use <- luad.final.germline.genes.to.use.with.na[-which(is.na(luad.final.germline.genes.to.use.with.na$Gene)), ] # remove one NA in the file lusc.final.germline.genes.to.use <- lusc.final.germline.genes[ ,c(1,3,4)] # pull out columns to use sclc.final.germline.genes.to.use <- sclc.final.germline.genes[ ,c(1,3,4)] # pull out columns to use ### Use R library biomaRt to extract genomic positions for all germline-regulated genes ensembl = useMart(biomart="ENSEMBL_MART_ENSEMBL", host="grch37.ensembl.org", path="/biomart/martservice" ,dataset="hsapiens_gene_ensembl") # mark parameters to use luad.gene.positions.biomart <- getBM(attributes = c('external_gene_name', 'chromosome_name', 'start_position', 'end_position', 'band'), filters = 'external_gene_name', values = luad.final.germline.genes.to.use$Gene, mart = ensembl) # extract genomic location for each gene luad.genes.with.position <- merge(luad.final.germline.genes.to.use, luad.gene.positions.biomart, by.x = "Gene", by.y = "external_gene_name", all.x = TRUE) # merge genomic positions to the gene list ### now run the same commands for LUSC and SCLC ## lusc lusc.gene.positions.biomart <- getBM(attributes = c('external_gene_name', 'chromosome_name', 'start_position', 'end_position', 'band'), filters = 'external_gene_name', values = lusc.final.germline.genes.to.use$Gene, mart = ensembl) # extract genomic location for each gene lusc.genes.with.position <- merge(lusc.final.germline.genes.to.use, lusc.gene.positions.biomart, by.x = "Gene", by.y = "external_gene_name", all.x = TRUE) ## sclc sclc.gene.positions.biomart <- getBM(attributes = c('external_gene_name', 'chromosome_name', 'start_position', 'end_position', 'band'), filters = 'external_gene_name', values = sclc.final.germline.genes.to.use$Gene, mart = ensembl) # extract genomic location for each gene sclc.genes.with.position <- merge(sclc.final.germline.genes.to.use, sclc.gene.positions.biomart, by.x = "Gene", by.y = "external_gene_name", all.x = TRUE) ## now extract the different regulatory categories per subtype #################### LUAD ############################# luad.gtex.single <- luad.genes.with.position[which(luad.genes.with.position$Data.source == "GTEx single tissue V6 Lung eQTL"), ] luad.gtex.multi <- luad.genes.with.position[which(luad.genes.with.position$Data.source == "GTEx multi-tissue eQTL in lung > 0.80"), ] luad.hao <- luad.genes.with.position[which(luad.genes.with.position$Data.source == "Hao single tissue lung eQTL"), ] luad.fantom <- luad.genes.with.position[which(luad.genes.with.position$Data.source == "FANTOM5 enhancer target gene"), ] luad.impet.imr90 <- luad.genes.with.position[which(luad.genes.with.position$Data.source == "IM-PET IMR90"), ] luad.impet.nhlf <- luad.genes.with.position[which(luad.genes.with.position$Data.source == "IM-PET NHLF"), ] ### now combine gtex and im-pet genes luad.gtex.combined <- rbind(luad.gtex.single, luad.gtex.multi) luad.impet.combined <- rbind(luad.impet.imr90, luad.impet.nhlf) #################### LUSC ############################# lusc.gtex.single <- lusc.genes.with.position[which(lusc.genes.with.position$Data.source == "GTEx single tissue V6 Lung eQTL"), ] lusc.gtex.multi <- lusc.genes.with.position[which(lusc.genes.with.position$Data.source == "GTEx multi-tissue eQTL in lung > 0.80"), ] lusc.hao <- lusc.genes.with.position[which(lusc.genes.with.position$Data.source == "Hao single tissue lung eQTL"), ] lusc.fantom <- lusc.genes.with.position[which(lusc.genes.with.position$Data.source == "FANTOM5 enhancer target gene"), ] lusc.impet.imr90 <- lusc.genes.with.position[which(lusc.genes.with.position$Data.source == "IM-PET IMR90"), ] lusc.impet.nhlf <- lusc.genes.with.position[which(lusc.genes.with.position$Data.source == "IM-PET NHLF"), ] ### now combine gtex and im-pet genes lusc.gtex.combined <- rbind(lusc.gtex.single, lusc.gtex.multi) lusc.impet.combined <- rbind(lusc.impet.imr90, lusc.impet.nhlf) #################### SCLC ############################# sclc.gtex.single <- sclc.genes.with.position[which(sclc.genes.with.position$Data.source == "GTEx single tissue V6 Lung eQTL"), ] sclc.gtex.multi <- sclc.genes.with.position[which(sclc.genes.with.position$Data.source == "GTEx multi-tissue eQTL in lung > 0.80"), ] sclc.hao <- sclc.genes.with.position[which(sclc.genes.with.position$Data.source == "Hao single tissue lung eQTL"), ] sclc.fantom <- sclc.genes.with.position[which(sclc.genes.with.position$Data.source == "FANTOM5 enhancer target gene"), ] sclc.impet.imr90 <- sclc.genes.with.position[which(sclc.genes.with.position$Data.source == "IM-PET IMR90"), ] sclc.impet.nhlf <- sclc.genes.with.position[which(sclc.genes.with.position$Data.source == "IM-PET NHLF"), ] ### now combine gtex and im-pet genes sclc.gtex.combined <- rbind(sclc.gtex.single, sclc.gtex.multi) sclc.impet.combined <- rbind(sclc.impet.imr90, sclc.impet.nhlf) ################################################################################################################################################################################################ ################################################################################################################################################################################################ ### Section 2. In same R session, extract genes unique to LUSC in every category and make bed files for bedtools ################################################################################################################################################################################################ ################################################################################################################################################################################################ ### now identify genes just for LUSC unique lusc.unique.gtex.single <- setdiff(setdiff(lusc.gtex.single$Gene, luad.gtex.single$Gene), sclc.gtex.single$Gene) lusc.unique.gtex.multi <- setdiff(setdiff(lusc.gtex.multi$Gene, luad.gtex.multi$Gene), sclc.gtex.multi$Gene) lusc.unique.hao <- setdiff(setdiff(lusc.hao$Gene, luad.hao$Gene), sclc.hao$Gene) lusc.unique.fantom <- setdiff(setdiff(lusc.fantom$Gene, luad.fantom$Gene), sclc.fantom$Gene) lusc.unique.impet.imr90 <- setdiff(setdiff(lusc.impet.imr90$Gene, luad.impet.imr90$Gene), sclc.impet.imr90$Gene) lusc.unique.impet.nhlf <- setdiff(setdiff(lusc.impet.nhlf$Gene, luad.impet.nhlf$Gene), sclc.impet.nhlf$Gene) lusc.unique.all <- setdiff(setdiff(lusc.genes.with.position$Gene, luad.genes.with.position$Gene), sclc.genes.with.position$Gene) ### now combine gtex and im-pet genes lusc.unique.gtex.combined <- setdiff(setdiff(lusc.gtex.combined$Gene, luad.gtex.combined$Gene), sclc.gtex.combined$Gene) lusc.unique.impet.combined <- setdiff(setdiff(lusc.impet.combined$Gene, luad.impet.combined$Gene), sclc.impet.combined$Gene) ### now merge these unique genes with the location information from biomaRt lusc.unique.gtex.single.FINAL <- merge(lusc.gtex.single, as.data.frame(lusc.unique.gtex.single), by.x = "Gene", by.y = "lusc.unique.gtex.single", all.x = FALSE) lusc.unique.gtex.multi.FINAL <- merge(lusc.gtex.multi, as.data.frame(lusc.unique.gtex.multi), by.x = "Gene", by.y = "lusc.unique.gtex.multi", all.x = FALSE) lusc.unique.gtex.combined.FINAL <- merge(lusc.gtex.combined, as.data.frame(lusc.unique.gtex.combined), by.x = "Gene", by.y = "lusc.unique.gtex.combined", all.x = FALSE) lusc.unique.hao.FINAL <- merge(lusc.hao, as.data.frame(lusc.unique.hao), by.x = "Gene", by.y = "lusc.unique.hao", all.x = FALSE) lusc.unique.fantom.FINAL <- merge(lusc.fantom, as.data.frame(lusc.unique.fantom), by.x = "Gene", by.y = "lusc.unique.fantom", all.x = FALSE) lusc.unique.impet.imr90.FINAL <- merge(lusc.impet.imr90, as.data.frame(lusc.unique.impet.imr90), by.x = "Gene", by.y = "lusc.unique.impet.imr90", all.x = FALSE) lusc.unique.impet.nhlf.FINAL <- merge(lusc.impet.nhlf, as.data.frame(lusc.unique.impet.nhlf), by.x = "Gene", by.y = "lusc.unique.impet.nhlf", all.x = FALSE) lusc.unique.impet.combined.FINAL <- merge(lusc.impet.combined, as.data.frame(lusc.unique.impet.combined), by.x = "Gene", by.y = "lusc.unique.impet.combined", all.x = FALSE) lusc.unique.all.FINAL <- merge(lusc.genes.with.position, as.data.frame(lusc.unique.all), by.x = "Gene", by.y = "lusc.unique.all", all.x = FALSE) ### now generate the bed files for each category and write to new files for bedtools #################### GTEx single tissue eQTLs ############################# bed.gtex.single <- lusc.unique.gtex.single.FINAL[ ,c(4,5,6,1,7)] ## extract columns for bedtools colnames(bed.gtex.single) <- c("chrom", "chromStart", "chromEnd", "Gene", "Band") ## change column names ## add 'chr' to each chromosome number chr.for.paste.gtex.single <- paste("chr", bed.gtex.single$chrom, sep = "") bed.gtex.single$chrom.2 <- chr.for.paste.gtex.single ## make final bed file bed.gtex.single.final <- bed.gtex.single[, c(6, 2, 3, 4, 5)] ## change format of columns colnames(bed.gtex.single.final) <- c("chrom", "chromStart", "chromEnd", "Gene", "Band") ## rename new columns ## write new bed file write.table(bed.gtex.single.final, file = "lusc_unique_gtex_single_for_bedtools.bed", sep = "\t", quote = FALSE, row.names = FALSE, col.names = FALSE) #################### GTEx multi tissue eQTLs ############################# bed.gtex.multi <- lusc.unique.gtex.multi.FINAL[ ,c(4,5,6,1,7)] ## extract columns for bedtools colnames(bed.gtex.multi) <- c("chrom", "chromStart", "chromEnd", "Gene", "Band") ## change column names ## add 'chr' to each chromosome number chr.for.paste.gtex.multi <- paste("chr", bed.gtex.multi$chrom, sep = "") bed.gtex.multi$chrom.2 <- chr.for.paste.gtex.multi ## make final bed file bed.gtex.multi.final <- bed.gtex.multi[, c(6, 2, 3, 4, 5)] ## change format of columns colnames(bed.gtex.multi.final) <- c("chrom", "chromStart", "chromEnd", "Gene", "Band") ## rename new columns ## write new bed file write.table(bed.gtex.multi.final, file = "lusc_unique_gtex_multi_for_bedtools.bed", sep = "\t", quote = FALSE, row.names = FALSE, col.names = FALSE) #################### GTEx combined eQTLs ############################# bed.gtex.combined <- lusc.unique.gtex.combined.FINAL[ ,c(4,5,6,1,7)] ## extract columns for bedtools colnames(bed.gtex.combined) <- c("chrom", "chromStart", "chromEnd", "Gene", "Band") ## change column names ## add 'chr' to each chromosome number chr.for.paste.gtex.combined <- paste("chr", bed.gtex.combined$chrom, sep = "") bed.gtex.combined$chrom.2 <- chr.for.paste.gtex.combined ## make final bed file bed.gtex.combined.final <- bed.gtex.combined[, c(6, 2, 3, 4, 5)] ## change format of columns colnames(bed.gtex.combined.final) <- c("chrom", "chromStart", "chromEnd", "Gene", "Band") ## rename new columns ## write new bed file write.table(bed.gtex.combined.final, file = "lusc_unique_gtex_combined_for_bedtools.bed", sep = "\t", quote = FALSE, row.names = FALSE, col.names = FALSE) #################### Hao single tissue eQTLs ############################# bed.hao <- lusc.unique.hao.FINAL[ ,c(4,5,6,1,7)] ## extract columns for bedtools colnames(bed.hao) <- c("chrom", "chromStart", "chromEnd", "Gene", "Band") ## change column names ## add 'chr' to each chromosome number chr.for.paste.hao <- paste("chr", bed.hao$chrom, sep = "") bed.hao$chrom.2 <- chr.for.paste.hao ## make final bed file bed.hao.final <- bed.hao[, c(6, 2, 3, 4, 5)] ## change format of columns colnames(bed.hao.final) <- c("chrom", "chromStart", "chromEnd", "Gene", "Band") ## rename new columns ## write new bed file write.table(bed.hao.final, file = "lusc_unique_hao_for_bedtools.bed", sep = "\t", quote = FALSE, row.names = FALSE, col.names = FALSE) #################### FANTOM5 enhancer targets ############################# bed.fantom <- lusc.unique.fantom.FINAL[ ,c(4,5,6,1,7)] ## extract columns for bedtools colnames(bed.fantom) <- c("chrom", "chromStart", "chromEnd", "Gene", "Band") ## change column names ## add 'chr' to each chromosome number chr.for.paste.fantom <- paste("chr", bed.fantom$chrom, sep = "") bed.fantom$chrom.2 <- chr.for.paste.fantom ## make final bed file bed.fantom.final <- bed.fantom[, c(6, 2, 3, 4, 5)] ## change format of columns colnames(bed.fantom.final) <- c("chrom", "chromStart", "chromEnd", "Gene", "Band") ## rename new columns ## write new bed file write.table(bed.fantom.final, file = "lusc_unique_fantom_for_bedtools.bed", sep = "\t", quote = FALSE, row.names = FALSE, col.names = FALSE) #################### IM-PET IMR90 enhancer targets ############################# bed.impet.imr90 <- lusc.unique.impet.imr90.FINAL[ ,c(4,5,6,1,7)] ## extract columns for bedtools colnames(bed.impet.imr90) <- c("chrom", "chromStart", "chromEnd", "Gene", "Band") ## change column names ## add 'chr' to each chromosome number chr.for.paste.impet.imr90 <- paste("chr", bed.impet.imr90$chrom, sep = "") bed.impet.imr90$chrom.2 <- chr.for.paste.impet.imr90 ## make final bed file bed.impet.imr90.final <- bed.impet.imr90[, c(6, 2, 3, 4, 5)] ## change format of columns colnames(bed.impet.imr90.final) <- c("chrom", "chromStart", "chromEnd", "Gene", "Band") ## rename new columns ## write new bed file write.table(bed.impet.imr90.final, file = "lusc_unique_impet_imr90_for_bedtools.bed", sep = "\t", quote = FALSE, row.names = FALSE, col.names = FALSE) #################### IM-PET NHLF enhancer targets ############################# bed.impet.nhlf <- lusc.unique.impet.nhlf.FINAL[ ,c(4,5,6,1,7)] ## extract columns for bedtools colnames(bed.impet.nhlf) <- c("chrom", "chromStart", "chromEnd", "Gene", "Band") ## change column names ## add 'chr' to each chromosome number chr.for.paste.impet.nhlf <- paste("chr", bed.impet.nhlf$chrom, sep = "") bed.impet.nhlf$chrom.2 <- chr.for.paste.impet.nhlf ## make final bed file bed.impet.nhlf.final <- bed.impet.nhlf[, c(6, 2, 3, 4, 5)] ## change format of columns colnames(bed.impet.nhlf.final) <- c("chrom", "chromStart", "chromEnd", "Gene", "Band") ## rename new columns ## write new bed file write.table(bed.impet.nhlf.final, file = "lusc_unique_impet_nhlf_for_bedtools.bed", sep = "\t", quote = FALSE, row.names = FALSE, col.names = FALSE) #################### IM-PET combined enhancer targets ############################# bed.impet.combined <- lusc.unique.impet.combined.FINAL[ ,c(4,5,6,1,7)] ## extract columns for bedtools colnames(bed.impet.combined) <- c("chrom", "chromStart", "chromEnd", "Gene", "Band") ## change column names ## add 'chr' to each chromosome number chr.for.paste.impet.combined <- paste("chr", bed.impet.combined$chrom, sep = "") bed.impet.combined$chrom.2 <- chr.for.paste.impet.combined ## make final bed file bed.impet.combined.final <- bed.impet.combined[, c(6, 2, 3, 4, 5)] ## change format of columns colnames(bed.impet.combined.final) <- c("chrom", "chromStart", "chromEnd", "Gene", "Band") ## rename new columns ## write new bed file write.table(bed.impet.combined.final, file = "lusc_unique_impet_combined_for_bedtools.bed", sep = "\t", quote = FALSE, row.names = FALSE, col.names = FALSE) #################### all genes ############################# bed.all.overlap <- lusc.unique.all.FINAL[ ,c(4,5,6,1,7)] ## extract columns for bedtools colnames(bed.all.overlap) <- c("chrom", "chromStart", "chromEnd", "Gene", "Band") ## change column names ## add 'chr' to each chromosome number chr.for.paste.all.overlap <- paste("chr", bed.all.overlap$chrom, sep = "") bed.all.overlap$chrom.2 <- chr.for.paste.all.overlap ## make final bed file bed.all.overlap.final <- bed.all.overlap[, c(6, 2, 3, 4, 5)] ## change format of columns colnames(bed.all.overlap.final) <- c("chrom", "chromStart", "chromEnd", "Gene", "Band") ## rename new columns ## write new bed file write.table(bed.all.overlap.final, file = "lusc_unique_all_overlap_for_bedtools.bed", sep = "\t", quote = FALSE, row.names = FALSE, col.names = FALSE) ################################################################################################################################################################################################ ################################################################################################################################################################################################ ### Section 3. Upload bed files to linux command line and convert to unix, sort, and then run bedtools to identify clusters of genes ################################################################################################################################################################################################ ################################################################################################################################################################################################ #################################################################################################################################################################################### #################### Before running bedtools need to manually add genomic location for any genes that were not found with biomaRt. ######################################## #################### Use the NCBI gene website https://www.ncbi.nlm.nih.gov/gene for this information ######################################## #################### Add to existing file and rename as _positions_added. ######################################## #################################################################################################################################################################################### #### convert all bed files from dos to unix dos2unix lusc_unique_gtex_single_for_bedtools.bed dos2unix lusc_unique_gtex_multi_for_bedtools.bed dos2unix lusc_unique_gtex_combined_for_bedtools.bed dos2unix lusc_unique_hao_for_bedtools_positions_added.bed dos2unix lusc_unique_fantom_for_bedtools_positions_added.bed dos2unix lusc_unique_impet_imr90_for_bedtools.bed dos2unix lusc_unique_impet_nhlf_for_bedtools.bed dos2unix lusc_unique_impet_combined_for_bedtools.bed dos2unix lusc_unique_all_overlap_for_bedtools_positions_added.bed #### sort all of the bed files sort -k1,1 -k2,2n lusc_unique_gtex_single_for_bedtools.bed > lusc_unique_gtex_single_for_bedtools.sorted.bed sort -k1,1 -k2,2n lusc_unique_gtex_multi_for_bedtools.bed > lusc_unique_gtex_multi_for_bedtools.sorted.bed sort -k1,1 -k2,2n lusc_unique_gtex_combined_for_bedtools.bed > lusc_unique_gtex_combined_for_bedtools.sorted.bed sort -k1,1 -k2,2n lusc_unique_hao_for_bedtools_positions_added.bed > lusc_unique_hao_for_bedtools_positions_added.sorted.bed sort -k1,1 -k2,2n lusc_unique_fantom_for_bedtools_positions_added.bed > lusc_unique_fantom_for_bedtools_positions_added.sorted.bed sort -k1,1 -k2,2n lusc_unique_impet_imr90_for_bedtools.bed > lusc_unique_impet_imr90_for_bedtools.sorted.bed sort -k1,1 -k2,2n lusc_unique_impet_nhlf_for_bedtools.bed > lusc_unique_impet_nhlf_for_bedtools.sorted.bed sort -k1,1 -k2,2n lusc_unique_impet_combined_for_bedtools.bed > lusc_unique_impet_combined_for_bedtools.sorted.bed sort -k1,1 -k2,2n lusc_unique_all_overlap_for_bedtools_positions_added.bed > lusc_unique_all_overlap_for_bedtools_positions_added.sorted.bed #### run the cluster function in bedtools bedtools cluster -i lusc_unique_gtex_single_for_bedtools.sorted.bed -d 1000000 > lusc_unique_gtex_single_regions_by_loci.bed bedtools cluster -i lusc_unique_gtex_multi_for_bedtools.sorted.bed -d 1000000 > lusc_unique_gtex_multi_regions_by_loci.bed bedtools cluster -i lusc_unique_gtex_combined_for_bedtools.sorted.bed -d 1000000 > lusc_unique_gtex_combined_regions_by_loci.bed bedtools cluster -i lusc_unique_hao_for_bedtools_positions_added.sorted.bed -d 1000000 > lusc_unique_hao_regions_by_loci.bed bedtools cluster -i lusc_unique_fantom_for_bedtools_positions_added.sorted.bed -d 1000000 > lusc_unique_fantom_regions_by_loci.bed bedtools cluster -i lusc_unique_impet_imr90_for_bedtools.sorted.bed -d 1000000 > lusc_unique_impet_imr90_regions_by_loci.bed bedtools cluster -i lusc_unique_impet_nhlf_for_bedtools.sorted.bed -d 1000000 > lusc_unique_impet_nhlf_regions_by_loci.bed bedtools cluster -i lusc_unique_impet_combined_for_bedtools.sorted.bed -d 1000000 > lusc_unique_impet_combined_regions_by_loci.bed bedtools cluster -i lusc_unique_all_overlap_for_bedtools_positions_added.sorted.bed -d 1000000 > lusc_unique_all_overlap_regions_by_loci.bed