### changes in W5: not only Y needs to be scaled, X needs also to be scaled across samples, such that t-values across cancer types are comparable
### About X: no difference whether scale or not -- the rank remains unchanged
#!/usr/bin/env Rscript
args = commandArgs(trailingOnly=TRUE)

if (length(args)==0) {
	stop("Must provide start and end\n", call.=FALSE)
} else if (length(args)==1) {
	cur.cancer = args[1]
} 
print( cur.cancer )

setwd("/data1_2/jiap/projects/18-CCLE-VAE/new/V15.2/NOPEER.RANK.Sigmoid/result.EN/dr.CCLE/04-mix/09.Expr/")

###########################################################################################################

drug.ccle = read.table(file="/data1_2/jiap/projects/18-CCLE-VAE/new/V15.2/NOPEER.RANK.Sigmoid/result.EN/dr.CCLE/01/MIX-F1-W5-PCC/MIX-F1-W5-PCC.best.pred_TCGA.txt", header=T, as.is=T, sep="\t")
colnames(drug.ccle)[3:ncol(drug.ccle)] -> drugs
cancer.types = unique(drug.ccle[,2])
sample.type = substr(drug.ccle[,1], 14, 15)
ss = gsub("\\.", "-", drug.ccle[,1])
drug.ccle[,1] = ss

cancer.drug.ccle = c()
for(ct in 1:length(cancer.types)){
	cancer = cancer.types[ct]
	
	type.code = "01"
	if(cancer == "LAML"){ type.code = "03" }
	if(cancer == "SKCM"){ type.code = "06" }
	
	blca.ccle = drug.ccle[which(drug.ccle[,2] == cancer & sample.type == type.code), ]
	cancer.drug.ccle = rbind(cancer.drug.ccle, blca.ccle)
}
drug.ccle = cancer.drug.ccle

###########################################################################################################

	gene.drug.expr.mat = c()
	
	original.TCGA.RPKM = read.delim(paste("/data/mshao/TCGA/",cur.cancer,"/HiSeqV2", sep=""), as.is=T)
	
	apply(original.TCGA.RPKM[,-1],1,sum) -> rowCheck
	non0.TCGA.RPKM = original.TCGA.RPKM[which(rowCheck!=0),]
	
	apply(original.TCGA.RPKM[,-1],1,var) -> rowCheck
	non0.TCGA.RPKM = original.TCGA.RPKM[which(rowCheck!=0),]
	
	TCGA.ss = gsub("\\.", "-", colnames(non0.TCGA.RPKM))
	colnames(non0.TCGA.RPKM) = TCGA.ss; rm(TCGA.ss)
	
	match(unique(non0.TCGA.RPKM[,1]), non0.TCGA.RPKM[,1]) -> ii ### keep unique genes
	non0.TCGA.RPKM = non0.TCGA.RPKM[ii, ]
	
	blca.ccle = drug.ccle[which(drug.ccle[,2] == cur.cancer), ]
	
	fixed.ss = intersect(colnames(non0.TCGA.RPKM), blca.ccle[,1])
	
	blca.ccle = blca.ccle[match(fixed.ss, blca.ccle[,1]),]
	apply(blca.ccle[, 3:ncol(blca.ccle)], 2, scale) -> new.ccle  ### unique in W5
	TCGA.RPKM = non0.TCGA.RPKM[, match(fixed.ss, colnames(non0.TCGA.RPKM))]
	rownames(TCGA.RPKM) = non0.TCGA.RPKM[,1]
	apply(TCGA.RPKM, 1, sd) -> rowCheck
	TCGA.RPKM = TCGA.RPKM[rowCheck!=0, ]
	
	apply(TCGA.RPKM,1,mean) -> rowMean
	TCGA.RPKM = TCGA.RPKM[rowMean > 1, ]
	
	for(kgene in 1:nrow(TCGA.RPKM)){
		t(TCGA.RPKM[kgene, ]) -> gene.expr
		
		X2 = rep(1, length(gene.expr))
		X2[which(gene.expr < quantile(gene.expr, probs=.25))] = 0
		X2[which(gene.expr > quantile(gene.expr, probs=.75))] = 2
		if(length(unique(X2)) < 3)next
		
		apply(new.ccle, 2, function(Y){
				summary(glm(Y ~ X2)) -> sfit
				coef(sfit)[2, 1:4]
		}) -> ps.twosided
		
		betas  = ps.twosided[1,]
		stds   = ps.twosided[2,]
		tvalue = ps.twosided[3,]
		ps     = ps.twosided[4,]
		
		gene.drug.expr.mat = rbind(gene.drug.expr.mat, cbind(cur.cancer, rownames(TCGA.RPKM)[kgene], drugs, ps, tvalue, betas, stds, mean(gene.expr[which(X2==0)]), mean(gene.expr[which(X2==1)]), mean(gene.expr[which(X2==2)]) ))
		
		if(kgene %% 100 == 0)cat(kgene, ".", sep="")
	}
	cat(cur.cancer, ".", sep="")
	write.table(gene.drug.expr.mat, file=paste("W5.",cur.cancer,".Expr.txt", sep=""), quote=F, row.names=F, sep="\t")