# NormPCA.R
# 
# Autogenerated content
# TODO: Add comment
#
# Author: Xiao-Qin Xia
###############################################################################

DEBUG <- FALSE 
#DEBUG <- TRUE

HAS_RESULT = FALSE
N_digits <- 5

Angles <- function(x,y) {
	#lx = length(x); ly = length(y)
	#if (lx != ly) break
	ang <- c()
	for (i in 1:length(x)) {
		xi <- x[[i]]; yi <- y[[i]]
		if (yi == 0) {ang[i] <- ifelse(xi > 0, pi/2, ifelse(xi==0, 0, pi))
		}else ang[i] <- atan(xi/yi) + ifelse(yi<0, pi, ifelse(xi<0, pi+pi, 0))
		}
	ang
	}

CutFrequency <- function(x, freq) {
	if (freq==0) return(1)
	freq_sum <- 0
	for (i in 1:length(x)) {
		freq_sum <- freq_sum + x[[i]]
		if (freq_sum >= freq) break
		}
	if (DEBUG) print(sprintf('freq_sum = %f, freq = %f, sum(x) = %i', freq_sum, freq, sum(x)))
	i+1
	}

JointSegs <- function(x, gate) {
	segs <- c()
	for (i in 1:length(x)) {
		xi <- x[[i]]
		n <- length(segs)
		if (xi >= gate) {segs[n+1] <- i
		}else if (n == 1) {segs <- c()
		}else if (n>1) return(segs)
		}
	segs
	}

GenesInSegs <- function(x, segs) {
	genes <- c()
	lsg <- 1:length(segs)
	for (i in 1:length(x)) {
		xi <- x[[i]]
		#if xi contained by segs: genes[length(genes)+1] <- xi
		for (j in lsg) {
			if (xi == segs[[j]]) {genes[length(genes)+1] <- i; break}
			}
		}
	genes
	}	

MyPCA <- function(D) {
	mD <- as.matrix(D)
	for (i in 1:ncol(mD)) mD[,i] <- mD[,i] - mean(mD[,i])
	C <- t(mD) %*% mD
	C <- C / (nrow(D)-1)
	eigen(C)
	}


GetPC1_Radka <- function(D, sig_level=0.05, ave_num=10, ...) {
	MyData <- list()
	gene_mean_num <- ave_num

	#D <- read.table('C:/temp/mark_pivotdata.txt', header=TRUE)
	#D <- read.table('C:/temp/D1.txt', header=TRUE)

	aD <- as.matrix(D)
	# replace NAs with median
	lapply(1:ncol(aD), function(i) aD[is.na(aD[,i]), i] <<- median(aD[,i], na.rm=TRUE) )
	
	Q <- prcomp(aD)
	Q <- data.frame(Q$rotation)
	if (sum(Q$PC1)<0) Q$PC1 <- -Q$PC1
	if (sum(Q$PC2)<0) Q$PC2 <- -Q$PC2
	R1 <- aD %*% Q$PC1
	R2 <- aD %*% Q$PC2

	#Q <- princomp(D)
	#Q <- Q$loadings
	#R1 <- aD %*% Q[,'Comp.1']
	#R2 <- aD %*% Q[,'Comp.2']

	if (DEBUG) print('Complete PCA with the original data')

	#plot(R1,R2)

	angles <- Angles(R1, R2) # fi

	if (DEBUG) print('Got angle vector')

	#gene_mean_num <- 10
	angle_max <- max(angles); angle_min <- min(angles)
	angle_range <- angle_max - angle_min # theta
	angle_num <- as.integer(length(R1)/gene_mean_num + 1) # p
	angle_step <- angle_range/angle_num # delta
	angle_factor <- as.integer((angles-angle_min)/angle_step)+1
	angle_factor[angle_factor>angle_num] <- angle_num # the max angle should be included in the angle_num_th group
	angle_factor <- as.factor(angle_factor)
	angle_freq <- tapply(angles, angle_factor, length)
	angle_num_valid <- length(angle_freq) # should be equal to angle_num? No. might be less than angle_num if there are vectors without any data points.

	if (DEBUG) {print(sprintf('length of angle_freq = %d',length(angle_freq)));print(summary(angle_freq))}
	#freq_min <- min(angle_freq); freq_max <- max(angle_freq)
	angle_freq_dist_num <- tapply(angle_freq, as.factor(angle_freq), length)
	#angle_freq_dist_rate <- angle_freq_dist_num/angle_num
	if (DEBUG) print(sprintf('angle_num = %d',angle_num))
	#i <- CutFrequency(angle_freq_dist_num, angle_num*0.95)
	i <- CutFrequency(angle_freq_dist_num, angle_num_valid*(1-sig_level))

	#print(sprintf('i=%f, length_angle_freq_dist_num=%d, angle_num_valid=%d, angle_num_valid_(1-sig_level)=%f', i, length(angle_freq_dist_num), angle_num_valid, (angle_num_valid*(1-sig_level))))

	#attr(MyData, 'R1') <- R1
	MyData$R1 <- R1; MyData$R2 <- R2
	MyData$angle_max <- angle_max; MyData$angle_min <- angle_min
	MyData$angle_num <- angle_num
	MyData$angle_freq <- angle_freq; MyData$angle_freq_dist_num <- angle_freq_dist_num
	MyData$first_PCs <- Q
	PC <- Q$PC1
	MyData$PC <- PC #Q[,'Comp.1']
	MyData$NormPC <- PC/(sum(PC)/length(PC))

	if (DEBUG) print(sprintf('Proceeded to Step 1: i = %f, length(angle_freq_dist_num) = %d', i, length(angle_freq_dist_num)))
	#if (i<=length(angle_freq_dist_num)) cut_freq_num <- angle_freq_dist_num[[i]]
	if (i<=length(angle_freq_dist_num)) {
		cut_freq_num <- as.integer(attr(angle_freq_dist_num[i],'names'))
		#print('yes')
	}else { 
		#print(length(1:nrow(D)))
		MyData$house_keeping_indice <- 1:nrow(D)
		#print(length(MyData$house_keeping_indice))
		return(MyData) # either none or all genes will serve as 'house_keeping' gene.
		}

	if (DEBUG) print(sprintf('cut_freq_num is %d', cut_freq_num))
	joint_segs <- JointSegs(angle_freq, cut_freq_num)
	if (length(joint_segs) == 0) {
		MyData$house_keeping_indice <- 1:nrow(D)
		return(MyData)
		}
	if (DEBUG) {print(3); print(joint_segs)}

	# Find related genes from the segments with subscripts specified by joint_segs
	house_keeping_indice <- GenesInSegs(angle_factor, joint_segs)
	if (DEBUG) {print(typeof(house_keeping_indice)); print(class(house_keeping_indice)); print(house_keeping_indice)}
	house_keeping_genes <- D[house_keeping_indice,]

	if (DEBUG) print(summary(house_keeping_genes))
	
	Q <- prcomp(house_keeping_genes)
	Q <- data.frame(Q$rotation)
	if (sum(Q$PC1)<0) Q$PC1 <- -Q$PC1
	if (DEBUG) print(Q$PC1)
	#Q$PC1
	
	#Q <- princomp(house_keeping_genes)
	#Q <- Q$loadings
	#if (DEBUG) print(Q[,'Comp.1'])
	
	MyData$joint_segs <- joint_segs; MyData$house_keeping_indice <- house_keeping_indice
	MyData$second_PCs <- Q; 
	PC <- Q$PC1
	MyData$PC <- PC #Q[,'Comp.1']
	MyData$NormPC <- PC/(sum(PC)/length(PC))
	
	MyData

	}

Get_Index <- function(D) {
	IDX <- c()
	for (i in 1:length(D)) if (D[i]) IDX[length(IDX)+1] <- i
	IDX
	}

GetPC1_My_Method <- function(D, sig_level=0.01, ...) {
	MyData <- list()
	# replace NAs with median
	lapply(1:ncol(D), function(i) D[is.na(D[,i]), i] <<- median(D[,i], na.rm=TRUE) )
	Q <- prcomp(D, retx=TRUE)#, center=FALSE, scale.=FALSE)
	SUMX <- apply(abs(Q$x), 1, sum)
	house_keeping_sign <- abs(Q$x[,1]) / SUMX >= 1-sig_level
	house_keeping_indice <- Get_Index(house_keeping_sign)
	house_keeping_genes <- D[house_keeping_sign,]
	
	PC <- Q$rotation[,'PC1']
	if (sum(PC)<0) PC <- -PC
	
	MyData$Q <- Q
	MyData$house_keeping_indice <- house_keeping_indice	
	MyData$PC <- PC
	
	MyData
	}

GetPC1 <- GetPC1_Radka #GetPC1_My_Method

NormedPC1 <- function(...) {
	pcs <- GetPC1(...)
	pcs$NormPC
	}

Normalize <- function(D, PC) { #No use
	ND <- matrix()
	for (i in length(D)) ND[i] <- D[i]/PC[[i]]
	ND
	}

NormalizeData <- function(D, PC) {
	#ND <- D/DataPCA$PC
	
	#ND <- matrix()
	#for (i in length(D)) ND[i] <- D[i]/PC[[i]]
	#ND <- D/PC
	
	#ND <- list()
	#for (i in 1:length(PC)) ND[[i]] <- D[,i]/PC[[i]]
	#ND <- as.matrix(ND)
	
	#ND <- t(t(D)/PC)

	for (i in 1:length(PC)) D[, i] <- D[, i]/PC[i]

	#apply(D, 

	D
	
	}
		
NormPCA <- function(D, sig_level=0.025, ave_num=10, method='default', debug=DEBUG){
	DEBUG <<- debug
	HAS_RESULT <<- FALSE
	
	if (method == 'Radka') {GetPC1 <- GetPC1_Radka 
	}else if (method == 'My_Method') GetPC1 <- GetPC1_My_Method 
	
	DataList <- list()
	DataPCA <- GetPC1(D, sig_level=sig_level, ave_num=ave_num)
	DataList$DataPCA <- DataPCA
	#PC <- DataPCA$PC
	#DataList$NormPC <- PC/(sum(PC)/length(PC))
	##DataList$ND <- NormalizeData(D, DataPCA$PC)
	DataList$ND <- NormalizeData(D, DataList$DataPCA$NormPC)
	HAS_RESULT <<- TRUE
	DataList
	}
	
normGUI <- function(sig_level=0.025, ave_num=10, debug=DEBUG, ...) {
	
	DataList <- list()
	
	D <- matrix(); ND <- D
	file_src <- ''; file_obj <- ''; file_idx <- ''
	
	DEBUG <<- debug
	
	normalized = FALSE
	
	GetFileSrc <- function() {
		file_src <<- tclvalue(tkgetOpenFile(initialfile=file_src))
		tkconfigure(file_src_label, text=file_src)
		file_obj <<- file_src
		tkconfigure(file_obj_label, text=file_obj)
		PrintMsg(sprintf("%s%s",'\nUse data from file: ', file_src))
		file_src
		}
	SaveFileObj <- function() {
		if (HAS_RESULT == FALSE) return()
		file_obj <<- tclvalue(tkgetSaveFile(initialfile=file_obj))
		tkconfigure(file_obj_label, text=file_obj)
		if (DEBUG) {print(file_obj); print('Before Saving');print(typeof(ND));print(class(ND))}
		write.table(ND, file=file_obj)#, seq='\t')
		PrintMsg(sprintf("%s%s", '\nSaved data to file: ', file_obj))
		file_obj
		}
	SaveHouseIndex <- function() {
		if (HAS_RESULT == FALSE) return()
		file_idx <<- tclvalue(tkgetSaveFile(initialfile=file_idx))
		write.table(DataList$DataPCA$house_keeping_indice, file=file_idx)
		PrintMsg(sprintf('\nSaved indice of house-keeping genes to file %s', file_idx))
		file_idx
		}
	PrintMsg <- function(m) {
		#if (typeof(m) == 'character'
		tkinsert(info_txt, 'end', paste(m,'\n'))
		}
	MainNormalizeData <- function() {
		HAS_RESULT <<- FALSE
	
		if (DEBUG) {print('Before reading data');print(file_src);print(class(file_src));print(typeof(file_src))}
		
		D <<- read.table(file_src, header=ifelse(tclvar$src_header, TRUE, FALSE))
		if (DEBUG) print('After reading data')
		#PrintMsg(summary(D))
		if (DEBUG) print('After summarized data')
		
		sig_level <<- as.double(tclvalue(tkget(sig_entry)))
		ave_num <<- as.integer(tclvalue(tkget(ave_entry)))
		#DataPCA <- GetPC1(D, sig_level=sig_level, ave_num=ave_num)
		#DataList$DataPCA <<- DataPCA
		#PC <- DataPCA$PC
		DataList <- NormPCA(D, sig_level=sig_level, ave_num=ave_num, debug=DEBUG)
		if (DEBUG) print('After PCA data')
		PrintMsg('The First PC (principal component):')
		PrintMsg(DataList$DataPCA$PC)
		PrintMsg('\nAnalysis completed.')

		if (DEBUG) print('After normlizing data')
		#PrintMsg(summary(ND))
		#DataList$ND <<- ND
		ND <<- DataList$ND
		HAS_RESULT <<- TRUE
		#ND
		DataList <<- DataList
		}
		
	PlotFigures <- function() {
		if (HAS_RESULT == FALSE) return()
		windows(); plot(DataList$DataPCA$R1, DataList$DataPCA$R2)
		windows(); plot(DataList$DataPCA$angle_freq)
		windows(); plot(DataList$DataPCA$angle_freq_dist_num)
		house_genes <- D[DataList$DataPCA$house_keeping_indice, ]
		windows(); pairs(house_genes)
		}
	
	library(tcltk)
	root <- tktoplevel()
	tktitle(root) <- 'Normalization by PCA'
	btn_frame <- tkframe(root)
	tkpack(btn_frame, side='top', fill='both', expand=1)
	
	i <- 0
	file_src_btn <- tkbutton(btn_frame, text='Open source data file:', command=GetFileSrc)
	tkgrid(file_src_btn, row=i, column=0,sticky='e')
	file_src_label <- tklabel(btn_frame, text='')
	tkgrid(file_src_label, row=i, column=1, columnspan=2, sticky='w')
	
	header_src_btn <- tkcheckbutton(btn_frame, text='Data with titles', variable='src_header')
	tkgrid(header_src_btn, row=i, column=3, sticky='w')
	tclvar$src_hearder <- 1
	
	i <- i+1
	sig_label <- tklabel(btn_frame, text='Significace level')
	tkgrid(sig_label, row=i, column=0, sticky='e')
	sig_entry <- tkentry(btn_frame)
	tkgrid(sig_entry, row=i, column=1, sticky='w')
	tkinsert(sig_entry, 0, sig_level)

	ave_label <- tklabel(btn_frame, text='Average dot No. in each sector')
	tkgrid(ave_label, row=i, column=2, sticky='e')
	ave_entry <- tkentry(btn_frame)
	tkgrid(ave_entry, row=i, column=3, sticky='w')
	tkinsert(ave_entry, 0, ave_num)
	
	i <- i+1
	norm_btn <- tkbutton(btn_frame, text='Normalize data!', command=MainNormalizeData)
	tkgrid(norm_btn, row=i, column=0, columnspan=1, sticky='e')
	plot_btn <- tkbutton(btn_frame, text='Plot related data', command=PlotFigures)
	tkgrid(plot_btn, row=i, column=3, columnspan=1, sticky='w')
	
	i <- i+1
	file_obj_btn <- tkbutton(btn_frame, text='Save transformed data:', command=SaveFileObj)
	tkgrid(file_obj_btn, row=i, column=0,sticky='e')
	file_obj_label <- tklabel(btn_frame, text='')
	tkgrid(file_obj_label, row=i, column=1, columnspan=2, sticky='w')
	save_index_btn <- tkbutton(btn_frame, text='Save indice of housekeeping genes', command=SaveHouseIndex)
	tkgrid(save_index_btn, row=i, column=3, sticky='w')
	
	txt_frame <- tkframe(root)
	info_txt <- tktext(txt_frame)
	scr_y <- tkscrollbar(txt_frame, orient='vertical', command=function(...) tkyview(info_txt,...))
	scr_x <- tkscrollbar(txt_frame, orient='horizontal', command=function(...) tkxview(info_txt,...))
	tkconfigure(info_txt, yscrollcommand=function(...) tkset(scr_y,...), xscrollcommand=function(...) tkset(scr_x,...))
	tkpack(scr_y, side='right', fill='y')
	tkpack(scr_x, side='bottom', fill='x')
	tkpack(info_txt, fill='both', expand=1)
	#tkgrid(txt_frame, row=2, column=0, columnspan=3)
	tkpack(txt_frame, side='top', fill='both', expand=1)
	
	#D <- read.table(filename, header=ifelse(tclvar$src_header, TRUE, FALSE))
	#pc <- GetPC1(D)
	#ND <- Normalize(D, pc)
	}

fullNormalize <- function(intensity_file, output_dir, chart_dir, plotPCA=TRUE, plotAF=TRUE, plotAFD=TRUE, plotHK=TRUE, ...) {
	#if (intensity_file == '') intensity_file='NormPCA.R'
	file_sep <- .Platform$file.sep
	D <- read.table(intensity_file, header=TRUE)
	DataList <- NormPCA(D, ...)
	write.table(DataList$ND, paste(output_dir, 'normalized_by_pca.txt', sep='') )
	if (plotPCA) {
		#bitmap(paste(chart_dir,'/','R1_R2.pdf', sep=''), type='pdfwrite', width=12, heigh=12)
		pdf(paste(chart_dir, 'R1_R2.pdf', sep=file_sep), width=12, heigh=12)
		plot(DataList$DataPCA$R1, DataList$DataPCA$R2)
		dev.off() 
		}
	if (plotAF) {
		#bitmap(paste(chart_dir,file_sep,'Angle_freq.pdf', sep=''), type='pdfwrite', width=12, heigh=12)
		pdf(paste(chart_dir,'Angle_freq.pdf', sep=file_sep), width=12, heigh=12)
		plot(DataList$DataPCA$angle_freq)
		dev.off() 
		}
	if (plotAFD) {
		#bitmap(paste(chart_dir, 'Angle_freq_dist_num.pdf', sep='file_sep'), type='pdfwrite', width=12, heigh=12)
		pdf(paste(chart_dir, 'Angle_freq_dist_num.pdf', sep='file_sep'), width=12, heigh=12)
		plot(DataList$DataPCA$angle_freq_dist_num)
		dev.off()
		}
	if (plotHK) {
		#bitmap(paste(chart_dir, 'House_keeping.pdf', sep='file_sep'), type='pdfwrite', width=12, heigh=12)
		pdf(paste(chart_dir, 'House_keeping.pdf', sep='file_sep'), width=12, heigh=12)
		house_genes <- D[DataList$DataPCA$house_keeping_indice, ]
		pairs(house_genes)
		dev.off()
		}
	}	

normTest <- function(...) {
	D <- read.table('D1.txt', header=TRUE)
	NormPCA(D, ...)
	}

#Main()

NORM_HELP <- c('Usage of NormPCA.R:', 
'At present two functions are offered in NormPCA.R.', 
'After loading NormPCA.R by R "source" command, run Main() will open a simple GUI;',
'the alternative is call function NormPCA(D, sig_level=xxx, ave_num=xx),',
'where,',
'	"D" is a data.frame containing original array data in columns;',
'	"sig_level" has a default value 0.025, the significant level to determine house-keeping genes;',
'	"ave_num" is the average number of gene points in each sector on the plate of PC1 and PC2.',
'The return value of NormPCA() is list consists of two items:',
'	1. DataPCA, a list of intermidiate data gathered during the course of ',
'		searching for house-keeping genes and computing the normalizing factor, ',
'		roughly the following items might be found: ',
'		R1, R2, ',
'		angle_max, angle_min, angle_num, ',
'		angle_freq, angle_freq_dist_num,',
'		first_PCs, second_PCs, PC,',
'		joint_segs, house_keeping_indice ',
'		',
'		meanings of these items refer to the paper of Radka Stoyanova ',
'		on Bioinformatics (2004, Vol.20, pp1772-1784) ',
'		',
'	2. ND - the normalized data ',
'	',
'To view this message again, type "HELP"')

#for (s in NORM_HELP) print(s)
