library(limma)
library(affy)

# accept two vectors (unique_id and probe_id, order by probe_id) from python
# accept a vector of db_colnms from python (in order to keep the col in correct order for database. 
# 		usually it is c('id', array_id', 'channel_No', 'probe_id', 'fg', 'bg', 'flag')

readIntensity <- function(src, obj, fmt, data_type, channel_num, array_id, current_id) {
	#save(src, file='src.Rdata')
	#save(obj, file='obj.Rdata')
	
	#sink('/home/xxia/temp/RI.txt', append=TRUE)
	DTLIST <- list()
	#ids <- as.integer((current_id+1):(current_id+length(probe_id)*channel_num))
	DTLIST[['id']] <- as.character(as.integer((current_id+1):(current_id+length(probe_id)*channel_num)))
	names(probe_id) <- unique_id
	DTLIST[['probe_id']] <- as.character(as.integer(probe_id))
	rown = length(unique_id)

	DTLIST[['array_id']] <- as.character(as.integer(array_id))

	#print(paste('ids', class(ids), 'array_id', class(array_id), 'probe_id', class(probe_id)))
	
	if (toupper(fmt) == 'CEL') {
		DT <- mas5(ReadAffy(filenames=src))
		#DT <- exprs(DT)[unique_id,] #DT@exprs[unique_id,] # sort by unique_id. The row names should be unique_id, and the column number should be 1.
		DT <- exprs(DT)
		DT <- DT[match(unique_id, rownames(DT)), ]
		DTLIST[['channel_No']] <- 1
		DTLIST[['fg']] <- DT # DT is a vector! #DT[,1] #the column number should be 1.
		DTLIST[['bg']] <- 0
		DTLIST[['flag']] <- 1
		#DT <- cbind(ids, array_id, channel_No, probe_id, fg, bg, flag) # id, array_id, channel_No, probe_id, fg, bg, flag
		}
	else if(identical(fmt, 'user.defined')) {
		RawDT <- read.delim(src)
		colnms <- tolower(colnames(RawDT))
		colnames(RawDT) <- colnms
		# IMPORTANT: if no column name 'id', RawDT[,'id'] is NULL in R-2.5, but will be an error in R-2.8
		has_id <- 'id' %in% colnms || 'unique_id' %in% colnms 
		if (has_id) id_str <- if ('id' %in% colnms) 'id' else 'unique_id'
		if (has_id && #!is.null(RawDT[,'id']) &&  # has the "ID" column
			(length(RawDT[, id_str])==length(unique(RawDT[, id_str]))) && # no replicate
			all(sort(unique_id)==sort(RawDT[, id_str])) ) { # completed same to unique_id
			#rownames(RawDT) <- RawDT$ID
			#RawDT <- RawDT[unique_id, ] # sort by unique_id
			RawDT <- RawDT[match(unique_id, RawDT[, id_str]), ] # this method is much much much faster!!!
			rownames(RawDT) <- unique_id
			}
		if (data_type == 'intensity') {
			DT <- NULL
			for (i in 1:channel_num) {
				fg <- paste('ch', i, '.intensity', sep='')
				fg <- RawDT[fg]
				bg <- paste('ch', i, '.background', sep='')
				if (bg %in% colnms) bg <- RawDT[bg]
				else bg <- 0
				flag <- paste('ch', i, '.flag', sep='')
				if (flag %in% colnms) flag <- RawDT[flag]
				else flag <- 1
				DT_i <- cbind(fg, bg, flag)
				colnames(DT_i) <- c('fg', 'bg', 'flag') # this line is necessary for rbinding data.frame
				DT <- rbind(DT, DT_i)
				}
			DTLIST[['channel_No']] <- rep(1:channel_num, each=rown)
			DTLIST[['fg']] <- DT[,1]
			DTLIST[['bg']] <- DT[,2]
			DTLIST[['flag']] <- DT[,3]
			#DT <- cbind(ids, array_id, channel_No, probe_id, DT)
			}
		else { #  data_type is ratio or log-ratio
			DTLIST[['channel_No']] <- 1 #0 #rep(0, rown)
			DTLIST[['fg']] <- RawDT[if(data_type=='log-ratio') 'log.ratio' else 'ratio'] #RawDT['ratio']
			DTLIST[['bg']] <- 0 #rep(0, rown)
			if ('flag' %in% colnms) DTLIST[['flag']] <- RawDT['flag']
			else DTLIST[['flag']] <- 1
			}
		}
	else { # Then channel_num should be 2 (or 1 for some agilent data). use limma for two color arrays: ch1 is green, ch2 is red.
		if (fmt %in% c('genepix', 'genepix.median', 'genepix.custom')) RG <- readGenepix(src, source=fmt)
		else if (fmt == 'agilent' && channel_num==1) RG <- read.maimages(src, source=fmt, columns=list(G="gMeanSignal", Gb="gBGMedianSignal", R="gMeanSignal", Rb="gBGMedianSignal"))
		else RG <- read.maimages(src, source=fmt) 
		#DT <- rbind(cbind(RG$G[,1], RG$Gb[,1]), cbind(RG$R[,1], RG$Rb[,1]))
		DT <- cbind(RG$G[,1], RG$Gb[,1])
		if (channel_num == 2) DT <- rbind(DT, cbind(RG$R[,1], RG$Rb[,1]))
		if (fmt == 'scanarrayexpress') DT <- correctDT4ScanarrayExpress(DT, src)
		if (is.null(RG$weights)) DT <- cbind(DT, 1)
		else DT <- cbind(DT, RG$weights[,1])
		DTLIST[['channel_No']] <- rep(1:channel_num, each=rown)
		DTLIST[['fg']] <- DT[,1]
		DTLIST[['bg']] <- DT[,2]
		DTLIST[['flag']] <- DT[,3]
		#DT <- cbind(ids, array_id, channel_No, probe_id, DT)
		}

	DT <- cbind(DTLIST[[db_colnms[1]]], DTLIST[[db_colnms[2]]], DTLIST[[db_colnms[3]]], DTLIST[[db_colnms[4]]], DTLIST[[db_colnms[5]]], DTLIST[[db_colnms[6]]], DTLIST[[db_colnms[7]]])
	write.table(DT, file=obj, sep='\t', row.names=FALSE, col.names=FALSE, quote=FALSE)
	invisible(DT)
		
	}

readGenepix <- function(src, source='genepix') { # LIMMA only recognize wave length 635/532 by default. this function will fix this
	f <- file(src)
	nln <- as.integer(scan(f, nlines=2, what='character')[3]) + 2
	lines <- scan(f, nlines=nln, what='character', sep='\n')
	close(f)
	line <- grep('Wavelengths\\s*=\\s*\\d+\\s*\\d+\\b', lines, perl=TRUE, value=TRUE)
	line <- strsplit(line, '\\s*=\\s*')[[1]][2]
	wave <- as.integer(unlist(strsplit(line, '\\D+', perl=TRUE)))
	sort(wave, decreasing=TRUE)
	wave <- wave[wave > 0][1:2]
	cols <- if (source=='genepix.median') rep(c(' Median'), each=4) else rep(c(' Mean', ' Median'), each=2)
	cols <- as.list(paste(c('F', 'F', 'B', 'B'), wave, cols, sep=''))
	names(cols) <- c('R', 'G', 'Rb', 'Gb')
	RG <- read.maimages(src, source=source, columns=cols)
	}

correctDT4ScanarrayExpress <- function(DT, fn) { 
	# this function is fixing LIMMA's bug in read scanarrayexpress file - when its end row (END DATA) contains TAB character, LIMMA (or read.table?) will thinks it is a data row.
	S <- scan(fn, sep='\n', what='')
	i1 <- grep('BEGIN DATA', S)
	if (length(i1) != 1) return(DT)
	i2 <- grep('END DATA', S)
	if (length(i2) != 1 || i2 < i1) return(DT)
	dif <- i2 - i1 - 2
	if (dif < nrow(DT)) DT <- if(dif>1) cbind(DT[1:dif, ]) else rbind(DT[1:dif, ])
	DT
	}

checkIntensity <- function(src, fmt, data_type='intensity', ch_num=1) {
	rltstr <- function(x) paste('<checkIntensity_result>', x, '</checkIntensity_result>', sep='')
	if (toupper(fmt) == 'CEL') { 
		#DT <- try(mas5(ReadAffy(filenames=src)))
		DT <- try(exprs(rma(ReadAffy(filenames=src))), silent=TRUE) # rma is much faster!
		if ('try-error' %in% class(DT)) { print(rltstr(0)); return(0) }
		}
	else if(identical(fmt, 'user.defined')) {
		DT <- try(read.delim(src))
		if ('try-error' %in% class(DT))  { print(rltstr(0)); return(0) }
		cnm <- tolower(colnames(DT))
		colnames(DT) <- cnm
		cnm <- sort(cnm)
		nc <- length(cnm)
		# check col names:
		if (data_type %in%  c('ratio', 'log-ratio')) {
			if (nc < 1) return('No data in file')
			#if (!('ratio' %in% cnm)) { print(rltstr('No valid ratio column')); return('No valid ratio column') }
			if ((data_type=='ratio' && !('ratio' %in% cnm)) || (data_type=='log-ratio' && !('log.ratio' %in% cnm))) { print(rltstr(paste('No valid', data_type, 'column'))); return(paste('No valid', data_type, 'column')) }
			#if (nc > 2) return('Too many columns found in file')
			#if (nc == 1 && cnm != 'Ratio') return(paste('Invalid column name ', cnm, ' (Should be "Ratio") found in file', sep=''))
			#if (nc == 2 && sum(cnm != c('Flag', 'Ratio'))>0 ) return('Invalid column name(s) ("Ratio" and "Flag" are allowed) found in file')
			}
		else if (data_type == 'intensity') { # should be intensity
			# check intensity only
			check_intensity_only <- TRUE
			if (check_intensity_only) {
				titl <-paste('ch', seq(ch_num), '.intensity', sep='')
				if (length(intersect(titl, cnm)) != ch_num) {
					print(rltstr('Invalid column name(s) or column number'))
					return('Invalid column name(s) or column number')}
				}
			else {
				if (length(grep('ch1.intensity', cnm))<1) {
					print(rltstr('No valid intensity column'))
					return('No valid intensity column')}
				nrep <- 1
				titl <- c('.intensity')
				if (length(grep('ch1.background', cnm))==1) {
					nrep <- nrep + 1
					titl <- c('.background', titl)
					}
				if (length(grep('ch1.flag', cnm))==1) {
					nrep <- nrep + 1
					titl <- c(titl, '.flag')
					}
				#if (length(grep('ch1.Flag', cnm))==1) { # with Flag
				#	nrep <- 3
				#	titl <- c('.Background', '.Intensity', '.Flag')
				#	}
				#else {
				#	nrep <- 2
				#	titl <- c('.Background', '.Intensity')
				#	}
				titl <- sort(paste(c('ch'), rep(seq(ch_num), each=nrep), titl, sep=''))
				#if (nc != length(titl) || any(cnm != titl))
				if (length(intersect(titl, cnm)) != length(titl)) {
					print(rltstr('Invalid column name(s) or column number'))
					return('Invalid column name(s) or column number') }
				}
			}
		else {
			print(rltstr(paste('Wrong data type ("', data_type, '" - only "intensity", "ratio" or "log-ratio" are allowed) for file'), sep=''))
			return(paste('Wrong data type ("', data_type, '" - only "intensity", "ratio" or "log-ratio" are allowed) for file'), sep='')
			}
		}
	else {
		if (fmt %in% c('genepix', 'genepix.median', 'genepix.custom')) DT <- try(readGenepix(src, source=fmt)$R) 
		else if (fmt == 'agilent' && ch_num==1) DT <- try(read.maimages(src, source=fmt, columns=list(G="gMeanSignal", Gb="gBGMedianSignal", R="gMeanSignal", Rb="gBGMedianSignal"))$R)
		else DT <- try(read.maimages(src, source=fmt)$R)
		if ('try-error' %in% class(DT) || is.null(DT)) {print(rltstr(0)); return(0) }
		if (fmt == 'scanarrayexpress') DT <- correctDT4ScanarrayExpress(DT, src)
		}
	print(rltstr(nrow(DT)))
	return(nrow(DT))
	}

