#!/usr/bin/env python

import sys, os, re, sets, types, time, string #, MySQLdb
from tools import getConnectionCursor, getCursor, descNT, getTC, readTable, get_dbstr, readINI, MYSQL_KEYS, unescape, inquireDB, esc_sql, escSql, getErrInfo, getPfPairs, ProbeFileType, join_chs
from pipeR import pipeR

__DEBUG__ = False
#__DEBUG__ = False

sep = '\t'
#join_chs = ' /// '

#title_str = re.compile(r'^\[\w+\]$') # enclosed in square brackets
#title_str = re.compile(r'(?<=^\[)\s*(\w+)(?=\s*\]\s*$)') # use search, groups()[0] to get the title. Should be start with [
title_str = re.compile(r'^\[\s*(\w+)(?=\s*\]\s*$)') # use search/match, groups()[0] to get the title. Should be start with [
#def_str = re.compile(r'^\s*\[\s*(\w+)\s*\]\s*:\s*(\S+.*)$') # use match, groups()[0], groups()[1] to get the name and definition.
def_str = re.compile(r'^\s*\[\s*(\w+(\s+\w+)*)\s*\]\s*:\s*(\S+.*)$') # use match, groups()[0], groups()[2] to get the name and definition.
#tab_str = re.compile(r'((?<=[^\\])\\t)|(^\\t)')
#line_str = re.compile(r'((?<=[^\\])\\n)|(^\\n)')
#esc_str = re.compile(r'\\\\')

# escape \, ' and " for sql 
#esc_sql = re.compile(r'([\\\'\"])' # or r'''([\\'"])'''    Syntax: esc_sql.sub(r'\\\1', obj_str)
# only escape ' and " for sql 
# now use esc_sql from tools
#esc_sql = re.compile(r'([\'\"])') # or r'''([\\'"])'''    Syntax: esc_sql.sub(r'\\\1', obj_str)

#dec_str = re.compile(r"\s*[\+\-]?\s*\d+|NULL|")
#float_str = re.compile(r"[\+\-]?\s*(\d+(\.\d*)?|\d*\.\d+)([Ee][\+\-]?\d+)?")
# now match '' and 'NULL' too
dec_str = re.compile(r"^\s*[\+\-]?\s*\d+$|^NULL$|^$")
float_str = re.compile(r"^\s*[\+\-]?\s*(\d+(\.\d*)?|\d*\.\d+)([Ee][\+\-]?\d+)?$|^NULL$|^$")

tb_col_int = re.compile(r'TINYINT|SMALLINT|MEDIUMINT|INT|BIGINT', re.I)
tb_col_float = re.compile(r'FLOAT|DOUBLE|DECIMAL|REAL|NUMERIC', re.I)
tb_col_char = re.compile(r'CHAR|VARCHAR|TINYBLOB|BLOB|MEDIUMBLOB|LONGBLOB|TINYTEXT|TEXT|MEDIUMTEXT|LONGTEXT', re.I)
#tb_col_date = re.compile(r'DATE|TIME|YEAR|DATETIME|TIMESTAMP', re.I)
tb_col_date = re.compile(r'DATE|TIME|DATETIME|TIMESTAMP', re.I)
#tb_col_date = re.compile(r'YEAR', re.I)
tb_col_year = re.compile(r'YEAR', re.I)
tb_col_enum = re.compile(r'ENUM', re.I)
tb_col_set = re.compile(r'SET', re.I)


# get(and make) data_dir
ini_file = 'mpmdb.ini'
ini_file_path = os.path.join(os.path.split(os.path.abspath(__file__))[0], ini_file)
exec get_dbstr()
USER_DB = db # the db with user information
CUR_DB = globals().has_key('db') and db or 'mpmdb'
file_location = 'arraydb_files'

def prepareDataDir(dbname=CUR_DB, ini_file=ini_file_path, file_location=file_location):
	if True: #not globals().has_key('data_dir'): 
		global data_dir
		if os.path.exists(ini_file): 
			#exec open(ini_file).read()
			ini_dic = readINI(ini_file)
			my_data_dir = ini_dic.get(dbname, {}).get('data_dir', '')
		else: my_data_dir = ''
		data_dir = my_data_dir or os.path.join(os.path.split(os.path.abspath(__file__))[0], file_location)
	if not os.path.exists(data_dir): 
		#print data_dir[0]
		#os.mkdir(data_dir)
		os.makedirs(data_dir)
	return data_dir
		
#prepareDataDir()
data_dir = os.path.join(os.path.split(os.path.abspath(__file__))[0], file_location)

tbl_user = 'users'

tbcols_auto = { # the columns that are automatically produced when filling data
		'probe':['id', 'xpf_id', 'mapf_id', 'platform_id']
		} 

tbcols = { # don't need this dict any more since it can be produced by getColInfo automatically
	"sample":{"name":"NULL", "organism":"NULL", "tissue":"NULL", "gender":"NULL", "age":"NULL", "description":"NULL", "provider":"NULL", "state_develop":"NULL", "state_disease":"NULL", "relapse_status":"NULL", "relapse_time":"NULL", "pre_op_PSA":"NULL", "patient_id":"NULL", "cell_type":"NULL", "sample_type":"NULL", "GLSN":"NULL", "meta_site":"NULL", "tissue_perc_T":"NULL", "tissue_perc_S":"NULL", "tissue_perc_B":"NULL", "tissue_perc_G":"NULL", "tissue_perc_SMS":"NULL", "tissue_perc_NSS":"NULL", "capsularlnv":"NULL", "SM":"NULL", "TNN_1992":"NULL", "race":"NULL"},
	"sampxref":{"sample_id":"NULL", "array_id":"NULL", "channel_No":"NULL", "dye":"NULL", "prot_proc_id":"NULL", "prot_tech_id":"NULL", "protocol_label":"NULL", "exp_factor":"NULL"},
	"protocol":{"name":"NULL", "category":"NULL", "description":"NULL"},
	"project":{"name":"NULL", "keywords":"NULL", "factors":"NULL", "tissue":"NULL", "design":"NULL", "QC":"NULL", "description":"NULL", "authors":"NULL", "journal":"NULL", "publish_year":"NULL", "pubmed_id":"NULL", "data_link":"NULL"},
	"project_array":{"project_id":"NULL", "array_id":"NULL"},
	"array":{"platform_id":"NULL", "channel_num":"NULL", "hyb_date":"NULL", "prot_hyb_id":"NULL", "prot_img_id":"NULL", "prot_data_id":"NULL", "identifier":"NULL", "description":"NULL"},
	#"intensity":{"array_id":"NULL", "probe_id":"NULL", "channel_No":"NULL", "fg":"NULL", "bg":"NULL", "flag":"NULL"},
	"fileinfo":{"name":"NULL", "location":"NULL", "category":"NULL", "format":"NULL"},
	"filexref":{"file_id":"NULL", "tb_id":"NULL", "tbname":"NULL"},
	"platform":{"name":"NULL", "category":"NULL", "probe_num":"NULL", "manufacturer":"NULL", "organism":"NULL", "description":"NULL"}
}

char_cols = { # don't need this dict any more since it can be produced getColInfo automatically
	"sample_char" : {"name":True, "organism":True, "tissue":True, "gender":True, "description":True, "provider":True, "state_develop":True, "state_disease":True, "relapse_status":True, "patient_id":True, "cell_type":True, "sample_type":True, "GLSN":True, "meta_site":True, "capsularlnv":True, "SM":True, "TNN_1992":True, "race":True},
	"sampxref":{"dye":True, "protocol_label":True, "exp_factor":True},
	"protocol":{"name":True, "category":True, "description":True},
	"project":{"name":True, "keywords":True, "factors":True, "tissue":True, "design":True, "QC":True, "description":True, "authors":True, "journal":True, "publish_year":True, "pubmed_id":True, "data_link":True},
	"array":{"hyb_date":True, "identifier":True, "description":True},
	"fileinfo":{"name":True, "location":True, "category":True, "format":True},
	"filexref":{"tbname":True}
}

#s/ /":"NULL", "/g
#s/ /":True, "/g

def getColInfo(cur, tbsrc, DB=CUR_DB, skip_id=True, skip_cols=[], lower=False, return_dic=True):
	'''
	return value is a tuple:
	if tbsrc is a table name, it is ({col_name:'NULL'...}, {char_col_name:True...}) or ([col_names], [char_col_names])
	if tbsrc is a list of table names, then it become a dict with table names as keys and the tuples above as values.
	'''
	if skip_id and 'id' not in skip_cols: skip_cols.append('id')
	if type(tbsrc) not in (types.ListType, types.TupleType): tbs = [tbsrc]
	else: tbs = tbsrc # tbsrc is a list of table names
	tbcols = {}
	char_cols = {}
	char_col_str = re.compile(r'(date)|(datetime)|(time\s)|(char)|(text)|(blob)|(enum)|(set)', re.I) # ENUM is considered as string, didn't consider SET yet
	for tb in tbs:
		n = cur.execute('DESCRIBE %s.%s' % (DB, tb))
		if not cur.rowcount: continue #if not n: continue
		cols = map(lambda a:char_col_str.search(a[1]) and (a[0], True) or (a[0], False), cur.fetchall())
		#if skip_id: cols = filter(lambda a:a[0] != 'id', cols)
		cols = filter(lambda a:a[0] not in skip_cols, cols)
		if lower: cols = map(lambda a:(a[0].lower(), a[1]), cols)
		if return_dic: 
			cola = dict(map(lambda a:(a[0], 'NULL'), cols))
			colb = dict(filter(lambda a:a[1], cols))
		else: 
			cola = map(lambda a:a[0], cols)
			colb = map(lambda a:a[0], filter(lambda b:b[1], cols))
		tbcols[tb] = cola
		if colb: char_cols[tb] = colb
	if tbs is tbsrc: # tbsrc is a list of table names
		return (tbcols, char_cols)
	else: return (tbcols.get(tb, (return_dic and ({},) or ([],))[0]), char_cols.get(tb, (return_dic and ({},) or ([],))[0]))

def getColDef(cur, tb, DB=CUR_DB, skip_id=True, skip_cols=[]):
	'return a list of tuple: [(colname, definition_str), ...]'
	if not cur: cur = getCursor()
	cur.execute('DESCRIBE %s.%s' % (DB, tb))
	if skip_id: skip_cols.append('id')
	if skip_cols: return [a[0:2] for a in cur.fetchall() if a[0] not in skip_cols]
	return map(lambda a:a[0:2], cur.fetchall())

def getIntCols(cur, tb, DB=CUR_DB, skip_id=True, skip_cols=[]):
	'return a list of column names'
	if not cur: cur = getCursor()
	cur.execute('DESCRIBE %s.%s' % (DB, tb))
	if skip_id: skip_cols.append('id')
	if skip_cols: 
		skip_cols = dict(zip(skip_cols, range(len(skip_cols))))
		return list(sets.Set(map(lambda a:a[0], filter(lambda b:tb_col_int.match(b[1]), cur.fetchall()))).difference(skip_cols))
	return map(lambda a:a[0], filter(lambda b:tb_col_int.match(b[1]), cur.fetchall()))
	
def getAllColType(cur, DB=CUR_DB, skip_id=True, 
		skip_cols={'project':['user_id', 'id'], 'protocol':['id', 'project_id'], 'platform':['id', 'project_id'], 'array':['project_id', 'prot_hyb_id', 'id', 'prot_data_id', 'platform_id', 'prot_img_id'], 'sample':['project_id', 'id']},
		tbs=['project', 'protocol', 'platform', 'array', 'sample'], 
		tps={'int':lambda a:tb_col_int.match(a), 'float':lambda a:tb_col_float.match(a), 'date':lambda a:tb_col_date.match(a), 'enum':lambda a:tb_col_enum.match(a), 'year':lambda a:tb_col_year.match(a)}
		):
	'''return a dict with type as key while the value is dict too, 
	the value dict have table name as key and a list of column name as value, but for ENUM, the value is a list of tuple (column name, (possible_val1, possible_val2, ...))
	'''
	if not cur: cur = getCursor()
	coldic = {}
	for tb in tbs: coldic[tb] = getColDef(cur, tb, DB=DB, skip_id=skip_id, skip_cols=skip_cols.get(tb, []))
	tpdic = {}
	for tp, fun in tps.items():
		dic = {}
		for tb, coldefs in coldic.items():
			#if tp == 'enum': tpcols = map(lambda b:(b[0], eval(b[1][4:])), filter(lambda a:fun(a[1]), coldefs))
			if tp == 'enum': tpcols = map(lambda b:(b[0], eval('("","NULL",' + b[1][5:])), filter(lambda a:fun(a[1]), coldefs))
			else:tpcols = map(lambda b:b[0], filter(lambda a:fun(a[1]), coldefs))
			if tpcols: dic[tb] = tpcols
		tpdic[tp] = dic
	return tpdic

def getColType_no_use(cur, tbsrc=None, DB=CUR_DB, skip_id=True, skip_cols=[], lower=False, return_dic=True):
	'''
	Don't use this since it is not finished yet.

	return value is a dict:
	if tbsrc is a table name, it is {'INTEGER':[col_name_list], 'FLOAT':[...], 'DATE':[...], 'ENUM':[...], 'DATE':[...], 'CHAR':[...]} #, 'BLOB':[...]}
	if tbsrc is a list of table names, then it become a dict with table names as keys and the tuples above as values.
	if tbsrc is None, tbsrc will be all the tables in the databasse
	'''
	if skip_id and 'id' not in skip_cols: skip_cols.append('id')
	if tbsrc is None: # use all tables
		cur.execute('SHOW TABLES FROM %s' % DB)
		if not cur.rowcount: return {}
		tbs = map(lambda a:a[0], cur.fetchall())
	elif type(tbsrc) not in (types.ListType, types.TupleType): tbs = [tbsrc]
	else: tbs = tbsrc # tbsrc is a list of table names


	tbcols = {}
	char_cols = {}
	char_col_str = re.compile(r'(date)|(datetime)|(time\s)|(char)|(text)|(blob)|(enum)', re.I) # ENUM is considered as string, didn't consider SET yet
	for tb in tbs:
		n = cur.execute('DESCRIBE %s.%s' % (DB, tb))
		if not cur.rowcount: continue #if not n: continue
		cols = map(lambda a:char_col_str.search(a[1]) and (a[0], True) or (a[0], False), cur.fetchall())
		#if skip_id: cols = filter(lambda a:a[0] != 'id', cols)
		cols = filter(lambda a:a[0] not in skip_cols, cols)
		if lower: cols = map(lambda a:(a[0].lower(), a[1]), cols)
		if return_dic: 
			cola = dict(map(lambda a:(a[0], 'NULL'), cols))
			colb = dict(filter(lambda a:a[1], cols))
		else: 
			cola = map(lambda a:a[0], cols)
			colb = map(lambda a:a[0], filter(lambda b:b[1], cols))
		tbcols[tb] = cola
		if colb: char_cols[tb] = colb
	if tbs is tbsrc: # tbsrc is a list of table names
		return (tbcols, char_cols)
	else: return (tbcols.get(tb, (return_dic and ({},) or ([],))[0]), char_cols.get(tb, (return_dic and ({},) or ([],))[0]))

def getTC_in_tools_now(title_case=None):
	if type(title_case) is type(''): title_case = title_case.lower()
	import string
	if title_case == 'lower': tc = string.lower
	elif title_case == 'upper': tc = string.upper
	else: tc = lambda a:a #title_case = False
	return tc

def readTbName(f, title_case=None):
	"title (table name) line should not follow any non-blank line."
	tc = getTC(title_case)
	for line in f:
		line = line.strip()
		rlt = title_str.search(line) # find a table name. "match" doesn't work here (why?), so use "search"
		if rlt: return tc(rlt.groups()[0])
	return None

def readTable_in_tools_now(f, colnames = None, skip_blank=False, stop_at=None, use_list=True, title_case=None, sep_str=sep, delim='', NULL_str=[], NULL_fill='NULL'):
	" if stop_at is False, then return the list tb -- the table content, otherwise, the return value is a tuple, the table content and the name of next table (or None)"
	if (type(f) is str): f = open(f)
	tc = getTC(title_case)
	tb = []
	sep_str = delim+sep_str+delim
	ndelim = len(delim)

	for line in f:
		if stop_at:
			rlt = stop_at.search(line) # "match" doesn't work here (why?), so use "search"
			# f.seek(1, -len(line)) # cannot seek correctly in text mode
			if rlt: return tb, tc(rlt.groups()[0])
		if not line.strip():
			if skip_blank: continue
			else: 
				if stop_at: return tb, None
				else: return tb
		cols = line.replace('\n','').replace('\r','')
		if delim:
			if cols[:ndelim] == delim: cols = cols[ndelim:]
			if cols[-ndelim:] == delim: cols = cols[:-ndelim]
		cols = cols.split(sep_str)
		#cols = map(lambda a:tab_str.sub('\t',a), cols)
		#cols = map(lambda a:line_str.sub('\n',a), cols)
		#cols = map(lambda a:esc_str.sub('\\',a), cols)
		cols = map(lambda a:descNT(a), cols) # recover escaped chars
		
		if colnames is None: 
			#colnames = map(lambda a:a.strip().lower(), cols)
			colnames = map(lambda a:tc(a.strip()), cols)
		else: 
			if NULL_str: cols = map(lambda a:((a in NULL_str) and [NULL_fill] or [a])[0], cols)
			lc = len(cols)
			ln = len(colnames)
			if lc < ln: cols.extend([NULL_fill]*(ln-lc))
			elif lc > ln: del cols[ln:]
			if use_list:
				if not tb: tb.append(colnames) # the first item is a list of column names
				tb.append(cols)
			else: # use list
				tb.append(dict(zip(colnames, cols)))
	
	if stop_at: return tb, None
	else: return tb	

def parseFile(fn, title_case=None):
	'''
	return a dict with table names as keys and table content as value. The table content is a list of lists - the first list is the column names.
	'''
	#if type(title_case) is type(''): title_case = title_case.lower()
	#import string
	#if title_case == 'lower': tc = string.lower
	#elif title_case == 'upper': tc = string.upper
	#else: title_case = False
	tbs = {} # tbs is a dict of table, a table is a list of dicts.
	if type(fn) is type(''): # should be file name
		f = open(fn)
	else: # should be a file-like object
		f = fn
	title = readTbName(f, title_case=title_case)
	while True:
		#title = readTbName(f)
		#if title is None: return tbs
		if not title: return tbs
		#tbs.setdefault(title, []).extend(readTable(f, skip_blank=True, stop_at=title_str ))
		tb, title_next = readTable(f, skip_blank=True, stop_at=title_str, title_case=title_case, NULL_fill='')
		#if title_case: 
		#	title = tc(title)
		#	if tb: tb[0] = map(tc, tb[0])
		
		tbs.setdefault(title, []).extend(tb)
		title = title_next
	return tbs

def getPos(tbv):
	if not tbv: return {}
	return dict(zip(tbv[0], range(len(tbv[0]))))	
	
def getIDs_no_userid(cur, tbv, tbids, tb_nm_id, identifier='name'):
	'''get IDs from tb_nm_id regardless of user_id'''
	if not tb_nm_id: return {}
	head = tbv[0] 
	pos = dict(zip(head, range(len(head))))
	add_num = 0
	for tni in tb_nm_id:
		k = tni[2] # column for id
		if not pos.has_key(k):
			head.append(k)
			pos[k] = len(head)-1
			add_num = add_num + 1
	if add_num: # add NULL to the rest lines to make sure all lines has the same length
		add_list = ['NULL'] * add_num
		map(lambda a:a.extend(add_list), tbv[1:])
		
	# get related ids
	for i in range(1, len(tbv)):
		line = tbv[i]
		for tb, nm, id in tb_nm_id: #(('platform', 'platform', 'platform_id'), ('protocol', 'protocol_hyb', 'prot_hyb_id'), ('protocol', 'protocol_image', 'prot_img_id'), ('protocol', 'protocol_data', 'prot_data_id')):
			#if line[pos[id]] == 'NULL' and pos.has_key(nm) and line[pos[nm]] != 'NULL':
			#identifier = (tb == 'array') and 'identifier' or 'name'
			if pos.has_key(nm) and line[pos[nm]] != 'NULL':
				idnms = map(lambda a:a.strip(), line[pos[nm]].split(join_chs))
				ids = []
				for idnm in idnms: 
					if tbids.has_key(tb) and tbids[tb].has_key(idnm): # search in tbids first
						ids.append(str(tbids[tb][idnm]))
					else: # search database then
						try:
							#cur.execute('SELECT id FROM %s WHERE name="%s"' % (tb, identifier, idnm))
							cur.execute('SELECT id FROM %s.%s WHERE %s="%s"' % (CUR_DB, tb, identifier, idnm.replace('"','\\"') ))
							if cur.rowcount:
								ids.append(str(cur.fetchone()[0]))
						except: 
							continue
				if line[pos[id]] != 'NULL': ids.insert(0, line[pos[id]])
				line[pos[id]] = join_chs.join(ids)
	return pos
	
def getIDs_with_userid(cur, tbv, tbids, tb_nm_id, user_id=None):
	'''get IDs from tb_nm_id regarding of user_id'''
	# tb_nm_id is ( (tb, nm, id), ...) or ( (tb, (nm, tbnm), (id, tbid), ...)
	if not tb_nm_id: return {}
	head = tbv[0] 
	pos = dict(zip(head, range(len(head))))
	add_num = 0
	for tni in tb_nm_id:
		k = tni[2] # column for id
		if not pos.has_key(k):
			head.append(k)
			pos[k] = len(head)-1
			add_num = add_num + 1
	if add_num: # add NULL to the rest lines to make sure all lines has the same length
		add_list = ['NULL'] * add_num
		map(lambda a:a.extend(add_list), tbv[1:])
		
	# get related ids
	user_id_str = (user_id is None) and 'NULL' or str(user_id)
	for i in xrange(1, len(tbv)):
		line = tbv[i]
		for tb, nm, id in tb_nm_id: 
			if type(nm) is types.StringType:
				nm, tbnm = nm, 'name'
			else: # should be a two-element tuple
				nm, tbnm = nm
			if type(id) is types.StringType:
				id, tbid = id, 'id'
			else: id, tbid = id

			if pos.has_key(nm) and line[pos[nm]] != 'NULL':
				idnms = map(lambda a:a.strip(), line[pos[nm]].split(join_chs))
				ids = []
				for idnm in idnms: 
					if tbids.get(tb, {}) and tbids.get(tb, {}).has_key(idnm): # search in tbids first
						ids.append(str(tbids[tb][idnm]))
					else: # search database then
						try:
							if tb == tbl_user:
								SQL = 'SELECT %s FROM %s.%s WHERE %s="%s"' % (tbid, USER_DB, tb, tbnm, idnm.replace('"','\\"') ) 
							elif tb in ('platform', 'protocol', 'sample'): # consist in nm4id. 'platform', 'protocol', 'sample' are unique for all users
								SQL = 'SELECT %s FROM %s.%s WHERE %s="%s"' % (tbid, CUR_DB, tb, tbnm, idnm.replace('"','\\"') )
							elif tb == 'project':
								SQL = 'SELECT %s FROM %s.%s WHERE %s="%s" AND user_id=%s' % (tbid, CUR_DB, tb, tbnm, idnm.replace('"','\\"') , user_id_str)
							else: 
								SQL = 'SELECT a.%s FROM %s.%s AS a INNER JOIN %s.project AS p ON (a.project_id=p.id) WHERE a.%s="%s" AND p.user_id=%s' % (tbid, CUR_DB, tb, CUR_DB, tbnm, idnm.replace('"','\\"') , user_id_str)

							#print tb, '<>', tbl_user, ' ? ', 
							#print '<>', SQL, '\n'
							cur.execute(SQL)
							#print rowcount, '\n'
							if cur.rowcount > 0:
								ids.append(str(cur.fetchone()[0]))
						except: 
							continue
				if line[pos[id]] != 'NULL': ids.insert(0, line[pos[id]])
				line[pos[id]] = join_chs.join(ids)
				#print '<>'.join(line), '\n'
	return pos
	
getIDs = getIDs_with_userid

def fillTableDict(cur, tbnm, tbv, tbcol, chcol, idx_col='name'):
	# cur is mysql cursor, tbnm is table name, tbv is a list of dicts to be filled into table
	# tbcol is columns to be filled in the table, chcol is columns with string value 
	# return a dict if idx_col is available, otherwise return a list of id
	if idx_col: ids = {}
	else: ids = []
	kw = tbcol.keys()
	kwstr = ', '.join(kw)
	cur.execute('LOCK TABLES %s.%s LOW_PRIORITY WRITE' % (CUR_DB, tbnm))
	for line in tbv:
		cols = tbcol.copy()
		for k,v in line.items():
			if cols.has_key(k): 
				# add '"' for char values
				if chcol.get(k, False): cols[k] = '"%s"' % esc_sql.sub(r'\\\1', v) #'"' + v.replace('"', '\\"') + '"'
				else: cols[k] = v
		cur.execute('INSERT INTO %s.%s (%s) VALUES (%s)' % (CUR_DB, tbnm, kwstr, ', '.join(cols.values())))
		cur.execute('SELECT MAX(id) from %s.%s' % (CUR_DB, tbnm))
		if idx_col: ids[line[idx_col]] = cur.fetchone()[0]
		else: idx.append(cur.fetchone()[0])
	cur.execute('UNLOCK TABLES')
	cur.connection.commit()
	return ids

#def fillKeyword(cur, kws, tbnm, rec_id, sep=re.compile(r'[,;\t]')):
def fillKeyword(cur, kws, tbnm, rec_id, sep=re.compile(r'[;]')):
	tbnm = esc_sql.sub(r'\\\1', tbnm) #tbnm.replace('"','\\"') 
	for k in filter(lambda a:a, map(lambda a:a.strip(), sep.split(kws))):
		cur.execute('INSERT INTO %s.keyword (kw, tb_name, rec_id) VALUES ("%s", "%s", %d)' % (CUR_DB, esc_sql.sub(r'\\\1', k), tbnm, rec_id) ) #(CUR_DB, k.replace('"','\\"'), tbnm, rec_id) ) 
	cur.connection.commit()


#sep_dyncol = re.compile(r'[,;]')
sep_dyncol = re.compile(r'[;]')
new_int_str = re.compile(r'^\s*(\[\s*int(eger)?\s*\])?\s*([\+\-]?\s*\d+)\s*$', re.I) # Ignore case, .groups()[2] is the int string
new_float_str = re.compile(r'^\s*(\[\s*float\s*\])?\s*([\+\-]?\s*(\d+\.?\d*|\d*\.?\d+)([Ee][\+\-]?\d+)?)', re.I) # an integer can be used as float, .groups()[1]
new_str_str_1 = re.compile(r'^\s*([\'\"])(.*\S+.*)\1\s*$', re.I) # use quote ' or ", groups()[1]
new_str_str_2 = re.compile(r'^\s*(\[\s*str(ing)?\s*\])?\s*(\S+.*?)\s*$', re.I) # use [string] or nothing but string itself, groups()[2]
#new_str_str_3 = re.compile(r'^\s*(\S+.*?)\s*$', re.I) # use nothing but string itself. groups()[0]
# new_str_str_1 and new_str_str_2 should be used in order
def parseDyncols(kws, col_dic, sep=sep_dyncol, int_s=dec_str, float_s=float_str, new_int_s=new_int_str, new_float_s=new_float_str, new_str_s1=new_str_str_1, new_str_s2=new_str_str_2):
	'''kws is a string consists of colnm=value..., col_dic is a dict like {colnm:type, ...} '''
	type_fun = {'int':int_s.match, 'float':float_s.match, 'string':lambda a:True}
	s_i_tp = ((new_int_s, 2, 'int'), (new_float_s, 1, 'float'), (new_str_s1, 1, 'string'), (new_str_s2, 2, 'string'))
	kv_dic = {}
	err_msg = []
	warn_msg = []
	for item in filter(lambda a:a, map(lambda a:a.strip(), sep.split(kws))):
		kv = map(lambda a:a.strip(), item.split('=') )
		if len(kv) != 2: 
			err_msg.append(repr(kv) + ' is an invalid "column_value = value" pair!')
			continue 
		k, v = kv
		if not k: # no col name
			err_msg.append(repr(kv) + ' is a invalid "column_value = value" pair!')
			continue 
		if not v or v=='NULL': # no value
			warn_msg.append(repr(kv) + 'is skipped!')
			continue
		if k in col_dic: # existed/defined columns
			tp = col_dic[k]
			if not type_fun[tp](v):
				err_msg.append('"%s" (User-defined column: %s) was offered an invalid value (%s)!' % (k, tp, v))
				continue
			# this k, v is OK
			kv_dic[k] = v
			continue
		# new column name, need to determine type
		for s, i, tp in s_i_tp:
			m = s.match(v)
			if m:
				kv_dic[k] = m.groups()[i]
				col_dic[k] = tp
				break
		if not m: err_msg.append('Cannot determine the type of User-defined column "%s" by its value (%s)!' % (k, v))
	
	return {'kv':kv_dic, 'error':err_msg, 'warn':warn_msg}

def fillDyncol_old(cur, kws, tbnm, rec_id, sep=sep_dyncol):
	tbnm = esc_sql.sub(r'\\\1', tbnm) #tbnm.replace('"','\\"')
	for item in filter(lambda a:a, map(lambda a:a.strip(), sep.split(kws))):
		kv = map(lambda a:a.strip(), item.split('=') )
		if len(kv) != 2: 
			print kv, 'is a invalid "column_value = value" pair, skipped!'
			continue # maybe a warning is better here
		k, v = kv
		if not k or not v: 
			print kv, 'is a invalid "column_value = value" pair, skipped!'
			continue # maybe a warning is better here
		
		# check dyncoldef first
		# Now here we may check/determin the type of the new defined column
		k = esc_sql.sub(r'\\\1', k) #k = k.replace('"','\\"')
		v = esc_sql.sub(r'\\\1', v) #v = v.replace('"','\\"')
		cur.execute('SELECT tb_name FROM %s.dyncoldef WHERE tb_name="%s" AND col_name="%s"' % (CUR_DB, tbnm, k) )
		if not cur.rowcount:
			cur.execute('INSERT INTO %s.dyncoldef (tb_name, col_name) VALUES ("%s", "%s")' % (CUR_DB, tbnm, k) ) 

		cur.execute('INSERT INTO %s.dyncol (tb_name, col_name, value, rec_id) VALUES ("%s", "%s", "%s", %d)' % (CUR_DB, tbnm, k, v, rec_id) ) 
	cur.connection.commit()

def fillDyncol(cur, kws, tbnm, rec_id, col_dic):
	tbnm = esc_sql.sub(r'\\\1', tbnm) #tbnm.replace('"','\\"')
	kvs = parseDyncols(kws, col_dic)['kv']	
	for k, v in kvs.items():
		# check dyncoldef first
		tp = col_dic.get(k, 'string')
		k = esc_sql.sub(r'\\\1', k) #k = k.replace('"','\\"')
		v = esc_sql.sub(r'\\\1', v) #v = v.replace('"','\\"')
		cur.execute('SELECT id FROM %s.dyncoldef WHERE tb_name="%s" AND col_name="%s"' % (CUR_DB, tbnm, k) )
		if not cur.rowcount:
			cur.execute('INSERT INTO %s.dyncoldef (tb_name, col_name, col_type) VALUES ("%s", "%s", "%s")' % (CUR_DB, tbnm, k, tp) ) 
			cur.execute('SELECT id FROM %s.dyncoldef WHERE tb_name="%s" AND col_name="%s"' % (CUR_DB, tbnm, k) )
		col_id = cur.fetchone()[0]
		if tp == 'string':
			cur.execute('INSERT INTO %s.dyncol (col_id, value_str, rec_id) VALUES (%d, "%s", %d)' % (CUR_DB, col_id, v, rec_id) ) 
		elif tp == 'int':
			cur.execute('INSERT INTO %s.dyncol (col_id, value_int, rec_id) VALUES (%d, %s, %d)' % (CUR_DB, col_id, v, rec_id) ) 
		elif tp == 'float':
			cur.execute('INSERT INTO %s.dyncol (col_id, value_float, rec_id) VALUES (%d, %s, %d)' % (CUR_DB, col_id, v, rec_id) ) 
	cur.connection.commit()
	
def fillTable(cur, tbnm, tbv, tbcol=None, chcol=None, idx_col='name', get='id', tb_nm_id=None, tbids={}, get_pos=False, skip_null=True, user_id=None, strip_str=True): # use strip_str to avoid some name start/end with blank character
	'''
	"cur" is mysql cursor, tbnm is table name, tbv is a list of lists to be filled into table, the first list is column names
	"tbcol" is columns to be filled in the table, chcol is columns with string value 
	"get" must be a valid table col name.
	return a dict if idx_col is available, 
	otherwise return a list of id
	'''
	if idx_col: ids = {}
	else: ids = []
	if not tbv or len(tbv) < 2: return get_pos and (ids, {}) or ids
	
	if tb_nm_id: pos = getIDs(cur, tbv, tbids, tb_nm_id, user_id=user_id)

	#colnames = tbv[0] = map(string.lower, tbv[0])
	# why set tbv[0] to lower? 
	colnames = map(string.lower, tbv[0])

	# find key words
	if 'keyword' in colnames:
		ikw = colnames.index('keyword')
		HAS_KW = True
		kwlock = ', %s.keyword LOW_PRIORITY WRITE' % (CUR_DB)
	else: 
		HAS_KW = False
		kwlock = ''
	if 'user_added_cols' in colnames:
		iusercol = colnames.index('user_added_cols')
		HAS_USERCOL = True
		usercollock = ', %s.dyncoldef LOW_PRIORITY WRITE, %s.dyncol LOW_PRIORITY WRITE' % (CUR_DB, CUR_DB)
		dyncol_type = dict(inquireDB('SELECT col_name, col_type FROM %s.dyncoldef WHERE tb_name="%s"' % (CUR_DB, tbnm), cursor=cur, fetch=True))
	else: 
		HAS_USERCOL = False
		usercollock = ''

	cur.execute('LOCK TABLES %s.%s LOW_PRIORITY WRITE%s%s' % (CUR_DB, tbnm, kwlock, usercollock))

	#cur.execute('LOCK TABLES %s.%s WRITE' % (CUR_DB, tbnm))
	if tbcol is None and chcol is None:
		tbcol, chcol = getColInfo(cur, tbnm, skip_id=True, lower=True, DB=CUR_DB)
	elif tbcol is None:
		tbcol = getColInfo(cur, tbnm, skip_id=True, lower=True, DB=CUR_DB)[0]
	elif chcol is None:
		chcol = getColInfo(cur, tbnm, skip_id=True, lower=True, DB=CUR_DB)[1]

	# !!!!!!!! All tables used by statements between the LOCK and UNLOCK must have been locked using the initial LOCK TABLES, including any alias you used !!!!!!!!!!!
	# otherwise, will got error: Table 'xxx' was not locked with LOCK TABLES
	# dyncol_type = dict(inquireDB('SELECT col_name, col_type FROM %s.dyncoldef WHERE tb_name="%s"' % (CUR_DB, tbnm), cursor=cur, fetch=True))

	if skip_null: # now it is always so.
		colnmSet = sets.Set(colnames)
		col_share = list(colnmSet & sets.Set(tbcol.keys()))
		col_share_idx = map(lambda a:colnames.index(a), col_share)
		col_share_idx_set = sets.Set(col_share_idx)
		chcol_share = list(colnmSet & sets.Set(chcol.keys()))
		chcol_share_idx = map(lambda a:colnames.index(a), chcol_share)
		sqls = esc_sql
		for line in tbv[1:]:
			vs = line[:]
			v_empty_idx = sets.Set(filter(lambda a:vs[a]=='NULL' or not vs[a], range(len(vs)) ))
			v_use_idx = list(col_share_idx_set - v_empty_idx)
			#map(lambda a:vs.__setitem__(a, '"' + vs[a].replace('"', '\\"') + '"'), chcol_share_idx)
			map(lambda a:vs.__setitem__(a, '"' + sqls.sub(r'\\\1', vs[a]) + '"'), chcol_share_idx)
			
			colnmStr = ', '.join(map(lambda a:colnames[a], v_use_idx))
			if strip_str: vsStr = ', '.join(map(lambda a:vs[a].strip(), v_use_idx))
			else: vsStr = ', '.join(map(lambda a:vs[a], v_use_idx))
			cur.execute('INSERT INTO %s.%s (%s) VALUES (%s)' % (CUR_DB, tbnm, colnmStr, vsStr) )
			if get: # should always be true
				cur.execute('SELECT MAX(%s) from %s.%s' % (get, CUR_DB, tbnm)) # or by "SELECT LAST_INSERT_ID()"
				if cur.rowcount > 0: # should always be true
					last_id = cur.fetchone()[0]
					if idx_col and (idx_col in colnames): ids[line[colnames.index(idx_col)]] = last_id
					else: ids.append(last_id)
					# fill table "keyword", "dyncoldef" and "dyncol"
					if HAS_KW and line[ikw].strip(): fillKeyword(cur, line[ikw], tbnm, last_id)
					if HAS_USERCOL and line[iusercol].strip(): fillDyncol(cur, line[iusercol], tbnm, last_id, dyncol_type)
		
	else: 	
		kw = tbcol.keys()
		kwstr = ', '.join(kw)
		if idx_col and idx_col not in tbcol: ids = [] 
		for line in tbv[1:]:
			cols = tbcol.copy()
			for k,v in zip(colnames, line):
				if cols.has_key(k): 
					# add '"' for char values
					if chcol.get(k, False): cols[k] = '"' + sqls.sub(r'\\\1', v) + '"' #'"' + v.replace('"', '\\"') + '"'
					else: cols[k] = v or 'NULL'
						
			if strip_str: cur.execute('INSERT INTO %s.%s (%s) VALUES (%s)' % (CUR_DB, tbnm, kwstr, ', '.join(map(lambda a:cols[a].strip(), kw))) )
			else: cur.execute('INSERT INTO %s.%s (%s) VALUES (%s)' % (CUR_DB, tbnm, kwstr, ', '.join(map(lambda a:cols[a], kw))) )
			if get: 
				cur.execute('SELECT MAX(%s) from %s.%s' % (get, CUR_DB, tbnm))
				if cur.rowcount > 0:
					last_id = cur.fetchone()[0]
					if idx_col and (idx_col in colnames): ids[line[colnames.index(idx_col)]] = last_id
					else: ids.append(last_id)
					# fill table "keyword", "dyncoldef" and "dyncol"
					if HAS_KW and line[ikw].strip(): fillKeyword(cur, line[ikw], tbnm, last_id)
					if HAS_USERCOL and line[iusercol].strip(): fillDyncol(cur, line[iusercol], tbnm, last_id, dyncol_type)
	
	cur.execute('UNLOCK TABLES')
	cur.connection.commit()
	return get_pos and (ids, pos) or ids
	


def fillAll(tbs, file_location=data_dir, src_dir={}, cur=None, user_id=None):
	if not cur:
		dbcon, cur = getConnectionCursor()
		need_close = True
	else: need_close = False
	tbs_to_fill = ['sample', 'sampxref', 'protocol', 'project', 'project_array', 'array', 'fileinfo', 'filexref', 'platform']
	tbcols, char_cols = getColInfo(cur, tbs_to_fill, skip_id=True, lower=True, DB=CUR_DB)

	tbids = {}
	
	# get user_id
	#user_id = None
	tbprj = tbs.get('project', [[]])
	if type(user_id)==type('') or (user_id is None and len(tbprj) > 1 and 'user_name' in tbprj[0]):
		if type(user_id)==type(''): user_name = user_id
		else: user_name = tbprj[1][tbprj[0].index('user_name')].strip()
		if user_name:
			cur.execute('SELECT id FROM %s.%s WHERE user_name="%s"' % (USER_DB, tbl_user, user_name.replace('"','\\"')) )
			if not cur.rowcount:
				return {'success':False, 'info':'ERROR: no such user name -- %s! Please correct it.' % user_name}
			user_id = cur.fetchone()[0]
	
	# fill project first since many table needs its id
	fillProject(cur, tbs, tbids, tbcols, char_cols, user_id=user_id)

	# fill independent tables: sample & protocol
	for tbnm in ('sample', 'protocol'): 
		tbv = tbs.get(tbnm, None)
		if not tbv: continue
		tb_nm_id = (('project', 'project', 'project_id'),)
		tbids[tbnm] = fillTable(cur, tbnm, tbv, tbcol=tbcols[tbnm], chcol=char_cols.get(tbnm, {}), tb_nm_id=tb_nm_id, tbids=tbids, user_id=user_id)
	
	# fill platform & probe & db & dbkw & dbxref
	fillPlatform(cur, tbs, tbids, tbcols, char_cols, user_id=user_id, src_dir=src_dir.get('probe',None))
	
	# fill array & intensity
	fillArray(cur, tbs, tbids, tbcols, char_cols, user_id=user_id, src_dir=src_dir.get('intensity',None))

	# fill sampxref
	fillSampxref(cur, tbs, tbids, tbcols, char_cols, user_id=user_id)

	# fill project_array
	fillProjectArray(cur, tbs, tbids, tbcols, char_cols, user_id=user_id)

	# fill fileinfo & filexref
	fillFileinfo(cur, tbs, tbids, tbcols, char_cols, location=file_location, user_id=user_id, src_dir=src_dir)

	if need_close:
		cur.close()
		dbcon.close()


def fillSampxref(cur, tbs, tbids, tbcols, char_cols, user_id=None):
	"""fill sampxref"""
	tbnm = 'sampxref'
	tbv = tbs.get(tbnm, None)
	if not tbv or len(tbv)<2: return
	tb_nm_id = (('sample', 'sample', 'sample_id'), ('array', ('array', 'identifier'), 'array_id'), ('protocol', 'protocol_process', 'prot_proc_id'), ('protocol', 'protocol_tech', 'prot_tech_id'))
	#getIDs(tbv, tbids, tb_nm_id)
	# fill table 
	ids = fillTable(cur, tbnm, tbv, tbcol=tbcols[tbnm], chcol=char_cols.get(tbnm,{}), idx_col=None, get=None, tb_nm_id=tb_nm_id, tbids=tbids, user_id=user_id)
	#if ids: tbids[tbnm] = dict(zip(map(lambda a:a[pos['name']], tbv[1:]), ids))

def fillProject(cur, tbs, tbids, tbcols, char_cols, user_id=None):
	"""fill project"""
	tbnm = 'project'
	tbv = tbs.get(tbnm, None)
	if not tbv or len(tbv)<2: return

	# convert date format
	convertDate(tbv, 'release_date')

	# add sumbmit date
	#today = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime())
	sub_date = 'submit_date'
	today = time.strftime('%Y-%m-%d', time.gmtime())
	head = map(string.lower, tbv[0])
	if sub_date in head: 
		convertDate(tbv, sub_date)
		idx = head.index(sub_date)
		for line in tbv[1:]:
			if line[idx].strip() in ['', 'NULL']: line[idx] = today
	else:
		tbv[0].append(sub_date)
		for line in tbv[1:]: line.append(today)

	tb_nm_id = ((tbl_user, ('user_name', 'user_name'), 'user_id'),)
	#getIDs(tbv, tbids, tb_nm_id) 
	# fill table 
	ids = fillTable(cur, tbnm, tbv, tbcol=tbcols[tbnm], chcol=char_cols.get(tbnm,{}), tb_nm_id=tb_nm_id, tbids=tbids, get_pos=False, user_id=user_id) #, idx_col=None)
	#pos = getPos(tbv)
	tbids[tbnm] = ids # dict(zip(map(lambda a:a[pos['name']], tbv[1:]), ids))

def fillProjectArray_use_project(cur, tbs, tbids, tbcols, char_cols, user_id=None):
	"fill project_array"
	tbv = tbs.get('project', None)
	prj_ids = tbids.get('project', [])
	if not tbv or not prj_ids: return

	tb_nm_id = (('array', ('array', 'identifier'), 'array_id'),)
	prj_pos = getIDs(cur, tbv, tbids, tb_nm_id, user_id=user_id) 
	# then fill project_array
	#for i in range(1, len(tbv)):
	for line in tbv[1:]:
		#project_id = str(prj_ids[i-1])
		project_id = str(prj_ids[line[prj_pos['name']]])
		array_ids = list(sets.Set(map(lambda a:a.strip(), line[prj_pos['array_id']].split(join_chs))))
		array_ids.sort()
		for array_id in array_ids:
			#rlt = cur.execute('SELECT projet_id FROM %s.project_array WHERE project_id=%s AND array_id=%s' % (CUR_DB, project_id, array_id)
			#if not rlt.rowcount:
			cur.execute('INSERT INTO %s.project_array (project_id, array_id) VALUES (%s, %s)' % (CUR_DB, project_id, array_id))
	cur.connection.commit()
	
def fillProjectArray_use_array(cur, tbs, tbids, tbcols, char_cols, user_id=None):
	"fill project_array"
	tbv = tbs.get('array', None)
	array_ids = tbids.get('array', {})
	if not tbv or not array_ids: return

	idx_identifier, idx_prj_id = tbv[0].index('identifier'), tbv[0].index('project_id')
	for line in tbv[1:]:
		project_id = str(line[idx_prj_id])
		array_id = array_ids[line[idx_identifier]]
		cur.execute('INSERT INTO %s.project_array (project_id, array_id) VALUES (%s, %s)' % (CUR_DB, project_id, array_id))
	cur.connection.commit()

fillProjectArray = fillProjectArray_use_array
		 
def fillFileinfo(cur, tbs, tbids, tbcols, char_cols, location=data_dir, user_id=None, src_dir={}, re_subdir=re.compile(r'^\d+$') ):
	"""fill fileinfo & filexref"""
	tbnm ='fileinfo'
	tbv = tbs.get(tbnm, None)
	if not tbv or len(tbv)<2: return
	
	import shutil
	# rename 'name' to 'raw_name', add column 'name'
	DT_LEN = 6
	nmdic = {'other':'AD', 'image':'IM', 'intensity':'DT', 'project':'PJ', 'platform':'PF', 'protocol':'PC', 'map':'MP'} # 'AD'-ArrayDb

	## update src_dir for platform.
	#if src_dir.get('probe', None) and not src_dir.has_key('platform'): src_dir['platform'] = src_dir['probe']

	locdic = dict(zip(nmdic.keys(), nmdic.keys()))
	idx_raw = tbv[0].index('name')
	idx_cat = tbv[0].index('category')
	tbv[0][idx_raw] = 'raw_name'
	tbv[0].extend(['location', 'name'])
	idx_nm, idx_loc = -1, -2
	# add location
	#map(lambda a:a.append(['','']), tbv[1:])
	# make new name and copy to new location
	cur.execute('SELECT MAX(id) FROM %s.fileinfo' % CUR_DB)
	#current_id = cur.rowcount and cur.fetchone()[0]+1 or 1 
	if  cur.rowcount: 
		current_id = cur.fetchone()[0]
		if not current_id: current_id = 1
		else: current_id = current_id + 1
	else: current_id = 1
	for i in range(1, len(tbv)): 
		v = tbv[i]
		raw = v[idx_raw]
		rext = os.path.splitext(raw)[1] # don't consider compressed or not
		#v[idx_nm] = nm = nmdic.get(v[idx_cat].strip().lower(), nmdic['other']) + str(current_id).zfill(DT_LEN) + rext
		#v[idx_loc] = loc = os.path.join(location, locdic.get(v[idx_cat].strip().lower(), locdic['other']))
		file_cat = v[idx_cat].strip().lower()
		if src_dir:
			fpth = src_dir.get(file_cat == 'project' and 'prjanno' or file_cat, None)
			if fpth: raw = os.path.join(fpth, raw)

		nm = nmdic.get(file_cat, nmdic['other']) + str(current_id).zfill(DT_LEN) + rext
		loc = os.path.join(location, locdic.get(v[idx_cat].strip().lower(), locdic['other']))

		# check file numbers
		if os.path.exists(loc):
			n_max = 500
			n_max_sub = 1000
			n_fill = 6
			fns = os.listdir(loc)
			fdirs = filter(lambda a:re_subdir.match(a), fns) # folder names consist of digits. and names consist of digits should be folder!
			if fdirs: # has sub folder
				fmax = max(map(int, fdirs)) # find the last sub folder
				last_dir = str(fmax).zfill(n_fill)
				if len(os.listdir(os.path.join(loc, last_dir))) >= n_max_sub: last_dir = str(fmax+1).zfill(n_fill)
				loc = os.path.join(loc, last_dir)
			elif len(fns) >= n_max: loc = os.path.join(loc, '1'.zfill(n_fill))

		loc_sql = sys.platform == 'win32' and loc.replace('\\', '\\\\') or loc
		v.extend([loc_sql, nm])

		# copy file
		#shutil.copytree(raw, os.path.join(loc, nm))
		if not os.path.exists(loc): os.makedirs(loc)
		fnobj = os.path.join(loc, nm)
		shutil.copy(raw, fnobj)
		os.chmod(fnobj, 0774)
		current_id = current_id + 1

		# remove path from the raw name. We can do this since the file path info won't be use any more
		v[idx_raw] = os.path.split(raw)[1]
	
	
	tb_nm_id = (('project', 'project', 'project_id'), ('platform', 'platform', 'platform_id'), ('protocol', 'protocol', 'protocol_id'), ('array', ('array', 'identifier'), 'array_id'))
	# fill table
	ids, pos = fillTable(cur, tbnm, tbv, tbcol=tbcols[tbnm], chcol=char_cols.get(tbnm,{}), idx_col=None, tb_nm_id=tb_nm_id, tbids=tbids, get_pos=True, user_id=user_id)
	if ids:
		tbids[tbnm] = dict(zip(map(lambda a:a[pos['name']], tbv[1:]), ids))
		# fill filexref
		for i in range(1, len(tbv)):
			file_id = str(ids[i-1])
			info_code = tbv[i][pos['info_code']]
			info_code = (info_code is None or info_code == '') and 'NULL' or str(info_code)
			for tb, nm, id in tb_nm_id:
				idnms = list(sets.Set(map(lambda a:a.strip(), tbv[i][pos[id]].split(join_chs))))
				tb = tb.replace('"','\\"')
				for idnm in idnms:
					#rlt = cur.execute('SELECT file_id FROM %s.filexref WHERE file_id=%s AND tb_id=%s AND tbname=%s' % (CUR_DB, file_id, idnm, tb))
					#if not rlt.rowcount:
					if idnm: cur.execute('INSERT INTO %s.filexref (file_id, tb_id, tbname, info_code) VALUES (%s, %s, "%s", %s)' % (CUR_DB, file_id, idnm, tb, info_code))
	cur.connection.commit()
				

	
def fillPlatform(cur, tbs, tbids, tbcols, char_cols, user_id=None, src_dir=None):
	"""fill platform & probe & db & dbkw & dbxref"""
	tbnm = 'platform'
	tbv = tbs.get(tbnm, None)
	if not tbv or len(tbv)<2: return 
	# fill table
	tb_nm_id = (('project', 'project', 'project_id'),)
	ids = fillTable(cur, tbnm, tbv, tbcol=tbcols[tbnm], chcol=char_cols.get(tbnm,{}), tb_nm_id=tb_nm_id, tbids=tbids, idx_col=None, user_id=user_id)
	if ids:
		p_nm = tbv[0].index('name')
		#tbids[tbnm] = ids 
		tbids[tbnm] = dict(zip(map(lambda a:a[p_nm], tbv[1:]), ids))
		# fill probes
		if 'probe_file' in tbv[0]: 
			fn_idx = tbv[0].index('probe_file')
			for i in range(len(tbv)-1):
				platform_id = ids[i]
				probe_fn = tbv[i+1][fn_idx]
				# if src_dir: probe_fn = os.path.join(src_dir, probe_fn)
				fillProbe(cur, platform_id, probe_fn, src_dir)

def fillProbe_old(cur, platform_id, fn, src_dir=None):
	if src_dir: fn = os.path.join(src_dir, fn)
	f = open(fn)
	tbnm = 'probe'
	platform_id_str = str(platform_id)
	probe_ids = []
	lines = readTable(f, skip_blank=True)
	if len(lines) < 2: return probe_ids
	map(lambda a:a.insert(0, platform_id_str), lines)
	lines[0][0] = 'platform_id'

	# fill table probe
	probe_ids = fillTable(cur, tbnm, lines, idx_col=None)

	# then fill the table db & dbkw & dbxref
	colnms = lines[0] # the first item is column name
	colsites = dict(zip(colnms, range(len(colnms)))) 
	tbcols, char_cols = getColInfo(cur, tbnm, skip_id=True, lower=True, DB=CUR_DB)
	left_colnms = filter(lambda a:a not in tbcols, lines[0])
	left_sites = map(lambda a:colsites[a], left_colnms)
	#left_cols = dict(zip(left_colnms, left_sites))
	left_cols = zip(left_colnms, left_sites) # now allow replicated column names !!!
	
	cur.execute('SELECT name FROM %s.db' % CUR_DB)
	dbnms = map(lambda a:a[0].pop(), cur.fetchall()) # CHAR column is convert to sets.Set type
	dbnms = dict(zip(dbnms, [True]*len(dbnms)))
	sqls = esc_sql
	for i in xrange(1, len(lines)):
		probe_id = probe_ids[i-1]
		for db, site in left_cols: #.items():
			vals = lines[i][site]
			if vals == 'NULL': continue
			kws = filter(lambda b:b, map(lambda a:a.strip(), vals.split(join_chs)) )
			if not kws: continue
			db_s = sqls.sub(r'\\\1', db) #db.replace('"','\\"')
			if not dbnms.has_key(db): # update db
				cur.execute('INSERT INTO %s.db SET name="%s"' % (CUR_DB, db_s))
				dbnms[db] = True
			for kw in kws:
				#if not kw: continue
				kw = sqls.sub(r'\\\1', kw) #kw.replace('"','\\"')
				rlt = cur.execute('SELECT id FROM %s.dbkw WHERE dbname="%s" AND kw="%s"' % (CUR_DB, db_s, kw))
				if rlt:
					db_id = cur.fetchone()[0]
				else: # update dbkw
					cur.execute('INSERT INTO %s.dbkw (dbname, kw) VALUES ("%s", "%s")' % (CUR_DB, db_s, kw))
					cur.execute('SELECT id FROM %s.dbkw WHERE dbname="%s" AND kw="%s"' % (CUR_DB, db_s, kw))
					db_id = cur.fetchone()[0]
				# update dbxref	
				rlt = cur.execute('SELECT probe_id FROM %s.dbxref WHERE probe_id=%d and dbkw_id=%d' % (CUR_DB, probe_id, db_id))
				if not rlt:
					cur.execute('INSERT INTO %s.dbxref SET probe_id=%d, dbkw_id=%d' % (CUR_DB, probe_id, db_id))
	cur.connection.commit()

	return probe_ids

def ProbeType_by_str(s):
	tp = 'tsv'
	if 'Probe Set ID' in s and 'GeneChip Array' in s: return 'affy'
	return tp

def readAffyProbe(fp, head=''):
	affy_join_chs, affy_sub_join_chs = join_chs, ' // '
	if not head: 
		head = fp.readline()
		fp.seek(-len(head), 1)
	if not head: 
		delim = '"'
		sep_str = ','
	else:
		if head[0] == '"': # traditional CSV files
			delim = '"'
			if '","' in head: sep_str = ','
			else: sep_str = sep
		else: # GPL4685.annot
			delim = ''
			sep_str = sep
			affy_join_chs, affy_sub_join_chs = '///', '//'

	lines = readTable(fp, skip_blank=True, sep_str=sep_str, delim=delim, NULL_str=['---'], NULL_fill='', autodelim=False)

	if lines[-1][0] == '!platform_table_end': # GPL4685.annot end with this token
		lines.pop() 
		#new_fmt = True
	else: pass #new_fmt = False
	
	line0 = lines[0]
	# deal with GO terms
	i_GO = []
	for GO_str in ("Gene Ontology Biological Process", "Gene Ontology Cellular Component","Gene Ontology Molecular Function"):
		if GO_str in lines[0]: i_GO.append(lines[0].index(GO_str))
	if i_GO: 
		GO_str = re.compile(r'^(\d{7})|///\s(\d{7})')
		GO_col = ['GO']
		for line in lines[1:]:
			gline = []
			map(lambda i:gline.extend(GO_str.findall(line[i])), i_GO)
			if gline: gline = map(lambda a:'GO:'+(a[0] or a[1]), sets.Set(gline))
			GO_col.append(join_chs.join(gline))
	else: # try new style annotation files, e.g. GPL4685.annot and exon arrays.
		GO_COLS = ("GO:Function", "GO:Component","GO:Process") # GPL4685.annot
		GO_COLS = GO_COLS + ('GO_biological_process', 'GO_cellular_component', 'GO_molecular_function') # exon arrays
		for i in range(len(line0)):
			if line0[i] in GO_COLS: i_GO.append(i)
		if i_GO: 
			#GO_str = re.compile(r'^(GO:\d{7})|///\s*(GO:\d{7})') # for GPL4685.annot
			GO_str = re.compile(r'(GO:\d{7,})') # for GPL4685.annot and exon arrays
			GO_col = ['GO']
			for line in lines[1:]:
				gline = []
				map(lambda i:gline.extend(GO_str.findall(line[i])), i_GO)
				if gline: gline = map(lambda a:a[0] or a[1], sets.Set(gline))
				GO_col.append(join_chs.join(gline))

	# change column names
	nm_dic =  {
			"ID":'unique_id', # GPL4685.annot
			# exon arrays
			"probeset_id":'unique_id', 'seqname':'chromosome', 
			"strand":'gene_strand', "start":'gene_start', "stop":'gene_end',
			"unigene":'UniGene',
			"Probe Set ID":'unique_id', "Gene Title":'gene_title', "Gene Symbol":'gene_symbol', "Chromosomal Location":'chromosome',
			"UniGene ID":'UniGene',		
			"Ensembl":"Ensembl", "Entrez Gene":'Entrez', "SwissProt":"SwissProt", "EC":"EC", "OMIM":"OMIM", 
			"Representative Public ID":'Representative Public ID', "RefSeq Protein ID":"RefSeq Protein ID",
			"RefSeq Transcript ID":"RefSeq Transcript ID", "FlyBase":"FlyBase", "AGI":"AGI", "WormBase":"WormBase", 
			"MGI Name":'MGI', "RGD Name":'RGD', "SGD accession number":'SGD', "Pathway":"Pathway", "InterPro":"InterPro", 
			"Trans Membrane":"Trans Membrane", "QTL":"QTL"
			}

	line0 = map(str.lower, line0)
	for k,v in nm_dic.items(): nm_dic[k.lower()] = v
	# filter columns here
	nm_idx = dict(zip(line0, range(len(line0))))
	nm_avail = list(sets.Set(nm_dic.keys()).intersection(line0))
	i_avail = map(lambda a:nm_idx[a], nm_avail)
	# "Probe Set ID" will be used both for "unique_id" and "Affymetrix" column. # Perhaps should not add the "Affymetrix" column if Affyemetrix ID is not good for matching.
	lines[0] = map(lambda a:nm_dic[a], nm_avail)
	if 'probe set id' in line0:
		i_avail.append(nm_idx["probe set id"])#["Probe Set ID"])
		lines[0].append('Affymetrix')
	elif 'id' in line0:
		i_avail.append(nm_idx["id"])
		lines[0].append('Affymetrix')
	#for i in xrange(1, len(lines)): lines[i] = map(lambda a:lines[i][a], i_avail)
	def mapCell(s): # 1. remove sub-annotation after '//'; 2. remove blank item '---'
		return join_chs.join(sets.Set(filter(lambda b:b and b!='---', map(lambda a:a.strip().split(affy_sub_join_chs)[0], s.split(affy_join_chs)) ) ) )
	for i in xrange(1, len(lines)): lines[i] = map(lambda a:mapCell(lines[i][a]), i_avail)

	# add GO terms
	if i_GO:
		for i in xrange(len(lines)): lines[i].append(GO_col[i])

	# create columns for grid
	lines[0].extend(('idx', 'block_row', 'block_col', 'row', 'col'))
	for i in xrange(1, len(lines)): lines[i].extend((str(i), '1','1','1',str(i)))
	
	return lines

def readOtherProbe(fp, cur):
	lines = readTable(fp, skip_blank=True)
	n = len(lines)-1
	# check columns here
	head = map(lambda a:a.lower(), lines[0])
	# change 'column' to 'col'
	if 'col' not in head and 'column' in head: 
		lines[0][head.index('column')] = head[head.index('column')] = 'col'
	if 'block_col' not in head and 'block_column' in head: 
		lines[0][head.index('block_column')] = head[head.index('block_column')] = 'block_col'

	# check unique_id
	if 'unique_id' not in head and 'id' in head: lines[0][head.index('id')] = 'unique_id'

	# check row and col
	if 'col' not in head:
		lines[0].insert(0, 'col')
		map(lambda i:lines[i].insert(0, str(i)), xrange(1, n+1))
	if 'row' not in head:
		lines[0].insert(0, 'row')
		map(lambda i:lines[i].insert(0, '1'), xrange(1, n+1))

	# make block_row and block_col from block
	if 'block_row' not in head and 'block_col' not in head and 'block' in head:
		# guess grid
		ib = head.index('block')
		blocks = map(lambda a:int(a[ib]), lines[1:])
		if max(blocks) % 4 == 0:
			blkrow = map(lambda a:str((a-1)/4+1), blocks)
			blkrow.insert(0, 'block_row')
			#blkcol = ['1','2','3','4'] * (n/4 + 1)
			blkcol = map(lambda a:str((a-1)%4+1), blocks)
			blkcol = blkcol[:n]
			blkcol.insert(0, 'block_col')
		else: # assume all in a single block
			blkrow = ['1']*(n+1)
			blkrow[0] = 'block_row'
			blkcol = blkrow[:]
			blkcol[0] = 'block_col'
		# remove the block column first
		map(lambda i:lines[i].pop(ib), xrange(n+1))
		# add columns
		map(lambda i:lines[i].insert(0,blkcol[i]), xrange(n+1))
		map(lambda i:lines[i].insert(0,blkrow[i]), xrange(n+1))
	else:
		if 'block_col' not in head:
			blkcol = ['1']*(n+1)
			blkcol[0] = 'block_col'
			map(lambda i:lines[i].insert(0,blkcol[i]), xrange(n+1))
		if 'block_row' not in head:
			blkrow = ['1']*(n+1)
			blkrow[0] = 'block_row'
			map(lambda i:lines[i].insert(0,blkrow[i]), xrange(n+1))

	# add idx
	if 'idx' not in head:
		idx = map(lambda a:str(a), xrange(n+1))
		idx[0] = 'idx'
		map(lambda i:lines[i].insert(0,idx[i]), xrange(n+1))

	# get allow columns: columns in table probe and names in OTHER_DB.txt
	#cols_allowed = getColInfo(cur, 'probe', skip_id=True, lower=True, return_dic=False)[0]
	#cols_allowed.extend(readOtherDBs(cur))
	cols_allowed = getColInfo(cur, 'probe', skip_id=True, lower=False, return_dic=False)[0]
	cols_allowed.extend(readOtherDBs(cur, lower=False))
	cols_allowed_lower = map(string.lower, cols_allowed)
	colsdic = dict(zip(cols_allowed_lower, cols_allowed))
	line0 = map(lambda a:a.lower(), lines[0])
	nm_idx = dict(zip(line0, range(len(line0))))
	nm_avail = list(sets.Set(cols_allowed).intersection(line0))
	i_avail = map(lambda a:nm_idx[a], nm_avail)
	# filter out useless columns
	for i in xrange(len(lines)): 
		this_line = lines[i]
		lines[i] = map(lambda a:this_line[a], i_avail)
	# recover the column name to normal case (instead of lowercase)
	#lines[0] = map(lambda a:colsdic[a], lines[0])
	lines[0] = map(lambda a:colsdic.get(a.lower(),a), lines[0])

	return lines

def readOtherDBsByFile(fn='DB_ALLOW.txt', lower=True):
	def getName(s, lower=lower):
		if ':' in s: s = s[:s.index(':')]
		s = s.strip()
		if lower: s = s.lower()
		return unescape(s)
	lines = map(getName, open(fn).readlines())
	return filter(lambda a:a, lines)

def readOtherDBs(cur, lower=True):
	lines = inquireDB('SELECT name FROM %s.refdbs ORDER BY name' % USER_DB, cursor=cur, fetch=True)
	if lower:
		lines = map(lambda a:a[0].lower(), lines)
	else:
		lines = map(lambda a:a[0], lines)
	return filter(lambda a:a, lines)

def readProbe(fn, cur, platform_id=None):
	probe_ids = []
	#f = open(fn)
	#line1 = f.readline()
	#pf_type = ProbeFileType(line1)
	#f.seek(0,0)
	pf_type, annotation_ln = ProbeFileType(fn)
	f = open(fn)
	for i in range(annotation_ln): f.readline()

	if pf_type == 'affy': 
		#lines = readAffyProbe(f, head=line1)
		lines = readAffyProbe(f) #, head=line1)
	else: 
		lines = readOtherProbe(f, cur)
	f.close()
	if len(lines) < 2: return probe_ids

	# convert Block to block_row and block_col
	#head_low = map(lambda a:a.lower(), lines[0])

	if platform_id is not None:
		platform_id_str = str(platform_id)
		map(lambda a:a.insert(0, platform_id_str), lines)
		lines[0][0] = 'platform_id'
	return lines

def chkXpf(cur, sql, xpf_id=None):
	# find xpf_id by sql. and set them to the first one if more than one found.
	cur.execute(sql) # sql should be "ORDER BY xpf_id ASC"
	if not cur.rowcount: return xpf_id
	xpf_ids =  map(lambda a:a[0], cur.fetchall())
	xpf_ids.sort() # sql should be "ORDER BY xpf_id ASC"
	# set to the minimum one (first one)
	if xpf_id is None or xpf_id in xpf_ids: 
		xpf_id = xpf_ids.pop(0) # = xpf_ids[1:]
	elif xpf_id > xpf_ids[0]: 
		xpf_ids.append(xpf_id)
		xpf_id = xpf_ids.pop(0) 
	if len(xpf_ids) > 0: # correct other xpf_ids to the xpf_id
		xpfids_str = ','.join(map(lambda a:str(a), xpf_ids))
		cur.execute('UPDATE %s.probe SET xpf_id=%d WHERE xpf_id IN (%s)' % (CUR_DB, xpf_id, xpfids_str))
		cur.execute('DELETE FROM %s.xpf WHERE id IN (%s)' % (CUR_DB, xpfids_str))
	cur.connection.commit()
	return xpf_id

def updateTable(cur, tbnm, ids, lines):
	pass

def updateProbe(cur, platform_id, fn, src_dir=None, bind_idx_id=False):
	if not cur: cur = getCursor()
	if type(platform_id) == type(''): 
		platform_id =  inquireDB('SELECT id FROM %s.platform WHERE name="%s"' % (CUR_DB, platform_id), cursor=cur, fetch=True)
		if platform_id: platform_id = platform_id[0][0]
	if not platform_id: return {'success':False, 'info':['No platform matching, did nothing!']}
	#read file
	if src_dir: fn = os.path.join(src_dir, fn)
	tbnm = 'probe'
	lines = readProbe(fn, cur, platform_id)

	# check file - probe number
	cur.execute('SELECT probe_num FROM %s.platform WHERE id=%d' % (CUR_DB, platform_id))
	if cur.rowcount != 1: return {'success':False, 'info':['Found %d matching platform, did nothing!' % cur.rowcount]}
	probe_num = cur.fetchone()[0]
	if len(lines)-1 != probe_num: return {'success':False, 'info':['Probe number doesn\'t match (expecting %d but got %d), did nothing!' % (probe_num, len(lines)-1)]}

	# update 
	#fixed_cols = ['block_row', 'block_col', 'row', 'col', 'gene_symbol', 'gene_title', 'chromosome', 'probe_start', 'probe_end', 'probe_strand', 'gene_start', 'gene_end', 'gene_strand', 'cpg_dist', 'probe_sequence', 'bioseq_type', 'probe_purpose', 'designation', 'unique_id', 'user_attr1', 'user_attr2']
	#int_cols = ['block_row', 'block_col', 'row', 'col', 'probe_start', 'probe_end', 'gene_start', 'gene_end', 'cpg_dist']
	##str_cols = ['gene_symbol', 'gene_title', 'chromosome', 'probe_strand', 'gene_strand', 'probe_sequence', 'bioseq_type', 'probe_purpose', 'designation', 'unique_id']

	# now automatically find int_cols and fixed_cols from table structure.
	auto_cols = tbcols_auto['probe'] + ['idx']
	col_defs = getColDef(cur, 'probe', DB=CUR_DB)
	int_cols = map(lambda b:b[0], filter(lambda a:tb_col_int.match(a[1]), col_defs))
	fixed_cols = list(sets.Set(map(lambda a:a[0], col_defs)).difference(auto_cols))
	
	j_col = dict(zip(fixed_cols, range(1, 1+len(fixed_cols)))) # since 0 is for id in recs later
	recs = inquireDB('SELECT idx, id, %s, xpf_id, mapf_id FROM %s.probe WHERE platform_id=%d' % (', '.join(fixed_cols), CUR_DB, platform_id), cursor=cur, fetch=True)
	if not recs or len(recs) != probe_num: return {'success':False, 'info':['Probe number doesn\'t match between the tables "platform" and "probe", did nothing!' % (probe_num, len(lines)-1)]} # probe number should match
	recs = dict(map(lambda a:(a[0], a[1:]), recs)) # now it is {idx:[id, ...fixed_cols..., xpf_id, mapf_id], ...}
	head_low = map(string.lower, lines[0])
	i_col = dict(zip(head_low, range(len(head_low))))
	i_idx, i_unid = i_col['idx'], i_col['unique_id']
	j_unid = j_col['unique_id']
	#if sets.Set(recs.keys()) != sets.Set(map(lambda a:int(a[i_idx]), lines[1:])): return [] # all idx should match
	if bind_idx_id:
		if sets.Set(map(lambda a:(a[0],a[1][j_unid]), recs.items())) != sets.Set(map(lambda a:(int(a[i_idx]), a[i_unid]), lines[1:])): 
			return {'success':False, 'info':['The column "idx" and/or "unique_id" changed, this is not allowed. Did nothing!']}  # all idx, unique_id should match
	else:
		if sets.Set(recs.keys()) != sets.Set(map(lambda a:int(a[i_idx]), lines[1:])): 
			return {'success':False, 'info':['The column "idx" changed, this is not allowed. Did nothing!']}  # all idx, unique_id should match
	for k in j_col.keys(): # don't touch those columns not in the new file
		if k not in i_col: del j_col[k] 
	cols_xref = list(sets.Set(i_col.keys()).difference(fixed_cols+['idx']))

	probe_ids = []
	sqls = esc_sql
	for line in lines[1:]:
		idx = int(line[i_idx])
		#if idx not in recs: continue # skip it, or report an error?
		# compare data in file with those in database
		v_rec = recs[idx]
		probe_id = v_rec[0]
		probe_ids.append(probe_id)
		col_dif = []
		for col, j in j_col.items():
			#if col not in i_col: continue
			if v_rec[j] is None and line[i_col[col]]=='NULL': continue
			if str(v_rec[j]) != line[i_col[col]]: col_dif.append(col)
		if col_dif: # update this record
			v_dif = []
			for k in col_dif:
				if k in int_cols: 
					#v_dif.append('%s=%s' % (k, line[i_col[k]]))
					try: v_dif.append('%s=%d' % (k, int(line[i_col[k]]))) 
					except: v_dif.append('%s=NULL' % k) # invalid value will become NULL
				else: # string
					v_dif.append('%s="%s"' % (k, sqls.sub(r'\\\1', line[i_col[k]])))
			cur.execute('UPDATE %s.probe SET %s WHERE id=%d' % (CUR_DB, ', '.join(v_dif), probe_id))
		
	# set xpf_id to NULL
	cur.execute('UPDATE %s.probe SET xpf_id=NULL WHERE platform_id=%d' % (CUR_DB, platform_id))
	# clear xref for this platform, but mapf_id will be kept
	cur.execute('DELETE x.* FROM %s.xpf x LEFT JOIN %s.probe pb ON x.id=pb.xpf_id WHERE pb.xpf_id IS NULL' % (CUR_DB, CUR_DB))
	# clear dbxref
	cur.execute('DELETE x.* FROM %s.dbxref x LEFT JOIN %s.probe pb ON x.probe_id=pb.id WHERE pb.platform_id=%d' % (CUR_DB, CUR_DB, platform_id))
	# clear dbkw that not used
	cur.execute('DELETE k.* FROM %s.dbkw k LEFT JOIN %s.dbxref x ON k.id=x.dbkw_id WHERE x.dbkw_id IS NULL' % (CUR_DB, CUR_DB))
	# clear db
	cur.execute('DELETE d.* FROM %s.db d LEFT JOIN %s.dbkw k ON d.name=k.dbname WHERE k.dbname IS NULL' % (CUR_DB, CUR_DB))

	# check other columns for xref
	fillProbeXref(cur, platform_id, lines, probe_ids)

	# copy file
	fobj = inquireDB('SELECT id, name, raw_name, location FROM %s.filexref x, %s.fileinfo f WHERE x.tbname="platform" && x.tb_id=%d && x.file_id=f.id && f.category="platform"' % (CUR_DB, CUR_DB, platform_id), cursor=cur, fetch=True)
	if fobj: 
		fid, nm, raw, loc = fobj[0]
		import shutil
		if not os.path.exists(loc): os.makedirs(loc)
		fnobj = os.path.join(loc, nm)
		shutil.copy(fn, fnobj)
		os.chmod(fnobj, 0774)
		sql = 'UPDATE %s.fileinfo SET raw_name="%s" WHERE id=%d' % (CUR_DB, os.path.split(fn)[1], fid)
		cur.execute(sql)
	else: # should do something too
		pass
	cur.connection.commit()
	return {'success':True}
	

def fillProbe(cur, platform_id, fn, src_dir=None):
	if src_dir: fn = os.path.join(src_dir, fn)
	tbnm = 'probe'
	lines = readProbe(fn, cur, platform_id)
	if len(lines) < 2: return []
		
	# fill table probe
	probe_ids = fillTable(cur, tbnm, lines, idx_col=None)

	return fillProbeXref(cur, platform_id, lines, probe_ids)

	# then fill the tables db & dbkw & dbxref
	colnms = lines[0] # the first item is column name
	colsites = dict(zip(colnms, range(len(colnms)))) 
	tbcols, char_cols = getColInfo(cur, tbnm, skip_id=True, lower=True, DB=CUR_DB)
	for k,v in tbcols.items(): tbcols[k.lower()] = v # add lower keys
	left_colnms = filter(lambda a:a.lower() not in tbcols, lines[0])
	left_sites = map(lambda a:colsites[a], left_colnms)
	#left_cols = dict(zip(left_colnms, left_sites))
	left_cols = zip(left_colnms, left_sites) # now allow replicated column names !!!

	#cur.execute('LOCK TABLES %s.probe, %s.xpf, %s.db, %s.dbkw, %s.dbxref LOW_PRIORITY WRITE' % (CUR_DB, CUR_DB, CUR_DB, CUR_DB, CUR_DB))
	
	cur.execute('SELECT name FROM %s.db' % CUR_DB)
	dbnms = map(lambda a:((type(a[0]) == type('')) and [a[0]] or [a[0].pop()])[0], cur.fetchall()) # CHAR column is convert to sets.Set type
	dbnms = dict(zip(dbnms, [True]*len(dbnms)))

	i_unid = colsites['unique_id']
	has_gsym = 'gene_symbol' in colsites
	if has_gsym:
		i_gsym = colsites['gene_symbol']
	#xpf =  {}
	cur.execute('SELECT MAX(id) FROM %s.xpf' % CUR_DB)
	xpf_max = cur.fetchone()
	xpf_max = xpf_max and xpf_max[0] or 0
	sqls = esc_sql
	for i in xrange(1, len(lines)):
		line = lines[i]
		probe_id = probe_ids[i-1]
		# try to find xpf_id by gene_symbol, (gene_title) and unique_id in the same platform
		# check xpf_id for unique_id in the same platform
		unique_id = line[i_unid]
		sql = 'SELECT DISTINCT xpf_id FROM %s.probe WHERE platform_id=%d && unique_id="%s" && xpf_id>0 ORDER BY xpf_id ASC' % (CUR_DB, platform_id, unique_id) 
		xpf_id = chkXpf(cur, sql)
		if has_gsym:
			gene_symbol =  line[i_gsym]
			#cur.execute('SELECT DISTINCT xpf_id FROM %s.probe WHERE gene_symbol="%s" && xpf_id>0 LIMIT 0,1' % (CUR_DB, gene_symbol.replace('"', '\\"')))
			sql = 'SELECT DISTINCT xpf_id FROM %s.probe WHERE gene_symbol="%s" && xpf_id>0 ORDER BY xpf_id ASC' % (CUR_DB, gene_symbol.replace('"', '\\"'))
			xpf_id = chkXpf(cur, sql, xpf_id)

		for db, site in left_cols: #.items():
			db_str = sqls.sub(r'\\\1', db) #db.replace('"', '\\"')
			vals = line[site]
			if vals == 'NULL': continue
			kws = filter(lambda b:b, map(lambda a:a.strip(), vals.split(join_chs)) )
			if not kws: continue
			kws = list(sets.Set(kws)) # remove replicate items
			if db not in dbnms: #not db in dbnms: #dbnms.has_key(db): # update db
				cur.execute('INSERT INTO %s.db SET name="%s"' % (CUR_DB, db_str))
				dbnms[db] = True
			for kw in kws:
				#if not kw: continue
				kw_str = sqls.sub(r'\\\1', kw) #kw.replace('"', '\\"')
				rlt = cur.execute('SELECT id FROM %s.dbkw WHERE dbname="%s" AND kw="%s"' % (CUR_DB, db_str[:55], kw_str[:255]))
				if rlt:
					db_id = cur.fetchone()[0]
					# find xpf_id for this db_id
					#if xpf_id is None:
					sql = '''
						SELECT DISTINCT p.xpf_id FROM %s.probe p, %s.dbxref x 
						WHERE x.dbkw_id=%d && p.id=x.probe_id && p.xpf_id>0 
						ORDER BY p.xpf_id ASC
						''' % (CUR_DB, CUR_DB, db_id)
					xpf_id = chkXpf(cur, sql, xpf_id)
				else: # update dbkw
					cur.execute('INSERT INTO %s.dbkw (dbname, kw) VALUES ("%s", "%s")' % (CUR_DB, db_str, kw_str))
					cur.execute('SELECT id FROM %s.dbkw WHERE dbname="%s" AND kw="%s"' % (CUR_DB, db_str[:55], kw_str[:255]))
					db_id = cur.fetchone()[0]
				# update dbxref	
				rlt = cur.execute('SELECT probe_id FROM %s.dbxref WHERE probe_id=%d and dbkw_id=%d' % (CUR_DB, probe_id, db_id))
				if not rlt:
					cur.execute('INSERT INTO %s.dbxref SET probe_id=%d, dbkw_id=%d' % (CUR_DB, probe_id, db_id))

		if xpf_id is None:
			cur.execute('INSERT INTO %s.xpf () VALUES ()' % CUR_DB )
			xpf_id = xpf_max = xpf_max + 1
		#xpf[probe_id] = xpf_id 
		# update xpf_id in table probe
		#cur.execute('UPDATE %s.probe p SET xpf_id=%d WHERE p.id=%d' % (CUR_DB, xpf_id, probe_id) )
		cur.execute('UPDATE %s.probe SET xpf_id=%d WHERE id=%d' % (CUR_DB, xpf_id, probe_id) )
	## update xpf_id in table probe
	#for pbid, xpfid in xpf.items(): cur.execute('UPDATE %s.probe p SET xpf_id=%d WHERE p.id=%d' % (CUR_DB, xpfid, pbid) )

	#cur.execute('UNLOCK TABLES')
	cur.connection.commit()
	return probe_ids

def fillProbeXref(cur, platform_id, lines, probe_ids, updating=False):
	# then fill the tables db & dbkw & dbxref
	colnms = lines[0] # the first item is column name
	colsites = dict(zip(colnms, range(len(colnms)))) 
	tbcols, char_cols = getColInfo(cur, 'probe', skip_id=True, lower=True, DB=CUR_DB)
	for k,v in tbcols.items(): tbcols[k.lower()] = v # add lower keys
	left_colnms = filter(lambda a:a.lower() not in tbcols, lines[0]) # use filter insead of sets.Set().difference so that replicate column names are allowed.
	left_sites = map(lambda a:colsites[a], left_colnms)
	#left_cols = dict(zip(left_colnms, left_sites))
	left_cols = zip(left_colnms, map(string.lower, left_colnms), left_sites) # now allow replicated column names !!!

	#cur.execute('LOCK TABLES %s.probe, %s.xpf, %s.db, %s.dbkw, %s.dbxref LOW_PRIORITY WRITE' % (CUR_DB, CUR_DB, CUR_DB, CUR_DB, CUR_DB))

	cur.execute('SELECT name FROM %s.db' % CUR_DB)
	dbnms = map(lambda a:((type(a[0]) == type('')) and [a[0]] or [a[0].pop()])[0], cur.fetchall()) # CHAR column is convert to sets.Set type
	dbnms = dict(zip(dbnms, [True]*len(dbnms)))

	# only these columns will be used for global search for xpf_id, while unique_id will be used for searching xpf_id within platform
	cols4xpf = sets.Set(map(lambda a:a[0].lower(), inquireDB('SELECT name FROM %s.refdbs WHERE update_xpf_id="yes"' % USER_DB, cursor=cur, fetch=True)))
	# !!! perhaps it is not good to use gene_symbol for cross-platform match since some researchers arbitrarily create gene_symbol !!!  Disable the following line if decide not to use it.
	cols4xpf.add('gene_symbol')
	
	i_unid = colsites['unique_id']
	has_gsym = 'gene_symbol' in colsites and 'gene_symbol' in cols4xpf
	if has_gsym:
		i_gsym = colsites['gene_symbol']
	#xpf =  {}

	cur.execute('SELECT MAX(id) FROM %s.xpf' % CUR_DB)
	xpf_max = cur.fetchone()
	xpf_max = xpf_max and xpf_max[0] or 0
	sqls = esc_sql
	tailEsc = re.compile(r'(^|[^\\]+)(\\\\)*\\$') # tailEsc.search(s)
	for i in xrange(1, len(lines)):
		line = lines[i]
		probe_id = probe_ids[i-1]
		# try to find xpf_id by gene_symbol, (gene_title) and unique_id in the same platform
		# check xpf_id for unique_id in the same platform
		unique_id = line[i_unid]
		sql = 'SELECT DISTINCT xpf_id FROM %s.probe WHERE platform_id=%d && unique_id="%s" && xpf_id>0 ORDER BY xpf_id ASC' % (CUR_DB, platform_id, unique_id) 
		xpf_id = chkXpf(cur, sql)
		if has_gsym:
			gene_symbol =  line[i_gsym]
			#cur.execute('SELECT DISTINCT xpf_id FROM %s.probe WHERE gene_symbol="%s" && xpf_id>0 LIMIT 0,1' % (CUR_DB, gene_symbol.replace('"', '\\"')))
			sql = 'SELECT DISTINCT xpf_id FROM %s.probe WHERE gene_symbol="%s" && xpf_id>0 ORDER BY xpf_id ASC' % (CUR_DB, gene_symbol.replace('"', '\\"'))
			xpf_id = chkXpf(cur, sql, xpf_id)

		for db, dblower, site in left_cols: #.items():
			db_str = sqls.sub(r'\\\1', db) #db.replace('"', '\\"')
			vals = line[site]
			if vals == 'NULL': continue
			kws = filter(lambda b:b, map(lambda a:a.strip(), vals.split(join_chs)) )
			if not kws: continue
			kws = list(sets.Set(kws)) # remove replicate items
			if db not in dbnms: #not db in dbnms: #dbnms.has_key(db): # update db
				cur.execute('INSERT INTO %s.db SET name="%s"' % (CUR_DB, db_str))
				dbnms[db] = True
			for kw in kws:
				#if not kw: continue
				kw_str = sqls.sub(r'\\\1', kw) #kw.replace('"', '\\"')
				if len(kw_str) > 255:
					#sys.stdout.write('\n\n\n%s: %s\n\n\n' % (db_str, kw_str))
					kw_str = kw_str[:255]
					if tailEsc.search(kw_str): kw_str = kw_str[:-1]
				rlt = cur.execute('SELECT id FROM %s.dbkw WHERE dbname="%s" AND kw="%s"' % (CUR_DB, db_str[:55], kw_str))
				if rlt:
					db_id = cur.fetchone()[0]
					# find xpf_id for this db_id
					#if xpf_id is None:
					if dblower in cols4xpf: # only those in cols4xpf will be used for xpf_id
						sql = '''
							SELECT DISTINCT p.xpf_id FROM %s.probe p, %s.dbxref x 
							WHERE x.dbkw_id=%d && p.id=x.probe_id && p.xpf_id>0 
							ORDER BY p.xpf_id ASC
							''' % (CUR_DB, CUR_DB, db_id)
						xpf_id = chkXpf(cur, sql, xpf_id)
				else: # update dbkw
					cur.execute('INSERT INTO %s.dbkw (dbname, kw) VALUES ("%s", "%s")' % (CUR_DB, db_str, kw_str))
					cur.execute('SELECT id FROM %s.dbkw WHERE dbname="%s" AND kw="%s"' % (CUR_DB, db_str[:55], kw_str))
					db_id = cur.fetchone()[0]
				# update dbxref	
				rlt = cur.execute('SELECT probe_id FROM %s.dbxref WHERE probe_id=%d and dbkw_id=%d' % (CUR_DB, probe_id, db_id))
				if not rlt:
					cur.execute('INSERT INTO %s.dbxref SET probe_id=%d, dbkw_id=%d' % (CUR_DB, probe_id, db_id))

		if xpf_id is None:
			cur.execute('INSERT INTO %s.xpf () VALUES ()' % CUR_DB )
			xpf_id = xpf_max = xpf_max + 1
		#xpf[probe_id] = xpf_id 
		# update xpf_id in table probe
		#cur.execute('UPDATE %s.probe p SET xpf_id=%d WHERE p.id=%d' % (CUR_DB, xpf_id, probe_id) )
		cur.execute('UPDATE %s.probe SET xpf_id=%d WHERE id=%d' % (CUR_DB, xpf_id, probe_id) )
	## update xpf_id in table probe
	#for pbid, xpfid in xpf.items(): cur.execute('UPDATE %s.probe p SET xpf_id=%d WHERE p.id=%d' % (CUR_DB, xpfid, pbid) )

	#cur.execute('UNLOCK TABLES')
	cur.connection.commit()
	return probe_ids

def convertDate(tbv, colname):
	# correct mm-dd-yyyy format to yyyy-mm-dd
	if colname not in tbv[0]: return 
	idx = tbv[0].index(colname)
	date_us = re.compile(r'\s*(\d\d?)\s*[-/]\s*(\d\d?)\s*[-/]\s*(\d\d\d\d)\s*') # must have 4 digits for year
	#date_us = re.compile(r'\s*(0?[1-9]|1[0-2])\s*\D\s*([0-2]?\d|3[01])\s*\D\s*(\d\d\d\d)\s*') # must have 4 digits for year
	for line in tbv[1:]:
		m = date_us.match(line[idx])
		if not m: continue
		grps = m.groups()
		line[idx] = '-'.join([grps[2], grps[0], grps[1]])
		
def fillArray_rpy(cur, tbs, tbids, tbcols, char_cols, user_id=None, src_dir=None):
	"""fill array & intensity"""
	tbnm = 'array'
	tbv = tbs.get(tbnm, None)
	if not tbv or len(tbv)<2: return

	convertDate(tbv, 'hyb_date')

	tb_nm_id = (('project', 'project', 'project_id'), ('platform', 'platform', 'platform_id'), ('protocol', 'protocol_hyb', 'prot_hyb_id'), ('protocol', 'protocol_image', 'prot_img_id'), ('protocol', 'protocol_data', 'prot_data_id'))
	#getIDs(tbv, tbids, tb_nm_id) 
	# fill table array
	ids, pos = fillTable(cur, tbnm, tbv, tbcol=tbcols[tbnm], chcol=char_cols.get(tbnm,{}), idx_col=None, tb_nm_id=tb_nm_id, tbids=tbids, get_pos=True, user_id=user_id)
	#if ids: tbids[tbnm] = dict(zip(map(lambda a:a[pos['name']], tbv[1:]), ids))
	if ids: tbids[tbnm] = dict(zip(map(lambda a:a[pos['identifier']], tbv[1:]), ids))

	#if not tbv or len(tbv)<2: return
	# then fill intensity data
	r = rpy_r #initR()
	platform_dic = {} # contain probe_ids of related platforms
	for i in range(1, len(tbv)):
		#sys.stdout.write('\nplatform_id is: "%s"' % tbv[i][pos['platform_id']])
		platform_id = int(tbv[i][pos['platform_id']])
		data_type = ('data_type' not in pos) and 'intensity' or tbv[i][pos['data_type']]

		if not platform_dic.has_key(platform_id):
			cur.execute('SELECT unique_id, id FROM %s.probe WHERE platform_id=%d ORDER BY id' % (CUR_DB, platform_id))
			rlt = cur.fetchall()
			platform_dic[platform_id] = {'unique_id':map(lambda a:a[0], rlt), 'probe_id':map(lambda a:a[1], rlt)}
		# need to consider multi-file data here
		fillIntensity_rpy(r, cur, ids[i-1], tbv[i][pos['intensity_file']], tbv[i][pos['intensity_format']], tbv[i][pos['channel_num']], platform_dic[platform_id]['unique_id'], platform_dic[platform_id]['probe_id'], data_type=data_type, src_dir=src_dir)
	
def initR():
	def myout(s): pass
	import rpy
	rpy.set_rpy_output(myout)
	rpy.r('source("%s")' % os.path.join(os.path.split(os.path.abspath(__file__))[0], 'readIntensity.R'))
	return rpy.r

#try: rpy_r = initR()
#except: rpy_r = None

def fillArray_expect(cur, tbs, tbids, tbcols, char_cols, user_id=None, src_dir=None):
	"""fill array & intensity"""
	tbnm = 'array'
	tbv = tbs.get(tbnm, None)
	if not tbv or len(tbv)<2: return

	convertDate(tbv, 'hyb_date')

	tb_nm_id = (('project', 'project', 'project_id'), ('platform', 'platform', 'platform_id'), ('protocol', 'protocol_hyb', 'prot_hyb_id'), ('protocol', 'protocol_image', 'prot_img_id'), ('protocol', 'protocol_data', 'prot_data_id'))
	#getIDs(tbv, tbids, tb_nm_id) 
	# fill table array
	ids, pos = fillTable(cur, tbnm, tbv, tbcol=tbcols[tbnm], chcol=char_cols.get(tbnm,{}), idx_col=None, tb_nm_id=tb_nm_id, tbids=tbids, get_pos=True, user_id=user_id)
	#if ids: tbids[tbnm] = dict(zip(map(lambda a:a[pos['name']], tbv[1:]), ids))
	if ids: tbids[tbnm] = dict(zip(map(lambda a:a[pos['identifier']], tbv[1:]), ids))

	#if not tbv or len(tbv)<2: return
	# then fill intensity data
	#r = initR()
	import pexpect
	r = pexpect.spawn('R --vanilla')
	#r.expect('.*> ')
	r.sendline('source("%s")' % os.path.join(os.path.split(os.path.abspath(__file__))[0], 'readIntensity.R'))
	platform_dic = {} # contain probe_ids of related platforms
	for i in range(1, len(tbv)):
		#sys.stdout.write('\nplatform_id is: "%s"' % tbv[i][pos['platform_id']])
		platform_id = int(tbv[i][pos['platform_id']])
		data_type = ('data_type' not in pos) and 'intensity' or tbv[i][pos['data_type']]

		if not platform_dic.has_key(platform_id):
			cur.execute('SELECT unique_id, id FROM %s.probe WHERE platform_id=%d ORDER BY id' % (CUR_DB, platform_id))
			rlt = cur.fetchall()
			platform_dic[platform_id] = {'unique_id':map(lambda a:a[0], rlt), 'probe_id':map(lambda a:a[1], rlt)}
		# need to consider multi-file data here
		fillIntensity_expect(r, cur, ids[i-1], tbv[i][pos['intensity_file']], tbv[i][pos['intensity_format']], tbv[i][pos['channel_num']], platform_dic[platform_id]['unique_id'], platform_dic[platform_id]['probe_id'], data_type=data_type, src_dir=src_dir)
	r.sendline('q(save="no")')
	r.close()

def fillArray_pipe(cur, tbs, tbids, tbcols, char_cols, user_id=None, src_dir=None):
	"""fill array & intensity"""
	tbnm = 'array'
	tbv = tbs.get(tbnm, None)
	if not tbv or len(tbv)<2: return

	convertDate(tbv, 'hyb_date')

	tb_nm_id = (('project', 'project', 'project_id'), ('platform', 'platform', 'platform_id'), ('protocol', 'protocol_hyb', 'prot_hyb_id'), ('protocol', 'protocol_image', 'prot_img_id'), ('protocol', 'protocol_data', 'prot_data_id'))
	#getIDs(tbv, tbids, tb_nm_id) 
	# fill table array
	ids, pos = fillTable(cur, tbnm, tbv, tbcol=tbcols[tbnm], chcol=char_cols.get(tbnm,{}), idx_col=None, tb_nm_id=tb_nm_id, tbids=tbids, get_pos=True, user_id=user_id)
	#if ids: tbids[tbnm] = dict(zip(map(lambda a:a[pos['name']], tbv[1:]), ids))
	if ids: tbids[tbnm] = dict(zip(map(lambda a:a[pos['identifier']], tbv[1:]), ids))

	#if not tbv or len(tbv)<2: return
	# then fill intensity data
	platform_dic = {} # contain probe_ids of related platforms
	for i in range(1, len(tbv)):
		#sys.stdout.write('\nplatform_id is: "%s"' % tbv[i][pos['platform_id']])
		platform_id = int(tbv[i][pos['platform_id']])
		data_type = ('data_type' not in pos) and 'intensity' or tbv[i][pos['data_type']]

		if not platform_dic.has_key(platform_id):
			cur.execute('SELECT unique_id, id FROM %s.probe WHERE platform_id=%d ORDER BY id' % (CUR_DB, platform_id))
			rlt = cur.fetchall()
			platform_dic[platform_id] = {'unique_id':map(lambda a:a[0], rlt), 'probe_id':map(lambda a:a[1], rlt)}
		# need to consider multi-file data here
		fillIntensity_pipe(cur, ids[i-1], tbv[i][pos['intensity_file']], tbv[i][pos['intensity_format']], tbv[i][pos['channel_num']], platform_dic[platform_id]['unique_id'], platform_dic[platform_id]['probe_id'], data_type=data_type, src_dir=src_dir)

#fillArray = fillArray_rpy
#fillArray = fillArray_expect
fillArray = fillArray_pipe
	
def fillIntensity_rpy(r, cur, array_id, filename, format, channel_num, unique_id, probe_id, data_type='intensity', src_dir=None):
	# need to consider multi-file data here

	# create tmp file to store intensity table
	fntmp = os.tempnam()
	ftmp = open(fntmp, 'w')
	ftmp.close()
	os.chmod(fntmp, 0664) # to let MySQL read it.
	ftmp = open(fntmp)
	# get the current id number
	cur.execute('LOCK TABLES %s.intensity LOW_PRIORITY WRITE' % CUR_DB)
	cur.execute('DESCRIBE %s.intensity' % CUR_DB)
	db_colnms = map(lambda a:a[0], cur.fetchall())
	cur.execute('SELECT MAX(id) from %s.intensity' % CUR_DB)
	current_id = cur.fetchone()[0] or 0
	
	if src_dir:
		if filename.find(join_chs) >= 0: # fm should be "imagene"
			filename = 'matrix(c("' + '", "'.join( map(lambda a:os.path.join(src_dir, a), filename.split(join_chs)) )+'")' + ', nrow=1)'
		else: filename = '"' + os.path.join(src_dir, filename) + '"'
	else:
		if filename.find(join_chs) >= 0: # fm should be "imagene"
			filename = 'matrix(c("' + '", "'.join(filename.split(join_chs))+'")' + ', nrow=1)'
		else: filename = '"'+filename+'"'
		
	# rpy cannot use long integer, so use int to coerse them. may cause error if the number is beyond -2147483648 to 2147483647on 32-bit system
	r.assign('unique_id', unique_id)
	r.assign('probe_id', map(lambda a:int(a), probe_id))
	r.assign('db_colnms', db_colnms)
	#r('readIntensity(src=%s, obj="%s", fmt="%s", channel_num=%d, array_id=%d, current_id=%d)' % (filename, fntmp, format, channel_num, array_id, current_id) )
	r('readIntensity(src=%s, obj="%s", fmt="%s", data_type="%s", channel_num=%d, array_id=%d, current_id=%d)' % (filename, fntmp, format.lower(), data_type, int(channel_num), array_id, current_id) )

	rlt = None

	try:
		cur.execute('LOAD DATA INFILE "%s" INTO TABLE %s.intensity' % (fntmp.replace('"','\\"'), CUR_DB))
	except: 
		rlt = {'success':True, 'info':'Error (skipped) when LOAD DATA INFILE "%s" INTO TABLE intensity' % fntmp}
	cur.execute('UNLOCK TABLES')
	cur.connection.commit()
	ftmp.close()
	os.unlink(fntmp)
	return rlt

def fillIntensity_expect(r, cur, array_id, filename, format, channel_num, unique_id, probe_id, data_type='intensity', src_dir=None):
	# need to consider multi-file data here

	# create tmp file to store intensity table
	fntmp = os.tempnam()
	ftmp = open(fntmp, 'w')
	ftmp.close()
	os.chmod(fntmp, 0664) # to let MySQL read it.
	ftmp = open(fntmp)
	# get the current id number
	cur.execute('LOCK TABLES %s.intensity LOW_PRIORITY WRITE' % CUR_DB)
	cur.execute('DESCRIBE %s.intensity' % CUR_DB)
	db_colnms = map(lambda a:a[0], cur.fetchall())
	cur.execute('SELECT MAX(id) from %s.intensity' % CUR_DB)
	current_id = cur.fetchone()[0] or 0
	
	fntmpR = os.tempnam()
	ftmpR = open(fntmpR, 'w')
	ftmpR.write('unique_id <- c(%s)\n' % str(unique_id)[1:-1] )
	ftmpR.write('probe_id <- c(%s)\n' % str(map(lambda a:int(a), probe_id))[1:-1])
	ftmpR.write('db_colnms <- c(%s)\n' % str(db_colnms)[1:-1])
	ftmpR.close()
	os.chmod(fntmpR, 0664) # to let MySQL read it.
	ftmpR = open(fntmpR)

	if src_dir:
		if filename.find(join_chs) >= 0: # fm should be "imagene"
			filename = 'matrix(c("' + '", "'.join( map(lambda a:os.path.join(src_dir, a), filename.split(join_chs)) )+'")' + ', nrow=1)'
		else: filename = '"' + os.path.join(src_dir, filename) + '"'
	else:
		if filename.find(join_chs) >= 0: # fm should be "imagene"
			filename = 'matrix(c("' + '", "'.join(filename.split(join_chs))+'")' + ', nrow=1)'
		else: filename = '"'+filename+'"'
		
	# rpy cannot use long integer, so use int to coerse them. may cause error if the number is beyond -2147483648 to 2147483647on 32-bit system
	#fn = file('/home/xxia/temp/test_filldbs.log', 'w'); fn.close()
	#fn = file('/home/xxia/temp/test_filldbs.log', 'a'); fn.write('Expect *\n'); fn.close()
	r.expect('.*')
	#fn = file('/home/xxia/temp/test_filldbs.log', 'a'); fn.write('unique_id <- c(%s)\n' % str(unique_id)[1:-1] ); fn.close()
	r.sendline('source("%s")' % fntmpR)
	#r.sendline('unique_id <- c(%s)' % str(unique_id)[1:-1] ) # too long for expect #r.assign('unique_id', unique_id) 
	#r.expect('.*')
	#fn = file('/home/xxia/temp/test_filldbs.log', 'a'); fn.write('probe_id <- c(%s)\n' % str(map(lambda a:int(a), probe_id))[1:-1]); fn.close()
	#r.sendline('probe_id <- c(%s)' % str(map(lambda a:int(a), probe_id))[1:-1] ) #r.assign('probe_id', map(lambda a:int(a), probe_id))
	#r.expect('.*> ')
	#fn = file('/home/xxia/temp/test_filldbs.log', 'a'); fn.write('db_colnms <- c(%s)\n' % str(db_colnms)[1:-1]); fn.close()
	#r.sendline('db_colnms <- c(%s)' % str(db_colnms)[1:-1] ) #r.assign('db_colnms', db_colnms)
	r.expect('.*> ')
	#fn = file('/home/xxia/temp/test_filldbs.log', 'a'); fn.write('readIntensity(src=%s, obj="%s", fmt="%s", channel_num=%d, array_id=%d, current_id=%d)\n' % (filename, fntmp, format.lower(), int(channel_num), array_id, current_id)); fn.close()
	#r('readIntensity(src=%s, obj="%s", fmt="%s", channel_num=%d, array_id=%d, current_id=%d)' % (filename, fntmp, format, channel_num, array_id, current_id) )
	#r('readIntensity(src=%s, obj="%s", fmt="%s", channel_num=%d, array_id=%d, current_id=%d)' % (filename, fntmp, format.lower(), int(channel_num), array_id, current_id) )
	r.sendline('readIntensity(src=%s, obj="%s", fmt="%s", data_type="%s", channel_num=%d, array_id=%d, current_id=%d)' % (filename, fntmp, format.lower(), data_type, int(channel_num), array_id, current_id) )
	r.expect('.*> ')
	#fn = file('/home/xxia/temp/test_filldbs.log', 'a'); fn.write('5\n'); fn.close()

	rlt = None

	try:
		cur.execute('LOAD DATA INFILE "%s" INTO TABLE %s.intensity' % (fntmp.replace('"','\\"'), CUR_DB))
	except: 
		rlt = {'success':True, 'info':'Error (skipped) when LOAD DATA INFILE "%s" INTO TABLE intensity' % fntmp}
	cur.execute('UNLOCK TABLES')
	cur.connection.commit()
	ftmp.close()
	ftmpR.close()
	os.unlink(fntmp)
	os.unlink(fntmpR)
	return rlt

import tempfile
def fillIntensity_pipe(cur, array_id, filename, format, channel_num, unique_id, probe_id, data_type='intensity', src_dir=None):
	# need to consider multi-file data here

	# create tmp file to store intensity table
	ftmp = tempfile.NamedTemporaryFile('r')
	fntmp = ftmp.name
	#fntmp = os.tempnam()
	#ftmp = open(fntmp, 'w')
	#ftmp.close()
	os.chmod(fntmp, 0664) # to let MySQL read it.
	#ftmp = open(fntmp)
	# get the current id number
	cur.execute('LOCK TABLES %s.intensity LOW_PRIORITY WRITE' % CUR_DB)
	cur.execute('DESCRIBE %s.intensity' % CUR_DB)
	db_colnms = map(lambda a:a[0], cur.fetchall())
	cur.execute('SELECT MAX(id) from %s.intensity' % CUR_DB)
	current_id = cur.fetchone()[0] or 0
	
	#fntmpR = os.tempnam() # temp file to pass parameters to R
	#ftmpR = open(fntmpR, 'w')
	ftmpR = tempfile.NamedTemporaryFile('w')
	fntmpR = ftmpR.name
	ftmpR.write('unique_id <- c(%s)\n' % str(unique_id)[1:-1] )
	ftmpR.write('probe_id <- c(%s)\n' % str(map(lambda a:int(a), probe_id))[1:-1])
	ftmpR.write('db_colnms <- c(%s)\n' % str(db_colnms)[1:-1])
	#ftmpR.close()
	ftmpR.flush()
	os.chmod(fntmpR, 0664) # to let MySQL read it.
	#ftmpR = open(fntmpR)


	if src_dir:
		if filename.find(join_chs) >= 0: # fm should be "imagene"
			filename = 'matrix(c("' + '", "'.join( map(lambda a:os.path.join(src_dir, a), filename.split(join_chs)) )+'")' + ', nrow=1)'
		else: filename = '"' + os.path.join(src_dir, filename) + '"'
	else:
		if filename.find(join_chs) >= 0: # fm should be "imagene"
			filename = 'matrix(c("' + '", "'.join(filename.split(join_chs))+'")' + ', nrow=1)'
		else: filename = '"'+filename+'"'

	#open('/tmp/filenames.txt','a').write(filename+'\n')

	#from pipeR import pipeR
	freadint = os.path.join(os.path.split(os.path.abspath(__file__))[0], 'readIntensity.R')
	if sys.platform == 'win32':
		fntmpR = fntmpR.replace('\\', '\\\\')
		filename = filename.replace('\\', '\\\\')
		fntmp = fntmp.replace('\\', '\\\\')
		freadint = freadint.replace('\\', '\\\\')
	CMDS = ['source("%s")' % fntmpR]
	CMDS.append('source("%s")' % freadint)
	CMDS.append('readIntensity(src=%s, obj="%s", fmt="%s", data_type="%s", channel_num=%d, array_id=%d, current_id=%d)' % (filename, fntmp, format.lower(), data_type, int(channel_num), array_id, current_id) )
	pipeR(CMDS=CMDS, init_lam=False)

	rlt = None

	try:
		cur.execute('LOAD DATA INFILE "%s" INTO TABLE %s.intensity' % (fntmp.replace('"','\\"'), CUR_DB))
	except: 
		#rlt = {'success':True, 'info':'Error (skipped) when LOAD DATA INFILE "%s" INTO TABLE intensity' % fntmp}
		#raise
		# server may be not the localhost, try to save data by SQL overhere
		try:
			sql_mod = """INSERT INTO %s.intensity VALUES (%%s)""" % CUR_DB
			for line in open(fntmp):
				line = line.replace('\n', '').replace('\r', '').replace('\t', ', ') #DT = map(lambda a:a.split('\t', open(fntmp))
				line = sql_mod % line
				cur.execute(line)
		except:
			rlt = {'success':True, 'info':'Error (skipped) when INSERT data INTO TABLE intensity'}
	cur.connection.commit()
	cur.execute('UNLOCK TABLES')
	ftmp.close()
	ftmpR.close()
	#os.unlink(fntmp)
	#os.unlink(fntmpR)
	return rlt


def addCol(tb, col):
	if len(tb) > 0: tb[0].append(col)
	for line in tb[1:]: line.append('')
	return tb

def convertFileInfo(tbs, file_names, file_types, file_fmts, col, val, info=None, prj=None):
	head = ['name', 'category', 'format', 'project', 'platform', 'protocol', 'array', 'info_code']
	col = col.strip()
	val = val.strip()
	prj_nm = (prj is None) and getPrjName(tbs) or prj
	
	file_names = map(lambda a:a.strip(), file_names.split(join_chs))
	file_types = map(lambda a:a.strip(), file_types.split(join_chs))
	file_fmts = map(lambda a:a.strip(), file_fmts.split(join_chs))
	if not file_names or not file_types or not file_fmts: return # fill only if all three values are present.
	lnm, lty, lfm = len(file_names), len(file_types), len(file_fmts)
	if lty < lnm: file_types.extend(file_types[-1:]*(lnm-lty)) # use the last items for extra files
	if lfm < lnm: file_fmts.extend(file_fmts[-1:]*(lnm-lty)) # use the last items for extra files
	tb_file = tbs.get('fileinfo', None) or tbs.setdefault('fileinfo', [head])
	tb_head = tb_file[0]
	if len(tb_file)>=2 and 'name' not in tb_head: return # needn't convert since it wouldn't pass chkCols
	for cl in head:
		if cl not in tb_head: addCol(tb_file, cl)
	tb_fname = map(lambda a:a[0], tb_file[1:])
	tb_line = [''] * len(tb_head)
	idx_cols = dict(zip(tb_head, range(len(tb_head))))
	idx_col, idx_prj, idx_fn, idx_tp, idx_fmt, idx_info = idx_cols[col], idx_cols['project'], idx_cols['name'], idx_cols['category'], idx_cols['format'], idx_cols['info_code']
	#idx_col, idx_prj = head.index(col), head.index('project')
	info_str = (info is not None) and str(info) or 'NULL'
	tb_line[idx_info] = info_str
	for (file_name, file_type, file_fmt) in zip(file_names, file_types, file_fmts):			
		if not file_name or file_name=='NULL': continue
		if file_name in tb_fname: # this is impossible for [project] since project is checked first.
			idx_row = tb_fname.index(file_name)+1
			v = tb_file[idx_row][idx_col].strip()
			if v: v = v + join_chs + val
			else: v = val
		else:
			#tb_file.append([file_name, file_type, file_fmt, '', '', ''])
			new_row = tb_line[:]
			new_row[idx_fn] = file_name
			new_row[idx_tp] = file_type
			new_row[idx_fmt] = file_fmt
			tb_file.append(new_row)
			idx_row = -1
			v = val
		tb_file[idx_row][idx_col] = v
		if prj_nm:
			vs = tb_file[idx_row][idx_prj].strip()
			if vs: vs = vs.split(join_chs)
			else: vs = []
			if prj_nm not in vs:
				vs.append(prj_nm)
				tb_file[idx_row][idx_prj] = join_chs.join(vs)
			
	
def convertPrj(cur, tbs):
	str_file = 'related_files'
	v_type, v_fmt = 'project', 'text'
	#v_type, v_fmt = 'annotation', 'text'
	def genPrjname():
		cur.execute('SELECT MAX(id) FROM %s.project' % CUR_DB)
		rlt = cur.fetchone()
		rlt = rlt and rlt[0]+1 or 1
		return 'prj'+str(rlt).zfill(5)
	
	# check the name column
	#tb = tbs.setdefault('project', [[],[]])
	rlt = {}
	if len(tbs.get('project', [])) > 2: 
		return {'success':False, 'info':'Multi-project file, please use batch loading command'}
	if not tbs.has_key('project') or len(tbs['project']) != 2: # automatically generate the porject table
		rlt['info'] = 'project information is generated automatically!'
		tbs['project'] = tb = [['name'], [genPrjname()]]
	else: # tb should be a 2-line list
		tb = tbs['project']
		if 'name' not in tb[0]:
			tb[0].insert(0, 'name')
			tb[1].insert(0, genPrjname())
	# check related files
	head, val = tb
	if str_file in head: 
		i_name, i_file = head.index('name'), head.index(str_file)
		v = val[i_file].strip()
		if v and v != 'NULL': convertFileInfo(tbs, v, v_type, v_fmt, 'project', val[i_name], info='1')
	# check/make identifiers in array for project (in order to create table "project_array")
	tb_array = tbs.get('array', []) #tbs['array']
	if (len(tb_array)<2): return rlt
	head_a, val_a = tb_array[0], tb_array[1:]
	n_a = len(val_a)
	col_nm = 'identifier'
	prj_nm = val[head.index('name')]
	if col_nm not in head_a: 
		head_a.append(col_nm)
		map(lambda a:a.append(None), val_a)
	col_i = head_a.index(col_nm)
	for i in range(n_a):
		if not val_a[i][col_i] or val_a[i][col_i]=='NULL': # make name for arrays
			NO = '_array_' + str(i+1).zfill(len(str(n_a)))
			val_a[i][col_i] = prj_nm[:(254-len(NO))] + NO
	# The following two lines are not need now since the table "array" is used to create table "project_array"
	#head.append('array')
	#val.append(join_chs.join(map(lambda a:a[col_i], val_a)))
	return rlt

def convertPf(cur, tbs):
	rlt = {}
	str_probe, str_file = 'probe_file', 'related_files'
	v_type, v_fmt = 'platform', 'text'
	# check related files
	tb = tbs.get('platform', [])
	if len(tb) < 2: return rlt
	head = tb[0] # lowered already.  map(string.lower, tb[0])
	i_prj = (('project' in head) and [head.index('project')] or [None])[0]
	
	if not (str_probe in head or str_file in head): return rlt
	if str_probe in head:
		i_name, i_f = head.index('name'), head.index(str_probe)
		for val in tb[1:]:
			v_f = val[i_f].strip()
			if v_f and v_f != 'NULL': convertFileInfo(tbs, v_f, v_type, v_fmt, 'platform', val[i_name], prj=(i_prj is not None) and val[i_prj] or None)
	if str_file in head:
		i_name, i_f = head.index('name'), head.index(str_file)
		for val in tb[1:]:
			v_f = val[i_f].strip()
			if v_f and v_f != 'NULL': convertFileInfo(tbs, v_f, v_type, v_fmt, 'platform', val[i_name], prj=(i_prj is not None) and val[i_prj] or None)
	return rlt
	
def convertProt(cur, tbs):
	rlt = {}
	str_prot = 'prot_file'
	v_type = 'protocol'
	# check related files
	tb = tbs.get('protocol', [])
	if len(tb) < 2: return rlt
	head = tb[0]
	i_prj = (('project' in head) and [head.index('project')] or [None])[0]
	
	if str_prot in head:
		i_name, i_f = head.index('name'), head.index(str_prot)
		for val in tb[1:]:
			v_f = val[i_f].strip()
			if v_f and v_f != 'NULL': 
				v_fmt = os.path.splitext(v_f)[1][1:].lower()
				convertFileInfo(tbs, v_f, v_type, v_fmt, 'protocol', val[i_name], prj=(i_prj is not None) and val[i_prj] or None)
	return rlt
	
def convertArray(cur, tbs):
	rlt = {}
	err_msg = []
	tb = tbs.get('array', [])
	if len(tb) < 2: return rlt

	def any(x): return bool(x) and reduce(lambda a,b:a or b, x) or False
	def all(x): return bool(x) and reduce(lambda a,b:a and b, x) or False
		
	head = tb[0]
	i_name = head.index('identifier')
	i_prj = (('project' in head) and [head.index('project')] or [None])[0]
	
	# check intensity file first
	str_file, str_fmt = 'intensity_file', 'intensity_format'
	i_file, i_fmt = head.index(str_file), head.index(str_fmt)
	file_type = 'intensity'
	if str_file in head and str_fmt in head:
		for val in tb[1:]: 
			vf = val[i_file].strip()
			if vf and vf != 'NULL': convertFileInfo(tbs, vf, file_type, val[i_fmt], 'array', val[i_name], prj=(i_prj is not None) and val[i_prj] or None) 
	
	# check image files
	img_str = re.compile(r'image_file(_ch\d+)?\Z') # use match
	img_cols = filter(lambda a:img_str.match(a), head)
	suffix = map(lambda a:img_str.match(a).groups()[0] or '', img_cols)

	for suf in suffix:
		str_file, str_fmt = 'image_file'+suf, 'image_format'+suf
		if str_fmt not in head:
			err_msg.append(str_fmt+' is not offered')
			break
		i_file, i_fmt = head.index(str_file), head.index(str_fmt)
		file_type = 'image'
		if not suf: info = None
		else:
			try: chN = int(suf[3:])
			except: 
				err_msg.append('Incorrect channel nubmer for image file')
				break
			if chN>0 and chN<100: info = str(chN)
			else: 
				err_msg.append('Incorrect channel number: %d!' % chN)
				break
		if str_file in head and str_fmt in head:
			for val in tb[1:]: 
				vf = val[i_file].strip()
				if vf and vf != 'NULL': convertFileInfo(tbs, vf, file_type, val[i_fmt], 'array', val[i_name], info=info, prj=(i_prj is not None) and val[i_prj] or None) 

	# check for new protocols
	#if not tbs.has_key('protocol'): # wouldn't check new protocols in Array section if there is a "protocol" section existing
	cols = filter(lambda a:not a.find('protocol_'), head)
	idx = map(lambda a:head.index(a), cols)
	cat_str = re.compile(r'protocol_([a-zA-Z0-9]+)(_.*)*') # use match, groups()[0] get fetch result
	cats = map(lambda a:cat_str.match(a).groups()[0], cols)
	cat_map = {'hyb':'hybridization', 'tech':'technique'}
	cats = map(lambda a:cat_map.get(a,a), cats)
	idx_cat = zip(idx, cats)
	#tbs['protocol'] = tb_protocol = [['name', 'category', 'description']]
	prot_head = ['name', 'category', 'description']
	tb_protocol = tbs.get('protocol', None) or tbs.setdefault('protocol', [prot_head] )
	if len(tb_protocol)==1 and 'name' not in tb_protocol[0]: addCol(tb_protocol, 'name')
	if 'name' in tb_protocol[0]: # and len(tb_protocol)>0:
		for col in ('category', 'description'):
			if col not in tb_protocol[0]: addCol(tb_protocol, col)
		prot_line = [''] * len(tb_protocol[0])
		prot_idx = dict(zip(tb_protocol[0], range(len(tb_protocol[0]))))
		for val in tb[1:]:
			for i, cat in idx_cat:
				v = val[i]
				vdf = def_str.match(v)
				if not vdf: continue
				nm, nm_rest, content = vdf.groups()
				val[i] = nm
				if content is None: content = ''
				#tb_protocol.append([nm, cat, content])
				new_line = prot_line[:]
				new_line[prot_idx['name']] = nm
				new_line[prot_idx['category']] = cat
				new_line[prot_idx['description']] = content
				tb_protocol.append(new_line)

	# check for new sample. Any content in sample columns ("organism", "tissue", "gender", "age", "description") means a new sample definition
	nm_str = re.compile(r'sample(_ch\d+)?\Z') # use match
	nm_cols = filter(lambda a:nm_str.match(a), head)
	suffix = map(lambda a:nm_str.match(a).groups()[0] or '', nm_cols)
	if '' in suffix and len(suffix)>1: err_msg.append('Sample/channel is in confusion in Array section')

	#if not tbs.has_key('sample'): # wouldn't check new samples defining in Array section if there is a "sample" section existing
	tbcols, char_cols = getColInfo(cur, 'sample', skip_cols=['id', 'name', 'project_id'], lower=True, return_dic=False, DB=CUR_DB)
	cols_exclude = ['project_id']
	for col_exclude in cols_exclude:
		if col_exclude in tbcols: tbcols.remove(col_exclude)
		if col_exclude in char_cols: char_cols.remove(col_exclude)
	#tbs['sample'] = tb_samp = [['name'] + tbcols]
	tb_samp = tbs.get('sample', None) or tbs.setdefault('sample', [['name'] + tbcols] )
	if len(tb_samp)==1 and 'name' not in tb_samp[0]: addCol(tb_samp, 'name')
	if 'name' in tb_samp[0]: # and len(tb_samp)>0:
		samp_head = tb_samp[0]
		for col in tbcols:
			if col not in samp_head: addCol(tb_samp, col)
		tb_line = ['']*len(tb_samp[0])
		tbcol_sites = dict(zip(tb_samp[0], range(len(tb_samp[0]))))
		# len(suffix) should be equal to channel_num
		for suf in suffix:
			tbcols_suf = map(lambda a:a+suf, tbcols)
			headcols_suf = filter(lambda a:a in head, tbcols_suf)
			cols_j = map(lambda a:head.index(a), headcols_suf)
			i_samp = head.index('sample'+suf)
			n_suf = len(suf)
			headcols = map(lambda a:a[:-n_suf], headcols_suf)
			for val in tb[1:]:
				if not val[i_samp]: continue # sample number must be offered, then here should raise an error
				vs = map(lambda a:val[a], cols_j)
				if not any(vs): continue # not a new sample definition
				line = tb_line[:]
				#for h, v in zip(headcols, vs): line[tb_col_sites[h]] = v
				map(lambda a:line.__setitem__(tbcol_sites[a[0]], a[1]), zip(headcols, vs))
				line[0] = val[i_samp]
				tb_samp.append(line)
			
	# check sampxref
	tbcols = ['sample', 'array', 'channel_no', 'dye', 'protocol_process', 'protocol_tech', 'protocol_label', 'exp_factor']
	tbs['sampxref'] = tb_xref = [tbcols[:]]
	tb_line = ['']*len(tbcols)
	tbcol_sites = dict(zip(tbcols, range(len(tbcols))))
	tbcols_head = tbcols[:]
	tbcols_head.remove('array')
	tbcols_head.remove('channel_no')
	i_array = head.index('identifier')
	for suf in suffix:
		tbcols_suf = map(lambda a:a+suf, tbcols_head)
		headcols_suf = filter(lambda a:a in head, tbcols_suf)
		cols_j = map(lambda a:head.index(a), headcols_suf)
		i_samp = head.index('sample'+suf)
		n_suf = len(suf)
		channel_No = suf[3:] #int(suf[3:])
		headcols = map(lambda a:a[:-n_suf], headcols_suf)
		for val in tb[1:]:
			if not val[i_samp]: continue # sample number must be offered, then here should raise an error
			vs = map(lambda a:val[a], cols_j)
			line =  tb_line[:]
			map(lambda a:line.__setitem__(tbcol_sites[a[0]], a[1]), zip(headcols, vs))
			line[tbcol_sites['channel_no']] = channel_No
			line[tbcol_sites['array']] = val[i_array]
			tb_xref.append(line)

	if err_msg: return {'success':False, 'info':err_msg}
	return rlt

def getPrjName(tbs):
	prj = tbs.get('project', [])
	if len(prj) != 2: return ''
	if 'name' not in prj[0]: return ''
	return prj[1][prj[0].index('name')].strip()

def addPrjCol(tbs):
	"Add project column to forms"
	tbnms = ['sample', 'protocol', 'array', 'fileinfo', 'platform']
	prj = getPrjName(tbs)
	if not prj: return True
	for tbnm in tbnms:
		tb = tbs.get(tbnm, [])
		if len(tb) < 2: continue
		if 'project' in tb[0]: continue
		tb[0].append('project')
		map(lambda a:a.append(prj), tb[1:])
	return True

def convertUserCols(tbs, cur):
	# user-added colnames should started by [user_added]
	# copy values in "[user_added]cols" to the column "user_added_cols"
	user_str = 'user_added_cols'
	for tbnm, tbv in tbs.items():
		if len(tbv) < 2: continue
		head = tbv[0]
		#head = map(str.strip, tbv[0])
		user_cols = filter(lambda a:a.strip().lower().startswith('[user_added]'), head)
		#user_cols = filter(None, user_cols)
		user_cols = filter(lambda a:a.strip()[12:].strip(), user_cols)
		if not user_cols: continue
		user_cols = map(lambda a:(a.strip()[12:].strip(), head.index(a)), user_cols)
		if user_str in head: idx = head.index(user_str)
		else: 
			idx = None
			head.append(user_str)
		for line in tbv[1:]:
			vs = []
			for uc, ui in user_cols:
				v = line[ui].strip()
				if not v: continue
				vs.append('%s=%s' % (uc, v))
			if vs:
				vs = '; '.join(vs)
				if idx is None: line.append(vs)
				elif line[idx].strip(): line[idx] = '%s; %s' % (line[idx].strip(), vs)
				else: line[idx] = vs

def convertTbs(tbs, cur=None, simplified=True):
	'''
	convert user offered 3-form (or 5-form: project, array, platform, sample, protocol) data into 7-form data.
	[array] is required, [project] is optional, [platform] is required only if new platform is used.
	one file for one project.
	'''

	if not cur:
		dbcon, cur = getConnectionCursor()
		need_close = True
	else: need_close = False

	convertUserCols(tbs, cur)

	rlt = {'success':True, 'info':[]}
	# convert project, platform, array in order
	converts = simplified and (convertPrj, convertPf, convertProt, convertArray) or (convertPf, convertProt, convertArray)
	for fun in converts:
		rltmp = fun(cur, tbs)
		mergeInfo(rlt, rltmp)
		if not rlt['success']: return rlt

	addPrjCol(tbs)
	
	if need_close:
		cur.close()
		dbcon.close()
	return rlt

def mergeInfo(rlt, rltmp):
	# merge rltmp into rlt. both are dict. like {'success':True, 'info':[] | '' }
	if rltmp:
		infotmp = rltmp.get('info', [])
		if infotmp:
			info = rlt.setdefault('info',[])
			if type(info) is not types.ListType: info = rlt['info'] = [info]
			if type(infotmp) is not types.ListType: # should be string
				rlt['info'].append(infotmp)
			else: rlt['info'].extend(infotmp)
		if not rltmp.get('success', True): 
			rlt['success'] = False
	return rlt


def getUser(cur, tbs, user_name=''): 
	if user_name is None: user_name = ''
	if not user_name: # find it from project
		prj = tbs.get('project', {})
		if prj and len(prj)>=2: # all user_name should be same in all lines in the form "project"
			if 'user_name' in prj[0]:
				user_name = prj[1][prj[0].index('user_name')].strip()
	if user_name: 
		cur.execute('SELECT id FROM %s.%s WHERE user_name="%s"' % (USER_DB, tbl_user, user_name.replace('"','\\"')) )
		if cur.rowcount > 0: 
			user_id_str = str(cur.fetchone()[0])
			return user_name, user_id_str
	return user_name, 'NULL'
			

def chkCols(cur, tbs, chkavail, user_name=None):
	''' check if all required names are available (and unique)'''
	
	error_msg = []
	# get user_id first
	user_name, user_id_str = getUser(cur, tbs, user_name)
	user_str = user_name and ('for user "%s" ' % user_name)
	user_id_str = (user_id_str == 'NULL') and ' IS NULL' or (' = ' +  user_id_str)
			
	for tb, tpnms in chkavail.items():
		tbv = tbs.get(tb, [])
		if not tbv or len(tbv)<2: continue
		head = tbv[0]
		body = tbv[1:]
		rng = range(len(body))
		pos = dict(zip(head, range(len(head)) ))
		
		# check necessary items
		colnms = tpnms.get('necessary', [])
		for colnm in colnms:
			colnm = colnm.lower()
			idx = pos.get(colnm, None)
			if idx is None: 
				error_msg.append('The form "%s" doesn\'t has the necessary column "%s"!' % (tb, colnm) )
				continue
			# check every line in tbv
			for i in rng:
				if not body[i][idx].strip(): error_msg.append('The form %s doesn\'t has the necessary item "%s" in the line %d!' % (tb, colnm, i+1) )

		# check columns that should be unique in the input file (both unique4user and unique4all)
		# columns = list(sets.Set(tpnms.get('unique4user',[])) | sets.Set(tpnms.get('unique4all',[])) )
		colnms = sets.Set(tpnms.get('unique4user',[])) | sets.Set(tpnms.get('unique4all',[]))
		for colnm in colnms:
			colnm = colnm.lower()
			idx = pos.get(colnm, None)
			if idx is None: continue
			items = map(lambda a:a[idx].strip(), body)
			items = filter(lambda a:a, items) # remove empty items
			item_dic = {}
			for item in items: item_dic[item] = item_dic.get(item, 0) + 1
			for item, num in item_dic.items():
				if num > 1: error_msg.append('The value in the column "%s" of form "%s" should be unique, but %d copies occurred for the value "%s"!' % (colnm, tb, num, item) )
		
		# check columns unique for the user.
		colnms = tpnms.get('unique4user', [])
			
		for colnm in colnms:
			colnm = colnm.lower()
			idx = pos.get(colnm, None)
			if idx is None: continue
			# check every line in tbv
			for i in rng:
				item = body[i][idx].strip()
				if not item: continue 
				if tb == 'project':
					cur.execute('SELECT %s FROM %s.%s WHERE %s="%s" AND user_id %s' % (colnm, CUR_DB, tb, colnm, item.replace('"','\\"'), user_id_str) ) # all should be string
				else:
					try:
						cur.execute('SELECT a.id FROM %s.%s AS a INNER JOIN %s.project AS p ON (a.project_id=p.id) WHERE a.%s="%s" AND p.user_id %s' % (CUR_DB, tb, CUR_DB, colnm, item.replace('"','\\"'), user_id_str) ) # all should be string
					except:
						error_msg.append('error in query: SELECT a.id FROM %s.%s AS a INNER JOIN %s.project AS p ON (a.project_id=p.id) WHERE a.%s="%s" AND p.user_id %s' % (CUR_DB, tb, CUR_DB, colnm, item, user_id_str) )
						return error_msg
						#print error_msg 
						#raise
					#cur.execute('SELECT a.id FROM %s AS a INNER JOIN project ON (a.project_id=project.id) WHERE a.%s="%s" AND project.user_id %s' % (tb, colnm, item, user_id_str) ) # all should be string
					
				if cur.rowcount > 0: error_msg.append('The item "%s" (Form: "%s", Line: %d, Column "%s") already exists %sin Database!' % (item, tb, i+1, colnm, user_str) )
				
		# check columns unique in the whole database.
		colnms = tpnms.get('unique4all', [])
		for colnm in colnms:
			colnm = colnm.lower()
			idx = pos.get(colnm, None)
			if idx is None: continue
			# check every line in tbv
			for i in rng:
				item = body[i][idx].strip()
				if not item: continue 
				cur.execute('SELECT %s FROM %s.%s WHERE %s="%s"' % (colnm, CUR_DB, tb, colnm, item.replace('"','\\"')) ) # all should be string
				if cur.rowcount: error_msg.append('The item "%s" (Form: "%s", Line: %d, Column: "%s") already exists in Database!' % (item, tb, i+1, colnm) )
				
	return error_msg
	
def chkRefs(cur, tbs, nm4id, user_name=None):
	''' check if all referenced names exist'''
	
	error_msg = []
	error_dic = {}
	idx_dic = {}
	
	# get user_name first
	user_name, user_id_str = getUser(cur, tbs, user_name)
	user_id_str = (user_id_str == 'NULL') and ' IS NULL' or (' = ' +  user_id_str)
		
	for tb, nmdic in nm4id.items():
		tbv = tbs.get(tb, [])
		if not tbv or len(tbv)<2: continue
		head = tbv[0]
		body = tbv[1:]
		rng = range(len(body))
		pos_tb = idx_dic.setdefault(tb, dict(zip(head, range(len(head)) )) )
		
		# check name reference
		for nm, v in nmdic.items():
			nm = nm.lower()
			if nm not in head: continue
			tbref = v['tb']
			tbvref = tbs.get(tbref, [])
			col =  v['col']
			for i in rng:# check each line in the tbv
				val = body[i][pos_tb[nm]].strip()
				val_s = val.replace('"','\\"')
				if not val: continue
				if len(tbvref) >= 2: # check in this file
					pos_tbref = idx_dic.setdefault(tbref, dict(zip(tbvref[0], range(len(tbvref[0])))) )
					i_ref = pos_tbref.get(col.lower(), None)
					if (i_ref is not None) and (val in map(lambda a:a[i_ref], tbvref[1:])): continue # found reference within this file
				# not exists in this file. Search database then.
				if v.get('this_user'): 
					# tbref must be a form that connected to form project directly
					if tbref == tbl_user: 
						SQL = 'SELECT %s FROM %s.%s WHERE %s="%s"' % (col, USER_DB, tbref, col, val_s)
					elif tbref == 'project':
						SQL = 'SELECT %s FROM %s.project WHERE %s="%s" AND user_id%s' % (col, CUR_DB, col, val_s, user_id_str)
					else:
						SQL = 'SELECT a.%s FROM %s.%s AS a, %s.project AS p WHERE a.%s="%s" AND a.project_id=p.id AND p.user_id%s' % (col, CUR_DB, tbref, CUR_DB, col, val_s, user_id_str)
						
				else: # must be all_user. Same to this_user AND tbl_user
					#SQL = 'SELECT %s FROM %s.%s WHERE %s="%s"' % (col, tbref==tbl_user and USER_DB or CUR_DB, tbref, col_s, val)
					SQL = 'SELECT %s FROM %s.%s WHERE %s="%s"' % (col, tbref==tbl_user and USER_DB or CUR_DB, tbref, col, val)

				cur.execute(SQL)
					
				if not cur.rowcount: 
					error_key = '.'.join([tbref, col, val])
					if not error_dic.has_key(error_key):
						error_dic[error_key] = True
						error_msg.append('No definition of "%s" in form "%s", referred in (Form: "%s", row: %d, colname: "%s")' % (val, tbref, tb, i, nm) )
						#error_msg.append(SQL)
	
	return error_msg

def chkValType(tb, cols, fun):
	err_lns = [] # (row, col, colname, Val)
	head = map(lambda a:a.lower(), tb[0])
	idx = [head.index(a) for a in cols if a in head]
	i = 0
	for line in tb[1:]:
		i = i+1
		for j in idx: 
			if not fun(line[j]): err_lns.append((i, j+1, head[j], line[j]))
	return err_lns

def chkVals(cur, tbs):
	int_s = dec_str
	float_s = float_str
	# US date must use 4-digit year
	#date_s_int = re.compile(r'^$|^NULL$|^\d{1,4}\D((0?[1-9])|1[012])\D([0-2]?\d|3[01])$')
	#date_s_us = re.compile(r'^$|^NULL$|^((0?[1-9])|1[012])\D([0-2]?\d|3[01])\D\d{4}$')
	date_s = date_s_both = re.compile(r'^$|^NULL$|^\d{1,4}\D((0?[1-9])|1[012])\D([0-2]?\d|3[01])$|^((0?[1-9])|1[012])\D([0-2]?\d|3[01])\D\d{4}$')
	#year_s = re.compile(r'^$|^NULL$|^\s*\d{4}\s*$')
	year_s = re.compile(r'^$|^NULL$|^\s*(199\d|2\d{3})\s*$')

	int_cols = {'platform':['probe_num', 'replicate', 'space'], 'array':['channel_num']}
	float_cols = {'sample':['age']}
	date_cols = {'project':['release_date', 'submit_date'], 'array':['hyb_date']}
	tp_cols = getAllColType(cur)
	int_cols, float_cols, date_cols, enum_cols, year_cols = tp_cols['int'], tp_cols['float'], tp_cols['date'], tp_cols['enum'], tp_cols['year']
	tp_dic_fun = [('INTEGER', int_cols, int_s.match), ('FLOAT', float_cols, float_s.match), ('DATE', date_cols, date_s.match), ('YEAR', year_cols, year_s.match)]

	#enum_cols = {
	#	'platform':[
	#		('category', ('', 'NULL', 'antibody', 'in situ oligonucleotide', 'MPSS', 'MS', 'oligonucleotide beads', 'other', 'RT-PCR', 'SAGE NlaIII', 'SAGE Sau3A', 'spotted DNA/cDNA', 'spotted oligonucleotide', 'spotted protein')), 
	#		('availability', ('', 'NULL', 'public', 'private'))], 
	#	'array':[('data_type', ('', 'NULL', 'intensity', 'ratio'))], 
	#	'sample':[('gender', ('', 'NULL', 'female', 'male', 'NA'))],
	#	'protocol':[('category', ('', 'NULL', 'hybridization', 'image', 'data', 'process', 'technique', 'label'))]
	#	}

	error_msg = []
	for tp, tpdic, fun in tp_dic_fun:
		for tbnm, cols in tpdic.items():
			tb = tbs.get(tbnm, None)
			if not tb or len(tb)<2: continue
			err_i_j_v = chkValType(tb, cols, fun)
			if err_i_j_v:
				error_msg.append('Incorrect %s values in the %s definition:' % (tp, tbnm))
				error_msg.extend(map(lambda a:'   Line %d, cols %d (%s) - %s' % a, err_i_j_v))

	for tbnm, colnm_opts in enum_cols.items():
		tb = tbs.get(tbnm, None)
		if not tb or len(tb)<2: continue
		for colnm, opts in colnm_opts:
			opts = sets.Set(map(lambda a:a.lower(), opts))
			err_i_j_v = chkValType(tb, [colnm], lambda a:a.lower() in opts)
			if err_i_j_v:
				error_msg.append('Incorrect ENUM values in the %s definition:' % tbnm)
				error_msg.extend(map(lambda a:'   Line %d, cols %d (%s) - %s' % a, err_i_j_v))

	return error_msg

#def chkUserCol(cur, tbs, sep=re.compile(r'[,;]')):
def chkUserCol(cur, tbs, sep=sep_dyncol):
	'''Check the name in the user-added column - "user_added_cols"'''
	error_msg = []
	nm_valid = re.compile(r'^[a-zA-Z][a-zA-Z0-9_]*$')
	nms_bad = []
	mysql_keys = MYSQL_KEYS
	int_s = dec_str
	float_s = float_str
	
	for tbnm, tbv in tbs.items():
		if not tbv or len(tbv)<2: continue
		if 'user_added_cols' not in tbv[0]: continue
		idx = tbv[0].index('user_added_cols')
		# find int and float dyn cols
		#colnm_type = inquireDB('SELECT col_name, col_type FROM %s.dyncoldef WHERE tb_name="%s" AND col_type!="string"' % (CUR_DB, tbnm), cursor=cur, fetch=True)
		colnm_type = inquireDB('SELECT col_name, col_type FROM %s.dyncoldef WHERE tb_name="%s"' % (CUR_DB, tbnm), cursor=cur, fetch=True)
		#colnm_fun = dict(map(lambda a:a[1]=='int' and (a[0], int_s.match) or (a[0], float_s.match), colnm_type))
		type_fun = {'int':int_s.match, 'float':float_s.match, 'string':lambda a:True}
		colnm_type = dict(colnm_type)
		for line in tbv[1:]:
			kws = line[idx]
			errs = parseDyncols(kws, colnm_type)['error']
			if errs:
				errs = map(lambda a:'Error in column "user_added_cols" in "%s" section: %s' % (tbnm, a), errs)
				error_msg.extend(errs)
			continue

			for item in filter(lambda a:a, map(lambda a:a.strip(), sep.split(kws))):
				kv = map(lambda a:a.strip(), item.split('=') )
				if len(kv) != 2: 
					error_msg.append('Invalid "column_name = value" pair: "%s" in the "user_added_cols" in "%s" section' % (item, tbnm))
					continue
				k, v = kv
				if mysql_keys.get(k, False): # mysql_keys should be allowed for dyncol names !
					nms_bad.append('Invalid column name "%s" (in section: "%s", column: "user_added_cols", SQL reserved word!)' % (k, tbnm) )
				elif not nm_valid.match(k): nms_bad.append('Invalid column name "%s" (in section: "%s", column: "user_added_cols")' % (k, tbnm) )
				if k in colnm_type: # defined int/float columns
					if not type_fun[colnm_type[k]](v): nms_bad.append('Invalid value for %s type user-defined column "%s" (in section: "%s", column: "user_added_cols")' % (colnm_type[k], k, tbnm) ) 
				else: # new columns
					pass
					# add new number columns to colnm_type
	if nms_bad: error_msg.append('Bad column names: %s' % '; '.join(nms_bad))
	return error_msg

def chkFiles(tbs, cur=None, src_dir=None):
	''' check if mentioned files exist  and rows/columns of probe file'''
	error_msg = []
	tbv = tbs.get('fileinfo', None)
	if not tbv or len(tbv)<2: return error_msg
	head = tbv[0]
	idx_raw = head.index('name')
	idx_cat = head.index('category')
	idx_fmt = head.index('format')
	idx_pf = head.index('platform')
	idx_array = head.index('array')
	#r = rpy_r
	#if r is None: #not r: 
	#	error_msg.append('Cannot read intensity files due to Rpy error!')
	#	read_int = False
	#else: read_int = True
	read_int = True
	#else:
	#	fmts = map(lambda a:a[idx_fmt].lower(), tbv[1:])
	#	if 'cel' in fmts: r('library(affy)')
	#	r('library(limma)') 

	dir_cats = {'annotation':'prjanno'}
	dir_cats['project'] = dir_cats['annotation']
	
	fns_pf = []
	fns_int = []
	for line in tbv[1:]:
		file_cat, fnraw, file_fmt = line[idx_cat], line[idx_raw], line[idx_fmt]
		#dir_cat = file_cat # file_cat=='platform' and 'probe' or file_cat  # this is done by CGI
		dir_cat = dir_cats.get(file_cat, file_cat) 
		if src_dir and src_dir.get(dir_cat, None): fn = os.path.join(src_dir[dir_cat], fnraw)
		else: fn = fnraw
		if not os.path.exists(fn): 
			error_msg.append('File "%s" doesn\'t exist!' % fnraw)
			#error_msg.append('file_cat is "%s", dir_cat is "%s"\nsrc_dir is %s' % (file_cat, dir_cat, str(src_dir)))
		elif file_cat == 'platform': fns_pf.append((line[idx_pf], fnraw, fn))
		elif file_cat == 'intensity' and read_int:  fns_int.append((line[idx_array], fnraw, fn, file_fmt))
		#else: error_msg.append('file_cat is '+file_cat)

	#if error_msg: return error_msg
	#else:
	#	error_msg.append('OK now')
	#	return error_msg

	# check platform-probe first: probe number
	tbv_pf = tbs.get('platform', [])
	pf_params = {}
	if tbv_pf and len(tbv_pf)>=2:
		i_pfnm, i_pbnum = tbv_pf[0].index('name'), tbv_pf[0].index('probe_num')
		pf_params = dict(map(lambda a:(a[i_pfnm], int(a[i_pbnum])), tbv_pf[1:]))
		for pfnm, fnraw, fn in fns_pf:
			try:
				#pf_title_str = open(fn).readline()
				#pf_type = ProbeFileType(pf_title_str)
				pf_type, ann_ln = ProbeFileType(fn)
				#lines = readTable(open(fn), skip_blank=True)
				lines = readProbe(fn, cur)
				line0 = map(string.lower, lines[0])
				# check if the "unique_id" unique
				check_unique_id = False # now "unique_id" can be not unique!
				if check_unique_id and 'unique_id' in line0:
					i_id = line0.index('unique_id')
					ids = map(lambda a:a[i_id], lines[1:])
					if len(ids) != len(sets.Set(idx)):
						error_msg.append('Have replicated "unique_id" or "id" in the probe file "%s"!' % fnraw)
				# check file content here
				#err_loc_val = chkValType(lines, ['idx', 'block_row', 'block_col', 'row', 'col', 'chr_start', 'chr_end'], dec_str.match)
				err_loc_val = chkValType(lines, getIntCols(cur, 'probe', skip_cols=tbcols_auto['probe']), dec_str.match)
				if err_loc_val:
					error_msg.append('Incorrect integer values in the probe file "%s":' % fnraw)
					error_msg.extend(map(lambda a:'   Line %d, cols %d (%s) - %s' % a, err_loc_val))
				
				if len(lines)-1 != pf_params[pfnm]: error_msg.append('Probe number doesn\'t match: claimed %d for platform (%s) but found %d in probe file (%s)!' % (pf_params[pfnm], pfnm, len(lines)-1, fnraw))
				if pf_type != 'affy':
					cols_avail = map(lambda a:a.lower(), lines[0])
					cols_miss = sets.Set(['unique_id', 'id']).difference(cols_avail)
					if (len(cols_miss)==2): error_msg.append('Necessary columns missed in probe file (%s): %s' % (fnraw, str(list(cols_miss))[1:-1]) )
			except: 
				#error_msg.append('Error in reading file "%s"!' % fnraw)
				error_msg.append('Error in reading file "%s"! -- error info:<br>%s' % (fnraw, getErrInfo()))
	elif fns_pf: error_msg.append('Found probe files without platform description information!')

	# then check intensity with platform
	# get platform from tbs and database
	tbv_ary = tbs.get('array', [])
	re_check = re.compile(r'(?<=<checkIntensity_result>).*(?=</checkIntensity_result>)') # use re_check.search('xxx').group()
	ffn = os.path.join(os.path.split(os.path.abspath(__file__))[0], 'readIntensity.R')
	if sys.platform == 'win32': ffn = ffn.replace('\\', '\\\\')
	if tbv_ary and len(tbv_ary)>=2:
		i_ary_nm, i_ary_chnum, i_ary_dtype, i_ary_pf = tbv_ary[0].index('identifier'), tbv_ary[0].index('channel_num'), ('data_type' not in tbv_ary[0]) and -1 or tbv_ary[0].index('data_type'), tbv_ary[0].index('platform')
		ary_params = dict(map(lambda a:(a[i_ary_nm], (a[i_ary_chnum], (i_ary_dtype >= 0) and a[i_ary_dtype] or 'intensity', a[i_ary_pf])), tbv_ary[1:]))
		for ary, fnraw, fn, fmt in fns_int:
			if fmt.lower() == 'cel': # skip reading intensity from CEL to save time
				try: ch_num = int(ary_params[ary][0])
				except: ch_num = 0
				if ch_num != 1: 
					error_msg.append('Wrong channel number (%s - should be 1) for CEL data!' % str(ary_params[ary][0]))
					continue
			elif fmt.lower() != 'user.defined' and int(ary_params[ary][0]) != 2 and not (fmt.lower()=='agilent' and int(ary_params[ary][0])==1): 
				error_msg.append('Wrong channel number (%s - should be 2) for data with format "%s"!' % (str(ary_params[ary][0]), fmt) )
				continue
			#else: # try to read data
			if True:
				try:
					fails = 'Failed to read intensity file "%s" (format: "%s")!' % (fnraw, fmt)
					
					CMDS = ['source("%s")' % ffn]
					CMDS.append('checkIntensity(src="%s", fmt="%s", data_type="%s", ch_num=%s)' % (sys.platform=='win32' and fn.replace('\\','\\\\') or fn, fmt, ary_params[ary][1], str(ary_params[ary][0])) )
					rlt = rlt0 = pipeR(CMDS=CMDS, init_lam=False)
					if type(rlt) is not tuple: 
						error_msg.append(fails + ' -- rlt is not tule!')
						continue
					rlt = re_check.search(rlt[0])
					if not rlt:
						error_msg.append(fails + ' -- no value in: ' + rlt0[0])
						continue
					rlt = rlt.group()
					#error_msg.append('The number of intensities is %s' % rlt)
					#continue

					try:
						rlt = int(rlt)
					except: pass

					#rlt = r('checkIntensity(src="%s", fmt="%s", data_type="%s", ch_num=%s)' % (fn, fmt, ary_params[ary][1], str(ary_params[ary][0])) )
					if type(rlt) is str: error_msg.append('%s "%s" (format: "%s")!' % (rlt, fn, fmt))
					elif not rlt: error_msg.append(fails + ' -- ' + repr(rlt))
					else: # check the row number
						pfnm = ary_params[ary][2]
						if pfnm in pf_params: # new platform
							pb_num = pf_params[pfnm]
						else: # try to find it in database
							if not cur:
								dbcon, cur = getConnectionCursor()
								need_close = True
							else: need_close = False
							cur.execute('SELECT DISTINCT probe_num FROM %s.platform WHERE name="%s"' % (CUR_DB, pfnm.replace('"','\\"'))) # platform name is unique for all users
							pb_num = pf_params[pfnm] = cur.rowcount and cur.fetchone()[0] or 0
							if need_close:
								cur.close()
								dbcon.close()
								cur = None
						if rlt != pb_num: # pb_num and rlt != pb_num:
							error_msg.append('Incorrect probe number in file "%s" (found %d probes in file, but %d were defined in platform %s' % (fnraw, rlt, pb_num, pfnm))
				except: error_msg.append('Failed in reading intensity file "%s" (format: "%s")!' % (fnraw, fmt))
	elif fns_int: error_msg.append('Found intensity files without array definition!')
	
	#error_msg.append('<p>Stop here for debug!')

	return error_msg

def rmBlanks(tbs):
	for tbv in tbs.values():
		for line in tbv:
			for i in range(len(line)):
				line[i] = line[i].strip()
	
def checkTbs(tbs, cur=None, src_dir=None, user_name=None):
	if not cur:
		dbcon, cur = getConnectionCursor()
		need_close = True
	else: need_close = False
		
	# check name/identifier
	# some names are unique for each user, e.g. project, sample, protocol, array
	#nm4user = {'project':'name', 'sample':'name', 'protocol':'name', 'array':'identifier'}
	# some names should be unique for all users, e.g. platform names.
	#nm4all = {'platform':'name'}

	error_msg = []

	chkavail = { 
		# These columns should be offered and/or unique for the user (or all user). 
		# Keys: "necessary", "unique4user", "unique4all", "files"
		'platform' : {'necessary':['name', 'probe_file', 'category', 'probe_num', 'replicate', 'space'], 
					'unique4all':['name'], 'files':['probe_file']},
		'project' : {'necessary':['name', 'user_name'], 'unique4user':['name']},
		'sample' : {'necessary':['name', 'project'], 'unique4all':['name']},
		'protocol' : {'necessary':['name', 'category', 'project'], 'unique4all':['name']},
		'fileinfo' : {'necessary':['name', 'project']},
		#'array' : {'necessary':['identifier', 'project', 'platform', 'channel_num', 'intensity_filename', 'intensity_format'], 
		#			'unique4user':['identifier'], 'files':['intensity_filename']},
		'array' : {'necessary':['identifier', 'project', 'platform', 'channel_num', 'intensity_file', 'intensity_format'], 
					'unique4user':['identifier'], 'files':['intensity_file']},
		'sampxref' : {'necessary':['sample', 'array', 'channel_No']}
		#'' : {'necessary':['name'], 'unique4user':['name']}
		}

	#nm4id = { # actually only the id of platform and users should be search in "all_user". The same in getIDs.
	#	# nm, tb, nm_in_tb. nm will be used for ID searching by match nm_tb in tb, so they should be available in this file or in DB. 
	#	# Keys: "this_file", "this_user", "all_user"
	#	'platform' : {'this_user':[('project', 'project', 'name')]},
	#	'project' : {'all_user':[('user_name', tbl_user, 'user_name')]},
	#	'sample' : {'this_file':[('project', 'project', 'name')]},
	#	'protocol' : {'this_user':[('project', 'project', 'name')]},
	#	'fileinfo' : {'this_user':[('project', 'project', 'name'), ('array', 'array', 'name')], 
	#				'all_user':[('platform', 'platform', 'name')]},
	#	'array' : {'this_user':[('project', 'project', 'name'), ('protocol_hyb', 'protocol', 'name'), ('protocol_image', 'protocol', 'name'), ('protocol_data', 'protocol', 'name')], 
	#				'all_user':[('platform', 'platform', 'name')]},
	#	'sampxref' : {'this_user':[('sample', 'sample', 'name'), ('array', 'array', 'identifier'), ('protocol_process', 'protocol', 'name'), ('protocol_tech', 'protocol', 'name'), ('protocol_label', 'protocol', 'name')]},
	#	}

	nm4id = { # actually only the id of platform and users should be search in "all_user". The same in getIDs.
		# nm, tb, nm_in_tb. nm will be used for ID searching by match nm_tb in tb, so they should be available in this file or in DB. 
		# "this_user", "all_user" # now platform, sample, and protocol should be unique for all_user, project and array unique for the user
		'platform' : {'project':{'this_user':True, 'tb':'project', 'col':'name'}},
		'project' : {'user_name':{'all_user':True, 'tb':tbl_user, 'col':'user_name'}},
		'sample' : {'project':{'this_file':True, 'tb':'project', 'col':'name'}},
		'protocol' : {'project':{'this_user':True, 'tb':'project', 'col':'name'}},
		'fileinfo' : {'project':{'this_user':True, 'tb':'project', 'col':'name'}, 
					'array':{'this_user':True, 'tb':'array', 'col':'identifier'}, 
					'platform':{'all_user':True, 'tb':'platform', 'col':'name'} },
		'array' : {'project':{'this_user':True, 'tb':'project', 'col':'name'}, 
					'protocol_hyb':{'all_user':True, 'tb':'protocol', 'col':'name'}, 
					'protocol_image':{'all_user':True, 'tb':'protocol', 'col':'name'}, 
					'protocol_data':{'all_user':True, 'tb':'protocol', 'col':'name'}, 
					'platform':{'all_user':True, 'tb':'platform', 'col':'name'} },
		'sampxref' : {'sample':{'all_user':True, 'tb':'sample', 'col':'name'}, 
					'array':{'this_user':True, 'tb':'array', 'col':'identifier'}, 
					'protocol_process':{'all_user':True, 'tb':'protocol', 'col':'name'}, 
					'protocol_tech':{'all_user':True, 'tb':'protocol', 'col':'name'}, 
					'protocol_label':{'all_user':True, 'tb':'protocol', 'col':'name'} }, 
		}

	# remove leading and tailing blank first
	rmBlanks(tbs)

	error_msg.extend(chkCols(cur, tbs, chkavail, user_name=user_name))
	error_msg.extend(chkRefs(cur, tbs, nm4id, user_name=user_name))
	error_msg.extend(chkUserCol(cur, tbs))
	error_msg.extend(chkVals(cur, tbs))
	error_msg.extend(chkFiles(tbs, src_dir=src_dir, cur=cur))

	if need_close:
		cur.close()
		dbcon.close()
	if error_msg:
		return {'success':False, 'info':error_msg}
	#return True

def updateTbs(tbs, cur=None):
	if not cur:
		dbcon, cur = getConnectionCursor()
		need_close = True
	else: need_close = False
		
		
	if need_close:
		cur.close()
		dbcon.close()
		
	return True
				
def saveTbs(tbs, fn='tbs.txt'):
	if type(fn) is type(''):
		f = file(fn, 'w')
	else: f = fn
	for tbnm, tb in tbs.items():
		print >>f, '[%s]' % tbnm
		for line in tb: #print >>f, '\t'.join(line)
			try:
				print >>f, '\t'.join(line)
			except:
				print tbnm
				print tb, '\n'
				print line
				f.close()
				raise
		print >>f
	f.close()

def fillBatch(fn, cur=None, data_dir=data_dir, src_dir={}, user_name=None, simplified=False, fill_type=None):
	rlt = {'success':True, 'info':[]}
	if type(fn) is types.DictType: # fn can be a dict (a read file)
		tbs = fn 
	else: # should be file name or an opened file or a memory file
		tbs = parseFile(fn, title_case='lower')
	if not tbs: return

	if not cur:
		dbcon, cur = getConnectionCursor()
		need_close = True
	else: need_close = False

	if simplified: # if simplified, a project section will be generated automatically if no one provided.
		if (len(tbs.get('project',[])) > 2): 
			simplified = False
			rlt['info'].append('More than one project offered, trying batch filling mode!')
		elif (len(tbs.get('project',[])) < 2): # no project section
			if False: #(len(tbs.get('array', [])) < 2): # nor array
				simplified = False
				rlt['info'].append('Has neither "project" nor "array", trying batch filling mode!')
			else: # check if there is a project column in each section
				for k, v in tbs.items():
					if v and 'project' in v[0]:
						simplified = False
						rlt['info'].append('Has the "project" column in section "%s", trying batch filling mode!' % k)
		elif not sets.Set(tbs.keys()).issubset(sets.Set(['project', 'array', 'platform', 'sample', 'protocol'])): # has more forms
			simplified = False
			rlt['info'].append('Have items other than "project", "array", "platform", "sample" or "protocol", trying batch filling mode!')
		elif False: #'array' not in tbs:
			simplified = False
			rlt['info'].append('"array" section is not found, trying batch filling mode!')
	
	if user_name: # add user_name to project
		tbprj = tbs.setdefault('project', [])
		n = len(tbprj)
		if not n: 
			if simplified: 
				tbprj.extend([['user_name'],[user_name]]) # create a new project if in simplified mode
			else:
				tbprj.extend([['user_name']]) # why don't add [user_name]? because in some cases we don't need the project definition, e.g. when add data to an existing project. If a project section has content, it should be new project.
		else:
			if 'user_name' not in tbprj[0]: tbprj[0].append('user_name')
			if n==1: tbprj.append(['']*len(tbprj[0]))
			st = tbprj[0].index('user_name')
			ncol = len(tbprj[0])
			for i in range(1, len(tbprj)):
				icol = len(tbprj[i])
				if icol < ncol: tbprj[i].extend(['']*(ncol-icol))
				tbprj[i][st] = user_name


	if True:#simplified: # NOW, the only differnece of simplified mode is whether generate project automatically or not.
		rlt = mergeInfo(rlt, convertTbs(tbs, cur=cur, simplified=simplified))
		if rlt and not rlt.get('success', True):
			if need_close:
				cur.close()
				dbcon.close()
			return rlt
		if fill_type is not None and fill_type == 'Add prjanno': # remove the project section
			tbs['project'] = []
			tbprj = tbs.get('project', [])
			if len(tbprj)>1:
				idx = list(range(len(tbprj[0])))
				if 'name' in tbprj[0]: idx.remove(tbprj[0].index('name'))
				if 'user_name' in tbprj[0]: idx.remove(tbprj[0].index('user_name'))
				for line in tbprj[1:]:
					for i in idx: line[i] = ''
	
	# for debug
	if __DEBUG__: saveTbs(tbs)
	#cur.close()
	#dbcon.close()
	#sys.exit(0)
	
	rlt = mergeInfo(rlt, checkTbs(tbs, cur=cur, src_dir=src_dir, user_name=user_name))
	if rlt and not rlt.get('success', True): 
		rlt['info'].append("\nDidn't fill tables due to errors.")
		return rlt

	rlt = mergeInfo(rlt, fillAll(tbs, cur=cur, file_location=data_dir, src_dir=src_dir, user_id=user_name))

	if need_close:
		cur.close()
		dbcon.close()
	return rlt

def fillWith(fn, cur=None, data_dir=data_dir, src_dir={}, user_name=None, cur_db=None, fill_type=None):
	# data_dir is the folder to store data file
	# src_dir is a dict of dir for src data {'intensity':'', 'project':'', 'image':'', 'probe':''}
	if cur_db:
		global CUR_DB
		CUR_DB = cur_db
	return fillBatch(fn, cur=cur, data_dir=data_dir, src_dir=src_dir, simplified=True, user_name=user_name, fill_type=fill_type)

def chkMapf(cur, sql, mapf_id=None): # similar to chkXpf
	cur.execute(sql) # sql should be "ORDER BY mapf_id ASC"
	if not cur.rowcount: return mapf_id
	mapf_ids =  map(lambda a:a[0], cur.fetchall())
	mapf_ids.sort() # sql should be "ORDER BY mapf_id ASC"
	# set to the minimum one (first one)
	if mapf_id is None or mapf_id in mapf_ids: 
		mapf_id = mapf_ids.pop(0) # = mapf_ids[1:]
	elif mapf_id > mapf_ids[0]: 
		mapf_ids.append(mapf_id)
		mapf_id = mapf_ids.pop(0) 
	if len(mapf_ids) > 0: # correct other mapf_ids to the mapf_id
		mapfids_str = ','.join(map(lambda a:str(a), mapf_ids))
		cur.execute('UPDATE %s.probe SET mapf_id=%d WHERE mapf_id IN (%s)' % (CUR_DB, mapf_id, mapfids_str))
		cur.execute('DELETE FROM %s.mapf WHERE id IN (%s)' % (CUR_DB, mapfids_str))
	cur.connection.commit()
	return mapf_id

def getPfPairs_now_in_tools(fn, sep='\t'):
	# fn is the name of a probe-mapping file
	fp = open(fn)
	head = map(string.lower, fp.readline().split(sep))
	if not head: return None
	head_other = ['platform_a', 'unique_id_a', 'platform_b', 'unique_id_b']
	head_affy = ['a array name', 'a probe set name', 'b array name', 'b probe set name']
	if len(sets.Set(head_other).intersection(head)) == 4: head_use = head_other # head_other will override head_affy
	elif len(sets.Set(head_affy).intersection(head)) == 4: head_use = head_affy
	else: return None
	idx = map(lambda a:head.index(a), head_use)
	lines = map(lambda a:(lambda b=a.split(sep):map(lambda c:b[c], idx))(), fp.xreadlines())
	#if not lines: return None
	#lines.insert(0, head_use)
	return lines

def mapPfs(fn, cur=None, data_dir=data_dir): #, src_dir={}, user_name=None, cur_db=None):
	lines = getPfPairs(fn) # each row is a list: [platform_name_a, unique_id_a, platform_name_b, unique_id_b]
	if not lines: return {'success':False, 'info':['Invalid file "%s"!' % os.path.basename(fn)]}

	if not cur: cur = getCursor()
	#cur.execute('LOCK TABLES %s.probe, %s.mapf LOW_PRIORITY WRITE' % (CUR_DB, CUR_DB))
	cur.execute('SELECT MAX(id) FROM %s.mapf' % CUR_DB)
	mapf_max = cur.fetchone()
	mapf_max = mapf_max and mapf_max[0] or 0
	def getPfid(pfnm, cur=cur):
		cur.execute('SELECT id FROM %s.platform WHERE name="%s"' % (CUR_DB, pfnm))
		if not cur.rowcount: return None
		pfid = cur.fetchone()
		return pfid and pfid[0] or None
	pfdic = {}
	for pna, unia, pnb, unib in lines:
		pia = pfdic.setdefault(pna, getPfid(pna))
		if pia is None: continue
		pib = pfdic.setdefault(pnb, getPfid(pnb))
		if pib is None: continue
		sql = 'SELECT DISTINCT mapf_id FROM %s.probe WHERE (platform_id=%d && unique_id="%s" && mapf_id IS NOT NULL) || (platform_id=%d && unique_id="%s" && mapf_id IS NOT NULL) ORDER BY mapf_id ASC' % (CUR_DB, pia, unia, pib, unib)
		mapf_id = chkMapf(cur, sql)
		if mapf_id is None: # update all records with new mapf_id (maximum)
			mapf_max = mapf_max + 1
			cur.execute('INSERT INTO %s.mapf () VALUES ()' % CUR_DB )
			cur.execute('UPDATE %s.probe SET mapf_id=%d WHERE (platform_id=%d && unique_id="%s") || (platform_id=%d && unique_id="%s")' % (CUR_DB, mapf_max, pia, unia, pib, unib))
		else: # update only those records with mapf_id != mapf_id (or IS NULL - other records should be set to mapf_id already by chkMapf)
			cur.execute('UPDATE %s.probe SET mapf_id=%d WHERE (platform_id=%d && unique_id="%s" && mapf_id!=%d) || (platform_id=%d && unique_id="%s" && mapf_id!=%d)' % (CUR_DB, mapf_id, pia, unia, mapf_id, pib, unib, mapf_id))
	#cur.execute('UNLOCK TABLES')
	cur.connection.commit()
	# save fileinfo and copy file
	v_fmt = os.path.splitext(fn)[1][1:].lower()
	tbs = {'fileinfo':[['name', 'category', 'format', 'project_id', 'platform_id', 'protocol_id', 'array_id', 'info_code'],[fn, 'map', v_fmt, '', '', '', '', '']]}
	tbids, tbcols, char_cols = {}, {'fileinfo':None}, {'fileinfo':None} # None value will force fillTable to find them by itself.
	fillFileinfo(cur, tbs, tbids, tbcols, char_cols, location=data_dir)
	return {'success':True}

if __name__ == '__main__': 
	if len(sys.argv) < 2: sys.exit(0)
	
	from getopt import getopt
	optlist, args =  getopt(sys.argv[1:], '', ['ACCEPT_DATA_FROM_PIPE', 'TYPE'])
	optdict = dict(optlist)
	by_pipe = optdict.has_key('--ACCEPT_DATA_FROM_PIPE')
	cmd_type = optdict.get('--TYPE', 'FILL') # not needed for PIPE mode

	#fn = sys.argv[1]
	if by_pipe: #fn == '--ACCEPT_DATA_FROM_PIPE': 
		import cPickle
		params = cPickle.loads(sys.stdin.read())
		if params.has_key('cur_db'): CUR_DB = params['cur_db'] # CUR_DB should be set before prepareDataDir()
		#sys.stdout.write(repr(params)+'\n\n')
		fill_type = params['fill_type']
		# This string tell the caller a failure
		failure_token = '\n\nError ocurred!'
		success_token = '\n\nJob was done.'
		try:
			data_dir = prepareDataDir(dbname=CUR_DB)
			if fill_type == 'By table':
				#tbs, src_dir = params['tbs'], params['src_dir']
				tbs, src_dir, user_name = params['tbs'], params['src_dir'], params['user_name']
				## update src_dir for platform. # this is done by CGI (dbs_fill_single)
				#if src_dir.get('probe', None) and not src_dir.has_key('platform'): src_dir['platform'] = src_dir['probe']
				#rlt = fillWith(tbs, data_dir=data_dir, src_dir = src_dir)
				rlt = fillWith(tbs, data_dir=data_dir, src_dir = src_dir, user_name=user_name)
			elif fill_type == 'By file':
				fn, src_dir, user_name = params['prj_file'], params['src_dir'], params['user_name']
				## update src_dir for platform. # this is done by CGI (dbs_fill_file)
				#if src_dir.get('probe', None) and not src_dir.has_key('platform'): src_dir['platform'] = src_dir['probe']
				rlt = fillWith(fn, data_dir=data_dir, src_dir=src_dir, user_name=user_name)
			elif fill_type == 'Map platforms':
				fn, src_dir, user_name = params['map_file'], params['src_dir'], params['user_name']
				rlt = mapPfs(fn, data_dir=data_dir) #, src_dir=src_dir, user_name=user_name)
			elif fill_type == 'Update probes':
				fn, src_dir, user_name, platform_id = params['probe_file_name'], params['src_dir'], params['user_name'], params['rec_id']
				rlt = updateProbe(None, platform_id, fn)
			elif fill_type == 'Add prjanno':
				tbs, src_dir, user_name = params['tbs'], params['src_dir'], params['user_name']
				rlt = fillWith(tbs, data_dir=data_dir, src_dir = src_dir, user_name=user_name, fill_type=fill_type)

			if rlt: # display info
				info = rlt.get('info',[])
				if type(info) is not types.ListType: info = [info]
				sys.stdout.write('\n'.join(info))
			# add token to result in PIPE mode
			if not rlt or rlt.get('success', True): sys.stdout.write(success_token)
			else: sys.stdout.write(failure_token)
		except: 
			import traceback
			traceback.print_exc()
			sys.stdout.write(failure_token)
		sys.stdout.close()
	else:
		if not args: sys.stdout.write('\n\nNo file provided.\n')
		else:
			fn = args[0]
			if os.path.exists(fn):
				data_dir = prepareDataDir(dbname=CUR_DB)
				if cmd_type == 'map-platforms': # map platform
					rlt = mapPfs(fn, data_dir=data_dir)
				elif cmd_type == 'update-probe':
					pfnm = optdict.get('--platform', None)
					if not pfnm: rlt = {'info':'No platform name offered!'}
					else: rlt = updateProbe(None, pfnm, fn)
				else: # 'fill'
					rlt = fillWith(fn, data_dir=data_dir)
				if rlt and rlt.get('info', []): sys.stdout.write('\n'.join(rlt['info']))
				if not rlt or rlt.get('success', True): sys.stdout.write('\n\nAll done successfully.\n')
				else: sys.stdout.write('\n\nFailed.\n')
			else: 
				sys.stdout.write('\n\nFile "%s" doesn\'t exist, did nothing.\n' % fn)
	

	
