#!/usr/bin/env python

# in order to be called by CGI, this program should run on the web server.

from tools import getConnectionCursor, inquireDB, esc_sql_1, Log, py_egg_dir, killProcs, delReqs, getPids, getCursor, CPUInfo, bug_report_fmt, JOB_ON_LOCAL_ONLY#, delOldStuff
import os, sys, re
import cPickle, time, cStringIO
from db_vars import *

from socket import socket, AF_INET, SOCK_DGRAM, gethostname #*
from threading import *
import traceback
from sets import Set
import copy

try:
	from Crypto.Cipher import RC5;
	import MySQLdb
	import numpy
	#import rpy
except: pass
try:
	if os.name == 'posix' and os.path.exists(py_egg_dir):
		os.system('chown -R apache: %s > /dev/null' % py_egg_dir)
		os.system('chmod -R 777 %s > /dev/null' % py_egg_dir) # then any user can use these eggs.
except: pass

work_dir = os.path.split(os.path.realpath(__file__))[0]
#R_code_dir = os.path.join(work_dir, 'R_code') #os.path.abspath('R_code') # os.path.realpath('R_code') # under cgi-bin

#HOST = ''
PORT = 1970
BUFSIZ = 1024
USER_NAME = GRP_NAME = 'apache'
LOG_NAME = 'analyze_d.log'
PID_NAME = 'analyze_d.pid'
#REQUEST_LIFE = 31*6

if os.path.exists(os.path.join(work_dir, 'DAEMON_HOST')):
	exec open(os.path.join(work_dir, 'DAEMON_HOST')).read()

#ADDR_SRV = ('', PORT) #(HOST, PORT)
ADDR_SRV = ('localhost', PORT) #(HOST, PORT) 'localhost' is required under Windows
LOGFILE = os.path.join(work_dir, LOG_NAME) #'/var/log/analyze_d.log'
PIDFILE = os.path.join(work_dir, PID_NAME) 
#PIDFILE = '/var/tmp/analyze_d.pid'
LOCKFD = None
if os.name == 'posix': 
	import pwd, grp
	USER_ID = pwd.getpwnam(USER_NAME)[2]
	GRP_ID = grp.getgrnam(GRP_NAME)[2]

NODES = CPUInfo()['nodes'] # a list of HOST names (replicates allowed)
HOST_NODES = {}
for node in NODES: 
	HOST_NODES[node] = HOST_NODES.get(node, 0) + 1


REQUESTS = {'analysis':'New requests', 'quit':'Please quit', 'ask_identity':'Please tell me your name', 'identity':'This is analyze_d', 'ready_to_quit':'Ready to quit', 'del_jobs':'Delete jobs', 'stop_jobs':'Stop jobs'}


def sendQuit():
	udpCliSock = socket(AF_INET, SOCK_DGRAM)
	REQUEST = REQUESTS['quit']
	udpCliSock.sendto(REQUEST, ADDR_SRV)
	data, addr = udpCliSock.recvfrom(BUFSIZ)
	udpCliSock.close()

def sendValue(s):
	udpCliSock = socket(AF_INET, SOCK_DGRAM)
	REQUEST = s
	udpCliSock.sendto(REQUEST, ADDR_SRV)
	udpCliSock.close()

def getInfo():
	udpCliSock = socket(AF_INET, SOCK_DGRAM)
	REQUEST = 'GET INFO'
	udpCliSock.sendto(REQUEST, ADDR_SRV)
	data, addr = udpCliSock.recvfrom(BUFSIZ)
	udpCliSock.close()
	return data

def findMe():
	udpCliSock = socket(AF_INET, SOCK_DGRAM)
	udpCliSock.sendto(REQUESTS['ask_identity'], ADDR_SRV)
	udpCliSock.settimeout(1) #udpCliSock.setblocking(0)
	isThere = 0
	try:
		data, addr = udpCliSock.recvfrom(BUFSIZ)
		if data == REQUESTS['identity']: isThere = 1
	except: pass #raise
	udpCliSock.close()
	return isThere

class MyTimer(Thread):
	interval = 60.00
	def __init__(self, master, interval=None):
		Thread.__init__(self)
		self.master = master
		self.event = Event()
		self.rlock = RLock()
		self.to_quit = 0
		if interval is not None: self.interval = interval

	def job(self):
		self.event.set()
		self.master.thread_event.set()

	def quit(self):
		self.rlock.acquire()
		self.to_quit = 1
		if hasattr(self, 'timer'): self.timer.cancel()
		self.event.set()
		self.rlock.release()

	def keepRunning(self):
		self.rlock.acquire()
		go_on = not self.to_quit
		self.rlock.release()
		return go_on
	
	def run(self):
		while self.keepRunning():
			#self.job()
			self.timer = Timer(self.interval, self.job)
			self.timer.start()
			#self.rlock.acquire()
			self.event.clear()
			#self.rlock.release()
			self.event.wait()

class Connector(Thread):
	def __init__(self, master):
		Thread.__init__(self)
		self.master = master
		#self.funs = {'THREAD_NUM_MAX':self.master.setMaxJob}

	def run(self):
		udpSerSock = socket(AF_INET, SOCK_DGRAM)
		udpSerSock.bind(ADDR_SRV)
		reqs = REQUESTS #{'analysis':'New requests', 'quit':'Please quit', 'identity':'Pease tell me your name'}
		
		while 1:
			data, addr = udpSerSock.recvfrom(BUFSIZ)
			if data[0:4] == 'SET ':
				var_val = map(lambda a:a.strip(), data[4:].split('='))
				#if len(var_val)==2: globals()[var_val[0]] = eval(var_val[1]) # set global varible
				#if len(var_val)==2: setattr(self.master, var_val[0], eval(var_val[1]) ) # set global varible
				#if len(var_val)==2: self.funs[var_val[0]](eval(var_val[1]) )
				if len(var_val)==2: 
					var, val = var_val
					self.master.setAttr(var, eval(val) )
			elif data == 'GET INFO':
				info = self.master.getInfo()
				udpSerSock.sendto(info, addr)
			elif data == reqs['analysis']: 
				self.master.hasNewReq() #thread_event.set() # requests from web CGI
			elif data == reqs['quit']: 
				self.quit(); 
				udpSerSock.sendto(reqs['ready_to_quit'], addr)
				break
			elif data == reqs['ask_identity']:
				udpSerSock.sendto(reqs['identity'], addr)
			elif data[:9] == reqs['stop_jobs']:
				req_ids = map(int, filter(lambda a:a.strip(), data[9:].split(';')))
				cur = getCursor()
				for req_id in req_ids: 
					r_state = inquireDB('SELECT req_state FROM requests WHERE req_id=%d' % req_id, cursor=cur, fetch=True)
					if r_state and r_state[0][0] == STATE_WORKING:
						killProcs(getPids(req_id, cursor=cur))
						inquireDB('UPDATE requests SET req_state=%d WHERE req_id=%d' % (STATE_STOPPED, req_id), cursor=cur, fetch=False )
						self.master.delJob(req_id)
				cur.close()
			elif data[:11] == reqs['del_jobs']:
				req_ids = map(int, filter(lambda a:a.strip(), data[11:].split(';')))
				cur = getCursor()
				working_reqs = []
				for req_id in req_ids: 
					r_state = inquireDB('SELECT req_state FROM requests WHERE req_id=%d' % req_id, cursor=cur, fetch=True)
					if r_state and r_state[0][0] == STATE_WORKING:
						killProcs(getPids(req_id, cursor=cur))
						inquireDB('UPDATE requests SET req_state=%d WHERE req_id=%d' % (STATE_TO_DELETE, req_id), cursor=cur, fetch=False )
						self.master.delJob(req_id)
						working_reqs.append(req_id)
				cur.close()
				time.sleep(10)
				#delReqs(r_ids=req_ids)
				delReqs(r_ids=working_reqs)
		udpSerSock.close()

	def quit(self):
		self.master.quit()
		
class Job(Thread):
	sql_set_working = 'UPDATE requests SET req_state=%d %s' % (STATE_WORKING, 'WHERE req_id=%d') 
	sql_get_state = 'SELECT req_state FROM requests WHERE req_id=%d'

	def __init__(self, master, req_type, req_id, user_params, user_name, hostname=None):
		Thread.__init__(self)
		self.master = master
		self.req_type = req_type
		self.id = req_id
		self.user_params = user_params
		self.user_name = user_name
		self.hostname = hostname

	def getReqState(self, req_id, new_state=STATE_SOLVED):
		st = inquireDB(self.sql_get_state % req_id, fetch=True)
		if st: st = st[0][0]
		if st == STATE_WORKING: return new_state
		return st

	def saveParams(self, params, rlt_dir):
		# params is a dict
		if not os.path.exists(rlt_dir): return
		rlt_dir = rlt_dir.strip()
		if rlt_dir[-1] == os.sep: rlt_dir = rlt_dir[:-1]
		dir_name = os.path.basename(rlt_dir)
		try: f = open(os.path.join(rlt_dir, dir_name + '_input_parameters.txt'), 'w')
		except: return
		keys_exclude = ['chart.dir', 'result.data.dir', 'path.sep', 'script.dir', 'raw.data.dir', # WebArray - Dual, Affy
				'output_dir', 'chart_dir', # WebArray - normPCA
				'result_dir', 'groups', 'groups_text', 'arrays', 'arrays_text', # WebArrayDB - dbs_analyze
				'page_values', 'tbs', 'path_sep', 'remove_unit', 'hyb_date_entry', 'new_dyes' # WebArrayDB - fill_single
				]
		keys = list(Set(params.keys()).difference(keys_exclude))
		keys.sort()
		kvs = map(lambda a:'%s\t=\t%s' % (a, params[a]), keys)
		kvs.insert(0, 'HOSTNODE\t=\t' + str(self.hostname) )
		kvs.insert(0, 'Variable\t\tValue')
		f.write('\n'.join(kvs))
		f.close()
		
	def run(self):
		self.runReq(self.req_type, self.id, self.user_params, self.user_name)
		self.master.delJob(self.id)

	def runReq(self, req_type, req_id, user_params, user_name):
		error_msg = []

		#print >>sys.stdout, '\n\n\n'+'-'*60, '\n[', time.asctime(), ']   -- req_type:', req_type, '-- req_id:', req_id, '\n'+'-'*60+'\n'
		s = '| [%s] %s  --req_type:%d --req_id:%d |' % (time.asctime(), user_name, req_type, req_id)
		l = '-' * len(s)
		print >>sys.stdout, '\n\n\n%s\n%s\n%s\n' % (l, s, l)
		print >>sys.stdout, '\nRunning on %s\n' % str(self.hostname)

		# set the working status
		inquireDB(self.sql_set_working % req_id, fetch=False)

		rlt_sql_update = None
		try:
			#pin, pout = os.popen2(os.path.join(os.path.split(os.path.abspath(__file__))[0], 'runrpy'))
			#pin, pout = os.popen2(os.path.join(work_dir, 'runR'))
			if not TYPE_CMDS.has_key(req_type):
				exec open(os.path.join(work_dir, 'db_vars.py'))
			if not TYPE_CMDS.has_key(req_type): 
				date_time = time.strftime('%Y-%m-%d %H:%M:%S', time.gmtime())
				new_state = self.getReqState(req_id, STATE_ERROR)
				inquireDB('UPDATE requests SET req_state=%d, solve_time="%s", error_msg="%s" WHERE req_id=%d' % (new_state, date_time, 'No corresponding program!', req_id))
				return
			
			# try to write user_params here
			rlt_dir = user_params.get('result_dir',  user_params.get('result.data.dir', None))
			if rlt_dir: self.saveParams(user_params, rlt_dir)

			if sys.platform == 'win32': pin, pout = os.popen2('python %s' % TYPE_CMDS[req_type]) # windows has bug in pipe redirect, you have to use python explicitly
			else:
				if self.hostname and self.hostname not in ('localhost', gethostname()):
					pin, pout = os.popen2('ssh %s %s' % (self.hostname, os.path.join(work_dir, TYPE_CMDS[req_type])))
				else:
					pin, pout = os.popen2(os.path.join(work_dir, TYPE_CMDS[req_type]))
			#os.write(pin.fileno(), cPickle.dumps((req_type, req_id, user_params, R_code_dir)) )
			os.write(pin.fileno(), cPickle.dumps((req_type, req_id, user_params)) )
			pin.close()
			rlt_sql_update = pout.read()
			pout.close()
			rlt_sql_update = cPickle.loads(rlt_sql_update)
			if type(rlt_sql_update) is tuple and len(rlt_sql_update)==2:
				rlt, sql_update = rlt_sql_update
			else:
				error_msg.append('\n<p>Inccorrect results returned: \n<pre>"""\n%s\n"""</pre>\n' % str(rlt_sql_update))
			# for debug
			#sql_update = '''UPDATE requests SET req_state=%d, error_msg="%s" WHERE req_id=%d''' % (1, sql_update, req_id)
			# update solve_time over here
			sql_update = re.sub(r'\bsolve_time=".+?"', 'solve_time="%s"' % time.strftime('%Y-%m-%d %H:%M:%S', time.gmtime()), sql_update, 1 )
		except:
			error_msg.append('Fatal Error ocurred') #might caused by wrong parameters from user.')
			if rlt_sql_update is not None: error_msg.append('\n<p>Inccorrect results returned: \n<pre>"""\n%s\n"""</pre>\n' % str(rlt_sql_update))
			cfile = cStringIO.StringIO()
			traceback.print_exc(None,cfile)
			value = cfile.getvalue()
			value = value.replace('\n', '<br>')
			value = value.replace('"', '\'\'')
			err_tk = 'error: '
			err_st = value.find(err_tk)
			if err_st >= 0: value = value[err_st+len(err_tk):]
			error_msg.append(value)
			error_msg.append(bug_report_fmt)
			rlt = msg = ' '.join(error_msg)
			req_state = self.getReqState(req_id, STATE_ERROR) #STATE_ERROR
			date_time = time.strftime('%Y-%m-%d %H:%M:%S', time.gmtime())
			sql_update = 'UPDATE requests SET req_state=%d, solve_time="%s", error_msg="%s" WHERE req_id=%d' % (req_state, date_time, esc_sql_1.sub(r'\\\1', msg), req_id)
			
		# write to LOG
		print >>sys.stdout, sql_update
		print >>sys.stdout, rlt
		if sql_update[:6].upper() != 'UPDATE': # make a final check
			date_time = time.strftime('%Y-%m-%d %H:%M:%S', time.gmtime())
			#sql_update = 'UPDATE requests SET req_state=%d, solve_time="%s", error_msg="%s" WHERE req_id=%d' % (STATE_ERROR, date_time, 'unclear error in SQL statements:\n%s' % sql_update.replace('"', "'"), req_id)
			new_state = self.getReqState(req_id, STATE_ERROR)
			sql_update = 'UPDATE requests SET req_state=%d, solve_time="%s", error_msg="%s" WHERE req_id=%d' % (new_state, date_time, 'unclear error in SQL statements:\n%s' % esc_sql_1.sub(r'\\\1', sql_update), req_id)
		try:
			n = inquireDB(sql_update)
		except: 
			date_time = time.strftime('%Y-%m-%d %H:%M:%S', time.gmtime())
			#inquireDB('UPDATE requests SET req_state=%d, solve_time="%s", error_msg="%s" WHERE req_id=%d' % (STATE_ERROR, date_time, 'unclear error in SQL statements:\n%s' % sql_update.replace('"', "'"), req_id))
			inquireDB('UPDATE requests SET req_state=%d, solve_time="%s", error_msg="%s" WHERE req_id=%d' % (self.getReqState(req_id, STATE_ERROR), date_time, 'unclear error in SQL statements:\n%s' % esc_sql_1.sub(r'\\\1', sql_update), req_id))

		return rlt

class JobManager:
	'''manager jobs'''
	sql_request_working = 'SELECT category, req_info, req_id, user_id, user_name FROM requests r LEFT JOIN users u ON r.user_id=u.id WHERE req_state=%d ORDER BY req_id' % STATE_WORKING
	sql_request_waiting = 'SELECT category, req_info, req_id, user_id, user_name FROM requests r LEFT JOIN users u ON r.user_id=u.id WHERE req_state=%d ORDER BY req_id' % STATE_WAITING
	#sql_request = 'SELECT req_state, req_id, category, req_info, user_id FROM requests WHERE req_state=%d OR req=%d' % (STATE_WORKING, STATE_WAITING)
	#sql_still_exist = 'SELECT req_id FROM requests WHERE req_id=%d'
	#sql_set_working = 'UPDATE requests SET req_state=%d %s' % (STATE_WORKING, 'WHERE req_id=%d') 
	parallel_types = TYPE_PARALLEL


	def __init__(self, **kv):
		#Thread.__init__(self)
		
		self.THREAD_NUM_MAX = 4
		self.THREAD_NUM_MAX_lock = RLock()

		self.USER_JOB_MAX = 0.5
		self.USER_JOB_MAX_lock = RLock()
		self.max_exclude_req = 1 # for data depositaing, not limit by THREAD_NUM_MAX

		self.hostname = gethostname() #os.uname()[1]
		self.IDLE_NODES = HOST_NODES.copy() # host : n_idle_cores
		self.job_host = {} # job_id : host
		self.host_lock = RLock()
		self.multi_node = not globals().get('JOB_ON_LOCAL_ONLY', False)

		self.thread_event = Event()
		self.lock = RLock()
		self.go_on = True
		self.go_on_lock = RLock()

		self.jobs = {} # {job_id : {'user':user_id, 'type':req_type,...}, ...}
		self.jobs_done = [] # [(job_id, {'user':user_id, 'type':req_type,...}), ...]
		self.my_timer = MyTimer(self)
		self.my_timer.start()
		self.connector = Connector(self)
		self.connector.start()

		for k, v in kv.items(): 
			#setattr(self, k, v)
			self.setAttr(k, v)

	def hasNewReq(self):
		self.thread_event.set()

	def engageHost(self, job_id, user_id=None, use_host=None):
		if not self.multi_node: return None
		self.host_lock.acquire()
		if len(self.IDLE_NODES) < 2: host=None
		else: # find the one of more idle cores
			vk = zip(self.IDLE_NODES.values(), self.IDLE_NODES.keys())
			vk.sort(reverse=True)
			if True: 
				if use_host and use_host in self.IDLE_NODES:
					host = use_host
				else:
					host = vk[0][1]
					#self.IDLE_NODES[host] = vk[0][0] - 1
				self.IDLE_NODES[host] -= 1
			else : # test dell5
				host = user_id and user_id==1 and 'dell5' or 'dell4' # for test only, user_name is xqxia
				self.IDLE_NODES[host] = self.IDLE_NODES[host] - 1
			self.job_host[job_id] = host
		self.host_lock.release()
		return host

	def releaseHost(self, job_id):
		if not self.multi_node: return
		self.host_lock.acquire()
		if len(self.IDLE_NODES) > 1: 
			host = self.job_host[job_id]
			self.IDLE_NODES[host] = self.IDLE_NODES[host] + 1
			del self.job_host[job_id]
		self.host_lock.release()

	def waitJobs(self):
		while 1:
			self.lock.acquire()
			if self.jobs:
				self.lock.release()
			else:
				self.lock.release()
				break
			time.sleep(1)

	def setAttr(self, nm, val):
		funs = {'THREAD_NUM_MAX':self.MaxJob, 'USER_JOB_MAX':self.MaxUserJob}
		funs[nm](val)
		self.thread_event.set()

	def getInfo(self):
		n_max, n_user, n_job = self.MaxJob(), self.MaxUserJob(), self.JobNum()
		return '%d job%s can be run simultaneously while no more than %d job%s per user. \nNow there\'s %d job%s running.' % (n_max, n_max > 1 and 's' or '', n_user, n_user > 1 and 's' or '', n_job, n_job > 1 and 's' or '')

	def MaxJob(self, n=None):
		self.THREAD_NUM_MAX_lock.acquire()
		if n is not None: self.THREAD_NUM_MAX = n
		else: n = self.THREAD_NUM_MAX
		self.THREAD_NUM_MAX_lock.release()
		return n

	def MaxUserJob(self, n=None, get_n_only=False):
		#return(max(1, self.MaxJob()/2))
		#return(max(1, int(self.MaxJob()-1) ))
		self.USER_JOB_MAX_lock.acquire()
		if n is not None: self.USER_JOB_MAX = n
		else: n = self.USER_JOB_MAX
		self.USER_JOB_MAX_lock.release()
		if get_n_only: return n
		if n > 1: return int(n)
		elif n==1 and repr(n) == '1': return n 
		else: return max(1, int(self.MaxJob()*n) )

	def JobNum(self):
		self.lock.acquire()
		n = len(self.jobs)
		self.lock.release()
		return n

	def addJob(self, user_id, req_type, req_id, user_params, user_name, use_host=None):
		self.lock.acquire()
		if req_id in self.jobs:
			newjob = None
		else:
			host = self.engageHost(req_id, user_id=user_id, use_host=use_host)
			newjob = Job(self, req_type, req_id, user_params, user_name, hostname=host)
			self.jobs[req_id] = {'user':user_id, 'type':req_type}
		self.lock.release()
		if newjob: newjob.start()

	def delJob(self, job_id):
		'maintain running jobs'
		# set self data
		self.lock.acquire()
		if job_id in self.jobs:
			self.jobs_done.append((job_id, self.jobs[job_id]))
			del self.jobs[job_id]
			del self.jobs_done[:-10] # remember 10 jobs done
			self.releaseHost(job_id)
		self.lock.release()
		self.thread_event.set()

	def isWorking(self):
		self.go_on_lock.acquire()
		go_on = self.go_on
		self.go_on_lock.release()
		return go_on

	def run(self):
		last_del_day = 0
		last_del_day = int(time.time()/60/60/24)
		para_set = Set(TYPE_PARALLEL)
		while self.isWorking():

			# clear outdated stuff: requests and files once a day
			today = int(time.time()/60/60/24)
			#if today != last_del_day: # do it every day
			if today - last_del_day >= 7: # do it every 7 days
				last_del_day = today
				#delOldStuff(REQUEST_LIFE) # this may block the program over here
				os.spawnlp(os.P_NOWAIT, 'python', 'python', '-c', 'import sys; sys.path.insert(0, "%s"); from tools import *; delOldStuff()' % work_dir) # use NOWAIT

			# get self data
			self.lock.acquire()
			jobs = copy.deepcopy(self.jobs)
			jobs_done = copy.deepcopy(self.jobs_done)
			self.lock.release()
			
			job_tp_set = Set(map(lambda a:a['type'], jobs.values()))
			n_exclude_req = len(job_tp_set - para_set)
			
			self.lock.acquire()
			n = len(self.jobs)
			self.lock.release()
			nmax = self.MaxJob() + n_exclude_req
			if n >= nmax and n_exclude_req >= self.max_exclude_req:
				self.thread_event.clear()
				self.thread_event.wait() # a waiting point: MyTimer,  done jobs or new requests from WebArray will arise this
			# check unexpectedly stopped working jobs first
			try:
				rlt = list(inquireDB(self.sql_request_working, fetch=True))
			except: rlt = []
			if rlt: # exclude those are in running.
				for i in range(len(rlt)-1, -1, -1):
					req_type, user_params, req_id, user_id, user_name = rlt[i]
					if self.jobs.has_key(req_id): rlt.pop(i)

			if not rlt:
				# check jobs in queue
				try:
					rlt = inquireDB(self.sql_request_waiting, fetch=True)
				except: rlt = []
				if len(rlt) < 1:
					self.thread_event.clear()
					self.thread_event.wait() # a waiting point: MyTimer,  done jobs  or new requests from WebArray will arose this
					continue

			users = {} # { 'user_id':job_num, ... }
			for v in jobs.values(): users[v['user']] = users.get(v['user'],0) + 1
			# sort new users by the running job number
			new_user_list = map(lambda a:a[3], rlt)
			site_user = zip(range(len(new_user_list)), new_user_list) # insert the site index
			jbnum_site_user = map(lambda a:(users.get(a[1],0), a[0], a[1]), site_user) # insert job number
			jbnum_site_user.sort() # sort by job number & site index

			user_jobs = {}
			new_job_tp_set = Set()
			for req_type, user_params, req_id, user_id, user_name in rlt:
				user_jobs.setdefault(user_id, []).append((req_type, user_params, req_id, user_name))
				new_job_tp_set.add(req_type)
			new_n_exclude_req = len(new_job_tp_set - para_set)
			
			# find the job for the user with least jobs on working.
			found = False
			use_host = None
			if n_exclude_req < self.max_exclude_req and new_n_exclude_req >= 1: # find data-depositing jobs
				for jbnum, site, user_id in jbnum_site_user:
					for req_type, user_params, req_id, user_name in user_jobs[user_id]:
						if req_type in para_set: continue
						found = True
						use_host = self.hostname
						break
					if found: break
			else: # find jobs that can be done in parallel
				for jbnum, site, user_id in jbnum_site_user:
					if jbnum >= self.MaxUserJob() + n_exclude_req: 
						self.thread_event.clear()
						self.thread_event.wait() # a waiting point: MyTimer,  done jobs  or new requests from WebArray will arise this
						break
					for req_type, user_params, req_id, user_name in user_jobs[user_id]:
						#if n_exclude_req and (req_type not in para_set): continue
						if req_type not in para_set: continue
						found = True
						break
					if found: break

			if found:
				# set the working status
				#inquireDB(self.sql_set_working % req_id, fetch=False)
				try:
					user_params = cPickle.loads(user_params) # very occasionally here's EOFError
					self.addJob(user_id, req_type, req_id, user_params, user_name, use_host=use_host)	
				except: pass

		self.waitJobs()
	
	def quit(self):
		#if hasattr(self, 'timer'): self.timer.cancel()
		self.my_timer.quit()
		self.go_on_lock.acquire()
		self.go_on = False
		self.thread_event.set()
		self.go_on_lock.release()


	
if os.name == 'posix': import fcntl # needed for file locking and other file operations
def lockPID():
	global LOCKFD
	#folder = os.path.split(os.path.realpath(PIDFILE))[0]
	#if not os.path.exists(folder): os.makedirs(folder, mode=0770)
	fp = os.open(PIDFILE, os.O_WRONLY | os.O_CREAT) #, FILE_MODE)	
	try:
		LOCKFD = fcntl.flock(fp, fcntl.LOCK_EX | fcntl.LOCK_NB)
	except IOError: 
		print 'Another copy of %s has already run!' % os.path.split(__file__)[1]
		sys.exit(0)
	os.ftruncate(fp, 0)
	mypid = '%d\n' % os.getpid()
	if os.write(fp, mypid) != len(mypid):
		print 'write error'
		sys.exit(1)
	val = fcntl.fcntl(fp, fcntl.F_GETFD, 0)
	if val < 0:
		print 'fcntl F_GETFD error'
		sys.exit(1)
	val |= fcntl.FD_CLOEXEC
	if fcntl.fcntl(fp, fcntl.F_SETFD, val) < 0:
		print('fcntl F_SETFD error')
		sys.exit(1)
		
def unlockPID():
	try:
		if LOCKFD is not None: # or type(LOCKFD) is type(1)
			os.close(LOCKFD)
		os.remove(PIDFILE)
		return 1
	except OSError:
		return 0

def isRunning():
	global LOCKFD
	if not os.path.exists(PIDFILE): return(False)
	#folder = os.path.split(os.path.realpath(PIDFILE))[0]
	#if not os.path.exists(folder): os.makedirs(folder, mode=0770)
	fp = os.open(PIDFILE, os.O_WRONLY | os.O_CREAT)
	try:
		LOCKFD = fcntl.flock(fp, fcntl.LOCK_EX | fcntl.LOCK_NB)
		os.close(fp)
		os.remove(PIDFILE)
		return(False)
	except IOError: return(True)
	return(True)


def exitUsage(s=None):
	print 'Usage: %s [--stop|quit|restart] [-n max_job_num] [-u max_job_per_user(absolute number or coefficient)] [-p port] [-i | --information] [-h | --help]' % os.path.split(sys.argv[0])[1]
	if s: print '\n%s\n' % s
	sys.exit(0)

#if __name__ == "__main__":
def Main(): #pass
	if os.name == 'posix': is_running = isRunning() #findMe() # now use lockPID()
	else: is_running = findMe()

	from getopt import getopt
	optlist, args = getopt(sys.argv[1:], 'n:u:p:hi', ['stop', 'quit', 'restart', 'help', 'information'])
	optdict = dict(optlist)
	if optdict.has_key('-h') or optdict.has_key('--help'): exitUsage()
	if optdict.has_key('-i') or optdict.has_key('--information'):
		if is_running: print getInfo()
		else: print 'Not running!'
		sys.exit(0)
	vals = {}
	if optdict.has_key('-n'): vals['THREAD_NUM_MAX'] = nproc = max(1, int(optdict['-n']))
	if optdict.has_key('-u'): vals['USER_JOB_MAX'] = nuser = eval(optdict['-u'])
	global PORT, ADDR_SRV
	if optdict.has_key('-p'):
		PORT = int(optdict['-p'])
		ADDR_SRV = ('', PORT)
	to_stop = optdict.has_key('--stop') or optdict.has_key('--quit') or optdict.has_key('--restart')
	to_start = optdict.has_key('--restart') or (not is_running and not to_stop)
	to_set = is_running and not to_stop and (optdict.has_key('-n') or optdict.has_key('-u'))# and vals

	if to_set: 
		print 'set parameters'
		#sendValue('THREAD_NUM_MAX = %d' % THREAD_NUM_MAX)# set
		for k, v in vals.items():
			s = 'SET %s = %s' % (k, repr(v) )
			sendValue( s )
			print s
		sys.exit(0)

	if to_start and (not optdict.has_key('-n') or not optdict.has_key('-u') ):
		if is_running:
			info = getInfo() # '%d jobs can be run at the same time, and no more than %d jobs per user.'
			#print info, '\n\n'
			#nproc, nuser = int(info[:info.index(' ')]), int(info[info.index('no more than ')+13:info.index(' jobs per user')])
			nproc = int(info[:info.index(' ')]) 
			nuser = info[info.index('no more than ')+13:]
			nuser = int(nuser[:nuser.index(' job')])
		else:
			if not optdict.has_key('-n'): # check cpu number automatically
				#nproc = max(1, len(os.popen('cat /proc/cpuinfo | grep processor').readlines()))
				#nproc = CPUInfo()['n_cores']
				nproc = len(NODES)
			if not optdict.has_key('-u') or nuser > nproc:
				nuser = max(int(nproc/2), 1)
		if 'THREAD_NUM_MAX' not in vals: vals['THREAD_NUM_MAX'] = nproc
		if 'USER_JOB_MAX' not in vals: vals['USER_JOB_MAX'] = nuser

	if to_stop and is_running: 
		print 'to stop'
		sendQuit()
		while findMe(): #isRunning(): 
			time.sleep(1)
		if not to_start: sys.exit(0)
		# restart this programs here
		#pid = os.fork()
		#if pid > 0: sys.exit(0)
		#os.fsync()
		#params = sys.argv[1:] # skip this file's name
		#params.insert(0, os.path.normpath(__file__))
		#params.remove('--restart') # should have this
		#print params
		
		# use the following three lines can work too!
		#params = [os.path.abspath(__file__), '-n', str(nproc), '-u', str(nuser), '-p', str(PORT)]
		#print params
		#os.execvp(os.path.abspath(__file__), params)
	if not to_start: 
		if is_running: print 'A copy is runing!'
		print 'not to start'
		sys.exit(0)
	#while isRunning(): # or findMe()
	#	time.sleep(1)
	print 'to start'

	if os.name == 'posix':
		# do the UNIX double-fork magic, see Stevens' "Advanced
		# Programming in the UNIX Environment" for details (ISBN 0201563177)
		try:
			pid = os.fork()
			if pid > 0:
				# exit first parent
				sys.exit(0)
		except OSError, e:
			print >>sys.stderr, "fork #1 failed: %d (%s)" % (e.errno, e.strerror)
			sys.exit(1)

		# make log/pid folder
		#os.umask(007)
		os.umask(005)
		folders = Set()
		for fname in (LOGFILE, PIDFILE):
			folder = os.path.split(os.path.realpath(fname))[0]
			if not os.path.exists(folder): 
				os.makedirs(folder)
				folders.add(folder)

		os.umask(113)
		sys.stdout = sys.stderr = Log(open(LOGFILE, 'a+'))
		lockPID()
		os.chmod(PIDFILE, 0660)
		#os.umask(007)
		os.umask(002)

		print "\n\n=================== Started at %s =================\n\n" % time.strftime('%Y-%m-%d %H:%M:%S', time.localtime())

		# decouple from parent environment
		#if not os.getuid(): # 0 is root
		try:
			thegid = os.getgid()
			for folder in folders: os.chown(folder, USER_ID, thegid)
			os.chown(LOGFILE, USER_ID, thegid)
			os.chown(PIDFILE, USER_ID, thegid)
		except: pass
		try:
			os.setgid(GRP_ID) # GRP_ID have to be set before USER_ID to avoid error !!!!
			#os.setregid(GRP_ID, GRP_ID)
		except: pass #raise
		try:
			os.setuid(USER_ID)
			#os.setreuid(USER_ID, USER_ID)
		except: pass
		os.chdir("/")   #don't prevent unmounting....
		os.chdir("/")   #don't prevent unmounting....
		os.setsid()

		# set mask for R produced files
		#os.umask(0)
		#os.umask(006)
		os.umask(002)

		# do second fork
		try:
			pid = os.fork()
			if pid > 0:
				sys.exit(0)
		except OSError, e:
			print >>sys.stderr, "fork #2 failed: %d (%s)" % (e.errno, e.strerror)
			sys.exit(1)

		#sys.stdout = sys.stderr = Log(open(LOGFILE, 'a+'))
		#if os.name == 'posix':
		#lockPID()
		JobManager(**vals).run()
		unlockPID()
	else:
		sys.stdout = sys.stderr = Log(open(LOGFILE, 'a+'))
		print "\n\n=================== Started at %s =================\n\n" % time.strftime('%Y-%m-%d %H:%M:%S', time.localtime())
		JobManager(**vals).run()
	

if __name__ == "__main__":
	Main()
	

