#!/usr/bin/env python

'''
Purpose:
    Stat array in ensemble(fake) way
Detail:
    Ensemble is not the true ensemble learning,
    only ensemble like prediction converting.
    Rules:
	A	B	C	RESULT
	1	1	1	1
	1	1	0	0
	1	0	1	0
	0	1	0	0
	1	0	0	0
	0	0	1	0
	0	1	0	0
	0	0	0	0

    It's a simulation of most parpers do in predicting lncRNAs.
    1. Metrics need to be export: CNCI, CPC, CPAT, hmmscan, PLEK.
    2. Out file:
    3. Steps
        3.1 Iter all prediction results of each transcript, and 
            make sure every(5) methods has results.
        3.2 Get ensemble results of each Transcript on every combination.
        3.3 Output ensemble array.

'''

import os

def get_ci(k,n,isbase0=True):
    #get combination index
    #get k-th combination in n
    #eg. get_ci(2,4) should be 1,2 1,3 1,4 2,3 2,4 3,4
    #trace-back idea

    assert isinstance(k,int)
    assert isinstance(n,int)
    i = 0
    if k>n:
        yield None
        i=-1
    #the res[k+1] if set for res[-1] judgement.
    res = [0 for _ in range(k+1)]
    while i >= 0:
        res[i] += 1
        #this node matches condition
        if res[i]>res[i-1] and res[i]<=n:
            i += 1
        #exceed max number,reset res[i] and trace back
        elif res[i] > n:
            res[i] = 0
            i -= 1
        #res[i]<= res[i-1],res[i] ++
        else:
            continue
        if i==k:
            tmpres = tuple(res[:-1])
            if isbase0:
                tmpres = tuple(map(lambda x:x-1,tmpres))
            yield tmpres
            i -= 1

# Get combinatorial pair from 2 to n of n.
def get_ci_detail(n):
    res = []
    for i in range(2,n+1):
        res.extend(get_ci(i,n))
    return res

def is_all_01(mylst):

    for i in range(1,len(mylst)):
        if mylst[i] == "1" or mylst[i] == "0":
            continue
        else:
            return False
    return True

# Make sure line correct.
def iter_line(infile,has_header=True,sep="\t",coll="_"):
    #if has_header:
    #    header = infile.readline()
    for line in infile:
        tmp = line.strip().split(sep)
        if has_header:
            yield tmp
            has_header=False
        if len(tmp) < METHODLENGTH:
            continue
        if not is_all_01(tmp):
            continue
        yield tmp

def iter_coding(infile,res_index,has_header=True,sep="\t",coll="_"):#coll collapse
    assert isinstance(res_index,list)

    for tmp in iter_line(infile,has_header,sep,coll):
        name = tmp[0]
        true_cls = tmp[1]
        l = map(lambda x:tmp[x],res_index)
        yield map(lambda x:name+coll+true_cls+coll+x,l)

# Iter infile
# Tx_name act_cls method1 method2 ...(according to METHODINDEX)
def iter_methods_res(infile):
    for res in iter_line(infile):
        myindex=[0,ACTINDEX]
        myindex.extend(METHODINDEX)
        yield map(lambda x:res[x],myindex)

# Basic(default) rules:
# 1+1+1 --> 1
# 1+1+0 --> 0
# 1+0+0 --> 0
# 0+0+0 --> 0
def ensemble_res(pred_code,mode):
    # default
    if mode == "d":
        merge_code = 1
        for each_code in pred_code:
            merge_code = int(merge_code) and int(each_code)
    # header
    elif mode == "h":
        merge_code = "-".join(pred_code)
    # vote
    elif mode == "v":
        if(len(pred_code)>(pred_code.count("1")*2)):
            merge_code = "0"
        else:
            merge_code = "1"
    else:
        raise KeyError
    return merge_code

# Get ensemble_res of every combinatorial pair.
def ensemble_ress(pred_code,mode="d"):
    data = []
    if TOTALONLY:
        data.append(ensemble_res(pred_code,mode))
    else:
        for pair_indx in get_ci_detail(len(pred_code)):
            data.append(ensemble_res(map(lambda x:pred_code[x],pair_indx),mode))
    return data

def ensemble_file(infile,outfile,sep="\t",header=True,mode="d"):

    #if header:
    #    outfile.write(header+"\n")
    for recs in iter_methods_res(infile):
        name = recs[0]
        actCls = recs[1]
        pred_codes = recs[2:]
        if header:
            ensembled_codes = ensemble_ress(pred_codes,"h")
            header=False
        else:
            ensembled_codes = ensemble_ress(pred_codes,mode)
        # Prepare outline
        # Output one line of one transcript.
        outdata = []
        outdata.append(name)
        outdata.append(actCls)
        outdata.extend(ensembled_codes)
        outfile.write(sep.join(map(str,outdata))+"\n")

def main(argv):

    import argparse

    parser = argparse.ArgumentParser(description="Get prediction overlap information of name_coding file")
    parser.add_argument('infile',nargs='?',help="Array file, \"-\" for stdin")
    parser.add_argument('-o','--outfile',nargs='?',help="output file",default=sys.stdout,type=argparse.FileType('w'))
    parser.add_argument('-n','--method_name',nargs='?',default="CNCI,CPAT,CPC,hmmscan,PLEK",help="Methods names.")
    parser.add_argument('-i','--method_index',nargs='?',default="2,3,4,5,9",help="Column index of methods.")
    parser.add_argument('-m','--mode',nargs='?',default="d",choices=["d","v"],help="Ensemble mode.")
    parser.add_argument('-l','--row_length',nargs='?',default=10,type=int,help="Max row length, useful for filter out incomplete predicted Tx.")
    parser.add_argument('-a','--act_index',nargs='?',default=1,type=int,help="Column index of transcripts actual class.")
    parser.add_argument('-c','--coding_nonc',default=False,action='store_true',help="Output coding, noncoding, all 3 name_coding files")
    parser.add_argument('--has_header',default=True,action='store_false',help="if file hasn't header,set this option.")
    parser.add_argument('--total-only',action='store_true',help="Set this option to aviod computing every combination results while only output all.(Huge method_index needed)")
    parser.add_argument('-s','--sep',nargs='?',default="\t")
    args = parser.parse_args(argv[1:])

    if args.infile == '-':
        infile=sys.stdin
    else:
        infile=open(args.infile)

    global METHODNAME, METHODINDEX, METHODLENGTH, HasHeader, ACTINDEX, TOTALONLY
    METHODNAME = args.method_name.strip().split(",")
    METHODINDEX = map(int,args.method_index.strip().split(","))
    METHODLENGTH = args.row_length
    HasHeader = args.has_header
    ACTINDEX = args.act_index
    TOTALONLY = args.total_only

    ensemble_file(infile,args.outfile,mode=args.mode)
    

if __name__ == '__main__':

    import sys

    main(sys.argv)