#!/usr/bin/env python ''' Purpose: Stat array in ensemble(fake) way Detail: Ensemble is not the true ensemble learning, only ensemble like prediction converting. Rules: A B C RESULT 1 1 1 1 1 1 0 0 1 0 1 0 0 1 0 0 1 0 0 0 0 0 1 0 0 1 0 0 0 0 0 0 It's a simulation of most parpers do in predicting lncRNAs. 1. Metrics need to be export: CNCI, CPC, CPAT, hmmscan, PLEK. 2. Out file: 3. Steps 3.1 Iter all prediction results of each transcript, and make sure every(5) methods has results. 3.2 Get ensemble results of each Transcript on every combination. 3.3 Output ensemble array. ''' import os def get_ci(k,n,isbase0=True): #get combination index #get k-th combination in n #eg. get_ci(2,4) should be 1,2 1,3 1,4 2,3 2,4 3,4 #trace-back idea assert isinstance(k,int) assert isinstance(n,int) i = 0 if k>n: yield None i=-1 #the res[k+1] if set for res[-1] judgement. res = [0 for _ in range(k+1)] while i >= 0: res[i] += 1 #this node matches condition if res[i]>res[i-1] and res[i]<=n: i += 1 #exceed max number,reset res[i] and trace back elif res[i] > n: res[i] = 0 i -= 1 #res[i]<= res[i-1],res[i] ++ else: continue if i==k: tmpres = tuple(res[:-1]) if isbase0: tmpres = tuple(map(lambda x:x-1,tmpres)) yield tmpres i -= 1 # Get combinatorial pair from 2 to n of n. def get_ci_detail(n): res = [] for i in range(2,n+1): res.extend(get_ci(i,n)) return res def is_all_01(mylst): for i in range(1,len(mylst)): if mylst[i] == "1" or mylst[i] == "0": continue else: return False return True # Make sure line correct. def iter_line(infile,has_header=True,sep="\t",coll="_"): #if has_header: # header = infile.readline() for line in infile: tmp = line.strip().split(sep) if has_header: yield tmp has_header=False if len(tmp) < METHODLENGTH: continue if not is_all_01(tmp): continue yield tmp def iter_coding(infile,res_index,has_header=True,sep="\t",coll="_"):#coll collapse assert isinstance(res_index,list) for tmp in iter_line(infile,has_header,sep,coll): name = tmp[0] true_cls = tmp[1] l = map(lambda x:tmp[x],res_index) yield map(lambda x:name+coll+true_cls+coll+x,l) # Iter infile # Tx_name act_cls method1 method2 ...(according to METHODINDEX) def iter_methods_res(infile): for res in iter_line(infile): myindex=[0,ACTINDEX] myindex.extend(METHODINDEX) yield map(lambda x:res[x],myindex) # Basic(default) rules: # 1+1+1 --> 1 # 1+1+0 --> 0 # 1+0+0 --> 0 # 0+0+0 --> 0 def ensemble_res(pred_code,mode): # default if mode == "d": merge_code = 1 for each_code in pred_code: merge_code = int(merge_code) and int(each_code) # header elif mode == "h": merge_code = "-".join(pred_code) # vote elif mode == "v": if(len(pred_code)>(pred_code.count("1")*2)): merge_code = "0" else: merge_code = "1" else: raise KeyError return merge_code # Get ensemble_res of every combinatorial pair. def ensemble_ress(pred_code,mode="d"): data = [] if TOTALONLY: data.append(ensemble_res(pred_code,mode)) else: for pair_indx in get_ci_detail(len(pred_code)): data.append(ensemble_res(map(lambda x:pred_code[x],pair_indx),mode)) return data def ensemble_file(infile,outfile,sep="\t",header=True,mode="d"): #if header: # outfile.write(header+"\n") for recs in iter_methods_res(infile): name = recs[0] actCls = recs[1] pred_codes = recs[2:] if header: ensembled_codes = ensemble_ress(pred_codes,"h") header=False else: ensembled_codes = ensemble_ress(pred_codes,mode) # Prepare outline # Output one line of one transcript. outdata = [] outdata.append(name) outdata.append(actCls) outdata.extend(ensembled_codes) outfile.write(sep.join(map(str,outdata))+"\n") def main(argv): import argparse parser = argparse.ArgumentParser(description="Get prediction overlap information of name_coding file") parser.add_argument('infile',nargs='?',help="Array file, \"-\" for stdin") parser.add_argument('-o','--outfile',nargs='?',help="output file",default=sys.stdout,type=argparse.FileType('w')) parser.add_argument('-n','--method_name',nargs='?',default="CNCI,CPAT,CPC,hmmscan,PLEK",help="Methods names.") parser.add_argument('-i','--method_index',nargs='?',default="2,3,4,5,9",help="Column index of methods.") parser.add_argument('-m','--mode',nargs='?',default="d",choices=["d","v"],help="Ensemble mode.") parser.add_argument('-l','--row_length',nargs='?',default=10,type=int,help="Max row length, useful for filter out incomplete predicted Tx.") parser.add_argument('-a','--act_index',nargs='?',default=1,type=int,help="Column index of transcripts actual class.") parser.add_argument('-c','--coding_nonc',default=False,action='store_true',help="Output coding, noncoding, all 3 name_coding files") parser.add_argument('--has_header',default=True,action='store_false',help="if file hasn't header,set this option.") parser.add_argument('--total-only',action='store_true',help="Set this option to aviod computing every combination results while only output all.(Huge method_index needed)") parser.add_argument('-s','--sep',nargs='?',default="\t") args = parser.parse_args(argv[1:]) if args.infile == '-': infile=sys.stdin else: infile=open(args.infile) global METHODNAME, METHODINDEX, METHODLENGTH, HasHeader, ACTINDEX, TOTALONLY METHODNAME = args.method_name.strip().split(",") METHODINDEX = map(int,args.method_index.strip().split(",")) METHODLENGTH = args.row_length HasHeader = args.has_header ACTINDEX = args.act_index TOTALONLY = args.total_only ensemble_file(infile,args.outfile,mode=args.mode) if __name__ == '__main__': import sys main(sys.argv)