Evaluation of lncRNA identification methods

Library, software:

    cgat (Computational Genomics Analysis Tools) 
    EMBOSS 6.6.0.0
    Stringtie 1.3.1c 
    Gffcompare v0.10.6

Scripts and data

# Add scripts/base into PYTHONPATH first#
# "cmd" is the file recoding commends
scripts/
    prepare_data/ # scripts for generating testing data from source data
        1.Golden_data/ # For golden data
            cmd
            prepare_gencode.sh
            prepare_refSeq.sh
            random5k.sh
        2.Mitrans/ # For mitrans data
            cmd
            prepare_mitrans.sh
            random5k.sh
        3.Erroneous_tx/ # For simulation data
            # The core script for simulation can be found at “../../../../data/processing_data/Simu/[species]/[depth]/simu.R”
            human/ # Scripts for simulation and assembly transcripts from golden_human.
                cmd
                asscmd
                assembly.sh
            mouse/ # Scripts for simulation and assembly transcripts from golden_mouse.
                cmd
                asscmd
                assembly.sh
            cmd
            tracking_split.py # Splitting the tracking file generated by “gffcompare” according to the class code.
            write_mt.R # Generating the matrix required by polyester for simulation.
        4.Joint_prediction # Joint prediction is depending on the prediction results of section "1.Golden_data" and section "2.Mitrans". Scripts for joint_prediction can be found at "../interpret/coding2array/"
        5.Different_species/ # For different species data
            cmd
            mk_all_data.sh # Pooling data downloaded from EnsemblPlants, Ensembl, NONCODE and RefSeq.
            data_prepare.py # Code transcripts according to the source and classification.
            filter_fasta.py # Extracting sequence according to the names of transcripts.
            split_fasta.py # Splitting a FASTA file into a few smaller FASTA file.
            subRNE.py # Sampling sequence (output the names of sequences).
            core_spe_code.txt # the codes for core-species
            core_spe.txt # the names of core-species.
            peri_spe_code.txt # the codes for periphery-species
            peri_spe.txt # the names of periphery-species.
            species_name2code.table # table for the relationship of species and code.
            txclass2code.table # table for the relationship of sequence type and code.
        6.Real_data/ # Assemble transcripts from sequencing data.
            cmd
            cmd_rt # commends for rainbow trout.
            cmd_sh # commends for seahorse.
            assembly_SER.sh # assemble transcripts with single-end reads for rainbow trout.
            assembly_PER.sh # assemble transcripts with pair-end reads for seahorse.
            mk_real_data.sh # generate data for testing
        7.Misc/ # Scripts for preparing testing data.
            illegal_filter.py #filter out illegal sequence that contain words except ATCG, filter out short sequence (<200 bp), and output one-line format FASTA.
            illegal_filter.py #filter out illegal sequence that contain words except ATCG, filter out short sequence (<200 bp), and output one-line format FASTA.
            illegal_filter2.py # Trim seq name after illegal_filter.
            subset_fasta.py # Sampling sequence without replacement by using random package in python.
            genome_convert/ # convert human gtf file from version 38 to version 19.
            ncbi_chr.py # Renaming chromosome for FASTA file. For example, change "NC_000010.11" to “chr10” for human genome.
            del_multi_chromosome_transcripts.py # Deleting chromosome line in GTF file.
            gtf2COME.py # Transforming GTF file to be used for COME (specifically for file of MiTranscriptome).
    prediction/ # Scripts for performing predictions.
        cmd
        randome5k_predict.sh # Prediction for golden data (part 1) and mitrans (part 2).
        simu_predict.sh # Prediction for simulation data (part 3).
        smallRNE_predict.sh # Prediction for data of different species (part 5)
        realdata_predict.sh # Prediction for transcripts assembled from real sequencing data (part 6)
        predict_fa.sh # Prediction for models that only need FASTA file as input.
        predict_gtf.sh # Prediction for models that need GTF file as input (human hg38 genome).
        predict_gtf_hg19.sh # Prediction for models that need GTF file as input (human hg19 genome).
    interpret/ # Scripts for reorganizing the prediction results to a uniform format.
        res2coding/ # Interpreting results file given by different tools to coding format. 
        coding2array/ # Organizing coding format files to array file.
            cmd
            array2ensemble.sh（vote and rough）# Performing joint prediction.
    	    mcn_for_ens.sh # Statistical analysis for joint prediction. 
    	    arrayStat.py # Script used for statistical analysis
            mkResArray.sh # Creating array files for golden_data (part 1), mitrans (part 2), data from different species (part 5) and real data (part 6).
            mkResArray1.sh # Creating array files for transcripts assembled from simulated data (part 3)
            filter_line.py # Filtering out results for a transcript where some tools don’t give classification.
    base/ # Some supporting scripts
        filter.py # Script for basic file manipulation.
        annot_from_file.py # Script for basic file manipulation. 
        bio/
            base.py # Providing basic classes and functions.
            seq/
                base.py # Providing basic classes and functions.
    
data/
    confs/ # Pre-built models
    train_data/ # Training data (which only has been used by FEELnc).
    source_data # Data download from database.
    processing_data/ # Intermediate files for generating testing data.
        Simu/ # Intermediate files for simulating RNA-seq data with transcripts in golden data and assembling transcripts from the simulated RNA-seq data.
        Real_data/ # Transcripts assembled from real RNA-seq data.
        Different_species/
    testing_data/ # Data used for testing the performance of the lncRNA identification tools.
    predictions/ # Prediction results given by the tools.
    codingf/ # Coding files generated from prediction results.
    arrays/ # Array files generated from coding files.