Library, software:
cgat (Computational Genomics Analysis Tools)
EMBOSS 6.6.0.0
Stringtie 1.3.1c
Gffcompare v0.10.6
# Add scripts/base into PYTHONPATH first#
# "cmd" is the file recoding commends
scripts/
prepare_data/ # scripts for generating testing data from source data
1.Golden_data/ # For golden data
cmd
prepare_gencode.sh
prepare_refSeq.sh
random5k.sh
2.Mitrans/ # For mitrans data
cmd
prepare_mitrans.sh
random5k.sh
3.Erroneous_tx/ # For simulation data
# The core script for simulation can be found at “../../../../data/processing_data/Simu/[species]/[depth]/simu.R”
human/ # Scripts for simulation and assembly transcripts from golden_human.
cmd
asscmd
assembly.sh
mouse/ # Scripts for simulation and assembly transcripts from golden_mouse.
cmd
asscmd
assembly.sh
cmd
tracking_split.py # Splitting the tracking file generated by “gffcompare” according to the class code.
write_mt.R # Generating the matrix required by polyester for simulation.
4.Joint_prediction # Joint prediction is depending on the prediction results of section "1.Golden_data" and section "2.Mitrans". Scripts for joint_prediction can be found at "../interpret/coding2array/"
5.Different_species/ # For different species data
cmd
mk_all_data.sh # Pooling data downloaded from EnsemblPlants, Ensembl, NONCODE and RefSeq.
data_prepare.py # Code transcripts according to the source and classification.
filter_fasta.py # Extracting sequence according to the names of transcripts.
split_fasta.py # Splitting a FASTA file into a few smaller FASTA file.
subRNE.py # Sampling sequence (output the names of sequences).
core_spe_code.txt # the codes for core-species
core_spe.txt # the names of core-species.
peri_spe_code.txt # the codes for periphery-species
peri_spe.txt # the names of periphery-species.
species_name2code.table # table for the relationship of species and code.
txclass2code.table # table for the relationship of sequence type and code.
6.Real_data/ # Assemble transcripts from sequencing data.
cmd
cmd_rt # commends for rainbow trout.
cmd_sh # commends for seahorse.
assembly_SER.sh # assemble transcripts with single-end reads for rainbow trout.
assembly_PER.sh # assemble transcripts with pair-end reads for seahorse.
mk_real_data.sh # generate data for testing
7.Misc/ # Scripts for preparing testing data.
illegal_filter.py #filter out illegal sequence that contain words except ATCG, filter out short sequence (<200 bp), and output one-line format FASTA.
illegal_filter.py #filter out illegal sequence that contain words except ATCG, filter out short sequence (<200 bp), and output one-line format FASTA.
illegal_filter2.py # Trim seq name after illegal_filter.
subset_fasta.py # Sampling sequence without replacement by using random package in python.
genome_convert/ # convert human gtf file from version 38 to version 19.
ncbi_chr.py # Renaming chromosome for FASTA file. For example, change "NC_000010.11" to “chr10” for human genome.
del_multi_chromosome_transcripts.py # Deleting chromosome line in GTF file.
gtf2COME.py # Transforming GTF file to be used for COME (specifically for file of MiTranscriptome).
prediction/ # Scripts for performing predictions.
cmd
randome5k_predict.sh # Prediction for golden data (part 1) and mitrans (part 2).
simu_predict.sh # Prediction for simulation data (part 3).
smallRNE_predict.sh # Prediction for data of different species (part 5)
realdata_predict.sh # Prediction for transcripts assembled from real sequencing data (part 6)
predict_fa.sh # Prediction for models that only need FASTA file as input.
predict_gtf.sh # Prediction for models that need GTF file as input (human hg38 genome).
predict_gtf_hg19.sh # Prediction for models that need GTF file as input (human hg19 genome).
interpret/ # Scripts for reorganizing the prediction results to a uniform format.
res2coding/ # Interpreting results file given by different tools to coding format.
coding2array/ # Organizing coding format files to array file.
cmd
array2ensemble.sh(vote and rough)# Performing joint prediction.
mcn_for_ens.sh # Statistical analysis for joint prediction.
arrayStat.py # Script used for statistical analysis
mkResArray.sh # Creating array files for golden_data (part 1), mitrans (part 2), data from different species (part 5) and real data (part 6).
mkResArray1.sh # Creating array files for transcripts assembled from simulated data (part 3)
filter_line.py # Filtering out results for a transcript where some tools don’t give classification.
base/ # Some supporting scripts
filter.py # Script for basic file manipulation.
annot_from_file.py # Script for basic file manipulation.
bio/
base.py # Providing basic classes and functions.
seq/
base.py # Providing basic classes and functions.
data/
confs/ # Pre-built models
train_data/ # Training data (which only has been used by FEELnc).
source_data # Data download from database.
processing_data/ # Intermediate files for generating testing data.
Simu/ # Intermediate files for simulating RNA-seq data with transcripts in golden data and assembling transcripts from the simulated RNA-seq data.
Real_data/ # Transcripts assembled from real RNA-seq data.
Different_species/
testing_data/ # Data used for testing the performance of the lncRNA identification tools.
predictions/ # Prediction results given by the tools.
codingf/ # Coding files generated from prediction results.
arrays/ # Array files generated from coding files.