run_finder

#! /usr/bin/env python3

from argparse import RawTextHelpFormatter

import argparse
import sys
import os


def parseCommandLineArguments():
    parser = argparse.ArgumentParser( prog = "run_finder", description = "Generates gene annotation from RNA-Seq data", formatter_class = RawTextHelpFormatter )
    parser.add_argument( '--version', action = 'version', version = '%(prog)s-v1.1.0' )

    required_named = parser.add_argument_group( 'Required arguments' )
    optional_named = parser.add_argument_group( 'Optional arguments' )

    # Mandatory arguments
    required_named.add_argument( "--metadatafile", "-mf", help = "Please enter the name of the metadata file. Enter 0 in the last column of those samples which you wish to skip processing. The columns should represent the following in order --> BioProject, SRA Accession, Tissues, Description, Date, Read Length, Ended (PE or SE), RNA-Seq, process, Location. If the sample is skipped it will not be downloaded. Leave the directory path blank if you are downloading the samples. In the end of the run the program will output a csv file with the directory path filled out. Please check the provided csv file for more information on how to configure the metadata file. ", required = True )
    required_named.add_argument( "--output_directory", "-out_dir", help = "Enter the name of the directory where all other operations will be performed", required = True )
    required_named.add_argument( "--genome", "-g", help = "Enter the SOFT-MASKED genome file of the organism", required = True )
    required_named.add_argument( "--organism_model", "-om", help = "Enter the type of organism", choices = ["VERT", "INV", "PLANTS", "FUNGI"] , required = True )
    required_named.add_argument( "--genemark_path", "-gm", help = "Enter the path to genemark" , required = True )
    required_named.add_argument( "--genemark_license", "-gml", help = "Enter the licence file. Please make sure your license file is less than 365 days old"  , required = True )

    # Optional arguments
    optional_named.add_argument( "--cpu", "-n", help = "Enter the number of CPUs to be used.", default = 1 )
    optional_named.add_argument( "--genome_dir_star", "-gdir_star", help = "Please enter the location of the genome index directory of STAR" )
    optional_named.add_argument( "--genome_dir_olego", "-gdir_olego", help = "Please enter the location of the genome index directory of OLego" )
    optional_named.add_argument( "--verbose", "-verb", default = 1, help = "Enter a verbosity level" )
    optional_named.add_argument( "--protein", "-p", help = "Enter the protein fasta" )
    optional_named.add_argument( "--no_cleanup", "-no_cleanup", help = "Provide this option if you do not wish to remove any intermediate files. Please note that this will NOT remove any files and might take up a large amount of space", action = "store_true" )
    optional_named.add_argument( "--preserve_raw_input_data", "-preserve", help = "Set this argument if you want to preserve the raw fastq files. All other temporary files will be removed. These fastq files can be later used. ", action = "store_true" )
    optional_named.add_argument( "--checkpoint", "-c", help = """Enter a value if you wish to restart operations from a certain check point. Please note if you have new RNA-Seq samples, then FINDER will override this argument and computation will take place from read alignment. If there are missing data in any step then also FINDER will enforce restart of operations from a previous
. For example, if you wish to run assembly on samples for which alignments are not available then FINDER will readjust this value and set it to 1.
    1. Align reads to reference genome (Will trigger removal of all alignments and start from beginning)
    2. Assemble with PsiCLASS (Will remove all assemblies)
    3. Find genes with FINDER (entails changepoint detection)
    4. Predict genes using BRAKER2 (Will remove previous results of gene predictions with BRAKER2)
    5. Annotate coding regions
    6. Merge FINDER annotations with BRAKER2 predictions and protein sequences
    """, default = 0, type = int )
    optional_named.add_argument( "--perform_post_completion_data_cleanup", "-pc_clean", help = "Set this field if you wish to clean up all the intermediate files after the completion of the execution. If this operation is requested prior to generation of all the important files then it will be ignored and finder will proceed to annotate the genome. ", action = "store_true" )
    optional_named.add_argument( "--run_tests", "-rt", help = "Modify behaviour of finder to accelerate tests. This will reduce the downloaded fastq files to a bare minimum and also check the other installations", action = "store_true" )
    optional_named.add_argument( "--addUTR", "--addUTR", help = "Turn on this option if you wish BRAKER to add UTR sequences", action = "store_true" )
    optional_named.add_argument( "--skip_cpd", "--skip_cpd", help = "Turn on this option to skip changepoint detection. Could be effective for grasses", action = "store_true" )
    optional_named.add_argument( "--exonerate_gff3", "-egff3", help = "Enter the exonerate output in gff3 format" )
    optional_named.add_argument( "--star_shared_mem", "--star_shared_mem", help = "Turn on this option if you want STAR to load the genome index into shared memory. This saves memory if multiple finder runs are executing on the same host, but might not work in your cluster environment.", action = "store_true" )
    optional_named.add_argument( "--framework", "-fm", help = "Enter your choice of framework", choices = ["docker", "singularity"], default = "docker" )

    # optional_named.add_argument("--intron_gff3","-intron_gff3",help="Enter the name and location of the file containing introns in gff3 format")
    # optional_named.add_argument("--ground_truth_gtf","-gt_gtf",help="Enter the gtf filename of the actual annotation [for developmental purposes]")
    # optional_named.add_argument("--error_correct_reads","-ecr",help="Set this argument if you wish to perform error corrections using Rcorrector. Please note that setting this option does not guarantee correction. Short read error correction is a time consuming task. Hence, only those samples will be error corrected which have a low mapping rate. Please refer to page no. <> of the manual for more details. ",action="store_true")

    return parser.parse_args()


def runDockerCommand( name, version, image_location, container_name, volumes, command , cpus = 1, memory = '1g' ):
    """
    Runs the command in a docker container
    """

    # Runs the main command
    docker_cmd = f" docker run "
    # docker_cmd += f" -ti "
    docker_cmd += f" --rm "
    docker_cmd += f" --cpus={cpus}"
    docker_cmd += f" --memory='{memory}'"
    docker_cmd += f" --name {container_name}"
    for mapping in volumes:
        docker_cmd += f" -v {mapping}"
    docker_cmd += f" {image_location}:{version} "
    docker_cmd += f" bash -c {command}"
    os.system( docker_cmd )


def runSingularityCommand( name, version, image_location, container_name, volumes, command , cpus = 1, memory = '1g' ):
    """
    Runs the command in a Singularity container
    """

    # Runs the main command
    singularity_cmd = f" singularity exec --disable-cache  "
    for mapping in volumes:
        singularity_cmd += f" -B {mapping}"
    singularity_cmd += f" docker://{image_location}:{version} "
    singularity_cmd += f" bash -c {command}"
    os.system( singularity_cmd )


def main():
    commandLineArg = sys.argv
    if len( commandLineArg ) == 1:
        print( "Please use the --help option to get usage information" )
    options = parseCommandLineArguments()

    os.system( f"mkdir -p {options.output_directory}" )
    volumes_list = [f"{options.output_directory}:{options.output_directory}",
                    f"{'/'.join(options.genome.split('/')[:-1])}:{'/'.join(options.genome.split('/')[:-1])}",
                    f"{options.output_directory}:{options.output_directory}"]

    cmd = f" \" ulimit -n $(ulimit -Hn) && "
    cmd += f" finder "
    cmd += f" --metadatafile {options.metadatafile} "
    cmd += f" --output_directory {options.output_directory} "
    cmd += f" --genome {options.genome} "
    cmd += f" --organism_model {options.organism_model} "
    cmd += f" --cpu {options.cpu} "
    if options.genome_dir_star is not None:
        cmd += f" --genome_dir_star {options.genome_dir_star} "
        volumes_list.append( f"{options.genome_dir_star}:{options.genome_dir_star}" )
    if options.genome_dir_olego is not None:
        cmd += f" --genome_dir_olego {options.genome_dir_olego} "
        volumes_list.append( f"{options.genome_dir_olego}:{options.genome_dir_olego}" )
    cmd += f" --verbose {options.verbose} "
    if options.protein is not None:
        cmd += f" --protein {options.protein} "
        volumes_list.append( f"{'/'.join(options.protein.split('/')[:-1])}:{'/'.join(options.protein.split('/')[:-1])}" )
    if options.no_cleanup == True:
        cmd += f" --no_cleanup "
    if options.preserve_raw_input_data == True:
        cmd += f" --preserve_raw_input_data"
    cmd += f" --checkpoint {options.checkpoint} "
    if options.perform_post_completion_data_cleanup == True:
        cmd += f" --perform_post_completion_data_cleanup "
    if options.run_tests == True:
        cmd += f" --run_tests "
    if options.skip_cpd == True:
        cmd += f" --skip_cpd "
    if options.exonerate_gff3 is not None:
        cmd += f" --exonerate_gff3 {options.exonerate_gff3} "
        volumes_list.append( f"{'/'.join(options.exonerate_gff3.split('/')[:-1])}:{'/'.join(options.exonerate_gff3.split('/')[:-1])}" )
    if options.star_shared_mem == True:
        cmd += f" --star_shared_mem "
    cmd += f" --genemark_path {options.genemark_path} "
    volumes_list.append( f"{options.genemark_path}:{options.genemark_path}" )

    cmd += f" --genemark_license {options.genemark_license}"
    volumes_list.append( f"{'/'.join(options.genemark_license.split('/')[:-1])}:{'/'.join(options.genemark_license.split('/')[:-1])}" )

    cmd += f" \" "

    volumes_list = list( set( volumes_list ) )

    os.system( f"mkdir -p {options.output_directory}" )
    os.system( f"which docker > {options.output_directory}/find_docker" )
    os.system( f"which singularity > {options.output_directory}/find_singularity" )

    docker_installed = 1 if "no docker in" not in open( f"{options.output_directory}/find_docker", "r" ).read() else 0
    singularity_installed = 1 if "no singularity in" not in open( f"{options.output_directory}/find_singularity", "r" ).read() else 0

    if docker_installed == 0 and singularity_installed == 0:
        print( "You need to have either docker or singularity installed" )
        sys.exit()

    framework_of_choice = ""
    if options.framework == "docker":
        if docker_installed == 1:
            framework_of_choice = "docker"
        else:
            framework_of_choice = "singularity"

    if options.framework == "singularity":
        if singularity_installed == 1:
            framework_of_choice = "singularity"
        else:
            framework_of_choice = "docker"

    if framework_of_choice == "docker":
        os.system( "docker pull sagnikbanerjee15/finder:1.1.0" )
        runDockerCommand( name = "finder",
                            version = "1.1.0",
                            image_location = "sagnikbanerjee15/finder",
                            container_name = f"{options.output_directory.split('/')[-1]}",
                            volumes = volumes_list,
                            command = cmd,
                            cpus = options.cpu,
                            memory = '300g'
            )

    elif framework_of_choice == "singularity":
        os.system( f"singularity pull docker://sagnikbanerjee15/finder:1.1.0" )
        runSingularityCommand( name = "finder",
                            version = "1.1.0",
                            image_location = "sagnikbanerjee15/finder",
                            container_name = f"{options.output_directory.split('/')[-1]}",
                            volumes = volumes_list,
                            command = cmd,
                            cpus = options.cpu,
                            memory = '300g'
            )


if __name__ == "__main__":
    main()