nextflow_schema.json

{
    "$schema": "https://json-schema.org/draft/2020-12/schema",
    "$id": "https://raw.githubusercontent.com/./master/nextflow_schema.json",
    "title": ". pipeline parameters",
    "description": "",
    "type": "object",
    "defs": {
        "input_output_options": {
            "title": "input_output_options",
            "type": "object",
            "description": "Define where the pipeline should find input data and save output data.",
            "default": "",
            "properties": {
                "input": {
                    "type": "string",
                    "default": "/nfs/production/rdf/metagenomics/users/vangelis/plp_flatfiles_pgsql_2/sequence_explorer_protein.csv.bz2",
                    "description": "The pipeline main input data. Either sequence_explorer_protein from MGnify proteins, or a fasta file with fasta_input_mode set to true."
                },
                "fasta_input_mode": {
                    "type": "boolean",
                    "default": false,
                    "description": "If this flag is set, the pipeline starts from a fasta-file, without executing the preprocessing steps required for the sequence_explorer_protein file from MGnify proteins."
                },
                "esm_conda_path": {
                    "type": "string",
                    "default": "/hps/nobackup/rdf/metagenomics/service-team/users/vangelis/miniconda3/envs/esmfold_gpu",
                    "description": "predict_structures ESMfold conda env"
                },
                "db_config_file": {
                    "type": "string",
                    "default": "/nfs/production/rdf/metagenomics/users/vangelis/mgnifams/assets/db_config.ini",
                    "description": "secrets for sqlite db"
                },
                "db_schema_file": {
                    "type": "string",
                    "default": "/hps/nobackup/rdf/metagenomics/service-team/users/vangelis/mgnifams/assets/db_schema.sqlite",
                    "description": "schema for sqlite db"
                },
                "outdir": {
                    "type": "string",
                    "default": "/nfs/production/rdf/metagenomics/users/vangelis/mgnifams/data/output"
                }
            },
            "required": [
                "input",
                "esm_conda_path",
                "outdir"
            ]
        },
        "module_parameters": {
            "title": "module_parameters",
            "type": "object",
            "description": "Input values for modules",
            "default": "",
            "properties": {
                "compress_mode": {
                    "type": "string",
                    "default": "none",
                    "description": "compression mode of starting CSV input"
                },
                "min_sequence_length": {
                    "type": "integer",
                    "default": 100,
                    "description": "Sequence (and slice) length threshold to continue to clustering"
                },
                "linclust_seq_identity": {
                    "type": "number",
                    "default": 0.5,
                    "description": "mmseqs/linclust parameter for minimum sequence identity"
                },
                "linclust_coverage": {
                    "type": "number",
                    "default": 0.9,
                    "description": "mmseqs/linclust parameter for minimum sequence coverage ratio"
                },
                "linclust_cov_mode": {
                    "type": "integer",
                    "default": 0,
                    "description": "mmseqs/linclust parameter for coverage mode: 0 for both, 1 for target and 2 for query sequence"
                },
                "input_csv_chunk_size": {
                    "type": "integer",
                    "default": 50000000,
                    "description": "Chunk size for parallel slicing and filtering of sequence space"
                },
                "minimum_members": {
                    "type": "integer",
                    "default": 50,
                    "description": "Minimum number of members a cluster is allowed to have to continue with family generation"
                },
                "num_cluster_chunks": {
                    "type": "integer",
                    "description": "Number of chunks the linclust clustering file will be divided into for parallel family generation"
                },
                "deeptmhmm_chunk_size": {
                    "type": "integer",
                    "default": 100,
                    "description": "Chunk size for parallel tm prediction"
                },
                "tm_fraction_threshold": {
                    "type": "number",
                    "description": "Predicted transmembrane fraction of total length allowed for families"
                },
                "redundant_threshold": {
                    "type": "number",
                    "default": 0.95,
                    "description": "AA Jaccard Index threshold to deem two families similar, hence removing one as redundant"
                },
                "similarity_threshold": {
                    "type": "number",
                    "default": 0.5,
                    "description": "Family Jaccard Index threshold to deem two families similar, requiring further investigation at AA level"
                },
                "db_name": {
                    "type": "string",
                    "default": "pfam",
                    "description": "db used in hhblits/hhsearch"
                },
                "hh_mode": {
                    "type": "string",
                    "default": "hhblits",
                    "description": "hhblits (fast) or hhsearch (sensitive)"
                },
                "pdb_chunk_size": {
                    "type": "integer",
                    "default": 50,
                    "description": "Chunk size for parallel structure prediction"
                },
                "pdb_chunk_size_long": {
                    "type": "integer",
                    "default": 10,
                    "description": "Chunk size for parallel structure prediction for long sequences"
                },
                "compute_mode": {
                    "type": "string",
                    "default": "gpu",
                    "description": "GPU or CPU mode, depending on sequence length"
                }
            }
        },
        "reference_databases": {
            "title": "reference_databases",
            "type": "object",
            "description": "Paths to downloaded databases used by the pipeline.",
            "default": "",
            "properties": {
                "hhdb_folder_path": {
                    "type": "string",
                    "description": "annotate_models",
                    "default": "/hps/nobackup/rdf/metagenomics/service-team/ref-dbs/hh/pfamA_35.0"
                },
                "foldseek_db_path": {
                    "type": "string",
                    "description": "annotate_structures",
                    "default": "/hps/nobackup/rdf/metagenomics/service-team/ref-dbs/foldseek/8-ef4e960"
                }
            },
            "required": [
                "hhdb_folder_path",
                "foldseek_db_path"
            ]
        },
        "generic_options": {
            "title": "generic_options",
            "type": "object",
            "description": "General",
            "default": "",
            "properties": {
                "help": {
                    "type": "boolean",
                    "hidden": true,
                    "description": "Display help text."
                },
                "publish_dir_mode": {
                    "type": "string",
                    "default": "copy",
                    "description": "Method used to save pipeline results to output directory.",
                    "help_text": "The Nextflow `publishDir` option specifies which intermediate files should be saved to the output directory. This option tells the pipeline what method should be used to move these files. See [Nextflow docs](https://www.nextflow.io/docs/latest/process.html#publishdir) for details.",
                    "fa_icon": "fas fa-copy",
                    "enum": [
                        "symlink",
                        "rellink",
                        "link",
                        "copy",
                        "copyNoFollow",
                        "move"
                    ],
                    "hidden": true
                },
                "validate_params": {
                    "type": "boolean",
                    "description": "Boolean whether to validate parameters against the schema at runtime",
                    "default": true,
                    "fa_icon": "fas fa-check-square",
                    "hidden": true
                }
            }
        }
    },
    "allOf": [
        {
            "$ref": "#/defs/input_output_options"
        },
        {
            "$ref": "#/defs/module_parameters"
        },
        {
            "$ref": "#/defs/reference_databases"
        },
        {
            "$ref": "#/defs/generic_options"
        }
    ]
}