From 8056f57859e307da4d3f8f8dc8766728a33341e9 Mon Sep 17 00:00:00 2001 From: SwiftSeal Date: Thu, 19 Dec 2024 21:27:49 +0000 Subject: [PATCH 1/4] init thread restriction --- resistify/nlrexpress.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/resistify/nlrexpress.py b/resistify/nlrexpress.py index 37d46b7..98c1704 100644 --- a/resistify/nlrexpress.py +++ b/resistify/nlrexpress.py @@ -6,6 +6,7 @@ import logging import tempfile from multiprocessing import Pool, cpu_count, get_context +from threadpoolctl import threadpool_limits import shutil import warnings from resistify.utility import log_percentage @@ -273,7 +274,8 @@ def nlrexpress_subprocess(params): matrix = np.array(matrix, dtype=float) - result = model.predict_proba(matrix) + with threadpool_limits(limits=2): + result = model.predict_proba(matrix) result_index = 0 for sequence in sequences: From 48f68aa61b2bf1fa00793a811ec363eaa7555bba Mon Sep 17 00:00:00 2001 From: SwiftSeal Date: Thu, 19 Dec 2024 21:45:50 +0000 Subject: [PATCH 2/4] add thread arg --- resistify/main.py | 10 ++++++++-- resistify/nlrexpress.py | 11 ++++++----- 2 files changed, 14 insertions(+), 7 deletions(-) diff --git a/resistify/main.py b/resistify/main.py index 250f8cf..57a0722 100755 --- a/resistify/main.py +++ b/resistify/main.py @@ -68,6 +68,12 @@ def add_common_args(parser): default=None, type=int, ) + parser.add_argument( + "-t", "--threads", + help="Number of threads available for nlrexpress. Default is the number of available CPUs.", + default=None, + type=int, + ) def validate_input_file(filepath): @@ -199,7 +205,7 @@ def nlr(args, log): else: chunksize = args.chunksize - sequences = nlrexpress(sequences, "all", chunksize) + sequences = nlrexpress(sequences, "all", chunksize, args.threads) if args.coconat: log.info("Running CoCoNat to identify additional CC domains...") @@ -234,7 +240,7 @@ def prr(args, log): sequences = [sequence for sequence in sequences if sequence.is_rlp()] if len(sequences) > 0: log.info(f"{len(sequences)} PRRs identified...") - sequences = nlrexpress(sequences, "lrr", chunksize) + sequences = nlrexpress(sequences, "lrr", chunksize, args.threads) log.info("Classifying PRRs...") for sequence in sequences: diff --git a/resistify/nlrexpress.py b/resistify/nlrexpress.py index 98c1704..207d651 100644 --- a/resistify/nlrexpress.py +++ b/resistify/nlrexpress.py @@ -131,11 +131,12 @@ def parse_jackhmmer(file, iteration=False): return hmm_dict -def nlrexpress(sequences, search_type, chunk_size): - try: - threads = len(os.sched_getaffinity(0)) - except AttributeError: - threads = cpu_count() +def nlrexpress(sequences, search_type, chunk_size, threads): + if threads is None: + try: + threads = len(os.sched_getaffinity(0)) + except AttributeError: + threads = cpu_count() models = load_models(search_type) From 6a7b646d63605524e3f5564d7220fd94249c1de3 Mon Sep 17 00:00:00 2001 From: SwiftSeal Date: Thu, 19 Dec 2024 22:02:57 +0000 Subject: [PATCH 3/4] bump version and add dependency --- pyproject.toml | 3 ++- resistify/__version__.py | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 0d73138..224259b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "hatchling.build" [project] name = "resistify" -version = "1.0.0" +version = "1.0.1" dependencies = [ "scikit-learn>=0.24.2", "numpy", @@ -14,6 +14,7 @@ dependencies = [ "fair-esm", "transformers", "sentencepiece", + "threadpoolctl", ] authors = [ { name="Moray Smith", email="moraysmith98@gmail.com" }, diff --git a/resistify/__version__.py b/resistify/__version__.py index 5becc17..5c4105c 100644 --- a/resistify/__version__.py +++ b/resistify/__version__.py @@ -1 +1 @@ -__version__ = "1.0.0" +__version__ = "1.0.1" From f3afbcc7c93f327b5c07b4c173e57237ac7520bf Mon Sep 17 00:00:00 2001 From: Moray Smith <65286772+SwiftSeal@users.noreply.github.com> Date: Mon, 6 Jan 2025 15:48:01 +0000 Subject: [PATCH 4/4] update README --- README.md | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/README.md b/README.md index 3de494f..9e77d87 100644 --- a/README.md +++ b/README.md @@ -6,7 +6,6 @@ ![Conda Downloads](https://img.shields.io/conda/dn/bioconda/resistify) [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/swiftseal/resistify/blob/main/assets/resistify.ipynb) -*More than 2,500 downloads - thank you all!* Resistify is a program which rapidly identifies and classifies plant resistance genes from protein sequences. @@ -36,7 +35,7 @@ To use these with - for example - `singularity`, simply run: If you are having issues with `conda`, you can instead try installing directly from the repository: ```sh -pip install https://github.com/SwiftSeal/resistify/archive/refs/tags/v0.6.2.tar.gz +pip install https://github.com/SwiftSeal/resistify/archive/refs/tags/v1.0.1.tar.gz ``` Note that `resistify` requires `hmmer` to be installed and available in your system's PATH, which will not be installed automatically when using `pip`. @@ -116,9 +115,9 @@ Approximately 13G of disk space is required. ### results.tsv (nlr) -| Sequence | Length | Motifs | Domains | Classification | NBARC_motifs | MADA | MADAL | CJID | -| --- | --- | --- | --- | --- | --- | --- | --- | --- | -| ZAR1 | 852 | CNNNNNNNNNLLLLLLLLLL | mCNL | CNL | 9 | False | True | False | +| Sequence | Length | LRR_Length | Motifs | Domains | Classification | NBARC_motifs | MADA | MADAL | CJID | +| --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | +| ZAR1 | 852 | 307 | CNNNNNNNNNLLLLLLLLLL | mCNL | CNL | 9 | False | True | False | The main column of interest is "Classification", where we can see that it has been identified as a canonical CNL. The "Motifs" column indicates the series of NLR-associated motifs identified across the sequence - this can be useful if an NLR has an undetermined or unexpected classification. @@ -127,9 +126,9 @@ Here, it appears that ZAR1 has a MADA-like motif. ### results.tsv (prr) -| Sequence | Length | Type | Classification | Signal_peptide | -| --- | --- | --- | --- | --- | -| fls2 | 1174 | RLK | LRR | True | +| Sequence | Length | Extracellular_Length | LRR_Length | Type | Classification | Signal_peptide | +| --- | --- | --- | --- | --- | --- | --- | +| fls2 | 1173 | 806 | 675 | RLK | LRR | True | For PRRs, sequences can be of the type RLP or RLK - both are single pass transmembrane proteins, and RLKs have an internal kinase domain. Classification refers to the domains identified in the external region.