diff --git a/guidemaker/core.py b/guidemaker/core.py index ec4c479..7d2fdd3 100644 --- a/guidemaker/core.py +++ b/guidemaker/core.py @@ -1079,11 +1079,11 @@ def get_fastas(filelist, input_format="genbank", tempdir=None): if is_gzip(file): with gzip.open(file, 'rt') as f: records = SeqIO.parse(f, input_format) - SeqIO.write(records, f1, "fasta") + SeqIO.write(map(lambda obj: obj.upper(), records), f1, "fasta") else: with open(file, 'r') as f: records = (SeqIO.parse(f, input_format)) - SeqIO.write(records, f1, "fasta") + SeqIO.write(map(lambda obj: obj.upper(), records), f1, "fasta") return fastpath except Exception as e: logger.exception("An error occurred in the input file %s" % file) @@ -1152,7 +1152,8 @@ def get_max_cfd(cfdlist): def get_doench_efficiency_score(df, pam_orientation, num_threads=1): checkset={'AGG','CGG','TGG','GGG'} if pam_orientation == "3prime" and set(df.PAM)==checkset: - doenchscore = doench_predict.predict(np.array(df.target_seq30), num_threads=num_threads) + + doenchscore = doench_predict.predict(np.array([x.upper() for x in df.target_seq30]), num_threads=num_threads) df["Efficiency"] = doenchscore else: logger.warning("NOTE: doench_efficiency_score based on Doench et al. 2016 - can only be used for NGG PAM).Check PAM sequence and PAM orientation") diff --git a/guidemaker/doench_featurization.py b/guidemaker/doench_featurization.py index 3332785..b439d26 100644 --- a/guidemaker/doench_featurization.py +++ b/guidemaker/doench_featurization.py @@ -22,8 +22,6 @@ Nature Biotechnology Jan 2016, doi:10.1038/nbt.3437. """ from itertools import product, islice -from time import time -from typing import List import logging from functools import partial from multiprocessing import Pool @@ -52,7 +50,6 @@ def featurize_data(data: pd.DataFrame, learn_options: dict, pam_audit: bool=True if np.any(data["30mer"].str.len() != 30): raise AssertionError(f"Sequences should be 30 nt long") - feature_sets = {} if learn_options["nuc_features"]: @@ -171,6 +168,7 @@ def one_hot(seq, idmat, lookup): pos = lookup[let] featurevect.extend(list(idmat[pos,:])) return pd.Series(featurevect) + def sliding_window(iterable, n): """Create a generator of substrings diff --git a/guidemaker/doench_predict.py b/guidemaker/doench_predict.py index c1c3bda..7ccd927 100644 --- a/guidemaker/doench_predict.py +++ b/guidemaker/doench_predict.py @@ -92,7 +92,7 @@ def predict( Args: seq (numpy.ndarray) numpy array of 30 nt sequences with 25 nt of guide, NGG pam in 25:27 and the following 2 nts. - model_file (str): file path of the onnx Boosted Gradien Regressor model file without position data + model_file (str): file path of the onnx Boosted Gradient Regressor model file without position data model_metadata (str): file path of the json model parameters metadata file. pam_audit (bool): check PAM of each sequence. length_audit(bool) : check length of each sequence.