Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fixed error in handling of soft-masked (lower case) DNA sequences #30

Merged
merged 1 commit into from
Sep 19, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 4 additions & 3 deletions guidemaker/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -1079,11 +1079,11 @@ def get_fastas(filelist, input_format="genbank", tempdir=None):
if is_gzip(file):
with gzip.open(file, 'rt') as f:
records = SeqIO.parse(f, input_format)
SeqIO.write(records, f1, "fasta")
SeqIO.write(map(lambda obj: obj.upper(), records), f1, "fasta")
else:
with open(file, 'r') as f:
records = (SeqIO.parse(f, input_format))
SeqIO.write(records, f1, "fasta")
SeqIO.write(map(lambda obj: obj.upper(), records), f1, "fasta")
return fastpath
except Exception as e:
logger.exception("An error occurred in the input file %s" % file)
Expand Down Expand Up @@ -1152,7 +1152,8 @@ def get_max_cfd(cfdlist):
def get_doench_efficiency_score(df, pam_orientation, num_threads=1):
checkset={'AGG','CGG','TGG','GGG'}
if pam_orientation == "3prime" and set(df.PAM)==checkset:
doenchscore = doench_predict.predict(np.array(df.target_seq30), num_threads=num_threads)

doenchscore = doench_predict.predict(np.array([x.upper() for x in df.target_seq30]), num_threads=num_threads)
df["Efficiency"] = doenchscore
else:
logger.warning("NOTE: doench_efficiency_score based on Doench et al. 2016 - can only be used for NGG PAM).Check PAM sequence and PAM orientation")
Expand Down
4 changes: 1 addition & 3 deletions guidemaker/doench_featurization.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,8 +22,6 @@
Nature Biotechnology Jan 2016, doi:10.1038/nbt.3437.
"""
from itertools import product, islice
from time import time
from typing import List
import logging
from functools import partial
from multiprocessing import Pool
Expand Down Expand Up @@ -52,7 +50,6 @@ def featurize_data(data: pd.DataFrame, learn_options: dict, pam_audit: bool=True
if np.any(data["30mer"].str.len() != 30):
raise AssertionError(f"Sequences should be 30 nt long")


feature_sets = {}

if learn_options["nuc_features"]:
Expand Down Expand Up @@ -171,6 +168,7 @@ def one_hot(seq, idmat, lookup):
pos = lookup[let]
featurevect.extend(list(idmat[pos,:]))
return pd.Series(featurevect)


def sliding_window(iterable, n):
"""Create a generator of substrings
Expand Down
2 changes: 1 addition & 1 deletion guidemaker/doench_predict.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,7 +92,7 @@ def predict(

Args:
seq (numpy.ndarray) numpy array of 30 nt sequences with 25 nt of guide, NGG pam in 25:27 and the following 2 nts.
model_file (str): file path of the onnx Boosted Gradien Regressor model file without position data
model_file (str): file path of the onnx Boosted Gradient Regressor model file without position data
model_metadata (str): file path of the json model parameters metadata file.
pam_audit (bool): check PAM of each sequence.
length_audit(bool) : check length of each sequence.
Expand Down
Loading