Merge pull request #39 from USDA-ARS-GBRU/issue38

fixed an issue where N's after the PAM site caused an error
USDA-ARS-GBRU · Nov 28, 2023 · 5979759 · 5979759
2 parents a699c6c + b6d55e4
commit 5979759
Show file tree

Hide file tree

Showing 4 changed files with 19 additions and 9 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,3 +1,7 @@
+# v0.4.2
+
+Fixed a bug where calulating Doench efficiency scores raised an error if there was an 'N' in the first three nucleotides past the PAM in the flanking genomic sequence.  Guidemaker now removes those guides from consideration and reports it as a warning if the flag `--doench_efficiency_score` is used.
+
 # v0.4.1
 
 * Changed how Guidemaker handles DNA sequences that are soft-masked with lowercase letters. The new behavior unmasks all 
@@ -14,4 +18,5 @@
 *  replaced append methods with concat methods for Pandas 2.1.1
 *  output data is now gzipped
 *  updated Dockerfile to use Minimamba base image
-*  Updates to Python dependencies
+*  Updates to Python dependencies
+
diff --git a/guidemaker/core.py b/guidemaker/core.py
@@ -1151,11 +1151,16 @@ def get_max_cfd(cfdlist):
 
 def get_doench_efficiency_score(df, pam_orientation, num_threads=1):
     checkset={'AGG','CGG','TGG','GGG'}
-    if pam_orientation == "3prime" and set(df.PAM)==checkset:
-
-        doenchscore = doench_predict.predict(np.array([x.upper() for x in df.target_seq30]), num_threads=num_threads)
-        df["Efficiency"] = doenchscore
+    # filter out lines with N'safter the PAM, these cannot be scored
+    df2 = df[-df.target_seq30.str.contains('N')]
+    if len(df) != len(df2):
+        n_removed = len(df) - len(df2)
+        logger.warning("{} guides were removed from consideration becasue there were N's in the region flanking the PAM site. These cannot be scored.".format(n_removed) )
+    if pam_orientation == "3prime" and set(df2.PAM)==checkset:
+
+        doenchscore = doench_predict.predict(np.array([x.upper() for x in df2.target_seq30]), num_threads=num_threads)
+        df2["Efficiency"] = doenchscore
     else:
         logger.warning("NOTE: doench_efficiency_score based on Doench et al. 2016 - can only  be used for NGG PAM).Check PAM sequence and PAM orientation")
-        df["Efficiency"] = "Not Available"
-    return df.drop('target_seq30', axis=1)
+        df2["Efficiency"] = "Not Available"
+    return df2.drop('target_seq30', axis=1)
diff --git a/guidemaker/doench_predict.py b/guidemaker/doench_predict.py
@@ -88,7 +88,7 @@ def predict(
     length_audit: bool = False,
     num_threads: int = 1
 ) -> np.array:
-    """Pedicts regressions scored from sequences.
+    """Predicts regression scores from sequences.
 
     Args:
         seq (numpy.ndarray) numpy array of 30 nt sequences with 25 nt of guide, NGG pam in 25:27 and the following 2 nts.

diff --git a/requirements.txt b/requirements.txt
@@ -70,7 +70,7 @@ tornado==6.3.3
 typing_extensions==4.7.1
 tzdata==2023c 
 tzlocal==4.3.1
-urllib3==2.0.6
+urllib3==2.0.7
 validators==0.22.0
 watchdog==3.0.0
 zipp==3.16.2