add primary_target_only to prob_escape (becomes version 1.4.3) (#84)

* fix deprecation warning for `corr` * use `altair` version 5 in tests * add `primary_target_only` to `prob_escape` * start testing on Python 3.11 * update version to 1.4.3 * `black` formatting and pass tests * fix `primary_target_only` filtering in `prob_escape` * add check in `prob_escape` that the neutralization standard target exists * test on Python 3.10 * try testing on Python 3.8 * pass `flake8`
jbloomlab · Mar 19, 2023 · 1432d62 · 1432d62
1 parent 7974f4d
commit 1432d62
Show file tree

Hide file tree

Showing 22 changed files with 491 additions and 437 deletions.
diff --git a/CHANGELOG.rst b/CHANGELOG.rst
@@ -6,6 +6,17 @@ All notable changes to this project will be documented in this file.
 
 The format is based on `Keep a Changelog <https://keepachangelog.com>`_.
 
+1.4.3
+-----
+
+Added
+++++++
+- Added ``primary_target_only`` option ``prob_escape``.
+
+Changed
++++++++
+- Use ``altair`` version 5.0.0rc1
+
 1.4.2
 ------
 

diff --git a/dms_variants/__init__.py b/dms_variants/__init__.py
@@ -10,5 +10,5 @@
 
 __author__ = "`the Bloom lab <https://research.fhcrc.org/bloom/en.html>`_"
 __email__ = "[email protected]"
-__version__ = "1.4.2"
+__version__ = "1.4.3"
 __url__ = "https://github.com/jbloomlab/dms_variants"
diff --git a/dms_variants/codonvarianttable.py b/dms_variants/codonvarianttable.py
@@ -717,6 +717,7 @@ def prob_escape(
         min_neut_standard_count=1e3,
         ceil_n_aa_substitutions=4,
         drop_neut_standard_target=True,
+        primary_target_only=False,
     ):
         r"""Compute probability of escape relative to a neutralization standard.
 
@@ -756,6 +757,9 @@ def prob_escape(
         drop_neut_standard_target : bool
             Drop the neutralization standard variant-level results from the
             returned data frames.
+        primary_target_only : bool
+            Drop everything except the primary target and neutralization standard
+            target before beginning calculations.
 
         Returns
         -------
@@ -811,10 +815,18 @@ def prob_escape(
         if len(invalid_samples):
             raise ValueError(f"invalid samples in selections_df\n{invalid_samples}")
 
+        valid_targets = self.barcode_variant_df["target"].unique()
+        if neut_standard_target not in valid_targets:
+            raise ValueError(f"{neut_standard_target=} not in targets {valid_targets}")
+
         # get neut_standard fracs for each library / sample
+        fracs = self.n_variants_df(primary_target_only=False)
+        if primary_target_only:
+            fracs = fracs.query(
+                "(target in [@self.primary_target, @neut_standard_target])"
+            )
         fracs = (
-            self.n_variants_df(primary_target_only=False)
-            .assign(
+            fracs.assign(
                 n=lambda x: x.groupby(["library", "sample"])["count"].transform("sum"),
                 frac=lambda x: x["count"] / x["n"],
             )
@@ -852,6 +864,10 @@ def prob_escape(
 
         # get variant counts grouped by `by`
         count_df = self.variant_count_df
+        if primary_target_only:
+            count_df = count_df.query(
+                "(target in [@self.primary_target, @neut_standard_target])"
+            )
         group_cols = [
             "codon_substitutions",
             "n_codon_substitutions",
@@ -2146,7 +2162,7 @@ def plotCountsPerVariant(
             if not classifyVariants_kwargs:
                 kw_args = {}
             else:
-                kw_args = {k: v for k, v in classifyVariants_kwargs.items()}
+                kw_args = dict(classifyVariants_kwargs.items())
             if "primary_target" not in kw_args:
                 kw_args["primary_target"] = self.primary_target
             if "class_as_categorical" not in kw_args:
@@ -3025,7 +3041,6 @@ def _parseCodonMut(mutstr):
         for lib, sample in itertools.product(
             df["library"].unique().tolist(), df["sample"].unique().tolist()
         ):
-
             i_df = df.query("library == @lib & sample == @sample")
             if len(i_df) == 0:
                 continue  # no data for this library and sample

diff --git a/dms_variants/fastq.py b/dms_variants/fastq.py
@@ -147,7 +147,6 @@ def iterate_fastq_pair(
     )
 
     for r1_entry, r2_entry in itertools.zip_longest(r1_iterator, r2_iterator):
-
         if (r1_entry is None) or (r2_entry is None):
             raise IOError(
                 f"{r1filename} and {r2filename} have unequal " "number of entries"

diff --git a/dms_variants/globalepistasis.py b/dms_variants/globalepistasis.py
@@ -1017,9 +1017,7 @@ def phenotypes_frombinary(
                 assert latents.shape[0] == binary_variants.shape[1]
                 latent_phenos = binary_variants.dot(latents).transpose()
             else:
-                latents = self._latenteffects.transpose()[
-                    :-1,
-                ]
+                latents = self._latenteffects.transpose()[:-1,]
                 assert latents.shape[0] == binary_variants.shape[1]
                 latent_phenos = (
                     binary_variants.dot(latents)
@@ -2223,6 +2221,7 @@ def __init__(
                     "increasing `pseudocount` if you have fitting "
                     "problems",
                     EpistasisFittingWarning,
+                    stacklevel=2,
                 )
             f.flags.writeable = False
             setattr(self, f"_f_{cond}", f)
@@ -2891,6 +2890,7 @@ def _prescale_params(self, k, g_k_range):
                     f"({currentrange}); so cannot pre-scale. Just "
                     "setting all latent effects to zero",
                     EpistasisFittingWarning,
+                    stacklevel=2,
                 )
                 rescaled_latenteffects[ki] = 0
                 rescaled_latenteffects[ki] = numpy.append(
@@ -2964,6 +2964,7 @@ def _postscale_params(self):
                     f"is nearly zero ({mean_abs_latent_effect}); "
                     "so cannot rescale",
                     EpistasisFittingWarning,
+                    stacklevel=2,
                 )
             else:
                 rescaled_latenteffects[ki] = (

diff --git a/dms_variants/illuminabarcodeparser.py b/dms_variants/illuminabarcodeparser.py
@@ -228,7 +228,6 @@ def parse(self, r1files, *, r2files=None, add_cols=None):
                 )
 
             for entry in iterator:
-
                 if r1only:
                     readlist = [entry[1]]
                     qlist = [entry[2]]

diff --git a/dms_variants/simulate.py b/dms_variants/simulate.py
@@ -160,7 +160,7 @@ def codon_muts(codonseq, nmuts, nvariants):
     mutlist = []
     for _ in range(nvariants):
         sitemuts = []
-        for site, wt in sorted(random.sample(codons.items(), nmuts)):
+        for site, wt in sorted(random.sample(list(codons.items()), nmuts)):
             mut = random.choice([c for c in CODONS if c != wt])
             sitemuts.append(f"{wt}{site}{mut}")
         mutlist.append(" ".join(sitemuts))
@@ -312,15 +312,13 @@ def random_sample(sites, n, p):
 
     barcode_variant_dict = collections.defaultdict(list)
     for lib, specs_dict in sorted(library_specs.items()):
-
         nvariants = specs_dict["nvariants"]
         avgmuts = specs_dict["avgmuts"]
         if 10 * nvariants > (len(NTS)) ** bclen:  # safety factor 10
             raise ValueError("barcode too short for nvariants")
         existing_barcodes = set()
 
         for _ivariant in range(nvariants):
-
             barcode = "".join(random.choices(NTS, k=bclen))
             while barcode in existing_barcodes:
                 barcode = "".join(random.choices(NTS, k=bclen))
@@ -583,7 +581,6 @@ def _bottleneck_freqs(pre_freq, bottleneck):
     for lib, (sample, sample_dict) in itertools.product(  # noqa: B007
         libraries, sorted(post_samples.items())
     ):
-
         if set(sample_dict.keys()) != post_req_keys:
             raise ValueError(f"post_samples {sample} lacks {post_req_keys}")
 

diff --git a/dms_variants/utils.py b/dms_variants/utils.py
@@ -357,7 +357,11 @@ def tidy_to_corr(
     if group_cols:
         df = df.groupby(group_cols)
 
-    corr = df.corr(method=method).dropna(how="all", axis="index").reset_index()
+    corr = (
+        df.corr(method=method, numeric_only=True)
+        .dropna(how="all", axis="index")
+        .reset_index()
+    )
 
     corr.columns.name = None  # remove name of columns index
 

diff --git a/notebooks/bottleneck_likelihood.ipynb b/notebooks/bottleneck_likelihood.ipynb
@@ -193,7 +193,6 @@
     "for N_bottle, p_v in itertools.product(\n",
     "    [5e4, 1e5, 5e5, 1e6, 5e6], [-0.5, -0.1, 0, 0.1, 0.5]\n",
     "):\n",
-    "\n",
     "    # bottleneck log likelihood\n",
     "    n_v_bottle = f_post_v * N_bottle * mean_enrichment / 2**p_v\n",
     "    L_v = (\n",

diff --git a/notebooks/codonvariant_plot_formatting.ipynb b/notebooks/codonvariant_plot_formatting.ipynb
@@ -362,6 +362,8 @@
     }
    ],
    "source": [
+    "# NBVAL_IGNORE_OUTPUT\n",
+    "\n",
     "p = variants.plotNumCodonMutsByType(\"all\", samples=None)\n",
     "_ = p.draw()"
    ]
@@ -392,6 +394,8 @@
     }
    ],
    "source": [
+    "# NBVAL_IGNORE_OUTPUT\n",
+    "\n",
     "p = p + theme(panel_grid_major_x=element_blank())  # no vertical grid lines\n",
     "_ = p.draw()"
    ]
@@ -426,6 +430,8 @@
     }
    ],
    "source": [
+    "# NBVAL_IGNORE_OUTPUT\n",
+    "\n",
     "theme_set(theme_bw())\n",
     "p = variants.plotNumCodonMutsByType(\"all\", samples=None)\n",
     "_ = p.draw()"
@@ -464,6 +470,8 @@
     }
    ],
    "source": [
+    "# NBVAL_IGNORE_OUTPUT\n",
+    "\n",
     "theme_set(theme_xkcd())\n",
     "p = variants.plotNumCodonMutsByType(\n",
     "    \"all\", samples=None, heightscale=1.2, widthscale=1.2\n",
@@ -503,6 +511,8 @@
     }
    ],
    "source": [
+    "# NBVAL_IGNORE_OUTPUT\n",
+    "\n",
     "theme_set(dms_variants.plotnine_themes.theme_graygrid())  # restore gray-grid theme\n",
     "\n",
     "p = variants.plotNumCodonMutsByType(\n",
@@ -544,6 +554,8 @@
     }
    ],
    "source": [
+    "# NBVAL_IGNORE_OUTPUT\n",
+    "\n",
     "p = variants.plotNumCodonMutsByType(\n",
     "    \"all\",\n",
     "    samples=\"all\",\n",
@@ -580,6 +592,8 @@
     }
    ],
    "source": [
+    "# NBVAL_IGNORE_OUTPUT\n",
+    "\n",
     "p = variants.plotNumCodonMutsByType(\n",
     "    \"all\",\n",
     "    samples=\"all\",\n",
@@ -616,6 +630,8 @@
     }
    ],
    "source": [
+    "# NBVAL_IGNORE_OUTPUT\n",
+    "\n",
     "p = variants.plotNumCodonMutsByType(\n",
     "    \"all\",\n",
     "    samples=\"all\",\n",
@@ -653,6 +669,8 @@
     }
    ],
    "source": [
+    "# NBVAL_IGNORE_OUTPUT\n",
+    "\n",
     "p = variants.plotNumCodonMutsByType(\n",
     "    \"all\",\n",
     "    samples=\"all\",\n",
@@ -690,6 +708,8 @@
     }
    ],
    "source": [
+    "# NBVAL_IGNORE_OUTPUT\n",
+    "\n",
     "p = variants.plotNumMutsHistogram(\n",
     "    mut_type=\"codon\",\n",
     "    samples=[\"pre-selection\", \"tight_bottle\"],\n",
@@ -749,7 +769,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.8.12"
+   "version": "3.11.0"
   },
   "toc": {
    "nav_menu": {},