update to use newer versions of dependencies (#87)

* remove deprecated `scipy` functions in favor of `numpy` ones * re-format with latest version of `black` * lint with `ruff` * add `pyarrow` dependency as required by new `pandas` * pass tests w new versions of dependencies * test via GitHub actions, not Travis * remove Travis test file
jbloomlab · Feb 6, 2024 · f5e0a8f · f5e0a8f
1 parent 1432d62
commit f5e0a8f
Show file tree

Hide file tree

Showing 35 changed files with 1,595 additions and 682 deletions.
diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml
@@ -0,0 +1,35 @@
+name: Run tests
+
+on:
+  push:
+    branches:
+      - master
+  pull_request:
+    branches:
+      - master
+
+jobs:
+  test:
+    name: Run tests
+    runs-on: ubuntu-latest
+    timeout-minutes: 60
+    steps:
+      - name: checkout
+        uses: actions/checkout@v4
+
+      - name: install python
+        uses: actions/setup-python@v5
+        with:
+          python-version: "3.11"
+
+      - name: install package and dependencies
+        run: pip install -e . && pip install -r test_requirements.txt
+
+      - name: lint code with ruff
+        run: ruff check .
+
+      - name: check code format with black
+        run: black --check .
+
+      - name: test code with `pytest`
+        run: pytest
diff --git a/.gitignore b/.gitignore
@@ -4,6 +4,7 @@
 _*
 
 !.gitignore
+!.github
 !.travis.yml
 !.flake8
 !.nojekyll

diff --git a/.travis.yml b/.travis.yml
diff --git a/CHANGELOG.rst b/CHANGELOG.rst
@@ -6,6 +6,15 @@ All notable changes to this project will be documented in this file.
 
 The format is based on `Keep a Changelog <https://keepachangelog.com>`_.
 
+1.5.0
+-----
+- Remove use of deprecated ``scipy`` functions like ``flip`` to use ``numpy`` alternatives instead (fixes [this issue](https://github.com/jbloomlab/dms_variants/issues/86)).
+- Re-format code with latest version of ``black``.
+- Lint with ``ruff`` rather than ``flake8``
+- Add ``pyarrow`` as dependency as required by ``pandas``.
+- Tweaks to work with new versions of ``pandas`` and ``plotnine``
+- Test with GitHub Actions rather than Travis CI
+
 1.4.3
 -----
 

diff --git a/README.rst b/README.rst
@@ -5,8 +5,14 @@ dms_variants
 .. image:: https://img.shields.io/pypi/v/dms_variants.svg
         :target: https://pypi.python.org/pypi/dms_variants
 
-.. image:: https://app.travis-ci.com/jbloomlab/dms_variants.svg
-        :target: https://app.travis-ci.com/github/jbloomlab/dms_variants
+.. image:: https://github.com/jbloomlab/dms_variants/actions/workflows/test.yaml/badge.svg
+        :target: https://github.com/jbloomlab/dms_variants/actions/workflows/test.yaml
+
+.. image:: https://img.shields.io/badge/code%20style-black-000000.svg
+        :target: https://github.com/psf/black
+
+.. image:: https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/charliermarsh/ruff/main/assets/badge/v2.json
+        :target: https://github.com/astral-sh/ruff
 
 .. image:: https://mybinder.org/badge_logo.svg
         :target: https://mybinder.org/v2/gh/jbloomlab/dms_variants/master?filepath=notebooks

diff --git a/dms_variants/__init__.py b/dms_variants/__init__.py
@@ -10,5 +10,5 @@
 
 __author__ = "`the Bloom lab <https://research.fhcrc.org/bloom/en.html>`_"
 __email__ = "[email protected]"
-__version__ = "1.4.3"
+__version__ = "1.5.0"
 __url__ = "https://github.com/jbloomlab/dms_variants"
diff --git a/dms_variants/bottlenecks.py b/dms_variants/bottlenecks.py
@@ -7,7 +7,6 @@
 
 """
 
-
 import numpy
 
 import scipy.optimize

diff --git a/dms_variants/codonvarianttable.py b/dms_variants/codonvarianttable.py
@@ -222,12 +222,12 @@ def from_variant_count_df(
             if "target" in set(df.columns):
                 raise ValueError('primary_target is None but "target" col')
 
-        if not set(req_cols).issubset((df.columns)):
+        if not set(req_cols).issubset(df.columns):
             raise ValueError(
                 f"{variant_count_df_file} lacks required "
                 f"columns {req_cols}. It has: {set(df.columns)}"
             )
-        if extra_cols and not set(extra_cols).issubset((df.columns)):
+        if extra_cols and not set(extra_cols).issubset(df.columns):
             raise ValueError(
                 f"{variant_count_df_file} lacks `extra_cols` "
                 f"columns {extra_cols}. Has: {set(df.columns)}"
@@ -827,7 +827,9 @@ def prob_escape(
             )
         fracs = (
             fracs.assign(
-                n=lambda x: x.groupby(["library", "sample"])["count"].transform("sum"),
+                n=lambda x: x.groupby(["library", "sample"], observed=False)[
+                    "count"
+                ].transform("sum"),
                 frac=lambda x: x["count"] / x["n"],
             )
             .query("target == @neut_standard_target")
@@ -1101,7 +1103,7 @@ def escape_scores(
         or :math:`B_v`.
 
         Parameters
-        -----------
+        ----------
         sample_df : pandas.DataFrame
             Comparisons we use to compute the functional scores. Should have
             these columns: 'pre_sample' (pre-selection sample), 'post_sample'
@@ -2090,7 +2092,7 @@ def plotCountsPerVariant(
         """Plot variant index versus counts (or frac counts).
 
         Parameters
-        -----------
+        ----------
         ystat : {'frac_counts', 'count'}
             Is y-axis counts from variant, or fraction of counts in
             library / sample from variant?
@@ -2610,7 +2612,7 @@ def plotNumCodonMutsByType(
             )
             + p9.theme(
                 figure_size=(width, height),
-                axis_title_x=p9.element_blank(),
+                axis_title_x=None,
                 axis_text_x=p9.element_text(angle=90),
                 legend_position="none",
             )

diff --git a/dms_variants/constants.py b/dms_variants/constants.py
@@ -7,7 +7,6 @@
 
 """
 
-
 import Bio.Data.IUPACData
 import Bio.Seq
 

diff --git a/dms_variants/fastq.py b/dms_variants/fastq.py
@@ -7,7 +7,6 @@
 
 """
 
-
 import collections
 import gzip
 import itertools
@@ -148,12 +147,12 @@ def iterate_fastq_pair(
 
     for r1_entry, r2_entry in itertools.zip_longest(r1_iterator, r2_iterator):
         if (r1_entry is None) or (r2_entry is None):
-            raise IOError(
+            raise OSError(
                 f"{r1filename} and {r2filename} have unequal " "number of entries"
             )
 
         if r1_entry[0] != r2_entry[0]:
-            raise IOError(
+            raise OSError(
                 f"{r1filename} and {r2filename} specify different "
                 f"read IDs:\n{r1_entry[0]}\n{r2_entry[0]}"
             )
@@ -255,7 +254,7 @@ def iterate_fastq(filename, *, trim=None, check_pair=None, qual_format="str"):
             raise ValueError(f"invalid `check_pair` of {check_pair}")
 
     if not os.path.isfile(filename):
-        raise IOError(f"no FASTQ file {filename}")
+        raise OSError(f"no FASTQ file {filename}")
 
     if qual_format == "array":
         qual_to_array = True
@@ -273,7 +272,7 @@ def iterate_fastq(filename, *, trim=None, check_pair=None, qual_format="str"):
         head = f.readline()
         while head:
             if head[0] != "@":
-                raise IOError(f"id starts with {head[0]}, not @:\n{head}")
+                raise OSError(f"id starts with {head[0]}, not @:\n{head}")
             else:
                 head = head.rstrip()
                 headspl = head[1:].split()
@@ -282,7 +281,7 @@ def iterate_fastq(filename, *, trim=None, check_pair=None, qual_format="str"):
             plusline = f.readline().rstrip()
             qs = f.readline().rstrip()
             if (not seq) or (len(seq) != len(qs)) or (plusline != "+"):
-                raise IOError(
+                raise OSError(
                     f"invalid entry for {read_id} in {filename}:\n"
                     f"{head}\n{seq}\n{plusline}\n{qs}"
                 )

diff --git a/dms_variants/globalepistasis.py b/dms_variants/globalepistasis.py
@@ -603,7 +603,6 @@
 
 """
 
-
 import abc
 import collections
 import re
@@ -742,9 +741,9 @@ def _set_lower_latent_phenotype_params(self, model_one_less_latent):
         for k in range(1, self.n_latent_phenotypes):
             ki = k - 1
             new_latenteffects[ki] = model_one_less_latent._latenteffects[ki]
-            new_epistasis_func_params[
-                ki
-            ] = model_one_less_latent._epistasis_func_params[ki]
+            new_epistasis_func_params[ki] = (
+                model_one_less_latent._epistasis_func_params[ki]
+            )
         self._latenteffects = new_latenteffects
         self._epistasis_func_params = new_epistasis_func_params
 
@@ -910,7 +909,7 @@ def latent_phenotype_wt(self, k=None):
             is just one latent phenotype, can also be `None`.
 
         Returns
-        ---------
+        -------
         float
             Wildtype latent phenotype, which is :math:`\beta_{\rm{wt}}` in
             Eq. :eq:`latent_phenotype` or :math:`\beta_{\rm{wt}}^k` in
@@ -990,7 +989,7 @@ def phenotypes_frombinary(
             if `phenotype` is 'observed'.
 
         Returns
-        --------
+        -------
         numpy.ndarray
             Latent phenotypes calculated using Eq. :eq:`latent_phenotype` or
             observed phenotypes calculated using Eq. :eq:`observed_phenotype`
@@ -1308,7 +1307,7 @@ def single_mut_effects(
         reported only for mutations present in `AbstractEpistasis.binarymap`.
 
         Parameters
-        -----------
+        ----------
         phenotype : {'latent', 'observed'}
             Get effect on this phenotype. If there are multiple latent
             phenotypes, you must also set `k`.
@@ -1564,7 +1563,7 @@ def _dloglik_by_allparams(self, allparams, negative=True):
             optimize.
 
         Returns
-        --------
+        -------
         numpy.ndarray
             (Negative) derivative of log likelihood with respect to
             :meth:`AbstractEpistasis._allparams`.
@@ -1660,7 +1659,7 @@ def _latent_phenotypes(self, k=None):
         """Latent phenotypes.
 
         Parameters
-        -----------
+        ----------
         k : int or None
             Latent phenotype number (1 <= `k` <= `n_latent_phenotypes`),
             or can be `None` if just one latent phenotype.
@@ -1696,7 +1695,7 @@ def _observed_phenotypes(self, latent_phenos="all"):
             :math:`k` values listed here.
 
         Returns
-        --------
+        -------
         numpy.ndarray
             Observed phenotypes.
 
@@ -1835,7 +1834,7 @@ def epistasis_func(self, latent_phenotype, k=None):
         """The :ref:`global_epistasis_function` :math:`g`.
 
         Parameters
-        -----------
+        ----------
         latent_phenotype : numpy.ndarray
             Latent phenotype(s) of one or more variants.
         k : int or None
@@ -1857,7 +1856,7 @@ def _depistasis_func_dlatent(self, latent_phenotype, k=None):
         """Derivative of epistasis function by latent phenotype.
 
         Parameters
-        -----------
+        ----------
         latent_phenotype : numpy.ndarray
             Latent phenotype(s) of one or more variants.
         k : int or None
@@ -1933,7 +1932,7 @@ def _prescale_params(self, k, g_k_range):
         for `_epistasis_func_params`.
 
         Parameters
-        -----------
+        ----------
         k : int
             Latent phenotype number (1 <= `k` <= `n_latent_phenotypes`).
         g_k_range : tuple
@@ -2495,9 +2494,7 @@ def _dloglik_dlikelihood_calc_params(self):
             self._cache[key] = numpy.array(
                 [
                     0.5
-                    * (
-                        self._dloglik_dobserved_phenotype**2 - 1 / self._variances
-                    ).sum()
+                    * (self._dloglik_dobserved_phenotype**2 - 1 / self._variances).sum()
                 ]
             )
             self._cache[key].flags.writeable = False
@@ -2673,14 +2670,14 @@ def _isplines_total(self, k=None):
         """I-splines for global epistasis function.
 
         Parameters
-        -----------
+        ----------
         k : int or None
             Which global epistasis function to get I-splines for (1 <= k <=
             :attr:`AbstractEpistasis.n_latent_phenotypes`). If there
             is just one latent phenotype, can also be `None`.
 
         Returns
-        --------
+        -------
         :class:`dms_variants.ispline.Isplines_total`
             The I-spline family defined with the current values of
             the latent phenotypes as `x`.

diff --git a/dms_variants/illuminabarcodeparser.py b/dms_variants/illuminabarcodeparser.py
@@ -9,12 +9,12 @@
 
 import collections
 
+import numpy
+
 import pandas as pd
 
 import regex
 
-import scipy
-
 from dms_variants.fastq import (
     iterate_fastq,
     iterate_fastq_pair,
@@ -290,11 +290,11 @@ def parse(self, r1files, *, r2files=None, add_cols=None):
                     if self.bc_orientation == "R1":
                         if not r1only:
                             bc["R2"] = reverse_complement(bc["R2"])
-                            bc_q["R2"] = scipy.flip(bc_q["R2"], axis=0)
+                            bc_q["R2"] = numpy.flip(bc_q["R2"], axis=0)
                     else:
                         assert self.bc_orientation == "R2"
                         bc["R1"] = reverse_complement(bc["R1"])
-                        bc_q["R1"] = scipy.flip(bc_q["R1"], axis=0)
+                        bc_q["R1"] = numpy.flip(bc_q["R1"], axis=0)
                     if r1only:
                         if (bc_q["R1"] >= self.minq).all():
                             if self.valid_barcodes and (
@@ -313,7 +313,7 @@ def parse(self, r1files, *, r2files=None, add_cols=None):
                             ):
                                 fates["invalid barcode"] += 1
                             elif (
-                                scipy.maximum(bc_q["R1"], bc_q["R2"]) >= self.minq
+                                numpy.maximum(bc_q["R1"], bc_q["R2"]) >= self.minq
                             ).all():
                                 barcodes[bc["R1"]] += 1
                                 fates["valid barcode"] += 1