Merge pull request #17 from KrishnaswamyLab/dev

v0.1.8.1: check for duplicates, improved documentation
KrishnaswamyLab · Jul 11, 2018 · 7899467 · 7899467
2 parents e217bac + 20ac601
commit 7899467
Show file tree

Hide file tree

Showing 12 changed files with 116 additions and 22 deletions.
diff --git a/.travis.yml b/.travis.yml
@@ -6,8 +6,15 @@
 
   sudo: required
 
-  before_install:
-    - pip install nose2 pandas
+  addons:
+    apt:
+      packages:
+        libjs-mathjax
 
   script:
-    - python -m nose2 -v
+    - pip install -U .[test,doc]
+    - python setup.py test
+    - cd doc; make html; cd ..
+
+  after_success:
+    - coveralls
diff --git a/README.rst b/README.rst
@@ -11,6 +11,9 @@ graphtools
 .. image:: https://img.shields.io/readthedocs/graphtools.svg
     :target: https://graphtools.readthedocs.io/
     :alt: Read the Docs
+.. image:: https://coveralls.io/repos/github/KrishnaswamyLab/graphtools/badge.svg?branch=master
+    :target: https://coveralls.io/github/KrishnaswamyLab/graphtools?branch=master
+    :alt: Coverage Status
 .. image:: https://img.shields.io/twitter/follow/KrishnaswamyLab.svg?style=social&label=Follow
     :target: https://twitter.com/KrishnaswamyLab
     :alt: Twitter
@@ -42,3 +45,8 @@ Use it as follows::
         P = G.diff_op
         G = graphtools.Graph(digits['data'], n_landmark=300)
         L = G.landmark_op
+
+Help
+----
+
+If you have any questions or require assistance using graphtools, please contact us at https://krishnaswamylab.org/get-help
diff --git a/doc/source/conf.py b/doc/source/conf.py
@@ -36,6 +36,7 @@
               'sphinx.ext.napoleon',
               'sphinx.ext.doctest',
               'sphinx.ext.coverage',
+              'sphinx.ext.mathjax',
               'sphinx.ext.viewcode']
 
 # Add any paths that contain templates here, relative to this directory.

diff --git a/doc/source/index.rst b/doc/source/index.rst
@@ -14,6 +14,10 @@ graphtools
 
     <a href="https://graphtools.readthedocs.io/"><img src="https://img.shields.io/readthedocs/graphtools.svg" alt="Read the Docs"></img></a>
 
+.. raw:: html
+
+    <a href="https://coveralls.io/github/KrishnaswamyLab/graphtools?branch=master"><img src="https://coveralls.io/repos/github/KrishnaswamyLab/graphtools/badge.svg?branch=master" alt="Coverage Status"></img></a>
+
 .. raw:: html
 
     <a href="https://twitter.com/KrishnaswamyLab"><img src="https://img.shields.io/twitter/follow/KrishnaswamyLab.svg?style=social&label=Follow" alt="Twitter"></a>
@@ -53,3 +57,8 @@ To use `graphtools` with `pygsp`, create a `graphtools.Graph` class with `use_py
     N = G.N
     W = G.W
     basis = G.compute_fourier_basis()
+
+Help
+====
+
+If you have any questions or require assistance using graphtools, please contact us at https://krishnaswamylab.org/get-help
diff --git a/doc/source/requirements.txt b/doc/source/requirements.txt
@@ -2,4 +2,6 @@ numpy>=1.10.0
 scipy>=0.18.0
 pygsp>=>=0.5.1
 scikit-learn>=0.19.1
-future
+future
+sphinx
+sphinxcontrib-napoleon
diff --git a/graphtools/base.py b/graphtools/base.py
@@ -96,9 +96,7 @@ class Data(Base):
 
     def __init__(self, data, n_pca=None, random_state=None, **kwargs):
 
-        if len(data.shape) != 2:
-            raise ValueError("Expected a 2D matrix. data has shape {}".format(
-                data.shape))
+        self._check_data(data)
         if n_pca is not None and data.shape[1] <= n_pca:
             warnings.warn("Cannot perform PCA to {} dimensions on "
                           "data with {} dimensions".format(n_pca,
@@ -119,6 +117,16 @@ def __init__(self, data, n_pca=None, random_state=None, **kwargs):
         self.data_nu = self._reduce_data()
         super().__init__(**kwargs)
 
+    def _check_data(self, data):
+        if len(data.shape) != 2:
+            msg = "ValueError: Expected 2D array, got {}D array " \
+                "instead (shape: {}.) ".format(len(data.shape), data.shape)
+            if len(data.shape) < 2:
+                msg += "\nReshape your data either using array.reshape(-1, 1) "
+                "if your data has a single feature or array.reshape(1, -1) if "
+                "it contains a single sample."
+            raise ValueError(msg)
+
     def _reduce_data(self):
         """Private method to reduce data dimension.
 
@@ -233,6 +241,10 @@ def inverse_transform(self, Y, columns=None):
         ----------
         Y : array-like, shape=[n_samples_y, n_pca]
             n_features must be the same as `self.data_nu`.
+        columns : list-like
+            list of integers referring to column indices in the original data
+            space to be returned. Avoids recomputing the full matrix where only
+            a few dimensions of the ambient space are of interest
 
         Returns
         -------
@@ -546,21 +558,17 @@ class DataGraph(with_metaclass(abc.ABCMeta, Data, BaseGraph)):
 
     data : array-like, shape=[n_samples,n_features]
         accepted types: `numpy.ndarray`, `scipy.sparse.spmatrix`.
-        TODO: accept pandas dataframes
 
     n_pca : `int` or `None`, optional (default: `None`)
         number of PC dimensions to retain for graph building.
         If `None`, uses the original data.
         Note: if data is sparse, uses SVD instead of PCA
-        TODO: should we subtract and store the mean?
 
     random_state : `int` or `None`, optional (default: `None`)
         Random state for random PCA and graph building
 
     verbose : `bool`, optional (default: `True`)
         Verbosity.
-        TODO: should this be an integer instead to allow multiple
-        levels of verbosity?
 
     n_jobs : `int`, optional (default : 1)
         The number of jobs to use for the computation.

diff --git a/graphtools/graphs.py b/graphtools/graphs.py
@@ -191,7 +191,7 @@ def build_kernel_to_data(self, Y, knn=None):
         Parameters
         ----------
 
-        Y: array-like, [n_samples_y, n_dimensions]
+        Y: array-like, [n_samples_y, n_features]
             new data for which an affinity matrix is calculated
             to the existing data. `n_features` must match
             either the ambient or PCA dimensions
@@ -232,6 +232,20 @@ def build_kernel_to_data(self, Y, knn=None):
             search_knn = min(knn * 20, self.data_nu.shape[0])
             distances, indices = knn_tree.kneighbors(
                 Y, n_neighbors=search_knn)
+            if np.any(distances[:, 1] == 0):
+                has_duplicates = distances[:, 1] == 0
+                idx = np.argwhere((distances == 0) & has_duplicates[:, None])
+                duplicate_ids = np.array(
+                    [[indices[i[0], i[1]], i[0]]
+                     for i in idx if indices[i[0], i[1]] < i[0]])
+                duplicate_ids = duplicate_ids[np.argsort(duplicate_ids[:, 0])]
+                duplicate_names = ", ".join(["{} and {}".format(i[0], i[1])
+                                             for i in duplicate_ids])
+                warnings.warn(
+                    "Detected zero distance between samples {}. "
+                    "Consider removing duplicates to avoid errors in "
+                    "downstream processing.".format(duplicate_names),
+                    RuntimeWarning)
             log_complete("KNN search")
             log_start("affinities")
             bandwidth = distances[:, knn - 1]
@@ -493,7 +507,7 @@ def extend_to_data(self, data, **kwargs):
         Parameters
         ----------
 
-        Y: array-like, [n_samples_y, n_dimensions]
+        Y: array-like, [n_samples_y, n_features]
             new data for which an affinity matrix is calculated
             to the existing data. `n_features` must match
             either the ambient or PCA dimensions
@@ -529,7 +543,7 @@ def interpolate(self, transform, transitions=None, Y=None):
         transitions : array-like, optional, shape=[n_samples_y, n_samples]
             Transition matrix from `Y` (not provided) to `self.data`
 
-        Y: array-like, optional, shape=[n_samples_y, n_dimensions]
+        Y: array-like, optional, shape=[n_samples_y, n_features]
             new data for which an affinity matrix is calculated
             to the existing data. `n_features` must match
             either the ambient or PCA dimensions
@@ -706,7 +720,21 @@ def build_kernel(self):
             if self.precomputed == "distance":
                 pdx = self.data_nu
             elif self.precomputed is None:
-                pdx = squareform(pdist(self.data_nu, metric=self.distance))
+                pdx = pdist(self.data_nu, metric=self.distance)
+                if np.any(pdx == 0):
+                    pdx = squareform(pdx)
+                    duplicate_ids = np.array(
+                        [i for i in np.argwhere(pdx == 0)
+                         if i[1] > i[0]])
+                    duplicate_names = ", ".join(["{} and {}".format(i[0], i[1])
+                                                 for i in duplicate_ids])
+                    warnings.warn(
+                        "Detected zero distance between samples {}. "
+                        "Consider removing duplicates to avoid errors in "
+                        "downstream processing.".format(duplicate_names),
+                        RuntimeWarning)
+                else:
+                    pdx = squareform(pdx)
             else:
                 raise ValueError(
                     "precomputed='{}' not recognized. "
@@ -744,7 +772,7 @@ def build_kernel_to_data(self, Y, knn=None):
         Parameters
         ----------
 
-        Y: array-like, [n_samples_y, n_dimensions]
+        Y: array-like, [n_samples_y, n_features]
             new data for which an affinity matrix is calculated
             to the existing data. `n_features` must match
             either the ambient or PCA dimensions
@@ -789,7 +817,8 @@ class MNNGraph(DataGraph):
     ----------
 
     data : array-like, shape=[n_samples,n_features]
-        accepted types: `numpy.ndarray`, `scipy.sparse.spmatrix`.,
+        accepted types: `numpy.ndarray`,
+        `scipy.sparse.spmatrix`.,
         `pandas.DataFrame`, `pandas.SparseDataFrame`.
 
     sample_idx: array-like, shape=[n_samples]
@@ -798,13 +827,13 @@ class MNNGraph(DataGraph):
     beta: `float`, optional (default: 1)
         Downweight within-batch affinities by beta
 
-    adaptive_k : `{'min', 'mean', 'sqrt', 'none'}` (default: 'sqrt')
+    adaptive_k : {'min', 'mean', 'sqrt', `None`} (default: 'sqrt')
         Weights MNN kernel adaptively using the number of cells in
         each sample according to the selected method.
 
     Attributes
     ----------
-    subgraphs : list of `kNNGraph`s
+    subgraphs : list of :class:`~graphtools.graphs.kNNGraph`s
         Graphs representing each batch separately
     """
 
@@ -1061,7 +1090,7 @@ def build_kernel_to_data(self, Y, gamma=None):
         Parameters
         ----------
 
-        Y : array-like, [n_samples_y, n_dimensions]
+        Y : array-like, [n_samples_y, n_features]
             new data for which an affinity matrix is calculated
             to the existing data. `n_features` must match
             either the ambient or PCA dimensions

diff --git a/graphtools/version.py b/graphtools/version.py
@@ -1 +1 @@
-__version__ = "0.1.8"
+__version__ = "0.1.8.1"
diff --git a/setup.py b/setup.py
@@ -12,6 +12,14 @@
 
 test_requires = [
     'nose2',
+    'pandas',
+    'coverage',
+    'coveralls'
+]
+
+doc_requires = [
+    'sphinx',
+    'sphinxcontrib-napoleon',
 ]
 
 if sys.version_info[:2] < (2, 7) or (3, 0) <= sys.version_info[:2] < (3, 5):
@@ -32,7 +40,8 @@
       packages=['graphtools', ],
       license='GNU General Public License Version 2',
       install_requires=install_requires,
-      extras_require={'test': test_requires},
+      extras_require={'test': test_requires,
+                      'doc': doc_requires},
       test_suite='nose2.collector.collector',
       long_description=readme,
       url='https://github.com/KrishnaswamyLab/graphtools',

diff --git a/test/test_exact.py b/test/test_exact.py
@@ -68,6 +68,14 @@ def test_precomputed_negative():
                 n_pca=None)
 
 
+@warns(RuntimeWarning)
+def test_duplicate_data():
+    build_graph(np.vstack([data, data[:10]]),
+                n_pca=20,
+                decay=10,
+                thresh=0)
+
+
 #####################################################
 # Check kernel
 #####################################################

diff --git a/test/test_knn.py b/test/test_knn.py
@@ -37,6 +37,14 @@ def test_build_knn_with_sample_idx():
     build_graph(data, graphtype='knn', sample_idx=np.arange(len(data)))
 
 
+@warns(RuntimeWarning)
+def test_duplicate_data():
+    build_graph(np.vstack([data, data[:10]]),
+                n_pca=20,
+                decay=10,
+                thresh=1e-4)
+
+
 #####################################################
 # Check kernel
 #####################################################

diff --git a/unittest.cfg b/unittest.cfg
@@ -0,0 +1,5 @@
+[unittest]
+verbose = True
+
+[coverage]
+always-on = True