Skip to content

Commit

Permalink
Merge pull request #17 from KrishnaswamyLab/dev
Browse files Browse the repository at this point in the history
v0.1.8.1: check for duplicates, improved documentation
  • Loading branch information
scottgigante authored Jul 11, 2018
2 parents e217bac + 20ac601 commit 7899467
Show file tree
Hide file tree
Showing 12 changed files with 116 additions and 22 deletions.
13 changes: 10 additions & 3 deletions .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,15 @@

sudo: required

before_install:
- pip install nose2 pandas
addons:
apt:
packages:
libjs-mathjax

script:
- python -m nose2 -v
- pip install -U .[test,doc]
- python setup.py test
- cd doc; make html; cd ..

after_success:
- coveralls
8 changes: 8 additions & 0 deletions README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,9 @@ graphtools
.. image:: https://img.shields.io/readthedocs/graphtools.svg
:target: https://graphtools.readthedocs.io/
:alt: Read the Docs
.. image:: https://coveralls.io/repos/github/KrishnaswamyLab/graphtools/badge.svg?branch=master
:target: https://coveralls.io/github/KrishnaswamyLab/graphtools?branch=master
:alt: Coverage Status
.. image:: https://img.shields.io/twitter/follow/KrishnaswamyLab.svg?style=social&label=Follow
:target: https://twitter.com/KrishnaswamyLab
:alt: Twitter
Expand Down Expand Up @@ -42,3 +45,8 @@ Use it as follows::
P = G.diff_op
G = graphtools.Graph(digits['data'], n_landmark=300)
L = G.landmark_op

Help
----

If you have any questions or require assistance using graphtools, please contact us at https://krishnaswamylab.org/get-help
1 change: 1 addition & 0 deletions doc/source/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@
'sphinx.ext.napoleon',
'sphinx.ext.doctest',
'sphinx.ext.coverage',
'sphinx.ext.mathjax',
'sphinx.ext.viewcode']

# Add any paths that contain templates here, relative to this directory.
Expand Down
9 changes: 9 additions & 0 deletions doc/source/index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,10 @@ graphtools

<a href="https://graphtools.readthedocs.io/"><img src="https://img.shields.io/readthedocs/graphtools.svg" alt="Read the Docs"></img></a>

.. raw:: html

<a href="https://coveralls.io/github/KrishnaswamyLab/graphtools?branch=master"><img src="https://coveralls.io/repos/github/KrishnaswamyLab/graphtools/badge.svg?branch=master" alt="Coverage Status"></img></a>

.. raw:: html

<a href="https://twitter.com/KrishnaswamyLab"><img src="https://img.shields.io/twitter/follow/KrishnaswamyLab.svg?style=social&label=Follow" alt="Twitter"></a>
Expand Down Expand Up @@ -53,3 +57,8 @@ To use `graphtools` with `pygsp`, create a `graphtools.Graph` class with `use_py
N = G.N
W = G.W
basis = G.compute_fourier_basis()

Help
====

If you have any questions or require assistance using graphtools, please contact us at https://krishnaswamylab.org/get-help
4 changes: 3 additions & 1 deletion doc/source/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2,4 +2,6 @@ numpy>=1.10.0
scipy>=0.18.0
pygsp>=>=0.5.1
scikit-learn>=0.19.1
future
future
sphinx
sphinxcontrib-napoleon
22 changes: 15 additions & 7 deletions graphtools/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,9 +96,7 @@ class Data(Base):

def __init__(self, data, n_pca=None, random_state=None, **kwargs):

if len(data.shape) != 2:
raise ValueError("Expected a 2D matrix. data has shape {}".format(
data.shape))
self._check_data(data)
if n_pca is not None and data.shape[1] <= n_pca:
warnings.warn("Cannot perform PCA to {} dimensions on "
"data with {} dimensions".format(n_pca,
Expand All @@ -119,6 +117,16 @@ def __init__(self, data, n_pca=None, random_state=None, **kwargs):
self.data_nu = self._reduce_data()
super().__init__(**kwargs)

def _check_data(self, data):
if len(data.shape) != 2:
msg = "ValueError: Expected 2D array, got {}D array " \
"instead (shape: {}.) ".format(len(data.shape), data.shape)
if len(data.shape) < 2:
msg += "\nReshape your data either using array.reshape(-1, 1) "
"if your data has a single feature or array.reshape(1, -1) if "
"it contains a single sample."
raise ValueError(msg)

def _reduce_data(self):
"""Private method to reduce data dimension.
Expand Down Expand Up @@ -233,6 +241,10 @@ def inverse_transform(self, Y, columns=None):
----------
Y : array-like, shape=[n_samples_y, n_pca]
n_features must be the same as `self.data_nu`.
columns : list-like
list of integers referring to column indices in the original data
space to be returned. Avoids recomputing the full matrix where only
a few dimensions of the ambient space are of interest
Returns
-------
Expand Down Expand Up @@ -546,21 +558,17 @@ class DataGraph(with_metaclass(abc.ABCMeta, Data, BaseGraph)):
data : array-like, shape=[n_samples,n_features]
accepted types: `numpy.ndarray`, `scipy.sparse.spmatrix`.
TODO: accept pandas dataframes
n_pca : `int` or `None`, optional (default: `None`)
number of PC dimensions to retain for graph building.
If `None`, uses the original data.
Note: if data is sparse, uses SVD instead of PCA
TODO: should we subtract and store the mean?
random_state : `int` or `None`, optional (default: `None`)
Random state for random PCA and graph building
verbose : `bool`, optional (default: `True`)
Verbosity.
TODO: should this be an integer instead to allow multiple
levels of verbosity?
n_jobs : `int`, optional (default : 1)
The number of jobs to use for the computation.
Expand Down
47 changes: 38 additions & 9 deletions graphtools/graphs.py
Original file line number Diff line number Diff line change
Expand Up @@ -191,7 +191,7 @@ def build_kernel_to_data(self, Y, knn=None):
Parameters
----------
Y: array-like, [n_samples_y, n_dimensions]
Y: array-like, [n_samples_y, n_features]
new data for which an affinity matrix is calculated
to the existing data. `n_features` must match
either the ambient or PCA dimensions
Expand Down Expand Up @@ -232,6 +232,20 @@ def build_kernel_to_data(self, Y, knn=None):
search_knn = min(knn * 20, self.data_nu.shape[0])
distances, indices = knn_tree.kneighbors(
Y, n_neighbors=search_knn)
if np.any(distances[:, 1] == 0):
has_duplicates = distances[:, 1] == 0
idx = np.argwhere((distances == 0) & has_duplicates[:, None])
duplicate_ids = np.array(
[[indices[i[0], i[1]], i[0]]
for i in idx if indices[i[0], i[1]] < i[0]])
duplicate_ids = duplicate_ids[np.argsort(duplicate_ids[:, 0])]
duplicate_names = ", ".join(["{} and {}".format(i[0], i[1])
for i in duplicate_ids])
warnings.warn(
"Detected zero distance between samples {}. "
"Consider removing duplicates to avoid errors in "
"downstream processing.".format(duplicate_names),
RuntimeWarning)
log_complete("KNN search")
log_start("affinities")
bandwidth = distances[:, knn - 1]
Expand Down Expand Up @@ -493,7 +507,7 @@ def extend_to_data(self, data, **kwargs):
Parameters
----------
Y: array-like, [n_samples_y, n_dimensions]
Y: array-like, [n_samples_y, n_features]
new data for which an affinity matrix is calculated
to the existing data. `n_features` must match
either the ambient or PCA dimensions
Expand Down Expand Up @@ -529,7 +543,7 @@ def interpolate(self, transform, transitions=None, Y=None):
transitions : array-like, optional, shape=[n_samples_y, n_samples]
Transition matrix from `Y` (not provided) to `self.data`
Y: array-like, optional, shape=[n_samples_y, n_dimensions]
Y: array-like, optional, shape=[n_samples_y, n_features]
new data for which an affinity matrix is calculated
to the existing data. `n_features` must match
either the ambient or PCA dimensions
Expand Down Expand Up @@ -706,7 +720,21 @@ def build_kernel(self):
if self.precomputed == "distance":
pdx = self.data_nu
elif self.precomputed is None:
pdx = squareform(pdist(self.data_nu, metric=self.distance))
pdx = pdist(self.data_nu, metric=self.distance)
if np.any(pdx == 0):
pdx = squareform(pdx)
duplicate_ids = np.array(
[i for i in np.argwhere(pdx == 0)
if i[1] > i[0]])
duplicate_names = ", ".join(["{} and {}".format(i[0], i[1])
for i in duplicate_ids])
warnings.warn(
"Detected zero distance between samples {}. "
"Consider removing duplicates to avoid errors in "
"downstream processing.".format(duplicate_names),
RuntimeWarning)
else:
pdx = squareform(pdx)
else:
raise ValueError(
"precomputed='{}' not recognized. "
Expand Down Expand Up @@ -744,7 +772,7 @@ def build_kernel_to_data(self, Y, knn=None):
Parameters
----------
Y: array-like, [n_samples_y, n_dimensions]
Y: array-like, [n_samples_y, n_features]
new data for which an affinity matrix is calculated
to the existing data. `n_features` must match
either the ambient or PCA dimensions
Expand Down Expand Up @@ -789,7 +817,8 @@ class MNNGraph(DataGraph):
----------
data : array-like, shape=[n_samples,n_features]
accepted types: `numpy.ndarray`, `scipy.sparse.spmatrix`.,
accepted types: `numpy.ndarray`,
`scipy.sparse.spmatrix`.,
`pandas.DataFrame`, `pandas.SparseDataFrame`.
sample_idx: array-like, shape=[n_samples]
Expand All @@ -798,13 +827,13 @@ class MNNGraph(DataGraph):
beta: `float`, optional (default: 1)
Downweight within-batch affinities by beta
adaptive_k : `{'min', 'mean', 'sqrt', 'none'}` (default: 'sqrt')
adaptive_k : {'min', 'mean', 'sqrt', `None`} (default: 'sqrt')
Weights MNN kernel adaptively using the number of cells in
each sample according to the selected method.
Attributes
----------
subgraphs : list of `kNNGraph`s
subgraphs : list of :class:`~graphtools.graphs.kNNGraph`s
Graphs representing each batch separately
"""

Expand Down Expand Up @@ -1061,7 +1090,7 @@ def build_kernel_to_data(self, Y, gamma=None):
Parameters
----------
Y : array-like, [n_samples_y, n_dimensions]
Y : array-like, [n_samples_y, n_features]
new data for which an affinity matrix is calculated
to the existing data. `n_features` must match
either the ambient or PCA dimensions
Expand Down
2 changes: 1 addition & 1 deletion graphtools/version.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "0.1.8"
__version__ = "0.1.8.1"
11 changes: 10 additions & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,14 @@

test_requires = [
'nose2',
'pandas',
'coverage',
'coveralls'
]

doc_requires = [
'sphinx',
'sphinxcontrib-napoleon',
]

if sys.version_info[:2] < (2, 7) or (3, 0) <= sys.version_info[:2] < (3, 5):
Expand All @@ -32,7 +40,8 @@
packages=['graphtools', ],
license='GNU General Public License Version 2',
install_requires=install_requires,
extras_require={'test': test_requires},
extras_require={'test': test_requires,
'doc': doc_requires},
test_suite='nose2.collector.collector',
long_description=readme,
url='https://github.com/KrishnaswamyLab/graphtools',
Expand Down
8 changes: 8 additions & 0 deletions test/test_exact.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,14 @@ def test_precomputed_negative():
n_pca=None)


@warns(RuntimeWarning)
def test_duplicate_data():
build_graph(np.vstack([data, data[:10]]),
n_pca=20,
decay=10,
thresh=0)


#####################################################
# Check kernel
#####################################################
Expand Down
8 changes: 8 additions & 0 deletions test/test_knn.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,14 @@ def test_build_knn_with_sample_idx():
build_graph(data, graphtype='knn', sample_idx=np.arange(len(data)))


@warns(RuntimeWarning)
def test_duplicate_data():
build_graph(np.vstack([data, data[:10]]),
n_pca=20,
decay=10,
thresh=1e-4)


#####################################################
# Check kernel
#####################################################
Expand Down
5 changes: 5 additions & 0 deletions unittest.cfg
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
[unittest]
verbose = True

[coverage]
always-on = True

0 comments on commit 7899467

Please sign in to comment.