From 0ef77da97aef3e2b53e2ad5acdcf0c9d1f539601 Mon Sep 17 00:00:00 2001 From: Scott Gigante Date: Thu, 21 Feb 2019 16:58:02 -0500 Subject: [PATCH 001/125] first attempt at pandas 0.24 support --- requirements.txt | 6 ++--- scprep/filter.py | 7 +++--- scprep/io.py | 6 ++--- scprep/plot/marker.py | 2 +- scprep/reduce.py | 2 +- scprep/select.py | 35 +++++++++++++++++++++++------ scprep/utils.py | 51 +++++++++++++++---------------------------- scprep/version.py | 2 +- setup.py | 6 ++--- test/test_io.py | 12 +++++----- test/test_plot.py | 2 +- test/test_select.py | 28 ++++++++++++------------ test/tools/matrix.py | 8 ++++--- 13 files changed, 87 insertions(+), 80 deletions(-) diff --git a/requirements.txt b/requirements.txt index b2d05390..988d9183 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,5 @@ -numpy>=1.10.0 -scipy>=0.18.0,!=1.2.0.* +numpy>=1.12.0 +scipy>=0.18.1 scikit-learn>=0.19.1 -pandas>=0.19.0,<0.24 +pandas>=0.24 decorator diff --git a/scprep/filter.py b/scprep/filter.py index 647f5047..3a171a87 100644 --- a/scprep/filter.py +++ b/scprep/filter.py @@ -28,14 +28,14 @@ def remove_empty_cells(data, *extra_data, sample_labels=None): warnings.warn("`scprep.filter.remove_empty_cells` is deprecated. " "Use `scprep.filter.filter_empty_cells` instead.", DeprecationWarning) - return filter_empty_cells(data, *extra_data) + return filter_empty_cells(data, *extra_data, sample_labels=sample_labels) def remove_duplicates(data, *extra_data, sample_labels=None): warnings.warn("`scprep.filter.remove_duplicates` is deprecated. " "Use `scprep.filter.filter_duplicates` instead.", DeprecationWarning) - return filter_duplicates(data, *extra_data) + return filter_duplicates(data, *extra_data, sample_labels=sample_labels) def filter_empty_genes(data, *extra_data): @@ -302,7 +302,8 @@ def filter_gene_set_expression(data, *extra_data, genes=None, Filtered extra data, if passed. """ cell_sums = measure.gene_set_expression( - data, genes, + data, genes=genes, + starts_with=starts_with, ends_with=ends_with, regex=regex, library_size_normalize=library_size_normalize) return filter_values(data, *extra_data, values=cell_sums, cutoff=cutoff, percentile=percentile, diff --git a/scprep/io.py b/scprep/io.py index caaf5d9b..e18f8fb6 100644 --- a/scprep/io.py +++ b/scprep/io.py @@ -61,7 +61,7 @@ def _parse_header(header, n_expected, header_type="gene_names"): else: delimiter = "," columns = pd.read_csv(header, delimiter=delimiter, - header=None).values.flatten().astype(str) + header=None).to_numpy().flatten().astype(str) if not len(columns) == n_expected: raise ValueError("Expected {} entries in {}. Got {}".format( n_expected, header, len(columns))) @@ -480,8 +480,8 @@ def load_10X(data_dir, sparse=True, gene_labels='symbol', cell_names = barcodes[0] if allow_duplicates is None: allow_duplicates = not sparse - gene_names = _parse_10x_genes(genes['symbol'].values.astype(str), - genes['id'].values.astype(str), + gene_names = _parse_10x_genes(genes['symbol'].to_numpy().astype(str), + genes['id'].to_numpy().astype(str), gene_labels=gene_labels, allow_duplicates=allow_duplicates) diff --git a/scprep/plot/marker.py b/scprep/plot/marker.py index 0f332d5d..5307947a 100644 --- a/scprep/plot/marker.py +++ b/scprep/plot/marker.py @@ -64,7 +64,7 @@ def marker_plot(data, clusters, markers, gene_names=None, "be provided. " "Got gene_names=None, data as a {}".format(type(data))) gene_names = data.columns - for gene in np.concatenate(list(markers.values())): + for gene in np.concatenate(list(markers.to_numpy()())): if gene not in gene_names: raise ValueError('All genes in `markers` must appear ' 'in gene_names. Did not find: {}'.format(gene)) diff --git a/scprep/reduce.py b/scprep/reduce.py index 0ef7129f..80792e60 100644 --- a/scprep/reduce.py +++ b/scprep/reduce.py @@ -280,7 +280,7 @@ def pca(data, n_components=100, eps=0.3, if isinstance(data, pd.SparseDataFrame): data = data.to_coo() elif isinstance(data, pd.DataFrame): - data = data.values + data = data.to_numpy() # handle sparsity if sparse.issparse(data): diff --git a/scprep/select.py b/scprep/select.py index 1c239f98..ece3c887 100644 --- a/scprep/select.py +++ b/scprep/select.py @@ -4,6 +4,7 @@ from scipy import sparse import warnings import re +from . import utils def _is_1d(data): @@ -67,7 +68,10 @@ def _convert_dataframe_1d(idx): if (not _is_1d(idx)) and np.prod(idx.shape) != np.max(idx.shape): raise ValueError( "Expected idx to be 1D. Got shape {}".format(idx.shape)) - idx = idx.iloc[:, 0] if idx.shape[1] == 1 else idx.iloc[0, :] + if idx.shape[1] == 1: + idx = idx.loc[:, idx.columns[0]] + else: + idx = idx.loc[idx.index[0], :] return idx @@ -125,7 +129,7 @@ def get_gene_set(data, starts_with=None, ends_with=None, regex=None): """ if not _is_1d(data): try: - data = data.columns.values + data = data.columns.to_numpy() except AttributeError: raise TypeError("data must be a list of gene names or a pandas " "DataFrame. Got {}".format(type(data).__name__)) @@ -157,7 +161,7 @@ def get_cell_set(data, starts_with=None, ends_with=None, regex=None): """ if not _is_1d(data): try: - data = data.index.values + data = data.index.to_numpy() except AttributeError: raise TypeError("data must be a list of cell names or a pandas " "DataFrame. Got {}".format(type(data).__name__)) @@ -220,21 +224,31 @@ def select_cols(data, *extra_data, idx=None, if isinstance(idx, pd.DataFrame): idx = _convert_dataframe_1d(idx) + idx = utils.toarray(idx) + if isinstance(data, pd.DataFrame): try: + if np.issubdtype(idx.dtype, np.dtype(bool).type): + # temporary workaround for pandas error + raise TypeError data = data.loc[:, idx] except (KeyError, TypeError): if isinstance(idx, numbers.Integral) or \ - issubclass(np.array(idx).dtype.type, numbers.Integral): + np.issubdtype(idx.dtype, np.dtype(int)) or \ + np.issubdtype(idx.dtype, np.dtype(bool)): data = data.loc[:, np.array(data.columns)[idx]] else: raise elif isinstance(data, pd.Series): try: + if np.issubdtype(idx.dtype, np.dtype(bool).type): + # temporary workaround for pandas error + raise TypeError data = data.loc[idx] except (KeyError, TypeError): if isinstance(idx, numbers.Integral) or \ - issubclass(np.array(idx).dtype.type, numbers.Integral): + np.issubdtype(idx.dtype, np.dtype(int)) or \ + np.issubdtype(idx.dtype, np.dtype(bool)): data = data.loc[np.array(data.index)[idx]] else: raise @@ -310,16 +324,23 @@ def select_rows(data, *extra_data, idx=None, ends_with=ends_with, regex=regex) if isinstance(idx, pd.DataFrame): idx = _convert_dataframe_1d(idx) + + idx = utils.toarray(idx) + if isinstance(data, (pd.DataFrame, pd.Series)): try: + if np.issubdtype(idx.dtype, np.dtype(bool).type): + # temporary workaround for pandas error + raise TypeError with warnings.catch_warnings(): warnings.filterwarnings( "error", "Passing list-likes to .loc") data = data.loc[idx] except (KeyError, TypeError, FutureWarning): if isinstance(idx, numbers.Integral) or \ - issubclass(np.array(idx).dtype.type, numbers.Integral): - data = data.iloc[idx] + np.issubdtype(idx.dtype, np.dtype(int)) or \ + np.issubdtype(idx.dtype, np.dtype(bool)): + data = data.loc[np.array(data.index)[idx]] else: raise elif _is_1d(data): diff --git a/scprep/utils.py b/scprep/utils.py index b45d3e29..ff422606 100644 --- a/scprep/utils.py +++ b/scprep/utils.py @@ -3,8 +3,6 @@ import numbers from scipy import sparse import warnings -import re -from . import select def toarray(x): @@ -19,12 +17,9 @@ def toarray(x): ------- x : np.ndarray """ - if isinstance(x, pd.SparseDataFrame): - x = x.to_coo().toarray() - elif isinstance(x, pd.SparseSeries): - x = x.to_dense().values - elif isinstance(x, (pd.DataFrame, pd.Series)): - x = x.values + if isinstance(x, (pd.SparseDataFrame, pd.SparseSeries, + pd.DataFrame, pd.Series)): + x = x.to_numpy() elif isinstance(x, sparse.spmatrix): x = x.toarray() elif isinstance(x, np.matrix): @@ -54,9 +49,9 @@ def to_array_or_spmatrix(x): if isinstance(x, pd.SparseDataFrame): x = x.to_coo() elif isinstance(x, pd.SparseSeries): - x = x.to_dense().values + x = x.to_dense().to_numpy() elif isinstance(x, (pd.DataFrame, pd.Series)): - x = x.values + x = x.to_numpy() elif isinstance(x, np.matrix): x = np.array(x) elif isinstance(x, (sparse.spmatrix, np.ndarray, numbers.Number)): @@ -127,7 +122,7 @@ def matrix_sum(data, axis=None): sums = pd.Series(np.array(data.to_coo().sum(axis)).flatten(), index=index) elif axis is None: - sums = data.values.sum() + sums = data.to_numpy().sum() else: sums = data.sum(axis) else: @@ -159,7 +154,7 @@ def matrix_min(data): elif isinstance(data, sparse.lil_matrix): data = [np.min(d) for d in data.data] + [0] elif isinstance(data, sparse.dok_matrix): - data = list(data.values()) + [0] + data = list(data.to_numpy()()) + [0] elif isinstance(data, sparse.dia_matrix): data = [np.min(data.data), 0] return np.min(data) @@ -269,37 +264,25 @@ def combine_batches(data, batch_labels, append_to_cell_names=False): def select_cols(data, idx): - warnings.warn("`scprep.utils.select_cols` is deprecated. Use " - "`scprep.select.select_cols` instead.", - DeprecationWarning) - return select.select_cols(data, idx=idx) + raise RuntimeError("`scprep.utils.select_cols` is deprecated. Use " + "`scprep.select.select_cols` instead.") def select_rows(data, idx): - warnings.warn("`scprep.utils.select_rows` is deprecated. Use " - "`scprep.select.select_rows` instead.", - DeprecationWarning) - return select.select_rows(data, idx=idx) + raise RuntimeError("`scprep.utils.select_rows` is deprecated. Use " + "`scprep.select.select_rows` instead.") def get_gene_set(data, starts_with=None, ends_with=None, regex=None): - warnings.warn("`scprep.utils.get_gene_set` is deprecated. Use " - "`scprep.select.get_gene_set` instead.", - DeprecationWarning) - return select.get_gene_set(data, starts_with=starts_with, - ends_with=ends_with, regex=regex) + raise RuntimeError("`scprep.utils.get_gene_set` is deprecated. Use " + "`scprep.select.get_gene_set` instead.") def get_cell_set(data, starts_with=None, ends_with=None, regex=None): - warnings.warn("`scprep.utils.get_cell_set` is deprecated. Use " - "`scprep.select.get_cell_set` instead.", - DeprecationWarning) - return select.get_cell_set(data, starts_with=starts_with, - ends_with=ends_with, regex=regex) + raise RuntimeError("`scprep.utils.get_cell_set` is deprecated. Use " + "`scprep.select.get_cell_set` instead.") def subsample(*data, n=10000, seed=None): - warnings.warn("`scprep.utils.subsample` is deprecated. Use " - "`scprep.select.subsample` instead.", - DeprecationWarning) - return select.subsample(*data, n=n, seed=seed) + raise RuntimeError("`scprep.utils.subsample` is deprecated. Use " + "`scprep.select.subsample` instead.") diff --git a/scprep/version.py b/scprep/version.py index cf957409..6c50b11c 100644 --- a/scprep/version.py +++ b/scprep/version.py @@ -1,4 +1,4 @@ # author: Scott Gigante # (C) 2018 Krishnaswamy Lab GPLv2 -__version__ = "0.10.2" +__version__ = "1.0.0-alpha" diff --git a/setup.py b/setup.py index d6e31c96..dccd741d 100644 --- a/setup.py +++ b/setup.py @@ -3,10 +3,10 @@ from setuptools import setup, find_packages install_requires = [ - 'numpy>=1.10.0', - 'scipy>=0.18.0', + 'numpy>=1.12.0', + 'scipy>=0.18.1', 'scikit-learn>=0.19.1', - 'pandas>=0.19.0,<0.24', + 'pandas>=0.24', 'decorator' ] diff --git a/test/test_io.py b/test/test_io.py index e884f7b4..22cc8f77 100644 --- a/test/test_io.py +++ b/test/test_io.py @@ -191,7 +191,7 @@ def test_csv_and_tsv(): sparse=True, skiprows=1, usecols=range(1, 101)) - assert np.sum(np.sum(X.values != X_csv.values)) == 0 + assert np.sum(np.sum(X.to_numpy() != X_csv.to_numpy())) == 0 assert isinstance(X_csv, pd.SparseDataFrame) X_csv = scprep.io.load_csv( os.path.join(data.data_dir, @@ -216,7 +216,7 @@ def test_mtx(): cell_names=os.path.join( data.data_dir, "barcodes.tsv"), cell_axis="column") - assert np.sum(np.sum(X.values != X_mtx.values)) == 0 + assert np.sum(np.sum(X.to_numpy() != X_mtx.to_numpy())) == 0 np.testing.assert_array_equal(X.columns, X_mtx.columns) np.testing.assert_array_equal(X.index, X_mtx.index) assert isinstance(X_mtx, pd.SparseDataFrame) @@ -225,7 +225,7 @@ def test_mtx(): gene_names=X.columns, cell_names=X.index, cell_axis="column") - assert np.sum(np.sum(X.values != X_mtx.values)) == 0 + assert np.sum(np.sum(X.to_numpy() != X_mtx.to_numpy())) == 0 np.testing.assert_array_equal(X.columns, X_mtx.columns) np.testing.assert_array_equal(X.index, X_mtx.index) assert isinstance(X_mtx, pd.SparseDataFrame) @@ -235,7 +235,7 @@ def test_mtx(): cell_names=None, sparse=False, cell_axis="column") - assert np.sum(np.sum(X.values != X_mtx)) == 0 + assert np.sum(np.sum(X.to_numpy() != X_mtx)) == 0 assert isinstance(X_mtx, np.ndarray) assert_raise_message( ValueError, @@ -260,13 +260,13 @@ def test_fcs(): assert 'Time' not in X.columns assert len(set(X.columns).difference(data.columns)) == 0 np.testing.assert_array_equal(X.index, data.index) - np.testing.assert_array_equal(X.values, data[X.columns].values) + np.testing.assert_array_equal(X.to_numpy(), data[X.columns].to_numpy()) _, _, X = scprep.io.load_fcs(path, sparse=True) assert 'Time' not in X.columns assert len(set(X.columns).difference(data.columns)) == 0 np.testing.assert_array_equal(X.index, data.index) np.testing.assert_array_equal( - X.to_dense().values, data[X.columns].values) + X.to_dense().to_numpy(), data[X.columns].to_numpy()) def test_parse_header(): diff --git a/test/test_plot.py b/test/test_plot.py index e9039827..61607ad7 100644 --- a/test/test_plot.py +++ b/test/test_plot.py @@ -805,7 +805,7 @@ def test_marker_plot_no_gene_names(self): "be provided. " "Got gene_names=None, data as a ", scprep.plot.marker_plot, - data=self.X.values, + data=self.X.to_numpy(), clusters=np.random.choice( np.arange(10), replace=True, size=self.X.shape[0]), markers={'tissue': ['z']}) diff --git a/test/test_select.py b/test/test_select.py index 3307938d..8f951d42 100644 --- a/test/test_select.py +++ b/test/test_select.py @@ -40,7 +40,7 @@ def test_get_gene_set_ndarray(self): "data must be a list of gene names or a pandas " "DataFrame. Got ndarray", scprep.select.get_gene_set, - data=self.X.values, regex="8$") + data=self.X.to_numpy(), regex="8$") def test_get_gene_set_no_condition(self): assert_warns_message( @@ -74,7 +74,7 @@ def test_get_cell_set_ndarray(self): "data must be a list of cell names or a pandas " "DataFrame. Got ndarray", scprep.select.get_cell_set, - data=self.X.values, regex="G\\-1$") + data=self.X.to_numpy(), regex="G\\-1$") def test_get_cell_set_no_condition(self): assert_warns_message( @@ -105,7 +105,7 @@ def test_select_rows_integer_index(self): def test_select_rows_string_array_index(self): matrix.test_pandas_matrix_types( self.X, scprep.select.select_rows, - idx=np.random.choice(self.X.index.values, self.X.shape[0] // 2)) + idx=np.random.choice(self.X.index.to_numpy(), self.X.shape[0] // 2)) def test_select_rows_pandas_index_index(self): matrix.test_pandas_matrix_types( @@ -141,11 +141,11 @@ def test_select_rows_sparse_series_data_integer_index(self): def test_select_rows_1d_array_data(self): scprep.select.select_rows( - self.X, self.X.values[:, 0], idx=np.random.choice([True, False], [self.X.shape[0]])) + self.X, self.X.to_numpy()[:, 0], idx=np.random.choice([True, False], [self.X.shape[0]])) def test_select_rows_list_data(self): scprep.select.select_rows( - self.X, self.X.values[:, 0].tolist(), idx=np.random.choice([True, False], [self.X.shape[1]])) + self.X, self.X.to_numpy()[:, 0].tolist(), idx=np.random.choice([True, False], [self.X.shape[1]])) def test_select_rows_get_cell_set(self): matrix.test_pandas_matrix_types( @@ -188,7 +188,7 @@ def test_select_cols_integer_index(self): def test_select_cols_string_array_index(self): matrix.test_pandas_matrix_types( self.X, scprep.select.select_cols, - idx=np.random.choice(self.X.columns.values, self.X.shape[1] // 2)) + idx=np.random.choice(self.X.columns.to_numpy(), self.X.shape[1] // 2)) def test_select_cols_pandas_index_index(self): matrix.test_pandas_matrix_types( @@ -224,11 +224,11 @@ def test_select_cols_sparse_series_data_integer_index(self): def test_select_cols_1d_array_data(self): scprep.select.select_cols( - self.X, self.X.values[0, :], idx=np.random.choice([True, False], [self.X.shape[1]])) + self.X, self.X.to_numpy()[0, :], idx=np.random.choice([True, False], [self.X.shape[1]])) def test_select_cols_list_data(self): scprep.select.select_cols( - self.X, self.X.values[0, :].tolist(), idx=np.random.choice([True, False], [self.X.shape[1]])) + self.X, self.X.to_numpy()[0, :].tolist(), idx=np.random.choice([True, False], [self.X.shape[1]])) def test_select_cols_get_gene_set(self): matrix.test_pandas_matrix_types( @@ -250,14 +250,14 @@ def test_select_cols_no_condition(self): def test_select_rows_invalid_index(self): assert_raise_message(KeyError, - "the label [not_a_cell] is not in the [index]", + "'not_a_cell'", scprep.select.select_rows, self.X, idx='not_a_cell') def test_select_cols_invalid_index(self): assert_raise_message(KeyError, - "the label [not_a_gene] is not in the [columns]", + "'not_a_gene'", scprep.select.select_cols, self.X, idx='not_a_gene') @@ -283,7 +283,7 @@ def test_select_cols_unequal_columns(self): "columns. Got [100, 50]", scprep.select.select_cols, self.X, - self.X.values[:, :50]) + self.X.to_numpy()[:, :50]) def test_select_rows_unequal_rows(self): assert_raise_message( @@ -292,7 +292,7 @@ def test_select_rows_unequal_rows(self): "rows. Got [100, 50]", scprep.select.select_rows, self.X, - self.X.values[:50, :]) + self.X.to_numpy()[:50, :]) def test_select_cols_conflicting_data(self): assert_raise_message( @@ -319,7 +319,7 @@ def test_select_cols_get_gene_set_ndarray_data(self): ValueError, "Can only select based on column names with DataFrame input. " "Please set `idx` to select specific columns.", - scprep.select.select_cols, self.X.values, starts_with="A" + scprep.select.select_cols, self.X.to_numpy(), starts_with="A" ) def test_select_rows_get_cell_set_ndarray_data(self): @@ -327,7 +327,7 @@ def test_select_rows_get_cell_set_ndarray_data(self): ValueError, "Can only select based on row names with DataFrame input. " "Please set `idx` to select specific rows.", - scprep.select.select_rows, self.X.values, starts_with="A" + scprep.select.select_rows, self.X.to_numpy(), starts_with="A" ) def test_subsample(self): diff --git a/test/tools/matrix.py b/test/tools/matrix.py index 2c443307..20007f03 100644 --- a/test/tools/matrix.py +++ b/test/tools/matrix.py @@ -16,6 +16,8 @@ def _no_warning_dia_matrix(*args, **kwargs): return sparse.dia_matrix(*args, **kwargs) +SparseDataFrame = partial(pd.SparseDataFrame, default_fill_value=0.0) + _scipy_matrix_types = [ sparse.csr_matrix, sparse.csc_matrix, @@ -34,12 +36,12 @@ def _no_warning_dia_matrix(*args, **kwargs): ] _pandas_sparse_matrix_types = [ - partial(pd.SparseDataFrame, default_fill_value=0.0), + SparseDataFrame, ] _pandas_matrix_types = [ pd.DataFrame, - partial(pd.SparseDataFrame, default_fill_value=0.0), + SparseDataFrame, ] _indexable_matrix_types = [ @@ -49,7 +51,7 @@ def _no_warning_dia_matrix(*args, **kwargs): sparse.dok_matrix, np.array, pd.DataFrame, - pd.SparseDataFrame + SparseDataFrame ] From 5026837554ca2fbdc90715f7305d7b12008af975 Mon Sep 17 00:00:00 2001 From: Haarith Vohra Date: Fri, 31 May 2019 15:36:44 -0400 Subject: [PATCH 002/125] converting type of input matrix data to float64 --- scprep/io.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/scprep/io.py b/scprep/io.py index e18f8fb6..94bb93c0 100644 --- a/scprep/io.py +++ b/scprep/io.py @@ -155,6 +155,9 @@ def _matrix_to_data_frame(data, gene_names=None, cell_names=None, sparse=None): if sp.issparse(data): data = data.toarray() data = pd.DataFrame(data, index=cell_names, columns=gene_names) + + # convert data to float + data = data.astype(float) return data From d223fbeacf74671d89bcd1fe2b08541bcb682d1c Mon Sep 17 00:00:00 2001 From: Scott Gigante Date: Thu, 21 Feb 2019 16:58:02 -0500 Subject: [PATCH 003/125] first attempt at pandas 0.24 support --- requirements.txt | 3 ++- scprep/filter.py | 6 +++--- scprep/reduce.py | 2 +- scprep/select.py | 41 ++++++++++++++++++++++++++++++++++------ scprep/utils.py | 45 ++++++++++++++++++-------------------------- scprep/version.py | 2 +- setup.py | 8 ++++---- test/test_io.py | 12 ++++++------ test/test_plot.py | 2 +- test/test_select.py | 28 +++++++++++++-------------- test/tools/matrix.py | 2 ++ 11 files changed, 87 insertions(+), 64 deletions(-) diff --git a/requirements.txt b/requirements.txt index b5819b87..9830c87c 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,6 @@ +<<<<<<< HEAD numpy>=1.10.0 scipy>=0.18.0 scikit-learn>=0.19.1 -pandas>=0.19.0,<0.24 decorator>=4.3.0 +pandas>=0.24 diff --git a/scprep/filter.py b/scprep/filter.py index 1737cb14..696d9797 100644 --- a/scprep/filter.py +++ b/scprep/filter.py @@ -28,14 +28,14 @@ def remove_empty_cells(data, *extra_data, sample_labels=None): warnings.warn("`scprep.filter.remove_empty_cells` is deprecated. " "Use `scprep.filter.filter_empty_cells` instead.", DeprecationWarning) - return filter_empty_cells(data, *extra_data) + return filter_empty_cells(data, *extra_data, sample_labels=sample_labels) def remove_duplicates(data, *extra_data, sample_labels=None): warnings.warn("`scprep.filter.remove_duplicates` is deprecated. " "Use `scprep.filter.filter_duplicates` instead.", DeprecationWarning) - return filter_duplicates(data, *extra_data) + return filter_duplicates(data, *extra_data, sample_labels=sample_labels) def filter_empty_genes(data, *extra_data): @@ -303,7 +303,7 @@ def filter_gene_set_expression(data, *extra_data, genes=None, Filtered extra data, if passed. """ cell_sums = measure.gene_set_expression( - data, genes, + data, genes=ganes, starts_with=starts_with, ends_with=ends_with, exact_word=exact_word, regex=regex, library_size_normalize=library_size_normalize) diff --git a/scprep/reduce.py b/scprep/reduce.py index 028d36c6..04e6d3e5 100644 --- a/scprep/reduce.py +++ b/scprep/reduce.py @@ -280,7 +280,7 @@ def pca(data, n_components=100, eps=0.3, if isinstance(data, pd.SparseDataFrame): data = data.to_coo() elif isinstance(data, pd.DataFrame): - data = data.values + data = data.to_numpy() # handle sparsity if sparse.issparse(data): diff --git a/scprep/select.py b/scprep/select.py index 81404edd..f22856cf 100644 --- a/scprep/select.py +++ b/scprep/select.py @@ -77,8 +77,18 @@ def _check_rows_compatible(*data): def _convert_dataframe_1d(idx): +<<<<<<< HEAD _check_idx_1d(idx) idx = idx.iloc[:, 0] if idx.shape[1] == 1 else idx.iloc[0, :] +======= + if (not _is_1d(idx)) and np.prod(idx.shape) != np.max(idx.shape): + raise ValueError( + "Expected idx to be 1D. Got shape {}".format(idx.shape)) + if idx.shape[1] == 1: + idx = idx.loc[:, idx.columns[0]] + else: + idx = idx.loc[idx.index[0], :] +>>>>>>> first attempt at pandas 0.24 support return idx @@ -218,7 +228,7 @@ def get_gene_set(data, starts_with=None, ends_with=None, """ if not _is_1d(data): try: - data = data.columns.values + data = data.columns.to_numpy() except AttributeError: raise TypeError("data must be a list of gene names or a pandas " "DataFrame. Got {}".format(type(data).__name__)) @@ -254,7 +264,7 @@ def get_cell_set(data, starts_with=None, ends_with=None, """ if not _is_1d(data): try: - data = data.index.values + data = data.index.to_numpy() except AttributeError: raise TypeError("data must be a list of cell names or a pandas " "DataFrame. Got {}".format(type(data).__name__)) @@ -326,21 +336,31 @@ def select_cols(data, *extra_data, idx=None, _check_idx_1d(idx) idx = idx.flatten() + idx = utils.toarray(idx) + if isinstance(data, pd.DataFrame): try: + if np.issubdtype(idx.dtype, np.dtype(bool).type): + # temporary workaround for pandas error + raise TypeError data = data.loc[:, idx] except (KeyError, TypeError): if isinstance(idx, numbers.Integral) or \ - issubclass(np.array(idx).dtype.type, numbers.Integral): + np.issubdtype(idx.dtype, np.dtype(int)) or \ + np.issubdtype(idx.dtype, np.dtype(bool)): data = data.loc[:, np.array(data.columns)[idx]] else: raise elif isinstance(data, pd.Series): try: + if np.issubdtype(idx.dtype, np.dtype(bool).type): + # temporary workaround for pandas error + raise TypeError data = data.loc[idx] except (KeyError, TypeError): if isinstance(idx, numbers.Integral) or \ - issubclass(np.array(idx).dtype.type, numbers.Integral): + np.issubdtype(idx.dtype, np.dtype(int)) or \ + np.issubdtype(idx.dtype, np.dtype(bool)): data = data.loc[np.array(data.index)[idx]] else: raise @@ -421,21 +441,30 @@ def select_rows(data, *extra_data, idx=None, if isinstance(idx, pd.DataFrame): idx = _convert_dataframe_1d(idx) +<<<<<<< HEAD elif not isinstance(idx, (numbers.Integral, str)): idx = utils.toarray(idx) _check_idx_1d(idx) idx = idx.flatten() +======= + + idx = utils.toarray(idx) +>>>>>>> first attempt at pandas 0.24 support if isinstance(data, (pd.DataFrame, pd.Series)): try: + if np.issubdtype(idx.dtype, np.dtype(bool).type): + # temporary workaround for pandas error + raise TypeError with warnings.catch_warnings(): warnings.filterwarnings( "error", "Passing list-likes to .loc") data = data.loc[idx] except (KeyError, TypeError, FutureWarning): if isinstance(idx, numbers.Integral) or \ - issubclass(np.array(idx).dtype.type, numbers.Integral): - data = data.iloc[idx] + np.issubdtype(idx.dtype, np.dtype(int)) or \ + np.issubdtype(idx.dtype, np.dtype(bool)): + data = data.loc[np.array(data.index)[idx]] else: raise elif _is_1d(data): diff --git a/scprep/utils.py b/scprep/utils.py index d8724d1c..f5e60dfd 100644 --- a/scprep/utils.py +++ b/scprep/utils.py @@ -4,10 +4,13 @@ import re from scipy import sparse import warnings +<<<<<<< HEAD import importlib from decorator import decorator from . import select +======= +>>>>>>> first attempt at pandas 0.24 support try: ModuleNotFoundError @@ -93,13 +96,13 @@ def toarray(x): if isinstance(x, pd.SparseDataFrame): x = x.to_coo().toarray() elif isinstance(x, pd.SparseSeries): - x = x.to_dense().values + x = x.to_dense().to_numpy() elif isinstance(x, (pd.DataFrame, pd.Series, pd.Index)): - x = x.values + x = x.to_numpy() elif isinstance(x, sparse.spmatrix): x = x.toarray() elif isinstance(x, np.matrix): - x = np.array(x) + x = x.A elif isinstance(x, list): x_out = [] for xi in x: @@ -214,7 +217,7 @@ def matrix_sum(data, axis=None): sums = pd.Series(np.array(data.to_coo().sum(axis)).flatten(), index=index) elif axis is None: - sums = data.values.sum() + sums = data.to_numpy().sum() else: sums = data.sum(axis) else: @@ -323,7 +326,7 @@ def matrix_min(data): elif isinstance(data, sparse.lil_matrix): data = [np.min(d) for d in data.data] + [0] elif isinstance(data, sparse.dok_matrix): - data = list(data.values()) + [0] + data = list(data.to_numpy()()) + [0] elif isinstance(data, sparse.dia_matrix): data = [np.min(data.data), 0] return np.min(data) @@ -458,37 +461,25 @@ def combine_batches(data, batch_labels, append_to_cell_names=None): def select_cols(data, idx): - warnings.warn("`scprep.utils.select_cols` is deprecated. Use " - "`scprep.select.select_cols` instead.", - FutureWarning) - return select.select_cols(data, idx=idx) + raise RuntimeError("`scprep.utils.select_cols` is deprecated. Use " + "`scprep.select.select_cols` instead.") def select_rows(data, idx): - warnings.warn("`scprep.utils.select_rows` is deprecated. Use " - "`scprep.select.select_rows` instead.", - FutureWarning) - return select.select_rows(data, idx=idx) + raise RuntimeError("`scprep.utils.select_rows` is deprecated. Use " + "`scprep.select.select_rows` instead.") def get_gene_set(data, starts_with=None, ends_with=None, regex=None): - warnings.warn("`scprep.utils.get_gene_set` is deprecated. Use " - "`scprep.select.get_gene_set` instead.", - FutureWarning) - return select.get_gene_set(data, starts_with=starts_with, - ends_with=ends_with, regex=regex) + raise RuntimeError("`scprep.utils.get_gene_set` is deprecated. Use " + "`scprep.select.get_gene_set` instead.") def get_cell_set(data, starts_with=None, ends_with=None, regex=None): - warnings.warn("`scprep.utils.get_cell_set` is deprecated. Use " - "`scprep.select.get_cell_set` instead.", - FutureWarning) - return select.get_cell_set(data, starts_with=starts_with, - ends_with=ends_with, regex=regex) + raise RuntimeError("`scprep.utils.get_cell_set` is deprecated. Use " + "`scprep.select.get_cell_set` instead.") def subsample(*data, n=10000, seed=None): - warnings.warn("`scprep.utils.subsample` is deprecated. Use " - "`scprep.select.subsample` instead.", - FutureWarning) - return select.subsample(*data, n=n, seed=seed) + raise RuntimeError("`scprep.utils.subsample` is deprecated. Use " + "`scprep.select.subsample` instead.") diff --git a/scprep/version.py b/scprep/version.py index e76593b0..6c50b11c 100644 --- a/scprep/version.py +++ b/scprep/version.py @@ -1,4 +1,4 @@ # author: Scott Gigante # (C) 2018 Krishnaswamy Lab GPLv2 -__version__ = "0.12.0" +__version__ = "1.0.0-alpha" diff --git a/setup.py b/setup.py index c4b0971a..72b32dfb 100644 --- a/setup.py +++ b/setup.py @@ -3,11 +3,11 @@ from setuptools import setup, find_packages install_requires = [ - 'numpy>=1.10.0', - 'scipy>=0.18.0', + 'numpy>=1.12.0', + 'scipy>=0.18.1', 'scikit-learn>=0.19.1', - 'pandas>=0.19.0,<0.24', - 'decorator>=4.3.0' + 'decorator>=4.3.0', + 'pandas>=0.24', ] test_requires = [ diff --git a/test/test_io.py b/test/test_io.py index 401188e9..a9d61b8a 100644 --- a/test/test_io.py +++ b/test/test_io.py @@ -308,7 +308,7 @@ def test_csv_and_tsv(): sparse=True, skiprows=1, usecols=range(1, 101)) - assert np.sum(np.sum(X.values != X_csv.values)) == 0 + assert np.sum(np.sum(X.to_numpy() != X_csv.to_numpy())) == 0 assert isinstance(X_csv, pd.SparseDataFrame) X_csv = scprep.io.load_csv( os.path.join(data.data_dir, @@ -333,7 +333,7 @@ def test_mtx(): cell_names=os.path.join( data.data_dir, "barcodes.tsv"), cell_axis="column") - assert np.sum(np.sum(X.values != X_mtx.values)) == 0 + assert np.sum(np.sum(X.to_numpy() != X_mtx.to_numpy())) == 0 np.testing.assert_array_equal(X.columns, X_mtx.columns) np.testing.assert_array_equal(X.index, X_mtx.index) assert isinstance(X_mtx, pd.SparseDataFrame) @@ -342,7 +342,7 @@ def test_mtx(): gene_names=X.columns, cell_names=X.index, cell_axis="column") - assert np.sum(np.sum(X.values != X_mtx.values)) == 0 + assert np.sum(np.sum(X.to_numpy() != X_mtx.to_numpy())) == 0 np.testing.assert_array_equal(X.columns, X_mtx.columns) np.testing.assert_array_equal(X.index, X_mtx.index) assert isinstance(X_mtx, pd.SparseDataFrame) @@ -352,7 +352,7 @@ def test_mtx(): cell_names=None, sparse=False, cell_axis="column") - assert np.sum(np.sum(X.values != X_mtx)) == 0 + assert np.sum(np.sum(X.to_numpy() != X_mtx)) == 0 assert isinstance(X_mtx, np.ndarray) assert_raise_message( ValueError, @@ -377,13 +377,13 @@ def test_fcs(): assert 'Time' not in X.columns assert len(set(X.columns).difference(data.columns)) == 0 np.testing.assert_array_equal(X.index, data.index) - np.testing.assert_array_equal(X.values, data[X.columns].values) + np.testing.assert_array_equal(X.to_numpy(), data[X.columns].to_numpy()) _, _, X = scprep.io.load_fcs(path, sparse=True) assert 'Time' not in X.columns assert len(set(X.columns).difference(data.columns)) == 0 np.testing.assert_array_equal(X.index, data.index) np.testing.assert_array_equal( - X.to_dense().values, data[X.columns].values) + X.to_dense().to_numpy(), data[X.columns].to_numpy()) X_meta, _, X = scprep.io.load_fcs(path, reformat_meta=False, override=True) assert set(meta.keys()) == set(X_meta.keys()) diff --git a/test/test_plot.py b/test/test_plot.py index 960ada3f..84a242d1 100644 --- a/test/test_plot.py +++ b/test/test_plot.py @@ -931,7 +931,7 @@ def test_marker_plot_no_gene_names(self): "be provided. " "Got gene_names=None, data as a ", scprep.plot.marker_plot, - data=self.X.values, + data=self.X.to_numpy(), clusters=np.random.choice( np.arange(10), replace=True, size=self.X.shape[0]), markers={'tissue': ['z']}) diff --git a/test/test_select.py b/test/test_select.py index 0ed3e096..f3380759 100644 --- a/test/test_select.py +++ b/test/test_select.py @@ -41,7 +41,7 @@ def test_get_gene_set_ndarray(self): "data must be a list of gene names or a pandas " "DataFrame. Got ndarray", scprep.select.get_gene_set, - data=self.X.values, regex="8$") + data=self.X.to_numpy(), regex="8$") def test_get_gene_set_no_condition(self): assert_warns_message( @@ -75,7 +75,7 @@ def test_get_cell_set_ndarray(self): "data must be a list of cell names or a pandas " "DataFrame. Got ndarray", scprep.select.get_cell_set, - data=self.X.values, regex="G\\-1$") + data=self.X.to_numpy(), regex="G\\-1$") def test_get_cell_set_no_condition(self): assert_warns_message( @@ -106,7 +106,7 @@ def test_select_rows_integer_index(self): def test_select_rows_string_array_index(self): matrix.test_pandas_matrix_types( self.X, scprep.select.select_rows, - idx=np.random.choice(self.X.index.values, self.X.shape[0] // 2)) + idx=np.random.choice(self.X.index.to_numpy(), self.X.shape[0] // 2)) def test_select_rows_pandas_index_index(self): matrix.test_pandas_matrix_types( @@ -142,11 +142,11 @@ def test_select_rows_sparse_series_data_integer_index(self): def test_select_rows_1d_array_data(self): scprep.select.select_rows( - self.X, self.X.values[:, 0], idx=np.random.choice([True, False], [self.X.shape[0]])) + self.X, self.X.to_numpy()[:, 0], idx=np.random.choice([True, False], [self.X.shape[0]])) def test_select_rows_list_data(self): scprep.select.select_rows( - self.X, self.X.values[:, 0].tolist(), idx=np.random.choice([True, False], [self.X.shape[1]])) + self.X, self.X.to_numpy()[:, 0].tolist(), idx=np.random.choice([True, False], [self.X.shape[1]])) def test_select_rows_get_cell_set(self): matrix.test_pandas_matrix_types( @@ -189,7 +189,7 @@ def test_select_cols_integer_index(self): def test_select_cols_string_array_index(self): matrix.test_pandas_matrix_types( self.X, scprep.select.select_cols, - idx=np.random.choice(self.X.columns.values, self.X.shape[1] // 2)) + idx=np.random.choice(self.X.columns.to_numpy(), self.X.shape[1] // 2)) def test_select_cols_pandas_index_index(self): matrix.test_pandas_matrix_types( @@ -241,11 +241,11 @@ def test_select_cols_sparse_series_data_integer_index(self): def test_select_cols_1d_array_data(self): scprep.select.select_cols( - self.X, self.X.values[0, :], idx=np.random.choice([True, False], [self.X.shape[1]])) + self.X, self.X.to_numpy()[0, :], idx=np.random.choice([True, False], [self.X.shape[1]])) def test_select_cols_list_data(self): scprep.select.select_cols( - self.X, self.X.values[0, :].tolist(), idx=np.random.choice([True, False], [self.X.shape[1]])) + self.X, self.X.to_numpy()[0, :].tolist(), idx=np.random.choice([True, False], [self.X.shape[1]])) def test_select_cols_get_gene_set(self): matrix.test_pandas_matrix_types( @@ -267,14 +267,14 @@ def test_select_cols_no_condition(self): def test_select_rows_invalid_index(self): assert_raise_message(KeyError, - "the label [not_a_cell] is not in the [index]", + "'not_a_cell'", scprep.select.select_rows, self.X, idx='not_a_cell') def test_select_cols_invalid_index(self): assert_raise_message(KeyError, - "the label [not_a_gene] is not in the [columns]", + "'not_a_gene'", scprep.select.select_cols, self.X, idx='not_a_gene') @@ -318,7 +318,7 @@ def test_select_cols_unequal_columns(self): "columns. Got [100, 50]", scprep.select.select_cols, self.X, - self.X.values[:, :50]) + self.X.to_numpy()[:, :50]) def test_select_rows_unequal_rows(self): assert_raise_message( @@ -327,7 +327,7 @@ def test_select_rows_unequal_rows(self): "rows. Got [100, 50]", scprep.select.select_rows, self.X, - self.X.values[:50, :]) + self.X.to_numpy()[:50, :]) def test_select_cols_conflicting_data(self): assert_raise_message( @@ -354,7 +354,7 @@ def test_select_cols_get_gene_set_ndarray_data(self): ValueError, "Can only select based on column names with DataFrame input. " "Please set `idx` to select specific columns.", - scprep.select.select_cols, self.X.values, starts_with="A" + scprep.select.select_cols, self.X.to_numpy(), starts_with="A" ) def test_select_rows_get_cell_set_ndarray_data(self): @@ -362,7 +362,7 @@ def test_select_rows_get_cell_set_ndarray_data(self): ValueError, "Can only select based on row names with DataFrame input. " "Please set `idx` to select specific rows.", - scprep.select.select_rows, self.X.values, starts_with="A" + scprep.select.select_rows, self.X.to_numpy(), starts_with="A" ) def test_subsample(self): diff --git a/test/tools/matrix.py b/test/tools/matrix.py index ec741ddd..1027e34e 100644 --- a/test/tools/matrix.py +++ b/test/tools/matrix.py @@ -17,6 +17,8 @@ def _no_warning_dia_matrix(*args, **kwargs): SparseDataFrame = partial(pd.SparseDataFrame, default_fill_value=0.0) +SparseDataFrame = partial(pd.SparseDataFrame, default_fill_value=0.0) + _scipy_matrix_types = [ sparse.csr_matrix, sparse.csc_matrix, From fdddf974722abeb589c8006434b9353b4166e08d Mon Sep 17 00:00:00 2001 From: Haarith Vohra Date: Tue, 18 Jun 2019 17:20:58 -0400 Subject: [PATCH 004/125] fixing errors, type errors, error messages, converting data to float64 --- scprep/filter.py | 2 +- scprep/io/utils.py | 2 ++ scprep/select.py | 32 +++++++++++++++++++++----------- test/test_filter.py | 2 +- test/test_io.py | 2 +- test/test_utils.py | 10 +++++----- 6 files changed, 31 insertions(+), 19 deletions(-) diff --git a/scprep/filter.py b/scprep/filter.py index 696d9797..58051c37 100644 --- a/scprep/filter.py +++ b/scprep/filter.py @@ -303,7 +303,7 @@ def filter_gene_set_expression(data, *extra_data, genes=None, Filtered extra data, if passed. """ cell_sums = measure.gene_set_expression( - data, genes=ganes, + data, genes=genes, starts_with=starts_with, ends_with=ends_with, exact_word=exact_word, regex=regex, library_size_normalize=library_size_normalize) diff --git a/scprep/io/utils.py b/scprep/io/utils.py index 30a6eafa..85cd9249 100644 --- a/scprep/io/utils.py +++ b/scprep/io/utils.py @@ -126,4 +126,6 @@ def _matrix_to_data_frame(data, gene_names=None, cell_names=None, sparse=None): if sp.issparse(data): data = data.toarray() data = pd.DataFrame(data, index=cell_names, columns=gene_names) + # convert data to float + data = data.astype(float) return data diff --git a/scprep/select.py b/scprep/select.py index edb19f42..9e6b7810 100644 --- a/scprep/select.py +++ b/scprep/select.py @@ -328,11 +328,16 @@ def select_cols(data, *extra_data, idx=None, if isinstance(data, pd.DataFrame): try: - if np.issubdtype(idx.dtype, np.dtype(bool).type): - # temporary workaround for pandas error - raise TypeError - data = data.loc[:, idx] + if isinstance(idx, (numbers.Integral, str)): + data = data.loc[:, idx] + else: + if np.issubdtype(idx.dtype, np.dtype(bool).type): + # temporary workaround for pandas error + raise TypeError + data = data.loc[:, idx] except (KeyError, TypeError): + if isinstance(idx, str): + raise if isinstance(idx, numbers.Integral) or \ np.issubdtype(idx.dtype, np.dtype(int)) or \ np.issubdtype(idx.dtype, np.dtype(bool)): @@ -436,14 +441,19 @@ def select_rows(data, *extra_data, idx=None, if isinstance(data, (pd.DataFrame, pd.Series)): try: - if np.issubdtype(idx.dtype, np.dtype(bool).type): - # temporary workaround for pandas error - raise TypeError - with warnings.catch_warnings(): - warnings.filterwarnings( - "error", "Passing list-likes to .loc") - data = data.loc[idx] + if isinstance(idx, (numbers.Integral, str)): + data = data.loc[:, idx] + else: + if np.issubdtype(idx.dtype, np.dtype(bool).type): + # temporary workaround for pandas error + raise TypeError + with warnings.catch_warnings(): + warnings.filterwarnings( + "error", "Passing list-likes to .loc") + data = data.loc[idx] except (KeyError, TypeError, FutureWarning): + if isinstance(idx, str): + raise if isinstance(idx, numbers.Integral) or \ np.issubdtype(idx.dtype, np.dtype(int)) or \ np.issubdtype(idx.dtype, np.dtype(bool)): diff --git a/test/test_filter.py b/test/test_filter.py index 24e30c97..4421855c 100644 --- a/test/test_filter.py +++ b/test/test_filter.py @@ -209,7 +209,7 @@ def test_gene_expression_filter_warning(self): self.X_sparse, genes=genes, cutoff=None, percentile=None) assert_raise_message( KeyError, - "the label [not_a_gene] is not in the [columns]", + "not_a_gene", scprep.filter.filter_gene_set_expression, self.X_sparse, genes=no_genes, percentile=90.0, keep_cells='below') diff --git a/test/test_io.py b/test/test_io.py index a9d61b8a..302770a1 100644 --- a/test/test_io.py +++ b/test/test_io.py @@ -116,7 +116,7 @@ def test_10X_zip_url_not_a_zip(): def test_10X_zip_url_not_a_real_website(): assert_raise_message( urllib.error.URLError, - "", + "", scprep.io.load_10X_zip, 'http://invalid.not.a.url/scprep') diff --git a/test/test_utils.py b/test/test_utils.py index d3ab0da3..f2512195 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -370,30 +370,30 @@ def test_deprecated(): assert_warns_message(FutureWarning, "`scprep.utils.select_cols` is deprecated. Use " "`scprep.select.select_cols` instead.", - scprep.utils.select_cols, + scprep.select.select_cols, X, [1, 2, 3]) assert_warns_message(FutureWarning, "`scprep.utils.select_rows` is deprecated. Use " "`scprep.select.select_rows` instead.", - scprep.utils.select_rows, + scprep.select.select_rows, X, [1, 2, 3]) assert_warns_message(FutureWarning, "`scprep.utils.get_gene_set` is deprecated. Use " "`scprep.select.get_gene_set` instead.", - scprep.utils.get_gene_set, + scprep.select.get_gene_set, X, starts_with="D") assert_warns_message(FutureWarning, "`scprep.utils.get_cell_set` is deprecated. Use " "`scprep.select.get_cell_set` instead.", - scprep.utils.get_cell_set, + scprep.select.get_cell_set, X, starts_with="A") assert_warns_message(FutureWarning, "`scprep.utils.subsample` is deprecated. Use " "`scprep.select.subsample` instead.", - scprep.utils.subsample, + scprep.select.subsample, X, n=10) From 27ce522eefa911c9370dc2d49ee358a476babcc7 Mon Sep 17 00:00:00 2001 From: Scott Gigante Date: Mon, 1 Jul 2019 10:31:02 +0200 Subject: [PATCH 005/125] bump version --- scprep/version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scprep/version.py b/scprep/version.py index 54306210..385667a0 100644 --- a/scprep/version.py +++ b/scprep/version.py @@ -1,4 +1,4 @@ # author: Scott Gigante # (C) 2018 Krishnaswamy Lab GPLv2 -__version__ = "0.12.2" +__version__ = "0.12.3-alpha0" From 2554a17779418ccd899ef23423fb1c6d3ff67764 Mon Sep 17 00:00:00 2001 From: Scott Gigante Date: Thu, 11 Jul 2019 12:02:07 +0200 Subject: [PATCH 006/125] add generic tab function, allow int input to scatter (fixes #53) --- scprep/plot/colors.py | 23 +++++++++++ scprep/plot/scatter.py | 22 +++++++---- scprep/utils.py | 2 +- test/test_plot.py | 90 +++++++++++++++++++++++++++++++++++++++--- 4 files changed, 122 insertions(+), 15 deletions(-) diff --git a/scprep/plot/colors.py b/scprep/plot/colors.py index bdbc347b..a9b378e4 100644 --- a/scprep/plot/colors.py +++ b/scprep/plot/colors.py @@ -74,3 +74,26 @@ def tab40(): colors = np.vstack([mpl.cm.tab20c.colors, mpl.cm.tab20b.colors]) return mpl.colors.ListedColormap(colors) + + +def tab(n=10): + if n < 1: + raise ValueError( + "Expected n >= 1. Got {}".format(n)) + n_shades = int(np.ceil(n / 10)) + if n_shades == 1: + cmap = mpl.cm.tab10 + elif n_shades == 2: + cmap = mpl.cm.tab20 + elif n_shades == 3: + cmap = tab30() + elif n_shades == 4: + cmap = tab40() + else: + cmap = tab10_continuous(n_colors=10, n_step=n_shades) + # restrict to n values + if n > 1 and n < cmap.N: + select_idx = np.tile(np.arange(10), n_shades) * \ + n_shades + np.repeat(np.arange(n_shades), 10) + cmap = mpl.colors.ListedColormap(np.array(cmap.colors)[select_idx[:n]]) + return cmap diff --git a/scprep/plot/scatter.py b/scprep/plot/scatter.py index 6821c049..ded4241f 100644 --- a/scprep/plot/scatter.py +++ b/scprep/plot/scatter.py @@ -9,20 +9,30 @@ _with_default) from .tools import (create_colormap, create_normalize, label_axis, generate_colorbar, generate_legend) +from . import colors from .._lazyload import matplotlib as mpl plt = mpl.pyplot +def _squeeze_array(x): + x = utils.toarray([x]).squeeze() + try: + len(x) + except TypeError: + x = x[None] + return x + + class _ScatterParams(object): def __init__(self, x, y, z=None, c=None, discrete=None, cmap=None, cmap_scale=None, vmin=None, vmax=None, s=None, legend=None, colorbar=None, shuffle=True): - self._x = utils.toarray(x).squeeze() - self._y = utils.toarray(y).squeeze() - self._z = utils.toarray(z).squeeze() if z is not None else None + self._x = _squeeze_array(x) + self._y = _squeeze_array(y) + self._z = _squeeze_array(z) if z is not None else None self._c = c self._discrete = discrete self._cmap = cmap @@ -257,11 +267,7 @@ def cmap(self): if self.constant_c() or self.array_c(): return None elif self.discrete: - n_unique_colors = self.n_c_unique - if n_unique_colors <= 10: - return self.process_string_cmap('tab10') - else: - return self.process_string_cmap('tab20') + return colors.tab(n=self.n_c_unique) else: return self.process_string_cmap('inferno') diff --git a/scprep/utils.py b/scprep/utils.py index 6c57022d..bf02c252 100644 --- a/scprep/utils.py +++ b/scprep/utils.py @@ -63,7 +63,7 @@ def check_version(pkg, min_version=None): "Please install it with e.g. `pip install --user {0}`".format(pkg)) if not _version_check(module.__version__, min_version): raise ImportError( - "scprep requires {0}>={1} (installed: {2}). " + "{0}>={1} is required (installed: {2}). " "Please upgrade it with e.g." " `pip install --user --upgrade {0}`".format( pkg, min_version, module.__version__)) diff --git a/test/test_plot.py b/test/test_plot.py index 4b3c34be..05ae7111 100644 --- a/test/test_plot.py +++ b/test/test_plot.py @@ -89,10 +89,6 @@ def test_tab30(): 10, 12, 13, 14, 16, 17, 18]]) -def test_is_color_array_none(): - assert not scprep.plot.utils._is_color_array(None) - - def test_tab40(): cmap = scprep.plot.colors.tab40() np.testing.assert_array_equal( @@ -142,6 +138,80 @@ def test_tab10_continuous_invalid_n_colors(): n_step=1) +def test_tab_exact(): + assert scprep.plot.colors.tab(1) is plt.cm.tab10 + np.testing.assert_array_equal( + scprep.plot.colors.tab(10).colors, plt.cm.tab10.colors) + np.testing.assert_array_equal( + scprep.plot.colors.tab(20).colors, plt.cm.tab20.colors) + np.testing.assert_array_equal( + scprep.plot.colors.tab(30).colors, scprep.plot.colors.tab30().colors) + np.testing.assert_array_equal( + scprep.plot.colors.tab(40).colors, scprep.plot.colors.tab40().colors) + np.testing.assert_array_equal( + scprep.plot.colors.tab(50).colors, + scprep.plot.colors.tab10_continuous(n_colors=10, n_step=5).colors) + + +def test_tab_first10(): + np.testing.assert_array_equal( + scprep.plot.colors.tab(19).colors[:10], plt.cm.tab10.colors) + np.testing.assert_array_equal( + scprep.plot.colors.tab(29).colors[:10], + scprep.plot.colors.tab30().colors[::3]) + np.testing.assert_array_equal( + scprep.plot.colors.tab(39).colors[:10], + scprep.plot.colors.tab40().colors[::4]) + np.testing.assert_array_equal( + scprep.plot.colors.tab(49).colors[:10], + scprep.plot.colors.tab10_continuous( + n_colors=10, n_step=5).colors[::5]) + + +def test_tab_first20(): + np.testing.assert_array_equal( + scprep.plot.colors.tab(29).colors[10:20], + scprep.plot.colors.tab30().colors[1::3]) + np.testing.assert_array_equal( + scprep.plot.colors.tab(39).colors[10:20], + scprep.plot.colors.tab40().colors[1::4]) + + +def test_tab_first30(): + np.testing.assert_array_equal( + scprep.plot.colors.tab(39).colors[20:30], + scprep.plot.colors.tab40().colors[2::4]) + + +def test_tab_overhang(): + np.testing.assert_array_equal( + scprep.plot.colors.tab(9).colors, plt.cm.tab10.colors[:9]) + np.testing.assert_array_equal( + scprep.plot.colors.tab(19).colors[10:], plt.cm.tab20.colors[1:-1:2]) + np.testing.assert_array_equal( + scprep.plot.colors.tab(29).colors[20:], + scprep.plot.colors.tab30().colors[2:-1:3]) + np.testing.assert_array_equal( + scprep.plot.colors.tab(39).colors[30:], + scprep.plot.colors.tab40().colors[3:-1:4]) + np.testing.assert_array_equal( + scprep.plot.colors.tab(49).colors[40:], + scprep.plot.colors.tab10_continuous( + n_colors=10, n_step=5).colors[4:-1:5]) + + +def test_tab_invalid(): + assert_raise_message( + ValueError, + "Expected n >= 1. Got 0", + scprep.plot.colors.tab, + n=0) + + +def test_is_color_array_none(): + assert not scprep.plot.utils._is_color_array(None) + + class TestScatterParams(unittest.TestCase): @classmethod @@ -180,6 +250,11 @@ def test_plot_idx_no_shuffle(self): np.testing.assert_equal(params.c, self.c) np.testing.assert_equal(params.s, np.abs(self.x)) + def test_data_int(self): + params = _ScatterParams(x=1, y=2) + np.testing.assert_equal(params._data, [np.array([1]), np.array([2])]) + assert params.subplot_kw == {} + def test_data_2d(self): params = _ScatterParams(x=self.x, y=self.y) np.testing.assert_equal(params._data, [self.x, @@ -301,8 +376,11 @@ def test_discrete_tab20(self): assert params.extend is None assert isinstance(params.cmap, matplotlib.colors.ListedColormap) np.testing.assert_equal( - params.cmap.colors, - plt.cm.tab20.colors[:len(np.unique(np.round(self.c % 1, 1)))]) + params.cmap.colors[:10], + plt.cm.tab10.colors) + np.testing.assert_equal( + params.cmap.colors[10:], + plt.cm.tab20.colors[1:1 + (len(params.cmap.colors) - 10) * 2:2]) def test_continuous_less_than_20(self): params = _ScatterParams(x=self.x, y=self.y, From eb40a7fd0126d130d24e40bb81a187a395e3aa96 Mon Sep 17 00:00:00 2001 From: Scott Gigante Date: Thu, 11 Jul 2019 12:09:57 +0200 Subject: [PATCH 007/125] document scprep.plot.colors.tab --- scprep/plot/colors.py | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/scprep/plot/colors.py b/scprep/plot/colors.py index a9b378e4..4d688dd5 100644 --- a/scprep/plot/colors.py +++ b/scprep/plot/colors.py @@ -77,6 +77,28 @@ def tab40(): def tab(n=10): + """A discrete colormap with an arbitrary number of colors + + This colormap chooses the best of the following, in order: + - `plt.cm.tab10` + - `plt.cm.tab20` + - `scprep.plot.colors.tab30` + - `scprep.plot.colors.tab40` + - `scprep.plot.colors.tab10_continuous` + + If the number of colors required is less than the number of colors + available, colors are selected specifically in order to reduce similarity + between selected colors. + + Parameters + ---------- + n : int, optional (default: 10) + Number of required colors. + + Returns + ------- + cmap : `matplotlib.colors.ListedColormap` + """ if n < 1: raise ValueError( "Expected n >= 1. Got {}".format(n)) From 5c311a60f432fc2a3f3a60833a10e9277e6a2f80 Mon Sep 17 00:00:00 2001 From: Scott Gigante Date: Thu, 11 Jul 2019 12:19:51 +0200 Subject: [PATCH 008/125] update version requirement error message --- test/test_utils.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test/test_utils.py b/test/test_utils.py index d3ab0da3..9a044cdf 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -66,7 +66,7 @@ def test_with_pkg_version_fail_major(): def test(): return True assert_raise_message(ImportError, - "scprep requires numpy>={0} (installed: {1}). " + "numpy>={0} is required (installed: {1}). " "Please upgrade it with e.g." " `pip install --user --upgrade numpy".format( major + 1, np.__version__), @@ -80,7 +80,7 @@ def test_with_pkg_version_fail_minor(): def test(): return True assert_raise_message(ImportError, - "scprep requires numpy>={0}.{1} (installed: {2}). " + "numpy>={0}.{1} is required (installed: {2}). " "Please upgrade it with e.g." " `pip install --user --upgrade numpy".format( major, minor + 1, np.__version__), From 9f4272abca3fe2b37eb238f94078c9d9032b27c2 Mon Sep 17 00:00:00 2001 From: Scott Gigante Date: Fri, 26 Jul 2019 15:14:04 +1000 Subject: [PATCH 009/125] Add logo image --- logo.png | Bin 0 -> 17164 bytes 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100644 logo.png diff --git a/logo.png b/logo.png new file mode 100644 index 0000000000000000000000000000000000000000..0bb30d36bedf32c3acb3f1d7499bb6bf3999aa9d GIT binary patch literal 17164 zcmeHvbySpJzb`Su&_jpR020z5(lLa9v~-6E3J3_2lEcs~qI63)3JB662qH*#h|=AC zx4+-}-gE9b=dOF#y7#X8&+B>|1KrOvdw=(*zIzDQR97UxrNKo*LnBaDl7EDThF%JO zIzlnP-#m(Fln1|{yFOBsMJpemT?2oB*do*rXlPZjc$b#9z@KrPl?+|c&c3|)|siWsvt z45w`*m|OL-GlFM$NIA zebYwWoJXebz?p_QM?_9a6%+wpKmNaX#fP8n4ZV)I6w$ak8hxN~WqA|zgSxp8%6**Gh#xA3z8=OVIB`g%1TFZb1-JNfy0H!;15fg*_1pbu65ajhrA$iS5Zdy@nkxetG8 zFRN)KtNBz`Qv;RN`R}d_97H@0A`k}=j)QnkimYXY++l?TvqFyK|7|QV=)Z>hPjg3r z*VgyAZ-8iTByn;tqv7RNrH5;X=lZw`t8}{Plz2;;_~mf3miZ4hua)A`&SIo!#`#qh zwbMJolL03ZY;qJ~XUKB;JlDGLd6+;gz)BI)0@ zcV*XlU`Qttu?xNlxB0CfUA8T5;}u`7Vq*_(Y6`BMMLXG=+{W)_jvY;|^XJoZj8(bnx?;0YTt;-^1X zig=%Ri!|)y+QHb#*FmedxQpJ~>vDRP&?(n{cVMTk+lRworHIopYB$+;{EExFGnW`K zclK>vD|4q}e?+&-D?(FD0A(@Mi7u6zy>adBG&6pwxpT?$sZ3*gtwDM|<(I3bhiNkX zrr^`^i>uduQfp-!*J(|yw$dKsnHi&<0%7sI0QT%B2Vg!A<;+T45qZVvr)e|AHET zHsJcG2A%r5`chycB>ybEe+~D)p1bQBM5T+IeP&1F_K(Q4?=8SNRj)XZhbh*Kb#6Vx7f}o3?BpXh@|dZ1|DRuWZFid$02vCio=(s*R-N z*%d8p+Q*eEwSlJp*`VaZ?H!1ytk^VK{f1Ja51pM@(>{pAq23YMMdau^2<#MIc zZf_R#eIM}3TuwFUjvzXakXpA;l5C zam{zb;G5Iu=I_hbYa3m^F|T)Wlr{=0OAVX@>c`5;CMS_E{Eyb&ZCvkL zm1qe4E-xupzXbSJ*IZ`|>=`O6^q(?B0mzWQ-IECdiVFZJ74|{suadY2FJ$|7n#NjI zv+r*@Bu8bnqOxjn5XOILU|Aq)nEuF_zpsA{_&=Y+w1`$nuRL8HawWaZtfmdl0?as4M67v3K->-^pp{Z&fd1z*2DJksX&JwH^wlGoA+ zTI~oWS}!m<-~SbPUKGx`z-XYW``#~y{cgZffnhN^HllcK{bb%>Us_Aeu-)r&Sk>dF z-0s23cXD!a|CEuGoRsp`j~ba$mXgKusi8?YkI3GmaO*yQHo5xYAP;>+Hd?9xUXONy zookv{z;AV2^#%PEB4K&##fJZfuj}N}%kg%EOpu}mUGPIsFW+yk#OC_=RdG>CHazT> zwxQwKVh6V0JEZ_T(!=f9#+;m-SO#go>q}}GpNr$?&z^DkDTIXGbV=yx=)?u+XzODl zD(mY{M@#h2&dxk`=9=6#^wiY{e)YW*6cp4*7fXEm)_s*|G4$t`SX*oBg&j&Blb(Tr zX8)V(jq!>rpLyL}s5}}Wy`=Y>)YMnoGoopH>KD6<9eTW3%!)A!u>~5L&F))Mye*f= z7))BTRty%|z8yCmGc%sEA5yK`iyfhchK8#>;iPOa^pXv)PPU&<*S^os-=3`bvTpHh zuBgC^!~Q~NHcolM5K1JMH{0m3^ZQ%sDrNrBa&JPCo@$jhYP$UaQu23i0-Hzo43SFTx`|N(`-)Pxs8GJ*q-_1xwkae^y(dtX-ZripZ!?r+HgVaZDGgL-9=1n z!p*TV;}z+V_bTtQva%3CSOmB4a|sBze0{?aNx?l;{RA%uUC#aDcngWdz$c)}n|dx| zcyMyM_gJ3g_p|YG)4q4SvC+{4cz7K+CtxbUTL*In=5ZkwJYY;rjJm4o%!g;BdSn5} zH4hKF7Y`@y!XCJwe>3o0KfBQ~cx>=&l{sg3p)IKIEq8~iWl9J>^^ZCy^M3U&F?8ZD zR|5!-*M`|xSRhTtl@@Ja&Z-hD4KgHOg(NPRT7`8JD;3`su+*{?Em|4Bqqz^3dm`z3 zhGe9YalP|m6t&mHF~kL>*CeOvm zj(N3pZ>;n+6*+mZ*<)1Muh^KA^-;VaKC8~Kq$H{UEASqkl42fhRJ3@TI@e<%>`z*& zJKdulAHVBxX;*z5-=8L014?k-?_$gR=6w97pu(y7EZ_Iym?~Y=^*19yo4cujLD;e6 z!wNgxt;ZOA!DwHq9)FR!R~oHXW>h3NtOXW;|3t)}OEe4iO)T~?i`7$YI+=X1*qF}i zXa1A}kRpa-s?F|+_plgG>m%!{jh5>T>6-)To4%Gyl_PVX-&G2c6chbuYSt$fxVW~w zE)-lLEmvDk>u~~>wo}c%joy9)mPRjL_gGe#e5wXJU?^9y&VGD*{(5VEWca##r{#wJ zjsEu+toYEFFl*9j(J-tJA020n=pK~5a)=W=GQq-Mw??R--@&q4-hon0*#` zu_Rxr%A&Xa;;2Uy5?^G*bok)3HzUNc-$h5~?s>Lk0S$fV# zO)c4o2nn&E?mKhQb_0oh!3!8;y`Q}f;(C#K>q?Gz2N5*>tAp9nSBIq4zBPg#7>I>6 z0Gph8rBwGFsF?cC&HJa!ulFhKw|AY?)(sP&hAromzu%j;_zBt$$hSxCeHa+arj^|t zF3=#Nmyp@RbJjT-Z%LDZ>9iYlX(}ozKJG6gn|8D8Yja5zb6W>Pmke1TcE6Yco(9N+DX~ev#cJiz~Ayi+GYR{@EX^a8rM0lOJBom z2wZKSY@edvh$SoP@b~baY|lm(fAe~{3PCENIn8|zk0zKzol6Hs!NVk0kGH1LcWhJqDDA`v)bF+TdmxT&SeEsQ&S`Rp&U_mRoRfRT<3{8Cx)?4Vz-m~8N)Co zy$_%M`kKf-IAU%O9jRd@c4A`811CWy7lVn;)IfdqBy~w zQ4!X8d=21JlX~GxvTj9YNGK!jE})oZpR-5l$@wy^rKMaiJ9DcQm{>T9ii(U^>h zcW>XSr`*HQ(Oc0C3UVxC$gb??!xp*Y<|2miHD30?g9pX6G#aa2`D&@l)CuU2HmSS@ zcJ`5-qaHUx_tinm-q&|3%$fvJnS1b&%F4>(9@`G{%}wVCF)!f;hKAuZ&wJwF>u1q$ zYfF(pK#I)8{38>y@tknc(FR&eG(?nhZdS|VFEP`#&s`0ysbmM481*bt??3m6@VCFZ zxBH}RV#1&X4+)JqJUkp6)C^K>bAxAY4HtX|LTI#ivKI|$z%q7)yMA*1Yg=L|4D8qk zwK2Ey2h6mwU=S~@C{S|Kr82|F1t41p>BPuLNS3xs(6Oq^%L9T$I8i5{VB?95gsR<& zYXt%rAJl82=_vgNc!u<@NLDUrdlknd@t2Vj5KN;Y`xn3%)Q$oIBEgKDL}d2@KUrY#?@$BmDUM3;5VZf$$TQIs!1MP5H4(e0sM_~%!jx^s+vMW zd8ll0??ZMw|51Xf4{!q`qXzqN9ybGD%M|`E&!;{h9t<}7UI3M4_0gE26NAMtQyY2H zj#*kFT06vNJCFthpPP$7woJXxnaj=fl`bhd$4grKIcbg*b_kqHT{UowJPHlPIKaSB zmhdi!dmZpT4S(zcwrWsN5Wti$g+TK?#k8OVeIu9x9wiSKC+9HDIx8-MorR^d0VB(s zDd9dOQ`%q3vK`{4s%H6>zhXFFP2A_y+1>sA7+**u0H0ZgcA`_Th+lxZ21Tn_4I?IA zY%x(ldtXqnF-IAJnT;3dKZ@Ch=5taLEo=(43XP13xh_0FpwOX+);p}M-O9hhR)8e; zzB)e|!`>Y+!3z!!Hh_Y?mLlO*onWBAI~b#i|SNqF2zo`uEpFDX_RI|4M@Nv&vK|vu5$9iAbaYji+B~{$RG2Ll7EGnw= zY!&H$@cY|@3&MIdB7TtBVLX9V?cu|RV$KV#KzQ?-HoOFyT}D($$gL}ayfvZY1b+c$ zK<{TJw~K+C&<`P`i|A?r5)e2QFgx_x?u+fWE|G=c~Q1I+shiLmg|>x1voL3Mv~n~NtLfASj;L%;N~nt~Y5 zV~~$q1JIctL{SOa;6EBEEM(uTV3&XN=+VfcJx(59IotWkPK#wn$jr>lj~|bEDPpL4 z`ue6D+&Haz1IUsLxfs0KmU$rYoc+my*nKx&6FI{eiid>J9@rKD`%P_g!t2#g4Y6KgrQuU)13qG}h>({@3`)y}T2-kmIGJY*Q> z<{d^XsN;qAt^1f3Gj6DzP$1s#W>=7 z^v6oo*bp!QM$QNIX{i_Wh4SNfV8W_W*IrW!T>$^3Nq3=~cdK()0V1kuY6@FVY4tss9w_^n zNJ6ciu(Gfe%JrfWW4~%i&P&2+V9M=B+GSujBsWEq$zIp{U3vfXp?D%9Vle;8VfPEr zh|G2A(*DhtSN;QVC~a1|>}0vo2-#O^mY-qzLEa3q6^$>~kPiBfNE(2qBtESy*3z+T z*en?XLep0&Se1!u!0X_5;HP48=EXFR&55A`4F*d~Ep8TqcBUaw2(f;4b~ZaD{`Mo( z;FqRXCsrv@8~7t;N_sl3?mvPS64*5I)zgb(n6*KI5r0{-B3xYgBLKf9QY0>cfVO>z z*jOY`Y(8fktitN*Q?iR5{yXxO%e2VW?RaLzC_}VG)Xsd1lMFM;9&c{_?w|$9a&`7zbOsB}II*Po7m!|Pq&JbVVl@^td1K{9<&Nh(QYv@F~v18>wai*Tw)G|$y3z2{CYThx4zbS3M3mgU3i7}YzJtq-xQaf z|Dr`s__3;|g~=OljHOMr56j8Qet2!(U;mNon*cKdJc(E-K0e;4Eh^Cyz$Q)-umBf3 zE#G^3WOPI(Z5Z?VCmi}6t6DcV=UdDTh&K>0Jh$V`$#oHdKt|xx5(@I^aa?1=`Rj9{ z4it_2Ky){kW>*I+W8iZde22+1D9LV zb-*-W(RK=SJMt=5(aFMcB^@UC;I}TJ>EC*LhfI^Ho|oE|DuUnv6+7{R48v%m$ zb;*G`V;GgeXU{4_iE#=S_f2D)(c4(GpC1vNCDKF8n|+?`ehM#4e9L2s8*|S}Ds1*P zIjZeFC(6mirMe?gTe~NJWgtB=2mc0e7ukWe1eZGFM1{GuLt=RxoowfPgBu=cSz@XI zMh1v?9=~D*H>+76Eg1)^PWw9m<{jZV=&cM<+?n}w019)KDeDfJ??|E6I+Xyx8>Y*V zgN=9mka&KJAjR^nv4>w8YIp*QF;}6<0^KG;%lh6hAHbK@Px?j>FHz~F_Se7!+`4t^ zP71@`YDApT$=0;KN4V9aJ+8Ex#>L1NBA&Zh9g(bYzzd}(B_*9V2hXY|X@7WD`;0-v zxs^ggoe^rImMPUV7#(Y+D6AMuAC0b~hKE{aXJV3lv~Q0ysV6Io)H|l5rR7>g{&_3_ z!HOT3UIP-Qq@#Nt2Po>tug%GtV)B+BL5!!j_)3DF;?@LjVv5>%dy7@i0F{eT^b|&( zV|=FME4yITR&p>%OhL4-IQnp*mkJpcd$cj$*i4=-=Jxeiuw-KdAk=f-`9H|0scB@C zV%L*=_wL>GSK%#EAMM8p-#isq#MljCbKP4~7>!WDpnd)N zHU9RN0+#FPuBCEDHg+4xP)(bC423g`xtH-eLI^sfYVF6%fyHqcQi3=)tn%sQH$J08 zsfh>*YSS4j1}M0@*V^6LwZ=y*3yiT5HFBc9fRGC2v(9IwmMqY9+)c~|qZPD?Y6~qJ zr9hH~hdb6rMlY^p`s=CH-@W%oBDql|165;pF-9y1WfG?sb;+Z|`3BUhoSfXK<;xFV z2R6L~%HA+Y1`J{&{B@j+FG9%+9dVc>~}kM}4HJ?MF+TgX3g% zm)Hjr5oK$bx?|*?Q8%np)4nNLhaH7o44*X?&H`LCjd$D)g9L%>>DSk#PAT8t{nWGL z{*`7RkIPSS)-5c2xidCV{~~wmU1JIY*5@^pqfm^aA3P)&>8eBZn%8_dh!I7ndvePv z9A;1`Akz_ui5(Igte~jK#>$GZJh6d`i%UCa0M;a;t0;zr!Y;H(yP&~Luz_6=TI)FL z4dhb!Oo^89be)rMVOW))1MdX29IST@L}EIe1E|+lVCs!1A#e!Eqn&Vib_t1@n=f3W zW1x7ShdW>)(t6vZo5$zfvQ=OX)1xh}(g@wf@CVonb4uikiB*o*E>tpK$^h1j$#&g{(7Upl&J?ikQQL z^DclPy*|197|Mj&-{1f9Sflk8Z5Tn3hk?wS8|=Sz{MX~D0S)x1>XJIz0|RJ!D2?IO zj>36OiTnT$dyUN^yH^o2%!uKNE^mEpfJIGBZNO(#@exG!1oRRHAMfbZInIK7CMN}| zl#FnmsI;Uv%|UAVTI7$!zkCVc?Pk348{_FW;raU0v7?BH2#`YtMP>vtTVsMGSnH}t z(FTT3NO-(GMeO1=61J{nWqFfeGCUr_czAtv0T_v<1!p`wPT+8L=<#Ir6a8`%uj|WC zmH8ccG$vmmf~z2Mo<&GIndj?C!J~olizCb+_M@*Jkwl7l#MDp@snZAzCVAm4HzMiM@_d&y?D_(Nz(If=JR0EPFA| z7YvAEG|gxpT*SHuHa#gEU;)Hsdq>CUfh4T#S9G>^E50z?SjLGm<2tp} z`$ep@cwUa0nlLMkk8=>Ar-O{sxI<`2ZsS_{?6QoVz&|PcH)A?gE+;q?ehZ}ibHJRU zGrRR;QAW^Ne+H|1?No#gA;73}ns2Vv*Pzk2fu-Gh_PXGz-DUTB&F^ow zXNjbcaX1Y}#W{)sAz1}iGyowGZL7o(;LV8W#PU*{OZWF(WKghDZW(7#eSbz?kpRXX z3{8W9d|OEdw4H?K?n3uNoT_EO<^vg$#`AaE^X!@$dq|EiOO}ZIoyd|Au;dgenpT`g&3bF=7pFxVqF8 z3G((o3v+zO>8C3?au6hqnlC#$JJFheKYeyuM&kO%h+-4b7m}XGdS4uKe`Pw-tOB+_ zM%Gw}I8Gpl5xEAocQH;K`O^x*##hoX#xU+YWjSY1zGk1rK)-^owb6ruszr>e_wh_% z)}2aLZf*iY{@p+f>TpZB5S0S;bocEUL)-g-52+&&lImP6vHo8#AfZGI+q1q%fa?Uf zxQ`Tn3;`Ydxn5(?6q+-VB<0)C)+gopFaVkyp(w0^jg5WLa&Jc-{Ga<1Cd|ytcNAh% zVV74Y^Ht<;5)&6&IXE~TcSo%)oZ{L2l06ScD?7jwi-?RQ;9*}_2K%e%ubo^80xp1S z`fByuIv;JExcx6rt0;GNeZ9 zB?qvlD*nDZ@RlIiAfK}>F99x?#)n>PY5EwG4g(#Tn3xD2ZM10?w`uyLy*|PM`qM%D z2<&!_+E*uzqo|U*yu9Z1&e_{@nBOua8=Fpv>~U|YD=XvNKLc(H44f|E1zz;{^==vwgH+ZdJt7jkqB{dH^lT#>2~b)Q(b0)M4IzPX zQR8Q7kFJApXLB{;!Gr+u1Ugzd+h%h>#O39&jLK`rQA4IbJo`Bm>V%vvTmlS;dZ@ln zY&R!6s95OHD*tk=JNh>C+lLPyMn*;eR`VECT7dpmn7)o3VU2>!BB;@IPk+kVX3)$k zQCx+lP6MN^%NfY_7z&{Fk;_IvIm{V!#sns+syL?n=Cv7RV7i3&iA=v5d%SqoG~M*# z2^NB-hw8bsM=|vV2!-TA@U=pRhCsS|=N$()Bk6{VlJx<+LZ=FW*?vnQr>6%k8jqTj zW%yK_OL6`UPb+|&0xVC3`Y&BwpXi&RWIA`QccAe-RLFpf6e0Vx%z#+%Bv#IgSVo{{ z^^kt_p(HK@Dhmw(^6=^GO#t@eR^S-$LW=b;jBQ{jz6$`f#pLf$0)z*SyXzRo86a&` zmQt+VgRe4MLx9)?@017!hyT=bU}0fFOj?P~9{=Y3tDP9~W)L2D@cUFn2nYxOA)Nr? zToW9<2|=1)kEl3%|(I{*?6R|IN=K?AZ{*z<;ob{^9ekEX{~XnH}LBqiqs znSL%CPav&iqvs&c9jpKXVSDN$9tV6Zk=i<4rV`*NJH=%SBPZolZ(pCFr!gTayYh+; z!7jrXmORzqRx(!i4Kd~j9QibBJU)12*{`h0YT1JWVEqFHM;l_gW9T@2VtoQ+VX}4s zyQ58Or9h|w7S>i9f!$vO5##M;j`_IWtDzrk45+WQhdb$UaGcp{H38<8` zzydx=FDQ3D1l^Xiy2A@%rjJ=!7Trl;HZz6UxoTEyLtPcY@5z{ zs6A#jYVm6VT{yTq|3|Qi`91k*4kCijHzx&G4!l7JH(M@LuLGkdl9XP;)7i?3$urNG zW@Tk1C|ge(%Wl3|a*2Gp*=|5*OgELm?-cP4FD@8M8T|=X8(c1={<=1^y1JT}`N66l zIaQkDYy)UnPnlhf=qf1OLau-?rk0lnCpeDpCn3twDpW~XdGAKuIXsRZv@M5AKewKN zmJnP(*Vyj~Ap*8Lm@Nmo#F&ohCKLP2%ajfC1V4Z-$e2pyDob!z7WEpi&;r+pqW4I_g(8b z91Em;mAfxnj;f5Zr1kI!xpBqeXXwF07Xnr#>rF%0D6Wz)zY8*7h)L(n}le}$U90Er@}emR7>a&w59 zO`ODL50C?(%m#fkO-G9vT~0TgiZQJ;I41@~WOB0hBu=QljP!7H&P`02eq9=p`R-4#Z9!|P=@R;y8 ze`!M*1We5T+9y-WH;VG^k|o%}pwCcfdxc*g3)3+$H1Gu|3b3uz)F4YKQ{02+{<9+p z^QT(~C~>t_R|E*8nQdUAN4EqUVi7W%aEt=AR90om%Rjbu-|>@OvJ6ZxP)PHl*^A5c zlK9EbB>u;bw6%lH(H<4R_CVJN%fA1ZdzfmqvXalVc8n||vJC4z_HoJQ&!5G3eb8c( zLY9}UH_yU!f=cNX5i+3PH1++68DyhfQ4*A{?~;fFu42rVB}=KuKdgHNjnZZJ_uR(bm+DCTClQ!O8#&o_O> z6Oo#o$+KQM+6#=uN-P;!i@@2x<1$3{bX0N~RHEHc_VhfBgUiMJ5KkKdXKzk>*G*Ab zcko3-sL04r4wTvf&5R!bm0lI=mh1`&yzA*vTBrb}Kfp|$8>*WP0`V*aEb!&=reRbF z;*B2Okv2f3Kl{Ct;YjEq!w7-J(tBytqXnSH*~Q&AC#r0|(Uh{5a^z)B=v|q_d?-Iy z;#ikI`o#p{h8w{IsW>}{a-dbHFj*6L!>%9`!Cr{#{l>OdMPOrE8wT&neT0X7@&~xK zJRCH{uz8Gz{&g9^Ud~jzBibXzz$p-6=O2SS1Io;}1`smf1Z-I#t)o~NqM307s0giN zZr^vPY(fhZRaa7S-rz)!tIpWf~%Lm9%KMMM^;!`>hLW^NZoO5SA9aQYCg+h{3~`U z69jP?j7Qlld=5Si2c@8(C^x8*rMn-8>NNw6ZJ-uL!*NlSo@e_{92`m$C{I7b`1oHP zJM<>bvSK4FnC{&>1Udvu$f#8o2G(~JoQ^R0M6uZbn**3Q@-;Sv7GnJ2DKQ0wo{^Cq zOg|0_@*)7B!8M^{%#k|4q5gq^v9ikD)3mfS(A}sjKM|)aE(Yu^;k7THo?|W~3MHiC zTL7olux{az^}ysA0W%R9>u0@yH4XrzeBo$R@%BDAt+I#l5M%*EW#=XD(f@QkLB@$a zCA0wAkifFrSX-|)vsf2>{(K39ENGpUf9_tNFk;Qnmb2dbrEpGWWKm_r1c!WoES98NdiEtgQEWu*=r^`}^OV+!7Rc z+>L+aQw8eNY1Xd9@5BtqG{+o3uDF6TVo&f94lczwu}9mp;yUL*+wM%(6o~o!#5oYR z8z}^?a@`Duio>BXVQ$w3WIW1Cz(o6ExhJ0e_+xp{=vyoZXvJ!Y(tyu2x)~NpedP>w zB0lo3=PF( z{G6c}wG5yVpje>Wd-W+DHQ?Oik~U{PxpsyTJI==Oz9Uy>%u0wvvw_j_S;%;KdD+<& zPrhnemSq5x5XvXzvK+SO)|&DoAnSc~wE&BG*rrp<)!1fVM6X1tL%hO0P*E7A{+=zK6r~87&+f zxC9(G9MV;8BV~9!Z-8k4A=|ynR31JE_zgHqI1JWPMy5u;^9o9UBn*Q)JT3ejqhTataEw7C+yOu`KQ5IYCEjM7MLHBf!>-P`%S3222l7U6A+kqlddCP^^$CF7tBIfh~ISc414Fi?| zMid3nq-(t+qyRN)x52zWTmJ&n-ntEWtA`3UN;qVp+dR==^F1T_wDWC{v&)^IA)wvV z-Wz}qpt*~hkXB@Hlknk7(nNC|oe@CoAmEGswe$5oIfH1h#mCCGgWwa*-ybU)RaxEg zmU^+&3GN6?9%Ucq#=QHpYzfE{ztctRm$#J}yffnB(2k#2TZ8l}Yan*Z9?fv=2Y?A6 zc;51w6QaSH&M#fXzZhz!d<{lDSR3Z3*|QDM*S`=Ybg`q?ZVj z>hA1ZKvij?%S+RB3Y4FHGG>RCntqMs3!Ve#BY`88mNHe41I}JQFqaXUDroA9JDPa-*9Nd z;+3YV8sf&7eo`HVYA!Mpwm!V1?1th{=1}zgksa*ACcRsxWIehB=fG%SP$aRvv;~Lp zR>MY_p-plHj3b{xlPl~LaE!&hekqmNSQC_mnxxl0ge@u^|7nMJ$7UAqsJ6`!p?@b$ zw>A7h8TZs|`6$JsJ|OIG_C#ON$ZD~keg1KdQn&okv`T~9Ju_p%@^B0TffEcr3cqt1 z&|(qW_Qo=&O)bkB$T6__6be3bhJ(9&1eyT!c|uh&@N9u_T_rv@uBlmI7K@Z-GL?}G zbhOG|R5e-47m8hfXS9zCf)uVV%h~1V#4z+hKwDLQGO9a#J4KZ^E#61C){yElddty3 zwN*0BCAR_)Q(tvUgSnE*5f^zo+ID$)`Ht6|f}Gr|4vHyn_P8lOjs+t!><)_HE1`X1 zl~^3QkPiXmkjjJv9rBAg#`kiF~K~Zku(5JvyWqhQ6iSjDD>ZY93!P^rU_|6lsOf z3{C6ZB;)`Z`sn%mNdKrt>t}H`{Ir1bH;Jx<`=@o~qa{6hm4F2xi*@lMZb9##+usW0 zr8FORbOBNBB%7g71GJ5mOEzv_7RC50(d0We4&p1G5H@tI{}87wgcR5MC_#k@!ofR2 z%V=<*gM;v3>D@1$@e@siy)=z>v6LG8>kBr1vV*sT#=gS;@h@G!ks#HNCR2r#l)%}4 zvsDcuL?)g#DvMc&2_63DX5ZkXr5v(@CKDPo9%FxQNsQDfv?&L{JA0ce`*iF zGX+Bt#l2y$yw5PWk`Og9@(m?8=g-Rg_Y;p={}~>9H6;rAzdr~-elD=TbhH6;1RnS< z>!1O7>;Ym`twkn1U~4_hE5K(TewqG6S&&j7j{sMG5pn}Z9YLHw3Q-vS0qRt@bafT* z83>+xjw>qZk#m(C_Y)V~&x*m9%tfDE_>*t^wR3(1aR; zHyDA-m5uAzV@!wxzNG_(638JrczE<$?uGqd`sNN*&3;!mrD&bdEtISk_zo6m$_nc8 J<+2vY{|2J^UuOUS literal 0 HcmV?d00001 From b9d9bd3ad08986997db7d0e75f2b50f5d3ef9535 Mon Sep 17 00:00:00 2001 From: Scott Gigante Date: Fri, 26 Jul 2019 15:25:44 +1000 Subject: [PATCH 010/125] Add scprep logo and philosophy --- README.rst | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/README.rst b/README.rst index dede2bcb..98327f24 100644 --- a/README.rst +++ b/README.rst @@ -1,6 +1,4 @@ -============= -scprep -============= +![scprep logo](logo.png) .. image:: https://img.shields.io/pypi/v/scprep.svg :target: https://pypi.org/project/scprep/ @@ -24,8 +22,14 @@ scprep :target: https://github.com/KrishnaswamyLab/scprep/ :alt: GitHub stars +`scprep` provides an all-in-one framework for loading, preprocessing, and plotting matrices in Python, with a focus on single-cell genomics. -Tools for loading and preprocessing biological matrices in Python. +The philosophy of `scprep`: + +* Data shouldn't be hidden in a complex and bespoke class object. `scprep` works with `numpy` arrays, `pandas` data frames, and `scipy` sparse matrices, all of which are popular data formats in Python and accepted as input to most common algorithms. +* Your analysis pipeline shouldn't have to change based on data format. Changing from a `numpy` array to a `pandas` data frame introduces endless technical differences (e.g. in indexing matrices). `scprep` provides data-agnostic methods that work the same way on all formats. +* Simple analysis should mean simple code. `scprep` takes care of annoying edge cases and sets nice defaults so you don't have to. +* Using a framework shouldn't be limiting. Because nothing is hidden from you, you have access to the power of `numpy`, `scipy`, `pandas` and `matplotlib` just as you would if you used them directly. Installation ------------ @@ -72,4 +76,4 @@ Examples Help ---- -If you have any questions or require assistance using scprep, please read the documentation at https://scprep.readthedocs.io/ or contact us at https://krishnaswamylab.org/get-help \ No newline at end of file +If you have any questions or require assistance using scprep, please read the documentation at https://scprep.readthedocs.io/ or contact us at https://krishnaswamylab.org/get-help From 2184e000db4d044b7f21b938eee578cf31ea61ba Mon Sep 17 00:00:00 2001 From: Scott Gigante Date: Fri, 26 Jul 2019 15:26:51 +1000 Subject: [PATCH 011/125] fix logo with rst not md --- README.rst | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/README.rst b/README.rst index 98327f24..57dd93ac 100644 --- a/README.rst +++ b/README.rst @@ -1,4 +1,5 @@ -![scprep logo](logo.png) +.. image:: logo.png + :alt: scprep logo .. image:: https://img.shields.io/pypi/v/scprep.svg :target: https://pypi.org/project/scprep/ From 5df7f74e2136218a632612335798fede22aecd6f Mon Sep 17 00:00:00 2001 From: Scott Gigante Date: Fri, 26 Jul 2019 15:49:31 +1000 Subject: [PATCH 012/125] add logo to docs page --- doc/source/index.rst | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/doc/source/index.rst b/doc/source/index.rst index a19d46f1..0bcf85dc 100644 --- a/doc/source/index.rst +++ b/doc/source/index.rst @@ -2,6 +2,11 @@ scprep =========================================================================== +.. raw:: html + + scprep logo + + .. raw:: html Latest PyPi version @@ -26,7 +31,14 @@ scprep GitHub stars -Tools for building and manipulating graphs in Python. +`scprep` provides an all-in-one framework for loading, preprocessing, and plotting matrices in Python, with a focus on single-cell genomics. + +The philosophy of `scprep`: + +* Data shouldn't be hidden in a complex and bespoke class object. `scprep` works with `numpy` arrays, `pandas` data frames, and `scipy` sparse matrices, all of which are popular data formats in Python and accepted as input to most common algorithms. +* Your analysis pipeline shouldn't have to change based on data format. Changing from a `numpy` array to a `pandas` data frame introduces endless technical differences (e.g. in indexing matrices). `scprep` provides data-agnostic methods that work the same way on all formats. +* Simple analysis should mean simple code. `scprep` takes care of annoying edge cases and sets nice defaults so you don't have to. +* Using a framework shouldn't be limiting. Because nothing is hidden from you, you have access to the power of `numpy`, `scipy`, `pandas` and `matplotlib` just as you would if you used them directly. .. toctree:: :maxdepth: 2 @@ -63,4 +75,4 @@ You can use `scprep` with your single cell data as follows:: Help ==== -If you have any questions or require assistance using scprep, please contact us at https://krishnaswamylab.org/get-help \ No newline at end of file +If you have any questions or require assistance using scprep, please contact us at https://krishnaswamylab.org/get-help From 54c2de78f103459ba9016083be9d6359e805faae Mon Sep 17 00:00:00 2001 From: Scott Gigante Date: Fri, 26 Jul 2019 15:50:47 +1000 Subject: [PATCH 013/125] remove title and add line break --- doc/source/index.rst | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/doc/source/index.rst b/doc/source/index.rst index 0bcf85dc..bf1c246d 100644 --- a/doc/source/index.rst +++ b/doc/source/index.rst @@ -1,11 +1,6 @@ -=========================================================================== -scprep -=========================================================================== - .. raw:: html - scprep logo - + scprep logo
.. raw:: html From 3f278818469bf7ef15db523778db886c15f18189 Mon Sep 17 00:00:00 2001 From: Scott Gigante Date: Thu, 8 Aug 2019 12:58:44 -0400 Subject: [PATCH 014/125] use pip to install --- .travis.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.travis.yml b/.travis.yml index 87946602..ec599d97 100644 --- a/.travis.yml +++ b/.travis.yml @@ -25,7 +25,7 @@ - $HOME/R/Library install: - - python setup.py install + - pip install -U . before_script: - sudo apt-key adv --keyserver keyserver.ubuntu.com --recv-keys E298A3A825C0D65DFD57CBB651716619E084DAB9 From b644a3f360bf92272f424a2994c261fba33ee13e Mon Sep 17 00:00:00 2001 From: Scott Gigante Date: Thu, 8 Aug 2019 13:12:42 -0400 Subject: [PATCH 015/125] require pandas 0.25 --- requirements.txt | 2 +- setup.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/requirements.txt b/requirements.txt index 0b05aa83..c8461542 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,5 @@ numpy>=1.10.0 scipy>=0.18.0 scikit-learn>=0.19.1 -pandas>=0.24 +pandas>=0.25 decorator>=4.3.0 diff --git a/setup.py b/setup.py index cddfb591..4dd39b51 100644 --- a/setup.py +++ b/setup.py @@ -7,7 +7,7 @@ 'scipy>=0.18.1', 'scikit-learn>=0.19.1', 'decorator>=4.3.0', - 'pandas>=0.24', + 'pandas>=0.25', ] test_requires = [ From 26211543c94b47d59e476060028e6091e4101bca Mon Sep 17 00:00:00 2001 From: Scott Gigante Date: Thu, 8 Aug 2019 13:13:13 -0400 Subject: [PATCH 016/125] fix mistaken test --- test/test_utils.py | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/test/test_utils.py b/test/test_utils.py index e8fb4c5a..740ec074 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -367,33 +367,33 @@ def test_matrix_elementwise_multiply_invalid_axis(): def test_deprecated(): X = data.load_10X() - assert_warns_message(FutureWarning, + assert_raise_message(RuntimeError, "`scprep.utils.select_cols` is deprecated. Use " "`scprep.select.select_cols` instead.", - scprep.select.select_cols, + scprep.utils.select_cols, X, [1, 2, 3]) - assert_warns_message(FutureWarning, + assert_raise_message(RuntimeError, "`scprep.utils.select_rows` is deprecated. Use " "`scprep.select.select_rows` instead.", - scprep.select.select_rows, + scprep.utils.select_rows, X, [1, 2, 3]) - assert_warns_message(FutureWarning, + assert_raise_message(RuntimeError, "`scprep.utils.get_gene_set` is deprecated. Use " "`scprep.select.get_gene_set` instead.", - scprep.select.get_gene_set, + scprep.utils.get_gene_set, X, starts_with="D") - assert_warns_message(FutureWarning, + assert_raise_message(RuntimeError, "`scprep.utils.get_cell_set` is deprecated. Use " "`scprep.select.get_cell_set` instead.", - scprep.select.get_cell_set, + scprep.utils.get_cell_set, X, starts_with="A") - assert_warns_message(FutureWarning, + assert_raise_message(RuntimeError, "`scprep.utils.subsample` is deprecated. Use " "`scprep.select.subsample` instead.", - scprep.select.subsample, + scprep.utils.subsample, X, n=10) From ae576e02914e2ad51daf868580e7bc5e1ccd2d62 Mon Sep 17 00:00:00 2001 From: Scott Gigante Date: Thu, 8 Aug 2019 13:14:15 -0400 Subject: [PATCH 017/125] accidental column select --- scprep/select.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scprep/select.py b/scprep/select.py index 093c6fda..65dcef43 100644 --- a/scprep/select.py +++ b/scprep/select.py @@ -449,7 +449,7 @@ def select_rows(data, *extra_data, idx=None, if isinstance(data, (pd.DataFrame, pd.Series)): try: if isinstance(idx, (numbers.Integral, str)): - data = data.loc[:, idx] + data = data.loc[idx] else: if np.issubdtype(idx.dtype, np.dtype(bool).type): # temporary workaround for pandas error From b68aacd3a712a6f60cefd58daa7f59a4cfd29d50 Mon Sep 17 00:00:00 2001 From: Scott Gigante Date: Thu, 8 Aug 2019 13:16:26 -0400 Subject: [PATCH 018/125] simplify to_spmatrix_or_array and fix to_numpy replacement on dok --- scprep/utils.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/scprep/utils.py b/scprep/utils.py index 658de0e1..90360c3f 100644 --- a/scprep/utils.py +++ b/scprep/utils.py @@ -137,12 +137,12 @@ def to_array_or_spmatrix(x): """ if isinstance(x, pd.SparseDataFrame): x = x.to_coo() - elif isinstance(x, pd.SparseSeries): - x = x.to_dense().to_numpy() elif isinstance(x, (pd.DataFrame, pd.Series)): - x = x.to_numpy() - elif isinstance(x, np.matrix): - x = np.array(x) + try: + x = x.sparse.to_coo() + except AttributeError: + # not sparse + x = toarray(x) elif isinstance(x, (sparse.spmatrix, np.ndarray, numbers.Number)): pass elif isinstance(x, list): @@ -329,7 +329,7 @@ def matrix_min(data): elif isinstance(data, sparse.lil_matrix): data = [np.min(d) for d in data.data] + [0] elif isinstance(data, sparse.dok_matrix): - data = list(data.to_numpy()()) + [0] + data = list(data.values()) + [0] elif isinstance(data, sparse.dia_matrix): data = [np.min(data.data), 0] return np.min(data) From 9fa0149456e600474fc9cb3fa2dda07d3f4b28a7 Mon Sep 17 00:00:00 2001 From: Scott Gigante Date: Thu, 8 Aug 2019 13:16:44 -0400 Subject: [PATCH 019/125] don't need the same line three times --- test/tools/matrix.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/test/tools/matrix.py b/test/tools/matrix.py index ad76cbfb..ec741ddd 100644 --- a/test/tools/matrix.py +++ b/test/tools/matrix.py @@ -17,10 +17,6 @@ def _no_warning_dia_matrix(*args, **kwargs): SparseDataFrame = partial(pd.SparseDataFrame, default_fill_value=0.0) -SparseDataFrame = partial(pd.SparseDataFrame, default_fill_value=0.0) - -SparseDataFrame = partial(pd.SparseDataFrame, default_fill_value=0.0) - _scipy_matrix_types = [ sparse.csr_matrix, sparse.csc_matrix, From 6c85d92a849dbe6027251f2028f36bb72146e143 Mon Sep 17 00:00:00 2001 From: Scott Gigante Date: Thu, 8 Aug 2019 13:38:52 -0400 Subject: [PATCH 020/125] add sparse dataframe base functions --- scprep/utils.py | 32 ++++++++++++++++++++++++++------ 1 file changed, 26 insertions(+), 6 deletions(-) diff --git a/scprep/utils.py b/scprep/utils.py index 90360c3f..3b83f9ac 100644 --- a/scprep/utils.py +++ b/scprep/utils.py @@ -137,12 +137,8 @@ def to_array_or_spmatrix(x): """ if isinstance(x, pd.SparseDataFrame): x = x.to_coo() - elif isinstance(x, (pd.DataFrame, pd.Series)): - try: - x = x.sparse.to_coo() - except AttributeError: - # not sparse - x = toarray(x) + elif is_sparse_dataframe(x) or is_sparse_series(x): + x = x.sparse.to_coo() elif isinstance(x, (sparse.spmatrix, np.ndarray, numbers.Number)): pass elif isinstance(x, list): @@ -160,6 +156,30 @@ def to_array_or_spmatrix(x): return x +def is_sparse_dataframe(x): + if isinstance(x, pd.DataFrame): + try: + x.sparse + return True + except AttributeError: + pass + return False + + +def is_sparse_series(x): + if isinstance(x, pd.Series): + try: + x.sparse + return True + except AttributeError: + pass + return False + + +def dataframe_to_sparse(x, fill_value=0): + return x.astype(pd.SparseDtype(float, fill_value=fill_value)) + + def matrix_transform(data, fun, *args, **kwargs): """Perform a numerical transformation to data From 1fcef5adad2c62be6e61b0e63f430d377c0cd23a Mon Sep 17 00:00:00 2001 From: Scott Gigante Date: Sun, 15 Sep 2019 17:46:17 -0400 Subject: [PATCH 021/125] add differential expression functionality --- scprep/stats.py | 154 ++++++++++++++++++++++++++++++++++++++++++++- scprep/utils.py | 63 +++++++++++++------ test/test_stats.py | 72 +++++++++++++++++++++ 3 files changed, 267 insertions(+), 22 deletions(-) diff --git a/scprep/stats.py b/scprep/stats.py index 263fb01f..f60ab922 100644 --- a/scprep/stats.py +++ b/scprep/stats.py @@ -3,9 +3,11 @@ import numbers import numpy as np +import pandas as pd from scipy import stats, sparse from sklearn import neighbors, metrics -from . import plot, utils +import joblib +from . import plot, utils, select import warnings from ._lazyload import matplotlib @@ -363,6 +365,154 @@ def plot_knnDREMI(dremi, mutual_info, x, y, n_bins, n_mesh, plot.utils.show(fig) +def mean_difference(X, Y): + """Calculate the mean difference in genes between two datasets + + In the case where the data has been log normalized, + this is equivalent to fold change. + + Parameters + ---------- + X : array-like, shape=[n_cells, n_genes] + Y : array-like, shape=[m_cells, n_genes] + + Returns + ------- + difference : list-like, shape=[n_genes] + """ + if not X.shape[1] == Y.shape[1]: + raise ValueError("Expected X and Y to have the same number of columns. " + "Got shapes {}, {}".format(X.shape, Y.shape)) + X = utils.to_array_or_spmatrix(X) + Y = utils.to_array_or_spmatrix(Y) + X = utils.toarray(X.mean(axis=0)).flatten() + Y = utils.toarray(Y.mean(axis=0)).flatten() + return X - Y + + +def differential_expression(X, Y, + test='difference', + direction='up', + gene_names=None, + n_jobs=-2): + """Calculate the most significant genes between two datasets + + Parameters + ---------- + X : array-like, shape=[n_cells, n_genes] + Y : array-like, shape=[m_cells, n_genes] + test : {'difference', 'emd'}, optional (default: 'difference') + The statistical test to be used to rank genes + direction : {'up', 'down', 'both'}, optional (default: 'up') + The direction in which to consider genes significant. If 'up', rank genes where X > Y. If 'down', rank genes where X < Y. If 'both', rank genes by absolute value. + gene_names : list-like or `None`, optional (default: `None`) + List of gene names associated with the columns of X and Y + n_jobs : int, optional (default: -2) + Number of threads to use if the test is parallelizable (currently used for EMD). If negative, -1 refers to all available cores. + + Returns + ------- + result : pd.DataFrame + Ordered DataFrame with a column "gene" and a column named `test`. + """ + if not direction in ['up', 'down', 'both']: + raise ValueError("Expected `direction` in ['up', 'down', 'both']. " + "Got {}".format(test)) + if not test in ['difference', 'emd']: + raise ValueError("Expected `test` in ['difference', 'emd']. " + "Got {}".format(test)) + if not (len(X.shape) == 2 and len(Y.shape) == 2): + raise ValueError("Expected `X` and `Y` to be matrices. " + "Got shapes {}, {}".format(X.shape, Y.shape)) + [X, Y] = utils.check_consistent_columns([X, Y]) + if gene_names is not None: + if isinstance(X, pd.DataFrame): + X = select.select_cols(X, idx=gene_names) + gene_names = X.columns + if isinstance(Y, pd.DataFrame): + Y = select.select_cols(Y, idx=gene_names) + gene_names = Y.columns + if not len(gene_names) == X.shape[1]: + raise ValueError("Expected gene_names to have length {}. " + "Got {}".format(X.shape[1], len(gene_names))) + else: + if isinstance(X, pd.DataFrame) and isinstance(Y, pd.DataFrame): + gene_names = X.columns + else: + gene_names = np.arange(X.shape[1]) + X = utils.to_array_or_spmatrix(X) + Y = utils.to_array_or_spmatrix(Y) + # inconsistent behaviour from csr and csc + if sparse.issparse(X): + X = X.tocsr() + if sparse.issparse(Y): + Y = Y.tocsr() + if test == 'difference': + difference = mean_difference(X, Y) + elif test == 'emd': + difference = joblib.Parallel(n_jobs)(joblib.delayed(EMD)( + select.select_cols(X, idx=i), + select.select_cols(Y, idx=i)) + for i in range(X.shape[1])) + difference = np.array(difference) * np.sign(mean_difference(X, Y)) + result = pd.DataFrame({'gene' : gene_names, test : difference}) + if direction == 'up': + result = result.sort_values([test, 'gene'], ascending=False) + elif direction == 'down': + result = result.sort_values([test, 'gene'], ascending=True) + elif direction == 'both': + result['test_abs'] = np.abs(difference) + result = result.sort_values(['test_abs', 'gene'], ascending=False) + del result['test_abs'] + result.index = np.arange(result.shape[0]) + return result + + +def differential_expression_by_cluster(data, clusters, + test='difference', + direction='up', + gene_names=None, + n_jobs=-2): + """Calculate the most significant genes for each cluster in a dataset + + Tests are run for each cluster against the rest of the dataset. + + Parameters + ---------- + data : array-like, shape=[n_cells, n_genes] + clusters : list-like, shape=[n_cells] + test : {'difference', 'emd'}, optional (default: 'difference') + The statistical test to be used to rank genes + direction : {'up', 'down', 'both'}, optional (default: 'up') + The direction in which to consider genes significant. If 'up', rank genes where X > Y. If 'down', rank genes where X < Y. If 'both', rank genes by absolute value. + gene_names : list-like or `None`, optional (default: `None`) + List of gene names associated with the columns of X and Y + n_jobs : int, optional (default: -2) + Number of threads to use if the test is parallelizable (currently used for EMD). If negative, -1 refers to all available cores. + + Returns + ------- + result : dict(pd.DataFrame) + Dictionary containing an ordered DataFrame with a column "gene" and a column named `test` for each cluster. + """ + if gene_names is not None and isinstance(data, pd.DataFrame): + data = select.select_cols(data, idx=gene_names) + gene_names = data.columns + if gene_names is None: + if isinstance(data, pd.DataFrame): + gene_names = data.columns + elif not len(gene_names) == data.shape[1]: + raise ValueError("Expected gene_names to have length {}. " + "Got {}".format(data.shape[1], len(gene_names))) + data = utils.to_array_or_spmatrix(data) + result = {cluster : differential_expression( + select.select_rows(data, idx=clusters==cluster), + select.select_rows(data, idx=clusters!=cluster), + test = test, direction = direction, + gene_names = gene_names, n_jobs = n_jobs) + for cluster in np.unique(clusters)} + return result + def _vector_coerce_dense(x): x = utils.toarray(x) x_1d = x.flatten() @@ -381,5 +531,5 @@ def _vector_coerce_two_dense(x, y): raise ValueError("Expected x and y to be 1D arrays. " "Got shapes x {}, y {}".format(x.shape, y.shape)) else: - raise + raise e return x, y diff --git a/scprep/utils.py b/scprep/utils.py index bf02c252..b7c247cc 100644 --- a/scprep/utils.py +++ b/scprep/utils.py @@ -364,6 +364,48 @@ def matrix_any(condition): return np.sum(np.sum(condition)) > 0 +def check_consistent_columns(data): + """Ensure that a set of data matrices have consistent columns + + Parameters + ---------- + data : list of array-likes + List of matrices to be checked + + Returns + ------- + data : list of array-likes + List of matrices with consistent columns, subsetted if necessary + + Raises + ------ + ValueError + Raised if data has inconsistent number of columns and does not + have column names for subsetting + """ + matrix_type = type(data[0]) + matrix_shape = data[0].shape[1] + if issubclass(matrix_type, pd.DataFrame): + if not (np.all([d.shape[1] == matrix_shape for d in data[1:]]) and + np.all([data[0].columns == d.columns for d in data])): + common_genes = data[0].columns.values + for d in data[1:]: + common_genes = common_genes[np.isin(common_genes, + d.columns.values)] + for i in range(len(data)): + data[i] = data[i][common_genes] + warnings.warn("Input data has inconsistent column names. " + "Subsetting to {} common columns.".format( + len(common_genes)), UserWarning) + else: + for d in data[1:]: + if not d.shape[1] == matrix_shape: + shapes = ", ".join([str(d.shape[1]) for d in data]) + raise ValueError("Expected data all with the same number of " + "columns. Got {}".format(shapes)) + return data + + def combine_batches(data, batch_labels, append_to_cell_names=None): """Combine data matrices from multiple batches and store a batch label @@ -405,26 +447,7 @@ def combine_batches(data, batch_labels, append_to_cell_names=None): raise TypeError("Expected data all of the same class. " "Got {}".format(types)) - # check consistent columns - matrix_shape = data[0].shape[1] - if issubclass(matrix_type, pd.DataFrame): - if not (np.all([d.shape[1] == matrix_shape for d in data[1:]]) and - np.all([data[0].columns == d.columns for d in data])): - common_genes = data[0].columns.values - for d in data[1:]: - common_genes = common_genes[np.isin(common_genes, - d.columns.values)] - for i in range(len(data)): - data[i] = data[i][common_genes] - warnings.warn("Input data has inconsistent column names. " - "Subsetting to {} common columns.".format( - len(common_genes)), UserWarning) - else: - for d in data[1:]: - if not d.shape[1] == matrix_shape: - shapes = ", ".join([str(d.shape[1]) for d in data]) - raise ValueError("Expected data all with the same number of " - "columns. Got {}".format(shapes)) + data = check_consistent_columns(data) # check append_to_cell_names if append_to_cell_names and not issubclass(matrix_type, pd.DataFrame): diff --git a/test/test_stats.py b/test/test_stats.py index fdb7eff1..88927702 100644 --- a/test/test_stats.py +++ b/test/test_stats.py @@ -6,6 +6,7 @@ import scprep from functools import partial import warnings +from parameterized import parameterized def _test_fun_2d(X, fun, **kwargs): @@ -121,3 +122,74 @@ def test_knnDREMI(): "Attempting to calculate kNN-DREMI on a constant array. " "Returning `0`", scprep.stats.knnDREMI, X[:, 0], np.zeros_like(X[:, 1])) + + +def test_mean_difference(): + X = data.load_10X() + X = scprep.filter.filter_empty_genes(X) + Y = scprep.stats.mean_difference(X.iloc[:20], X.iloc[20:100]) + assert np.allclose(np.max(Y), 16.8125) + assert np.allclose(np.min(Y), -0.5625) + def test_fun(X, **kwargs): + return scprep.stats.mean_difference( + scprep.select.select_rows(X, idx=np.arange(20)), + scprep.select.select_rows(X, idx=np.arange(20, 100)), + **kwargs) + matrix.test_all_matrix_types( + X, utils.assert_transform_equals, Y=Y, + transform=test_fun, + check=utils.assert_all_close) + + +@parameterized([('difference', 'up'), ('difference', 'down'), ('difference', 'both'), + ('emd', 'up'), ('emd', 'down'), ('emd', 'both')]) +def test_differential_expression(test, direction): + X = data.load_10X() + X = scprep.filter.filter_empty_genes(X) + result = scprep.stats.differential_expression(X.iloc[:20], X.iloc[20:100], + test=test, direction=direction) + expected_results = {('difference', 'up') : ('Gstm5', 16.8125), + ('difference', 'down') : ('Slc2a3', -0.5625), + ('difference', 'both') : ('Gstm5', 16.8125), + ('emd', 'up') : ('Gstm5', 17.5625), + ('emd', 'down') : ('Slc2a3', -0.6875), + ('emd', 'both') : ('Gstm5', 17.5625)} + assert result['gene'][0] == expected_results[(test, direction)][0], result['gene'][0] + assert np.allclose(result[test][0], + expected_results[(test, direction)][1]) + def test_fun(X, **kwargs): + return scprep.stats.differential_expression( + scprep.select.select_rows(X, idx=np.arange(20)), + scprep.select.select_rows(X, idx=np.arange(20, 100)), + **kwargs) + def check_fun(Y1, Y2): + if direction == 'both': + Y1[test] = np.abs(Y1[test]) + Y2[test] = np.abs(Y2[test]) + np.testing.assert_allclose(Y1[test], Y2[test], atol=5e-4) + Y1 = Y1.sort_values('gene') + Y2 = Y2.sort_values('gene') + np.testing.assert_allclose(Y1[test], Y2[test], atol=5e-4) + matrix.test_all_matrix_types( + X, utils.assert_transform_equals, Y=result, + transform=test_fun, + check=check_fun, + gene_names=X.columns, + test=test, direction=direction) + + +@parameterized([('difference', 'up'), ('difference', 'down'), ('difference', 'both'), + ('emd', 'up'), ('emd', 'down'), ('emd', 'both')]) +def test_differential_expression_by_cluster(test, direction): + X = data.load_10X() + np.random.seed(42) + clusters = np.random.choice(4, X.shape[0], replace=True) + result = scprep.stats.differential_expression_by_cluster( + X, clusters, + test=test, direction=direction) + for cluster in range(4): + r = scprep.stats.differential_expression( + scprep.select.select_rows(X, idx=clusters==cluster), + scprep.select.select_rows(X, idx=clusters!=cluster), + test=test, direction=direction) + assert np.all(result[cluster] == r) From 90528466efaffdb2f642350d5ea764864e8969a3 Mon Sep 17 00:00:00 2001 From: Scott Gigante Date: Sun, 15 Sep 2019 18:07:57 -0400 Subject: [PATCH 022/125] rename to and add tests --- scprep/stats.py | 48 +++++++++++++++++----------------- test/test_stats.py | 65 ++++++++++++++++++++++++++++++++++++---------- 2 files changed, 76 insertions(+), 37 deletions(-) diff --git a/scprep/stats.py b/scprep/stats.py index f60ab922..574c2e8a 100644 --- a/scprep/stats.py +++ b/scprep/stats.py @@ -391,7 +391,7 @@ def mean_difference(X, Y): def differential_expression(X, Y, - test='difference', + measure='difference', direction='up', gene_names=None, n_jobs=-2): @@ -401,26 +401,26 @@ def differential_expression(X, Y, ---------- X : array-like, shape=[n_cells, n_genes] Y : array-like, shape=[m_cells, n_genes] - test : {'difference', 'emd'}, optional (default: 'difference') - The statistical test to be used to rank genes + measure : {'difference', 'emd'}, optional (default: 'difference') + The measurement to be used to rank genes direction : {'up', 'down', 'both'}, optional (default: 'up') The direction in which to consider genes significant. If 'up', rank genes where X > Y. If 'down', rank genes where X < Y. If 'both', rank genes by absolute value. gene_names : list-like or `None`, optional (default: `None`) List of gene names associated with the columns of X and Y n_jobs : int, optional (default: -2) - Number of threads to use if the test is parallelizable (currently used for EMD). If negative, -1 refers to all available cores. + Number of threads to use if the measurement is parallelizable (currently used for EMD). If negative, -1 refers to all available cores. Returns ------- result : pd.DataFrame - Ordered DataFrame with a column "gene" and a column named `test`. + Ordered DataFrame with a column "gene" and a column named `measure`. """ if not direction in ['up', 'down', 'both']: raise ValueError("Expected `direction` in ['up', 'down', 'both']. " - "Got {}".format(test)) - if not test in ['difference', 'emd']: - raise ValueError("Expected `test` in ['difference', 'emd']. " - "Got {}".format(test)) + "Got {}".format(direction)) + if not measure in ['difference', 'emd']: + raise ValueError("Expected `measure` in ['difference', 'emd']. " + "Got {}".format(measure)) if not (len(X.shape) == 2 and len(Y.shape) == 2): raise ValueError("Expected `X` and `Y` to be matrices. " "Got shapes {}, {}".format(X.shape, Y.shape)) @@ -447,53 +447,53 @@ def differential_expression(X, Y, X = X.tocsr() if sparse.issparse(Y): Y = Y.tocsr() - if test == 'difference': + if measure == 'difference': difference = mean_difference(X, Y) - elif test == 'emd': + elif measure == 'emd': difference = joblib.Parallel(n_jobs)(joblib.delayed(EMD)( select.select_cols(X, idx=i), select.select_cols(Y, idx=i)) for i in range(X.shape[1])) difference = np.array(difference) * np.sign(mean_difference(X, Y)) - result = pd.DataFrame({'gene' : gene_names, test : difference}) + result = pd.DataFrame({'gene' : gene_names, measure : difference}) if direction == 'up': - result = result.sort_values([test, 'gene'], ascending=False) + result = result.sort_values([measure, 'gene'], ascending=False) elif direction == 'down': - result = result.sort_values([test, 'gene'], ascending=True) + result = result.sort_values([measure, 'gene'], ascending=True) elif direction == 'both': - result['test_abs'] = np.abs(difference) - result = result.sort_values(['test_abs', 'gene'], ascending=False) - del result['test_abs'] + result['measure_abs'] = np.abs(difference) + result = result.sort_values(['measure_abs', 'gene'], ascending=False) + del result['measure_abs'] result.index = np.arange(result.shape[0]) return result def differential_expression_by_cluster(data, clusters, - test='difference', + measure='difference', direction='up', gene_names=None, n_jobs=-2): """Calculate the most significant genes for each cluster in a dataset - Tests are run for each cluster against the rest of the dataset. + Measurements are run for each cluster against the rest of the dataset. Parameters ---------- data : array-like, shape=[n_cells, n_genes] clusters : list-like, shape=[n_cells] - test : {'difference', 'emd'}, optional (default: 'difference') - The statistical test to be used to rank genes + measure : {'difference', 'emd'}, optional (default: 'difference') + The measurement to be used to rank genes direction : {'up', 'down', 'both'}, optional (default: 'up') The direction in which to consider genes significant. If 'up', rank genes where X > Y. If 'down', rank genes where X < Y. If 'both', rank genes by absolute value. gene_names : list-like or `None`, optional (default: `None`) List of gene names associated with the columns of X and Y n_jobs : int, optional (default: -2) - Number of threads to use if the test is parallelizable (currently used for EMD). If negative, -1 refers to all available cores. + Number of threads to use if the measurement is parallelizable (currently used for EMD). If negative, -1 refers to all available cores. Returns ------- result : dict(pd.DataFrame) - Dictionary containing an ordered DataFrame with a column "gene" and a column named `test` for each cluster. + Dictionary containing an ordered DataFrame with a column "gene" and a column named `measure` for each cluster. """ if gene_names is not None and isinstance(data, pd.DataFrame): data = select.select_cols(data, idx=gene_names) @@ -508,7 +508,7 @@ def differential_expression_by_cluster(data, clusters, result = {cluster : differential_expression( select.select_rows(data, idx=clusters==cluster), select.select_rows(data, idx=clusters!=cluster), - test = test, direction = direction, + measure = measure, direction = direction, gene_names = gene_names, n_jobs = n_jobs) for cluster in np.unique(clusters)} return result diff --git a/test/test_stats.py b/test/test_stats.py index 88927702..cafc914b 100644 --- a/test/test_stats.py +++ b/test/test_stats.py @@ -139,24 +139,30 @@ def test_fun(X, **kwargs): X, utils.assert_transform_equals, Y=Y, transform=test_fun, check=utils.assert_all_close) + assert_raise_message( + ValueError, + "Expected X and Y to have the same number of columns. " + "Got shapes {}, {}".format(X.shape, X.iloc[:,:10].shape), + scprep.stats.mean_difference, + X, X.iloc[:,:10]) @parameterized([('difference', 'up'), ('difference', 'down'), ('difference', 'both'), ('emd', 'up'), ('emd', 'down'), ('emd', 'both')]) -def test_differential_expression(test, direction): +def test_differential_expression(measure, direction): X = data.load_10X() X = scprep.filter.filter_empty_genes(X) result = scprep.stats.differential_expression(X.iloc[:20], X.iloc[20:100], - test=test, direction=direction) + measure=measure, direction=direction) expected_results = {('difference', 'up') : ('Gstm5', 16.8125), ('difference', 'down') : ('Slc2a3', -0.5625), ('difference', 'both') : ('Gstm5', 16.8125), ('emd', 'up') : ('Gstm5', 17.5625), ('emd', 'down') : ('Slc2a3', -0.6875), ('emd', 'both') : ('Gstm5', 17.5625)} - assert result['gene'][0] == expected_results[(test, direction)][0], result['gene'][0] - assert np.allclose(result[test][0], - expected_results[(test, direction)][1]) + assert result['gene'][0] == expected_results[(measure, direction)][0], result['gene'][0] + assert np.allclose(result[measure][0], + expected_results[(measure, direction)][1]) def test_fun(X, **kwargs): return scprep.stats.differential_expression( scprep.select.select_rows(X, idx=np.arange(20)), @@ -164,32 +170,65 @@ def test_fun(X, **kwargs): **kwargs) def check_fun(Y1, Y2): if direction == 'both': - Y1[test] = np.abs(Y1[test]) - Y2[test] = np.abs(Y2[test]) - np.testing.assert_allclose(Y1[test], Y2[test], atol=5e-4) + Y1[measure] = np.abs(Y1[measure]) + Y2[measure] = np.abs(Y2[measure]) + np.testing.assert_allclose(Y1[measure], Y2[measure], atol=5e-4) Y1 = Y1.sort_values('gene') Y2 = Y2.sort_values('gene') - np.testing.assert_allclose(Y1[test], Y2[test], atol=5e-4) + np.testing.assert_allclose(Y1[measure], Y2[measure], atol=5e-4) matrix.test_all_matrix_types( X, utils.assert_transform_equals, Y=result, transform=test_fun, check=check_fun, gene_names=X.columns, - test=test, direction=direction) + measure=measure, direction=direction) + + +def test_differential_expression_error(): + X = data.load_10X() + assert_raise_message( + ValueError, "Expected `direction` in ['up', 'down', 'both']. " + "Got invalid", scprep.stats.differential_expression, + X, X, direction='invalid') + assert_raise_message( + ValueError, "Expected `measure` in ['difference', 'emd']. " + "Got invalid", scprep.stats.differential_expression, + X, X, measure='invalid') + assert_raise_message( + ValueError, "Expected `X` and `Y` to be matrices. " + "Got shapes {}, {}".format(X.shape, X.iloc[0].shape), + scprep.stats.differential_expression, + X, X.iloc[0]) + assert_raise_message( + ValueError, "Expected gene_names to have length {}. " + "Got {}".format(X.shape[0], X.shape[0]//2), + scprep.stats.differential_expression, + X.to_coo(), X.to_coo(), gene_names=np.arange(X.shape[0]//2)) + assert_raise_message( + ValueError, "Expected gene_names to have length {}. " + "Got {}".format(X.shape[0], X.shape[0]//2), + scprep.stats.differential_expression_by_cluster, + X.to_coo(), np.random.choice(2, X.shape[0], replace=True), + gene_names=np.arange(X.shape[0]//2)) + assert_warns_message( + UserWarning, "Input data has inconsistent column names. " + "Subsetting to 20 common columns.", + scprep.stats.differential_expression, + X, X.iloc[:,:20]) @parameterized([('difference', 'up'), ('difference', 'down'), ('difference', 'both'), ('emd', 'up'), ('emd', 'down'), ('emd', 'both')]) -def test_differential_expression_by_cluster(test, direction): +def test_differential_expression_by_cluster(measure, direction): X = data.load_10X() np.random.seed(42) clusters = np.random.choice(4, X.shape[0], replace=True) result = scprep.stats.differential_expression_by_cluster( X, clusters, - test=test, direction=direction) + measure=measure, direction=direction) for cluster in range(4): r = scprep.stats.differential_expression( scprep.select.select_rows(X, idx=clusters==cluster), scprep.select.select_rows(X, idx=clusters!=cluster), - test=test, direction=direction) + measure=measure, direction=direction) assert np.all(result[cluster] == r) From 0be7b6cede414ee7d2f902e8384c62e839a783be Mon Sep 17 00:00:00 2001 From: Scott Gigante Date: Sun, 15 Sep 2019 19:02:36 -0400 Subject: [PATCH 023/125] require parameterized --- setup.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 3871b98b..a4d820d6 100644 --- a/setup.py +++ b/setup.py @@ -18,7 +18,8 @@ 'h5py', 'rpy2>=3.0', 'coverage', - 'coveralls' + 'coveralls', + 'parameterized', ] doc_requires = [ From f7f285c1b8622d6c8c468332a5d46f8eaca6a813 Mon Sep 17 00:00:00 2001 From: Scott Gigante Date: Sun, 15 Sep 2019 19:19:06 -0400 Subject: [PATCH 024/125] test without gene names --- test/test_stats.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/test/test_stats.py b/test/test_stats.py index cafc914b..caff585a 100644 --- a/test/test_stats.py +++ b/test/test_stats.py @@ -163,6 +163,10 @@ def test_differential_expression(measure, direction): assert result['gene'][0] == expected_results[(measure, direction)][0], result['gene'][0] assert np.allclose(result[measure][0], expected_results[(measure, direction)][1]) + result_unnamed = scprep.stats.differential_expression(X.iloc[:20].to_coo(), X.iloc[20:100].to_coo(), + measure=measure, direction=direction) + unique_results = ~np.isin(result[measure], result[measure][result[measure].duplicated()]) + assert np.all(X.columns[result_unnamed['gene']][unique_results] == result['gene'][unique_results]) def test_fun(X, **kwargs): return scprep.stats.differential_expression( scprep.select.select_rows(X, idx=np.arange(20)), From 89980d27114be29b64949000d3bbf6d7f5320f5c Mon Sep 17 00:00:00 2001 From: Scott Gigante Date: Sun, 15 Sep 2019 19:21:44 -0400 Subject: [PATCH 025/125] test subsetting by cluster --- test/test_stats.py | 24 +++++++++++++++++++++--- 1 file changed, 21 insertions(+), 3 deletions(-) diff --git a/test/test_stats.py b/test/test_stats.py index caff585a..76297fbe 100644 --- a/test/test_stats.py +++ b/test/test_stats.py @@ -221,9 +221,9 @@ def test_differential_expression_error(): X, X.iloc[:,:20]) -@parameterized([('difference', 'up'), ('difference', 'down'), ('difference', 'both'), - ('emd', 'up'), ('emd', 'down'), ('emd', 'both')]) -def test_differential_expression_by_cluster(measure, direction): +def test_differential_expression_by_cluster(): + measure = 'difference' + direction = 'up' X = data.load_10X() np.random.seed(42) clusters = np.random.choice(4, X.shape[0], replace=True) @@ -236,3 +236,21 @@ def test_differential_expression_by_cluster(measure, direction): scprep.select.select_rows(X, idx=clusters!=cluster), measure=measure, direction=direction) assert np.all(result[cluster] == r) + + +def test_differential_expression_by_cluster_subset(): + measure = 'difference' + direction = 'up' + X = data.load_10X() + np.random.seed(42) + clusters = np.random.choice(4, X.shape[0], replace=True) + result = scprep.stats.differential_expression_by_cluster( + X, clusters, + measure=measure, direction=direction, gene_names=X.columns[:X.shape[0]//2]) + for cluster in range(4): + r = scprep.stats.differential_expression( + scprep.select.select_rows(X, idx=clusters==cluster), + scprep.select.select_rows(X, idx=clusters!=cluster), + measure=measure, direction=direction, + gene_names=X.columns[:X.shape[0]//2]) + assert np.all(result[cluster] == r) From 3070a9eccfdd0211820422c9330f6a782a710a07 Mon Sep 17 00:00:00 2001 From: Scott Gigante Date: Sun, 15 Sep 2019 19:32:56 -0400 Subject: [PATCH 026/125] exclude rpy2==3.1 for python3.5 for now --- setup.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/setup.py b/setup.py index a4d820d6..4cf47d9b 100644 --- a/setup.py +++ b/setup.py @@ -16,7 +16,6 @@ 'fcsparser', 'tables', 'h5py', - 'rpy2>=3.0', 'coverage', 'coveralls', 'parameterized', @@ -33,9 +32,9 @@ if sys.version_info[:2] < (3, 5): raise RuntimeError("Python version >=3.5 required.") elif sys.version_info[:2] < (3, 6): - test_requires += ['matplotlib>=3.0,<3.1'] + test_requires += ['matplotlib>=3.0,<3.1', 'rpy2>=3.0,<3.1'] else: - test_requires += ['matplotlib>=3.0'] + test_requires += ['matplotlib>=3.0', 'rpy2>=3.0'] version_py = os.path.join(os.path.dirname( __file__), 'scprep', 'version.py') @@ -47,7 +46,7 @@ setup(name='scprep', version=version, description='scprep', - author='Jay Stanley, Scott Gigante, and Daniel Burkhardt, Krishnaswamy Lab, Yale University', + author='Scott Gigante, Daniel Burkhardt and Jay Stanley, Yale University', author_email='krishnaswamylab@gmail.com', packages=find_packages(), license='GNU General Public License Version 2', From 66df7f341dbafac0717ec2b554da71aa288e9f00 Mon Sep 17 00:00:00 2001 From: Scott Gigante Date: Mon, 16 Sep 2019 20:47:17 -0400 Subject: [PATCH 027/125] fix unique results for direction='both' --- test/test_stats.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/test/test_stats.py b/test/test_stats.py index 76297fbe..cd4c4e62 100644 --- a/test/test_stats.py +++ b/test/test_stats.py @@ -165,13 +165,19 @@ def test_differential_expression(measure, direction): expected_results[(measure, direction)][1]) result_unnamed = scprep.stats.differential_expression(X.iloc[:20].to_coo(), X.iloc[20:100].to_coo(), measure=measure, direction=direction) - unique_results = ~np.isin(result[measure], result[measure][result[measure].duplicated()]) - assert np.all(X.columns[result_unnamed['gene']][unique_results] == result['gene'][unique_results]) + if direction != 'both': + values = result[measure] + else: + values = np.abs(result[measure]) + + unique_values = ~np.isin(values, values[values.duplicated()]) + assert np.all(X.columns[result_unnamed['gene']][unique_values] == result['gene'][unique_values]) def test_fun(X, **kwargs): return scprep.stats.differential_expression( scprep.select.select_rows(X, idx=np.arange(20)), scprep.select.select_rows(X, idx=np.arange(20, 100)), **kwargs) + def check_fun(Y1, Y2): if direction == 'both': Y1[measure] = np.abs(Y1[measure]) @@ -180,6 +186,7 @@ def check_fun(Y1, Y2): Y1 = Y1.sort_values('gene') Y2 = Y2.sort_values('gene') np.testing.assert_allclose(Y1[measure], Y2[measure], atol=5e-4) + matrix.test_all_matrix_types( X, utils.assert_transform_equals, Y=result, transform=test_fun, From 2a05a0681ed24b3b6f4a6320fd41f98b131af845 Mon Sep 17 00:00:00 2001 From: Scott Gigante Date: Mon, 16 Sep 2019 21:06:25 -0400 Subject: [PATCH 028/125] test.png should never have existed --- test/test.png | Bin 88464 -> 0 bytes 1 file changed, 0 insertions(+), 0 deletions(-) delete mode 100644 test/test.png diff --git a/test/test.png b/test/test.png deleted file mode 100644 index 01c74e2d7e564d313d1d078688c4bdc0e477405a..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 88464 zcmeFZcRZJG|3Cbej6|q}5E+q_RQ8sVQBepPS*7f~DH=pX8I_Tl5T%qY3MnBo$|!qf zWY7C~c3t1!{k{LZ|G59VJ+ATj$T-jAcpb0Ta~%b1o>8Hr*-b+tk?2%UDW4;esHjP# zO?1?h_>AD?1CjW@&CW`y+SK^)p|%LZzi)9krSD84F*6eXCHuVZuRJ~};c`OH<-Gl6 z7k5)9OOmari-V25i_KLtE;maj=d1R1M-Pb{IxNIx<>KNXB`o}Z|M?+%rz^tw6L*+N zBrcMwvZA&}+<2F};raFr`Kb%{4`y7yD^#I%rK|XEuhOeOQe&GgT4(7R<{M=FwsG_w z&3vI#QF46coWjyKgCJHGbVouC*2#7!W{HB&m!KPF_mt;2FjFGeXi5MO*3UZk3nIudc04 ze-SR<>Eav|B#`Lg<)v1igdZorvHJRqWd`M;FVP1tW{mQ13#us{KhD<^zP`Rr(UrQ% zd8CQ&mA~2Wf?`*p>fB`uiK|HLnYz2Xrxq4!x{BTDI`?reA5Ke8|J`5v;QL!! zHr;H!8->n8X0?%gdgXqyl^;Gt#Kjplo)&u3Z@h4R?aadTw8*K8Gbv&8tUdB;zqo!^ zHWWN%Q&8qrv9c1lTpzcEnz;EyDP4x8j2h7z&V(>#vC%a3{iX`YiFKKS3pr~kX z7jKst92^udDmo^pn;mrW-tHvp_MG9C=Q~0}B^p9#1O4@h_0~@PSXE`(UE;}UPeRWFo>o&MI}7hxYTR7r(Av;htxxB z2L@78b2H=a-FJn{7B}f+YFFH4I&dF*efGsc<2=e_i>?fhQ)kXphcF8}%#Z7@&i}Aq zoNYd^Y-msF3rQ_RjfDvJxhPK-m6W1etr&)j!Rr)Jlx!KK0ZEo*PKjndarSY z-@RM8JU1rdJ#WLy%S$5t=_wCa45a+^DS~IILz!D}Q}iLDyVHF&JUl$deKe9ZpW1a5 zSv9BAv2$>kUb%AToqf;msu1P^k7;g{NknAiCQ?vXSoQF?ly7;K>Mgk@a{KOx~InXj}$1`a~Xuh@4FY#O3 z&C1G3yyKgz&9p_i_I~Xu%F45H?Ck7QYYROo$M0;j>MF`-;g#{46~ON0HI$kkg)H^|5-FU^m4WEd9aRrhV*zWt@`53#y9$sOUG^7Y~AehxF{`&h0#VWJ}= zBfCZ=%g4v}3r`&@X1n{4QPF0SreGAy5ew?w;&!2eI+^{>>LbUiNqe;zBOX7#+t{cn zE$y_T2D&rymbna*PHAQOC*q?>U<&cOrN@pXMvG6K`yP?6Um)@Z0gR z;Gm%H#H_3XMXqDF^R3%Ak;H7iUk~2F|GO_-ew_;kZTUX&42#o!9<#%Y#$`TRtgWrJ zGc?Km{Q2|p-MfZ`dF*EI&q8Ojjhc=QR@=@uOI-6~*56<5-FGHVKtIwBH7-R#9i_&1 zCSK~+%EBa%pdih@efy?nW}3XVm3S{4s!xy&86LJuO-+63vow=oTq^Zg)cW>zKDGWi zKGm0#YO{Xp>nrxPes1aM>E8<+d6s8KQs%x#@~NABe`(5c%!*;~>ytYjudlGZOjE9U zaO)ODk;_N`s`cslXJ^dbi;Idfy}r^g_*AFq0J;qgBPSYE@6@No^LTIQ%Yi(q@tuG+8VR4@y*0U&M;2-1fKFok60#_YUmg7#Q|JwZQtcdHV_W1W#@?XL^!r@-*oouS;G=(tD|3{g8Mw-MRJB-UQ%dlFU_iGluw*cooR>_Ic|GhlVP(>cL^(&wHhs} z^=(aC+b*7yq3`MCS4jsBWXY87^PMvazWe&)M`hYQGAwFo=X`9M(@zg~6~8+(5c~LX zWls<1*C*l)uauMbUA=l$!gX}#pRVFNC;~k_z4zVSd+>*Mo;-PSW^QP(?$OZ#&%gVU zlanu5THeak$*OE_-hp1{DR}m|_V16uI|^LKcA1%*o7&rl<3v2w$951Lw}4cbps5`B~(;RoJ*h;oQ0HYEgo>W(K~b zp#6ERt$aRv`SNAGQZLcW3%Q35QTX(}C&Pxsug?>G=}2cfIf5zrX+Xjq%0BcWpmB#rEzcOG--enCahwz8grxB>2>^uR6o1_^t70 z{zsd+j}*LruRwO)V=FDK_7nTz#w4>n^z`(54;?zQ*#F=xZ_Tx$gC=E$*3x3xfqIQJzt)!hi0ACQ+|MWC_BVGOG_dkH zI4U=J^XBG?A3rY6ul)M`oBL;h<8kX1x(;_f!Fs=?0p{9`M-_ooY$f#iUH4?oTYZOZd&R6{!Z+PJQfCun7;n?Ea--@pUZ`MoFm1P2R#sX>JsXkwm1p^7 z^OY-C@?9SuxNv3>z1=d3bH@%Ij4dUrp0d(vu`6BIS~B)-rlh>Kb}xcYcruaQDVP3A zdx4{Qt>eOEPmE0NAMtQ5Y9%YHjK&gA=Xk#-#M=#bmlj_#GfSQ?Jtii$tE9B_)|VLJ zFV~8TtT&^gDl@cFHXk@}Am8t&bat4Cwm5CF3iM#jj6V__GMn45aagvJ*4pHJ__*kEYb+T*_=yS_3hADAy* zj^Vz{u$hV~Wc7~Y~8?(o2c9@KrnVFcUd-v{@@L7B*A?Wul`2@b8M!0-6d10Pi z+KtwEsQ!e3!S1;D_;0VToNdX_BFFD5aO|TLv-=qoBW$vXWcRb+#^j@|!QqZTa`+#t6?dvCWWPMqYQpM`f(5IHt}R@ zrjBK4baE4Fb^M_k-!5Tep>!siTNcl?j!QTX(#oyPo}XUPd4TN&jM_tBsJ2`a`W-uV z#E&ik7!kR?y-rlhzk3b1S_3=Q$55|vtez-pn z6?zFhk8~Zg)O_Jj*KliAjACb8&m4xbtFTGg&D`8vj~lh0K2c-aYQ%edcxuHRe=Eny z`R+!`+1{{KJz7Zd*_n73DdA5d)@?$>n2tS6hC03V=+UG8B~JfUMh+>L_W++{B%u8o zv}nyN-Gi!S*%P#R!mbA2m-#zlmW52pd~%jJWxZ~qGAcfQrgrLBq^I3V- zfWXsF#HzwLq@89D;_*XXnb-V&*W*V{D)E}50m7^T^0hn|p{u=|qkOBYHR}RFDnz=H zj#Fvp7~4=gVMT+Xrl+sL`S=HkzZbb4hWRHpN(nA#r=^GIl9-qE!Pa)krm!>ulE0)l8os9hB zD!@Y}CiJnv6L0tbsIRYA($EO4UIo;VjDG#?*sY(M>Y-)!OUlX*{ofoE*^LpZZ;*la zt$cL!iqpaKU6F|a_X}MfhlGSI8qSTjiXLK}EmIHOod*z@to&ztN{{bCJa5m&`a-$k zmr{@E$YvtI_oHPK6O9cOl{Smd-uN1#g_T5sEu~MwS7`ua(ldZJJSUe zeZ(r~7HaCldPW!~N|=+pb8Cxz;YtPu24N*Ao}I)ftSxpLP`hV&Wn*3R$dT=JU%%eE z$0nZn)gN@SqPqHI-DGd?{jqZ?8>@fMh7GU)c*XLV7#KW0Z25WTD>|o~l`R3QW0xkK z6Cy6EV8Jfg+Y5UN`>)Mi$oJg9D5Hd-(7{7VqVdkt)CZ01Wd1Ce~>L^XB{ z>|kWHtbNS4I6rZb^oYkCAS4((t^c`dt)Yxu&*CO-@EFt*E#-p(Vd; zlXAK0P}T=yA{~DT6v3X9BP=E+wzxEu==}A3%1OF6c3lPc5Y;@zUxdeQc8$H&Jnes6hxp07uwP)jh1pk?4g z8S_iG>DbiDHd2z4u?NYoN+!!z&Xm}jn%?N`y#&@RuObIrQTIfg9(1AcDr-%p#ro=! z$Mo-8vBqz#BC0#0fc$F!_)JmJa$=Ya`g7>8ENpCj*h!${z)qQXDc8t{?qJY{SBB0$ zKDGt(>(;YpS8`r|>-zcAVYqQ`ZC)Dy;e*ycmA%ja3fv|h$GW+=+{9^W>swx!4Daz< zp^J`=Hu#I}2tTE!w!N8Xz6}Yot%GaOD)`i2K1)wGo9VAL7$b45%Wt8jZP8oD%;hVm z2hSk-bFEA8WrmV?B1y}H*p9a>bQYuG~zv9EXl1en)}Su@|!I;$ZtH< zkppmwW;&qz)TG?+cax&)SXr!K)|GNGdDY#OgoNAVqO-#cnOoxAEdrTo+i-@>*jE*qrp`Z?p{OVL->^j=)U1d#i zYgIcPC(2bI($B{0qJn~Id8zi8f{nmShT$VW3y*p&{5hD8ZXOEZ;7evM=7d4Rh>71) zng!_thD4S1mQ;VpGfd|&u^b#68Xq?_HJzI|&I|sey98M(QcJ#2i{r#H#$)a7VM6m- zny%p#@tU=IMLr0Q{_^Ev{UHfkFk;@6Es3(;>CJL1w@QgVD_}htE9A65pQ-&k(xSY4 ze4s?wmYMIhgrsEf!od8CnI}p~_kZI1G}AR$fpq#7di-MyXKh+C7~8V-d9n;!dkyT4 z963UuYLTv2<|_J%mq^1+X(1ja&$S0*zol*?&Rdd{2l*%3GPcSlw^tB;q~%d^-r+M0C-$op=CCB4hF$b9>!?02?t zMA8cGMqOq10o(C)Ul5Fv@R)j<-IqsoLsV<6d||Zp0QiPln`5dpA)f4S-H|Mo`Q;7h z;U}qd2s(LR8WI-TUR=xz=i^TPy`P7N{NLuhnYxnbdmBI{324GA>lwsI2X}YoW@uWN zFs&DtD2#~K2Gqnc?MHBPNKzd&nZ_kL2UR%x=W}P8-U)^Z9Xu#PKrac$O>J#$27fPJ zyqE%kDDSi1^6Y+q-zpa%GK+WWb4Mxred{r&Mrtr~d9SukO8b5uQ-}cegH~|fT=Ma- zGbw2F>8)_NV;p9GVfcGd|Cr}qD{l?y#u*x9=+zh(9o8P9i9VYV^7Q*!=WO9 z`G)~_sdZY}*@Zz3`m`*|-Gt}xVFt9X4*?WgEz17@h}7oDd;4p~G)S(5QcC)&&l?5L@~m9CvAF0F*z!Pj zDzRO0b3{Z$WCitqS0(_QO;5M*x*a(yO~@*Hqq(*9R?rlE%P}cUe&}a7d^MmDjmamr z63QPzmKZ(082g(V@hNaXYcMQuP;{F*<63^gR$IOt7_4?ZZYRyH{*9oxa&nNoj-${P z;`ux!dutq={?Mu`R|Q~}j+b=A1RQP&KLwM(A@gJr>)M=txp7BGf{Lc=a^AWw=o04bSuE&&Q8yiU%qDaMv)Vyl%DI5DRjsgNlDA1yL26 z54bQ~$N`jOJZF@L@*X^R@V8~3uOJY6*!xlle`+y@=J=oE#9ObHxx z_>`~w+UyRshXQ0I0(cYB^0~8$tt!ge+8Ogq{dKFFX`PdR!=?@|?|<&*Y#DH_SMQ1R zy}-TpX^)Th;#892=(l!-h;hpSs@BQ-3#@UPJBPg$&iY+^Wkvy9oF*n=Wot{PqM`y8 zpns&#IO-?UciR12TqFqFkw50f+8sw*_S^h;O9!Z3T3?~aHTi$uXXc#T03G>F<*J8= zp5)`qoN5Amketb|)6Cv#B)pq}QR|zCZRarz^|Km11&-wCxRtnue(B{2OmVw(oI2cdEEj-V9|eM*t~tQ@oBZqnsQyu&X7gw`d>n`rbv z6T~}s%#T0n9-CQMU~W)ROWXc{|7@E#2ZcV-wlK9tr~M2K+4so%39WoTa3Lq-;2j!f zXho_Ulh6?J!gba{$vhKZm!bTsqVC+e!|OXctQFXjvOD(d>C;tEzbrVP#3*iFe)vCy zN4y#Iru)&+lw8ibozUMI7#K+I?hEYdZkP9~d2bu!|If>y|M*vt$>06}(O-lKK1@rC zw!SAlhZJwOyQ5?6DP`awk{*DxNI+08msl z3+;vwz#(Id=I=pWEG{htk6;RM3khlDy6&_f&Y2=Am2U*dO}|DKk`H92cD*yEDV_3U zcQ0_JDF<)2`dRQQ7BHpUvV5JKYFp3(ny6Eq)j5?O-wVmVGcg|`WnXVB4RHnv9ew>7bf4aXn<3esin>jRA`%n9r^fMEC00*ny zXNd)HouZujv?L7~HF#XX*1{r{SD0#*WgAhJ+SmxU@$X5ZJtz%_nSND~u2OIJ{3U{r zbT@qZ^d8ngel!OQi$F@+BkgCAV@H-l;mzHF3~LH`!b}S4!Ja*Pq+FE;7V$1bw@Bwd z{W!IKW@{0EK;@qYXXPS%gyBI<=FKnZ7doZ!n#3G3qHU7%tu1TOaU4r=ndsCh;Y9VV ztt?o}?vs+*4LNOlGh{dcoy>=AeY0Iqh;j(k-EY-HY9gbcOk!o!a2FKV~U&+nx zgIYE9r%O72pnnjkJUU8vW#c)}=qIT#ml~o#=0DKObBT%R_WgxtwFu3Lk7MrFhdWtc ze>_zr2kN>zBEP=K(_RFLH=ICkZWn=}Kd#_F=@f4ffh1xFuzb)D6wI-w6I0;BhY#d1 z3z{x+q^ZYLL+8&Nklabjz=u~8XMaR0wf{}(EmcQiI}{H_Jy%ao^$hE9xvW>Jv^Kq zBgNx9of?Dhr`zGJ2N{={~B zXe}%<9xpBWKRs0p)O=@Wls}-audnKN%JydgBcAH~lZDD#TdjV)JqS65-89qsJ! zk>&E-Z2SXLu~rcb@&Nm?CCc)R_2R{~jj3P9p=o^D6MvuvB608Ns9<`cynoKSysgJ^ zT5olwsy^UVmxc-Rj%kVQQ(C_*&ScO5C7RMU>rH!v{!aGNf2>;P)K236`R zNB{$v?}WX}cVj~+lbnQUeH)U|kU=-ZH9k)pTib8t{{Hcusq~=diV!5LW})l`KouME z3VlmP>uEWBL*U=x?@@Ohxpo?rYjJGI`tF?%TO7VjbPhG)AqZv1QcyXF0403;dr z>>MgocFwASD_4ulM=*`o6O1IFfCUPXpzi%63or}_wOgHNf;x^u8qb@N}Jkpj6#4oP2t7ujwxOJ9U{#8nwt5b!A;0j+%Vu( zy0N7x@Q^%~X3`eFL-^o*{qS!y;ONH{G&n-p8!wFE135rl)Z~`03$CnaFfWNPq-AUy z9{K(%#bX1oqYB^GGN^v))O)CGA;6d{{YRHIVgFI>+O^B9$bVyf@o$5C>!7U16xZ_n z#DVnFr%zkvGQ`ZW(sD@cgfe*k$rY>!2eFyc*LFM^O?p z7&0CUe>jL&0usplItCAbM_ZKSRQC(zwdX^|9334gzP<6$N><)N2=T8QFdI z3aXfbZOa=IQ6^v&M2_8r$+>06J}ztp9uV^RdtfsyyfwOUB`Vs@&CO$F{-AM*$BnJb zhq#U%)85Y2^s%lEY~F_O7^*Cf!|8r?wRzv{NHbNGpblPc)0srMd$8ivqJzi3rBIy= zWifr(o^u)-LU5B1^@DOAQzVS$lLWEP)H#5>MIRI%#Ac{qtdMj4U29qJ3)3gp%A?e? zpi_2qb-k6JpI^Wo0vQ1)XVZX{U1OQ ziOde!C)#Yc$!-qJY?SNm(o$J7GcyOwH5f#v!sw`}@5`E(TYiZ?EGBmH)G2DhyaMk* z-%LT_K)<5c#x8Lkd4u33CK?LliYlCoZipzH(hi-&I>C~@_21+`AHhCo*(DgTyfwYTa|SZ>UU9?A2*oj7#ab44%?Uz^iFo)h z^*r)Z+7h&^M>bbiR}+B{NDU0EtXp98uL6%DM6v801)OvfQ(dy>vR%JBMg+y)6*>8i z1(nY^5X}kOoLD9R!OXR;98H0#TK>dK@Tem@tWGEb82w`eaQVIG$1AYt3_ErNf-vi^ zNcd8tp*l|WD%y0u+2TA{_pM-OBFT9>MlM7;$Vm{UZsIkc zf8D^!nZx>_Wf(L=LuS}amU8mm_MYaxn(!>yp-qq0dQODwQr#D9Wsc_uGpeBa+ChGf&vT_0ZxcBpA5()-$g_hdM3KQVLN7Z9U{_yGcd4HZ$?Bg z8X6il<`_vsq#icBj5rSTEc!TE?_Ge6N;Wn^gb#(2m;{d}(ly1ki?$mE5F{@mp5$uB z%*e>9Yg}(m?3~=js`I^uF|P#8fr*J1eytzhlFf}p%{+d$$`k2 z$KuppBv>e59!p3_5FRCyp!WSP%0k}*;^OaQeEGi|ln;6S{CR!N@ThP7>B4DKdu}>jw5(SA8=FyMVjEA-O8@m;YWDB-u3Zi%Z5)`ei{{H^8 z##R*1K_dtoif|5r9gwRKRM`U~PXo#>?0ykF=^1MPH1@`3sHza(OySg+kwlB7Tt_eA zSL7{}9Q;F69+B*Tt)b}PaJZ?d$*F(8R`TYOl9DNmjG?Q!J;+^fp@m|pWT95^*B{nx z1p*ej_*P^7t@f2JU*my?bUwz_#y%+cAc({j<^c{ew5tSl@Pt2$Qyq%4W%}l8*OeX5BTR)(jz|KRQ z20m9Hc!iA6Z6KuXQM}80qRBEb-6|-M&@(WoM3xQ!1*)v*C5w-*CJ`}QM_MBcGiEQm zHNaL7tQVyT=uX&-EhL>H%#OSt%r=!A>+G77{WQl79P#_eCacK?1a(sECHN znN$g1`adb$D&tF+o_g2~Hdu2f|FqnwuPd5B;AbcglZL|C`7gUA9fy>*?K&0xo==}I ze1{iI^n6AlS5xMWL|!G7-hDE_WG0=E40#CMUqC-EwR8S^XD1CR908&2GlnW&UPUiq zI3*#bd2Q@dQPlqPClQD!zaJZ$QFGyxGR5iULhdodc6pJ-p$44YajiY)pwIHiGenMd z+{7%WhB@C-SO)geZ@@{asH{{%`jGjtn5`h;?@BoIUPlZyXTj@D7kbvSS3a%QxAos) z`9@a#r~|mrPE>8z3gfMj#}kycH+4c>xZ=wfb)vC#wi{xdOz}%8?Nv^myzKy84n6G@ z3v6p5w?q7{rs(_k8A@ARVX2zFzfKkagi>8DdP`~VF{?%eK$$znqp6j_sL50OM4kfd zm_)L(vzwZp-v6DFkugF5J+iE|Oy zerOd9j|elv<*|D?Ph^D>edk$GIXNO{F?x_u2?c}egPlcW*Uq>CzEB*$Ok zUT~S7Zw7RDbH?jNH~y0 zL_`EG^9oY?&9P`3`FAm5Qz(PUYuQ|FML;a5&Rc~JOuHz z;Zj-m$?g-twNvm(reDGPOh!x%by%1@^^67{I|I4Njp=FFpBB6m$1Q&IHpNRk?lN7e*4(hG{72p^0+k1(w3A`?a>?p8r zD3Ml%1`Vv3d}dBkN@|eB1*a+~HkJiAC?p(oiA038FlfGEgmRcfEqlHBHkVIQ0DDG) zV57F)f2y~_kSf4O0Lva6!J~3_h!HInh&dU{h{McFWURnSxeguLMqqz9-2E=_HG#~gCdTc9FK?l}uNAD=ypAUok?DO_%> zLYO$wXJ!jfqKlW&$RdT%fd0!2GOm*l_9yaEsU&2@#SA z_L$a)G8|p9{1PjuTQ|6SsG*5#(VqmmDRATpi=&!br2qWW~#X`Pq^4h&dT* zwBvz*L83QPz4lpO8j2OQj*j(19u_nyseOL@XW`{s)lysORn^+!n!BK)t@m+%A71{6-3w!Y2X5<0h(=`aza~L z;0@B?C!EyQZt-rnynLBbSXh`y6m}K4?%+I&_&(O6r`X5$8UoaU7jkGx5HI-VCwqj) z3IMkpqgZhSQ0J`K_b9Hy4+;toXKqG@=rbOXgha>qk9YJ0zGlywR8%DzC`1(pgx)}6 zwh>(r4}BLWb0lpQN8fz2i>*AFq+GLKZ|*0}Nz0u4>epTL))I{GFMp#CU2bptmrKZ5n$hDDyLtjd<;f}1WIMkJ`t*u-W2TtMPK48XHxj+|j zHsWrpuBJSCxsIWkNJ?RP%yp(CLv0GGW!3of#`vRzqQ$`{_D{71$w;XB;I_XKOq_;h z4>K5u_y9p6(5Z0$gn&bk;V1JA)li(oph3-WBW4ZYM1)tNu@RYN2$aXea}~;fl8Dho z^k0lMpwCO@=Hzf--q>_(fBW_=Mbjbp(Slh#UC+<+JKk2%RWX;W7k7)aj*k0w6Cx1gmfD@;P z%QJ*+-IA%Z6+%dpdU0@&06!{mcC4M=e|4JbdY%4FoWuILSoEyBN57&#ZX+$nS`xox z3%B;SFG7$Wo}L4bdvMJM*dX@km2JfLps_YD8-Jc&U2;b1&_DyKHrv0z&iVfY*wqXJ zCNaLz@tQrpop*g}fu2ynd4#c)83s9#swDalS}2*tA5LbFhs$3dkEDe`^h|~TTHw1P zcD;@-3M$0b*2o%k&R7w1N`zN2%x}Uk0(0#53cbeVh)U(`pNYiegCEJ*OvDbn>hNnt zpi4Ps5q|iv=&;QxP@IpbrjN5{<=1$(*qxsyE~D(7LcGHa!0q?tKbe4`hp)8=Xgm?4 ztFE3||5Anng4R@x@j|c{j8t87W7w%go)bb`%OcMp=KVF{uszal8hu+Vk&U6Ip^21n z139(nD*ER|Lm{h*IMA_wCk=6r1qVz8+LY+q_@JQKF<+@B6zFjU1(I&A@xec?5nU>r zzJyZw*&rFDhzegxPJ)gOacd_OC1TZyK@G*n?zEUnpf2S}BAZ>+R()Ndp9&@wA)@Q|3)M3SxqVjy}5`_ zm_gp4{XD{>zygtLr1H?~qz-KgVaCa$<)87*?{O94%xoLt&iUn?a3EFukeQI4Z>AuFXgueges zeWV7m>jRwF9T=!Ec&in>k4SGcRq%0hfA}(bDvO4i`qNx2HoGq+T#ncoQA#SZUd+Ko zU|!;W9H_}iOAc1>JzOfFJ#99O3aU4=qpj*=W$Fld_J4{XuAsE_a^077HUmDz&5*X| zUzZOfR6Lywww7n1Dr*ZR?IYI#622{@LXlhW;bU?8L-j}eLDB&%vqUDb@DBZ-7)g)3 z`yf0YGwF;?1i%qe#VuoD9i6|<&HXt9)tv%G z$!r;F&!(=FEW(Y1Nv^v>W0TDF27b%1xLNmADH@oZP$T`_AI37R(Idww+}3m7@SW z3;x9YIUHP)MZL0S*N~Eq-IJf29;w&AvHG?Nao?MeKY;9~`)hZw%X(=WmYrnSOq_Z6 zy`24@F2P5=hKYb?b6estUlxy1vT`uH=W!*a#x!+i-hKP-hWoBg*9;@^g$Vj)cnZYL z9AGmQFhbHLSU;FYxUldfdmeE>T9(5kM6d&tlCHTJv!+IxwN8db+D+(Z zSC`g5%Z(6AV9@VuhM}Mg>5#YpfoX%QFT~9Q?AgQ6j02=!pqb17t+F{>v+jaumJK#*T!7oAe%&VYWK~X2B<8FX@#%0gpSus#&M~ zFjaa72low{0Cxh&grdQ6K}v%3d;*B35z`Dsr|tABF-V{oNuMMD?ceVs3Fd&o7P+?N z?^HK?enENZUxbUHWjgLGI(4?~S@L%eK6i*H@i| zp01aqKX1_2!-!K`zNtTP8o(7AD1ki;%l$YAQIL4J@o`!x-`D~L`!Cbyei8kitPle= z{cPf)bXyynm&kgtr|= zbA}hA54T$g=6vj0%c+@Fs0}aOCJY?ugR;|3VOewDmJr5xNza?B=f``>IYGwS^)~P` zUg1g{M7gWBSyIp`D^S@ag(C(>aRxC|4Z=4@vvNO@eqlB`AmPeBc|+Oj)^&n_I$O*7CwzlkP^~F8rIM|6~5Ls8@&q`iimhoF< z!41=D6zDIoAUxMK+I>41$sWBf%Uc#D&2Hz?7L3ATK9JexE;&pD*l*(j%z(N`M;4ZK zn7MvJ>U$(avZUITf9?UHRDQ5phx;x>wt(c`1)7$lZP4nm;sdXorWlaVk_kLs!5$8_aI^Ak6tz<9m^E=t z5ptMcU!U6E4Gn!*x7zmXXY|o8Wv6c*Vjk3vi97?|tEQ%wKvj@o9e&6br%DmG*{inJ z>u+3sR@4e7O|)!jkge8F6)!7bQYQ1euaA+LnP1Wl2gGA_Q3R$oZ?TcGc8SGTwfab3 zeN7_>iChR(5tlo0%cQCXh8D>4Q)f?4{d1DmxlTre#7E0u2Tumq0&b+GrSXQ3&Ll61 zcpu)szxLA5_w1S0Z|#!18hwlpXyn=R&!OKD5}lJAL=43KC;+|rF5Vyr6y(bf;Ksef z!?E_Z>8n+;h?pv9Ywv`a#+8nG&z)1LK%$xO)@ub#Vni!XYiJNeTErKG%5Wjvi?irW z7CaA@!7}BWjR>9Twodrg1iQ6W=8zs~@z)(r z24-d|OjD%0e&OVM%#AV;t_8>wbQAr&0KC;^MB;PXOQoMX5P1m_?Bo#U36iJxCRVUq zo6X_H#l;rdBTz!<&V}bdxf1Rc7P5tof9Q>Cs{5ax`|EE)(yRofvWObPHOnQx3tfk0 z#ho@8BJ&YZcT)SE`=$AaJdEo|9e+hOIiG^XPU0dg*soO`)E$e>4w%aMf9N!nwONpy zVFjrs-IwfJfvEc1*}l7^0tm0l#jnl|^8gopC<_PP>I9B9T~QdMl`70@*wrHTRC!zL z{BTqB_{6uCU9DTQ^^Y>|7PARL0)}wx&z(fz=M`Ey5$^NRVAzZsKVm8hkAkQb!kA9v zHE^9D!pjx7PbT#5VrAX`$cVzC-jAM!h7ibSTEx&^wV$6m@$BL}SCnDq(COAPrFj=T zu@;!COl#6s7_Ncw<`jyJ5?}IO#+%a}^kz6ZW-oz6Zu#m=hy_Cq0^F3eEZ6ZgwSFPs z+wBFuR7}1^W}Vb}`uo?fU%wT(TI<(9U#aDq5P`jnmRse5gc}l_QGtPh9vhGWz`^#v zZ)0F&67H00%1pV;CVDOPJW0(}m|D_Xy0EB7t$xd6mkS&+E3NZd&-3!sPcbw|v2Jra6ElHX@Zg!GJy>8*2i z((@;KI}jX#yRFufRc3=Oe>0BNDRN%@HYf}?(Q!IpU(Tc>5;eC8cc8E znxp|PZoxf7#^3{P+T+#A!(gpYgU&4S48#|IB>H zw!&JuCwsmBwT;fmQrX9yX12BmRlPkulR(-HzJLK|^BV1>TxHN&|KfL0wLSrV5zb!t z^AO|-B2W0KxXnQR$dmg(9-#MEMki3hudb$payxDd(M3$@}y*FPWc|?M8-b+sHo@(%sq<%x~*G}qbdkL z8+IuzDwv?P44j;~t|LrF(!NfX;E)J%IJmi011A%zw_&Ll6XxhcQRzm_37B-13Z(%+#JTsw*+qGfk*g@6JA!%d_x;1q{| zmhQoc`*ObPcH@hS; zd~sXslPBu)f*TB$lYeU^FI?7=^?wo@+po86!uLmt`bDTlyn(JQMyYkiawJ5qPwFl~ zfdbPoTLzfU`*BjaN~KMsP$=_&Rjnn2Z6X7OohKxGuleyUT+U0@_V$zPt~+%z$!)Nr znBMzYuOG!*V56rv;;FfZmqls_)lPUH3DGk3l{s6i*NfE$4rdq{Qo0ftt!ak-5IBf4w;@U)}sK8($ zsr1nzl5}G+>V)>sZF!cD7$5Rozs|+|IqcKrha{>F4P|PT8@!t-snwNuOIs$x>t|bL zT-;sVPkTxI^)q5-F4gXGZ9gyCJW`mI@S{1HKS9sS@2hD0O|RYRO~QWDAx#vbVSP4J z?j5A>hkYlkO%BMGjb>&kF)OpXeK+m2dHwZqM4Q)+aVlQb+>#l!->xW>fu=r~NOE&N z5OH#R5NFE6|Ou#FK|Pk&;!`?105SMeT=H+!zH4**Tj$@W;r8123r zQ)}UZM~d=uFJ0*NmRcT8IbJ@woawvJ^=?*r9=V`aVgDudwLk9V2TD0tb6zE3u;>kM zYVndcPn(sy;(h*DjqTfxebW@F5Xk6dhn(F$^y`>xR_F{kQD(r_*_p1sb#!#JiFng% zQe&P8XQW1WIs6uXEzdra=UCpvbj)XQ>apBTYu75K3&VI$8e4l99i8qyzWIG&(gwLU zzEc%c$9?%oq_MO4!cA=UC5~6`Mtl|fp#jNR-@Y)-H07RH6R!CYglUA{e=%@<>W;is zMF52tuAZqUCUw~rPl(A*f81eE;E*IY|0-N`e_t|W34QtX>2Mu^oo~Qmx;%&CrTXt5 zSDU2T2$D0cy!kywieyD$vOW_d3h42|xYPhQXRL&}ad@&D`+^3cM(`&qFET?FF zdF;jHy*;v0vp#^Goz4xiS=FoUFzD6D&M{wX%#l-SFPq#Z^GI;?mE$b(Jt=%5!&eIm)G|X(3w#^ z#}zt1AQ*FYwuc4jX1C-LpC0G7~qWA9KW*kZf2V!knQt8Lq)$2E74r6#_5Yq=EHby%cIpmDSGp91rdY1r3^ z9(SY5*CyKn}P0z|L{5QSoZ8J`Wv9m76l_GM#Bmd%X z*_YfOic}mA)#OYTE18x0{1B<-GZ2n#xctO69b-7^9;b~bW6Afl$nuS~$s~EwyGm_) zQd=VW(rCceL)4}7U+w*lw%W!y*Gs0=e!dg=_b!5c#S@9be6j!uCbH)#y;xq z-(|=SuT6*hYrHXb=!|m0Uzw)s#a`O{N#?&<|>>8&{a|lP;~D zT0SAQnC67e@yghPEdU|Jk?hy?g0c+*PF~Yh%xrgASIMnPebq3S8wV7L^zl0D6i}OU25<*1{^ELb>;KRlQ(yq(aV*TnQc5i<6C}V-FrCcj-cI2 z<8k_fvgqD1S?kkB&>NtZ9FpDmUL)7$kodw5%6BUom#&xJmxJ$a{`vXZ6TV^orHX*2 zzvT2XFUNa#4j0X9<;=w# z9=$g@(c{z`gbR8D+&QGS#qZ35>m5!%k73V9eaCM94c{&diL{{~cEv9ssYZF(S;^7dRM?@M zdtYNlapzj)g)>v1n^K*R!%UF=<`k*4amfRPw&TT)(oRuTW1@hsE*~SkU)K7=@n8+xtA)%Tx9ID-BxzO70%;G#;*hoRw3ghWd9F=vL*A*Y=MjHLCnX zv-LtgMt$QRi%4?E{ukmsi`C*v{v~mu*1A;en5;m3tO6p8Qgi7~kG^|e{ShjHInK3wF~gJo z8{VzoF@=R?%U1prXPr^?8cB<(_ByV;ef8Wrl*Un*Fxf@*IR4CDel;BK+J@`+Ov@Jj zIL(E4CeroJ*5wmka0rrwPh3gxOh-`0uvyHvryW=E$J7hX;6nYW*^bs1#o{O4^$yv{ z1|+8VuZ@|Eg*i8=@<*ln`r@}KxqJS3RkP>dH+piCW;dE<7Ah<{L4IZYwbWp&d3K_y zc(+Fi@k&j90UJ*KU!=WvSX0}+Hp)`5fSoGU0wTTlDk>PNQluwBKsrd1(1HzJbPZh~ zfHaXNEp%8QBvK^wNLNDdp@a^11lQjCtaHD6p6@(&{t*JqlsV=YzxuxK0JAV>%A@I$ z`iywWw$|)62uYp@j)QtjGFZ=P1Fm)JnZmOrDf>>%TcFk33$Dw|7uj!OI}6Yj+Jt7!ZmPrXwW)JTLTY@&UvRe{Y5at0JY=w?b_R! zG6Iyp_SsV9^qYQXY9=Vca+U;&DiUtvO+Uh1^++CjZz7bJpcAds{OAT^o7a3Cu9 z5!4g2z#MwLWE>shtz!PPfm#k&Z@T71#+q?!klD)3=asDQF9*Unx4!`>v$*F>=Y!)v zIrnGO2c64DBq;GqF$fPsfXkLqB>#X1dmN+cU3zsvsS`vrOo9MY4-7{>+Ha|{a~C)b z82ErHU|u%{9Sdi{cy)oogQRT-Tfihg?EQR|4Z#fKl;Bc1$>`@gvubi=HkQOhUJG8VcA9=>V@u&(c9nW@AT(2ZIWwB7c zU-t+Zzyo||6_+?pJU{mpWTb`a96omsck2)Nb7Z)GfI!JMU?UgS2MAk*V4chrR4;Xd z$!czD3o1mEH`H4WwGQh7z~(|wQMUm7jp*gw53(LvE)~B{Qmo$~u!JUF(c2+K(|DLvPARCM_Kry zfz(~y9D6vG>5z2&wsyeb?%0J|G6=$Ez=!sM`SIXcEr6}hfY0d#lYPjsva%o-#G&?} zz=jGbDK-ZyaEDx8Z{am^G)7}Rtv;Y8RR$uv-Wm>HN*}|>!YhUnQf${1f|%C`e6vD| zg8}YZpW;=(BFvjFs21`p9|0em2ef-K(0K6OK7&tbh64SM%{%zEWN{|*r!PQ6@uiqV z5h7R)SoX_YTLu1z0H(kv%DAQMFW8oKsj(h~ph(KH1#{iZUXkGC)uSzKA1a9X<$yqmW>7sq)tG^$8%Z7u3=HQ~hMGAQ1p!Z*1#G@{;C35~i@|jo39Q^Y zSdMYwyJdv|+R8g!k?Ua0@1u}AJt=b_VkBHrXG;UO_5jQj7laAsUj)Jd39F_*JL5XRq?Pg9-8ZpK&IEZ#8!9BI8DM!} zl(?f5=u_DNGK(qTBXR_+KO8~sE`oUGfE&N&Nm-@^uc@hev&!r518S&2{lP||B%sn$ zk{sXZWN*)5H{9$(3BDh^C>w*3at1YG)9PpWj|$9lKm!n}tGE%32Cm`9de3|hKn{3z z$WG6Ic%=*2t~?9x5kpFZUDnKK_H8pj5a_Cbx&Ppka`b#v)y&6GmQgaucRo?3Ro${9 zD87N{w1z?^0%1c?$Y&#Sw|jjYyc9bC?{Ra*3_K~~prd-;UkU>6c`)G#FHUh7@90!e z&~o|>=QtFM3FaQ*z$*;#Z!c63BX&lXk=aKcb2(90X*&b$zY3XO1$Y0|QU&q6GYH^< z$6ZUlOI3ku^#a+GBS=h88QExG4RBQk177FBcLkT=#UQM8taE1+c#-bqWCsO&59s`O z&gEn~)bEq$C)-0Jln)RAo7H22Bk#R{gR!IJWgr9PD<}u`ktz(qZs<5YB~GK{clO!I zSx!>9s%fcX2juddMb|(_3nX}-o^#T}4?eVTYcJPsqM=|pd~S=8w?z$?`GQ;I0BqS# z%HcLAF!+=li*lI-l~XM9tBEdEIY=gsO?2M77E0PFLGVyLu}WQx{tV_+(YdY-!5Nm)6dv)eDhFyKY7$fisn z3kwTWi2h5pY~_96$r*f+z>hkB(C>r1cR|K@-6%K@Ov-925>TI{+{l_=_QO=E^=6!~ z46sbSDK-?gH_o)tNGp7wk4#;)+R9ItKenJeqX^cEFOZ$#7g~EC>3^4C;hp>pgvW2n zH!JZ01)h^nyx*^b_1kUcTFYSS0y^-9a8M?O0A?YrwzvMW;Ww)1b@rgdLdjN_fJ7_AbiK}*?GHarIA+Bec~JRyDd<`*_nqOQf$F2NU1Q;r=`FL&VzWf6IfK( z8oiVJVrdkZ#wNfZjP)2ASs0SXL_y9e2fn0N??ZLT+H$(*8+hh)pvYRJa=$qYSh576|P^81v?q4@g*VBTebTA9f- zJ~qV21MP;rA~zYW&&BID>;i5q`)5uhp3NA?M*=d13=O(odA*?}kki5?d5Ct7_QR8;vqDfoGO;6VvMYso+`~Zz2t>3C<{bbaXs3xs z>5_v;eVt8-2_01J-I3aSJ^UrC?UiJP|6 zN9UN{-eH&g13X=?qL>K-Hi`|ZTa77{kiBtqLMI=ntAJ;MQ()Qa0es%+_ zMgKlSj)6y{Vo* zYNk&IKUHW4KUolS@3>Cp!oraLZ7po{3u-Z zGKUxtCk;n09;fQOrPTK2m)Lh5s#>8%_%O}1R+VhlmG4GE&30w;-qejt{C)to-aDZM z*3M_ZKFk>!1Jk{?uP^h(K}751>bd4s1tXraWqwGN)#$zYJIJ8GiZGKv#=4~L(*sPE~Fbj zHLC9sL9JXQc3mX%UswQ>kT?nDvCFQlVZs_HIoq*E9yPbXi@mUx zs0^PH*M~x8LkDW*^ET$%iEbHO_Dc(iF>O0~jvfwmuPq+fVmdY?adT7O3^8l=hM-LOY_x>LzEsRZUn& zlH?tfX9IPdsSu(xf8Hpp99RK~Mb#)uwQPjbfIlz`%srDjd+!yJ$(y@8Y@ z+XKW9lDi67f0Wqws) z@KzW3XeP5}9T;6d_4l-f^YVJqFCgZ}Dhg6yY%w9VI_~)grX|s-HKq(FH8M;qb19 z-Dj4PV^9}Cixz~)9XWC$B7dk!uGu2 zkK3BDcZ*L)__>|k_bTF#G?bkjkAYdJJG(F%+nHI$2ae(t3Akq!QRW&h(fuOHoK?%9?&E;T{Y=Z#CA$7-uX z_SabEa~S4anS^ry|5fTxxK%0Y^2?c$jPN~SDm z{N?%e|Jd)#=_)_2%gM{%X}%5d1<^+Dz(5Yg1@ZDzsfhbPiTSTziK3c3ob;KbE*5^! zAYE>MW1_E+=rHt&zVLz3o3hCnp7@(0Y=x`zMRh4$Xvcsg15XybI|;Rr$hCcDoI8>M z*3ni`go6-c=)z}rTMi&DiQV0sw5`^@{TGxXgqR|k#v(IF1~-`!tw}&>gOsNqR>1no zK{Q}KIyGk8<`_wgG$=#@^uR|0UjV2Eg8FUM9mD}4W`A#5IkFZB+CTHQ z%DX0XV#l5zVyY5Ykwhbj`($UM;L`AkbROznV)zR zI%aoqx(<#pSHAjk4`8)EXAvkqGqt$%Fckz5&sTR2>T7S!ND3_;GoR}gip@lZ?be;M zPvO0Y`4gn|GjCX3PHT(HJ!xkKw=Xjo_~s9@Oze?-aV5I5anaMVa?j?Lf{6)yhs>J_ z>Nu?tLzE{RN0)|x9KK)R=M3>)@5P%E2k4#j*JE0icOr1Aq`V&LLCmkWYw`sFNnEpo zKD2Wowr!*>vLY9R(C(Kp1vft(2J^0=8`_NJvQS)Z&{I(UMgNI}D9~{jGeUVb*X#nL zJ0>h$Yi;0Zu7K~Gu&3jOC@1>q@97bMPH0p>`DEzxPAyypSz%?1hkv~QX4}*YQiXA#h4emdd#wT z4|thJAqjUgMNeZjYwG!6=53R!tv0Ou7h^+>TwioD7QR20CWu~Sd#cs=($RXvTq;H5 z_GccbkXu8Z`3sR-W_N1bsT+5QhUInRlJ3YMD>Yo@C;t~BY{ElNUwrxVcObQH3sx#c zFh>G3lPGb}-^X)CHHuSYoNRU65|We_v9YTqz0`kul6a*ut@@NQ=Z+uN1+}Wp5(ltOEj0HIl?v35&Mf@^c^k zE{)n6byUGo>-TZQ$~4@_GSlTzC`)oyzo^TMV#!hg4?)Z(dw=h2WXTytD3kDPeqk(Y zf!0it5VmCU$pzc$p3XDc>JDM>6lD3EB)IG#R%o{L6LQOT@LRs+POtm2Od2j+D)~YI z%zt=hq)KPP%|9@F62VoQ`=w5yN_u$z@Gn1X@aDN9_AwfshjXq1p4L84X#2qv!qke7Ue)FhZa>o`jeReQ+z#TeF$}TQ`dP$}qb~{N+uHu& zh&gT-#_H?jAp6Z`H9VOR3pe|c$EvXD>+Wv-68A$UZR^FkW}*pFX^}Kew54Oqy^2G; zz+JgAdAXUtI{ByU4Woz_E>pY|5@oept@H!ii%y~24y=4GIaP}$Q`|FEr#I>Q|K~vb zH}`%D*m^-eNThP&)0DpkeNUPasr~k-Ipp0{1%KoPbO{VUP@qFB@FAR|^6M>bD4vU2 zFOSMlO^r)DGx_1nq(Q-$i9sn-g4xY1K`1Jnp}puwRG~keqS;t=2x;QcY>nOQKGVPj zIyo$Q&x_=YD}R@I2or%8O@9o?W^>F#)ZJXK@UIbTY1)b%eTzvsXND|W=Pnh97g6?u zHL5lFvPkhm8%dhUZsUrFPDFEk@W;Zn3Au-qtDIThdh(x{uzrL79~lX+8pszO77(_) zlp2^fu(aBua|z#wKcmrSrrw54GKCYq>LQZ$pXO`qCVDQv>M?&5GR>0|A|_XBT^Wx- zK(d%Oj)~Ur;VOO_;VWYi{mysE>E;S!?sZ%PFv6kQRU*@r0^DNSr5Y9cP5T zP*#9No1@sO*b7907)e0ibTG?pWB;XC0=n>Y18xHVp%&netw>I7108i5a8pU#>mFI} zL-hQ2QA$R6izb@(a_` z?^sy4n(SJU0ck-BNYfZ&>QbnvxZ(%^HJ!Lx8-^F_%_fIFAxN9S7O=;%6kk}5rCb!< z5Y;kk#WjiA!~Ek9M0JMCm9F8e^+L;}c0W7=tuyvXx@$_FCVRgL+oWaC_tje=G>?~oMi9VsZ zSdYEd!%yyt>36aMuDjK!#z1cIFBS&S;2lV8>;rWrT(em|y8J1cQqGUd;$bd_1_DnS zxROyK{?hM&bMu|E<%UQxIxoc=!7?=O$ySU!L zx)HyKQ;2TJ;;sg&Al|#)UDl6nR|4f$U@P(Oq)Q4N?yTj{FyT{BCmMj_n$7lbF_Nm5 zdRXJG|I+u0(W3bIIIi!B0ineMPQ?hDtZ(aL|IDUBjC91uO*X$*)%$=XNJoDE1f-jr z2@e`#7~=bt>E`7GE6=GbQQQ#F4oWEQVxvO`j)0WPHc2*!zYm;ERYLu@Q}8 zIt^;;wAkDd6=NyrING5$5w_yg{G#K@5L^8M6=nlXt$A6Nfk@zUDiF~b!j?jb zjup&(z+uv#epPIqd1)a-MGnE1G@u)&eWPkwaAa!9ARn#y^1fiU>wIwCJjKadl3a>@K?wK*C8 z-)BI-P2^xyAx8t+-yX+Lp2%e@bik{`MA}{^AJS}|&CAcWY%o|NF}35rz1|5M*kQV9 zsk*xmi?N+gw`HxLL7wn%_gB{mx`WCPh9Nk`Gi#g_mr!hJ50Kb%>sz6lUHn@l=ez1x zWo|~8%i45AXQ<=OO#CvC`P^T`-y<$EOz4a$3QJx@nE5f7`ks`FJ@5?5P#(mf-)xYp zX`Alzqw@*yL8oo*5tq#>!_7)8tjvDHO<+I7WhI_cXQDFY96S53WlgEp{G&eeyP);^ zZtqhgfv)4*`H1QlcfHxo{{2}--<`ZPshEQ-V`&!K0XatF?g7mg2Y6_|WY9*gSF zhg%Q|k&E%H##j=&?0h$W(}hCwHc?P4i%)^^Dz{0|52&rgGe){q@{bEl>vEM_ou%`2 znuvBkJNZPBQ!sHO)Q8%`Gt9=KEysGOK(s*XaRA)>N*l7`rVxl7_CE z+ea6L-6kPUV08;J%s+bZA66J0m=sT;p8CL(J`jXW;yg(4Tc|u`1IkD|M?U`_Dq14V zw}8)V-C#*!5n@gsMKtg-^Ea31ikkW)mQC4y7SMB47(>Cj_w$VeMZ>w3mEHG^6woCu zBfe`Md6D@&_NH6!?VV~>!hRCBCgNId{^d1{&a5JGCvTNKHtCK{S*-bj3tbDifuB$4 z?p_-%)(^n;3zm!+D=5#>qhr@_(nlT!o5d84{ltw{tXCC4yKZX`&9Y8mT)qn5ikdZ1 zjv`&wbvCtU?TQcA;D>s5G&2vV^&CxwQW4@VuSA>uHIVwCG5}FZQkITNCD(y$3o$cG ztg5P#0HEtECr+x)9nf;??c2ckE01~v)DYG7a8cXDcuXPg*_D_*J)N|1@bGyYh9x9iS=>38nR- z?vzFK`?yy7uydZMC)}}4I&WM1AWXw-hq~4?)1p(eI-V!ftTT#>a8^sy)FTkTzF0y_ zMHTHemJ}C*mz28neE!O!bU=k9v_bB+)dqqtJF*n3>C63*E}_w5Z{3+2QlGM>(YGJ2 z1LFC&z&mwYM@-un|Z61t3Cr+9bq_+^+~cW1m&`@rO6BJw&zIS{pH(Tr0*Ta zwf~X$aMGQcVb8i~VH<)gU%m#ZdNO*W83D;oEk#HA_@Hy61~+54EJT-)HYKDSj>Vpc z!SZ>Z0-Xe8D%WhWtT$hxrj%EPYw&dL)qhp|A_mI&;&KD`s@8HD@lqDq&S$H#nUj%o z4^bJiaR=RUNQjrNs2s5r&IKtEu7m=ZuQtXdXzl91OMG=Rd^RQFBbjj^rwW(~%)VG? z8`AZHRWkZA^uM4~&`1i~K3$^#@k#%n6iZAUc61Y_3A;5f% z1)mQe^)6RcFcZQMT5FeE&Wt$Q)~5?`2*(Gp`hr+CFrH|HMwjwD#%~y=%|4jS+1rn7 zkxdg_7_cR{Tf}V$Ro|zHa6J~uzCE&Q&%U!I^$`Sa@R{iE%@BHF5p>;+?C5$M3;2*) z{24`|h%Cq1;CJii@6qk}@M23WENv5?Nyr5_dDrRuv~7FO4;S$MXrjZN-5wbVAMS)l zt#n8Y$v&P?8@I{xu4JJ*+<#nsqlmfZj3TBPc>2-HcxIknVk0w7u0sj~cL|RxeXAD4 zx%qf3?1L30r;NCSoSEjYmA`Xj(DUE^4|eLGmfiO#Wc#SNjdE{YpU9PlKFCC1ke+>2 zw@!L^eN;S7H^s8xGyP`#-X_NVu(8|s1vJPdwLWFKS}a!SO&K3k$z!J?XE@7J3FD}( z`axCgvps7<@IAkiC)a4Pn%(uC7=x9XTVywZXbZx#Z?y04OuQMlnaI65*6k>0a;ovE zs{*dVJ?vpaKWArgLB4RB7i%msrE4E$fSI-^#ihV!FWA9{F8I}iaWj}HEAKvgRmjB2 z!^)(eb-^Jo+w!>L@wN?aCxcJ6sl#sq%in;+;Iiw8PhS+QOb0`m4bRUp{4Wc7_xx7- zn9}_t>o+dpi#E3S*Xo#N3yi+xnSF7>pIcv|*@T_lBo&iuN{Tn13Xtn^1NJ`3Cf<+W z+c+DB!)$KRZaUMF&kB&Y*4hdjiE0Hrv>OTQwP=puesh6~$7US6`nCy|D07XMb%q&XTV;ve>E*WB5{#CMM&`od8A)QbRK(rH730D2?O601NMTxDXPs~?{8hlY`QYi@*n^OH z?C0sjqv1FR`Cm>9wnEae;SpY`sLe>H+Ij#+>^feHDA1b#&ptZu!?oiJ%W!( zxTUK05%%yOn6{6w+o0;<6w)4AUv3pL6$D>jrA!KQLg)!l8Z70TU z#w%ki%2fmJ3HNx6TZ@GUPPX=5D(v=or>1jsdHqlP$5shp=K!ZAk;@)v`D%BZpz8em zVc!H~Pgw;8LfXl4WxPYDs7mW1BF@(il<83`SC5vDfPVs=1WgU-JK)BIC{S$aNxv&8 zHv9L8OgZE_PoC2fv9y1%M!6&H58D&!lp8+6cgW2FHz)V>l(8r_o)I<+agj1(&b<9> z08uvRIl%rX#|u%u5;*=k`H6w%sl7%$v*B{P$mB-hOZ03w9Oc_3x_JHRm=e~WX)VK( z))zIAA@k<@f#Ch4;!WA(4jKmc7@lKa0!&Wpq|YGKh*=l3*$O{^(%q_u)>D=T_M7%* znkbIs-71<&^yGrR+@n#bNWJ>?Z3cn)o5AIq>w$4xiZZr7x6sZzw};{)XWS!fOY!TG z5Y&OlwBaOwm?bL70&iD8`;{BA*}kpb46@mHM9uLmXJJScdueF2hW$nZ;(VIMnBdR=b;Ew*WJ(Kn#D%&M7J@tPFM~Y zDZ7qDfTZnTb;aoKlU8l?)-dV1k@4uv!;!j3uG!B%7wnvckm|oM z#Q#@w48JdTuu-V*8g2cLwTyTAx1fsDjT+m1`d9NL(Qu#d81r)f4x_q8=Kc>JWt1V? zAr81h0t3I^d-roT3eOOYAEfo|--e}wuFAp z;nu}B#K}bx_<1v!l9k1CK~Y7*Q%q3lxbOUlsaucQg$JY@f)fOwLJw;+s8u1qs~D*% z>7efd(ys6ubhk!jytK4_o_N&4tRD2&nnqC4D#`cQ-L(=S*Or0Tqszbe{Bf2~%|@JL zPtgW9Y>a%{55=6hPGgAeJ=5%AZ9Nw2yzSl=1h2}BhtR5Z73Z=BXR}1+((yJ`A$1k- z-ZuTUuEJbc^}f5yR}Dna1UfvXKKAP+i&o&8*VA?)dNx51G8%IywD@gH;nbiB;-2Gr7oI z_-il3CkXeY%idp=G||-2nBm>>3mO;!KdXGdd*E|X zqH5G|mXKd39(jj96iLcOEWBJcieN%4Xo$r!|60V2d8;mODv-l~5KGPDU|WG%>hepG zgf-RMq!{hK{Y8_p@P+(EK7k>`-8fpDr*;3N4NMvh1G0h*2@;5~VI*-b>5?i;AXMQ{d6R)zmOU;y#G4_B>52kBKLyKJ56v?cJc9=+w0R>NCW z7>~+!DJ{FR;cU&szUvirH;k|oL|Vgxu3K*5H>AG=7PJ`S@K~vn7?OA_N%KRBfx`03 zNcO0Og)fsvLRmR$@ms}V0Wpg)(2@o9&8?VG#})CTM7Fc))Wal^<^@P)49J(j$&n(Dz*g{e@1TM}6z^QD#6nD^+Oy>cZE5 zwtEIL!YKbbIy&bnUVXV>1|&hW3y)Ln#M$Bh@->*dZS0cx5ShAZo{4@TmGy?>b=RI# zTJFXRkL&ZN;6HqY27S5Sj4k`HE!x|`7fDP$ygzh1f@|oQ0tE+iJI^%F&t9>M%E(S* z4I%W9@cpj9MOuWC`8@YkBJYj~ZUhJz?__h&ax*Wm&Ti?D_Datt3&Q-yzWsz}SA^M| zjv{jla0H>f&3#(=nP9E49tm}?bwG+r@D&=OZ(T3f?Cue<=e*53Op9k$j|dK=CIy(e zVi9>SUFVl=XE{yN4x#=g((6CC7Itmh@REniD0B`C5pm$)yuxOyf{gHzBkUy%r9KB)bEt=N>!$)=g$a8^Q|H1AgUhe`faV?B#BN_?zQMxncrLUTt4&9@ox+$dV~M$fS^HfQ zU)YPdGfiz=W7AtwH0N+fm*m78wrsSkKlmmS^wUs)u>J9-?J+7l!|w+I_KuGFpw|Wf zV}lf!hA9j{_5hZToq@)Yu!|WQ94tkUtXlN^PyQp{%*-UwUN_U+8L4w(QUVF*$ZoLK zpz@8IBcc+uZd!UBqKj)c7x%iAusSS8U)3v~JpN2dqQ^bmczc+wyCCt|J`g@}9+cdE zAEw%z!sLwPlqmNU!vh4d)1V$FAeo+j-N$K&qi2!2uIrmrGR|@G07a4$T#7O zCamew5Fsf^I}aUed}UrB@SdKA;nT*&=j$$i$6shF(P^;rk(6QhNQ4~9dTBzsVNQvEy13yef!2?A}hXivm04>zQ|-LJr@#*V{gvw?oXHzESPV-M*LXAgWR6sH{> z>dZ>5ER6T?G~bnhbeF9$-ip)26f8QJz^HxD>BOPUcU%_Y%a`)YCT#WXHTn)+#?(=s zeKn`@wi%)$achh4Igq}hF)3WL(;mzEQkF<+<+|`CM7Eal1C-}Ll6*{?i8}=0zwb2! z5Hb2s{2!uvWp^>g?Gq&5tP{>DZWUbiqYGxyZG)ScgHkU^!=8t~BPu&liZir(7?i{! zbn#@{@h(I--$aILS^bdNZu`dRscH4ql1;mkn*RGp%m&M!wrz!a1BJ{U7Jmp?zW8De ztF%(6r^na1ERt3Sur#l4Z>uQyAvZwE`2;n^8gBH3iOmssXAgl}aoUV{=+(3JvHW!4 zXIQ9^kq}xN_0dQaT2o=DNomHVd@{ZnBMe-)k>iUmrtMACHf_uk#XdH445#@cd4J<2 zX2yOO#m(Up5;Alz&g8*dYaYM2$&43QL#8&`@<$9*ytvwch!9Pi1}Vbwsd82UnI~cC zs<-N`&z$5jce_qB5>y^)V(n5sd~5YNJNq^KjEmstJ!tVrru~phC1Tt7eOghJ&>#>l z7+$luQeXYkuBB7X3$i6CeGwm%J^tL7Z!9*=BhR~cW1$|Q3x(%3#ZcW^X1P%l!FvSwO;lf(!lRRvpOUJ4J;EH<~fwstNjnGC|>@ zI^%WM3MThRhQ`uQ=2t}uHZ~u3kHJQ&ZI6kDyAkVIFJnD4NR7J-#x%nZ`;!DoN2_X| z?*Cu05qG~Xr&ue7`R13pxU0~FTm6Uv(KJK>_p^q``l%DKxsvrSooLscxR~*xzoxOp z1>A}Z{zJK>qGwtP(1v2DCofMaUMO|sg4|uSlZVhpIBYt%c4ai z^U4%uIs{GvQHy?07drhxYlOshLl%2R!FzJB!$vtp0L{hoF69P6jx1GR?p2@Xh7Y&!SNW1=`H`XG??eUT^|kVF7H1GH}n&QT;D6*tqR)Vvgqv42ZQ*YL~vIYd3+w?sWqAVBl1?sKW^P?bxJW{j%hIrf%9Obw`8 zH*y5`Z6Yng9$cv7K#TURm2C9ot_(Blw+@gar5ts?esj6&A&^HKzeaSdE;4f3zA_IV z>MCD$-R5}R(BXhN2!Z%40apj(5R(JR?)&$6mh)49OOQl?-zf8z*dBm1?6?)B^jTMkkXGI1(547 z!G~A<0c0280r?=^^?*VmEz6o=4-hddKy@GOGnLToC_Vi26{CQ$%KLx!NmH5D2kFxR zzz|Rf%g!tBJzXqr6WYNa?&-4ouQYrRw~YWy02BQ@AiirxVYmTATQ#(m@q0HArRF=Y z(G6m%bg--KfARfaEazk7PXiSjdv>>`OFNV=(m4+(3V9T`C1BO~$N}~!KvB0hiSB>) z_>{&V2sEO9V?>TtQfGWG?P~x+=^yo5&IibSUA>wwwqad>gGynP%q~?g_XC1BEs&y? zSd{3z4&V!hJ@Zu0ITFHD71ym4b6QW6GsRv6o;l0>pcV&{EW?(NKk>)GZVrog_7}DH z6WT6FOw(Z3t`nzsFlg_X@=Fm+9eB)qCExZv`1utr-cyc3v4oEV1KAgd0z!{R`)E$V z4zSN+X!s(naclD`Q~`=l^+6lWZQGg=28jiYTRzaD+%(rkTf1PJ?Q3FCU4Ohs8`eXx za8gT%{6SdV<#CEQI)4p*PH{Wz24P+{79ICKaiyy2WoBe*#)FzaMpY_f%@_7*o!ewr z!RK6&J+mvxxVI(6B~0&{D*Yg0oahv8+cFTx88Yn?LDTP&L?#+UFv%2L5vH4un&Al} zD4yJCS`#D^0-!!N9Y?5Fl|#$DXxHz%?Y2tk6utwN`3ii!Ss3{cXnw?qs^JwklUx8@ zv=*>z#IS(50eFL|fRhMoe%T4c4|==0uEE1reTFz`P(XVoic8S6^i!J5{kQq2D6(xc zw@;_)I{u$a=j;86%Y3k{r?sxtzoA5r@`H@1s2W#U6n#DQ>*7czv9Yg>DqTdnm&rC- zRnw^tZ}uVhJRVxrvi0!OUC$C236RQ(w5-C(Ebf_$i}WFmA`<(EUw((a(3BtcZ$YWsCo~(y?q9V`<-Gyi=v-`t_ORhfFb~(o;rXH0;&K7OX5j;yhP7! zTA)7l4$S^kpnwSG++DEuLpK1{-#Y(yA`qYf6Q0S0J=ctKKADWoticD=r4q_RT~XG0 zbx;3t>t_EtGg_uk zPp?zDiz=O1eOlw&%Oc!DpMHbK^%}%?+t&5%c%OW;oi|e0oUsEhm92{6RU68pC+-Wn zE3hP3tt)5w8=%&YVzw%e93_iv`jpWTQig81D>uZ~e`!7wROH`%(H5=REzReK1b$en zqQNB3b#VWpPxsRj){#E!Od@EIz&m+1t}k*?Itd}Gvh`XV7+y7E+Z8YQd`#SWe(16Et z8d+En1wHi?sYqIwe-tQ)*qNYo7~H8<+Z-{+EFV{z5+?;oA=Dqxt8SuCieIK{mwt=G zGicx{`n{Z2K}jWc55)*cZ<}mwDw&!#f&tiW@S0;@=4}I^xX9+bG8JRpqms$$Z}_T8 zee4{ekwPE9G6yVqq-d7zQMiH>%>f`b@R7oaqELze1Nxmy-S!i}vElvROGWkLCnDlK zXNC6n9QdP`KP}!~fwwYF`|?A}sySXQ`$eekI`V5y3I=)zpU_ZiK)7%zkLk4+X`j7Z z^-AQO?oCvRVGJk?%VkYhZn$W+-TWvy(8*e7mqR{Q>_da#`?6>#(2e)^No(dE)UKb4y^O9wCRH(ly z{dBytj%Vn-K)H)T{mUd31)TT9kC2|&uP$S>>MlRojnJz_CA}SGT86~jt2`@J@ZrBq z=%58qm&Q8&7|gE|NIweRc*e(JK)Q`Ib5Of8|h}_7f@NaswF8up7Gn;yX7b*ML0oE4aU4Ql& z36~nY4_@Jy<828jm_P+g63A^!=>I#9rrIep@Vf{cX9q>ZV*@Z3<^kXCEl?pa>Dfup zQY#;J4~2Pc{4NKA8A zK0U?%#Wz$gXq01{>S`ZvH!m`El93i0N|q=smhcEAZ}Ag&x)|k_aYUc@=;saB+`@By zY<7_!dEQUHk7OWn&xB4zpLwk8<<#~Dwsu^7IB%ZCP8rprLw=0E3)-{+Cb|;yj84i_ zH;28uOb9%pt=hwJBEzswlTD3oFJfV{-^r;`sk)?%w?kq=Sr_NZZ6{iRb|Gy0t}K(E z%t<2mVdkW9bx2jwzV>Ln2uI-;7v;uU{wQ{B$G}cw{9o93uiCTWMd3%RlT>BM#`#7H zWq*AO5OVYR-vHv_uQ#loWJ`QVpCJ!?izDzi;-Hgan*tJHY(y0sPgjrvm?;2C4PHg~$MQ**Q)ng}!*8ldB=g z0Zh-!up8TkQ^!OMt>XVW1{pVmADbtz;?GUchfT|UmfO7DAU3O_v|^gMUzYEf_%*dt zMPc{S8DZwIS(X;q<}hYHA{ZCyB3<#PtroAk84}}6Z#8sw%2tE47JG%$>CVr)*sB%V z1Q{_bEDq-HOt_4Xbk5}Y$ght55JW#Fp7U5%sllIJws~=zX8M<~EX>-dM+e>1vd=cOlpi9YsUrwrjH5y$t zy;ka7XOoDsc8I^lU6B`@oL1(had-hM5xiP-u$Qlijkk;V1z#{mVGealBAF^5PS%8pn^qAi^_I2FOvQSIj__h=xn5+q8-_ToHo-?VfWMHmcj_Rum?kr_p6` z77Fz>tM0Wb7gX^@>OQovBkODXgJE5HLK zYU71%5q!s&?%v3V$|#>QiZn8eA{r)MHb9s51vewQ zNV(o2U6wF@teM;7(CTrLlYZDBv_$)ioWHy1lxEy!laqBw7kjE;|n(-7jVN4Zdx+nqWJm z&=}qu0eHge*&zrDsXWa9kOlk3#{J(DwxC;l-!{Omk-HUKv*b$QTPcnUk$aGF?qsV{ z`O&n+bb#4ND3@_lNL=Doi*y<@L3ys#*n$F5R(I(H%1Da*q41>wzU#k0En3`*X37YEn*Qj4^F4Yws6z87#4=S*7rP3zFs#Fi@a#;QAXo_5< zG!&jciqQ|Cu5f4K>0>QSdeW4e*>O48RxpD5R(uQ;&ueg1i>*+#qahyC%o1Xxep>?L zufu2ER_JKKW3MnDD_DC7NJkG8U_p7V;WomnhESoDwHSOX|3xgU&5T7{jj$Mz5bI8I z+GcPHzrp&at%b6C(;5hI2CSP~XR7pSW@7m1gmY%D7-}l`{n4R!ONY6q--C2$ySzkP zxYf2x3S8Pndq`=$DvkOoWLqY9*fQNKTJ^-%g^L)}@v7>u5C zu%11anMw|`Wx!u=9|^pS?VekG+)CSi;r@f;O(!p_qR$FgdgI%N>yS^rP+n0oPx-Zin?^_SJ3>#tZ`W*AQ@Er+B zveFN#2;w(Pu)e?Z%1G{!;P7LY_xJ8JsMeKsV@w6O)UuWI%0A_X;>5?3-z!)-N6EfD zotB)aXLhd0rt%A|{fS}Y`_rZV2Xk#P2%ff~Cj*f#R+`&_Xn`4stVXkZPGnHQzH+Y5 z@6E8H3tJRl2>yH1E6S6}5Ng%y|K<2-|N8&+2O!i6hUHV|G2X=!jx=%u@_9d%I3`NV z_*lXQH=TQVLX#M1NNE53r>kw3@uzFMyM0eGiml&Y4=Q0X5ga|PUypJ$nxzrW@0$5JDQq0^LIKgM~Kx6e?erl%k0-1~XT4TP6O`LXTuKoq8+cTuQj zMRgd6WKlg2JlqV#0e#}pSbuPzZ_2YT`|!L&e+^!*aqQg5ySn~~?&Ka072>SiPkz)v zCL!DjMHA-%%0bVYdtxrUKf35PQqQ%}nx`D=L!ZR)S`RPA3(cdxW{@XrEeX8{yL2&; z3+wvoK2k^b&bIG;#cS!i;iXLmpBZ6uw`XkU0KpuW>dyZ9P7J4Bl&_A%bTtbLqrEyS zPrl(&TK08V_Z>UV?&C)FR*!mee%_IOM;T^LL>Y$1M$+5O9-;7-?C zDtzeFN)+jJ%9TED=z>xhXD9xF6cKv!GAzih&JE`{3h&NF%_jV3Tn z`5#cRd6E{)iR?S?{fXZX?dov;?cK*c!Z%~vrY2aH+1t){fQUb-W}&PSD{nJk2_A{7 zH^BeVZH!Ya%foFE3B4q$pL?0E-z7(z4rVOkDwF?{&U^tmcKe`%Q9tMy7eObLqf@cMJaMsF!`H>MfuYXi1%`EAS zzZFB4nI3guD+m_JZH{lu@o%H%+ju|H&uq~yiGCDhPK<$1`D@?Y_N%n3au2r^IZxo4 z-W36R1Ah1H(Q)Rzhz2Sc6`;KK1#E~s4CG^sL4$nCxe(}fBGEGw%6RsbaCfV(&f?tUbWmCrw%6-i2u=~hGwS@vi zRp7Rhg?df=9DxO#Dc(P+r*=&=1R6Cag&)c@BV*?N|lr;I6{z+T6b?mtw4VbTAL1~fj#);yNPYU17 z_}r)QN5_7R`RFsvSf6a;6zBk;6uK7zzq7fmZtaHmuQ z<#U!{8OjN3#s3doZvj=++I6Dg6L_j=r^F8N!-{0%|ALH9&T&_b7d&}O>e%4%T%{8ZG$Cs(J38-pc?>GNw)rXMz z;nl80f327hsN)!u;FK=QErpa_MwLAG$eR*Xw;mGR$cj-eyoJv!eV*?*EVp6ACM7dm zL9U9C|L$Jb$_Ln>SwZdGDMVqf>~$GqN@>5G2u{q{ubEf*>_g3P_HHHC2zt6aX%YI) z=#T;pV<{YRA>-9{q_`w-O zVP>d_kUcEfe+1K4qk1LhQ6WX3LD<11o+n$jN6GDt)Q@l1Y#BSFu@hc+##W?iRpo5G zr1p{d_ax+5s)CF(zmelL&U!aN zHRX;xeSsFck>t!5q}*IU21UtNiK=slhjGS-rdgvW)y)KROBeFJmJeGvn++HP+`|dB z(6DT{OPSWzZ?FP(avVGNdGNhd?%3H~?BmD}CC8w;l2KEez5?MRy>l)1e=Vk~GHhO{Hx3 zk8SZkXP}h?PkIK*?{z0VrHAA~M9eyXfUF;QHI>R# zM39{hLvg5p$%*jS7@AVhmF4rK=^;ZsSwgm${U2(RFbs=Ne242sTz=SvwMKOG28(6n zC8VJiw%lzJy4lZubZU|oe%FM=P|;I;6gnSD|AOouN{PYo!_352wHES9c71AK@DNYM zn3UnzbOmP~hJm9caDQKsL&30GuF;Kja}T^i1al>hD^{vsHPocIhq4NhX2`-kyPT%I z<=0p}EK$XgI*Zk6YFM!zhUIIXw)7?WFC^P^9_$nNP0BbOl#5@JU=L&KcdojQ$W>!u z`ESv&SEzAD!Ga%GHWtC|tvRnED-%NB{)+DRFsDO?<)qq5eu28~%ZC%>M};|2Vt2o< z9^H8oD{_pcAl13&h9D@>Ws>$R=A)~WWM5EIy#Fr(vP-ie57@IRN=x!O&~eF2yx^2( z##lU-H7@h{_3I6!6-XotWeZA6^TL%6c0Yo@Lixsn;%xGVOM?5O6E&bmzCC+|t+32$ zy9l7U0FZ1C#QAp(PYT`ysn#vG|>9vK0INA&(&d= ztkJaXKEcrgv$?CM3|}7#4Kn?+IgGy5qFQ{cmp3n-$sRe9Jg-mwDUoB>NpNPE`o6NN z>s4WYT`ng_S52eSKpmdR{?+riiX=BAHemnBo5C~Nr@~NMvtNySy==q6W?d`$$3rgN}=`wuDRpD|QD|c+f)PD1{ zclf9+2!+)aQaD&!c7GYyJ@65aW0vXHgdys;?IEqh+E-Nc%tqZQxDu{pBRm}9_z?SJ zofJvDmfH|Nsk9F4evkDI81`xP}iUXl*KXEXc1Pgw`A2yXz zY1THbnBeAF>YBHo@u!s*!Q58+=YNa+?0v;_DZ94 z8oiDfa#xGxwPDuFt!VEE)=*W)APBX;eM&rgA*RfOvaQ>EO>25_QG{PxM{>CsnPIug*2<$NJ|l~i3{|(lbIZu zQ~AqV-@xYUsmc-B-N(qX+lis+e%90EB_${nY4v_DweOA z<~e?^ULGYu!{4n~*!mE!R!fZM9-Bs=X|gO|g{Iu7!N}0?*7hboD-0IB*L8CX8m^&S zWfQFuT+RQn;+EP-FRPV*bMOjt^{A5xMB?^w`GlA1bl_8N@w|<6n9U011DWb6HD*zY zisWfL{#m-RV9yT9c-b7bWIFQlxo2D<>z&>LW%1mbQ925f)d-So|DfKR}f$P>Q(T4(GmLXSmi_|v~X zL5p8y8=`KN_`ebJbqT0OsDjX?M2FSE*$GTL%1k?Mg@uJlF#Cfc6biu4u>AtE!2ceW zyHj!K%dM>vnp?m=ZdF$N?0#|c*=v;RF;KEYY4a?(+LHg4i{TU$3~Fu`vzabMLwQx$ z*+IdiCWA0LZDO~u5+wW#3!e2cs_N(XSa^cqfgG@06dxHU?AffB|&ZQkwV>63{Nz?+0(ojzR=>M;m< zj!Vff+)P|7cOf`-PZFm;l)-iI{Ryo_Jcc>iv*&C#3Ew1edSWLHhB-FJz2RRz=o)uQ zd8nx(!kolQ{nXm38@okVc{^zh%h@y&2<^{|Y|2exB zN(JV5peT3(IAO!VD#6)#z@SzR2;zYs?@#CZFTlO8EEHW2#y3#UaBksN zGxD|y$h~IA{rLy}`Mk1rv}4FN1ri7rVz?{fmRwa6=?C7o3(oM+#X zOUovn+!L-gX8b0HD-~=Xd@=3$Z7^J6w=KcYU`Vx2l!VVj^G&pb%^u^yh*R}(rSDgX z1SGe?vZ$3-Wn^0XEF)JPr?u)65K4%n)cEU*(b5^ys*6S%pzBwWM>JY1@VF|*6qN&B zEj1~Tyv5Wn0V6}V$C(98CNGrMQ*;={8pdcmG1hFyy|O=j;bQcfwb)tK6^;0LEikI_ zZ~NhhTQgn4l9ietXyKas@_f<-}f{ zzXqj*|vork|GF;tk+?9j~ zZyY1~F5&TZBGLPc2N-!HnzJmnhxOHQYDNh4dg&+~U34$I=q+GyKsV5n;+@(Y zs4o)@S3fy0;i(9?k}U3V2%<{_4z~PAh)2RW%N|6ROVk37r3;`d-3|Cs0l=Xnqxu5y zQa-m9X2*e5$Xk^J5DkLP^P*)$MYTB&bh7;ylTO_rt?emdl)>?!ga*THzuv7BfIt@Z zsJG}qL*|GkOqJ{G$F(Z~WU1)5h};RceT1p-d>@E#%3hruN3F(YGgDMyrm4ezUBlTS zOI6orhL0BP69%Aoaj`3+Sa*MqoqtD%2)8s4<+pA0egfB|5%f|y4Yji?;Cpn(?-Wx} z>bgoI0f>|M0|~>c--F4Au=Yz@4uWh_(UXre5buqa=87X%m+!E$Ud|ZIfon^lmb(JG za+eo|KP8!t*7m(IsG&Uu*Td^DvrS3eaM!oH0Fg4DRfh__`X-Bovx6q8h>H#>d|q|8 z8BaBlS6X;sByP27XKzom_IrWBeYw*t8*-9XDR=B$-FKXjB@$L+Z)!W~+oy(lrqG9M zVLA4FYo$=rfWCCer^HD zgJmFQIw)GBt>mA_06Y;<8!c_3Tm1x9r6Z30OtlkE@Efn@Pp@xw2QyA$`ojsGFW+yr zZpzk1va|_Ot0`i%e>}(us?c8|EIj(!!8sK8)93fwA8O2}0f(oWx z&3E5q;vKYDatViU8E3Rej#qN=AKE8J~?kNhFjEN<~gdsK~ zN|EAuCFUa8_&D|~WP&p0SCqCiL=SH)(lR~WeT_6l zE0g%a_BkdpU^P@o(OJ!c6al7u^Qbz5`GpvOBD2er7Pr?fmfg*v!NP0&!%g zh{(d5nVSm*#^BPp?F$PFar?ma?*ESTyb{~t@K@~jSh7&Xy_F!kh*qFeABpR+bOQz? znT3XC%;P8Lc3F*2u~9^#ApZ{S5I<<+Z*(B==LbwFD+x;q3UEMkFxoSd?yo3(ze)$x zFtR=h?nYX2hvC~iDhgaw_1q&Uv!4d>0+=xgR7CDxmgBY6Y%;<2qfSwH)-BKs62ZH2($Y)`knYXU^`q{Bk05nyDnnmAXxP7y z8oxb8@0aWHe8y{+~%|rezRBnF}6{8i}q+{=f1-4r0jPco)624-Zn<9 zjyxde(;ndLy_mmp__YPYt@5?MdW017O^Y@Qy!~2~F3Smes2o;fTcKc@yEi*dTs0Z4 z;U-lpO)W{^9ClD$$SpPA*Qr0mB13OLC8Hgic-#QgC>LCiSy1BX0_Jc3>KtwWROTa+wCK(A)irY_SL+7v0P#_G%@y`*; ze`hu01_m;CN5{uEAPd071yv%lb~{IDJy3LIzHB-8ftH_un-8u>~UgL zR_^rAgRH+#!`lS*fRi>{Pdo)?w&(SgS?J4@F7rM>Cz?us>oyb*eY5E@v)z<+-23Pc zY8V~oERNk&{0j#LS-OFYffK6z6O%hQljxp7@G3ER3ed5>0G?&ag;Rb1?-iBv(3jt? z5R(Iw+cFq(#V{M1ybXRqrw9J>(!7+UMD4lVxWjLWWj@YrmLAPc9lq;)y4sO@c6_u+ zs$#Qs&QQiynSZ1|W!%n#Cc=oY@C~S&++&(_dbiKR5VnRXt|`c*yWyNX`ml2Uca&Xp z(Wh@0^lACB^F6GRkEb#Su{RWg)We>4j7*pfaiYDYzor?O78b5XIJv5+d5>)#E0N}R zmdofUvreewE4;zR?S-Fe>#7Gaa@}p#S*P+E_STB&v{4cFm%M;aIKuP&kMnHP0ByG( zCQFOwcm36-wQ)$E-My}qTc=~~!<7=|=R!(< z9-Vs&vpNk01;tIs+js9&>=_8?iguS5v<0KF(=IPBNl8hkw<7+g0$8Ri1*0#6@*)-_ z0ThmJT8uR|%5<*t!8cz{PEE!CXs87ZjDy3&ox>hf$M-+S1VgbTA%C{%=L%50`uU#r&8G(uK=LO_327VaaiKa?MRbFVYfuG^+5Mh zAB!<0(5a;`;Dg%_`Q)PoqWvN<6N(8O3x6jd{E$FM8MiTzgvk=Qqsh z>2vgv7sC=lB*77#nlCoQkiWDYp{;M{NQcO)HuyXM`wa|G1hIEQ-Q+hnlmT{Th2vPQcCU%&}B zbiL}p0-;MYKng8@HH`{!e5FAgOyWUn7+Z!2q3cPHUi3NFzif=T>|V{35OX5yE= zHPTwYyW*-hO!g-}?G(jODt=X=38(P@E%wxyc2J85ql*-AJjjeedF+B79Oe>`*za~< zJU3#~Ev1RS08k$d0=zw5av;g{%3fdqTY&t|Juv+dL5HQe_O+tyTlVV(>50U5R~+(l z*TC(w^zG)@?HO^a_nji*q1IUr6J}g>ku_{nkvc(^T4=-^hJp{TIaseb+_Cl7k7pve&ew>o8+DPv z?USzAJg9na3#PCAqbb;G+xRJ>L&fgeaFr-(7v2PYwLzw8;W^LQz`4<)w>QKk(d@PH z%IhT$`S5q~Y<9v|9f@p+g?*j@;Fzu5-XDs8}NX@6loGj=1W zwpZ7iEHVj{J+cjvF-;7AZO)b~58|0+=!I~#lJLRz0quzMJMHXCHVNhKBnE7EW&Ba; z(+W(g;DbB745!jD6Fj9$2aWqx-Oqd2Z$TQ*%Z#O0{~bI$2@8ITM*l z$Bp`4=wnl4$IX|v5O0^ul_`)zd(s>b5bW)DoVod^!`6oTD)0~H&&f%1X;9O z-ula!etQ5k6eBKYG^1d2l6%8!RzIZ(8f3udR}RF{A&r}hDL%!WZ2rBgl*B`AYBrzv zu;+$8VM3;Nhl5_jc#7VQimb#?M7u_nEqBak(>?rdv zGoHH64o!JxQta22?E3Ml$%inmTd->SQh8blE$rIO1WN?z;H&t?7@M%iU{ZW@r-3X5 zH}Aag97{R!;dEq>cGVv>i;vs=)T3?tMD1fXlz30^7kXON2%=rSAEX~tx^+ZpP+QjZ zkA|<*BaU#*m^!Cf7*&LEt|plS&#Nzj7QCkie~^&IKz{~= zFHYTi6`UnTr=(ZAQury`UuRG@wuRz}0V}dvU^9cHzD82(HoRj8O@1D`f*9Z39A3|v z&~P2J*K8RO#&Rb92kWEO<$7kHXyby8?s=L(a&S|)dIN2RL+I|B)$aJbZ

2#*Ccj zGj+1YVz2NcZZ8R478i~j*bWyY2b*`$9m1NUQ>(Dk2%adOy7smZT-Yrca}Yo9cyd&ER*|#uTgxD zRCXVUb$UF{jjoA z$+qUlkuWbd>3!`3KUcBLBt=u+gxZVxWxhj{wf#v_G%St(uAQw_J ze;lN+IhashR;u*4|L$9b0>#Y*96!E)H?i!1YGN!qpuO&aqCk9+JEePcO8x86gGlrk zUTk|{46obbDGKaZxL&u}(T2Rxnz9VhHTt_DpfFyIzKrrXK?PN{f&}Y0b&}DMf?$vZLUn?;wj-Y}`TxsF~1en;Bm)(0W&5b<{Fx)^R1= zJVf`uIEOZ|xpT&24`tP|jOy|hRwW2IF`&Gq8i8#~EdV>Ow`^uBZ+ii6pHim{l?yQU zK@Mn7yt>xQ=VD^WU{#axY~|@nE6vAa$=dlD5E3?BZ;-l4H|tP_6YC_;?xA|YQ_V+w ze`N!6vHYesnw@_w*wKTpargaz=Od4dj0}jHAUrq#6U?0A2H;h<3amNXfh(9lsL0$v z`8=VFbMAn$I^l&zg>>%Skepz&G z;_8BQ&dL3PIY<1tN0Utd{6gRxY$OaZLai1;61=lE_29A5qUsUmfj`2B(kSmpm{X-u z>bA_(s{AF6?8RD&g3MGr2166U=ZVLqj|?+lkE_2-vfoKqv5vtfh8m{@m zFJQ{r8bSHk9(XV>*iP=W1mOv~MJT1O6ZhjvoY7kwAy=1CE2Id_)DP7RVcW6hz374+2BRM12+c?Nv|* z_`ic73p9>^t=J2ou>&|lE-z0(P2b4<5+J!3ZN3;QN5dlfcf9wQI`S;-$k`{ATBJUf zMbh10e~5KQOV{fpJ45iJZcc_-t?3=(&$HH9Y%%3`r)sq6sOwS^9*y5J_C3mb# zbQRu&zVuvBgU>oFeo8dlf{>Z+Y(+$@9jw)B$#st(Qq04W{Y_!nR3%JF;&+@Gh)*Ri zNxxOBf5Pk5!$0Y@`WlW+(C^FXrg9@zqz6L7j!28%XssE&49IR?zC$3Pe%wDugK5UTudcZm^e_LAqv6JQUHfH-{8J&gU**aW4UYqTN zEZeTj$(3hEDZ9)6c6$iNKqn1|8T|%KaE1`-9x8X6okN%AeM*#9X0WY*&9pZ#XT(#; z>m{{U0FRZB{0V^TOOkqoBIzNwd#gR?-C4IWcNm+BV%#VHRB=jXqE-=5K5M+_; z8@WpeT5NPc?SrrkYsm7R};m}OmlB_mfp(MexCWHU&FoZ7yJ5^ImC#T+o6hi1$`RhB!Rm777P+J9&ou|eYK%1?s3VYTm z#14TU8oE~$Wmr<*DaiG0s_L-wER9grJcKdUoxI2#Ki2u*7yV64m5?|;>j|*bbT1wnL2ieK-GgYTq=a3McQn&G75NQVo*dHzp=a(T9xmSHdVhAUQ(q1u^$ zHFx0EP19BNOXVJ%y=2oKTA?v7C8Q1KnF0aF^_EDT!!+lPEjpWFt-aLM^T}`D!kxD~ zLw#LL-f-c(diACZ(yB>W*L@?Mr+e+;?jn(o@=z&vHSF~2*QAvyW<;CV0$N@7=RBim z>&9aH7QLNS~o(Jy>EBs#)eWy@Oq%>#qh`nC?5Ph1E;&#YEQMWV# zl9n=dIxLipBI=fsl99E0T~2u|PiU$Z!FPe5881iO8a|{IWjkUBEZ22QP0)_EXWoM* z1}#>ffFIW_l=tJ~F-o4i6S_}*8C~T^tTteMu4s&X{p#=I&|SU|k5&)49; zlVI9k%wIm5d2ipjsrbV>+&#kUF=T0&3iinHD)TC&B<`c?w=Jsk z_qwfA7`vh$p_(Q=DsTdb$?`YVCps_go%V{)wZ||M&~1_WFSeX>LRw9)tMiCe`|NZD zbFyg8VoLe28?JT@A*}G*b&e*5^7G|&QZyXUKw8ZTs?#2ID{@35p4=_+KO$7o*KW?C25A$$@yQJ18b0>@f~^Ot z%Q5Eb$KOz#9Leo~XoN{DBBm(f1rMTKFTMp0@;ifk3cTB8DxPYq`<=Z1CP#O{vOsUm zh!jgaYqIm){|q~yTZ}-JIDy^>+q|AC%9wWr35r(gz0zVZ!=0cDpSC zWJeoxw0+$;9dbVX@NYp5;(qUbN+RykVTjuL{o z#85&OY}16KJq9HqC~}{XC@M@XtIQ`U<+e?s1XFR`#8UZY>m;_De!kIdvYn}L#-bd5 zr+l()9SrY$!i-MD+ZVo@r*+7+*Scrbv@LtaaSQ%d{qu=de2C1idUF~tucSrfGW#SE?sx#ru_N&dCoJt`ah*9#TOMe}JP}nAn{Q`P zlz$jO9#Mn|@hMg6IZLn0$;@SosSa+mM*5k1HtHl+glMc^Lug#HA(nY%hHt4gK1)G6 z-sL6P;>@D!|7s!0w?RsUe&KRwaZ@SK+S3Sn{{5oVa|qJPB!PqEt}gYvqjEgTtj#nb-Ff1+HiwONRp$j_tx;%a(I?qH_Y>?=t z!-&Jwl_moJLn(STr+@3T>6^Av*nU9&f)tgl+_`h-pG2g}ngl@6-@ol&HhpVwS=DJd zBZ4deqtB-v48a)33%4=wQjr0%9!5|QM$VQufg8uW=g2~h`_}^; z)f$;ZUq<0&;EKBhoT>BKD`InuV}GyNSy(V~)c>Hh`BqOPOX*DL3EsjtD2WRVWQPqpUZ+%_Ow5NP@y9UP?A)VNI8Rz!#m z==;UUtfMT3Yga;8?SbZiN`929*#lR3X=(H4B{P(py*c3{=L(kC{ASl@9dz^?sy;)P zx9E_Wwl~rOjX0UEh17q!=%hSMHS7KW=)S-1+?e$nsb0`5NR1Z7e*&V<9M8xOVk_s)?j&(i zcPZmmA1t4dEmM)H2raNlE$fw@FgxQG%%>C=KC&^ubBs8?V#N}@q1^q>S_4;fhxdTh zGE>Zm1PixWzNf@T7gvh)2pZ!Ux00u;g1BYV&ERI$g#GO*P^MoLKMRpCW_8a5IOjZ5 zh?L$eliY;ilu`L$Fva*4b-~)X!z~*tse)(D@$B0kjJ{1vqX+pdH z7)0pk#dPSO#b8;Fns@3Ezv80u_$>*N`}W(6OF$i}3s`-kLArUbMmF!-!EW3WTEj!p zV>BXYZhAWn31L!FQbN3eKoyS@7l-vS)cU>#S!@l+(+!;l`B0s)4+-^eY2B~F&DZ|k z-|klxgM0>MNDJxfYDKNPsudG$RCGaYn<9r3nyXPp{h9aOJ32aQfx-umZA#VesefP# z8fdlfvHjvBgV%t*3!8o=7I2KuW4Er}SX^3K$`yLa7Z(-p`kgE4{LjR|zX%lzL@}|0 zktbE7?@PQcw>~d;f?XNnZ97-Jo;;ht`!fUOGp!z~q=)16)j1w&yug_+qhML_(R5ch$sKo{#`49SPX=Zh=#VrAn0MZWyo8%%G>zF3)CH?d^#2z5Lh>){reu90`IWvQaqb(z0OH@5uuK?WTW z+dhyKv>2j5(Ii{0FLw=QuSR95F(9CH`Or0+@6}jP6A->K1v;y>-xIAnZMz%3RD!Lr z5x8UruT!6Xk^PMtd(o9U&dbjdP_6lRFM+K`VN2HVx{k(>Pw{Stsw+F3aMG^aF`}tozlGB70b%(TN&c6yD5q7$|rBdfz>im_7gS zc&^+WM@9VdEr3TtEBvd^Un;PEXauiRFv3eQj;##@AcR$-Odjd%pYj?~=mX)-7qfB- z)-3QU=59|4?e_C6DD%WilRUB@wN+R+g&!;DfGN+O&@0e7icCJsHe5w(9k-xFST}}S zv~Wzp9uFxG>2nfy?8aM&SU$7&tc!H6?zJ?3(>r5mZ}MVBcgJZNO-Sa((e((JF&%7p zv@(o;MxA!mIO{Tw-Lv0-5dUz#=S{mLbMUs5E-qnEi zz+Myt^P~2B;HTbs+;R=yo~ghD!Jo4Ew?Fjd?9egXV}&3k7Y-8l7WYOCo0*?q3#_}^ z@ zWkG)p$WPP*HnaF|AKaz=lxq=`W4s}V-hyCrmc69$>WQEPL$0@}+tII{JG*wwiWzX_;&19 zCPA}|n0Svqg-^559bhIl)@XsTu84CZg+qGvo5&VA7lP$ckHaaH z{`h{R4&@9C8AqLJvel#MRz+8~^Lr$$vwTdhtXaGWnSpRYhYWeFp}~ft;ZHhxVOcb% zIb)^9Ug+E-*tp3aybzPo;S}H7KR0CEnCzs3HUdX;x}-9GPh2~JF+Wt^i8&}5PLhH@ z5q{jAB4QA_F}J4qxJnPn`rWoVoCiYT#UK5liDn7Mu`HPS^?R|q@WgNc#l_>Wr{W4vkY z%vda7cI9f!vC+bA=5|ZWd>NtYiq4Ajaa`naZ!;xUCanFz9Y29AJN8N)YN3DFcvw$V z;HCyv)L*TVt8#u9o9_eWUCDeH=ezPEdmRN$DV=zbgkH0;iVt2^#J0%dg_0Q zhhaG^H*&JuE&r{YlHr1&>V^pXY}fq2BFW*F$0AA~Ap(UI1a}+vOC46bXWhU^z5uGb z0E){H6pTLFX2!+Y1*IoYNSck%`W(mq!fsQxHn=e4Ikvvno%(qX56>+;=kq2^(`$(2XM-z7>W3&}MHMXqG5{ zTIfJ!vLghO(bb1^v0-jy{WsrDpr_prft$Y1kH2R#j?pet;a-cZ{E%7+&1?Bgsw#MI zlOng%?GW7`8}ckT2b3q25!ghKzTt&Mm9Sloeb#IF8m-Csm0m5gR}Tg9`iY@t#_cCI z9Fx#%0noVPkXyK++(jUVlzNCbsO~FgJMd9s3Q`#V&CBxRshPfC8SSg@wpmg+b_7q= z9Sl%RI#!0}f5}eca)71=vF~HfSpu4(9rW>rzG-O>3-u~)fr=U$B)4z1+alRn+ZJwc zDtjV!zWx5{%Jbl0O(4ew0f4pxP<%86_b+(2$hgIG0Qs469rGnLiA}HkCeS#2@-jqA zW&H0Nb)}27HD>4jusAhp-@jT@y}CI$`^5`_ZwxhRB*hgwp&0_#c$YmB1BkUA7pv4H zEytjYMfm=`nRWH&4W^dA(ud-vBmE?|pLtO$OO3Ak_CQS2K2hesRdw3*Cfzw1Jng2_ zLA!30e8wt>(_P)PQn*z?{%je~)g zJlv19t`AGty_fs8B1}mw&PGjC_l^zJ6s|T?--=>xiMbQ9)H>aZ*q#+AEBeHhWH_le zEM))Pk?ma**{5<|!n5^H8i159H7bOe-yE*NKc-oxtAr>pVLD}t9UU?Z8j1-?_SaZ* zZ~l-n5e;3K;^K5vkuBeFU>pw|Is~WB3gV3*8_|@5TAwq+WgeS09PAl5QGS4OG z>YKC`w%K$2Z-r{tx5i$+cKKNdla`sZn-Zx-wT8$dvf zH3e2G1Xbr;!6ZW{$&?A_sS@K9rhz|X&Qxui=n$7*!g}u5{Cy5ESgol;X}+-Yrlrsy zSn)5onBX?ZXyUyYa;e3B>yet|%eMy0s8#jNz5&dU;yyKu0AUHGuk0Iz*sj);&w`z^$+d5*b#lvD3NoIeb zA@gFrFE($>cBUKy2+-VsG=ddB#N+P}uT3)tY7MCQ5#!6BI#KH=oMe1`Xl!O_nOkks z9rgev6|uARFZW62vM%FvuMZi*S}Y+eE1Qk3zaSJYaHRAoeKXg-^$p>(yrZhbWyZ2KDCfyV&m~&^Xz|I1NnIlZT@cV+H@io zfOB}Ugq9pTJ6Bz-cAtLVHn1mctD^-Yhd&C);*`T!bciv#hrBqcjLnC?jPcq4K;n8$!dI(PkUxanvsi z>x}bBNX&b$Eo;JDsI>EJ?il5r>mQtNPl2;fMhxS z6qADUZ#@vMB!}A$B$PvcY;C>Qz-~_2OIvyL6_6zWtx6@ST$8M>27nmUh)+O3Kq>g6 zKC}hwilB^#nla@TV|b#``M*Z%T4t2UlrbVG}|xXj{Tj>|u9kUBqsHce7rF;KCY2SOGwP*kz* zj}ZD!>Tos~{BwOu$-&Mp0pY{~*2Q0YZ6X#MUNw^#I=8L#n)Jb*BKh3}23xnGR0CIZob?OF!`Kz?=3aglW76CW$SQQ6yC)NLdF!!4@1d{Ta0A}q z)MVt*ofxtr*K=G3AP;6;gc+vApK{6~!Z`CgYi*Y?E4n4}_|7j)_no6Vcb+`|g{hO~ zfAx;CE2@)wXUhH2Yj$V8f;HC8Xt!JWx+#2HoovT6Y(2>LmUfa5(mj8%sIq3WUsK(F zoUumc8o0c42Wnx*oNwK45o_sg5ioYKDu_~;_1qW!?Cj}_jkXpTSdl)+@ej4eM}`XK_Ow#3TX(kJ58m_PrpVp^xAESd=CmTr_y$9I3Ea7mq|9a)dBBpac} zxUiv63tb=A2;NvLHvJ&d-hHXDg=9G|=`w@w>8pr0Y$1*UOwI_28)Cc!?!giBYAcRHb8_#UfY$^dE~YjdgUqp>~g z`%!^7Q>spSxs!8f%d$~odU7zUPoGgHYO>U^sngidA|9!?DuptxePnKqTkX=@)Xg#< zqAyrT)Xpcq-S5}0>8V;@Q;dm(v&_4da~g=0wZ~Q)u_3guqB&~Rj04KO9aorDIpni! zKQkNkxj9G;5|MeoTd{(!TjWXCzfqx(Hez+{>JCjJRq%Mu*qB0IXx>Ts@B5WpDlzR_ z={pOi+DgG5C@oo>+;lM}bHj6;)wm4^hX^@*mKnDPEl6Sj`!v%@mbHFpD#Mt6a$SQA8}EJ zDcIK4MXuqhsMFDMUH9CU@Mmp2Xc-BIAZOVf z$g4`y>(o$-n%b=Gk;paj-MiF}X{KO39HLB9Fo=Eo`sT4R>q3`e6((2ASwu9SDpJrsG0jh1zno=brfGbcT8!SKbDK&urxvu`HpTr7bS`j_CsM{&_G zTWc4K?@eBX?_6FjEN&0|im;iv@`Ne9TZAtzf_bIw*Y8^P&*59KSLPJV*ZngCBPYz1A0ec&K(_&2KDU;9u4wV%2^A_&aT<`~LK% z^j()!nbtUw$7D3-m7Qd`klAFIU5H^^(6QV46mj=?4N{kXVfb)e8L2ZVXw}m&{K=#7 zJSozUo1RQ>bST{u-&va=U)p~~%7T*B{T-u#S4-5O834A(Ps1L!czAfy;J)elqFLfw z50x5|5QuKTyeM^CS5Qu3i`$s5v?8E#-{!L2Zib%&-lrOw6NBFnPupoZwciJvcyT>0 zznEntO$=ME-)l%M;HX6iptKr;q z(8y*k!jqF8w>@dg&WSxEM(8A>$MfF!r^6G1BleG&Y355a;qmAYj`MiqxRs~uT!}bt zc>1@D&b}!^Ajyk~OvBK0Ow$jIR&m^zbIqq3%!o&XM$Ch0 z>}pR|%qhP4`12zU!3uiG&`Y63pxE3Z$7}LSjMTbIs|G<8<%yGoAai zw6uin&z})*w#a#R7&erHp+7L?*m-pH?IcP2D!S+M&UA%5KKC8(jUS+kV|IOU?9AaZ zz-K8}O79<8)0V<-;wF5Gw|z0sKz~)^B#ab)a3z!KzfF9k@cc9P5%eV0KoG7?Tjq3+ zt>jgVJk7`0AB+h$oFRd6$;dhA5$YC{%!5}H3mgSmpQBhTBM5o}iR^fEF%j;!O{&FWlxt-8dF`Owp7;SWU!m^j*7thP55zn`=FWqI7do zm?g`C4*H`k)rEmei%Bk3f}^8~DC93WP25f*o+)*zc(v+&I?QOWW}|M28v2b|E<^g6 zU0Kt(p*a7b#U93=E!>VM*?_ZKWH6$lV&^)Cru|$Xv{a(84rfVd@GPr<`jun`gd=gg zrg%Yqu9V_@_3p5d>STof!YQ#rwGVq+^tkAc_e155t#|w*=!YR{XqVP4duW3>wjYsl zzBNV#>d#`$Q`k14eoV;084iCb(=CQ?Co($ zNlSZn<_TB;oi0m>^%U!Y7-M;3uB>1bp8;#5BCuQ6`nnR;c8@s~XN^fkW0wl0>xSu4QBTuT*Qux!IriIoId_ddP z2ng79m96srwKD$oMc@yy?+bdy6uTRA*9;qzWoa2$qM`PSZf$L~4x~Qy%t(!~(g|Z~ zaGg_VYW?tFsp}JMMwKzWBhT*&z@{Ylfc-F^dE8@J0lo6yw-5f1!yp}$7VDDww{syd zM=ow|)iM!A#K#<#Lp*&3@XbS@+%?WQ;6P z;58M-rlnf>-s!d}47J)^wFr`M4LvB-eiVx@f>DqzOz8{%AftngXH;Gh)8G}wX(B%| zKe%O`@YP1gj~I&kg3GGhuj3V5kU{bO`0p@-;$OpYT&-1_ZDqpnz#NwL=+Iq->@=uS z6ULzXcEV%BgnW=ag%YmXpo~**Ox?GyW0Kjgw@mSENs_;D%yB?tbG@f0)l6BkZ2gL9 zvgO!$a*~<@eLvXUmdr`aWI`gM%|G3v!v*cdbIG~}WppY=^u2ao%&AkQl=AicwaXaa zJ1;z_y3%a?IB8-AIZI2uM)o6D@fuJ@s{-%&5k*GL8@EIX^IM9%5OMyBWQ`N}Sj_pY za)u=Jp9ms~U-0YKj~-KSsxPJZ5nj zxIYkoO-+n_>|ybrAW-*9TC_FaaRbyu6%MBEqyd7KOG>-$%6zCO6pn zS#9dHQ$pn?6WmPur4g4}6Fc^@Am~)n`4iFpTHTBEb;TrT*$JMNKp0}X;(bQ|qr^)L`dr^*k1uFv%ehgyx;{ zX3l+W`_IB1uxG*<_tn@Llm}jHoYAE~ffFQV*=r_=J|^4XHMt_-A?$}jS;wwN!_tz^ zjR9X%QBiR%3q~Y)W}SshOLLeC?r@WksJrIlelSjlLR;2>glc%jof zHtmov%xE5S`jf{{Cx2G#>1Y;K?D%8M(W605^G#=NOp&>~QTB$l@Z6^NR@J4U;uI*f zBf@w&OYU|FD~=dZf;xmJvdo_3;RnPi4uwu>aT1i{zeQIlFoeR#qbjb~vbGY_%$X9( z_Z3X8>HJo>4vIJ2ZEDe(<+YeYwdqfMTT&<4qseNN zm*#2V=aPCFm&bQ+Q6Q~c-#kAx_y<|mjVvXOPz0#I5b6x^<_>m_ILZ4H6PG>wkxU-S z=n6_ACjQ+|?SQI?Kf0b8gRzPPX6sgR6;M*`a;602v!f%IjdOz_OGnFnNy3{1X2}7| z&Fj7|+W%dMPF^p3OR-zN+9Qfgpjd)0ipv=ahf_Z0Bjh3Ex10S4W_6SNzi~Ai112j8 zIr->?J2;4m2FVCdc@i<_4&kT2ntbnOS0Zpc1oiFbBNL4Udr?cd{@jt=Ln{tCp}F4+ zdf|q&PiX(Mp9HY$WiV_Z^nPo-DH4yUH8-{Wv?To}?tWA?uDwk{`AajTKE* zcBZMoU+K7Or6AJs=--D*lHsJDx+IwHlq4Bb}J`Q5-MBbxkIsJY!k55v8t##4SwaDKFnn;7EJ?0z@wNdJN7VJ~V@ zSL_6@`4p(Y2G!!cc95^{-DxUo5y47UxW8KHn6@0jh%5X$uG53Vs+KDj?;2Ne;Y_Bp zZ>zwH1U)d_SZ>a_MTc=wRA$a^1&1c6dAyW9rzja#Np-h38;(GB7ZRL*0S$EK4{RtE-@p5ShM(vI-6ejCxuos=@`! zxjI%2$6xK|MG;VWbaZrfCD(I17;JK)LY=akr?Rc{%=(BY*qlPe$$6UsA7r&-lnWYoWODQ2xnBk}ftr}X006m4 zvlDJS*{gbwt+S4-W+f7Pr|_o8`j=mxYdEDlfx623MP(VRHRyK1{?*=gW_xto`_bVg zPu1%(=ocL(m#ArR*m{2>e!VmD&00xG2vGjL@@_mPFYCCxxzy?;+nq0g5?a?;J~7g( zSI4?KxOZM#U}V&o$370~@Qc=AGp#}5B6c|3zMzWzZ1QW@en#ot>5PHT5SLH3v-!o(+5lId4WbhiVj6ZI!rL8P z*74M&fx^0O)b(do1Fpf?uT*(JVo~9*e;%`-v8!=6Fuz{IojxX^Me#N5PN)?nzFLO{ z5z+Vk>-uV)qL1uE5QBP{Y9W9<&K2Yf_v7fqd#rwR_twB}wfZD#B{_xq7unsy%G_kt z3-^iW7Ozuy&2DBD#DXr#Dc}Z`L5#x?Q1Wf8Wto$q^lP94DHJk>9^!6KoS^;WTi1kmuh==(!{IXk-6O zo^9A0_ISo0FEM&WqoGdj7e=lfYg>8aVhH1TxNiSNglFZNyeTExZ2UB+>0|hQA7XwZ z+A@pHgs#^z_COsDSX8?x{2Lkr^K$8G{VYExvwNfU()xPqT-}HGBr{5T!hLh+hs2VT zi8h(TD=o=gTU#dPF1$hMep~ai0jB2K=wWm#p77v|VH&^yhHuy*#;ud;GJRz2vh?zX z&C*=Shb6vdNoND@6A(!8`Hq)P)c!16Txm|+M&rS8$M+L=G4^(6|BLwCfNQKcRTs9q zxBI>=4=xMd-eSgOiZ898a5jH#G=8+QXh}Yg1cA+DHu`LIQTn@@Y#+36_tN+-l_$QJ zr%Lo&VCqAKe#zSP(e2iL5Ww@fU7<0~UV(;&A}Xa6tGi$*iCs5oO59sgXmKKi~MZtHeELIuFHHm{0wP&pZ1&rQ?}Te0pdo8-N4%WDl(-bO#74gOoIlfn+r=878dfCre|gxfq~A*VGW+nz+0(@ zy{bJail2&xZHCV^dSd^RA|~#tP^Y}08-A4Bk1%u4n+Jod!KP9S(aoCLhAa4cEVz9a z!H>K^vzro#wqT*LN!mA_vyGHeWcN)GO_33X4_59e{JSdhPhkaDG#lA?3_5*s8S;RT zt5sqYc71soe<9J05X!g3+m}2oe&R*3NZ6|y z_~KI-2b{ z;=jS&qc>f+`+oUu)t-i{yB%JFqEA&)x#$ClCk2D;IHBVQjQTwhSQt8mZ|*>{h}JKY zUkzy4lnckJX|X)$1$m&zm>;#GxbOtp8D7#?ZXI>GQwJA6vjwpnh;h=&T7S1e_o&K2 zZcJ&aLnhCti+n>sl(cD2>@{<}fYwwWhq+*>Mg5Pjm3B_?UtMmcpZq$08vm}McG;KR)J+4w11^8rEl!62iJ*T{vSUnkUYqy8t>zVOGl z9%z5IdW|2FNh(2mJ_zp$ii+t)MOfesCcUB$vOowF04OOSCR7*a3IfU)|=TXd+1Nc#Q$^oNDM*{z^H>XB~P|zJ^+I=++ELIcW!QO z-02dg9K!N@ppnapZVXt|ZIhD;M2x`#>pzVm$H&KY+Ma)QA{}L-{1}T$KL2X=yrbsV zg3NzzJ<=HzEpJp5W2V2q<2(W7NZBCZ*BEi*b5|Du63Cz9v%V6D_+Cw=UF)vvluWuT zcj1-|0{cD6WXKrEcfev+P*QqbBDw#pP-5~-YH5IRku6gWwT4TkGF4(^Yk%SqiH$6S z5#us!e36Z<6<-|a=?1u{M#g#@F6cs8lU;r zK3l}QH2vn&0%lJ+d;OEzasIF&v_UhSHOn7`p|AHy3 z`YijDll|VG_1fnQ(2(bu85+a8C29<&pMZ?E2~e~GS>Ysj9f%GE;E^Kt_xD!=?;1## zt|jDgSzb*IDd<@=xPnx%Gp-^IWKV#(XAHuj=%-R6?-#puLQkIpX58!F^No!Q)rYZe zy<2;P1FKEwD+f;mEvpFgmyj9VD+@W4FYujcpJ z6QaOthsgSQ;~y@-V)FbI17487vcOD@Gb4~(`gY#m*JljIa&>B)%q3~cJ{4A+oV*nV zbA4DCT59}15gOcpQ)pspx>!fhvdNq|Ei+CNAo&M;_^T>gZb2XrS)*3M=gZLuEpSQb zvmko;xX*w|{ow&s2LN`#9gFkgK7INWEdB-S!#JX29#PTa3NVTGAjse2m3FZ~(B*Dp z&sum%d!<^3{O7{(G)ozgOb-&Q1q78Uh|Zef#*an5+P@OIaQqT;7?PhmWU(S5}1@ zSQ@r>b;%g&x~f^cuHeZ(E>^GqFFJrDKH$lO!~ZN>EfeJZi3$uZe>IeNc>I6^jaqLL zIVGhX%q~|cRUGVQP0l8_uj9cQc!|N6s`lqSQmZH_i3f>M&n`g6s$=a?C&XBQpH>Hk5&Aoxsuh3F-Mbu2JBd)(X|YN`hJDDwO>E$m;FfTRK5CNcUzg2cgC&H zl(G*J^x$H>Xx>xM5hU_sn)orD@=Q#6!xPBpURsk}Afe3E4JIc!$y6(?>jW8yuDIJJ z4zEq5j7@71^&3^4s(+8TZY zM}{buTM^u#a0d`RZfi9GdN{3d#6-QgYnZV34pkr8&^PVQGOt(%zxw-)oG1xFvzAyZ z`25JN82&($9LV6+_y9pWDyq&OKM1C$r-#6d!T$b!zOD~gdPKdBaxF_>4L^P$4grZ@ z%W$T&K3w+so}Q(o&@$0Db5aa(zK)D`!M?^YtVA&8n#tFhNpE^`8w z86G)wk#6nJKc5uV>GhV6N54V+!N~TNjhW^!0)ZG*NgO{rQ{QEff8b0LfpMMH>)(T9 zGQi6~OhFa;;8|XenD?57tAc}f?8urWMm&FUE<4ph^^kH}2+1H5hJh=LlKjp>?+{9h zF;x?1Bubh)mHeZ4W+rykC6vnmRc827Y8t`755|Xla5FShb>#HH<1cbMHkjHfNd^lB zT58FblyV8`t~M@y+}YQ`wOHAhOoLY0Y_6$om8DhCM|(Q64p_F$FjY-XtD0hkrDpg&q;4KF!3WpApSr59D&VEQk(b< z06pTdGJw&gb7w2?KmY*O1}31;A(PdPv=^6`jsSnV=!qvvgu{zj_%(a<3K&oWf@DGN zSG}69qsSCpSpmY^EO_<_2?<}A^?_s8V5r`X3rBCpe;%s4rRa*i0J}?4Fy$^hZ-zW6 zf@oK)s*P+H!~RP4SSjzLEo7&0opx-APjgpoxqwxSq~2LVO1shCsPl5CLc5)8xBIjx zoNsQBzkz-nR^s(VWE|#<%Ug~>3(l@XBp2kw;~S5w=364ztsTU@I4)Ypb(V^*B;m)&P@up}m{yB7)DFTp8`UuK@0XU=rFYWo)DVufc=L=;d?@UdRfGU+J z?m93D!8Q95UGBdO<0y_)Pq-9$MtNve2EYG~1{`1FAvGMM6_(Z#&d$%rQS^toW0KIw$QM zDsP8%SqHCmO)rC9)1+gpA%H}_Bw*Yn*JNS;5ghD^yTswI&{flpG}>a%5xTpp?Cjx9 zbO)lSrg(O*dtjQdt-U8;f9J?WbZ)Tb2|ZB4orlCsz?YW?ZuIF= z0xNCqq4eZ_Qo1Rjr=BBhaUuC>_;2fWzem5VTkFe%aSlQty!(`R%XE!FLP2_}uZllE zBcW9zGLV+Oj}Ty7gS#tji2;3TWzS9ZSN0DYorR;j5siTtFJ2_T;T#q2yyu{lXe*01 ze49wZna`x5h;tCE09thbSH}4H;PY1%qTI6|U%X8;z>x~9-j?xwlarG@&80G9E`{|Q zyeiN`EMFUT2i&h`-&(7rKczO0@wR!KHIJr^3e|zK-a-NY5|`kaL6 zuLNGR5pnG3CtdGjdu8D4Znw&~il9O#>_b&s?6U`~SEvXMS$sK#0tScm_aBH3J`1{X z^u(q%#h+E4@Opo(YXtunexPFQR|QRT=o}9t@vHrl zn~7T_h|cM*nT*Umw{nt}B$n)WlAnKZ1sbm#PeGeF9=~U46$RQ!Ix}xCMZf>ML;en= z@n#)^>tR6Muj>To0H>0Xc$3~H`9}R}(k;c?D^nWy&>(K*C&FnTand2v9H`p_LFm1P z!Y@-WM$0AkSw57FQ@d^|6(J`%Q2sYIB0-Ejug#gHNmxCCx-lp}yG}sxn*65vn$;gX ze4!C0kyOsBZ|XADiIM2l`Bouh|MwS4f4g+#_2epo#p{C`tL5xL`UU;uBgui27!prQ zs7czX_lD+nf`GW8h86jVO>!cWCHBqU9CLN$Y{@i#Fr~jmOQgE!pTx?5ONBbrcQIlH zECM|uM6F508Mpge_{}E4c%kllu%e2O%wR(wb^+Gb4Eh8WWz7l=z-4z9`yoV* zUdY|u9S9#^bWu8~BBy6`Jr)%;Ql*aZ=hO4sxTSmlsab~^&L_f)}SJ+ z!hh*IHW(x=P#rIaHpHa_V30~-ESL^>k|00@W zc{e8${T7)|-lo85&4?Mk9Kh7ZE56IZmL?E*mn3Yqk_?7Vik(&E`#YYo1LFuSi17kT zE?2y?A3KQHgu_%{Mp#xmVzKp$%||5bmr&`v{y>Ix*<7h*esYJyX0HWP<5$2y_@6Le z^BEE5M^#wriBo7}JF|+Re`vF7#4H~3y zs!-eDY;HyJ8xs-XZW7rTfi`3#)K(|E(ife)n`#BZvc2m z&TrEM)PBf`(l+;33IeL-GT_z?Ue5K!%PmU=?hpQ*KX;u&P+IOW2fK%H2?+%c0w;LB zYii2AV(Yg1c0eFzvdm0+qo%43O7NV?W^QY|s9gpEYf;V1ZoJ<=zgN9Z!Dm!LLF#{rjG9OB ziU6q3fBW62*LxXo9XUTXrg@m)EJ^UOHzK0JYYmJ!T4OfcC!M*rs{l<}BBWcZ> zq#T=84VU*_^&j-_mqA<7PA{=ri%2EAJ_sRM9QjwbrKCoW=hlHm}Z$ zG}E9tvpA1paI4W=2OBfS^Q=cLNw*zLGou(DOW|c zJcDlDA)L_NLVr=q&Xbam1DMc2%4=VdE!TuWj7~LURUyzjOaaI&5#M=SYM;i1fa6#lH)4Z!?!TPvU6x3*(?@pD6-I@V@lwr1JKkyO7T5zSO zlH-iJg=bzpe~2>aH~;zN7SqLf^KSb(l1hs!b^jF6n)a(Bj)=S2woii%X;tPl3D=2_ zh+|gSRDUzJ|0x@sOww^P54TRG{)@()yGDI|eP`1)GaNyY?x#ELK#SB1F>AXRu?A`rkcNPBv!J#J zK{f<3Ai%xrUfYCq_j1UlrcRbl_&!2N3y3G}Em7T)bVg7Cw$x)F1{pX!JnTw~qtS~U z()!o(;|I%+qT&9(;fCctao^HAATe$~0c005ur*5AYMRtpV#PV)3#^k7F#kW?JJpW} zhRmAStt-HBG(&m0x$i&_f(n+Rhoos-Q?s+7KwYbLRU1pz0Yl9?Xtaio?dk7}trBS< z#$UhMB?#p>756jx^e4eb+=QABC@&Gu5*ypp!J*VttP>YgAA)fCJDVEx3LMStg)0JH z5lBMKvVJ!LE=J*z!1^0ReT^u>K}W(|@>4035$KD*(hUI>I0-3f5Zln$VlxAp9o`}R zC5sIufRMqX%bcJ{|1c-ma=uAurQ@@}P6SUKzW5RnXJtE(O&m`-Q;9;6;OR6h>9ex` z4S!LyeaY&JR-w;9oBBq(-aF?-lzLlL*Y7#jNQ@W}MFF}xQd7!9US{e1<6BR3TGm{( zy=>Tp!~9j@Y0i45L+WpjUjOcnON@e2L;6yN*5Y61d%cf5fDtFfVW<1lSm#pzHxU@$3qvd=pM9`2jD4S8RiwaaA86ljiFPcD#2?5QU&wU z(f>r&uJb!0sMc9$w1;3ZIb5CX*P0@vkJ*FXV-T@5A%-ddd%$CvmTFiSwSdMowW6wO zX>Xwwv4=!&_9MVS-+EfBUH36l*p!{pMs;Z*{&&Ia!_R-ci<&$2kN%_vh_B1S6n& zO!Xo2rCbj-1`qV>!|ti~lx~MqZjyg^YlyQ*h7{eOk0)M?L0L&DcpC_dNz2P)7)-Q> za5j9Y=LoV>6RZa=S+nlD^+}c{nJc)~xTKdC7khiH4-oCa>`EXim3p@KK`b;VC`hKk zIPBshVP`}{#0T_T$>1pyu#v$bNGRVl5N{(bE+cRh{BH(OBo6_q^|H-XLW%+dKuKsC zh!|EiS#sClj$>=-w?Lpwyh4;$<_e94LPsBIZk zw6(h9V1u%*HPUsL2F)!~POFLOD0MMT5&qP#>hZ}&$P-JnmeaM;Svbi zvT|ii?poe>vBBu+lVWp6De^6$THd7SLufejU`h;+oakF-lX(jL#u$u?Ip&Co1WqGo zEplsABUlm;7fDqZ4G{dE6Lmjr`BTEXE9bYd^Y2Rq#JU@mU-y2$2|kw{6ij*iFhP!n z{)~3o?mG75p$TQi{ghv>@m~#?MxQwztLKP|lcYKyoy*QdMxH1*tohUSl~iM~?g^u- ziV)LheTsTwwkOQ1vPbjdQg;7{Kl^HipXTyVb~o&!Rs@#Gz(6*v;;UX zEr9QUNE2T9HAlmlX&q24D6_L~4<=l!L^$stbTPn+BB7)lm#Ek4*$9Z0KC+nJo+|Hr z@}MMvNb@B!ddms`kUeZcG#v8p<;p-$s|bxC%ZJmAynm#~uH+|EL{*fY|fP z*|}z4c#P?v9IvJn!3_qO9t_9^AdHc~Kq7^h7e!|T2K|Jov4w?{O&T12BJu$JZCKoB zL5!6)T|Sexq78BeMtm50+So;E7~a>gI7NDYgX&r5J^Y)%YbUW(yRu^pWslN^pes4M z21}^D=e~IL6K`t(L~I%cdkmvpDTcafeiOI5+3XRTC_#G4IgevqCq3utS-9VD4ZX20 z+`rkrg=rV=V_#SGnufhwe$Q$KI{k@?wq+gV-u5W|i%p-<=8z(<-miS>l(TO%o&mr_ z!5cY3t7{ne@LV1J91Jq#xz!Hn3`H}Gf|ebP1o9;R@)k8p<#TKsbXj~rLXjzknVc#HJXN0ojz@R2uKZQRq^p1fcHOFUl{n5hqiPx1HY`p z-CAhwoKL^@fd_4&0iAZRwSQjp{*A@C6#twx^>FX^J9JdXQZV*h?P04JbxoG-A9Azp zVo61?l5+~e-5ZP4bLh6jL0w6Fx#zVE$VwWHi|i$Lk`>VDr(Q1uLC+4XC$m0`a2OKE zkj9MWfgkJ50p~3qAR_|QX{fD;De<+WW1-%OF?)tV!Xyl#V!4`ZUo7jxik=2cpJ1TB zv><@3r?0=iq3lft+E99IvcFl;b3ubSAt^~*Utiyf&4(A>|JR=D3h1?MJXXwN%RTZ~ zO%<=yRl42hay?SnaGc}cZx`8RZFAqf1-z_5hvN{brLnwV=bjq0S?BeLI}mTD_IX-! zH>X64gA1P(asFY>-kGjEx*gZ^w~(gXLKC_$C*gNeXN!-1tn>CAh^~BUiri7BR3YP86mDz@>7S`#dq=oDukXd2D;4GHD z`Lg3){!!<;;7Q->E^;DDuU>D!wErD{s}{_-uzbwdg)-j6U#P~g_~FF+KANV0@`0+E)ppU>ol^?*L8fG^qau)3U#pS~A{iPXQ-O(pG5PhaWmUjHrcxMIbJ6WQ7F#A1k@j0!U&NjK11CGlBcI!sSmchuv%`|&JUo+Ixm8^14 zHI|rxw%ysn`U4r7u_lz}(&;wdD1iLzRcSzpZ@9|4Z<7`bKJUD19YPy#0{}$PSc11G zJL9m>*TL&^=8f^EUUkBL?pvx9PAQ_NDRCIp%jebE4W_0eK!LCTc!-Nz`1tr|zb(bK ztliHRKJ34zx`J%D)V|Kqt5Ew41nL-|AEv>f|i+Kbbe^ zi7eo=ws&#SNXgaiE1>%rXugVtiJ7u*(;(HGgD1{mpcc1;kNR-Kh+yIVnua*h$0%P| zpmC{r!f*(2j=~HlPEt$BP8`nQ%ph+-mI*DOo3%l!P<)3R+TxF4eqSDuWF}_ll12I( zYErwPI2MnjTFS1(Ad9HuI4jV=RTp+d?1jh=*wTi_-%tJA>5BXh+CXZU4Vq)2&N|XLMSpLKDBY!fdJDa2PsxS5B$|wp%8 z@KLiz>79~`eytW@_!qm(xz8}+{Bu-ypG^27H zysD}y6HHYi)G8|)j$sdX<{JT>?|n58qbjh-4=~5(0IFA@AZxY1>ahRywU?uSNlk9* zt(xv2u;Dv8A*#!xqp^SW3Kdj1_~l5f<^pc*b#8L+;9>tf{(l!m{y&hwt$>7}>9(G7DlEAXhn@?JeXdCbwUI zk^cRZe9FVb>yz__`~PVVV-SP(Fb-dGA4uC13todZ?;>El+sjIXV01G72*{7iBZ^05 zC>reQ`dA5-WzL!4PQ8y%^Q0E1@#u9Mj}bezT<;6viIJJMoFcyR*ylWb=GclT~q- zf);$vtF!8QgO+_RsAL@cqlJ^W)>{;W#5#;3lVkOiI zN&IeL^5e@{9bOnocXEpbD3y49tw{8Ow7VPs*R;)(sFuY-%Z_S#sY`_5Q`c?V+84-D zu&K!+%hTb(I-kxhSX;k-oljO?Q9<}jP~8^crVJEPfiLr0sNfLrb#!_6ZbEpC8nAMj z|3~=0``L*fN?^j1FR)H1oPC|ARa;&y1r%7ct$ya@=9byc@FOM#vah@U8Wb2h`BSO= zvv^-Nv|_dhfiav`Fau9_u~!dn>iv=cqL?PSd}}+?v3otYn-0hobcMhx4@`W{7P%=8wQ(Z?02MfHS4^0? zxsidSISFb~pZ#=n<8e|_5|@a`wlydCz{$yL(y-^d?FtO7QR)A#_rd5*yb5*Ud%y?G zg}b3YnS;>+g2R^k_E7o@=i2SHG;ajGr=bS$QS&mMt#4By7(jdz221!GctOZEGuRKy z0SbwbfP#Se8ecHoaR&GlZ26ij_M3zUpn+3nIY^myHaLG;Uu&RFos`=}%yr2PA3`+cwymlmO@7g($e+0V0rifgJvl=Wu`kfS=7k<4qYT zOO;?IKlTY$CKHDd5Tr%0`;9m)E;aF1K?DDF&t`N_SFv6NqOeXs1oM_0Xpwj(+W?^9 zJeMDbbRQhP+M2}4#2+PrQp`wt zZK(QW2*BZdK@EMD7~|sLw3{FK7}R1F}mr~lp<$e>*E7BT+p`AJwrF5 zs*Yk%P4?GHiy7@g%QII`4Ws#*Rl?3=+vV$cv$(Z`LaTGftk4#$rzb%H(>k<<1uuR{NNlM>@=Rl2UP}0BOI4tv0(Xoa;Fw&Ym~rfT2)k+S~*0xXCG1L!`sD|`iRWM%;72F!yKTS~Fw+b^gPz(HzR zUk55kz~>6$1%z1Q{fhwe13`^MBp0Bk7TIqhiE9gr*EjunOig!iCIHOxqSv@kBAcxV zz%i=SZ0M1({xx58J=@t@0qF{H8a3Pmn*4Pn3f!8$&5+P}`X}A_yVU4P1T>FS{~%QW z9e!96aQ@nFpfqR$`E^JYgZ!<#8(7_pjBANRBSR3Ac_zo3W0Q^E-u;L9F{D*?KWmdb zPV>$01fG*v@Q_jnI!T1Op0C79gC*4_%XxXa+o|~Er~l5*&X8#PNVdu0|DMC&%Uy5= z#DnNkn^L#-W;;imDbfsjiu@sYFBn<_`C2oAQRrB|3sY z!Pj3B6-m5mhDxn5xwW$+1q{c)pa)yWU)%fMJ#kNx_aKb@>ITloU=RwZIe2X865J^` zXUB75$ZL21Pw?ytxC4&-PhZ>Edh_L}>z1SB9T0ByGxIn6N+|-kR(;^7w?NXcD_&%h z34zc)0JaAshk%d;+y=}hiVf3e_ANS4Cd`1Ub(cl?gHBT%KuE}vbk zi9v(;xVWg%-4TtAyAOnYKql}3-TQBd5;1!S$O-wu&a4BR06zdS_H0EFbag+rs$^tR zG+3tOKkZ#L37_-yy`1{cW3@4KL2?dwhck86M>;DaN~T)qEUMx0(0vPAo6YRBt$m{S z`Ui$H2X((%)_1k>a`AqK#`_CvEuW6deX@_?`*L)GLS1@xIXvu zLQ7KW`w9E61j<~=y@EHrQ-!Kt@Hco|4^i|5Dg`@7rv1DEedhIt-zYL^pL<}-%iIL` z?WS#$sTAMVEPFD!UddLzaW#k7O@%++CBwnaT>ro%$b9cfI<@ehcgK=NfbbpXBKiDTU`Y{RG6Ed2v-$n~c5a}I);j22 zd@DoF`1c3_y>}}$P`_zkiV!oxBdgPO2Gj}|5F9QAGe>H->FMcF#HI;42uniH;+)jp zJbJhbyHDx5j0;WHUPkQ|GN^Gvu=!?T3&CFvf<0&mO3W<+Sx#;)PTIZp%OuM15JD&*k$=@+=PDjdC)K5R1egFfN4V>(`%y6@kcJIBxY}C z5~+hv83e1k*X{~Z3(_RhPv(k;);(Oe#yFQ0ACG-v&9&<)vUZPyx9#X*UXkzsIc>sz zwv8tT0IR0`+j?2O0a|@c$dkZUl~Np|-aWEpEc}9m6>ohlY{|1Q3zLg^iHYk0RCq8u zjn?2^t+i7EL2;;gHHrtQfb*XDl}G=E;r{(l9x8BEg5>{wpwi@e=?c@D4-}c z8QgY>f?NdX?6&JkJq`u3K){1|enP7|8W=IS?gtxt^^+a86|KSQCtjK+hI(U`wgBMGd~{PKWh1`qx-)?fpr^ruTxh66@OLbH(($@Q#%S zg7@MnUDu8)6hEUYH`u=^-1oYfKjx_k+uQswjlik~cC~eWL28i;s_1WxU6;%7@|q3< zsPu^(vqZHPGsJAdC^*rH=PB&#jlPP6SG8vuCR6aGXFmZiBtGLZKQ$| zD|`D+z{c7N_6D5P6rnfhK@v!n5Bod67y%*?pJRR$#BH$+4bkKga9PKh0}|z0{roXZ z=pJ+1?v=T2+NmN zjTdvvI{T(a_wN;U`g}93PPiMGlQGD+IAhRVav?MEjKmY6&_sMpqdNaRZ4;b5O_VHu3vhGR# zw1~_|AXvShpp?sJ-v&?%Ng*QDfEc}i;?ffYC0IGW@iTl^H=xfh+1+pLHh@|f`szyd z6je>@{9csDntp|-o!+35T5pf8-g#Zi{JeH`(&dseQnypP0tx3<+Sa&idRxVsIIDf~ z*&_V(4&=h?v{QzmM@)jA@tDdlOg;Aqc5bj?GNAYi{yq&LR3|%zGP*0Q%I}6Hr&VG{ z8vE!vdc~MlXOVYYZ`1=|@;lFay4;1(>Gc9Qo1{jdrbif^_MC0sVO>RxV?~)-;uz^nV;CErz|lN< zV!8HJ<6x@`frYD|zGHX1Wx}fWcDetjp=-PZ@-i_ez;vA`JFO1 zNVJymM5x^I!(Swu&UhnV^6ClYi{WRl%Qd;-zVo$3SdhJC;_wzMg)X82IpPcP@8#Ep z{_2Nu zAc{!P()P^Ij70DqU@#JZlD_GuuNM8gmlFJYn?zCp-HrC1d;*bfy&Ioh?FPeqtpu)G z%j(e7R9=ylP%8E|mBHF!{VYTI%mAjzqM)FdjBcee^<%sX&P63PoO-nbk$2s6kqU88 z-T+wK&^_%2a*;%bPY3A!aW`faCTOGw&Cn4`S>14Z@1OpZ%4mo32?z5(Vm_T>xcQ|g z@F4rD@GwO$%6=4mvvB&%%}j3bA##LRZVP*?Y^&63EozOSiK$45dxTz3jl~6J1{&Zv z19x8tJ)yhXsLM9Nj^-ctc~$1g2Fhh3+>`5}<^{3D>*&Sr@WU1dg3!qCq>>Rr3&zOG zR6sZ51@6w8c2OD@YOTtVf9Nm%-;vu=q^!18ap$sd4;E^rX&=0CWi;nc;r6A90qr5L z(^)E}e3B!EI5SW;rqS)@=6BFTRz7B==Y}EC{1Y$@h5#qVhlyTlD?K|qJO9}vsJd+` zMV&JdvjmEYE0`e)4DUgD7y^d@C5@+`|Ek432ZSxS`3OzBGy*boR*yMn>s)yo4q{6* z*!%S&5(pS3|BT}pCTd0vRRE%^@Ie6w9g|82tWu+3?m1qxmjkz|6yg z0=P}Tu`ERiCUe{J1q|>p{$&{zlnT8I2sHrQ(g!HG99#rvqK) zp7gRs+5wHwN)>XhHLJszs>ehu&?zPQv+%^;WB%jXxqA>PJRFv&i_Tyv>jnBJVQ~i( z+2^Ew+)1vV8e4qw3=nnnyBxF~8^h++%BAVa*)sZc=R7H)qBQbc<%(Cfl|@yA-QS63 zr;fX(E^17^yXnszbGpry3YX{t`58B2xyfQLh%@|slCvU7u^-J3fRf3<1MNOX7O>Wy zcYC=;J%DewXity6y(16izvaFsnWD+PaJNG>=TJ169%&FjZ%|8bUTZk{Xe&hx{{bvt zKF`65(F(u3ymw&oEKueTlXFgTthFYn+pY_&f5; ziBW@RGg^wi&b#P8J zdjMs8T0&|9JVTHgL;OotFgWXQghbiO!L+yzK6yx@`!m5Adw|NFopp+Z1B@5|siq*0 zM_g#`YV`?XED^v@c~;&T1Du6FpOzpT$$%0OXA%3Kr04JIStbRsd&)I09CzpH5k`pM zfM$)m#suSy+@hl4z?80R)mBMKDI;65apo4Nk^H0b;eE5_Jx)=@gc(@0(jD!_fTYrO zhG8*IxTf0YhB-r|GBCHbxj8cK0Dv|?%xeWO*l45-qGA9<1*m>SXwzcdPUEc?uqu8? zaR1Mf`+HyWFqXExLJ(_Smg@rE5-6u@0ZUs^v3k35#$mVq6wLE7j3fXueNt6)_sdeV z&hY%}I5u6s3Fw4%u57mGDIhfi#M&P{7HELC>|PD*8iR`bNQMplz1oyA1kr8l0DI>b z=yC4{SxQGx770Aqt*q1NK{~!b8GpYluVa6%_NR^c_fC!!K=k3GIt^*TJt3?C5MJ&G zSq7@xvrZV0R5T4Te1NE!=IY&PPva$O5c4m&xw&*7ABwmU0NaW@Wr(ev9iqmEG$!x^ zl3!U_Sw1(AXh_UoG}c))s$lzn)qQC^)a~2%ltP7+>{&um%4E+@Lb9eLJ85haLb7iW zDNB}!vQ%T=_r@~16w;8L>}AM4vS%!FAJg@}ujjs=SNEIe#WOGRnUUZ8e&>0d$9bH` z@jbrZP80lR$U*2Z78qFxgg(D2#QH1Y<*i)XdOd>%(Z9@ZOZDH_Dzkg(J$FDm#xCQU zzzW>3$SDZ1I9Po4~F2EBD4e7lo)0i94wRG-s=ug5lD+Y2(-=Vi3DyKHvw zQg@BUnoh#!!??t_f!9nEfjPd^*H(M@pIQVpuA1ynXG*n)=-;-V^}2M*ouT>2>bc(G z105d*rIxg+42HbA8uZsrpCG9?1UxS46i>-MWlsAalbXEz__R9u%F26m0tHry2yy`) zjo;sCl-SSDk>me@{fE0?tGtIRa2?Hz-wDk6uG+<${hIEnFTyFMd8_x$1>{p*(3(D$ops;?~(+E zbvprE6-7{r+hl*m?TNL$?WN+E9&G9t`0Ssnmmh5%X(ar23&o$?oAe7|50wO`N9le0 zZocG`q^B*e!&Edcb)r^C5GBU$N8TPU_TDK&;h*EDD0MLC-9r4~SalwiJt_C_yaC;C zRsIGRv_)NKI!_hu!qMmWHmh2LRTwt{(3bYbzJKRdD0Mj|J^5p@F@KIUdP!*jikR3K zcF_n7Y|#4-{4ux7ZKHk~-ag?C@;}je+h+7xxabU+&xEhlKD#p4yVL?#8?$pC-1p7lv-lF3)NzPgx>4GHv2b& z)Z3Y9=81DJoDS$$o$xPN5D1I6B#<0nIRce33K0#Epucuez$|<$IbJZ?_Q$YfTGAB~JC3Ac zwvFT?bjL&=w@elAw)Bg?tz-96yQkMsx9K{TZatae(;S;TFkELGGUne#v%y;QT;WU` zds@LI^3HzuL(Mc+vW=N>rn+)%*feY&rufw&FY4-^)@srJ5@2xcHq}(R``D^{ z%$2h*TA4+tQfZyMy`m#dDT&;9McNx82l{Su3mAHA`0iBK`iivCy``e%zmBb+l2lV> zV-#hxY4XzXj_~-7iKGsRcv_q#kswIzt`hY)o6qqO2Ysx4fk4(Hj-!I4W@&q6HAc%Q zQHx7Kk1o&ZpsF>m28(8c`l_5+UWI+>p8}a<`4L6%#G@#%^Ld_qWEOy1$tuM zC9*pvvPT!Sl3r)}hmEGO(b($P_Le8G;|t!e5xL)w^l$Rb)ZbB0ZU}Bk@EtO)GP}`V zMlc>4vRyQmN%Y#o-?5Vek!^?N*Dvp4*ZTL2@%NS=;l@s`!+#Q-r%rq3`Mvy@<}>!v z0798xzc%p*SAoR?=XLK+yyIXFA_tFfA66nr-(7d}GUHL)ErQtMf)NSfhC*Do0RV-uGb+RLH< zDn0O0)aC(z(XJUZFVs=pqYA*<`bbJj8q;{M&|3t6uj|;VbGn6slUvnRkwy`)NsZDR z+YQ;Ip&Fb>LbjftA?%*k3RK|B7uYTp&?!k693}q_JdlHuYnd*TWF>EI1$S_L>bA5c z%i(hLmRYLJ2C_G#JeO#tfBM&mt$%1}%&9*gGaji}thd{`QH|84lN=+y%2)-7H6g5#lS3iM;U~xV9{`Zb9wK8|I8Cm@CU?-$?uMypzDbmHLTtPhqLUcg?y7ZOl!T(t~Di=Wvre!!T# zzkRN(9RUUMZdiCem1nGqrjO7w{Ct*Pe?XX;WsC>z;l50ihLF!M`mNtIYlYRhOl8ev=@U+m{&GpAH~&66iD01a3x6;(b7UZ# zrI=80}HU*^|Y=6@%PM&_OPjJxl&G{h}xSOU=kPHwRc{MDMi_7}$l(#ZF zGlg5iRy=-<8=m;Fhbu!S(@CZYEYfdzCe}yX55Lb>b=sWtp)ra7_|+cu+lCKaP4f)%hy>`L_<8_fUfuT!dF_N290A3zVSe-Alo zLPJYK&w<%^7`VSpB>kfGqgzF0xfH0+~4%67(;eK~Pt+<#{@AAQsC#m2Mg7UcF&^Fx>Q7p*$GkGS>JJvstY z`$ij%^xLi>yGsYU`^m2i*W!Cu2F+V@SQbY^5;IKxs}Jt^vF+N~F}xUf{Ny)7sB z=WGO5YV!K=*e7YJ{k!2|$>e&Kt~CEtxeJz$h6skZvgj+eg8_!#;xrmcSmRjC@`V;* z&D2D(;iV}q-7CJnwhU4`&b6;Gy(r^331?$&v9z}3CSgG}vgc+)?c_VTDHBDOswqLa z3*#yiMf^<$;xwKYhBGl!+c}F`8r@ZsSv5(6@)vuS2;mP~Oa1Js{Ie_kk=h&Qe+>3A zxeTjr>;jdJCvpp3 zI3x3G>eBJ}6u7Y>oLwz4Yf-Dh>Hg0TtbN=1cT)+eLNdOcn3O<-OH3 z%s$FEG^h4xe9MSS=Pm=I$q+}2TeQ6AlA+vkFXD~OAX9fu>usW!;kBF^yP(e42IO_E zmNGNUQTrM# zIG{HXbO1Qo)0M_I%|y4qPYoM&N}-*hj*bf!*Yp7dP9c?V(?UGU-7It5pvz-hp`ouy z=+2@9Em@WthN@{~{hERQdcNg)Jx!C@QOb2Gub__oRH6R*_F2ZyYi6mgg44hVH~{`l zts_O)_E5XuVfwZ~3k@5acOXnD>!Q43^)0`!bG}3&eO}=bTM^CQ7eOE{d0kZgMq|Ag z!rF}Jwv%KuTnGqw5&D-hB3&{&|Ay|6+iTtFY@BQ>2U*cSgyW2FOP7dIx2`TqQ=BQP^^6~>IIW3(7&wXM+*)I@G#;2C)Dy1oRsK#NzQ|`2 z(etT9)!%8pf3ItNw!hcIb)>ggF}D5H2~A7tijQSQ8BSDdYwNT{wm=@ti|su`I*&!R?dT zMb;kv;nkn)8_Se3oOtT5@U7Me@a4J-j^t=}tfW>c(8f29D_JI^X0?X1hpI5B#wvL< z&xT9&g^z<%vT>S)L^kPYCzKhQVw}g$l*b%v!Ie35PF>7wn=*)R#`R_wBBhF2Bd0mG z$HJ?`ws8@H`GJDJ0z#@3sG`J*D`OK%`tarBR?RX#Z<8yenJMV|z*)oTH>sM=rp?s{vQS95=Z2wpLKxo!Tq=Uamg{5SHfmCMn_ zXvig#?`_l8ByVP)?jahvNAb5*9sM3$1M^jGp^;>;q-i9UcePvS2Wc3axcY5FTYEfA z%vqqtWB<|{yJ60bX6}x0*5Ne2=I!E}1wq-B{rEBQiw<{3`MavZQ=KduF~XTH&yw%E z*INv82Uhv)?H!D=8~5zKJ0hC=yp$+yKbq8eyki*Mi5U*QqKV$i=zrqR`_%SdOYOvQ)4!rA{dCqaz=%uo(59pnCg-Z6*nyZq0aj7OiY zjqXDrI_3KE$whNlDh){IzJWc+M^5i8gI{m zRHb%zW*jDO9hrZwKJZYybk#sZqC&1s^%uD!|E;rz&_2XjP26NYt(2?F_3CC_k&`Ma z;py~ki(1Z2%bp_haIY5ECd|j07&$4PqPr$_d8f8_oIHro}=9mEqA+sYQB`*qJHh=Li{z&0ZZFw z(G@HH}lXTIb(d~{68m$oC zI+tFWChc&@J}^+H85<>f?e@^nSXSt=hJ=D-&wOp7JEzu_fo|36YztC0er+fFzNsmB zB#rD*cbqc%%BVZkr~a7+eU+CgvZ--b%>@j3vz({vL3o_2b__%`y#4qcYER9K#Dw@}_ zQhC{RP40an_R`kZ#u(s}jS@^gPq<(5luxS+1v7?)qYjrRi?^CsZo& zC(zdd>GC5^{kx z%hJ%$&=Go_TeoPAau)zpOHxWIadHO{v{H>85J|?o!$hzaoh&0R-R0ttcD~}JvPq;x zCSvO4cJP8nC2D1ljDNcknXxHzfZ4O=a@2In4nAX3{dYRKr1;Lab8{U^IgIr5^ryEh zq$uAlx8XSQ-^5}v9-qzv$PO^%Y=u&=22R9!FRI8rrdW|t1Fc1_3FyoZ)&@C zS;P0e|42Hmr{eV_lyDfX#0lBSx+^LX=g)NCa?qK2!L;ta;e-pTHSXubo;*@mP26r& z>4Ys~|G)rx9CttdC?ccmrlDaj4u`wn6yWxK1kQIzV>pCN6#{E?I#75Yk@t zNvVLOL?}m*Wr7Opz6?vJu{vx8Ry!{2B(q6d=Vq`h-$KwX?N^u=l^MYt(O~Q4*p|`A3ii8{-{P)P*Qa~lK&yLg)FUBCRGmy3XsGbpax?c`tV~h#uc4;rnDiHEj^VMK6WO1mJSlb} z#EjaCmoMoJ+oTwR>KK-;i8txZ$v-aZM~bLT7)3f%vzx-d14o_Xn>l`v1yQ%$`eM`z zRmQm@gt8ic(mFSm4na-rX1o-OM_5_${`p|)(Z0(!6VBeq%E4k~^hdA1Q(t#?x!4Jv z^+})Ttv>K4o#Jt04{}o$QB{O*o(eJx^ZSevVx?nz`}Q?Z7u2lxn`({H)7_7Xib|39 zF3ZwRehU|y0a?NBqXEA{3_On@1Y+@wbXd9%%IYGjjFj7F$I_`h$6xHEL}dzsIw=@X zt=0oj6kf>1!Qt%n7z9Fy0d7^F_!)%=a?=n|#ah7t`$$Mh zf_y56+{1#o`FTTg^TK2N>N5#9bh`jWs0spNr|q_{@SSRLoxBuX?Oh^}3i5gu-lPst zUFJj&-#-6MflA!A?bi@RcQtq(RK*RCu!>M3eRZI8w3o=U{unkT-vI!^O8YLpRM%#9 z(#W)8_Rx?Cm(R=_*&m@y4*<((4-}cfq$7wD{v?=>jFDJ8JQL-aC&e=E(%palO2$1a z_O3uOFyn92j2aUOXepumdw_8jiZUrmHc-*HmR&Dav|HGu;gY1VHvDMlx8Ui9#Z z+3JSfLL7*#o#T$tikq98lW-c6;PPMVayzL(-rH=TP!XMu*W86>_)frn78qIj8^VIF zZib{}xj=4J)h&=6tOH{&hST_Gws(-BnT1qOKa5{rUtcEl*sDYnW`IDzLOt@Tz}x&{ zoJrgo$ZubV{A0X;Imk-D8EGYT^`%Ypp+%nq0NOmaJENgwz!%$DV*T*}t5$R%A;ro@)bRxDACu|^#GiyDTCM3yRiwwplQ^L;a=a=Dlwvf5CF|@l0%kqXobe zhmD}NHzp43Wo4dY&=-TWYZET6>!z?;X4Th^lKkYkr5kppkUHk(sh&w4(AV$Qe8g>9 z=b_*F>Zn5L?Y2wNHWXVOPrU@qOzbg9BIa|IoMLyMctg5EN1l7Li9Byfb_A$@T@9rv zAp^Hb?YW5%l5{f;m@I1qbGjB~mdJxGxCl$wfIn$ULqs^zEG#&TJv2wCLw=P~(l9O= z13)ka*8SF(>GoIIP#p>f zgKXXy?$aI7_GEF5%F0SeO5)ytG}{I9nwzj7&i|IBetL!PjU4g4>gsgIrh?Ip8w4(> z4a_xjx~zRk_a(~cq{$C*tVW7|{Z-@&iXu=RU@-USGt7f9Bb{|0g6aUaKms0vk=xDK z;W-|`D?eG*&6~9El<+$3>CW-)CE_7cdBCoZCw9My>A(S6+we-4VBx)K!=8STFezbi+VS=9jx;Wz4bW>Y z0s{Z52Ah(KkgM|@&_v`QCdxnc)1qyOzJ4^@bHyXO#+D&f<8(xbF^ts!#ZH4@*)bFwgNmFq#M|Dr~_f z%;?37e}51vY%s@wn0gD=qqoQu5n6l??ub2Nd!f9epFc+aEu=(lK#Kns7)PY&m>wys zxoCbPS1CPHC4`!}pl4uSc(7O$!TZ>(-G{HAJX_f`k8jz-!jVdtiTFaq(t! zy5EW>jYOI|-3AJ;&oVYMF!W9C<7K6dgnT*6pdYQlu+4UW_wKukR{OS}x1 zTDc;kbt-9}vNvB+=5u^dH${?tH2+?eMTNGWzBdGa{{MR?xVf7z#2TZJYU?9R!vObTRQ`c>j$5j7ObI9uC4D! zJmOFC0Bcp=;``Un%od8ltj+dF+BDN?j<#;VqtL%T3ALkjuzA~rC@#5vxv>)K5P<$9 zlKi z-37bg;i`&?gubjV2>rvbKf#kiJOIJN7cSVNsOe6?bE>_E_`=Z0=!~Wb)-BWa>+4+D zfh6sA!5TD_wp#TheAes>jmrfIScn3=dPd`0F%(*Y`fXw@yGL9rh6QO+fETIN?y|oL z1%-p#Ua*n+v_QK7&;G(qs7%~j`VCSQm1D-M9f5l{EJ@u-6`;ASygL*1kwH!y9COm< z??hAQE(!+RA3S$IshO=FA_vk}Pux3r?wWC;?EavTLzEjvOhucigVjOQ17k|1QgELc zL8JZhqDqXy#!FD|%Y;b3(4yW12v0eb0_rHi6Xi>w5&u8?TU4av2d4m1Px>4vkHc05 z5RN_|-O8Em$wotSBgGAjllMHG#&PCxYal9a_KX!N;??r?F{?!mAZpa&D(*BL)`2gi zAbeF-LyYV;76wx!9O5>ASM=!i0Ky~}~m8WI{ zeauE?Z{Th1c>fY^;;HZ&gcCF1i;@HY*51%fS=A9tXXaqHz7E$1BFVKM;XIB&jQYmK z)(>j+XsFqbxFErJo)UZ4pI4ui3w_+d}VOk?Skv6RpWoH6779WJ>; zY-Ky~DZ*mpvJym1k}h@2VUqo@8KYqheXZ)U@7)2EkyO!q3T%AdQ>In-S_c~d>L=eDYQMg89r|8-fK00rhPxh0 z>_ask`HH)1H~RYf9ii(~7P|Z-VPsQ5|2IX-Ddl8KX>$#94pA1;y8JQ=@!Y({c S Date: Mon, 16 Sep 2019 21:18:07 -0400 Subject: [PATCH 029/125] fix io incompatibilities with pandas 0.25 --- scprep/io/csv.py | 13 ++++++----- scprep/io/mtx.py | 4 ++-- scprep/io/tenx.py | 6 ++--- scprep/io/utils.py | 21 ++++++++++-------- test/test_io.py | 55 +++++++++++++++++++++++----------------------- 5 files changed, 51 insertions(+), 48 deletions(-) diff --git a/scprep/io/csv.py b/scprep/io/csv.py index 6e65c51b..bf17bc55 100644 --- a/scprep/io/csv.py +++ b/scprep/io/csv.py @@ -4,13 +4,14 @@ import pandas as pd from .utils import _matrix_to_data_frame +from .. import utils def _read_csv_sparse(filename, chunksize=1000000, fill_value=0.0, **kwargs): - """Read a csv file into a pandas.SparseDataFrame + """Read a csv file into a pd.DataFrame[pd.SparseArray] """ chunks = pd.read_csv(filename, chunksize=chunksize, **kwargs) - data = pd.concat(chunk.to_sparse(fill_value=fill_value) + data = pd.concat(utils.dataframe_to_sparse(chunk, fill_value=fill_value) for chunk in chunks) return data @@ -36,7 +37,7 @@ def load_csv(filename, cell_axis='row', delimiter=',', If `True`, we assume cell names are in the first row/column. Otherwise expects a filename or an array containing a list of cell barcodes. sparse : bool, optional (default: False) - If True, loads the data as a pd.SparseDataFrame. This uses less memory + If True, loads the data as a pd.DataFrame[pd.SparseArray]. This uses less memory but more CPU. **kwargs : optional arguments for `pd.read_csv`. @@ -44,7 +45,7 @@ def load_csv(filename, cell_axis='row', delimiter=',', ------- data : array-like, shape=[n_samples, n_features] If either gene or cell names are given, data will be a pd.DataFrame or - pd.SparseDataFrame. If no names are given, data will be a np.ndarray + pd.DataFrame[pd.SparseArray]. If no names are given, data will be a np.ndarray or scipy.sparse.spmatrix """ if cell_axis not in ['row', 'column', 'col']: @@ -113,7 +114,7 @@ def load_tsv(filename, cell_axis='row', delimiter='\t', If `True`, we assume cell names are in the first row/column. Otherwise expects a filename or an array containing a list of cell barcodes. sparse : bool, optional (default: False) - If True, loads the data as a pd.SparseDataFrame. This uses less memory + If True, loads the data as a pd.DataFrame[pd.SparseArray]. This uses less memory but more CPU. **kwargs : optional arguments for `pd.read_csv`. @@ -121,7 +122,7 @@ def load_tsv(filename, cell_axis='row', delimiter='\t', ------- data : array-like, shape=[n_samples, n_features] If either gene or cell names are given, data will be a pd.DataFrame or - pd.SparseDataFrame. If no names are given, data will be a np.ndarray + pd.DataFrame[pd.SparseArray]. If no names are given, data will be a np.ndarray or scipy.sparse.spmatrix """ return load_csv(filename, cell_axis=cell_axis, delimiter=delimiter, diff --git a/scprep/io/mtx.py b/scprep/io/mtx.py index eb4d114e..598c3b42 100644 --- a/scprep/io/mtx.py +++ b/scprep/io/mtx.py @@ -21,14 +21,14 @@ def load_mtx(mtx_file, cell_axis='row', cell_names : `str`, array-like, or `None` (default: None) Expects a filename or an array containing a list of cell barcodes. sparse : bool, optional (default: None) - If True, loads the data as a pd.SparseDataFrame. This uses less memory + If True, loads the data as a pd.DataFrame[pd.SparseArray]. This uses less memory but more CPU. Returns ------- data : array-like, shape=[n_samples, n_features] If either gene or cell names are given, data will be a pd.DataFrame or - pd.SparseDataFrame. If no names are given, data will be a np.ndarray + pd.DataFrame[pd.SparseArray]. If no names are given, data will be a np.ndarray or scipy.sparse.spmatrix """ if cell_axis not in ['row', 'column', 'col']: diff --git a/scprep/io/tenx.py b/scprep/io/tenx.py index 6da8f90b..6af20f47 100644 --- a/scprep/io/tenx.py +++ b/scprep/io/tenx.py @@ -95,7 +95,7 @@ def load_10X(data_dir, sparse=True, gene_labels='symbol', Returns ------- data: array-like, shape=[n_samples, n_features] - If sparse, data will be a pd.SparseDataFrame. Otherwise, data will + If sparse, data will be a pd.DataFrame[pd.SparseArray]. Otherwise, data will be a pd.DataFrame. """ @@ -168,7 +168,7 @@ def load_10X_zip(filename, sparse=True, gene_labels='symbol', Returns ------- data: array-like, shape=[n_samples, n_features] - If sparse, data will be a pd.SparseDataFrame. Otherwise, data will + If sparse, data will be a pd.DataFrame[pd.SparseArray]. Otherwise, data will be a pd.DataFrame. """ @@ -247,7 +247,7 @@ def load_10X_HDF5(filename, genome=None, sparse=True, gene_labels='symbol', Returns ------- data: array-like, shape=[n_samples, n_features] - If sparse, data will be a pd.SparseDataFrame. Otherwise, data will + If sparse, data will be a pd.DataFrame[pd.SparseArray]. Otherwise, data will be a pd.DataFrame. """ diff --git a/scprep/io/utils.py b/scprep/io/utils.py index 85cd9249..0574b742 100644 --- a/scprep/io/utils.py +++ b/scprep/io/utils.py @@ -6,6 +6,8 @@ import warnings import numpy as np +from .. import utils + def _parse_header(header, n_expected, header_type="gene_names"): """ @@ -93,7 +95,7 @@ def _matrix_to_data_frame(data, gene_names=None, cell_names=None, sparse=None): # dataframe with index and/or columns if sparse is None: # let the input data decide - sparse = isinstance(data, pd.SparseDataFrame) or sp.issparse(data) + sparse = utils.is_sparse_dataframe(data) or sp.issparse(data) if sparse and gene_names is not None and \ len(np.unique(gene_names)) < len(gene_names): warnings.warn( @@ -101,18 +103,19 @@ def _matrix_to_data_frame(data, gene_names=None, cell_names=None, sparse=None): RuntimeWarning) sparse = False if sparse: - # return pandas.SparseDataFrame + # return pandas.DataFrame[SparseArray] if isinstance(data, pd.DataFrame): if gene_names is not None: data.columns = gene_names if cell_names is not None: data.index = cell_names - if not isinstance(data, pd.SparseDataFrame): - data = data.to_sparse(fill_value=0.0) + if not utils.is_sparse_dataframe(data): + data = utils.dataframe_to_sparse(data, fill_value=0.0) + elif sp.issparse(data): + data = pd.DataFrame.sparse.from_spmatrix(data, index=cell_names, columns=gene_names) else: - data = pd.SparseDataFrame(data, default_fill_value=0.0) - data.index = cell_names - data.columns = gene_names + data = pd.DataFrame(data, index=cell_names, columns=gene_names) + data = utils.dataframe_to_sparse(data, fill_value=0.0) else: # return pandas.DataFrame if isinstance(data, pd.DataFrame): @@ -120,8 +123,8 @@ def _matrix_to_data_frame(data, gene_names=None, cell_names=None, sparse=None): data.columns = gene_names if cell_names is not None: data.index = cell_names - if isinstance(data, pd.SparseDataFrame): - data = data.to_dense() + if utils.is_sparse_dataframe(data): + data = data.sparse.to_dense() else: if sp.issparse(data): data = data.toarray() diff --git a/test/test_io.py b/test/test_io.py index 302770a1..92c1af4a 100644 --- a/test/test_io.py +++ b/test/test_io.py @@ -1,7 +1,7 @@ from tools import data import scprep import scprep.io.utils -from sklearn.utils.testing import assert_warns_message, assert_raise_message +from sklearn.utils.testing import assert_warns_message, assert_raise_message, assert_raises import pandas as pd import numpy as np import os @@ -32,16 +32,16 @@ def test_10X_duplicate_gene_names(): def test_10X(): X = data.load_10X() assert X.shape == (100, 100) - assert isinstance(X, pd.SparseDataFrame) + assert scprep.utils.is_sparse_dataframe(X) assert X.columns[0] == "Arl8b" X = data.load_10X(gene_labels='id', sparse=False) assert X.shape == (100, 100) assert isinstance(X, pd.DataFrame) - assert not isinstance(X, pd.SparseDataFrame) + assert not scprep.utils.is_sparse_dataframe(X) assert X.columns[0] == "ENSMUSG00000030105" X = data.load_10X(gene_labels='both') assert X.shape == (100, 100) - assert isinstance(X, pd.SparseDataFrame) + assert scprep.utils.is_sparse_dataframe(X) assert X.columns[0] == "Arl8b (ENSMUSG00000030105)" X_cellranger3 = scprep.io.load_10X( os.path.join(data.data_dir, "test_10X_cellranger3"), @@ -74,7 +74,7 @@ def test_10X_zip(): filename = os.path.join(data.data_dir, "test_10X.zip") X_zip = scprep.io.load_10X_zip( filename) - assert isinstance(X_zip, pd.SparseDataFrame) + assert scprep.utils.is_sparse_dataframe(X_zip) assert np.sum(np.sum(X != X_zip)) == 0 np.testing.assert_array_equal(X.columns, X_zip.columns) np.testing.assert_array_equal(X.index, X_zip.index) @@ -99,7 +99,7 @@ def test_10X_zip_url(): filename = "https://github.com/KrishnaswamyLab/scprep/raw/master/data/test_data/test_10X.zip" X_zip = scprep.io.load_10X_zip( filename) - assert isinstance(X_zip, pd.SparseDataFrame) + assert scprep.utils.is_sparse_dataframe(X_zip) assert np.sum(np.sum(X != X_zip)) == 0 np.testing.assert_array_equal(X.columns, X_zip.columns) np.testing.assert_array_equal(X.index, X_zip.index) @@ -114,9 +114,8 @@ def test_10X_zip_url_not_a_zip(): def test_10X_zip_url_not_a_real_website(): - assert_raise_message( + assert_raises( urllib.error.URLError, - "", scprep.io.load_10X_zip, 'http://invalid.not.a.url/scprep') @@ -142,19 +141,19 @@ def test_10X_HDF5(): h5_file = os.path.join(data.data_dir, "test_10X.h5") # automatic tables backend X_hdf5 = scprep.io.load_10X_HDF5(h5_file) - assert isinstance(X_hdf5, pd.SparseDataFrame) + assert scprep.utils.is_sparse_dataframe(X_hdf5) assert np.sum(np.sum(X != X_hdf5)) == 0 np.testing.assert_array_equal(X.columns, X_hdf5.columns) np.testing.assert_array_equal(X.index, X_hdf5.index) # explicit tables backend X_hdf5 = scprep.io.load_10X_HDF5(h5_file, backend='tables') - assert isinstance(X_hdf5, pd.SparseDataFrame) + assert scprep.utils.is_sparse_dataframe(X_hdf5) assert np.sum(np.sum(X != X_hdf5)) == 0 np.testing.assert_array_equal(X.columns, X_hdf5.columns) np.testing.assert_array_equal(X.index, X_hdf5.index) # explicit h5py backend X_hdf5 = scprep.io.load_10X_HDF5(h5_file, backend='h5py') - assert isinstance(X_hdf5, pd.SparseDataFrame) + assert scprep.utils.is_sparse_dataframe(X_hdf5) assert np.sum(np.sum(X != X_hdf5)) == 0 np.testing.assert_array_equal(X.columns, X_hdf5.columns) np.testing.assert_array_equal(X.index, X_hdf5.index) @@ -162,7 +161,7 @@ def test_10X_HDF5(): tables = scprep.io.hdf5.tables del scprep.io.hdf5.tables X_hdf5 = scprep.io.load_10X_HDF5(h5_file) - assert isinstance(X_hdf5, pd.SparseDataFrame) + assert scprep.utils.is_sparse_dataframe(X_hdf5) assert np.sum(np.sum(X != X_hdf5)) == 0 np.testing.assert_array_equal(X.columns, X_hdf5.columns) np.testing.assert_array_equal(X.index, X_hdf5.index) @@ -174,19 +173,19 @@ def test_10X_HDF5_cellranger3(): h5_file = os.path.join(data.data_dir, "test_10X_cellranger3.h5") # automatic tables backend X_hdf5 = scprep.io.load_10X_HDF5(h5_file) - assert isinstance(X_hdf5, pd.SparseDataFrame) + assert scprep.utils.is_sparse_dataframe(X_hdf5) assert np.sum(np.sum(X != X_hdf5)) == 0 np.testing.assert_array_equal(X.columns, X_hdf5.columns) np.testing.assert_array_equal(X.index, X_hdf5.index) # explicit tables backend X_hdf5 = scprep.io.load_10X_HDF5(h5_file, backend='tables') - assert isinstance(X_hdf5, pd.SparseDataFrame) + assert scprep.utils.is_sparse_dataframe(X_hdf5) assert np.sum(np.sum(X != X_hdf5)) == 0 np.testing.assert_array_equal(X.columns, X_hdf5.columns) np.testing.assert_array_equal(X.index, X_hdf5.index) # explicit h5py backend X_hdf5 = scprep.io.load_10X_HDF5(h5_file, backend='h5py') - assert isinstance(X_hdf5, pd.SparseDataFrame) + assert scprep.utils.is_sparse_dataframe(X_hdf5) assert np.sum(np.sum(X != X_hdf5)) == 0 np.testing.assert_array_equal(X.columns, X_hdf5.columns) np.testing.assert_array_equal(X.index, X_hdf5.index) @@ -194,7 +193,7 @@ def test_10X_HDF5_cellranger3(): tables = scprep.io.hdf5.tables del scprep.io.hdf5.tables X_hdf5 = scprep.io.load_10X_HDF5(h5_file) - assert isinstance(X_hdf5, pd.SparseDataFrame) + assert scprep.utils.is_sparse_dataframe(X_hdf5) assert np.sum(np.sum(X != X_hdf5)) == 0 np.testing.assert_array_equal(X.columns, X_hdf5.columns) np.testing.assert_array_equal(X.index, X_hdf5.index) @@ -276,7 +275,7 @@ def test_csv_and_tsv(): np.testing.assert_array_equal(X_csv.columns, X_csv4.index) np.testing.assert_array_equal(X_csv.index, X_csv4.columns) assert isinstance(X_csv, pd.DataFrame) - assert not isinstance(X_csv, pd.SparseDataFrame) + assert not scprep.utils.is_sparse_dataframe(X_csv) X_csv = scprep.io.load_csv( os.path.join(data.data_dir, "test_small.csv"), gene_names=os.path.join( @@ -289,7 +288,7 @@ def test_csv_and_tsv(): np.testing.assert_array_equal(X.columns, X_csv.columns) np.testing.assert_array_equal(X.index, X_csv.index) assert isinstance(X_csv, pd.DataFrame) - assert not isinstance(X_csv, pd.SparseDataFrame) + assert not scprep.utils.is_sparse_dataframe(X_csv) X_csv = scprep.io.load_csv( os.path.join(data.data_dir, "test_small.csv"), gene_names=X.columns, @@ -300,7 +299,7 @@ def test_csv_and_tsv(): np.testing.assert_array_equal(X.columns, X_csv.columns) np.testing.assert_array_equal(X.index, X_csv.index) assert isinstance(X_csv, pd.DataFrame) - assert not isinstance(X_csv, pd.SparseDataFrame) + assert not scprep.utils.is_sparse_dataframe(X_csv) X_csv = scprep.io.load_csv( os.path.join(data.data_dir, "test_small.csv"), gene_names=None, @@ -309,7 +308,7 @@ def test_csv_and_tsv(): skiprows=1, usecols=range(1, 101)) assert np.sum(np.sum(X.to_numpy() != X_csv.to_numpy())) == 0 - assert isinstance(X_csv, pd.SparseDataFrame) + assert scprep.utils.is_sparse_dataframe(X_csv) X_csv = scprep.io.load_csv( os.path.join(data.data_dir, "test_small_duplicate_gene_names.csv")) @@ -336,7 +335,7 @@ def test_mtx(): assert np.sum(np.sum(X.to_numpy() != X_mtx.to_numpy())) == 0 np.testing.assert_array_equal(X.columns, X_mtx.columns) np.testing.assert_array_equal(X.index, X_mtx.index) - assert isinstance(X_mtx, pd.SparseDataFrame) + assert scprep.utils.is_sparse_dataframe(X_mtx) X_mtx = scprep.io.load_mtx( filename, gene_names=X.columns, @@ -345,7 +344,7 @@ def test_mtx(): assert np.sum(np.sum(X.to_numpy() != X_mtx.to_numpy())) == 0 np.testing.assert_array_equal(X.columns, X_mtx.columns) np.testing.assert_array_equal(X.index, X_mtx.index) - assert isinstance(X_mtx, pd.SparseDataFrame) + assert scprep.utils.is_sparse_dataframe(X_mtx) X_mtx = scprep.io.load_mtx( filename, gene_names=None, @@ -360,14 +359,14 @@ def test_mtx(): "Expected 'row' or 'column'", scprep.io.load_mtx, filename, cell_axis='neither') - X = scprep.io.load_mtx( + X_mtx = scprep.io.load_mtx( filename, gene_names=np.arange(X.shape[1]).astype('str'), cell_names=np.arange(X.shape[0])) - assert X.shape == (100, 100) - assert isinstance(X, pd.SparseDataFrame) - assert X.columns[0] == "0" - assert X.index[0] == 0 + assert X_mtx.shape == (100, 100) + assert scprep.utils.is_sparse_dataframe(X_mtx) + assert X_mtx.columns[0] == "0" + assert X_mtx.index[0] == 0 def test_fcs(): @@ -383,7 +382,7 @@ def test_fcs(): assert len(set(X.columns).difference(data.columns)) == 0 np.testing.assert_array_equal(X.index, data.index) np.testing.assert_array_equal( - X.to_dense().to_numpy(), data[X.columns].to_numpy()) + X.sparse.to_dense().to_numpy(), data[X.columns].to_numpy()) X_meta, _, X = scprep.io.load_fcs(path, reformat_meta=False, override=True) assert set(meta.keys()) == set(X_meta.keys()) From 87b0d3da236ea25d99b4a6382fc294103fd773bb Mon Sep 17 00:00:00 2001 From: Scott Gigante Date: Mon, 16 Sep 2019 22:05:56 -0400 Subject: [PATCH 030/125] fix scprep.select SparseDataFrame issues --- scprep/select.py | 6 ++++++ test/tools/matrix.py | 29 +++++++++++++++++++---------- 2 files changed, 25 insertions(+), 10 deletions(-) diff --git a/scprep/select.py b/scprep/select.py index 65dcef43..6973f5ff 100644 --- a/scprep/select.py +++ b/scprep/select.py @@ -467,6 +467,12 @@ def select_rows(data, *extra_data, idx=None, data = data.loc[np.array(data.index)[idx]] else: raise + # temporary workaround for https://github.com/pandas-dev/pandas/issues/27781 + if utils.is_sparse_dataframe(data): + for col in np.where(data.isna().any())[0]: + colname = data.columns[col] + if utils.is_sparse_series(data[colname]) and data[colname].isna().all(): + data[colname] = data[colname].fillna(data[colname].sparse.fill_value) elif _is_1d(data): if isinstance(data, list): # can't numpy index a list diff --git a/test/tools/matrix.py b/test/tools/matrix.py index ec741ddd..ad7113b3 100644 --- a/test/tools/matrix.py +++ b/test/tools/matrix.py @@ -15,7 +15,21 @@ def _no_warning_dia_matrix(*args, **kwargs): " diagonals is inefficient") return sparse.dia_matrix(*args, **kwargs) -SparseDataFrame = partial(pd.SparseDataFrame, default_fill_value=0.0) +def SparseDataFrame_deprecated(X, default_fill_value=0.0): + with warnings.catch_warnings(): + warnings.filterwarnings( + "ignore", + message="SparseSeries is deprecated and will be removed in a future version.", + category=FutureWarning) + warnings.filterwarnings( + "ignore", + message="SparseDataFrame is deprecated and will be removed in a future version.", + category=FutureWarning) + return pd.SparseDataFrame(X, default_fill_value=default_fill_value) + +def SparseDataFrame(X, default_fill_value=0.0): + return pd.DataFrame(X).astype(pd.SparseDtype(float, fill_value=default_fill_value)) + _scipy_matrix_types = [ sparse.csr_matrix, @@ -36,22 +50,17 @@ def _no_warning_dia_matrix(*args, **kwargs): _pandas_sparse_matrix_types = [ SparseDataFrame, + SparseDataFrame_deprecated, ] -_pandas_matrix_types = [ - pd.DataFrame, - SparseDataFrame, -] +_pandas_matrix_types = _pandas_dense_matrix_types + _pandas_sparse_matrix_types _indexable_matrix_types = [ sparse.csr_matrix, sparse.csc_matrix, sparse.lil_matrix, - sparse.dok_matrix, - np.array, - pd.DataFrame, - SparseDataFrame -] + sparse.dok_matrix +] + _numpy_matrix_types + _pandas_matrix_types def test_matrix_types(X, test_fun, matrix_types, *args, **kwargs): From be5eb2a012980b34e2df64dd84150c7f08afad9f Mon Sep 17 00:00:00 2001 From: Scott Gigante Date: Mon, 16 Sep 2019 22:07:21 -0400 Subject: [PATCH 031/125] fix scprep.utils tests for pandas 0.25 --- test/test_utils.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/test/test_utils.py b/test/test_utils.py index 740ec074..6f9143b1 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -161,14 +161,14 @@ def test_combine_batches_errors(): "append_to_cell_names only valid for pd.DataFrame input. " "Got coo_matrix", scprep.utils.combine_batches, - [X.to_coo(), X.iloc[:X.shape[0] // 2].to_coo()], + [X.sparse.to_coo(), X.iloc[:X.shape[0] // 2].sparse.to_coo()], batch_labels=[0, 1], append_to_cell_names=True) assert_raise_message( TypeError, - "Expected data all of the same class. Got SparseDataFrame, coo_matrix", + "Expected data all of the same class. Got DataFrame, coo_matrix", scprep.utils.combine_batches, - [X, X.iloc[:X.shape[0] // 2].to_coo()], + [X, X.iloc[:X.shape[0] // 2].sparse.to_coo()], batch_labels=[0, 1]) assert_raise_message( ValueError, From 7a5094039c79a190a488df8e0fe33f14a5bea4fe Mon Sep 17 00:00:00 2001 From: Scott Gigante Date: Mon, 16 Sep 2019 22:09:18 -0400 Subject: [PATCH 032/125] refuse to batch mean center DataFrame[SparseArray] --- scprep/normalize.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scprep/normalize.py b/scprep/normalize.py index 69501612..7c791c57 100644 --- a/scprep/normalize.py +++ b/scprep/normalize.py @@ -120,7 +120,7 @@ def batch_mean_center(data, sample_idx=None): data : array-like, shape=[n_samples, n_features] Batch mean-centered output data. """ - if sparse.issparse(data) or isinstance(data, pd.SparseDataFrame): + if sparse.issparse(data) or isinstance(data, pd.SparseDataFrame) or utils.is_sparse_dataframe(data): raise ValueError("Cannot mean center sparse data. " "Convert to dense matrix first.") if sample_idx is None: From c39b0ac708e85f904c87480ac490168dabc7b239 Mon Sep 17 00:00:00 2001 From: Scott Gigante Date: Mon, 16 Sep 2019 22:32:33 -0400 Subject: [PATCH 033/125] remove io.py --- scprep/io.py | 626 --------------------------------------------------- 1 file changed, 626 deletions(-) delete mode 100644 scprep/io.py diff --git a/scprep/io.py b/scprep/io.py deleted file mode 100644 index 94bb93c0..00000000 --- a/scprep/io.py +++ /dev/null @@ -1,626 +0,0 @@ -# author: Scott Gigante -# (C) 2018 Krishnaswamy Lab GPLv2 - -import pandas as pd -import scipy.io as sio -import scipy.sparse as sp -import warnings -import numpy as np -import os -import zipfile -import tempfile -import shutil -from decorator import decorator -from . import hdf5 - -try: - import fcsparser -except ImportError: - pass - -try: - FileNotFoundError -except NameError: - # py2 compatibility - FileNotFoundError = IOError - - -@decorator -def _with_fcsparser(fun, *args, **kwargs): - try: - fcsparser - except NameError: - raise ImportError( - "fcsparser not found. " - "Please install it with e.g. `pip install --user fcsparser`") - return fun(*args, **kwargs) - - -def _parse_header(header, n_expected, header_type="gene_names"): - """ - Parameters - ---------- - header : `str` filename, array-like or `None` - - n_expected : `int` - Expected header length - - header_type : argument name for error printing - - Returns - ------- - columns : list-like or `None` - Parsed column names. - """ - if header is None or header is False: - return None - elif isinstance(header, str): - # treat as a file - if header.endswith("tsv"): - delimiter = "\t" - else: - delimiter = "," - columns = pd.read_csv(header, delimiter=delimiter, - header=None).to_numpy().flatten().astype(str) - if not len(columns) == n_expected: - raise ValueError("Expected {} entries in {}. Got {}".format( - n_expected, header, len(columns))) - else: - # treat as list - columns = header - if not len(columns) == n_expected: - raise ValueError("Expected {} entries in {}. Got {}".format( - n_expected, header_type, len(columns))) - return columns - - -def _parse_gene_names(header, data): - return _parse_header(header, data.shape[1], - header_type="gene_names") - - -def _parse_cell_names(header, data): - return _parse_header(header, data.shape[0], - header_type="cell_names") - - -def _matrix_to_data_frame(data, gene_names=None, cell_names=None, sparse=None): - """Return the optimal data type given data, gene names and cell names. - - Parameters - ---------- - - data : array-like - - gene_names : `str`, array-like or `None` (default: None) - Either a filename or an array containing a list of gene symbols or ids. - - cell_names : `str`, array-like or `None` (default: None) - Either a filename or an array containing a list of cell barcodes. - - sparse : `bool` or `None` (default: None) - If not `None`, overrides default sparsity of the data. - """ - if gene_names is None and cell_names is None and \ - not isinstance(data, pd.DataFrame): - # just a matrix - if sparse is not None: - if sparse: - if not sp.issparse(data): - # return scipy.sparse.csr_matrix - data = sp.csr_matrix(data) - elif sp.issparse(data) and not sparse: - # return numpy.ndarray - data = data.toarray() - else: - # return data as is - pass - return data - else: - gene_names = _parse_gene_names(gene_names, data) - cell_names = _parse_cell_names(cell_names, data) - # dataframe with index and/or columns - if sparse is None: - # let the input data decide - sparse = isinstance(data, pd.SparseDataFrame) or sp.issparse(data) - if sparse and gene_names is not None and \ - len(np.unique(gene_names)) < len(gene_names): - warnings.warn( - "Duplicate gene names detected! Forcing dense matrix", - RuntimeWarning) - sparse = False - if sparse: - # return pandas.SparseDataFrame - if isinstance(data, pd.DataFrame): - if gene_names is not None: - data.columns = gene_names - if cell_names is not None: - data.index = cell_names - if not isinstance(data, pd.SparseDataFrame): - data = data.to_sparse(fill_value=0.0) - else: - data = pd.SparseDataFrame(data, default_fill_value=0.0) - data.index = cell_names - data.columns = gene_names - else: - # return pandas.DataFrame - if isinstance(data, pd.DataFrame): - if gene_names is not None: - data.columns = gene_names - if cell_names is not None: - data.index = cell_names - if isinstance(data, pd.SparseDataFrame): - data = data.to_dense() - else: - if sp.issparse(data): - data = data.toarray() - data = pd.DataFrame(data, index=cell_names, columns=gene_names) - - # convert data to float - data = data.astype(float) - return data - - -def _read_csv_sparse(filename, chunksize=1000000, fill_value=0.0, **kwargs): - """Read a csv file into a pandas.SparseDataFrame - """ - chunks = pd.read_csv(filename, chunksize=chunksize, **kwargs) - data = pd.concat(chunk.to_sparse(fill_value=fill_value) - for chunk in chunks) - return data - - -def load_csv(filename, cell_axis='row', delimiter=',', - gene_names=True, cell_names=True, - sparse=False, **kwargs): - """Load a csv file - - Parameters - ---------- - filename : str - The name of the csv file to be loaded - cell_axis : {'row', 'column'}, optional (default: 'row') - If your data has genes on the rows and cells on the columns, use - cell_axis='column' - delimiter : str, optional (default: ',') - Use '\\t' for tab separated values (tsv) - gene_names : `bool`, `str`, array-like, or `None` (default: True) - If `True`, we assume gene names are in the first row/column. Otherwise - expects a filename or an array containing a list of gene symbols or ids - cell_names : `bool`, `str`, array-like, or `None` (default: True) - If `True`, we assume cell names are in the first row/column. Otherwise - expects a filename or an array containing a list of cell barcodes. - sparse : bool, optional (default: False) - If True, loads the data as a pd.SparseDataFrame. This uses less memory - but more CPU. - **kwargs : optional arguments for `pd.read_csv`. - - Returns - ------- - data : array-like, shape=[n_samples, n_features] - If either gene or cell names are given, data will be a pd.DataFrame or - pd.SparseDataFrame. If no names are given, data will be a np.ndarray - or scipy.sparse.spmatrix - """ - if cell_axis not in ['row', 'column', 'col']: - raise ValueError( - "cell_axis {} not recognized. Expected 'row' or 'column'".format( - cell_axis)) - - if 'index_col' in kwargs: - # override - index_col = kwargs['index_col'] - cell_names = None - del kwargs['index_col'] - elif cell_names is True: - index_col = 0 - cell_names = None - else: - index_col = None - - if 'header' in kwargs: - # override - header = kwargs['header'] - del kwargs['header'] - gene_names = None - elif gene_names is True: - header = 0 - gene_names = None - else: - header = None - - # Read in csv file - if sparse: - read_fun = _read_csv_sparse - else: - read_fun = pd.read_csv - data = read_fun(filename, delimiter=delimiter, - header=header, index_col=index_col, - **kwargs) - - if cell_axis in ['column', 'col']: - data = data.T - - data = _matrix_to_data_frame( - data, gene_names=gene_names, - cell_names=cell_names, sparse=sparse) - return data - - -def load_tsv(filename, cell_axis='row', delimiter='\t', - gene_names=True, cell_names=True, - sparse=False, **kwargs): - """Load a tsv file - - Parameters - ---------- - filename : str - The name of the csv file to be loaded - cell_axis : {'row', 'column'}, optional (default: 'row') - If your data has genes on the rows and cells on the columns, use - cell_axis='column' - delimiter : str, optional (default: '\\t') - Use ',' for comma separated values (csv) - gene_names : `bool`, `str`, array-like, or `None` (default: True) - If `True`, we assume gene names are in the first row/column. Otherwise - expects a filename or an array containing a list of gene symbols or ids - cell_names : `bool`, `str`, array-like, or `None` (default: True) - If `True`, we assume cell names are in the first row/column. Otherwise - expects a filename or an array containing a list of cell barcodes. - sparse : bool, optional (default: False) - If True, loads the data as a pd.SparseDataFrame. This uses less memory - but more CPU. - **kwargs : optional arguments for `pd.read_csv`. - - Returns - ------- - data : array-like, shape=[n_samples, n_features] - If either gene or cell names are given, data will be a pd.DataFrame or - pd.SparseDataFrame. If no names are given, data will be a np.ndarray - or scipy.sparse.spmatrix - """ - return load_csv(filename, cell_axis=cell_axis, delimiter=delimiter, - gene_names=gene_names, cell_names=cell_names, - sparse=sparse, **kwargs) - - -@_with_fcsparser -def load_fcs(filename, gene_names=True, cell_names=True, - sparse=None, - metadata_channels=['Time', 'Event_length', 'DNA1', 'DNA2', - 'Cisplatin', 'beadDist', 'bead1'], - reformat_meta=True, - **kwargs): - """Load a fcs file - - Parameters - ---------- - filename : str - The name of the fcs file to be loaded - gene_names : `bool`, `str`, array-like, or `None` (default: True) - If `True`, we assume gene names are contained in the file. Otherwise - expects a filename or an array containing a list of gene symbols or ids - cell_names : `bool`, `str`, array-like, or `None` (default: True) - If `True`, we assume cell names are contained in the file. Otherwise - expects a filename or an array containing a list of cell barcodes. - sparse : bool, optional (default: None) - If True, loads the data as a pd.SparseDataFrame. This uses less memory - but more CPU. - metadata_channels : list-like, optional, shape=[n_meta] (default: ['Time', 'Event_length', 'DNA1', 'DNA2', 'Cisplatin', 'beadDist', 'bead1']) - Channels to be excluded from the data - reformat_meta : bool, optional (default: True) - If true, the meta data is reformatted with the channel information - organized into a DataFrame and moved into the '_channels_' key - **kwargs : optional arguments for `fcsparser.parse`. - - Returns - ------- - channel_metadata : dict - FCS metadata - cell_metadata : array-like, shape=[n_samples, n_meta] - Values from metadata channels - data : array-like, shape=[n_samples, n_features] - If either gene or cell names are given, data will be a pd.DataFrame or - pd.SparseDataFrame. If no names are given, data will be a np.ndarray - or scipy.sparse.spmatrix - """ - if cell_names is True: - cell_names = None - if gene_names is True: - gene_names = None - # Parse the fcs file - channel_metadata, data = fcsparser.parse( - filename, reformat_meta=reformat_meta, **kwargs) - metadata_channels = data.columns.intersection(metadata_channels) - data_channels = data.columns.difference(metadata_channels) - cell_metadata = data[metadata_channels] - data = data[data_channels] - data = _matrix_to_data_frame(data, gene_names=gene_names, - cell_names=cell_names, sparse=sparse) - return channel_metadata, cell_metadata, data - - -def load_mtx(mtx_file, cell_axis='row', - gene_names=None, cell_names=None, sparse=None): - """Load a mtx file - - Parameters - ---------- - filename : str - The name of the mtx file to be loaded - cell_axis : {'row', 'column'}, optional (default: 'row') - If your data has genes on the rows and cells on the columns, use - cell_axis='column' - gene_names : `str`, array-like, or `None` (default: None) - Expects a filename or an array containing a list of gene symbols or ids - cell_names : `str`, array-like, or `None` (default: None) - Expects a filename or an array containing a list of cell barcodes. - sparse : bool, optional (default: None) - If True, loads the data as a pd.SparseDataFrame. This uses less memory - but more CPU. - - Returns - ------- - data : array-like, shape=[n_samples, n_features] - If either gene or cell names are given, data will be a pd.DataFrame or - pd.SparseDataFrame. If no names are given, data will be a np.ndarray - or scipy.sparse.spmatrix - """ - if cell_axis not in ['row', 'column', 'col']: - raise ValueError( - "cell_axis {} not recognized. Expected 'row' or 'column'".format( - cell_axis)) - # Read in mtx file - data = sio.mmread(mtx_file) - if cell_axis in ['column', 'col']: - data = data.T - data = _matrix_to_data_frame( - data, gene_names=gene_names, - cell_names=cell_names, sparse=sparse) - return data - - -def _combine_gene_id(symbols, ids): - """Creates gene labels of the form SYMBOL (ID) - - Parameters - ---------- - - genes: pandas.DataFrame with columns['symbol', 'id'] - - Returns - ------- - - pandas.Index with combined gene symbols and ids - """ - columns = np.core.defchararray.add( - np.array(symbols, dtype=str), ' (') - columns = np.core.defchararray.add( - columns, np.array(ids, dtype=str)) - columns = np.core.defchararray.add(columns, ')') - return columns - - -def _parse_10x_genes(symbols, ids, gene_labels='symbol', - allow_duplicates=True): - assert gene_labels in ['symbol', 'id', 'both'] - if gene_labels == 'both': - columns = _combine_gene_id(symbols, ids) - if gene_labels == 'symbol': - columns = symbols - if not allow_duplicates and len(np.unique(columns)) < len(columns): - warnings.warn( - "Duplicate gene names detected! Forcing `gene_labels='id'`. " - "Alternatively, try `gene_labels='both'`, " - "`allow_duplicates=True`, or load the matrix" - " with `sparse=False`", RuntimeWarning) - gene_labels = 'id' - if gene_labels == 'id': - columns = ids - return columns - - -def load_10X(data_dir, sparse=True, gene_labels='symbol', - allow_duplicates=None): - """Basic IO for 10X data produced from the 10X Cellranger pipeline. - - A default run of the `cellranger count` command will generate gene-barcode - matrices for secondary analysis. For both "raw" and "filtered" output, - directories are created containing three files: - 'matrix.mtx', 'barcodes.tsv', 'genes.tsv'. - Running `scprep.io.load_10X(data_dir)` will return a Pandas DataFrame with - genes as columns and cells as rows. - - Parameters - ---------- - data_dir: string - path to input data directory - expects 'matrix.mtx', 'genes.tsv', 'barcodes.tsv' to be present and - will raise an error otherwise - sparse: boolean - If True, a sparse Pandas DataFrame is returned. - gene_labels: string, {'id', 'symbol', 'both'} optional, default: 'symbol' - Whether the columns of the dataframe should contain gene ids or gene - symbols. If 'both', returns symbols followed by ids in parentheses. - allow_duplicates : bool, optional (default: None) - Whether or not to allow duplicate gene names. If None, duplicates are - allowed for dense input but not for sparse input. - - Returns - ------- - data: array-like, shape=[n_samples, n_features] - If sparse, data will be a pd.SparseDataFrame. Otherwise, data will - be a pd.DataFrame. - """ - - if gene_labels not in ['id', 'symbol', 'both']: - raise ValueError( - "gene_labels='{}' not recognized. " - "Choose from ['symbol', 'id', 'both']".format(gene_labels)) - - if not os.path.isdir(data_dir): - raise FileNotFoundError( - "{} is not a directory".format(data_dir)) - - try: - m = sio.mmread(os.path.join(data_dir, "matrix.mtx")) - genes = pd.read_csv(os.path.join(data_dir, "genes.tsv"), - delimiter='\t', header=None) - if genes.shape[1] == 2: - # Cellranger < 3.0 - genes.columns = ['id', 'symbol'] - else: - # Cellranger >= 3.0 - genes.columns = ['id', 'symbol', 'measurement'] - barcodes = pd.read_csv(os.path.join(data_dir, "barcodes.tsv"), - delimiter='\t', header=None) - - except (FileNotFoundError, IOError): - raise FileNotFoundError( - "'matrix.mtx', 'genes.tsv', and 'barcodes.tsv' must be present " - "in {}".format(data_dir)) - - cell_names = barcodes[0] - if allow_duplicates is None: - allow_duplicates = not sparse - gene_names = _parse_10x_genes(genes['symbol'].to_numpy().astype(str), - genes['id'].to_numpy().astype(str), - gene_labels=gene_labels, - allow_duplicates=allow_duplicates) - - data = _matrix_to_data_frame(m.T, cell_names=cell_names, - gene_names=gene_names, - sparse=sparse) - return data - - -def load_10X_zip(filename, sparse=True, gene_labels='symbol', - allow_duplicates=None): - """Basic IO for zipped 10X data produced from the 10X Cellranger pipeline. - - Runs `load_10X` after unzipping the data contained in `filename` - - Parameters - ---------- - filename: string - path to zipped input data directory - expects 'matrix.mtx', 'genes.tsv', 'barcodes.tsv' to be present and - will raise an error otherwise - sparse: boolean - If True, a sparse Pandas DataFrame is returned. - gene_labels: string, {'id', 'symbol', 'both'} optional, default: 'symbol' - Whether the columns of the dataframe should contain gene ids or gene - symbols. If 'both', returns symbols followed by ids in parentheses. - allow_duplicates : bool, optional (default: None) - Whether or not to allow duplicate gene names. If None, duplicates are - allowed for dense input but not for sparse input. - - Returns - ------- - data: array-like, shape=[n_samples, n_features] - If sparse, data will be a pd.SparseDataFrame. Otherwise, data will - be a pd.DataFrame. - """ - - if gene_labels not in ['id', 'symbol', 'both']: - raise ValueError( - "gene_labels='{}' not recognized. " - "Choose from ['symbol', 'id', 'both']".format(gene_labels)) - - tmpdir = tempfile.mkdtemp() - with zipfile.ZipFile(filename) as handle: - files = handle.namelist() - if len(files) != 4: - valid = False - else: - dirname = files[0].strip("/") - subdir_files = [f.split("/")[-1] for f in files] - valid = ("barcodes.tsv" in subdir_files and - "genes.tsv" in subdir_files and - "matrix.mtx" in subdir_files) - if not valid: - raise ValueError( - "Expected a single zipped folder containing 'matrix.mtx', " - "'genes.tsv', and 'barcodes.tsv'. Got {}".format(files)) - handle.extractall(path=tmpdir) - data = load_10X(os.path.join(tmpdir, dirname)) - shutil.rmtree(tmpdir) - return data - - -@hdf5.with_HDF5 -def load_10X_HDF5(filename, genome=None, sparse=True, gene_labels='symbol', - allow_duplicates=None, backend=None): - """Basic IO for HDF5 10X data produced from the 10X Cellranger pipeline. - - Equivalent to `load_10X` but for HDF5 format. - - Parameters - ---------- - filename: string - path to HDF5 input data - genome : str or None, optional (default: None) - Name of the genome to which CellRanger ran analysis. If None, selects - the first available genome, and prints all available genomes if more - than one is available. - sparse: boolean - If True, a sparse Pandas DataFrame is returned. - gene_labels: string, {'id', 'symbol', 'both'} optional, default: 'symbol' - Whether the columns of the dataframe should contain gene ids or gene - symbols. If 'both', returns symbols followed by ids in parentheses. - allow_duplicates : bool, optional (default: None) - Whether or not to allow duplicate gene names. If None, duplicates are - allowed for dense input but not for sparse input. - backend : string, {'tables', 'h5py' or None} optional, default: None - Selects the HDF5 backend. By default, selects whichever is available, - using tables if both are available. - - Returns - ------- - data: array-like, shape=[n_samples, n_features] - If sparse, data will be a pd.SparseDataFrame. Otherwise, data will - be a pd.DataFrame. - """ - - if gene_labels not in ['id', 'symbol', 'both']: - raise ValueError( - "gene_labels='{}' not recognized. " - "Choose from ['symbol', 'id', 'both']".format(gene_labels)) - - with hdf5.open_file(filename, 'r', backend=backend) as f: - if genome is None: - genomes = hdf5.list_nodes(f) - print_genomes = ", ".join(genomes) - genome = genomes[0] - if len(genomes) > 1: - print("Available genomes: {}. Selecting {} by default".format( - print_genomes, genome)) - try: - group = hdf5.get_node(f, genome) - except (AttributeError, KeyError): - genomes = hdf5.list_nodes(f) - print_genomes = ", ".join(genomes) - raise ValueError( - "Genome {} not found in {}. " - "Available genomes: {}".format(genome, filename, - print_genomes)) - if allow_duplicates is None: - allow_duplicates = not sparse - gene_names = _parse_10x_genes( - symbols=[g.decode() for g in hdf5.get_values( - hdf5.get_node(group, 'gene_names'))], - ids=[g.decode() - for g in hdf5.get_values(hdf5.get_node(group, 'genes'))], - gene_labels=gene_labels, allow_duplicates=allow_duplicates) - cell_names = [b.decode() - for b in hdf5.get_values(hdf5.get_node(group, 'barcodes'))] - data = hdf5.get_values(hdf5.get_node(group, 'data')) - indices = hdf5.get_values(hdf5.get_node(group, 'indices')) - indptr = hdf5.get_values(hdf5.get_node(group, 'indptr')) - shape = hdf5.get_values(hdf5.get_node(group, 'shape')) - data = sp.csc_matrix((data, indices, indptr), shape=shape) - data = _matrix_to_data_frame(data.T, - gene_names=gene_names, - cell_names=cell_names, - sparse=sparse) - return data From 21b9f00601657c726b79c14cee04852b3732183f Mon Sep 17 00:00:00 2001 From: Scott Gigante Date: Mon, 16 Sep 2019 22:32:43 -0400 Subject: [PATCH 034/125] fix fcs docs --- scprep/io/fcs.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scprep/io/fcs.py b/scprep/io/fcs.py index dd398f67..c0edf180 100644 --- a/scprep/io/fcs.py +++ b/scprep/io/fcs.py @@ -242,7 +242,7 @@ def load_fcs(filename, gene_names=True, cell_names=True, If `True`, we assume cell names are contained in the file. Otherwise expects a filename or an array containing a list of cell barcodes. sparse : bool, optional (default: None) - If True, loads the data as a pd.SparseDataFrame. This uses less memory + If True, loads the data as a pd.DataFrame[SparseArray]. This uses less memory but more CPU. metadata_channels : list-like, optional, shape=[n_meta] (default: ['Time', 'Event_length', 'DNA1', 'DNA2', 'Cisplatin', 'beadDist', 'bead1']) Channels to be excluded from the data @@ -273,7 +273,7 @@ def load_fcs(filename, gene_names=True, cell_names=True, Values from metadata channels data : array-like, shape=[n_samples, n_features] If either gene or cell names are given, data will be a pd.DataFrame or - pd.SparseDataFrame. If no names are given, data will be a np.ndarray + pd.DataFrame[SparseArray]. If no names are given, data will be a np.ndarray or scipy.sparse.spmatrix """ if cell_names is True: From 0735b74e37515625a05edbe6d0368b3c9b32bacd Mon Sep 17 00:00:00 2001 From: Scott Gigante Date: Mon, 16 Sep 2019 22:33:18 -0400 Subject: [PATCH 035/125] avoid densifying sparse dataframe in unique --- scprep/filter.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/scprep/filter.py b/scprep/filter.py index 58051c37..61dd2d31 100644 --- a/scprep/filter.py +++ b/scprep/filter.py @@ -330,6 +330,8 @@ def _find_unique_cells(data): """ if isinstance(data, pd.SparseDataFrame): unique_idx = _find_unique_cells(data.to_coo()) + elif utils.is_sparse_dataframe(data): + unique_idx = _find_unique_cells(data.sparse.to_coo()) elif isinstance(data, pd.DataFrame): unique_idx = ~data.duplicated() elif isinstance(data, np.ndarray): From a4f538db2771f1145170fd6f5498850410ee5b4c Mon Sep 17 00:00:00 2001 From: Scott Gigante Date: Mon, 16 Sep 2019 22:33:44 -0400 Subject: [PATCH 036/125] fix normalize for pandas 0.25 --- scprep/normalize.py | 16 +++++++++------- scprep/utils.py | 16 +++++++++++++++- test/tools/matrix.py | 1 + test/tools/utils.py | 17 +++++++++++++++-- 4 files changed, 40 insertions(+), 10 deletions(-) diff --git a/scprep/normalize.py b/scprep/normalize.py index 7c791c57..bc333f1a 100644 --- a/scprep/normalize.py +++ b/scprep/normalize.py @@ -64,13 +64,15 @@ def library_size_normalize(data, rescale='median', """ # pandas support columns, index = None, None - if isinstance(data, pd.SparseDataFrame) or \ - pd.api.types.is_sparse(data): + if isinstance(data, pd.DataFrame): columns, index = data.columns, data.index - data = data.to_coo() - elif isinstance(data, pd.DataFrame): - columns, index = data.columns, data.index - data = data.values + if utils.is_sparse_dataframe(data): + data = data.sparse.to_coo() + elif isinstance(data, pd.SparseDataFrame): + data = data.to_coo() + else: + # dense data + data = data.to_numpy() calc_libsize = sparse.issparse(data) and (return_library_size or data.nnz > 2**31) @@ -91,7 +93,7 @@ def library_size_normalize(data, rescale='median', if columns is not None: # pandas dataframe if sparse.issparse(data_norm): - data_norm = pd.SparseDataFrame(data_norm, default_fill_value=0.0) + data_norm = utils.SparseDataFrame(data_norm, default_fill_value=0.0) else: data_norm = pd.DataFrame(data_norm) data_norm.columns = columns diff --git a/scprep/utils.py b/scprep/utils.py index 8dbe1e28..447fed17 100644 --- a/scprep/utils.py +++ b/scprep/utils.py @@ -176,10 +176,24 @@ def is_sparse_series(x): return False -def dataframe_to_sparse(x, fill_value=0): +def dataframe_to_sparse(x, fill_value=0.0): return x.astype(pd.SparseDtype(float, fill_value=fill_value)) +def SparseDataFrame(X, columns=None, index=None, default_fill_value=0.0): + if sparse.issparse(X): + X = pd.DataFrame.sparse.from_spmatrix(X) + X.sparse.fill_value = default_fill_value + elif not isinstance(X, pd.DataFrame): + X = pd.DataFrame(X) + X = dataframe_to_sparse(X, fill_value=default_fill_value) + if columns is not None: + X.columns = columns + if index is not None: + X.index = index + return X + + def matrix_transform(data, fun, *args, **kwargs): """Perform a numerical transformation to data diff --git a/test/tools/matrix.py b/test/tools/matrix.py index ad7113b3..7bb6f887 100644 --- a/test/tools/matrix.py +++ b/test/tools/matrix.py @@ -27,6 +27,7 @@ def SparseDataFrame_deprecated(X, default_fill_value=0.0): category=FutureWarning) return pd.SparseDataFrame(X, default_fill_value=default_fill_value) + def SparseDataFrame(X, default_fill_value=0.0): return pd.DataFrame(X).astype(pd.SparseDtype(float, fill_value=default_fill_value)) diff --git a/test/tools/utils.py b/test/tools/utils.py index 092a2cf9..73775cad 100644 --- a/test/tools/utils.py +++ b/test/tools/utils.py @@ -104,6 +104,18 @@ def assert_transform_raises(X, transform, exception=ValueError, **kwargs): assert_raises(exception, transform, X, **kwargs) +def _is_sparse_dataframe(X): + return isinstance(X, pd.SparseDataFrame) or \ + (isinstance(X, pd.DataFrame) and hasattr(X, "sparse")) + + +def _sparse_dataframe_density(X): + try: + return X.sparse.density + except AttributeError: + return X.density + + def assert_matrix_class_equivalent(X, Y): """Check the format of X and Y are the same @@ -117,11 +129,12 @@ def assert_matrix_class_equivalent(X, Y): if sparse.issparse(X): assert sparse.issparse(Y) assert X.tocoo().nnz == Y.tocoo().nnz + elif _is_sparse_dataframe(X): + assert _is_sparse_dataframe(Y) + assert _sparse_dataframe_density(X) == _sparse_dataframe_density(Y) else: assert type(X) == type(Y) if isinstance(X, pd.DataFrame): assert np.all(X.columns == Y.columns) assert np.all(X.index == Y.index) - if isinstance(X, pd.SparseDataFrame) or isinstance(Y, pd.SparseDataFrame): - assert X.density == Y.density return True From a2f64dbbac9fb1bc4833c82265ca702eadf90e69 Mon Sep 17 00:00:00 2001 From: Scott Gigante Date: Mon, 16 Sep 2019 22:36:31 -0400 Subject: [PATCH 037/125] fix test_sanitize for new pandas sparse --- test/test_sanitize.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/test/test_sanitize.py b/test/test_sanitize.py index 9ed4ef30..421719e2 100644 --- a/test/test_sanitize.py +++ b/test/test_sanitize.py @@ -20,7 +20,8 @@ def test_check_numeric_inplace(): utils.assert_transform_unchanged, matrix._scipy_matrix_types + matrix._numpy_matrix_types + - matrix._pandas_dense_matrix_types, + matrix._pandas_dense_matrix_types + + [matrix.SparseDataFrame], transform=scprep.sanitize.check_numeric, copy=False) assert_raise_message( @@ -28,7 +29,7 @@ def test_check_numeric_inplace(): "pd.SparseDataFrame does not support " "copy=False. Please use copy=True.", scprep.sanitize.check_numeric, - data=X, copy=False) + data=matrix.SparseDataFrame_deprecated(X), copy=False) class TypeErrorClass(object): From aa461c829e015bef943276a6ee2c0ee98930ea47 Mon Sep 17 00:00:00 2001 From: Scott Gigante Date: Mon, 16 Sep 2019 23:07:14 -0400 Subject: [PATCH 038/125] fix ufunc densification from pandas 0.25; workaround for https://github.com/pandas-dev/pandas/issues/28476 --- scprep/utils.py | 2 +- test/tools/matrix.py | 9 ++++++++- test/tools/utils.py | 12 ++++-------- 3 files changed, 13 insertions(+), 10 deletions(-) diff --git a/scprep/utils.py b/scprep/utils.py index 447fed17..99b6209e 100644 --- a/scprep/utils.py +++ b/scprep/utils.py @@ -211,7 +211,7 @@ def matrix_transform(data, fun, *args, **kwargs): data : array-like, shape=[n_samples, n_features] Transformed output data """ - if isinstance(data, pd.SparseDataFrame): + if is_sparse_dataframe(data) or isinstance(data, pd.SparseDataFrame): data = data.copy() for col in data.columns: data[col] = fun(data[col], *args, **kwargs) diff --git a/test/tools/matrix.py b/test/tools/matrix.py index 7bb6f887..107305ad 100644 --- a/test/tools/matrix.py +++ b/test/tools/matrix.py @@ -64,6 +64,13 @@ def SparseDataFrame(X, default_fill_value=0.0): ] + _numpy_matrix_types + _pandas_matrix_types +def _typename(X): + if isinstance(X, pd.DataFrame) and hasattr(X, "sparse"): + return "DataFrame[SparseArray]" + else: + return type(X).__name__ + + def test_matrix_types(X, test_fun, matrix_types, *args, **kwargs): """Test a function across a range of matrix types @@ -81,7 +88,7 @@ def test_matrix_types(X, test_fun, matrix_types, *args, **kwargs): test_fun(Y, *args, **kwargs) except Exception as e: raise RuntimeError("{} with {} input to {}\n{}".format( - type(e).__name__, type(Y).__name__, test_fun.__name__, + type(e).__name__, _typename(Y), test_fun.__name__, str(e))) diff --git a/test/tools/utils.py b/test/tools/utils.py index 73775cad..883f70e9 100644 --- a/test/tools/utils.py +++ b/test/tools/utils.py @@ -3,6 +3,7 @@ import pandas as pd from nose.tools import assert_raises from scprep.utils import toarray +from . import matrix def assert_all_equal(X, Y): @@ -40,15 +41,10 @@ def assert_transform_equals(X, Y, transform, check=assert_all_equal, **kwargs): ------- Y2 : returned value of transform(X, **kwargs) """ - try: - Y2 = transform(X, **kwargs) - except Exception as e: - raise RuntimeError("{} with {} input to {}\n{}".format( - type(e).__name__, type(X).__name__, transform, - str(e))) + Y2 = transform(X, **kwargs) check(Y, Y2), "{} failed on {}".format( transform, - type(X).__name__) + matrix._typename(X)) return Y2 @@ -89,7 +85,7 @@ def assert_transform_equivalent(X, Y, transform, check=assert_all_equal, Y2 = assert_transform_equals(X, Y, transform, check=check, **kwargs) assert assert_matrix_class_equivalent(X, Y2), \ "{} produced inconsistent matrix output".format( - type(X).__name__) + _typename(X)) def assert_transform_raises(X, transform, exception=ValueError, **kwargs): From fcc1778ee9df8b3816ddc1346bef57486cb7dd83 Mon Sep 17 00:00:00 2001 From: Scott Gigante Date: Mon, 16 Sep 2019 23:15:33 -0400 Subject: [PATCH 039/125] catch new sparse data in reduce.pca --- scprep/reduce.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/scprep/reduce.py b/scprep/reduce.py index 04e6d3e5..5b96a951 100644 --- a/scprep/reduce.py +++ b/scprep/reduce.py @@ -279,6 +279,8 @@ def pca(data, n_components=100, eps=0.3, # handle dataframes if isinstance(data, pd.SparseDataFrame): data = data.to_coo() + elif utils.is_sparse_dataframe(data): + data = data.sparse.to_coo() elif isinstance(data, pd.DataFrame): data = data.to_numpy() From b269ff422ebe26ecf735ef0b7e2faee50b03a15f Mon Sep 17 00:00:00 2001 From: Scott Gigante Date: Mon, 16 Sep 2019 23:40:35 -0400 Subject: [PATCH 040/125] fix row-wise matrix vector multiplication for pandas 0.24 --- scprep/utils.py | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/scprep/utils.py b/scprep/utils.py index 99b6209e..1b9f1ed3 100644 --- a/scprep/utils.py +++ b/scprep/utils.py @@ -316,14 +316,18 @@ def matrix_vector_elementwise_multiply(data, multiplier, axis=None): data.shape[1], multiplier.shape)) multiplier = multiplier.reshape(1, -1) - if isinstance(data, pd.SparseDataFrame): + if isinstance(data, pd.SparseDataFrame) or is_sparse_dataframe(data): data = data.copy() multiplier = multiplier.flatten() if axis == 0: - data = data.T - for col, mult in zip(data.columns, multiplier): - data[col] = data[col] * mult - data = data.T + for col in data.columns: + try: + mult_indices = data[col].values.sp_index.indices + except AttributeError: + mult_indices = data[col].values.sp_index.to_int_index().indices + new_data = data[col].values.sp_values * multiplier[mult_indices] + data[col].values.sp_values.put(np.arange(data[col].sparse.npoints), + new_data) else: for col, mult in zip(data.columns, multiplier): data[col] = data[col] * mult From 672da543db3f9cf151f6938020e0890d0e6188b3 Mon Sep 17 00:00:00 2001 From: Scott Gigante Date: Mon, 16 Sep 2019 23:59:43 -0400 Subject: [PATCH 041/125] fix to_coo in test_stats --- test/test_stats.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/test/test_stats.py b/test/test_stats.py index cd4c4e62..51023337 100644 --- a/test/test_stats.py +++ b/test/test_stats.py @@ -163,7 +163,7 @@ def test_differential_expression(measure, direction): assert result['gene'][0] == expected_results[(measure, direction)][0], result['gene'][0] assert np.allclose(result[measure][0], expected_results[(measure, direction)][1]) - result_unnamed = scprep.stats.differential_expression(X.iloc[:20].to_coo(), X.iloc[20:100].to_coo(), + result_unnamed = scprep.stats.differential_expression(X.iloc[:20].sparse.to_coo(), X.iloc[20:100].sparse.to_coo(), measure=measure, direction=direction) if direction != 'both': values = result[measure] @@ -214,12 +214,12 @@ def test_differential_expression_error(): ValueError, "Expected gene_names to have length {}. " "Got {}".format(X.shape[0], X.shape[0]//2), scprep.stats.differential_expression, - X.to_coo(), X.to_coo(), gene_names=np.arange(X.shape[0]//2)) + X.sparse.to_coo(), X.sparse.to_coo(), gene_names=np.arange(X.shape[0]//2)) assert_raise_message( ValueError, "Expected gene_names to have length {}. " "Got {}".format(X.shape[0], X.shape[0]//2), scprep.stats.differential_expression_by_cluster, - X.to_coo(), np.random.choice(2, X.shape[0], replace=True), + X.sparse.to_coo(), np.random.choice(2, X.shape[0], replace=True), gene_names=np.arange(X.shape[0]//2)) assert_warns_message( UserWarning, "Input data has inconsistent column names. " From accb71e582ef7acdfd193565231f917023341140 Mon Sep 17 00:00:00 2001 From: Scott Gigante Date: Tue, 17 Sep 2019 00:05:44 -0400 Subject: [PATCH 042/125] set h5py file mode --- test/test_hdf5.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/test_hdf5.py b/test/test_hdf5.py index 26831f3c..855fe847 100644 --- a/test/test_hdf5.py +++ b/test/test_hdf5.py @@ -26,7 +26,7 @@ def test_failed_import_h5py(): h5py = scprep.io.hdf5.h5py del scprep.io.hdf5.h5py assert hdf5_available() is True - with h5py.File(h5_file) as f: + with h5py.File(h5_file, 'r') as f: assert scprep.io.hdf5._is_h5py(f) is False scprep.io.hdf5.h5py = h5py From a913fcb62194105aed7c47d14c2c00aff8e12bd8 Mon Sep 17 00:00:00 2001 From: Scott Gigante Date: Tue, 17 Sep 2019 00:32:43 -0400 Subject: [PATCH 043/125] quiet sparse warnings --- scprep/utils.py | 4 ++-- test/tools/matrix.py | 40 +++++++++++++++++++++++++++++----------- test/tools/utils.py | 5 +++-- 3 files changed, 34 insertions(+), 15 deletions(-) diff --git a/scprep/utils.py b/scprep/utils.py index 1b9f1ed3..716d1d4c 100644 --- a/scprep/utils.py +++ b/scprep/utils.py @@ -157,7 +157,7 @@ def to_array_or_spmatrix(x): def is_sparse_dataframe(x): - if isinstance(x, pd.DataFrame): + if isinstance(x, pd.DataFrame) and not isinstance(x, pd.SparseDataFrame): try: x.sparse return True @@ -167,7 +167,7 @@ def is_sparse_dataframe(x): def is_sparse_series(x): - if isinstance(x, pd.Series): + if isinstance(x, pd.Series) and not isinstance(x, pd.SparseSeries): try: x.sparse return True diff --git a/test/tools/matrix.py b/test/tools/matrix.py index 107305ad..8b688f96 100644 --- a/test/tools/matrix.py +++ b/test/tools/matrix.py @@ -5,6 +5,28 @@ from functools import partial +def _ignore_pandas_sparse_warning(): + warnings.filterwarnings( + "ignore", + category=FutureWarning, + message="SparseSeries") + warnings.filterwarnings( + "ignore", + category=FutureWarning, + message="SparseDataFrame") + + +def _reset_pandas_sparse_warning(): + warnings.filterwarnings( + "default", + category=FutureWarning, + message="SparseSeries") + warnings.filterwarnings( + "default", + category=FutureWarning, + message="SparseDataFrame") + + def _no_warning_dia_matrix(*args, **kwargs): """Helper function to silently create diagonal matrix""" with warnings.catch_warnings(): @@ -16,16 +38,7 @@ def _no_warning_dia_matrix(*args, **kwargs): return sparse.dia_matrix(*args, **kwargs) def SparseDataFrame_deprecated(X, default_fill_value=0.0): - with warnings.catch_warnings(): - warnings.filterwarnings( - "ignore", - message="SparseSeries is deprecated and will be removed in a future version.", - category=FutureWarning) - warnings.filterwarnings( - "ignore", - message="SparseDataFrame is deprecated and will be removed in a future version.", - category=FutureWarning) - return pd.SparseDataFrame(X, default_fill_value=default_fill_value) + return pd.SparseDataFrame(X, default_fill_value=default_fill_value) def SparseDataFrame(X, default_fill_value=0.0): @@ -65,7 +78,7 @@ def SparseDataFrame(X, default_fill_value=0.0): def _typename(X): - if isinstance(X, pd.DataFrame) and hasattr(X, "sparse"): + if isinstance(X, pd.DataFrame) and not isinstance(X, pd.SparseDataFrame) and hasattr(X, "sparse"): return "DataFrame[SparseArray]" else: return type(X).__name__ @@ -83,6 +96,8 @@ def test_matrix_types(X, test_fun, matrix_types, *args, **kwargs): **kwargs : keyword arguments for test_fun """ for fun in matrix_types: + if fun is SparseDataFrame_deprecated: + _ignore_pandas_sparse_warning() Y = fun(X.copy()) try: test_fun(Y, *args, **kwargs) @@ -90,6 +105,9 @@ def test_matrix_types(X, test_fun, matrix_types, *args, **kwargs): raise RuntimeError("{} with {} input to {}\n{}".format( type(e).__name__, _typename(Y), test_fun.__name__, str(e))) + finally: + if fun is SparseDataFrame_deprecated: + _reset_pandas_sparse_warning() def test_dense_matrix_types(X, test_fun, *args, **kwargs): diff --git a/test/tools/utils.py b/test/tools/utils.py index 883f70e9..4f657045 100644 --- a/test/tools/utils.py +++ b/test/tools/utils.py @@ -125,11 +125,12 @@ def assert_matrix_class_equivalent(X, Y): if sparse.issparse(X): assert sparse.issparse(Y) assert X.tocoo().nnz == Y.tocoo().nnz - elif _is_sparse_dataframe(X): + elif isinstance(X, pd.SparseDataFrame): assert _is_sparse_dataframe(Y) - assert _sparse_dataframe_density(X) == _sparse_dataframe_density(Y) else: assert type(X) == type(Y) + if _is_sparse_dataframe(X): + assert _sparse_dataframe_density(X) == _sparse_dataframe_density(Y) if isinstance(X, pd.DataFrame): assert np.all(X.columns == Y.columns) assert np.all(X.index == Y.index) From 0d95a125cc666d3a618829a6cb192a472e8ea150 Mon Sep 17 00:00:00 2001 From: Scott Gigante Date: Tue, 17 Sep 2019 16:16:58 -0400 Subject: [PATCH 044/125] don't allow warnings in tests --- scprep/select.py | 6 ++++++ scprep/transform.py | 4 +++- scprep/utils.py | 11 ++++++++++- test/test_filter.py | 6 ++++++ test/test_sanitize.py | 2 ++ test/test_select.py | 11 +++++++++++ test/test_transform.py | 18 +++++++----------- test/tools/__init__.py | 2 +- test/tools/matrix.py | 25 +++++++++++++++++++------ test/tools/utils.py | 1 + 10 files changed, 66 insertions(+), 20 deletions(-) diff --git a/scprep/select.py b/scprep/select.py index 6973f5ff..d23ee49a 100644 --- a/scprep/select.py +++ b/scprep/select.py @@ -330,6 +330,9 @@ def select_cols(data, *extra_data, idx=None, _check_idx_1d(idx) idx = idx.flatten() + if isinstance(data, pd.SparseDataFrame): + # evil deprecated dataframe; get rid of it + data = utils.SparseDataFrame(data) if isinstance(data, pd.DataFrame): try: if isinstance(idx, (numbers.Integral, str)): @@ -446,6 +449,9 @@ def select_rows(data, *extra_data, idx=None, _check_idx_1d(idx) idx = idx.flatten() + if isinstance(data, pd.SparseDataFrame): + # evil deprecated dataframe; get rid of it + data = utils.SparseDataFrame(data) if isinstance(data, (pd.DataFrame, pd.Series)): try: if isinstance(idx, (numbers.Integral, str)): diff --git a/scprep/transform.py b/scprep/transform.py index 0a55b39c..179fd026 100644 --- a/scprep/transform.py +++ b/scprep/transform.py @@ -60,7 +60,9 @@ def log(data, pseudocount=1, base=10): "Got pseudocount = {}".format(utils.matrix_min(data), pseudocount)) elif pseudocount != data_min + 1 and \ - (sparse.issparse(data) or isinstance(data, pd.SparseDataFrame)): + (sparse.issparse(data) or + isinstance(data, pd.SparseDataFrame) or + utils.is_sparse_dataframe(data)): req = "min(data) + 1 ({})".format(data_min + 1) if data_min != 0 else "1" warnings.warn("log transform on sparse data requires " diff --git a/scprep/utils.py b/scprep/utils.py index 716d1d4c..b3ffc194 100644 --- a/scprep/utils.py +++ b/scprep/utils.py @@ -184,7 +184,7 @@ def SparseDataFrame(X, columns=None, index=None, default_fill_value=0.0): if sparse.issparse(X): X = pd.DataFrame.sparse.from_spmatrix(X) X.sparse.fill_value = default_fill_value - elif not isinstance(X, pd.DataFrame): + elif isinstance(X, pd.SparseDataFrame) or not isinstance(X, pd.DataFrame): X = pd.DataFrame(X) X = dataframe_to_sparse(X, fill_value=default_fill_value) if columns is not None: @@ -253,6 +253,13 @@ def matrix_sum(data, axis=None): index = data.index if axis == 1 else data.columns sums = pd.Series(np.array(data.to_coo().sum(axis)).flatten(), index=index) + elif is_sparse_dataframe(data): + if axis is None: + sums = data.sparse.to_coo().sum() + else: + index = data.index if axis == 1 else data.columns + sums = pd.Series(np.array(data.sparse.to_coo().sum(axis)).flatten(), + index=index) elif axis is None: sums = data.to_numpy().sum() else: @@ -479,6 +486,8 @@ def combine_batches(data, batch_labels, append_to_cell_names=None): # check consistent type matrix_type = type(data[0]) + if matrix_type is pd.SparseDataFrame: + matrix_type = pd.DataFrame if not issubclass(matrix_type, (np.ndarray, pd.DataFrame, sparse.spmatrix)): diff --git a/test/test_filter.py b/test/test_filter.py index 4421855c..217d2d0c 100644 --- a/test/test_filter.py +++ b/test/test_filter.py @@ -266,7 +266,13 @@ def test_deprecated_sample_labels(self): def test_large_sparse_dataframe_library_size(): + matrix._ignore_pandas_sparse_warning() X = pd.SparseDataFrame(sparse.coo_matrix((10**7, 2 * 10**4)), default_fill_value=0.0) cell_sums = scprep.measure.library_size(X) assert cell_sums.shape[0] == X.shape[0] + matrix._reset_warnings() + X = matrix.SparseDataFrame(sparse.coo_matrix((10**7, 2 * 10**4)), + default_fill_value=0.0) + cell_sums = scprep.measure.library_size(X) + assert cell_sums.shape[0] == X.shape[0] diff --git a/test/test_sanitize.py b/test/test_sanitize.py index 421719e2..9c46f611 100644 --- a/test/test_sanitize.py +++ b/test/test_sanitize.py @@ -24,12 +24,14 @@ def test_check_numeric_inplace(): [matrix.SparseDataFrame], transform=scprep.sanitize.check_numeric, copy=False) + matrix._ignore_pandas_sparse_warning() assert_raise_message( TypeError, "pd.SparseDataFrame does not support " "copy=False. Please use copy=True.", scprep.sanitize.check_numeric, data=matrix.SparseDataFrame_deprecated(X), copy=False) + matrix._reset_warnings() class TypeErrorClass(object): diff --git a/test/test_select.py b/test/test_select.py index 984b9739..f82af8bb 100644 --- a/test/test_select.py +++ b/test/test_select.py @@ -407,6 +407,17 @@ def test_subsample_n_too_large(self): "Expected n (101) <= n_samples (100)", scprep.select.subsample, self.X, n=self.X.shape[0] + 1) + def test_sparse_dataframe_fill_value(self): + def test_fun(X): + Y = scprep.select.select_rows(X, idx=np.arange(X.shape[0]//2)) + for col in Y.columns: + assert X[col].dtype == Y[col].dtype, (X[col].dtype, Y[col].dtype) + Y = scprep.select.select_cols(X, idx=np.arange(X.shape[1]//2)) + for col in Y.columns: + assert X[col].dtype == Y[col].dtype, (X[col].dtype, Y[col].dtype) + matrix.test_matrix_types( + self.X, test_fun, matrix._pandas_sparse_matrix_types) + def test_string_subset_exact_word(): np.testing.assert_array_equal(scprep.select._get_string_subset_mask( diff --git a/test/test_transform.py b/test/test_transform.py index f0d26f31..58168cbd 100644 --- a/test/test_transform.py +++ b/test/test_transform.py @@ -33,17 +33,13 @@ def test_log_transform(): Y=Y, transform=scprep.transform.log, base=2) Y = np.log2(X + 5) - assert_warns_message( - RuntimeWarning, - "log transform on sparse data requires pseudocount = 1", - scprep.transform.log, - data=sparse.csr_matrix(X), base=2, pseudocount=5) - assert_warns_message( - RuntimeWarning, - "log transform on sparse data requires pseudocount = 1", - scprep.transform.log, - data=pd.SparseDataFrame(X, default_fill_value=0.0), - base=2, pseudocount=5) + def test_fun(X): + assert_warns_message( + RuntimeWarning, + "log transform on sparse data requires pseudocount = 1", + scprep.transform.log, + data=X, base=2, pseudocount=5) + matrix.test_sparse_matrix_types(X, test_fun) matrix.test_dense_matrix_types( X, utils.assert_transform_equivalent, Y=Y, transform=scprep.transform.log, diff --git a/test/tools/__init__.py b/test/tools/__init__.py index 9b2e90bd..610d72cb 100644 --- a/test/tools/__init__.py +++ b/test/tools/__init__.py @@ -1,2 +1,2 @@ import matplotlib as mpl -mpl.use("Agg") +mpl.use("agg") diff --git a/test/tools/matrix.py b/test/tools/matrix.py index 8b688f96..6749b21a 100644 --- a/test/tools/matrix.py +++ b/test/tools/matrix.py @@ -14,17 +14,26 @@ def _ignore_pandas_sparse_warning(): "ignore", category=FutureWarning, message="SparseDataFrame") + warnings.filterwarnings( + "error", + category=pd.errors.PerformanceWarning) -def _reset_pandas_sparse_warning(): +def _reset_warnings(): warnings.filterwarnings( - "default", + "error", category=FutureWarning, message="SparseSeries") warnings.filterwarnings( - "default", + "error", category=FutureWarning, message="SparseDataFrame") + warnings.filterwarnings( + "error", + category=pd.errors.PerformanceWarning) + + +_reset_warnings() def _no_warning_dia_matrix(*args, **kwargs): @@ -42,7 +51,12 @@ def SparseDataFrame_deprecated(X, default_fill_value=0.0): def SparseDataFrame(X, default_fill_value=0.0): - return pd.DataFrame(X).astype(pd.SparseDtype(float, fill_value=default_fill_value)) + if sparse.issparse(X): + X = pd.DataFrame.sparse.from_spmatrix(X) + X.sparse.fill_value = default_fill_value + elif isinstance(X, pd.SparseDataFrame) or not isinstance(X, pd.DataFrame): + X = pd.DataFrame(X) + return X.astype(pd.SparseDtype(float, fill_value=default_fill_value)) _scipy_matrix_types = [ @@ -106,8 +120,7 @@ def test_matrix_types(X, test_fun, matrix_types, *args, **kwargs): type(e).__name__, _typename(Y), test_fun.__name__, str(e))) finally: - if fun is SparseDataFrame_deprecated: - _reset_pandas_sparse_warning() + _reset_warnings() def test_dense_matrix_types(X, test_fun, *args, **kwargs): diff --git a/test/tools/utils.py b/test/tools/utils.py index 4f657045..83146b0b 100644 --- a/test/tools/utils.py +++ b/test/tools/utils.py @@ -131,6 +131,7 @@ def assert_matrix_class_equivalent(X, Y): assert type(X) == type(Y) if _is_sparse_dataframe(X): assert _sparse_dataframe_density(X) == _sparse_dataframe_density(Y) + assert _sparse_dataframe_density(X) == _sparse_dataframe_density(Y) if isinstance(X, pd.DataFrame): assert np.all(X.columns == Y.columns) assert np.all(X.index == Y.index) From d313d701591716332939662d8eb26ed1c70b707a Mon Sep 17 00:00:00 2001 From: Scott Gigante Date: Sat, 21 Sep 2019 13:34:59 -0400 Subject: [PATCH 045/125] bump test coverage --- scprep/__init__.py | 9 ------ test/test_io.py | 68 +++++++++++++++++++++++++++++++++++++++++++ test/test_utils.py | 69 ++++++++++++++++++++++++++++++++++++++++++++ test/tools/matrix.py | 14 ++++++++- test/tools/utils.py | 2 +- 5 files changed, 151 insertions(+), 11 deletions(-) diff --git a/scprep/__init__.py b/scprep/__init__.py index 2bbd9f2f..a915c9bf 100644 --- a/scprep/__init__.py +++ b/scprep/__init__.py @@ -14,12 +14,3 @@ import scprep.stats import scprep.reduce import scprep.run - -import pandas as _pd -if int(_pd.__version__.split(".")[1]) < 24: - import numpy as _np - - def __rmatmul__(self, other): - """ Matrix multiplication using binary `@` operator in Python>=3.5 """ - return self.dot(_np.transpose(other)) - _pd.core.series.Series.__rmatmul__ = __rmatmul__ diff --git a/test/test_io.py b/test/test_io.py index 92c1af4a..15d38d96 100644 --- a/test/test_io.py +++ b/test/test_io.py @@ -4,10 +4,78 @@ from sklearn.utils.testing import assert_warns_message, assert_raise_message, assert_raises import pandas as pd import numpy as np +from scipy import sparse import os import fcsparser import zipfile import urllib +import unittest + + +class TestMatrixToDataFrame(unittest.TestCase): + + def setUpClass(self): + self.X_dense = data.load_10X(sparse=False) + self.X_sparse = data.load_10X(sparse=True) + self.X_numpy = self.X_dense.to_numpy() + self.cell_names = self.X_dense.index + self.gene_names = self.X_dense.columns + + def test_matrix_to_dataframe_no_names_sparse(): + Y = scprep.io.utils._matrix_to_dataframe(self.X_dense, sparse=True) + assert isinstance(Y, sparse.csr_matrix) + assert np.all(scprep.utils.toarray(Y) == self.X_numpy) + Y = scprep.io.utils._matrix_to_dataframe(self.X_sparse, sparse=True) + assert isinstance(Y, sparse.csr_matrix) + assert np.all(scprep.utils.toarray(Y) == self.X_numpy) + Y = scprep.io.utils._matrix_to_dataframe(self.X_numpy, sparse=True) + assert isinstance(Y, sparse.csr_matrix) + assert np.all(scprep.utils.toarray(Y) == self.X_numpy) + + def test_matrix_to_dataframe_no_names_dense(): + Y = scprep.io.utils._matrix_to_dataframe(self.X_dense, sparse=False) + assert isinstance(Y, np.ndarray) + assert np.all(Y == self.X_numpy) + Y = scprep.io.utils._matrix_to_dataframe(self.X_sparse, sparse=False) + assert isinstance(Y, np.ndarray) + assert np.all(Y == self.X_numpy) + Y = scprep.io.utils._matrix_to_dataframe(self.X_numpy, sparse=False) + assert isinstance(Y, np.ndarray) + assert np.all(Y == self.X_numpy) + + def test_matrix_to_dataframe_names_sparse(): + Y = scprep.io.utils._matrix_to_dataframe(self.X_dense, cell_names=self.cell_names, + gene_names=self.gene_names, sparse=True) + assert scprep.utils.is_sparse_dataframe(Y) + assert not isinstance(Y, pd.SparseDataFrame) + assert np.all(scprep.utils.toarray(Y) == self.X_numpy) + Y = scprep.io.utils._matrix_to_dataframe(self.X_sparse, cell_names=self.cell_names, + gene_names=self.gene_names, sparse=True) + assert scprep.utils.is_sparse_dataframe(Y) + assert not isinstance(Y, pd.SparseDataFrame) + assert np.all(scprep.utils.toarray(Y) == self.X_numpy) + Y = scprep.io.utils._matrix_to_dataframe(self.X_numpy, cell_names=self.cell_names, + gene_names=self.gene_names, sparse=True) + assert scprep.utils.is_sparse_dataframe(Y) + assert not isinstance(Y, pd.SparseDataFrame) + assert np.all(scprep.utils.toarray(Y) == self.X_numpy) + + def test_matrix_to_dataframe_names_dense(): + Y = scprep.io.utils._matrix_to_dataframe(self.X_dense, cell_names=self.cell_names, + gene_names=self.gene_names, sparse=False) + assert isinstance(Y, pd.DataFrame) + assert not isinstance(Y, pd.SparseDataFrame) + assert np.all(scprep.utils.toarray(Y) == self.X_numpy) + Y = scprep.io.utils._matrix_to_dataframe(self.X_sparse, cell_names=self.cell_names, + gene_names=self.gene_names, sparse=False) + assert isinstance(Y, pd.DataFrame) + assert not isinstance(Y, pd.SparseDataFrame) + assert np.all(scprep.utils.toarray(Y) == self.X_numpy) + Y = scprep.io.utils._matrix_to_dataframe(self.X_numpy, cell_names=self.cell_names, + gene_names=self.gene_names, sparse=False) + assert isinstance(Y, pd.DataFrame) + assert not isinstance(Y, pd.SparseDataFrame) + assert np.all(scprep.utils.toarray(Y) == self.X_numpy) def test_10X_duplicate_gene_names(): diff --git a/test/test_utils.py b/test/test_utils.py index 6f9143b1..dd58b6f1 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -217,12 +217,25 @@ def test_fun(X): matrix.test_all_matrix_types(X, test_fun) test_fun([X, np.matrix(X)]) + + +def test_toarray_string_error(): assert_raise_message(TypeError, "Expected array-like. Got ", scprep.utils.toarray, "hello") +def test_toarray_vector(): + X = data.generate_positive_sparse_matrix(shape=(50,)) + + def test_fun(X): + assert isinstance(scprep.utils.toarray(X), np.ndarray) + matrix.test_matrix_types(X, + test_fun, + matrix._pandas_vector_types) + + def test_toarray_list_of_strings(): X = ['hello', 'world', [1, 2, 3]] X = scprep.utils.toarray(X) @@ -397,3 +410,59 @@ def test_deprecated(): scprep.utils.subsample, X, n=10) + + +def test_is_sparse_dataframe(): + X = data.load_10X(sparse=False) + Y = X.astype(pd.SparseDtype(float, fill_value=0.0)) + assert scprep.utils.is_sparse_dataframe(Y) + def test_fun(X): + assert not scprep.utils.is_sparse_dataframe(X) + matrix.test_matrix_types( + X, + test_fun, + matrix._scipy_matrix_types + + matrix._numpy_matrix_types + + matrix._pandas_dense_matrix_types + + [matrix.SparseDataFrame_deprecated] + ) + + +def test_SparseDataFrame(): + X = data.load_10X(sparse=False) + Y = X.astype(pd.SparseDtype(float, fill_value=0.0)) + index = X.index + columns = X.columns + def test_fun(X): + X = scprep.utils.SparseDataFrame(X, index=index, columns=columns) + utils.assert_matrix_class_equivalent(X, Y) + matrix.test_all_matrix_types( + X, + test_fun + ) + matrix.test_pandas_matrix_types( + X, + utils.assert_transform_equivalent, + Y=Y, + transform=scprep.utils.SparseDataFrame + ) + + +def test_is_sparse_series(): + X = data.load_10X(sparse=True) + assert scprep.utils.is_sparse_series(X[X.columns[0]]) + def test_fun(X): + if isinstance(X, pd.SparseDataFrame): + x = X[X.columns[0]] + else: + x = scprep.select.select_cols(X, idx=0) + assert not scprep.utils.is_sparse_series(x) + matrix.test_matrix_types( + X.to_numpy(), + test_fun, + matrix._scipy_matrix_types + + matrix._numpy_matrix_types + + matrix._pandas_dense_matrix_types + + [matrix.SparseDataFrame_deprecated] + ) + \ No newline at end of file diff --git a/test/tools/matrix.py b/test/tools/matrix.py index 6749b21a..e2251f58 100644 --- a/test/tools/matrix.py +++ b/test/tools/matrix.py @@ -49,6 +49,12 @@ def _no_warning_dia_matrix(*args, **kwargs): def SparseDataFrame_deprecated(X, default_fill_value=0.0): return pd.SparseDataFrame(X, default_fill_value=default_fill_value) +def SparseSeries(X, default_fill_value=0.0): + return pd.Series(X).astype(pd.SparseDtype(float, fill_value=default_fill_value)) + +def SparseSeries_deprecated(X, default_fill_value=0.0): + return pd.SparseSeries(X, fill_value=default_fill_value) + def SparseDataFrame(X, default_fill_value=0.0): if sparse.issparse(X): @@ -81,6 +87,12 @@ def SparseDataFrame(X, default_fill_value=0.0): SparseDataFrame_deprecated, ] +_pandas_vector_types = [ + pd.Series, + SparseSeries, + SparseSeries_deprecated +] + _pandas_matrix_types = _pandas_dense_matrix_types + _pandas_sparse_matrix_types _indexable_matrix_types = [ @@ -110,7 +122,7 @@ def test_matrix_types(X, test_fun, matrix_types, *args, **kwargs): **kwargs : keyword arguments for test_fun """ for fun in matrix_types: - if fun is SparseDataFrame_deprecated: + if fun is SparseDataFrame_deprecated or fun is SparseSeries_deprecated: _ignore_pandas_sparse_warning() Y = fun(X.copy()) try: diff --git a/test/tools/utils.py b/test/tools/utils.py index 83146b0b..25671c2e 100644 --- a/test/tools/utils.py +++ b/test/tools/utils.py @@ -128,7 +128,7 @@ def assert_matrix_class_equivalent(X, Y): elif isinstance(X, pd.SparseDataFrame): assert _is_sparse_dataframe(Y) else: - assert type(X) == type(Y) + assert type(X) == type(Y), (type(X), type(Y)) if _is_sparse_dataframe(X): assert _sparse_dataframe_density(X) == _sparse_dataframe_density(Y) assert _sparse_dataframe_density(X) == _sparse_dataframe_density(Y) From bbaf6acbfc18600c6807ed1409e516cfccfe9ad2 Mon Sep 17 00:00:00 2001 From: Scott Gigante Date: Sat, 21 Sep 2019 13:52:15 -0400 Subject: [PATCH 046/125] fix matrix_to_data_frame tests --- test/test_io.py | 73 +++++++++++++++++++++++++++++++++++-------------- 1 file changed, 52 insertions(+), 21 deletions(-) diff --git a/test/test_io.py b/test/test_io.py index 15d38d96..96dfc92e 100644 --- a/test/test_io.py +++ b/test/test_io.py @@ -1,4 +1,4 @@ -from tools import data +from tools import data, utils import scprep import scprep.io.utils from sklearn.utils.testing import assert_warns_message, assert_raise_message, assert_raises @@ -14,68 +14,99 @@ class TestMatrixToDataFrame(unittest.TestCase): + @classmethod def setUpClass(self): self.X_dense = data.load_10X(sparse=False) self.X_sparse = data.load_10X(sparse=True) self.X_numpy = self.X_dense.to_numpy() + self.X_coo = self.X_sparse.sparse.to_coo() self.cell_names = self.X_dense.index self.gene_names = self.X_dense.columns - def test_matrix_to_dataframe_no_names_sparse(): - Y = scprep.io.utils._matrix_to_dataframe(self.X_dense, sparse=True) + def test_matrix_to_dataframe_no_names_sparse(self): + Y = scprep.io.utils._matrix_to_data_frame(self.X_numpy, sparse=True) assert isinstance(Y, sparse.csr_matrix) assert np.all(scprep.utils.toarray(Y) == self.X_numpy) - Y = scprep.io.utils._matrix_to_dataframe(self.X_sparse, sparse=True) - assert isinstance(Y, sparse.csr_matrix) + Y = scprep.io.utils._matrix_to_data_frame(self.X_coo, sparse=True) + assert isinstance(Y, sparse.spmatrix) assert np.all(scprep.utils.toarray(Y) == self.X_numpy) - Y = scprep.io.utils._matrix_to_dataframe(self.X_numpy, sparse=True) - assert isinstance(Y, sparse.csr_matrix) + + def test_matrix_to_dataframe_no_names_dataframe_sparse(self): + Y = scprep.io.utils._matrix_to_data_frame(self.X_dense, sparse=True) + assert scprep.utils.is_sparse_dataframe(Y) + assert not isinstance(Y, pd.SparseDataFrame) assert np.all(scprep.utils.toarray(Y) == self.X_numpy) + utils.assert_matrix_class_equivalent(Y, self.X_sparse) + Y = scprep.io.utils._matrix_to_data_frame(self.X_sparse, sparse=True) + assert scprep.utils.is_sparse_dataframe(Y) + assert not isinstance(Y, pd.SparseDataFrame) + assert np.all(scprep.utils.toarray(Y) == self.X_numpy) + utils.assert_matrix_class_equivalent(Y, self.X_sparse) - def test_matrix_to_dataframe_no_names_dense(): - Y = scprep.io.utils._matrix_to_dataframe(self.X_dense, sparse=False) + def test_matrix_to_dataframe_no_names_dense(self): + Y = scprep.io.utils._matrix_to_data_frame(self.X_numpy, sparse=False) assert isinstance(Y, np.ndarray) assert np.all(Y == self.X_numpy) - Y = scprep.io.utils._matrix_to_dataframe(self.X_sparse, sparse=False) - assert isinstance(Y, np.ndarray) - assert np.all(Y == self.X_numpy) - Y = scprep.io.utils._matrix_to_dataframe(self.X_numpy, sparse=False) + Y = scprep.io.utils._matrix_to_data_frame(self.X_coo, sparse=False) assert isinstance(Y, np.ndarray) assert np.all(Y == self.X_numpy) - def test_matrix_to_dataframe_names_sparse(): - Y = scprep.io.utils._matrix_to_dataframe(self.X_dense, cell_names=self.cell_names, + def test_matrix_to_dataframe_no_names_dataframe_dense(self): + Y = scprep.io.utils._matrix_to_data_frame(self.X_dense, sparse=False) + assert isinstance(Y, pd.DataFrame) + assert not scprep.utils.is_sparse_dataframe(Y) + assert not isinstance(Y, pd.SparseDataFrame) + assert np.all(scprep.utils.toarray(Y) == self.X_numpy) + utils.assert_matrix_class_equivalent(Y, self.X_dense) + Y = scprep.io.utils._matrix_to_data_frame(self.X_sparse, sparse=False) + assert isinstance(Y, pd.DataFrame) + assert not scprep.utils.is_sparse_dataframe(Y) + assert not isinstance(Y, pd.SparseDataFrame) + assert np.all(scprep.utils.toarray(Y) == self.X_numpy) + utils.assert_matrix_class_equivalent(Y, self.X_dense) + + def test_matrix_to_dataframe_names_sparse(self): + Y = scprep.io.utils._matrix_to_data_frame(self.X_dense, cell_names=self.cell_names, gene_names=self.gene_names, sparse=True) assert scprep.utils.is_sparse_dataframe(Y) assert not isinstance(Y, pd.SparseDataFrame) assert np.all(scprep.utils.toarray(Y) == self.X_numpy) - Y = scprep.io.utils._matrix_to_dataframe(self.X_sparse, cell_names=self.cell_names, + utils.assert_matrix_class_equivalent(Y, self.X_sparse) + Y = scprep.io.utils._matrix_to_data_frame(self.X_sparse, cell_names=self.cell_names, gene_names=self.gene_names, sparse=True) assert scprep.utils.is_sparse_dataframe(Y) assert not isinstance(Y, pd.SparseDataFrame) assert np.all(scprep.utils.toarray(Y) == self.X_numpy) - Y = scprep.io.utils._matrix_to_dataframe(self.X_numpy, cell_names=self.cell_names, + utils.assert_matrix_class_equivalent(Y, self.X_sparse) + Y = scprep.io.utils._matrix_to_data_frame(self.X_numpy, cell_names=self.cell_names, gene_names=self.gene_names, sparse=True) assert scprep.utils.is_sparse_dataframe(Y) assert not isinstance(Y, pd.SparseDataFrame) assert np.all(scprep.utils.toarray(Y) == self.X_numpy) + utils.assert_matrix_class_equivalent(Y, self.X_sparse) - def test_matrix_to_dataframe_names_dense(): - Y = scprep.io.utils._matrix_to_dataframe(self.X_dense, cell_names=self.cell_names, + def test_matrix_to_dataframe_names_dense(self): + Y = scprep.io.utils._matrix_to_data_frame(self.X_dense, cell_names=self.cell_names, gene_names=self.gene_names, sparse=False) assert isinstance(Y, pd.DataFrame) + assert not scprep.utils.is_sparse_dataframe(Y) assert not isinstance(Y, pd.SparseDataFrame) assert np.all(scprep.utils.toarray(Y) == self.X_numpy) - Y = scprep.io.utils._matrix_to_dataframe(self.X_sparse, cell_names=self.cell_names, + utils.assert_matrix_class_equivalent(Y, self.X_dense) + Y = scprep.io.utils._matrix_to_data_frame(self.X_sparse, cell_names=self.cell_names, gene_names=self.gene_names, sparse=False) assert isinstance(Y, pd.DataFrame) + assert not scprep.utils.is_sparse_dataframe(Y) assert not isinstance(Y, pd.SparseDataFrame) assert np.all(scprep.utils.toarray(Y) == self.X_numpy) - Y = scprep.io.utils._matrix_to_dataframe(self.X_numpy, cell_names=self.cell_names, + utils.assert_matrix_class_equivalent(Y, self.X_dense) + Y = scprep.io.utils._matrix_to_data_frame(self.X_numpy, cell_names=self.cell_names, gene_names=self.gene_names, sparse=False) assert isinstance(Y, pd.DataFrame) + assert not scprep.utils.is_sparse_dataframe(Y) assert not isinstance(Y, pd.SparseDataFrame) assert np.all(scprep.utils.toarray(Y) == self.X_numpy) + utils.assert_matrix_class_equivalent(Y, self.X_dense) def test_10X_duplicate_gene_names(): From 74b824e87a78183f199ad21dff99c86d758ce612 Mon Sep 17 00:00:00 2001 From: Scott Gigante Date: Mon, 23 Sep 2019 11:26:28 -0400 Subject: [PATCH 047/125] escape input to regex, fixes #64 --- scprep/select.py | 2 +- test/test_select.py | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/scprep/select.py b/scprep/select.py index 6d055be5..ec5df059 100644 --- a/scprep/select.py +++ b/scprep/select.py @@ -112,7 +112,7 @@ def _exact_word_regex(word): allowed_chars = ['\\(', '\\)', '\\[', '\\]', '\\.', ',', '!', '\\?', ' ', '^', '$'] wildcard = "(" + "|".join(allowed_chars) + ")+" - return "{wildcard}{word}{wildcard}".format(wildcard=wildcard, word=word) + return "{wildcard}{word}{wildcard}".format(wildcard=wildcard, word=re.escape(word)) def _get_string_subset_mask(data, starts_with=None, ends_with=None, diff --git a/test/test_select.py b/test/test_select.py index e05738e3..8d53f81b 100644 --- a/test/test_select.py +++ b/test/test_select.py @@ -427,6 +427,8 @@ def test_string_subset_exact_word(): ['World, hello!', 'world'], exact_word='hello'), [True, False]) np.testing.assert_array_equal(scprep.select._get_string_subset_mask( ['helloooo!', 'world'], exact_word='hello'), [False, False]) + np.testing.assert_array_equal(scprep.select._get_string_subset_mask( + ['(hello) world', 'world'], exact_word='(hello) world'), [True, False]) def test_string_subset_list(): From af17f7306ed6fc85ed95fe774db51c820a896c29 Mon Sep 17 00:00:00 2001 From: Scott Gigante Date: Mon, 23 Sep 2019 11:43:52 -0400 Subject: [PATCH 048/125] add file saving functionality to histogram, fixes #55 --- scprep/plot/histogram.py | 33 +++++++++++++++++++++++++++++++-- test/test_plot.py | 13 ++++++++++--- 2 files changed, 41 insertions(+), 5 deletions(-) diff --git a/scprep/plot/histogram.py b/scprep/plot/histogram.py index 0dfb30df..48f515a5 100644 --- a/scprep/plot/histogram.py +++ b/scprep/plot/histogram.py @@ -18,6 +18,8 @@ def histogram(data, fontsize=None, histtype='stepfilled', alpha=None, + filename=None, + dpi=None, **kwargs): """Plot a histogram. @@ -55,6 +57,12 @@ def histogram(data, 'stepfilled' generates a lineplot that is by default filled. alpha : float, optional (default: 1 for a single dataset, 0.5 for multiple) Histogram transparency + filename : str or None (default: None) + file to which the output is saved + dpi : int or None, optional (default: None) + The resolution in dots per inch. If None it will default to the value + savefig.dpi in the matplotlibrc file. If 'figure' it will set the dpi + to be the value of the figure. Only used if filename is not None. **kwargs : additional arguments for `matplotlib.pyplot.hist` Returns @@ -102,6 +110,9 @@ def histogram(data, else: for c in cutoff: ax.axvline(c, color='red') + # save and show + if filename is not None: + fig.savefig(filename, dpi=dpi) if show_fig: show(fig) return ax @@ -115,6 +126,8 @@ def plot_library_size(data, xlabel='Library size', title=None, fontsize=None, + filename=None, + dpi=None, **kwargs): """Plot the library size histogram. @@ -144,6 +157,12 @@ def plot_library_size(data, Axis title. fontsize : float or None (default: None) Base font size. + filename : str or None (default: None) + file to which the output is saved + dpi : int or None, optional (default: None) + The resolution in dots per inch. If None it will default to the value + savefig.dpi in the matplotlibrc file. If 'figure' it will set the dpi + to be the value of the figure. Only used if filename is not None. **kwargs : additional arguments for `matplotlib.pyplot.hist` Returns @@ -161,7 +180,8 @@ def plot_library_size(data, return histogram(libsize, cutoff=cutoff, percentile=percentile, bins=bins, log=log, ax=ax, figsize=figsize, - xlabel=xlabel, title=title, fontsize=fontsize, **kwargs) + xlabel=xlabel, title=title, fontsize=fontsize, + filename=filename, dpi=dpi, **kwargs) @utils._with_pkg(pkg="matplotlib", min_version=3) @@ -175,6 +195,8 @@ def plot_gene_set_expression(data, genes=None, xlabel='Gene expression', title=None, fontsize=None, + filename=None, + dpi=None, **kwargs): """Plot the histogram of the expression of a gene set. @@ -216,6 +238,12 @@ def plot_gene_set_expression(data, genes=None, Axis title. fontsize : float or None (default: None) Base font size. + filename : str or None (default: None) + file to which the output is saved + dpi : int or None, optional (default: None) + The resolution in dots per inch. If None it will default to the value + savefig.dpi in the matplotlibrc file. If 'figure' it will set the dpi + to be the value of the figure. Only used if filename is not None. **kwargs : additional arguments for `matplotlib.pyplot.hist` Returns @@ -245,4 +273,5 @@ def plot_gene_set_expression(data, genes=None, return histogram(expression, cutoff=cutoff, percentile=percentile, bins=bins, log=log, ax=ax, figsize=figsize, - xlabel=xlabel, title=title, fontsize=fontsize, **kwargs) + xlabel=xlabel, title=title, fontsize=fontsize, + filename=filename, dpi=dpi, **kwargs) diff --git a/test/test_plot.py b/test/test_plot.py index 05ae7111..2f9388d9 100644 --- a/test/test_plot.py +++ b/test/test_plot.py @@ -710,7 +710,9 @@ def test_plot_library_size_multiple(self): scprep.plot.plot_library_size([ self.X, scprep.select.select_rows( self.X, idx=np.arange(self.X.shape[0] // 2))], - color=['r', 'b']) + color=['r', 'b'], + filename="test_library_size.png") + assert os.path.exists("test_library_size.png") def test_plot_gene_set_expression_multiple(self): scprep.plot.plot_gene_set_expression([ @@ -730,7 +732,9 @@ def test_gene_set_expression_array(self): def test_plot_gene_set_expression_single_gene(self): scprep.plot.plot_gene_set_expression( self.X, color=["red"], - genes="Arl8b") + genes="Arl8b", + filename="test_gene_expression.png") + assert os.path.exists("test_gene_expression.png") def test_histogram_single_gene_dataframe(self): scprep.plot.histogram( @@ -746,7 +750,10 @@ def test_histogram_custom_axis(self): fig, ax = plt.subplots() scprep.plot.plot_gene_set_expression( self.X, genes=scprep.select.get_gene_set(self.X, starts_with="D"), - percentile=90, log='y', ax=ax, title="histogram") + percentile=90, log='y', ax=ax, title="histogram", + filename="test_histogram.png") + assert os.path.exists("test_histogram.png") + assert ax.get_title() == 'histogram' def test_histogram_invalid_axis(self): assert_raise_message( From 89a1dbc0e7a96e9cfd8e92cfae0431c3462af3a6 Mon Sep 17 00:00:00 2001 From: Scott Gigante Date: Mon, 23 Sep 2019 11:50:41 -0400 Subject: [PATCH 049/125] remove extraneous test files --- test/test_plot.py | 3 +++ test/test_stats.py | 3 +++ 2 files changed, 6 insertions(+) diff --git a/test/test_plot.py b/test/test_plot.py index 2f9388d9..bafee91d 100644 --- a/test/test_plot.py +++ b/test/test_plot.py @@ -680,6 +680,9 @@ def tearDownClass(self): try_remove("test.gif") try_remove("test.mp4") try_remove("test_jitter.png") + try_remove("test_histogram.png") + try_remove("test_library_size.png") + try_remove("test_gene_expression.png") def tearDown(self): plt.close('all') diff --git a/test/test_stats.py b/test/test_stats.py index cd4c4e62..50c6077d 100644 --- a/test/test_stats.py +++ b/test/test_stats.py @@ -6,6 +6,7 @@ import scprep from functools import partial import warnings +import os from parameterized import parameterized @@ -98,6 +99,8 @@ def test_knnDREMI(): Y2, drevi = scprep.stats.knnDREMI(X[:, 0], X[:, 1], plot=True, filename="test.png", return_drevi=True) + assert os.path.isfile("test.png") + os.remove("test.png") assert Y2 == Y assert drevi.shape == (20, 20) matrix.test_all_matrix_types( From 30d3417f3975a958c348854b7a73937991af33fc Mon Sep 17 00:00:00 2001 From: Scott Gigante Date: Mon, 23 Sep 2019 12:08:31 -0400 Subject: [PATCH 050/125] better handling of negative and constant data in log='x' for histogram, fixes #54 --- scprep/plot/histogram.py | 17 ++++++++++++++-- test/test_plot.py | 43 ++++++++++++++++++++++++++++++++-------- 2 files changed, 50 insertions(+), 10 deletions(-) diff --git a/scprep/plot/histogram.py b/scprep/plot/histogram.py index 48f515a5..8102de66 100644 --- a/scprep/plot/histogram.py +++ b/scprep/plot/histogram.py @@ -1,5 +1,6 @@ import numpy as np import numbers +import warnings from .. import measure, utils from .utils import (_get_figure, show, @@ -86,8 +87,20 @@ def histogram(data, if alpha is None: alpha = 1 if log == 'x' or log is True: - bins = np.logspace(np.log10(max(xmin, 1)), - np.log10(xmax), + if xmax < np.finfo('float').eps: + raise ValueError("Expected positive data for log = {}. " + "Got max(data) = {:.2f}".format(log, xmax)) + elif xmin < np.finfo('float').eps: + warnings.warn("Expected positive data for log = {}. " + "Got min(data) = {:.2f}".format(log, xmin), UserWarning) + xmin = np.finfo('float').eps + xmin = np.log10(xmin) + xmax = np.log10(xmax) + xrange = max(xmax - xmin, 1) + xmin = xmin - xrange * 0.1 + xmax = xmax + xrange * 0.1 + bins = np.logspace(xmin, + xmax, bins) ax.hist(data, bins=bins, histtype=histtype, alpha=alpha, **kwargs) diff --git a/test/test_plot.py b/test/test_plot.py index bafee91d..97fcc72f 100644 --- a/test/test_plot.py +++ b/test/test_plot.py @@ -212,6 +212,32 @@ def test_is_color_array_none(): assert not scprep.plot.utils._is_color_array(None) +def test_histogram_log_negative_min(): + assert_warns_message( + UserWarning, + "Expected positive data for log = x. Got min(data) = -1.00", + scprep.plot.histogram, + [-1, 1, 1, 1], log='x') + assert_warns_message( + UserWarning, + "Expected positive data for log = True. Got min(data) = -1.00", + scprep.plot.histogram, + [-1, 1, 1, 1], log=True) + + +def test_histogram_log_negative_max(): + assert_raise_message( + ValueError, + "Expected positive data for log = x. Got max(data) = -1.00", + scprep.plot.histogram, + [-1, -1, -1, -2], log='x') + assert_raise_message( + ValueError, + "Expected positive data for log = True. Got max(data) = -1.00", + scprep.plot.histogram, + [-1, -1, -1, -2], log=True) + + class TestScatterParams(unittest.TestCase): @classmethod @@ -671,6 +697,7 @@ class Test10X(unittest.TestCase): @classmethod def setUpClass(self): self.X = data.load_10X(sparse=False) + self.X_filt = scprep.filter.filter_empty_cells(self.X) self.X_pca, self.S = scprep.reduce.pca(self.X, n_components=10, return_singular_values=True) @@ -688,15 +715,15 @@ def tearDown(self): plt.close('all') def test_histogram(self): - scprep.plot.plot_library_size(self.X, cutoff=1000, log=True) - scprep.plot.plot_library_size(self.X, cutoff=1000, log=True, + scprep.plot.plot_library_size(self.X_filt, cutoff=1000, log=True) + scprep.plot.plot_library_size(self.X_filt, cutoff=1000, log=True, xlabel="x label", ylabel="y label") def test_histogram_list_of_lists(self): - scprep.plot.plot_library_size(scprep.utils.toarray(self.X).tolist()) + scprep.plot.plot_library_size(scprep.utils.toarray(self.X_filt).tolist()) def test_histogram_array(self): - scprep.plot.plot_library_size(scprep.utils.toarray(self.X)) + scprep.plot.plot_library_size(scprep.utils.toarray(self.X_filt)) def test_histogram_multiple(self): scprep.plot.histogram([scprep.select.select_rows(self.X, idx=0), @@ -704,15 +731,15 @@ def test_histogram_multiple(self): color=['r', 'b']) def test_histogram_multiple_cutoff(self): - scprep.plot.plot_library_size(self.X, cutoff=[500, 1000], log=True) + scprep.plot.plot_library_size(self.X_filt, cutoff=[500, 1000], log=True) def test_histogram_multiple_percentile(self): - scprep.plot.plot_library_size(self.X, percentile=[10, 90], log=True) + scprep.plot.plot_library_size(self.X_filt, percentile=[10, 90], log=True) def test_plot_library_size_multiple(self): scprep.plot.plot_library_size([ - self.X, scprep.select.select_rows( - self.X, idx=np.arange(self.X.shape[0] // 2))], + self.X_filt, scprep.select.select_rows( + self.X_filt, idx=np.arange(self.X_filt.shape[0] // 2))], color=['r', 'b'], filename="test_library_size.png") assert os.path.exists("test_library_size.png") From 26def77fdd996d3a8c7e5a7f7a458ac2f796df4f Mon Sep 17 00:00:00 2001 From: Scott Gigante Date: Tue, 24 Sep 2019 11:16:45 -0400 Subject: [PATCH 051/125] move hdf5 inside IO in docs --- doc/source/reference.rst | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/doc/source/reference.rst b/doc/source/reference.rst index 90e274a8..21814d24 100644 --- a/doc/source/reference.rst +++ b/doc/source/reference.rst @@ -11,6 +11,15 @@ Data Input/Output :imported-members: :show-inheritance: +HDF5 +~~~~ + +.. automodule:: scprep.io.hdf5 + :autosummary: + :members: + :inherited-members: + :show-inheritance: + Filtering --------- @@ -102,12 +111,3 @@ External Tools :inherited-members: :imported-members: :show-inheritance: - -HDF5 ----- - -.. automodule:: scprep.io.hdf5 - :autosummary: - :members: - :inherited-members: - :show-inheritance: From 8df3734215edde7cc91ddac77a56a5b4ca6359f0 Mon Sep 17 00:00:00 2001 From: Scott Gigante Date: Wed, 25 Sep 2019 17:34:46 -0400 Subject: [PATCH 052/125] don't need data as an argument to filter_idx --- scprep/filter.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scprep/filter.py b/scprep/filter.py index 1737cb14..b6c49012 100644 --- a/scprep/filter.py +++ b/scprep/filter.py @@ -120,7 +120,7 @@ def filter_empty_cells(data, *extra_data, sample_labels=None): return data -def _get_filter_idx(data, values, +def _get_filter_idx(values, cutoff, percentile, keep_cells): cutoff = measure._get_percentile_cutoff( @@ -188,7 +188,7 @@ def filter_values(data, *extra_data, values=None, "Filtering as a single sample.", DeprecationWarning) assert values is not None - keep_cells_idx = _get_filter_idx(data, values, + keep_cells_idx = _get_filter_idx(values, cutoff, percentile, keep_cells) if return_values: From 1a1a5a96c16de63557dc46872f6b302c56c599cf Mon Sep 17 00:00:00 2001 From: Scott Gigante Date: Wed, 25 Sep 2019 18:36:18 -0400 Subject: [PATCH 053/125] add variable genes functionality --- scprep/_lazyload.py | 3 +- scprep/filter.py | 38 +++++++++++++++++++ scprep/measure.py | 32 ++++++++++++++++ scprep/plot/__init__.py | 2 +- scprep/plot/histogram.py | 80 ++++++++++++++++++++++++++++++++++++++++ scprep/utils.py | 53 +++++++++++++++++++++++++- test/test_filter.py | 11 ++++++ test/test_plot.py | 18 +++++++++ test/test_utils.py | 33 +++++++++++++++++ 9 files changed, 267 insertions(+), 3 deletions(-) diff --git a/scprep/_lazyload.py b/scprep/_lazyload.py index 130a88b9..b9357058 100644 --- a/scprep/_lazyload.py +++ b/scprep/_lazyload.py @@ -14,7 +14,8 @@ 'rinterface', {'rinterface_lib': ['callbacks']}], 'h5py': [], - 'tables': [] + 'tables': [], + 'statsmodels': [{'nonparametric': ['smoothers_lowess']}], } diff --git a/scprep/filter.py b/scprep/filter.py index b6c49012..e56d06b2 100644 --- a/scprep/filter.py +++ b/scprep/filter.py @@ -369,3 +369,41 @@ def filter_duplicates(data, *extra_data, sample_labels=None): unique_idx = _find_unique_cells(data) data = select.select_rows(data, *extra_data, idx=unique_idx) return data + + +def filter_variable_genes(data, *extra_data, span=0.7, interpolate=0.2, + cutoff=None, percentile=80): + """Filter all genes with low variability + + Variability is computed as the deviation from a loess fit of the mean-variance curve + + Parameters + ---------- + data : array-like, shape=[n_samples, n_features] + Input data + extra_data : array-like, shape=[any, n_features], optional + Optional additional data objects from which to select the same rows + span : float, optional (default: 0.7) + Fraction of genes to use when computing the loess estimate at each point + interpolate : float, optional (default: 0.2) + Multiple of the standard deviation of variances at which to interpolate + linearly in order to reduce computation time. + cutoff : float, optional (default: None) + Variability above which expression is deemed significant + percentile : int, optional (Default: 80) + Percentile above or below which to remove genes. + Must be an integer between 0 and 100. Only one of `cutoff` + and `percentile` should be specified. + + Returns + ------- + data : array-like, shape=[n_samples, m_features] + Filtered output data, where m_features <= n_features + extra_data : array-like, shape=[any, m_features] + Filtered extra data, if passed. + """ + var_genes = measure.variable_genes(data, span=span, interpolate=interpolate) + keep_cells_idx = _get_filter_idx(var_genes, + cutoff, percentile, + keep_cells='above') + return select.select_cols(data, *extra_data, idx=keep_cells_idx) diff --git a/scprep/measure.py b/scprep/measure.py index 27a18f8b..5ee9f5ab 100644 --- a/scprep/measure.py +++ b/scprep/measure.py @@ -3,6 +3,7 @@ import numbers from . import utils, select +from ._lazyload import statsmodels def library_size(data): @@ -62,6 +63,37 @@ def gene_set_expression(data, genes=None, library_size_normalize=False, return gene_set_expression +@utils._with_pkg(pkg="statsmodels") +def variable_genes(data, span=0.7, interpolate=0.2): + """Measure the variability of each gene in a dataset + + Variability is computed as the deviation from a loess fit of the mean-variance curve + + Parameters + ---------- + data : array-like, shape=[n_samples, n_features] + Input data + span : float, optional (default: 0.7) + Fraction of genes to use when computing the loess estimate at each point + interpolate : float, optional (default: 0.2) + Multiple of the standard deviation of variances at which to interpolate + linearly in order to reduce computation time. + + Returns + ------- + variability : list-like, shape=[n_samples] + Variability for each gene + """ + data = utils.to_array_or_spmatrix(data) + data_mean = utils.toarray(np.mean(data, axis=0)).flatten() + data_std = utils.matrix_std(data, axis=0) ** 2 + delta = np.std(data_std) * interpolate + lowess = statsmodels.nonparametric.smoothers_lowess.lowess( + data_std, data_mean, + delta=delta, frac=span, return_sorted=False) + return data_std - lowess + + def _get_percentile_cutoff(data, cutoff=None, percentile=None, required=False): """Get a cutoff for a dataset diff --git a/scprep/plot/__init__.py b/scprep/plot/__init__.py index 1a7ca181..92129998 100644 --- a/scprep/plot/__init__.py +++ b/scprep/plot/__init__.py @@ -1,5 +1,5 @@ from .scatter import scatter, scatter2d, scatter3d, rotate_scatter3d -from .histogram import histogram, plot_library_size, plot_gene_set_expression +from .histogram import histogram, plot_library_size, plot_gene_set_expression, plot_variable_genes from .marker import marker_plot from .scree import scree_plot from .jitter import jitter diff --git a/scprep/plot/histogram.py b/scprep/plot/histogram.py index 8102de66..c2fc61d2 100644 --- a/scprep/plot/histogram.py +++ b/scprep/plot/histogram.py @@ -288,3 +288,83 @@ def plot_gene_set_expression(data, genes=None, bins=bins, log=log, ax=ax, figsize=figsize, xlabel=xlabel, title=title, fontsize=fontsize, filename=filename, dpi=dpi, **kwargs) + + +@utils._with_pkg(pkg="matplotlib", min_version=3) +def plot_variable_genes(data, span=0.7, interpolate=0.2, + bins=100, log=False, + cutoff=None, percentile=None, + ax=None, figsize=None, + xlabel='Gene variability', + ylabel='Number of genes', + title=None, + fontsize=None, + filename=None, + dpi=None, **kwargs): + """Plot the histogram of gene variability + + Variability is computed as the deviation from a loess fit of the mean-variance curve + + Parameters + ---------- + data : array-like, shape=[n_samples, n_features] + Input data. Multiple datasets may be given as a list of array-likes. + span : float, optional (default: 0.7) + Fraction of genes to use when computing the loess estimate at each point + interpolate : float, optional (default: 0.2) + Multiple of the standard deviation of variances at which to interpolate + linearly in order to reduce computation time. + bins : int, optional (default: 100) + Number of bins to draw in the histogram + log : bool, or {'x', 'y'}, optional (default: False) + If True, plot both axes on a log scale. If 'x' or 'y', + only plot the given axis on a log scale. If False, + plot both axes on a linear scale. + cutoff : float or `None`, optional (default: `None`) + Absolute cutoff at which to draw a vertical line. + Only one of `cutoff` and `percentile` may be given. + percentile : float or `None`, optional (default: `None`) + Percentile between 0 and 100 at which to draw a vertical line. + Only one of `cutoff` and `percentile` may be given. + library_size_normalize : bool, optional (default: False) + Divide gene set expression by library size + ax : `matplotlib.Axes` or None, optional (default: None) + Axis to plot on. If None, a new axis will be created. + figsize : tuple or None, optional (default: None) + If not None, sets the figure size (width, height) + [x,y]label : str, optional + Labels to display on the x and y axis. + title : str or None, optional (default: None) + Axis title. + fontsize : float or None (default: None) + Base font size. + filename : str or None (default: None) + file to which the output is saved + dpi : int or None, optional (default: None) + The resolution in dots per inch. If None it will default to the value + savefig.dpi in the matplotlibrc file. If 'figure' it will set the dpi + to be the value of the figure. Only used if filename is not None. + **kwargs : additional arguments for `matplotlib.pyplot.hist` + + Returns + ------- + ax : `matplotlib.Axes` + axis on which plot was drawn + """ + if hasattr(data, 'shape') and len(data.shape) == 2: + var_genes = measure.variable_genes( + data, span=span, interpolate=interpolate) + else: + data_array = utils.to_array_or_spmatrix(data) + if len(data_array.shape) == 2 and data_array.dtype.type is not np.object_: + var_genes = measure.variable_genes( + data_array, span=span, interpolate=interpolate) + else: + var_genes = [measure.variable_genes( + d, span=span, interpolate=interpolate) + for d in data] + return histogram(var_genes, + cutoff=cutoff, percentile=percentile, + bins=bins, log=log, ax=ax, figsize=figsize, + xlabel=xlabel, title=title, fontsize=fontsize, + filename=filename, dpi=dpi, **kwargs) diff --git a/scprep/utils.py b/scprep/utils.py index b7c247cc..f1db32e0 100644 --- a/scprep/utils.py +++ b/scprep/utils.py @@ -99,7 +99,7 @@ def toarray(x): elif isinstance(x, sparse.spmatrix): x = x.toarray() elif isinstance(x, np.matrix): - x = np.array(x) + x = x.A elif isinstance(x, list): x_out = [] for xi in x: @@ -224,6 +224,57 @@ def matrix_sum(data, axis=None): return sums +def matrix_std(data, axis=None): + """Get the column-wise, row-wise, or total standard deviation of a matrix + + Parameters + ---------- + data : array-like, shape=[n_samples, n_features] + Input data + axis : int or None, optional (default: None) + Axis across which to calculate standard deviation. + axis=0 gives column standard deviation, + axis=1 gives row standard deviation. + None gives the total standard deviation. + + Returns + ------- + std : array-like or float + Standard deviation along desired axis. + """ + if axis not in [0, 1, None]: + raise ValueError("Expected axis in [0, 1, None]. Got {}".format(axis)) + data = to_array_or_spmatrix(data) + if sparse.issparse(data): + if axis is None: + if isinstance(data, (sparse.lil_matrix, sparse.dok_matrix)): + data = data.tocoo() + data_sq = data.copy() + data_sq.data = data_sq.data ** 2 + variance = data_sq.mean() - data.mean() ** 2 + std = np.sqrt(variance) + else: + if axis == 0: + data = data.tocsc() + next_fn = data.getcol + N = data.shape[1] + elif axis == 1: + data = data.tocsr() + next_fn = data.getrow + N = data.shape[0] + std = [] + for i in range(N): + col = next_fn(i) + col_sq = col.copy() + col_sq.data = col_sq.data ** 2 + variance = col_sq.mean() - col.mean() ** 2 + std.append(np.sqrt(variance)) + std = np.array(std) + else: + std = np.std(data, axis=axis) + return std + + def matrix_vector_elementwise_multiply(data, multiplier, axis=None): """Elementwise multiply a matrix by a vector diff --git a/test/test_filter.py b/test/test_filter.py index 24e30c97..31d62520 100644 --- a/test/test_filter.py +++ b/test/test_filter.py @@ -85,6 +85,17 @@ def test_filter_rare_genes(self): self.X_dense, utils.assert_transform_equals, Y=X_filtered, transform=scprep.filter.filter_rare_genes) + def test_filter_variable_genes(self): + X_filtered = scprep.filter.filter_variable_genes(self.X_dense, percentile=70) + assert X_filtered.shape[0] == self.X_dense.shape[0] + assert X_filtered.shape[1] <= 30 + assert X_filtered.shape[1] >= 20 + assert self.X_dense.columns[np.argmax(self.X_dense.values.std(axis=0))] in X_filtered.columns + matrix.test_all_matrix_types( + self.X_dense, utils.assert_transform_equals, + Y=X_filtered, transform=scprep.filter.filter_variable_genes, percentile=70) + + def test_library_size_filter(self): X_filtered = scprep.filter.filter_library_size( self.X_sparse, cutoff=100) diff --git a/test/test_plot.py b/test/test_plot.py index 97fcc72f..5c2738f6 100644 --- a/test/test_plot.py +++ b/test/test_plot.py @@ -709,6 +709,7 @@ def tearDownClass(self): try_remove("test_jitter.png") try_remove("test_histogram.png") try_remove("test_library_size.png") + try_remove("test_variable_genes.png") try_remove("test_gene_expression.png") def tearDown(self): @@ -766,6 +767,23 @@ def test_plot_gene_set_expression_single_gene(self): filename="test_gene_expression.png") assert os.path.exists("test_gene_expression.png") + def test_plot_variable_genes(self): + scprep.plot.plot_variable_genes( + self.X, + color='r') + + def test_plot_variable_genes_multiple(self): + scprep.plot.plot_variable_genes([ + self.X, scprep.select.select_rows( + self.X, idx=np.arange(self.X.shape[0] // 2))], + filename="test_variable_genes.png", + color=['r', 'b']) + assert os.path.exists("test_variable_genes.png") + + def test_variable_genes_list_of_lists(self): + scprep.plot.plot_variable_genes( + scprep.utils.toarray(self.X).tolist()) + def test_histogram_single_gene_dataframe(self): scprep.plot.histogram( scprep.select.select_cols(self.X, idx=['Arl8b']), diff --git a/test/test_utils.py b/test/test_utils.py index 9a044cdf..2bbfb6f7 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -271,6 +271,39 @@ def test_matrix_sum(): 5) +def test_matrix_std(): + X = data.generate_positive_sparse_matrix(shape=(50, 100)) + stds = np.array(X.std(0)).flatten() + matrix.test_all_matrix_types(X, utils.assert_transform_equals, Y=stds, + transform=scprep.utils.matrix_std, axis=0, + check=utils.assert_all_close) + matrix.test_numpy_matrix(X, utils.assert_transform_equals, Y=stds, + transform=scprep.utils.matrix_std, axis=0, + check=utils.assert_all_close) + + stds = np.array(X.std(1)).flatten() + matrix.test_all_matrix_types(X, utils.assert_transform_equals, Y=stds, + transform=scprep.utils.matrix_std, axis=1, + check=utils.assert_all_close) + matrix.test_numpy_matrix(X, utils.assert_transform_equals, Y=stds, + transform=scprep.utils.matrix_std, axis=1, + check=utils.assert_all_close) + + stds = np.array(X.std(None)).flatten() + matrix.test_all_matrix_types(X, utils.assert_transform_equals, Y=stds, + transform=scprep.utils.matrix_std, axis=None, + check=utils.assert_all_close) + matrix.test_numpy_matrix(X, utils.assert_transform_equals, Y=stds, + transform=scprep.utils.matrix_std, axis=None, + check=utils.assert_all_close) + + assert_raise_message(ValueError, + "Expected axis in [0, 1, None]. Got 5", + scprep.utils.matrix_std, + data, + 5) + + def test_matrix_elementwise_multiply_row(): X = data.generate_positive_sparse_matrix(shape=(50, 100)) x = X[:, 0] + 1 From 9823ebcc2b20ff2a1606da3e0e69194fc8ac3e38 Mon Sep 17 00:00:00 2001 From: Scott Gigante Date: Wed, 25 Sep 2019 18:59:01 -0400 Subject: [PATCH 054/125] change default fallback from 10X labels to 'both', resolves #57 --- scprep/io/tenx.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/scprep/io/tenx.py b/scprep/io/tenx.py index 6da8f90b..e102de08 100644 --- a/scprep/io/tenx.py +++ b/scprep/io/tenx.py @@ -40,18 +40,18 @@ def _combine_gene_id(symbols, ids): def _parse_10x_genes(symbols, ids, gene_labels='symbol', allow_duplicates=True): assert gene_labels in ['symbol', 'id', 'both'] - if gene_labels == 'both': - columns = _combine_gene_id(symbols, ids) if gene_labels == 'symbol': columns = symbols if not allow_duplicates and len(np.unique(columns)) < len(columns): warnings.warn( - "Duplicate gene names detected! Forcing `gene_labels='id'`. " - "Alternatively, try `gene_labels='both'`, " + "Duplicate gene names detected! Forcing `gene_labels='both'`. " + "Alternatively, try `gene_labels='id'`, " "`allow_duplicates=True`, or load the matrix" " with `sparse=False`", RuntimeWarning) - gene_labels = 'id' - if gene_labels == 'id': + gene_labels = 'both' + if gene_labels == 'both': + columns = _combine_gene_id(symbols, ids) + elif gene_labels == 'id': columns = ids return columns From 2cba5d18f6639d0a4584a7c117d04a6763cd704f Mon Sep 17 00:00:00 2001 From: Scott Gigante Date: Wed, 25 Sep 2019 19:01:39 -0400 Subject: [PATCH 055/125] fix tests to match new warning message for 9823ebcc2b20ff2a1606da3e0e69194fc8ac3e38 --- test/test_io.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test/test_io.py b/test/test_io.py index 401188e9..bb82c8d5 100644 --- a/test/test_io.py +++ b/test/test_io.py @@ -13,8 +13,8 @@ def test_10X_duplicate_gene_names(): assert_warns_message( RuntimeWarning, - "Duplicate gene names detected! Forcing `gene_labels='id'`. " - "Alternatively, try `gene_labels='both'`, `allow_duplicates=True`, or " + "Duplicate gene names detected! Forcing `gene_labels='both'`. " + "Alternatively, try `gene_labels='id'`, `allow_duplicates=True`, or " "load the matrix with `sparse=False`", scprep.io.load_10X, os.path.join(data.data_dir, "test_10X_duplicate_gene_names"), From 2a6d71c6cc79d457687d07f36fcccfc5cfb5f81b Mon Sep 17 00:00:00 2001 From: Scott Gigante Date: Wed, 25 Sep 2019 19:29:31 -0400 Subject: [PATCH 056/125] automatically set axis labels if pd.Series is given, resolves #70 --- scprep/plot/jitter.py | 14 +++---- scprep/plot/scatter.py | 94 +++++++++++++++++++++++++++++++----------- test/test_plot.py | 52 +++++++++++++++++++++++ 3 files changed, 128 insertions(+), 32 deletions(-) diff --git a/scprep/plot/jitter.py b/scprep/plot/jitter.py index 3b75f422..ccf96558 100644 --- a/scprep/plot/jitter.py +++ b/scprep/plot/jitter.py @@ -113,13 +113,8 @@ def jitter(labels, values, sigma=0.1, If a list, sets custom axis tick labels {x,y}ticklabels : True, False, or list-like (default: None) If set, overrides `ticklabels` - label_prefix : str or None (default: None) - Prefix for all axis labels. Axes will be labelled `label_prefix`1, - `label_prefix`2, etc. Can be overriden by setting `xlabel`, - `ylabel`, and `zlabel`. {x,y}label : str or None (default : None) - Axis labels. Overrides the automatic label given by - label_prefix. If None and label_prefix is None, no label is set. + Axis labels. If None, no label is set. title : str or None (default: None) axis title. If None, no title is set. fontsize : float or None (default: None) @@ -157,7 +152,8 @@ def jitter(labels, values, sigma=0.1, labels, values, c=c, discrete=discrete, cmap=cmap, cmap_scale=cmap_scale, vmin=vmin, vmax=vmax, s=s, - legend=legend, colorbar=colorbar) + legend=legend, colorbar=colorbar, + xlabel=xlabel, ylabel=ylabel) fig, ax, show_fig = _get_figure( ax, figsize, subplot_kw=params.subplot_kw) @@ -190,9 +186,9 @@ def jitter(labels, values, sigma=0.1, xticklabels = params.x_labels # label axes - label_axis(ax.xaxis, xticks, xticklabels, xlabel) + label_axis(ax.xaxis, xticks, xticklabels, params.xlabel) label_axis(ax.yaxis, _with_default(yticks, ticks), - _with_default(yticklabels, ticklabels), ylabel) + _with_default(yticklabels, ticklabels), params.ylabel) # manually set x limits xmin = np.min(params.x_coords) diff --git a/scprep/plot/scatter.py b/scprep/plot/scatter.py index ded4241f..bffed12c 100644 --- a/scprep/plot/scatter.py +++ b/scprep/plot/scatter.py @@ -29,10 +29,11 @@ class _ScatterParams(object): def __init__(self, x, y, z=None, c=None, discrete=None, cmap=None, cmap_scale=None, vmin=None, vmax=None, s=None, legend=None, colorbar=None, - shuffle=True): - self._x = _squeeze_array(x) - self._y = _squeeze_array(y) - self._z = _squeeze_array(z) if z is not None else None + xlabel=None, ylabel=None, zlabel=None, + label_prefix=None, shuffle=True): + self._x = x + self._y = y + self._z = z if z is not None else None self._c = c self._discrete = discrete self._cmap = cmap @@ -44,6 +45,10 @@ def __init__(self, x, y, z=None, c=None, discrete=None, self._colorbar = colorbar self._labels = None self._c_discrete = None + self._label_prefix = label_prefix + self._xlabel = xlabel + self._ylabel = ylabel + self._zlabel = zlabel self.shuffle = shuffle self.check_size() self.check_c() @@ -54,9 +59,25 @@ def __init__(self, x, y, z=None, c=None, discrete=None, self.check_cmap_scale() self.check_vmin_vmax() + @property + def x_array(self): + return _squeeze_array(self._x) + + @property + def y_array(self): + return _squeeze_array(self._y) + + @property + def z_array(self): + return _squeeze_array(self._z) if self._z is not None else None + @property def size(self): - return len(self._x) + try: + return self._size + except AttributeError: + self._size = len(self.x_array) + return self._size @property def plot_idx(self): @@ -71,15 +92,15 @@ def plot_idx(self): @property def x(self): - return self._x[self.plot_idx] + return self.x_array[self.plot_idx] @property def y(self): - return self._y[self.plot_idx] + return self.y_array[self.plot_idx] @property def z(self): - return self._z[self.plot_idx] if self._z is not None else None + return self.z_array[self.plot_idx] if self._z is not None else None @property def data(self): @@ -91,9 +112,9 @@ def data(self): @property def _data(self): if self._z is not None: - return [self._x, self._y, self._z] + return [self.x_array, self.y_array, self.z_array] else: - return [self._x, self._y] + return [self.x_array, self.y_array] @property def s(self): @@ -420,6 +441,41 @@ def check_cmap_scale(self): UserWarning) self._cmap_scale = 'linear' + @property + def xlabel(self): + if self._xlabel is not None: + return self._xlabel + elif self._label_prefix is not None: + return self._label_prefix + "1" + elif isinstance(self._x, pd.Series): + return self._x.name + else: + return None + + @property + def ylabel(self): + if self._ylabel is not None: + return self._ylabel + elif self._label_prefix is not None: + return self._label_prefix + "2" + elif isinstance(self._y, pd.Series): + return self._y.name + else: + return None + + @property + def zlabel(self): + if self._z is None: + return None + elif self._zlabel is not None: + return self._zlabel + elif self._label_prefix is not None: + return self._label_prefix + "3" + elif isinstance(self._z, pd.Series): + return self._z.name + else: + return None + @utils._with_pkg(pkg="matplotlib", min_version=3) def scatter(x, y, z=None, @@ -570,7 +626,8 @@ def scatter(x, y, z=None, cmap=cmap, cmap_scale=cmap_scale, vmin=vmin, vmax=vmax, s=s, legend=legend, colorbar=colorbar, - shuffle=shuffle) + xlabel=xlabel, ylabel=ylabel, zlabel=zlabel, + label_prefix=label_prefix, shuffle=shuffle) fig, ax, show_fig = _get_figure( ax, figsize, subplot_kw=params.subplot_kw) @@ -581,23 +638,14 @@ def scatter(x, y, z=None, c=params.c, cmap=params.cmap, norm=params.norm, s=params.s, vmin=params.vmin, vmax=params.vmax, **plot_kwargs) - # automatic axis labels - if label_prefix is not None: - if xlabel is None: - xlabel = label_prefix + "1" - if ylabel is None: - ylabel = label_prefix + "2" - if zlabel is None: - zlabel = label_prefix + "3" - # label axes label_axis(ax.xaxis, _with_default(xticks, ticks), - _with_default(xticklabels, ticklabels), xlabel) + _with_default(xticklabels, ticklabels), params.xlabel) label_axis(ax.yaxis, _with_default(yticks, ticks), - _with_default(yticklabels, ticklabels), ylabel) + _with_default(yticklabels, ticklabels), params.ylabel) if z is not None: label_axis(ax.zaxis, _with_default(zticks, ticks), - _with_default(zticklabels, ticklabels), zlabel) + _with_default(zticklabels, ticklabels), params.zlabel) if title is not None: ax.set_title(title, fontsize=parse_fontsize(None, 'xx-large')) diff --git a/test/test_plot.py b/test/test_plot.py index 97fcc72f..6c069a02 100644 --- a/test/test_plot.py +++ b/test/test_plot.py @@ -2,6 +2,7 @@ import matplotlib import matplotlib.pyplot as plt import numpy as np +import pandas as pd import os from sklearn.utils.testing import assert_raise_message, assert_warns_message import unittest @@ -685,6 +686,38 @@ def test_check_cmap_scale(self): c=np.where(self.c > 0, '+', '-'), ) + def test_series_labels(self): + params = _ScatterParams(x=pd.Series(self.x, name='x'), y=self.y, c=self.c) + assert params.xlabel == 'x' + assert params.ylabel is None + assert params.zlabel is None + params = _ScatterParams(x=self.x, y=pd.Series(self.y, name='y'), c=self.c) + assert params.xlabel is None + assert params.ylabel == 'y' + assert params.zlabel is None + params = _ScatterParams(x=self.x, y=self.y, z=pd.Series(self.y, name='z'), c=self.c) + assert params.xlabel is None + assert params.ylabel is None + assert params.zlabel == 'z' + # xlabel overrides series + params = _ScatterParams(x=pd.Series(self.x, name='x'), y=self.y, c=self.c, + xlabel='y') + assert params.xlabel == 'y' + assert params.ylabel is None + assert params.zlabel is None + # label_prefix overrides series + params = _ScatterParams(x=pd.Series(self.x, name='x'), y=self.y, c=self.c, + label_prefix='y') + assert params.xlabel == 'y1' + assert params.ylabel == 'y2' + assert params.zlabel is None + # xlabel overrides label_prefix + params = _ScatterParams(x=pd.Series(self.x, name='x'), y=self.y, z=self.y, c=self.c, + label_prefix='y', xlabel='test') + assert params.xlabel == 'test' + assert params.ylabel == 'y2' + assert params.zlabel == 'y3' + def test_jitter_x(self): params = _JitterParams(x=np.where(self.x > 0, '+', '-'), y=self.y) np.testing.assert_array_equal(params.x_labels, ['+', '-']) @@ -839,6 +872,18 @@ def test_jitter_continuous(self): assert ax.get_xlim() == (-0.5, 1.5) assert [t.get_text() for t in ax.get_xticklabels()] == ['+', '-'] + def test_jitter_axis_labels(self): + ax = scprep.plot.jitter(np.where(self.X_pca[:, 0] > 0, '+', '-'), + self.X_pca[:, 1], + xlabel="test") + assert ax.get_xlabel() == "test" + assert ax.get_ylabel() == '' + ax = scprep.plot.jitter( + pd.Series(np.where(self.X_pca[:, 0] > 0, '+', '-'), name='x'), + pd.Series(self.X_pca[:, 1], name='y'), ylabel="override") + assert ax.get_xlabel() == "x" + assert ax.get_ylabel() == "override" + def test_scatter_dict(self): scprep.plot.scatter2d(self.X_pca, c=np.random.choice( ['hello', 'world'], self.X_pca.shape[0], replace=True), @@ -934,6 +979,13 @@ def test_scatter_axis_labels(self): self.X_pca, label_prefix="test", xlabel="override") assert ax.get_xlabel() == "override" assert ax.get_ylabel() == "test2" + ax = scprep.plot.scatter( + x=self.X_pca[:,0], y=pd.Series(self.X_pca[:,1], name='y'), + z=pd.Series(self.X_pca[:,2], name='z'), + ylabel='override') + assert ax.get_xlabel() == '' + assert ax.get_ylabel() == "override" + assert ax.get_zlabel() == "z" def test_scatter_axis_savefig(self): scprep.plot.scatter2d( From 27e574b5ce4df2aeca0cabea27ecca1209ef7a2a Mon Sep 17 00:00:00 2001 From: Scott Gigante Date: Wed, 25 Sep 2019 19:33:12 -0400 Subject: [PATCH 057/125] require statsmodels for testing --- setup.py | 1 + 1 file changed, 1 insertion(+) diff --git a/setup.py b/setup.py index 4cf47d9b..2cb91997 100644 --- a/setup.py +++ b/setup.py @@ -19,6 +19,7 @@ 'coverage', 'coveralls', 'parameterized', + 'statsmodels', ] doc_requires = [ From 635eefe123a58798ce221c527b89ea0beda79016 Mon Sep 17 00:00:00 2001 From: Scott Gigante Date: Fri, 27 Sep 2019 15:34:27 -0400 Subject: [PATCH 058/125] return sample_labels as pd.Series; name library_size and gene expression series; use rangeindex if rangeindex is passed to utils.combine_batches. resolves #56 and resolves #32 --- scprep/measure.py | 5 +++++ scprep/utils.py | 11 ++++++++++- test/test_measure.py | 16 ++++++++++++++++ test/test_utils.py | 13 +++++++++++++ 4 files changed, 44 insertions(+), 1 deletion(-) diff --git a/scprep/measure.py b/scprep/measure.py index 27a18f8b..bc7d849a 100644 --- a/scprep/measure.py +++ b/scprep/measure.py @@ -1,4 +1,5 @@ import numpy as np +import pandas as pd import warnings import numbers @@ -19,6 +20,8 @@ def library_size(data): Sum over all genes for each cell """ library_size = utils.matrix_sum(data, axis=1) + if isinstance(library_size, pd.Series): + library_size.name = 'library_size' return library_size @@ -59,6 +62,8 @@ def gene_set_expression(data, genes=None, library_size_normalize=False, gene_set_expression = library_size(gene_data) else: gene_set_expression = gene_data + if isinstance(gene_set_expression, pd.Series): + gene_set_expression.name = 'expression' return gene_set_expression diff --git a/scprep/utils.py b/scprep/utils.py index b7c247cc..a6a3f4ab 100644 --- a/scprep/utils.py +++ b/scprep/utils.py @@ -455,7 +455,11 @@ def combine_batches(data, batch_labels, append_to_cell_names=None): " Got {}".format(matrix_type.__name__), UserWarning) elif append_to_cell_names is None: if issubclass(matrix_type, pd.DataFrame): - append_to_cell_names = True + if all([isinstance(d.index, pd.RangeIndex) for d in data]): + # rangeindex should still be a rangeindex + append_to_cell_names = False + else: + append_to_cell_names = True else: append_to_cell_names = False @@ -472,6 +476,11 @@ def combine_batches(data, batch_labels, append_to_cell_names=None): "_" + str(batch_labels[i])) for i, d in enumerate(data)]) data_combined.index = index + elif all([isinstance(d.index, pd.RangeIndex) for d in data]): + # rangeindex should still be a rangeindex + data_combined = data_combined.reset_index(drop=True) + sample_labels = pd.Series(sample_labels, index=data_combined.index, + name='sample_labels') elif issubclass(matrix_type, sparse.spmatrix): data_combined = sparse.vstack(data) elif issubclass(matrix_type, np.ndarray): diff --git a/test/test_measure.py b/test/test_measure.py index 1f54fd69..0f286e02 100644 --- a/test/test_measure.py +++ b/test/test_measure.py @@ -51,3 +51,19 @@ def test_array_all(self): self.X_dense, utils.assert_transform_equals, Y=self.Y, transform=scprep.measure.gene_set_expression, genes=[0]) + + def test_library_size(self): + def test_fun(X): + x = scprep.measure.library_size(X) + assert x.name == 'library_size' + assert np.all(x.index == self.X_dense.index) + matrix.test_pandas_matrix_types( + self.X_dense, test_fun) + + def test_library_size(self): + def test_fun(X): + x = scprep.measure.gene_set_expression(X, genes=[0, 1]) + assert x.name == 'expression' + assert np.all(x.index == self.X_dense.index) + matrix.test_pandas_matrix_types( + self.X_dense, test_fun) diff --git a/test/test_utils.py b/test/test_utils.py index 9a044cdf..da7934b8 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -122,6 +122,8 @@ def test_combine_batches(): assert np.all(Y.index == Y2.index) assert np.all(sample_labels == np.concatenate( [np.repeat(0, X.shape[0]), np.repeat(1, X.shape[0] // 2)])) + assert np.all(sample_labels.index == Y2.index) + assert sample_labels.name == 'sample_labels' Y2, sample_labels = scprep.utils.combine_batches( [X, scprep.select.select_rows( X, idx=np.arange(X.shape[0] // 2))], @@ -131,6 +133,8 @@ def test_combine_batches(): assert np.all(np.core.defchararray.add( "_", sample_labels.astype(str)) == np.array( [i[-2:] for i in Y2.index], dtype=str)) + assert np.all(sample_labels.index == Y2.index) + assert sample_labels.name == 'sample_labels' transform = lambda X: scprep.utils.combine_batches( [X, scprep.select.select_rows(X, idx=np.arange(X.shape[0] // 2))], batch_labels=[0, 1])[0] @@ -141,6 +145,15 @@ def test_combine_batches(): Y=Y, transform=transform, check=utils.assert_all_equal) + def test_fun(X): + Y, sample_labels = scprep.utils.combine_batches( + [X, scprep.select.select_rows(X, idx=np.arange(X.shape[0] // 2))], + batch_labels=[0, 1]) + assert np.all(sample_labels.index == Y.index) + assert sample_labels.name == 'sample_labels' + matrix.test_pandas_matrix_types( + X, + test_fun) def test_combine_batches_uncommon_genes(): From f85181fc3dc7f89193023915310d155db3cbc51a Mon Sep 17 00:00:00 2001 From: Scott Gigante Date: Fri, 27 Sep 2019 15:52:34 -0400 Subject: [PATCH 059/125] return variability and std as series where appropriate --- scprep/measure.py | 8 ++++++-- scprep/utils.py | 8 ++++++++ test/test_measure.py | 10 +++++++++- test/test_utils.py | 11 +++++++++++ 4 files changed, 34 insertions(+), 3 deletions(-) diff --git a/scprep/measure.py b/scprep/measure.py index d4eb5359..7c9adcf2 100644 --- a/scprep/measure.py +++ b/scprep/measure.py @@ -89,14 +89,18 @@ def variable_genes(data, span=0.7, interpolate=0.2): variability : list-like, shape=[n_samples] Variability for each gene """ + columns = data.columns if isinstance(data, pd.DataFrame) else None data = utils.to_array_or_spmatrix(data) - data_mean = utils.toarray(np.mean(data, axis=0)).flatten() data_std = utils.matrix_std(data, axis=0) ** 2 + data_mean = utils.toarray(np.mean(data, axis=0)).flatten() delta = np.std(data_std) * interpolate lowess = statsmodels.nonparametric.smoothers_lowess.lowess( data_std, data_mean, delta=delta, frac=span, return_sorted=False) - return data_std - lowess + variability = data_std - lowess + if columns is not None: + variability = pd.Series(variability, index=columns, name='variability') + return variability def _get_percentile_cutoff(data, cutoff=None, percentile=None, required=False): diff --git a/scprep/utils.py b/scprep/utils.py index c5a17793..16994fb2 100644 --- a/scprep/utils.py +++ b/scprep/utils.py @@ -244,6 +244,12 @@ def matrix_std(data, axis=None): """ if axis not in [0, 1, None]: raise ValueError("Expected axis in [0, 1, None]. Got {}".format(axis)) + index = None + if isinstance(data, pd.DataFrame) and axis is not None: + if axis == 1: + index = data.index + elif axis == 0: + index = data.columns data = to_array_or_spmatrix(data) if sparse.issparse(data): if axis is None: @@ -272,6 +278,8 @@ def matrix_std(data, axis=None): std = np.array(std) else: std = np.std(data, axis=axis) + if index is not None: + std = pd.Series(std, index=index, name='std') return std diff --git a/test/test_measure.py b/test/test_measure.py index 0f286e02..15be58a3 100644 --- a/test/test_measure.py +++ b/test/test_measure.py @@ -60,10 +60,18 @@ def test_fun(X): matrix.test_pandas_matrix_types( self.X_dense, test_fun) - def test_library_size(self): + def test_gene_set_expression(self): def test_fun(X): x = scprep.measure.gene_set_expression(X, genes=[0, 1]) assert x.name == 'expression' assert np.all(x.index == self.X_dense.index) matrix.test_pandas_matrix_types( self.X_dense, test_fun) + + def test_variable_genes(self): + def test_fun(X): + x = scprep.measure.variable_genes(X) + assert x.name == 'variability' + assert np.all(x.index == self.X_dense.columns) + matrix.test_pandas_matrix_types( + self.X_dense, test_fun) diff --git a/test/test_utils.py b/test/test_utils.py index 4581c224..7c5155eb 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -310,6 +310,17 @@ def test_matrix_std(): transform=scprep.utils.matrix_std, axis=None, check=utils.assert_all_close) + X_df = pd.DataFrame(X, index=np.arange(X.shape[0]).astype(str), + columns=np.arange(X.shape[1]).astype(str)) + def test_fun(X): + x = scprep.utils.matrix_std(X, axis=0) + assert x.name == 'std' + assert np.all(x.index == X_df.columns) + x = scprep.utils.matrix_std(X, axis=1) + assert x.name == 'std' + assert np.all(x.index == X_df.index) + matrix.test_pandas_matrix_types( + X_df, test_fun) assert_raise_message(ValueError, "Expected axis in [0, 1, None]. Got 5", scprep.utils.matrix_std, From 5a810fa488e531ec8953d246efd4d98b3953b9b9 Mon Sep 17 00:00:00 2001 From: Scott Gigante Date: Fri, 27 Sep 2019 22:05:08 -0400 Subject: [PATCH 060/125] run tight_layout in non-gui settings, fixes #58 --- scprep/plot/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scprep/plot/utils.py b/scprep/plot/utils.py index 900aa3fb..f0d77e92 100644 --- a/scprep/plot/utils.py +++ b/scprep/plot/utils.py @@ -82,8 +82,8 @@ def show(fig): fig : matplotlib.Figure Figure to show """ + fig.tight_layout() if _mpl_is_gui_backend(): - fig.tight_layout() if platform.system() == "Windows": plt.show(block=True) else: From 95c1fb1622cef050c4817a64a4f7c12000d2e1b2 Mon Sep 17 00:00:00 2001 From: Scott Gigante Date: Fri, 27 Sep 2019 22:05:11 -0400 Subject: [PATCH 061/125] use bar plot for scree plot --- scprep/plot/scree.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scprep/plot/scree.py b/scprep/plot/scree.py index fa98b2f6..23062f00 100644 --- a/scprep/plot/scree.py +++ b/scprep/plot/scree.py @@ -50,8 +50,8 @@ def scree_plot(singular_values, cumulative=False, ax=None, figsize=None, if cumulative: explained_variance = np.cumsum(explained_variance) fig, ax, show_fig = _get_figure(ax, figsize) - ax.plot(np.arange(len(explained_variance)), - explained_variance, **kwargs) + ax.bar(np.arange(len(explained_variance)), + explained_variance, **kwargs) label_axis(ax.xaxis, label=xlabel) label_axis(ax.yaxis, label=ylabel) if show_fig: From ecf70bcca55ba8f5aa2c1b19500c0812ceca1aee Mon Sep 17 00:00:00 2001 From: Scott Gigante Date: Sat, 28 Sep 2019 13:18:30 -0400 Subject: [PATCH 062/125] swap order of save and show, additional fix for #58 --- scprep/plot/histogram.py | 4 ++-- scprep/plot/jitter.py | 4 ++-- scprep/plot/scatter.py | 4 ++-- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/scprep/plot/histogram.py b/scprep/plot/histogram.py index 8102de66..d770f491 100644 --- a/scprep/plot/histogram.py +++ b/scprep/plot/histogram.py @@ -124,10 +124,10 @@ def histogram(data, for c in cutoff: ax.axvline(c, color='red') # save and show - if filename is not None: - fig.savefig(filename, dpi=dpi) if show_fig: show(fig) + if filename is not None: + fig.savefig(filename, dpi=dpi) return ax diff --git a/scprep/plot/jitter.py b/scprep/plot/jitter.py index ccf96558..0807f192 100644 --- a/scprep/plot/jitter.py +++ b/scprep/plot/jitter.py @@ -212,8 +212,8 @@ def jitter(labels, values, sigma=0.1, scale=sc.norm) # save and show - if filename is not None: - fig.savefig(filename, dpi=dpi) if show_fig: show(fig) + if filename is not None: + fig.savefig(filename, dpi=dpi) return ax diff --git a/scprep/plot/scatter.py b/scprep/plot/scatter.py index bffed12c..532e76ba 100644 --- a/scprep/plot/scatter.py +++ b/scprep/plot/scatter.py @@ -668,10 +668,10 @@ def scatter(x, y, z=None, ax.view_init(elev=elev, azim=azim) # save and show - if filename is not None: - fig.savefig(filename, dpi=dpi) if show_fig: show(fig) + if filename is not None: + fig.savefig(filename, dpi=dpi) return ax From 1468061035974a055b314c64d4c4a8e32ae5dfef Mon Sep 17 00:00:00 2001 From: Scott Gigante Date: Sat, 28 Sep 2019 13:59:21 -0400 Subject: [PATCH 063/125] explained variance should be a percentage --- scprep/plot/scree.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scprep/plot/scree.py b/scprep/plot/scree.py index 23062f00..4708d057 100644 --- a/scprep/plot/scree.py +++ b/scprep/plot/scree.py @@ -46,7 +46,7 @@ def scree_plot(singular_values, cumulative=False, ax=None, figsize=None, """ with temp_fontsize(fontsize): explained_variance = singular_values ** 2 - explained_variance = explained_variance / explained_variance.sum() + explained_variance = explained_variance / explained_variance.sum() * 100 if cumulative: explained_variance = np.cumsum(explained_variance) fig, ax, show_fig = _get_figure(ax, figsize) From 491502d64a7ea03a71345bc48ecbd4cec462a9f3 Mon Sep 17 00:00:00 2001 From: Scott Gigante Date: Sat, 28 Sep 2019 14:01:29 -0400 Subject: [PATCH 064/125] add filename to scree_plot and display variance as percentage --- scprep/plot/scree.py | 10 +++++++++- test/test_plot.py | 4 +++- 2 files changed, 12 insertions(+), 2 deletions(-) diff --git a/scprep/plot/scree.py b/scprep/plot/scree.py index 4708d057..f9021dd7 100644 --- a/scprep/plot/scree.py +++ b/scprep/plot/scree.py @@ -10,7 +10,7 @@ @utils._with_pkg(pkg="matplotlib", min_version=3) def scree_plot(singular_values, cumulative=False, ax=None, figsize=None, xlabel='Principal Component', ylabel='Explained Variance (%)', - fontsize=None, + fontsize=None, filename=None, dpi=None, **kwargs): """Plot the explained variance of each principal component @@ -28,6 +28,12 @@ def scree_plot(singular_values, cumulative=False, ax=None, figsize=None, Labels to display on the x and y axis. fontsize : float or None (default: None) Base font size. + filename : str or None (default: None) + file to which the output is saved + dpi : int or None, optional (default: None) + The resolution in dots per inch. If None it will default to the value + savefig.dpi in the matplotlibrc file. If 'figure' it will set the dpi + to be the value of the figure. Only used if filename is not None. **kwargs : additional arguments for `matplotlib.pyplot.plot` Returns @@ -56,4 +62,6 @@ def scree_plot(singular_values, cumulative=False, ax=None, figsize=None, label_axis(ax.yaxis, label=ylabel) if show_fig: show(fig) + if filename is not None: + fig.savefig(filename, dpi=dpi) return ax diff --git a/test/test_plot.py b/test/test_plot.py index 6c069a02..dd3c9746 100644 --- a/test/test_plot.py +++ b/test/test_plot.py @@ -743,6 +743,7 @@ def tearDownClass(self): try_remove("test_histogram.png") try_remove("test_library_size.png") try_remove("test_gene_expression.png") + try_remove("test_scree.png") def tearDown(self): plt.close('all') @@ -828,7 +829,8 @@ def test_histogram_invalid_axis(self): def test_scree(self): scprep.plot.scree_plot(self.S) scprep.plot.scree_plot(self.S, cumulative=True, - xlabel="x label", ylabel="y label") + xlabel="x label", ylabel="y label", filename="test_scree.png") + assert os.path.isfile("test_scree.png") def test_scree_custom_axis(self): fig, ax = plt.subplots() From 2fcbe7200346300b54040539e19a2d9cb77a32ee Mon Sep 17 00:00:00 2001 From: Scott Gigante Date: Sat, 28 Sep 2019 14:11:29 -0400 Subject: [PATCH 065/125] fix scree plot x ticks --- scprep/plot/scree.py | 4 +++- test/test_plot.py | 8 ++++++-- 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/scprep/plot/scree.py b/scprep/plot/scree.py index f9021dd7..9bc0a177 100644 --- a/scprep/plot/scree.py +++ b/scprep/plot/scree.py @@ -1,6 +1,7 @@ import numpy as np from .. import utils +from .._lazyload import matplotlib as mpl from .utils import (_get_figure, show, temp_fontsize) @@ -56,10 +57,11 @@ def scree_plot(singular_values, cumulative=False, ax=None, figsize=None, if cumulative: explained_variance = np.cumsum(explained_variance) fig, ax, show_fig = _get_figure(ax, figsize) - ax.bar(np.arange(len(explained_variance)), + ax.bar(np.arange(len(explained_variance)) + 1, explained_variance, **kwargs) label_axis(ax.xaxis, label=xlabel) label_axis(ax.yaxis, label=ylabel) + ax.xaxis.set_major_locator(mpl.ticker.MaxNLocator(integer=True)) if show_fig: show(fig) if filename is not None: diff --git a/test/test_plot.py b/test/test_plot.py index dd3c9746..9db1aa63 100644 --- a/test/test_plot.py +++ b/test/test_plot.py @@ -4,6 +4,7 @@ import numpy as np import pandas as pd import os +import numbers from sklearn.utils.testing import assert_raise_message, assert_warns_message import unittest import scprep @@ -827,14 +828,17 @@ def test_histogram_invalid_axis(self): self.X, ax="invalid") def test_scree(self): - scprep.plot.scree_plot(self.S) - scprep.plot.scree_plot(self.S, cumulative=True, + ax = scprep.plot.scree_plot(self.S) + assert all([t == int(t) for t in ax.get_xticks()]), ax.get_xticks() + ax = scprep.plot.scree_plot(self.S, cumulative=True, xlabel="x label", ylabel="y label", filename="test_scree.png") + assert all([t == int(t) for t in ax.get_xticks()]), ax.get_xticks() assert os.path.isfile("test_scree.png") def test_scree_custom_axis(self): fig, ax = plt.subplots() scprep.plot.scree_plot(self.S, ax=ax) + assert all([t == int(t) for t in ax.get_xticks()]), ax.get_xticks() def test_scree_invalid_axis(self): assert_raise_message( From d52b3794ad6ac09c517776b5e9051cf3ebde03ff Mon Sep 17 00:00:00 2001 From: Scott Gigante Date: Sat, 28 Sep 2019 14:14:43 -0400 Subject: [PATCH 066/125] fix scree plot x limits --- scprep/plot/scree.py | 1 + 1 file changed, 1 insertion(+) diff --git a/scprep/plot/scree.py b/scprep/plot/scree.py index 9bc0a177..e32e73b7 100644 --- a/scprep/plot/scree.py +++ b/scprep/plot/scree.py @@ -62,6 +62,7 @@ def scree_plot(singular_values, cumulative=False, ax=None, figsize=None, label_axis(ax.xaxis, label=xlabel) label_axis(ax.yaxis, label=ylabel) ax.xaxis.set_major_locator(mpl.ticker.MaxNLocator(integer=True)) + ax.set_xlim(0.3, len(explained_variance) + 0.3) if show_fig: show(fig) if filename is not None: From 9fa7d40c523b401775d3582b85c85cc0595c978c Mon Sep 17 00:00:00 2001 From: Scott Gigante Date: Sat, 28 Sep 2019 14:17:34 -0400 Subject: [PATCH 067/125] fix scree plot x limits properly this time --- scprep/plot/scree.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scprep/plot/scree.py b/scprep/plot/scree.py index e32e73b7..fc693798 100644 --- a/scprep/plot/scree.py +++ b/scprep/plot/scree.py @@ -62,7 +62,7 @@ def scree_plot(singular_values, cumulative=False, ax=None, figsize=None, label_axis(ax.xaxis, label=xlabel) label_axis(ax.yaxis, label=ylabel) ax.xaxis.set_major_locator(mpl.ticker.MaxNLocator(integer=True)) - ax.set_xlim(0.3, len(explained_variance) + 0.3) + ax.set_xlim(0.3, len(explained_variance) + 0.7) if show_fig: show(fig) if filename is not None: From 034cbe66f90b692abb6fda6e7883ba047cebc949 Mon Sep 17 00:00:00 2001 From: Scott Gigante Date: Thu, 3 Oct 2019 12:24:15 -0400 Subject: [PATCH 068/125] better error message for scatter3d, fixes #67 --- scprep/plot/scatter.py | 10 +++++++--- test/test_plot.py | 14 ++++++++++++++ 2 files changed, 21 insertions(+), 3 deletions(-) diff --git a/scprep/plot/scatter.py b/scprep/plot/scatter.py index 532e76ba..92322f34 100644 --- a/scprep/plot/scatter.py +++ b/scprep/plot/scatter.py @@ -964,9 +964,13 @@ def scatter3d(data, >>> data[colors == 'a'] += 5 >>> scprep.plot.scatter3d(data, c=colors, cmap={'a' : [1,0,0,1], 'b' : 'xkcd:sky blue'}) """ - return scatter(x=select.select_cols(data, idx=0), - y=select.select_cols(data, idx=1), - z=select.select_cols(data, idx=2), + try: + x = select.select_cols(data, idx=0) + y = select.select_cols(data, idx=1) + z = select.select_cols(data, idx=2) + except IndexError: + raise ValueError("Expected data.shape[1] >= 3. Got {}".format(data.shape[1])) + return scatter(x=x, y=y, z=z, c=c, cmap=cmap, cmap_scale=cmap_scale, s=s, discrete=discrete, ax=ax, legend=legend, colorbar=colorbar, shuffle=shuffle, figsize=figsize, diff --git a/test/test_plot.py b/test/test_plot.py index 9db1aa63..674f6d40 100644 --- a/test/test_plot.py +++ b/test/test_plot.py @@ -1004,6 +1004,20 @@ def test_scatter_viewinit(self): assert ax.elev == 80 assert ax.azim == 270 + def test_scatter3d_data_2d(self): + assert_raise_message( + ValueError, + "Expected data.shape[1] >= 3. Got 2", + scprep.plot.scatter3d, + self.X_pca[:,:2]) + + def test_scatter3d_data_2d_list(self): + assert_raise_message( + ValueError, + "Expected data.shape[1] >= 3. Got 2", + scprep.plot.scatter3d, + self.X_pca[:,:2].tolist()) + def test_scatter_rotate_gif(self): scprep.plot.rotate_scatter3d(self.X_pca, fps=3, dpi=20, filename="test.gif") From 6abc2fb0bd32e4b4682e48166cf097885c58eb28 Mon Sep 17 00:00:00 2001 From: Scott Gigante Date: Thu, 3 Oct 2019 14:25:35 -0400 Subject: [PATCH 069/125] add slingshot to scprep.run --- scprep/run/__init__.py | 3 +- scprep/run/r_function.py | 37 ++++++++ scprep/run/slingshot.py | 189 +++++++++++++++++++++++++++++++++++++++ scprep/run/splatter.py | 23 ++++- 4 files changed, 250 insertions(+), 2 deletions(-) create mode 100644 scprep/run/slingshot.py diff --git a/scprep/run/__init__.py b/scprep/run/__init__.py index 220af9f1..0a07c96e 100644 --- a/scprep/run/__init__.py +++ b/scprep/run/__init__.py @@ -1,2 +1,3 @@ -from .r_function import RFunction +from .r_function import RFunction, install_bioconductor from .splatter import SplatSimulate +from .slingshot import Slingshot diff --git a/scprep/run/r_function.py b/scprep/run/r_function.py index ff644200..359d2486 100644 --- a/scprep/run/r_function.py +++ b/scprep/run/r_function.py @@ -138,3 +138,40 @@ def __call__(self, *args, rpy_verbose=None, **kwargs): robject = self.convert(robject) self.verbose = default_verbose return robject + + +_install_bioconductor = RFunction( + args="package = character(), site_repository = character(), update = FALSE, version = BiocManager::version()", + body=""" + if (!require('BiocManager')) install.packages("BiocManager") + ask <- !update + BiocManager::install(package, site_repository=site_repository, + update=update, ask=ask, version=version) + """) + +def install_bioconductor(package = None, site_repository = None, update = False, version = None, verbose = True): + """Install a Bioconductor package + + Parameters + ---------- + site_repository : string, optional (default: None) + additional repository in which to look for packages to install. + This repository will be prepended to the default repositories + update : boolean, optional (default: False) + When False, don't attempt to update old packages. + When True, update old packages automatically. + version : string, optional (default: None) + Bioconductor version to install, e.g., version = "3.8". + The special symbol version = "devel" installs the current 'development' version. + If None, installs from the current version. + verbose : boolean, optional (default: True) + Install script verbosity. + """ + kwargs = {'update': update, 'rpy_verbose': verbose} + if package is not None: + kwargs['package'] = package + if site_repository is not None: + kwargs['site_repository'] = site_repository + if version is not None: + kwargs['version'] = version + _install_bioconductor(**kwargs) \ No newline at end of file diff --git a/scprep/run/slingshot.py b/scprep/run/slingshot.py new file mode 100644 index 00000000..9db61e88 --- /dev/null +++ b/scprep/run/slingshot.py @@ -0,0 +1,189 @@ +import numpy as np + +from .r_function import RFunction, install_bioconductor + + +def install(site_repository = None, update = False, version = None): + """Install the required R packages to run Slingshot + + Parameters + ---------- + site_repository : string, optional (default: None) + additional repository in which to look for packages to install. + This repository will be prepended to the default repositories + update : boolean, optional (default: False) + When False, don't attempt to update old packages. + When True, update old packages automatically. + version : string, optional (default: None) + Bioconductor version to install, e.g., version = "3.8". + The special symbol version = "devel" installs the current 'development' version. + If None, installs from the current version. + """ + install_bioconductor('slingshot', site_repository=site_repository, + update=update, version=version) + + +_Slingshot = RFunction( + setup=""" + library(slingshot) + library(SingleCellExperiment) + """, + args=""" + data, cluster_labels, reduced_dim, + start_cluster = NULL, end_cluster = NULL, + distance = NULL, omega = NULL, lineages = list(), shrink = TRUE, + extend = "y", reweight = TRUE, reassign = TRUE, thresh = 0.001, + max_iter = 15, stretch = 2, + smoother = "smooth.spline", shrink_method = "cosine", + allow_breaks = TRUE, seed = NULL + """, + body=""" + set.seed(seed) + data <- t(as.matrix(data)) + reduced_dim <- as.matrix(reduced_dim) + cluster_labels <- as.factor(cluster_labels) + print(str(data)) + print(str(reduced_dim)) + print(str(cluster_labels)) + + # Create SingleCellExperiment + sce <- SingleCellExperiment(assays = List(counts = data)) + + # Add dim red data and clusters to SCE + reducedDims(sce) <- SimpleList(reduced_dim = reduced_dim) + colData(sce)$cluster_labels <- cluster_labels + + # Run Slingshot + sce <- slingshot(sce, clusterLabels = 'cluster_labels', + reducedDim = 'reduced_dim', start.clus = start_cluster, end.clus = end_cluster, + dist.fun = distance, omega = omega, lineages = lineages, shrink = shrink, + extend = extend, reweight = reweight, reassign = reassign, thresh = thresh, + maxit = max_iter, stretch = stretch, + smoother = smoother, shrink.method = shrink_method, + allow.breaks = allow_breaks) + list(pseudotime = slingPseudotime(sce), + curves = lapply(SlingshotDataSet(sce)@curves, function(curve) curve$s[curve$ord,])) + """) + + +def Slingshot( + data, cluster_labels, reduced_dim, + start_cluster = None, end_cluster = None, + distance = None, omega = None, shrink = True, + extend = "y", reweight = True, reassign = True, thresh = 0.001, + max_iter = 15, stretch = 2, + smoother = "smooth.spline", shrink_method = "cosine", + allow_breaks = True, + seed=None, verbose=1): + """Perform lineage inference with Slingshot + + Given a reduced-dimensional data matrix n by p and a vector of cluster labels + (or matrix of soft cluster assignments, potentially including a -1 label for "unclustered"), + this function performs lineage inference using a cluster-based minimum spanning tree and + constructing simulatenous principal curves for branching paths through the tree. + + For more details, read about Slingshot on [GitHub](https://github.com/kstreet13/slingshot) + and [Bioconductor](https://bioconductor.org/packages/release/bioc/html/slingshot.html). + + Parameters + ---------- + data : array-like, shape=[n_samples, n_features] + a data object containing the matrix of coordinates to be used for lineage inference. + cluster_labels : list-like, shape=[n_samples] + a vector of cluster labels, optionally including -1's for "unclustered." + reduced_dim : array-like, shape=[n_samples, n_dimensions] + dimensionality reduction on which to display lineage + start_cluster : string, optional (default: None) + indicates the cluster(s) of origin. + Lineages will be represented by paths coming out of this cluster. + end_cluster : string, optional (default: None) + indicates the cluster(s) which will be forced leaf nodes. + This introduces a constraint on the MST algorithm. + distance : callable, optional (default: None) + method for calculating distances between clusters. + Must take two matrices as input, corresponding to subsets of reduced_dim. + If the minimum cluster size is larger than the number dimensions, + the default is to use the joint covariance matrix to find squared distance + between cluster centers. If not, the default is to use the diagonal of the + joint covariance matrix. Not currently implemented + omega : float, optional (default: None) + this granularity parameter determines the distance between every + real cluster and the artificial cluster. + It is parameterized such that this distance is omega / 2, + making omega the maximum distance between two connected clusters. + By default, omega = Inf. + shrink : boolean or float, optional (default: True) + boolean or numeric between 0 and 1, determines whether and how much to shrink + branching lineages toward their average prior to the split. + extend : {'y', 'n', 'pc1'}, optional (default: "y") + how to handle root and leaf clusters of lineages when + constructing the initial, piece-wise linear curve. + reweight : boolean, optional (default: True) + whether to allow cells shared between lineages to be reweighted during curve-fitting. + If True, cells shared between lineages will be iteratively + reweighted based on the quantiles of their projection distances to each curve. + reassign : boolean, optional (default: True) + whether to reassign cells to lineages at each iteration. + If True, cells will be added to a lineage when their + projection distance to the curve is less than the median + distance for all cells currently assigned to the lineage. + Additionally, shared cells will be removed from a lineage if + their projection distance to the curve is above the 90th + percentile and their weight along the curve is less than 0.1. + thresh : float, optional (default: 0.001) + determines the convergence criterion. Percent change in the + total distance from cells to their projections along curves + must be less than thresh. + max_iter : int, optional (default: 15) + maximum number of iterations + stretch : int, optional (default: 2) + factor between 0 and 2 by which curves can be extrapolated beyond endpoints + smoother : {"smooth.spline", "lowess", "periodic_lowess"}, optional (default: "smooth.spline") + choice of smoother. "periodic_lowess" allows one to fit closed + curves. Beware, you may want to use iter = 0 with "lowess". + shrink_method : string, optional (default: "cosine") + how to determine the appropriate amount of shrinkage for a + branching lineage. Accepted values: "gaussian", "rectangular", + "triangular", "epanechnikov", "biweight", "triweight", + "cosine", "optcosine", "density". + allow_breaks : boolean, optional (default: True) + determines whether curves that branch very close to the origin + should be allowed to have different starting points. + seed : int or None, optional (default: None) + Seed to use for generating random numbers. + verbose : int, optional (default: 1) + Logging verbosity between 0 and 2. + + Returns + ------- + pseudotime : array-like, shape=[n_samples, n_curves] + Pseudotime projection of each cell onto each principal curve. + Value is `np.nan` if the cell does not lie on the curve + curves : array_like, shape=[n_curves, n_samples, n_dimensions] + Coordinates of each principle curve in the reduced dimension + """ + if seed is None: + seed = np.random.randint(2**16 - 1) + if distance is not None: + raise NotImplementedError("distance argument not currently implemented") + np.random.seed(seed) + + kwargs = {} + if start_cluster is not None: + kwargs['start_cluster'] = start_cluster + if end_cluster is not None: + kwargs['end_cluster'] = end_cluster + if distance is not None: + kwargs['distance'] = distance + if omega is not None: + kwargs['omega'] = omega + + slingshot = _Slingshot( + data=data, cluster_labels=cluster_labels, reduced_dim=reduced_dim, shrink = shrink, + extend = extend, reweight = reweight, reassign = reassign, thresh = thresh, + max_iter = max_iter, stretch = stretch, + smoother = smoother, shrink_method = shrink_method, + allow_breaks = allow_breaks, **kwargs, + seed=seed, rpy_verbose=verbose) + slingshot['curves'] = np.array(list(slingshot['curves'].values())) + return slingshot['pseudotime'], slingshot['curves'] diff --git a/scprep/run/splatter.py b/scprep/run/splatter.py index 2bcaa187..b563fe20 100644 --- a/scprep/run/splatter.py +++ b/scprep/run/splatter.py @@ -1,6 +1,27 @@ import numpy as np -from .r_function import RFunction +from .r_function import RFunction, install_bioconductor + + +def install(site_repository = None, update = False, version = None): + """Install the required R packages to run Splatter + + Parameters + ---------- + site_repository : string, optional (default: None) + additional repository in which to look for packages to install. + This repository will be prepended to the default repositories + update : boolean, optional (default: False) + When False, don't attempt to update old packages. + When True, update old packages automatically. + version : string, optional (default: None) + Bioconductor version to install, e.g., version = "3.8". + The special symbol version = "devel" installs the current 'development' version. + If None, installs from the current version. + """ + install_bioconductor('splatter', site_repository=site_repository, + update=update, version=version) + _SplatSimulate = RFunction( setup=""" From c45c0465b44778043b2ba583081a9aefc2d2888d Mon Sep 17 00:00:00 2001 From: Scott Gigante Date: Thu, 3 Oct 2019 14:43:06 -0400 Subject: [PATCH 070/125] add scprep.io.download --- doc/source/reference.rst | 9 ++++++++ scprep/io/__init__.py | 2 ++ scprep/io/download.py | 44 ++++++++++++++++++++++++++++++++++++++++ test/test_io.py | 10 +++++++++ 4 files changed, 65 insertions(+) create mode 100644 scprep/io/download.py diff --git a/doc/source/reference.rst b/doc/source/reference.rst index 21814d24..386b740d 100644 --- a/doc/source/reference.rst +++ b/doc/source/reference.rst @@ -20,6 +20,15 @@ HDF5 :inherited-members: :show-inheritance: +Download +~~~~ + +.. automodule:: scprep.io.download + :autosummary: + :members: + :inherited-members: + :show-inheritance: + Filtering --------- diff --git a/scprep/io/__init__.py b/scprep/io/__init__.py index 18446deb..f1307bd3 100644 --- a/scprep/io/__init__.py +++ b/scprep/io/__init__.py @@ -5,3 +5,5 @@ from .tenx import load_10X, load_10X_zip, load_10X_HDF5 from .fcs import load_fcs from .mtx import load_mtx + +from . import download, hdf5 diff --git a/scprep/io/download.py b/scprep/io/download.py new file mode 100644 index 00000000..b70e3472 --- /dev/null +++ b/scprep/io/download.py @@ -0,0 +1,44 @@ +import requests + +_CHUNK_SIZE = 32768 +_GOOGLE_DRIVE_URL = "https://docs.google.com/uc?export=download" + +def _save_response_content(response, destination): + global _CHUNK_SIZE + with open(destination, "wb") as f: + for chunk in response.iter_content(_CHUNK_SIZE): + if chunk: # filter out keep-alive new chunks + f.write(chunk) + +def _google_drive_confirm_token(response): + for key, value in response.cookies.items(): + if key.startswith('download_warning'): + return value + return None + + +def download_google_drive(id, destination): + """Download a file from Google Drive + + Requires the file to be available to view by anyone with the URL. + + Parameters + ---------- + id : string + Google Drive ID string. You can access this by clicking 'Get Shareable Link', + which will give a URL of the form + https://drive.google.com/file/d/**your_file_id**/view?usp=sharing + destination : string + Filename to which to save the downloaded file + """ + global _GOOGLE_DRIVE_URL + + with requests.Session() as session: + response = session.get(_GOOGLE_DRIVE_URL, params = { 'id' : id }, stream = True) + token = _google_drive_confirm_token(response) + + if token: + params = { 'id' : id, 'confirm' : token } + response = session.get(URL, params = params, stream = True) + + _save_response_content(response, destination) diff --git a/test/test_io.py b/test/test_io.py index bb82c8d5..29757b6c 100644 --- a/test/test_io.py +++ b/test/test_io.py @@ -482,3 +482,13 @@ def test_parse_header(): ValueError, "Expected 50 entries in {}. Got 100".format(os.path.abspath(header2)), scprep.io.utils._parse_header, header2, 50) + +def test_download_google_drive(): + id = "1_T5bRqbid5mtuDYnyusoGvujc6fW1UKv" + dest = "test.txt" + scprep.io.download.download_google_drive(id, dest) + assert os.path.isfile(dest) + with open(dest, 'r') as f: + data = f.read() + assert data == 'test\n', data + os.remove(dest) From 406af4d890fcf9bf6f927b2325010004726fec4f Mon Sep 17 00:00:00 2001 From: Scott Gigante Date: Thu, 3 Oct 2019 14:50:07 -0400 Subject: [PATCH 071/125] quiet h5py warning --- test/test_hdf5.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test/test_hdf5.py b/test/test_hdf5.py index 26831f3c..78c3f319 100644 --- a/test/test_hdf5.py +++ b/test/test_hdf5.py @@ -14,7 +14,7 @@ def test_failed_import_tables(): tables = scprep.io.hdf5.tables del scprep.io.hdf5.tables assert hdf5_available() is True - with tables.File(h5_file) as f: + with tables.File(h5_file, 'r') as f: assert scprep.io.hdf5._is_tables(f) is False with scprep.io.hdf5.open_file(h5_file) as f: assert scprep.io.hdf5._is_h5py(f) @@ -26,7 +26,7 @@ def test_failed_import_h5py(): h5py = scprep.io.hdf5.h5py del scprep.io.hdf5.h5py assert hdf5_available() is True - with h5py.File(h5_file) as f: + with h5py.File(h5_file, 'r') as f: assert scprep.io.hdf5._is_h5py(f) is False scprep.io.hdf5.h5py = h5py From 198457759c8ca32266a5fcaa7e29b22a78ed2a3a Mon Sep 17 00:00:00 2001 From: Scott Gigante Date: Thu, 3 Oct 2019 14:52:22 -0400 Subject: [PATCH 072/125] fix list of lists scatter3d warning --- scprep/plot/scatter.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/scprep/plot/scatter.py b/scprep/plot/scatter.py index 92322f34..3f9c7407 100644 --- a/scprep/plot/scatter.py +++ b/scprep/plot/scatter.py @@ -969,6 +969,8 @@ def scatter3d(data, y = select.select_cols(data, idx=1) z = select.select_cols(data, idx=2) except IndexError: + if isinstance(data, list): + data = utils.toarray(data) raise ValueError("Expected data.shape[1] >= 3. Got {}".format(data.shape[1])) return scatter(x=x, y=y, z=z, c=c, cmap=cmap, cmap_scale=cmap_scale, s=s, discrete=discrete, From e733e26107cad304630be6756e7b0b3db5488fee Mon Sep 17 00:00:00 2001 From: Scott Gigante Date: Thu, 3 Oct 2019 16:17:25 -0400 Subject: [PATCH 073/125] handle list of lists data better --- scprep/plot/scatter.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/scprep/plot/scatter.py b/scprep/plot/scatter.py index 3f9c7407..44cf58fc 100644 --- a/scprep/plot/scatter.py +++ b/scprep/plot/scatter.py @@ -803,6 +803,8 @@ def scatter2d(data, >>> data[colors == 'a'] += 10 >>> scprep.plot.scatter2d(data, c=colors, cmap={'a' : [1,0,0,1], 'b' : 'xkcd:sky blue'}) """ + if isinstance(data, list): + data = utils.toarray(data) return scatter(x=select.select_cols(data, idx=0), y=select.select_cols(data, idx=1), c=c, cmap=cmap, cmap_scale=cmap_scale, s=s, discrete=discrete, @@ -964,13 +966,13 @@ def scatter3d(data, >>> data[colors == 'a'] += 5 >>> scprep.plot.scatter3d(data, c=colors, cmap={'a' : [1,0,0,1], 'b' : 'xkcd:sky blue'}) """ + if isinstance(data, list): + data = utils.toarray(data) try: x = select.select_cols(data, idx=0) y = select.select_cols(data, idx=1) z = select.select_cols(data, idx=2) except IndexError: - if isinstance(data, list): - data = utils.toarray(data) raise ValueError("Expected data.shape[1] >= 3. Got {}".format(data.shape[1])) return scatter(x=x, y=y, z=z, c=c, cmap=cmap, cmap_scale=cmap_scale, s=s, discrete=discrete, From f358e42b6b109bee9ced8289bfdfc9c8f50881c6 Mon Sep 17 00:00:00 2001 From: Scott Gigante Date: Thu, 3 Oct 2019 20:58:50 -0400 Subject: [PATCH 074/125] bump coverage --- test/test_plot.py | 4 ++-- test/test_utils.py | 10 ++++++++++ 2 files changed, 12 insertions(+), 2 deletions(-) diff --git a/test/test_plot.py b/test/test_plot.py index 674f6d40..2eb3a44c 100644 --- a/test/test_plot.py +++ b/test/test_plot.py @@ -977,12 +977,12 @@ def test_scatter_custom_ticklabels(self): def test_scatter_axis_labels(self): ax = scprep.plot.scatter3d( - self.X_pca, label_prefix="test") + self.X_pca.tolist(), label_prefix="test") assert ax.get_xlabel() == "test1" assert ax.get_ylabel() == "test2" assert ax.get_zlabel() == "test3" ax = scprep.plot.scatter2d( - self.X_pca, label_prefix="test", xlabel="override") + self.X_pca, label_prefix="test", zlabel="override") assert ax.get_xlabel() == "override" assert ax.get_ylabel() == "test2" ax = scprep.plot.scatter( diff --git a/test/test_utils.py b/test/test_utils.py index da7934b8..9ea93f5d 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -156,6 +156,16 @@ def test_fun(X): test_fun) +def test_combine_batches_rangeindex(): + X = data.load_10X() + X = X.reset_index(drop=True) + Y = X.iloc[:X.shape[0] // 2] + data_combined, labels = scprep.utils.combine_batches( + [X, Y], ['x', 'y']) + assert isinstance(data_combined.index, pd.RangeIndex) + assert np.all(data_combined.columns == X.columns) + + def test_combine_batches_uncommon_genes(): X = data.load_10X() Y = X.iloc[:, :X.shape[1] // 2] From 2bd45d67701118b93993f2b865fcc21858145349 Mon Sep 17 00:00:00 2001 From: Scott Gigante Date: Thu, 3 Oct 2019 21:31:48 -0400 Subject: [PATCH 075/125] fix label tests --- test/test_plot.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/test/test_plot.py b/test/test_plot.py index 2eb3a44c..85f322df 100644 --- a/test/test_plot.py +++ b/test/test_plot.py @@ -982,7 +982,7 @@ def test_scatter_axis_labels(self): assert ax.get_ylabel() == "test2" assert ax.get_zlabel() == "test3" ax = scprep.plot.scatter2d( - self.X_pca, label_prefix="test", zlabel="override") + self.X_pca, label_prefix="test", xlabel="override") assert ax.get_xlabel() == "override" assert ax.get_ylabel() == "test2" ax = scprep.plot.scatter( @@ -992,6 +992,13 @@ def test_scatter_axis_labels(self): assert ax.get_xlabel() == '' assert ax.get_ylabel() == "override" assert ax.get_zlabel() == "z" + ax = scprep.plot.scatter( + x=self.X_pca[:,0], y=pd.Series(self.X_pca[:,1], name='y'), + z=pd.Series(self.X_pca[:,2], name='z'), + zlabel='override') + assert ax.get_xlabel() == '' + assert ax.get_ylabel() == "y" + assert ax.get_zlabel() == "override" def test_scatter_axis_savefig(self): scprep.plot.scatter2d( From e5a1e94f40cb651aa3cc6e6f3607eb35f87a36ab Mon Sep 17 00:00:00 2001 From: Scott Gigante Date: Thu, 3 Oct 2019 21:38:11 -0400 Subject: [PATCH 076/125] extend download underline --- doc/source/reference.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/reference.rst b/doc/source/reference.rst index 386b740d..dd91a80f 100644 --- a/doc/source/reference.rst +++ b/doc/source/reference.rst @@ -21,7 +21,7 @@ HDF5 :show-inheritance: Download -~~~~ +~~~~~~~~ .. automodule:: scprep.io.download :autosummary: From 4bd8aa4389cdbfdc1f5c898e6705f66834a0421c Mon Sep 17 00:00:00 2001 From: Scott Gigante Date: Thu, 3 Oct 2019 22:36:19 -0400 Subject: [PATCH 077/125] handle pandas input to slingshot, give example, fix vmin/vmax with nan data --- scprep/plot/scatter.py | 4 +-- scprep/run/slingshot.py | 64 +++++++++++++++++++++++++---------------- 2 files changed, 42 insertions(+), 26 deletions(-) diff --git a/scprep/plot/scatter.py b/scprep/plot/scatter.py index 92322f34..cd8ae349 100644 --- a/scprep/plot/scatter.py +++ b/scprep/plot/scatter.py @@ -246,7 +246,7 @@ def vmin(self): if self.constant_c() or self.array_c() or self.discrete: return None else: - return np.min(self.c) + return np.nanmin(self.c) @property def vmax(self): @@ -256,7 +256,7 @@ def vmax(self): if self.constant_c() or self.array_c() or self.discrete: return None else: - return np.max(self.c) + return np.nanmax(self.c) def list_cmap(self): """Is the colormap a list?""" diff --git a/scprep/run/slingshot.py b/scprep/run/slingshot.py index 9db61e88..cc6f7771 100644 --- a/scprep/run/slingshot.py +++ b/scprep/run/slingshot.py @@ -1,6 +1,8 @@ import numpy as np +import pandas as pd from .r_function import RFunction, install_bioconductor +from .. import utils def install(site_repository = None, update = False, version = None): @@ -26,10 +28,9 @@ def install(site_repository = None, update = False, version = None): _Slingshot = RFunction( setup=""" library(slingshot) - library(SingleCellExperiment) """, args=""" - data, cluster_labels, reduced_dim, + data, cluster_labels, start_cluster = NULL, end_cluster = NULL, distance = NULL, omega = NULL, lineages = list(), shrink = TRUE, extend = "y", reweight = TRUE, reassign = TRUE, thresh = 0.001, @@ -39,23 +40,12 @@ def install(site_repository = None, update = False, version = None): """, body=""" set.seed(seed) - data <- t(as.matrix(data)) - reduced_dim <- as.matrix(reduced_dim) + data <- as.matrix(data) cluster_labels <- as.factor(cluster_labels) - print(str(data)) - print(str(reduced_dim)) - print(str(cluster_labels)) - - # Create SingleCellExperiment - sce <- SingleCellExperiment(assays = List(counts = data)) - - # Add dim red data and clusters to SCE - reducedDims(sce) <- SimpleList(reduced_dim = reduced_dim) - colData(sce)$cluster_labels <- cluster_labels # Run Slingshot - sce <- slingshot(sce, clusterLabels = 'cluster_labels', - reducedDim = 'reduced_dim', start.clus = start_cluster, end.clus = end_cluster, + sce <- slingshot(data, clusterLabels = cluster_labels, + start.clus = start_cluster, end.clus = end_cluster, dist.fun = distance, omega = omega, lineages = lineages, shrink = shrink, extend = extend, reweight = reweight, reassign = reassign, thresh = thresh, maxit = max_iter, stretch = stretch, @@ -67,7 +57,7 @@ def install(site_repository = None, update = False, version = None): def Slingshot( - data, cluster_labels, reduced_dim, + data, cluster_labels, start_cluster = None, end_cluster = None, distance = None, omega = None, shrink = True, extend = "y", reweight = True, reassign = True, thresh = 0.001, @@ -87,12 +77,11 @@ def Slingshot( Parameters ---------- - data : array-like, shape=[n_samples, n_features] - a data object containing the matrix of coordinates to be used for lineage inference. + data : array-like, shape=[n_samples, n_dimensions] + matrix of (reduced dimension) coordinates + to be used for lineage inference. cluster_labels : list-like, shape=[n_samples] a vector of cluster labels, optionally including -1's for "unclustered." - reduced_dim : array-like, shape=[n_samples, n_dimensions] - dimensionality reduction on which to display lineage start_cluster : string, optional (default: None) indicates the cluster(s) of origin. Lineages will be represented by paths coming out of this cluster. @@ -161,29 +150,56 @@ def Slingshot( Value is `np.nan` if the cell does not lie on the curve curves : array_like, shape=[n_curves, n_samples, n_dimensions] Coordinates of each principle curve in the reduced dimension + + Examples + -------- + >>> import scprep + >>> import phate + >>> phate_op = phate.PHATE() + >>> data_phate = phate_op.fit_transform(data) + >>> clusters = phate.cluster.kmeans(phate_op, 6) + >>> pseudotime, curves = scprep.run.Slingshot(data_phate, clusters) + >>> ax = scprep.plot.scatter2d(data_phate, c=pseudotime[:,0], cmap='magma', legend_title='Branch 1') + >>> scprep.plot.scatter2d(data_phate, c=pseudotime[:,1], cmap='viridis', ax=ax, + ... ticks=False, label_prefix='PHATE', legend_title='Branch 2') + >>> for curve in curves: + ... ax.plot(curve[:,0], curve[:,1], c='black') """ if seed is None: seed = np.random.randint(2**16 - 1) if distance is not None: raise NotImplementedError("distance argument not currently implemented") np.random.seed(seed) + + index = data.index if isinstance(data, pd.DataFrame) else None + data = utils.toarray(data) + if data.shape[1] > 3: + warnings.warn("Expected data to be low-dimensional. " + "Got data.shape[1] = {}".format(data.shape[1])) + cluster_labels = utils.toarray(cluster_labels).flatten() + if not cluster_labels.shape[0] == data.shape[0]: + raise ValueError("Expected len(cluster_labels) ({}) to equal " + "data.shape[0] ({})".format(data.shape[0], cluster_labels.shape[0])) + kwargs = {} if start_cluster is not None: kwargs['start_cluster'] = start_cluster if end_cluster is not None: kwargs['end_cluster'] = end_cluster - if distance is not None: - kwargs['distance'] = distance if omega is not None: kwargs['omega'] = omega slingshot = _Slingshot( - data=data, cluster_labels=cluster_labels, reduced_dim=reduced_dim, shrink = shrink, + data=data, cluster_labels=cluster_labels, + shrink = shrink, extend = extend, reweight = reweight, reassign = reassign, thresh = thresh, max_iter = max_iter, stretch = stretch, smoother = smoother, shrink_method = shrink_method, allow_breaks = allow_breaks, **kwargs, seed=seed, rpy_verbose=verbose) slingshot['curves'] = np.array(list(slingshot['curves'].values())) + + if index is not None: + slingshot['pseudotime'] = pd.DataFrame(slingshot['pseudotime'], index=index) return slingshot['pseudotime'], slingshot['curves'] From 3afe040ef37a6e8927939105394c435fe6c98c82 Mon Sep 17 00:00:00 2001 From: Scott Gigante Date: Thu, 3 Oct 2019 23:09:03 -0400 Subject: [PATCH 078/125] add slingshot tests --- scprep/run/r_function.py | 11 ++++-- scprep/run/slingshot.py | 14 +++++--- scprep/run/splatter.py | 6 ++-- test/test_run.py | 74 +++++++++++++++++++++++++++++++++++++++- travis_setup.R | 3 -- 5 files changed, 95 insertions(+), 13 deletions(-) diff --git a/scprep/run/r_function.py b/scprep/run/r_function.py index 359d2486..04cdc2af 100644 --- a/scprep/run/r_function.py +++ b/scprep/run/r_function.py @@ -145,8 +145,15 @@ def __call__(self, *args, rpy_verbose=None, **kwargs): body=""" if (!require('BiocManager')) install.packages("BiocManager") ask <- !update - BiocManager::install(package, site_repository=site_repository, - update=update, ask=ask, version=version) + if (length(package) == 0) { + BiocManager::install(site_repository=site_repository, + update=update, ask=ask, version=version) + } else { + for (pkg in package) { + if (!require(pkg, character.only = TRUE)) BiocManager::install(pkg, site_repository=site_repository, + update=update, ask=ask, version=version) + } + } """) def install_bioconductor(package = None, site_repository = None, update = False, version = None, verbose = True): diff --git a/scprep/run/slingshot.py b/scprep/run/slingshot.py index cc6f7771..ef4a91b3 100644 --- a/scprep/run/slingshot.py +++ b/scprep/run/slingshot.py @@ -1,11 +1,12 @@ import numpy as np import pandas as pd +import warnings from .r_function import RFunction, install_bioconductor from .. import utils -def install(site_repository = None, update = False, version = None): +def install(site_repository = None, update = False, version = None, verbose = True): """Install the required R packages to run Slingshot Parameters @@ -20,9 +21,11 @@ def install(site_repository = None, update = False, version = None): Bioconductor version to install, e.g., version = "3.8". The special symbol version = "devel" installs the current 'development' version. If None, installs from the current version. + verbose : boolean, optional (default: True) + Install script verbosity. """ install_bioconductor('slingshot', site_repository=site_repository, - update=update, version=version) + update=update, version=version, verbose=verbose) _Slingshot = RFunction( @@ -155,9 +158,9 @@ def Slingshot( -------- >>> import scprep >>> import phate + >>> data, clusters = phate.tree.gen_dla(n_branch=4, n_dim=200, branch_length=200) >>> phate_op = phate.PHATE() >>> data_phate = phate_op.fit_transform(data) - >>> clusters = phate.cluster.kmeans(phate_op, 6) >>> pseudotime, curves = scprep.run.Slingshot(data_phate, clusters) >>> ax = scprep.plot.scatter2d(data_phate, c=pseudotime[:,0], cmap='magma', legend_title='Branch 1') >>> scprep.plot.scatter2d(data_phate, c=pseudotime[:,1], cmap='viridis', ax=ax, @@ -176,11 +179,12 @@ def Slingshot( data = utils.toarray(data) if data.shape[1] > 3: warnings.warn("Expected data to be low-dimensional. " - "Got data.shape[1] = {}".format(data.shape[1])) + "Got data.shape[1] = {}".format(data.shape[1]), + UserWarning) cluster_labels = utils.toarray(cluster_labels).flatten() if not cluster_labels.shape[0] == data.shape[0]: raise ValueError("Expected len(cluster_labels) ({}) to equal " - "data.shape[0] ({})".format(data.shape[0], cluster_labels.shape[0])) + "data.shape[0] ({})".format(cluster_labels.shape[0], data.shape[0])) kwargs = {} if start_cluster is not None: diff --git a/scprep/run/splatter.py b/scprep/run/splatter.py index b563fe20..37a50cd0 100644 --- a/scprep/run/splatter.py +++ b/scprep/run/splatter.py @@ -3,7 +3,7 @@ from .r_function import RFunction, install_bioconductor -def install(site_repository = None, update = False, version = None): +def install(site_repository = None, update = False, version = None, verbose = True): """Install the required R packages to run Splatter Parameters @@ -18,9 +18,11 @@ def install(site_repository = None, update = False, version = None): Bioconductor version to install, e.g., version = "3.8". The special symbol version = "devel" installs the current 'development' version. If None, installs from the current version. + verbose : boolean, optional (default: True) + Install script verbosity. """ install_bioconductor('splatter', site_repository=site_repository, - update=update, version=version) + update=update, version=version, verbose=verbose) _SplatSimulate = RFunction( diff --git a/test/test_run.py b/test/test_run.py index f0f2a66a..36742a13 100644 --- a/test/test_run.py +++ b/test/test_run.py @@ -1,9 +1,12 @@ from tools import utils, matrix, data import numpy as np +import pandas as pd import scprep import scprep.run.r_function import unittest +import sklearn.cluster import rpy2.rinterface_lib.callbacks +from sklearn.utils.testing import assert_raise_message, assert_warns_message builtin_warning = rpy2.rinterface_lib.callbacks.consolewrite_warnerror @@ -15,7 +18,11 @@ def test_verbose(): assert np.all(fun() == np.array([[1], [2], [3]])) -class TestRFunctions(unittest.TestCase): +class TestSplatter(unittest.TestCase): + + @classmethod + def setUpClass(self): + scprep.run.splatter.install(verbose=False) def test_splatter_default(self): sim = scprep.run.SplatSimulate( @@ -176,3 +183,68 @@ def test_splatter_warning(self): scprep.run.r_function._ConsoleWarning.set_builtin() assert rpy2.rinterface_lib.callbacks.consolewrite_warnerror is \ builtin_warning + + +class TestSlingshot(unittest.TestCase): + + @classmethod + def setUpClass(self): + scprep.run.slingshot.install(verbose=False) + self.X = data.load_10X() + self.X_pca = scprep.reduce.pca(self.X) + self.clusters = sklearn.cluster.KMeans(6).fit_predict(self.X_pca) + + def test_slingshot(self): + pseudotime, curves = scprep.run.Slingshot(self.X_pca[:,:2], self.clusters, verbose=False) + assert pseudotime.shape[0] == self.X_pca.shape[0] + assert pseudotime.shape[1] == curves.shape[0] + assert curves.shape[1] == self.X_pca.shape[0] + assert curves.shape[2] == 2 + assert np.all(np.any(~np.isnan(pseudotime), axis=1)) + + def test_slingshot_pandas(self): + pseudotime, curves = scprep.run.Slingshot(pd.DataFrame(self.X_pca[:,:2], index=self.X.index), + self.clusters, verbose=False) + assert pseudotime.shape[0] == self.X_pca.shape[0] + assert pseudotime.shape[1] == curves.shape[0] + assert curves.shape[1] == self.X_pca.shape[0] + assert curves.shape[2] == 2 + assert np.all(np.any(~np.isnan(pseudotime), axis=1)) + assert np.all(pseudotime.index == self.X.index) + + def test_slingshot_distance(self): + assert_raise_message( + NotImplementedError, + "distance argument not currently implemented", + scprep.run.Slingshot, + self.X_pca, self.clusters, distance=lambda X, Y : np.sum(X-Y)) + + def test_slingshot_optional_args(self): + pseudotime, curves = scprep.run.Slingshot(self.X_pca[:,:2], self.clusters, + start_cluster=4, omega=0.1, verbose=False) + assert pseudotime.shape[0] == self.X_pca.shape[0] + assert pseudotime.shape[1] == curves.shape[0] + assert curves.shape[1] == self.X_pca.shape[0] + assert curves.shape[2] == 2 + pseudotime, curves = scprep.run.Slingshot(self.X_pca[:,:2], self.clusters, + end_cluster=0, verbose=False) + assert pseudotime.shape[0] == self.X_pca.shape[0] + assert pseudotime.shape[1] == curves.shape[0] + assert curves.shape[1] == self.X_pca.shape[0] + assert curves.shape[2] == 2 + assert np.all(np.any(~np.isnan(pseudotime), axis=1)) + + def test_slingshot_errors(self): + assert_warns_message( + UserWarning, + "Expected data to be low-dimensional. " + "Got data.shape[1] = 4", + scprep.run.Slingshot, + self.X_pca[:, :4], self.clusters, verbose=False) + assert_raise_message( + ValueError, + "Expected len(cluster_labels) ({}) to equal " + "data.shape[0] ({})".format( + self.X.shape[0]//2, self.X.shape[0]), + scprep.run.Slingshot, + self.X_pca[:, :2], self.clusters[:self.X.shape[0]//2], verbose=False) diff --git a/travis_setup.R b/travis_setup.R index 373614b1..2c8f4061 100644 --- a/travis_setup.R +++ b/travis_setup.R @@ -1,6 +1,3 @@ chooseCRANmirror(ind=1) if (!require("remotes")) install.packages("remotes", quietly=TRUE) remotes::update_packages(upgrade="always") -if (!require("BiocManager")) install.packages("BiocManager", quietly=TRUE) -BiocManager::install(update=TRUE, ask=FALSE) -if (!require("splatter")) BiocManager::install("splatter", quietly=TRUE) From a5274a41e47ba8b54c9dbb759332046ed0a76c6d Mon Sep 17 00:00:00 2001 From: Scott Gigante Date: Thu, 3 Oct 2019 23:26:23 -0400 Subject: [PATCH 079/125] return branch assignments --- scprep/run/slingshot.py | 9 ++++++++- test/test_run.py | 28 +++++++++++++++++++++++----- 2 files changed, 31 insertions(+), 6 deletions(-) diff --git a/scprep/run/slingshot.py b/scprep/run/slingshot.py index ef4a91b3..9f0d53d7 100644 --- a/scprep/run/slingshot.py +++ b/scprep/run/slingshot.py @@ -151,6 +151,8 @@ def Slingshot( pseudotime : array-like, shape=[n_samples, n_curves] Pseudotime projection of each cell onto each principal curve. Value is `np.nan` if the cell does not lie on the curve + branch : list-like, shape=[n_samples] + Branch assignment for each cell curves : array_like, shape=[n_curves, n_samples, n_dimensions] Coordinates of each principle curve in the reduced dimension @@ -204,6 +206,11 @@ def Slingshot( seed=seed, rpy_verbose=verbose) slingshot['curves'] = np.array(list(slingshot['curves'].values())) + membership = (~np.isnan(slingshot['pseudotime'])).astype(int) + branch = np.sum(membership * (2**np.arange(membership.shape[1])), axis=1) + branch, _ = pd.factorize(branch) + if index is not None: slingshot['pseudotime'] = pd.DataFrame(slingshot['pseudotime'], index=index) - return slingshot['pseudotime'], slingshot['curves'] + branch = pd.Series(branch, name='branch', index=index) + return slingshot['pseudotime'], branch, slingshot['curves'] diff --git a/test/test_run.py b/test/test_run.py index 36742a13..dd09a113 100644 --- a/test/test_run.py +++ b/test/test_run.py @@ -195,22 +195,32 @@ def setUpClass(self): self.clusters = sklearn.cluster.KMeans(6).fit_predict(self.X_pca) def test_slingshot(self): - pseudotime, curves = scprep.run.Slingshot(self.X_pca[:,:2], self.clusters, verbose=False) + pseudotime, branch, curves = scprep.run.Slingshot(self.X_pca[:,:2], self.clusters, verbose=False) assert pseudotime.shape[0] == self.X_pca.shape[0] assert pseudotime.shape[1] == curves.shape[0] + assert branch.shape[0] == self.X_pca.shape[0] + for i in np.unique(branch): + branch_membership = np.isnan(pseudotime[branch==i]) + assert np.all(branch_membership == branch_membership[0]) assert curves.shape[1] == self.X_pca.shape[0] assert curves.shape[2] == 2 assert np.all(np.any(~np.isnan(pseudotime), axis=1)) def test_slingshot_pandas(self): - pseudotime, curves = scprep.run.Slingshot(pd.DataFrame(self.X_pca[:,:2], index=self.X.index), + pseudotime, branch, curves = scprep.run.Slingshot(pd.DataFrame(self.X_pca[:,:2], index=self.X.index), self.clusters, verbose=False) + assert np.all(pseudotime.index == self.X.index) + assert np.all(branch.index == self.X.index) + assert branch.name == 'branch' assert pseudotime.shape[0] == self.X_pca.shape[0] assert pseudotime.shape[1] == curves.shape[0] + assert branch.shape[0] == self.X_pca.shape[0] + for i in np.unique(branch): + branch_membership = np.isnan(pseudotime.loc[branch==i]) + assert np.all(branch_membership == branch_membership.iloc[0]) assert curves.shape[1] == self.X_pca.shape[0] assert curves.shape[2] == 2 assert np.all(np.any(~np.isnan(pseudotime), axis=1)) - assert np.all(pseudotime.index == self.X.index) def test_slingshot_distance(self): assert_raise_message( @@ -220,16 +230,24 @@ def test_slingshot_distance(self): self.X_pca, self.clusters, distance=lambda X, Y : np.sum(X-Y)) def test_slingshot_optional_args(self): - pseudotime, curves = scprep.run.Slingshot(self.X_pca[:,:2], self.clusters, + pseudotime, branch, curves = scprep.run.Slingshot(self.X_pca[:,:2], self.clusters, start_cluster=4, omega=0.1, verbose=False) assert pseudotime.shape[0] == self.X_pca.shape[0] assert pseudotime.shape[1] == curves.shape[0] + assert branch.shape[0] == self.X_pca.shape[0] + for i in np.unique(branch): + branch_membership = np.isnan(pseudotime[branch==i]) + assert np.all(branch_membership == branch_membership[0]) assert curves.shape[1] == self.X_pca.shape[0] assert curves.shape[2] == 2 - pseudotime, curves = scprep.run.Slingshot(self.X_pca[:,:2], self.clusters, + pseudotime, branch, curves = scprep.run.Slingshot(self.X_pca[:,:2], self.clusters, end_cluster=0, verbose=False) assert pseudotime.shape[0] == self.X_pca.shape[0] assert pseudotime.shape[1] == curves.shape[0] + assert branch.shape[0] == self.X_pca.shape[0] + for i in np.unique(branch): + branch_membership = np.isnan(pseudotime[branch==i]) + assert np.all(branch_membership == branch_membership[0]) assert curves.shape[1] == self.X_pca.shape[0] assert curves.shape[2] == 2 assert np.all(np.any(~np.isnan(pseudotime), axis=1)) From 2c219d3537557e62ea7c423105d83b121e9466b2 Mon Sep 17 00:00:00 2001 From: Scott Gigante Date: Thu, 3 Oct 2019 23:31:56 -0400 Subject: [PATCH 080/125] add rgl packages --- .travis.yml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.travis.yml b/.travis.yml index ec599d97..83b65678 100644 --- a/.travis.yml +++ b/.travis.yml @@ -17,6 +17,9 @@ - gfortran - libblas-dev - liblapack-dev + - libglu1-mesa-dev + - freeglut3-dev + - mesa-common-dev cache: - pip From c06d95dcc163dc180ca1c3bd9ed55923a4c29d61 Mon Sep 17 00:00:00 2001 From: Scott Gigante Date: Thu, 3 Oct 2019 23:32:09 -0400 Subject: [PATCH 081/125] update even if available when update=TRUE --- scprep/run/r_function.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/scprep/run/r_function.py b/scprep/run/r_function.py index 04cdc2af..12df8154 100644 --- a/scprep/run/r_function.py +++ b/scprep/run/r_function.py @@ -150,8 +150,10 @@ def __call__(self, *args, rpy_verbose=None, **kwargs): update=update, ask=ask, version=version) } else { for (pkg in package) { - if (!require(pkg, character.only = TRUE)) BiocManager::install(pkg, site_repository=site_repository, - update=update, ask=ask, version=version) + if (update || !require(pkg, character.only = TRUE)) { + BiocManager::install(pkg, site_repository=site_repository, + update=update, ask=ask, version=version) + } } } """) From ccd1a119af8ce5cc09f9fa19c322069ee1ee9e24 Mon Sep 17 00:00:00 2001 From: Scott Gigante Date: Thu, 3 Oct 2019 23:50:58 -0400 Subject: [PATCH 082/125] install libgsl --- .travis.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.travis.yml b/.travis.yml index 83b65678..96ed5e04 100644 --- a/.travis.yml +++ b/.travis.yml @@ -20,6 +20,7 @@ - libglu1-mesa-dev - freeglut3-dev - mesa-common-dev + - libgsl-dev cache: - pip From ebbcdfbc29bda5f355c8a21715434de5791b7566 Mon Sep 17 00:00:00 2001 From: Scott Gigante Date: Thu, 3 Oct 2019 23:58:15 -0400 Subject: [PATCH 083/125] order branches by average pseudotime --- scprep/run/slingshot.py | 20 ++++++++++++++++++-- test/test_run.py | 19 +++++++++++++++++++ 2 files changed, 37 insertions(+), 2 deletions(-) diff --git a/scprep/run/slingshot.py b/scprep/run/slingshot.py index 9f0d53d7..e15c38e6 100644 --- a/scprep/run/slingshot.py +++ b/scprep/run/slingshot.py @@ -163,12 +163,16 @@ def Slingshot( >>> data, clusters = phate.tree.gen_dla(n_branch=4, n_dim=200, branch_length=200) >>> phate_op = phate.PHATE() >>> data_phate = phate_op.fit_transform(data) - >>> pseudotime, curves = scprep.run.Slingshot(data_phate, clusters) + >>> pseudotime, branch, curves = scprep.run.Slingshot(data_phate, clusters) >>> ax = scprep.plot.scatter2d(data_phate, c=pseudotime[:,0], cmap='magma', legend_title='Branch 1') >>> scprep.plot.scatter2d(data_phate, c=pseudotime[:,1], cmap='viridis', ax=ax, ... ticks=False, label_prefix='PHATE', legend_title='Branch 2') >>> for curve in curves: ... ax.plot(curve[:,0], curve[:,1], c='black') + >>> ax = scprep.plot.scatter2d(data_phate, c=branch, legend_title='Branch', + ... ticks=False, label_prefix='PHATE') + >>> for curve in curves: + ... ax.plot(curve[:,0], curve[:,1], c='black') """ if seed is None: seed = np.random.randint(2**16 - 1) @@ -208,7 +212,19 @@ def Slingshot( membership = (~np.isnan(slingshot['pseudotime'])).astype(int) branch = np.sum(membership * (2**np.arange(membership.shape[1])), axis=1) - branch, _ = pd.factorize(branch) + # reorder based on pseudotime + branch_ids = np.unique(branch) + branch_means = [np.nanmean(slingshot['pseudotime'][branch==id]) + if not np.all(np.isnan(slingshot['pseudotime'][branch==id])) else np.nan + for id in branch_ids] + branch_order = np.argsort(branch_means) + branch_old = branch.copy() + for i in range(len(branch_order)): + j = branch_order[i] + if np.isnan(branch_means[j]): + branch[branch_old == branch_ids[j]] = -1 + else: + branch[branch_old == branch_ids[j]] = i if index is not None: slingshot['pseudotime'] = pd.DataFrame(slingshot['pseudotime'], index=index) diff --git a/test/test_run.py b/test/test_run.py index dd09a113..3bc81ca1 100644 --- a/test/test_run.py +++ b/test/test_run.py @@ -199,9 +199,13 @@ def test_slingshot(self): assert pseudotime.shape[0] == self.X_pca.shape[0] assert pseudotime.shape[1] == curves.shape[0] assert branch.shape[0] == self.X_pca.shape[0] + current_pseudotime = -1 for i in np.unique(branch): branch_membership = np.isnan(pseudotime[branch==i]) assert np.all(branch_membership == branch_membership[0]) + new_pseudotime = np.nanmean(pseudotime[branch==i]) + assert new_pseudotime > current_pseudotime + current_pseudotime = new_pseudotime assert curves.shape[1] == self.X_pca.shape[0] assert curves.shape[2] == 2 assert np.all(np.any(~np.isnan(pseudotime), axis=1)) @@ -215,9 +219,13 @@ def test_slingshot_pandas(self): assert pseudotime.shape[0] == self.X_pca.shape[0] assert pseudotime.shape[1] == curves.shape[0] assert branch.shape[0] == self.X_pca.shape[0] + current_pseudotime = -1 for i in np.unique(branch): branch_membership = np.isnan(pseudotime.loc[branch==i]) assert np.all(branch_membership == branch_membership.iloc[0]) + new_pseudotime = np.nanmean(np.nanmean(pseudotime.loc[branch==i])) + assert new_pseudotime > current_pseudotime + current_pseudotime = new_pseudotime assert curves.shape[1] == self.X_pca.shape[0] assert curves.shape[2] == 2 assert np.all(np.any(~np.isnan(pseudotime), axis=1)) @@ -235,9 +243,16 @@ def test_slingshot_optional_args(self): assert pseudotime.shape[0] == self.X_pca.shape[0] assert pseudotime.shape[1] == curves.shape[0] assert branch.shape[0] == self.X_pca.shape[0] + current_pseudotime = -1 for i in np.unique(branch): branch_membership = np.isnan(pseudotime[branch==i]) assert np.all(branch_membership == branch_membership[0]) + if np.all(np.isnan(pseudotime[branch==i])): + assert i == -1 + else: + new_pseudotime = np.nanmean(pseudotime[branch==i]) + assert new_pseudotime > current_pseudotime + current_pseudotime = new_pseudotime assert curves.shape[1] == self.X_pca.shape[0] assert curves.shape[2] == 2 pseudotime, branch, curves = scprep.run.Slingshot(self.X_pca[:,:2], self.clusters, @@ -245,9 +260,13 @@ def test_slingshot_optional_args(self): assert pseudotime.shape[0] == self.X_pca.shape[0] assert pseudotime.shape[1] == curves.shape[0] assert branch.shape[0] == self.X_pca.shape[0] + current_pseudotime = -1 for i in np.unique(branch): branch_membership = np.isnan(pseudotime[branch==i]) assert np.all(branch_membership == branch_membership[0]) + new_pseudotime = np.nanmean(pseudotime[branch==i]) + assert new_pseudotime > current_pseudotime + current_pseudotime = new_pseudotime assert curves.shape[1] == self.X_pca.shape[0] assert curves.shape[2] == 2 assert np.all(np.any(~np.isnan(pseudotime), axis=1)) From 37b9dd71b0f61a336b6ccab212a3d8d43e0d920f Mon Sep 17 00:00:00 2001 From: Scott Gigante Date: Fri, 4 Oct 2019 00:10:39 -0400 Subject: [PATCH 084/125] fix redundant sce conversion --- scprep/run/slingshot.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/scprep/run/slingshot.py b/scprep/run/slingshot.py index e15c38e6..a317bb88 100644 --- a/scprep/run/slingshot.py +++ b/scprep/run/slingshot.py @@ -47,15 +47,15 @@ def install(site_repository = None, update = False, version = None, verbose = Tr cluster_labels <- as.factor(cluster_labels) # Run Slingshot - sce <- slingshot(data, clusterLabels = cluster_labels, + sling <- slingshot(data, clusterLabels = cluster_labels, start.clus = start_cluster, end.clus = end_cluster, dist.fun = distance, omega = omega, lineages = lineages, shrink = shrink, extend = extend, reweight = reweight, reassign = reassign, thresh = thresh, maxit = max_iter, stretch = stretch, smoother = smoother, shrink.method = shrink_method, allow.breaks = allow_breaks) - list(pseudotime = slingPseudotime(sce), - curves = lapply(SlingshotDataSet(sce)@curves, function(curve) curve$s[curve$ord,])) + list(pseudotime = slingPseudotime(sling), + curves = lapply(sling@curves, function(curve) curve$s[curve$ord,])) """) From 0861a2f6a94c78c01cc516c2fe80fbfd26f0d1c7 Mon Sep 17 00:00:00 2001 From: Scott Gigante Date: Fri, 4 Oct 2019 11:22:24 -0400 Subject: [PATCH 085/125] use median variability to fit loess --- scprep/filter.py | 5 ++++- scprep/measure.py | 17 +++++++++++++---- scprep/plot/histogram.py | 5 ++++- 3 files changed, 21 insertions(+), 6 deletions(-) diff --git a/scprep/filter.py b/scprep/filter.py index e56d06b2..c1aa0dbf 100644 --- a/scprep/filter.py +++ b/scprep/filter.py @@ -371,7 +371,7 @@ def filter_duplicates(data, *extra_data, sample_labels=None): return data -def filter_variable_genes(data, *extra_data, span=0.7, interpolate=0.2, +def filter_variable_genes(data, *extra_data, span=0.7, interpolate=0.2, kernel_size=0.05, cutoff=None, percentile=80): """Filter all genes with low variability @@ -388,6 +388,9 @@ def filter_variable_genes(data, *extra_data, span=0.7, interpolate=0.2, interpolate : float, optional (default: 0.2) Multiple of the standard deviation of variances at which to interpolate linearly in order to reduce computation time. + kernel_size : float or int, optional (default: 0.05) + Width of rolling median window. If a float, the width is given by + kernel_size * data.shape[1] cutoff : float, optional (default: None) Variability above which expression is deemed significant percentile : int, optional (Default: 80) diff --git a/scprep/measure.py b/scprep/measure.py index 7c9adcf2..31a881a3 100644 --- a/scprep/measure.py +++ b/scprep/measure.py @@ -2,6 +2,7 @@ import pandas as pd import warnings import numbers +import scipy.signal from . import utils, select from ._lazyload import statsmodels @@ -69,10 +70,11 @@ def gene_set_expression(data, genes=None, library_size_normalize=False, @utils._with_pkg(pkg="statsmodels") -def variable_genes(data, span=0.7, interpolate=0.2): +def variable_genes(data, span=0.7, interpolate=0.2, kernel_size=0.05): """Measure the variability of each gene in a dataset - Variability is computed as the deviation from a loess fit of the mean-variance curve + Variability is computed as the deviation from a loess fit + to the rolling median of the mean-variance curve Parameters ---------- @@ -83,6 +85,9 @@ def variable_genes(data, span=0.7, interpolate=0.2): interpolate : float, optional (default: 0.2) Multiple of the standard deviation of variances at which to interpolate linearly in order to reduce computation time. + kernel_size : float or int, optional (default: 0.05) + Width of rolling median window. If a float, the width is given by + kernel_size * data.shape[1] Returns ------- @@ -92,10 +97,14 @@ def variable_genes(data, span=0.7, interpolate=0.2): columns = data.columns if isinstance(data, pd.DataFrame) else None data = utils.to_array_or_spmatrix(data) data_std = utils.matrix_std(data, axis=0) ** 2 + kernel_size = 2*(int(kernel_size * len(data_std))//2)+1 + order = np.argsort(data_std) + data_std_med = np.empty_like(data_std) + data_std_med[order] = scipy.signal.medfilt(data_std[order], kernel_size=kernel_size) data_mean = utils.toarray(np.mean(data, axis=0)).flatten() - delta = np.std(data_std) * interpolate + delta = np.std(data_std_med) * interpolate lowess = statsmodels.nonparametric.smoothers_lowess.lowess( - data_std, data_mean, + data_std_med, data_mean, delta=delta, frac=span, return_sorted=False) variability = data_std - lowess if columns is not None: diff --git a/scprep/plot/histogram.py b/scprep/plot/histogram.py index c2fc61d2..81544b24 100644 --- a/scprep/plot/histogram.py +++ b/scprep/plot/histogram.py @@ -291,7 +291,7 @@ def plot_gene_set_expression(data, genes=None, @utils._with_pkg(pkg="matplotlib", min_version=3) -def plot_variable_genes(data, span=0.7, interpolate=0.2, +def plot_variable_genes(data, span=0.7, interpolate=0.2, kernel_size=0.05, bins=100, log=False, cutoff=None, percentile=None, ax=None, figsize=None, @@ -314,6 +314,9 @@ def plot_variable_genes(data, span=0.7, interpolate=0.2, interpolate : float, optional (default: 0.2) Multiple of the standard deviation of variances at which to interpolate linearly in order to reduce computation time. + kernel_size : float or int, optional (default: 0.05) + Width of rolling median window. If a float, the width is given by + kernel_size * data.shape[1] bins : int, optional (default: 100) Number of bins to draw in the histogram log : bool, or {'x', 'y'}, optional (default: False) From d6af6170721c2e5509c778ea77a064843a8e0521 Mon Sep 17 00:00:00 2001 From: Scott Gigante Date: Fri, 4 Oct 2019 12:02:16 -0400 Subject: [PATCH 086/125] add rolling median filter to variable_genes, switch plot to scatterplot --- scprep/filter.py | 6 ++- scprep/measure.py | 13 ++++-- scprep/plot/__init__.py | 3 +- scprep/plot/histogram.py | 83 ----------------------------------- scprep/plot/variable_genes.py | 71 ++++++++++++++++++++++++++++++ test/test_plot.py | 9 +--- 6 files changed, 87 insertions(+), 98 deletions(-) create mode 100644 scprep/plot/variable_genes.py diff --git a/scprep/filter.py b/scprep/filter.py index c1aa0dbf..d3ff7366 100644 --- a/scprep/filter.py +++ b/scprep/filter.py @@ -375,7 +375,8 @@ def filter_variable_genes(data, *extra_data, span=0.7, interpolate=0.2, kernel_s cutoff=None, percentile=80): """Filter all genes with low variability - Variability is computed as the deviation from a loess fit of the mean-variance curve + Variability is computed as the deviation from a loess fit + to the rolling median of the mean-variance curve Parameters ---------- @@ -405,7 +406,8 @@ def filter_variable_genes(data, *extra_data, span=0.7, interpolate=0.2, kernel_s extra_data : array-like, shape=[any, m_features] Filtered extra data, if passed. """ - var_genes = measure.variable_genes(data, span=span, interpolate=interpolate) + var_genes = measure.variable_genes(data, span=span, interpolate=interpolate, + kernel_size=kernel_size) keep_cells_idx = _get_filter_idx(var_genes, cutoff, percentile, keep_cells='above') diff --git a/scprep/measure.py b/scprep/measure.py index 31a881a3..dbcc71f2 100644 --- a/scprep/measure.py +++ b/scprep/measure.py @@ -70,7 +70,7 @@ def gene_set_expression(data, genes=None, library_size_normalize=False, @utils._with_pkg(pkg="statsmodels") -def variable_genes(data, span=0.7, interpolate=0.2, kernel_size=0.05): +def variable_genes(data, span=0.7, interpolate=0.2, kernel_size=0.05, return_means=False): """Measure the variability of each gene in a dataset Variability is computed as the deviation from a loess fit @@ -88,6 +88,8 @@ def variable_genes(data, span=0.7, interpolate=0.2, kernel_size=0.05): kernel_size : float or int, optional (default: 0.05) Width of rolling median window. If a float, the width is given by kernel_size * data.shape[1] + return_means : boolean, optional (default: False) + If True, return the gene means Returns ------- @@ -106,10 +108,13 @@ def variable_genes(data, span=0.7, interpolate=0.2, kernel_size=0.05): lowess = statsmodels.nonparametric.smoothers_lowess.lowess( data_std_med, data_mean, delta=delta, frac=span, return_sorted=False) - variability = data_std - lowess + result = data_std - lowess if columns is not None: - variability = pd.Series(variability, index=columns, name='variability') - return variability + result = pd.Series(result, index=columns, name='variability') + data_mean = pd.Series(data_mean, index=columns, name='mean') + if return_means: + result = result, data_mean + return result def _get_percentile_cutoff(data, cutoff=None, percentile=None, required=False): diff --git a/scprep/plot/__init__.py b/scprep/plot/__init__.py index 92129998..57deaa6b 100644 --- a/scprep/plot/__init__.py +++ b/scprep/plot/__init__.py @@ -1,6 +1,7 @@ from .scatter import scatter, scatter2d, scatter3d, rotate_scatter3d -from .histogram import histogram, plot_library_size, plot_gene_set_expression, plot_variable_genes +from .histogram import histogram, plot_library_size, plot_gene_set_expression from .marker import marker_plot from .scree import scree_plot from .jitter import jitter +from .variable_genes import plot_variable_genes from . import tools, colors diff --git a/scprep/plot/histogram.py b/scprep/plot/histogram.py index 81544b24..8102de66 100644 --- a/scprep/plot/histogram.py +++ b/scprep/plot/histogram.py @@ -288,86 +288,3 @@ def plot_gene_set_expression(data, genes=None, bins=bins, log=log, ax=ax, figsize=figsize, xlabel=xlabel, title=title, fontsize=fontsize, filename=filename, dpi=dpi, **kwargs) - - -@utils._with_pkg(pkg="matplotlib", min_version=3) -def plot_variable_genes(data, span=0.7, interpolate=0.2, kernel_size=0.05, - bins=100, log=False, - cutoff=None, percentile=None, - ax=None, figsize=None, - xlabel='Gene variability', - ylabel='Number of genes', - title=None, - fontsize=None, - filename=None, - dpi=None, **kwargs): - """Plot the histogram of gene variability - - Variability is computed as the deviation from a loess fit of the mean-variance curve - - Parameters - ---------- - data : array-like, shape=[n_samples, n_features] - Input data. Multiple datasets may be given as a list of array-likes. - span : float, optional (default: 0.7) - Fraction of genes to use when computing the loess estimate at each point - interpolate : float, optional (default: 0.2) - Multiple of the standard deviation of variances at which to interpolate - linearly in order to reduce computation time. - kernel_size : float or int, optional (default: 0.05) - Width of rolling median window. If a float, the width is given by - kernel_size * data.shape[1] - bins : int, optional (default: 100) - Number of bins to draw in the histogram - log : bool, or {'x', 'y'}, optional (default: False) - If True, plot both axes on a log scale. If 'x' or 'y', - only plot the given axis on a log scale. If False, - plot both axes on a linear scale. - cutoff : float or `None`, optional (default: `None`) - Absolute cutoff at which to draw a vertical line. - Only one of `cutoff` and `percentile` may be given. - percentile : float or `None`, optional (default: `None`) - Percentile between 0 and 100 at which to draw a vertical line. - Only one of `cutoff` and `percentile` may be given. - library_size_normalize : bool, optional (default: False) - Divide gene set expression by library size - ax : `matplotlib.Axes` or None, optional (default: None) - Axis to plot on. If None, a new axis will be created. - figsize : tuple or None, optional (default: None) - If not None, sets the figure size (width, height) - [x,y]label : str, optional - Labels to display on the x and y axis. - title : str or None, optional (default: None) - Axis title. - fontsize : float or None (default: None) - Base font size. - filename : str or None (default: None) - file to which the output is saved - dpi : int or None, optional (default: None) - The resolution in dots per inch. If None it will default to the value - savefig.dpi in the matplotlibrc file. If 'figure' it will set the dpi - to be the value of the figure. Only used if filename is not None. - **kwargs : additional arguments for `matplotlib.pyplot.hist` - - Returns - ------- - ax : `matplotlib.Axes` - axis on which plot was drawn - """ - if hasattr(data, 'shape') and len(data.shape) == 2: - var_genes = measure.variable_genes( - data, span=span, interpolate=interpolate) - else: - data_array = utils.to_array_or_spmatrix(data) - if len(data_array.shape) == 2 and data_array.dtype.type is not np.object_: - var_genes = measure.variable_genes( - data_array, span=span, interpolate=interpolate) - else: - var_genes = [measure.variable_genes( - d, span=span, interpolate=interpolate) - for d in data] - return histogram(var_genes, - cutoff=cutoff, percentile=percentile, - bins=bins, log=log, ax=ax, figsize=figsize, - xlabel=xlabel, title=title, fontsize=fontsize, - filename=filename, dpi=dpi, **kwargs) diff --git a/scprep/plot/variable_genes.py b/scprep/plot/variable_genes.py new file mode 100644 index 00000000..ae4429e6 --- /dev/null +++ b/scprep/plot/variable_genes.py @@ -0,0 +1,71 @@ +from .scatter import scatter +from .. import utils, measure +from ..filter import _get_filter_idx + + +@utils._with_pkg(pkg="matplotlib", min_version=3) +def plot_variable_genes(data, span=0.7, interpolate=0.2, kernel_size=0.05, + cutoff=None, percentile=90, + ax=None, figsize=None, + xlabel='Gene mean', + ylabel='Standardized variance', + title=None, + fontsize=None, + filename=None, + dpi=None, **kwargs): + """Plot the histogram of gene variability + + Variability is computed as the deviation from a loess fit + to the rolling median of the mean-variance curve + + Parameters + ---------- + data : array-like, shape=[n_samples, n_features] + Input data. Multiple datasets may be given as a list of array-likes. + span : float, optional (default: 0.7) + Fraction of genes to use when computing the loess estimate at each point + interpolate : float, optional (default: 0.2) + Multiple of the standard deviation of variances at which to interpolate + linearly in order to reduce computation time. + kernel_size : float or int, optional (default: 0.05) + Width of rolling median window. If a float, the width is given by + kernel_size * data.shape[1] + cutoff : float or `None`, optional (default: `None`) + Absolute cutoff at which to draw a vertical line. + Only one of `cutoff` and `percentile` may be given. + percentile : float or `None`, optional (default: 90) + Percentile between 0 and 100 at which to draw a vertical line. + Only one of `cutoff` and `percentile` may be given. + ax : `matplotlib.Axes` or None, optional (default: None) + Axis to plot on. If None, a new axis will be created. + figsize : tuple or None, optional (default: None) + If not None, sets the figure size (width, height) + [x,y]label : str, optional + Labels to display on the x and y axis. + title : str or None, optional (default: None) + Axis title. + fontsize : float or None (default: None) + Base font size. + filename : str or None (default: None) + file to which the output is saved + dpi : int or None, optional (default: None) + The resolution in dots per inch. If None it will default to the value + savefig.dpi in the matplotlibrc file. If 'figure' it will set the dpi + to be the value of the figure. Only used if filename is not None. + **kwargs : additional arguments for `matplotlib.pyplot.hist` + + Returns + ------- + ax : `matplotlib.Axes` + axis on which plot was drawn + """ + variability, means = measure.variable_genes(data, span=span, interpolate=interpolate, + kernel_size=kernel_size, return_means=True) + keep_cells_idx = _get_filter_idx(variability, + cutoff, percentile, + keep_cells='above') + return scatter(means, variability, c=keep_cells_idx, + cmap={True : 'red', False : 'black'}, + xlabel=xlabel, ylabel=ylabel, title=title, + fontsize=fontsize, filename=filename, dpi=dpi, + **kwargs) diff --git a/test/test_plot.py b/test/test_plot.py index b5b8438f..a20c6bdf 100644 --- a/test/test_plot.py +++ b/test/test_plot.py @@ -803,14 +803,7 @@ def test_plot_gene_set_expression_single_gene(self): def test_plot_variable_genes(self): scprep.plot.plot_variable_genes( self.X, - color='r') - - def test_plot_variable_genes_multiple(self): - scprep.plot.plot_variable_genes([ - self.X, scprep.select.select_rows( - self.X, idx=np.arange(self.X.shape[0] // 2))], - filename="test_variable_genes.png", - color=['r', 'b']) + filename="test_variable_genes.png") assert os.path.exists("test_variable_genes.png") def test_variable_genes_list_of_lists(self): From 5174a4a8d83b974624c88d59b585dbf9ff55a350 Mon Sep 17 00:00:00 2001 From: Scott Gigante Date: Fri, 4 Oct 2019 12:24:58 -0400 Subject: [PATCH 087/125] rework highly variable genes api --- scprep/filter.py | 64 ++--------------------------------- scprep/measure.py | 43 +---------------------- scprep/plot/__init__.py | 2 +- scprep/plot/histogram.py | 2 +- scprep/plot/variable_genes.py | 29 ++++++++-------- scprep/select.py | 44 ++++++++++++++++++++++++ scprep/utils.py | 57 +++++++++++++++++++++++++++++++ test/test_filter.py | 10 ------ test/test_measure.py | 2 +- test/test_plot.py | 4 +-- test/test_select.py | 10 ++++++ 11 files changed, 134 insertions(+), 133 deletions(-) diff --git a/scprep/filter.py b/scprep/filter.py index d3ff7366..e5e12a30 100644 --- a/scprep/filter.py +++ b/scprep/filter.py @@ -120,21 +120,6 @@ def filter_empty_cells(data, *extra_data, sample_labels=None): return data -def _get_filter_idx(values, - cutoff, percentile, - keep_cells): - cutoff = measure._get_percentile_cutoff( - values, cutoff, percentile, required=True) - if keep_cells == 'above': - keep_cells_idx = values > cutoff - elif keep_cells == 'below': - keep_cells_idx = values < cutoff - else: - raise ValueError("Expected `keep_cells` in ['above', 'below']. " - "Got {}".format(keep_cells)) - return keep_cells_idx - - def filter_values(data, *extra_data, values=None, cutoff=None, percentile=None, keep_cells='above', @@ -188,9 +173,9 @@ def filter_values(data, *extra_data, values=None, "Filtering as a single sample.", DeprecationWarning) assert values is not None - keep_cells_idx = _get_filter_idx(values, - cutoff, percentile, - keep_cells) + keep_cells_idx = utils._get_filter_idx(values, + cutoff, percentile, + keep_cells) if return_values: extra_data = [values] + list(extra_data) data = select.select_rows(data, *extra_data, idx=keep_cells_idx) @@ -369,46 +354,3 @@ def filter_duplicates(data, *extra_data, sample_labels=None): unique_idx = _find_unique_cells(data) data = select.select_rows(data, *extra_data, idx=unique_idx) return data - - -def filter_variable_genes(data, *extra_data, span=0.7, interpolate=0.2, kernel_size=0.05, - cutoff=None, percentile=80): - """Filter all genes with low variability - - Variability is computed as the deviation from a loess fit - to the rolling median of the mean-variance curve - - Parameters - ---------- - data : array-like, shape=[n_samples, n_features] - Input data - extra_data : array-like, shape=[any, n_features], optional - Optional additional data objects from which to select the same rows - span : float, optional (default: 0.7) - Fraction of genes to use when computing the loess estimate at each point - interpolate : float, optional (default: 0.2) - Multiple of the standard deviation of variances at which to interpolate - linearly in order to reduce computation time. - kernel_size : float or int, optional (default: 0.05) - Width of rolling median window. If a float, the width is given by - kernel_size * data.shape[1] - cutoff : float, optional (default: None) - Variability above which expression is deemed significant - percentile : int, optional (Default: 80) - Percentile above or below which to remove genes. - Must be an integer between 0 and 100. Only one of `cutoff` - and `percentile` should be specified. - - Returns - ------- - data : array-like, shape=[n_samples, m_features] - Filtered output data, where m_features <= n_features - extra_data : array-like, shape=[any, m_features] - Filtered extra data, if passed. - """ - var_genes = measure.variable_genes(data, span=span, interpolate=interpolate, - kernel_size=kernel_size) - keep_cells_idx = _get_filter_idx(var_genes, - cutoff, percentile, - keep_cells='above') - return select.select_cols(data, *extra_data, idx=keep_cells_idx) diff --git a/scprep/measure.py b/scprep/measure.py index dbcc71f2..8b79530c 100644 --- a/scprep/measure.py +++ b/scprep/measure.py @@ -70,7 +70,7 @@ def gene_set_expression(data, genes=None, library_size_normalize=False, @utils._with_pkg(pkg="statsmodels") -def variable_genes(data, span=0.7, interpolate=0.2, kernel_size=0.05, return_means=False): +def gene_variability(data, span=0.7, interpolate=0.2, kernel_size=0.05, return_means=False): """Measure the variability of each gene in a dataset Variability is computed as the deviation from a loess fit @@ -115,44 +115,3 @@ def variable_genes(data, span=0.7, interpolate=0.2, kernel_size=0.05, return_mea if return_means: result = result, data_mean return result - - -def _get_percentile_cutoff(data, cutoff=None, percentile=None, required=False): - """Get a cutoff for a dataset - - Parameters - ---------- - data : array-like - cutoff : float or None, optional (default: None) - Absolute cutoff value. Only one of cutoff and percentile may be given - percentile : float or None, optional (default: None) - Percentile cutoff value between 0 and 100. - Only one of cutoff and percentile may be given - required : bool, optional (default: False) - If True, one of cutoff and percentile must be given. - - Returns - ------- - cutoff : float or None - Absolute cutoff value. Can only be None if required is False and - cutoff and percentile are both None. - """ - if percentile is not None: - if cutoff is not None: - raise ValueError( - "Only one of `cutoff` and `percentile` should be given." - "Got cutoff={}, percentile={}".format(cutoff, percentile)) - if not isinstance(percentile, numbers.Number): - return [_get_percentile_cutoff(data, percentile=p) - for p in percentile] - if percentile < 1: - warnings.warn( - "`percentile` expects values between 0 and 100." - "Got {}. Did you mean {}?".format(percentile, - percentile * 100), - UserWarning) - cutoff = np.percentile(np.array(data).reshape(-1), percentile) - elif cutoff is None and required: - raise ValueError( - "One of either `cutoff` or `percentile` must be given.") - return cutoff diff --git a/scprep/plot/__init__.py b/scprep/plot/__init__.py index 57deaa6b..582a8a0d 100644 --- a/scprep/plot/__init__.py +++ b/scprep/plot/__init__.py @@ -3,5 +3,5 @@ from .marker import marker_plot from .scree import scree_plot from .jitter import jitter -from .variable_genes import plot_variable_genes +from .variable_genes import plot_gene_variability from . import tools, colors diff --git a/scprep/plot/histogram.py b/scprep/plot/histogram.py index 8102de66..0330b4c2 100644 --- a/scprep/plot/histogram.py +++ b/scprep/plot/histogram.py @@ -115,7 +115,7 @@ def histogram(data, if title is not None: ax.set_title(title, fontsize=parse_fontsize(None, 'xx-large')) - cutoff = measure._get_percentile_cutoff( + cutoff = utils._get_percentile_cutoff( data, cutoff, percentile, required=False) if cutoff is not None: if isinstance(cutoff, numbers.Number): diff --git a/scprep/plot/variable_genes.py b/scprep/plot/variable_genes.py index ae4429e6..c5fc6b43 100644 --- a/scprep/plot/variable_genes.py +++ b/scprep/plot/variable_genes.py @@ -1,18 +1,17 @@ from .scatter import scatter from .. import utils, measure -from ..filter import _get_filter_idx @utils._with_pkg(pkg="matplotlib", min_version=3) -def plot_variable_genes(data, span=0.7, interpolate=0.2, kernel_size=0.05, - cutoff=None, percentile=90, - ax=None, figsize=None, - xlabel='Gene mean', - ylabel='Standardized variance', - title=None, - fontsize=None, - filename=None, - dpi=None, **kwargs): +def plot_gene_variability(data, span=0.7, interpolate=0.2, kernel_size=0.05, + cutoff=None, percentile=90, + ax=None, figsize=None, + xlabel='Gene mean', + ylabel='Standardized variance', + title=None, + fontsize=None, + filename=None, + dpi=None, **kwargs): """Plot the histogram of gene variability Variability is computed as the deviation from a loess fit @@ -59,11 +58,11 @@ def plot_variable_genes(data, span=0.7, interpolate=0.2, kernel_size=0.05, ax : `matplotlib.Axes` axis on which plot was drawn """ - variability, means = measure.variable_genes(data, span=span, interpolate=interpolate, - kernel_size=kernel_size, return_means=True) - keep_cells_idx = _get_filter_idx(variability, - cutoff, percentile, - keep_cells='above') + variability, means = measure.gene_variability(data, span=span, interpolate=interpolate, + kernel_size=kernel_size, return_means=True) + keep_cells_idx = utils._get_filter_idx(variability, + cutoff, percentile, + keep_cells='above') return scatter(means, variability, c=keep_cells_idx, cmap={True : 'red', False : 'black'}, xlabel=xlabel, ylabel=ylabel, title=title, diff --git a/scprep/select.py b/scprep/select.py index ec5df059..edf5b1de 100644 --- a/scprep/select.py +++ b/scprep/select.py @@ -497,3 +497,47 @@ def subsample(*data, n=10000, seed=None): select_idx = np.random.choice(N, n, replace=False) data = [select_rows(d, idx=select_idx) for d in data] return tuple(data) if len(data) > 1 else data[0] + + +def highly_variable_genes(data, *extra_data, span=0.7, interpolate=0.2, kernel_size=0.05, + cutoff=None, percentile=80): + """Filter all genes with low variability + + Variability is computed as the deviation from a loess fit + to the rolling median of the mean-variance curve + + Parameters + ---------- + data : array-like, shape=[n_samples, n_features] + Input data + extra_data : array-like, shape=[any, n_features], optional + Optional additional data objects from which to select the same rows + span : float, optional (default: 0.7) + Fraction of genes to use when computing the loess estimate at each point + interpolate : float, optional (default: 0.2) + Multiple of the standard deviation of variances at which to interpolate + linearly in order to reduce computation time. + kernel_size : float or int, optional (default: 0.05) + Width of rolling median window. If a float, the width is given by + kernel_size * data.shape[1] + cutoff : float, optional (default: None) + Variability above which expression is deemed significant + percentile : int, optional (Default: 80) + Percentile above or below which to remove genes. + Must be an integer between 0 and 100. Only one of `cutoff` + and `percentile` should be specified. + + Returns + ------- + data : array-like, shape=[n_samples, m_features] + Filtered output data, where m_features <= n_features + extra_data : array-like, shape=[any, m_features] + Filtered extra data, if passed. + """ + from . import measure + var_genes = measure.gene_variability(data, span=span, interpolate=interpolate, + kernel_size=kernel_size) + keep_cells_idx = utils._get_filter_idx(var_genes, + cutoff, percentile, + keep_cells='above') + return select_cols(data, *extra_data, idx=keep_cells_idx) diff --git a/scprep/utils.py b/scprep/utils.py index 16994fb2..4099128b 100644 --- a/scprep/utils.py +++ b/scprep/utils.py @@ -78,6 +78,63 @@ def _with_pkg(fun, pkg=None, min_version=None, *args, **kwargs): return fun(*args, **kwargs) +def _get_percentile_cutoff(data, cutoff=None, percentile=None, required=False): + """Get a cutoff for a dataset + + Parameters + ---------- + data : array-like + cutoff : float or None, optional (default: None) + Absolute cutoff value. Only one of cutoff and percentile may be given + percentile : float or None, optional (default: None) + Percentile cutoff value between 0 and 100. + Only one of cutoff and percentile may be given + required : bool, optional (default: False) + If True, one of cutoff and percentile must be given. + + Returns + ------- + cutoff : float or None + Absolute cutoff value. Can only be None if required is False and + cutoff and percentile are both None. + """ + if percentile is not None: + if cutoff is not None: + raise ValueError( + "Only one of `cutoff` and `percentile` should be given." + "Got cutoff={}, percentile={}".format(cutoff, percentile)) + if not isinstance(percentile, numbers.Number): + return [_get_percentile_cutoff(data, percentile=p) + for p in percentile] + if percentile < 1: + warnings.warn( + "`percentile` expects values between 0 and 100." + "Got {}. Did you mean {}?".format(percentile, + percentile * 100), + UserWarning) + cutoff = np.percentile(np.array(data).reshape(-1), percentile) + elif cutoff is None and required: + raise ValueError( + "One of either `cutoff` or `percentile` must be given.") + return cutoff + + + +def _get_filter_idx(values, + cutoff, percentile, + keep_cells): + cutoff = _get_percentile_cutoff( + values, cutoff, percentile, required=True) + if keep_cells == 'above': + keep_cells_idx = values > cutoff + elif keep_cells == 'below': + keep_cells_idx = values < cutoff + else: + raise ValueError("Expected `keep_cells` in ['above', 'below']. " + "Got {}".format(keep_cells)) + return keep_cells_idx + + def toarray(x): """Convert an array-like to a np.ndarray diff --git a/test/test_filter.py b/test/test_filter.py index 31d62520..27c99bd3 100644 --- a/test/test_filter.py +++ b/test/test_filter.py @@ -85,16 +85,6 @@ def test_filter_rare_genes(self): self.X_dense, utils.assert_transform_equals, Y=X_filtered, transform=scprep.filter.filter_rare_genes) - def test_filter_variable_genes(self): - X_filtered = scprep.filter.filter_variable_genes(self.X_dense, percentile=70) - assert X_filtered.shape[0] == self.X_dense.shape[0] - assert X_filtered.shape[1] <= 30 - assert X_filtered.shape[1] >= 20 - assert self.X_dense.columns[np.argmax(self.X_dense.values.std(axis=0))] in X_filtered.columns - matrix.test_all_matrix_types( - self.X_dense, utils.assert_transform_equals, - Y=X_filtered, transform=scprep.filter.filter_variable_genes, percentile=70) - def test_library_size_filter(self): X_filtered = scprep.filter.filter_library_size( diff --git a/test/test_measure.py b/test/test_measure.py index 15be58a3..e54e61f1 100644 --- a/test/test_measure.py +++ b/test/test_measure.py @@ -70,7 +70,7 @@ def test_fun(X): def test_variable_genes(self): def test_fun(X): - x = scprep.measure.variable_genes(X) + x = scprep.measure.gene_variability(X) assert x.name == 'variability' assert np.all(x.index == self.X_dense.columns) matrix.test_pandas_matrix_types( diff --git a/test/test_plot.py b/test/test_plot.py index a20c6bdf..c6b5ccef 100644 --- a/test/test_plot.py +++ b/test/test_plot.py @@ -801,13 +801,13 @@ def test_plot_gene_set_expression_single_gene(self): assert os.path.exists("test_gene_expression.png") def test_plot_variable_genes(self): - scprep.plot.plot_variable_genes( + scprep.plot.plot_gene_variability( self.X, filename="test_variable_genes.png") assert os.path.exists("test_variable_genes.png") def test_variable_genes_list_of_lists(self): - scprep.plot.plot_variable_genes( + scprep.plot.plot_gene_variability( scprep.utils.toarray(self.X).tolist()) def test_histogram_single_gene_dataframe(self): diff --git a/test/test_select.py b/test/test_select.py index 8d53f81b..0a691d05 100644 --- a/test/test_select.py +++ b/test/test_select.py @@ -407,6 +407,16 @@ def test_subsample_n_too_large(self): "Expected n (101) <= n_samples (100)", scprep.select.subsample, self.X, n=self.X.shape[0] + 1) + def test_select_variable_genes(self): + X_filtered = scprep.select.highly_variable_genes(self.X, percentile=70) + assert X_filtered.shape[0] == self.X.shape[0] + assert X_filtered.shape[1] <= 30 + assert X_filtered.shape[1] >= 20 + assert self.X.columns[np.argmax(self.X.values.std(axis=0))] in X_filtered.columns + matrix.test_all_matrix_types( + self.X, utils.assert_transform_equals, + Y=X_filtered, transform=scprep.select.highly_variable_genes, percentile=70) + def test_string_subset_exact_word(): np.testing.assert_array_equal(scprep.select._get_string_subset_mask( From 3a9efb14d03e53a6487dc6b5fa342a00b6e59c6e Mon Sep 17 00:00:00 2001 From: Scott Gigante Date: Fri, 4 Oct 2019 13:12:03 -0400 Subject: [PATCH 088/125] allow kernel_size > 1 --- scprep/measure.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/scprep/measure.py b/scprep/measure.py index 8b79530c..fda2347c 100644 --- a/scprep/measure.py +++ b/scprep/measure.py @@ -86,8 +86,8 @@ def gene_variability(data, span=0.7, interpolate=0.2, kernel_size=0.05, return_m Multiple of the standard deviation of variances at which to interpolate linearly in order to reduce computation time. kernel_size : float or int, optional (default: 0.05) - Width of rolling median window. If a float, the width is given by - kernel_size * data.shape[1] + Width of rolling median window. If a float between 0 and 1, the width is given by + kernel_size * data.shape[1]. Otherwise should be an odd integer return_means : boolean, optional (default: False) If True, return the gene means @@ -99,7 +99,8 @@ def gene_variability(data, span=0.7, interpolate=0.2, kernel_size=0.05, return_m columns = data.columns if isinstance(data, pd.DataFrame) else None data = utils.to_array_or_spmatrix(data) data_std = utils.matrix_std(data, axis=0) ** 2 - kernel_size = 2*(int(kernel_size * len(data_std))//2)+1 + if kernel_size < 1: + kernel_size = 2*(int(kernel_size * len(data_std))//2)+1 order = np.argsort(data_std) data_std_med = np.empty_like(data_std) data_std_med[order] = scipy.signal.medfilt(data_std[order], kernel_size=kernel_size) From 3ba22b9c941a1da6ffb59ba8bc60d10a83a55eee Mon Sep 17 00:00:00 2001 From: Scott Gigante Date: Fri, 4 Oct 2019 13:28:29 -0400 Subject: [PATCH 089/125] patch pandas sparse dataframe loc, monkey patches https://github.com/pandas-dev/pandas/issues/27781 --- scprep/__init__.py | 11 +++++++++++ test/test_patch.py | 4 ++++ 2 files changed, 15 insertions(+) diff --git a/scprep/__init__.py b/scprep/__init__.py index a915c9bf..a7a8ffdd 100644 --- a/scprep/__init__.py +++ b/scprep/__init__.py @@ -14,3 +14,14 @@ import scprep.stats import scprep.reduce import scprep.run + +import pandas as pd +if int(pd.__version__.split('.')[1]) < 26: + def fill_value(self): + # Used in reindex_indexer + try: + return self.values.dtype.fill_value + except AttributeError: + return self.values.dtype.na_value + from pandas.core.internals.blocks import ExtensionBlock + setattr(ExtensionBlock, 'fill_value', property(fill_value)) diff --git a/test/test_patch.py b/test/test_patch.py index 86f99543..d408895e 100644 --- a/test/test_patch.py +++ b/test/test_patch.py @@ -9,3 +9,7 @@ def test_pandas_series_rmatmul(): df = pd.DataFrame(mat) ser = pd.Series(arr) np.testing.assert_array_equal(mat @ ser, (df @ ser).values) + +def test_pandas_sparse_iloc(): + X = pd.DataFrame([[0,1,1], [0,0,1], [0,0,0]]).astype(pd.SparseDtype(float, fill_value=0.0)) + assert np.all(~np.isnan(X.iloc[[0,1]].to_numpy())) \ No newline at end of file From 526f46582ae5875cf1f41830420ac85ba90e491f Mon Sep 17 00:00:00 2001 From: Scott Gigante Date: Fri, 4 Oct 2019 13:37:34 -0400 Subject: [PATCH 090/125] test 2d list of lists --- test/test_plot.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/test/test_plot.py b/test/test_plot.py index 85f322df..49f070d9 100644 --- a/test/test_plot.py +++ b/test/test_plot.py @@ -976,6 +976,10 @@ def test_scatter_custom_ticklabels(self): assert np.all(xticklabels == np.array(['a', 'b', 'c'])) def test_scatter_axis_labels(self): + ax = scprep.plot.scatter2d( + self.X_pca.tolist(), label_prefix="test") + assert ax.get_xlabel() == "test1" + assert ax.get_ylabel() == "test2" ax = scprep.plot.scatter3d( self.X_pca.tolist(), label_prefix="test") assert ax.get_xlabel() == "test1" From 6b83dfcbd31f39c271f2711bcdb4152d03ca3d44 Mon Sep 17 00:00:00 2001 From: Scott Gigante Date: Fri, 4 Oct 2019 13:51:08 -0400 Subject: [PATCH 091/125] add splatter and slingshot docs --- doc/source/reference.rst | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/doc/source/reference.rst b/doc/source/reference.rst index dd91a80f..ac1e425d 100644 --- a/doc/source/reference.rst +++ b/doc/source/reference.rst @@ -120,3 +120,23 @@ External Tools :inherited-members: :imported-members: :show-inheritance: + +Splatter +~~~~~~~~ + +.. automodule:: scprep.run.splatter + :autosummary: + :members: + :inherited-members: + :imported-members: + :show-inheritance: + +Slingshot +~~~~~~~~~ + +.. automodule:: scprep.run.slingshot + :autosummary: + :members: + :inherited-members: + :imported-members: + :show-inheritance: From 5a63d0f2a4346df6564b415f30dd7973f17c2b27 Mon Sep 17 00:00:00 2001 From: Scott Gigante Date: Fri, 4 Oct 2019 14:57:20 -0400 Subject: [PATCH 092/125] add download_url and download_and_extract_zip, resolves #76 --- scprep/io/download.py | 59 ++++++++++++++++++++++++++++++++++++++++--- test/test_io.py | 17 +++++++++++++ 2 files changed, 72 insertions(+), 4 deletions(-) diff --git a/scprep/io/download.py b/scprep/io/download.py index b70e3472..68b5099b 100644 --- a/scprep/io/download.py +++ b/scprep/io/download.py @@ -1,14 +1,24 @@ import requests +import zipfile +import tempfile +import os +import urllib.request _CHUNK_SIZE = 32768 _GOOGLE_DRIVE_URL = "https://docs.google.com/uc?export=download" +_FAKE_HEADERS = [('User-Agent', 'Mozilla/5.0')] + def _save_response_content(response, destination): global _CHUNK_SIZE - with open(destination, "wb") as f: + if isinstance(destination, str): + with open(destination, 'wb') as handle: + _save_response_content(response, handle) + else: for chunk in response.iter_content(_CHUNK_SIZE): if chunk: # filter out keep-alive new chunks - f.write(chunk) + destination.write(chunk) + def _google_drive_confirm_token(response): for key, value in response.cookies.items(): @@ -28,8 +38,8 @@ def download_google_drive(id, destination): Google Drive ID string. You can access this by clicking 'Get Shareable Link', which will give a URL of the form https://drive.google.com/file/d/**your_file_id**/view?usp=sharing - destination : string - Filename to which to save the downloaded file + destination : string or file + File to which to save the downloaded data """ global _GOOGLE_DRIVE_URL @@ -42,3 +52,44 @@ def download_google_drive(id, destination): response = session.get(URL, params = params, stream = True) _save_response_content(response, destination) + + +def download_url(url, destination): + """Download a file from a URL + + Parameters + ---------- + url : string + URL of file to be downloaded + destination : string or file + File to which to save the downloaded data + """ + if isinstance(destination, str): + with open(destination, 'wb') as handle: + download_url(url, handle) + else: + opener = urllib.request.build_opener() + opener.addheaders = _FAKE_HEADERS + urllib.request.install_opener(opener) + with urllib.request.urlopen(url) as handle: + destination.write(handle.read()) + + +def download_and_extract_zip(url, destination): + """Download a .zip file from a URL and extract it + + Parameters + ---------- + url : string + URL of file to be downloaded + destination : string + Directory in which to extract the downloaded zip + """ + if not os.path.isdir(destination): + os.mkdir(destination) + zip_handle = tempfile.NamedTemporaryFile(suffix=".zip", delete=False) + download_url(url, zip_handle) + zip_handle.close() + with zipfile.ZipFile(zip_handle.name, 'r') as handle: + handle.extractall(destination) + os.unlink(zip_handle.name) diff --git a/test/test_io.py b/test/test_io.py index 29757b6c..c082ed76 100644 --- a/test/test_io.py +++ b/test/test_io.py @@ -8,6 +8,7 @@ import fcsparser import zipfile import urllib +import shutil def test_10X_duplicate_gene_names(): @@ -492,3 +493,19 @@ def test_download_google_drive(): data = f.read() assert data == 'test\n', data os.remove(dest) + +def test_download_url(): + X = data.load_10X() + scprep.io.download.download_url("https://github.com/KrishnaswamyLab/scprep/raw/master/data/test_data/test_10X/matrix.mtx.gz", "url_test.mtx.gz") + Y = scprep.io.load_mtx("url_test.mtx.gz").T + assert (X.to_coo() - Y).nnz == 0 + os.remove("url_test.mtx.gz") + +def test_download_zip(): + X = data.load_10X() + scprep.io.download.download_and_extract_zip("https://github.com/KrishnaswamyLab/scprep/raw/master/data/test_data/test_10X.zip", "zip_test") + Y = scprep.io.load_10X("zip_test/test_10X") + assert np.all(X == Y) + assert np.all(X.index == Y.index) + assert np.all(X.columns == Y.columns) + shutil.rmtree("zip_test") From d7650f3ba0e081f8e41073bf56372f77412ebf85 Mon Sep 17 00:00:00 2001 From: Scott Gigante Date: Fri, 4 Oct 2019 15:11:06 -0400 Subject: [PATCH 093/125] convert np.matrix properly --- scprep/utils.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/scprep/utils.py b/scprep/utils.py index 50d473a5..de2172e4 100644 --- a/scprep/utils.py +++ b/scprep/utils.py @@ -196,7 +196,8 @@ def to_array_or_spmatrix(x): x = x.to_coo() elif is_sparse_dataframe(x) or is_sparse_series(x): x = x.sparse.to_coo() - elif isinstance(x, (sparse.spmatrix, np.ndarray, numbers.Number)): + elif isinstance(x, (sparse.spmatrix, np.ndarray, numbers.Number)) and \ + not isinstance(x, np.matrix): pass elif isinstance(x, list): x_out = [] From 74c69bd303a69f01f4989a415cb0c41e0a25ef96 Mon Sep 17 00:00:00 2001 From: Scott Gigante Date: Fri, 4 Oct 2019 15:19:01 -0400 Subject: [PATCH 094/125] test version kwarg --- test/test_run.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/test/test_run.py b/test/test_run.py index 3bc81ca1..1de8e2a7 100644 --- a/test/test_run.py +++ b/test/test_run.py @@ -6,6 +6,7 @@ import unittest import sklearn.cluster import rpy2.rinterface_lib.callbacks +import rpy2.rinterface_lib.embedded from sklearn.utils.testing import assert_raise_message, assert_warns_message builtin_warning = rpy2.rinterface_lib.callbacks.consolewrite_warnerror @@ -18,6 +19,15 @@ def test_verbose(): assert np.all(fun() == np.array([[1], [2], [3]])) +def test_install_bioc(): + assert_raise_message( + rpy2.rinterface_lib.embedded.RRuntimeError, + "Error: Bioconductor version '3.7' requires R version '3.5'; see", + scprep.run.install_bioconductor, + version='3.7', site_repository='https://bioconductor.org/packages/3.7/bioc', + verbose=False) + + class TestSplatter(unittest.TestCase): @classmethod From cadbfff8e56cba078beca6038d048c6477b8c31b Mon Sep 17 00:00:00 2001 From: Scott Gigante Date: Fri, 4 Oct 2019 15:42:18 -0400 Subject: [PATCH 095/125] use a definitely out of date version of bioc --- test/test_run.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test/test_run.py b/test/test_run.py index 1de8e2a7..bf60553e 100644 --- a/test/test_run.py +++ b/test/test_run.py @@ -22,9 +22,9 @@ def test_verbose(): def test_install_bioc(): assert_raise_message( rpy2.rinterface_lib.embedded.RRuntimeError, - "Error: Bioconductor version '3.7' requires R version '3.5'; see", + "Error: Bioconductor version '3.1' requires R version '3.2'; see", scprep.run.install_bioconductor, - version='3.7', site_repository='https://bioconductor.org/packages/3.7/bioc', + version='3.1', site_repository='https://bioconductor.org/packages/3.1/bioc', verbose=False) From 7f9023169aa19aad289a5bd81ae22272845801af Mon Sep 17 00:00:00 2001 From: Scott Gigante Date: Fri, 4 Oct 2019 15:45:01 -0400 Subject: [PATCH 096/125] fix sparse.to_coo reference --- test/test_io.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/test_io.py b/test/test_io.py index 3142bcbb..357d1036 100644 --- a/test/test_io.py +++ b/test/test_io.py @@ -596,7 +596,7 @@ def test_download_url(): X = data.load_10X() scprep.io.download.download_url("https://github.com/KrishnaswamyLab/scprep/raw/master/data/test_data/test_10X/matrix.mtx.gz", "url_test.mtx.gz") Y = scprep.io.load_mtx("url_test.mtx.gz").T - assert (X.to_coo() - Y).nnz == 0 + assert (X.sparse.to_coo() - Y).nnz == 0 os.remove("url_test.mtx.gz") def test_download_zip(): From 4d1f393c5c3e72e47628079f5663dc9a102491ad Mon Sep 17 00:00:00 2001 From: Scott Gigante Date: Fri, 4 Oct 2019 16:15:30 -0400 Subject: [PATCH 097/125] remove temporary workaround --- scprep/select.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/scprep/select.py b/scprep/select.py index faafe03c..1d1df322 100644 --- a/scprep/select.py +++ b/scprep/select.py @@ -473,12 +473,6 @@ def select_rows(data, *extra_data, idx=None, data = data.loc[np.array(data.index)[idx]] else: raise - # temporary workaround for https://github.com/pandas-dev/pandas/issues/27781 - if utils.is_sparse_dataframe(data): - for col in np.where(data.isna().any())[0]: - colname = data.columns[col] - if utils.is_sparse_series(data[colname]) and data[colname].isna().all(): - data[colname] = data[colname].fillna(data[colname].sparse.fill_value) elif _is_1d(data): if isinstance(data, list): # can't numpy index a list From 2e8310e734d74fac5c79db04de07e9835a66e988 Mon Sep 17 00:00:00 2001 From: Scott Gigante Date: Fri, 4 Oct 2019 16:19:34 -0400 Subject: [PATCH 098/125] upgrade bioc packages --- travis_setup.R | 2 ++ 1 file changed, 2 insertions(+) diff --git a/travis_setup.R b/travis_setup.R index 2c8f4061..a71289e6 100644 --- a/travis_setup.R +++ b/travis_setup.R @@ -1,3 +1,5 @@ chooseCRANmirror(ind=1) if (!require("remotes")) install.packages("remotes", quietly=TRUE) remotes::update_packages(upgrade="always") +if (!require("BiocManager")) install.packages("BiocManager", quietly=TRUE) +BiocManager::install(update=TRUE, ask=FALSE) From 2fd038d52b198f3c4b634921a2b7c1171d6b2c03 Mon Sep 17 00:00:00 2001 From: Scott Gigante Date: Fri, 4 Oct 2019 22:05:19 -0400 Subject: [PATCH 099/125] update docs --- doc/source/requirements.txt | 2 +- scprep/run/slingshot.py | 9 +++++---- scprep/run/splatter.py | 9 +++++---- scprep/select.py | 2 +- 4 files changed, 12 insertions(+), 10 deletions(-) diff --git a/doc/source/requirements.txt b/doc/source/requirements.txt index 58906e83..f3770f7f 100644 --- a/doc/source/requirements.txt +++ b/doc/source/requirements.txt @@ -1,7 +1,7 @@ numpy>=1.10.0 scipy>=0.18.0 scikit-learn>=0.19.1 -pandas>=0.19.0,<0.24 +pandas>=0.25 decorator matplotlib sphinx<=1.8.5 diff --git a/scprep/run/slingshot.py b/scprep/run/slingshot.py index a317bb88..b771986f 100644 --- a/scprep/run/slingshot.py +++ b/scprep/run/slingshot.py @@ -2,7 +2,7 @@ import pandas as pd import warnings -from .r_function import RFunction, install_bioconductor +from . import r_function from .. import utils @@ -24,11 +24,12 @@ def install(site_repository = None, update = False, version = None, verbose = Tr verbose : boolean, optional (default: True) Install script verbosity. """ - install_bioconductor('slingshot', site_repository=site_repository, - update=update, version=version, verbose=verbose) + r_function.install_bioconductor( + 'slingshot', site_repository=site_repository, + update=update, version=version, verbose=verbose) -_Slingshot = RFunction( +_Slingshot = r_function.RFunction( setup=""" library(slingshot) """, diff --git a/scprep/run/splatter.py b/scprep/run/splatter.py index 37a50cd0..4bbc18cb 100644 --- a/scprep/run/splatter.py +++ b/scprep/run/splatter.py @@ -1,6 +1,6 @@ import numpy as np -from .r_function import RFunction, install_bioconductor +from . import r_function def install(site_repository = None, update = False, version = None, verbose = True): @@ -21,11 +21,12 @@ def install(site_repository = None, update = False, version = None, verbose = Tr verbose : boolean, optional (default: True) Install script verbosity. """ - install_bioconductor('splatter', site_repository=site_repository, - update=update, version=version, verbose=verbose) + r_function.install_bioconductor( + 'splatter', site_repository=site_repository, + update=update, version=version, verbose=verbose) -_SplatSimulate = RFunction( +_SplatSimulate = r_function.RFunction( setup=""" library(splatter) """, diff --git a/scprep/select.py b/scprep/select.py index edf5b1de..2e546052 100644 --- a/scprep/select.py +++ b/scprep/select.py @@ -501,7 +501,7 @@ def subsample(*data, n=10000, seed=None): def highly_variable_genes(data, *extra_data, span=0.7, interpolate=0.2, kernel_size=0.05, cutoff=None, percentile=80): - """Filter all genes with low variability + """Select genes with high variability Variability is computed as the deviation from a loess fit to the rolling median of the mean-variance curve From c338d9a5c8ff7912a072b414585d5e330dd2c472 Mon Sep 17 00:00:00 2001 From: Scott Gigante Date: Fri, 4 Oct 2019 22:09:28 -0400 Subject: [PATCH 100/125] don't show imported members in submodules --- doc/source/reference.rst | 2 -- 1 file changed, 2 deletions(-) diff --git a/doc/source/reference.rst b/doc/source/reference.rst index ac1e425d..8122ae6d 100644 --- a/doc/source/reference.rst +++ b/doc/source/reference.rst @@ -128,7 +128,6 @@ Splatter :autosummary: :members: :inherited-members: - :imported-members: :show-inheritance: Slingshot @@ -138,5 +137,4 @@ Slingshot :autosummary: :members: :inherited-members: - :imported-members: :show-inheritance: From a4578acca6daa2ceae75be747feb43af7f44d89b Mon Sep 17 00:00:00 2001 From: Scott Gigante Date: Fri, 4 Oct 2019 22:16:41 -0400 Subject: [PATCH 101/125] nicer default, fixes #74 --- scprep/normalize.py | 4 ++-- scprep/plot/jitter.py | 4 ++-- test/test_normalize.py | 3 ++- 3 files changed, 6 insertions(+), 5 deletions(-) diff --git a/scprep/normalize.py b/scprep/normalize.py index bc333f1a..ef83852a 100644 --- a/scprep/normalize.py +++ b/scprep/normalize.py @@ -34,7 +34,7 @@ def _get_scaled_libsize(data, rescale='median', return_library_size=False): return rescale, libsize -def library_size_normalize(data, rescale='median', +def library_size_normalize(data, rescale=10000, return_library_size=False): """Performs L1 normalization on input data Performs L1 normalization on input data such that the sum of expression @@ -46,7 +46,7 @@ def library_size_normalize(data, rescale='median', ---------- data : array-like, shape=[n_samples, n_features] Input data - rescale : {'mean', 'median'}, float or `None`, optional (default: 'median') + rescale : {'mean', 'median'}, float or `None`, optional (default: 10000) Rescaling strategy. If 'mean' or 'median', normalized cells are scaled back up to the mean or median expression value. If a float, normalized cells are scaled up to the given value. If `None`, no diff --git a/scprep/plot/jitter.py b/scprep/plot/jitter.py index 0807f192..3e58ccd7 100644 --- a/scprep/plot/jitter.py +++ b/scprep/plot/jitter.py @@ -30,7 +30,7 @@ def x_coords(self): def jitter(labels, values, sigma=0.1, c=None, cmap=None, cmap_scale='linear', s=None, - plot_means=True, means_s=100, means_c='xkcd:light lavender', + plot_means=True, means_s=100, means_c='lightgrey', discrete=None, ax=None, legend=None, colorbar=None, @@ -84,7 +84,7 @@ def jitter(labels, values, sigma=0.1, If True, plot the mean value for each label. means_s : float, optional (default: 100) Point size for mean values. - means_c : string, list-like or matplotlib color, optional (default: 'xkcd:light lavender') + means_c : string, list-like or matplotlib color, optional (default: 'lightgrey') Point color(s) for mean values. discrete : bool or None, optional (default: None) If True, the legend is categorical. If False, the legend is a colorbar. diff --git a/test/test_normalize.py b/test/test_normalize.py index 60c4c341..29e24d42 100644 --- a/test/test_normalize.py +++ b/test/test_normalize.py @@ -23,12 +23,13 @@ def test_libsize_norm_rescale_median(self): Y = self.X_norm * self.median utils.assert_all_close(Y.sum(1), np.median(np.sum(self.X, 1))) Y2, libsize2 = scprep.normalize.library_size_normalize( - self.X, return_library_size=True) + self.X, rescale='median', return_library_size=True) np.testing.assert_allclose(Y, Y2) np.testing.assert_allclose(self.libsize, libsize2) matrix.test_all_matrix_types( self.X, utils.assert_transform_equivalent, Y=Y, transform=scprep.normalize.library_size_normalize, + rescale='median', check=utils.assert_all_close) def test_libsize_norm_return_libsize(self): From ec953bb4dd900594b2054fb65cb8ce74ae3cb2d7 Mon Sep 17 00:00:00 2001 From: Scott Gigante Date: Fri, 4 Oct 2019 22:25:39 -0400 Subject: [PATCH 102/125] use dictionary as output for slingshot --- scprep/run/slingshot.py | 19 +++++++++++-------- test/test_run.py | 18 +++++++++++------- 2 files changed, 22 insertions(+), 15 deletions(-) diff --git a/scprep/run/slingshot.py b/scprep/run/slingshot.py index b771986f..f24a16e1 100644 --- a/scprep/run/slingshot.py +++ b/scprep/run/slingshot.py @@ -149,6 +149,8 @@ def Slingshot( Returns ------- + slingshot : dict + Contains the following keys: pseudotime : array-like, shape=[n_samples, n_curves] Pseudotime projection of each cell onto each principal curve. Value is `np.nan` if the cell does not lie on the curve @@ -164,15 +166,15 @@ def Slingshot( >>> data, clusters = phate.tree.gen_dla(n_branch=4, n_dim=200, branch_length=200) >>> phate_op = phate.PHATE() >>> data_phate = phate_op.fit_transform(data) - >>> pseudotime, branch, curves = scprep.run.Slingshot(data_phate, clusters) - >>> ax = scprep.plot.scatter2d(data_phate, c=pseudotime[:,0], cmap='magma', legend_title='Branch 1') - >>> scprep.plot.scatter2d(data_phate, c=pseudotime[:,1], cmap='viridis', ax=ax, + >>> slingshot = scprep.run.Slingshot(data_phate, clusters) + >>> ax = scprep.plot.scatter2d(data_phate, c=slingshot['pseudotime'][:,0], cmap='magma', legend_title='Branch 1') + >>> scprep.plot.scatter2d(data_phate, c=slingshot['pseudotime'][:,1], cmap='viridis', ax=ax, ... ticks=False, label_prefix='PHATE', legend_title='Branch 2') - >>> for curve in curves: + >>> for curve in slingshot['curves']: ... ax.plot(curve[:,0], curve[:,1], c='black') - >>> ax = scprep.plot.scatter2d(data_phate, c=branch, legend_title='Branch', + >>> ax = scprep.plot.scatter2d(data_phate, c=slingshot['branch'], legend_title='Branch', ... ticks=False, label_prefix='PHATE') - >>> for curve in curves: + >>> for curve in slingshot['curves']: ... ax.plot(curve[:,0], curve[:,1], c='black') """ if seed is None: @@ -226,8 +228,9 @@ def Slingshot( branch[branch_old == branch_ids[j]] = -1 else: branch[branch_old == branch_ids[j]] = i + slingshot['branch'] = branch if index is not None: slingshot['pseudotime'] = pd.DataFrame(slingshot['pseudotime'], index=index) - branch = pd.Series(branch, name='branch', index=index) - return slingshot['pseudotime'], branch, slingshot['curves'] + slingshot['branch'] = pd.Series(slingshot['branch'], name='branch', index=index) + return slingshot diff --git a/test/test_run.py b/test/test_run.py index bf60553e..c191d94d 100644 --- a/test/test_run.py +++ b/test/test_run.py @@ -205,7 +205,8 @@ def setUpClass(self): self.clusters = sklearn.cluster.KMeans(6).fit_predict(self.X_pca) def test_slingshot(self): - pseudotime, branch, curves = scprep.run.Slingshot(self.X_pca[:,:2], self.clusters, verbose=False) + slingshot = scprep.run.Slingshot(self.X_pca[:,:2], self.clusters, verbose=False) + pseudotime, branch, curves = slingshot['pseudotime'], slingshot['branch'], slingshot['curves'] assert pseudotime.shape[0] == self.X_pca.shape[0] assert pseudotime.shape[1] == curves.shape[0] assert branch.shape[0] == self.X_pca.shape[0] @@ -221,8 +222,9 @@ def test_slingshot(self): assert np.all(np.any(~np.isnan(pseudotime), axis=1)) def test_slingshot_pandas(self): - pseudotime, branch, curves = scprep.run.Slingshot(pd.DataFrame(self.X_pca[:,:2], index=self.X.index), - self.clusters, verbose=False) + slingshot = scprep.run.Slingshot(pd.DataFrame(self.X_pca[:,:2], index=self.X.index), + self.clusters, verbose=False) + pseudotime, branch, curves = slingshot['pseudotime'], slingshot['branch'], slingshot['curves'] assert np.all(pseudotime.index == self.X.index) assert np.all(branch.index == self.X.index) assert branch.name == 'branch' @@ -248,8 +250,9 @@ def test_slingshot_distance(self): self.X_pca, self.clusters, distance=lambda X, Y : np.sum(X-Y)) def test_slingshot_optional_args(self): - pseudotime, branch, curves = scprep.run.Slingshot(self.X_pca[:,:2], self.clusters, - start_cluster=4, omega=0.1, verbose=False) + slingshot = scprep.run.Slingshot(self.X_pca[:,:2], self.clusters, + start_cluster=4, omega=0.1, verbose=False) + pseudotime, branch, curves = slingshot['pseudotime'], slingshot['branch'], slingshot['curves'] assert pseudotime.shape[0] == self.X_pca.shape[0] assert pseudotime.shape[1] == curves.shape[0] assert branch.shape[0] == self.X_pca.shape[0] @@ -265,8 +268,9 @@ def test_slingshot_optional_args(self): current_pseudotime = new_pseudotime assert curves.shape[1] == self.X_pca.shape[0] assert curves.shape[2] == 2 - pseudotime, branch, curves = scprep.run.Slingshot(self.X_pca[:,:2], self.clusters, - end_cluster=0, verbose=False) + slingshot = scprep.run.Slingshot(self.X_pca[:,:2], self.clusters, + end_cluster=0, verbose=False) + pseudotime, branch, curves = slingshot['pseudotime'], slingshot['branch'], slingshot['curves'] assert pseudotime.shape[0] == self.X_pca.shape[0] assert pseudotime.shape[1] == curves.shape[0] assert branch.shape[0] == self.X_pca.shape[0] From 6831393b13ae64ef0c93cd51e3f68da74e271b63 Mon Sep 17 00:00:00 2001 From: Scott Gigante Date: Sat, 5 Oct 2019 15:43:09 -0400 Subject: [PATCH 103/125] scprep.reduce.pca should return a dataframe when passed, fixes #78 --- scprep/reduce.py | 15 +++++++++------ test/test_reduce.py | 22 ++++++++++++++++++++-- 2 files changed, 29 insertions(+), 8 deletions(-) diff --git a/scprep/reduce.py b/scprep/reduce.py index 5b96a951..ca3f4b1d 100644 --- a/scprep/reduce.py +++ b/scprep/reduce.py @@ -277,12 +277,11 @@ def pca(data, n_components=100, eps=0.3, n_components, min(data.shape))) # handle dataframes - if isinstance(data, pd.SparseDataFrame): - data = data.to_coo() - elif utils.is_sparse_dataframe(data): - data = data.sparse.to_coo() - elif isinstance(data, pd.DataFrame): - data = data.to_numpy() + if isinstance(data, pd.DataFrame): + index = data.index + else: + index = None + data = utils.to_array_or_spmatrix(data) # handle sparsity if sparse.issparse(data): @@ -301,6 +300,10 @@ def pca(data, n_components=100, eps=0.3, pca_op = decomposition.PCA(n_components, random_state=seed) data = pca_op.fit_transform(data) + if index is not None: + data = pd.DataFrame(data, index=index, + columns=["PC{}".format(i+1) for i in range(n_components)]) + if return_singular_values: data = (data, pca_op.singular_values_) return data diff --git a/test/test_reduce.py b/test/test_reduce.py index c5620904..033eaa45 100644 --- a/test/test_reduce.py +++ b/test/test_reduce.py @@ -1,6 +1,8 @@ from tools import utils, matrix, data import scprep from scipy import sparse +import numpy as np +import pandas as pd from sklearn import decomposition from sklearn.utils.testing import assert_raise_message, assert_warns_message from functools import partial @@ -33,11 +35,25 @@ def test_sparse_svd(self): check=partial(utils.assert_all_close, rtol=1e-3, atol=1e-5), n_components=50, eps=0.3, seed=42, method='svd') + def test_pandas(self): + X = pd.DataFrame(self.X, index=np.arange(self.X.shape[0]).astype(str), + columns=np.arange(self.X.shape[1]).astype(float)) + def test_fun(X_pd): + Y = scprep.reduce.pca(X_pd, n_components=100, seed=42) + assert isinstance(Y, pd.DataFrame) + assert np.all(Y.index == X.index) + assert np.all(Y.columns == np.array(['PC{}'.format(i+1) + for i in range(Y.shape[1])])) + matrix.test_pandas_matrix_types( + X, test_fun) + def test_sparse_orth_rproj(self): + def test_fn(*args, **kwargs): + return scprep.utils.toarray(scprep.reduce.pca(*args, **kwargs)) matrix.test_sparse_matrix_types( self.X, utils.assert_transform_equals, check=utils.assert_matrix_class_equivalent, - Y=self.Y_full, transform=scprep.reduce.pca, + Y=self.Y_full, transform=test_fn, n_components=50, eps=0.3, seed=42, method='orth_rproj') def test_singular_values_dense(self): @@ -53,10 +69,12 @@ def test_singular_values_sparse(self): eps=0.3, seed=42, return_singular_values=True)[1], atol=1e-14) def test_sparse_rproj(self): + def test_fn(*args, **kwargs): + return scprep.utils.toarray(scprep.reduce.pca(*args, **kwargs)) matrix.test_sparse_matrix_types( self.X, utils.assert_transform_equals, check=utils.assert_matrix_class_equivalent, - Y=self.Y_full, transform=scprep.reduce.pca, + Y=self.Y_full, transform=test_fn, n_components=50, eps=0.3, seed=42, method='rproj') def test_eps_too_low(self): From 8384d0bc4dac271134d0573f5c8afb5364f00b23 Mon Sep 17 00:00:00 2001 From: Daniel Burkhardt Date: Sun, 6 Oct 2019 11:50:20 -0400 Subject: [PATCH 104/125] Closes #80 --- scprep/io/download.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scprep/io/download.py b/scprep/io/download.py index 68b5099b..9fb380fe 100644 --- a/scprep/io/download.py +++ b/scprep/io/download.py @@ -49,7 +49,7 @@ def download_google_drive(id, destination): if token: params = { 'id' : id, 'confirm' : token } - response = session.get(URL, params = params, stream = True) + response = session.get(_GOOGLE_DRIVE_URL, params = params, stream = True) _save_response_content(response, destination) From e82a29e71fb25540e17595335bccdfda2bd38c68 Mon Sep 17 00:00:00 2001 From: Scott Gigante Date: Mon, 7 Oct 2019 09:05:15 -0400 Subject: [PATCH 105/125] don't need to coerce data to float anymore --- scprep/io/utils.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/scprep/io/utils.py b/scprep/io/utils.py index 0574b742..53eb9f70 100644 --- a/scprep/io/utils.py +++ b/scprep/io/utils.py @@ -129,6 +129,4 @@ def _matrix_to_data_frame(data, gene_names=None, cell_names=None, sparse=None): if sp.issparse(data): data = data.toarray() data = pd.DataFrame(data, index=cell_names, columns=gene_names) - # convert data to float - data = data.astype(float) return data From 4c6633618e79e25edd80e629f605e6deed482a92 Mon Sep 17 00:00:00 2001 From: Scott Gigante Date: Mon, 7 Oct 2019 10:32:31 -0400 Subject: [PATCH 106/125] allow scprep.reduce.pca to coerce to dense --- scprep/reduce.py | 12 ++++++++---- test/test_reduce.py | 5 +++++ 2 files changed, 13 insertions(+), 4 deletions(-) diff --git a/scprep/reduce.py b/scprep/reduce.py index ca3f4b1d..089eaa33 100644 --- a/scprep/reduce.py +++ b/scprep/reduce.py @@ -238,11 +238,12 @@ def pca(data, n_components=100, eps=0.3, Parameter to control the quality of the embedding of sparse input. Smaller values lead to more accurate embeddings but higher computational and memory costs - method : {'svd', 'orth_rproj', 'rproj'}, optional (default: 'svd') + method : {'svd', 'orth_rproj', 'rproj', 'dense'}, optional (default: 'svd') Dimensionality reduction method applied prior to mean centering of sparse input. The method choice affects accuracy - (`svd` > `orth_rproj` > `rproj`) comes with increased - computational cost (but not memory.) + (`svd` > `orth_rproj` > `rproj`) and comes with increased + computational cost (but not memory.) On the other hand, + `method='dense'` adds a memory cost but is faster. seed : int, RandomState or None, optional (default: None) Random state. return_singular_values : bool, optional (default: False) @@ -281,7 +282,10 @@ def pca(data, n_components=100, eps=0.3, index = data.index else: index = None - data = utils.to_array_or_spmatrix(data) + if method == 'dense': + data = utils.toarray(data) + else: + data = utils.to_array_or_spmatrix(data) # handle sparsity if sparse.issparse(data): diff --git a/test/test_reduce.py b/test/test_reduce.py index 033eaa45..94179d16 100644 --- a/test/test_reduce.py +++ b/test/test_reduce.py @@ -27,6 +27,11 @@ def test_dense(self): self.X, utils.assert_transform_equals, Y=self.Y_random, transform=scprep.reduce.pca, n_components=100, seed=42) + matrix.test_all_matrix_types( + self.X, utils.assert_transform_equals, + Y=self.Y_random, transform=scprep.reduce.pca, + n_components=100, seed=42, method='dense', + check=partial(utils.assert_all_close, atol=1e-10)) def test_sparse_svd(self): matrix.test_sparse_matrix_types( From ffef8dedc341ad23df5ae0a5beb1fe48e870de64 Mon Sep 17 00:00:00 2001 From: Scott Gigante Date: Mon, 7 Oct 2019 11:55:20 -0400 Subject: [PATCH 107/125] add scatter mask, fix pca testing error. closes #59 --- scprep/plot/scatter.py | 23 +++++++++++++++++------ test/test_plot.py | 19 ++++++++++++++++++- 2 files changed, 35 insertions(+), 7 deletions(-) diff --git a/scprep/plot/scatter.py b/scprep/plot/scatter.py index 6d33704a..182fc1c0 100644 --- a/scprep/plot/scatter.py +++ b/scprep/plot/scatter.py @@ -26,7 +26,8 @@ def _squeeze_array(x): class _ScatterParams(object): - def __init__(self, x, y, z=None, c=None, discrete=None, + def __init__(self, x, y, z=None, c=None, mask=None, + discrete=None, cmap=None, cmap_scale=None, vmin=None, vmax=None, s=None, legend=None, colorbar=None, xlabel=None, ylabel=None, zlabel=None, @@ -35,6 +36,7 @@ def __init__(self, x, y, z=None, c=None, discrete=None, self._y = y self._z = z if z is not None else None self._c = c + self._mask = mask self._discrete = discrete self._cmap = cmap self._cmap_scale = cmap_scale @@ -52,6 +54,7 @@ def __init__(self, x, y, z=None, c=None, discrete=None, self.shuffle = shuffle self.check_size() self.check_c() + self.check_mask() self.check_s() self.check_discrete() self.check_legend() @@ -84,10 +87,11 @@ def plot_idx(self): try: return self._plot_idx except AttributeError: + self._plot_idx = np.arange(self.size) + if self._mask is not None: + self._plot_idx = self._plot_idx[self._mask] if self.shuffle: - self._plot_idx = np.random.permutation(self.size) - else: - self._plot_idx = np.arange(self.size) + self._plot_idx = np.random.permutation(self._plot_idx) return self._plot_idx @property @@ -381,14 +385,21 @@ def check_size(self): def check_c(self): if not self.constant_c(): - self._c = utils.toarray(self._c).squeeze() + self._c = _squeeze_array(self._c) if not len(self._c) == self.size: raise ValueError("Expected c of length {} or 1. Got {}".format( self.size, len(self._c))) + def check_mask(self): + if self._mask is not None: + self._mask = _squeeze_array(self._mask) + if not len(self._mask) == self.size: + raise ValueError("Expected mask of length {}. Got {}".format( + self.size, len(self._mask))) + def check_s(self): if self._s is not None and not isinstance(self._s, numbers.Number): - self._s = utils.toarray(self._s).squeeze() + self._s = _squeeze_array(self._s) if not len(self._s) == self.size: raise ValueError("Expected s of length {} or 1. Got {}".format( self.size, len(self._s))) diff --git a/test/test_plot.py b/test/test_plot.py index 3ea4a886..b98a1daf 100644 --- a/test/test_plot.py +++ b/test/test_plot.py @@ -278,6 +278,22 @@ def test_plot_idx_no_shuffle(self): np.testing.assert_equal(params.c, self.c) np.testing.assert_equal(params.s, np.abs(self.x)) + def test_plot_idx_mask(self): + params = _ScatterParams(x=self.x, y=self.y, + z=self.z, c=self.c, + mask=self.x > 0, shuffle=False) + np.testing.assert_equal(params.plot_idx, np.arange(params.size)[self.x > 0]) + np.testing.assert_equal(params.x, self.x[self.x > 0]) + np.testing.assert_equal(params.y, self.y[self.x > 0]) + np.testing.assert_equal(params.z, self.z[self.x > 0]) + np.testing.assert_equal(params.c, self.c[self.x > 0]) + + def test_plot_idx_mask_shuffle(self): + params = _ScatterParams(x=self.x, y=self.y, + mask=self.x > 0, shuffle=True) + np.testing.assert_equal(np.sort(params.plot_idx), np.arange(params.size)[self.x > 0]) + assert np.all(params.x > 0) + def test_data_int(self): params = _ScatterParams(x=1, y=2) np.testing.assert_equal(params._data, [np.array([1]), np.array([2])]) @@ -732,7 +748,8 @@ class Test10X(unittest.TestCase): def setUpClass(self): self.X = data.load_10X(sparse=False) self.X_filt = scprep.filter.filter_empty_cells(self.X) - self.X_pca, self.S = scprep.reduce.pca(self.X, n_components=10, + self.X_pca, self.S = scprep.reduce.pca(scprep.utils.toarray(self.X), + n_components=10, return_singular_values=True) @classmethod From 8f9e247a1d097ea00bb553a2ffc4f361bd3be823 Mon Sep 17 00:00:00 2001 From: Scott Gigante Date: Mon, 7 Oct 2019 11:56:28 -0400 Subject: [PATCH 108/125] test invalid mask --- test/test_plot.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/test/test_plot.py b/test/test_plot.py index b98a1daf..4f56d919 100644 --- a/test/test_plot.py +++ b/test/test_plot.py @@ -1100,6 +1100,13 @@ def test_scatter_invalid_s(self): scprep.plot.scatter2d, self.X_pca, s=self.X_pca[0, :]) + def test_scatter_invalid_mask(self): + assert_raise_message( + ValueError, "Expected mask of length {}. Got {}".format( + self.X_pca.shape[0], self.X_pca.shape[1]), + scprep.plot.scatter2d, self.X_pca, + mask=self.X_pca[0, :] > 0) + def test_scatter_invalid_discrete(self): assert_raise_message( ValueError, "Cannot treat non-numeric data as continuous", From 3c292af82226ae61d6a4e45f20cc3eefbea92841 Mon Sep 17 00:00:00 2001 From: Scott Gigante Date: Mon, 7 Oct 2019 12:12:01 -0400 Subject: [PATCH 109/125] fix test_select int bug --- test/test_select.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/test_select.py b/test/test_select.py index aee86491..ac5389b7 100644 --- a/test/test_select.py +++ b/test/test_select.py @@ -416,7 +416,7 @@ def test_fun(X): for col in Y.columns: assert X[col].dtype == Y[col].dtype, (X[col].dtype, Y[col].dtype) matrix.test_matrix_types( - self.X, test_fun, matrix._pandas_sparse_matrix_types) + self.X.astype(float), test_fun, matrix._pandas_sparse_matrix_types) def test_select_variable_genes(self): X_filtered = scprep.select.highly_variable_genes(self.X, percentile=70) From 462c26c4751453e2fe8d9d7c29fda8e07f7d0418 Mon Sep 17 00:00:00 2001 From: Scott Gigante Date: Mon, 7 Oct 2019 12:56:10 -0400 Subject: [PATCH 110/125] add mask to scatter api --- scprep/plot/scatter.py | 22 +++++++++++++++------- 1 file changed, 15 insertions(+), 7 deletions(-) diff --git a/scprep/plot/scatter.py b/scprep/plot/scatter.py index 182fc1c0..26e9a3ea 100644 --- a/scprep/plot/scatter.py +++ b/scprep/plot/scatter.py @@ -489,8 +489,9 @@ def zlabel(self): @utils._with_pkg(pkg="matplotlib", min_version=3) -def scatter(x, y, z=None, - c=None, cmap=None, cmap_scale='linear', s=None, discrete=None, +def scatter(x, y, z=None, mask=None, + c=None, cmap=None, cmap_scale='linear', s=None, + discrete=None, ax=None, legend=None, colorbar=None, shuffle=True, @@ -531,6 +532,8 @@ def scatter(x, y, z=None, data for y axis z : list-like, optional (default: None) data for z axis + mask : list-like, optional (default: None) + boolean mask to hide data points c : list-like or None, optional (default: None) Color vector. Can be a single color value (RGB, RGBA, or named matplotlib colors), an array of these of length n_samples, or a list of @@ -633,7 +636,7 @@ def scatter(x, y, z=None, """ with temp_fontsize(fontsize): params = _ScatterParams( - x, y, z, c=c, discrete=discrete, + x, y, z, c=c, mask=mask, discrete=discrete, cmap=cmap, cmap_scale=cmap_scale, vmin=vmin, vmax=vmax, s=s, legend=legend, colorbar=colorbar, @@ -687,7 +690,7 @@ def scatter(x, y, z=None, @utils._with_pkg(pkg="matplotlib", min_version=3) -def scatter2d(data, +def scatter2d(data, mask=None, c=None, cmap=None, cmap_scale='linear', s=None, discrete=None, ax=None, legend=None, colorbar=None, shuffle=True, figsize=None, @@ -717,6 +720,8 @@ def scatter2d(data, ---------- data : array-like, shape=[n_samples, n_features] Input data. Only the first two components will be used. + mask : list-like, optional (default: None) + boolean mask to hide data points c : list-like or None, optional (default: None) Color vector. Can be a single color value (RGB, RGBA, or named matplotlib colors), an array of these of length n_samples, or a list of @@ -818,6 +823,7 @@ def scatter2d(data, data = utils.toarray(data) return scatter(x=select.select_cols(data, idx=0), y=select.select_cols(data, idx=1), + mask=mask, c=c, cmap=cmap, cmap_scale=cmap_scale, s=s, discrete=discrete, ax=ax, legend=legend, colorbar=colorbar, shuffle=shuffle, figsize=figsize, @@ -841,7 +847,7 @@ def scatter2d(data, @utils._with_pkg(pkg="matplotlib", min_version=3) -def scatter3d(data, +def scatter3d(data, mask=None, c=None, cmap=None, cmap_scale='linear', s=None, discrete=None, ax=None, legend=None, colorbar=None, shuffle=True, @@ -876,6 +882,8 @@ def scatter3d(data, ---------- data : array-like, shape=[n_samples, n_features] Input data. Only the first two components will be used. + mask : list-like, optional (default: None) + boolean mask to hide data points c : list-like or None, optional (default: None) Color vector. Can be a single color value (RGB, RGBA, or named matplotlib colors), an array of these of length n_samples, or a list of @@ -985,7 +993,7 @@ def scatter3d(data, z = select.select_cols(data, idx=2) except IndexError: raise ValueError("Expected data.shape[1] >= 3. Got {}".format(data.shape[1])) - return scatter(x=x, y=y, z=z, + return scatter(x=x, y=y, z=z, mask=mask, c=c, cmap=cmap, cmap_scale=cmap_scale, s=s, discrete=discrete, ax=ax, legend=legend, colorbar=colorbar, shuffle=shuffle, figsize=figsize, @@ -1050,7 +1058,7 @@ def rotate_scatter3d(data, savefig.dpi in the matplotlibrc file. If 'figure' it will set the dpi to be the value of the figure. Only used if filename is not None. **kwargs : keyword arguments - See :~func:`phate.plot.scatter3d`. + See :~func:`scprep.plot.scatter3d`. Returns ------- From 6ddfd5a002b85a55571d2dd22c49de45aace3882 Mon Sep 17 00:00:00 2001 From: Scott Gigante Date: Mon, 7 Oct 2019 16:35:41 -0400 Subject: [PATCH 111/125] add save_mtx for #66 --- scprep/io/__init__.py | 2 +- scprep/io/mtx.py | 52 +++++++++++++++++++++++++++++++++++++++++++ test/test_io.py | 20 +++++++++++++++++ 3 files changed, 73 insertions(+), 1 deletion(-) diff --git a/scprep/io/__init__.py b/scprep/io/__init__.py index f1307bd3..d34bd92c 100644 --- a/scprep/io/__init__.py +++ b/scprep/io/__init__.py @@ -4,6 +4,6 @@ from .csv import load_csv, load_tsv from .tenx import load_10X, load_10X_zip, load_10X_HDF5 from .fcs import load_fcs -from .mtx import load_mtx +from .mtx import load_mtx, save_mtx from . import download, hdf5 diff --git a/scprep/io/mtx.py b/scprep/io/mtx.py index 598c3b42..6f1bd2a8 100644 --- a/scprep/io/mtx.py +++ b/scprep/io/mtx.py @@ -2,7 +2,12 @@ # (C) 2018 Krishnaswamy Lab GPLv2 import scipy.io as sio +from scipy import sparse +import pandas as pd +import os + from .utils import _matrix_to_data_frame +from .. import utils def load_mtx(mtx_file, cell_axis='row', @@ -43,3 +48,50 @@ def load_mtx(mtx_file, cell_axis='row', data, gene_names=gene_names, cell_names=cell_names, sparse=sparse) return data + +def save_mtx(data, destination, cell_names=None, gene_names=None): + """Save a mtx file + + Parameters + ---------- + data : array-like, shape=[n_samples, n_features] + Input data, saved to destination/matrix.mtx + destination : str + Directory in which to save the data + cell_names : list-like, shape=[n_samples], optional (default: None) + Cell names associated with rows, saved to destination/cell_names.tsv. + If `data` is a pandas DataFrame and `cell_names` is None, + these are autopopulated from `data.index`. + gene_names : list-like, shape=[n_features], optional (default: None) + Cell names associated with rows, saved to destination/gene_names.tsv. + If `data` is a pandas DataFrame and `gene_names` is None, + these are autopopulated from `data.columns`. + + Examples + -------- + >>> import scprep + >>> scprep.io.save_mtx(data, destination="my_data") + >>> reload = scprep.io.load_mtx("my_data/matrix.mtx", + ... cell_names="my_data/cell_names.tsv", + ... gene_names="my_data/gene_names.tsv") + """ + if isinstance(data, pd.DataFrame): + if cell_names is None: + cell_names = data.index + if gene_names is None: + gene_names = data.columns + data = utils.to_array_or_spmatrix(data) + data = sparse.coo_matrix(data) + # handle ~/ and relative paths + destination = os.path.expanduser(destination) + if not os.path.isdir(destination): + os.mkdir(destination) + if cell_names is not None: + with open(os.path.join(destination, "cell_names.tsv"), 'w') as handle: + for name in cell_names: + handle.write("{}\n".format(name)) + if gene_names is not None: + with open(os.path.join(destination, "gene_names.tsv"), 'w') as handle: + for name in gene_names: + handle.write("{}\n".format(name)) + sio.mmwrite(os.path.join(destination, "matrix.mtx"), data) diff --git a/test/test_io.py b/test/test_io.py index 357d1036..09088db6 100644 --- a/test/test_io.py +++ b/test/test_io.py @@ -469,6 +469,26 @@ def test_mtx(): assert X_mtx.index[0] == 0 +def test_save_mtx(): + filename = os.path.join(data.data_dir, "test_10X", "matrix.mtx.gz") + X = scprep.io.load_mtx( + filename, + gene_names=os.path.join( + data.data_dir, "gene_symbols.csv"), + cell_names=os.path.join( + data.data_dir, "barcodes.tsv"), + cell_axis="column") + scprep.io.save_mtx(X, "test_mtx") + Y = scprep.io.load_mtx( + "test_mtx/matrix.mtx", + gene_names="test_mtx/gene_names.tsv", + cell_names="test_mtx/cell_names.tsv") + np.testing.assert_array_equal(X, Y) + assert np.all(X.index == Y.index) + assert np.all(X.columns == Y.columns) + shutil.rmtree("test_mtx") + + def test_fcs(): path = fcsparser.test_sample_path meta, data = fcsparser.parse(path) From c8fa3826e852de6551a19c15ce525faf7aa4aaf1 Mon Sep 17 00:00:00 2001 From: Scott Gigante Date: Mon, 7 Oct 2019 19:39:31 -0400 Subject: [PATCH 112/125] fix handling of masked arrays for c_unique --- scprep/plot/scatter.py | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/scprep/plot/scatter.py b/scprep/plot/scatter.py index 26e9a3ea..9bfa5738 100644 --- a/scprep/plot/scatter.py +++ b/scprep/plot/scatter.py @@ -145,13 +145,20 @@ def array_c(self): self._c) return self._array_c + @property + def _c_masked(self): + if self.constant_c() or self._mask is None: + return self._c + else: + return self._c[self._mask] + @property def c_unique(self): """Get unique values in c to avoid recomputing every time""" try: return self._c_unique except AttributeError: - self._c_unique = np.unique(self._c) + self._c_unique = np.unique(self._c_masked) return self._c_unique @property @@ -180,7 +187,7 @@ def discrete(self): else: if isinstance(self._cmap, dict) or not \ np.all([isinstance(x, numbers.Number) - for x in self._c]): + for x in self._c_masked]): # cmap dictionary or non-numeric values force discrete return True else: @@ -206,8 +213,9 @@ def c_discrete(self): for i, label in enumerate(self._labels): self._c_discrete[self._c == label] = i else: - self._c_discrete, self._labels = pd.factorize( - self._c, sort=True) + self._c_discrete = np.zeros_like(self._c, dtype=int) + self._c_discrete[self._mask], self._labels = pd.factorize( + self._c_masked, sort=True) return self._c_discrete @property From cf201ebb86d531721c6cf4b08266c31b0809c628 Mon Sep 17 00:00:00 2001 From: Scott Gigante Date: Mon, 7 Oct 2019 19:39:39 -0400 Subject: [PATCH 113/125] split out unzip into its own function --- scprep/io/download.py | 28 +++++++++++++++++++++++++--- test/test_io.py | 27 +++++++++++++++++++++++++++ 2 files changed, 52 insertions(+), 3 deletions(-) diff --git a/scprep/io/download.py b/scprep/io/download.py index 9fb380fe..7f753832 100644 --- a/scprep/io/download.py +++ b/scprep/io/download.py @@ -75,6 +75,30 @@ def download_url(url, destination): destination.write(handle.read()) +def unzip(filename, destination=None, delete=True): + """Extract a .zip file and optionally remove the archived version + + Parameters + ---------- + filename : string + Path to the zip file + destination : string, optional (default: None) + Path to the folder in which to extract the zip. + If None, extracts to the same directory the archive is in. + delete : boolean, optional (default: True) + If True, deletes the zip file after extraction + """ + filename = os.path.expanduser(filename) + if destination is None: + destination = os.path.dirname(filename) + elif not os.path.isdir(destination): + os.mkdir(destination) + with zipfile.ZipFile(filename, 'r') as handle: + handle.extractall(destination) + if delete: + os.unlink(filename) + + def download_and_extract_zip(url, destination): """Download a .zip file from a URL and extract it @@ -90,6 +114,4 @@ def download_and_extract_zip(url, destination): zip_handle = tempfile.NamedTemporaryFile(suffix=".zip", delete=False) download_url(url, zip_handle) zip_handle.close() - with zipfile.ZipFile(zip_handle.name, 'r') as handle: - handle.extractall(destination) - os.unlink(zip_handle.name) + unzip(zip_handle.name, destination) diff --git a/test/test_io.py b/test/test_io.py index 09088db6..cc9c4752 100644 --- a/test/test_io.py +++ b/test/test_io.py @@ -627,3 +627,30 @@ def test_download_zip(): assert np.all(X.index == Y.index) assert np.all(X.columns == Y.columns) shutil.rmtree("zip_test") + +def test_unzip_no_destination(): + X = data.load_10X() + filename = os.path.join(data.data_dir, "test_10X.zip") + tmp_filename = os.path.join("zip_test", "zip_extract_test.zip") + os.mkdir("zip_test") + shutil.copyfile(filename, tmp_filename) + scprep.io.download.unzip(tmp_filename, delete=False) + assert os.path.isfile(tmp_filename) + Y = scprep.io.load_10X("zip_test/test_10X") + assert np.all(X == Y) + assert np.all(X.index == Y.index) + assert np.all(X.columns == Y.columns) + shutil.rmtree("zip_test") + +def test_unzip_no_destination(): + X = data.load_10X() + filename = os.path.join(data.data_dir, "test_10X.zip") + tmp_filename = "zip_extract_test.zip" + shutil.copyfile(filename, tmp_filename) + scprep.io.download.unzip(tmp_filename, destination="zip_test") + assert not os.path.isfile(tmp_filename) + Y = scprep.io.load_10X("zip_test/test_10X") + assert np.all(X == Y) + assert np.all(X.index == Y.index) + assert np.all(X.columns == Y.columns) + shutil.rmtree("zip_test") \ No newline at end of file From 43ea24d755b134cdc71535684d330d9e512d3732 Mon Sep 17 00:00:00 2001 From: Scott Gigante Date: Mon, 7 Oct 2019 19:40:11 -0400 Subject: [PATCH 114/125] clean up gdrive url form --- scprep/io/download.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scprep/io/download.py b/scprep/io/download.py index 7f753832..9566d349 100644 --- a/scprep/io/download.py +++ b/scprep/io/download.py @@ -37,7 +37,7 @@ def download_google_drive(id, destination): id : string Google Drive ID string. You can access this by clicking 'Get Shareable Link', which will give a URL of the form - https://drive.google.com/file/d/**your_file_id**/view?usp=sharing + destination : string or file File to which to save the downloaded data """ From d88db6cdc205ec605e1102f07d7c4be9838bba8e Mon Sep 17 00:00:00 2001 From: Scott Gigante Date: Mon, 7 Oct 2019 21:41:02 -0400 Subject: [PATCH 115/125] fix select.subsample to avoid ambiguity on rangeindex --- scprep/select.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scprep/select.py b/scprep/select.py index 8a96d6da..2598cf05 100644 --- a/scprep/select.py +++ b/scprep/select.py @@ -74,7 +74,7 @@ def _check_rows_compatible(*data): raise ValueError( "Expected `data` and `extra_data` pandas inputs to have " "the same index. Fix with " - "`scprep.select.select_rows(*extra_data, data.index)`") + "`scprep.select.select_rows(*extra_data, idx=data.index)`") def _convert_dataframe_1d(idx): @@ -523,7 +523,7 @@ def subsample(*data, n=10000, seed=None): if N < n: raise ValueError("Expected n ({}) <= n_samples ({})".format(n, N)) np.random.seed(seed) - select_idx = np.random.choice(N, n, replace=False) + select_idx = np.isin(np.arange(N), np.random.choice(N, n, replace=False)) data = [select_rows(d, idx=select_idx) for d in data] return tuple(data) if len(data) > 1 else data[0] From 6da8d980aaba190843a7952a38ac6fa8d19e6c02 Mon Sep 17 00:00:00 2001 From: Scott Gigante Date: Mon, 7 Oct 2019 22:42:55 -0400 Subject: [PATCH 116/125] bump version --- scprep/version.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/scprep/version.py b/scprep/version.py index 1b6f8c4b..1b0953c5 100644 --- a/scprep/version.py +++ b/scprep/version.py @@ -1,5 +1,4 @@ # author: Scott Gigante # (C) 2018 Krishnaswamy Lab GPLv2 -__version__ = "1.0.0-alpha" - +__version__ = "1.0.0" From d653a9eba43843ae1373315fdbb72065ee52cf40 Mon Sep 17 00:00:00 2001 From: Scott Gigante Date: Mon, 7 Oct 2019 22:51:10 -0400 Subject: [PATCH 117/125] harmonize requirements.txt with setup.py --- doc/source/requirements.txt | 8 ++++---- requirements.txt | 4 ++-- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/doc/source/requirements.txt b/doc/source/requirements.txt index f3770f7f..ae852b76 100644 --- a/doc/source/requirements.txt +++ b/doc/source/requirements.txt @@ -1,9 +1,9 @@ -numpy>=1.10.0 -scipy>=0.18.0 +numpy>=1.12.0 +scipy>=0.18.1 scikit-learn>=0.19.1 pandas>=0.25 -decorator -matplotlib +decorator>=4.3.0 +matplotlib>=3.0 sphinx<=1.8.5 sphinxcontrib-napoleon autodocsumm diff --git a/requirements.txt b/requirements.txt index c8461542..31d927e5 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,5 @@ -numpy>=1.10.0 -scipy>=0.18.0 +numpy>=1.12.0 +scipy>=0.18.1 scikit-learn>=0.19.1 pandas>=0.25 decorator>=4.3.0 From 23b32dd96d71a178da627514a6f8724b638a1c04 Mon Sep 17 00:00:00 2001 From: Scott Gigante Date: Mon, 7 Oct 2019 22:59:09 -0400 Subject: [PATCH 118/125] add contributing.md --- CONTRIBUTING.md | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) create mode 100644 CONTRIBUTING.md diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md new file mode 100644 index 00000000..416eb053 --- /dev/null +++ b/CONTRIBUTING.md @@ -0,0 +1,29 @@ + +Contributing to scprep +============================ + +There are many ways to contribute to `scprep`, with the most common ones +being contribution of code or documentation to the project. Improving the +documentation is no less important than improving the library itself. If you +find a typo in the documentation, or have made improvements, do not hesitate to +submit a GitHub pull request. + +But there are many other ways to help. In particular answering queries on the +[issue tracker](https://github.com/KrishnaswamyLab/scprep/issues), +investigating bugs, and [reviewing other developers' pull +requests](https://github.com/KrishnaswamyLab/scprep/pulls) +are very valuable contributions that decrease the burden on the project +maintainers. + +Another way to contribute is to report issues you're facing, and give a "thumbs +up" on issues that others reported and that are relevant to you. It also helps +us if you spread the word: reference the project from your blog and articles, +link to it from your website, or simply star it in GitHub to say "I use it". + +Code of Conduct +--------------- + +We abide by the principles of openness, respect, and consideration of others +of the Python Software Foundation: https://www.python.org/psf/codeofconduct/. + +This `CONTRIBUTING.md` was adapted from [scikit-learn](https://github.com/scikit-learn/scikit-learn/blob/master/CONTRIBUTING.md). \ No newline at end of file From 06fa90375e2b4e472d223290f4025abb982185f1 Mon Sep 17 00:00:00 2001 From: Scott Gigante Date: Mon, 7 Oct 2019 23:03:53 -0400 Subject: [PATCH 119/125] fix test_select tests and warnings --- scprep/select.py | 2 +- test/test_select.py | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/scprep/select.py b/scprep/select.py index 2598cf05..6d22c699 100644 --- a/scprep/select.py +++ b/scprep/select.py @@ -58,7 +58,7 @@ def _check_columns_compatible(*data): raise ValueError( "Expected `data` and `extra_data` pandas inputs to have " "the same column names. Fix with " - "`scprep.select.select_cols(*extra_data, data.columns)`") + "`scprep.select.select_cols(*extra_data, idx=data.columns)`") def _check_rows_compatible(*data): diff --git a/test/test_select.py b/test/test_select.py index ac5389b7..ff05f652 100644 --- a/test/test_select.py +++ b/test/test_select.py @@ -334,20 +334,20 @@ def test_select_cols_conflicting_data(self): ValueError, "Expected `data` and `extra_data` pandas inputs to have the same " "column names. Fix with " - "`scprep.select.select_cols(*extra_data, data.columns)`", + "`scprep.select.select_cols(*extra_data, idx=data.columns)`", scprep.select.select_cols, self.X, - scprep.select.subsample(self.X.T, n=self.X.shape[0]).T) + self.X.iloc[:,::-1]) def test_select_rows_conflicting_data(self): assert_raise_message( ValueError, "Expected `data` and `extra_data` pandas inputs to have the same " "index. Fix with " - "`scprep.select.select_rows(*extra_data, data.index)`", + "`scprep.select.select_rows(*extra_data, idx=data.index)`", scprep.select.select_rows, self.X, - scprep.select.subsample(self.X, n=self.X.shape[0])) + self.X.iloc[::-1]) def test_select_cols_get_gene_set_ndarray_data(self): assert_raise_message( From e3f1e3bd9652ab5dcc4de91e7b759200fdd3292b Mon Sep 17 00:00:00 2001 From: Scott Gigante Date: Mon, 7 Oct 2019 23:12:28 -0400 Subject: [PATCH 120/125] Update issue templates --- .github/ISSUE_TEMPLATE/bug_report.md | 36 +++++++++++++++++++++++ .github/ISSUE_TEMPLATE/feature_request.md | 20 +++++++++++++ 2 files changed, 56 insertions(+) create mode 100644 .github/ISSUE_TEMPLATE/bug_report.md create mode 100644 .github/ISSUE_TEMPLATE/feature_request.md diff --git a/.github/ISSUE_TEMPLATE/bug_report.md b/.github/ISSUE_TEMPLATE/bug_report.md new file mode 100644 index 00000000..5d588928 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/bug_report.md @@ -0,0 +1,36 @@ +--- +name: Bug report +about: Create a report to help us improve +title: '' +labels: bug +assignees: '' + +--- + +**Describe the bug** +A clear and concise description of what the bug is. + +**To Reproduce** +Standalone code to reproduce the behavior: + +**Expected behavior** +A clear and concise description of what you expected to happen. + +**Actual behavior** +Include full traceback if applicable. + +**scprep version** +Output of `scprep.__version__` + +**Output of `pd.show_versions()`** + +

+ +``` +paste pd.show_versions() here +``` + +
+ +**Additional context** +Add any other context about the problem here. diff --git a/.github/ISSUE_TEMPLATE/feature_request.md b/.github/ISSUE_TEMPLATE/feature_request.md new file mode 100644 index 00000000..4a05b865 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/feature_request.md @@ -0,0 +1,20 @@ +--- +name: Feature request +about: Suggest an idea for this project +title: '' +labels: enhancement +assignees: '' + +--- + +**Is your feature request related to a problem? Please describe.** +A clear and concise description of what the problem is. Ex. I'm always frustrated when [...] + +**Describe the solution you'd like** +A clear and concise description of what you want to happen. + +**Describe alternatives you've considered** +A clear and concise description of any alternative solutions or features you've considered. + +**Additional context** +Add any other context or code snippets related to the feature request here. From 3652671c2c13bedf37a6f4683b9825991ecab37a Mon Sep 17 00:00:00 2001 From: Scott Gigante Date: Tue, 8 Oct 2019 10:01:46 -0400 Subject: [PATCH 121/125] remove naming conflict --- test/test_io.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/test_io.py b/test/test_io.py index cc9c4752..7b729eb9 100644 --- a/test/test_io.py +++ b/test/test_io.py @@ -642,7 +642,7 @@ def test_unzip_no_destination(): assert np.all(X.columns == Y.columns) shutil.rmtree("zip_test") -def test_unzip_no_destination(): +def test_unzip_destination(): X = data.load_10X() filename = os.path.join(data.data_dir, "test_10X.zip") tmp_filename = "zip_extract_test.zip" From 2a5ced1cad4c7f8169df0a01e4e517d0daf1199b Mon Sep 17 00:00:00 2001 From: Scott Gigante Date: Tue, 8 Oct 2019 10:01:58 -0400 Subject: [PATCH 122/125] add CONTRIBUTING and CODE_OF_CONDUCT --- CODE_OF_CONDUCT.md | 76 ++++++++++++++++++++++++++++++++++++++++++++++ CONTRIBUTING.md | 3 ++ 2 files changed, 79 insertions(+) create mode 100644 CODE_OF_CONDUCT.md diff --git a/CODE_OF_CONDUCT.md b/CODE_OF_CONDUCT.md new file mode 100644 index 00000000..10ef1ee7 --- /dev/null +++ b/CODE_OF_CONDUCT.md @@ -0,0 +1,76 @@ +# Contributor Covenant Code of Conduct + +## Our Pledge + +In the interest of fostering an open and welcoming environment, we as +contributors and maintainers pledge to making participation in our project and +our community a harassment-free experience for everyone, regardless of age, body +size, disability, ethnicity, sex characteristics, gender identity and expression, +level of experience, education, socio-economic status, nationality, personal +appearance, race, religion, or sexual identity and orientation. + +## Our Standards + +Examples of behavior that contributes to creating a positive environment +include: + +* Using welcoming and inclusive language +* Being respectful of differing viewpoints and experiences +* Gracefully accepting constructive criticism +* Focusing on what is best for the community +* Showing empathy towards other community members + +Examples of unacceptable behavior by participants include: + +* The use of sexualized language or imagery and unwelcome sexual attention or + advances +* Trolling, insulting/derogatory comments, and personal or political attacks +* Public or private harassment +* Publishing others' private information, such as a physical or electronic + address, without explicit permission +* Other conduct which could reasonably be considered inappropriate in a + professional setting + +## Our Responsibilities + +Project maintainers are responsible for clarifying the standards of acceptable +behavior and are expected to take appropriate and fair corrective action in +response to any instances of unacceptable behavior. + +Project maintainers have the right and responsibility to remove, edit, or +reject comments, commits, code, wiki edits, issues, and other contributions +that are not aligned to this Code of Conduct, or to ban temporarily or +permanently any contributor for other behaviors that they deem inappropriate, +threatening, offensive, or harmful. + +## Scope + +This Code of Conduct applies both within project spaces and in public spaces +when an individual is representing the project or its community. Examples of +representing a project or community include using an official project e-mail +address, posting via an official social media account, or acting as an appointed +representative at an online or offline event. Representation of a project may be +further defined and clarified by project maintainers. + +## Enforcement + +Instances of abusive, harassing, or otherwise unacceptable behavior may be +reported by contacting the project team at krishnaswamylab@gmail.com. All +complaints will be reviewed and investigated and will result in a response that +is deemed necessary and appropriate to the circumstances. The project team is +obligated to maintain confidentiality with regard to the reporter of an incident. +Further details of specific enforcement policies may be posted separately. + +Project maintainers who do not follow or enforce the Code of Conduct in good +faith may face temporary or permanent repercussions as determined by other +members of the project's leadership. + +## Attribution + +This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4, +available at https://www.contributor-covenant.org/version/1/4/code-of-conduct.html + +[homepage]: https://www.contributor-covenant.org + +For answers to common questions about this code of conduct, see +https://www.contributor-covenant.org/faq \ No newline at end of file diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 416eb053..4341d1ad 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -26,4 +26,7 @@ Code of Conduct We abide by the principles of openness, respect, and consideration of others of the Python Software Foundation: https://www.python.org/psf/codeofconduct/. +Attribution +--------------- + This `CONTRIBUTING.md` was adapted from [scikit-learn](https://github.com/scikit-learn/scikit-learn/blob/master/CONTRIBUTING.md). \ No newline at end of file From 30b833471044bb70af090a0673c7e8ddba6c1c89 Mon Sep 17 00:00:00 2001 From: Scott Gigante Date: Tue, 8 Oct 2019 10:41:44 -0400 Subject: [PATCH 123/125] test fill_value --- test/test_patch.py | 26 +++++++++++++++++++++++++- 1 file changed, 25 insertions(+), 1 deletion(-) diff --git a/test/test_patch.py b/test/test_patch.py index d408895e..f16dcc0d 100644 --- a/test/test_patch.py +++ b/test/test_patch.py @@ -1,6 +1,7 @@ import scprep import numpy as np import pandas as pd +from pandas.core.internals.blocks import ExtensionBlock def test_pandas_series_rmatmul(): @@ -12,4 +13,27 @@ def test_pandas_series_rmatmul(): def test_pandas_sparse_iloc(): X = pd.DataFrame([[0,1,1], [0,0,1], [0,0,0]]).astype(pd.SparseDtype(float, fill_value=0.0)) - assert np.all(~np.isnan(X.iloc[[0,1]].to_numpy())) \ No newline at end of file + assert np.all(~np.isnan(X.iloc[[0,1]].to_numpy())) + + +class CustomBlock(ExtensionBlock): + + _holder = np.ndarray + + def concat_same_type(self, to_concat, placement=None): + """ + Always concatenate disregarding self.ndim as the values are + always 1D in this custom Block + """ + values = np.concatenate([blk.values for blk in to_concat]) + return self.make_block_same_class( + values, placement=placement or slice(0, len(values), 1) + ) + +def test_fill_value(): + values = pd.Series(np.arange(3), dtype=pd.UInt16Dtype()) + custom_block = CustomBlock(values, placement=slice(1, 2)) + assert np.isnan(custom_block.fill_value) + values = pd.Series(np.arange(3), dtype=pd.SparseDtype(float, 0.0)) + custom_block = CustomBlock(values, placement=slice(1, 2)) + assert not np.isnan(custom_block.fill_value) \ No newline at end of file From 1597c69092e2adb0a1ba502ada1de66bd53f636f Mon Sep 17 00:00:00 2001 From: Scott Gigante Date: Tue, 8 Oct 2019 10:51:50 -0400 Subject: [PATCH 124/125] ensure that monkey patch hasn't broken pandas --- test/test_patch.py | 13 +------------ 1 file changed, 1 insertion(+), 12 deletions(-) diff --git a/test/test_patch.py b/test/test_patch.py index f16dcc0d..150a92c7 100644 --- a/test/test_patch.py +++ b/test/test_patch.py @@ -17,23 +17,12 @@ def test_pandas_sparse_iloc(): class CustomBlock(ExtensionBlock): - _holder = np.ndarray - def concat_same_type(self, to_concat, placement=None): - """ - Always concatenate disregarding self.ndim as the values are - always 1D in this custom Block - """ - values = np.concatenate([blk.values for blk in to_concat]) - return self.make_block_same_class( - values, placement=placement or slice(0, len(values), 1) - ) - def test_fill_value(): values = pd.Series(np.arange(3), dtype=pd.UInt16Dtype()) custom_block = CustomBlock(values, placement=slice(1, 2)) assert np.isnan(custom_block.fill_value) values = pd.Series(np.arange(3), dtype=pd.SparseDtype(float, 0.0)) custom_block = CustomBlock(values, placement=slice(1, 2)) - assert not np.isnan(custom_block.fill_value) \ No newline at end of file + assert not np.isnan(custom_block.fill_value) From fce05c163bc8c9d7598c0e55f7b549652015630b Mon Sep 17 00:00:00 2001 From: Scott Gigante Date: Tue, 8 Oct 2019 10:52:03 -0400 Subject: [PATCH 125/125] test google drive large download --- scprep/io/download.py | 28 +++++++++++++++++----------- test/test_io.py | 6 ++++++ 2 files changed, 23 insertions(+), 11 deletions(-) diff --git a/scprep/io/download.py b/scprep/io/download.py index 9566d349..5072c31c 100644 --- a/scprep/io/download.py +++ b/scprep/io/download.py @@ -27,6 +27,20 @@ def _google_drive_confirm_token(response): return None +def _GET_google_drive(id): + """Post a GET request to Google Drive""" + global _GOOGLE_DRIVE_URL + + with requests.Session() as session: + response = session.get(_GOOGLE_DRIVE_URL, params = { 'id' : id }, stream = True) + token = _google_drive_confirm_token(response) + + if token: + params = { 'id' : id, 'confirm' : token } + response = session.get(_GOOGLE_DRIVE_URL, params = params, stream = True) + return response + + def download_google_drive(id, destination): """Download a file from Google Drive @@ -41,17 +55,8 @@ def download_google_drive(id, destination): destination : string or file File to which to save the downloaded data """ - global _GOOGLE_DRIVE_URL - - with requests.Session() as session: - response = session.get(_GOOGLE_DRIVE_URL, params = { 'id' : id }, stream = True) - token = _google_drive_confirm_token(response) - - if token: - params = { 'id' : id, 'confirm' : token } - response = session.get(_GOOGLE_DRIVE_URL, params = params, stream = True) - - _save_response_content(response, destination) + response = _GET_google_drive(id) + _save_response_content(response, destination) def download_url(url, destination): @@ -68,6 +73,7 @@ def download_url(url, destination): with open(destination, 'wb') as handle: download_url(url, handle) else: + # destination is File opener = urllib.request.build_opener() opener.addheaders = _FAKE_HEADERS urllib.request.install_opener(opener) diff --git a/test/test_io.py b/test/test_io.py index 7b729eb9..a3218a79 100644 --- a/test/test_io.py +++ b/test/test_io.py @@ -612,6 +612,12 @@ def test_download_google_drive(): assert data == 'test\n', data os.remove(dest) +def test_download_google_drive_large(): + id = "1FDDSWtSZcdQUVKpk-mPCZ8Ji1Fx8KSz9" + response = scprep.io.download._GET_google_drive(id) + assert response.status_code == 200 + response.close() + def test_download_url(): X = data.load_10X() scprep.io.download.download_url("https://github.com/KrishnaswamyLab/scprep/raw/master/data/test_data/test_10X/matrix.mtx.gz", "url_test.mtx.gz")