diff --git a/.github/ISSUE_TEMPLATE/bug_report.md b/.github/ISSUE_TEMPLATE/bug_report.md new file mode 100644 index 00000000..5d588928 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/bug_report.md @@ -0,0 +1,36 @@ +--- +name: Bug report +about: Create a report to help us improve +title: '' +labels: bug +assignees: '' + +--- + +**Describe the bug** +A clear and concise description of what the bug is. + +**To Reproduce** +Standalone code to reproduce the behavior: + +**Expected behavior** +A clear and concise description of what you expected to happen. + +**Actual behavior** +Include full traceback if applicable. + +**scprep version** +Output of `scprep.__version__` + +**Output of `pd.show_versions()`** + +
+ +``` +paste pd.show_versions() here +``` + +
+ +**Additional context** +Add any other context about the problem here. diff --git a/.github/ISSUE_TEMPLATE/feature_request.md b/.github/ISSUE_TEMPLATE/feature_request.md new file mode 100644 index 00000000..4a05b865 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/feature_request.md @@ -0,0 +1,20 @@ +--- +name: Feature request +about: Suggest an idea for this project +title: '' +labels: enhancement +assignees: '' + +--- + +**Is your feature request related to a problem? Please describe.** +A clear and concise description of what the problem is. Ex. I'm always frustrated when [...] + +**Describe the solution you'd like** +A clear and concise description of what you want to happen. + +**Describe alternatives you've considered** +A clear and concise description of any alternative solutions or features you've considered. + +**Additional context** +Add any other context or code snippets related to the feature request here. diff --git a/.travis.yml b/.travis.yml index 87946602..96ed5e04 100644 --- a/.travis.yml +++ b/.travis.yml @@ -17,6 +17,10 @@ - gfortran - libblas-dev - liblapack-dev + - libglu1-mesa-dev + - freeglut3-dev + - mesa-common-dev + - libgsl-dev cache: - pip @@ -25,7 +29,7 @@ - $HOME/R/Library install: - - python setup.py install + - pip install -U . before_script: - sudo apt-key adv --keyserver keyserver.ubuntu.com --recv-keys E298A3A825C0D65DFD57CBB651716619E084DAB9 diff --git a/CODE_OF_CONDUCT.md b/CODE_OF_CONDUCT.md new file mode 100644 index 00000000..10ef1ee7 --- /dev/null +++ b/CODE_OF_CONDUCT.md @@ -0,0 +1,76 @@ +# Contributor Covenant Code of Conduct + +## Our Pledge + +In the interest of fostering an open and welcoming environment, we as +contributors and maintainers pledge to making participation in our project and +our community a harassment-free experience for everyone, regardless of age, body +size, disability, ethnicity, sex characteristics, gender identity and expression, +level of experience, education, socio-economic status, nationality, personal +appearance, race, religion, or sexual identity and orientation. + +## Our Standards + +Examples of behavior that contributes to creating a positive environment +include: + +* Using welcoming and inclusive language +* Being respectful of differing viewpoints and experiences +* Gracefully accepting constructive criticism +* Focusing on what is best for the community +* Showing empathy towards other community members + +Examples of unacceptable behavior by participants include: + +* The use of sexualized language or imagery and unwelcome sexual attention or + advances +* Trolling, insulting/derogatory comments, and personal or political attacks +* Public or private harassment +* Publishing others' private information, such as a physical or electronic + address, without explicit permission +* Other conduct which could reasonably be considered inappropriate in a + professional setting + +## Our Responsibilities + +Project maintainers are responsible for clarifying the standards of acceptable +behavior and are expected to take appropriate and fair corrective action in +response to any instances of unacceptable behavior. + +Project maintainers have the right and responsibility to remove, edit, or +reject comments, commits, code, wiki edits, issues, and other contributions +that are not aligned to this Code of Conduct, or to ban temporarily or +permanently any contributor for other behaviors that they deem inappropriate, +threatening, offensive, or harmful. + +## Scope + +This Code of Conduct applies both within project spaces and in public spaces +when an individual is representing the project or its community. Examples of +representing a project or community include using an official project e-mail +address, posting via an official social media account, or acting as an appointed +representative at an online or offline event. Representation of a project may be +further defined and clarified by project maintainers. + +## Enforcement + +Instances of abusive, harassing, or otherwise unacceptable behavior may be +reported by contacting the project team at krishnaswamylab@gmail.com. All +complaints will be reviewed and investigated and will result in a response that +is deemed necessary and appropriate to the circumstances. The project team is +obligated to maintain confidentiality with regard to the reporter of an incident. +Further details of specific enforcement policies may be posted separately. + +Project maintainers who do not follow or enforce the Code of Conduct in good +faith may face temporary or permanent repercussions as determined by other +members of the project's leadership. + +## Attribution + +This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4, +available at https://www.contributor-covenant.org/version/1/4/code-of-conduct.html + +[homepage]: https://www.contributor-covenant.org + +For answers to common questions about this code of conduct, see +https://www.contributor-covenant.org/faq \ No newline at end of file diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md new file mode 100644 index 00000000..4341d1ad --- /dev/null +++ b/CONTRIBUTING.md @@ -0,0 +1,32 @@ + +Contributing to scprep +============================ + +There are many ways to contribute to `scprep`, with the most common ones +being contribution of code or documentation to the project. Improving the +documentation is no less important than improving the library itself. If you +find a typo in the documentation, or have made improvements, do not hesitate to +submit a GitHub pull request. + +But there are many other ways to help. In particular answering queries on the +[issue tracker](https://github.com/KrishnaswamyLab/scprep/issues), +investigating bugs, and [reviewing other developers' pull +requests](https://github.com/KrishnaswamyLab/scprep/pulls) +are very valuable contributions that decrease the burden on the project +maintainers. + +Another way to contribute is to report issues you're facing, and give a "thumbs +up" on issues that others reported and that are relevant to you. It also helps +us if you spread the word: reference the project from your blog and articles, +link to it from your website, or simply star it in GitHub to say "I use it". + +Code of Conduct +--------------- + +We abide by the principles of openness, respect, and consideration of others +of the Python Software Foundation: https://www.python.org/psf/codeofconduct/. + +Attribution +--------------- + +This `CONTRIBUTING.md` was adapted from [scikit-learn](https://github.com/scikit-learn/scikit-learn/blob/master/CONTRIBUTING.md). \ No newline at end of file diff --git a/README.rst b/README.rst index dede2bcb..57dd93ac 100644 --- a/README.rst +++ b/README.rst @@ -1,6 +1,5 @@ -============= -scprep -============= +.. image:: logo.png + :alt: scprep logo .. image:: https://img.shields.io/pypi/v/scprep.svg :target: https://pypi.org/project/scprep/ @@ -24,8 +23,14 @@ scprep :target: https://github.com/KrishnaswamyLab/scprep/ :alt: GitHub stars +`scprep` provides an all-in-one framework for loading, preprocessing, and plotting matrices in Python, with a focus on single-cell genomics. -Tools for loading and preprocessing biological matrices in Python. +The philosophy of `scprep`: + +* Data shouldn't be hidden in a complex and bespoke class object. `scprep` works with `numpy` arrays, `pandas` data frames, and `scipy` sparse matrices, all of which are popular data formats in Python and accepted as input to most common algorithms. +* Your analysis pipeline shouldn't have to change based on data format. Changing from a `numpy` array to a `pandas` data frame introduces endless technical differences (e.g. in indexing matrices). `scprep` provides data-agnostic methods that work the same way on all formats. +* Simple analysis should mean simple code. `scprep` takes care of annoying edge cases and sets nice defaults so you don't have to. +* Using a framework shouldn't be limiting. Because nothing is hidden from you, you have access to the power of `numpy`, `scipy`, `pandas` and `matplotlib` just as you would if you used them directly. Installation ------------ @@ -72,4 +77,4 @@ Examples Help ---- -If you have any questions or require assistance using scprep, please read the documentation at https://scprep.readthedocs.io/ or contact us at https://krishnaswamylab.org/get-help \ No newline at end of file +If you have any questions or require assistance using scprep, please read the documentation at https://scprep.readthedocs.io/ or contact us at https://krishnaswamylab.org/get-help diff --git a/doc/source/index.rst b/doc/source/index.rst index a19d46f1..bf1c246d 100644 --- a/doc/source/index.rst +++ b/doc/source/index.rst @@ -1,6 +1,6 @@ -=========================================================================== -scprep -=========================================================================== +.. raw:: html + + scprep logo
.. raw:: html @@ -26,7 +26,14 @@ scprep GitHub stars -Tools for building and manipulating graphs in Python. +`scprep` provides an all-in-one framework for loading, preprocessing, and plotting matrices in Python, with a focus on single-cell genomics. + +The philosophy of `scprep`: + +* Data shouldn't be hidden in a complex and bespoke class object. `scprep` works with `numpy` arrays, `pandas` data frames, and `scipy` sparse matrices, all of which are popular data formats in Python and accepted as input to most common algorithms. +* Your analysis pipeline shouldn't have to change based on data format. Changing from a `numpy` array to a `pandas` data frame introduces endless technical differences (e.g. in indexing matrices). `scprep` provides data-agnostic methods that work the same way on all formats. +* Simple analysis should mean simple code. `scprep` takes care of annoying edge cases and sets nice defaults so you don't have to. +* Using a framework shouldn't be limiting. Because nothing is hidden from you, you have access to the power of `numpy`, `scipy`, `pandas` and `matplotlib` just as you would if you used them directly. .. toctree:: :maxdepth: 2 @@ -63,4 +70,4 @@ You can use `scprep` with your single cell data as follows:: Help ==== -If you have any questions or require assistance using scprep, please contact us at https://krishnaswamylab.org/get-help \ No newline at end of file +If you have any questions or require assistance using scprep, please contact us at https://krishnaswamylab.org/get-help diff --git a/doc/source/reference.rst b/doc/source/reference.rst index 90e274a8..8122ae6d 100644 --- a/doc/source/reference.rst +++ b/doc/source/reference.rst @@ -11,6 +11,24 @@ Data Input/Output :imported-members: :show-inheritance: +HDF5 +~~~~ + +.. automodule:: scprep.io.hdf5 + :autosummary: + :members: + :inherited-members: + :show-inheritance: + +Download +~~~~~~~~ + +.. automodule:: scprep.io.download + :autosummary: + :members: + :inherited-members: + :show-inheritance: + Filtering --------- @@ -103,10 +121,19 @@ External Tools :imported-members: :show-inheritance: -HDF5 ----- +Splatter +~~~~~~~~ -.. automodule:: scprep.io.hdf5 +.. automodule:: scprep.run.splatter + :autosummary: + :members: + :inherited-members: + :show-inheritance: + +Slingshot +~~~~~~~~~ + +.. automodule:: scprep.run.slingshot :autosummary: :members: :inherited-members: diff --git a/doc/source/requirements.txt b/doc/source/requirements.txt index 58906e83..ae852b76 100644 --- a/doc/source/requirements.txt +++ b/doc/source/requirements.txt @@ -1,9 +1,9 @@ -numpy>=1.10.0 -scipy>=0.18.0 +numpy>=1.12.0 +scipy>=0.18.1 scikit-learn>=0.19.1 -pandas>=0.19.0,<0.24 -decorator -matplotlib +pandas>=0.25 +decorator>=4.3.0 +matplotlib>=3.0 sphinx<=1.8.5 sphinxcontrib-napoleon autodocsumm diff --git a/logo.png b/logo.png new file mode 100644 index 00000000..0bb30d36 Binary files /dev/null and b/logo.png differ diff --git a/requirements.txt b/requirements.txt index b5819b87..31d927e5 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,5 @@ -numpy>=1.10.0 -scipy>=0.18.0 +numpy>=1.12.0 +scipy>=0.18.1 scikit-learn>=0.19.1 -pandas>=0.19.0,<0.24 +pandas>=0.25 decorator>=4.3.0 diff --git a/scprep/__init__.py b/scprep/__init__.py index 2bbd9f2f..a7a8ffdd 100644 --- a/scprep/__init__.py +++ b/scprep/__init__.py @@ -15,11 +15,13 @@ import scprep.reduce import scprep.run -import pandas as _pd -if int(_pd.__version__.split(".")[1]) < 24: - import numpy as _np - - def __rmatmul__(self, other): - """ Matrix multiplication using binary `@` operator in Python>=3.5 """ - return self.dot(_np.transpose(other)) - _pd.core.series.Series.__rmatmul__ = __rmatmul__ +import pandas as pd +if int(pd.__version__.split('.')[1]) < 26: + def fill_value(self): + # Used in reindex_indexer + try: + return self.values.dtype.fill_value + except AttributeError: + return self.values.dtype.na_value + from pandas.core.internals.blocks import ExtensionBlock + setattr(ExtensionBlock, 'fill_value', property(fill_value)) diff --git a/scprep/_lazyload.py b/scprep/_lazyload.py index 130a88b9..b9357058 100644 --- a/scprep/_lazyload.py +++ b/scprep/_lazyload.py @@ -14,7 +14,8 @@ 'rinterface', {'rinterface_lib': ['callbacks']}], 'h5py': [], - 'tables': [] + 'tables': [], + 'statsmodels': [{'nonparametric': ['smoothers_lowess']}], } diff --git a/scprep/filter.py b/scprep/filter.py index 1737cb14..e816369d 100644 --- a/scprep/filter.py +++ b/scprep/filter.py @@ -28,14 +28,14 @@ def remove_empty_cells(data, *extra_data, sample_labels=None): warnings.warn("`scprep.filter.remove_empty_cells` is deprecated. " "Use `scprep.filter.filter_empty_cells` instead.", DeprecationWarning) - return filter_empty_cells(data, *extra_data) + return filter_empty_cells(data, *extra_data, sample_labels=sample_labels) def remove_duplicates(data, *extra_data, sample_labels=None): warnings.warn("`scprep.filter.remove_duplicates` is deprecated. " "Use `scprep.filter.filter_duplicates` instead.", DeprecationWarning) - return filter_duplicates(data, *extra_data) + return filter_duplicates(data, *extra_data, sample_labels=sample_labels) def filter_empty_genes(data, *extra_data): @@ -120,21 +120,6 @@ def filter_empty_cells(data, *extra_data, sample_labels=None): return data -def _get_filter_idx(data, values, - cutoff, percentile, - keep_cells): - cutoff = measure._get_percentile_cutoff( - values, cutoff, percentile, required=True) - if keep_cells == 'above': - keep_cells_idx = values > cutoff - elif keep_cells == 'below': - keep_cells_idx = values < cutoff - else: - raise ValueError("Expected `keep_cells` in ['above', 'below']. " - "Got {}".format(keep_cells)) - return keep_cells_idx - - def filter_values(data, *extra_data, values=None, cutoff=None, percentile=None, keep_cells='above', @@ -188,9 +173,9 @@ def filter_values(data, *extra_data, values=None, "Filtering as a single sample.", DeprecationWarning) assert values is not None - keep_cells_idx = _get_filter_idx(data, values, - cutoff, percentile, - keep_cells) + keep_cells_idx = utils._get_filter_idx(values, + cutoff, percentile, + keep_cells) if return_values: extra_data = [values] + list(extra_data) data = select.select_rows(data, *extra_data, idx=keep_cells_idx) @@ -303,7 +288,7 @@ def filter_gene_set_expression(data, *extra_data, genes=None, Filtered extra data, if passed. """ cell_sums = measure.gene_set_expression( - data, genes, + data, genes=genes, starts_with=starts_with, ends_with=ends_with, exact_word=exact_word, regex=regex, library_size_normalize=library_size_normalize) @@ -330,6 +315,8 @@ def _find_unique_cells(data): """ if isinstance(data, pd.SparseDataFrame): unique_idx = _find_unique_cells(data.to_coo()) + elif utils.is_sparse_dataframe(data): + unique_idx = _find_unique_cells(data.sparse.to_coo()) elif isinstance(data, pd.DataFrame): unique_idx = ~data.duplicated() elif isinstance(data, np.ndarray): diff --git a/scprep/io/__init__.py b/scprep/io/__init__.py index 18446deb..d34bd92c 100644 --- a/scprep/io/__init__.py +++ b/scprep/io/__init__.py @@ -4,4 +4,6 @@ from .csv import load_csv, load_tsv from .tenx import load_10X, load_10X_zip, load_10X_HDF5 from .fcs import load_fcs -from .mtx import load_mtx +from .mtx import load_mtx, save_mtx + +from . import download, hdf5 diff --git a/scprep/io/csv.py b/scprep/io/csv.py index 6e65c51b..bf17bc55 100644 --- a/scprep/io/csv.py +++ b/scprep/io/csv.py @@ -4,13 +4,14 @@ import pandas as pd from .utils import _matrix_to_data_frame +from .. import utils def _read_csv_sparse(filename, chunksize=1000000, fill_value=0.0, **kwargs): - """Read a csv file into a pandas.SparseDataFrame + """Read a csv file into a pd.DataFrame[pd.SparseArray] """ chunks = pd.read_csv(filename, chunksize=chunksize, **kwargs) - data = pd.concat(chunk.to_sparse(fill_value=fill_value) + data = pd.concat(utils.dataframe_to_sparse(chunk, fill_value=fill_value) for chunk in chunks) return data @@ -36,7 +37,7 @@ def load_csv(filename, cell_axis='row', delimiter=',', If `True`, we assume cell names are in the first row/column. Otherwise expects a filename or an array containing a list of cell barcodes. sparse : bool, optional (default: False) - If True, loads the data as a pd.SparseDataFrame. This uses less memory + If True, loads the data as a pd.DataFrame[pd.SparseArray]. This uses less memory but more CPU. **kwargs : optional arguments for `pd.read_csv`. @@ -44,7 +45,7 @@ def load_csv(filename, cell_axis='row', delimiter=',', ------- data : array-like, shape=[n_samples, n_features] If either gene or cell names are given, data will be a pd.DataFrame or - pd.SparseDataFrame. If no names are given, data will be a np.ndarray + pd.DataFrame[pd.SparseArray]. If no names are given, data will be a np.ndarray or scipy.sparse.spmatrix """ if cell_axis not in ['row', 'column', 'col']: @@ -113,7 +114,7 @@ def load_tsv(filename, cell_axis='row', delimiter='\t', If `True`, we assume cell names are in the first row/column. Otherwise expects a filename or an array containing a list of cell barcodes. sparse : bool, optional (default: False) - If True, loads the data as a pd.SparseDataFrame. This uses less memory + If True, loads the data as a pd.DataFrame[pd.SparseArray]. This uses less memory but more CPU. **kwargs : optional arguments for `pd.read_csv`. @@ -121,7 +122,7 @@ def load_tsv(filename, cell_axis='row', delimiter='\t', ------- data : array-like, shape=[n_samples, n_features] If either gene or cell names are given, data will be a pd.DataFrame or - pd.SparseDataFrame. If no names are given, data will be a np.ndarray + pd.DataFrame[pd.SparseArray]. If no names are given, data will be a np.ndarray or scipy.sparse.spmatrix """ return load_csv(filename, cell_axis=cell_axis, delimiter=delimiter, diff --git a/scprep/io/download.py b/scprep/io/download.py new file mode 100644 index 00000000..5072c31c --- /dev/null +++ b/scprep/io/download.py @@ -0,0 +1,123 @@ +import requests +import zipfile +import tempfile +import os +import urllib.request + +_CHUNK_SIZE = 32768 +_GOOGLE_DRIVE_URL = "https://docs.google.com/uc?export=download" +_FAKE_HEADERS = [('User-Agent', 'Mozilla/5.0')] + + +def _save_response_content(response, destination): + global _CHUNK_SIZE + if isinstance(destination, str): + with open(destination, 'wb') as handle: + _save_response_content(response, handle) + else: + for chunk in response.iter_content(_CHUNK_SIZE): + if chunk: # filter out keep-alive new chunks + destination.write(chunk) + + +def _google_drive_confirm_token(response): + for key, value in response.cookies.items(): + if key.startswith('download_warning'): + return value + return None + + +def _GET_google_drive(id): + """Post a GET request to Google Drive""" + global _GOOGLE_DRIVE_URL + + with requests.Session() as session: + response = session.get(_GOOGLE_DRIVE_URL, params = { 'id' : id }, stream = True) + token = _google_drive_confirm_token(response) + + if token: + params = { 'id' : id, 'confirm' : token } + response = session.get(_GOOGLE_DRIVE_URL, params = params, stream = True) + return response + + +def download_google_drive(id, destination): + """Download a file from Google Drive + + Requires the file to be available to view by anyone with the URL. + + Parameters + ---------- + id : string + Google Drive ID string. You can access this by clicking 'Get Shareable Link', + which will give a URL of the form + + destination : string or file + File to which to save the downloaded data + """ + response = _GET_google_drive(id) + _save_response_content(response, destination) + + +def download_url(url, destination): + """Download a file from a URL + + Parameters + ---------- + url : string + URL of file to be downloaded + destination : string or file + File to which to save the downloaded data + """ + if isinstance(destination, str): + with open(destination, 'wb') as handle: + download_url(url, handle) + else: + # destination is File + opener = urllib.request.build_opener() + opener.addheaders = _FAKE_HEADERS + urllib.request.install_opener(opener) + with urllib.request.urlopen(url) as handle: + destination.write(handle.read()) + + +def unzip(filename, destination=None, delete=True): + """Extract a .zip file and optionally remove the archived version + + Parameters + ---------- + filename : string + Path to the zip file + destination : string, optional (default: None) + Path to the folder in which to extract the zip. + If None, extracts to the same directory the archive is in. + delete : boolean, optional (default: True) + If True, deletes the zip file after extraction + """ + filename = os.path.expanduser(filename) + if destination is None: + destination = os.path.dirname(filename) + elif not os.path.isdir(destination): + os.mkdir(destination) + with zipfile.ZipFile(filename, 'r') as handle: + handle.extractall(destination) + if delete: + os.unlink(filename) + + +def download_and_extract_zip(url, destination): + """Download a .zip file from a URL and extract it + + Parameters + ---------- + url : string + URL of file to be downloaded + destination : string + Directory in which to extract the downloaded zip + """ + if not os.path.isdir(destination): + os.mkdir(destination) + zip_handle = tempfile.NamedTemporaryFile(suffix=".zip", delete=False) + download_url(url, zip_handle) + zip_handle.close() + unzip(zip_handle.name, destination) diff --git a/scprep/io/fcs.py b/scprep/io/fcs.py index dd398f67..c0edf180 100644 --- a/scprep/io/fcs.py +++ b/scprep/io/fcs.py @@ -242,7 +242,7 @@ def load_fcs(filename, gene_names=True, cell_names=True, If `True`, we assume cell names are contained in the file. Otherwise expects a filename or an array containing a list of cell barcodes. sparse : bool, optional (default: None) - If True, loads the data as a pd.SparseDataFrame. This uses less memory + If True, loads the data as a pd.DataFrame[SparseArray]. This uses less memory but more CPU. metadata_channels : list-like, optional, shape=[n_meta] (default: ['Time', 'Event_length', 'DNA1', 'DNA2', 'Cisplatin', 'beadDist', 'bead1']) Channels to be excluded from the data @@ -273,7 +273,7 @@ def load_fcs(filename, gene_names=True, cell_names=True, Values from metadata channels data : array-like, shape=[n_samples, n_features] If either gene or cell names are given, data will be a pd.DataFrame or - pd.SparseDataFrame. If no names are given, data will be a np.ndarray + pd.DataFrame[SparseArray]. If no names are given, data will be a np.ndarray or scipy.sparse.spmatrix """ if cell_names is True: diff --git a/scprep/io/mtx.py b/scprep/io/mtx.py index eb4d114e..6f1bd2a8 100644 --- a/scprep/io/mtx.py +++ b/scprep/io/mtx.py @@ -2,7 +2,12 @@ # (C) 2018 Krishnaswamy Lab GPLv2 import scipy.io as sio +from scipy import sparse +import pandas as pd +import os + from .utils import _matrix_to_data_frame +from .. import utils def load_mtx(mtx_file, cell_axis='row', @@ -21,14 +26,14 @@ def load_mtx(mtx_file, cell_axis='row', cell_names : `str`, array-like, or `None` (default: None) Expects a filename or an array containing a list of cell barcodes. sparse : bool, optional (default: None) - If True, loads the data as a pd.SparseDataFrame. This uses less memory + If True, loads the data as a pd.DataFrame[pd.SparseArray]. This uses less memory but more CPU. Returns ------- data : array-like, shape=[n_samples, n_features] If either gene or cell names are given, data will be a pd.DataFrame or - pd.SparseDataFrame. If no names are given, data will be a np.ndarray + pd.DataFrame[pd.SparseArray]. If no names are given, data will be a np.ndarray or scipy.sparse.spmatrix """ if cell_axis not in ['row', 'column', 'col']: @@ -43,3 +48,50 @@ def load_mtx(mtx_file, cell_axis='row', data, gene_names=gene_names, cell_names=cell_names, sparse=sparse) return data + +def save_mtx(data, destination, cell_names=None, gene_names=None): + """Save a mtx file + + Parameters + ---------- + data : array-like, shape=[n_samples, n_features] + Input data, saved to destination/matrix.mtx + destination : str + Directory in which to save the data + cell_names : list-like, shape=[n_samples], optional (default: None) + Cell names associated with rows, saved to destination/cell_names.tsv. + If `data` is a pandas DataFrame and `cell_names` is None, + these are autopopulated from `data.index`. + gene_names : list-like, shape=[n_features], optional (default: None) + Cell names associated with rows, saved to destination/gene_names.tsv. + If `data` is a pandas DataFrame and `gene_names` is None, + these are autopopulated from `data.columns`. + + Examples + -------- + >>> import scprep + >>> scprep.io.save_mtx(data, destination="my_data") + >>> reload = scprep.io.load_mtx("my_data/matrix.mtx", + ... cell_names="my_data/cell_names.tsv", + ... gene_names="my_data/gene_names.tsv") + """ + if isinstance(data, pd.DataFrame): + if cell_names is None: + cell_names = data.index + if gene_names is None: + gene_names = data.columns + data = utils.to_array_or_spmatrix(data) + data = sparse.coo_matrix(data) + # handle ~/ and relative paths + destination = os.path.expanduser(destination) + if not os.path.isdir(destination): + os.mkdir(destination) + if cell_names is not None: + with open(os.path.join(destination, "cell_names.tsv"), 'w') as handle: + for name in cell_names: + handle.write("{}\n".format(name)) + if gene_names is not None: + with open(os.path.join(destination, "gene_names.tsv"), 'w') as handle: + for name in gene_names: + handle.write("{}\n".format(name)) + sio.mmwrite(os.path.join(destination, "matrix.mtx"), data) diff --git a/scprep/io/tenx.py b/scprep/io/tenx.py index 6da8f90b..aaa8e8e8 100644 --- a/scprep/io/tenx.py +++ b/scprep/io/tenx.py @@ -40,18 +40,18 @@ def _combine_gene_id(symbols, ids): def _parse_10x_genes(symbols, ids, gene_labels='symbol', allow_duplicates=True): assert gene_labels in ['symbol', 'id', 'both'] - if gene_labels == 'both': - columns = _combine_gene_id(symbols, ids) if gene_labels == 'symbol': columns = symbols if not allow_duplicates and len(np.unique(columns)) < len(columns): warnings.warn( - "Duplicate gene names detected! Forcing `gene_labels='id'`. " - "Alternatively, try `gene_labels='both'`, " + "Duplicate gene names detected! Forcing `gene_labels='both'`. " + "Alternatively, try `gene_labels='id'`, " "`allow_duplicates=True`, or load the matrix" " with `sparse=False`", RuntimeWarning) - gene_labels = 'id' - if gene_labels == 'id': + gene_labels = 'both' + if gene_labels == 'both': + columns = _combine_gene_id(symbols, ids) + elif gene_labels == 'id': columns = ids return columns @@ -95,7 +95,7 @@ def load_10X(data_dir, sparse=True, gene_labels='symbol', Returns ------- data: array-like, shape=[n_samples, n_features] - If sparse, data will be a pd.SparseDataFrame. Otherwise, data will + If sparse, data will be a pd.DataFrame[pd.SparseArray]. Otherwise, data will be a pd.DataFrame. """ @@ -168,7 +168,7 @@ def load_10X_zip(filename, sparse=True, gene_labels='symbol', Returns ------- data: array-like, shape=[n_samples, n_features] - If sparse, data will be a pd.SparseDataFrame. Otherwise, data will + If sparse, data will be a pd.DataFrame[pd.SparseArray]. Otherwise, data will be a pd.DataFrame. """ @@ -247,7 +247,7 @@ def load_10X_HDF5(filename, genome=None, sparse=True, gene_labels='symbol', Returns ------- data: array-like, shape=[n_samples, n_features] - If sparse, data will be a pd.SparseDataFrame. Otherwise, data will + If sparse, data will be a pd.DataFrame[pd.SparseArray]. Otherwise, data will be a pd.DataFrame. """ diff --git a/scprep/io/utils.py b/scprep/io/utils.py index 30a6eafa..53eb9f70 100644 --- a/scprep/io/utils.py +++ b/scprep/io/utils.py @@ -6,6 +6,8 @@ import warnings import numpy as np +from .. import utils + def _parse_header(header, n_expected, header_type="gene_names"): """ @@ -93,7 +95,7 @@ def _matrix_to_data_frame(data, gene_names=None, cell_names=None, sparse=None): # dataframe with index and/or columns if sparse is None: # let the input data decide - sparse = isinstance(data, pd.SparseDataFrame) or sp.issparse(data) + sparse = utils.is_sparse_dataframe(data) or sp.issparse(data) if sparse and gene_names is not None and \ len(np.unique(gene_names)) < len(gene_names): warnings.warn( @@ -101,18 +103,19 @@ def _matrix_to_data_frame(data, gene_names=None, cell_names=None, sparse=None): RuntimeWarning) sparse = False if sparse: - # return pandas.SparseDataFrame + # return pandas.DataFrame[SparseArray] if isinstance(data, pd.DataFrame): if gene_names is not None: data.columns = gene_names if cell_names is not None: data.index = cell_names - if not isinstance(data, pd.SparseDataFrame): - data = data.to_sparse(fill_value=0.0) + if not utils.is_sparse_dataframe(data): + data = utils.dataframe_to_sparse(data, fill_value=0.0) + elif sp.issparse(data): + data = pd.DataFrame.sparse.from_spmatrix(data, index=cell_names, columns=gene_names) else: - data = pd.SparseDataFrame(data, default_fill_value=0.0) - data.index = cell_names - data.columns = gene_names + data = pd.DataFrame(data, index=cell_names, columns=gene_names) + data = utils.dataframe_to_sparse(data, fill_value=0.0) else: # return pandas.DataFrame if isinstance(data, pd.DataFrame): @@ -120,8 +123,8 @@ def _matrix_to_data_frame(data, gene_names=None, cell_names=None, sparse=None): data.columns = gene_names if cell_names is not None: data.index = cell_names - if isinstance(data, pd.SparseDataFrame): - data = data.to_dense() + if utils.is_sparse_dataframe(data): + data = data.sparse.to_dense() else: if sp.issparse(data): data = data.toarray() diff --git a/scprep/measure.py b/scprep/measure.py index 27a18f8b..fda2347c 100644 --- a/scprep/measure.py +++ b/scprep/measure.py @@ -1,8 +1,11 @@ import numpy as np +import pandas as pd import warnings import numbers +import scipy.signal from . import utils, select +from ._lazyload import statsmodels def library_size(data): @@ -19,6 +22,8 @@ def library_size(data): Sum over all genes for each cell """ library_size = utils.matrix_sum(data, axis=1) + if isinstance(library_size, pd.Series): + library_size.name = 'library_size' return library_size @@ -59,45 +64,55 @@ def gene_set_expression(data, genes=None, library_size_normalize=False, gene_set_expression = library_size(gene_data) else: gene_set_expression = gene_data + if isinstance(gene_set_expression, pd.Series): + gene_set_expression.name = 'expression' return gene_set_expression -def _get_percentile_cutoff(data, cutoff=None, percentile=None, required=False): - """Get a cutoff for a dataset +@utils._with_pkg(pkg="statsmodels") +def gene_variability(data, span=0.7, interpolate=0.2, kernel_size=0.05, return_means=False): + """Measure the variability of each gene in a dataset + + Variability is computed as the deviation from a loess fit + to the rolling median of the mean-variance curve Parameters ---------- - data : array-like - cutoff : float or None, optional (default: None) - Absolute cutoff value. Only one of cutoff and percentile may be given - percentile : float or None, optional (default: None) - Percentile cutoff value between 0 and 100. - Only one of cutoff and percentile may be given - required : bool, optional (default: False) - If True, one of cutoff and percentile must be given. + data : array-like, shape=[n_samples, n_features] + Input data + span : float, optional (default: 0.7) + Fraction of genes to use when computing the loess estimate at each point + interpolate : float, optional (default: 0.2) + Multiple of the standard deviation of variances at which to interpolate + linearly in order to reduce computation time. + kernel_size : float or int, optional (default: 0.05) + Width of rolling median window. If a float between 0 and 1, the width is given by + kernel_size * data.shape[1]. Otherwise should be an odd integer + return_means : boolean, optional (default: False) + If True, return the gene means Returns ------- - cutoff : float or None - Absolute cutoff value. Can only be None if required is False and - cutoff and percentile are both None. + variability : list-like, shape=[n_samples] + Variability for each gene """ - if percentile is not None: - if cutoff is not None: - raise ValueError( - "Only one of `cutoff` and `percentile` should be given." - "Got cutoff={}, percentile={}".format(cutoff, percentile)) - if not isinstance(percentile, numbers.Number): - return [_get_percentile_cutoff(data, percentile=p) - for p in percentile] - if percentile < 1: - warnings.warn( - "`percentile` expects values between 0 and 100." - "Got {}. Did you mean {}?".format(percentile, - percentile * 100), - UserWarning) - cutoff = np.percentile(np.array(data).reshape(-1), percentile) - elif cutoff is None and required: - raise ValueError( - "One of either `cutoff` or `percentile` must be given.") - return cutoff + columns = data.columns if isinstance(data, pd.DataFrame) else None + data = utils.to_array_or_spmatrix(data) + data_std = utils.matrix_std(data, axis=0) ** 2 + if kernel_size < 1: + kernel_size = 2*(int(kernel_size * len(data_std))//2)+1 + order = np.argsort(data_std) + data_std_med = np.empty_like(data_std) + data_std_med[order] = scipy.signal.medfilt(data_std[order], kernel_size=kernel_size) + data_mean = utils.toarray(np.mean(data, axis=0)).flatten() + delta = np.std(data_std_med) * interpolate + lowess = statsmodels.nonparametric.smoothers_lowess.lowess( + data_std_med, data_mean, + delta=delta, frac=span, return_sorted=False) + result = data_std - lowess + if columns is not None: + result = pd.Series(result, index=columns, name='variability') + data_mean = pd.Series(data_mean, index=columns, name='mean') + if return_means: + result = result, data_mean + return result diff --git a/scprep/normalize.py b/scprep/normalize.py index 69501612..ef83852a 100644 --- a/scprep/normalize.py +++ b/scprep/normalize.py @@ -34,7 +34,7 @@ def _get_scaled_libsize(data, rescale='median', return_library_size=False): return rescale, libsize -def library_size_normalize(data, rescale='median', +def library_size_normalize(data, rescale=10000, return_library_size=False): """Performs L1 normalization on input data Performs L1 normalization on input data such that the sum of expression @@ -46,7 +46,7 @@ def library_size_normalize(data, rescale='median', ---------- data : array-like, shape=[n_samples, n_features] Input data - rescale : {'mean', 'median'}, float or `None`, optional (default: 'median') + rescale : {'mean', 'median'}, float or `None`, optional (default: 10000) Rescaling strategy. If 'mean' or 'median', normalized cells are scaled back up to the mean or median expression value. If a float, normalized cells are scaled up to the given value. If `None`, no @@ -64,13 +64,15 @@ def library_size_normalize(data, rescale='median', """ # pandas support columns, index = None, None - if isinstance(data, pd.SparseDataFrame) or \ - pd.api.types.is_sparse(data): + if isinstance(data, pd.DataFrame): columns, index = data.columns, data.index - data = data.to_coo() - elif isinstance(data, pd.DataFrame): - columns, index = data.columns, data.index - data = data.values + if utils.is_sparse_dataframe(data): + data = data.sparse.to_coo() + elif isinstance(data, pd.SparseDataFrame): + data = data.to_coo() + else: + # dense data + data = data.to_numpy() calc_libsize = sparse.issparse(data) and (return_library_size or data.nnz > 2**31) @@ -91,7 +93,7 @@ def library_size_normalize(data, rescale='median', if columns is not None: # pandas dataframe if sparse.issparse(data_norm): - data_norm = pd.SparseDataFrame(data_norm, default_fill_value=0.0) + data_norm = utils.SparseDataFrame(data_norm, default_fill_value=0.0) else: data_norm = pd.DataFrame(data_norm) data_norm.columns = columns @@ -120,7 +122,7 @@ def batch_mean_center(data, sample_idx=None): data : array-like, shape=[n_samples, n_features] Batch mean-centered output data. """ - if sparse.issparse(data) or isinstance(data, pd.SparseDataFrame): + if sparse.issparse(data) or isinstance(data, pd.SparseDataFrame) or utils.is_sparse_dataframe(data): raise ValueError("Cannot mean center sparse data. " "Convert to dense matrix first.") if sample_idx is None: diff --git a/scprep/plot/__init__.py b/scprep/plot/__init__.py index 1a7ca181..582a8a0d 100644 --- a/scprep/plot/__init__.py +++ b/scprep/plot/__init__.py @@ -3,4 +3,5 @@ from .marker import marker_plot from .scree import scree_plot from .jitter import jitter +from .variable_genes import plot_gene_variability from . import tools, colors diff --git a/scprep/plot/colors.py b/scprep/plot/colors.py index bdbc347b..4d688dd5 100644 --- a/scprep/plot/colors.py +++ b/scprep/plot/colors.py @@ -74,3 +74,48 @@ def tab40(): colors = np.vstack([mpl.cm.tab20c.colors, mpl.cm.tab20b.colors]) return mpl.colors.ListedColormap(colors) + + +def tab(n=10): + """A discrete colormap with an arbitrary number of colors + + This colormap chooses the best of the following, in order: + - `plt.cm.tab10` + - `plt.cm.tab20` + - `scprep.plot.colors.tab30` + - `scprep.plot.colors.tab40` + - `scprep.plot.colors.tab10_continuous` + + If the number of colors required is less than the number of colors + available, colors are selected specifically in order to reduce similarity + between selected colors. + + Parameters + ---------- + n : int, optional (default: 10) + Number of required colors. + + Returns + ------- + cmap : `matplotlib.colors.ListedColormap` + """ + if n < 1: + raise ValueError( + "Expected n >= 1. Got {}".format(n)) + n_shades = int(np.ceil(n / 10)) + if n_shades == 1: + cmap = mpl.cm.tab10 + elif n_shades == 2: + cmap = mpl.cm.tab20 + elif n_shades == 3: + cmap = tab30() + elif n_shades == 4: + cmap = tab40() + else: + cmap = tab10_continuous(n_colors=10, n_step=n_shades) + # restrict to n values + if n > 1 and n < cmap.N: + select_idx = np.tile(np.arange(10), n_shades) * \ + n_shades + np.repeat(np.arange(n_shades), 10) + cmap = mpl.colors.ListedColormap(np.array(cmap.colors)[select_idx[:n]]) + return cmap diff --git a/scprep/plot/histogram.py b/scprep/plot/histogram.py index 0dfb30df..e3c43dc8 100644 --- a/scprep/plot/histogram.py +++ b/scprep/plot/histogram.py @@ -1,5 +1,6 @@ import numpy as np import numbers +import warnings from .. import measure, utils from .utils import (_get_figure, show, @@ -18,6 +19,8 @@ def histogram(data, fontsize=None, histtype='stepfilled', alpha=None, + filename=None, + dpi=None, **kwargs): """Plot a histogram. @@ -55,6 +58,12 @@ def histogram(data, 'stepfilled' generates a lineplot that is by default filled. alpha : float, optional (default: 1 for a single dataset, 0.5 for multiple) Histogram transparency + filename : str or None (default: None) + file to which the output is saved + dpi : int or None, optional (default: None) + The resolution in dots per inch. If None it will default to the value + savefig.dpi in the matplotlibrc file. If 'figure' it will set the dpi + to be the value of the figure. Only used if filename is not None. **kwargs : additional arguments for `matplotlib.pyplot.hist` Returns @@ -78,8 +87,20 @@ def histogram(data, if alpha is None: alpha = 1 if log == 'x' or log is True: - bins = np.logspace(np.log10(max(xmin, 1)), - np.log10(xmax), + if xmax < np.finfo('float').eps: + raise ValueError("Expected positive data for log = {}. " + "Got max(data) = {:.2f}".format(log, xmax)) + elif xmin < np.finfo('float').eps: + warnings.warn("Expected positive data for log = {}. " + "Got min(data) = {:.2f}".format(log, xmin), UserWarning) + xmin = np.finfo('float').eps + xmin = np.log10(xmin) + xmax = np.log10(xmax) + xrange = max(xmax - xmin, 1) + xmin = xmin - xrange * 0.1 + xmax = xmax + xrange * 0.1 + bins = np.logspace(xmin, + xmax, bins) ax.hist(data, bins=bins, histtype=histtype, alpha=alpha, **kwargs) @@ -94,7 +115,7 @@ def histogram(data, if title is not None: ax.set_title(title, fontsize=parse_fontsize(None, 'xx-large')) - cutoff = measure._get_percentile_cutoff( + cutoff = utils._get_percentile_cutoff( data, cutoff, percentile, required=False) if cutoff is not None: if isinstance(cutoff, numbers.Number): @@ -102,8 +123,11 @@ def histogram(data, else: for c in cutoff: ax.axvline(c, color='red') + # save and show if show_fig: show(fig) + if filename is not None: + fig.savefig(filename, dpi=dpi) return ax @@ -115,6 +139,8 @@ def plot_library_size(data, xlabel='Library size', title=None, fontsize=None, + filename=None, + dpi=None, **kwargs): """Plot the library size histogram. @@ -144,6 +170,12 @@ def plot_library_size(data, Axis title. fontsize : float or None (default: None) Base font size. + filename : str or None (default: None) + file to which the output is saved + dpi : int or None, optional (default: None) + The resolution in dots per inch. If None it will default to the value + savefig.dpi in the matplotlibrc file. If 'figure' it will set the dpi + to be the value of the figure. Only used if filename is not None. **kwargs : additional arguments for `matplotlib.pyplot.hist` Returns @@ -161,7 +193,8 @@ def plot_library_size(data, return histogram(libsize, cutoff=cutoff, percentile=percentile, bins=bins, log=log, ax=ax, figsize=figsize, - xlabel=xlabel, title=title, fontsize=fontsize, **kwargs) + xlabel=xlabel, title=title, fontsize=fontsize, + filename=filename, dpi=dpi, **kwargs) @utils._with_pkg(pkg="matplotlib", min_version=3) @@ -175,6 +208,8 @@ def plot_gene_set_expression(data, genes=None, xlabel='Gene expression', title=None, fontsize=None, + filename=None, + dpi=None, **kwargs): """Plot the histogram of the expression of a gene set. @@ -216,6 +251,12 @@ def plot_gene_set_expression(data, genes=None, Axis title. fontsize : float or None (default: None) Base font size. + filename : str or None (default: None) + file to which the output is saved + dpi : int or None, optional (default: None) + The resolution in dots per inch. If None it will default to the value + savefig.dpi in the matplotlibrc file. If 'figure' it will set the dpi + to be the value of the figure. Only used if filename is not None. **kwargs : additional arguments for `matplotlib.pyplot.hist` Returns @@ -245,4 +286,5 @@ def plot_gene_set_expression(data, genes=None, return histogram(expression, cutoff=cutoff, percentile=percentile, bins=bins, log=log, ax=ax, figsize=figsize, - xlabel=xlabel, title=title, fontsize=fontsize, **kwargs) + xlabel=xlabel, title=title, fontsize=fontsize, + filename=filename, dpi=dpi, **kwargs) diff --git a/scprep/plot/jitter.py b/scprep/plot/jitter.py index 3b75f422..3e58ccd7 100644 --- a/scprep/plot/jitter.py +++ b/scprep/plot/jitter.py @@ -30,7 +30,7 @@ def x_coords(self): def jitter(labels, values, sigma=0.1, c=None, cmap=None, cmap_scale='linear', s=None, - plot_means=True, means_s=100, means_c='xkcd:light lavender', + plot_means=True, means_s=100, means_c='lightgrey', discrete=None, ax=None, legend=None, colorbar=None, @@ -84,7 +84,7 @@ def jitter(labels, values, sigma=0.1, If True, plot the mean value for each label. means_s : float, optional (default: 100) Point size for mean values. - means_c : string, list-like or matplotlib color, optional (default: 'xkcd:light lavender') + means_c : string, list-like or matplotlib color, optional (default: 'lightgrey') Point color(s) for mean values. discrete : bool or None, optional (default: None) If True, the legend is categorical. If False, the legend is a colorbar. @@ -113,13 +113,8 @@ def jitter(labels, values, sigma=0.1, If a list, sets custom axis tick labels {x,y}ticklabels : True, False, or list-like (default: None) If set, overrides `ticklabels` - label_prefix : str or None (default: None) - Prefix for all axis labels. Axes will be labelled `label_prefix`1, - `label_prefix`2, etc. Can be overriden by setting `xlabel`, - `ylabel`, and `zlabel`. {x,y}label : str or None (default : None) - Axis labels. Overrides the automatic label given by - label_prefix. If None and label_prefix is None, no label is set. + Axis labels. If None, no label is set. title : str or None (default: None) axis title. If None, no title is set. fontsize : float or None (default: None) @@ -157,7 +152,8 @@ def jitter(labels, values, sigma=0.1, labels, values, c=c, discrete=discrete, cmap=cmap, cmap_scale=cmap_scale, vmin=vmin, vmax=vmax, s=s, - legend=legend, colorbar=colorbar) + legend=legend, colorbar=colorbar, + xlabel=xlabel, ylabel=ylabel) fig, ax, show_fig = _get_figure( ax, figsize, subplot_kw=params.subplot_kw) @@ -190,9 +186,9 @@ def jitter(labels, values, sigma=0.1, xticklabels = params.x_labels # label axes - label_axis(ax.xaxis, xticks, xticklabels, xlabel) + label_axis(ax.xaxis, xticks, xticklabels, params.xlabel) label_axis(ax.yaxis, _with_default(yticks, ticks), - _with_default(yticklabels, ticklabels), ylabel) + _with_default(yticklabels, ticklabels), params.ylabel) # manually set x limits xmin = np.min(params.x_coords) @@ -216,8 +212,8 @@ def jitter(labels, values, sigma=0.1, scale=sc.norm) # save and show - if filename is not None: - fig.savefig(filename, dpi=dpi) if show_fig: show(fig) + if filename is not None: + fig.savefig(filename, dpi=dpi) return ax diff --git a/scprep/plot/scatter.py b/scprep/plot/scatter.py index 6821c049..9bfa5738 100644 --- a/scprep/plot/scatter.py +++ b/scprep/plot/scatter.py @@ -9,21 +9,34 @@ _with_default) from .tools import (create_colormap, create_normalize, label_axis, generate_colorbar, generate_legend) +from . import colors from .._lazyload import matplotlib as mpl plt = mpl.pyplot +def _squeeze_array(x): + x = utils.toarray([x]).squeeze() + try: + len(x) + except TypeError: + x = x[None] + return x + + class _ScatterParams(object): - def __init__(self, x, y, z=None, c=None, discrete=None, + def __init__(self, x, y, z=None, c=None, mask=None, + discrete=None, cmap=None, cmap_scale=None, vmin=None, vmax=None, s=None, legend=None, colorbar=None, - shuffle=True): - self._x = utils.toarray(x).squeeze() - self._y = utils.toarray(y).squeeze() - self._z = utils.toarray(z).squeeze() if z is not None else None + xlabel=None, ylabel=None, zlabel=None, + label_prefix=None, shuffle=True): + self._x = x + self._y = y + self._z = z if z is not None else None self._c = c + self._mask = mask self._discrete = discrete self._cmap = cmap self._cmap_scale = cmap_scale @@ -34,9 +47,14 @@ def __init__(self, x, y, z=None, c=None, discrete=None, self._colorbar = colorbar self._labels = None self._c_discrete = None + self._label_prefix = label_prefix + self._xlabel = xlabel + self._ylabel = ylabel + self._zlabel = zlabel self.shuffle = shuffle self.check_size() self.check_c() + self.check_mask() self.check_s() self.check_discrete() self.check_legend() @@ -44,32 +62,49 @@ def __init__(self, x, y, z=None, c=None, discrete=None, self.check_cmap_scale() self.check_vmin_vmax() + @property + def x_array(self): + return _squeeze_array(self._x) + + @property + def y_array(self): + return _squeeze_array(self._y) + + @property + def z_array(self): + return _squeeze_array(self._z) if self._z is not None else None + @property def size(self): - return len(self._x) + try: + return self._size + except AttributeError: + self._size = len(self.x_array) + return self._size @property def plot_idx(self): try: return self._plot_idx except AttributeError: + self._plot_idx = np.arange(self.size) + if self._mask is not None: + self._plot_idx = self._plot_idx[self._mask] if self.shuffle: - self._plot_idx = np.random.permutation(self.size) - else: - self._plot_idx = np.arange(self.size) + self._plot_idx = np.random.permutation(self._plot_idx) return self._plot_idx @property def x(self): - return self._x[self.plot_idx] + return self.x_array[self.plot_idx] @property def y(self): - return self._y[self.plot_idx] + return self.y_array[self.plot_idx] @property def z(self): - return self._z[self.plot_idx] if self._z is not None else None + return self.z_array[self.plot_idx] if self._z is not None else None @property def data(self): @@ -81,9 +116,9 @@ def data(self): @property def _data(self): if self._z is not None: - return [self._x, self._y, self._z] + return [self.x_array, self.y_array, self.z_array] else: - return [self._x, self._y] + return [self.x_array, self.y_array] @property def s(self): @@ -110,13 +145,20 @@ def array_c(self): self._c) return self._array_c + @property + def _c_masked(self): + if self.constant_c() or self._mask is None: + return self._c + else: + return self._c[self._mask] + @property def c_unique(self): """Get unique values in c to avoid recomputing every time""" try: return self._c_unique except AttributeError: - self._c_unique = np.unique(self._c) + self._c_unique = np.unique(self._c_masked) return self._c_unique @property @@ -145,7 +187,7 @@ def discrete(self): else: if isinstance(self._cmap, dict) or not \ np.all([isinstance(x, numbers.Number) - for x in self._c]): + for x in self._c_masked]): # cmap dictionary or non-numeric values force discrete return True else: @@ -171,8 +213,9 @@ def c_discrete(self): for i, label in enumerate(self._labels): self._c_discrete[self._c == label] = i else: - self._c_discrete, self._labels = pd.factorize( - self._c, sort=True) + self._c_discrete = np.zeros_like(self._c, dtype=int) + self._c_discrete[self._mask], self._labels = pd.factorize( + self._c_masked, sort=True) return self._c_discrete @property @@ -215,7 +258,7 @@ def vmin(self): if self.constant_c() or self.array_c() or self.discrete: return None else: - return np.min(self.c) + return np.nanmin(self.c) @property def vmax(self): @@ -225,7 +268,7 @@ def vmax(self): if self.constant_c() or self.array_c() or self.discrete: return None else: - return np.max(self.c) + return np.nanmax(self.c) def list_cmap(self): """Is the colormap a list?""" @@ -257,11 +300,7 @@ def cmap(self): if self.constant_c() or self.array_c(): return None elif self.discrete: - n_unique_colors = self.n_c_unique - if n_unique_colors <= 10: - return self.process_string_cmap('tab10') - else: - return self.process_string_cmap('tab20') + return colors.tab(n=self.n_c_unique) else: return self.process_string_cmap('inferno') @@ -354,14 +393,21 @@ def check_size(self): def check_c(self): if not self.constant_c(): - self._c = utils.toarray(self._c).squeeze() + self._c = _squeeze_array(self._c) if not len(self._c) == self.size: raise ValueError("Expected c of length {} or 1. Got {}".format( self.size, len(self._c))) + def check_mask(self): + if self._mask is not None: + self._mask = _squeeze_array(self._mask) + if not len(self._mask) == self.size: + raise ValueError("Expected mask of length {}. Got {}".format( + self.size, len(self._mask))) + def check_s(self): if self._s is not None and not isinstance(self._s, numbers.Number): - self._s = utils.toarray(self._s).squeeze() + self._s = _squeeze_array(self._s) if not len(self._s) == self.size: raise ValueError("Expected s of length {} or 1. Got {}".format( self.size, len(self._s))) @@ -414,10 +460,46 @@ def check_cmap_scale(self): UserWarning) self._cmap_scale = 'linear' + @property + def xlabel(self): + if self._xlabel is not None: + return self._xlabel + elif self._label_prefix is not None: + return self._label_prefix + "1" + elif isinstance(self._x, pd.Series): + return self._x.name + else: + return None + + @property + def ylabel(self): + if self._ylabel is not None: + return self._ylabel + elif self._label_prefix is not None: + return self._label_prefix + "2" + elif isinstance(self._y, pd.Series): + return self._y.name + else: + return None + + @property + def zlabel(self): + if self._z is None: + return None + elif self._zlabel is not None: + return self._zlabel + elif self._label_prefix is not None: + return self._label_prefix + "3" + elif isinstance(self._z, pd.Series): + return self._z.name + else: + return None + @utils._with_pkg(pkg="matplotlib", min_version=3) -def scatter(x, y, z=None, - c=None, cmap=None, cmap_scale='linear', s=None, discrete=None, +def scatter(x, y, z=None, mask=None, + c=None, cmap=None, cmap_scale='linear', s=None, + discrete=None, ax=None, legend=None, colorbar=None, shuffle=True, @@ -458,6 +540,8 @@ def scatter(x, y, z=None, data for y axis z : list-like, optional (default: None) data for z axis + mask : list-like, optional (default: None) + boolean mask to hide data points c : list-like or None, optional (default: None) Color vector. Can be a single color value (RGB, RGBA, or named matplotlib colors), an array of these of length n_samples, or a list of @@ -560,11 +644,12 @@ def scatter(x, y, z=None, """ with temp_fontsize(fontsize): params = _ScatterParams( - x, y, z, c=c, discrete=discrete, + x, y, z, c=c, mask=mask, discrete=discrete, cmap=cmap, cmap_scale=cmap_scale, vmin=vmin, vmax=vmax, s=s, legend=legend, colorbar=colorbar, - shuffle=shuffle) + xlabel=xlabel, ylabel=ylabel, zlabel=zlabel, + label_prefix=label_prefix, shuffle=shuffle) fig, ax, show_fig = _get_figure( ax, figsize, subplot_kw=params.subplot_kw) @@ -575,23 +660,14 @@ def scatter(x, y, z=None, c=params.c, cmap=params.cmap, norm=params.norm, s=params.s, vmin=params.vmin, vmax=params.vmax, **plot_kwargs) - # automatic axis labels - if label_prefix is not None: - if xlabel is None: - xlabel = label_prefix + "1" - if ylabel is None: - ylabel = label_prefix + "2" - if zlabel is None: - zlabel = label_prefix + "3" - # label axes label_axis(ax.xaxis, _with_default(xticks, ticks), - _with_default(xticklabels, ticklabels), xlabel) + _with_default(xticklabels, ticklabels), params.xlabel) label_axis(ax.yaxis, _with_default(yticks, ticks), - _with_default(yticklabels, ticklabels), ylabel) + _with_default(yticklabels, ticklabels), params.ylabel) if z is not None: label_axis(ax.zaxis, _with_default(zticks, ticks), - _with_default(zticklabels, ticklabels), zlabel) + _with_default(zticklabels, ticklabels), params.zlabel) if title is not None: ax.set_title(title, fontsize=parse_fontsize(None, 'xx-large')) @@ -614,15 +690,15 @@ def scatter(x, y, z=None, ax.view_init(elev=elev, azim=azim) # save and show - if filename is not None: - fig.savefig(filename, dpi=dpi) if show_fig: show(fig) + if filename is not None: + fig.savefig(filename, dpi=dpi) return ax @utils._with_pkg(pkg="matplotlib", min_version=3) -def scatter2d(data, +def scatter2d(data, mask=None, c=None, cmap=None, cmap_scale='linear', s=None, discrete=None, ax=None, legend=None, colorbar=None, shuffle=True, figsize=None, @@ -652,6 +728,8 @@ def scatter2d(data, ---------- data : array-like, shape=[n_samples, n_features] Input data. Only the first two components will be used. + mask : list-like, optional (default: None) + boolean mask to hide data points c : list-like or None, optional (default: None) Color vector. Can be a single color value (RGB, RGBA, or named matplotlib colors), an array of these of length n_samples, or a list of @@ -749,8 +827,11 @@ def scatter2d(data, >>> data[colors == 'a'] += 10 >>> scprep.plot.scatter2d(data, c=colors, cmap={'a' : [1,0,0,1], 'b' : 'xkcd:sky blue'}) """ + if isinstance(data, list): + data = utils.toarray(data) return scatter(x=select.select_cols(data, idx=0), y=select.select_cols(data, idx=1), + mask=mask, c=c, cmap=cmap, cmap_scale=cmap_scale, s=s, discrete=discrete, ax=ax, legend=legend, colorbar=colorbar, shuffle=shuffle, figsize=figsize, @@ -774,7 +855,7 @@ def scatter2d(data, @utils._with_pkg(pkg="matplotlib", min_version=3) -def scatter3d(data, +def scatter3d(data, mask=None, c=None, cmap=None, cmap_scale='linear', s=None, discrete=None, ax=None, legend=None, colorbar=None, shuffle=True, @@ -809,6 +890,8 @@ def scatter3d(data, ---------- data : array-like, shape=[n_samples, n_features] Input data. Only the first two components will be used. + mask : list-like, optional (default: None) + boolean mask to hide data points c : list-like or None, optional (default: None) Color vector. Can be a single color value (RGB, RGBA, or named matplotlib colors), an array of these of length n_samples, or a list of @@ -910,9 +993,15 @@ def scatter3d(data, >>> data[colors == 'a'] += 5 >>> scprep.plot.scatter3d(data, c=colors, cmap={'a' : [1,0,0,1], 'b' : 'xkcd:sky blue'}) """ - return scatter(x=select.select_cols(data, idx=0), - y=select.select_cols(data, idx=1), - z=select.select_cols(data, idx=2), + if isinstance(data, list): + data = utils.toarray(data) + try: + x = select.select_cols(data, idx=0) + y = select.select_cols(data, idx=1) + z = select.select_cols(data, idx=2) + except IndexError: + raise ValueError("Expected data.shape[1] >= 3. Got {}".format(data.shape[1])) + return scatter(x=x, y=y, z=z, mask=mask, c=c, cmap=cmap, cmap_scale=cmap_scale, s=s, discrete=discrete, ax=ax, legend=legend, colorbar=colorbar, shuffle=shuffle, figsize=figsize, @@ -977,7 +1066,7 @@ def rotate_scatter3d(data, savefig.dpi in the matplotlibrc file. If 'figure' it will set the dpi to be the value of the figure. Only used if filename is not None. **kwargs : keyword arguments - See :~func:`phate.plot.scatter3d`. + See :~func:`scprep.plot.scatter3d`. Returns ------- diff --git a/scprep/plot/scree.py b/scprep/plot/scree.py index fa98b2f6..fc693798 100644 --- a/scprep/plot/scree.py +++ b/scprep/plot/scree.py @@ -1,6 +1,7 @@ import numpy as np from .. import utils +from .._lazyload import matplotlib as mpl from .utils import (_get_figure, show, temp_fontsize) @@ -10,7 +11,7 @@ @utils._with_pkg(pkg="matplotlib", min_version=3) def scree_plot(singular_values, cumulative=False, ax=None, figsize=None, xlabel='Principal Component', ylabel='Explained Variance (%)', - fontsize=None, + fontsize=None, filename=None, dpi=None, **kwargs): """Plot the explained variance of each principal component @@ -28,6 +29,12 @@ def scree_plot(singular_values, cumulative=False, ax=None, figsize=None, Labels to display on the x and y axis. fontsize : float or None (default: None) Base font size. + filename : str or None (default: None) + file to which the output is saved + dpi : int or None, optional (default: None) + The resolution in dots per inch. If None it will default to the value + savefig.dpi in the matplotlibrc file. If 'figure' it will set the dpi + to be the value of the figure. Only used if filename is not None. **kwargs : additional arguments for `matplotlib.pyplot.plot` Returns @@ -46,14 +53,18 @@ def scree_plot(singular_values, cumulative=False, ax=None, figsize=None, """ with temp_fontsize(fontsize): explained_variance = singular_values ** 2 - explained_variance = explained_variance / explained_variance.sum() + explained_variance = explained_variance / explained_variance.sum() * 100 if cumulative: explained_variance = np.cumsum(explained_variance) fig, ax, show_fig = _get_figure(ax, figsize) - ax.plot(np.arange(len(explained_variance)), - explained_variance, **kwargs) + ax.bar(np.arange(len(explained_variance)) + 1, + explained_variance, **kwargs) label_axis(ax.xaxis, label=xlabel) label_axis(ax.yaxis, label=ylabel) + ax.xaxis.set_major_locator(mpl.ticker.MaxNLocator(integer=True)) + ax.set_xlim(0.3, len(explained_variance) + 0.7) if show_fig: show(fig) + if filename is not None: + fig.savefig(filename, dpi=dpi) return ax diff --git a/scprep/plot/utils.py b/scprep/plot/utils.py index 900aa3fb..f0d77e92 100644 --- a/scprep/plot/utils.py +++ b/scprep/plot/utils.py @@ -82,8 +82,8 @@ def show(fig): fig : matplotlib.Figure Figure to show """ + fig.tight_layout() if _mpl_is_gui_backend(): - fig.tight_layout() if platform.system() == "Windows": plt.show(block=True) else: diff --git a/scprep/plot/variable_genes.py b/scprep/plot/variable_genes.py new file mode 100644 index 00000000..c5fc6b43 --- /dev/null +++ b/scprep/plot/variable_genes.py @@ -0,0 +1,70 @@ +from .scatter import scatter +from .. import utils, measure + + +@utils._with_pkg(pkg="matplotlib", min_version=3) +def plot_gene_variability(data, span=0.7, interpolate=0.2, kernel_size=0.05, + cutoff=None, percentile=90, + ax=None, figsize=None, + xlabel='Gene mean', + ylabel='Standardized variance', + title=None, + fontsize=None, + filename=None, + dpi=None, **kwargs): + """Plot the histogram of gene variability + + Variability is computed as the deviation from a loess fit + to the rolling median of the mean-variance curve + + Parameters + ---------- + data : array-like, shape=[n_samples, n_features] + Input data. Multiple datasets may be given as a list of array-likes. + span : float, optional (default: 0.7) + Fraction of genes to use when computing the loess estimate at each point + interpolate : float, optional (default: 0.2) + Multiple of the standard deviation of variances at which to interpolate + linearly in order to reduce computation time. + kernel_size : float or int, optional (default: 0.05) + Width of rolling median window. If a float, the width is given by + kernel_size * data.shape[1] + cutoff : float or `None`, optional (default: `None`) + Absolute cutoff at which to draw a vertical line. + Only one of `cutoff` and `percentile` may be given. + percentile : float or `None`, optional (default: 90) + Percentile between 0 and 100 at which to draw a vertical line. + Only one of `cutoff` and `percentile` may be given. + ax : `matplotlib.Axes` or None, optional (default: None) + Axis to plot on. If None, a new axis will be created. + figsize : tuple or None, optional (default: None) + If not None, sets the figure size (width, height) + [x,y]label : str, optional + Labels to display on the x and y axis. + title : str or None, optional (default: None) + Axis title. + fontsize : float or None (default: None) + Base font size. + filename : str or None (default: None) + file to which the output is saved + dpi : int or None, optional (default: None) + The resolution in dots per inch. If None it will default to the value + savefig.dpi in the matplotlibrc file. If 'figure' it will set the dpi + to be the value of the figure. Only used if filename is not None. + **kwargs : additional arguments for `matplotlib.pyplot.hist` + + Returns + ------- + ax : `matplotlib.Axes` + axis on which plot was drawn + """ + variability, means = measure.gene_variability(data, span=span, interpolate=interpolate, + kernel_size=kernel_size, return_means=True) + keep_cells_idx = utils._get_filter_idx(variability, + cutoff, percentile, + keep_cells='above') + return scatter(means, variability, c=keep_cells_idx, + cmap={True : 'red', False : 'black'}, + xlabel=xlabel, ylabel=ylabel, title=title, + fontsize=fontsize, filename=filename, dpi=dpi, + **kwargs) diff --git a/scprep/reduce.py b/scprep/reduce.py index 028d36c6..089eaa33 100644 --- a/scprep/reduce.py +++ b/scprep/reduce.py @@ -238,11 +238,12 @@ def pca(data, n_components=100, eps=0.3, Parameter to control the quality of the embedding of sparse input. Smaller values lead to more accurate embeddings but higher computational and memory costs - method : {'svd', 'orth_rproj', 'rproj'}, optional (default: 'svd') + method : {'svd', 'orth_rproj', 'rproj', 'dense'}, optional (default: 'svd') Dimensionality reduction method applied prior to mean centering of sparse input. The method choice affects accuracy - (`svd` > `orth_rproj` > `rproj`) comes with increased - computational cost (but not memory.) + (`svd` > `orth_rproj` > `rproj`) and comes with increased + computational cost (but not memory.) On the other hand, + `method='dense'` adds a memory cost but is faster. seed : int, RandomState or None, optional (default: None) Random state. return_singular_values : bool, optional (default: False) @@ -277,10 +278,14 @@ def pca(data, n_components=100, eps=0.3, n_components, min(data.shape))) # handle dataframes - if isinstance(data, pd.SparseDataFrame): - data = data.to_coo() - elif isinstance(data, pd.DataFrame): - data = data.values + if isinstance(data, pd.DataFrame): + index = data.index + else: + index = None + if method == 'dense': + data = utils.toarray(data) + else: + data = utils.to_array_or_spmatrix(data) # handle sparsity if sparse.issparse(data): @@ -299,6 +304,10 @@ def pca(data, n_components=100, eps=0.3, pca_op = decomposition.PCA(n_components, random_state=seed) data = pca_op.fit_transform(data) + if index is not None: + data = pd.DataFrame(data, index=index, + columns=["PC{}".format(i+1) for i in range(n_components)]) + if return_singular_values: data = (data, pca_op.singular_values_) return data diff --git a/scprep/run/__init__.py b/scprep/run/__init__.py index 220af9f1..0a07c96e 100644 --- a/scprep/run/__init__.py +++ b/scprep/run/__init__.py @@ -1,2 +1,3 @@ -from .r_function import RFunction +from .r_function import RFunction, install_bioconductor from .splatter import SplatSimulate +from .slingshot import Slingshot diff --git a/scprep/run/r_function.py b/scprep/run/r_function.py index ff644200..12df8154 100644 --- a/scprep/run/r_function.py +++ b/scprep/run/r_function.py @@ -138,3 +138,49 @@ def __call__(self, *args, rpy_verbose=None, **kwargs): robject = self.convert(robject) self.verbose = default_verbose return robject + + +_install_bioconductor = RFunction( + args="package = character(), site_repository = character(), update = FALSE, version = BiocManager::version()", + body=""" + if (!require('BiocManager')) install.packages("BiocManager") + ask <- !update + if (length(package) == 0) { + BiocManager::install(site_repository=site_repository, + update=update, ask=ask, version=version) + } else { + for (pkg in package) { + if (update || !require(pkg, character.only = TRUE)) { + BiocManager::install(pkg, site_repository=site_repository, + update=update, ask=ask, version=version) + } + } + } + """) + +def install_bioconductor(package = None, site_repository = None, update = False, version = None, verbose = True): + """Install a Bioconductor package + + Parameters + ---------- + site_repository : string, optional (default: None) + additional repository in which to look for packages to install. + This repository will be prepended to the default repositories + update : boolean, optional (default: False) + When False, don't attempt to update old packages. + When True, update old packages automatically. + version : string, optional (default: None) + Bioconductor version to install, e.g., version = "3.8". + The special symbol version = "devel" installs the current 'development' version. + If None, installs from the current version. + verbose : boolean, optional (default: True) + Install script verbosity. + """ + kwargs = {'update': update, 'rpy_verbose': verbose} + if package is not None: + kwargs['package'] = package + if site_repository is not None: + kwargs['site_repository'] = site_repository + if version is not None: + kwargs['version'] = version + _install_bioconductor(**kwargs) \ No newline at end of file diff --git a/scprep/run/slingshot.py b/scprep/run/slingshot.py new file mode 100644 index 00000000..f24a16e1 --- /dev/null +++ b/scprep/run/slingshot.py @@ -0,0 +1,236 @@ +import numpy as np +import pandas as pd +import warnings + +from . import r_function +from .. import utils + + +def install(site_repository = None, update = False, version = None, verbose = True): + """Install the required R packages to run Slingshot + + Parameters + ---------- + site_repository : string, optional (default: None) + additional repository in which to look for packages to install. + This repository will be prepended to the default repositories + update : boolean, optional (default: False) + When False, don't attempt to update old packages. + When True, update old packages automatically. + version : string, optional (default: None) + Bioconductor version to install, e.g., version = "3.8". + The special symbol version = "devel" installs the current 'development' version. + If None, installs from the current version. + verbose : boolean, optional (default: True) + Install script verbosity. + """ + r_function.install_bioconductor( + 'slingshot', site_repository=site_repository, + update=update, version=version, verbose=verbose) + + +_Slingshot = r_function.RFunction( + setup=""" + library(slingshot) + """, + args=""" + data, cluster_labels, + start_cluster = NULL, end_cluster = NULL, + distance = NULL, omega = NULL, lineages = list(), shrink = TRUE, + extend = "y", reweight = TRUE, reassign = TRUE, thresh = 0.001, + max_iter = 15, stretch = 2, + smoother = "smooth.spline", shrink_method = "cosine", + allow_breaks = TRUE, seed = NULL + """, + body=""" + set.seed(seed) + data <- as.matrix(data) + cluster_labels <- as.factor(cluster_labels) + + # Run Slingshot + sling <- slingshot(data, clusterLabels = cluster_labels, + start.clus = start_cluster, end.clus = end_cluster, + dist.fun = distance, omega = omega, lineages = lineages, shrink = shrink, + extend = extend, reweight = reweight, reassign = reassign, thresh = thresh, + maxit = max_iter, stretch = stretch, + smoother = smoother, shrink.method = shrink_method, + allow.breaks = allow_breaks) + list(pseudotime = slingPseudotime(sling), + curves = lapply(sling@curves, function(curve) curve$s[curve$ord,])) + """) + + +def Slingshot( + data, cluster_labels, + start_cluster = None, end_cluster = None, + distance = None, omega = None, shrink = True, + extend = "y", reweight = True, reassign = True, thresh = 0.001, + max_iter = 15, stretch = 2, + smoother = "smooth.spline", shrink_method = "cosine", + allow_breaks = True, + seed=None, verbose=1): + """Perform lineage inference with Slingshot + + Given a reduced-dimensional data matrix n by p and a vector of cluster labels + (or matrix of soft cluster assignments, potentially including a -1 label for "unclustered"), + this function performs lineage inference using a cluster-based minimum spanning tree and + constructing simulatenous principal curves for branching paths through the tree. + + For more details, read about Slingshot on [GitHub](https://github.com/kstreet13/slingshot) + and [Bioconductor](https://bioconductor.org/packages/release/bioc/html/slingshot.html). + + Parameters + ---------- + data : array-like, shape=[n_samples, n_dimensions] + matrix of (reduced dimension) coordinates + to be used for lineage inference. + cluster_labels : list-like, shape=[n_samples] + a vector of cluster labels, optionally including -1's for "unclustered." + start_cluster : string, optional (default: None) + indicates the cluster(s) of origin. + Lineages will be represented by paths coming out of this cluster. + end_cluster : string, optional (default: None) + indicates the cluster(s) which will be forced leaf nodes. + This introduces a constraint on the MST algorithm. + distance : callable, optional (default: None) + method for calculating distances between clusters. + Must take two matrices as input, corresponding to subsets of reduced_dim. + If the minimum cluster size is larger than the number dimensions, + the default is to use the joint covariance matrix to find squared distance + between cluster centers. If not, the default is to use the diagonal of the + joint covariance matrix. Not currently implemented + omega : float, optional (default: None) + this granularity parameter determines the distance between every + real cluster and the artificial cluster. + It is parameterized such that this distance is omega / 2, + making omega the maximum distance between two connected clusters. + By default, omega = Inf. + shrink : boolean or float, optional (default: True) + boolean or numeric between 0 and 1, determines whether and how much to shrink + branching lineages toward their average prior to the split. + extend : {'y', 'n', 'pc1'}, optional (default: "y") + how to handle root and leaf clusters of lineages when + constructing the initial, piece-wise linear curve. + reweight : boolean, optional (default: True) + whether to allow cells shared between lineages to be reweighted during curve-fitting. + If True, cells shared between lineages will be iteratively + reweighted based on the quantiles of their projection distances to each curve. + reassign : boolean, optional (default: True) + whether to reassign cells to lineages at each iteration. + If True, cells will be added to a lineage when their + projection distance to the curve is less than the median + distance for all cells currently assigned to the lineage. + Additionally, shared cells will be removed from a lineage if + their projection distance to the curve is above the 90th + percentile and their weight along the curve is less than 0.1. + thresh : float, optional (default: 0.001) + determines the convergence criterion. Percent change in the + total distance from cells to their projections along curves + must be less than thresh. + max_iter : int, optional (default: 15) + maximum number of iterations + stretch : int, optional (default: 2) + factor between 0 and 2 by which curves can be extrapolated beyond endpoints + smoother : {"smooth.spline", "lowess", "periodic_lowess"}, optional (default: "smooth.spline") + choice of smoother. "periodic_lowess" allows one to fit closed + curves. Beware, you may want to use iter = 0 with "lowess". + shrink_method : string, optional (default: "cosine") + how to determine the appropriate amount of shrinkage for a + branching lineage. Accepted values: "gaussian", "rectangular", + "triangular", "epanechnikov", "biweight", "triweight", + "cosine", "optcosine", "density". + allow_breaks : boolean, optional (default: True) + determines whether curves that branch very close to the origin + should be allowed to have different starting points. + seed : int or None, optional (default: None) + Seed to use for generating random numbers. + verbose : int, optional (default: 1) + Logging verbosity between 0 and 2. + + Returns + ------- + slingshot : dict + Contains the following keys: + pseudotime : array-like, shape=[n_samples, n_curves] + Pseudotime projection of each cell onto each principal curve. + Value is `np.nan` if the cell does not lie on the curve + branch : list-like, shape=[n_samples] + Branch assignment for each cell + curves : array_like, shape=[n_curves, n_samples, n_dimensions] + Coordinates of each principle curve in the reduced dimension + + Examples + -------- + >>> import scprep + >>> import phate + >>> data, clusters = phate.tree.gen_dla(n_branch=4, n_dim=200, branch_length=200) + >>> phate_op = phate.PHATE() + >>> data_phate = phate_op.fit_transform(data) + >>> slingshot = scprep.run.Slingshot(data_phate, clusters) + >>> ax = scprep.plot.scatter2d(data_phate, c=slingshot['pseudotime'][:,0], cmap='magma', legend_title='Branch 1') + >>> scprep.plot.scatter2d(data_phate, c=slingshot['pseudotime'][:,1], cmap='viridis', ax=ax, + ... ticks=False, label_prefix='PHATE', legend_title='Branch 2') + >>> for curve in slingshot['curves']: + ... ax.plot(curve[:,0], curve[:,1], c='black') + >>> ax = scprep.plot.scatter2d(data_phate, c=slingshot['branch'], legend_title='Branch', + ... ticks=False, label_prefix='PHATE') + >>> for curve in slingshot['curves']: + ... ax.plot(curve[:,0], curve[:,1], c='black') + """ + if seed is None: + seed = np.random.randint(2**16 - 1) + if distance is not None: + raise NotImplementedError("distance argument not currently implemented") + np.random.seed(seed) + + index = data.index if isinstance(data, pd.DataFrame) else None + + data = utils.toarray(data) + if data.shape[1] > 3: + warnings.warn("Expected data to be low-dimensional. " + "Got data.shape[1] = {}".format(data.shape[1]), + UserWarning) + cluster_labels = utils.toarray(cluster_labels).flatten() + if not cluster_labels.shape[0] == data.shape[0]: + raise ValueError("Expected len(cluster_labels) ({}) to equal " + "data.shape[0] ({})".format(cluster_labels.shape[0], data.shape[0])) + + kwargs = {} + if start_cluster is not None: + kwargs['start_cluster'] = start_cluster + if end_cluster is not None: + kwargs['end_cluster'] = end_cluster + if omega is not None: + kwargs['omega'] = omega + + slingshot = _Slingshot( + data=data, cluster_labels=cluster_labels, + shrink = shrink, + extend = extend, reweight = reweight, reassign = reassign, thresh = thresh, + max_iter = max_iter, stretch = stretch, + smoother = smoother, shrink_method = shrink_method, + allow_breaks = allow_breaks, **kwargs, + seed=seed, rpy_verbose=verbose) + slingshot['curves'] = np.array(list(slingshot['curves'].values())) + + membership = (~np.isnan(slingshot['pseudotime'])).astype(int) + branch = np.sum(membership * (2**np.arange(membership.shape[1])), axis=1) + # reorder based on pseudotime + branch_ids = np.unique(branch) + branch_means = [np.nanmean(slingshot['pseudotime'][branch==id]) + if not np.all(np.isnan(slingshot['pseudotime'][branch==id])) else np.nan + for id in branch_ids] + branch_order = np.argsort(branch_means) + branch_old = branch.copy() + for i in range(len(branch_order)): + j = branch_order[i] + if np.isnan(branch_means[j]): + branch[branch_old == branch_ids[j]] = -1 + else: + branch[branch_old == branch_ids[j]] = i + slingshot['branch'] = branch + + if index is not None: + slingshot['pseudotime'] = pd.DataFrame(slingshot['pseudotime'], index=index) + slingshot['branch'] = pd.Series(slingshot['branch'], name='branch', index=index) + return slingshot diff --git a/scprep/run/splatter.py b/scprep/run/splatter.py index 2bcaa187..4bbc18cb 100644 --- a/scprep/run/splatter.py +++ b/scprep/run/splatter.py @@ -1,8 +1,32 @@ import numpy as np -from .r_function import RFunction +from . import r_function -_SplatSimulate = RFunction( + +def install(site_repository = None, update = False, version = None, verbose = True): + """Install the required R packages to run Splatter + + Parameters + ---------- + site_repository : string, optional (default: None) + additional repository in which to look for packages to install. + This repository will be prepended to the default repositories + update : boolean, optional (default: False) + When False, don't attempt to update old packages. + When True, update old packages automatically. + version : string, optional (default: None) + Bioconductor version to install, e.g., version = "3.8". + The special symbol version = "devel" installs the current 'development' version. + If None, installs from the current version. + verbose : boolean, optional (default: True) + Install script verbosity. + """ + r_function.install_bioconductor( + 'splatter', site_repository=site_repository, + update=update, version=version, verbose=verbose) + + +_SplatSimulate = r_function.RFunction( setup=""" library(splatter) """, diff --git a/scprep/select.py b/scprep/select.py index 6d055be5..6d22c699 100644 --- a/scprep/select.py +++ b/scprep/select.py @@ -5,6 +5,7 @@ import warnings import re import sys + from . import utils if int(sys.version.split(".")[1]) < 7: @@ -57,7 +58,7 @@ def _check_columns_compatible(*data): raise ValueError( "Expected `data` and `extra_data` pandas inputs to have " "the same column names. Fix with " - "`scprep.select.select_cols(*extra_data, data.columns)`") + "`scprep.select.select_cols(*extra_data, idx=data.columns)`") def _check_rows_compatible(*data): @@ -73,7 +74,7 @@ def _check_rows_compatible(*data): raise ValueError( "Expected `data` and `extra_data` pandas inputs to have " "the same index. Fix with " - "`scprep.select.select_rows(*extra_data, data.index)`") + "`scprep.select.select_rows(*extra_data, idx=data.index)`") def _convert_dataframe_1d(idx): @@ -112,7 +113,7 @@ def _exact_word_regex(word): allowed_chars = ['\\(', '\\)', '\\[', '\\]', '\\.', ',', '!', '\\?', ' ', '^', '$'] wildcard = "(" + "|".join(allowed_chars) + ")+" - return "{wildcard}{word}{wildcard}".format(wildcard=wildcard, word=word) + return "{wildcard}{word}{wildcard}".format(wildcard=wildcard, word=re.escape(word)) def _get_string_subset_mask(data, starts_with=None, ends_with=None, @@ -218,7 +219,7 @@ def get_gene_set(data, starts_with=None, ends_with=None, """ if not _is_1d(data): try: - data = data.columns.values + data = data.columns.to_numpy() except AttributeError: raise TypeError("data must be a list of gene names or a pandas " "DataFrame. Got {}".format(type(data).__name__)) @@ -255,7 +256,7 @@ def get_cell_set(data, starts_with=None, ends_with=None, """ if not _is_1d(data): try: - data = data.index.values + data = data.index.to_numpy() except AttributeError: raise TypeError("data must be a list of cell names or a pandas " "DataFrame. Got {}".format(type(data).__name__)) @@ -329,21 +330,37 @@ def select_cols(data, *extra_data, idx=None, _check_idx_1d(idx) idx = idx.flatten() + if isinstance(data, pd.SparseDataFrame): + # evil deprecated dataframe; get rid of it + data = utils.SparseDataFrame(data) if isinstance(data, pd.DataFrame): try: - data = data.loc[:, idx] + if isinstance(idx, (numbers.Integral, str)): + data = data.loc[:, idx] + else: + if np.issubdtype(idx.dtype, np.dtype(bool).type): + # temporary workaround for pandas error + raise TypeError + data = data.loc[:, idx] except (KeyError, TypeError): + if isinstance(idx, str): + raise if isinstance(idx, numbers.Integral) or \ - issubclass(np.array(idx).dtype.type, numbers.Integral): + np.issubdtype(idx.dtype, np.dtype(int)) or \ + np.issubdtype(idx.dtype, np.dtype(bool)): data = data.loc[:, np.array(data.columns)[idx]] else: raise elif isinstance(data, pd.Series): try: + if np.issubdtype(idx.dtype, np.dtype(bool).type): + # temporary workaround for pandas error + raise TypeError data = data.loc[idx] except (KeyError, TypeError): if isinstance(idx, numbers.Integral) or \ - issubclass(np.array(idx).dtype.type, numbers.Integral): + np.issubdtype(idx.dtype, np.dtype(int)) or \ + np.issubdtype(idx.dtype, np.dtype(bool)): data = data.loc[np.array(data.index)[idx]] else: raise @@ -432,16 +449,28 @@ def select_rows(data, *extra_data, idx=None, _check_idx_1d(idx) idx = idx.flatten() + if isinstance(data, pd.SparseDataFrame): + # evil deprecated dataframe; get rid of it + data = utils.SparseDataFrame(data) if isinstance(data, (pd.DataFrame, pd.Series)): try: - with warnings.catch_warnings(): - warnings.filterwarnings( - "error", "Passing list-likes to .loc") + if isinstance(idx, (numbers.Integral, str)): data = data.loc[idx] + else: + if np.issubdtype(idx.dtype, np.dtype(bool).type): + # temporary workaround for pandas error + raise TypeError + with warnings.catch_warnings(): + warnings.filterwarnings( + "error", "Passing list-likes to .loc") + data = data.loc[idx] except (KeyError, TypeError, FutureWarning): + if isinstance(idx, str): + raise if isinstance(idx, numbers.Integral) or \ - issubclass(np.array(idx).dtype.type, numbers.Integral): - data = data.iloc[idx] + np.issubdtype(idx.dtype, np.dtype(int)) or \ + np.issubdtype(idx.dtype, np.dtype(bool)): + data = data.loc[np.array(data.index)[idx]] else: raise elif _is_1d(data): @@ -494,6 +523,50 @@ def subsample(*data, n=10000, seed=None): if N < n: raise ValueError("Expected n ({}) <= n_samples ({})".format(n, N)) np.random.seed(seed) - select_idx = np.random.choice(N, n, replace=False) + select_idx = np.isin(np.arange(N), np.random.choice(N, n, replace=False)) data = [select_rows(d, idx=select_idx) for d in data] return tuple(data) if len(data) > 1 else data[0] + + +def highly_variable_genes(data, *extra_data, span=0.7, interpolate=0.2, kernel_size=0.05, + cutoff=None, percentile=80): + """Select genes with high variability + + Variability is computed as the deviation from a loess fit + to the rolling median of the mean-variance curve + + Parameters + ---------- + data : array-like, shape=[n_samples, n_features] + Input data + extra_data : array-like, shape=[any, n_features], optional + Optional additional data objects from which to select the same rows + span : float, optional (default: 0.7) + Fraction of genes to use when computing the loess estimate at each point + interpolate : float, optional (default: 0.2) + Multiple of the standard deviation of variances at which to interpolate + linearly in order to reduce computation time. + kernel_size : float or int, optional (default: 0.05) + Width of rolling median window. If a float, the width is given by + kernel_size * data.shape[1] + cutoff : float, optional (default: None) + Variability above which expression is deemed significant + percentile : int, optional (Default: 80) + Percentile above or below which to remove genes. + Must be an integer between 0 and 100. Only one of `cutoff` + and `percentile` should be specified. + + Returns + ------- + data : array-like, shape=[n_samples, m_features] + Filtered output data, where m_features <= n_features + extra_data : array-like, shape=[any, m_features] + Filtered extra data, if passed. + """ + from . import measure + var_genes = measure.gene_variability(data, span=span, interpolate=interpolate, + kernel_size=kernel_size) + keep_cells_idx = utils._get_filter_idx(var_genes, + cutoff, percentile, + keep_cells='above') + return select_cols(data, *extra_data, idx=keep_cells_idx) diff --git a/scprep/stats.py b/scprep/stats.py index 263fb01f..574c2e8a 100644 --- a/scprep/stats.py +++ b/scprep/stats.py @@ -3,9 +3,11 @@ import numbers import numpy as np +import pandas as pd from scipy import stats, sparse from sklearn import neighbors, metrics -from . import plot, utils +import joblib +from . import plot, utils, select import warnings from ._lazyload import matplotlib @@ -363,6 +365,154 @@ def plot_knnDREMI(dremi, mutual_info, x, y, n_bins, n_mesh, plot.utils.show(fig) +def mean_difference(X, Y): + """Calculate the mean difference in genes between two datasets + + In the case where the data has been log normalized, + this is equivalent to fold change. + + Parameters + ---------- + X : array-like, shape=[n_cells, n_genes] + Y : array-like, shape=[m_cells, n_genes] + + Returns + ------- + difference : list-like, shape=[n_genes] + """ + if not X.shape[1] == Y.shape[1]: + raise ValueError("Expected X and Y to have the same number of columns. " + "Got shapes {}, {}".format(X.shape, Y.shape)) + X = utils.to_array_or_spmatrix(X) + Y = utils.to_array_or_spmatrix(Y) + X = utils.toarray(X.mean(axis=0)).flatten() + Y = utils.toarray(Y.mean(axis=0)).flatten() + return X - Y + + +def differential_expression(X, Y, + measure='difference', + direction='up', + gene_names=None, + n_jobs=-2): + """Calculate the most significant genes between two datasets + + Parameters + ---------- + X : array-like, shape=[n_cells, n_genes] + Y : array-like, shape=[m_cells, n_genes] + measure : {'difference', 'emd'}, optional (default: 'difference') + The measurement to be used to rank genes + direction : {'up', 'down', 'both'}, optional (default: 'up') + The direction in which to consider genes significant. If 'up', rank genes where X > Y. If 'down', rank genes where X < Y. If 'both', rank genes by absolute value. + gene_names : list-like or `None`, optional (default: `None`) + List of gene names associated with the columns of X and Y + n_jobs : int, optional (default: -2) + Number of threads to use if the measurement is parallelizable (currently used for EMD). If negative, -1 refers to all available cores. + + Returns + ------- + result : pd.DataFrame + Ordered DataFrame with a column "gene" and a column named `measure`. + """ + if not direction in ['up', 'down', 'both']: + raise ValueError("Expected `direction` in ['up', 'down', 'both']. " + "Got {}".format(direction)) + if not measure in ['difference', 'emd']: + raise ValueError("Expected `measure` in ['difference', 'emd']. " + "Got {}".format(measure)) + if not (len(X.shape) == 2 and len(Y.shape) == 2): + raise ValueError("Expected `X` and `Y` to be matrices. " + "Got shapes {}, {}".format(X.shape, Y.shape)) + [X, Y] = utils.check_consistent_columns([X, Y]) + if gene_names is not None: + if isinstance(X, pd.DataFrame): + X = select.select_cols(X, idx=gene_names) + gene_names = X.columns + if isinstance(Y, pd.DataFrame): + Y = select.select_cols(Y, idx=gene_names) + gene_names = Y.columns + if not len(gene_names) == X.shape[1]: + raise ValueError("Expected gene_names to have length {}. " + "Got {}".format(X.shape[1], len(gene_names))) + else: + if isinstance(X, pd.DataFrame) and isinstance(Y, pd.DataFrame): + gene_names = X.columns + else: + gene_names = np.arange(X.shape[1]) + X = utils.to_array_or_spmatrix(X) + Y = utils.to_array_or_spmatrix(Y) + # inconsistent behaviour from csr and csc + if sparse.issparse(X): + X = X.tocsr() + if sparse.issparse(Y): + Y = Y.tocsr() + if measure == 'difference': + difference = mean_difference(X, Y) + elif measure == 'emd': + difference = joblib.Parallel(n_jobs)(joblib.delayed(EMD)( + select.select_cols(X, idx=i), + select.select_cols(Y, idx=i)) + for i in range(X.shape[1])) + difference = np.array(difference) * np.sign(mean_difference(X, Y)) + result = pd.DataFrame({'gene' : gene_names, measure : difference}) + if direction == 'up': + result = result.sort_values([measure, 'gene'], ascending=False) + elif direction == 'down': + result = result.sort_values([measure, 'gene'], ascending=True) + elif direction == 'both': + result['measure_abs'] = np.abs(difference) + result = result.sort_values(['measure_abs', 'gene'], ascending=False) + del result['measure_abs'] + result.index = np.arange(result.shape[0]) + return result + + +def differential_expression_by_cluster(data, clusters, + measure='difference', + direction='up', + gene_names=None, + n_jobs=-2): + """Calculate the most significant genes for each cluster in a dataset + + Measurements are run for each cluster against the rest of the dataset. + + Parameters + ---------- + data : array-like, shape=[n_cells, n_genes] + clusters : list-like, shape=[n_cells] + measure : {'difference', 'emd'}, optional (default: 'difference') + The measurement to be used to rank genes + direction : {'up', 'down', 'both'}, optional (default: 'up') + The direction in which to consider genes significant. If 'up', rank genes where X > Y. If 'down', rank genes where X < Y. If 'both', rank genes by absolute value. + gene_names : list-like or `None`, optional (default: `None`) + List of gene names associated with the columns of X and Y + n_jobs : int, optional (default: -2) + Number of threads to use if the measurement is parallelizable (currently used for EMD). If negative, -1 refers to all available cores. + + Returns + ------- + result : dict(pd.DataFrame) + Dictionary containing an ordered DataFrame with a column "gene" and a column named `measure` for each cluster. + """ + if gene_names is not None and isinstance(data, pd.DataFrame): + data = select.select_cols(data, idx=gene_names) + gene_names = data.columns + if gene_names is None: + if isinstance(data, pd.DataFrame): + gene_names = data.columns + elif not len(gene_names) == data.shape[1]: + raise ValueError("Expected gene_names to have length {}. " + "Got {}".format(data.shape[1], len(gene_names))) + data = utils.to_array_or_spmatrix(data) + result = {cluster : differential_expression( + select.select_rows(data, idx=clusters==cluster), + select.select_rows(data, idx=clusters!=cluster), + measure = measure, direction = direction, + gene_names = gene_names, n_jobs = n_jobs) + for cluster in np.unique(clusters)} + return result + def _vector_coerce_dense(x): x = utils.toarray(x) x_1d = x.flatten() @@ -381,5 +531,5 @@ def _vector_coerce_two_dense(x, y): raise ValueError("Expected x and y to be 1D arrays. " "Got shapes x {}, y {}".format(x.shape, y.shape)) else: - raise + raise e return x, y diff --git a/scprep/transform.py b/scprep/transform.py index 0a55b39c..179fd026 100644 --- a/scprep/transform.py +++ b/scprep/transform.py @@ -60,7 +60,9 @@ def log(data, pseudocount=1, base=10): "Got pseudocount = {}".format(utils.matrix_min(data), pseudocount)) elif pseudocount != data_min + 1 and \ - (sparse.issparse(data) or isinstance(data, pd.SparseDataFrame)): + (sparse.issparse(data) or + isinstance(data, pd.SparseDataFrame) or + utils.is_sparse_dataframe(data)): req = "min(data) + 1 ({})".format(data_min + 1) if data_min != 0 else "1" warnings.warn("log transform on sparse data requires " diff --git a/scprep/utils.py b/scprep/utils.py index 6c57022d..de2172e4 100644 --- a/scprep/utils.py +++ b/scprep/utils.py @@ -63,7 +63,7 @@ def check_version(pkg, min_version=None): "Please install it with e.g. `pip install --user {0}`".format(pkg)) if not _version_check(module.__version__, min_version): raise ImportError( - "scprep requires {0}>={1} (installed: {2}). " + "{0}>={1} is required (installed: {2}). " "Please upgrade it with e.g." " `pip install --user --upgrade {0}`".format( pkg, min_version, module.__version__)) @@ -78,6 +78,63 @@ def _with_pkg(fun, pkg=None, min_version=None, *args, **kwargs): return fun(*args, **kwargs) +def _get_percentile_cutoff(data, cutoff=None, percentile=None, required=False): + """Get a cutoff for a dataset + + Parameters + ---------- + data : array-like + cutoff : float or None, optional (default: None) + Absolute cutoff value. Only one of cutoff and percentile may be given + percentile : float or None, optional (default: None) + Percentile cutoff value between 0 and 100. + Only one of cutoff and percentile may be given + required : bool, optional (default: False) + If True, one of cutoff and percentile must be given. + + Returns + ------- + cutoff : float or None + Absolute cutoff value. Can only be None if required is False and + cutoff and percentile are both None. + """ + if percentile is not None: + if cutoff is not None: + raise ValueError( + "Only one of `cutoff` and `percentile` should be given." + "Got cutoff={}, percentile={}".format(cutoff, percentile)) + if not isinstance(percentile, numbers.Number): + return [_get_percentile_cutoff(data, percentile=p) + for p in percentile] + if percentile < 1: + warnings.warn( + "`percentile` expects values between 0 and 100." + "Got {}. Did you mean {}?".format(percentile, + percentile * 100), + UserWarning) + cutoff = np.percentile(np.array(data).reshape(-1), percentile) + elif cutoff is None and required: + raise ValueError( + "One of either `cutoff` or `percentile` must be given.") + return cutoff + + + +def _get_filter_idx(values, + cutoff, percentile, + keep_cells): + cutoff = _get_percentile_cutoff( + values, cutoff, percentile, required=True) + if keep_cells == 'above': + keep_cells_idx = values > cutoff + elif keep_cells == 'below': + keep_cells_idx = values < cutoff + else: + raise ValueError("Expected `keep_cells` in ['above', 'below']. " + "Got {}".format(keep_cells)) + return keep_cells_idx + + def toarray(x): """Convert an array-like to a np.ndarray @@ -93,13 +150,13 @@ def toarray(x): if isinstance(x, pd.SparseDataFrame): x = x.to_coo().toarray() elif isinstance(x, pd.SparseSeries): - x = x.to_dense().values + x = x.to_dense().to_numpy() elif isinstance(x, (pd.DataFrame, pd.Series, pd.Index)): - x = x.values + x = x.to_numpy() elif isinstance(x, sparse.spmatrix): x = x.toarray() elif isinstance(x, np.matrix): - x = np.array(x) + x = x.A elif isinstance(x, list): x_out = [] for xi in x: @@ -137,7 +194,10 @@ def to_array_or_spmatrix(x): """ if isinstance(x, pd.SparseDataFrame): x = x.to_coo() - elif isinstance(x, sparse.spmatrix): + elif is_sparse_dataframe(x) or is_sparse_series(x): + x = x.sparse.to_coo() + elif isinstance(x, (sparse.spmatrix, np.ndarray, numbers.Number)) and \ + not isinstance(x, np.matrix): pass elif isinstance(x, list): x_out = [] @@ -154,6 +214,44 @@ def to_array_or_spmatrix(x): return x +def is_sparse_dataframe(x): + if isinstance(x, pd.DataFrame) and not isinstance(x, pd.SparseDataFrame): + try: + x.sparse + return True + except AttributeError: + pass + return False + + +def is_sparse_series(x): + if isinstance(x, pd.Series) and not isinstance(x, pd.SparseSeries): + try: + x.sparse + return True + except AttributeError: + pass + return False + + +def dataframe_to_sparse(x, fill_value=0.0): + return x.astype(pd.SparseDtype(float, fill_value=fill_value)) + + +def SparseDataFrame(X, columns=None, index=None, default_fill_value=0.0): + if sparse.issparse(X): + X = pd.DataFrame.sparse.from_spmatrix(X) + X.sparse.fill_value = default_fill_value + elif isinstance(X, pd.SparseDataFrame) or not isinstance(X, pd.DataFrame): + X = pd.DataFrame(X) + X = dataframe_to_sparse(X, fill_value=default_fill_value) + if columns is not None: + X.columns = columns + if index is not None: + X.index = index + return X + + def matrix_transform(data, fun, *args, **kwargs): """Perform a numerical transformation to data @@ -171,7 +269,7 @@ def matrix_transform(data, fun, *args, **kwargs): data : array-like, shape=[n_samples, n_features] Transformed output data """ - if isinstance(data, pd.SparseDataFrame): + if is_sparse_dataframe(data) or isinstance(data, pd.SparseDataFrame): data = data.copy() for col in data.columns: data[col] = fun(data[col], *args, **kwargs) @@ -213,8 +311,15 @@ def matrix_sum(data, axis=None): index = data.index if axis == 1 else data.columns sums = pd.Series(np.array(data.to_coo().sum(axis)).flatten(), index=index) + elif is_sparse_dataframe(data): + if axis is None: + sums = data.sparse.to_coo().sum() + else: + index = data.index if axis == 1 else data.columns + sums = pd.Series(np.array(data.sparse.to_coo().sum(axis)).flatten(), + index=index) elif axis is None: - sums = data.values.sum() + sums = data.to_numpy().sum() else: sums = data.sum(axis) else: @@ -224,6 +329,65 @@ def matrix_sum(data, axis=None): return sums +def matrix_std(data, axis=None): + """Get the column-wise, row-wise, or total standard deviation of a matrix + + Parameters + ---------- + data : array-like, shape=[n_samples, n_features] + Input data + axis : int or None, optional (default: None) + Axis across which to calculate standard deviation. + axis=0 gives column standard deviation, + axis=1 gives row standard deviation. + None gives the total standard deviation. + + Returns + ------- + std : array-like or float + Standard deviation along desired axis. + """ + if axis not in [0, 1, None]: + raise ValueError("Expected axis in [0, 1, None]. Got {}".format(axis)) + index = None + if isinstance(data, pd.DataFrame) and axis is not None: + if axis == 1: + index = data.index + elif axis == 0: + index = data.columns + data = to_array_or_spmatrix(data) + if sparse.issparse(data): + if axis is None: + if isinstance(data, (sparse.lil_matrix, sparse.dok_matrix)): + data = data.tocoo() + data_sq = data.copy() + data_sq.data = data_sq.data ** 2 + variance = data_sq.mean() - data.mean() ** 2 + std = np.sqrt(variance) + else: + if axis == 0: + data = data.tocsc() + next_fn = data.getcol + N = data.shape[1] + elif axis == 1: + data = data.tocsr() + next_fn = data.getrow + N = data.shape[0] + std = [] + for i in range(N): + col = next_fn(i) + col_sq = col.copy() + col_sq.data = col_sq.data ** 2 + variance = col_sq.mean() - col.mean() ** 2 + std.append(np.sqrt(variance)) + std = np.array(std) + else: + std = np.std(data, axis=axis) + if index is not None: + std = pd.Series(std, index=index, name='std') + return std + + def matrix_vector_elementwise_multiply(data, multiplier, axis=None): """Elementwise multiply a matrix by a vector @@ -276,14 +440,18 @@ def matrix_vector_elementwise_multiply(data, multiplier, axis=None): data.shape[1], multiplier.shape)) multiplier = multiplier.reshape(1, -1) - if isinstance(data, pd.SparseDataFrame): + if isinstance(data, pd.SparseDataFrame) or is_sparse_dataframe(data): data = data.copy() multiplier = multiplier.flatten() if axis == 0: - data = data.T - for col, mult in zip(data.columns, multiplier): - data[col] = data[col] * mult - data = data.T + for col in data.columns: + try: + mult_indices = data[col].values.sp_index.indices + except AttributeError: + mult_indices = data[col].values.sp_index.to_int_index().indices + new_data = data[col].values.sp_values * multiplier[mult_indices] + data[col].values.sp_values.put(np.arange(data[col].sparse.npoints), + new_data) else: for col, mult in zip(data.columns, multiplier): data[col] = data[col] * mult @@ -364,6 +532,48 @@ def matrix_any(condition): return np.sum(np.sum(condition)) > 0 +def check_consistent_columns(data): + """Ensure that a set of data matrices have consistent columns + + Parameters + ---------- + data : list of array-likes + List of matrices to be checked + + Returns + ------- + data : list of array-likes + List of matrices with consistent columns, subsetted if necessary + + Raises + ------ + ValueError + Raised if data has inconsistent number of columns and does not + have column names for subsetting + """ + matrix_type = type(data[0]) + matrix_shape = data[0].shape[1] + if issubclass(matrix_type, pd.DataFrame): + if not (np.all([d.shape[1] == matrix_shape for d in data[1:]]) and + np.all([data[0].columns == d.columns for d in data])): + common_genes = data[0].columns.values + for d in data[1:]: + common_genes = common_genes[np.isin(common_genes, + d.columns.values)] + for i in range(len(data)): + data[i] = data[i][common_genes] + warnings.warn("Input data has inconsistent column names. " + "Subsetting to {} common columns.".format( + len(common_genes)), UserWarning) + else: + for d in data[1:]: + if not d.shape[1] == matrix_shape: + shapes = ", ".join([str(d.shape[1]) for d in data]) + raise ValueError("Expected data all with the same number of " + "columns. Got {}".format(shapes)) + return data + + def combine_batches(data, batch_labels, append_to_cell_names=None): """Combine data matrices from multiple batches and store a batch label @@ -393,6 +603,8 @@ def combine_batches(data, batch_labels, append_to_cell_names=None): # check consistent type matrix_type = type(data[0]) + if matrix_type is pd.SparseDataFrame: + matrix_type = pd.DataFrame if not issubclass(matrix_type, (np.ndarray, pd.DataFrame, sparse.spmatrix)): @@ -405,26 +617,7 @@ def combine_batches(data, batch_labels, append_to_cell_names=None): raise TypeError("Expected data all of the same class. " "Got {}".format(types)) - # check consistent columns - matrix_shape = data[0].shape[1] - if issubclass(matrix_type, pd.DataFrame): - if not (np.all([d.shape[1] == matrix_shape for d in data[1:]]) and - np.all([data[0].columns == d.columns for d in data])): - common_genes = data[0].columns.values - for d in data[1:]: - common_genes = common_genes[np.isin(common_genes, - d.columns.values)] - for i in range(len(data)): - data[i] = data[i][common_genes] - warnings.warn("Input data has inconsistent column names. " - "Subsetting to {} common columns.".format( - len(common_genes)), UserWarning) - else: - for d in data[1:]: - if not d.shape[1] == matrix_shape: - shapes = ", ".join([str(d.shape[1]) for d in data]) - raise ValueError("Expected data all with the same number of " - "columns. Got {}".format(shapes)) + data = check_consistent_columns(data) # check append_to_cell_names if append_to_cell_names and not issubclass(matrix_type, pd.DataFrame): @@ -432,7 +625,11 @@ def combine_batches(data, batch_labels, append_to_cell_names=None): " Got {}".format(matrix_type.__name__), UserWarning) elif append_to_cell_names is None: if issubclass(matrix_type, pd.DataFrame): - append_to_cell_names = True + if all([isinstance(d.index, pd.RangeIndex) for d in data]): + # rangeindex should still be a rangeindex + append_to_cell_names = False + else: + append_to_cell_names = True else: append_to_cell_names = False @@ -449,6 +646,11 @@ def combine_batches(data, batch_labels, append_to_cell_names=None): "_" + str(batch_labels[i])) for i, d in enumerate(data)]) data_combined.index = index + elif all([isinstance(d.index, pd.RangeIndex) for d in data]): + # rangeindex should still be a rangeindex + data_combined = data_combined.reset_index(drop=True) + sample_labels = pd.Series(sample_labels, index=data_combined.index, + name='sample_labels') elif issubclass(matrix_type, sparse.spmatrix): data_combined = sparse.vstack(data) elif issubclass(matrix_type, np.ndarray): @@ -458,37 +660,25 @@ def combine_batches(data, batch_labels, append_to_cell_names=None): def select_cols(data, idx): - warnings.warn("`scprep.utils.select_cols` is deprecated. Use " - "`scprep.select.select_cols` instead.", - FutureWarning) - return select.select_cols(data, idx=idx) + raise RuntimeError("`scprep.utils.select_cols` is deprecated. Use " + "`scprep.select.select_cols` instead.") def select_rows(data, idx): - warnings.warn("`scprep.utils.select_rows` is deprecated. Use " - "`scprep.select.select_rows` instead.", - FutureWarning) - return select.select_rows(data, idx=idx) + raise RuntimeError("`scprep.utils.select_rows` is deprecated. Use " + "`scprep.select.select_rows` instead.") def get_gene_set(data, starts_with=None, ends_with=None, regex=None): - warnings.warn("`scprep.utils.get_gene_set` is deprecated. Use " - "`scprep.select.get_gene_set` instead.", - FutureWarning) - return select.get_gene_set(data, starts_with=starts_with, - ends_with=ends_with, regex=regex) + raise RuntimeError("`scprep.utils.get_gene_set` is deprecated. Use " + "`scprep.select.get_gene_set` instead.") def get_cell_set(data, starts_with=None, ends_with=None, regex=None): - warnings.warn("`scprep.utils.get_cell_set` is deprecated. Use " - "`scprep.select.get_cell_set` instead.", - FutureWarning) - return select.get_cell_set(data, starts_with=starts_with, - ends_with=ends_with, regex=regex) + raise RuntimeError("`scprep.utils.get_cell_set` is deprecated. Use " + "`scprep.select.get_cell_set` instead.") def subsample(*data, n=10000, seed=None): - warnings.warn("`scprep.utils.subsample` is deprecated. Use " - "`scprep.select.subsample` instead.", - FutureWarning) - return select.subsample(*data, n=n, seed=seed) + raise RuntimeError("`scprep.utils.subsample` is deprecated. Use " + "`scprep.select.subsample` instead.") diff --git a/scprep/version.py b/scprep/version.py index 54306210..1b0953c5 100644 --- a/scprep/version.py +++ b/scprep/version.py @@ -1,4 +1,4 @@ # author: Scott Gigante # (C) 2018 Krishnaswamy Lab GPLv2 -__version__ = "0.12.2" +__version__ = "1.0.0" diff --git a/setup.py b/setup.py index 3871b98b..0c49f2df 100644 --- a/setup.py +++ b/setup.py @@ -3,11 +3,11 @@ from setuptools import setup, find_packages install_requires = [ - 'numpy>=1.10.0', - 'scipy>=0.18.0', + 'numpy>=1.12.0', + 'scipy>=0.18.1', 'scikit-learn>=0.19.1', - 'pandas>=0.19.0,<0.24', - 'decorator>=4.3.0' + 'decorator>=4.3.0', + 'pandas>=0.25', ] test_requires = [ @@ -16,9 +16,10 @@ 'fcsparser', 'tables', 'h5py', - 'rpy2>=3.0', 'coverage', - 'coveralls' + 'coveralls', + 'parameterized', + 'statsmodels', ] doc_requires = [ @@ -32,9 +33,9 @@ if sys.version_info[:2] < (3, 5): raise RuntimeError("Python version >=3.5 required.") elif sys.version_info[:2] < (3, 6): - test_requires += ['matplotlib>=3.0,<3.1'] + test_requires += ['matplotlib>=3.0,<3.1', 'rpy2>=3.0,<3.1'] else: - test_requires += ['matplotlib>=3.0'] + test_requires += ['matplotlib>=3.0', 'rpy2>=3.0'] version_py = os.path.join(os.path.dirname( __file__), 'scprep', 'version.py') @@ -46,7 +47,7 @@ setup(name='scprep', version=version, description='scprep', - author='Jay Stanley, Scott Gigante, and Daniel Burkhardt, Krishnaswamy Lab, Yale University', + author='Scott Gigante, Daniel Burkhardt and Jay Stanley, Yale University', author_email='krishnaswamylab@gmail.com', packages=find_packages(), license='GNU General Public License Version 2', diff --git a/test.png b/test.png new file mode 100644 index 00000000..01c74e2d Binary files /dev/null and b/test.png differ diff --git a/test/test_filter.py b/test/test_filter.py index 24e30c97..d55b3489 100644 --- a/test/test_filter.py +++ b/test/test_filter.py @@ -85,6 +85,7 @@ def test_filter_rare_genes(self): self.X_dense, utils.assert_transform_equals, Y=X_filtered, transform=scprep.filter.filter_rare_genes) + def test_library_size_filter(self): X_filtered = scprep.filter.filter_library_size( self.X_sparse, cutoff=100) @@ -209,7 +210,7 @@ def test_gene_expression_filter_warning(self): self.X_sparse, genes=genes, cutoff=None, percentile=None) assert_raise_message( KeyError, - "the label [not_a_gene] is not in the [columns]", + "not_a_gene", scprep.filter.filter_gene_set_expression, self.X_sparse, genes=no_genes, percentile=90.0, keep_cells='below') @@ -266,7 +267,13 @@ def test_deprecated_sample_labels(self): def test_large_sparse_dataframe_library_size(): + matrix._ignore_pandas_sparse_warning() X = pd.SparseDataFrame(sparse.coo_matrix((10**7, 2 * 10**4)), default_fill_value=0.0) cell_sums = scprep.measure.library_size(X) assert cell_sums.shape[0] == X.shape[0] + matrix._reset_warnings() + X = matrix.SparseDataFrame(sparse.coo_matrix((10**7, 2 * 10**4)), + default_fill_value=0.0) + cell_sums = scprep.measure.library_size(X) + assert cell_sums.shape[0] == X.shape[0] diff --git a/test/test_hdf5.py b/test/test_hdf5.py index 26831f3c..78c3f319 100644 --- a/test/test_hdf5.py +++ b/test/test_hdf5.py @@ -14,7 +14,7 @@ def test_failed_import_tables(): tables = scprep.io.hdf5.tables del scprep.io.hdf5.tables assert hdf5_available() is True - with tables.File(h5_file) as f: + with tables.File(h5_file, 'r') as f: assert scprep.io.hdf5._is_tables(f) is False with scprep.io.hdf5.open_file(h5_file) as f: assert scprep.io.hdf5._is_h5py(f) @@ -26,7 +26,7 @@ def test_failed_import_h5py(): h5py = scprep.io.hdf5.h5py del scprep.io.hdf5.h5py assert hdf5_available() is True - with h5py.File(h5_file) as f: + with h5py.File(h5_file, 'r') as f: assert scprep.io.hdf5._is_h5py(f) is False scprep.io.hdf5.h5py = h5py diff --git a/test/test_io.py b/test/test_io.py index 401188e9..a3218a79 100644 --- a/test/test_io.py +++ b/test/test_io.py @@ -1,20 +1,120 @@ -from tools import data +from tools import data, utils import scprep import scprep.io.utils -from sklearn.utils.testing import assert_warns_message, assert_raise_message +from sklearn.utils.testing import assert_warns_message, assert_raise_message, assert_raises import pandas as pd import numpy as np +from scipy import sparse import os +import shutil import fcsparser import zipfile import urllib +import unittest + + +class TestMatrixToDataFrame(unittest.TestCase): + + @classmethod + def setUpClass(self): + self.X_dense = data.load_10X(sparse=False) + self.X_sparse = data.load_10X(sparse=True) + self.X_numpy = self.X_dense.to_numpy() + self.X_coo = self.X_sparse.sparse.to_coo() + self.cell_names = self.X_dense.index + self.gene_names = self.X_dense.columns + + def test_matrix_to_dataframe_no_names_sparse(self): + Y = scprep.io.utils._matrix_to_data_frame(self.X_numpy, sparse=True) + assert isinstance(Y, sparse.csr_matrix) + assert np.all(scprep.utils.toarray(Y) == self.X_numpy) + Y = scprep.io.utils._matrix_to_data_frame(self.X_coo, sparse=True) + assert isinstance(Y, sparse.spmatrix) + assert np.all(scprep.utils.toarray(Y) == self.X_numpy) + + def test_matrix_to_dataframe_no_names_dataframe_sparse(self): + Y = scprep.io.utils._matrix_to_data_frame(self.X_dense, sparse=True) + assert scprep.utils.is_sparse_dataframe(Y) + assert not isinstance(Y, pd.SparseDataFrame) + assert np.all(scprep.utils.toarray(Y) == self.X_numpy) + utils.assert_matrix_class_equivalent(Y, self.X_sparse) + Y = scprep.io.utils._matrix_to_data_frame(self.X_sparse, sparse=True) + assert scprep.utils.is_sparse_dataframe(Y) + assert not isinstance(Y, pd.SparseDataFrame) + assert np.all(scprep.utils.toarray(Y) == self.X_numpy) + utils.assert_matrix_class_equivalent(Y, self.X_sparse) + + def test_matrix_to_dataframe_no_names_dense(self): + Y = scprep.io.utils._matrix_to_data_frame(self.X_numpy, sparse=False) + assert isinstance(Y, np.ndarray) + assert np.all(Y == self.X_numpy) + Y = scprep.io.utils._matrix_to_data_frame(self.X_coo, sparse=False) + assert isinstance(Y, np.ndarray) + assert np.all(Y == self.X_numpy) + + def test_matrix_to_dataframe_no_names_dataframe_dense(self): + Y = scprep.io.utils._matrix_to_data_frame(self.X_dense, sparse=False) + assert isinstance(Y, pd.DataFrame) + assert not scprep.utils.is_sparse_dataframe(Y) + assert not isinstance(Y, pd.SparseDataFrame) + assert np.all(scprep.utils.toarray(Y) == self.X_numpy) + utils.assert_matrix_class_equivalent(Y, self.X_dense) + Y = scprep.io.utils._matrix_to_data_frame(self.X_sparse, sparse=False) + assert isinstance(Y, pd.DataFrame) + assert not scprep.utils.is_sparse_dataframe(Y) + assert not isinstance(Y, pd.SparseDataFrame) + assert np.all(scprep.utils.toarray(Y) == self.X_numpy) + utils.assert_matrix_class_equivalent(Y, self.X_dense) + + def test_matrix_to_dataframe_names_sparse(self): + Y = scprep.io.utils._matrix_to_data_frame(self.X_dense, cell_names=self.cell_names, + gene_names=self.gene_names, sparse=True) + assert scprep.utils.is_sparse_dataframe(Y) + assert not isinstance(Y, pd.SparseDataFrame) + assert np.all(scprep.utils.toarray(Y) == self.X_numpy) + utils.assert_matrix_class_equivalent(Y, self.X_sparse) + Y = scprep.io.utils._matrix_to_data_frame(self.X_sparse, cell_names=self.cell_names, + gene_names=self.gene_names, sparse=True) + assert scprep.utils.is_sparse_dataframe(Y) + assert not isinstance(Y, pd.SparseDataFrame) + assert np.all(scprep.utils.toarray(Y) == self.X_numpy) + utils.assert_matrix_class_equivalent(Y, self.X_sparse) + Y = scprep.io.utils._matrix_to_data_frame(self.X_numpy, cell_names=self.cell_names, + gene_names=self.gene_names, sparse=True) + assert scprep.utils.is_sparse_dataframe(Y) + assert not isinstance(Y, pd.SparseDataFrame) + assert np.all(scprep.utils.toarray(Y) == self.X_numpy) + utils.assert_matrix_class_equivalent(Y, self.X_sparse) + + def test_matrix_to_dataframe_names_dense(self): + Y = scprep.io.utils._matrix_to_data_frame(self.X_dense, cell_names=self.cell_names, + gene_names=self.gene_names, sparse=False) + assert isinstance(Y, pd.DataFrame) + assert not scprep.utils.is_sparse_dataframe(Y) + assert not isinstance(Y, pd.SparseDataFrame) + assert np.all(scprep.utils.toarray(Y) == self.X_numpy) + utils.assert_matrix_class_equivalent(Y, self.X_dense) + Y = scprep.io.utils._matrix_to_data_frame(self.X_sparse, cell_names=self.cell_names, + gene_names=self.gene_names, sparse=False) + assert isinstance(Y, pd.DataFrame) + assert not scprep.utils.is_sparse_dataframe(Y) + assert not isinstance(Y, pd.SparseDataFrame) + assert np.all(scprep.utils.toarray(Y) == self.X_numpy) + utils.assert_matrix_class_equivalent(Y, self.X_dense) + Y = scprep.io.utils._matrix_to_data_frame(self.X_numpy, cell_names=self.cell_names, + gene_names=self.gene_names, sparse=False) + assert isinstance(Y, pd.DataFrame) + assert not scprep.utils.is_sparse_dataframe(Y) + assert not isinstance(Y, pd.SparseDataFrame) + assert np.all(scprep.utils.toarray(Y) == self.X_numpy) + utils.assert_matrix_class_equivalent(Y, self.X_dense) def test_10X_duplicate_gene_names(): assert_warns_message( RuntimeWarning, - "Duplicate gene names detected! Forcing `gene_labels='id'`. " - "Alternatively, try `gene_labels='both'`, `allow_duplicates=True`, or " + "Duplicate gene names detected! Forcing `gene_labels='both'`. " + "Alternatively, try `gene_labels='id'`, `allow_duplicates=True`, or " "load the matrix with `sparse=False`", scprep.io.load_10X, os.path.join(data.data_dir, "test_10X_duplicate_gene_names"), @@ -32,16 +132,16 @@ def test_10X_duplicate_gene_names(): def test_10X(): X = data.load_10X() assert X.shape == (100, 100) - assert isinstance(X, pd.SparseDataFrame) + assert scprep.utils.is_sparse_dataframe(X) assert X.columns[0] == "Arl8b" X = data.load_10X(gene_labels='id', sparse=False) assert X.shape == (100, 100) assert isinstance(X, pd.DataFrame) - assert not isinstance(X, pd.SparseDataFrame) + assert not scprep.utils.is_sparse_dataframe(X) assert X.columns[0] == "ENSMUSG00000030105" X = data.load_10X(gene_labels='both') assert X.shape == (100, 100) - assert isinstance(X, pd.SparseDataFrame) + assert scprep.utils.is_sparse_dataframe(X) assert X.columns[0] == "Arl8b (ENSMUSG00000030105)" X_cellranger3 = scprep.io.load_10X( os.path.join(data.data_dir, "test_10X_cellranger3"), @@ -74,7 +174,7 @@ def test_10X_zip(): filename = os.path.join(data.data_dir, "test_10X.zip") X_zip = scprep.io.load_10X_zip( filename) - assert isinstance(X_zip, pd.SparseDataFrame) + assert scprep.utils.is_sparse_dataframe(X_zip) assert np.sum(np.sum(X != X_zip)) == 0 np.testing.assert_array_equal(X.columns, X_zip.columns) np.testing.assert_array_equal(X.index, X_zip.index) @@ -99,7 +199,7 @@ def test_10X_zip_url(): filename = "https://github.com/KrishnaswamyLab/scprep/raw/master/data/test_data/test_10X.zip" X_zip = scprep.io.load_10X_zip( filename) - assert isinstance(X_zip, pd.SparseDataFrame) + assert scprep.utils.is_sparse_dataframe(X_zip) assert np.sum(np.sum(X != X_zip)) == 0 np.testing.assert_array_equal(X.columns, X_zip.columns) np.testing.assert_array_equal(X.index, X_zip.index) @@ -114,9 +214,8 @@ def test_10X_zip_url_not_a_zip(): def test_10X_zip_url_not_a_real_website(): - assert_raise_message( + assert_raises( urllib.error.URLError, - "", scprep.io.load_10X_zip, 'http://invalid.not.a.url/scprep') @@ -142,19 +241,19 @@ def test_10X_HDF5(): h5_file = os.path.join(data.data_dir, "test_10X.h5") # automatic tables backend X_hdf5 = scprep.io.load_10X_HDF5(h5_file) - assert isinstance(X_hdf5, pd.SparseDataFrame) + assert scprep.utils.is_sparse_dataframe(X_hdf5) assert np.sum(np.sum(X != X_hdf5)) == 0 np.testing.assert_array_equal(X.columns, X_hdf5.columns) np.testing.assert_array_equal(X.index, X_hdf5.index) # explicit tables backend X_hdf5 = scprep.io.load_10X_HDF5(h5_file, backend='tables') - assert isinstance(X_hdf5, pd.SparseDataFrame) + assert scprep.utils.is_sparse_dataframe(X_hdf5) assert np.sum(np.sum(X != X_hdf5)) == 0 np.testing.assert_array_equal(X.columns, X_hdf5.columns) np.testing.assert_array_equal(X.index, X_hdf5.index) # explicit h5py backend X_hdf5 = scprep.io.load_10X_HDF5(h5_file, backend='h5py') - assert isinstance(X_hdf5, pd.SparseDataFrame) + assert scprep.utils.is_sparse_dataframe(X_hdf5) assert np.sum(np.sum(X != X_hdf5)) == 0 np.testing.assert_array_equal(X.columns, X_hdf5.columns) np.testing.assert_array_equal(X.index, X_hdf5.index) @@ -162,7 +261,7 @@ def test_10X_HDF5(): tables = scprep.io.hdf5.tables del scprep.io.hdf5.tables X_hdf5 = scprep.io.load_10X_HDF5(h5_file) - assert isinstance(X_hdf5, pd.SparseDataFrame) + assert scprep.utils.is_sparse_dataframe(X_hdf5) assert np.sum(np.sum(X != X_hdf5)) == 0 np.testing.assert_array_equal(X.columns, X_hdf5.columns) np.testing.assert_array_equal(X.index, X_hdf5.index) @@ -174,19 +273,19 @@ def test_10X_HDF5_cellranger3(): h5_file = os.path.join(data.data_dir, "test_10X_cellranger3.h5") # automatic tables backend X_hdf5 = scprep.io.load_10X_HDF5(h5_file) - assert isinstance(X_hdf5, pd.SparseDataFrame) + assert scprep.utils.is_sparse_dataframe(X_hdf5) assert np.sum(np.sum(X != X_hdf5)) == 0 np.testing.assert_array_equal(X.columns, X_hdf5.columns) np.testing.assert_array_equal(X.index, X_hdf5.index) # explicit tables backend X_hdf5 = scprep.io.load_10X_HDF5(h5_file, backend='tables') - assert isinstance(X_hdf5, pd.SparseDataFrame) + assert scprep.utils.is_sparse_dataframe(X_hdf5) assert np.sum(np.sum(X != X_hdf5)) == 0 np.testing.assert_array_equal(X.columns, X_hdf5.columns) np.testing.assert_array_equal(X.index, X_hdf5.index) # explicit h5py backend X_hdf5 = scprep.io.load_10X_HDF5(h5_file, backend='h5py') - assert isinstance(X_hdf5, pd.SparseDataFrame) + assert scprep.utils.is_sparse_dataframe(X_hdf5) assert np.sum(np.sum(X != X_hdf5)) == 0 np.testing.assert_array_equal(X.columns, X_hdf5.columns) np.testing.assert_array_equal(X.index, X_hdf5.index) @@ -194,7 +293,7 @@ def test_10X_HDF5_cellranger3(): tables = scprep.io.hdf5.tables del scprep.io.hdf5.tables X_hdf5 = scprep.io.load_10X_HDF5(h5_file) - assert isinstance(X_hdf5, pd.SparseDataFrame) + assert scprep.utils.is_sparse_dataframe(X_hdf5) assert np.sum(np.sum(X != X_hdf5)) == 0 np.testing.assert_array_equal(X.columns, X_hdf5.columns) np.testing.assert_array_equal(X.index, X_hdf5.index) @@ -276,7 +375,7 @@ def test_csv_and_tsv(): np.testing.assert_array_equal(X_csv.columns, X_csv4.index) np.testing.assert_array_equal(X_csv.index, X_csv4.columns) assert isinstance(X_csv, pd.DataFrame) - assert not isinstance(X_csv, pd.SparseDataFrame) + assert not scprep.utils.is_sparse_dataframe(X_csv) X_csv = scprep.io.load_csv( os.path.join(data.data_dir, "test_small.csv"), gene_names=os.path.join( @@ -289,7 +388,7 @@ def test_csv_and_tsv(): np.testing.assert_array_equal(X.columns, X_csv.columns) np.testing.assert_array_equal(X.index, X_csv.index) assert isinstance(X_csv, pd.DataFrame) - assert not isinstance(X_csv, pd.SparseDataFrame) + assert not scprep.utils.is_sparse_dataframe(X_csv) X_csv = scprep.io.load_csv( os.path.join(data.data_dir, "test_small.csv"), gene_names=X.columns, @@ -300,7 +399,7 @@ def test_csv_and_tsv(): np.testing.assert_array_equal(X.columns, X_csv.columns) np.testing.assert_array_equal(X.index, X_csv.index) assert isinstance(X_csv, pd.DataFrame) - assert not isinstance(X_csv, pd.SparseDataFrame) + assert not scprep.utils.is_sparse_dataframe(X_csv) X_csv = scprep.io.load_csv( os.path.join(data.data_dir, "test_small.csv"), gene_names=None, @@ -308,8 +407,8 @@ def test_csv_and_tsv(): sparse=True, skiprows=1, usecols=range(1, 101)) - assert np.sum(np.sum(X.values != X_csv.values)) == 0 - assert isinstance(X_csv, pd.SparseDataFrame) + assert np.sum(np.sum(X.to_numpy() != X_csv.to_numpy())) == 0 + assert scprep.utils.is_sparse_dataframe(X_csv) X_csv = scprep.io.load_csv( os.path.join(data.data_dir, "test_small_duplicate_gene_names.csv")) @@ -333,26 +432,26 @@ def test_mtx(): cell_names=os.path.join( data.data_dir, "barcodes.tsv"), cell_axis="column") - assert np.sum(np.sum(X.values != X_mtx.values)) == 0 + assert np.sum(np.sum(X.to_numpy() != X_mtx.to_numpy())) == 0 np.testing.assert_array_equal(X.columns, X_mtx.columns) np.testing.assert_array_equal(X.index, X_mtx.index) - assert isinstance(X_mtx, pd.SparseDataFrame) + assert scprep.utils.is_sparse_dataframe(X_mtx) X_mtx = scprep.io.load_mtx( filename, gene_names=X.columns, cell_names=X.index, cell_axis="column") - assert np.sum(np.sum(X.values != X_mtx.values)) == 0 + assert np.sum(np.sum(X.to_numpy() != X_mtx.to_numpy())) == 0 np.testing.assert_array_equal(X.columns, X_mtx.columns) np.testing.assert_array_equal(X.index, X_mtx.index) - assert isinstance(X_mtx, pd.SparseDataFrame) + assert scprep.utils.is_sparse_dataframe(X_mtx) X_mtx = scprep.io.load_mtx( filename, gene_names=None, cell_names=None, sparse=False, cell_axis="column") - assert np.sum(np.sum(X.values != X_mtx)) == 0 + assert np.sum(np.sum(X.to_numpy() != X_mtx)) == 0 assert isinstance(X_mtx, np.ndarray) assert_raise_message( ValueError, @@ -360,14 +459,34 @@ def test_mtx(): "Expected 'row' or 'column'", scprep.io.load_mtx, filename, cell_axis='neither') - X = scprep.io.load_mtx( + X_mtx = scprep.io.load_mtx( filename, gene_names=np.arange(X.shape[1]).astype('str'), cell_names=np.arange(X.shape[0])) - assert X.shape == (100, 100) - assert isinstance(X, pd.SparseDataFrame) - assert X.columns[0] == "0" - assert X.index[0] == 0 + assert X_mtx.shape == (100, 100) + assert scprep.utils.is_sparse_dataframe(X_mtx) + assert X_mtx.columns[0] == "0" + assert X_mtx.index[0] == 0 + + +def test_save_mtx(): + filename = os.path.join(data.data_dir, "test_10X", "matrix.mtx.gz") + X = scprep.io.load_mtx( + filename, + gene_names=os.path.join( + data.data_dir, "gene_symbols.csv"), + cell_names=os.path.join( + data.data_dir, "barcodes.tsv"), + cell_axis="column") + scprep.io.save_mtx(X, "test_mtx") + Y = scprep.io.load_mtx( + "test_mtx/matrix.mtx", + gene_names="test_mtx/gene_names.tsv", + cell_names="test_mtx/cell_names.tsv") + np.testing.assert_array_equal(X, Y) + assert np.all(X.index == Y.index) + assert np.all(X.columns == Y.columns) + shutil.rmtree("test_mtx") def test_fcs(): @@ -377,13 +496,13 @@ def test_fcs(): assert 'Time' not in X.columns assert len(set(X.columns).difference(data.columns)) == 0 np.testing.assert_array_equal(X.index, data.index) - np.testing.assert_array_equal(X.values, data[X.columns].values) + np.testing.assert_array_equal(X.to_numpy(), data[X.columns].to_numpy()) _, _, X = scprep.io.load_fcs(path, sparse=True) assert 'Time' not in X.columns assert len(set(X.columns).difference(data.columns)) == 0 np.testing.assert_array_equal(X.index, data.index) np.testing.assert_array_equal( - X.to_dense().values, data[X.columns].values) + X.sparse.to_dense().to_numpy(), data[X.columns].to_numpy()) X_meta, _, X = scprep.io.load_fcs(path, reformat_meta=False, override=True) assert set(meta.keys()) == set(X_meta.keys()) @@ -482,3 +601,62 @@ def test_parse_header(): ValueError, "Expected 50 entries in {}. Got 100".format(os.path.abspath(header2)), scprep.io.utils._parse_header, header2, 50) + +def test_download_google_drive(): + id = "1_T5bRqbid5mtuDYnyusoGvujc6fW1UKv" + dest = "test.txt" + scprep.io.download.download_google_drive(id, dest) + assert os.path.isfile(dest) + with open(dest, 'r') as f: + data = f.read() + assert data == 'test\n', data + os.remove(dest) + +def test_download_google_drive_large(): + id = "1FDDSWtSZcdQUVKpk-mPCZ8Ji1Fx8KSz9" + response = scprep.io.download._GET_google_drive(id) + assert response.status_code == 200 + response.close() + +def test_download_url(): + X = data.load_10X() + scprep.io.download.download_url("https://github.com/KrishnaswamyLab/scprep/raw/master/data/test_data/test_10X/matrix.mtx.gz", "url_test.mtx.gz") + Y = scprep.io.load_mtx("url_test.mtx.gz").T + assert (X.sparse.to_coo() - Y).nnz == 0 + os.remove("url_test.mtx.gz") + +def test_download_zip(): + X = data.load_10X() + scprep.io.download.download_and_extract_zip("https://github.com/KrishnaswamyLab/scprep/raw/master/data/test_data/test_10X.zip", "zip_test") + Y = scprep.io.load_10X("zip_test/test_10X") + assert np.all(X == Y) + assert np.all(X.index == Y.index) + assert np.all(X.columns == Y.columns) + shutil.rmtree("zip_test") + +def test_unzip_no_destination(): + X = data.load_10X() + filename = os.path.join(data.data_dir, "test_10X.zip") + tmp_filename = os.path.join("zip_test", "zip_extract_test.zip") + os.mkdir("zip_test") + shutil.copyfile(filename, tmp_filename) + scprep.io.download.unzip(tmp_filename, delete=False) + assert os.path.isfile(tmp_filename) + Y = scprep.io.load_10X("zip_test/test_10X") + assert np.all(X == Y) + assert np.all(X.index == Y.index) + assert np.all(X.columns == Y.columns) + shutil.rmtree("zip_test") + +def test_unzip_destination(): + X = data.load_10X() + filename = os.path.join(data.data_dir, "test_10X.zip") + tmp_filename = "zip_extract_test.zip" + shutil.copyfile(filename, tmp_filename) + scprep.io.download.unzip(tmp_filename, destination="zip_test") + assert not os.path.isfile(tmp_filename) + Y = scprep.io.load_10X("zip_test/test_10X") + assert np.all(X == Y) + assert np.all(X.index == Y.index) + assert np.all(X.columns == Y.columns) + shutil.rmtree("zip_test") \ No newline at end of file diff --git a/test/test_measure.py b/test/test_measure.py index 1f54fd69..e54e61f1 100644 --- a/test/test_measure.py +++ b/test/test_measure.py @@ -51,3 +51,27 @@ def test_array_all(self): self.X_dense, utils.assert_transform_equals, Y=self.Y, transform=scprep.measure.gene_set_expression, genes=[0]) + + def test_library_size(self): + def test_fun(X): + x = scprep.measure.library_size(X) + assert x.name == 'library_size' + assert np.all(x.index == self.X_dense.index) + matrix.test_pandas_matrix_types( + self.X_dense, test_fun) + + def test_gene_set_expression(self): + def test_fun(X): + x = scprep.measure.gene_set_expression(X, genes=[0, 1]) + assert x.name == 'expression' + assert np.all(x.index == self.X_dense.index) + matrix.test_pandas_matrix_types( + self.X_dense, test_fun) + + def test_variable_genes(self): + def test_fun(X): + x = scprep.measure.gene_variability(X) + assert x.name == 'variability' + assert np.all(x.index == self.X_dense.columns) + matrix.test_pandas_matrix_types( + self.X_dense, test_fun) diff --git a/test/test_normalize.py b/test/test_normalize.py index 60c4c341..29e24d42 100644 --- a/test/test_normalize.py +++ b/test/test_normalize.py @@ -23,12 +23,13 @@ def test_libsize_norm_rescale_median(self): Y = self.X_norm * self.median utils.assert_all_close(Y.sum(1), np.median(np.sum(self.X, 1))) Y2, libsize2 = scprep.normalize.library_size_normalize( - self.X, return_library_size=True) + self.X, rescale='median', return_library_size=True) np.testing.assert_allclose(Y, Y2) np.testing.assert_allclose(self.libsize, libsize2) matrix.test_all_matrix_types( self.X, utils.assert_transform_equivalent, Y=Y, transform=scprep.normalize.library_size_normalize, + rescale='median', check=utils.assert_all_close) def test_libsize_norm_return_libsize(self): diff --git a/test/test_patch.py b/test/test_patch.py index 86f99543..150a92c7 100644 --- a/test/test_patch.py +++ b/test/test_patch.py @@ -1,6 +1,7 @@ import scprep import numpy as np import pandas as pd +from pandas.core.internals.blocks import ExtensionBlock def test_pandas_series_rmatmul(): @@ -9,3 +10,19 @@ def test_pandas_series_rmatmul(): df = pd.DataFrame(mat) ser = pd.Series(arr) np.testing.assert_array_equal(mat @ ser, (df @ ser).values) + +def test_pandas_sparse_iloc(): + X = pd.DataFrame([[0,1,1], [0,0,1], [0,0,0]]).astype(pd.SparseDtype(float, fill_value=0.0)) + assert np.all(~np.isnan(X.iloc[[0,1]].to_numpy())) + + +class CustomBlock(ExtensionBlock): + _holder = np.ndarray + +def test_fill_value(): + values = pd.Series(np.arange(3), dtype=pd.UInt16Dtype()) + custom_block = CustomBlock(values, placement=slice(1, 2)) + assert np.isnan(custom_block.fill_value) + values = pd.Series(np.arange(3), dtype=pd.SparseDtype(float, 0.0)) + custom_block = CustomBlock(values, placement=slice(1, 2)) + assert not np.isnan(custom_block.fill_value) diff --git a/test/test_plot.py b/test/test_plot.py index 4b3c34be..4f56d919 100644 --- a/test/test_plot.py +++ b/test/test_plot.py @@ -2,7 +2,9 @@ import matplotlib import matplotlib.pyplot as plt import numpy as np +import pandas as pd import os +import numbers from sklearn.utils.testing import assert_raise_message, assert_warns_message import unittest import scprep @@ -89,10 +91,6 @@ def test_tab30(): 10, 12, 13, 14, 16, 17, 18]]) -def test_is_color_array_none(): - assert not scprep.plot.utils._is_color_array(None) - - def test_tab40(): cmap = scprep.plot.colors.tab40() np.testing.assert_array_equal( @@ -142,6 +140,106 @@ def test_tab10_continuous_invalid_n_colors(): n_step=1) +def test_tab_exact(): + assert scprep.plot.colors.tab(1) is plt.cm.tab10 + np.testing.assert_array_equal( + scprep.plot.colors.tab(10).colors, plt.cm.tab10.colors) + np.testing.assert_array_equal( + scprep.plot.colors.tab(20).colors, plt.cm.tab20.colors) + np.testing.assert_array_equal( + scprep.plot.colors.tab(30).colors, scprep.plot.colors.tab30().colors) + np.testing.assert_array_equal( + scprep.plot.colors.tab(40).colors, scprep.plot.colors.tab40().colors) + np.testing.assert_array_equal( + scprep.plot.colors.tab(50).colors, + scprep.plot.colors.tab10_continuous(n_colors=10, n_step=5).colors) + + +def test_tab_first10(): + np.testing.assert_array_equal( + scprep.plot.colors.tab(19).colors[:10], plt.cm.tab10.colors) + np.testing.assert_array_equal( + scprep.plot.colors.tab(29).colors[:10], + scprep.plot.colors.tab30().colors[::3]) + np.testing.assert_array_equal( + scprep.plot.colors.tab(39).colors[:10], + scprep.plot.colors.tab40().colors[::4]) + np.testing.assert_array_equal( + scprep.plot.colors.tab(49).colors[:10], + scprep.plot.colors.tab10_continuous( + n_colors=10, n_step=5).colors[::5]) + + +def test_tab_first20(): + np.testing.assert_array_equal( + scprep.plot.colors.tab(29).colors[10:20], + scprep.plot.colors.tab30().colors[1::3]) + np.testing.assert_array_equal( + scprep.plot.colors.tab(39).colors[10:20], + scprep.plot.colors.tab40().colors[1::4]) + + +def test_tab_first30(): + np.testing.assert_array_equal( + scprep.plot.colors.tab(39).colors[20:30], + scprep.plot.colors.tab40().colors[2::4]) + + +def test_tab_overhang(): + np.testing.assert_array_equal( + scprep.plot.colors.tab(9).colors, plt.cm.tab10.colors[:9]) + np.testing.assert_array_equal( + scprep.plot.colors.tab(19).colors[10:], plt.cm.tab20.colors[1:-1:2]) + np.testing.assert_array_equal( + scprep.plot.colors.tab(29).colors[20:], + scprep.plot.colors.tab30().colors[2:-1:3]) + np.testing.assert_array_equal( + scprep.plot.colors.tab(39).colors[30:], + scprep.plot.colors.tab40().colors[3:-1:4]) + np.testing.assert_array_equal( + scprep.plot.colors.tab(49).colors[40:], + scprep.plot.colors.tab10_continuous( + n_colors=10, n_step=5).colors[4:-1:5]) + + +def test_tab_invalid(): + assert_raise_message( + ValueError, + "Expected n >= 1. Got 0", + scprep.plot.colors.tab, + n=0) + + +def test_is_color_array_none(): + assert not scprep.plot.utils._is_color_array(None) + + +def test_histogram_log_negative_min(): + assert_warns_message( + UserWarning, + "Expected positive data for log = x. Got min(data) = -1.00", + scprep.plot.histogram, + [-1, 1, 1, 1], log='x') + assert_warns_message( + UserWarning, + "Expected positive data for log = True. Got min(data) = -1.00", + scprep.plot.histogram, + [-1, 1, 1, 1], log=True) + + +def test_histogram_log_negative_max(): + assert_raise_message( + ValueError, + "Expected positive data for log = x. Got max(data) = -1.00", + scprep.plot.histogram, + [-1, -1, -1, -2], log='x') + assert_raise_message( + ValueError, + "Expected positive data for log = True. Got max(data) = -1.00", + scprep.plot.histogram, + [-1, -1, -1, -2], log=True) + + class TestScatterParams(unittest.TestCase): @classmethod @@ -180,6 +278,27 @@ def test_plot_idx_no_shuffle(self): np.testing.assert_equal(params.c, self.c) np.testing.assert_equal(params.s, np.abs(self.x)) + def test_plot_idx_mask(self): + params = _ScatterParams(x=self.x, y=self.y, + z=self.z, c=self.c, + mask=self.x > 0, shuffle=False) + np.testing.assert_equal(params.plot_idx, np.arange(params.size)[self.x > 0]) + np.testing.assert_equal(params.x, self.x[self.x > 0]) + np.testing.assert_equal(params.y, self.y[self.x > 0]) + np.testing.assert_equal(params.z, self.z[self.x > 0]) + np.testing.assert_equal(params.c, self.c[self.x > 0]) + + def test_plot_idx_mask_shuffle(self): + params = _ScatterParams(x=self.x, y=self.y, + mask=self.x > 0, shuffle=True) + np.testing.assert_equal(np.sort(params.plot_idx), np.arange(params.size)[self.x > 0]) + assert np.all(params.x > 0) + + def test_data_int(self): + params = _ScatterParams(x=1, y=2) + np.testing.assert_equal(params._data, [np.array([1]), np.array([2])]) + assert params.subplot_kw == {} + def test_data_2d(self): params = _ScatterParams(x=self.x, y=self.y) np.testing.assert_equal(params._data, [self.x, @@ -301,8 +420,11 @@ def test_discrete_tab20(self): assert params.extend is None assert isinstance(params.cmap, matplotlib.colors.ListedColormap) np.testing.assert_equal( - params.cmap.colors, - plt.cm.tab20.colors[:len(np.unique(np.round(self.c % 1, 1)))]) + params.cmap.colors[:10], + plt.cm.tab10.colors) + np.testing.assert_equal( + params.cmap.colors[10:], + plt.cm.tab20.colors[1:1 + (len(params.cmap.colors) - 10) * 2:2]) def test_continuous_less_than_20(self): params = _ScatterParams(x=self.x, y=self.y, @@ -581,6 +703,38 @@ def test_check_cmap_scale(self): c=np.where(self.c > 0, '+', '-'), ) + def test_series_labels(self): + params = _ScatterParams(x=pd.Series(self.x, name='x'), y=self.y, c=self.c) + assert params.xlabel == 'x' + assert params.ylabel is None + assert params.zlabel is None + params = _ScatterParams(x=self.x, y=pd.Series(self.y, name='y'), c=self.c) + assert params.xlabel is None + assert params.ylabel == 'y' + assert params.zlabel is None + params = _ScatterParams(x=self.x, y=self.y, z=pd.Series(self.y, name='z'), c=self.c) + assert params.xlabel is None + assert params.ylabel is None + assert params.zlabel == 'z' + # xlabel overrides series + params = _ScatterParams(x=pd.Series(self.x, name='x'), y=self.y, c=self.c, + xlabel='y') + assert params.xlabel == 'y' + assert params.ylabel is None + assert params.zlabel is None + # label_prefix overrides series + params = _ScatterParams(x=pd.Series(self.x, name='x'), y=self.y, c=self.c, + label_prefix='y') + assert params.xlabel == 'y1' + assert params.ylabel == 'y2' + assert params.zlabel is None + # xlabel overrides label_prefix + params = _ScatterParams(x=pd.Series(self.x, name='x'), y=self.y, z=self.y, c=self.c, + label_prefix='y', xlabel='test') + assert params.xlabel == 'test' + assert params.ylabel == 'y2' + assert params.zlabel == 'y3' + def test_jitter_x(self): params = _JitterParams(x=np.where(self.x > 0, '+', '-'), y=self.y) np.testing.assert_array_equal(params.x_labels, ['+', '-']) @@ -593,7 +747,9 @@ class Test10X(unittest.TestCase): @classmethod def setUpClass(self): self.X = data.load_10X(sparse=False) - self.X_pca, self.S = scprep.reduce.pca(self.X, n_components=10, + self.X_filt = scprep.filter.filter_empty_cells(self.X) + self.X_pca, self.S = scprep.reduce.pca(scprep.utils.toarray(self.X), + n_components=10, return_singular_values=True) @classmethod @@ -602,20 +758,25 @@ def tearDownClass(self): try_remove("test.gif") try_remove("test.mp4") try_remove("test_jitter.png") + try_remove("test_histogram.png") + try_remove("test_library_size.png") + try_remove("test_variable_genes.png") + try_remove("test_gene_expression.png") + try_remove("test_scree.png") def tearDown(self): plt.close('all') def test_histogram(self): - scprep.plot.plot_library_size(self.X, cutoff=1000, log=True) - scprep.plot.plot_library_size(self.X, cutoff=1000, log=True, + scprep.plot.plot_library_size(self.X_filt, cutoff=1000, log=True) + scprep.plot.plot_library_size(self.X_filt, cutoff=1000, log=True, xlabel="x label", ylabel="y label") def test_histogram_list_of_lists(self): - scprep.plot.plot_library_size(scprep.utils.toarray(self.X).tolist()) + scprep.plot.plot_library_size(scprep.utils.toarray(self.X_filt).tolist()) def test_histogram_array(self): - scprep.plot.plot_library_size(scprep.utils.toarray(self.X)) + scprep.plot.plot_library_size(scprep.utils.toarray(self.X_filt)) def test_histogram_multiple(self): scprep.plot.histogram([scprep.select.select_rows(self.X, idx=0), @@ -623,16 +784,18 @@ def test_histogram_multiple(self): color=['r', 'b']) def test_histogram_multiple_cutoff(self): - scprep.plot.plot_library_size(self.X, cutoff=[500, 1000], log=True) + scprep.plot.plot_library_size(self.X_filt, cutoff=[500, 1000], log=True) def test_histogram_multiple_percentile(self): - scprep.plot.plot_library_size(self.X, percentile=[10, 90], log=True) + scprep.plot.plot_library_size(self.X_filt, percentile=[10, 90], log=True) def test_plot_library_size_multiple(self): scprep.plot.plot_library_size([ - self.X, scprep.select.select_rows( - self.X, idx=np.arange(self.X.shape[0] // 2))], - color=['r', 'b']) + self.X_filt, scprep.select.select_rows( + self.X_filt, idx=np.arange(self.X_filt.shape[0] // 2))], + color=['r', 'b'], + filename="test_library_size.png") + assert os.path.exists("test_library_size.png") def test_plot_gene_set_expression_multiple(self): scprep.plot.plot_gene_set_expression([ @@ -652,7 +815,19 @@ def test_gene_set_expression_array(self): def test_plot_gene_set_expression_single_gene(self): scprep.plot.plot_gene_set_expression( self.X, color=["red"], - genes="Arl8b") + genes="Arl8b", + filename="test_gene_expression.png") + assert os.path.exists("test_gene_expression.png") + + def test_plot_variable_genes(self): + scprep.plot.plot_gene_variability( + self.X, + filename="test_variable_genes.png") + assert os.path.exists("test_variable_genes.png") + + def test_variable_genes_list_of_lists(self): + scprep.plot.plot_gene_variability( + scprep.utils.toarray(self.X).tolist()) def test_histogram_single_gene_dataframe(self): scprep.plot.histogram( @@ -668,7 +843,10 @@ def test_histogram_custom_axis(self): fig, ax = plt.subplots() scprep.plot.plot_gene_set_expression( self.X, genes=scprep.select.get_gene_set(self.X, starts_with="D"), - percentile=90, log='y', ax=ax, title="histogram") + percentile=90, log='y', ax=ax, title="histogram", + filename="test_histogram.png") + assert os.path.exists("test_histogram.png") + assert ax.get_title() == 'histogram' def test_histogram_invalid_axis(self): assert_raise_message( @@ -678,13 +856,17 @@ def test_histogram_invalid_axis(self): self.X, ax="invalid") def test_scree(self): - scprep.plot.scree_plot(self.S) - scprep.plot.scree_plot(self.S, cumulative=True, - xlabel="x label", ylabel="y label") + ax = scprep.plot.scree_plot(self.S) + assert all([t == int(t) for t in ax.get_xticks()]), ax.get_xticks() + ax = scprep.plot.scree_plot(self.S, cumulative=True, + xlabel="x label", ylabel="y label", filename="test_scree.png") + assert all([t == int(t) for t in ax.get_xticks()]), ax.get_xticks() + assert os.path.isfile("test_scree.png") def test_scree_custom_axis(self): fig, ax = plt.subplots() scprep.plot.scree_plot(self.S, ax=ax) + assert all([t == int(t) for t in ax.get_xticks()]), ax.get_xticks() def test_scree_invalid_axis(self): assert_raise_message( @@ -724,6 +906,18 @@ def test_jitter_continuous(self): assert ax.get_xlim() == (-0.5, 1.5) assert [t.get_text() for t in ax.get_xticklabels()] == ['+', '-'] + def test_jitter_axis_labels(self): + ax = scprep.plot.jitter(np.where(self.X_pca[:, 0] > 0, '+', '-'), + self.X_pca[:, 1], + xlabel="test") + assert ax.get_xlabel() == "test" + assert ax.get_ylabel() == '' + ax = scprep.plot.jitter( + pd.Series(np.where(self.X_pca[:, 0] > 0, '+', '-'), name='x'), + pd.Series(self.X_pca[:, 1], name='y'), ylabel="override") + assert ax.get_xlabel() == "x" + assert ax.get_ylabel() == "override" + def test_scatter_dict(self): scprep.plot.scatter2d(self.X_pca, c=np.random.choice( ['hello', 'world'], self.X_pca.shape[0], replace=True), @@ -810,8 +1004,12 @@ def test_scatter_custom_ticklabels(self): assert np.all(xticklabels == np.array(['a', 'b', 'c'])) def test_scatter_axis_labels(self): + ax = scprep.plot.scatter2d( + self.X_pca.tolist(), label_prefix="test") + assert ax.get_xlabel() == "test1" + assert ax.get_ylabel() == "test2" ax = scprep.plot.scatter3d( - self.X_pca, label_prefix="test") + self.X_pca.tolist(), label_prefix="test") assert ax.get_xlabel() == "test1" assert ax.get_ylabel() == "test2" assert ax.get_zlabel() == "test3" @@ -819,6 +1017,20 @@ def test_scatter_axis_labels(self): self.X_pca, label_prefix="test", xlabel="override") assert ax.get_xlabel() == "override" assert ax.get_ylabel() == "test2" + ax = scprep.plot.scatter( + x=self.X_pca[:,0], y=pd.Series(self.X_pca[:,1], name='y'), + z=pd.Series(self.X_pca[:,2], name='z'), + ylabel='override') + assert ax.get_xlabel() == '' + assert ax.get_ylabel() == "override" + assert ax.get_zlabel() == "z" + ax = scprep.plot.scatter( + x=self.X_pca[:,0], y=pd.Series(self.X_pca[:,1], name='y'), + z=pd.Series(self.X_pca[:,2], name='z'), + zlabel='override') + assert ax.get_xlabel() == '' + assert ax.get_ylabel() == "y" + assert ax.get_zlabel() == "override" def test_scatter_axis_savefig(self): scprep.plot.scatter2d( @@ -831,6 +1043,20 @@ def test_scatter_viewinit(self): assert ax.elev == 80 assert ax.azim == 270 + def test_scatter3d_data_2d(self): + assert_raise_message( + ValueError, + "Expected data.shape[1] >= 3. Got 2", + scprep.plot.scatter3d, + self.X_pca[:,:2]) + + def test_scatter3d_data_2d_list(self): + assert_raise_message( + ValueError, + "Expected data.shape[1] >= 3. Got 2", + scprep.plot.scatter3d, + self.X_pca[:,:2].tolist()) + def test_scatter_rotate_gif(self): scprep.plot.rotate_scatter3d(self.X_pca, fps=3, dpi=20, filename="test.gif") @@ -874,6 +1100,13 @@ def test_scatter_invalid_s(self): scprep.plot.scatter2d, self.X_pca, s=self.X_pca[0, :]) + def test_scatter_invalid_mask(self): + assert_raise_message( + ValueError, "Expected mask of length {}. Got {}".format( + self.X_pca.shape[0], self.X_pca.shape[1]), + scprep.plot.scatter2d, self.X_pca, + mask=self.X_pca[0, :] > 0) + def test_scatter_invalid_discrete(self): assert_raise_message( ValueError, "Cannot treat non-numeric data as continuous", @@ -1080,7 +1313,7 @@ def test_marker_plot_no_gene_names(self): "be provided. " "Got gene_names=None, data as a ", scprep.plot.marker_plot, - data=self.X.values, + data=self.X.to_numpy(), clusters=np.random.choice( np.arange(10), replace=True, size=self.X.shape[0]), markers={'tissue': ['z']}) diff --git a/test/test_reduce.py b/test/test_reduce.py index c5620904..94179d16 100644 --- a/test/test_reduce.py +++ b/test/test_reduce.py @@ -1,6 +1,8 @@ from tools import utils, matrix, data import scprep from scipy import sparse +import numpy as np +import pandas as pd from sklearn import decomposition from sklearn.utils.testing import assert_raise_message, assert_warns_message from functools import partial @@ -25,6 +27,11 @@ def test_dense(self): self.X, utils.assert_transform_equals, Y=self.Y_random, transform=scprep.reduce.pca, n_components=100, seed=42) + matrix.test_all_matrix_types( + self.X, utils.assert_transform_equals, + Y=self.Y_random, transform=scprep.reduce.pca, + n_components=100, seed=42, method='dense', + check=partial(utils.assert_all_close, atol=1e-10)) def test_sparse_svd(self): matrix.test_sparse_matrix_types( @@ -33,11 +40,25 @@ def test_sparse_svd(self): check=partial(utils.assert_all_close, rtol=1e-3, atol=1e-5), n_components=50, eps=0.3, seed=42, method='svd') + def test_pandas(self): + X = pd.DataFrame(self.X, index=np.arange(self.X.shape[0]).astype(str), + columns=np.arange(self.X.shape[1]).astype(float)) + def test_fun(X_pd): + Y = scprep.reduce.pca(X_pd, n_components=100, seed=42) + assert isinstance(Y, pd.DataFrame) + assert np.all(Y.index == X.index) + assert np.all(Y.columns == np.array(['PC{}'.format(i+1) + for i in range(Y.shape[1])])) + matrix.test_pandas_matrix_types( + X, test_fun) + def test_sparse_orth_rproj(self): + def test_fn(*args, **kwargs): + return scprep.utils.toarray(scprep.reduce.pca(*args, **kwargs)) matrix.test_sparse_matrix_types( self.X, utils.assert_transform_equals, check=utils.assert_matrix_class_equivalent, - Y=self.Y_full, transform=scprep.reduce.pca, + Y=self.Y_full, transform=test_fn, n_components=50, eps=0.3, seed=42, method='orth_rproj') def test_singular_values_dense(self): @@ -53,10 +74,12 @@ def test_singular_values_sparse(self): eps=0.3, seed=42, return_singular_values=True)[1], atol=1e-14) def test_sparse_rproj(self): + def test_fn(*args, **kwargs): + return scprep.utils.toarray(scprep.reduce.pca(*args, **kwargs)) matrix.test_sparse_matrix_types( self.X, utils.assert_transform_equals, check=utils.assert_matrix_class_equivalent, - Y=self.Y_full, transform=scprep.reduce.pca, + Y=self.Y_full, transform=test_fn, n_components=50, eps=0.3, seed=42, method='rproj') def test_eps_too_low(self): diff --git a/test/test_run.py b/test/test_run.py index f0f2a66a..c191d94d 100644 --- a/test/test_run.py +++ b/test/test_run.py @@ -1,9 +1,13 @@ from tools import utils, matrix, data import numpy as np +import pandas as pd import scprep import scprep.run.r_function import unittest +import sklearn.cluster import rpy2.rinterface_lib.callbacks +import rpy2.rinterface_lib.embedded +from sklearn.utils.testing import assert_raise_message, assert_warns_message builtin_warning = rpy2.rinterface_lib.callbacks.consolewrite_warnerror @@ -15,7 +19,20 @@ def test_verbose(): assert np.all(fun() == np.array([[1], [2], [3]])) -class TestRFunctions(unittest.TestCase): +def test_install_bioc(): + assert_raise_message( + rpy2.rinterface_lib.embedded.RRuntimeError, + "Error: Bioconductor version '3.1' requires R version '3.2'; see", + scprep.run.install_bioconductor, + version='3.1', site_repository='https://bioconductor.org/packages/3.1/bioc', + verbose=False) + + +class TestSplatter(unittest.TestCase): + + @classmethod + def setUpClass(self): + scprep.run.splatter.install(verbose=False) def test_splatter_default(self): sim = scprep.run.SplatSimulate( @@ -176,3 +193,109 @@ def test_splatter_warning(self): scprep.run.r_function._ConsoleWarning.set_builtin() assert rpy2.rinterface_lib.callbacks.consolewrite_warnerror is \ builtin_warning + + +class TestSlingshot(unittest.TestCase): + + @classmethod + def setUpClass(self): + scprep.run.slingshot.install(verbose=False) + self.X = data.load_10X() + self.X_pca = scprep.reduce.pca(self.X) + self.clusters = sklearn.cluster.KMeans(6).fit_predict(self.X_pca) + + def test_slingshot(self): + slingshot = scprep.run.Slingshot(self.X_pca[:,:2], self.clusters, verbose=False) + pseudotime, branch, curves = slingshot['pseudotime'], slingshot['branch'], slingshot['curves'] + assert pseudotime.shape[0] == self.X_pca.shape[0] + assert pseudotime.shape[1] == curves.shape[0] + assert branch.shape[0] == self.X_pca.shape[0] + current_pseudotime = -1 + for i in np.unique(branch): + branch_membership = np.isnan(pseudotime[branch==i]) + assert np.all(branch_membership == branch_membership[0]) + new_pseudotime = np.nanmean(pseudotime[branch==i]) + assert new_pseudotime > current_pseudotime + current_pseudotime = new_pseudotime + assert curves.shape[1] == self.X_pca.shape[0] + assert curves.shape[2] == 2 + assert np.all(np.any(~np.isnan(pseudotime), axis=1)) + + def test_slingshot_pandas(self): + slingshot = scprep.run.Slingshot(pd.DataFrame(self.X_pca[:,:2], index=self.X.index), + self.clusters, verbose=False) + pseudotime, branch, curves = slingshot['pseudotime'], slingshot['branch'], slingshot['curves'] + assert np.all(pseudotime.index == self.X.index) + assert np.all(branch.index == self.X.index) + assert branch.name == 'branch' + assert pseudotime.shape[0] == self.X_pca.shape[0] + assert pseudotime.shape[1] == curves.shape[0] + assert branch.shape[0] == self.X_pca.shape[0] + current_pseudotime = -1 + for i in np.unique(branch): + branch_membership = np.isnan(pseudotime.loc[branch==i]) + assert np.all(branch_membership == branch_membership.iloc[0]) + new_pseudotime = np.nanmean(np.nanmean(pseudotime.loc[branch==i])) + assert new_pseudotime > current_pseudotime + current_pseudotime = new_pseudotime + assert curves.shape[1] == self.X_pca.shape[0] + assert curves.shape[2] == 2 + assert np.all(np.any(~np.isnan(pseudotime), axis=1)) + + def test_slingshot_distance(self): + assert_raise_message( + NotImplementedError, + "distance argument not currently implemented", + scprep.run.Slingshot, + self.X_pca, self.clusters, distance=lambda X, Y : np.sum(X-Y)) + + def test_slingshot_optional_args(self): + slingshot = scprep.run.Slingshot(self.X_pca[:,:2], self.clusters, + start_cluster=4, omega=0.1, verbose=False) + pseudotime, branch, curves = slingshot['pseudotime'], slingshot['branch'], slingshot['curves'] + assert pseudotime.shape[0] == self.X_pca.shape[0] + assert pseudotime.shape[1] == curves.shape[0] + assert branch.shape[0] == self.X_pca.shape[0] + current_pseudotime = -1 + for i in np.unique(branch): + branch_membership = np.isnan(pseudotime[branch==i]) + assert np.all(branch_membership == branch_membership[0]) + if np.all(np.isnan(pseudotime[branch==i])): + assert i == -1 + else: + new_pseudotime = np.nanmean(pseudotime[branch==i]) + assert new_pseudotime > current_pseudotime + current_pseudotime = new_pseudotime + assert curves.shape[1] == self.X_pca.shape[0] + assert curves.shape[2] == 2 + slingshot = scprep.run.Slingshot(self.X_pca[:,:2], self.clusters, + end_cluster=0, verbose=False) + pseudotime, branch, curves = slingshot['pseudotime'], slingshot['branch'], slingshot['curves'] + assert pseudotime.shape[0] == self.X_pca.shape[0] + assert pseudotime.shape[1] == curves.shape[0] + assert branch.shape[0] == self.X_pca.shape[0] + current_pseudotime = -1 + for i in np.unique(branch): + branch_membership = np.isnan(pseudotime[branch==i]) + assert np.all(branch_membership == branch_membership[0]) + new_pseudotime = np.nanmean(pseudotime[branch==i]) + assert new_pseudotime > current_pseudotime + current_pseudotime = new_pseudotime + assert curves.shape[1] == self.X_pca.shape[0] + assert curves.shape[2] == 2 + assert np.all(np.any(~np.isnan(pseudotime), axis=1)) + + def test_slingshot_errors(self): + assert_warns_message( + UserWarning, + "Expected data to be low-dimensional. " + "Got data.shape[1] = 4", + scprep.run.Slingshot, + self.X_pca[:, :4], self.clusters, verbose=False) + assert_raise_message( + ValueError, + "Expected len(cluster_labels) ({}) to equal " + "data.shape[0] ({})".format( + self.X.shape[0]//2, self.X.shape[0]), + scprep.run.Slingshot, + self.X_pca[:, :2], self.clusters[:self.X.shape[0]//2], verbose=False) diff --git a/test/test_sanitize.py b/test/test_sanitize.py index 9ed4ef30..9c46f611 100644 --- a/test/test_sanitize.py +++ b/test/test_sanitize.py @@ -20,15 +20,18 @@ def test_check_numeric_inplace(): utils.assert_transform_unchanged, matrix._scipy_matrix_types + matrix._numpy_matrix_types + - matrix._pandas_dense_matrix_types, + matrix._pandas_dense_matrix_types + + [matrix.SparseDataFrame], transform=scprep.sanitize.check_numeric, copy=False) + matrix._ignore_pandas_sparse_warning() assert_raise_message( TypeError, "pd.SparseDataFrame does not support " "copy=False. Please use copy=True.", scprep.sanitize.check_numeric, - data=X, copy=False) + data=matrix.SparseDataFrame_deprecated(X), copy=False) + matrix._reset_warnings() class TypeErrorClass(object): diff --git a/test/test_select.py b/test/test_select.py index e05738e3..ff05f652 100644 --- a/test/test_select.py +++ b/test/test_select.py @@ -41,7 +41,7 @@ def test_get_gene_set_ndarray(self): "data must be a list of gene names or a pandas " "DataFrame. Got ndarray", scprep.select.get_gene_set, - data=self.X.values, regex="8$") + data=self.X.to_numpy(), regex="8$") def test_get_gene_set_no_condition(self): assert_warns_message( @@ -75,7 +75,7 @@ def test_get_cell_set_ndarray(self): "data must be a list of cell names or a pandas " "DataFrame. Got ndarray", scprep.select.get_cell_set, - data=self.X.values, regex="G\\-1$") + data=self.X.to_numpy(), regex="G\\-1$") def test_get_cell_set_no_condition(self): assert_warns_message( @@ -106,7 +106,7 @@ def test_select_rows_integer_index(self): def test_select_rows_string_array_index(self): matrix.test_pandas_matrix_types( self.X, scprep.select.select_rows, - idx=np.random.choice(self.X.index.values, self.X.shape[0] // 2)) + idx=np.random.choice(self.X.index.to_numpy(), self.X.shape[0] // 2)) def test_select_rows_pandas_index_index(self): matrix.test_pandas_matrix_types( @@ -142,11 +142,11 @@ def test_select_rows_sparse_series_data_integer_index(self): def test_select_rows_1d_array_data(self): scprep.select.select_rows( - self.X, self.X.values[:, 0], idx=np.random.choice([True, False], [self.X.shape[0]])) + self.X, self.X.to_numpy()[:, 0], idx=np.random.choice([True, False], [self.X.shape[0]])) def test_select_rows_list_data(self): scprep.select.select_rows( - self.X, self.X.values[:, 0].tolist(), idx=np.random.choice([True, False], [self.X.shape[1]])) + self.X, self.X.to_numpy()[:, 0].tolist(), idx=np.random.choice([True, False], [self.X.shape[1]])) def test_select_rows_get_cell_set(self): matrix.test_pandas_matrix_types( @@ -189,7 +189,7 @@ def test_select_cols_integer_index(self): def test_select_cols_string_array_index(self): matrix.test_pandas_matrix_types( self.X, scprep.select.select_cols, - idx=np.random.choice(self.X.columns.values, self.X.shape[1] // 2)) + idx=np.random.choice(self.X.columns.to_numpy(), self.X.shape[1] // 2)) def test_select_cols_pandas_index_index(self): matrix.test_pandas_matrix_types( @@ -241,11 +241,11 @@ def test_select_cols_sparse_series_data_integer_index(self): def test_select_cols_1d_array_data(self): scprep.select.select_cols( - self.X, self.X.values[0, :], idx=np.random.choice([True, False], [self.X.shape[1]])) + self.X, self.X.to_numpy()[0, :], idx=np.random.choice([True, False], [self.X.shape[1]])) def test_select_cols_list_data(self): scprep.select.select_cols( - self.X, self.X.values[0, :].tolist(), idx=np.random.choice([True, False], [self.X.shape[1]])) + self.X, self.X.to_numpy()[0, :].tolist(), idx=np.random.choice([True, False], [self.X.shape[1]])) def test_select_cols_get_gene_set(self): matrix.test_pandas_matrix_types( @@ -267,14 +267,14 @@ def test_select_cols_no_condition(self): def test_select_rows_invalid_index(self): assert_raise_message(KeyError, - "the label [not_a_cell] is not in the [index]", + "'not_a_cell'", scprep.select.select_rows, self.X, idx='not_a_cell') def test_select_cols_invalid_index(self): assert_raise_message(KeyError, - "the label [not_a_gene] is not in the [columns]", + "'not_a_gene'", scprep.select.select_cols, self.X, idx='not_a_gene') @@ -318,7 +318,7 @@ def test_select_cols_unequal_columns(self): "columns. Got [100, 50]", scprep.select.select_cols, self.X, - self.X.values[:, :50]) + self.X.to_numpy()[:, :50]) def test_select_rows_unequal_rows(self): assert_raise_message( @@ -327,34 +327,34 @@ def test_select_rows_unequal_rows(self): "rows. Got [100, 50]", scprep.select.select_rows, self.X, - self.X.values[:50, :]) + self.X.to_numpy()[:50, :]) def test_select_cols_conflicting_data(self): assert_raise_message( ValueError, "Expected `data` and `extra_data` pandas inputs to have the same " "column names. Fix with " - "`scprep.select.select_cols(*extra_data, data.columns)`", + "`scprep.select.select_cols(*extra_data, idx=data.columns)`", scprep.select.select_cols, self.X, - scprep.select.subsample(self.X.T, n=self.X.shape[0]).T) + self.X.iloc[:,::-1]) def test_select_rows_conflicting_data(self): assert_raise_message( ValueError, "Expected `data` and `extra_data` pandas inputs to have the same " "index. Fix with " - "`scprep.select.select_rows(*extra_data, data.index)`", + "`scprep.select.select_rows(*extra_data, idx=data.index)`", scprep.select.select_rows, self.X, - scprep.select.subsample(self.X, n=self.X.shape[0])) + self.X.iloc[::-1]) def test_select_cols_get_gene_set_ndarray_data(self): assert_raise_message( ValueError, "Can only select based on column names with DataFrame input. " "Please set `idx` to select specific columns.", - scprep.select.select_cols, self.X.values, starts_with="A" + scprep.select.select_cols, self.X.to_numpy(), starts_with="A" ) def test_select_rows_get_cell_set_ndarray_data(self): @@ -362,7 +362,7 @@ def test_select_rows_get_cell_set_ndarray_data(self): ValueError, "Can only select based on row names with DataFrame input. " "Please set `idx` to select specific rows.", - scprep.select.select_rows, self.X.values, starts_with="A" + scprep.select.select_rows, self.X.to_numpy(), starts_with="A" ) def test_subsample(self): @@ -407,6 +407,27 @@ def test_subsample_n_too_large(self): "Expected n (101) <= n_samples (100)", scprep.select.subsample, self.X, n=self.X.shape[0] + 1) + def test_sparse_dataframe_fill_value(self): + def test_fun(X): + Y = scprep.select.select_rows(X, idx=np.arange(X.shape[0]//2)) + for col in Y.columns: + assert X[col].dtype == Y[col].dtype, (X[col].dtype, Y[col].dtype) + Y = scprep.select.select_cols(X, idx=np.arange(X.shape[1]//2)) + for col in Y.columns: + assert X[col].dtype == Y[col].dtype, (X[col].dtype, Y[col].dtype) + matrix.test_matrix_types( + self.X.astype(float), test_fun, matrix._pandas_sparse_matrix_types) + + def test_select_variable_genes(self): + X_filtered = scprep.select.highly_variable_genes(self.X, percentile=70) + assert X_filtered.shape[0] == self.X.shape[0] + assert X_filtered.shape[1] <= 30 + assert X_filtered.shape[1] >= 20 + assert self.X.columns[np.argmax(self.X.values.std(axis=0))] in X_filtered.columns + matrix.test_all_matrix_types( + self.X, utils.assert_transform_equals, + Y=X_filtered, transform=scprep.select.highly_variable_genes, percentile=70) + def test_string_subset_exact_word(): np.testing.assert_array_equal(scprep.select._get_string_subset_mask( @@ -427,6 +448,8 @@ def test_string_subset_exact_word(): ['World, hello!', 'world'], exact_word='hello'), [True, False]) np.testing.assert_array_equal(scprep.select._get_string_subset_mask( ['helloooo!', 'world'], exact_word='hello'), [False, False]) + np.testing.assert_array_equal(scprep.select._get_string_subset_mask( + ['(hello) world', 'world'], exact_word='(hello) world'), [True, False]) def test_string_subset_list(): diff --git a/test/test_stats.py b/test/test_stats.py index fdb7eff1..7294b74b 100644 --- a/test/test_stats.py +++ b/test/test_stats.py @@ -6,6 +6,8 @@ import scprep from functools import partial import warnings +import os +from parameterized import parameterized def _test_fun_2d(X, fun, **kwargs): @@ -97,6 +99,8 @@ def test_knnDREMI(): Y2, drevi = scprep.stats.knnDREMI(X[:, 0], X[:, 1], plot=True, filename="test.png", return_drevi=True) + assert os.path.isfile("test.png") + os.remove("test.png") assert Y2 == Y assert drevi.shape == (20, 20) matrix.test_all_matrix_types( @@ -121,3 +125,142 @@ def test_knnDREMI(): "Attempting to calculate kNN-DREMI on a constant array. " "Returning `0`", scprep.stats.knnDREMI, X[:, 0], np.zeros_like(X[:, 1])) + + +def test_mean_difference(): + X = data.load_10X() + X = scprep.filter.filter_empty_genes(X) + Y = scprep.stats.mean_difference(X.iloc[:20], X.iloc[20:100]) + assert np.allclose(np.max(Y), 16.8125) + assert np.allclose(np.min(Y), -0.5625) + def test_fun(X, **kwargs): + return scprep.stats.mean_difference( + scprep.select.select_rows(X, idx=np.arange(20)), + scprep.select.select_rows(X, idx=np.arange(20, 100)), + **kwargs) + matrix.test_all_matrix_types( + X, utils.assert_transform_equals, Y=Y, + transform=test_fun, + check=utils.assert_all_close) + assert_raise_message( + ValueError, + "Expected X and Y to have the same number of columns. " + "Got shapes {}, {}".format(X.shape, X.iloc[:,:10].shape), + scprep.stats.mean_difference, + X, X.iloc[:,:10]) + + +@parameterized([('difference', 'up'), ('difference', 'down'), ('difference', 'both'), + ('emd', 'up'), ('emd', 'down'), ('emd', 'both')]) +def test_differential_expression(measure, direction): + X = data.load_10X() + X = scprep.filter.filter_empty_genes(X) + result = scprep.stats.differential_expression(X.iloc[:20], X.iloc[20:100], + measure=measure, direction=direction) + expected_results = {('difference', 'up') : ('Gstm5', 16.8125), + ('difference', 'down') : ('Slc2a3', -0.5625), + ('difference', 'both') : ('Gstm5', 16.8125), + ('emd', 'up') : ('Gstm5', 17.5625), + ('emd', 'down') : ('Slc2a3', -0.6875), + ('emd', 'both') : ('Gstm5', 17.5625)} + assert result['gene'][0] == expected_results[(measure, direction)][0], result['gene'][0] + assert np.allclose(result[measure][0], + expected_results[(measure, direction)][1]) + result_unnamed = scprep.stats.differential_expression(X.iloc[:20].sparse.to_coo(), X.iloc[20:100].sparse.to_coo(), + measure=measure, direction=direction) + if direction != 'both': + values = result[measure] + else: + values = np.abs(result[measure]) + + unique_values = ~np.isin(values, values[values.duplicated()]) + assert np.all(X.columns[result_unnamed['gene']][unique_values] == result['gene'][unique_values]) + def test_fun(X, **kwargs): + return scprep.stats.differential_expression( + scprep.select.select_rows(X, idx=np.arange(20)), + scprep.select.select_rows(X, idx=np.arange(20, 100)), + **kwargs) + + def check_fun(Y1, Y2): + if direction == 'both': + Y1[measure] = np.abs(Y1[measure]) + Y2[measure] = np.abs(Y2[measure]) + np.testing.assert_allclose(Y1[measure], Y2[measure], atol=5e-4) + Y1 = Y1.sort_values('gene') + Y2 = Y2.sort_values('gene') + np.testing.assert_allclose(Y1[measure], Y2[measure], atol=5e-4) + + matrix.test_all_matrix_types( + X, utils.assert_transform_equals, Y=result, + transform=test_fun, + check=check_fun, + gene_names=X.columns, + measure=measure, direction=direction) + + +def test_differential_expression_error(): + X = data.load_10X() + assert_raise_message( + ValueError, "Expected `direction` in ['up', 'down', 'both']. " + "Got invalid", scprep.stats.differential_expression, + X, X, direction='invalid') + assert_raise_message( + ValueError, "Expected `measure` in ['difference', 'emd']. " + "Got invalid", scprep.stats.differential_expression, + X, X, measure='invalid') + assert_raise_message( + ValueError, "Expected `X` and `Y` to be matrices. " + "Got shapes {}, {}".format(X.shape, X.iloc[0].shape), + scprep.stats.differential_expression, + X, X.iloc[0]) + assert_raise_message( + ValueError, "Expected gene_names to have length {}. " + "Got {}".format(X.shape[0], X.shape[0]//2), + scprep.stats.differential_expression, + X.sparse.to_coo(), X.sparse.to_coo(), gene_names=np.arange(X.shape[0]//2)) + assert_raise_message( + ValueError, "Expected gene_names to have length {}. " + "Got {}".format(X.shape[0], X.shape[0]//2), + scprep.stats.differential_expression_by_cluster, + X.sparse.to_coo(), np.random.choice(2, X.shape[0], replace=True), + gene_names=np.arange(X.shape[0]//2)) + assert_warns_message( + UserWarning, "Input data has inconsistent column names. " + "Subsetting to 20 common columns.", + scprep.stats.differential_expression, + X, X.iloc[:,:20]) + + +def test_differential_expression_by_cluster(): + measure = 'difference' + direction = 'up' + X = data.load_10X() + np.random.seed(42) + clusters = np.random.choice(4, X.shape[0], replace=True) + result = scprep.stats.differential_expression_by_cluster( + X, clusters, + measure=measure, direction=direction) + for cluster in range(4): + r = scprep.stats.differential_expression( + scprep.select.select_rows(X, idx=clusters==cluster), + scprep.select.select_rows(X, idx=clusters!=cluster), + measure=measure, direction=direction) + assert np.all(result[cluster] == r) + + +def test_differential_expression_by_cluster_subset(): + measure = 'difference' + direction = 'up' + X = data.load_10X() + np.random.seed(42) + clusters = np.random.choice(4, X.shape[0], replace=True) + result = scprep.stats.differential_expression_by_cluster( + X, clusters, + measure=measure, direction=direction, gene_names=X.columns[:X.shape[0]//2]) + for cluster in range(4): + r = scprep.stats.differential_expression( + scprep.select.select_rows(X, idx=clusters==cluster), + scprep.select.select_rows(X, idx=clusters!=cluster), + measure=measure, direction=direction, + gene_names=X.columns[:X.shape[0]//2]) + assert np.all(result[cluster] == r) diff --git a/test/test_transform.py b/test/test_transform.py index f0d26f31..58168cbd 100644 --- a/test/test_transform.py +++ b/test/test_transform.py @@ -33,17 +33,13 @@ def test_log_transform(): Y=Y, transform=scprep.transform.log, base=2) Y = np.log2(X + 5) - assert_warns_message( - RuntimeWarning, - "log transform on sparse data requires pseudocount = 1", - scprep.transform.log, - data=sparse.csr_matrix(X), base=2, pseudocount=5) - assert_warns_message( - RuntimeWarning, - "log transform on sparse data requires pseudocount = 1", - scprep.transform.log, - data=pd.SparseDataFrame(X, default_fill_value=0.0), - base=2, pseudocount=5) + def test_fun(X): + assert_warns_message( + RuntimeWarning, + "log transform on sparse data requires pseudocount = 1", + scprep.transform.log, + data=X, base=2, pseudocount=5) + matrix.test_sparse_matrix_types(X, test_fun) matrix.test_dense_matrix_types( X, utils.assert_transform_equivalent, Y=Y, transform=scprep.transform.log, diff --git a/test/test_utils.py b/test/test_utils.py index d3ab0da3..e5c1d55c 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -66,7 +66,7 @@ def test_with_pkg_version_fail_major(): def test(): return True assert_raise_message(ImportError, - "scprep requires numpy>={0} (installed: {1}). " + "numpy>={0} is required (installed: {1}). " "Please upgrade it with e.g." " `pip install --user --upgrade numpy".format( major + 1, np.__version__), @@ -80,7 +80,7 @@ def test_with_pkg_version_fail_minor(): def test(): return True assert_raise_message(ImportError, - "scprep requires numpy>={0}.{1} (installed: {2}). " + "numpy>={0}.{1} is required (installed: {2}). " "Please upgrade it with e.g." " `pip install --user --upgrade numpy".format( major, minor + 1, np.__version__), @@ -122,6 +122,8 @@ def test_combine_batches(): assert np.all(Y.index == Y2.index) assert np.all(sample_labels == np.concatenate( [np.repeat(0, X.shape[0]), np.repeat(1, X.shape[0] // 2)])) + assert np.all(sample_labels.index == Y2.index) + assert sample_labels.name == 'sample_labels' Y2, sample_labels = scprep.utils.combine_batches( [X, scprep.select.select_rows( X, idx=np.arange(X.shape[0] // 2))], @@ -131,6 +133,8 @@ def test_combine_batches(): assert np.all(np.core.defchararray.add( "_", sample_labels.astype(str)) == np.array( [i[-2:] for i in Y2.index], dtype=str)) + assert np.all(sample_labels.index == Y2.index) + assert sample_labels.name == 'sample_labels' transform = lambda X: scprep.utils.combine_batches( [X, scprep.select.select_rows(X, idx=np.arange(X.shape[0] // 2))], batch_labels=[0, 1])[0] @@ -141,6 +145,25 @@ def test_combine_batches(): Y=Y, transform=transform, check=utils.assert_all_equal) + def test_fun(X): + Y, sample_labels = scprep.utils.combine_batches( + [X, scprep.select.select_rows(X, idx=np.arange(X.shape[0] // 2))], + batch_labels=[0, 1]) + assert np.all(sample_labels.index == Y.index) + assert sample_labels.name == 'sample_labels' + matrix.test_pandas_matrix_types( + X, + test_fun) + + +def test_combine_batches_rangeindex(): + X = data.load_10X() + X = X.reset_index(drop=True) + Y = X.iloc[:X.shape[0] // 2] + data_combined, labels = scprep.utils.combine_batches( + [X, Y], ['x', 'y']) + assert isinstance(data_combined.index, pd.RangeIndex) + assert np.all(data_combined.columns == X.columns) def test_combine_batches_uncommon_genes(): @@ -161,14 +184,14 @@ def test_combine_batches_errors(): "append_to_cell_names only valid for pd.DataFrame input. " "Got coo_matrix", scprep.utils.combine_batches, - [X.to_coo(), X.iloc[:X.shape[0] // 2].to_coo()], + [X.sparse.to_coo(), X.iloc[:X.shape[0] // 2].sparse.to_coo()], batch_labels=[0, 1], append_to_cell_names=True) assert_raise_message( TypeError, - "Expected data all of the same class. Got SparseDataFrame, coo_matrix", + "Expected data all of the same class. Got DataFrame, coo_matrix", scprep.utils.combine_batches, - [X, X.iloc[:X.shape[0] // 2].to_coo()], + [X, X.iloc[:X.shape[0] // 2].sparse.to_coo()], batch_labels=[0, 1]) assert_raise_message( ValueError, @@ -217,12 +240,25 @@ def test_fun(X): matrix.test_all_matrix_types(X, test_fun) test_fun([X, np.matrix(X)]) + + +def test_toarray_string_error(): assert_raise_message(TypeError, "Expected array-like. Got ", scprep.utils.toarray, "hello") +def test_toarray_vector(): + X = data.generate_positive_sparse_matrix(shape=(50,)) + + def test_fun(X): + assert isinstance(scprep.utils.toarray(X), np.ndarray) + matrix.test_matrix_types(X, + test_fun, + matrix._pandas_vector_types) + + def test_toarray_list_of_strings(): X = ['hello', 'world', [1, 2, 3]] X = scprep.utils.toarray(X) @@ -271,6 +307,50 @@ def test_matrix_sum(): 5) +def test_matrix_std(): + X = data.generate_positive_sparse_matrix(shape=(50, 100)) + stds = np.array(X.std(0)).flatten() + matrix.test_all_matrix_types(X, utils.assert_transform_equals, Y=stds, + transform=scprep.utils.matrix_std, axis=0, + check=utils.assert_all_close) + matrix.test_numpy_matrix(X, utils.assert_transform_equals, Y=stds, + transform=scprep.utils.matrix_std, axis=0, + check=utils.assert_all_close) + + stds = np.array(X.std(1)).flatten() + matrix.test_all_matrix_types(X, utils.assert_transform_equals, Y=stds, + transform=scprep.utils.matrix_std, axis=1, + check=utils.assert_all_close) + matrix.test_numpy_matrix(X, utils.assert_transform_equals, Y=stds, + transform=scprep.utils.matrix_std, axis=1, + check=utils.assert_all_close) + + stds = np.array(X.std(None)).flatten() + matrix.test_all_matrix_types(X, utils.assert_transform_equals, Y=stds, + transform=scprep.utils.matrix_std, axis=None, + check=utils.assert_all_close) + matrix.test_numpy_matrix(X, utils.assert_transform_equals, Y=stds, + transform=scprep.utils.matrix_std, axis=None, + check=utils.assert_all_close) + + X_df = pd.DataFrame(X, index=np.arange(X.shape[0]).astype(str), + columns=np.arange(X.shape[1]).astype(str)) + def test_fun(X): + x = scprep.utils.matrix_std(X, axis=0) + assert x.name == 'std' + assert np.all(x.index == X_df.columns) + x = scprep.utils.matrix_std(X, axis=1) + assert x.name == 'std' + assert np.all(x.index == X_df.index) + matrix.test_pandas_matrix_types( + X_df, test_fun) + assert_raise_message(ValueError, + "Expected axis in [0, 1, None]. Got 5", + scprep.utils.matrix_std, + data, + 5) + + def test_matrix_elementwise_multiply_row(): X = data.generate_positive_sparse_matrix(shape=(50, 100)) x = X[:, 0] + 1 @@ -367,33 +447,89 @@ def test_matrix_elementwise_multiply_invalid_axis(): def test_deprecated(): X = data.load_10X() - assert_warns_message(FutureWarning, + assert_raise_message(RuntimeError, "`scprep.utils.select_cols` is deprecated. Use " "`scprep.select.select_cols` instead.", scprep.utils.select_cols, X, [1, 2, 3]) - assert_warns_message(FutureWarning, + assert_raise_message(RuntimeError, "`scprep.utils.select_rows` is deprecated. Use " "`scprep.select.select_rows` instead.", scprep.utils.select_rows, X, [1, 2, 3]) - assert_warns_message(FutureWarning, + assert_raise_message(RuntimeError, "`scprep.utils.get_gene_set` is deprecated. Use " "`scprep.select.get_gene_set` instead.", scprep.utils.get_gene_set, X, starts_with="D") - assert_warns_message(FutureWarning, + assert_raise_message(RuntimeError, "`scprep.utils.get_cell_set` is deprecated. Use " "`scprep.select.get_cell_set` instead.", scprep.utils.get_cell_set, X, starts_with="A") - assert_warns_message(FutureWarning, + assert_raise_message(RuntimeError, "`scprep.utils.subsample` is deprecated. Use " "`scprep.select.subsample` instead.", scprep.utils.subsample, X, n=10) + + +def test_is_sparse_dataframe(): + X = data.load_10X(sparse=False) + Y = X.astype(pd.SparseDtype(float, fill_value=0.0)) + assert scprep.utils.is_sparse_dataframe(Y) + def test_fun(X): + assert not scprep.utils.is_sparse_dataframe(X) + matrix.test_matrix_types( + X, + test_fun, + matrix._scipy_matrix_types + + matrix._numpy_matrix_types + + matrix._pandas_dense_matrix_types + + [matrix.SparseDataFrame_deprecated] + ) + + +def test_SparseDataFrame(): + X = data.load_10X(sparse=False) + Y = X.astype(pd.SparseDtype(float, fill_value=0.0)) + index = X.index + columns = X.columns + def test_fun(X): + X = scprep.utils.SparseDataFrame(X, index=index, columns=columns) + utils.assert_matrix_class_equivalent(X, Y) + matrix.test_all_matrix_types( + X, + test_fun + ) + matrix.test_pandas_matrix_types( + X, + utils.assert_transform_equivalent, + Y=Y, + transform=scprep.utils.SparseDataFrame + ) + + +def test_is_sparse_series(): + X = data.load_10X(sparse=True) + assert scprep.utils.is_sparse_series(X[X.columns[0]]) + def test_fun(X): + if isinstance(X, pd.SparseDataFrame): + x = X[X.columns[0]] + else: + x = scprep.select.select_cols(X, idx=0) + assert not scprep.utils.is_sparse_series(x) + matrix.test_matrix_types( + X.to_numpy(), + test_fun, + matrix._scipy_matrix_types + + matrix._numpy_matrix_types + + matrix._pandas_dense_matrix_types + + [matrix.SparseDataFrame_deprecated] + ) + \ No newline at end of file diff --git a/test/tools/__init__.py b/test/tools/__init__.py index 9b2e90bd..610d72cb 100644 --- a/test/tools/__init__.py +++ b/test/tools/__init__.py @@ -1,2 +1,2 @@ import matplotlib as mpl -mpl.use("Agg") +mpl.use("agg") diff --git a/test/tools/matrix.py b/test/tools/matrix.py index ec741ddd..e2251f58 100644 --- a/test/tools/matrix.py +++ b/test/tools/matrix.py @@ -5,6 +5,37 @@ from functools import partial +def _ignore_pandas_sparse_warning(): + warnings.filterwarnings( + "ignore", + category=FutureWarning, + message="SparseSeries") + warnings.filterwarnings( + "ignore", + category=FutureWarning, + message="SparseDataFrame") + warnings.filterwarnings( + "error", + category=pd.errors.PerformanceWarning) + + +def _reset_warnings(): + warnings.filterwarnings( + "error", + category=FutureWarning, + message="SparseSeries") + warnings.filterwarnings( + "error", + category=FutureWarning, + message="SparseDataFrame") + warnings.filterwarnings( + "error", + category=pd.errors.PerformanceWarning) + + +_reset_warnings() + + def _no_warning_dia_matrix(*args, **kwargs): """Helper function to silently create diagonal matrix""" with warnings.catch_warnings(): @@ -15,7 +46,24 @@ def _no_warning_dia_matrix(*args, **kwargs): " diagonals is inefficient") return sparse.dia_matrix(*args, **kwargs) -SparseDataFrame = partial(pd.SparseDataFrame, default_fill_value=0.0) +def SparseDataFrame_deprecated(X, default_fill_value=0.0): + return pd.SparseDataFrame(X, default_fill_value=default_fill_value) + +def SparseSeries(X, default_fill_value=0.0): + return pd.Series(X).astype(pd.SparseDtype(float, fill_value=default_fill_value)) + +def SparseSeries_deprecated(X, default_fill_value=0.0): + return pd.SparseSeries(X, fill_value=default_fill_value) + + +def SparseDataFrame(X, default_fill_value=0.0): + if sparse.issparse(X): + X = pd.DataFrame.sparse.from_spmatrix(X) + X.sparse.fill_value = default_fill_value + elif isinstance(X, pd.SparseDataFrame) or not isinstance(X, pd.DataFrame): + X = pd.DataFrame(X) + return X.astype(pd.SparseDtype(float, fill_value=default_fill_value)) + _scipy_matrix_types = [ sparse.csr_matrix, @@ -36,22 +84,30 @@ def _no_warning_dia_matrix(*args, **kwargs): _pandas_sparse_matrix_types = [ SparseDataFrame, + SparseDataFrame_deprecated, ] -_pandas_matrix_types = [ - pd.DataFrame, - SparseDataFrame, +_pandas_vector_types = [ + pd.Series, + SparseSeries, + SparseSeries_deprecated ] +_pandas_matrix_types = _pandas_dense_matrix_types + _pandas_sparse_matrix_types + _indexable_matrix_types = [ sparse.csr_matrix, sparse.csc_matrix, sparse.lil_matrix, - sparse.dok_matrix, - np.array, - pd.DataFrame, - SparseDataFrame -] + sparse.dok_matrix +] + _numpy_matrix_types + _pandas_matrix_types + + +def _typename(X): + if isinstance(X, pd.DataFrame) and not isinstance(X, pd.SparseDataFrame) and hasattr(X, "sparse"): + return "DataFrame[SparseArray]" + else: + return type(X).__name__ def test_matrix_types(X, test_fun, matrix_types, *args, **kwargs): @@ -66,13 +122,17 @@ def test_matrix_types(X, test_fun, matrix_types, *args, **kwargs): **kwargs : keyword arguments for test_fun """ for fun in matrix_types: + if fun is SparseDataFrame_deprecated or fun is SparseSeries_deprecated: + _ignore_pandas_sparse_warning() Y = fun(X.copy()) try: test_fun(Y, *args, **kwargs) except Exception as e: raise RuntimeError("{} with {} input to {}\n{}".format( - type(e).__name__, type(Y).__name__, test_fun.__name__, + type(e).__name__, _typename(Y), test_fun.__name__, str(e))) + finally: + _reset_warnings() def test_dense_matrix_types(X, test_fun, *args, **kwargs): diff --git a/test/tools/utils.py b/test/tools/utils.py index 092a2cf9..25671c2e 100644 --- a/test/tools/utils.py +++ b/test/tools/utils.py @@ -3,6 +3,7 @@ import pandas as pd from nose.tools import assert_raises from scprep.utils import toarray +from . import matrix def assert_all_equal(X, Y): @@ -40,15 +41,10 @@ def assert_transform_equals(X, Y, transform, check=assert_all_equal, **kwargs): ------- Y2 : returned value of transform(X, **kwargs) """ - try: - Y2 = transform(X, **kwargs) - except Exception as e: - raise RuntimeError("{} with {} input to {}\n{}".format( - type(e).__name__, type(X).__name__, transform, - str(e))) + Y2 = transform(X, **kwargs) check(Y, Y2), "{} failed on {}".format( transform, - type(X).__name__) + matrix._typename(X)) return Y2 @@ -89,7 +85,7 @@ def assert_transform_equivalent(X, Y, transform, check=assert_all_equal, Y2 = assert_transform_equals(X, Y, transform, check=check, **kwargs) assert assert_matrix_class_equivalent(X, Y2), \ "{} produced inconsistent matrix output".format( - type(X).__name__) + _typename(X)) def assert_transform_raises(X, transform, exception=ValueError, **kwargs): @@ -104,6 +100,18 @@ def assert_transform_raises(X, transform, exception=ValueError, **kwargs): assert_raises(exception, transform, X, **kwargs) +def _is_sparse_dataframe(X): + return isinstance(X, pd.SparseDataFrame) or \ + (isinstance(X, pd.DataFrame) and hasattr(X, "sparse")) + + +def _sparse_dataframe_density(X): + try: + return X.sparse.density + except AttributeError: + return X.density + + def assert_matrix_class_equivalent(X, Y): """Check the format of X and Y are the same @@ -117,11 +125,14 @@ def assert_matrix_class_equivalent(X, Y): if sparse.issparse(X): assert sparse.issparse(Y) assert X.tocoo().nnz == Y.tocoo().nnz + elif isinstance(X, pd.SparseDataFrame): + assert _is_sparse_dataframe(Y) else: - assert type(X) == type(Y) + assert type(X) == type(Y), (type(X), type(Y)) + if _is_sparse_dataframe(X): + assert _sparse_dataframe_density(X) == _sparse_dataframe_density(Y) + assert _sparse_dataframe_density(X) == _sparse_dataframe_density(Y) if isinstance(X, pd.DataFrame): assert np.all(X.columns == Y.columns) assert np.all(X.index == Y.index) - if isinstance(X, pd.SparseDataFrame) or isinstance(Y, pd.SparseDataFrame): - assert X.density == Y.density return True diff --git a/travis_setup.R b/travis_setup.R index 373614b1..a71289e6 100644 --- a/travis_setup.R +++ b/travis_setup.R @@ -3,4 +3,3 @@ if (!require("remotes")) install.packages("remotes", quietly=TRUE) remotes::update_packages(upgrade="always") if (!require("BiocManager")) install.packages("BiocManager", quietly=TRUE) BiocManager::install(update=TRUE, ask=FALSE) -if (!require("splatter")) BiocManager::install("splatter", quietly=TRUE)