diff --git a/.coveragerc b/.coveragerc new file mode 100644 index 0000000..677ea10 --- /dev/null +++ b/.coveragerc @@ -0,0 +1,7 @@ +[report] +show_missing = True +omit = + claspy/_version.py + +[run] +branch = True diff --git a/.github/workflows/cibuild.yml b/.github/workflows/cibuild.yml new file mode 100644 index 0000000..7c348a2 --- /dev/null +++ b/.github/workflows/cibuild.yml @@ -0,0 +1,26 @@ +name: CI Build + +on: [push] + +jobs: + build: + runs-on: ubuntu-latest + strategy: + max-parallel: 4 + matrix: + python-version: ["3.10", "3.11"] + steps: + - uses: actions/checkout@v1 + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v1 + with: + python-version: ${{ matrix.python-version }} + - name: Install + run: | + python -m pip install --upgrade pip + pip install . + claspy_db --dest=/home/runner/work/claspy/claspy/claspy/cellosaurus.json + - name: Test with pytest + run: make test + - name: Style check + run: make style diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..06776b7 --- /dev/null +++ b/.gitignore @@ -0,0 +1,8 @@ +.vscode/ +__pycache__/ +.coverage +sandbox/ +claspy.egg-info/ +claspy/cellosaurus.txt +claspy/cellosaurus.json +.DS_Store diff --git a/.pylintrc b/.pylintrc new file mode 100644 index 0000000..d21b901 --- /dev/null +++ b/.pylintrc @@ -0,0 +1 @@ +good-names=fh,db diff --git a/CHANGELOG.md b/CHANGELOG.md new file mode 100644 index 0000000..30042f6 --- /dev/null +++ b/CHANGELOG.md @@ -0,0 +1,36 @@ +# Change Log + +All notable changes to this project will be documented in this file. + +The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), +and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). + + +## Unreleased + +### Changed +- Improvements to loading profile and database objects (!9) +- Database search is now restricted based on species inferred from markers in the query profile, not by user-specified species (!8) +- Summary report is displayed in terminal, full report to a CSV file (!11, !12) + +### Fixed +- Added names of additional valid markers present in ForenSeq but not in Cellosaurus; includes four autosomal, seven X chromosome, and 21 Y chromosome STR markers (!8) +- Rank order of results with the same score but different numbers of shared alleles (!10) + + +## [0.0.2] 2023-05-25 + +### Fixed +- Divide by zero bug when query and reference have no shared alleles (!6) +- Marker name validation and standardization for human, mouse, and dog (!6) +- Rank order of results with the same score but different numbers of shared alleles (!6) +- Handling of string alleles, e.g. X and Y for Amelogenin (!7) +- Smart natural (not lexicographical) sorting of alleles for display (!7) + + +## [0.0.1] 2023-05-22 + +Initial release! Includes: + +- `claspy_db` for downloading and formatting the Cellosaurus database +- `claspy` for searching a profile against Cellosaurus and reporting the best results diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..866e2c3 --- /dev/null +++ b/LICENSE @@ -0,0 +1,34 @@ +Copyright (c) 2023, DHS; all rights reserved. + +This Software was prepared for the Department of Homeland Security +(DHS) by the Battelle National Biodefense Institute, LLC (BNBI) as +part of contract HSHQDC-15-C-00064 to manage and operate the National +Biodefense Analysis and Countermeasures Center (NBACC), a Federally +Funded Research and Development Center. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/MANIFEST.in b/MANIFEST.in new file mode 100644 index 0000000..a98abca --- /dev/null +++ b/MANIFEST.in @@ -0,0 +1 @@ +recursive-include claspy/tests/data/ * diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..2c9b2e9 --- /dev/null +++ b/Makefile @@ -0,0 +1,23 @@ +## #===== development tasks =====# + +## help: print this help message and exit +help: Makefile + @sed -n 's/^## //p' Makefile + +## test: run automated test suite +test: + pytest --cov=claspy claspy + +## format: autoformat Python and Snakemake code +format: + black --line-length=99 setup.py claspy/*.py claspy/tests/*.py + +## style: check code style +style: + black --line-length=99 --check setup.py claspy/*.py claspy/tests/*.py + +## hooks: deploy git pre-commit hooks for development +hooks: + echo "set -eo pipefail" > .git/hooks/pre-commit + echo "make style" >> .git/hooks/pre-commit + chmod 755 .git/hooks/pre-commit diff --git a/README.md b/README.md new file mode 100644 index 0000000..f3a5928 --- /dev/null +++ b/README.md @@ -0,0 +1,32 @@ +# Claspy: cell line authentication with STRs in Python + +Documentation for Claspy is pending. +In the mean time, see the following hints. + +``` +claspy_db # Run one time to install Cellosaurus database +claspy query.csv # Run to find closest profile to the query in the database +``` + +STR profiles should be in tabular/CSV format and look something like this. + +```csv +Sample,Marker,Allele1,Allele2 +sample1,CSF1PO,12,13 +sample1,D13S317,12, +sample1,D16S539,9,11 +sample1,D18S51,12,15 +sample1,D19S433,13,15 +sample1,D21S11,29,32.2 +sample1,D2S1338,20,23 +sample1,D3S1358,16,17 +sample1,D5S818,10,11 +sample1,D7S820,10,11 +sample1,D8S1179,13,15 +sample1,FGA,18,24 +sample1,Penta D,9, +sample1,Penta E,17, +sample1,TH01,9,9.3 +sample1,TPOX,8, +sample1,vWA,18,19 +``` diff --git a/claspy/__init__.py b/claspy/__init__.py new file mode 100644 index 0000000..3d7b58a --- /dev/null +++ b/claspy/__init__.py @@ -0,0 +1,18 @@ +# ------------------------------------------------------------------------------------------------- +# Copyright (c) 2023, DHS. +# This file is part of claspy: https://github.com/bioforensics/claspy +# +# This software was prepared for the Department of Homeland Security (DHS) by the Battelle National +# Biodefense Institute, LLC (BNBI) as part of contract HSHQDC-15-C-00064 to manage and operate the +# National Biodefense Analysis and Countermeasures Center (NBACC), a Federally Funded Research and +# Development Center. +# ------------------------------------------------------------------------------------------------- + +from .markers import validate_names +from .str_profile import Profile +from . import db +from .cli import main, db_main +from ._version import get_versions + +__version__ = get_versions()["version"] +del get_versions diff --git a/claspy/_version.py b/claspy/_version.py new file mode 100644 index 0000000..1680332 --- /dev/null +++ b/claspy/_version.py @@ -0,0 +1,646 @@ +# This file helps to compute a version number in source trees obtained from +# git-archive tarball (such as those provided by githubs download-from-tag +# feature). Distribution tarballs (built by setup.py sdist) and build +# directories (produced by setup.py build) will contain a much shorter file +# that just contains the computed version number. + +# This file is released into the public domain. Generated by +# versioneer-0.20 (https://github.com/python-versioneer/python-versioneer) + +"""Git implementation of _version.py.""" + +import errno +import os +import re +import subprocess +import sys + + +def get_keywords(): + """Get the keywords needed to look up the version information.""" + # these strings will be replaced by git during git-archive. + # setup.py/versioneer.py will grep for the variable names, so they must + # each be defined on a line of their own. _version.py will just call + # get_keywords(). + git_refnames = "$Format:%d$" + git_full = "$Format:%H$" + git_date = "$Format:%ci$" + keywords = {"refnames": git_refnames, "full": git_full, "date": git_date} + return keywords + + +class VersioneerConfig: # pylint: disable=too-few-public-methods + """Container for Versioneer configuration parameters.""" + + +def get_config(): + """Create, populate and return the VersioneerConfig() object.""" + # these strings are filled in when 'setup.py versioneer' creates + # _version.py + cfg = VersioneerConfig() + cfg.VCS = "git" + cfg.style = "pep440" + cfg.tag_prefix = "" + cfg.parentdir_prefix = "yeat-" + cfg.versionfile_source = "yeat/_version.py" + cfg.verbose = False + return cfg + + +class NotThisMethod(Exception): + """Exception raised if a method is not valid for the current scenario.""" + + +LONG_VERSION_PY = {} +HANDLERS = {} + + +def register_vcs_handler(vcs, method): # decorator + """Create decorator to mark a method as the handler of a VCS.""" + + def decorate(f): + """Store f in HANDLERS[vcs][method].""" + if vcs not in HANDLERS: + HANDLERS[vcs] = {} + HANDLERS[vcs][method] = f + return f + + return decorate + + +# pylint:disable=too-many-arguments,consider-using-with # noqa +def run_command(commands, args, cwd=None, verbose=False, hide_stderr=False, env=None): + """Call the given command(s).""" + assert isinstance(commands, list) + process = None + for command in commands: + try: + dispcmd = str([command] + args) + # remember shell=False, so use git.cmd on windows, not just git + process = subprocess.Popen( + [command] + args, + cwd=cwd, + env=env, + stdout=subprocess.PIPE, + stderr=(subprocess.PIPE if hide_stderr else None), + ) + break + except EnvironmentError: + e = sys.exc_info()[1] + if e.errno == errno.ENOENT: + continue + if verbose: + print("unable to run %s" % dispcmd) + print(e) + return None, None + else: + if verbose: + print("unable to find command, tried %s" % (commands,)) + return None, None + stdout = process.communicate()[0].strip().decode() + if process.returncode != 0: + if verbose: + print("unable to run %s (error)" % dispcmd) + print("stdout was %s" % stdout) + return None, process.returncode + return stdout, process.returncode + + +def versions_from_parentdir(parentdir_prefix, root, verbose): + """Try to determine the version from the parent directory name. + + Source tarballs conventionally unpack into a directory that includes both + the project name and a version string. We will also support searching up + two directory levels for an appropriately named parent directory + """ + rootdirs = [] + + for _ in range(3): + dirname = os.path.basename(root) + if dirname.startswith(parentdir_prefix): + return { + "version": dirname[len(parentdir_prefix) :], + "full-revisionid": None, + "dirty": False, + "error": None, + "date": None, + } + rootdirs.append(root) + root = os.path.dirname(root) # up a level + + if verbose: + print( + "Tried directories %s but none started with prefix %s" + % (str(rootdirs), parentdir_prefix) + ) + raise NotThisMethod("rootdir doesn't start with parentdir_prefix") + + +@register_vcs_handler("git", "get_keywords") +def git_get_keywords(versionfile_abs): + """Extract version information from the given file.""" + # the code embedded in _version.py can just fetch the value of these + # keywords. When used from setup.py, we don't want to import _version.py, + # so we do it with a regexp instead. This function is not used from + # _version.py. + keywords = {} + try: + with open(versionfile_abs, "r") as fobj: + for line in fobj: + if line.strip().startswith("git_refnames ="): + mo = re.search(r'=\s*"(.*)"', line) + if mo: + keywords["refnames"] = mo.group(1) + if line.strip().startswith("git_full ="): + mo = re.search(r'=\s*"(.*)"', line) + if mo: + keywords["full"] = mo.group(1) + if line.strip().startswith("git_date ="): + mo = re.search(r'=\s*"(.*)"', line) + if mo: + keywords["date"] = mo.group(1) + except EnvironmentError: + pass + return keywords + + +@register_vcs_handler("git", "keywords") +def git_versions_from_keywords(keywords, tag_prefix, verbose): + """Get version information from git keywords.""" + if "refnames" not in keywords: + raise NotThisMethod("Short version file found") + date = keywords.get("date") + if date is not None: + # Use only the last line. Previous lines may contain GPG signature + # information. + date = date.splitlines()[-1] + + # git-2.2.0 added "%cI", which expands to an ISO-8601 -compliant + # datestamp. However we prefer "%ci" (which expands to an "ISO-8601 + # -like" string, which we must then edit to make compliant), because + # it's been around since git-1.5.3, and it's too difficult to + # discover which version we're using, or to work around using an + # older one. + date = date.strip().replace(" ", "T", 1).replace(" ", "", 1) + refnames = keywords["refnames"].strip() + if refnames.startswith("$Format"): + if verbose: + print("keywords are unexpanded, not using") + raise NotThisMethod("unexpanded keywords, not a git-archive tarball") + refs = {r.strip() for r in refnames.strip("()").split(",")} + # starting in git-1.8.3, tags are listed as "tag: foo-1.0" instead of + # just "foo-1.0". If we see a "tag: " prefix, prefer those. + TAG = "tag: " + tags = {r[len(TAG) :] for r in refs if r.startswith(TAG)} + if not tags: + # Either we're using git < 1.8.3, or there really are no tags. We use + # a heuristic: assume all version tags have a digit. The old git %d + # expansion behaves like git log --decorate=short and strips out the + # refs/heads/ and refs/tags/ prefixes that would let us distinguish + # between branches and tags. By ignoring refnames without digits, we + # filter out many common branch names like "release" and + # "stabilization", as well as "HEAD" and "master". + tags = {r for r in refs if re.search(r"\d", r)} + if verbose: + print("discarding '%s', no digits" % ",".join(refs - tags)) + if verbose: + print("likely tags: %s" % ",".join(sorted(tags))) + for ref in sorted(tags): + # sorting will prefer e.g. "2.0" over "2.0rc1" + if ref.startswith(tag_prefix): + r = ref[len(tag_prefix) :] + # Filter out refs that exactly match prefix or that don't start + # with a number once the prefix is stripped (mostly a concern + # when prefix is '') + if not re.match(r"\d", r): + continue + if verbose: + print("picking %s" % r) + return { + "version": r, + "full-revisionid": keywords["full"].strip(), + "dirty": False, + "error": None, + "date": date, + } + # no suitable tags, so version is "0+unknown", but full hex is still there + if verbose: + print("no suitable tags, using unknown + full revision id") + return { + "version": "0+unknown", + "full-revisionid": keywords["full"].strip(), + "dirty": False, + "error": "no suitable tags", + "date": None, + } + + +@register_vcs_handler("git", "pieces_from_vcs") +def git_pieces_from_vcs(tag_prefix, root, verbose, runner=run_command): + """Get version from 'git describe' in the root of the source tree. + + This only gets called if the git-archive 'subst' keywords were *not* + expanded, and _version.py hasn't already been rewritten with a short + version string, meaning we're inside a checked out source tree. + """ + GITS = ["git"] + if sys.platform == "win32": + GITS = ["git.cmd", "git.exe"] + + _, rc = runner(GITS, ["rev-parse", "--git-dir"], cwd=root, hide_stderr=True) + if rc != 0: + if verbose: + print("Directory %s not under git control" % root) + raise NotThisMethod("'git rev-parse --git-dir' returned error") + + # if there is a tag matching tag_prefix, this yields TAG-NUM-gHEX[-dirty] + # if there isn't one, this yields HEX[-dirty] (no NUM) + describe_out, rc = runner( + GITS, + ["describe", "--tags", "--dirty", "--always", "--long", "--match", "%s*" % tag_prefix], + cwd=root, + ) + # --long was added in git-1.5.5 + if describe_out is None: + raise NotThisMethod("'git describe' failed") + describe_out = describe_out.strip() + full_out, rc = runner(GITS, ["rev-parse", "HEAD"], cwd=root) + if full_out is None: + raise NotThisMethod("'git rev-parse' failed") + full_out = full_out.strip() + + pieces = {} + pieces["long"] = full_out + pieces["short"] = full_out[:7] # maybe improved later + pieces["error"] = None + + branch_name, rc = runner(GITS, ["rev-parse", "--abbrev-ref", "HEAD"], cwd=root) + # --abbrev-ref was added in git-1.6.3 + if rc != 0 or branch_name is None: + raise NotThisMethod("'git rev-parse --abbrev-ref' returned error") + branch_name = branch_name.strip() + + if branch_name == "HEAD": + # If we aren't exactly on a branch, pick a branch which represents + # the current commit. If all else fails, we are on a branchless + # commit. + branches, rc = runner(GITS, ["branch", "--contains"], cwd=root) + # --contains was added in git-1.5.4 + if rc != 0 or branches is None: + raise NotThisMethod("'git branch --contains' returned error") + branches = branches.split("\n") + + # Remove the first line if we're running detached + if "(" in branches[0]: + branches.pop(0) + + # Strip off the leading "* " from the list of branches. + branches = [branch[2:] for branch in branches] + if "master" in branches: + branch_name = "master" + elif not branches: + branch_name = None + else: + # Pick the first branch that is returned. Good or bad. + branch_name = branches[0] + + pieces["branch"] = branch_name + + # parse describe_out. It will be like TAG-NUM-gHEX[-dirty] or HEX[-dirty] + # TAG might have hyphens. + git_describe = describe_out + + # look for -dirty suffix + dirty = git_describe.endswith("-dirty") + pieces["dirty"] = dirty + if dirty: + git_describe = git_describe[: git_describe.rindex("-dirty")] + + # now we have TAG-NUM-gHEX or HEX + + if "-" in git_describe: + # TAG-NUM-gHEX + mo = re.search(r"^(.+)-(\d+)-g([0-9a-f]+)$", git_describe) + if not mo: + # unparseable. Maybe git-describe is misbehaving? + pieces["error"] = "unable to parse git-describe output: '%s'" % describe_out + return pieces + + # tag + full_tag = mo.group(1) + if not full_tag.startswith(tag_prefix): + if verbose: + fmt = "tag '%s' doesn't start with prefix '%s'" + print(fmt % (full_tag, tag_prefix)) + pieces["error"] = "tag '%s' doesn't start with prefix '%s'" % (full_tag, tag_prefix) + return pieces + pieces["closest-tag"] = full_tag[len(tag_prefix) :] + + # distance: number of commits since tag + pieces["distance"] = int(mo.group(2)) + + # commit: short hex revision ID + pieces["short"] = mo.group(3) + + else: + # HEX: no tags + pieces["closest-tag"] = None + count_out, rc = runner(GITS, ["rev-list", "HEAD", "--count"], cwd=root) + pieces["distance"] = int(count_out) # total number of commits + + # commit date: see ISO-8601 comment in git_versions_from_keywords() + date = runner(GITS, ["show", "-s", "--format=%ci", "HEAD"], cwd=root)[0].strip() + # Use only the last line. Previous lines may contain GPG signature + # information. + date = date.splitlines()[-1] + pieces["date"] = date.strip().replace(" ", "T", 1).replace(" ", "", 1) + + return pieces + + +def plus_or_dot(pieces): + """Return a + if we don't already have one, else return a .""" + if "+" in pieces.get("closest-tag", ""): + return "." + return "+" + + +def render_pep440(pieces): + """Build up version string, with post-release "local version identifier". + + Our goal: TAG[+DISTANCE.gHEX[.dirty]] . Note that if you + get a tagged build and then dirty it, you'll get TAG+0.gHEX.dirty + + Exceptions: + 1: no tags. git_describe was just HEX. 0+untagged.DISTANCE.gHEX[.dirty] + """ + if pieces["closest-tag"]: + rendered = pieces["closest-tag"] + if pieces["distance"] or pieces["dirty"]: + rendered += plus_or_dot(pieces) + rendered += "%d.g%s" % (pieces["distance"], pieces["short"]) + if pieces["dirty"]: + rendered += ".dirty" + else: + # exception #1 + rendered = "0+untagged.%d.g%s" % (pieces["distance"], pieces["short"]) + if pieces["dirty"]: + rendered += ".dirty" + return rendered + + +def render_pep440_branch(pieces): + """TAG[[.dev0]+DISTANCE.gHEX[.dirty]] . + + The ".dev0" means not master branch. Note that .dev0 sorts backwards + (a feature branch will appear "older" than the master branch). + + Exceptions: + 1: no tags. 0[.dev0]+untagged.DISTANCE.gHEX[.dirty] + """ + if pieces["closest-tag"]: + rendered = pieces["closest-tag"] + if pieces["distance"] or pieces["dirty"]: + if pieces["branch"] != "master": + rendered += ".dev0" + rendered += plus_or_dot(pieces) + rendered += "%d.g%s" % (pieces["distance"], pieces["short"]) + if pieces["dirty"]: + rendered += ".dirty" + else: + # exception #1 + rendered = "0" + if pieces["branch"] != "master": + rendered += ".dev0" + rendered += "+untagged.%d.g%s" % (pieces["distance"], pieces["short"]) + if pieces["dirty"]: + rendered += ".dirty" + return rendered + + +def render_pep440_pre(pieces): + """TAG[.post0.devDISTANCE] -- No -dirty. + + Exceptions: + 1: no tags. 0.post0.devDISTANCE + """ + if pieces["closest-tag"]: + rendered = pieces["closest-tag"] + if pieces["distance"]: + rendered += ".post0.dev%d" % pieces["distance"] + else: + # exception #1 + rendered = "0.post0.dev%d" % pieces["distance"] + return rendered + + +def render_pep440_post(pieces): + """TAG[.postDISTANCE[.dev0]+gHEX] . + + The ".dev0" means dirty. Note that .dev0 sorts backwards + (a dirty tree will appear "older" than the corresponding clean one), + but you shouldn't be releasing software with -dirty anyways. + + Exceptions: + 1: no tags. 0.postDISTANCE[.dev0] + """ + if pieces["closest-tag"]: + rendered = pieces["closest-tag"] + if pieces["distance"] or pieces["dirty"]: + rendered += ".post%d" % pieces["distance"] + if pieces["dirty"]: + rendered += ".dev0" + rendered += plus_or_dot(pieces) + rendered += "g%s" % pieces["short"] + else: + # exception #1 + rendered = "0.post%d" % pieces["distance"] + if pieces["dirty"]: + rendered += ".dev0" + rendered += "+g%s" % pieces["short"] + return rendered + + +def render_pep440_post_branch(pieces): + """TAG[.postDISTANCE[.dev0]+gHEX[.dirty]] . + + The ".dev0" means not master branch. + + Exceptions: + 1: no tags. 0.postDISTANCE[.dev0]+gHEX[.dirty] + """ + if pieces["closest-tag"]: + rendered = pieces["closest-tag"] + if pieces["distance"] or pieces["dirty"]: + rendered += ".post%d" % pieces["distance"] + if pieces["branch"] != "master": + rendered += ".dev0" + rendered += plus_or_dot(pieces) + rendered += "g%s" % pieces["short"] + if pieces["dirty"]: + rendered += ".dirty" + else: + # exception #1 + rendered = "0.post%d" % pieces["distance"] + if pieces["branch"] != "master": + rendered += ".dev0" + rendered += "+g%s" % pieces["short"] + if pieces["dirty"]: + rendered += ".dirty" + return rendered + + +def render_pep440_old(pieces): + """TAG[.postDISTANCE[.dev0]] . + + The ".dev0" means dirty. + + Exceptions: + 1: no tags. 0.postDISTANCE[.dev0] + """ + if pieces["closest-tag"]: + rendered = pieces["closest-tag"] + if pieces["distance"] or pieces["dirty"]: + rendered += ".post%d" % pieces["distance"] + if pieces["dirty"]: + rendered += ".dev0" + else: + # exception #1 + rendered = "0.post%d" % pieces["distance"] + if pieces["dirty"]: + rendered += ".dev0" + return rendered + + +def render_git_describe(pieces): + """TAG[-DISTANCE-gHEX][-dirty]. + + Like 'git describe --tags --dirty --always'. + + Exceptions: + 1: no tags. HEX[-dirty] (note: no 'g' prefix) + """ + if pieces["closest-tag"]: + rendered = pieces["closest-tag"] + if pieces["distance"]: + rendered += "-%d-g%s" % (pieces["distance"], pieces["short"]) + else: + # exception #1 + rendered = pieces["short"] + if pieces["dirty"]: + rendered += "-dirty" + return rendered + + +def render_git_describe_long(pieces): + """TAG-DISTANCE-gHEX[-dirty]. + + Like 'git describe --tags --dirty --always -long'. + The distance/hash is unconditional. + + Exceptions: + 1: no tags. HEX[-dirty] (note: no 'g' prefix) + """ + if pieces["closest-tag"]: + rendered = pieces["closest-tag"] + rendered += "-%d-g%s" % (pieces["distance"], pieces["short"]) + else: + # exception #1 + rendered = pieces["short"] + if pieces["dirty"]: + rendered += "-dirty" + return rendered + + +def render(pieces, style): + """Render the given version pieces into the requested style.""" + if pieces["error"]: + return { + "version": "unknown", + "full-revisionid": pieces.get("long"), + "dirty": None, + "error": pieces["error"], + "date": None, + } + + if not style or style == "default": + style = "pep440" # the default + + if style == "pep440": + rendered = render_pep440(pieces) + elif style == "pep440-branch": + rendered = render_pep440_branch(pieces) + elif style == "pep440-pre": + rendered = render_pep440_pre(pieces) + elif style == "pep440-post": + rendered = render_pep440_post(pieces) + elif style == "pep440-post-branch": + rendered = render_pep440_post_branch(pieces) + elif style == "pep440-old": + rendered = render_pep440_old(pieces) + elif style == "git-describe": + rendered = render_git_describe(pieces) + elif style == "git-describe-long": + rendered = render_git_describe_long(pieces) + else: + raise ValueError("unknown style '%s'" % style) + + return { + "version": rendered, + "full-revisionid": pieces["long"], + "dirty": pieces["dirty"], + "error": None, + "date": pieces.get("date"), + } + + +def get_versions(): + """Get version information or return default if unable to do so.""" + # I am in _version.py, which lives at ROOT/VERSIONFILE_SOURCE. If we have + # __file__, we can work backwards from there to the root. Some + # py2exe/bbfreeze/non-CPython implementations don't do __file__, in which + # case we can only use expanded keywords. + + cfg = get_config() + verbose = cfg.verbose + + try: + return git_versions_from_keywords(get_keywords(), cfg.tag_prefix, verbose) + except NotThisMethod: + pass + + try: + root = os.path.realpath(__file__) + # versionfile_source is the relative path from the top of the source + # tree (where the .git directory might live) to this file. Invert + # this to find the root from __file__. + for _ in cfg.versionfile_source.split("/"): + root = os.path.dirname(root) + except NameError: + return { + "version": "0+unknown", + "full-revisionid": None, + "dirty": None, + "error": "unable to find root of source tree", + "date": None, + } + + try: + pieces = git_pieces_from_vcs(cfg.tag_prefix, root, verbose) + return render(pieces, cfg.style) + except NotThisMethod: + pass + + try: + if cfg.parentdir_prefix: + return versions_from_parentdir(cfg.parentdir_prefix, root, verbose) + except NotThisMethod: + pass + + return { + "version": "0+unknown", + "full-revisionid": None, + "dirty": None, + "error": "unable to compute version", + "date": None, + } diff --git a/claspy/cli.py b/claspy/cli.py new file mode 100644 index 0000000..05a6ba0 --- /dev/null +++ b/claspy/cli.py @@ -0,0 +1,131 @@ +# ------------------------------------------------------------------------------------------------- +# Copyright (c) 2023, DHS. +# This file is part of claspy: https://github.com/bioforensics/claspy +# +# This software was prepared for the Department of Homeland Security (DHS) by the Battelle National +# Biodefense Institute, LLC (BNBI) as part of contract HSHQDC-15-C-00064 to manage and operate the +# National Biodefense Analysis and Countermeasures Center (NBACC), a Federally Funded Research and +# Development Center. +# ------------------------------------------------------------------------------------------------- + +from .db import CellosaurusDB +from .str_profile import Profile +from argparse import ArgumentParser +from claspy.db import CellosaurusDB +import claspy +import pandas as pd +import sys + + +def main(arglist=None): + if arglist: + arglist = map(str, arglist) + args = get_parser().parse_args(arglist) + db = CellosaurusDB.load(args.db) + all_summaries = list() + all_reports = list() + for query in Profile.load(args.query): + results = db.search( + query, + algorithm=args.algorithm, + mode=args.mode, + taxid=query.taxid, + amel=args.amel, + minscore=args.min_score, + maxhits=args.max_hits, + ) + all_summaries.append(results.summary) + all_reports.append(results.full_report) + pd.concat(all_summaries).to_markdown(sys.stdout, index=False, floatfmt=".3f") + print("") + if args.out: + pd.concat(all_reports).to_csv(args.out, index=False) + print(f"\nFull report written to {args.out}", file=sys.stderr) + + +def get_parser(): + parser = ArgumentParser(description="Claspy: cell line authentication with STRs in Python") + parser.add_argument("query", help="query STR profile") + parser.add_argument( + "-v", "--version", action="version", version=f"Claspy v{claspy.__version__}" + ) + parser.add_argument( + "-d", + "--db", + metavar="PATH", + default=CellosaurusDB.default_path(), + help=f"path to Cellosaurus database; default is {CellosaurusDB.default_path()}", + ) + parser.add_argument( + "-a", + "--algorithm", + metavar="A", + choices=("Tanabe", "reference", "query"), + default="Tanabe", + help="scoring algorithm; available options are Tanabe (2S/(Q+R)), query (S/Q), and reference (S/R); default is Tanabe", + ) + parser.add_argument( + "-m", + "--mode", + metavar="M", + choices=("intersect", "reference", "query"), + default="intersect", + help="mode for handling missing data; available options are query (all query markers), reference (all reference markers), and intersect (only shared markers); default is intersect", + ) + parser.add_argument( + "-s", + "--min-score", + type=float, + metavar="S", + default=0.0, + help="do not report candidate matches with a score < S; by default S=0 (filter disabled)", + ) + parser.add_argument( + "-x", + "--max-hits", + type=int, + metavar="X", + default=20, + help="do not report more than X candidate matches; by default X=20; set X<=0 to disable this filter", + ) + parser.add_argument( + "--amel", + action="store_true", + help="include the Amelogenin marker, if present, in scoring calculations; by default it is excluded", + ) + parser.add_argument( + "-o", + "--out", + metavar="FILE", + help="write a full report in CSV format to FILE; by default only a summary report is printed to the terminal", + ) + return parser + + +def db_main(arglist=None): + args = get_db_parser().parse_args(arglist) + if args.path is None: + records = CellosaurusDB.convert_from_download() + else: + records = CellosaurusDB.convert_from_path(args.path) + records.to_json(args.dest) + print(f"Database written to {args.dest}", file=sys.stderr) + + +def get_db_parser(): + parser = ArgumentParser( + description="Retrieve, format, and install the Cellosaurus database for Claspy" + ) + parser.add_argument( + "-p", + "--path", + help="install the Cellosaurus database from local file PATH rather than a remote URL", + ) + parser.add_argument( + "-d", + "--dest", + metavar="PATH", + default=CellosaurusDB.default_path(), + help=f"destination for the Cellosaurus database in JSON format; by default PATH={CellosaurusDB.default_path()}", + ) + return parser diff --git a/claspy/db.py b/claspy/db.py new file mode 100644 index 0000000..7d85c47 --- /dev/null +++ b/claspy/db.py @@ -0,0 +1,198 @@ +# ------------------------------------------------------------------------------------------------- +# Copyright (c) 2023, DHS. +# This file is part of claspy: https://github.com/bioforensics/claspy +# +# This software was prepared for the Department of Homeland Security (DHS) by the Battelle National +# Biodefense Institute, LLC (BNBI) as part of contract HSHQDC-15-C-00064 to manage and operate the +# National Biodefense Analysis and Countermeasures Center (NBACC), a Federally Funded Research and +# Development Center. +# ------------------------------------------------------------------------------------------------- + +from .str_profile import Profile +from .result import ProfileResult, SearchResult +from importlib.resources import files +import json +from pathlib import Path +import re +import sys +from tqdm import tqdm +from urllib.request import urlretrieve + + +class CellosaurusDB(list): + def search( + self, + query, + algorithm="Tanabe", + mode="intersect", + amel=False, + taxid=9606, + minscore=0.0, + maxhits=20, + ): + result = SearchResult(query, minscore=minscore, maxhits=maxhits) + for reference in self: + if taxid is not None and not reference.taxid_match(taxid): + continue + score, num_shared_alleles = Profile.score( + query, reference, algorithm=algorithm, mode=mode, amel=amel + ) + proresult = ProfileResult(query._meta["sample"], score, num_shared_alleles, reference) + result.add_profile_result(proresult) + return result + + @classmethod + def load(cls, path=None): + if path is None: + path = cls.default_path() + with open(path, "r") as instream: + return cls.from_json(instream) + + @staticmethod + def default_path(): + return files("claspy") / "cellosaurus.json" + + @classmethod + def from_json(cls, instream): + payload = json.load(instream) + if not isinstance(payload, dict) and not isinstance(payload, list): + raise ValueError(f"unexpected data type '{type(payload)}'") + if isinstance(payload, dict): + payload = [payload] + records = cls() + for profile in payload: + metadata = profile["meta"] + alleles = profile["alleles"] + records.append(Profile(alleles, metadata)) + return records + + @classmethod + def convert_from_download(cls, url=None): + if url is None: + url = "https://ftp.expasy.org/databases/cellosaurus/cellosaurus.txt" + path = files("claspy") / "cellosaurus.txt" + with ProgressBar(unit="B", unit_scale=True, miniters=1, desc=Path(url).name) as pb: + urlretrieve(url, path, reporthook=pb.update_to) + return cls.convert_from_path(path) + + @classmethod + def convert_from_path(cls, path=None): + profiles = cls() + with open(path, "r") as instream: + parser = cls.parse_cellosaurus_records(instream) + for n, profile in enumerate(parser): + profiles.append(profile) + print(f"[CellosaurusDB] parsed {n+1} distinct cell line STR profiles", file=sys.stderr) + return profiles + + @staticmethod + def parse_cellosaurus_records(instream): + parser = CellosaurusDB.parse_cellosaurus_into_blocks(instream) + for n, block in enumerate(parser): + entry = CellosaurusEntry(block) + for alleles, meta in entry.profiles: + yield Profile(alleles, meta) + print(f"[CellosaurusDB] parsed {n+1} database records", file=sys.stderr) + + @staticmethod + def parse_cellosaurus_into_blocks(instream): + block = list() + for line in instream: + if line.startswith("ID"): + break + block.append(line.strip()) + for line in instream: + line = line.strip() + if line == "//": + yield block + block = list() + block.append(line) + + def to_json(self, output): + if isinstance(output, str) or isinstance(output, Path): + with open(output, "w") as outstream: + json.dump([profile.payload for profile in self], outstream, indent=4) + else: + json.dump([profile.payload for profile in self], output, indent=4) + + +class CellosaurusEntry: + ATTRIBUTES = { + "ID": "identifier", + "AC": "accession", + "SY": "synonyms", + } + + def __init__(self, data): + self._data = data + self.meta = dict() + self.alleles = dict() + for line in data: + self.parse_meta(line) + self.parse_sources(line) + self.parse_alleles(line) + + def parse_meta(self, line): + if line.startswith(("ID", "AC", "SY")): + key, value = re.split(r"\s+", line, 1) + assert key not in self.meta, key + self.meta[self.ATTRIBUTES[key]] = value + elif line.startswith("OX"): + match = re.match(r"OX NCBI_TaxID=(\d+); ! ([^\n]+)", line) + if not match: + raise ValueError(f"cannot parse species of origin: {line}") + taxid, organism = match.groups() + if "taxid" not in self.meta: + self.meta["taxid"] = list() + self.meta["organism"] = list() + self.meta["taxid"].append(int(taxid)) + self.meta["organism"].append(organism) + + def parse_sources(self, line): + if line.startswith("ST") and "Source" in line: + match = re.match(r"ST Source\(s\): ([^\n]+)", line) + if not match: + raise ValueError(f"could not parse sources: {line}") + source_string = match.group(1) + for source in source_string.split("; "): + self.alleles[source] = dict() + + def parse_alleles(self, line): + if line.startswith("ST") and "Source" not in line and "Not_detected" not in line: + match = re.match(r"^ST ([^:]+): ([\dXY,\. ]+)(.+)?", line) + if not match: + raise ValueError(f"could not parse STR profile data: {line}") + marker, allele_str, sources = match.groups() + if sources is None: + for marker_alleles in self.alleles.values(): + marker_alleles[marker] = allele_str.strip() + else: + sources = sources.replace("(", "").replace(")", "") + for source in sources.split("; "): + if source not in self.alleles: + print( + "[CellosaurusDB] WARNING:", + f"Source '{source}' not defined for cell line {self.meta['identifier']}", + file=sys.stderr, + ) + else: + self.alleles[source][marker] = allele_str.strip() + + @property + def profiles(self): + for source, marker_alleles in self.alleles.items(): + metadata = dict(self.meta) + if len(metadata["taxid"]) == 1: + metadata["taxid"] = metadata["taxid"][0] + metadata["organism"] = metadata["organism"][0] + metadata["source"] = source + yield marker_alleles, metadata + + +class ProgressBar(tqdm): + """Stolen shamelessly from https://stackoverflow.com/a/53877507/459780.""" + + def update_to(self, b=1, bsize=1, tsize=None): + if tsize is not None: + self.total = tsize + self.update(b * bsize - self.n) diff --git a/claspy/markers.py b/claspy/markers.py new file mode 100644 index 0000000..ab5763d --- /dev/null +++ b/claspy/markers.py @@ -0,0 +1,158 @@ +# ------------------------------------------------------------------------------------------------- +# Copyright (c) 2023, DHS. +# This file is part of claspy: https://github.com/bioforensics/claspy +# +# This software was prepared for the Department of Homeland Security (DHS) by the Battelle National +# Biodefense Institute, LLC (BNBI) as part of contract HSHQDC-15-C-00064 to manage and operate the +# National Biodefense Analysis and Countermeasures Center (NBACC), a Federally Funded Research and +# Development Center. +# ------------------------------------------------------------------------------------------------- + + +valid_names = { + 9606: [ + "Amelogenin", + "CSF1PO", + "D10S1248", + "D12S391", + "D13S317", + "D16S539", + "D17S1301", + "D18S51", + "D19S433", + "D1S1656", + "D20S482", + "D21S11", + "D22S1045", + "D2S1338", + "D2S441", + "D3S1358", + "D4S2408", + "D5S818", + "D6S1043", + "D7S820", + "D8S1179", + "D9S1122", + "DXS10074", + "DXS101", + "DXS10103", + "DXS10135", + "DXS7132", + "DXS7423", + "DXS8378", + "DYF387S1", + "DYS19", + "DYS385a-b", + "DYS389I", + "DYS389II", + "DYS390", + "DYS391", + "DYS391", + "DYS392", + "DYS437", + "DYS438", + "DYS439", + "DYS448", + "DYS460", + "DYS481", + "DYS505", + "DYS522", + "DYS533", + "DYS549", + "DYS570", + "DYS570", + "DYS576", + "DYS576", + "DYS612", + "DYS635", + "DYS643", + "F13A01", + "F13B", + "FESFPS", + "FGA", + "HPRTB", + "LPL", + "Penta C", + "Penta D", + "Penta E", + "SE33", + "TH01", + "TPOX", + "Y-GATA-H4", + "vWA", + ], + 10090: [ + "Mouse STR 1-1", + "Mouse STR 1-2", + "Mouse STR 2-1", + "Mouse STR 3-2", + "Mouse STR 4-2", + "Mouse STR 5-5", + "Mouse STR 6-4", + "Mouse STR 6-7", + "Mouse STR 7-1", + "Mouse STR 8-1", + "Mouse STR 9-2", + "Mouse STR 11-2", + "Mouse STR 12-1", + "Mouse STR 13-1", + "Mouse STR 15-3", + "Mouse STR 17-2", + "Mouse STR 18-3", + "Mouse STR 19-2", + "Mouse STR X-1", + ], + 9615: [ + "Dog FHC2010", + "Dog FHC2054", + "Dog FHC2079", + "Dog PEZ1", + "Dog PEZ3", + "Dog PEZ5", + "Dog PEZ6", + "Dog PEZ8", + "Dog PEZ12", + "Dog PEZ20", + ], +} + +species_by_taxid = { + 9606: "human", + 10090: "mouse", + 9615: "dog", +} + + +def validate_names(marker_names): + """Validate marker names + + For each given marker name, determine the standardized form. Determine the species associated + with this list of marker names. Raise an exception if any marker name cannot be validated, or + if the list contains marker names from multiple species. + """ + taxids = set() + valid = dict() + for name in marker_names: + valid[name], taxid = standardize_name(name) + taxids.add(taxid) + if None in taxids: + invalid = [name for name, valid_name in valid.items() if valid_name is None] + invalid = ", ".join(invalid) + raise ValueError(f"invalid marker name(s): {invalid}") + if len(taxids) > 1: + species = sorted([species_by_taxid[taxid] for taxid in taxids]) + species = ", ".join(species) + message = f"list of marker names includes markers from different species: {species}" + raise ValueError(message) + taxid = taxids.pop() + return valid, taxid + + +def standardize_name(name): + candidate = name.replace(" ", "").lower() + for taxid, species_names in valid_names.items(): + for species_name in species_names: + species_candidate = species_name.replace(" ", "").lower() + if candidate == species_candidate: + return species_name, taxid + return None, None diff --git a/claspy/result.py b/claspy/result.py new file mode 100644 index 0000000..c8c841d --- /dev/null +++ b/claspy/result.py @@ -0,0 +1,152 @@ +# ------------------------------------------------------------------------------------------------- +# Copyright (c) 2023, DHS. +# This file is part of claspy: https://github.com/bioforensics/claspy +# +# This software was prepared for the Department of Homeland Security (DHS) by the Battelle National +# Biodefense Institute, LLC (BNBI) as part of contract HSHQDC-15-C-00064 to manage and operate the +# National Biodefense Analysis and Countermeasures Center (NBACC), a Federally Funded Research and +# Development Center. +# ------------------------------------------------------------------------------------------------- + +from collections import defaultdict, namedtuple +import pandas as pd + + +class SearchResult: + """Result for database search of a single query profile + + The SearchResult includes a score for every profile of the relevant species in the database. + Distinct profiles for the same cell line are stored in a single CellLineResult object, + accessible by that cell line's identifier. + """ + + def __init__(self, query, minscore=0.0, maxhits=20): + self.query = query + self.minscore = minscore + self.maxhits = maxhits + self.results_by_cell_line = defaultdict(CellLineResult) + + def add_profile_result(self, result): + self.results_by_cell_line[result.reference.identifier].append(result) + + @property + def summary(self): + colnames = ["Sample", "CellLine", "Score", "SharedAlleles", "Source"] + summary = pd.DataFrame([result.summary for result in self], columns=colnames) + return summary + + @property + def full_report(self): + entries = list() + markers = self.all_markers + entry = ( + self.query._meta["sample"], + self.query._meta["sample"], + "query", + pd.NA, + pd.NA, + pd.NA, + *self.query.marker_alleles(markers), + ) + entries.append(entry) + for result in self: + for entry in result.full_report(markers): + entries.append(entry) + colnames = ["Sample", "CellLine", "Status", "Score", "SharedAlleles", "Source"] + markers + return pd.DataFrame(entries, columns=colnames) + + def __iter__(self): + for n, identifier in enumerate(self.ids_by_score): + if self.maxhits > 0 and n >= self.maxhits: + return + result = self.results_by_cell_line[identifier] + if result.top_score < self.minscore: + return + yield result + + @property + def ids_by_score(self): + sorted_results = sorted( + self.results_by_cell_line.values(), + key=lambda result: (result.top_score, result.top_score_shared_alleles), + reverse=True, + ) + for result in sorted_results: + yield result.identifier + + @property + def all_markers(self): + """Determine all markers to report + + This includes any marker for which allele data is present in the query or at least one of + the database profiles to be included in the final full report. + """ + markers = set() + for marker, allele in self.query.alleles(): + markers.add(marker) + for result in self: + for subresult in result: + for marker, allele in subresult.reference.alleles(): + markers.add(marker) + return sorted(markers) + + +class CellLineResult(list): + """A list of query search scores and database profiles from the same cell line + + This class is a list of ProfileResult objects, and essentially provides some convenience + functions for handling one or more scored profiles for a cell line from a database search. + """ + + @property + def top_score(self): + return max([single_result.score for single_result in self]) + + @property + def top_score_shared_alleles(self): + return max( + [ + single_result.shared_alleles + for single_result in self + if single_result.score == self.top_score + ] + ) + + @property + def identifier(self): + ids = [single_result.reference.identifier for single_result in self] + assert len(set(ids)) == 1 + return ids[0] + + @property + def sample(self): + samples = [single_result.sample for single_result in self] + assert len(set(samples)) == 1 + return samples[0] + + @property + def summary(self): + results = sorted(self, reverse=True) + best = results[0] + best = (best.score, best.shared_alleles, best.reference.source) + return self.sample, self.identifier, *best + + def full_report(self, markers): + results = sorted(self, reverse=True) + best = results[0] + status = "best" if len(results) > 1 else "only" + yield self.sample, self.identifier, status, *best.full_report(markers) + if len(results) > 1: + worst = results[-1] + yield self.sample, self.identifier, "worst", *worst.full_report(markers) + + +class ProfileResult(namedtuple("ProfileResult", "sample score shared_alleles reference")): + """Score from comparing a query profile to a single database reference profile""" + + @property + def summary(self): + return self.score, self.shared_alleles, self.reference.source + + def full_report(self, markers): + return (*self.summary, *self.reference.marker_alleles(markers)) diff --git a/claspy/str_profile.py b/claspy/str_profile.py new file mode 100644 index 0000000..77ab6a8 --- /dev/null +++ b/claspy/str_profile.py @@ -0,0 +1,227 @@ +# ------------------------------------------------------------------------------------------------- +# Copyright (c) 2023, DHS. +# This file is part of claspy: https://github.com/bioforensics/claspy +# +# This software was prepared for the Department of Homeland Security (DHS) by the Battelle National +# Biodefense Institute, LLC (BNBI) as part of contract HSHQDC-15-C-00064 to manage and operate the +# National Biodefense Analysis and Countermeasures Center (NBACC), a Federally Funded Research and +# Development Center. +# ------------------------------------------------------------------------------------------------- + +from .markers import validate_names +import pandas as pd +import re + + +class Profile: + """Class for handling STR profiles + + Includes methods for loading, saving, and scoring genetic profiles based on short tandem repeat + (STR) markers. Each profile contains a set of STR alleles. In mammalian cell lines, these are + expected to be diploid; that is, there should be at most observed two alleles for each marker. + """ + + def __init__(self, alleles, meta): + self._meta = meta + valid_names, taxid = validate_names(alleles.keys()) + self.taxid = taxid + self._alleles = dict() + for marker, marker_alleles in alleles.items(): + marker = valid_names[marker] + marker_alleles = Profile.parse_allele_string(marker_alleles) + self._alleles[marker] = marker_alleles + + @classmethod + def load(cls, path): + types = {f"Allele{i+1}": str for i in range(10)} + data = pd.read_csv(path, sep=None, engine="python", dtype=types) + for column in ("Sample", "Marker", "Allele1"): + if column not in data.columns: + raise ValueError(f"expected column '{column}' missing") + for sample_name, sample_data in data.groupby("Sample"): + numalleles = Profile.num_alleles_from_table(sample_data) + metadata = {"sample": sample_data.Sample.iloc[0]} + alleles = dict() + for i, row in sample_data.iterrows(): + marker_alleles = list() + for n in range(numalleles): + allele = row[f"Allele{n+1}"] + if not pd.isna(allele): + marker_alleles.append(allele) + alleles[row.Marker] = ",".join(sorted(marker_alleles)) + yield Profile(alleles, metadata) + + @staticmethod + def num_alleles_from_table(table): + count = 1 + for column in table: + if column.startswith("Allele"): + try: + number = int(column[6:]) + except ValueError as verr: + raise ValueError(f"invalid table header '{column}'") from verr + if number > count: + count = number + if count > 10: + raise ValueError(f"found {count} allele columns, well above expected limit") + return count + + @staticmethod + def parse_allele_string(alleles): + return set(alleles.replace(" ", "").split(",")) + + @staticmethod + def allele_repr(allele_set): + alleles = [Profile.allele_transform(a) for a in allele_set] + alleles = sorted(alleles) + alleles = [str(a) for a in alleles] + return ",".join(alleles) + + @staticmethod + def allele_transform(allele): + if "." in allele: + return float(allele) + elif re.match(r"^\d+$", allele): + return int(allele) + elif allele in ("X", "Y"): + return allele + else: + raise ValueError(f"unexpected allele '{allele}'") + + @property + def table(self): + sample = self._meta["sample"] if "sample" in self._meta else "sample" + alleles = list() + for marker, marker_alleles in self._alleles.items(): + allele_repr = Profile.allele_repr(marker_alleles) + sorted_alleles = allele_repr.split(",") + entry = [sample, marker, *sorted_alleles] + while len(entry) < self.max_num_alleles + 2: + entry.append(None) + alleles.append(entry) + colnames = ["Sample", "Marker"] + [f"Allele{i+1}" for i in range(self.max_num_alleles)] + return pd.DataFrame(alleles, columns=colnames) + + @property + def max_num_alleles(self): + return max([len(allele_set) for allele_set in self._alleles.values()]) + + def marker_alleles(self, markers): + for marker in markers: + if marker not in self._alleles: + yield pd.NA + else: + yield Profile.allele_repr(self._alleles[marker]) + + def __iter__(self): + for marker, alleles in self._alleles.items(): + for allele in sorted(alleles): + yield marker, allele + + def __len__(self): + return len([allele for allele in self]) + + @staticmethod + def score(query, reference, algorithm="Tanabe", mode="intersect", amel=False): + """Compute a similarity score between two profiles + + The score is based on the number of alleles shared between the query profile and the + reference profile. Three scoring algorithms are implemented as described below: "Tanabe" is + the default (Q=# query alleles, R=# reference alleles, S=# shared alleles). + + - "Tanabe": 2S / (Q+R) + - "query": S / Q + - "reference: R / Q + + There are also three modes for handling missing allele data in one or both profiles: the + "intersect" mode is used by default. + + - "intersect": consider alleles only at markers present in both profiles + - "query": consider alleles for markers present in the query profile, even if missing from + the reference profile + - "reference": consider alleles for markers present in the reference profile, even if + missing from the query profile + + The Amelogenin marker (amel) is used for sex determination and is typically excluded from + scoring. Set `amel=True` to include. + + The Tanabe algorithm is described in doi:10.11418/jtca1981.18.4_329, while the query and + reference algorithms are described in doi:10.1073/pnas.121616198 + """ + if algorithm not in ("Tanabe", "query", "reference"): + raise ValueError(f"unsupported scoring algorithm '{algorithm}'") + if mode not in ("intersect", "query", "reference"): + raise ValueError(f"unsupported scoring mode '{mode}'") + markers = Profile.markers_for_scoring(query, reference, mode=mode, amel=amel) + query_alleles = len(query.alleles(markers=markers)) + refr_alleles = len(reference.alleles(markers=markers)) + shared_alleles = len(query.alleles(markers=markers) & reference.alleles(markers=markers)) + score = 0.0 + if shared_alleles > 0: + if algorithm == "Tanabe": + score = (2 * shared_alleles) / (query_alleles + refr_alleles) + elif algorithm == "query": + score = shared_alleles / query_alleles + else: + score = shared_alleles / refr_alleles + return score, shared_alleles + + @staticmethod + def markers_for_scoring(query, reference, mode="intersect", amel=False): + if mode == "intersect": + markers = query.markers & reference.markers + elif mode == "query": + markers = query.markers + else: + markers = reference.markers + if not amel: + markers = [m for m in markers if m != "Amelogenin"] + return markers + + @property + def markers(self): + return set(self._alleles.keys()) + + def alleles(self, markers=None): + if markers is None: + markers = self.markers + return {(marker, allele) for marker, allele in self if marker in markers} + + def __str__(self): + return self.table.to_csv(index=False) + + def taxid_match(self, taxid): + if isinstance(self._meta["taxid"], list): + for testid in self._meta["taxid"]: + if int(testid) == int(taxid): + return True + return False + else: + return int(self._meta["taxid"]) == int(taxid) + + @property + def identifier(self): + return self._meta["identifier"] + + @property + def source(self): + return self._meta["source"] + + @property + def payload(self): + return {"meta": self._meta, "alleles": self.allele_dict} + + @property + def allele_dict(self): + return {marker: ",".join(sorted(alleles)) for marker, alleles in self._alleles.items()} + + @property + def slug(self): + return self.identifier, self._meta["accession"], self.source + + def __lt__(self, other): + return self.slug < other.slug + + @property + def payload(self): + return {"meta": self._meta, "alleles": self.allele_dict} diff --git a/claspy/tests/__init__.py b/claspy/tests/__init__.py new file mode 100644 index 0000000..dc66b71 --- /dev/null +++ b/claspy/tests/__init__.py @@ -0,0 +1,15 @@ +# ------------------------------------------------------------------------------------------------- +# Copyright (c) 2023, DHS. +# This file is part of claspy: https://github.com/bioforensics/claspy +# +# This software was prepared for the Department of Homeland Security (DHS) by the Battelle National +# Biodefense Institute, LLC (BNBI) as part of contract HSHQDC-15-C-00064 to manage and operate the +# National Biodefense Analysis and Countermeasures Center (NBACC), a Federally Funded Research and +# Development Center. +# ------------------------------------------------------------------------------------------------- + +from importlib.resources import files + + +def data_file(path): + return files("claspy") / "tests" / "data" / path diff --git a/claspy/tests/data/db-cvcl-1085.csv b/claspy/tests/data/db-cvcl-1085.csv new file mode 100644 index 0000000..55d1b18 --- /dev/null +++ b/claspy/tests/data/db-cvcl-1085.csv @@ -0,0 +1,18 @@ +Sample,Marker,Allele1,Allele2,Allele3 +refr1,CSF1PO,12,13, +refr1,D13S317,12,, +refr1,D16S539,9,11, +refr1,D18S51,12,15, +refr1,D19S433,13,15, +refr1,D21S11,29,32.2, +refr1,D2S1338,20,23, +refr1,D3S1358,16,17, +refr1,D5S818,10,11, +refr1,D7S820,10,11, +refr1,D8S1179,13,14,15 +refr1,FGA,18,24, +refr1,Penta D,9,, +refr1,Penta E,10,17, +refr1,TH01,9,9.3, +refr1,TPOX,8,, +refr1,vWA,18,19, diff --git a/claspy/tests/data/examples.json b/claspy/tests/data/examples.json new file mode 100644 index 0000000..049300b --- /dev/null +++ b/claspy/tests/data/examples.json @@ -0,0 +1,32 @@ +[ + { + "meta": { + "description": "Example 1" + }, + "alleles": { + "CSF1PO": "13,14", + "D5S818": "13", + "D7S820": "8", + "D13S317": "12", + "FGA": "24", + "TH01": "8", + "TPOX": "11", + "vWA": "16" + } + }, + { + "meta": { + "description": "Example 2" + }, + "alleles": { + "CSF1PO": "13", + "D5S818": "13, 14", + "D7S820": "8, 19", + "D13S317": "11, 12", + "FGA": "24", + "TH01": "8", + "TPOX": "11", + "vWA": "15" + } + } +] diff --git a/claspy/tests/data/mock-cvcl-1085.csv b/claspy/tests/data/mock-cvcl-1085.csv new file mode 100644 index 0000000..5e7dc4f --- /dev/null +++ b/claspy/tests/data/mock-cvcl-1085.csv @@ -0,0 +1,18 @@ +Sample,Marker,Allele1,Allele2 +mock0,CSF1PO,12,13 +mock0,D13S317,12, +mock0,D16S539,9,11 +mock0,D18S51,12,15 +mock0,D19S433,13,15 +mock0,D21S11,29,32.2 +mock0,D2S1338,20,23 +mock0,D3S1358,16,17 +mock0,D5S818,10,11 +mock0,D7S820,10,11 +mock0,D8S1179,13,15 +mock0,FGA,18,24 +mock0,Penta D,9, +mock0,Penta E,17, +mock0,TH01,9,9.3 +mock0,TPOX,8, +mock0,vWA,18,19 diff --git a/claspy/tests/data/mock-cvcl-1085.json b/claspy/tests/data/mock-cvcl-1085.json new file mode 100644 index 0000000..e228af2 --- /dev/null +++ b/claspy/tests/data/mock-cvcl-1085.json @@ -0,0 +1,24 @@ +{ + "meta": { + "id": 0 + }, + "alleles": { + "CSF1PO": "12,13", + "D13S317": "12", + "D16S539": "9,11", + "D18S51": "12,15", + "D19S433": "13,15", + "D21S11": "29,32.2", + "D2S1338": "20,23", + "D3S1358": "16,17", + "D5S818": "10,11", + "D7S820": "10,11", + "D8S1179": "13,15", + "FGA": "18,24", + "Penta D": "9", + "Penta E": "17", + "TH01": "9,9.3", + "TPOX": "8", + "vWA": "18,19" + } +} diff --git a/claspy/tests/data/mock-sk-hep-1-2samples.csv b/claspy/tests/data/mock-sk-hep-1-2samples.csv new file mode 100644 index 0000000..cb67524 --- /dev/null +++ b/claspy/tests/data/mock-sk-hep-1-2samples.csv @@ -0,0 +1,37 @@ +Sample,Marker,Allele1,Allele2 +mock_1,Amelogenin,X, +mock_1,CSF1PO,11,12 +mock_1,D12S391,18, +mock_1,D13S317,8,12 +mock_1,D16S539,12, +mock_1,D18S51,13,15 +mock_1,D21S11,29,31 +mock_1,D2S1338,20,23 +mock_1,D3S1358,16, +mock_1,D5S818,10,13 +mock_1,D6S1043,11, +mock_1,D7S820,8,11 +mock_1,D8S1179,14 +mock_1,FGA,17, +mock_1,Penta D,13,14 +mock_1,TH01,7,9 +mock_1,TPOX,9, +mock_1,vWA,14,17 +mock_2,Amelogenin,X, +mock_2,CSF1PO,11,12 +mock_2,D12S391,18, +mock_2,D13S317,8,12 +mock_2,D16S539,12, +mock_2,D18S51,13,15 +mock_2,D21S11,29,31 +mock_2,D2S1338,20,23 +mock_2,D3S1358,16, +mock_2,D5S818,10,13 +mock_2,D6S1043,11, +mock_2,D7S820,8,11 +mock_2,D8S1179,14 +mock_2,FGA,17, +mock_2,Penta D,13,14 +mock_2,TH01,7,9 +mock_2,TPOX,9, +mock_2,vWA,14,17 \ No newline at end of file diff --git a/claspy/tests/data/mock-sk-hep-1.csv b/claspy/tests/data/mock-sk-hep-1.csv new file mode 100644 index 0000000..3b9efab --- /dev/null +++ b/claspy/tests/data/mock-sk-hep-1.csv @@ -0,0 +1,19 @@ +Sample,Marker,Allele1,Allele2 +mock,Amelogenin,X, +mock,CSF1PO,11,12 +mock,D12S391,18, +mock,D13S317,8,12 +mock,D16S539,12, +mock,D18S51,13,15 +mock,D21S11,29,31 +mock,D2S1338,20,23 +mock,D3S1358,16, +mock,D5S818,10,13 +mock,D6S1043,11, +mock,D7S820,8,11 +mock,D8S1179,14 +mock,FGA,17, +mock,Penta D,13,14 +mock,TH01,7,9 +mock,TPOX,9, +mock,vWA,14,17 diff --git a/claspy/tests/data/query-bad-allele-1.csv b/claspy/tests/data/query-bad-allele-1.csv new file mode 100644 index 0000000..89b2aa6 --- /dev/null +++ b/claspy/tests/data/query-bad-allele-1.csv @@ -0,0 +1,4 @@ +Sample,Marker,AlleleOne,AlleleTwo +query,Amelogenin,X,Y +query,CSF1PO,7.3,13 +query,D13S317,8,12 diff --git a/claspy/tests/data/query-bad-allele-2.csv b/claspy/tests/data/query-bad-allele-2.csv new file mode 100644 index 0000000..9911cb2 --- /dev/null +++ b/claspy/tests/data/query-bad-allele-2.csv @@ -0,0 +1,4 @@ +Sample,Marker,Allele1,AlleleTwo +query,Amelogenin,X,Y +query,CSF1PO,7.3,13 +query,D13S317,8,12 diff --git a/claspy/tests/data/query-multi-sample.csv b/claspy/tests/data/query-multi-sample.csv new file mode 100644 index 0000000..9e56aeb --- /dev/null +++ b/claspy/tests/data/query-multi-sample.csv @@ -0,0 +1,7 @@ +Sample,Marker,Allele1,Allele2 +query1,Amelogenin,X,Y +query1,CSF1PO,7.3,13 +query1,D13S317,8,12 +query2,Amelogenin,X,Y +query2,CSF1PO,7.3,13 +query2,D13S317,8,12 diff --git a/claspy/tests/data/query-wide.csv b/claspy/tests/data/query-wide.csv new file mode 100644 index 0000000..d1f0630 --- /dev/null +++ b/claspy/tests/data/query-wide.csv @@ -0,0 +1,4 @@ +Sample,Marker,Allele1,Allele2,Allele3,Allele4,Allele5,Allele6,Allele7,Allele8,Allele9,Allele10,Allele11 +query,Amelogenin,X,Y,,,,,,,,, +query,CSF1PO,6,7,7.3,8.3,9,9.3,10,10.1,10.3,11,14 +query,D13S317,8,12,,,,,,,,, diff --git a/claspy/tests/data/report-cvcl-1085.csv b/claspy/tests/data/report-cvcl-1085.csv new file mode 100644 index 0000000..4bec2b4 --- /dev/null +++ b/claspy/tests/data/report-cvcl-1085.csv @@ -0,0 +1,23 @@ +Sample,CellLine,Status,Score,SharedAlleles,Source,Amelogenin,CSF1PO,D13S317,D16S539,D18S51,D19S433,D21S11,D2S1338,D3S1358,D5S818,D7S820,D8S1179,FGA,Penta D,Penta E,TH01,TPOX,vWA +mock0,mock0,query,,,,,"12,13",12,"9,11","12,15","13,15","29,32.2","20,23","16,17","10,11","10,11","13,15","18,24",9,17,"9,9.3",8,"18,19" +mock0,BHT-101,best,0.9655172413793104,28,PubMed=18713817,X,,12,"9,11","12,15","13,15","29,32.2","20,23","16,17","10,11","10,11","13,14,15","18,24",9,"10,17","9,9.3",8,"18,19" +mock0,BHT-101,worst,0.9310344827586207,27,DSMZ,X,12,12,"9,11","12,15","13,15","29,32.2","20,23","16,17","10,11","10,11",15,"18,24",9,"10,17","9,9.3",8,19 +mock0,FGH,only,0.7586206896551724,11,BCRJ,X,"10,12",12,"9,11",,,,,,"10,11","10,11",,,,,"7,9","8,11","17,18" +mock0,KR-12,only,0.75,12,ATCC,"X,Y","10,12","11,12","9,11",,,,,,"10,11,13","9,10",,,,,"8,9,9.3",8,"16,18,19" +mock0,CCD-1059Sk,only,0.7407407407407407,10,ATCC,X,"11,12",12,"9,11",,,,,,11,"11,12",,,,,"9,9.3",8,"16,18" +mock0,AML14.3D10/CCCKR3 Clone 16,only,0.7142857142857143,10,ATCC,"X,Y",13,12,"9,12",,,,,,"10,11","10,14",,,,,"9,9.3","9,11","18,19" +mock0,CCD-1127Sk,only,0.7142857142857143,10,ATCC,"X,Y","12,13",12,"10,11",,,,,,11,"8,10",,,,,"9,9.3","8,10","17,18" +mock0,CG0161,only,0.7142857142857143,10,BCRC,X,"12,13",12,"9,11",,,,,,"9,11","8,11",,,,,9,"8,11","14,18" +mock0,340-RPE-11tv,only,0.6923076923076923,9,ATCC,X,"12,14","11,12",11,,,,,,11,"10,11",,,,,9,8,"17,18" +mock0,CCD-944Sk,only,0.6923076923076923,9,ATCC,X,12,12,12,,,,,,"10,11","7,10",,,,,"9,9.3",8,"16,18" +mock0,HuNS1,only,0.6923076923076923,9,ATCC,"X,Y","10,12",12,11,,,,,,"10,12",10,,,,,"9,9.3",8,"16,19" +mock0,HE31,only,0.6896551724137931,10,RCB,"X,Y","10,12",12,"9,11",,,,,,"10,11","8,10",,,,,"6,9","9,11","18,19" +mock0,OX-CO-3,only,0.6896551724137931,10,PubMed=25926053,X,12,"11,12",9,,,28,,,11,"10,11",,,,,"9,9.3",8,"16,18" +mock0,SS23,only,0.6896551724137931,10,JCRB,X,"12,13","8,12","10,11",,,,,,"10,11","10,11",,,,,"7,9","8,11",14 +mock0,TE 159.T,only,0.6896551724137931,10,ATCC,"X,Y","12,13","12,14","9,11",,,,,,"9,11","10,11",,,,,"6,9.3","8,11",17 +mock0,Yub632,only,0.6896551724137931,10,JCRB,"X,Y",12,"8,10","9,11",,,,,,"10,11","8,10",,,,,"6,9","7,8","18,19" +mock0,NCC-DMM1-C1,only,0.6666666666666666,10,PubMed=35069873,"X,Y",12,12,"9,12",,,"29,32.2",,,"9,11",8,,,,,9,"8,11","18,19" +mock0,AIDHC-NMC8,only,0.6666666666666666,9,PubMed=28284873,"X,Y",11,"11,12","11,12",,,,,,11,"10,11",,,,,9.3,"8,9","18,19" +mock0,CA,only,0.6666666666666666,9,TKG,X,12,"10,12",11,,,,,,"11,15",11,,,,,"6,9","8,11","18,19" +mock0,CCD-1074Sk,only,0.6666666666666666,9,ATCC,X,"11,12",9,11,,,,,,"10,11","10,11",,,,,"9,9.3",8,"16,17" +mock0,CLC18,only,0.6666666666666666,9,PubMed=31378681,X,12,"10,11","9,11",,,,,,"10,11","11,12",,,,,9,8,"16,18" diff --git a/claspy/tests/data/skhep1-db.json b/claspy/tests/data/skhep1-db.json new file mode 100644 index 0000000..86150f7 --- /dev/null +++ b/claspy/tests/data/skhep1-db.json @@ -0,0 +1,611 @@ +[ + { + "meta": { + "identifier": "SK-HEP-1", + "accession": "CVCL_0525", + "synonyms": "SK-Hep-1; SK HEP-1; SK HEP 01; SK-Hep1; Sk-Hep1; SK Hep1; SKHEP-1; SKHEP1; SKHep1; SK_HEP1", + "taxid": 9606, + "organism": "Homo sapiens (Human)", + "source": "ATCC" + }, + "alleles": { + "Amelogenin": "X", + "CSF1PO": "11,12", + "D12S391": "18", + "D13S317": "12,8", + "D16S539": "12", + "D18S51": "13,15", + "D21S11": "29,31", + "D2S1338": "20,23", + "D3S1358": "16", + "D5S818": "10,13", + "D6S1043": "11", + "D7S820": "11,8", + "D8S1179": "13,14", + "FGA": "17", + "Penta D": "13,14", + "TH01": "7,9", + "TPOX": "9", + "vWA": "14,17" + } + }, + { + "meta": { + "identifier": "SK-HEP-1", + "accession": "CVCL_0525", + "synonyms": "SK-Hep-1; SK HEP-1; SK HEP 01; SK-Hep1; Sk-Hep1; SK Hep1; SKHEP-1; SKHEP1; SKHep1; SK_HEP1", + "taxid": 9606, + "organism": "Homo sapiens (Human)", + "source": "CCRID" + }, + "alleles": { + "Amelogenin": "X", + "CSF1PO": "11,12", + "D12S391": "18", + "D13S317": "12,8", + "D16S539": "12", + "D18S51": "13,15", + "D19S433": "12,15.2", + "D21S11": "29,31", + "D2S1338": "20,23", + "D3S1358": "16", + "D5S818": "10,13", + "D6S1043": "11", + "D7S820": "11,8", + "D8S1179": "13,14", + "FGA": "17", + "Penta D": "13,14", + "Penta E": "13,21", + "TH01": "7,9", + "TPOX": "9", + "vWA": "14,17" + } + }, + { + "meta": { + "identifier": "SK-HEP-1", + "accession": "CVCL_0525", + "synonyms": "SK-Hep-1; SK HEP-1; SK HEP 01; SK-Hep1; Sk-Hep1; SK Hep1; SKHEP-1; SKHEP1; SKHep1; SK_HEP1", + "taxid": 9606, + "organism": "Homo sapiens (Human)", + "source": "CLS" + }, + "alleles": { + "Amelogenin": "X", + "CSF1PO": "11,12", + "D12S391": "18", + "D13S317": "12,8", + "D16S539": "12", + "D18S51": "13,15", + "D19S433": "12,15.2", + "D21S11": "29,31", + "D2S1338": "20,23", + "D3S1358": "16", + "D5S818": "10,13", + "D6S1043": "11", + "D7S820": "11,8", + "D8S1179": "13,14", + "FGA": "17", + "Penta D": "13,14", + "Penta E": "13,21", + "TH01": "7,9", + "TPOX": "9", + "vWA": "14,17" + } + }, + { + "meta": { + "identifier": "SK-HEP-1", + "accession": "CVCL_0525", + "synonyms": "SK-Hep-1; SK HEP-1; SK HEP 01; SK-Hep1; Sk-Hep1; SK Hep1; SKHEP-1; SKHEP1; SKHep1; SK_HEP1", + "taxid": 9606, + "organism": "Homo sapiens (Human)", + "source": "Cosmic-CLP" + }, + "alleles": { + "Amelogenin": "X", + "CSF1PO": "11,12", + "D12S391": "18", + "D13S317": "12,8", + "D16S539": "12", + "D18S51": "13,15", + "D21S11": "29,31", + "D2S1338": "20,23", + "D3S1358": "16", + "D5S818": "10,13", + "D6S1043": "11", + "D7S820": "11,8", + "D8S1179": "13,14", + "FGA": "17", + "Penta D": "13,14", + "TH01": "7,9", + "TPOX": "9", + "vWA": "14,17" + } + }, + { + "meta": { + "identifier": "SK-HEP-1", + "accession": "CVCL_0525", + "synonyms": "SK-Hep-1; SK HEP-1; SK HEP 01; SK-Hep1; Sk-Hep1; SK Hep1; SKHEP-1; SKHEP1; SKHep1; SK_HEP1", + "taxid": 9606, + "organism": "Homo sapiens (Human)", + "source": "DSMZ" + }, + "alleles": { + "Amelogenin": "X", + "CSF1PO": "11,12", + "D12S391": "18", + "D13S317": "12,8", + "D16S539": "12", + "D18S51": "13,15", + "D19S433": "12", + "D21S11": "29,31", + "D2S1338": "20,23", + "D3S1358": "16", + "D5S818": "10,13", + "D6S1043": "11", + "D7S820": "11,8", + "D8S1179": "13,14", + "FGA": "17", + "Penta D": "13,14", + "Penta E": "13,21", + "TH01": "7,9", + "TPOX": "9", + "vWA": "14,17" + } + }, + { + "meta": { + "identifier": "SK-HEP-1", + "accession": "CVCL_0525", + "synonyms": "SK-Hep-1; SK HEP-1; SK HEP 01; SK-Hep1; Sk-Hep1; SK Hep1; SKHEP-1; SKHEP1; SKHep1; SK_HEP1", + "taxid": 9606, + "organism": "Homo sapiens (Human)", + "source": "ECACC" + }, + "alleles": { + "Amelogenin": "X", + "CSF1PO": "11,12", + "D12S391": "18", + "D13S317": "12,8", + "D16S539": "12", + "D18S51": "13,15", + "D21S11": "29,31", + "D2S1338": "20,23", + "D3S1358": "16", + "D5S818": "10,13", + "D6S1043": "11", + "D7S820": "11,8", + "D8S1179": "13,14", + "FGA": "17", + "Penta D": "13,14", + "TH01": "7,9", + "TPOX": "9", + "vWA": "14,17" + } + }, + { + "meta": { + "identifier": "SK-HEP-1", + "accession": "CVCL_0525", + "synonyms": "SK-Hep-1; SK HEP-1; SK HEP 01; SK-Hep1; Sk-Hep1; SK Hep1; SKHEP-1; SKHEP1; SKHep1; SK_HEP1", + "taxid": 9606, + "organism": "Homo sapiens (Human)", + "source": "KCLB" + }, + "alleles": { + "Amelogenin": "X", + "CSF1PO": "11,12", + "D12S391": "18", + "D13S317": "12,8", + "D16S539": "12", + "D18S51": "13,15", + "D21S11": "29,31", + "D2S1338": "20,23", + "D3S1358": "16", + "D5S818": "10,13", + "D6S1043": "11", + "D7S820": "11,8", + "D8S1179": "13,14", + "FGA": "17", + "Penta D": "13,14", + "TH01": "7,9", + "TPOX": "9", + "vWA": "14,17" + } + }, + { + "meta": { + "identifier": "SK-HEP-1", + "accession": "CVCL_0525", + "synonyms": "SK-Hep-1; SK HEP-1; SK HEP 01; SK-Hep1; Sk-Hep1; SK Hep1; SKHEP-1; SKHEP1; SKHep1; SK_HEP1", + "taxid": 9606, + "organism": "Homo sapiens (Human)", + "source": "MSKCC" + }, + "alleles": { + "Amelogenin": "X", + "CSF1PO": "11,12", + "D12S391": "18", + "D13S317": "12,8", + "D16S539": "12", + "D18S51": "13,15", + "D19S433": "12,15.2", + "D21S11": "29,31", + "D2S1338": "20,23", + "D3S1358": "16", + "D5S818": "10,13", + "D6S1043": "11", + "D7S820": "11,8", + "D8S1179": "13,14", + "FGA": "17", + "Penta D": "13,14", + "TH01": "7,9", + "TPOX": "9", + "vWA": "14,17" + } + }, + { + "meta": { + "identifier": "SK-HEP-1", + "accession": "CVCL_0525", + "synonyms": "SK-Hep-1; SK HEP-1; SK HEP 01; SK-Hep1; Sk-Hep1; SK Hep1; SKHEP-1; SKHEP1; SKHep1; SK_HEP1", + "taxid": 9606, + "organism": "Homo sapiens (Human)", + "source": "PubMed=11416159" + }, + "alleles": { + "Amelogenin": "X", + "CSF1PO": "11,12", + "D12S391": "18", + "D13S317": "12,8", + "D16S539": "12", + "D18S51": "13,15", + "D21S11": "29,31", + "D2S1338": "20,23", + "D3S1358": "16", + "D5S818": "10,13", + "D6S1043": "11", + "D7S820": "11,8", + "D8S1179": "13,14", + "FGA": "17", + "Penta D": "13,14", + "TH01": "7,9", + "TPOX": "9", + "vWA": "14,17" + } + }, + { + "meta": { + "identifier": "SK-HEP-1", + "accession": "CVCL_0525", + "synonyms": "SK-Hep-1; SK HEP-1; SK HEP 01; SK-Hep1; Sk-Hep1; SK Hep1; SKHEP-1; SKHEP1; SKHep1; SK_HEP1", + "taxid": 9606, + "organism": "Homo sapiens (Human)", + "source": "PubMed=25877200" + }, + "alleles": { + "Amelogenin": "X", + "CSF1PO": "11,12", + "D12S391": "18", + "D13S317": "12,8", + "D16S539": "12", + "D18S51": "13,15", + "D19S433": "12,15.2", + "D21S11": "29,31", + "D2S1338": "20,23", + "D3S1358": "16", + "D5S818": "10,13", + "D6S1043": "11", + "D7S820": "11,8", + "D8S1179": "13,14", + "FGA": "17", + "Penta D": "13,14", + "Penta E": "13", + "TH01": "7,9", + "TPOX": "9", + "vWA": "14,17" + } + }, + { + "meta": { + "identifier": "SK-HEP-1-Cas9-726", + "accession": "CVCL_XD84", + "taxid": 9606, + "organism": "Homo sapiens (Human)", + "source": "CCRID" + }, + "alleles": { + "Amelogenin": "X", + "CSF1PO": "11,12", + "D12S391": "18", + "D13S317": "12,8", + "D16S539": "12", + "D18S51": "13,15", + "D19S433": "12,15.2", + "D21S11": "29,31,32", + "D2S1338": "20,23", + "D3S1358": "16", + "D5S818": "10,13", + "D6S1043": "11", + "D7S820": "11,8", + "D8S1179": "13,14", + "FGA": "17", + "Penta E": "13,21", + "TH01": "7,9", + "TPOX": "9", + "vWA": "14,17" + } + }, + { + "meta": { + "identifier": "SK-HEP-1-Cas9-727", + "accession": "CVCL_XD85", + "taxid": 9606, + "organism": "Homo sapiens (Human)", + "source": "CCRID" + }, + "alleles": { + "Amelogenin": "X", + "CSF1PO": "11,12", + "D12S391": "18", + "D13S317": "12,8", + "D16S539": "12", + "D18S51": "13,15", + "D19S433": "12,15.2", + "D21S11": "29,31", + "D2S1338": "20,23", + "D3S1358": "16", + "D5S818": "10,13", + "D6S1043": "11", + "D7S820": "11,8", + "D8S1179": "13,14", + "FGA": "17", + "Penta E": "13,21", + "TH01": "7,9", + "TPOX": "9", + "vWA": "14,17" + } + }, + { + "meta": { + "identifier": "SK-HEP-1-Cas9-728", + "accession": "CVCL_XD86", + "taxid": 9606, + "organism": "Homo sapiens (Human)", + "source": "CCRID" + }, + "alleles": { + "Amelogenin": "X", + "CSF1PO": "11,12", + "D12S391": "18", + "D13S317": "12,8", + "D16S539": "12", + "D18S51": "13,15", + "D19S433": "12,15.2", + "D21S11": "29,31,32", + "D2S1338": "20,23", + "D3S1358": "16", + "D5S818": "10,13", + "D6S1043": "11", + "D7S820": "11,8", + "D8S1179": "13,14", + "FGA": "17", + "Penta E": "13,21", + "TH01": "7,9", + "TPOX": "9", + "vWA": "14,17" + } + }, + { + "meta": { + "identifier": "SK-HEP-1-Cas9-729", + "accession": "CVCL_XD87", + "taxid": 9606, + "organism": "Homo sapiens (Human)", + "source": "CCRID" + }, + "alleles": { + "Amelogenin": "X", + "CSF1PO": "11,12", + "D12S391": "18", + "D13S317": "12,8", + "D16S539": "12", + "D18S51": "13,15", + "D19S433": "12,15.2", + "D21S11": "29,31,32", + "D2S1338": "20,23", + "D3S1358": "16", + "D5S818": "10,13", + "D6S1043": "11", + "D7S820": "11,8", + "D8S1179": "13,14", + "FGA": "17", + "Penta E": "13,21", + "TH01": "7,9", + "TPOX": "9", + "vWA": "14,17" + } + }, + { + "meta": { + "identifier": "SK-HEP-1-Cas9-730", + "accession": "CVCL_XD88", + "taxid": 9606, + "organism": "Homo sapiens (Human)", + "source": "CCRID" + }, + "alleles": { + "Amelogenin": "X", + "CSF1PO": "11,12", + "D12S391": "18", + "D13S317": "12,8", + "D16S539": "12", + "D18S51": "13,15", + "D19S433": "12,15.2", + "D21S11": "29,31,32", + "D2S1338": "20,23", + "D3S1358": "16", + "D5S818": "10,13", + "D6S1043": "11", + "D7S820": "11,8", + "D8S1179": "13,14", + "FGA": "17", + "Penta E": "13,21", + "TH01": "7,9", + "TPOX": "9", + "vWA": "14,17" + } + }, + { + "meta": { + "identifier": "SK-HEP-1-Cas9-731", + "accession": "CVCL_XD89", + "taxid": 9606, + "organism": "Homo sapiens (Human)", + "source": "CCRID" + }, + "alleles": { + "Amelogenin": "X", + "CSF1PO": "11,12", + "D12S391": "18", + "D13S317": "12,8", + "D16S539": "12", + "D18S51": "13,15", + "D19S433": "12,15.2", + "D21S11": "29,31,32", + "D2S1338": "20,23", + "D3S1358": "16", + "D5S818": "10,13", + "D6S1043": "11", + "D7S820": "11,8", + "D8S1179": "13,14", + "FGA": "17", + "Penta E": "13,21", + "TH01": "7,9", + "TPOX": "9", + "vWA": "14,17" + } + }, + { + "meta": { + "identifier": "SK-HEP-1-Cas9-732", + "accession": "CVCL_XD90", + "taxid": 9606, + "organism": "Homo sapiens (Human)", + "source": "CCRID" + }, + "alleles": { + "Amelogenin": "X", + "CSF1PO": "11,12", + "D12S391": "18", + "D13S317": "12,8", + "D16S539": "12", + "D18S51": "13,15", + "D19S433": "12,15.2", + "D21S11": "29,31,32", + "D2S1338": "20,23", + "D3S1358": "16", + "D5S818": "10,13", + "D6S1043": "11", + "D7S820": "11,8", + "D8S1179": "13,14", + "FGA": "17", + "Penta E": "13,21", + "TH01": "7,9", + "TPOX": "9", + "vWA": "14,17" + } + }, + { + "meta": { + "identifier": "SK-HEP-1-Cas9-733", + "accession": "CVCL_XD91", + "taxid": 9606, + "organism": "Homo sapiens (Human)", + "source": "CCRID" + }, + "alleles": { + "Amelogenin": "X", + "CSF1PO": "11,12", + "D12S391": "18", + "D13S317": "12,8", + "D16S539": "12", + "D18S51": "13,15", + "D19S433": "12,15.2", + "D21S11": "29,31,32", + "D2S1338": "20,23", + "D3S1358": "16", + "D5S818": "10,13", + "D6S1043": "11", + "D7S820": "11,8", + "D8S1179": "13,14", + "FGA": "17", + "Penta E": "13,21", + "TH01": "7,9", + "TPOX": "9", + "vWA": "14,17" + } + }, + { + "meta": { + "identifier": "SK-HEP-1-Cas9-734", + "accession": "CVCL_XD92", + "taxid": 9606, + "organism": "Homo sapiens (Human)", + "source": "CCRID" + }, + "alleles": { + "Amelogenin": "X", + "CSF1PO": "11,12", + "D12S391": "18", + "D13S317": "12,8", + "D16S539": "12", + "D18S51": "13,15", + "D19S433": "12,15.2", + "D21S11": "29,31,32", + "D2S1338": "20,23", + "D3S1358": "16", + "D5S818": "10,13", + "D6S1043": "11", + "D7S820": "11,8", + "D8S1179": "13,14", + "FGA": "17", + "Penta E": "13,21", + "TH01": "7,9", + "TPOX": "9", + "vWA": "14,17" + } + }, + { + "meta": { + "identifier": "SK-HEP-1-Cas9-735", + "accession": "CVCL_XD93", + "taxid": 9606, + "organism": "Homo sapiens (Human)", + "source": "CCRID" + }, + "alleles": { + "Amelogenin": "X", + "CSF1PO": "11,12", + "D12S391": "18", + "D13S317": "12,8", + "D16S539": "12", + "D18S51": "13,15", + "D19S433": "12,15.2", + "D21S11": "29,31,32", + "D2S1338": "20,23", + "D3S1358": "16", + "D5S818": "10,13", + "D6S1043": "11", + "D7S820": "11,8", + "D8S1179": "13,14", + "FGA": "17", + "Penta E": "13,21", + "TH01": "7,9", + "TPOX": "9", + "vWA": "14,17" + } + } +] \ No newline at end of file diff --git a/claspy/tests/data/snu-db.json b/claspy/tests/data/snu-db.json new file mode 100644 index 0000000..0975578 --- /dev/null +++ b/claspy/tests/data/snu-db.json @@ -0,0 +1,96 @@ +[ + { + "meta": { + "identifier": "SNU-1033-1", + "accession": "CVCL_5002", + "synonyms": "SNU1033", + "taxid": 9606, + "organism": "Homo sapiens (Human)", + "source": "Genomics_Center_BCF_Technion" + }, + "alleles": { + "Amelogenin": "X", + "CSF1PO": "10", + "D10S1248": "13,15", + "D12S391": "18,21", + "D13S317": "10", + "D16S539": "9", + "D18S51": "13", + "D19S433": "13,14", + "D1S1656": "13", + "D21S11": "29,30", + "D22S1045": "17", + "D2S1338": "18,26", + "D2S441": "10,12", + "D3S1358": "17", + "D5S818": "13", + "D7S820": "11,12", + "D8S1179": "12,13", + "FGA": "23", + "Penta D": "11,12", + "Penta E": "11", + "TH01": "9", + "TPOX": "11", + "vWA": "17,19" + } + }, + { + "meta": { + "identifier": "SNU-1033-2", + "accession": "CVCL_5002", + "synonyms": "SNU1033", + "taxid": 9606, + "organism": "Homo sapiens (Human)", + "source": "PubMed=25926053" + }, + "alleles": { + "Amelogenin": "X", + "D10S1248": "13,15", + "D12S391": "18,21", + "D13S317": "10", + "D16S539": "9", + "D18S51": "13", + "D19S433": "13,14", + "D1S1656": "13", + "D21S11": "29,30", + "D22S1045": "17", + "D2S1338": "18,26", + "D2S441": "10,12", + "D5S818": "13", + "D7S820": "11,12", + "TH01": "9", + "TPOX": "11" + } + }, + { + "meta": { + "identifier": "SNU-1033-3", + "accession": "CVCL_5002", + "synonyms": "SNU1033", + "taxid": 9606, + "organism": "Homo sapiens (Human)", + "source": "KCLB" + }, + "alleles": { + "Amelogenin": "X", + "CSF1PO": "10", + "D10S1248": "13,15", + "D12S391": "18,21", + "D13S317": "10", + "D19S433": "13,14", + "D1S1656": "13", + "D21S11": "29,30", + "D22S1045": "17", + "D2S1338": "18,26", + "D2S441": "10,12", + "D5S818": "13", + "D8S1179": "12,13", + "FGA": "23", + "Penta D": "11,12", + "Penta E": "11", + "TH01": "9", + "TPOX": "11", + "vWA": "17,19" + } + } +] diff --git a/claspy/tests/data/snu-query.csv b/claspy/tests/data/snu-query.csv new file mode 100644 index 0000000..fe9353f --- /dev/null +++ b/claspy/tests/data/snu-query.csv @@ -0,0 +1,22 @@ +Sample,Marker,Allele1,Allele2 +sample,Amelogenin,X, +sample,CSF1PO,10, +sample,D10S1248,13,15 +sample,D12S391,18,21 +sample,D13S317,10, +sample,D16S539,9, +sample,D18S51,13, +sample,D19S433,13,14 +sample,D1S1656,13, +sample,D21S11,29,30 +sample,D22S1045,17, +sample,D2S1338,18,26 +sample,D2S441,10,12 +sample,D5S818,13, +sample,D8S1179,12,13 +sample,FGA,23, +sample,Penta D,11,12 +sample,Penta E,11, +sample,TH01,9, +sample,TPOX,11, +sample,vWA,17,19 diff --git a/claspy/tests/data/upci-scc-077-db.json b/claspy/tests/data/upci-scc-077-db.json new file mode 100644 index 0000000..512cb9a --- /dev/null +++ b/claspy/tests/data/upci-scc-077-db.json @@ -0,0 +1,54 @@ +[ + { + "meta": { + "sample": "query" + }, + "alleles": { + "Amelogenin": "X,Y", + "CSF1PO": "10", + "D13S317": "11", + "D16S539": "11,13", + "D19S433": "12,14", + "D21S11": "27,29", + "D3S1358": "14", + "D5S818": "9", + "D7S820": "7", + "D8S1179": "14,15", + "FGA": "21", + "Penta D": "13", + "Penta E": "13,7", + "TH01": "9.3", + "TPOX": "11", + "vWA": "17,18" + } + }, + { + "meta": { + "identifier": "UPCI-SCC-077", + "accession": "CVCL_C043", + "synonyms": "UPCI:SCC077; SCC077", + "taxid": 9606, + "organism": "Homo sapiens (Human)", + "source": "DSMZ" + }, + "alleles": { + "Amelogenin": "X,Y", + "CSF1PO": "10", + "D16S539": "11,13", + "D18S51": "13", + "D19S433": "12,14", + "D21S11": "27,29", + "D2S1338": "17", + "D3S1358": "14", + "D5S818": "9", + "D7S820": "8", + "D8S1179": "14,15", + "FGA": "21", + "Penta D": "13", + "Penta E": "13,7", + "TH01": "9.3", + "TPOX": "11,12", + "vWA": "17,18" + } + } +] diff --git a/claspy/tests/test_cli.py b/claspy/tests/test_cli.py new file mode 100644 index 0000000..73f6211 --- /dev/null +++ b/claspy/tests/test_cli.py @@ -0,0 +1,24 @@ +# ------------------------------------------------------------------------------------------------- +# Copyright (c) 2023, DHS. +# This file is part of claspy: https://github.com/bioforensics/claspy +# +# This software was prepared for the Department of Homeland Security (DHS) by the Battelle National +# Biodefense Institute, LLC (BNBI) as part of contract HSHQDC-15-C-00064 to manage and operate the +# National Biodefense Analysis and Countermeasures Center (NBACC), a Federally Funded Research and +# Development Center. +# ------------------------------------------------------------------------------------------------- + +import claspy +from claspy.tests import data_file +import pytest + + +def test_search_report_sorting(tmp_path): + report = tmp_path / "report.csv" + arglist = [data_file("mock-cvcl-1085.csv"), "--out", report] + claspy.cli.main(arglist=arglist) + assert report.is_file() + with open(report, "r") as fh1, open(data_file("report-cvcl-1085.csv"), "r") as fh2: + observed = fh1.read().strip() + expected = fh2.read().strip() + assert observed == expected diff --git a/claspy/tests/test_db.py b/claspy/tests/test_db.py new file mode 100644 index 0000000..7b2beae --- /dev/null +++ b/claspy/tests/test_db.py @@ -0,0 +1,37 @@ +# ------------------------------------------------------------------------------------------------- +# Copyright (c) 2023, DHS. +# This file is part of claspy: https://github.com/bioforensics/claspy +# +# This software was prepared for the Department of Homeland Security (DHS) by the Battelle National +# Biodefense Institute, LLC (BNBI) as part of contract HSHQDC-15-C-00064 to manage and operate the +# National Biodefense Analysis and Countermeasures Center (NBACC), a Federally Funded Research and +# Development Center. +# ------------------------------------------------------------------------------------------------- +from claspy.db import CellosaurusDB +from claspy import Profile +from claspy.tests import data_file +from io import StringIO +import pytest + + +def test_search_report_sorting(): + db = CellosaurusDB.load(data_file("snu-db.json")) + query = next(Profile.load(data_file("snu-query.csv"))) + report = db.search(query).full_report + print(report.to_string()) + assert report.CellLine.to_list() == ["sample", "SNU-1033-1", "SNU-1033-3", "SNU-1033-2"] + for score in report.Score[1:]: + assert score == pytest.approx(1.0) + assert report.SharedAlleles[1:].to_list() == [29, 27, 20] + + +def test_db_round_trip(tmp_path): + db = CellosaurusDB.load() + db1 = CellosaurusDB([profile for profile in db if "SK-HEP-1" in profile.identifier]) + db1.to_json(tmp_path / "db.json") + db2 = CellosaurusDB.load(tmp_path / "db.json") + assert len(db1) == len(db2) + json1, json2 = StringIO(), StringIO() + db1.to_json(json1) + db2.to_json(json2) + assert json1.getvalue() == json2.getvalue() diff --git a/claspy/tests/test_markers.py b/claspy/tests/test_markers.py new file mode 100644 index 0000000..f3dfbc6 --- /dev/null +++ b/claspy/tests/test_markers.py @@ -0,0 +1,54 @@ +# ------------------------------------------------------------------------------------------------- +# Copyright (c) 2023, DHS. +# This file is part of claspy: https://github.com/bioforensics/claspy +# +# This software was prepared for the Department of Homeland Security (DHS) by the Battelle National +# Biodefense Institute, LLC (BNBI) as part of contract HSHQDC-15-C-00064 to manage and operate the +# National Biodefense Analysis and Countermeasures Center (NBACC), a Federally Funded Research and +# Development Center. +# ------------------------------------------------------------------------------------------------- + +from claspy.markers import validate_names +import pytest + + +@pytest.mark.parametrize( + "input_names,expected_names,expected_taxid", + [ + ( + ("PENTAD", "AmeLoGenIN", "D21S11"), + {"PENTAD": "Penta D", "AmeLoGenIN": "Amelogenin", "D21S11": "D21S11"}, + 9606, + ), + ( + ("d8s1179", "PentaE", "Se33", "Tpox"), + {"d8s1179": "D8S1179", "PentaE": "Penta E", "Se33": "SE33", "Tpox": "TPOX"}, + 9606, + ), + ( + ("CSF1PO", "fgA", "D3S1358"), + {"CSF1PO": "CSF1PO", "fgA": "FGA", "D3S1358": "D3S1358"}, + 9606, + ), + ( + (" mousestr1-2 ", "MouseSTR8-1"), + {" mousestr1-2 ": "Mouse STR 1-2", "MouseSTR8-1": "Mouse STR 8-1"}, + 10090, + ), + ], +) +def test_validate_names_basic(input_names, expected_names, expected_taxid): + observed_names, observed_taxid = validate_names(input_names) + assert observed_names == expected_names + assert observed_taxid == expected_taxid + + +def test_validate_names_invalid_marker(): + with pytest.raises(ValueError, match=r"invalid marker name\(s\): Penta G"): + validate_names(("CSF1PO", "Penta G", "D2S1338")) + + +def test_validate_names_mixed_species(): + message = r"list of marker names includes markers from different species: dog, human" + with pytest.raises(ValueError, match=message): + validate_names(("vWA", "DogPEZ8")) diff --git a/claspy/tests/test_profile.py b/claspy/tests/test_profile.py new file mode 100644 index 0000000..4791122 --- /dev/null +++ b/claspy/tests/test_profile.py @@ -0,0 +1,164 @@ +# ------------------------------------------------------------------------------------------------- +# Copyright (c) 2023, DHS. +# This file is part of claspy: https://github.com/bioforensics/claspy +# +# This software was prepared for the Department of Homeland Security (DHS) by the Battelle National +# Biodefense Institute, LLC (BNBI) as part of contract HSHQDC-15-C-00064 to manage and operate the +# National Biodefense Analysis and Countermeasures Center (NBACC), a Federally Funded Research and +# Development Center. +# ------------------------------------------------------------------------------------------------- + +from claspy import Profile +from claspy.db import CellosaurusDB +from claspy.tests import data_file +import pytest + + +def test_profile_basic(capsys): + alleles = { + "CSF1PO": "13,14", + "D5S818": "13", + "D7S820": "8", + "D13S317": "12", + "FGA": "24", + "TH01": "8", + "TPOX": "11", + "vWA": "16", + } + meta = {"sample": "sample1"} + profile = Profile(alleles, meta) + assert len(profile) == 9 + assert next(iter(profile)) == ("CSF1PO", "13") + assert profile.taxid == 9606 + profile._meta["taxid"] = [9606, 10116] + assert profile.taxid_match(9606) is True + assert profile.taxid_match(10116) is True + assert profile.taxid_match(10090) is False + score, num_shared_alleles = Profile.score(profile, profile) + assert score == pytest.approx(1.0) + assert num_shared_alleles == 9 + print(profile) + terminal = capsys.readouterr() + observed = terminal.out + expected = """ +Sample,Marker,Allele1,Allele2 +sample1,CSF1PO,13,14 +sample1,D5S818,13, +sample1,D7S820,8, +sample1,D13S317,12, +sample1,FGA,24, +sample1,TH01,8, +sample1,TPOX,11, +sample1,vWA,16, +""" + assert observed.strip() == expected.strip() + + +@pytest.mark.parametrize( + "path,message", + [ + ("query-wide.csv", r"found 11 allele columns, well above expected limit"), + ("query-bad-allele-1.csv", r"expected column 'Allele1' missing"), + ("query-bad-allele-2.csv", r"invalid table header 'AlleleTwo'"), + ], +) +def test_load_failure_modes(path, message): + with pytest.raises(ValueError, match=message): + next(Profile.load(data_file(path))) + + +@pytest.mark.parametrize( + "input,expected", + [ + ("7", {"7"}), + ("7,8", {"7", "8"}), + ("7, 8", {"7", "8"}), + ("10.2,13", {"10.2", "13"}), + ("11.1", {"11.1"}), + ], +) +def test_parse_allele_string(input, expected): + assert Profile.parse_allele_string(input) == expected + + +@pytest.mark.parametrize( + "input,expected", + [ + ({"13", "7"}, "7,13"), + ({"13"}, "13"), + ({"19", "21", "9"}, "9,19,21"), + ], +) +def test_allele_representation_is_sorted(input, expected): + assert Profile.allele_repr(input) == expected + + +def test_allele_transform_failure_mode(): + with pytest.raises(ValueError, match=r"unexpected allele 'Z'"): + Profile.allele_transform("Z") + + +@pytest.mark.parametrize( + "algorithm,mode,amel,exp_alleles,exp_score", + [ + ("Tanabe", "intersect", True, 21, 0.933333), + ("Tanabe", "intersect", False, 19, 0.9268293), + ("Tanabe", "query", False, 19, 0.904762), + ("Tanabe", "reference", False, 19, 0.883721), + ("query", "intersect", False, 19, 0.95), + ("reference", "intersect", False, 19, 0.904762), + ], +) +def test_score_basic(algorithm, mode, amel, exp_alleles, exp_score): + profiles = CellosaurusDB.load(path=data_file("upci-scc-077-db.json")) + assert len(profiles) == 2 + query, reference = profiles + score, shared_alleles = Profile.score( + query, reference, algorithm=algorithm, mode=mode, amel=amel + ) + assert score == pytest.approx(exp_score) + assert shared_alleles == exp_alleles + + +@pytest.mark.parametrize( + "algorithm,mode,message", + [ + ("Tanabe", "lizard", r"unsupported scoring mode 'lizard'"), + ("AI", "intersect", r"unsupported scoring algorithm 'AI'"), + ], +) +def test_score_failure_modes(algorithm, mode, message): + query = next(Profile.load(data_file("mock-cvcl-1085.csv"))) + reference = next(Profile.load(data_file("db-cvcl-1085.csv"))) + with pytest.raises(ValueError, match=message): + Profile.score(query, reference, algorithm=algorithm, mode=mode) + + +def test_parse_and_score(): + profiles = CellosaurusDB.load(path=data_file("examples.json")) + assert len(profiles) == 2 + score, num_shared_alleles = Profile.score(profiles[0], profiles[1]) + assert score == pytest.approx(0.7) + assert num_shared_alleles == 7 + + +def test_score(): + query = next(Profile.load(data_file("mock-cvcl-1085.csv"))) + reference = next(Profile.load(data_file("db-cvcl-1085.csv"))) + score, num_shared_alleles = Profile.score(query, reference) + assert score == pytest.approx(0.9677, abs=1e-4) + assert num_shared_alleles == 30 + + +@pytest.mark.parametrize( + "allele_set,expected", + [ + ({"9", "13"}, "9,13"), + ({"3", "21"}, "3,21"), + ({"7"}, "7"), + ({"Y", "X"}, "X,Y"), + ({"9.3", "13"}, "9.3,13"), + ], +) +def test_allele_repr(allele_set, expected): + assert Profile.allele_repr(allele_set) == expected diff --git a/claspy/tests/test_result.py b/claspy/tests/test_result.py new file mode 100644 index 0000000..ca26cf0 --- /dev/null +++ b/claspy/tests/test_result.py @@ -0,0 +1,109 @@ +# ------------------------------------------------------------------------------------------------- +# Copyright (c) 2023, DHS. +# This file is part of claspy: https://github.com/bioforensics/claspy +# +# This software was prepared for the Department of Homeland Security (DHS) by the Battelle National +# Biodefense Institute, LLC (BNBI) as part of contract HSHQDC-15-C-00064 to manage and operate the +# National Biodefense Analysis and Countermeasures Center (NBACC), a Federally Funded Research and +# Development Center. +# ------------------------------------------------------------------------------------------------- +from claspy.db import CellosaurusDB +from claspy.str_profile import Profile +from claspy.tests import data_file +from io import StringIO +import pandas as pd +import pytest + + +@pytest.fixture(scope="session") +def skhep_result(): + db = CellosaurusDB.load(data_file("skhep1-db.json")) + query = next(Profile.load(data_file("mock-sk-hep-1.csv"))) + return db.search(query, maxhits=5, minscore=0.9) + + +@pytest.fixture(scope="session") +def skhep_result_multi_sample(): + db = CellosaurusDB.load(data_file("skhep1-db.json")) + all_results = list() + for query in Profile.load(data_file("mock-sk-hep-1-2samples.csv")): + all_results.append(db.search(query, maxhits=3, minscore=0.9)) + return all_results + + +def test_search_result_basic(skhep_result): + assert skhep_result.maxhits == 5 + assert skhep_result.minscore == pytest.approx(0.9) + assert len(skhep_result.results_by_cell_line) == 11 + + +def test_search_result_summary(skhep_result): + observed = skhep_result.summary + assert len(observed) == 5 + exp_data = StringIO( + """ +Sample,CellLine,Score,SharedAlleles,Source +mock,SK-HEP-1,0.981818,27,PubMed=25877200 +mock,SK-HEP-1-Cas9-727,0.980392,25,CCRID +mock,SK-HEP-1-Cas9-726,0.961538,25,CCRID +mock,SK-HEP-1-Cas9-728,0.961538,25,CCRID +mock,SK-HEP-1-Cas9-729,0.961538,25,CCRID""" + ) + expected = pd.read_csv(exp_data) + pd.testing.assert_frame_equal(observed, expected, check_exact=False, rtol=1e-6) + + +def test_search_result_full_report(skhep_result): + observed = skhep_result.full_report + assert len(observed) == 7 + observed = observed.to_csv(sep=";", index=False) + expected = """ +Sample;CellLine;Status;Score;SharedAlleles;Source;Amelogenin;CSF1PO;D12S391;D13S317;D16S539;D18S51;D19S433;D21S11;D2S1338;D3S1358;D5S818;D6S1043;D7S820;D8S1179;FGA;Penta D;Penta E;TH01;TPOX;vWA +mock;mock;query;;;;X;11,12;18;8,12;12;13,15;;29,31;20,23;16;10,13;11;8,11;14;17;13,14;;7,9;9;14,17 +mock;SK-HEP-1;best;0.9818181818181818;27;PubMed=25877200;X;11,12;18;8,12;12;13,15;12,15.2;29,31;20,23;16;10,13;11;8,11;13,14;17;13,14;13;7,9;9;14,17 +mock;SK-HEP-1;worst;0.9818181818181818;27;ATCC;X;11,12;18;8,12;12;13,15;;29,31;20,23;16;10,13;11;8,11;13,14;17;13,14;;7,9;9;14,17 +mock;SK-HEP-1-Cas9-727;only;0.9803921568627451;25;CCRID;X;11,12;18;8,12;12;13,15;12,15.2;29,31;20,23;16;10,13;11;8,11;13,14;17;;13,21;7,9;9;14,17 +mock;SK-HEP-1-Cas9-726;only;0.9615384615384616;25;CCRID;X;11,12;18;8,12;12;13,15;12,15.2;29,31,32;20,23;16;10,13;11;8,11;13,14;17;;13,21;7,9;9;14,17 +mock;SK-HEP-1-Cas9-728;only;0.9615384615384616;25;CCRID;X;11,12;18;8,12;12;13,15;12,15.2;29,31,32;20,23;16;10,13;11;8,11;13,14;17;;13,21;7,9;9;14,17 +mock;SK-HEP-1-Cas9-729;only;0.9615384615384616;25;CCRID;X;11,12;18;8,12;12;13,15;12,15.2;29,31,32;20,23;16;10,13;11;8,11;13,14;17;;13,21;7,9;9;14,17""" + assert observed.strip() == expected.strip() + + +def test_search_result_summary_multisamples(skhep_result_multi_sample): + observed = pd.concat( + [result.summary for result in skhep_result_multi_sample], ignore_index=True + ) + assert len(observed) == 6 + exp_data = StringIO( + """ +Sample,CellLine,Score,SharedAlleles,Source +mock_1,SK-HEP-1,0.981818,27,PubMed=25877200 +mock_1,SK-HEP-1-Cas9-727,0.980392,25,CCRID +mock_1,SK-HEP-1-Cas9-726,0.961538,25,CCRID +mock_2,SK-HEP-1,0.981818,27,PubMed=25877200 +mock_2,SK-HEP-1-Cas9-727,0.980392,25,CCRID +mock_2,SK-HEP-1-Cas9-726,0.961538,25,CCRID""" + ) + expected = pd.read_csv(exp_data) + pd.testing.assert_frame_equal(observed, expected, check_exact=False, rtol=1e-6) + + +def test_search_result_full_report_multisamples(skhep_result_multi_sample): + observed = pd.concat([result.full_report for result in skhep_result_multi_sample]) + assert len(observed) == 10 + observed = observed.to_csv(sep=";", index=False) + expected = """ +Sample;CellLine;Status;Score;SharedAlleles;Source;Amelogenin;CSF1PO;D12S391;D13S317;D16S539;D18S51;D19S433;D21S11;D2S1338;D3S1358;D5S818;D6S1043;D7S820;D8S1179;FGA;Penta D;Penta E;TH01;TPOX;vWA +mock_1;mock_1;query;;;;X;11,12;18;8,12;12;13,15;;29,31;20,23;16;10,13;11;8,11;14;17;13,14;;7,9;9;14,17 +mock_1;SK-HEP-1;best;0.9818181818181818;27;PubMed=25877200;X;11,12;18;8,12;12;13,15;12,15.2;29,31;20,23;16;10,13;11;8,11;13,14;17;13,14;13;7,9;9;14,17 +mock_1;SK-HEP-1;worst;0.9818181818181818;27;ATCC;X;11,12;18;8,12;12;13,15;;29,31;20,23;16;10,13;11;8,11;13,14;17;13,14;;7,9;9;14,17 +mock_1;SK-HEP-1-Cas9-727;only;0.9803921568627451;25;CCRID;X;11,12;18;8,12;12;13,15;12,15.2;29,31;20,23;16;10,13;11;8,11;13,14;17;;13,21;7,9;9;14,17 +mock_1;SK-HEP-1-Cas9-726;only;0.9615384615384616;25;CCRID;X;11,12;18;8,12;12;13,15;12,15.2;29,31,32;20,23;16;10,13;11;8,11;13,14;17;;13,21;7,9;9;14,17 +mock_2;mock_2;query;;;;X;11,12;18;8,12;12;13,15;;29,31;20,23;16;10,13;11;8,11;14;17;13,14;;7,9;9;14,17 +mock_2;SK-HEP-1;best;0.9818181818181818;27;PubMed=25877200;X;11,12;18;8,12;12;13,15;12,15.2;29,31;20,23;16;10,13;11;8,11;13,14;17;13,14;13;7,9;9;14,17 +mock_2;SK-HEP-1;worst;0.9818181818181818;27;ATCC;X;11,12;18;8,12;12;13,15;;29,31;20,23;16;10,13;11;8,11;13,14;17;13,14;;7,9;9;14,17 +mock_2;SK-HEP-1-Cas9-727;only;0.9803921568627451;25;CCRID;X;11,12;18;8,12;12;13,15;12,15.2;29,31;20,23;16;10,13;11;8,11;13,14;17;;13,21;7,9;9;14,17 +mock_2;SK-HEP-1-Cas9-726;only;0.9615384615384616;25;CCRID;X;11,12;18;8,12;12;13,15;12,15.2;29,31,32;20,23;16;10,13;11;8,11;13,14;17;;13,21;7,9;9;14,17 +""" + + assert observed.strip() == expected.strip() diff --git a/setup.cfg b/setup.cfg new file mode 100644 index 0000000..03ad3be --- /dev/null +++ b/setup.cfg @@ -0,0 +1,7 @@ +[versioneer] +VCS = git +style = pep440 +versionfile_source = claspy/_version.py +versionfile_build = claspy/_version.py +tag_prefix = +parentdir_prefix = claspy- diff --git a/setup.py b/setup.py new file mode 100644 index 0000000..12eb1ca --- /dev/null +++ b/setup.py @@ -0,0 +1,48 @@ +# ------------------------------------------------------------------------------------------------- +# Copyright (c) 2023, DHS. +# This file is part of claspy: https://github.com/bioforensics/claspy +# +# This software was prepared for the Department of Homeland Security (DHS) by the Battelle National +# Biodefense Institute, LLC (BNBI) as part of contract HSHQDC-15-C-00064 to manage and operate the +# National Biodefense Analysis and Countermeasures Center (NBACC), a Federally Funded Research and +# Development Center. +# ------------------------------------------------------------------------------------------------- + +from setuptools import setup +import versioneer + + +with open("README.md", "r") as infile: + longdesc = infile.read() + +setup( + name="claspy", + version=versioneer.get_version(), + cmdclass=versioneer.get_cmdclass(), + description="Clapsy: cell line authentication with STRs in Python", + long_description=longdesc, + long_description_content_type="text/markdown", + url="https://github.com/bioforensics/claspy", + packages=["claspy", "claspy.tests"], + package_data={"claspy": ["claspy/tests/data/*"]}, + include_package_data=True, + install_requires=[ + "black==24.3", + "pandas>=2.0", + "pytest>=6.0", + "pytest-cov>=3.0", + "tabulate>=0.9", + "tqdm>=3.0", + ], + entry_points={"console_scripts": ["claspy = claspy:main", "claspy_db = claspy:db_main"]}, + classifiers=[ + "Environment :: Console", + "Framework :: IPython", + "Framework :: Jupyter", + "Intended Audience :: Science/Research", + "License :: OSI Approved :: BSD License", + "Programming Language :: Python :: 3", + "Topic :: Scientific/Engineering :: Bio-Informatics", + ], + zip_safe=True, +) diff --git a/versioneer.py b/versioneer.py new file mode 100644 index 0000000..9713007 --- /dev/null +++ b/versioneer.py @@ -0,0 +1,2064 @@ + +# Version: 0.20 + +"""The Versioneer - like a rocketeer, but for versions. + +The Versioneer +============== + +* like a rocketeer, but for versions! +* https://github.com/python-versioneer/python-versioneer +* Brian Warner +* License: Public Domain +* Compatible with: Python 3.6, 3.7, 3.8, 3.9 and pypy3 +* [![Latest Version][pypi-image]][pypi-url] +* [![Build Status][travis-image]][travis-url] + +This is a tool for managing a recorded version number in distutils-based +python projects. The goal is to remove the tedious and error-prone "update +the embedded version string" step from your release process. Making a new +release should be as easy as recording a new tag in your version-control +system, and maybe making new tarballs. + + +## Quick Install + +* `pip install versioneer` to somewhere in your $PATH +* add a `[versioneer]` section to your setup.cfg (see [Install](INSTALL.md)) +* run `versioneer install` in your source tree, commit the results +* Verify version information with `python setup.py version` + +## Version Identifiers + +Source trees come from a variety of places: + +* a version-control system checkout (mostly used by developers) +* a nightly tarball, produced by build automation +* a snapshot tarball, produced by a web-based VCS browser, like github's + "tarball from tag" feature +* a release tarball, produced by "setup.py sdist", distributed through PyPI + +Within each source tree, the version identifier (either a string or a number, +this tool is format-agnostic) can come from a variety of places: + +* ask the VCS tool itself, e.g. "git describe" (for checkouts), which knows + about recent "tags" and an absolute revision-id +* the name of the directory into which the tarball was unpacked +* an expanded VCS keyword ($Id$, etc) +* a `_version.py` created by some earlier build step + +For released software, the version identifier is closely related to a VCS +tag. Some projects use tag names that include more than just the version +string (e.g. "myproject-1.2" instead of just "1.2"), in which case the tool +needs to strip the tag prefix to extract the version identifier. For +unreleased software (between tags), the version identifier should provide +enough information to help developers recreate the same tree, while also +giving them an idea of roughly how old the tree is (after version 1.2, before +version 1.3). Many VCS systems can report a description that captures this, +for example `git describe --tags --dirty --always` reports things like +"0.7-1-g574ab98-dirty" to indicate that the checkout is one revision past the +0.7 tag, has a unique revision id of "574ab98", and is "dirty" (it has +uncommitted changes). + +The version identifier is used for multiple purposes: + +* to allow the module to self-identify its version: `myproject.__version__` +* to choose a name and prefix for a 'setup.py sdist' tarball + +## Theory of Operation + +Versioneer works by adding a special `_version.py` file into your source +tree, where your `__init__.py` can import it. This `_version.py` knows how to +dynamically ask the VCS tool for version information at import time. + +`_version.py` also contains `$Revision$` markers, and the installation +process marks `_version.py` to have this marker rewritten with a tag name +during the `git archive` command. As a result, generated tarballs will +contain enough information to get the proper version. + +To allow `setup.py` to compute a version too, a `versioneer.py` is added to +the top level of your source tree, next to `setup.py` and the `setup.cfg` +that configures it. This overrides several distutils/setuptools commands to +compute the version when invoked, and changes `setup.py build` and `setup.py +sdist` to replace `_version.py` with a small static file that contains just +the generated version data. + +## Installation + +See [INSTALL.md](./INSTALL.md) for detailed installation instructions. + +## Version-String Flavors + +Code which uses Versioneer can learn about its version string at runtime by +importing `_version` from your main `__init__.py` file and running the +`get_versions()` function. From the "outside" (e.g. in `setup.py`), you can +import the top-level `versioneer.py` and run `get_versions()`. + +Both functions return a dictionary with different flavors of version +information: + +* `['version']`: A condensed version string, rendered using the selected + style. This is the most commonly used value for the project's version + string. The default "pep440" style yields strings like `0.11`, + `0.11+2.g1076c97`, or `0.11+2.g1076c97.dirty`. See the "Styles" section + below for alternative styles. + +* `['full-revisionid']`: detailed revision identifier. For Git, this is the + full SHA1 commit id, e.g. "1076c978a8d3cfc70f408fe5974aa6c092c949ac". + +* `['date']`: Date and time of the latest `HEAD` commit. For Git, it is the + commit date in ISO 8601 format. This will be None if the date is not + available. + +* `['dirty']`: a boolean, True if the tree has uncommitted changes. Note that + this is only accurate if run in a VCS checkout, otherwise it is likely to + be False or None + +* `['error']`: if the version string could not be computed, this will be set + to a string describing the problem, otherwise it will be None. It may be + useful to throw an exception in setup.py if this is set, to avoid e.g. + creating tarballs with a version string of "unknown". + +Some variants are more useful than others. Including `full-revisionid` in a +bug report should allow developers to reconstruct the exact code being tested +(or indicate the presence of local changes that should be shared with the +developers). `version` is suitable for display in an "about" box or a CLI +`--version` output: it can be easily compared against release notes and lists +of bugs fixed in various releases. + +The installer adds the following text to your `__init__.py` to place a basic +version in `YOURPROJECT.__version__`: + + from ._version import get_versions + __version__ = get_versions()['version'] + del get_versions + +## Styles + +The setup.cfg `style=` configuration controls how the VCS information is +rendered into a version string. + +The default style, "pep440", produces a PEP440-compliant string, equal to the +un-prefixed tag name for actual releases, and containing an additional "local +version" section with more detail for in-between builds. For Git, this is +TAG[+DISTANCE.gHEX[.dirty]] , using information from `git describe --tags +--dirty --always`. For example "0.11+2.g1076c97.dirty" indicates that the +tree is like the "1076c97" commit but has uncommitted changes (".dirty"), and +that this commit is two revisions ("+2") beyond the "0.11" tag. For released +software (exactly equal to a known tag), the identifier will only contain the +stripped tag, e.g. "0.11". + +Other styles are available. See [details.md](details.md) in the Versioneer +source tree for descriptions. + +## Debugging + +Versioneer tries to avoid fatal errors: if something goes wrong, it will tend +to return a version of "0+unknown". To investigate the problem, run `setup.py +version`, which will run the version-lookup code in a verbose mode, and will +display the full contents of `get_versions()` (including the `error` string, +which may help identify what went wrong). + +## Known Limitations + +Some situations are known to cause problems for Versioneer. This details the +most significant ones. More can be found on Github +[issues page](https://github.com/python-versioneer/python-versioneer/issues). + +### Subprojects + +Versioneer has limited support for source trees in which `setup.py` is not in +the root directory (e.g. `setup.py` and `.git/` are *not* siblings). The are +two common reasons why `setup.py` might not be in the root: + +* Source trees which contain multiple subprojects, such as + [Buildbot](https://github.com/buildbot/buildbot), which contains both + "master" and "slave" subprojects, each with their own `setup.py`, + `setup.cfg`, and `tox.ini`. Projects like these produce multiple PyPI + distributions (and upload multiple independently-installable tarballs). +* Source trees whose main purpose is to contain a C library, but which also + provide bindings to Python (and perhaps other languages) in subdirectories. + +Versioneer will look for `.git` in parent directories, and most operations +should get the right version string. However `pip` and `setuptools` have bugs +and implementation details which frequently cause `pip install .` from a +subproject directory to fail to find a correct version string (so it usually +defaults to `0+unknown`). + +`pip install --editable .` should work correctly. `setup.py install` might +work too. + +Pip-8.1.1 is known to have this problem, but hopefully it will get fixed in +some later version. + +[Bug #38](https://github.com/python-versioneer/python-versioneer/issues/38) is tracking +this issue. The discussion in +[PR #61](https://github.com/python-versioneer/python-versioneer/pull/61) describes the +issue from the Versioneer side in more detail. +[pip PR#3176](https://github.com/pypa/pip/pull/3176) and +[pip PR#3615](https://github.com/pypa/pip/pull/3615) contain work to improve +pip to let Versioneer work correctly. + +Versioneer-0.16 and earlier only looked for a `.git` directory next to the +`setup.cfg`, so subprojects were completely unsupported with those releases. + +### Editable installs with setuptools <= 18.5 + +`setup.py develop` and `pip install --editable .` allow you to install a +project into a virtualenv once, then continue editing the source code (and +test) without re-installing after every change. + +"Entry-point scripts" (`setup(entry_points={"console_scripts": ..})`) are a +convenient way to specify executable scripts that should be installed along +with the python package. + +These both work as expected when using modern setuptools. When using +setuptools-18.5 or earlier, however, certain operations will cause +`pkg_resources.DistributionNotFound` errors when running the entrypoint +script, which must be resolved by re-installing the package. This happens +when the install happens with one version, then the egg_info data is +regenerated while a different version is checked out. Many setup.py commands +cause egg_info to be rebuilt (including `sdist`, `wheel`, and installing into +a different virtualenv), so this can be surprising. + +[Bug #83](https://github.com/python-versioneer/python-versioneer/issues/83) describes +this one, but upgrading to a newer version of setuptools should probably +resolve it. + + +## Updating Versioneer + +To upgrade your project to a new release of Versioneer, do the following: + +* install the new Versioneer (`pip install -U versioneer` or equivalent) +* edit `setup.cfg`, if necessary, to include any new configuration settings + indicated by the release notes. See [UPGRADING](./UPGRADING.md) for details. +* re-run `versioneer install` in your source tree, to replace + `SRC/_version.py` +* commit any changed files + +## Future Directions + +This tool is designed to make it easily extended to other version-control +systems: all VCS-specific components are in separate directories like +src/git/ . The top-level `versioneer.py` script is assembled from these +components by running make-versioneer.py . In the future, make-versioneer.py +will take a VCS name as an argument, and will construct a version of +`versioneer.py` that is specific to the given VCS. It might also take the +configuration arguments that are currently provided manually during +installation by editing setup.py . Alternatively, it might go the other +direction and include code from all supported VCS systems, reducing the +number of intermediate scripts. + +## Similar projects + +* [setuptools_scm](https://github.com/pypa/setuptools_scm/) - a non-vendored build-time + dependency +* [minver](https://github.com/jbweston/miniver) - a lightweight reimplementation of + versioneer +* [versioningit](https://github.com/jwodder/versioningit) - a PEP 518-based setuptools + plugin + +## License + +To make Versioneer easier to embed, all its code is dedicated to the public +domain. The `_version.py` that it creates is also in the public domain. +Specifically, both are released under the Creative Commons "Public Domain +Dedication" license (CC0-1.0), as described in +https://creativecommons.org/publicdomain/zero/1.0/ . + +[pypi-image]: https://img.shields.io/pypi/v/versioneer.svg +[pypi-url]: https://pypi.python.org/pypi/versioneer/ +[travis-image]: +https://img.shields.io/travis/com/python-versioneer/python-versioneer.svg +[travis-url]: https://travis-ci.com/github/python-versioneer/python-versioneer + +""" + +import configparser +import errno +import json +import os +import re +import subprocess +import sys + + +class VersioneerConfig: # pylint: disable=too-few-public-methods # noqa + """Container for Versioneer configuration parameters.""" + + +def get_root(): + """Get the project root directory. + + We require that all commands are run from the project root, i.e. the + directory that contains setup.py, setup.cfg, and versioneer.py . + """ + root = os.path.realpath(os.path.abspath(os.getcwd())) + setup_py = os.path.join(root, "setup.py") + versioneer_py = os.path.join(root, "versioneer.py") + if not (os.path.exists(setup_py) or os.path.exists(versioneer_py)): + # allow 'python path/to/setup.py COMMAND' + root = os.path.dirname(os.path.realpath(os.path.abspath(sys.argv[0]))) + setup_py = os.path.join(root, "setup.py") + versioneer_py = os.path.join(root, "versioneer.py") + if not (os.path.exists(setup_py) or os.path.exists(versioneer_py)): + err = ("Versioneer was unable to run the project root directory. " + "Versioneer requires setup.py to be executed from " + "its immediate directory (like 'python setup.py COMMAND'), " + "or in a way that lets it use sys.argv[0] to find the root " + "(like 'python path/to/setup.py COMMAND').") + raise VersioneerBadRootError(err) + try: + # Certain runtime workflows (setup.py install/develop in a setuptools + # tree) execute all dependencies in a single python process, so + # "versioneer" may be imported multiple times, and python's shared + # module-import table will cache the first one. So we can't use + # os.path.dirname(__file__), as that will find whichever + # versioneer.py was first imported, even in later projects. + my_path = os.path.realpath(os.path.abspath(__file__)) + me_dir = os.path.normcase(os.path.splitext(my_path)[0]) + vsr_dir = os.path.normcase(os.path.splitext(versioneer_py)[0]) + if me_dir != vsr_dir: + print("Warning: build in %s is using versioneer.py from %s" + % (os.path.dirname(my_path), versioneer_py)) + except NameError: + pass + return root + + +def get_config_from_root(root): + """Read the project setup.cfg file to determine Versioneer config.""" + # This might raise EnvironmentError (if setup.cfg is missing), or + # configparser.NoSectionError (if it lacks a [versioneer] section), or + # configparser.NoOptionError (if it lacks "VCS="). See the docstring at + # the top of versioneer.py for instructions on writing your setup.cfg . + setup_cfg = os.path.join(root, "setup.cfg") + parser = configparser.ConfigParser() + with open(setup_cfg, "r") as cfg_file: + parser.read_file(cfg_file) + VCS = parser.get("versioneer", "VCS") # mandatory + + # Dict-like interface for non-mandatory entries + section = parser["versioneer"] + + # pylint:disable=attribute-defined-outside-init # noqa + cfg = VersioneerConfig() + cfg.VCS = VCS + cfg.style = section.get("style", "") + cfg.versionfile_source = section.get("versionfile_source") + cfg.versionfile_build = section.get("versionfile_build") + cfg.tag_prefix = section.get("tag_prefix") + if cfg.tag_prefix in ("''", '""'): + cfg.tag_prefix = "" + cfg.parentdir_prefix = section.get("parentdir_prefix") + cfg.verbose = section.get("verbose") + return cfg + + +class NotThisMethod(Exception): + """Exception raised if a method is not valid for the current scenario.""" + + +# these dictionaries contain VCS-specific tools +LONG_VERSION_PY = {} +HANDLERS = {} + + +def register_vcs_handler(vcs, method): # decorator + """Create decorator to mark a method as the handler of a VCS.""" + def decorate(f): + """Store f in HANDLERS[vcs][method].""" + HANDLERS.setdefault(vcs, {})[method] = f + return f + return decorate + + +# pylint:disable=too-many-arguments,consider-using-with # noqa +def run_command(commands, args, cwd=None, verbose=False, hide_stderr=False, + env=None): + """Call the given command(s).""" + assert isinstance(commands, list) + process = None + for command in commands: + try: + dispcmd = str([command] + args) + # remember shell=False, so use git.cmd on windows, not just git + process = subprocess.Popen([command] + args, cwd=cwd, env=env, + stdout=subprocess.PIPE, + stderr=(subprocess.PIPE if hide_stderr + else None)) + break + except EnvironmentError: + e = sys.exc_info()[1] + if e.errno == errno.ENOENT: + continue + if verbose: + print("unable to run %s" % dispcmd) + print(e) + return None, None + else: + if verbose: + print("unable to find command, tried %s" % (commands,)) + return None, None + stdout = process.communicate()[0].strip().decode() + if process.returncode != 0: + if verbose: + print("unable to run %s (error)" % dispcmd) + print("stdout was %s" % stdout) + return None, process.returncode + return stdout, process.returncode + + +LONG_VERSION_PY['git'] = r''' +# This file helps to compute a version number in source trees obtained from +# git-archive tarball (such as those provided by githubs download-from-tag +# feature). Distribution tarballs (built by setup.py sdist) and build +# directories (produced by setup.py build) will contain a much shorter file +# that just contains the computed version number. + +# This file is released into the public domain. Generated by +# versioneer-0.20 (https://github.com/python-versioneer/python-versioneer) + +"""Git implementation of _version.py.""" + +import errno +import os +import re +import subprocess +import sys + + +def get_keywords(): + """Get the keywords needed to look up the version information.""" + # these strings will be replaced by git during git-archive. + # setup.py/versioneer.py will grep for the variable names, so they must + # each be defined on a line of their own. _version.py will just call + # get_keywords(). + git_refnames = "%(DOLLAR)sFormat:%%d%(DOLLAR)s" + git_full = "%(DOLLAR)sFormat:%%H%(DOLLAR)s" + git_date = "%(DOLLAR)sFormat:%%ci%(DOLLAR)s" + keywords = {"refnames": git_refnames, "full": git_full, "date": git_date} + return keywords + + +class VersioneerConfig: # pylint: disable=too-few-public-methods + """Container for Versioneer configuration parameters.""" + + +def get_config(): + """Create, populate and return the VersioneerConfig() object.""" + # these strings are filled in when 'setup.py versioneer' creates + # _version.py + cfg = VersioneerConfig() + cfg.VCS = "git" + cfg.style = "%(STYLE)s" + cfg.tag_prefix = "%(TAG_PREFIX)s" + cfg.parentdir_prefix = "%(PARENTDIR_PREFIX)s" + cfg.versionfile_source = "%(VERSIONFILE_SOURCE)s" + cfg.verbose = False + return cfg + + +class NotThisMethod(Exception): + """Exception raised if a method is not valid for the current scenario.""" + + +LONG_VERSION_PY = {} +HANDLERS = {} + + +def register_vcs_handler(vcs, method): # decorator + """Create decorator to mark a method as the handler of a VCS.""" + def decorate(f): + """Store f in HANDLERS[vcs][method].""" + if vcs not in HANDLERS: + HANDLERS[vcs] = {} + HANDLERS[vcs][method] = f + return f + return decorate + + +# pylint:disable=too-many-arguments,consider-using-with # noqa +def run_command(commands, args, cwd=None, verbose=False, hide_stderr=False, + env=None): + """Call the given command(s).""" + assert isinstance(commands, list) + process = None + for command in commands: + try: + dispcmd = str([command] + args) + # remember shell=False, so use git.cmd on windows, not just git + process = subprocess.Popen([command] + args, cwd=cwd, env=env, + stdout=subprocess.PIPE, + stderr=(subprocess.PIPE if hide_stderr + else None)) + break + except EnvironmentError: + e = sys.exc_info()[1] + if e.errno == errno.ENOENT: + continue + if verbose: + print("unable to run %%s" %% dispcmd) + print(e) + return None, None + else: + if verbose: + print("unable to find command, tried %%s" %% (commands,)) + return None, None + stdout = process.communicate()[0].strip().decode() + if process.returncode != 0: + if verbose: + print("unable to run %%s (error)" %% dispcmd) + print("stdout was %%s" %% stdout) + return None, process.returncode + return stdout, process.returncode + + +def versions_from_parentdir(parentdir_prefix, root, verbose): + """Try to determine the version from the parent directory name. + + Source tarballs conventionally unpack into a directory that includes both + the project name and a version string. We will also support searching up + two directory levels for an appropriately named parent directory + """ + rootdirs = [] + + for _ in range(3): + dirname = os.path.basename(root) + if dirname.startswith(parentdir_prefix): + return {"version": dirname[len(parentdir_prefix):], + "full-revisionid": None, + "dirty": False, "error": None, "date": None} + rootdirs.append(root) + root = os.path.dirname(root) # up a level + + if verbose: + print("Tried directories %%s but none started with prefix %%s" %% + (str(rootdirs), parentdir_prefix)) + raise NotThisMethod("rootdir doesn't start with parentdir_prefix") + + +@register_vcs_handler("git", "get_keywords") +def git_get_keywords(versionfile_abs): + """Extract version information from the given file.""" + # the code embedded in _version.py can just fetch the value of these + # keywords. When used from setup.py, we don't want to import _version.py, + # so we do it with a regexp instead. This function is not used from + # _version.py. + keywords = {} + try: + with open(versionfile_abs, "r") as fobj: + for line in fobj: + if line.strip().startswith("git_refnames ="): + mo = re.search(r'=\s*"(.*)"', line) + if mo: + keywords["refnames"] = mo.group(1) + if line.strip().startswith("git_full ="): + mo = re.search(r'=\s*"(.*)"', line) + if mo: + keywords["full"] = mo.group(1) + if line.strip().startswith("git_date ="): + mo = re.search(r'=\s*"(.*)"', line) + if mo: + keywords["date"] = mo.group(1) + except EnvironmentError: + pass + return keywords + + +@register_vcs_handler("git", "keywords") +def git_versions_from_keywords(keywords, tag_prefix, verbose): + """Get version information from git keywords.""" + if "refnames" not in keywords: + raise NotThisMethod("Short version file found") + date = keywords.get("date") + if date is not None: + # Use only the last line. Previous lines may contain GPG signature + # information. + date = date.splitlines()[-1] + + # git-2.2.0 added "%%cI", which expands to an ISO-8601 -compliant + # datestamp. However we prefer "%%ci" (which expands to an "ISO-8601 + # -like" string, which we must then edit to make compliant), because + # it's been around since git-1.5.3, and it's too difficult to + # discover which version we're using, or to work around using an + # older one. + date = date.strip().replace(" ", "T", 1).replace(" ", "", 1) + refnames = keywords["refnames"].strip() + if refnames.startswith("$Format"): + if verbose: + print("keywords are unexpanded, not using") + raise NotThisMethod("unexpanded keywords, not a git-archive tarball") + refs = {r.strip() for r in refnames.strip("()").split(",")} + # starting in git-1.8.3, tags are listed as "tag: foo-1.0" instead of + # just "foo-1.0". If we see a "tag: " prefix, prefer those. + TAG = "tag: " + tags = {r[len(TAG):] for r in refs if r.startswith(TAG)} + if not tags: + # Either we're using git < 1.8.3, or there really are no tags. We use + # a heuristic: assume all version tags have a digit. The old git %%d + # expansion behaves like git log --decorate=short and strips out the + # refs/heads/ and refs/tags/ prefixes that would let us distinguish + # between branches and tags. By ignoring refnames without digits, we + # filter out many common branch names like "release" and + # "stabilization", as well as "HEAD" and "master". + tags = {r for r in refs if re.search(r'\d', r)} + if verbose: + print("discarding '%%s', no digits" %% ",".join(refs - tags)) + if verbose: + print("likely tags: %%s" %% ",".join(sorted(tags))) + for ref in sorted(tags): + # sorting will prefer e.g. "2.0" over "2.0rc1" + if ref.startswith(tag_prefix): + r = ref[len(tag_prefix):] + # Filter out refs that exactly match prefix or that don't start + # with a number once the prefix is stripped (mostly a concern + # when prefix is '') + if not re.match(r'\d', r): + continue + if verbose: + print("picking %%s" %% r) + return {"version": r, + "full-revisionid": keywords["full"].strip(), + "dirty": False, "error": None, + "date": date} + # no suitable tags, so version is "0+unknown", but full hex is still there + if verbose: + print("no suitable tags, using unknown + full revision id") + return {"version": "0+unknown", + "full-revisionid": keywords["full"].strip(), + "dirty": False, "error": "no suitable tags", "date": None} + + +@register_vcs_handler("git", "pieces_from_vcs") +def git_pieces_from_vcs(tag_prefix, root, verbose, runner=run_command): + """Get version from 'git describe' in the root of the source tree. + + This only gets called if the git-archive 'subst' keywords were *not* + expanded, and _version.py hasn't already been rewritten with a short + version string, meaning we're inside a checked out source tree. + """ + GITS = ["git"] + if sys.platform == "win32": + GITS = ["git.cmd", "git.exe"] + + _, rc = runner(GITS, ["rev-parse", "--git-dir"], cwd=root, + hide_stderr=True) + if rc != 0: + if verbose: + print("Directory %%s not under git control" %% root) + raise NotThisMethod("'git rev-parse --git-dir' returned error") + + # if there is a tag matching tag_prefix, this yields TAG-NUM-gHEX[-dirty] + # if there isn't one, this yields HEX[-dirty] (no NUM) + describe_out, rc = runner(GITS, ["describe", "--tags", "--dirty", + "--always", "--long", + "--match", "%%s*" %% tag_prefix], + cwd=root) + # --long was added in git-1.5.5 + if describe_out is None: + raise NotThisMethod("'git describe' failed") + describe_out = describe_out.strip() + full_out, rc = runner(GITS, ["rev-parse", "HEAD"], cwd=root) + if full_out is None: + raise NotThisMethod("'git rev-parse' failed") + full_out = full_out.strip() + + pieces = {} + pieces["long"] = full_out + pieces["short"] = full_out[:7] # maybe improved later + pieces["error"] = None + + branch_name, rc = runner(GITS, ["rev-parse", "--abbrev-ref", "HEAD"], + cwd=root) + # --abbrev-ref was added in git-1.6.3 + if rc != 0 or branch_name is None: + raise NotThisMethod("'git rev-parse --abbrev-ref' returned error") + branch_name = branch_name.strip() + + if branch_name == "HEAD": + # If we aren't exactly on a branch, pick a branch which represents + # the current commit. If all else fails, we are on a branchless + # commit. + branches, rc = runner(GITS, ["branch", "--contains"], cwd=root) + # --contains was added in git-1.5.4 + if rc != 0 or branches is None: + raise NotThisMethod("'git branch --contains' returned error") + branches = branches.split("\n") + + # Remove the first line if we're running detached + if "(" in branches[0]: + branches.pop(0) + + # Strip off the leading "* " from the list of branches. + branches = [branch[2:] for branch in branches] + if "master" in branches: + branch_name = "master" + elif not branches: + branch_name = None + else: + # Pick the first branch that is returned. Good or bad. + branch_name = branches[0] + + pieces["branch"] = branch_name + + # parse describe_out. It will be like TAG-NUM-gHEX[-dirty] or HEX[-dirty] + # TAG might have hyphens. + git_describe = describe_out + + # look for -dirty suffix + dirty = git_describe.endswith("-dirty") + pieces["dirty"] = dirty + if dirty: + git_describe = git_describe[:git_describe.rindex("-dirty")] + + # now we have TAG-NUM-gHEX or HEX + + if "-" in git_describe: + # TAG-NUM-gHEX + mo = re.search(r'^(.+)-(\d+)-g([0-9a-f]+)$', git_describe) + if not mo: + # unparseable. Maybe git-describe is misbehaving? + pieces["error"] = ("unable to parse git-describe output: '%%s'" + %% describe_out) + return pieces + + # tag + full_tag = mo.group(1) + if not full_tag.startswith(tag_prefix): + if verbose: + fmt = "tag '%%s' doesn't start with prefix '%%s'" + print(fmt %% (full_tag, tag_prefix)) + pieces["error"] = ("tag '%%s' doesn't start with prefix '%%s'" + %% (full_tag, tag_prefix)) + return pieces + pieces["closest-tag"] = full_tag[len(tag_prefix):] + + # distance: number of commits since tag + pieces["distance"] = int(mo.group(2)) + + # commit: short hex revision ID + pieces["short"] = mo.group(3) + + else: + # HEX: no tags + pieces["closest-tag"] = None + count_out, rc = runner(GITS, ["rev-list", "HEAD", "--count"], cwd=root) + pieces["distance"] = int(count_out) # total number of commits + + # commit date: see ISO-8601 comment in git_versions_from_keywords() + date = runner(GITS, ["show", "-s", "--format=%%ci", "HEAD"], cwd=root)[0].strip() + # Use only the last line. Previous lines may contain GPG signature + # information. + date = date.splitlines()[-1] + pieces["date"] = date.strip().replace(" ", "T", 1).replace(" ", "", 1) + + return pieces + + +def plus_or_dot(pieces): + """Return a + if we don't already have one, else return a .""" + if "+" in pieces.get("closest-tag", ""): + return "." + return "+" + + +def render_pep440(pieces): + """Build up version string, with post-release "local version identifier". + + Our goal: TAG[+DISTANCE.gHEX[.dirty]] . Note that if you + get a tagged build and then dirty it, you'll get TAG+0.gHEX.dirty + + Exceptions: + 1: no tags. git_describe was just HEX. 0+untagged.DISTANCE.gHEX[.dirty] + """ + if pieces["closest-tag"]: + rendered = pieces["closest-tag"] + if pieces["distance"] or pieces["dirty"]: + rendered += plus_or_dot(pieces) + rendered += "%%d.g%%s" %% (pieces["distance"], pieces["short"]) + if pieces["dirty"]: + rendered += ".dirty" + else: + # exception #1 + rendered = "0+untagged.%%d.g%%s" %% (pieces["distance"], + pieces["short"]) + if pieces["dirty"]: + rendered += ".dirty" + return rendered + + +def render_pep440_branch(pieces): + """TAG[[.dev0]+DISTANCE.gHEX[.dirty]] . + + The ".dev0" means not master branch. Note that .dev0 sorts backwards + (a feature branch will appear "older" than the master branch). + + Exceptions: + 1: no tags. 0[.dev0]+untagged.DISTANCE.gHEX[.dirty] + """ + if pieces["closest-tag"]: + rendered = pieces["closest-tag"] + if pieces["distance"] or pieces["dirty"]: + if pieces["branch"] != "master": + rendered += ".dev0" + rendered += plus_or_dot(pieces) + rendered += "%%d.g%%s" %% (pieces["distance"], pieces["short"]) + if pieces["dirty"]: + rendered += ".dirty" + else: + # exception #1 + rendered = "0" + if pieces["branch"] != "master": + rendered += ".dev0" + rendered += "+untagged.%%d.g%%s" %% (pieces["distance"], + pieces["short"]) + if pieces["dirty"]: + rendered += ".dirty" + return rendered + + +def render_pep440_pre(pieces): + """TAG[.post0.devDISTANCE] -- No -dirty. + + Exceptions: + 1: no tags. 0.post0.devDISTANCE + """ + if pieces["closest-tag"]: + rendered = pieces["closest-tag"] + if pieces["distance"]: + rendered += ".post0.dev%%d" %% pieces["distance"] + else: + # exception #1 + rendered = "0.post0.dev%%d" %% pieces["distance"] + return rendered + + +def render_pep440_post(pieces): + """TAG[.postDISTANCE[.dev0]+gHEX] . + + The ".dev0" means dirty. Note that .dev0 sorts backwards + (a dirty tree will appear "older" than the corresponding clean one), + but you shouldn't be releasing software with -dirty anyways. + + Exceptions: + 1: no tags. 0.postDISTANCE[.dev0] + """ + if pieces["closest-tag"]: + rendered = pieces["closest-tag"] + if pieces["distance"] or pieces["dirty"]: + rendered += ".post%%d" %% pieces["distance"] + if pieces["dirty"]: + rendered += ".dev0" + rendered += plus_or_dot(pieces) + rendered += "g%%s" %% pieces["short"] + else: + # exception #1 + rendered = "0.post%%d" %% pieces["distance"] + if pieces["dirty"]: + rendered += ".dev0" + rendered += "+g%%s" %% pieces["short"] + return rendered + + +def render_pep440_post_branch(pieces): + """TAG[.postDISTANCE[.dev0]+gHEX[.dirty]] . + + The ".dev0" means not master branch. + + Exceptions: + 1: no tags. 0.postDISTANCE[.dev0]+gHEX[.dirty] + """ + if pieces["closest-tag"]: + rendered = pieces["closest-tag"] + if pieces["distance"] or pieces["dirty"]: + rendered += ".post%%d" %% pieces["distance"] + if pieces["branch"] != "master": + rendered += ".dev0" + rendered += plus_or_dot(pieces) + rendered += "g%%s" %% pieces["short"] + if pieces["dirty"]: + rendered += ".dirty" + else: + # exception #1 + rendered = "0.post%%d" %% pieces["distance"] + if pieces["branch"] != "master": + rendered += ".dev0" + rendered += "+g%%s" %% pieces["short"] + if pieces["dirty"]: + rendered += ".dirty" + return rendered + + +def render_pep440_old(pieces): + """TAG[.postDISTANCE[.dev0]] . + + The ".dev0" means dirty. + + Exceptions: + 1: no tags. 0.postDISTANCE[.dev0] + """ + if pieces["closest-tag"]: + rendered = pieces["closest-tag"] + if pieces["distance"] or pieces["dirty"]: + rendered += ".post%%d" %% pieces["distance"] + if pieces["dirty"]: + rendered += ".dev0" + else: + # exception #1 + rendered = "0.post%%d" %% pieces["distance"] + if pieces["dirty"]: + rendered += ".dev0" + return rendered + + +def render_git_describe(pieces): + """TAG[-DISTANCE-gHEX][-dirty]. + + Like 'git describe --tags --dirty --always'. + + Exceptions: + 1: no tags. HEX[-dirty] (note: no 'g' prefix) + """ + if pieces["closest-tag"]: + rendered = pieces["closest-tag"] + if pieces["distance"]: + rendered += "-%%d-g%%s" %% (pieces["distance"], pieces["short"]) + else: + # exception #1 + rendered = pieces["short"] + if pieces["dirty"]: + rendered += "-dirty" + return rendered + + +def render_git_describe_long(pieces): + """TAG-DISTANCE-gHEX[-dirty]. + + Like 'git describe --tags --dirty --always -long'. + The distance/hash is unconditional. + + Exceptions: + 1: no tags. HEX[-dirty] (note: no 'g' prefix) + """ + if pieces["closest-tag"]: + rendered = pieces["closest-tag"] + rendered += "-%%d-g%%s" %% (pieces["distance"], pieces["short"]) + else: + # exception #1 + rendered = pieces["short"] + if pieces["dirty"]: + rendered += "-dirty" + return rendered + + +def render(pieces, style): + """Render the given version pieces into the requested style.""" + if pieces["error"]: + return {"version": "unknown", + "full-revisionid": pieces.get("long"), + "dirty": None, + "error": pieces["error"], + "date": None} + + if not style or style == "default": + style = "pep440" # the default + + if style == "pep440": + rendered = render_pep440(pieces) + elif style == "pep440-branch": + rendered = render_pep440_branch(pieces) + elif style == "pep440-pre": + rendered = render_pep440_pre(pieces) + elif style == "pep440-post": + rendered = render_pep440_post(pieces) + elif style == "pep440-post-branch": + rendered = render_pep440_post_branch(pieces) + elif style == "pep440-old": + rendered = render_pep440_old(pieces) + elif style == "git-describe": + rendered = render_git_describe(pieces) + elif style == "git-describe-long": + rendered = render_git_describe_long(pieces) + else: + raise ValueError("unknown style '%%s'" %% style) + + return {"version": rendered, "full-revisionid": pieces["long"], + "dirty": pieces["dirty"], "error": None, + "date": pieces.get("date")} + + +def get_versions(): + """Get version information or return default if unable to do so.""" + # I am in _version.py, which lives at ROOT/VERSIONFILE_SOURCE. If we have + # __file__, we can work backwards from there to the root. Some + # py2exe/bbfreeze/non-CPython implementations don't do __file__, in which + # case we can only use expanded keywords. + + cfg = get_config() + verbose = cfg.verbose + + try: + return git_versions_from_keywords(get_keywords(), cfg.tag_prefix, + verbose) + except NotThisMethod: + pass + + try: + root = os.path.realpath(__file__) + # versionfile_source is the relative path from the top of the source + # tree (where the .git directory might live) to this file. Invert + # this to find the root from __file__. + for _ in cfg.versionfile_source.split('/'): + root = os.path.dirname(root) + except NameError: + return {"version": "0+unknown", "full-revisionid": None, + "dirty": None, + "error": "unable to find root of source tree", + "date": None} + + try: + pieces = git_pieces_from_vcs(cfg.tag_prefix, root, verbose) + return render(pieces, cfg.style) + except NotThisMethod: + pass + + try: + if cfg.parentdir_prefix: + return versions_from_parentdir(cfg.parentdir_prefix, root, verbose) + except NotThisMethod: + pass + + return {"version": "0+unknown", "full-revisionid": None, + "dirty": None, + "error": "unable to compute version", "date": None} +''' + + +@register_vcs_handler("git", "get_keywords") +def git_get_keywords(versionfile_abs): + """Extract version information from the given file.""" + # the code embedded in _version.py can just fetch the value of these + # keywords. When used from setup.py, we don't want to import _version.py, + # so we do it with a regexp instead. This function is not used from + # _version.py. + keywords = {} + try: + with open(versionfile_abs, "r") as fobj: + for line in fobj: + if line.strip().startswith("git_refnames ="): + mo = re.search(r'=\s*"(.*)"', line) + if mo: + keywords["refnames"] = mo.group(1) + if line.strip().startswith("git_full ="): + mo = re.search(r'=\s*"(.*)"', line) + if mo: + keywords["full"] = mo.group(1) + if line.strip().startswith("git_date ="): + mo = re.search(r'=\s*"(.*)"', line) + if mo: + keywords["date"] = mo.group(1) + except EnvironmentError: + pass + return keywords + + +@register_vcs_handler("git", "keywords") +def git_versions_from_keywords(keywords, tag_prefix, verbose): + """Get version information from git keywords.""" + if "refnames" not in keywords: + raise NotThisMethod("Short version file found") + date = keywords.get("date") + if date is not None: + # Use only the last line. Previous lines may contain GPG signature + # information. + date = date.splitlines()[-1] + + # git-2.2.0 added "%cI", which expands to an ISO-8601 -compliant + # datestamp. However we prefer "%ci" (which expands to an "ISO-8601 + # -like" string, which we must then edit to make compliant), because + # it's been around since git-1.5.3, and it's too difficult to + # discover which version we're using, or to work around using an + # older one. + date = date.strip().replace(" ", "T", 1).replace(" ", "", 1) + refnames = keywords["refnames"].strip() + if refnames.startswith("$Format"): + if verbose: + print("keywords are unexpanded, not using") + raise NotThisMethod("unexpanded keywords, not a git-archive tarball") + refs = {r.strip() for r in refnames.strip("()").split(",")} + # starting in git-1.8.3, tags are listed as "tag: foo-1.0" instead of + # just "foo-1.0". If we see a "tag: " prefix, prefer those. + TAG = "tag: " + tags = {r[len(TAG):] for r in refs if r.startswith(TAG)} + if not tags: + # Either we're using git < 1.8.3, or there really are no tags. We use + # a heuristic: assume all version tags have a digit. The old git %d + # expansion behaves like git log --decorate=short and strips out the + # refs/heads/ and refs/tags/ prefixes that would let us distinguish + # between branches and tags. By ignoring refnames without digits, we + # filter out many common branch names like "release" and + # "stabilization", as well as "HEAD" and "master". + tags = {r for r in refs if re.search(r'\d', r)} + if verbose: + print("discarding '%s', no digits" % ",".join(refs - tags)) + if verbose: + print("likely tags: %s" % ",".join(sorted(tags))) + for ref in sorted(tags): + # sorting will prefer e.g. "2.0" over "2.0rc1" + if ref.startswith(tag_prefix): + r = ref[len(tag_prefix):] + # Filter out refs that exactly match prefix or that don't start + # with a number once the prefix is stripped (mostly a concern + # when prefix is '') + if not re.match(r'\d', r): + continue + if verbose: + print("picking %s" % r) + return {"version": r, + "full-revisionid": keywords["full"].strip(), + "dirty": False, "error": None, + "date": date} + # no suitable tags, so version is "0+unknown", but full hex is still there + if verbose: + print("no suitable tags, using unknown + full revision id") + return {"version": "0+unknown", + "full-revisionid": keywords["full"].strip(), + "dirty": False, "error": "no suitable tags", "date": None} + + +@register_vcs_handler("git", "pieces_from_vcs") +def git_pieces_from_vcs(tag_prefix, root, verbose, runner=run_command): + """Get version from 'git describe' in the root of the source tree. + + This only gets called if the git-archive 'subst' keywords were *not* + expanded, and _version.py hasn't already been rewritten with a short + version string, meaning we're inside a checked out source tree. + """ + GITS = ["git"] + if sys.platform == "win32": + GITS = ["git.cmd", "git.exe"] + + _, rc = runner(GITS, ["rev-parse", "--git-dir"], cwd=root, + hide_stderr=True) + if rc != 0: + if verbose: + print("Directory %s not under git control" % root) + raise NotThisMethod("'git rev-parse --git-dir' returned error") + + # if there is a tag matching tag_prefix, this yields TAG-NUM-gHEX[-dirty] + # if there isn't one, this yields HEX[-dirty] (no NUM) + describe_out, rc = runner(GITS, ["describe", "--tags", "--dirty", + "--always", "--long", + "--match", "%s*" % tag_prefix], + cwd=root) + # --long was added in git-1.5.5 + if describe_out is None: + raise NotThisMethod("'git describe' failed") + describe_out = describe_out.strip() + full_out, rc = runner(GITS, ["rev-parse", "HEAD"], cwd=root) + if full_out is None: + raise NotThisMethod("'git rev-parse' failed") + full_out = full_out.strip() + + pieces = {} + pieces["long"] = full_out + pieces["short"] = full_out[:7] # maybe improved later + pieces["error"] = None + + branch_name, rc = runner(GITS, ["rev-parse", "--abbrev-ref", "HEAD"], + cwd=root) + # --abbrev-ref was added in git-1.6.3 + if rc != 0 or branch_name is None: + raise NotThisMethod("'git rev-parse --abbrev-ref' returned error") + branch_name = branch_name.strip() + + if branch_name == "HEAD": + # If we aren't exactly on a branch, pick a branch which represents + # the current commit. If all else fails, we are on a branchless + # commit. + branches, rc = runner(GITS, ["branch", "--contains"], cwd=root) + # --contains was added in git-1.5.4 + if rc != 0 or branches is None: + raise NotThisMethod("'git branch --contains' returned error") + branches = branches.split("\n") + + # Remove the first line if we're running detached + if "(" in branches[0]: + branches.pop(0) + + # Strip off the leading "* " from the list of branches. + branches = [branch[2:] for branch in branches] + if "master" in branches: + branch_name = "master" + elif not branches: + branch_name = None + else: + # Pick the first branch that is returned. Good or bad. + branch_name = branches[0] + + pieces["branch"] = branch_name + + # parse describe_out. It will be like TAG-NUM-gHEX[-dirty] or HEX[-dirty] + # TAG might have hyphens. + git_describe = describe_out + + # look for -dirty suffix + dirty = git_describe.endswith("-dirty") + pieces["dirty"] = dirty + if dirty: + git_describe = git_describe[:git_describe.rindex("-dirty")] + + # now we have TAG-NUM-gHEX or HEX + + if "-" in git_describe: + # TAG-NUM-gHEX + mo = re.search(r'^(.+)-(\d+)-g([0-9a-f]+)$', git_describe) + if not mo: + # unparseable. Maybe git-describe is misbehaving? + pieces["error"] = ("unable to parse git-describe output: '%s'" + % describe_out) + return pieces + + # tag + full_tag = mo.group(1) + if not full_tag.startswith(tag_prefix): + if verbose: + fmt = "tag '%s' doesn't start with prefix '%s'" + print(fmt % (full_tag, tag_prefix)) + pieces["error"] = ("tag '%s' doesn't start with prefix '%s'" + % (full_tag, tag_prefix)) + return pieces + pieces["closest-tag"] = full_tag[len(tag_prefix):] + + # distance: number of commits since tag + pieces["distance"] = int(mo.group(2)) + + # commit: short hex revision ID + pieces["short"] = mo.group(3) + + else: + # HEX: no tags + pieces["closest-tag"] = None + count_out, rc = runner(GITS, ["rev-list", "HEAD", "--count"], cwd=root) + pieces["distance"] = int(count_out) # total number of commits + + # commit date: see ISO-8601 comment in git_versions_from_keywords() + date = runner(GITS, ["show", "-s", "--format=%ci", "HEAD"], cwd=root)[0].strip() + # Use only the last line. Previous lines may contain GPG signature + # information. + date = date.splitlines()[-1] + pieces["date"] = date.strip().replace(" ", "T", 1).replace(" ", "", 1) + + return pieces + + +def do_vcs_install(manifest_in, versionfile_source, ipy): + """Git-specific installation logic for Versioneer. + + For Git, this means creating/changing .gitattributes to mark _version.py + for export-subst keyword substitution. + """ + GITS = ["git"] + if sys.platform == "win32": + GITS = ["git.cmd", "git.exe"] + files = [manifest_in, versionfile_source] + if ipy: + files.append(ipy) + try: + my_path = __file__ + if my_path.endswith(".pyc") or my_path.endswith(".pyo"): + my_path = os.path.splitext(my_path)[0] + ".py" + versioneer_file = os.path.relpath(my_path) + except NameError: + versioneer_file = "versioneer.py" + files.append(versioneer_file) + present = False + try: + with open(".gitattributes", "r") as fobj: + for line in fobj: + if line.strip().startswith(versionfile_source): + if "export-subst" in line.strip().split()[1:]: + present = True + break + except EnvironmentError: + pass + if not present: + with open(".gitattributes", "a+") as fobj: + fobj.write(f"{versionfile_source} export-subst\n") + files.append(".gitattributes") + run_command(GITS, ["add", "--"] + files) + + +def versions_from_parentdir(parentdir_prefix, root, verbose): + """Try to determine the version from the parent directory name. + + Source tarballs conventionally unpack into a directory that includes both + the project name and a version string. We will also support searching up + two directory levels for an appropriately named parent directory + """ + rootdirs = [] + + for _ in range(3): + dirname = os.path.basename(root) + if dirname.startswith(parentdir_prefix): + return {"version": dirname[len(parentdir_prefix):], + "full-revisionid": None, + "dirty": False, "error": None, "date": None} + rootdirs.append(root) + root = os.path.dirname(root) # up a level + + if verbose: + print("Tried directories %s but none started with prefix %s" % + (str(rootdirs), parentdir_prefix)) + raise NotThisMethod("rootdir doesn't start with parentdir_prefix") + + +SHORT_VERSION_PY = """ +# This file was generated by 'versioneer.py' (0.20) from +# revision-control system data, or from the parent directory name of an +# unpacked source archive. Distribution tarballs contain a pre-generated copy +# of this file. + +import json + +version_json = ''' +%s +''' # END VERSION_JSON + + +def get_versions(): + return json.loads(version_json) +""" + + +def versions_from_file(filename): + """Try to determine the version from _version.py if present.""" + try: + with open(filename) as f: + contents = f.read() + except EnvironmentError: + raise NotThisMethod("unable to read _version.py") + mo = re.search(r"version_json = '''\n(.*)''' # END VERSION_JSON", + contents, re.M | re.S) + if not mo: + mo = re.search(r"version_json = '''\r\n(.*)''' # END VERSION_JSON", + contents, re.M | re.S) + if not mo: + raise NotThisMethod("no version_json in _version.py") + return json.loads(mo.group(1)) + + +def write_to_version_file(filename, versions): + """Write the given version number to the given _version.py file.""" + os.unlink(filename) + contents = json.dumps(versions, sort_keys=True, + indent=1, separators=(",", ": ")) + with open(filename, "w") as f: + f.write(SHORT_VERSION_PY % contents) + + print("set %s to '%s'" % (filename, versions["version"])) + + +def plus_or_dot(pieces): + """Return a + if we don't already have one, else return a .""" + if "+" in pieces.get("closest-tag", ""): + return "." + return "+" + + +def render_pep440(pieces): + """Build up version string, with post-release "local version identifier". + + Our goal: TAG[+DISTANCE.gHEX[.dirty]] . Note that if you + get a tagged build and then dirty it, you'll get TAG+0.gHEX.dirty + + Exceptions: + 1: no tags. git_describe was just HEX. 0+untagged.DISTANCE.gHEX[.dirty] + """ + if pieces["closest-tag"]: + rendered = pieces["closest-tag"] + if pieces["distance"] or pieces["dirty"]: + rendered += plus_or_dot(pieces) + rendered += "%d.g%s" % (pieces["distance"], pieces["short"]) + if pieces["dirty"]: + rendered += ".dirty" + else: + # exception #1 + rendered = "0+untagged.%d.g%s" % (pieces["distance"], + pieces["short"]) + if pieces["dirty"]: + rendered += ".dirty" + return rendered + + +def render_pep440_branch(pieces): + """TAG[[.dev0]+DISTANCE.gHEX[.dirty]] . + + The ".dev0" means not master branch. Note that .dev0 sorts backwards + (a feature branch will appear "older" than the master branch). + + Exceptions: + 1: no tags. 0[.dev0]+untagged.DISTANCE.gHEX[.dirty] + """ + if pieces["closest-tag"]: + rendered = pieces["closest-tag"] + if pieces["distance"] or pieces["dirty"]: + if pieces["branch"] != "master": + rendered += ".dev0" + rendered += plus_or_dot(pieces) + rendered += "%d.g%s" % (pieces["distance"], pieces["short"]) + if pieces["dirty"]: + rendered += ".dirty" + else: + # exception #1 + rendered = "0" + if pieces["branch"] != "master": + rendered += ".dev0" + rendered += "+untagged.%d.g%s" % (pieces["distance"], + pieces["short"]) + if pieces["dirty"]: + rendered += ".dirty" + return rendered + + +def render_pep440_pre(pieces): + """TAG[.post0.devDISTANCE] -- No -dirty. + + Exceptions: + 1: no tags. 0.post0.devDISTANCE + """ + if pieces["closest-tag"]: + rendered = pieces["closest-tag"] + if pieces["distance"]: + rendered += ".post0.dev%d" % pieces["distance"] + else: + # exception #1 + rendered = "0.post0.dev%d" % pieces["distance"] + return rendered + + +def render_pep440_post(pieces): + """TAG[.postDISTANCE[.dev0]+gHEX] . + + The ".dev0" means dirty. Note that .dev0 sorts backwards + (a dirty tree will appear "older" than the corresponding clean one), + but you shouldn't be releasing software with -dirty anyways. + + Exceptions: + 1: no tags. 0.postDISTANCE[.dev0] + """ + if pieces["closest-tag"]: + rendered = pieces["closest-tag"] + if pieces["distance"] or pieces["dirty"]: + rendered += ".post%d" % pieces["distance"] + if pieces["dirty"]: + rendered += ".dev0" + rendered += plus_or_dot(pieces) + rendered += "g%s" % pieces["short"] + else: + # exception #1 + rendered = "0.post%d" % pieces["distance"] + if pieces["dirty"]: + rendered += ".dev0" + rendered += "+g%s" % pieces["short"] + return rendered + + +def render_pep440_post_branch(pieces): + """TAG[.postDISTANCE[.dev0]+gHEX[.dirty]] . + + The ".dev0" means not master branch. + + Exceptions: + 1: no tags. 0.postDISTANCE[.dev0]+gHEX[.dirty] + """ + if pieces["closest-tag"]: + rendered = pieces["closest-tag"] + if pieces["distance"] or pieces["dirty"]: + rendered += ".post%d" % pieces["distance"] + if pieces["branch"] != "master": + rendered += ".dev0" + rendered += plus_or_dot(pieces) + rendered += "g%s" % pieces["short"] + if pieces["dirty"]: + rendered += ".dirty" + else: + # exception #1 + rendered = "0.post%d" % pieces["distance"] + if pieces["branch"] != "master": + rendered += ".dev0" + rendered += "+g%s" % pieces["short"] + if pieces["dirty"]: + rendered += ".dirty" + return rendered + + +def render_pep440_old(pieces): + """TAG[.postDISTANCE[.dev0]] . + + The ".dev0" means dirty. + + Exceptions: + 1: no tags. 0.postDISTANCE[.dev0] + """ + if pieces["closest-tag"]: + rendered = pieces["closest-tag"] + if pieces["distance"] or pieces["dirty"]: + rendered += ".post%d" % pieces["distance"] + if pieces["dirty"]: + rendered += ".dev0" + else: + # exception #1 + rendered = "0.post%d" % pieces["distance"] + if pieces["dirty"]: + rendered += ".dev0" + return rendered + + +def render_git_describe(pieces): + """TAG[-DISTANCE-gHEX][-dirty]. + + Like 'git describe --tags --dirty --always'. + + Exceptions: + 1: no tags. HEX[-dirty] (note: no 'g' prefix) + """ + if pieces["closest-tag"]: + rendered = pieces["closest-tag"] + if pieces["distance"]: + rendered += "-%d-g%s" % (pieces["distance"], pieces["short"]) + else: + # exception #1 + rendered = pieces["short"] + if pieces["dirty"]: + rendered += "-dirty" + return rendered + + +def render_git_describe_long(pieces): + """TAG-DISTANCE-gHEX[-dirty]. + + Like 'git describe --tags --dirty --always -long'. + The distance/hash is unconditional. + + Exceptions: + 1: no tags. HEX[-dirty] (note: no 'g' prefix) + """ + if pieces["closest-tag"]: + rendered = pieces["closest-tag"] + rendered += "-%d-g%s" % (pieces["distance"], pieces["short"]) + else: + # exception #1 + rendered = pieces["short"] + if pieces["dirty"]: + rendered += "-dirty" + return rendered + + +def render(pieces, style): + """Render the given version pieces into the requested style.""" + if pieces["error"]: + return {"version": "unknown", + "full-revisionid": pieces.get("long"), + "dirty": None, + "error": pieces["error"], + "date": None} + + if not style or style == "default": + style = "pep440" # the default + + if style == "pep440": + rendered = render_pep440(pieces) + elif style == "pep440-branch": + rendered = render_pep440_branch(pieces) + elif style == "pep440-pre": + rendered = render_pep440_pre(pieces) + elif style == "pep440-post": + rendered = render_pep440_post(pieces) + elif style == "pep440-post-branch": + rendered = render_pep440_post_branch(pieces) + elif style == "pep440-old": + rendered = render_pep440_old(pieces) + elif style == "git-describe": + rendered = render_git_describe(pieces) + elif style == "git-describe-long": + rendered = render_git_describe_long(pieces) + else: + raise ValueError("unknown style '%s'" % style) + + return {"version": rendered, "full-revisionid": pieces["long"], + "dirty": pieces["dirty"], "error": None, + "date": pieces.get("date")} + + +class VersioneerBadRootError(Exception): + """The project root directory is unknown or missing key files.""" + + +def get_versions(verbose=False): + """Get the project version from whatever source is available. + + Returns dict with two keys: 'version' and 'full'. + """ + if "versioneer" in sys.modules: + # see the discussion in cmdclass.py:get_cmdclass() + del sys.modules["versioneer"] + + root = get_root() + cfg = get_config_from_root(root) + + assert cfg.VCS is not None, "please set [versioneer]VCS= in setup.cfg" + handlers = HANDLERS.get(cfg.VCS) + assert handlers, "unrecognized VCS '%s'" % cfg.VCS + verbose = verbose or cfg.verbose + assert cfg.versionfile_source is not None, \ + "please set versioneer.versionfile_source" + assert cfg.tag_prefix is not None, "please set versioneer.tag_prefix" + + versionfile_abs = os.path.join(root, cfg.versionfile_source) + + # extract version from first of: _version.py, VCS command (e.g. 'git + # describe'), parentdir. This is meant to work for developers using a + # source checkout, for users of a tarball created by 'setup.py sdist', + # and for users of a tarball/zipball created by 'git archive' or github's + # download-from-tag feature or the equivalent in other VCSes. + + get_keywords_f = handlers.get("get_keywords") + from_keywords_f = handlers.get("keywords") + if get_keywords_f and from_keywords_f: + try: + keywords = get_keywords_f(versionfile_abs) + ver = from_keywords_f(keywords, cfg.tag_prefix, verbose) + if verbose: + print("got version from expanded keyword %s" % ver) + return ver + except NotThisMethod: + pass + + try: + ver = versions_from_file(versionfile_abs) + if verbose: + print("got version from file %s %s" % (versionfile_abs, ver)) + return ver + except NotThisMethod: + pass + + from_vcs_f = handlers.get("pieces_from_vcs") + if from_vcs_f: + try: + pieces = from_vcs_f(cfg.tag_prefix, root, verbose) + ver = render(pieces, cfg.style) + if verbose: + print("got version from VCS %s" % ver) + return ver + except NotThisMethod: + pass + + try: + if cfg.parentdir_prefix: + ver = versions_from_parentdir(cfg.parentdir_prefix, root, verbose) + if verbose: + print("got version from parentdir %s" % ver) + return ver + except NotThisMethod: + pass + + if verbose: + print("unable to compute version") + + return {"version": "0+unknown", "full-revisionid": None, + "dirty": None, "error": "unable to compute version", + "date": None} + + +def get_version(): + """Get the short version string for this project.""" + return get_versions()["version"] + + +def get_cmdclass(cmdclass=None): + """Get the custom setuptools/distutils subclasses used by Versioneer. + + If the package uses a different cmdclass (e.g. one from numpy), it + should be provide as an argument. + """ + if "versioneer" in sys.modules: + del sys.modules["versioneer"] + # this fixes the "python setup.py develop" case (also 'install' and + # 'easy_install .'), in which subdependencies of the main project are + # built (using setup.py bdist_egg) in the same python process. Assume + # a main project A and a dependency B, which use different versions + # of Versioneer. A's setup.py imports A's Versioneer, leaving it in + # sys.modules by the time B's setup.py is executed, causing B to run + # with the wrong versioneer. Setuptools wraps the sub-dep builds in a + # sandbox that restores sys.modules to it's pre-build state, so the + # parent is protected against the child's "import versioneer". By + # removing ourselves from sys.modules here, before the child build + # happens, we protect the child from the parent's versioneer too. + # Also see https://github.com/python-versioneer/python-versioneer/issues/52 + + cmds = {} if cmdclass is None else cmdclass.copy() + + # we add "version" to both distutils and setuptools + from distutils.core import Command + + class cmd_version(Command): + description = "report generated version string" + user_options = [] + boolean_options = [] + + def initialize_options(self): + pass + + def finalize_options(self): + pass + + def run(self): + vers = get_versions(verbose=True) + print("Version: %s" % vers["version"]) + print(" full-revisionid: %s" % vers.get("full-revisionid")) + print(" dirty: %s" % vers.get("dirty")) + print(" date: %s" % vers.get("date")) + if vers["error"]: + print(" error: %s" % vers["error"]) + cmds["version"] = cmd_version + + # we override "build_py" in both distutils and setuptools + # + # most invocation pathways end up running build_py: + # distutils/build -> build_py + # distutils/install -> distutils/build ->.. + # setuptools/bdist_wheel -> distutils/install ->.. + # setuptools/bdist_egg -> distutils/install_lib -> build_py + # setuptools/install -> bdist_egg ->.. + # setuptools/develop -> ? + # pip install: + # copies source tree to a tempdir before running egg_info/etc + # if .git isn't copied too, 'git describe' will fail + # then does setup.py bdist_wheel, or sometimes setup.py install + # setup.py egg_info -> ? + + # we override different "build_py" commands for both environments + if 'build_py' in cmds: + _build_py = cmds['build_py'] + elif "setuptools" in sys.modules: + from setuptools.command.build_py import build_py as _build_py + else: + from distutils.command.build_py import build_py as _build_py + + class cmd_build_py(_build_py): + def run(self): + root = get_root() + cfg = get_config_from_root(root) + versions = get_versions() + _build_py.run(self) + # now locate _version.py in the new build/ directory and replace + # it with an updated value + if cfg.versionfile_build: + target_versionfile = os.path.join(self.build_lib, + cfg.versionfile_build) + print("UPDATING %s" % target_versionfile) + write_to_version_file(target_versionfile, versions) + cmds["build_py"] = cmd_build_py + + if 'build_ext' in cmds: + _build_ext = cmds['build_ext'] + elif "setuptools" in sys.modules: + from setuptools.command.build_ext import build_ext as _build_ext + else: + from distutils.command.build_ext import build_ext as _build_ext + + class cmd_build_ext(_build_ext): + def run(self): + root = get_root() + cfg = get_config_from_root(root) + versions = get_versions() + _build_ext.run(self) + if self.inplace: + # build_ext --inplace will only build extensions in + # build/lib<..> dir with no _version.py to write to. + # As in place builds will already have a _version.py + # in the module dir, we do not need to write one. + return + # now locate _version.py in the new build/ directory and replace + # it with an updated value + target_versionfile = os.path.join(self.build_lib, + cfg.versionfile_build) + print("UPDATING %s" % target_versionfile) + write_to_version_file(target_versionfile, versions) + cmds["build_ext"] = cmd_build_ext + + if "cx_Freeze" in sys.modules: # cx_freeze enabled? + from cx_Freeze.dist import build_exe as _build_exe + # nczeczulin reports that py2exe won't like the pep440-style string + # as FILEVERSION, but it can be used for PRODUCTVERSION, e.g. + # setup(console=[{ + # "version": versioneer.get_version().split("+", 1)[0], # FILEVERSION + # "product_version": versioneer.get_version(), + # ... + + class cmd_build_exe(_build_exe): + def run(self): + root = get_root() + cfg = get_config_from_root(root) + versions = get_versions() + target_versionfile = cfg.versionfile_source + print("UPDATING %s" % target_versionfile) + write_to_version_file(target_versionfile, versions) + + _build_exe.run(self) + os.unlink(target_versionfile) + with open(cfg.versionfile_source, "w") as f: + LONG = LONG_VERSION_PY[cfg.VCS] + f.write(LONG % + {"DOLLAR": "$", + "STYLE": cfg.style, + "TAG_PREFIX": cfg.tag_prefix, + "PARENTDIR_PREFIX": cfg.parentdir_prefix, + "VERSIONFILE_SOURCE": cfg.versionfile_source, + }) + cmds["build_exe"] = cmd_build_exe + del cmds["build_py"] + + if 'py2exe' in sys.modules: # py2exe enabled? + from py2exe.distutils_buildexe import py2exe as _py2exe + + class cmd_py2exe(_py2exe): + def run(self): + root = get_root() + cfg = get_config_from_root(root) + versions = get_versions() + target_versionfile = cfg.versionfile_source + print("UPDATING %s" % target_versionfile) + write_to_version_file(target_versionfile, versions) + + _py2exe.run(self) + os.unlink(target_versionfile) + with open(cfg.versionfile_source, "w") as f: + LONG = LONG_VERSION_PY[cfg.VCS] + f.write(LONG % + {"DOLLAR": "$", + "STYLE": cfg.style, + "TAG_PREFIX": cfg.tag_prefix, + "PARENTDIR_PREFIX": cfg.parentdir_prefix, + "VERSIONFILE_SOURCE": cfg.versionfile_source, + }) + cmds["py2exe"] = cmd_py2exe + + # we override different "sdist" commands for both environments + if 'sdist' in cmds: + _sdist = cmds['sdist'] + elif "setuptools" in sys.modules: + from setuptools.command.sdist import sdist as _sdist + else: + from distutils.command.sdist import sdist as _sdist + + class cmd_sdist(_sdist): + def run(self): + versions = get_versions() + # pylint:disable=attribute-defined-outside-init # noqa + self._versioneer_generated_versions = versions + # unless we update this, the command will keep using the old + # version + self.distribution.metadata.version = versions["version"] + return _sdist.run(self) + + def make_release_tree(self, base_dir, files): + root = get_root() + cfg = get_config_from_root(root) + _sdist.make_release_tree(self, base_dir, files) + # now locate _version.py in the new base_dir directory + # (remembering that it may be a hardlink) and replace it with an + # updated value + target_versionfile = os.path.join(base_dir, cfg.versionfile_source) + print("UPDATING %s" % target_versionfile) + write_to_version_file(target_versionfile, + self._versioneer_generated_versions) + cmds["sdist"] = cmd_sdist + + return cmds + + +CONFIG_ERROR = """ +setup.cfg is missing the necessary Versioneer configuration. You need +a section like: + + [versioneer] + VCS = git + style = pep440 + versionfile_source = src/myproject/_version.py + versionfile_build = myproject/_version.py + tag_prefix = + parentdir_prefix = myproject- + +You will also need to edit your setup.py to use the results: + + import versioneer + setup(version=versioneer.get_version(), + cmdclass=versioneer.get_cmdclass(), ...) + +Please read the docstring in ./versioneer.py for configuration instructions, +edit setup.cfg, and re-run the installer or 'python versioneer.py setup'. +""" + +SAMPLE_CONFIG = """ +# See the docstring in versioneer.py for instructions. Note that you must +# re-run 'versioneer.py setup' after changing this section, and commit the +# resulting files. + +[versioneer] +#VCS = git +#style = pep440 +#versionfile_source = +#versionfile_build = +#tag_prefix = +#parentdir_prefix = + +""" + +OLD_SNIPPET = """ +from ._version import get_versions +__version__ = get_versions()['version'] +del get_versions +""" + +INIT_PY_SNIPPET = """ +from . import {0} +__version__ = {0}.get_versions()['version'] +""" + + +def do_setup(): + """Do main VCS-independent setup function for installing Versioneer.""" + root = get_root() + try: + cfg = get_config_from_root(root) + except (EnvironmentError, configparser.NoSectionError, + configparser.NoOptionError) as e: + if isinstance(e, (EnvironmentError, configparser.NoSectionError)): + print("Adding sample versioneer config to setup.cfg", + file=sys.stderr) + with open(os.path.join(root, "setup.cfg"), "a") as f: + f.write(SAMPLE_CONFIG) + print(CONFIG_ERROR, file=sys.stderr) + return 1 + + print(" creating %s" % cfg.versionfile_source) + with open(cfg.versionfile_source, "w") as f: + LONG = LONG_VERSION_PY[cfg.VCS] + f.write(LONG % {"DOLLAR": "$", + "STYLE": cfg.style, + "TAG_PREFIX": cfg.tag_prefix, + "PARENTDIR_PREFIX": cfg.parentdir_prefix, + "VERSIONFILE_SOURCE": cfg.versionfile_source, + }) + + ipy = os.path.join(os.path.dirname(cfg.versionfile_source), + "__init__.py") + if os.path.exists(ipy): + try: + with open(ipy, "r") as f: + old = f.read() + except EnvironmentError: + old = "" + module = os.path.splitext(os.path.basename(cfg.versionfile_source))[0] + snippet = INIT_PY_SNIPPET.format(module) + if OLD_SNIPPET in old: + print(" replacing boilerplate in %s" % ipy) + with open(ipy, "w") as f: + f.write(old.replace(OLD_SNIPPET, snippet)) + elif snippet not in old: + print(" appending to %s" % ipy) + with open(ipy, "a") as f: + f.write(snippet) + else: + print(" %s unmodified" % ipy) + else: + print(" %s doesn't exist, ok" % ipy) + ipy = None + + # Make sure both the top-level "versioneer.py" and versionfile_source + # (PKG/_version.py, used by runtime code) are in MANIFEST.in, so + # they'll be copied into source distributions. Pip won't be able to + # install the package without this. + manifest_in = os.path.join(root, "MANIFEST.in") + simple_includes = set() + try: + with open(manifest_in, "r") as f: + for line in f: + if line.startswith("include "): + for include in line.split()[1:]: + simple_includes.add(include) + except EnvironmentError: + pass + # That doesn't cover everything MANIFEST.in can do + # (http://docs.python.org/2/distutils/sourcedist.html#commands), so + # it might give some false negatives. Appending redundant 'include' + # lines is safe, though. + if "versioneer.py" not in simple_includes: + print(" appending 'versioneer.py' to MANIFEST.in") + with open(manifest_in, "a") as f: + f.write("include versioneer.py\n") + else: + print(" 'versioneer.py' already in MANIFEST.in") + if cfg.versionfile_source not in simple_includes: + print(" appending versionfile_source ('%s') to MANIFEST.in" % + cfg.versionfile_source) + with open(manifest_in, "a") as f: + f.write("include %s\n" % cfg.versionfile_source) + else: + print(" versionfile_source already in MANIFEST.in") + + # Make VCS-specific changes. For git, this means creating/changing + # .gitattributes to mark _version.py for export-subst keyword + # substitution. + do_vcs_install(manifest_in, cfg.versionfile_source, ipy) + return 0 + + +def scan_setup_py(): + """Validate the contents of setup.py against Versioneer's expectations.""" + found = set() + setters = False + errors = 0 + with open("setup.py", "r") as f: + for line in f.readlines(): + if "import versioneer" in line: + found.add("import") + if "versioneer.get_cmdclass()" in line: + found.add("cmdclass") + if "versioneer.get_version()" in line: + found.add("get_version") + if "versioneer.VCS" in line: + setters = True + if "versioneer.versionfile_source" in line: + setters = True + if len(found) != 3: + print("") + print("Your setup.py appears to be missing some important items") + print("(but I might be wrong). Please make sure it has something") + print("roughly like the following:") + print("") + print(" import versioneer") + print(" setup( version=versioneer.get_version(),") + print(" cmdclass=versioneer.get_cmdclass(), ...)") + print("") + errors += 1 + if setters: + print("You should remove lines like 'versioneer.VCS = ' and") + print("'versioneer.versionfile_source = ' . This configuration") + print("now lives in setup.cfg, and should be removed from setup.py") + print("") + errors += 1 + return errors + + +if __name__ == "__main__": + cmd = sys.argv[1] + if cmd == "setup": + errors = do_setup() + errors += scan_setup_py() + if errors: + sys.exit(1)