diff --git a/.github/workflows/run-pytest.yml b/.github/workflows/run-pytest.yml index e72a5006..00522850 100644 --- a/.github/workflows/run-pytest.yml +++ b/.github/workflows/run-pytest.yml @@ -17,6 +17,10 @@ jobs: steps: - uses: actions/checkout@v2 + - name: install macOS-specific dependancies + if: startsWith(matrix.os, 'macOS') + run: brew install md5sha1sum + - name: Set up Python ${{ matrix.python-version }} uses: actions/setup-python@v2 with: diff --git a/docs/changelog.md b/docs/changelog.md index 5b57e609..fc6179f3 100644 --- a/docs/changelog.md +++ b/docs/changelog.md @@ -2,6 +2,18 @@ This project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html) and [Keep a Changelog](https://keepachangelog.com/en/1.0.0/) format. +## [0.9.1] - 2020-07-29 + +### Added +- `force_large` argument in the `pull` method, which can be used to handle large archive downloads +- `add` method + +### Changed +- `getseq` method returns the sequence string instead of printing it to the screen + +### Deprecated +- `get_remote_data_str` method. Use `listr` instead + ## [0.9.0] - 2020-07-01 ### Changed diff --git a/refgenconf/__init__.py b/refgenconf/__init__.py index 51557765..03e5679d 100644 --- a/refgenconf/__init__.py +++ b/refgenconf/__init__.py @@ -5,7 +5,8 @@ from .helpers import * from .refgenconf import * -__all__ = ["RefGenConf", "select_genome_config", "GenomeConfigFormatError", - "MissingAssetError", "MissingConfigDataError", "MissingGenomeError", - "RefgenconfError", "UnboundEnvironmentVariablesError"] + \ - ["DEFAULT_SERVER"] + CFG_KEY_NAMES +__all__ = ["RefGenConf", "select_genome_config", "get_dir_digest", + "GenomeConfigFormatError", "MissingAssetError", + "MissingConfigDataError", "MissingGenomeError", "RefgenconfError", + "UnboundEnvironmentVariablesError"] + ["DEFAULT_SERVER"] + \ + CFG_KEY_NAMES diff --git a/refgenconf/_version.py b/refgenconf/_version.py index 3e2f46a3..d69d16e9 100644 --- a/refgenconf/_version.py +++ b/refgenconf/_version.py @@ -1 +1 @@ -__version__ = "0.9.0" +__version__ = "0.9.1" diff --git a/refgenconf/helpers.py b/refgenconf/helpers.py index a7251ba7..470819f2 100644 --- a/refgenconf/helpers.py +++ b/refgenconf/helpers.py @@ -1,11 +1,13 @@ """ Helper functions """ import os -import yacman -from .const import CFG_ENV_VARS +from yacman import select_config +from .const import CFG_ENV_VARS, BUILD_STATS_DIR +from re import sub +from ubiquerg import is_command_callable -__all__ = ["select_genome_config"] +__all__ = ["select_genome_config", "get_dir_digest"] def select_genome_config(filename=None, conf_env_vars=CFG_ENV_VARS, **kwargs): @@ -17,7 +19,7 @@ def select_genome_config(filename=None, conf_env_vars=CFG_ENV_VARS, **kwargs): consider; basically, a prioritized search list :return str: path to genome configuration file """ - return yacman.select_config(filename, conf_env_vars, **kwargs) + return select_config(filename, conf_env_vars, **kwargs) def unbound_env_vars(path): @@ -38,3 +40,32 @@ def unbound_env_vars(path): def asciify_json_dict(json_dict): from ubiquerg.collection import asciify_dict return asciify_dict(json_dict) + + +def get_dir_digest(path, pm=None): + """ + Generate a MD5 digest that reflects just the contents of the + files in the selected directory. + + :param str path: path to the directory to digest + :param pypiper.PipelineManager pm: a pipeline object, optional. + The subprocess module will be used if not provided + :return str: a digest, e.g. a3c46f201a3ce7831d85cf4a125aa334 + """ + if not is_command_callable("md5sum"): + raise OSError("md5sum command line tool is required for asset digest " + "calculation. \n" + "Install and try again, e.g on macOS: 'brew install " + "md5sha1sum'") + cmd = "cd {}; find . -type f -not -path './" + BUILD_STATS_DIR + \ + "*' -exec md5sum {{}} \; | sort -k 2 | awk '{{print $1}}' | md5sum" + try: + x = pm.checkprint(cmd.format(path)) + except AttributeError: + try: + from subprocess import check_output + x = check_output(cmd.format(path), shell=True).decode("utf-8") + except Exception as e: + + return + return str(sub(r'\W+', '', x)) # strips non-alphanumeric diff --git a/refgenconf/refgenconf.py b/refgenconf/refgenconf.py index 2344fb47..aa9a441c 100755 --- a/refgenconf/refgenconf.py +++ b/refgenconf/refgenconf.py @@ -10,8 +10,6 @@ import shutil import json -import yacman - from collections import Iterable, Mapping, OrderedDict from functools import partial from inspect import getfullargspec as finspect @@ -20,12 +18,14 @@ from pkg_resources import iter_entry_points from tempfile import TemporaryDirectory +from yacman import YacAttMap from attmap import PathExAttMap as PXAM -from ubiquerg import checksum, is_url, query_yes_no, \ - parse_registry_path as prp, untar, is_writable +from ubiquerg import checksum, is_url, query_yes_no, untar, is_writable, \ + parse_registry_path as prp from .const import * -from .helpers import unbound_env_vars, asciify_json_dict, select_genome_config +from .helpers import unbound_env_vars, asciify_json_dict, select_genome_config,\ + get_dir_digest from .exceptions import * _LOGGER = logging.getLogger(__name__) @@ -46,7 +46,7 @@ def handle(sig, frame): return handle -class RefGenConf(yacman.YacAttMap): +class RefGenConf(YacAttMap): """ A sort of oracle of available reference genome assembly assets """ def __init__(self, filepath=None, entries=None, writable=False, wait_max=60, @@ -200,6 +200,60 @@ def assets_str(self, offset_text=" ", asset_sep=", ", genome_assets_delim="/ ", asset_sep=asset_sep, order=order) return "\n".join([make_line(g, self[CFG_GENOMES_KEY][g][CFG_ASSETS_KEY]) for g in refgens]) + def add(self, path, genome, asset, tag=None, seek_keys=None, force=False): + """ + Add an external asset to the config + + :param str path: a path to the asset to add; must exist and be relative + to the genome_folder + :param str genome: genome name + :param str asset: asset name + :param str tag: tag name + :param dict seek_keys: seek keys to add + :param bool force: whether to force existing asset overwrite + """ + tag = tag or self.get_default_tag(genome, asset) + abspath = os.path.join(self[CFG_FOLDER_KEY], path) + remove = False + if not os.path.exists(abspath) or not os.path.isabs(abspath): + raise OSError("Provided path must exist and be relative to the" + " genome_folder: {}".format(self[CFG_FOLDER_KEY])) + try: + _assert_gat_exists(self[CFG_GENOMES_KEY], genome, asset, tag) + except Exception: + pass + else: + if not force and not \ + query_yes_no("'{}/{}:{}' exists. Do you want to overwrite?". + format(genome, asset, tag)): + _LOGGER.info("Aborted by a user, asset no added") + return False + remove = True + _LOGGER.info("Will remove existing to overwrite") + tag_data = { + CFG_ASSET_PATH_KEY: path, + CFG_ASSET_CHECKSUM_KEY: get_dir_digest(path) or "" + } + msg = "Added asset: {}/{}:{} {}".format( + genome, asset, tag, "" if not seek_keys else "with seek keys: {}". + format(seek_keys)) + if not self.file_path: + if remove: + self.cfg_remove_assets(genome, asset, tag) + self.update_tags(genome, asset, tag, tag_data) + self.update_seek_keys(genome, asset, tag, seek_keys or {asset: "."}) + self.set_default_pointer(genome, asset, tag) + _LOGGER.info(msg) + return True + with self as rgc: + if remove: + rgc.cfg_remove_assets(genome, asset, tag) + rgc.update_tags(genome, asset, tag, tag_data) + rgc.update_seek_keys(genome, asset, tag, seek_keys or {asset: "."}) + rgc.set_default_pointer(genome, asset, tag) + _LOGGER.info(msg) + return True + def filepath(self, genome, asset, tag, ext=".tgz", dir=False): """ Determine path to a particular asset for a particular genome. @@ -263,33 +317,50 @@ def seek(self, genome_name, asset_name, tag_name=None, seek_key=None, tag_name)) if not callable(check_exist) or len(finspect(check_exist).args) != 1: raise TypeError("Asset existence check must be a one-arg function.") - path = _genome_asset_path(self[CFG_GENOMES_KEY], genome_name, asset_name, - tag_name, seek_key, enclosing_dir) + # 3 'path' key options supported + # option1: absolute path + # get just the saute path value from the config + path_val = _genome_asset_path( + self[CFG_GENOMES_KEY], genome_name, asset_name, tag_name, + enclosing_dir=True, no_tag=True, seek_key=None + ) + _LOGGER.debug("Trying absolute path: {}".format(path_val)) + if seek_key: + path = os.path.join(path_val, seek_key) + else: + path = path_val if os.path.isabs(path) and check_exist(path): return path - _LOGGER.debug("Relative or nonexistent path: {}".format(path)) + # option2: relative to genome_folder/{genome} (default, canonical) + path = _genome_asset_path(self[CFG_GENOMES_KEY], genome_name, asset_name, + tag_name, seek_key, enclosing_dir) fullpath = os.path.join(self[CFG_FOLDER_KEY], genome_name, path) - _LOGGER.debug("Trying path relative to genome folder: {}".format(fullpath)) + _LOGGER.debug("Trying relative to genome_folder/genome ({}/{}): {}". + format(self[CFG_FOLDER_KEY], genome_name, fullpath)) if check_exist(fullpath): return fullpath - elif strict_exists is None: - return fullpath - msg = "For genome '{}' the asset '{}.{}:{}' doesn't exist; " \ - "tried {} and {}".format(genome_name, asset_name, seek_key, - tag_name, path, fullpath) - extant = [] - for base, ext in itertools.product([path, fullpath], [".tar.gz", ".tar"]): - # Attempt to enrich message with extra guidance. - p_prime = base + ext - if check_exist(p_prime): - extant.append(p_prime) - if extant: - msg += ". These paths exist: {}".format(extant) + # option3: relative to the genome_folder (if option2 does not exist) + gf_relpath = os.path.join( + self[CFG_FOLDER_KEY], + _genome_asset_path(self[CFG_GENOMES_KEY], genome_name, asset_name, + tag_name, seek_key, enclosing_dir, no_tag=True) + ) + _LOGGER.debug("Trying path relative to genome_folder ({}): {}". + format(self[CFG_FOLDER_KEY], gf_relpath)) + if check_exist(gf_relpath): + return gf_relpath + + msg = "For genome '{}' the asset '{}.{}:{}' doesn't exist; tried: {}".\ + format(genome_name, asset_name, seek_key, tag_name, + ",".join([path, gf_relpath, fullpath])) + # return option2 if existence not enforced + if strict_exists is None: + _LOGGER.debug(msg) if strict_exists is True: raise OSError(msg) else: warnings.warn(msg, RuntimeWarning) - return path + return fullpath def get_default_tag(self, genome, asset, use_existing=True): """ @@ -412,26 +483,29 @@ def get_remote_data_str(self, genome=None, order=None, get_url=lambda server, id names for sort :return str, str: text reps of remotely available genomes and assets """ - url = get_url(self[CFG_SERVERS_KEY], API_ID_ASSETS) - _LOGGER.info("Querying available assets: {}".format(url)) - genomes, assets = _list_remote(url, genome, order) - return genomes, assets + warnings.warn( + "Please use listr method instead; get_remote_data_str will be " + "removed in the next release.", category=DeprecationWarning + ) + return self.listr(genome, order, get_url) - def listr(self, genome=None, order=None, get_url=lambda server, id: construct_request_url(server, id)): + def listr(self, genome=None, order=None, get_url=lambda server, id: construct_request_url(server, id), as_str=False): """ - List genomes and assets available remotely. + List genomes and assets available remotely on all servers the object + subscribes to :param function(refgenconf.RefGenConf) -> str get_url: how to determine URL request, given RefGenConf instance :param list[str] | str genome: genomes that the assets should be found for :param function(str) -> object order: how to key genome IDs and asset names for sort - :return str, str: text reps of remotely available genomes and assets + :return dict[OrderedDict[list]]: remotely available genomes and assets + keyed by genome keyed by source server endpoint """ data_by_server = {} for url in self[CFG_SERVERS_KEY]: url = get_url(url, API_ID_ASSETS) - data_by_server[url] = _list_remote(url, genome, order, False) + data_by_server[url] = _list_remote(url, genome, order, as_str=as_str) return data_by_server def tag(self, genome, asset, tag, new_tag, files=True): @@ -576,7 +650,7 @@ def _update_relatives_tags(self, genome, asset, tag, new_tag, relatives, update_ self[CFG_GENOMES_KEY][genome][CFG_ASSETS_KEY][r_data["item"]][CFG_ASSET_TAGS_KEY][r_data["tag"]]\ [relative_key] = updated_relatives - def pull(self, genome, asset, tag, unpack=True, force=None, + def pull(self, genome, asset, tag, unpack=True, force=None, force_large=None, size_cutoff=10, get_json_url=lambda server, operation_id: construct_request_url(server, operation_id), build_signal_handler=_handle_sigint): """ @@ -590,6 +664,12 @@ def pull(self, genome, asset, tag, unpack=True, force=None, already exists; null for prompt (on a per-asset basis), False to effectively auto-reply No to the prompt to replace existing file, and True to auto-replay Yes for existing asset replacement. + :param bool | NoneType force_large: how to handle case in large (> 5GB) + asset is to be pulled; null for prompt (on a per-asset basis), False + to effectively auto-reply No to the prompt, + and True to auto-replay Yes + :param float size_cutoff: maximum archive file size to download with + no prompt :param function(str, str) -> str get_json_url: how to build URL from genome server URL base, genome, and asset :param function(str) -> function build_signal_handler: how to create @@ -680,9 +760,17 @@ def preserve(): bundle_name = '{}/{}:{}'.format(*gat) archsize = archive_data[CFG_ARCHIVE_SIZE_KEY] _LOGGER.debug("'{}' archive size: {}".format(bundle_name, archsize)) - if _is_large_archive(archsize) and not query_yes_no("Are you sure you want to download this large archive?"): - _LOGGER.info("pull action aborted by user") - return _null_return() + + if not force_large and _is_large_archive(archsize, size_cutoff): + if force_large is False: + _LOGGER.info("Skipping pull of {}/{}:{}; size: {}". + format(*gat, archsize)) + return _null_return() + if not query_yes_no("This archive exceeds the size cutoff ({} > {:.1f}GB) " + "Do you want to proceed?".format(archsize, size_cutoff)): + _LOGGER.info("Skipping pull of {}/{}:{}; size: {}". + format(*gat, archsize)) + return _null_return() if not os.path.exists(genome_dir_path): _LOGGER.debug("Creating directory: {}".format(genome_dir_path)) @@ -1077,29 +1165,33 @@ def unsubscribe(self, urls): if unsub_list: _LOGGER.info("Unsubscribed from: {}".format(", ".join(unsub_list))) - def getseq(self, genome, locus): + def getseq(self, genome, locus, as_str=False): """ Return the sequence found in a selected range and chromosome. Something like the refget protocol. :param str genome: name of the sequence identifier :param str locus: coordinates of desired sequence, e.g. 'chr1:1-10' + :param bool as_str: whether to convert the resurned object to string + and return just the sequence + :return str | pyfaidx.FastaRecord | pyfaidx.Sequence: selected sequence """ import pyfaidx fa = pyfaidx.Fasta(self.seek(genome, "fasta", strict_exists=True)) locus_split = locus.split(":") - - if len(locus_split) > 1: - start, end = locus_split[1].split("-") - _LOGGER.debug("chr: '{}', start: '{}', end: '{}'". - format(locus_split[0], start, end)) - print(fa[locus_split[0]][int(start):int(end)]) - else: - print(fa[locus_split[0]]) + chr = fa[locus_split[0]] + if len(locus_split) == 1: + return str(chr) if as_str else chr + start, end = locus_split[1].split("-") + _LOGGER.debug("chr: '{}', start: '{}', end: '{}'". + format(locus_split[0], start, end)) + return str(chr[int(start):int(end)]) \ + if as_str else chr[int(start):int(end)] def get_genome_attributes(self, genome): """ - Get the dictionary attributes, like checksum, contents, description. Does not return the assets. + Get the dictionary attributes, like checksum, contents, description. + Does not return the assets. :param str genome: genome to get the attributes dict for :return Mapping[str, str]: available genome attributes @@ -1306,7 +1398,7 @@ def _download_url_progress(url, output_path, name, params=None): urllib.request.urlretrieve(url, filename=output_path, reporthook=dpb.update_to) -def _genome_asset_path(genomes, gname, aname, tname, seek_key, enclosing_dir): +def _genome_asset_path(genomes, gname, aname, tname, seek_key, enclosing_dir, no_tag=False): """ Retrieve the raw path value for a particular asset for a particular genome. @@ -1322,7 +1414,7 @@ def _genome_asset_path(genomes, gname, aname, tname, seek_key, enclosing_dir): :return str: raw path value for a particular asset for a particular genome :raise MissingGenomeError: if the given key-value pair collection does not contain as a top-level key the given genome ID - :raise MissingAssetError: if the given key-value pair colelction does + :raise MissingAssetError: if the given key-value pair collection does contain the given genome ID, but that key's mapping doesn't contain the given asset name as a key :raise GenomeConfigFormatError: if it's discovered during the query that @@ -1332,19 +1424,25 @@ def _genome_asset_path(genomes, gname, aname, tname, seek_key, enclosing_dir): _assert_gat_exists(genomes, gname, aname, tname) asset_tag_data = genomes[gname][CFG_ASSETS_KEY][aname][CFG_ASSET_TAGS_KEY][tname] if enclosing_dir: + if no_tag: + return asset_tag_data[CFG_ASSET_PATH_KEY] return os.path.join(asset_tag_data[CFG_ASSET_PATH_KEY], tname) if seek_key is None: if aname in asset_tag_data[CFG_SEEK_KEYS_KEY]: seek_key = aname else: + if no_tag: + return asset_tag_data[CFG_ASSET_PATH_KEY] return os.path.join(asset_tag_data[CFG_ASSET_PATH_KEY], tname) try: seek_key_value = asset_tag_data[CFG_SEEK_KEYS_KEY][seek_key] - appendix = "" if seek_key_value == "." else seek_key_value - return os.path.join(asset_tag_data[CFG_ASSET_PATH_KEY], tname, appendix) except KeyError: raise MissingSeekKeyError("genome/asset:tag bundle '{}/{}:{}' exists, but seek_key '{}' is missing". format(gname, aname, tname, seek_key)) + else: + if no_tag: + return os.path.join(asset_tag_data[CFG_ASSET_PATH_KEY], seek_key_value) + return os.path.join(asset_tag_data[CFG_ASSET_PATH_KEY], tname, seek_key_value) def _assert_gat_exists(genomes, gname, aname=None, tname=None, allow_incomplete=False): @@ -1395,15 +1493,27 @@ def _assert_gat_exists(genomes, gname, aname=None, tname=None, allow_incomplete= "Build or pull the asset again.".format(gname, aname, tname)) -def _is_large_archive(size): +def _is_large_archive(size, cutoff=10): """ Determines if the file is large based on a string formatted as follows: 15.4GB :param str size: size string :return bool: the decision """ + def _str2float(x): + """ + Remove any letters from the file size string and cast the remainder to float + """ + return float("".join(c for c in x if c in '0123456789.')) + _LOGGER.debug("Checking archive size: '{}'".format(size)) - return size.endswith("TB") or (size.endswith("GB") and float("".join(c for c in size if c in '0123456789.')) > 5) + if size.endswith("MB"): + # convert to gigs + size = '{0:f}GB'.format(_str2float(size) / 1000) + if size.endswith("KB"): + # convert to gigs + size = '{0:f}GB'.format(_str2float(size) / 1000**2) + return size.endswith("TB") or (size.endswith("GB") and _str2float(size) > cutoff) def _list_remote(url, genome, order=None, as_str=True): diff --git a/requirements/requirements-all.txt b/requirements/requirements-all.txt index 309245da..e6e60d21 100644 --- a/requirements/requirements-all.txt +++ b/requirements/requirements-all.txt @@ -3,4 +3,5 @@ pyyaml requests tqdm>=4.38.0 yacman>=0.6.9 -future \ No newline at end of file +future +pyfaidx \ No newline at end of file diff --git a/tests/test_1pull_asset.py b/tests/test_1pull_asset.py index 91ab89dc..e9522f23 100644 --- a/tests/test_1pull_asset.py +++ b/tests/test_1pull_asset.py @@ -22,13 +22,6 @@ DOWNLOAD_FUNCTION = "refgenconf.refgenconf.{}".format(_download_url_progress.__name__) -@pytest.mark.parametrize( - ["genome", "asset", "tag"], [("rCRSd", "fasta", "default"), ("rCRSd", "fasta", "default")]) -def test_no_unpack(rgc, genome, asset, tag): - """ Tarballs must be unpacked. """ - with pytest.raises(NotImplementedError): - rgc.pull(genome, asset, tag, unpack=False) - @pytest.mark.parametrize(["gname", "aname"], [("human_repeats", 1), ("mouse_chrM2x", None)]) def test_pull_asset_illegal_asset_name(rgc, gname, aname): @@ -36,15 +29,6 @@ def test_pull_asset_illegal_asset_name(rgc, gname, aname): with pytest.raises(TypeError): rgc.pull(gname, aname) -@pytest.mark.parametrize(["gname", "aname", "tname"], - [("human_repeats", "bowtie2_index", "default"), ("mouse_chrM2x", "bwa_index", "default")]) -def test_negative_response_to_large_download_prompt(rgc, gname, aname, tname): - """ Test responsiveness to user abortion of pull request. """ - with mock.patch("refgenconf.refgenconf._is_large_archive", return_value=True), \ - mock.patch("refgenconf.refgenconf.query_yes_no", return_value=False): - gat, archive_dict, server_url = rgc.pull(gname, aname, tname) - assert gat == [gname, aname, tname] - @pytest.mark.parametrize(["gname", "aname", "tname"], [("human_repeats", "bowtie2_index", "default"), ("mouse_chrM2x", "bwa_index", "default")]) @@ -115,7 +99,8 @@ def test_pull_asset_updates_genome_config(cfg_file, gname, aname, tname): @pytest.mark.parametrize(["gname", "aname", "tname", "state"], - [("human_repeats", "fasta", "default", True), + [("rCRSd", "fasta", "default", True), + ("human_repeats", "fasta", "default", True), ("mouse_chrM2x", "fasta", "default", False)]) def test_pull_asset_works_with_nonwritable_and_writable_rgc(cfg_file, gname, aname, tname, state): rgc = RefGenConf(filepath=cfg_file, writable=state) diff --git a/tests/test_add.py b/tests/test_add.py new file mode 100644 index 00000000..316f6505 --- /dev/null +++ b/tests/test_add.py @@ -0,0 +1,44 @@ + +""" Tests for RefGenConf.add. These tests depend on successful completion of tests is test_1pull_asset.py """ + +import pytest +import mock +from refgenconf import RefGenConf + + +class TestAdd: + @pytest.mark.parametrize(["pth", "gname", "aname", "tname"], + [("bogus/path/file.txt", "rCRSd", "fasta", "default"), + ("bogus/path/file.txt", "rCRSd", "fasta", "default")]) + def test_nonexistent_file(self, cfg_file, pth, gname, aname, tname): + rgc = RefGenConf(filepath=cfg_file) + with pytest.raises(OSError): + rgc.add(pth, gname, aname, tname) + + @pytest.mark.parametrize(["gname", "aname", "tname"], + [("human_repeats", "fasta", "default"), + ("rCRSd", "fasta", "default")]) + def test_preexisting_asset_prompt(self, cfg_file, gname, aname, tname): + rgc = RefGenConf(filepath=cfg_file) + path = rgc.seek(genome_name=gname, asset_name=aname, tag_name=tname) + with mock.patch("refgenconf.refgenconf.query_yes_no", return_value=False): + assert not rgc.add(path, gname, aname, tname) + + @pytest.mark.parametrize(["gname", "aname", "tname"], + [("human_repeats", "fasta", "default"), + ("rCRSd", "fasta", "default")]) + def test_force_overwrite_asset(self, cfg_file, gname, aname, tname): + rgc = RefGenConf(filepath=cfg_file) + path = rgc.seek(genome_name=gname, asset_name=aname, tag_name=tname, enclosing_dir=True) + gname = gname + "_new" + assert rgc.add(path, gname, aname, tname) + assert rgc.add(path, gname, aname, tname, force=True) + + @pytest.mark.parametrize(["gname", "aname", "tname"], + [("human_repeats", "fasta", "default"), + ("rCRSd", "fasta", "default")]) + def test_nofile(self, cfg_file, gname, aname, tname): + rgc = RefGenConf(filepath=cfg_file) + pth = rgc.seek(gname, aname, tname, enclosing_dir=True) + rgc_new = RefGenConf() + assert rgc_new.add(pth, gname, aname, tname, seek_keys={"file": "b"}) \ No newline at end of file diff --git a/tests/test_get_asset.py b/tests/test_get_asset.py index 83f9785b..236a087f 100644 --- a/tests/test_get_asset.py +++ b/tests/test_get_asset.py @@ -46,5 +46,4 @@ def test_result_correctness_seek_keys(self, ro_rgc, gname, aname, tname, seek_ke tag_data = ro_rgc[CFG_GENOMES_KEY][gname][CFG_ASSETS_KEY][aname][CFG_ASSET_TAGS_KEY][tname] seek_key_value = tag_data[CFG_SEEK_KEYS_KEY][seek_key] pth = os.path.join(ro_rgc[CFG_FOLDER_KEY], gname, aname, tname, seek_key_value) - assert pth == ro_rgc.seek(gname, aname, tname, seek_key) - rmtree(os.path.join("/tmp", gname)) \ No newline at end of file + assert pth == ro_rgc.seek(gname, aname, tname, seek_key) \ No newline at end of file diff --git a/tests/test_getseq.py b/tests/test_getseq.py new file mode 100644 index 00000000..f65c92b2 --- /dev/null +++ b/tests/test_getseq.py @@ -0,0 +1,20 @@ + +""" Tests for RefGenConf.getseq. These tests depend on successful completion of tests is test_1pull_asset.py """ + +import pytest +from pyfaidx import FastaRecord, Sequence + + +class TestGetSeq: + @pytest.mark.parametrize(["gname", "chr"], [("rCRSd", "rCRSd"), ("human_repeats", "U14567.1")]) + def test_qetseq_just_chr(self, ro_rgc, gname, chr): + assert isinstance(ro_rgc.getseq(genome=gname, locus=chr), FastaRecord) + + @pytest.mark.parametrize(["gname", "chr"], + [("rCRSd", "rCRSd"), ("human_repeats", "U14567.1")]) + @pytest.mark.parametrize(["start", "end"], + [(1, 20), (2, 30), (1, 2), (2, 100)]) + def test_qetseq_interval(self, ro_rgc, gname, chr, start, end): + seq = ro_rgc.getseq(genome=gname, locus="{}:{}-{}".format(chr, start, end)) + assert isinstance(seq, Sequence) + assert len(seq) == end-start diff --git a/tests/test_init.py b/tests/test_init.py new file mode 100644 index 00000000..cda13ef8 --- /dev/null +++ b/tests/test_init.py @@ -0,0 +1,34 @@ +""" Tests for RefGenConf.initialize_config_file """ + +from refgenconf import RefGenConf +import pytest +import tempfile +import os +import shutil + + +class TestAdd: + def test_init_exists(self): + rgc = RefGenConf() + tf = tempfile.NamedTemporaryFile(prefix="/tmp/", suffix=".yaml") + with pytest.raises(OSError, match="file exists"): + rgc.initialize_config_file(filepath=tf.name) + + def test_init_nonwritable(self): + rgc = RefGenConf() + with pytest.raises(OSError, match="insufficient permissions"): + rgc.initialize_config_file(filepath="/test.yaml") + + def test_init_success(self): + rgc = RefGenConf() + dirpath = tempfile.mkdtemp(prefix="/tmp/") + cfg_file_path = os.path.join(dirpath, "test.yaml") + rgc.initialize_config_file(filepath=cfg_file_path) + assert os.path.exists(cfg_file_path) + shutil.rmtree(dirpath) + + @pytest.mark.parametrize("pth", [None, 1, {"a": "b"}]) + def test_invalid_path(self, pth): + rgc = RefGenConf() + with pytest.raises(TypeError): + rgc.initialize_config_file(filepath=pth) \ No newline at end of file diff --git a/tests/test_list_remote.py b/tests/test_list_remote.py index 88e424a4..89810d1f 100644 --- a/tests/test_list_remote.py +++ b/tests/test_list_remote.py @@ -1,27 +1,18 @@ """ Tests for listing remotely available genomes and assets. """ -import mock +from collections import OrderedDict from refgenconf import RefGenConf, CFG_FOLDER_KEY, CFG_GENOMES_KEY, \ CFG_SERVERS_KEY, DEFAULT_SERVER - -__author__ = "Vince Reuter" -__email__ = "vreuter@virginia.edu" +from refgenconf.refgenconf import _download_json def test_list_remote(rgc, tmpdir): """ Verify expected behavior of remote genome/asset listing. """ new_rgc = RefGenConf(entries={CFG_FOLDER_KEY: tmpdir.strpath, - CFG_SERVERS_KEY: DEFAULT_SERVER, + CFG_SERVERS_KEY: [DEFAULT_SERVER], CFG_GENOMES_KEY: rgc[CFG_GENOMES_KEY]}) - new_rgc[CFG_SERVERS_KEY] = "https://refgenomes.databio.org/" - print("NEW RGC KEYS: {}".format(list(new_rgc.keys()))) - with mock.patch("refgenconf.refgenconf._read_remote_data", - return_value=rgc.genomes): - genomes, assets = new_rgc.get_remote_data_str() - _assert_eq_as_sets(rgc.genomes_str(), genomes) - - -def _assert_eq_as_sets(a, b): - """ Collections are equivalent as sets if they're equal in size and element's collective identity. """ - assert len(a) == len(b) - assert set(a) == set(b) + result = new_rgc.listr() + assert list(result.keys())[0].startswith(DEFAULT_SERVER) + for server_url, asset_dict in result.items(): + assert isinstance(asset_dict, OrderedDict) + assert len(asset_dict) == len(_download_json(DEFAULT_SERVER + "/genomes")) diff --git a/tests/test_update_servers.py b/tests/test_update_servers.py index 3e8dcb47..464ad06a 100644 --- a/tests/test_update_servers.py +++ b/tests/test_update_servers.py @@ -30,3 +30,16 @@ def test_reset(self, my_rgc, urls): def test_reset(self, my_rgc, urls): my_rgc.subscribe(urls=urls, reset=True) assert len(my_rgc[CFG_SERVERS_KEY]) == 1 + + @pytest.mark.parametrize("urls", [["http://refgenomes.databio.org"]]) + def test_unsubscribe(self, my_rgc, urls): + my_rgc.subscribe(urls=urls) + my_rgc.unsubscribe(urls=urls) + assert len(my_rgc[CFG_SERVERS_KEY]) == 0 + + @pytest.mark.parametrize("urls", [["http://refge"], ["what"]]) + def test_unsubscribe_invalid(self, my_rgc, urls): + my_rgc.subscribe(urls=["http://refgenomes.databio.org"]) + servers = my_rgc[CFG_SERVERS_KEY] + my_rgc.unsubscribe(urls=urls) + assert my_rgc[CFG_SERVERS_KEY] == servers