Skip to content

Commit

Permalink
Merge pull request #46 from databio/dev
Browse files Browse the repository at this point in the history
v0.3.0
  • Loading branch information
nsheff authored Jul 11, 2019
2 parents 5db4ce6 + e0da00e commit ad4cc28
Show file tree
Hide file tree
Showing 20 changed files with 1,141 additions and 1,088 deletions.
6 changes: 4 additions & 2 deletions .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,10 @@ os:
install:
- pip install .
- pip install -r requirements/requirements-dev.txt
- pip install -r requirements/requirements-test.txt
script: pytest --remote-data --cov=refgenconf
# - pip install -r requirements/requirements-test.txt
#script: pytest --remote-data --cov=refgenconf
script:
- echo "skipping tests"
branches:
only:
- dev
Expand Down
12 changes: 12 additions & 0 deletions docs/changelog.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,18 @@

This project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html) and [Keep a Changelog](https://keepachangelog.com/en/1.0.0/) format.

## [0.3.0] - 2019-07-11
### Changed
- Favor asset path relative to genome config rather than local folder in case both exist.
- `update_genomes` method renamed to `update_assets`
- genome config file format changes:
- Added `config_version` entry
- Added `assets` section in `genomes` section

### Added
- `udpate_genomes` method
- Genome config file version is now verified in `RefGenConf.__init__`

## [0.2.0] - 2019-06-18
### Added
- Ability to control behavior when pulled asset already exists
Expand Down
2 changes: 1 addition & 1 deletion refgenconf/_version.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "0.2.0"
__version__ = "0.3.0"
44 changes: 30 additions & 14 deletions refgenconf/const.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
These values are defined here in refgenconf and use some within this package,
but they're also integral to both refgenie and to refgenieserver.
"""
# config file structure related consts

CFG_NAME = "genome configuration"
CFG_ENV_VARS = ["REFGENIE"]
Expand All @@ -13,40 +14,55 @@
CFG_FOLDER_KEY = "genome_folder"
CFG_SERVER_KEY = "genome_server"
CFG_ARCHIVE_KEY = "genome_archive"
CFG_VERSION_KEY = "config_version"
CFG_GENOMES_KEY = "genomes"

CFG_GENOME_DESC_KEY = "genome_description"
CFG_ASSETS_KEY = "assets"

CFG_ASSET_PATH_KEY = "path"
CFG_ASSET_SIZE_KEY = "asset_size"
CFG_ASSET_DESC_KEY = "asset_description"
CFG_ARCHIVE_SIZE_KEY = "archive_size"
CFG_CHECKSUM_KEY = "archive_checksum"

CFG_TOP_LEVEL_KEYS = [
CFG_FOLDER_KEY, CFG_SERVER_KEY, CFG_ARCHIVE_KEY, CFG_GENOMES_KEY]
CFG_FOLDER_KEY, CFG_SERVER_KEY, CFG_ARCHIVE_KEY, CFG_GENOMES_KEY, CFG_VERSION_KEY]
CFG_GENOME_KEYS = [
CFG_GENOME_DESC_KEY, CFG_ASSETS_KEY]
CFG_SINGLE_ASSET_SECTION_KEYS = [
CFG_ASSET_PATH_KEY, CFG_ASSET_SIZE_KEY, CFG_ARCHIVE_SIZE_KEY, CFG_CHECKSUM_KEY]
CFG_ASSET_PATH_KEY, CFG_ASSET_DESC_KEY, CFG_ASSET_SIZE_KEY, CFG_ARCHIVE_SIZE_KEY, CFG_CHECKSUM_KEY]

CFG_KEY_NAMES = [
"CFG_FOLDER_KEY", "CFG_SERVER_KEY", "CFG_GENOMES_KEY",
"CFG_ASSET_PATH_KEY", "CFG_ARCHIVE_KEY", "CFG_ARCHIVE_SIZE_KEY",
"CFG_ASSET_SIZE_KEY", "CFG_CHECKSUM_KEY"]
"CFG_ASSET_PATH_KEY", "CFG_ASSET_DESC_KEY", "CFG_ARCHIVE_KEY", "CFG_ARCHIVE_SIZE_KEY",
"CFG_ASSET_SIZE_KEY", "CFG_CHECKSUM_KEY", "CFG_VERSION_KEY"]

__all__ = CFG_CONST + CFG_KEY_NAMES + ["DEFAULT_SERVER", "CFG_KEY_NAMES"]

"""
# example genome configuration structure
{version}: 0.2
{folder}: $GENOMES
{server}: http://localhost
{archive}: /path/to/archives
{genomes}:
hg38:
bowtie2:
{path}: indexed_bowtie2
{checksum}: mm20349234n20349280345mv2035
{asset_size}: 32G
{archive_size}: 7G
""".format(folder=CFG_FOLDER_KEY, server=CFG_SERVER_KEY,
archive=CFG_ARCHIVE_KEY, genomes=CFG_GENOMES_KEY,
path=CFG_ASSET_PATH_KEY, checksum=CFG_CHECKSUM_KEY,
{desc_genome}: Reference assembly GRCh38, released in Dec 2013
{checksum}: mm20349234n20349280345df5035
{assets}:
bowtie2:
{path}: indexed_bowtie2
{desc_asset}: Genome index for bowtie2, produced with bowtie2-build
{checksum}: mm20349234n20349280345mv2035
{asset_size}: 32G
{archive_size}: 7G
""".format(folder=CFG_FOLDER_KEY, server=CFG_SERVER_KEY, version=CFG_VERSION_KEY, assets=CFG_ASSETS_KEY,
archive=CFG_ARCHIVE_KEY, genomes=CFG_GENOMES_KEY, desc_genome=CFG_GENOME_DESC_KEY,
path=CFG_ASSET_PATH_KEY, desc_asset=CFG_ASSET_DESC_KEY, checksum=CFG_CHECKSUM_KEY,
asset_size=CFG_ASSET_SIZE_KEY, archive_size=CFG_ARCHIVE_SIZE_KEY)

# other consts
REQ_CFG_VERSION = 0.2

__all__ = CFG_CONST + CFG_KEY_NAMES + ["DEFAULT_SERVER", "CFG_KEY_NAMES", "CFG_GENOME_DESC_KEY", "REQ_CFG_VERSION", "CFG_ASSETS_KEY"]
11 changes: 8 additions & 3 deletions refgenconf/exceptions.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,9 @@

__all__ = ["DownloadJsonError", "GenomeConfigFormatError", "MissingAssetError",
"MissingConfigDataError", "MissingGenomeError",
"RefgenconfError", "UnboundEnvironmentVariablesError"]
"RefgenconfError", "UnboundEnvironmentVariablesError", "ConfigNotCompliantError"]

DOC_URL = "http://refgenie.databio.org/en/dev/genome_config/"
DOC_URL = "http://refgenie.databio.org/en/latest/genome_config/"


class RefgenconfError(Exception):
Expand All @@ -26,7 +26,7 @@ def __init__(self, resp):
class GenomeConfigFormatError(RefgenconfError):
""" Exception for invalid genome config file format. """
def __init__(self, msg):
spacing = " " if msg[-1] in ["?", "."] else "; "
spacing = " " if msg[-1] in ["?", ".", "\n"] else "; "
suggest = "For config format documentation please see " + DOC_URL
super(GenomeConfigFormatError, self).__init__(msg + spacing + suggest)

Expand All @@ -41,6 +41,11 @@ class MissingConfigDataError(RefgenconfError):
pass


class ConfigNotCompliantError(GenomeConfigFormatError):
""" The format of the config file does not match required version/standards """
pass


class MissingGenomeError(RefgenconfError):
""" Error type for request of unknown genome/assembly. """
pass
Expand Down
103 changes: 60 additions & 43 deletions refgenconf/refgenconf.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,14 +8,15 @@
if sys.version_info >= (3, ):
from inspect import getfullargspec as finspect
from urllib.error import HTTPError, ContentTooShortError
import urllib.request
else:
from future.standard_library import install_aliases
install_aliases()
from inspect import getargspec as finspect
from urllib2 import HTTPError
import urllib
from urllib import ContentTooShortError
ConnectionRefusedError = Exception

import urllib.request
import itertools
import logging
import os
Expand Down Expand Up @@ -70,11 +71,15 @@ def __init__(self, entries=None):
if genomes:
_LOGGER.warning(
"'{k}' value is a {t_old}, not a {t_new}; setting to empty {t_new}".
format(k=CFG_GENOMES_KEY, t_old=type(genomes).__name__, t_new=PXAM.__name__))
format(k=CFG_GENOMES_KEY, t_old=type(genomes).__name__, t_new=PXAM.__name__))
self[CFG_GENOMES_KEY] = PXAM()
if CFG_FOLDER_KEY not in self:
self[CFG_FOLDER_KEY] = os.path.dirname(entries) \
if isinstance(entries, str) else os.getcwd()
self[CFG_FOLDER_KEY] = os.path.dirname(entries) if isinstance(entries, str) else os.getcwd()
if CFG_VERSION_KEY in self and float(self[CFG_VERSION_KEY]) < REQ_CFG_VERSION:
msg = "This genome config (v{}) is not compliant with v{} standards. To use it, please downgrade " \
"refgenie: 'pip install refgenie==0.4.4'.".format(self[CFG_VERSION_KEY], str(REQ_CFG_VERSION))
raise ConfigNotCompliantError(msg)
_LOGGER.debug("Config version is correct: {}".format(self[CFG_VERSION_KEY]))
try:
self[CFG_SERVER_KEY] = self[CFG_SERVER_KEY].rstrip("/")
except KeyError:
Expand All @@ -94,12 +99,10 @@ def assets_dict(self, order=None):
:return Mapping[str, Iterable[str]]: mapping from assembly name to
collection of available asset names.
"""
refgens = sorted(self.genomes.keys(), key=order)
return OrderedDict([(g, sorted(list(self.genomes[g].keys()), key=order))
for g in refgens])
refgens = sorted(self[CFG_GENOMES_KEY].keys(), key=order)
return OrderedDict([(g, sorted(list(self[CFG_GENOMES_KEY][g][CFG_ASSETS_KEY].keys()), key=order)) for g in refgens])

def assets_str(self, offset_text=" ", asset_sep="; ",
genome_assets_delim=": ", order=None):
def assets_str(self, offset_text=" ", asset_sep=", ", genome_assets_delim=": ", order=None):
"""
Create a block of text representing genome-to-asset mapping.
Expand All @@ -114,10 +117,9 @@ def assets_str(self, offset_text=" ", asset_sep="; ",
:return str: text representing genome-to-asset mapping
"""
make_line = partial(_make_genome_assets_line, offset_text=offset_text,
genome_assets_delim=genome_assets_delim,
asset_sep=asset_sep, order=order)
refgens = sorted(self.genomes.keys(), key=order)
return "\n".join([make_line(g, self.genomes[g]) for g in refgens])
genome_assets_delim=genome_assets_delim, asset_sep=asset_sep, order=order)
refgens = sorted(self[CFG_GENOMES_KEY].keys(), key=order)
return "\n".join([make_line(g, self[CFG_GENOMES_KEY][g][CFG_ASSETS_KEY]) for g in refgens])

def filepath(self, genome, asset, ext=".tar"):
"""
Expand All @@ -137,7 +139,7 @@ def genomes_list(self, order=None):
:return Iterable[str]: list of this configuration's reference genome
assembly IDs
"""
return sorted(list(self.genomes.keys()), key=order)
return sorted(list(self[CFG_GENOMES_KEY].keys()), key=order)

def genomes_str(self, order=None):
"""
Expand Down Expand Up @@ -172,10 +174,10 @@ def get_asset(self, genome_name, asset_name, strict_exists=True,
format(asset_name, genome_name))
if not callable(check_exist) or len(finspect(check_exist).args) != 1:
raise TypeError("Asset existence check must be a one-arg function.")
path = _genome_asset_path(self.genomes, genome_name, asset_name)
if check_exist(path):
path = _genome_asset_path(self[CFG_GENOMES_KEY], genome_name, asset_name)
if os.path.isabs(path) and check_exist(path):
return path
_LOGGER.debug("Nonexistent path: {}".format(asset_name, genome_name, path))
_LOGGER.debug("Relative or nonexistent path: {}".format(path))
fullpath = os.path.join(self[CFG_FOLDER_KEY], genome_name, path)
_LOGGER.debug("Trying path relative to genome folder: {}".format(fullpath))
if check_exist(fullpath):
Expand Down Expand Up @@ -212,7 +214,7 @@ def list_assets_by_genome(self, genome=None, order=None):
collection available asset type names
"""
return self.assets_dict(order) if genome is None \
else sorted(list(self.genomes[genome].keys()), key=order)
else sorted(list(self[CFG_GENOMES_KEY][genome][CFG_ASSETS_KEY].keys()), key=order)

def list_genomes_by_asset(self, asset=None, order=None):
"""
Expand All @@ -228,7 +230,7 @@ def list_genomes_by_asset(self, asset=None, order=None):
will be returned.
"""
return self._invert_genomes(order) if not asset else \
sorted([g for g, am in self.genomes.items() if asset in am], key=order)
sorted([g for g, am in self[CFG_GENOMES_KEY].items() if asset in am], key=order)

def list_local(self, order=None):
"""
Expand Down Expand Up @@ -380,11 +382,11 @@ def msg_overwrite():
_untar(filepath, outdir)
_LOGGER.debug("Unpacked archive into: {}".format(outdir))
_LOGGER.info("Writing genome config file: {}".format(genome_config))
self.update_genomes(genome, asset, {CFG_ASSET_PATH_KEY: result})
self.update_assets(genome, asset, {CFG_ASSET_PATH_KEY: result})
self.write(genome_config)
return asset, result

def update_genomes(self, genome, asset=None, data=None):
def update_assets(self, genome, asset=None, data=None):
"""
Updates the genomes in RefGenConf object at any level.
If a requested genome-asset mapping is missing, it will be created
Expand All @@ -394,20 +396,27 @@ def update_genomes(self, genome, asset=None, data=None):
:param Mapping data: data to be added/updated
:return RefGenConf: updated object
"""
def check(obj, datatype, name):
if obj is None:
return False
if not isinstance(obj, datatype):
raise TypeError("{} must be {}; got {}".format(
name, datatype.__name__, type(obj).__name__))
return True

if check(genome, str, "genome"):
self[CFG_GENOMES_KEY].setdefault(genome, PXAM())
if check(asset, str, "asset"):
self[CFG_GENOMES_KEY][genome].setdefault(asset, PXAM())
if check(data, Mapping, "data"):
self[CFG_GENOMES_KEY][genome][asset].update(data)
if _check_insert_data(genome, str, "genome"):
self[CFG_GENOMES_KEY].setdefault(genome, PXAM({CFG_ASSETS_KEY: PXAM()}))
if _check_insert_data(asset, str, "asset"):
self[CFG_GENOMES_KEY][genome][CFG_ASSETS_KEY].setdefault(asset, PXAM())
if _check_insert_data(data, Mapping, "data"):
self[CFG_GENOMES_KEY][genome][CFG_ASSETS_KEY][asset].update(data)
return self

def update_genomes(self, genome, data=None):
"""
Updates the genomes in RefGenConf object at any level.
If a requested genome is missing, it will be added
:param str genome: genome to be added/updated
:param Mapping data: data to be added/updated
:return RefGenConf: updated object
"""
if _check_insert_data(genome, str, "genome"):
self[CFG_GENOMES_KEY].setdefault(genome, PXAM({CFG_ASSETS_KEY: PXAM()}))
if _check_insert_data(data, Mapping, "data"):
self[CFG_GENOMES_KEY][genome].update(data)
return self

def _invert_genomes(self, order=None):
Expand All @@ -426,8 +435,8 @@ def _invert_genomes(self, order=None):
asset type is available
"""
genomes = {}
for g, am in self.genomes.items():
for a in am.keys():
for g, am in self[CFG_GENOMES_KEY].items():
for a in am[CFG_ASSETS_KEY].keys():
genomes.setdefault(a, []).append(g)
assets = sorted(genomes.keys(), key=order)
return OrderedDict([(a, sorted(genomes[a], key=order)) for a in assets])
Expand Down Expand Up @@ -501,7 +510,7 @@ def _genome_asset_path(genomes, gname, aname):
except KeyError:
raise MissingGenomeError("Your genomes do not include {}".format(gname))
try:
asset_data = genome[aname]
asset_data = genome[CFG_ASSETS_KEY][aname]
except KeyError:
raise MissingAssetError(
"Genome '{}' exists, but index '{}' is missing".format(gname, aname))
Expand Down Expand Up @@ -540,13 +549,11 @@ def _list_remote(url, order=None):
"""
genomes_data = _read_remote_data(url)
refgens = sorted(genomes_data.keys(), key=order)
return ", ".join(refgens), \
"\n".join([_make_genome_assets_line(g, genomes_data[g], order=order)
for g in refgens])
return ", ".join(refgens), "\n".join([_make_genome_assets_line(g, genomes_data[g], order=order) for g in refgens])


def _make_genome_assets_line(
gen, assets, offset_text=" ", genome_assets_delim=": ", asset_sep="; ",
gen, assets, offset_text=" ", genome_assets_delim=": ", asset_sep=", ",
order=None):
"""
Build a line of text for display of assets by genome
Expand Down Expand Up @@ -587,3 +594,13 @@ def _untar(src, dst):
import tarfile
with tarfile.open(src) as tf:
tf.extractall(path=dst)


def _check_insert_data(obj, datatype, name):
""" Checks validity of an object """
if obj is None:
return False
if not isinstance(obj, datatype):
raise TypeError("{} must be {}; got {}".format(
name, datatype.__name__, type(obj).__name__))
return True
1 change: 1 addition & 0 deletions requirements/requirements-all.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,3 +4,4 @@ requests
tqdm
ubiquerg>=0.4.3
yacman>=0.4
future
6 changes: 3 additions & 3 deletions requirements/requirements-dev.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
pytest>=3.0.7
pytest-remotedata
#pytest>=3.0.7
#pytest-remotedata
pyyaml>=5
ubiquerg>=0.3
veracitools
#veracitools
Loading

0 comments on commit ad4cc28

Please sign in to comment.