Skip to content

Commit

Permalink
Merge pull request #47 from vinisalazar/dev
Browse files Browse the repository at this point in the history
Merge v01.1.23 from dev branch
  • Loading branch information
vinisalazar authored Apr 24, 2021
2 parents fe19342 + 761824d commit 10789b4
Show file tree
Hide file tree
Showing 30 changed files with 443 additions and 82 deletions.
8 changes: 4 additions & 4 deletions .github/workflows/create-release.yml
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,9 @@ jobs:
steps:
- name: Checkout code
uses: actions/checkout@v2
- run: |
lastversion=$(git describe --tags)
awk /"$lastversion"'/{flag=1; next} /###/{flag=0} flag' CHANGELOG.md >> release.md
- name: Create Release
id: create_release
uses: actions/create-release@v1
Expand All @@ -21,9 +24,6 @@ jobs:
with:
tag_name: ${{ github.ref }}
release_name: Release ${{ github.ref }}
body: |
Changes in this Release
- First Change
- Second Change
body_path: release.md
draft: false
prerelease: false
1 change: 1 addition & 0 deletions .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ before_install:
# command to install dependencies
install:
- pip install .
- pip install pandas --force-reinstall
# command to run tests
script:
- pytest --cov=bioprov/
Expand Down
11 changes: 11 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,19 @@
* Create methods for Sample and Project
* .describe [ ]
* .write_paths_to_file, .copy_files_to_dir(), .link_files_to_dir() [ ]
* .total_duration [ ]
* Add logger calls when saving to JSON and uploading to ProvStore [ ]

### v0.1.23
* Patch PresetProgram SeqFile addition feature [x]
* Fix SeqFile deserializer [x]
* Add import_records arg to `bp.load_project()` [x]
* Improve reserved aminoacid characters [x]
* Add `SeqFile.max_seq` and `.min_seq` properties [x]
* Patch `Project` deserializer to improve BioProvDocument creation [x]
* Make shorter Environment hashes [x]
* Improve Project `__repr__` [x]

### v0.1.22
* Simplify `bp.load_project()` function [x]
* Fix user and env PROV relationships [x]
Expand Down
2 changes: 1 addition & 1 deletion bioprov/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
__license__ = "MIT"
__maintainer__ = "Vini Salazar"
__url__ = "https://github.com/vinisalazar/bioprov"
__version__ = "0.1.22"
__version__ = "0.1.23"


"""
Expand Down
2 changes: 1 addition & 1 deletion bioprov/bioprov.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
__license__ = "MIT"
__maintainer__ = "Vini Salazar"
__url__ = "https://github.com/vinisalazar/bioprov"
__version__ = "0.1.22"
__version__ = "0.1.23"

"""
BioProv command-line application. This module holds the main executable.
Expand Down
2 changes: 1 addition & 1 deletion bioprov/data/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
__license__ = "MIT"
__maintainer__ = "Vini Salazar"
__url__ = "https://github.com/vinisalazar/bioprov"
__version__ = "0.1.22"
__version__ = "0.1.23"


"""
Expand Down
2 changes: 1 addition & 1 deletion bioprov/programs/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
__license__ = "MIT"
__maintainer__ = "Vini Salazar"
__url__ = "https://github.com/vinisalazar/bioprov"
__version__ = "0.1.22"
__version__ = "0.1.23"


from .programs import (
Expand Down
3 changes: 1 addition & 2 deletions bioprov/programs/programs.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,15 +2,14 @@
__license__ = "MIT"
__maintainer__ = "Vini Salazar"
__url__ = "https://github.com/vinisalazar/bioprov"
__version__ = "0.1.22"
__version__ = "0.1.23"


"""
Module for holding preset instances of the Program class.
Module for holding preset instances of the Program class.
"""

import logging
from os import path
from pathlib import Path

Expand Down
2 changes: 1 addition & 1 deletion bioprov/src/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
__license__ = "MIT"
__maintainer__ = "Vini Salazar"
__url__ = "https://github.com/vinisalazar/bioprov"
__version__ = "0.1.22"
__version__ = "0.1.23"


"""
Expand Down
13 changes: 8 additions & 5 deletions bioprov/src/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
__license__ = "MIT"
__maintainer__ = "Vini Salazar"
__url__ = "https://github.com/vinisalazar/bioprov"
__version__ = "0.1.22"
__version__ = "0.1.23"


"""
Expand Down Expand Up @@ -270,14 +270,15 @@ def __init__(self):
Class constructor. All attributes are empty and are initialized with self.update()
"""
self.env_hash = None
self.env_hash_long = None
self.env_dict = None
self.user = None
self.env_namespace = None
self.update()
self._actedOnBehalfOf = False

def __repr__(self):
return self.env_hash
return f"BioProvEnvironment{self.env_hash}"

@property
def actedOnBehalfOf(self):
Expand All @@ -294,9 +295,10 @@ def update(self):
"""
env_dict = dict(os.environ.items())
env_hash = dict_to_sha256(env_dict)
if env_hash != self.env_hash:
if env_hash != self.env_hash_long:
self.env_dict = env_dict
self.env_hash = env_hash
self.env_hash = env_hash[:7]
self.env_hash_long = env_hash

# this is only to prevent build errors
try:
Expand All @@ -306,7 +308,8 @@ def update(self):
self.env_namespace = Namespace("envs", str(self))

def serializer(self):
return serializer(self)
keys = ("_actedOnBehalfOf",)
return serializer_filter(self, keys)


config = Config()
77 changes: 70 additions & 7 deletions bioprov/src/files.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
__license__ = "MIT"
__maintainer__ = "Vini Salazar"
__url__ = "https://github.com/vinisalazar/bioprov"
__version__ = "0.1.22"
__version__ = "0.1.23"


"""
Expand Down Expand Up @@ -279,6 +279,8 @@ def __init__(
self._generator = None
self._seqstats = None
self._parser = parser

# TODO: add these attributes as properties. Calculate lazily (only if retrieving).
self.number_seqs: int
self.total_bps: int
self.mean_bp: float
Expand All @@ -287,6 +289,10 @@ def __init__(
self.N50: int
self.GC: float

# Sequence properties
self._max_seq = None
self._min_seq = None

if self.exists:
self._seqrecordgenerator()
else:
Expand All @@ -304,7 +310,7 @@ def _seqrecordgenerator(self):
"""
Runs _seqrecordgenerator with the format.
"""
self._generator = seqrecordgenerator(
self.generator = seqrecordgenerator(
self.path, format=self.format, parser=self._parser
)

Expand All @@ -328,9 +334,15 @@ def seqstats(self):
def seqstats(self, value):
self._seqstats = value

def import_records(self):
def import_records(self, **kwargs):
"""
:param kwargs: Parameters to pass to the SeqFile._calculate_seqstats() function.
:return: Import records into self.
"""
assert self.exists, "Cannot import, file does not exist."
self._seqrecordgenerator()
self.records = SeqIO.to_dict(self._generator)
self._calculate_seqstats(**kwargs)

def serializer(self):
keys = ("records",)
Expand All @@ -353,9 +365,14 @@ def _calculate_seqstats(
assert isinstance(self.records, dict), Warnings()["incorrect_type"](
self.records, dict
)
if len(self.records) < 1:
self.import_records()
assert (
len(self.records) > 0
), "Attribute 'records' is empty. Try importing records manually."

bp_array, GC = [], 0
aminoacids = "LMFWKQESPVIYHRND"
aminoacids = "LMFQESPI"

# We use enumerate to check the first item for amino acids.
for ix, (key, SeqRecord) in enumerate(self.records.items()):
Expand All @@ -371,6 +388,7 @@ def _calculate_seqstats(
if calculate_gc:
GC += SeqRecord.seq.upper().count("G")
GC += SeqRecord.seq.upper().count("C")
GC += SeqRecord.seq.upper().count("S")

# Convert to array
bp_array = np.array(bp_array)
Expand Down Expand Up @@ -400,6 +418,51 @@ def _calculate_seqstats(

return self._seqstats

@property
def max_seq(self):
self.max_seq = self._find_max_seq()
return self._max_seq

@max_seq.setter
def max_seq(self, value):
self._max_seq = value

def _find_max_seq(self):
try:
if len(self.records) < 1:
self.import_records()

max_seq, len_max_seq = None, 0
for id_, seq in self.records.items():
if len(seq) > len_max_seq:
len_max_seq = len(seq)
max_seq = seq
return max_seq
except:
print("Couldn't import data to determine max_seq.")
return None

@property
def min_seq(self):
self.min_seq = self._find_min_seq()
return self._min_seq

@min_seq.setter
def min_seq(self, value):
self._min_seq = value

def _find_min_seq(self):
if len(self.records) < 1:
self.import_records()

min_seq, len_min_seq = None, 10 ** 9
for id_, seq in self.records.items():
if len(seq) < len_min_seq:
len_min_seq = len(seq)
min_seq = seq

return min_seq


@dataclass
class SeqStats:
Expand Down Expand Up @@ -482,10 +545,8 @@ def deserialize_files_dict(files_dict):
# TODO: don't import records again (slow)
# Get them straight from the JSON file.
files_dict[tag] = SeqFile(
path=file["path"],
tag=file["tag"],
path=file["path"], tag=file["tag"], format=file["format"]
)
_ = files_dict[tag].generator
for seqstats_attr_ in SeqStats.__dataclass_fields__.keys():
if seqstats_attr_ in file.keys():
setattr(
Expand All @@ -502,6 +563,8 @@ def deserialize_files_dict(files_dict):
if attr_ not in ("path",):
try:
setattr(files_dict[tag], attr_, value_)
if attr_ == "_generator":
files_dict[tag]._seqrecordgenerator()
except AttributeError:
pass
return files_dict
21 changes: 16 additions & 5 deletions bioprov/src/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
__license__ = "MIT"
__maintainer__ = "Vini Salazar"
__url__ = "https://github.com/vinisalazar/bioprov"
__version__ = "0.1.22"
__version__ = "0.1.23"

"""
Expand Down Expand Up @@ -625,11 +625,11 @@ def _parse_output_files(self):
for key, value in self.output_files.items():
# Usually just specify tag and suffix
if len(value) == 2:
suffix, tag = value
tag, suffix = value
self.sample.add_files(File(preffix + suffix, tag=tag))
# But we can also specify a format
elif len(value) == 3:
suffix, tag, format = value
tag, suffix, format = value
self.sample.add_files(
SeqFile(preffix + suffix, tag=tag, format=format)
)
Expand Down Expand Up @@ -1120,7 +1120,7 @@ def __len__(self):
return len(self._samples)

def __repr__(self):
return f"Project '{self.tag}' with {len(self)} samples"
return f"BioProvProject_'{self.tag}'"

def __getitem__(self, item):
if isinstance(item, str):
Expand Down Expand Up @@ -1738,12 +1738,13 @@ def write_json(dict_, _path):
config.logger.info(f"Could not create JSON file for {_path}.")


def load_project(tag, db=None):
def load_project(tag, db=None, import_records=False):
"""
Loads Project from the BioProvDatabase set in the config.
:param tag: Tag of the Project to be loaded.
:param db: Path to BioProvDB file. Default is set in the config module. (use the `bioprov --show_db` command).
:param import_records: Whether to import the sequence records. Unnecessary if this data is already recorded in the Project.
:return: Instance of Project.
"""
if db is None:
Expand All @@ -1766,4 +1767,14 @@ def load_project(tag, db=None):
f.write(bytes(json.dumps(result), "utf-8"))
project = from_json(f.name)

if import_records:
for k, file in project.files.items():
if isinstance(file, SeqFile):
file.import_records()

for sample in project:
for k, file in sample.files.items():
if isinstance(file, SeqFile):
file.import_records()

return project
Loading

0 comments on commit 10789b4

Please sign in to comment.