Merge pull request #47 from vinisalazar/dev

Merge v01.1.23 from dev branch
vinisalazar · Apr 24, 2021 · 10789b4 · 10789b4
2 parents fe19342 + 761824d
commit 10789b4
Show file tree

Hide file tree

Showing 30 changed files with 443 additions and 82 deletions.
diff --git a/.github/workflows/create-release.yml b/.github/workflows/create-release.yml
@@ -13,6 +13,9 @@ jobs:
     steps:
       - name: Checkout code
         uses: actions/checkout@v2
+      - run: |
+          lastversion=$(git describe --tags)
+          awk /"$lastversion"'/{flag=1; next} /###/{flag=0} flag' CHANGELOG.md >> release.md
       - name: Create Release
         id: create_release
         uses: actions/create-release@v1
@@ -21,9 +24,6 @@ jobs:
         with:
           tag_name: ${{ github.ref }}
           release_name: Release ${{ github.ref }}
-          body: |
-            Changes in this Release
-            - First Change
-            - Second Change
+          body_path: release.md
           draft: false
           prerelease: false
diff --git a/.travis.yml b/.travis.yml
@@ -8,6 +8,7 @@ before_install:
 # command to install dependencies
 install:
   - pip install .
+  - pip install pandas --force-reinstall
 # command to run tests
 script:
   - pytest --cov=bioprov/

diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -6,8 +6,19 @@
 * Create methods for Sample and Project
     * .describe [ ]
     * .write_paths_to_file, .copy_files_to_dir(), .link_files_to_dir() [ ]
+    * .total_duration [ ]
 * Add logger calls when saving to JSON and uploading to ProvStore [ ]
 
+### v0.1.23
+* Patch PresetProgram SeqFile addition feature [x]
+* Fix SeqFile deserializer [x]
+* Add import_records arg to `bp.load_project()` [x]
+* Improve reserved aminoacid characters [x]
+* Add `SeqFile.max_seq` and `.min_seq` properties [x]
+* Patch `Project` deserializer to improve BioProvDocument creation [x]
+* Make shorter Environment hashes [x]
+* Improve Project `__repr__` [x]
+
 ### v0.1.22
 * Simplify `bp.load_project()` function [x]
 * Fix user and env PROV relationships [x]

diff --git a/bioprov/__init__.py b/bioprov/__init__.py
@@ -2,7 +2,7 @@
 __license__ = "MIT"
 __maintainer__ = "Vini Salazar"
 __url__ = "https://github.com/vinisalazar/bioprov"
-__version__ = "0.1.22"
+__version__ = "0.1.23"
 
 
 """

diff --git a/bioprov/bioprov.py b/bioprov/bioprov.py
@@ -2,7 +2,7 @@
 __license__ = "MIT"
 __maintainer__ = "Vini Salazar"
 __url__ = "https://github.com/vinisalazar/bioprov"
-__version__ = "0.1.22"
+__version__ = "0.1.23"
 
 """
 BioProv command-line application. This module holds the main executable.

diff --git a/bioprov/data/__init__.py b/bioprov/data/__init__.py
@@ -2,7 +2,7 @@
 __license__ = "MIT"
 __maintainer__ = "Vini Salazar"
 __url__ = "https://github.com/vinisalazar/bioprov"
-__version__ = "0.1.22"
+__version__ = "0.1.23"
 
 
 """

diff --git a/bioprov/programs/__init__.py b/bioprov/programs/__init__.py
@@ -2,7 +2,7 @@
 __license__ = "MIT"
 __maintainer__ = "Vini Salazar"
 __url__ = "https://github.com/vinisalazar/bioprov"
-__version__ = "0.1.22"
+__version__ = "0.1.23"
 
 
 from .programs import (

diff --git a/bioprov/programs/programs.py b/bioprov/programs/programs.py
@@ -2,15 +2,14 @@
 __license__ = "MIT"
 __maintainer__ = "Vini Salazar"
 __url__ = "https://github.com/vinisalazar/bioprov"
-__version__ = "0.1.22"
+__version__ = "0.1.23"
 
 
 """
 Module for holding preset instances of the Program class.
 Module for holding preset instances of the Program class.
 """
 
-import logging
 from os import path
 from pathlib import Path
 

diff --git a/bioprov/src/__init__.py b/bioprov/src/__init__.py
@@ -2,7 +2,7 @@
 __license__ = "MIT"
 __maintainer__ = "Vini Salazar"
 __url__ = "https://github.com/vinisalazar/bioprov"
-__version__ = "0.1.22"
+__version__ = "0.1.23"
 
 
 """

diff --git a/bioprov/src/config.py b/bioprov/src/config.py
@@ -2,7 +2,7 @@
 __license__ = "MIT"
 __maintainer__ = "Vini Salazar"
 __url__ = "https://github.com/vinisalazar/bioprov"
-__version__ = "0.1.22"
+__version__ = "0.1.23"
 
 
 """
@@ -270,14 +270,15 @@ def __init__(self):
         Class constructor. All attributes are empty and are initialized with self.update()
         """
         self.env_hash = None
+        self.env_hash_long = None
         self.env_dict = None
         self.user = None
         self.env_namespace = None
         self.update()
         self._actedOnBehalfOf = False
 
     def __repr__(self):
-        return self.env_hash
+        return f"BioProvEnvironment{self.env_hash}"
 
     @property
     def actedOnBehalfOf(self):
@@ -294,9 +295,10 @@ def update(self):
         """
         env_dict = dict(os.environ.items())
         env_hash = dict_to_sha256(env_dict)
-        if env_hash != self.env_hash:
+        if env_hash != self.env_hash_long:
             self.env_dict = env_dict
-            self.env_hash = env_hash
+            self.env_hash = env_hash[:7]
+            self.env_hash_long = env_hash
 
             # this is only to prevent build errors
             try:
@@ -306,7 +308,8 @@ def update(self):
             self.env_namespace = Namespace("envs", str(self))
 
     def serializer(self):
-        return serializer(self)
+        keys = ("_actedOnBehalfOf",)
+        return serializer_filter(self, keys)
 
 
 config = Config()
diff --git a/bioprov/src/files.py b/bioprov/src/files.py
@@ -2,7 +2,7 @@
 __license__ = "MIT"
 __maintainer__ = "Vini Salazar"
 __url__ = "https://github.com/vinisalazar/bioprov"
-__version__ = "0.1.22"
+__version__ = "0.1.23"
 
 
 """
@@ -279,6 +279,8 @@ def __init__(
         self._generator = None
         self._seqstats = None
         self._parser = parser
+
+        # TODO: add these attributes as properties. Calculate lazily (only if retrieving).
         self.number_seqs: int
         self.total_bps: int
         self.mean_bp: float
@@ -287,6 +289,10 @@ def __init__(
         self.N50: int
         self.GC: float
 
+        # Sequence properties
+        self._max_seq = None
+        self._min_seq = None
+
         if self.exists:
             self._seqrecordgenerator()
         else:
@@ -304,7 +310,7 @@ def _seqrecordgenerator(self):
         """
         Runs _seqrecordgenerator with the format.
         """
-        self._generator = seqrecordgenerator(
+        self.generator = seqrecordgenerator(
             self.path, format=self.format, parser=self._parser
         )
 
@@ -328,9 +334,15 @@ def seqstats(self):
     def seqstats(self, value):
         self._seqstats = value
 
-    def import_records(self):
+    def import_records(self, **kwargs):
+        """
+        :param kwargs: Parameters to pass to the SeqFile._calculate_seqstats() function.
+        :return: Import records into self.
+        """
         assert self.exists, "Cannot import, file does not exist."
+        self._seqrecordgenerator()
         self.records = SeqIO.to_dict(self._generator)
+        self._calculate_seqstats(**kwargs)
 
     def serializer(self):
         keys = ("records",)
@@ -353,9 +365,14 @@ def _calculate_seqstats(
         assert isinstance(self.records, dict), Warnings()["incorrect_type"](
             self.records, dict
         )
+        if len(self.records) < 1:
+            self.import_records()
+        assert (
+            len(self.records) > 0
+        ), "Attribute 'records' is empty. Try importing records manually."
 
         bp_array, GC = [], 0
-        aminoacids = "LMFWKQESPVIYHRND"
+        aminoacids = "LMFQESPI"
 
         # We use enumerate to check the first item for amino acids.
         for ix, (key, SeqRecord) in enumerate(self.records.items()):
@@ -371,6 +388,7 @@ def _calculate_seqstats(
             if calculate_gc:
                 GC += SeqRecord.seq.upper().count("G")
                 GC += SeqRecord.seq.upper().count("C")
+                GC += SeqRecord.seq.upper().count("S")
 
         # Convert to array
         bp_array = np.array(bp_array)
@@ -400,6 +418,51 @@ def _calculate_seqstats(
 
         return self._seqstats
 
+    @property
+    def max_seq(self):
+        self.max_seq = self._find_max_seq()
+        return self._max_seq
+
+    @max_seq.setter
+    def max_seq(self, value):
+        self._max_seq = value
+
+    def _find_max_seq(self):
+        try:
+            if len(self.records) < 1:
+                self.import_records()
+
+            max_seq, len_max_seq = None, 0
+            for id_, seq in self.records.items():
+                if len(seq) > len_max_seq:
+                    len_max_seq = len(seq)
+                    max_seq = seq
+            return max_seq
+        except:
+            print("Couldn't import data to determine max_seq.")
+            return None
+
+    @property
+    def min_seq(self):
+        self.min_seq = self._find_min_seq()
+        return self._min_seq
+
+    @min_seq.setter
+    def min_seq(self, value):
+        self._min_seq = value
+
+    def _find_min_seq(self):
+        if len(self.records) < 1:
+            self.import_records()
+
+        min_seq, len_min_seq = None, 10 ** 9
+        for id_, seq in self.records.items():
+            if len(seq) < len_min_seq:
+                len_min_seq = len(seq)
+                min_seq = seq
+
+        return min_seq
+
 
 @dataclass
 class SeqStats:
@@ -482,10 +545,8 @@ def deserialize_files_dict(files_dict):
                     # TODO: don't import records again (slow)
                     # Get them straight from the JSON file.
                     files_dict[tag] = SeqFile(
-                        path=file["path"],
-                        tag=file["tag"],
+                        path=file["path"], tag=file["tag"], format=file["format"]
                     )
-                    _ = files_dict[tag].generator
                     for seqstats_attr_ in SeqStats.__dataclass_fields__.keys():
                         if seqstats_attr_ in file.keys():
                             setattr(
@@ -502,6 +563,8 @@ def deserialize_files_dict(files_dict):
                 if attr_ not in ("path",):
                     try:
                         setattr(files_dict[tag], attr_, value_)
+                        if attr_ == "_generator":
+                            files_dict[tag]._seqrecordgenerator()
                     except AttributeError:
                         pass
     return files_dict
diff --git a/bioprov/src/main.py b/bioprov/src/main.py
@@ -2,7 +2,7 @@
 __license__ = "MIT"
 __maintainer__ = "Vini Salazar"
 __url__ = "https://github.com/vinisalazar/bioprov"
-__version__ = "0.1.22"
+__version__ = "0.1.23"
 
 """
 
@@ -625,11 +625,11 @@ def _parse_output_files(self):
             for key, value in self.output_files.items():
                 # Usually just specify tag and suffix
                 if len(value) == 2:
-                    suffix, tag = value
+                    tag, suffix = value
                     self.sample.add_files(File(preffix + suffix, tag=tag))
                 # But we can also specify a format
                 elif len(value) == 3:
-                    suffix, tag, format = value
+                    tag, suffix, format = value
                     self.sample.add_files(
                         SeqFile(preffix + suffix, tag=tag, format=format)
                     )
@@ -1120,7 +1120,7 @@ def __len__(self):
         return len(self._samples)
 
     def __repr__(self):
-        return f"Project '{self.tag}' with {len(self)} samples"
+        return f"BioProvProject_'{self.tag}'"
 
     def __getitem__(self, item):
         if isinstance(item, str):
@@ -1738,12 +1738,13 @@ def write_json(dict_, _path):
         config.logger.info(f"Could not create JSON file for {_path}.")
 
 
-def load_project(tag, db=None):
+def load_project(tag, db=None, import_records=False):
     """
     Loads Project from the BioProvDatabase set in the config.
 
     :param tag: Tag of the Project to be loaded.
     :param db: Path to BioProvDB file. Default is set in the config module. (use the `bioprov --show_db` command).
+    :param import_records: Whether to import the sequence records. Unnecessary if this data is already recorded in the Project.
     :return: Instance of Project.
     """
     if db is None:
@@ -1766,4 +1767,14 @@ def load_project(tag, db=None):
         f.write(bytes(json.dumps(result), "utf-8"))
         project = from_json(f.name)
 
+    if import_records:
+        for k, file in project.files.items():
+            if isinstance(file, SeqFile):
+                file.import_records()
+
+        for sample in project:
+            for k, file in sample.files.items():
+                if isinstance(file, SeqFile):
+                    file.import_records()
+
     return project