lint, remove old stuff, renames

refgenie · Sep 13, 2023 · 0ed6ec7 · 0ed6ec7
1 parent 64c09a0
commit 0ed6ec7
Show file tree

Hide file tree

Showing 4 changed files with 47 additions and 226 deletions.
diff --git a/deprecated.py b/deprecated.py
@@ -19,198 +19,6 @@
 
 
 
-
-
-
-
-
-
-   @staticmethod
-    def compat_all_old(A, B):
-        all_keys = list(A.keys()) + list(set(B.keys()) - set(list(A.keys())))
-        result = {}
-        flipped_format = {
-            "a-in-b": {},
-            "b-in-a": {},
-            "a-total": {},
-            "b-total": {},
-            "a-duplicated": {},
-            "b-duplicated": {},
-            "order-match": [],
-            "only-in-a": [],
-            "only-in-b": [],
-        }
-        for k in all_keys:
-            _LOGGER.info(k)
-            if k not in A:
-                result[k] = {"flag": -1}
-                flipped_format["only-in-b"].append(k)
-            elif k not in B:
-                flipped_format["only-in-a"].append(k)
-            else:
-                v = SeqColClient.compat(A[k], B[k])
-                result[k] = v
-                if "a-in-b" in v:
-                    flipped_format["a-in-b"][k] = v['a-in-b']
-                if "b-in-a":
-                    flipped_format["b-in-a"][k] = v['b-in-a']
-                if "a-total" in v:
-                    flipped_format["a-total"][k] = v['a-total']
-                if "b-total" in v:
-                    flipped_format["b-total"][k] = v['b-total']
-                if "a-duplicated" in v:
-                    flipped_format["a-duplicated"][k] = v['a-duplicated']
-                if "b-duplicated" in v:
-                    flipped_format["b-duplicated"][k] = v['b-duplicated']
-                if "order-match" in v:
-                    flipped_format["order-match"].append(k)
-
-        # result = {
-        #     "any-elements-shared": any(ainb),
-        #     "all-a-in-b": all(ainb),
-        #     "all-b-in-a": all(bina),
-        #     "order-match": order,
-        #     "flag": flag
-        # }
-
-        return flipped_format
-
-
-    def compare_digests_old(self, digestA, digestB, explain=False):
-        """
-        Given two collection checksums in the database, provide some information
-        about how they are related.
-
-        :param str digestA: Digest for first sequence collection to compare.
-        :param str digestB: Digest for second sequence collection to compare.
-        :param bool explain: Print an explanation of the flag? [Default: False]
-        """
-        typeA = self.database[digestA + henge.ITEM_TYPE]
-        typeB = self.database[digestB + henge.ITEM_TYPE]
-
-        if typeA != typeB:
-            _LOGGER.error(
-                f"Can't compare objects of different types: " f"{typeA} vs {typeB}"
-            )
-
-        asdA = self.retrieve(digestA, reclimit=1)
-        asdB = self.retrieve(digestB, reclimit=1)
-        return self.compare_asds(asdA, asdB, explain=explain)
-
-
-   @staticmethod
-    def compare_asds(asdA, asdB, explain=False):
-        """
-        Compare Annotated Sequence Digests (ASDs) -- digested sequences and `data
-
-        :param str asdA: ASD for first sequence collection to compare.
-        :param str asdB: ASD for second sequence collection to compare.
-        :param bool explain: Print an explanation of the flag? [Default: False]
-        """
-
-        def _xp(prop, lst):
-            """Extract property from a list of dicts"""
-            return list(map(lambda x: x[prop], lst))
-
-        def _index(x, lst):
-            """Find an index of a sequence element in a list of dicts"""
-            try:
-                return _xp(SEQ_KEY, lst).index(x)
-            except:
-                return None
-
-        def _get_common_content(lstA, lstB):
-            """
-            Find the intersection between two list of dicts with sequences
-            """
-            return list(
-                filter(None.__ne__, [_index(x, lstB) for x in _xp(SEQ_KEY, lstA)])
-            )
-
-        # Not ideal, but we expect these to return lists, but if the item was
-        # singular only a dict is returned
-        if not isinstance(asdA, list):
-            asdA = [asdA]
-        if not isinstance(asdB, list):
-            asdB = [asdB]
-
-        ainb = [x in _xp(SEQ_KEY, asdB) for x in _xp(SEQ_KEY, asdA)]
-        bina = [x in _xp(SEQ_KEY, asdA) for x in _xp(SEQ_KEY, asdB)]
-
-        return_flag = 0  # initialize
-        if sum(ainb) > 1:
-            ordA = _get_common_content(asdA, asdB)
-            if ordA == sorted(ordA):
-                return_flag += CONTENT_A_ORDER
-        if sum(bina) > 1:
-            ordB = _get_common_content(asdB, asdA)
-            if ordB == sorted(ordB):
-                return_flag += CONTENT_B_ORDER
-
-        ainb_len = [x in _xp(LEN_KEY, asdB) for x in _xp(LEN_KEY, asdA)]
-        bina_len = [x in _xp(LEN_KEY, asdA) for x in _xp(LEN_KEY, asdB)]
-
-        ainb_name = [x in _xp(NAME_KEY, asdB) for x in _xp(NAME_KEY, asdA)]
-        bina_name = [x in _xp(NAME_KEY, asdA) for x in _xp(NAME_KEY, asdB)]
-
-        ainb_topo = [x in _xp(TOPO_KEY, asdB) for x in _xp(TOPO_KEY, asdA)]
-        bina_topo = [x in _xp(TOPO_KEY, asdA) for x in _xp(TOPO_KEY, asdB)]
-
-        if all(ainb):
-            return_flag += CONTENT_ALL_A_IN_B
-        if all(bina):
-            return_flag += CONTENT_ALL_B_IN_A
-
-        if all(ainb_name):
-            return_flag += NAMES_ALL_A_IN_B
-        if all(bina_name):
-            return_flag += NAMES_ALL_B_IN_A
-
-        if all(ainb_topo):
-            return_flag += TOPO_ALL_A_IN_B
-        if all(bina_topo):
-            return_flag += TOPO_ALL_B_IN_A
-
-        if all(ainb_len):
-            return_flag += LENGTHS_ALL_A_IN_B
-        if all(bina_len):
-            return_flag += LENGTHS_ALL_B_IN_A
-
-        if explain:
-            explain_flag(return_flag)
-        return return_flag
-
-
-    @staticmethod
-    def compat(A, B):
-        """
-        New compatibility function for array-based data model.
-        """
-
-        lenA = len(A)
-        lenB = len(B)
-        dupeA = lenA - len(dict.fromkeys(A))
-        dupeB = lenB - len(dict.fromkeys(B))
-        ainb = [x in B for x in A]
-        bina = [x in A for x in B]
-        sum_ainb = sum(ainb)
-        if sum_ainb > 1:
-            order = list(compress(B, bina)) == list(compress(A, ainb))
-        else:
-            order = False
-
-        result = {
-            "a-in-b": sum_ainb,
-            "b-in-a":  sum(bina),
-            "a-total": lenA,
-            "b-total": lenB,
-            "a-duplicated": dupeA,
-            "b-duplicated": dupeB,
-            "order-match": order
-        }
-        return result
-
-
 def compat(A, B):
     ainb = [x in B for x in A]
     bina = [x in A for x in B]

diff --git a/interactive_tests.py b/interactive_tests.py
@@ -38,6 +38,7 @@
 
 # Now a test of inherent attributes
 import seqcol
+
 scci = seqcol.SeqColHenge(database={}, schemas=["seqcol/schemas/SeqColArraySetInherent.yaml"])
 scci
 scci.schemas
@@ -46,30 +47,31 @@
 fa_file = "demo_fasta/demo0.fa"
 fa_object = seqcol.parse_fasta(fa_file)
 
-array_set_i = {"names": names, "lengths": lengthsi, "sequences": sequences, "author":"urkel"}
-array_set_i2 = {"names": names, "lengths": lengthsi, "sequences": sequences, "author" :"nathan"}
+array_set_i = {"names": names, "lengths": lengthsi, "sequences": sequences, "author": "urkel"}
+array_set_i2 = {"names": names, "lengths": lengthsi, "sequences": sequences, "author": "nathan"}
 
 
 di = scci.insert(array_set_i, "SeqColArraySet")
 di = scci.insert(array_set_i2, "SeqColArraySet")
 di
 # scc.retrieve(di)
 scci.retrieve(di)
-fasta_path="demo_fasta"
+fasta_path = "demo_fasta"
 fasta1 = "demo2.fa"
 fasta2 = "demo3.fa"
 fasta5 = "demo5.fa.gz"
 fasta6 = "demo6.fa"
 
 import os
+
 d = scci.load_fasta_from_filepath(os.path.join(fasta_path, fasta1))
 d2 = scci.load_fasta_from_filepath(os.path.join(fasta_path, fasta2))
 d2 = scci.load_fasta_from_filepath(os.path.join(fasta_path, fasta2))
 d5 = scci.load_fasta_from_filepath(os.path.join(fasta_path, fasta5))
 d6 = scci.load_fasta_from_filepath(os.path.join(fasta_path, fasta6))
-scci.retrieve(d['digest'])
+scci.retrieve(d["digest"])
 
-scci.retrieve(d5['digest'])
+scci.retrieve(d5["digest"])
 
 fa_object = seqcol.parse_fasta(os.path.join(fasta_path, fasta1))
 SCAS = seqcol.fasta_to_csc(fa_object)
@@ -84,17 +86,25 @@
 
 
 json.dumps(scci.compare(d5["SCAS"], d6["SCAS"]))
-print(json.dumps(scci.compare(d5["SCAS"], d6["SCAS"]), separators=(",", ":"), ensure_ascii=False
-    ...: , allow_nan=False, sort_keys=True, indent=2))
+print(
+    json.dumps(
+        scci.compare(d5["SCAS"], d6["SCAS"]),
+        separators=(",", ":"),
+        ensure_ascii=False,
+        allow_nan=False,
+        sort_keys=True,
+        indent=2,
+    )
+)
 
 build_sorted_name_length_pairs(array_set_i)
 
-#reorder
+# reorder
 
 array_set_reordered = {}
-for k,v in array_set.items():
-	print(k,v)
-	array_set_reordered[k] = list(reversed(v))
+for k, v in array_set.items():
+    print(k, v)
+    array_set_reordered[k] = list(reversed(v))
 
 array_set
 array_set_reordered
@@ -106,8 +116,6 @@
 import henge
 
 
-
-
 from henge import md5
 
 names = []
@@ -142,21 +150,19 @@
 os.getcwd()
 
 
-
-
-
 ## standalone functions
 
 import seqcol
+
 fa_file = "demo_fasta/demo0.fa"
 fa_object = seqcol.parse_fasta(fa_file)
 
 # get a canonical seqcol object
 csc = seqcol.fasta_to_csc(fa_object)
 csc
 import json
+
 print(json.dumps(csc, indent=2))
 
 
 seqcol.seqcol_digest(csc)
-
diff --git a/seqcol/__init__.py b/seqcol/__init__.py
@@ -7,5 +7,5 @@
 __classes__ = ["SeqColHenge"]
 __all__ = (
     __classes__
-    + ["build_sorted_name_length_pairs", "compare", "validate_seqcol", "fasta_to_digest"],
+    + ["build_sorted_name_length_pairs", "compare", "validate_seqcol", "fasta_file_to_digest"],
 )