From 8acce14edbc55e927feea3109c7353286c82e3b5 Mon Sep 17 00:00:00 2001
From: nsheff <nsheff@users.noreply.github.com>
Date: Mon, 11 Sep 2023 14:40:08 -0400
Subject: [PATCH 01/11] add sha512t24u_digest

---
 seqcol/seqcol.py        | 6 +++---
 seqcol/seqcol_client.py | 4 ++--
 seqcol/utilities.py     | 7 ++++++-
 3 files changed, 11 insertions(+), 6 deletions(-)

diff --git a/seqcol/seqcol.py b/seqcol/seqcol.py
index 4c0ed28..9d589d3 100644
--- a/seqcol/seqcol.py
+++ b/seqcol/seqcol.py
@@ -48,7 +48,7 @@ def fasta_to_seqcol(fa_file_path: str) -> dict:
 def fasta_obj_to_seqcol(
     fa_object: pyfaidx.Fasta,
     verbose: bool = True,
-    digest_function: Callable[[str], str] = trunc512_digest,
+    digest_function: Callable[[str], str] = sha512t24u_digest,
 ) -> dict:
     """
     Given a fasta object, return a CSC (Canonical Sequence Collection object)
@@ -177,7 +177,7 @@ def seqcol_digest(seqcol_obj: SeqCol, schema: dict = None) -> str:
 
     seqcol_obj3 = {}
     for attribute in seqcol_obj2:
-        seqcol_obj3[attribute] = trunc512_digest(seqcol_obj2[attribute])
+        seqcol_obj3[attribute] = sha512t24u_digest(seqcol_obj2[attribute])
     # print(json.dumps(seqcol_obj3, indent=2))  # visualize the result
 
     # Step 4: Apply RFC-8785 again to canonicalize the JSON
@@ -186,5 +186,5 @@ def seqcol_digest(seqcol_obj: SeqCol, schema: dict = None) -> str:
     seqcol_obj4 = canonical_str(seqcol_obj3)
 
     # Step 5: Digest the final canonical representation again.
-    seqcol_digest = trunc512_digest(seqcol_obj4)
+    seqcol_digest = sha512t24u_digest(seqcol_obj4)
     return seqcol_digest
diff --git a/seqcol/seqcol_client.py b/seqcol/seqcol_client.py
index adeaf1f..816eb24 100644
--- a/seqcol/seqcol_client.py
+++ b/seqcol/seqcol_client.py
@@ -8,7 +8,7 @@
 
 from .const import *
 from .seqcol import *
-from .utilities import trunc512_digest
+from .utilities import sha512t24u_digest
 
 
 _LOGGER = logging.getLogger(__name__)
@@ -26,7 +26,7 @@ def __init__(
         database={},
         schemas=None,
         henges=None,
-        checksum_function=trunc512_digest,
+        checksum_function=sha512t24u_digest,
     ):
         """
         A user interface to insert and retrieve decomposable recursive unique
diff --git a/seqcol/utilities.py b/seqcol/utilities.py
index d8d5c97..9f18312 100644
--- a/seqcol/utilities.py
+++ b/seqcol/utilities.py
@@ -15,11 +15,16 @@
 # Retrieved July 2019
 # http://samtools.github.io/hts-specs/refget.html
 def trunc512_digest(seq, offset=24) -> str:
-    """GA4GH digest algorithm"""
     digest = hashlib.sha512(seq.encode()).digest()
     hex_digest = binascii.hexlify(digest[:offset])
     return hex_digest.decode()
 
+def sha512t24u_digest(seq: str, offset: int = 24) -> str:
+    """ GA4GH digest function """
+    digest = hashlib.sha512(seq.encode()).digest()
+    tdigest_b64us = base64.urlsafe_b64encode(digest[:offset])
+    return tdigest_b64us.decode("ascii")
+
 
 def canonical_str(item: dict) -> str:
     """Convert a dict into a canonical string representation"""

From e5953318ebb156228da11b176b4365f5544e626a Mon Sep 17 00:00:00 2001
From: nsheff <nsheff@users.noreply.github.com>
Date: Mon, 11 Sep 2023 18:19:24 -0400
Subject: [PATCH 02/11] minor structuring

---
 seqcol/__init__.py      |   1 -
 seqcol/seqcol.py        | 195 ----------------------------------------
 seqcol/seqcol_client.py |   3 +-
 seqcol/utilities.py     | 192 ++++++++++++++++++++++++++++++++++++++-
 4 files changed, 191 insertions(+), 200 deletions(-)
 delete mode 100644 seqcol/seqcol.py

diff --git a/seqcol/__init__.py b/seqcol/__init__.py
index e894ea0..2ef78f9 100644
--- a/seqcol/__init__.py
+++ b/seqcol/__init__.py
@@ -1,5 +1,4 @@
 from .const import *
-from .seqcol import *
 from .seqcol_client import *
 from .utilities import *
 from ._version import __version__
diff --git a/seqcol/seqcol.py b/seqcol/seqcol.py
deleted file mode 100644
index 3cc4eaf..0000000
--- a/seqcol/seqcol.py
+++ /dev/null
@@ -1,195 +0,0 @@
-import henge
-import logging
-import pyfaidx
-
-from typing import Callable
-
-from .utilities import *
-from .const import *
-
-
-_LOGGER = logging.getLogger(__name__)
-henge.ITEM_TYPE = "_item_type"
-
-
-def explain_flag(flag):
-    """Explains a compare flag"""
-    print(f"Flag: {flag}\nBinary: {bin(flag)}\n")
-    for e in range(0, 13):
-        if flag & 2**e:
-            print(FLAGS[2**e])
-
-def fasta_to_digest(fa_file_path: str) -> str:
-    """Given a fasta, return a digest"""
-    seqcol_obj = fasta_to_seqcol(fa_file_path)
-    return seqcol_digest(seqcol_obj)
-
-
-def parse_fasta(fa_file) -> pyfaidx.Fasta:
-    """
-    Read in a gzipped or not gzipped FASTA file
-    """
-    try:
-        return pyfaidx.Fasta(fa_file)
-    except pyfaidx.UnsupportedCompressionFormat:
-        # pyfaidx can handle bgzip but not gzip; so we just hack it here and
-        # gunzip the file into a temporary one and read it in not to interfere
-        # with the original one.
-        from gzip import open as gzopen
-        from tempfile import NamedTemporaryFile
-
-        with gzopen(fa_file, "rt") as f_in, NamedTemporaryFile(mode="w+t", suffix=".fa") as f_out:
-            f_out.writelines(f_in.read())
-            f_out.seek(0)
-            return pyfaidx.Fasta(f_out.name)
-
-
-def fasta_to_seqcol(fa_file_path: str) -> dict:
-    """Given a fasta, return a canonical seqcol object"""
-    fa_obj = parse_fasta(fa_file_path)
-    return fasta_obj_to_seqcol(fa_obj)
-
-
-def fasta_obj_to_seqcol(
-    fa_object: pyfaidx.Fasta,
-    verbose: bool = True,
-    digest_function: Callable[[str], str] = sha512t24u_digest,
-) -> dict:
-    """
-    Given a fasta object, return a CSC (Canonical Sequence Collection object)
-    """
-    # CSC = SeqColArraySet
-    # Or maybe should be "Level 1 SC"
-
-    CSC = {"lengths": [], "names": [], "sequences": [], "sorted_name_length_pairs": []}
-    seqs = fa_object.keys()
-    nseqs = len(seqs)
-    print(f"Found {nseqs} chromosomes")
-    i = 1
-    for k in fa_object.keys():
-        if verbose:
-            print(f"Processing ({i} of {nseqs}) {k}...")
-        seq = str(fa_object[k])
-        seq_length = len(seq)
-        seq_name = fa_object[k].name
-        seq_digest = digest_function(seq.upper())
-        snlp = {"length": seq_length, "name": seq_name}  # sorted_name_length_pairs
-        snlp_digest = digest_function(canonical_str(snlp))
-        CSC["lengths"].append(seq_length)
-        CSC["names"].append(seq_name)
-        CSC["sorted_name_length_pairs"].append(snlp_digest)
-        CSC["sequences"].append(seq_digest)
-        i += 1
-    CSC["sorted_name_length_pairs"].sort()
-    return CSC
-
-
-def build_sorted_name_length_pairs(obj: dict, digest_function):
-    """Builds the sorted_name_length_pairs attribute, which corresponds to the coordinate system"""
-    sorted_name_length_pairs = []
-    for i in range(len(obj["names"])):
-        sorted_name_length_pairs.append({"length": obj["lengths"][i], "name": obj["names"][i]})
-    nl_digests = []
-    for i in range(len(sorted_name_length_pairs)):
-        nl_digests.append(digest_function(canonical_str(sorted_name_length_pairs[i])))
-
-    nl_digests.sort()
-    return nl_digests
-
-
-def compare_seqcols(A: SeqCol, B: SeqCol):
-    """
-    Workhorse comparison function
-
-    @param A Sequence collection A
-    @param B Sequence collection B
-    @return dict Following formal seqcol specification comparison function return value
-    """
-    validate_seqcol(A)  # First ensure these are the right structure
-    validate_seqcol(B)
-
-    all_keys = list(A.keys()) + list(set(B.keys()) - set(list(A.keys())))
-    result = {}
-    return_obj = {
-        "arrays": {"a-only": [], "b-only": [], "a-and-b": []},
-        "elements": {
-            "total": {"a": len(A["lengths"]), "b": len(B["lengths"])},
-            "a-and-b": {},
-            "a-and-b-same-order": {},
-        },
-    }
-
-    for k in all_keys:
-        _LOGGER.info(k)
-        if k not in A:
-            result[k] = {"flag": -1}
-            return_obj["arrays"]["b-only"].append(k)
-        elif k not in B:
-            return_obj["arrays"]["a-only"].append(k)
-        else:
-            return_obj["arrays"]["a-and-b"].append(k)
-            res = _compare_elements(A[k], B[k])
-            return_obj["elements"]["a-and-b"][k] = res["a-and-b"]
-            return_obj["elements"]["a-and-b-same-order"][k] = res["a-and-b-same-order"]
-    return return_obj
-
-
-def _compare_elements(A: list, B: list):
-    """
-    Compare elements between two arrays. Helper function for individual elements used by workhorse compare_seqcols function
-    """
-
-    A_filtered = list(filter(lambda x: x in B, A))
-    B_filtered = list(filter(lambda x: x in A, B))
-    A_count = len(A_filtered)
-    B_count = len(B_filtered)
-    overlap = min(len(A_filtered), len(B_filtered))  # counts duplicates
-
-    if A_count + B_count < 1:
-        # order match requires at least 2 matching elements
-        order = None
-    elif not (A_count == B_count == overlap):
-        # duplicated matches means order match is undefined
-        order = None
-    else:
-        order = A_filtered == B_filtered
-    return {"a-and-b": overlap, "a-and-b-same-order": order}
-
-
-def seqcol_digest(seqcol_obj: SeqCol, schema: dict = None) -> str:
-    """
-    Given a canonical sequence collection, compute its digest.
-
-    :param dict seqcol_obj: Dictionary representation of a canonical sequence collection object
-    :param dict schema: Schema defining the inherent attributes to digest
-    :return str: The sequence collection digest
-    """
-
-    validate_seqcol(seqcol_obj)
-    # Step 1a: Remove any non-inherent attributes,
-    # so that only the inherent attributes contribute to the digest.
-    seqcol_obj2 = {}
-    if schema:
-        for k in schema["inherent"]:
-            # Step 2: Apply RFC-8785 to canonicalize the value
-            # associated with each attribute individually.
-            seqcol_obj2[k] = canonical_str(seqcol_obj[k])
-    else:  # no schema provided, so assume all attributes are inherent
-        for k in seqcol_obj:
-            seqcol_obj2[k] = canonical_str(seqcol_obj[k])
-    # Step 3: Digest each canonicalized attribute value
-    # using the GA4GH digest algorithm.
-
-    seqcol_obj3 = {}
-    for attribute in seqcol_obj2:
-        seqcol_obj3[attribute] = sha512t24u_digest(seqcol_obj2[attribute])
-    # print(json.dumps(seqcol_obj3, indent=2))  # visualize the result
-
-    # Step 4: Apply RFC-8785 again to canonicalize the JSON
-    # of new seqcol object representation.
-
-    seqcol_obj4 = canonical_str(seqcol_obj3)
-
-    # Step 5: Digest the final canonical representation again.
-    seqcol_digest = sha512t24u_digest(seqcol_obj4)
-    return seqcol_digest
diff --git a/seqcol/seqcol_client.py b/seqcol/seqcol_client.py
index af35d29..01d0ec5 100644
--- a/seqcol/seqcol_client.py
+++ b/seqcol/seqcol_client.py
@@ -7,8 +7,7 @@
 from itertools import compress
 
 from .const import *
-from .seqcol import *
-from .utilities import sha512t24u_digest
+from .utilities import *
 
 
 _LOGGER = logging.getLogger(__name__)
diff --git a/seqcol/utilities.py b/seqcol/utilities.py
index c403b7c..b74bfee 100644
--- a/seqcol/utilities.py
+++ b/seqcol/utilities.py
@@ -1,15 +1,19 @@
+import base64
 import binascii
 import hashlib
 import json
+import logging
 import os
+import pyfaidx
 
 from jsonschema import Draft7Validator
-from typing import Optional
+from typing import Optional, Callable
 from yacman import load_yaml
 
 from .const import SeqCol
 from .exceptions import *
 
+_LOGGER = logging.getLogger(__name__)
 
 # Refget digests from published seqcol v1.0 protocol
 # Retrieved July 2019
@@ -19,6 +23,7 @@ def trunc512_digest(seq, offset=24) -> str:
     hex_digest = binascii.hexlify(digest[:offset])
     return hex_digest.decode()
 
+
 def sha512t24u_digest(seq: str, offset: int = 24) -> str:
     """ GA4GH digest function """
     digest = hashlib.sha512(seq.encode()).digest()
@@ -69,4 +74,187 @@ def format_itemwise(csc: SeqCol) -> list:
                 "length": csc["lengths"][i],
                 "sequence": csc["sequences"][i],
             })
-    return list_of_dicts
\ No newline at end of file
+    return list_of_dicts
+
+
+def explain_flag(flag):
+    """Explains a compare flag"""
+    print(f"Flag: {flag}\nBinary: {bin(flag)}\n")
+    for e in range(0, 13):
+        if flag & 2**e:
+            print(FLAGS[2**e])
+
+def fasta_to_digest(fa_file_path: str) -> str:
+    """Given a fasta, return a digest"""
+    seqcol_obj = fasta_to_seqcol(fa_file_path)
+    return seqcol_digest(seqcol_obj)
+
+
+def parse_fasta(fa_file) -> pyfaidx.Fasta:
+    """
+    Read in a gzipped or not gzipped FASTA file
+    """
+    try:
+        return pyfaidx.Fasta(fa_file)
+    except pyfaidx.UnsupportedCompressionFormat:
+        # pyfaidx can handle bgzip but not gzip; so we just hack it here and
+        # gunzip the file into a temporary one and read it in not to interfere
+        # with the original one.
+        from gzip import open as gzopen
+        from tempfile import NamedTemporaryFile
+
+        with gzopen(fa_file, "rt") as f_in, NamedTemporaryFile(mode="w+t", suffix=".fa") as f_out:
+            f_out.writelines(f_in.read())
+            f_out.seek(0)
+            return pyfaidx.Fasta(f_out.name)
+
+
+def fasta_to_seqcol(fa_file_path: str) -> dict:
+    """Given a fasta, return a canonical seqcol object"""
+    fa_obj = parse_fasta(fa_file_path)
+    return fasta_obj_to_seqcol(fa_obj)
+
+
+def fasta_obj_to_seqcol(
+    fa_object: pyfaidx.Fasta,
+    verbose: bool = True,
+    digest_function: Callable[[str], str] = sha512t24u_digest,
+) -> dict:
+    """
+    Given a fasta object, return a CSC (Canonical Sequence Collection object)
+    """
+    # CSC = SeqColArraySet
+    # Or maybe should be "Level 1 SC"
+
+    CSC = {"lengths": [], "names": [], "sequences": [], "sorted_name_length_pairs": []}
+    seqs = fa_object.keys()
+    nseqs = len(seqs)
+    print(f"Found {nseqs} chromosomes")
+    i = 1
+    for k in fa_object.keys():
+        if verbose:
+            print(f"Processing ({i} of {nseqs}) {k}...")
+        seq = str(fa_object[k])
+        seq_length = len(seq)
+        seq_name = fa_object[k].name
+        seq_digest = digest_function(seq.upper())
+        snlp = {"length": seq_length, "name": seq_name}  # sorted_name_length_pairs
+        snlp_digest = digest_function(canonical_str(snlp))
+        CSC["lengths"].append(seq_length)
+        CSC["names"].append(seq_name)
+        CSC["sorted_name_length_pairs"].append(snlp_digest)
+        CSC["sequences"].append(seq_digest)
+        i += 1
+    CSC["sorted_name_length_pairs"].sort()
+    return CSC
+
+
+def build_sorted_name_length_pairs(obj: dict, digest_function):
+    """Builds the sorted_name_length_pairs attribute, which corresponds to the coordinate system"""
+    sorted_name_length_pairs = []
+    for i in range(len(obj["names"])):
+        sorted_name_length_pairs.append({"length": obj["lengths"][i], "name": obj["names"][i]})
+    nl_digests = []
+    for i in range(len(sorted_name_length_pairs)):
+        nl_digests.append(digest_function(canonical_str(sorted_name_length_pairs[i])))
+
+    nl_digests.sort()
+    return nl_digests
+
+
+def compare_seqcols(A: SeqCol, B: SeqCol):
+    """
+    Workhorse comparison function
+
+    @param A Sequence collection A
+    @param B Sequence collection B
+    @return dict Following formal seqcol specification comparison function return value
+    """
+    validate_seqcol(A)  # First ensure these are the right structure
+    validate_seqcol(B)
+
+    all_keys = list(A.keys()) + list(set(B.keys()) - set(list(A.keys())))
+    result = {}
+    return_obj = {
+        "arrays": {"a-only": [], "b-only": [], "a-and-b": []},
+        "elements": {
+            "total": {"a": len(A["lengths"]), "b": len(B["lengths"])},
+            "a-and-b": {},
+            "a-and-b-same-order": {},
+        },
+    }
+
+    for k in all_keys:
+        _LOGGER.info(k)
+        if k not in A:
+            result[k] = {"flag": -1}
+            return_obj["arrays"]["b-only"].append(k)
+        elif k not in B:
+            return_obj["arrays"]["a-only"].append(k)
+        else:
+            return_obj["arrays"]["a-and-b"].append(k)
+            res = _compare_elements(A[k], B[k])
+            return_obj["elements"]["a-and-b"][k] = res["a-and-b"]
+            return_obj["elements"]["a-and-b-same-order"][k] = res["a-and-b-same-order"]
+    return return_obj
+
+
+def _compare_elements(A: list, B: list):
+    """
+    Compare elements between two arrays. Helper function for individual elements used by workhorse compare_seqcols function
+    """
+
+    A_filtered = list(filter(lambda x: x in B, A))
+    B_filtered = list(filter(lambda x: x in A, B))
+    A_count = len(A_filtered)
+    B_count = len(B_filtered)
+    overlap = min(len(A_filtered), len(B_filtered))  # counts duplicates
+
+    if A_count + B_count < 1:
+        # order match requires at least 2 matching elements
+        order = None
+    elif not (A_count == B_count == overlap):
+        # duplicated matches means order match is undefined
+        order = None
+    else:
+        order = A_filtered == B_filtered
+    return {"a-and-b": overlap, "a-and-b-same-order": order}
+
+
+def seqcol_digest(seqcol_obj: SeqCol, schema: dict = None) -> str:
+    """
+    Given a canonical sequence collection, compute its digest.
+
+    :param dict seqcol_obj: Dictionary representation of a canonical sequence collection object
+    :param dict schema: Schema defining the inherent attributes to digest
+    :return str: The sequence collection digest
+    """
+
+    validate_seqcol(seqcol_obj)
+    # Step 1a: Remove any non-inherent attributes,
+    # so that only the inherent attributes contribute to the digest.
+    seqcol_obj2 = {}
+    if schema:
+        for k in schema["inherent"]:
+            # Step 2: Apply RFC-8785 to canonicalize the value
+            # associated with each attribute individually.
+            seqcol_obj2[k] = canonical_str(seqcol_obj[k])
+    else:  # no schema provided, so assume all attributes are inherent
+        for k in seqcol_obj:
+            seqcol_obj2[k] = canonical_str(seqcol_obj[k])
+    # Step 3: Digest each canonicalized attribute value
+    # using the GA4GH digest algorithm.
+
+    seqcol_obj3 = {}
+    for attribute in seqcol_obj2:
+        seqcol_obj3[attribute] = sha512t24u_digest(seqcol_obj2[attribute])
+    # print(json.dumps(seqcol_obj3, indent=2))  # visualize the result
+
+    # Step 4: Apply RFC-8785 again to canonicalize the JSON
+    # of new seqcol object representation.
+
+    seqcol_obj4 = canonical_str(seqcol_obj3)
+
+    # Step 5: Digest the final canonical representation again.
+    seqcol_digest = sha512t24u_digest(seqcol_obj4)
+    return seqcol_digest

From 63aa628832ed455a817f989f4081458e8d749823 Mon Sep 17 00:00:00 2001
From: nsheff <nsheff@users.noreply.github.com>
Date: Wed, 13 Sep 2023 19:14:34 -0400
Subject: [PATCH 03/11] rename SeqColClient to SeqColHenge

---
 docs_jupyter/demo.ipynb   | 14 +++++++-------
 docs_jupyter/seqcol.ipynb |  8 ++++----
 interactive_tests.py      |  6 +++---
 seqcol/__init__.py        |  2 +-
 seqcol/seqcol_client.py   | 13 +++++++------
 seqcol/utilities.py       |  4 +++-
 tests/test_seqcol.py      | 12 ++++++------
 7 files changed, 31 insertions(+), 28 deletions(-)

diff --git a/docs_jupyter/demo.ipynb b/docs_jupyter/demo.ipynb
index c9cf0b1..0806745 100644
--- a/docs_jupyter/demo.ipynb
+++ b/docs_jupyter/demo.ipynb
@@ -106,7 +106,7 @@
     "    trunc512_digest('TCGA'): \"TCGA\"\n",
     "}\n",
     "\n",
-    "scdb_local = seqcol.SeqColClient(local_lookup_dict)\n"
+    "scdb_local = seqcol.SeqColHenge(local_lookup_dict)\n"
    ]
   },
   {
@@ -616,7 +616,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "rgdb = seqcol.SeqColClient(my_dict)"
+    "rgdb = seqcol.SeqColHenge(my_dict)"
    ]
   },
   {
@@ -947,7 +947,7 @@
    "outputs": [],
    "source": [
     "import henge \n",
-    "sc = seqcol.SeqColClient(database=mydict, schemas=[\"/home/nsheff/code/seqcol/seqcol/schemas/RawSeqCol.yaml\"])"
+    "sc = seqcol.SeqColHenge(database=mydict, schemas=[\"/home/nsheff/code/seqcol/seqcol/schemas/RawSeqCol.yaml\"])"
    ]
   },
   {
@@ -1039,16 +1039,16 @@
       "68a178f7c740c5c240aa67ba41843b119d3bf9f8b0f0ac36 ACGT\n",
       "68a178f7c740c5c240aa67ba41843b119d3bf9f8b0f0ac36_item_type sequence\n",
       "68a178f7c740c5c240aa67ba41843b119d3bf9f8b0f0ac36_digest_version md5\n",
-      "25ccf153f9ac4876a631270b6bb23328f8e5fed08087a9f5 chr1\u001e",
-      "4\u001e",
+      "25ccf153f9ac4876a631270b6bb23328f8e5fed08087a9f5 chr1\u001e\n",
+      "4\u001e\n",
       "68a178f7c740c5c240aa67ba41843b119d3bf9f8b0f0ac36\n",
       "25ccf153f9ac4876a631270b6bb23328f8e5fed08087a9f5_item_type ASD\n",
       "25ccf153f9ac4876a631270b6bb23328f8e5fed08087a9f5_digest_version md5\n",
       "3912dddce432f3085c6b4f72a644c4c4c73f07215a9679ce TCGA\n",
       "3912dddce432f3085c6b4f72a644c4c4c73f07215a9679ce_item_type sequence\n",
       "3912dddce432f3085c6b4f72a644c4c4c73f07215a9679ce_digest_version md5\n",
-      "7ea134e2ee93733d2a0aa66150d9b4540ad7fafe5782715b chr2\u001e",
-      "4\u001e",
+      "7ea134e2ee93733d2a0aa66150d9b4540ad7fafe5782715b chr2\u001e\n",
+      "4\u001e\n",
       "3912dddce432f3085c6b4f72a644c4c4c73f07215a9679ce\n",
       "7ea134e2ee93733d2a0aa66150d9b4540ad7fafe5782715b_item_type ASD\n",
       "7ea134e2ee93733d2a0aa66150d9b4540ad7fafe5782715b_digest_version md5\n",
diff --git a/docs_jupyter/seqcol.ipynb b/docs_jupyter/seqcol.ipynb
index ea1888e..c16b011 100644
--- a/docs_jupyter/seqcol.ipynb
+++ b/docs_jupyter/seqcol.ipynb
@@ -38,7 +38,7 @@
     }
    ],
    "source": [
-    "sqdb = seqcol.SeqColClient({})"
+    "sqdb = seqcol.SeqColHenge({})"
    ]
   },
   {
@@ -129,7 +129,7 @@
     }
    ],
    "source": [
-    "sqdb2 = seqcol.SeqColClient({}, schemas=[\"../seqcol/schemas/RawSeqCol.yaml\"])"
+    "sqdb2 = seqcol.SeqColHenge({}, schemas=[\"../seqcol/schemas/RawSeqCol.yaml\"])"
    ]
   },
   {
@@ -293,7 +293,7 @@
     }
    ],
    "source": [
-    "sqdb3 = seqcol.SeqColClient({}, schemas=[\"../seqcol/schemas/TASeqCol.yaml\"])"
+    "sqdb3 = seqcol.SeqColHenge({}, schemas=[\"../seqcol/schemas/TASeqCol.yaml\"])"
    ]
   },
   {
@@ -474,7 +474,7 @@
     }
    ],
    "source": [
-    "sqdb4 = seqcol.SeqColClient({}, schemas=[\"../seqcol/schemas/SeqColArraySet.yaml\"])"
+    "sqdb4 = seqcol.SeqColHenge({}, schemas=[\"../seqcol/schemas/SeqColArraySet.yaml\"])"
    ]
   },
   {
diff --git a/interactive_tests.py b/interactive_tests.py
index 8d32f08..7f5d6a0 100644
--- a/interactive_tests.py
+++ b/interactive_tests.py
@@ -1,8 +1,8 @@
 import seqcol
-from seqcol import SeqColClient
+from seqcol import SeqColHenge
 
 
-scc = SeqColClient(database={}, schemas=["seqcol/schemas/SeqColArraySet.yaml"])
+scc = SeqColHenge(database={}, schemas=["seqcol/schemas/SeqColArraySet.yaml"])
 scc
 
 fa_file = "demo_fasta/demo0.fa"
@@ -38,7 +38,7 @@
 
 # Now a test of inherent attributes
 import seqcol
-scci = seqcol.SeqColClient(database={}, schemas=["seqcol/schemas/SeqColArraySetInherent.yaml"])
+scci = seqcol.SeqColHenge(database={}, schemas=["seqcol/schemas/SeqColArraySetInherent.yaml"])
 scci
 scci.schemas
 
diff --git a/seqcol/__init__.py b/seqcol/__init__.py
index 2ef78f9..d5db6b6 100644
--- a/seqcol/__init__.py
+++ b/seqcol/__init__.py
@@ -4,5 +4,5 @@
 from ._version import __version__
 
 
-__classes__ = ["SeqColClient"]
+__classes__ = ["SeqColHenge"]
 __all__ = (__classes__ + ["build_sorted_name_length_pairs", "compare", "validate_seqcol", "fasta_to_digest"],)
diff --git a/seqcol/seqcol_client.py b/seqcol/seqcol_client.py
index 01d0ec5..3133c3f 100644
--- a/seqcol/seqcol_client.py
+++ b/seqcol/seqcol_client.py
@@ -14,7 +14,7 @@
 henge.ITEM_TYPE = "_item_type"
 
 
-class SeqColClient(refget.RefGetClient):
+class SeqColHenge(refget.RefGetClient):
     """
     Extension of henge that accommodates collections of sequences.
     """
@@ -39,14 +39,14 @@ def __init__(
             handle the digest of the
             serialized items stored in this henge.
         """
-        super(SeqColClient, self).__init__(
+        super(SeqColHenge, self).__init__(
             api_url_base=api_url_base,
             database=database,
             schemas=schemas or INTERNAL_SCHEMAS,
             henges=henges,
             checksum_function=checksum_function,
         )
-        _LOGGER.info("Initializing SeqColClient")
+        _LOGGER.info("Initializing SeqColHenge")
 
     def load_fasta(self, fa_file, skip_seq=False, topology_default="linear"):
         """
@@ -122,7 +122,7 @@ def compare_digests(self, digestA, digestB):
 
     def retrieve(self, druid, reclimit=None, raw=False):
         try:
-            return super(SeqColClient, self).retrieve(druid, reclimit, raw)
+            return super(SeqColHenge, self).retrieve(druid, reclimit, raw)
         except henge.NotFoundException as e:
             _LOGGER.debug(e)
             try:
@@ -138,7 +138,7 @@ def load_fasta_from_refgenie(self, rgc, refgenie_key):
         """
         @param rgc RefGenConf object
         @param refgenie_key key of genome to load
-        @param scc SeqColClient object to load into
+        @param scc SeqColHenge object to load into
         """
         filepath = rgc.seek(refgenie_key, "fasta")
         return self.load_fasta_from_filepath(filepath)
@@ -165,7 +165,8 @@ def load_multiple_fastas(self, fasta_dict):
         @param fasta_list
         """
         results = {}
-        for name, path in fasta_dict.items():
+        for name in fasta_dict.keys():
+            path = fasta_dict[name]["fasta"]
             print(f"Processing fasta '{name}'' at path '{path}'...")
             results[name] = self.load_fasta_from_filepath(path)
         return results
diff --git a/seqcol/utilities.py b/seqcol/utilities.py
index b74bfee..c374c53 100644
--- a/seqcol/utilities.py
+++ b/seqcol/utilities.py
@@ -67,6 +67,8 @@ def format_itemwise(csc: SeqCol) -> list:
     Format a SeqCol object into a list of dicts, one per sequence.
     """
     list_of_dicts = []
+    # TODO: handle all properties, not just these 3
+    # TODO: handle non-collated attributes, somehow
     for i in range(len(csc["lengths"])):
         list_of_dicts.append(
             {
@@ -74,7 +76,7 @@ def format_itemwise(csc: SeqCol) -> list:
                 "length": csc["lengths"][i],
                 "sequence": csc["sequences"][i],
             })
-    return list_of_dicts
+    return {"sequences": list_of_dicts}
 
 
 def explain_flag(flag):
diff --git a/tests/test_seqcol.py b/tests/test_seqcol.py
index cf45652..f74a9ce 100644
--- a/tests/test_seqcol.py
+++ b/tests/test_seqcol.py
@@ -3,7 +3,7 @@
 import pytest
 import seqcol
 
-# from seqcol import SeqColClient, validate_seqcol, compare
+# from seqcol import SeqColHenge, validate_seqcol, compare
 # from seqcol.const import *
 
 DEMO_FILES = [
@@ -29,16 +29,16 @@
 class TestGeneral:
     def test_no_schemas_required(self):
         """
-        In contrast to the generic Henge object, SeqColClient does not
+        In contrast to the generic Henge object, SeqColHenge does not
         require schemas as input, they are predefined in the constructor
         """
-        assert isinstance(seqcol.SeqColClient(database={}), seqcol.SeqColClient)
+        assert isinstance(seqcol.SeqColHenge(database={}), seqcol.SeqColHenge)
 
 
 class TestFastaInserting:
     @pytest.mark.parametrize("fasta_name", DEMO_FILES)
     def test_fasta_loading_works(self, fasta_name, fa_root):
-        scc = seqcol.SeqColClient(database={})
+        scc = seqcol.SeqColHenge(database={})
         f = os.path.join(fa_root, fasta_name)
         print("Fasta file to be loaded: {}".format(f))
         res = scc.load_fasta(f)
@@ -48,7 +48,7 @@ def test_fasta_loading_works(self, fasta_name, fa_root):
 class TestRetrieval:
     @pytest.mark.parametrize("fasta_name", DEMO_FILES)
     def test_retrieval_works(self, fasta_name, fa_root):
-        scc = seqcol.SeqColClient(database={})
+        scc = seqcol.SeqColHenge(database={})
         f = os.path.join(fa_root, fasta_name)
         print("Fasta file to be loaded: {}".format(f))
         d, asds = scc.load_fasta(f)
@@ -60,7 +60,7 @@ def test_retrieval_works(self, fasta_name, fa_root):
 
 def check_comparison(fasta1, fasta2, expected_comparison):
     print(f"Comparison: Fasta1: {fasta1} vs Fasta2: {fasta2}. Expected: {expected_comparison}")
-    scc = seqcol.SeqColClient(database={})
+    scc = seqcol.SeqColHenge(database={})
     d = scc.load_fasta_from_filepath(fasta1)
     d2 = scc.load_fasta_from_filepath(fasta2)
     with open(expected_comparison) as fp:

From 48ba7d2d7e22e34b8c5fad3925c25205c648ccd2 Mon Sep 17 00:00:00 2001
From: nsheff <nsheff@users.noreply.github.com>
Date: Wed, 13 Sep 2023 19:15:12 -0400
Subject: [PATCH 04/11] lint

---
 seqcol/__init__.py  |  5 ++++-
 seqcol/const.py     |  1 +
 seqcol/utilities.py | 10 +++++++---
 3 files changed, 12 insertions(+), 4 deletions(-)

diff --git a/seqcol/__init__.py b/seqcol/__init__.py
index d5db6b6..38f3d72 100644
--- a/seqcol/__init__.py
+++ b/seqcol/__init__.py
@@ -5,4 +5,7 @@
 
 
 __classes__ = ["SeqColHenge"]
-__all__ = (__classes__ + ["build_sorted_name_length_pairs", "compare", "validate_seqcol", "fasta_to_digest"],)
+__all__ = (
+    __classes__
+    + ["build_sorted_name_length_pairs", "compare", "validate_seqcol", "fasta_to_digest"],
+)
diff --git a/seqcol/const.py b/seqcol/const.py
index 086c129..987b644 100644
--- a/seqcol/const.py
+++ b/seqcol/const.py
@@ -1,5 +1,6 @@
 import os
 
+
 def _schema_path(name):
     return os.path.join(SCHEMA_FILEPATH, name)
 
diff --git a/seqcol/utilities.py b/seqcol/utilities.py
index c374c53..e3cff6a 100644
--- a/seqcol/utilities.py
+++ b/seqcol/utilities.py
@@ -15,6 +15,7 @@
 
 _LOGGER = logging.getLogger(__name__)
 
+
 # Refget digests from published seqcol v1.0 protocol
 # Retrieved July 2019
 # http://samtools.github.io/hts-specs/refget.html
@@ -25,7 +26,7 @@ def trunc512_digest(seq, offset=24) -> str:
 
 
 def sha512t24u_digest(seq: str, offset: int = 24) -> str:
-    """ GA4GH digest function """
+    """GA4GH digest function"""
     digest = hashlib.sha512(seq.encode()).digest()
     tdigest_b64us = base64.urlsafe_b64encode(digest[:offset])
     return tdigest_b64us.decode("ascii")
@@ -62,8 +63,9 @@ def validate_seqcol(seqcol_obj: SeqCol, schema=None) -> Optional[dict]:
         raise InvalidSeqColError("Validation failed", errors)
     return True
 
+
 def format_itemwise(csc: SeqCol) -> list:
-    """ 
+    """
     Format a SeqCol object into a list of dicts, one per sequence.
     """
     list_of_dicts = []
@@ -75,7 +77,8 @@ def format_itemwise(csc: SeqCol) -> list:
                 "name": csc["names"][i],
                 "length": csc["lengths"][i],
                 "sequence": csc["sequences"][i],
-            })
+            }
+        )
     return {"sequences": list_of_dicts}
 
 
@@ -86,6 +89,7 @@ def explain_flag(flag):
         if flag & 2**e:
             print(FLAGS[2**e])
 
+
 def fasta_to_digest(fa_file_path: str) -> str:
     """Given a fasta, return a digest"""
     seqcol_obj = fasta_to_seqcol(fa_file_path)

From 4b09130f9de352afabd8175e1ceda566c5bbb42f Mon Sep 17 00:00:00 2001
From: nsheff <nsheff@users.noreply.github.com>
Date: Wed, 13 Sep 2023 19:18:27 -0400
Subject: [PATCH 05/11] some cleanup

---
 seqcol/__init__.py                     | 2 +-
 seqcol/_version.py                     | 2 +-
 seqcol/{seqcol_client.py => seqcol.py} | 2 --
 3 files changed, 2 insertions(+), 4 deletions(-)
 rename seqcol/{seqcol_client.py => seqcol.py} (99%)

diff --git a/seqcol/__init__.py b/seqcol/__init__.py
index 38f3d72..b29eaad 100644
--- a/seqcol/__init__.py
+++ b/seqcol/__init__.py
@@ -1,5 +1,5 @@
 from .const import *
-from .seqcol_client import *
+from .seqcol import *
 from .utilities import *
 from ._version import __version__
 
diff --git a/seqcol/_version.py b/seqcol/_version.py
index 6892a3d..d89a3b2 100644
--- a/seqcol/_version.py
+++ b/seqcol/_version.py
@@ -1 +1 @@
-__version__ = "0.0.2-dev"
+__version__ = "0.0.3-dev"
diff --git a/seqcol/seqcol_client.py b/seqcol/seqcol.py
similarity index 99%
rename from seqcol/seqcol_client.py
rename to seqcol/seqcol.py
index 3133c3f..e18ee3f 100644
--- a/seqcol/seqcol_client.py
+++ b/seqcol/seqcol.py
@@ -2,8 +2,6 @@
 import logging
 import refget
 
-from copy import copy
-from functools import reduce
 from itertools import compress
 
 from .const import *

From 64c09a08910a3206517a8a8070ac955ae278fc09 Mon Sep 17 00:00:00 2001
From: nsheff <nsheff@users.noreply.github.com>
Date: Wed, 13 Sep 2023 19:19:25 -0400
Subject: [PATCH 06/11] convert hyphens to underscores

---
 seqcol/utilities.py | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/seqcol/utilities.py b/seqcol/utilities.py
index e3cff6a..4db93e2 100644
--- a/seqcol/utilities.py
+++ b/seqcol/utilities.py
@@ -182,11 +182,11 @@ def compare_seqcols(A: SeqCol, B: SeqCol):
     all_keys = list(A.keys()) + list(set(B.keys()) - set(list(A.keys())))
     result = {}
     return_obj = {
-        "arrays": {"a-only": [], "b-only": [], "a-and-b": []},
+        "arrays": {"a_only": [], "b_only": [], "a_and_b": []},
         "elements": {
             "total": {"a": len(A["lengths"]), "b": len(B["lengths"])},
-            "a-and-b": {},
-            "a-and-b-same-order": {},
+            "a_and_b": {},
+            "a_and_b_same_order": {},
         },
     }
 
@@ -194,14 +194,14 @@ def compare_seqcols(A: SeqCol, B: SeqCol):
         _LOGGER.info(k)
         if k not in A:
             result[k] = {"flag": -1}
-            return_obj["arrays"]["b-only"].append(k)
+            return_obj["arrays"]["b_only"].append(k)
         elif k not in B:
-            return_obj["arrays"]["a-only"].append(k)
+            return_obj["arrays"]["a_only"].append(k)
         else:
-            return_obj["arrays"]["a-and-b"].append(k)
+            return_obj["arrays"]["a_and_b"].append(k)
             res = _compare_elements(A[k], B[k])
-            return_obj["elements"]["a-and-b"][k] = res["a-and-b"]
-            return_obj["elements"]["a-and-b-same-order"][k] = res["a-and-b-same-order"]
+            return_obj["elements"]["a_and_b"][k] = res["a_and_b"]
+            return_obj["elements"]["a_and_b_same_order"][k] = res["a_and_b_same_order"]
     return return_obj
 
 

From 0ed6ec75819e1f6b8a71cc3ddd722d84c66055cb Mon Sep 17 00:00:00 2001
From: nsheff <nsheff@users.noreply.github.com>
Date: Wed, 13 Sep 2023 19:27:51 -0400
Subject: [PATCH 07/11] lint, remove old stuff, renames

---
 deprecated.py        | 192 -------------------------------------------
 interactive_tests.py |  40 +++++----
 seqcol/__init__.py   |   2 +-
 seqcol/utilities.py  |  39 +++++----
 4 files changed, 47 insertions(+), 226 deletions(-)

diff --git a/deprecated.py b/deprecated.py
index bca8b35..68e05b6 100644
--- a/deprecated.py
+++ b/deprecated.py
@@ -19,198 +19,6 @@
 
 
 
-
-
-
-
-
-
-   @staticmethod
-    def compat_all_old(A, B):
-        all_keys = list(A.keys()) + list(set(B.keys()) - set(list(A.keys())))
-        result = {}
-        flipped_format = {
-            "a-in-b": {},
-            "b-in-a": {},
-            "a-total": {},
-            "b-total": {},
-            "a-duplicated": {},
-            "b-duplicated": {},
-            "order-match": [],
-            "only-in-a": [],
-            "only-in-b": [],
-        }
-        for k in all_keys:
-            _LOGGER.info(k)
-            if k not in A:
-                result[k] = {"flag": -1}
-                flipped_format["only-in-b"].append(k)
-            elif k not in B:
-                flipped_format["only-in-a"].append(k)
-            else:
-                v = SeqColClient.compat(A[k], B[k])
-                result[k] = v
-                if "a-in-b" in v:
-                    flipped_format["a-in-b"][k] = v['a-in-b']
-                if "b-in-a":
-                    flipped_format["b-in-a"][k] = v['b-in-a']
-                if "a-total" in v:
-                    flipped_format["a-total"][k] = v['a-total']
-                if "b-total" in v:
-                    flipped_format["b-total"][k] = v['b-total']
-                if "a-duplicated" in v:
-                    flipped_format["a-duplicated"][k] = v['a-duplicated']
-                if "b-duplicated" in v:
-                    flipped_format["b-duplicated"][k] = v['b-duplicated']
-                if "order-match" in v:
-                    flipped_format["order-match"].append(k)
-
-        # result = {
-        #     "any-elements-shared": any(ainb),
-        #     "all-a-in-b": all(ainb),
-        #     "all-b-in-a": all(bina),
-        #     "order-match": order,
-        #     "flag": flag
-        # }
-
-        return flipped_format
-
-
-    def compare_digests_old(self, digestA, digestB, explain=False):
-        """
-        Given two collection checksums in the database, provide some information
-        about how they are related.
-
-        :param str digestA: Digest for first sequence collection to compare.
-        :param str digestB: Digest for second sequence collection to compare.
-        :param bool explain: Print an explanation of the flag? [Default: False]
-        """
-        typeA = self.database[digestA + henge.ITEM_TYPE]
-        typeB = self.database[digestB + henge.ITEM_TYPE]
-
-        if typeA != typeB:
-            _LOGGER.error(
-                f"Can't compare objects of different types: " f"{typeA} vs {typeB}"
-            )
-
-        asdA = self.retrieve(digestA, reclimit=1)
-        asdB = self.retrieve(digestB, reclimit=1)
-        return self.compare_asds(asdA, asdB, explain=explain)
-
-
-   @staticmethod
-    def compare_asds(asdA, asdB, explain=False):
-        """
-        Compare Annotated Sequence Digests (ASDs) -- digested sequences and `data
-
-        :param str asdA: ASD for first sequence collection to compare.
-        :param str asdB: ASD for second sequence collection to compare.
-        :param bool explain: Print an explanation of the flag? [Default: False]
-        """
-
-        def _xp(prop, lst):
-            """Extract property from a list of dicts"""
-            return list(map(lambda x: x[prop], lst))
-
-        def _index(x, lst):
-            """Find an index of a sequence element in a list of dicts"""
-            try:
-                return _xp(SEQ_KEY, lst).index(x)
-            except:
-                return None
-
-        def _get_common_content(lstA, lstB):
-            """
-            Find the intersection between two list of dicts with sequences
-            """
-            return list(
-                filter(None.__ne__, [_index(x, lstB) for x in _xp(SEQ_KEY, lstA)])
-            )
-
-        # Not ideal, but we expect these to return lists, but if the item was
-        # singular only a dict is returned
-        if not isinstance(asdA, list):
-            asdA = [asdA]
-        if not isinstance(asdB, list):
-            asdB = [asdB]
-
-        ainb = [x in _xp(SEQ_KEY, asdB) for x in _xp(SEQ_KEY, asdA)]
-        bina = [x in _xp(SEQ_KEY, asdA) for x in _xp(SEQ_KEY, asdB)]
-
-        return_flag = 0  # initialize
-        if sum(ainb) > 1:
-            ordA = _get_common_content(asdA, asdB)
-            if ordA == sorted(ordA):
-                return_flag += CONTENT_A_ORDER
-        if sum(bina) > 1:
-            ordB = _get_common_content(asdB, asdA)
-            if ordB == sorted(ordB):
-                return_flag += CONTENT_B_ORDER
-
-        ainb_len = [x in _xp(LEN_KEY, asdB) for x in _xp(LEN_KEY, asdA)]
-        bina_len = [x in _xp(LEN_KEY, asdA) for x in _xp(LEN_KEY, asdB)]
-
-        ainb_name = [x in _xp(NAME_KEY, asdB) for x in _xp(NAME_KEY, asdA)]
-        bina_name = [x in _xp(NAME_KEY, asdA) for x in _xp(NAME_KEY, asdB)]
-
-        ainb_topo = [x in _xp(TOPO_KEY, asdB) for x in _xp(TOPO_KEY, asdA)]
-        bina_topo = [x in _xp(TOPO_KEY, asdA) for x in _xp(TOPO_KEY, asdB)]
-
-        if all(ainb):
-            return_flag += CONTENT_ALL_A_IN_B
-        if all(bina):
-            return_flag += CONTENT_ALL_B_IN_A
-
-        if all(ainb_name):
-            return_flag += NAMES_ALL_A_IN_B
-        if all(bina_name):
-            return_flag += NAMES_ALL_B_IN_A
-
-        if all(ainb_topo):
-            return_flag += TOPO_ALL_A_IN_B
-        if all(bina_topo):
-            return_flag += TOPO_ALL_B_IN_A
-
-        if all(ainb_len):
-            return_flag += LENGTHS_ALL_A_IN_B
-        if all(bina_len):
-            return_flag += LENGTHS_ALL_B_IN_A
-
-        if explain:
-            explain_flag(return_flag)
-        return return_flag
-
-
-    @staticmethod
-    def compat(A, B):
-        """
-        New compatibility function for array-based data model.
-        """
-
-        lenA = len(A)
-        lenB = len(B)
-        dupeA = lenA - len(dict.fromkeys(A))
-        dupeB = lenB - len(dict.fromkeys(B))
-        ainb = [x in B for x in A]
-        bina = [x in A for x in B]
-        sum_ainb = sum(ainb)
-        if sum_ainb > 1:
-            order = list(compress(B, bina)) == list(compress(A, ainb))
-        else:
-            order = False
-
-        result = {
-            "a-in-b": sum_ainb,
-            "b-in-a":  sum(bina),
-            "a-total": lenA,
-            "b-total": lenB,
-            "a-duplicated": dupeA,
-            "b-duplicated": dupeB,
-            "order-match": order
-        }
-        return result
-
-
 def compat(A, B):
     ainb = [x in B for x in A]
     bina = [x in A for x in B]
diff --git a/interactive_tests.py b/interactive_tests.py
index 7f5d6a0..e16cdaa 100644
--- a/interactive_tests.py
+++ b/interactive_tests.py
@@ -38,6 +38,7 @@
 
 # Now a test of inherent attributes
 import seqcol
+
 scci = seqcol.SeqColHenge(database={}, schemas=["seqcol/schemas/SeqColArraySetInherent.yaml"])
 scci
 scci.schemas
@@ -46,8 +47,8 @@
 fa_file = "demo_fasta/demo0.fa"
 fa_object = seqcol.parse_fasta(fa_file)
 
-array_set_i = {"names": names, "lengths": lengthsi, "sequences": sequences, "author":"urkel"}
-array_set_i2 = {"names": names, "lengths": lengthsi, "sequences": sequences, "author" :"nathan"}
+array_set_i = {"names": names, "lengths": lengthsi, "sequences": sequences, "author": "urkel"}
+array_set_i2 = {"names": names, "lengths": lengthsi, "sequences": sequences, "author": "nathan"}
 
 
 di = scci.insert(array_set_i, "SeqColArraySet")
@@ -55,21 +56,22 @@
 di
 # scc.retrieve(di)
 scci.retrieve(di)
-fasta_path="demo_fasta"
+fasta_path = "demo_fasta"
 fasta1 = "demo2.fa"
 fasta2 = "demo3.fa"
 fasta5 = "demo5.fa.gz"
 fasta6 = "demo6.fa"
 
 import os
+
 d = scci.load_fasta_from_filepath(os.path.join(fasta_path, fasta1))
 d2 = scci.load_fasta_from_filepath(os.path.join(fasta_path, fasta2))
 d2 = scci.load_fasta_from_filepath(os.path.join(fasta_path, fasta2))
 d5 = scci.load_fasta_from_filepath(os.path.join(fasta_path, fasta5))
 d6 = scci.load_fasta_from_filepath(os.path.join(fasta_path, fasta6))
-scci.retrieve(d['digest'])
+scci.retrieve(d["digest"])
 
-scci.retrieve(d5['digest'])
+scci.retrieve(d5["digest"])
 
 fa_object = seqcol.parse_fasta(os.path.join(fasta_path, fasta1))
 SCAS = seqcol.fasta_to_csc(fa_object)
@@ -84,17 +86,25 @@
 
 
 json.dumps(scci.compare(d5["SCAS"], d6["SCAS"]))
-print(json.dumps(scci.compare(d5["SCAS"], d6["SCAS"]), separators=(",", ":"), ensure_ascii=False
-    ...: , allow_nan=False, sort_keys=True, indent=2))
+print(
+    json.dumps(
+        scci.compare(d5["SCAS"], d6["SCAS"]),
+        separators=(",", ":"),
+        ensure_ascii=False,
+        allow_nan=False,
+        sort_keys=True,
+        indent=2,
+    )
+)
 
 build_sorted_name_length_pairs(array_set_i)
 
-#reorder
+# reorder
 
 array_set_reordered = {}
-for k,v in array_set.items():
-	print(k,v)
-	array_set_reordered[k] = list(reversed(v))
+for k, v in array_set.items():
+    print(k, v)
+    array_set_reordered[k] = list(reversed(v))
 
 array_set
 array_set_reordered
@@ -106,8 +116,6 @@
 import henge
 
 
-
-
 from henge import md5
 
 names = []
@@ -142,12 +150,10 @@
 os.getcwd()
 
 
-
-
-
 ## standalone functions
 
 import seqcol
+
 fa_file = "demo_fasta/demo0.fa"
 fa_object = seqcol.parse_fasta(fa_file)
 
@@ -155,8 +161,8 @@
 csc = seqcol.fasta_to_csc(fa_object)
 csc
 import json
+
 print(json.dumps(csc, indent=2))
 
 
 seqcol.seqcol_digest(csc)
-
diff --git a/seqcol/__init__.py b/seqcol/__init__.py
index b29eaad..bb6f733 100644
--- a/seqcol/__init__.py
+++ b/seqcol/__init__.py
@@ -7,5 +7,5 @@
 __classes__ = ["SeqColHenge"]
 __all__ = (
     __classes__
-    + ["build_sorted_name_length_pairs", "compare", "validate_seqcol", "fasta_to_digest"],
+    + ["build_sorted_name_length_pairs", "compare", "validate_seqcol", "fasta_file_to_digest"],
 )
diff --git a/seqcol/utilities.py b/seqcol/utilities.py
index 4db93e2..80b21e8 100644
--- a/seqcol/utilities.py
+++ b/seqcol/utilities.py
@@ -20,6 +20,7 @@
 # Retrieved July 2019
 # http://samtools.github.io/hts-specs/refget.html
 def trunc512_digest(seq, offset=24) -> str:
+    """Deprecated GA4GH digest function"""
     digest = hashlib.sha512(seq.encode()).digest()
     hex_digest = binascii.hexlify(digest[:offset])
     return hex_digest.decode()
@@ -44,17 +45,23 @@ def print_csc(csc: dict) -> str:
     return print(json.dumps(csc, indent=2))
 
 
-# Simple true/false validation
 def validate_seqcol_bool(seqcol_obj: SeqCol, schema=None) -> bool:
+    """
+    Validate a seqcol object against the seqcol schema. Returns True if valid, False if not.
+
+    To enumerate the errors, use validate_seqcol instead.
+    """
     schema_path = os.path.join(os.path.dirname(__file__), "schemas", "seqcol.yaml")
     schema = load_yaml(schema_path)
     validator = Draft7Validator(schema)
     return validator.is_valid(seqcol_obj)
 
 
-# Get errors if invalid (use this one)
-# Get the errors with exception.errors
 def validate_seqcol(seqcol_obj: SeqCol, schema=None) -> Optional[dict]:
+    """Validate a seqcol object against the seqcol schema.
+    Returns True if valid, raises InvalidSeqColError if not, which enumerates the errors.
+    Retrieve individual errors with exception.errors
+    """
     schema_path = os.path.join(os.path.dirname(__file__), "schemas", "seqcol.yaml")
     schema = load_yaml(schema_path)
     validator = Draft7Validator(schema)
@@ -82,17 +89,9 @@ def format_itemwise(csc: SeqCol) -> list:
     return {"sequences": list_of_dicts}
 
 
-def explain_flag(flag):
-    """Explains a compare flag"""
-    print(f"Flag: {flag}\nBinary: {bin(flag)}\n")
-    for e in range(0, 13):
-        if flag & 2**e:
-            print(FLAGS[2**e])
-
-
-def fasta_to_digest(fa_file_path: str) -> str:
+def fasta_file_to_digest(fa_file_path: str) -> str:
     """Given a fasta, return a digest"""
-    seqcol_obj = fasta_to_seqcol(fa_file_path)
+    seqcol_obj = fasta_file_to_seqcol(fa_file_path)
     return seqcol_digest(seqcol_obj)
 
 
@@ -115,7 +114,7 @@ def parse_fasta(fa_file) -> pyfaidx.Fasta:
             return pyfaidx.Fasta(f_out.name)
 
 
-def fasta_to_seqcol(fa_file_path: str) -> dict:
+def fasta_file_to_seqcol(fa_file_path: str) -> dict:
     """Given a fasta, return a canonical seqcol object"""
     fa_obj = parse_fasta(fa_file_path)
     return fasta_obj_to_seqcol(fa_obj)
@@ -130,7 +129,7 @@ def fasta_obj_to_seqcol(
     Given a fasta object, return a CSC (Canonical Sequence Collection object)
     """
     # CSC = SeqColArraySet
-    # Or maybe should be "Level 1 SC"
+    # Or equivalently, a "Level 1 SeqCol"
 
     CSC = {"lengths": [], "names": [], "sequences": [], "sorted_name_length_pairs": []}
     seqs = fa_object.keys()
@@ -160,7 +159,7 @@ def build_sorted_name_length_pairs(obj: dict, digest_function):
     sorted_name_length_pairs = []
     for i in range(len(obj["names"])):
         sorted_name_length_pairs.append({"length": obj["lengths"][i], "name": obj["names"][i]})
-    nl_digests = []
+    nl_digests = []  # name-length digests
     for i in range(len(sorted_name_length_pairs)):
         nl_digests.append(digest_function(canonical_str(sorted_name_length_pairs[i])))
 
@@ -264,3 +263,11 @@ def seqcol_digest(seqcol_obj: SeqCol, schema: dict = None) -> str:
     # Step 5: Digest the final canonical representation again.
     seqcol_digest = sha512t24u_digest(seqcol_obj4)
     return seqcol_digest
+
+
+def explain_flag(flag):
+    """Explains a compare flag"""
+    print(f"Flag: {flag}\nBinary: {bin(flag)}\n")
+    for e in range(0, 13):
+        if flag & 2**e:
+            print(FLAGS[2**e])

From 5a55798c3faf896bead6fdacd16d73dd0e2a8bc4 Mon Sep 17 00:00:00 2001
From: nsheff <nsheff@users.noreply.github.com>
Date: Thu, 14 Sep 2023 08:56:29 -0400
Subject: [PATCH 08/11] order

---
 seqcol/utilities.py | 15 ++++++---------
 1 file changed, 6 insertions(+), 9 deletions(-)

diff --git a/seqcol/utilities.py b/seqcol/utilities.py
index 80b21e8..5133221 100644
--- a/seqcol/utilities.py
+++ b/seqcol/utilities.py
@@ -16,9 +16,6 @@
 _LOGGER = logging.getLogger(__name__)
 
 
-# Refget digests from published seqcol v1.0 protocol
-# Retrieved July 2019
-# http://samtools.github.io/hts-specs/refget.html
 def trunc512_digest(seq, offset=24) -> str:
     """Deprecated GA4GH digest function"""
     digest = hashlib.sha512(seq.encode()).digest()
@@ -89,12 +86,6 @@ def format_itemwise(csc: SeqCol) -> list:
     return {"sequences": list_of_dicts}
 
 
-def fasta_file_to_digest(fa_file_path: str) -> str:
-    """Given a fasta, return a digest"""
-    seqcol_obj = fasta_file_to_seqcol(fa_file_path)
-    return seqcol_digest(seqcol_obj)
-
-
 def parse_fasta(fa_file) -> pyfaidx.Fasta:
     """
     Read in a gzipped or not gzipped FASTA file
@@ -114,6 +105,12 @@ def parse_fasta(fa_file) -> pyfaidx.Fasta:
             return pyfaidx.Fasta(f_out.name)
 
 
+def fasta_file_to_digest(fa_file_path: str) -> str:
+    """Given a fasta, return a digest"""
+    seqcol_obj = fasta_file_to_seqcol(fa_file_path)
+    return seqcol_digest(seqcol_obj)
+
+
 def fasta_file_to_seqcol(fa_file_path: str) -> dict:
     """Given a fasta, return a canonical seqcol object"""
     fa_obj = parse_fasta(fa_file_path)

From 7974c0147ec0a0a568e1857a415c56f5a7261439 Mon Sep 17 00:00:00 2001
From: nsheff <nsheff@users.noreply.github.com>
Date: Thu, 14 Sep 2023 09:19:12 -0400
Subject: [PATCH 09/11] hyphen to underscore

---
 demo_fasta/compare-0vs1.json | 10 +++++-----
 demo_fasta/compare-1vs1.json | 10 +++++-----
 demo_fasta/compare-5vs6.json | 10 +++++-----
 seqcol/utilities.py          |  2 +-
 4 files changed, 16 insertions(+), 16 deletions(-)

diff --git a/demo_fasta/compare-0vs1.json b/demo_fasta/compare-0vs1.json
index 719cd2f..d1bc8a2 100644
--- a/demo_fasta/compare-0vs1.json
+++ b/demo_fasta/compare-0vs1.json
@@ -1,22 +1,22 @@
 {
   "arrays":{
-    "a-and-b":[
+    "a_and_b":[
       "lengths",
       "names",
       "sequences",
       "sorted_name_length_pairs"
     ],
-    "a-only":[],
-    "b-only":[]
+    "a_only":[],
+    "b_only":[]
   },
   "elements":{
-    "a-and-b":{
+    "a_and_b":{
       "lengths":2,
       "names":2,
       "sorted_name_length_pairs":2,
       "sequences":0
     },
-    "a-and-b-same-order":{
+    "a_and_b_same_order":{
       "lengths":true,
       "names":true,
       "sorted_name_length_pairs":true,
diff --git a/demo_fasta/compare-1vs1.json b/demo_fasta/compare-1vs1.json
index b944bc5..87b71b4 100644
--- a/demo_fasta/compare-1vs1.json
+++ b/demo_fasta/compare-1vs1.json
@@ -1,22 +1,22 @@
 {
   "arrays":{
-    "a-and-b":[
+    "a_and_b":[
       "lengths",
       "names",
       "sequences",
       "sorted_name_length_pairs"
     ],
-    "a-only":[],
-    "b-only":[]
+    "a_only":[],
+    "b_only":[]
   },
   "elements":{
-    "a-and-b":{
+    "a_and_b":{
       "lengths":2,
       "names":2,
       "sorted_name_length_pairs":2,
       "sequences":2
     },
-    "a-and-b-same-order":{
+    "a_and_b_same_order":{
       "lengths":true,
       "names":true,
       "sorted_name_length_pairs":true,
diff --git a/demo_fasta/compare-5vs6.json b/demo_fasta/compare-5vs6.json
index 56114f8..2dfed9f 100644
--- a/demo_fasta/compare-5vs6.json
+++ b/demo_fasta/compare-5vs6.json
@@ -1,22 +1,22 @@
 {
   "arrays":{
-    "a-and-b":[
+    "a_and_b":[
       "lengths",
       "names",
       "sequences",
       "sorted_name_length_pairs"
     ],
-    "a-only":[],
-    "b-only":[]
+    "a_only":[],
+    "b_only":[]
   },
   "elements":{
-    "a-and-b":{
+    "a_and_b":{
       "lengths":3,
       "names":3,
       "sorted_name_length_pairs":3,
       "sequences":3
     },
-    "a-and-b-same-order":{
+    "a_and_b_same_order":{
       "lengths":false,
       "names":false,
       "sorted_name_length_pairs":true,
diff --git a/seqcol/utilities.py b/seqcol/utilities.py
index 5133221..a902722 100644
--- a/seqcol/utilities.py
+++ b/seqcol/utilities.py
@@ -220,7 +220,7 @@ def _compare_elements(A: list, B: list):
         order = None
     else:
         order = A_filtered == B_filtered
-    return {"a-and-b": overlap, "a-and-b-same-order": order}
+    return {"a_and_b": overlap, "a_and_b_same_order": order}
 
 
 def seqcol_digest(seqcol_obj: SeqCol, schema: dict = None) -> str:

From 2d335c3bae24bbfc1060f56561f377e9c80460d6 Mon Sep 17 00:00:00 2001
From: nsheff <nsheff@users.noreply.github.com>
Date: Thu, 14 Sep 2023 16:39:05 -0400
Subject: [PATCH 10/11] update to refget2, seq prefix, remove refget extension

---
 seqcol/seqcol.py    | 28 ++++++++++++++++++++++++----
 seqcol/utilities.py |  2 +-
 2 files changed, 25 insertions(+), 5 deletions(-)

diff --git a/seqcol/seqcol.py b/seqcol/seqcol.py
index e18ee3f..2b7e3e5 100644
--- a/seqcol/seqcol.py
+++ b/seqcol/seqcol.py
@@ -1,6 +1,6 @@
 import henge
 import logging
-import refget
+import yacman
 
 from itertools import compress
 
@@ -12,14 +12,34 @@
 henge.ITEM_TYPE = "_item_type"
 
 
-class SeqColHenge(refget.RefGetClient):
+class SeqColConf(yacman.YAMLConfigManager):
+    """
+    Simple configuration manager object for SeqColHenge.
+    """
+    def __init__(
+        self,
+        entries={},
+        filepath=None,
+        yamldata=None,
+        writable=False,
+        wait_max=60,
+        skip_read_lock=False,
+    ):
+        filepath = yacman.select_config(
+            config_filepath=filepath,
+            config_env_vars=["SEQCOLAPI_CONFIG"],
+            config_name="seqcol"
+        )
+        super(SeqColConf, self).__init__(entries, filepath, yamldata, writable)
+
+
+class SeqColHenge(henge.Henge):
     """
     Extension of henge that accommodates collections of sequences.
     """
 
     def __init__(
         self,
-        api_url_base=None,
         database={},
         schemas=None,
         henges=None,
@@ -38,7 +58,6 @@ def __init__(
             serialized items stored in this henge.
         """
         super(SeqColHenge, self).__init__(
-            api_url_base=api_url_base,
             database=database,
             schemas=schemas or INTERNAL_SCHEMAS,
             henges=henges,
@@ -123,6 +142,7 @@ def retrieve(self, druid, reclimit=None, raw=False):
             return super(SeqColHenge, self).retrieve(druid, reclimit, raw)
         except henge.NotFoundException as e:
             _LOGGER.debug(e)
+            raise e
             try:
                 return self.refget(druid)
             except Exception as e:
diff --git a/seqcol/utilities.py b/seqcol/utilities.py
index a902722..12f32bf 100644
--- a/seqcol/utilities.py
+++ b/seqcol/utilities.py
@@ -139,7 +139,7 @@ def fasta_obj_to_seqcol(
         seq = str(fa_object[k])
         seq_length = len(seq)
         seq_name = fa_object[k].name
-        seq_digest = digest_function(seq.upper())
+        seq_digest = "SQ."+digest_function(seq.upper())
         snlp = {"length": seq_length, "name": seq_name}  # sorted_name_length_pairs
         snlp_digest = digest_function(canonical_str(snlp))
         CSC["lengths"].append(seq_length)

From 5c7fb31bb2af6818f9279203186e087f97624253 Mon Sep 17 00:00:00 2001
From: nsheff <nsheff@users.noreply.github.com>
Date: Fri, 15 Sep 2023 17:04:00 -0400
Subject: [PATCH 11/11] add ability to load from chromsizes

---
 seqcol/seqcol.py    | 16 ++++++++++++++--
 seqcol/utilities.py | 28 ++++++++++++++++++++++++++++
 2 files changed, 42 insertions(+), 2 deletions(-)

diff --git a/seqcol/seqcol.py b/seqcol/seqcol.py
index 2b7e3e5..634952d 100644
--- a/seqcol/seqcol.py
+++ b/seqcol/seqcol.py
@@ -156,7 +156,6 @@ def load_fasta_from_refgenie(self, rgc, refgenie_key):
         """
         @param rgc RefGenConf object
         @param refgenie_key key of genome to load
-        @param scc SeqColHenge object to load into
         """
         filepath = rgc.seek(refgenie_key, "fasta")
         return self.load_fasta_from_filepath(filepath)
@@ -164,7 +163,6 @@ def load_fasta_from_refgenie(self, rgc, refgenie_key):
     def load_fasta_from_filepath(self, filepath):
         """
         @param filepath Path to fasta file
-        @param sc
         """
         fa_object = parse_fasta(filepath)
         SCAS = fasta_obj_to_seqcol(fa_object, digest_function=self.checksum_function)
@@ -176,6 +174,20 @@ def load_fasta_from_filepath(self, filepath):
             "digest": digest,
         }
 
+    def load_from_chromsizes(self, chromsizes):
+        """
+        @param chromsizes Path to chromsizes file
+        """
+        SCAS = chrom_sizes_to_seqcol(
+            chromsizes, digest_function=self.checksum_function
+        )
+        digest = self.insert(SCAS, "SeqColArraySet", reclimit=1)
+        return {
+            "chromsizes_file": chromsizes,
+            "SCAS": SCAS,
+            "digest": digest,
+        }
+
     def load_multiple_fastas(self, fasta_dict):
         """
         Wrapper for load_fasta_from_filepath
diff --git a/seqcol/utilities.py b/seqcol/utilities.py
index 12f32bf..5e94b4c 100644
--- a/seqcol/utilities.py
+++ b/seqcol/utilities.py
@@ -105,6 +105,34 @@ def parse_fasta(fa_file) -> pyfaidx.Fasta:
             return pyfaidx.Fasta(f_out.name)
 
 
+def chrom_sizes_to_digest(chrom_sizes_file_path: str) -> str:
+    """Given a chrom.sizes file, return a digest"""
+    seqcol_obj = chrom_sizes_to_seqcol(chrom_sizes_file_path)
+    return seqcol_digest(seqcol_obj)
+
+
+def chrom_sizes_to_seqcol(
+        chrom_sizes_file_path: str,
+        digest_function: Callable[[str], str] = sha512t24u_digest,
+        ) -> dict:
+    """Given a chrom.sizes file, return a canonical seqcol object"""
+    with open(chrom_sizes_file_path, "r") as f:
+        lines = f.readlines()
+    CSC = {"lengths": [], "names": [], "sequences": [], "sorted_name_length_pairs": []}
+    for line in lines:
+        line = line.strip()
+        if line == "":
+            continue
+        seq_name, seq_length, ga4gh_digest, md5_digest = line.split("\t")
+        snlp = {"length": seq_length, "name": seq_name}  # sorted_name_length_pairs
+        snlp_digest = digest_function(canonical_str(snlp))
+        CSC["lengths"].append(int(seq_length))
+        CSC["names"].append(seq_name)
+        CSC["sequences"].append(ga4gh_digest)
+        CSC["sorted_name_length_pairs"].append(snlp_digest)
+    return CSC
+
+
 def fasta_file_to_digest(fa_file_path: str) -> str:
     """Given a fasta, return a digest"""
     seqcol_obj = fasta_file_to_seqcol(fa_file_path)