From 8acce14edbc55e927feea3109c7353286c82e3b5 Mon Sep 17 00:00:00 2001 From: nsheff Date: Mon, 11 Sep 2023 14:40:08 -0400 Subject: [PATCH 01/11] add sha512t24u_digest --- seqcol/seqcol.py | 6 +++--- seqcol/seqcol_client.py | 4 ++-- seqcol/utilities.py | 7 ++++++- 3 files changed, 11 insertions(+), 6 deletions(-) diff --git a/seqcol/seqcol.py b/seqcol/seqcol.py index 4c0ed28..9d589d3 100644 --- a/seqcol/seqcol.py +++ b/seqcol/seqcol.py @@ -48,7 +48,7 @@ def fasta_to_seqcol(fa_file_path: str) -> dict: def fasta_obj_to_seqcol( fa_object: pyfaidx.Fasta, verbose: bool = True, - digest_function: Callable[[str], str] = trunc512_digest, + digest_function: Callable[[str], str] = sha512t24u_digest, ) -> dict: """ Given a fasta object, return a CSC (Canonical Sequence Collection object) @@ -177,7 +177,7 @@ def seqcol_digest(seqcol_obj: SeqCol, schema: dict = None) -> str: seqcol_obj3 = {} for attribute in seqcol_obj2: - seqcol_obj3[attribute] = trunc512_digest(seqcol_obj2[attribute]) + seqcol_obj3[attribute] = sha512t24u_digest(seqcol_obj2[attribute]) # print(json.dumps(seqcol_obj3, indent=2)) # visualize the result # Step 4: Apply RFC-8785 again to canonicalize the JSON @@ -186,5 +186,5 @@ def seqcol_digest(seqcol_obj: SeqCol, schema: dict = None) -> str: seqcol_obj4 = canonical_str(seqcol_obj3) # Step 5: Digest the final canonical representation again. - seqcol_digest = trunc512_digest(seqcol_obj4) + seqcol_digest = sha512t24u_digest(seqcol_obj4) return seqcol_digest diff --git a/seqcol/seqcol_client.py b/seqcol/seqcol_client.py index adeaf1f..816eb24 100644 --- a/seqcol/seqcol_client.py +++ b/seqcol/seqcol_client.py @@ -8,7 +8,7 @@ from .const import * from .seqcol import * -from .utilities import trunc512_digest +from .utilities import sha512t24u_digest _LOGGER = logging.getLogger(__name__) @@ -26,7 +26,7 @@ def __init__( database={}, schemas=None, henges=None, - checksum_function=trunc512_digest, + checksum_function=sha512t24u_digest, ): """ A user interface to insert and retrieve decomposable recursive unique diff --git a/seqcol/utilities.py b/seqcol/utilities.py index d8d5c97..9f18312 100644 --- a/seqcol/utilities.py +++ b/seqcol/utilities.py @@ -15,11 +15,16 @@ # Retrieved July 2019 # http://samtools.github.io/hts-specs/refget.html def trunc512_digest(seq, offset=24) -> str: - """GA4GH digest algorithm""" digest = hashlib.sha512(seq.encode()).digest() hex_digest = binascii.hexlify(digest[:offset]) return hex_digest.decode() +def sha512t24u_digest(seq: str, offset: int = 24) -> str: + """ GA4GH digest function """ + digest = hashlib.sha512(seq.encode()).digest() + tdigest_b64us = base64.urlsafe_b64encode(digest[:offset]) + return tdigest_b64us.decode("ascii") + def canonical_str(item: dict) -> str: """Convert a dict into a canonical string representation""" From e5953318ebb156228da11b176b4365f5544e626a Mon Sep 17 00:00:00 2001 From: nsheff Date: Mon, 11 Sep 2023 18:19:24 -0400 Subject: [PATCH 02/11] minor structuring --- seqcol/__init__.py | 1 - seqcol/seqcol.py | 195 ---------------------------------------- seqcol/seqcol_client.py | 3 +- seqcol/utilities.py | 192 ++++++++++++++++++++++++++++++++++++++- 4 files changed, 191 insertions(+), 200 deletions(-) delete mode 100644 seqcol/seqcol.py diff --git a/seqcol/__init__.py b/seqcol/__init__.py index e894ea0..2ef78f9 100644 --- a/seqcol/__init__.py +++ b/seqcol/__init__.py @@ -1,5 +1,4 @@ from .const import * -from .seqcol import * from .seqcol_client import * from .utilities import * from ._version import __version__ diff --git a/seqcol/seqcol.py b/seqcol/seqcol.py deleted file mode 100644 index 3cc4eaf..0000000 --- a/seqcol/seqcol.py +++ /dev/null @@ -1,195 +0,0 @@ -import henge -import logging -import pyfaidx - -from typing import Callable - -from .utilities import * -from .const import * - - -_LOGGER = logging.getLogger(__name__) -henge.ITEM_TYPE = "_item_type" - - -def explain_flag(flag): - """Explains a compare flag""" - print(f"Flag: {flag}\nBinary: {bin(flag)}\n") - for e in range(0, 13): - if flag & 2**e: - print(FLAGS[2**e]) - -def fasta_to_digest(fa_file_path: str) -> str: - """Given a fasta, return a digest""" - seqcol_obj = fasta_to_seqcol(fa_file_path) - return seqcol_digest(seqcol_obj) - - -def parse_fasta(fa_file) -> pyfaidx.Fasta: - """ - Read in a gzipped or not gzipped FASTA file - """ - try: - return pyfaidx.Fasta(fa_file) - except pyfaidx.UnsupportedCompressionFormat: - # pyfaidx can handle bgzip but not gzip; so we just hack it here and - # gunzip the file into a temporary one and read it in not to interfere - # with the original one. - from gzip import open as gzopen - from tempfile import NamedTemporaryFile - - with gzopen(fa_file, "rt") as f_in, NamedTemporaryFile(mode="w+t", suffix=".fa") as f_out: - f_out.writelines(f_in.read()) - f_out.seek(0) - return pyfaidx.Fasta(f_out.name) - - -def fasta_to_seqcol(fa_file_path: str) -> dict: - """Given a fasta, return a canonical seqcol object""" - fa_obj = parse_fasta(fa_file_path) - return fasta_obj_to_seqcol(fa_obj) - - -def fasta_obj_to_seqcol( - fa_object: pyfaidx.Fasta, - verbose: bool = True, - digest_function: Callable[[str], str] = sha512t24u_digest, -) -> dict: - """ - Given a fasta object, return a CSC (Canonical Sequence Collection object) - """ - # CSC = SeqColArraySet - # Or maybe should be "Level 1 SC" - - CSC = {"lengths": [], "names": [], "sequences": [], "sorted_name_length_pairs": []} - seqs = fa_object.keys() - nseqs = len(seqs) - print(f"Found {nseqs} chromosomes") - i = 1 - for k in fa_object.keys(): - if verbose: - print(f"Processing ({i} of {nseqs}) {k}...") - seq = str(fa_object[k]) - seq_length = len(seq) - seq_name = fa_object[k].name - seq_digest = digest_function(seq.upper()) - snlp = {"length": seq_length, "name": seq_name} # sorted_name_length_pairs - snlp_digest = digest_function(canonical_str(snlp)) - CSC["lengths"].append(seq_length) - CSC["names"].append(seq_name) - CSC["sorted_name_length_pairs"].append(snlp_digest) - CSC["sequences"].append(seq_digest) - i += 1 - CSC["sorted_name_length_pairs"].sort() - return CSC - - -def build_sorted_name_length_pairs(obj: dict, digest_function): - """Builds the sorted_name_length_pairs attribute, which corresponds to the coordinate system""" - sorted_name_length_pairs = [] - for i in range(len(obj["names"])): - sorted_name_length_pairs.append({"length": obj["lengths"][i], "name": obj["names"][i]}) - nl_digests = [] - for i in range(len(sorted_name_length_pairs)): - nl_digests.append(digest_function(canonical_str(sorted_name_length_pairs[i]))) - - nl_digests.sort() - return nl_digests - - -def compare_seqcols(A: SeqCol, B: SeqCol): - """ - Workhorse comparison function - - @param A Sequence collection A - @param B Sequence collection B - @return dict Following formal seqcol specification comparison function return value - """ - validate_seqcol(A) # First ensure these are the right structure - validate_seqcol(B) - - all_keys = list(A.keys()) + list(set(B.keys()) - set(list(A.keys()))) - result = {} - return_obj = { - "arrays": {"a-only": [], "b-only": [], "a-and-b": []}, - "elements": { - "total": {"a": len(A["lengths"]), "b": len(B["lengths"])}, - "a-and-b": {}, - "a-and-b-same-order": {}, - }, - } - - for k in all_keys: - _LOGGER.info(k) - if k not in A: - result[k] = {"flag": -1} - return_obj["arrays"]["b-only"].append(k) - elif k not in B: - return_obj["arrays"]["a-only"].append(k) - else: - return_obj["arrays"]["a-and-b"].append(k) - res = _compare_elements(A[k], B[k]) - return_obj["elements"]["a-and-b"][k] = res["a-and-b"] - return_obj["elements"]["a-and-b-same-order"][k] = res["a-and-b-same-order"] - return return_obj - - -def _compare_elements(A: list, B: list): - """ - Compare elements between two arrays. Helper function for individual elements used by workhorse compare_seqcols function - """ - - A_filtered = list(filter(lambda x: x in B, A)) - B_filtered = list(filter(lambda x: x in A, B)) - A_count = len(A_filtered) - B_count = len(B_filtered) - overlap = min(len(A_filtered), len(B_filtered)) # counts duplicates - - if A_count + B_count < 1: - # order match requires at least 2 matching elements - order = None - elif not (A_count == B_count == overlap): - # duplicated matches means order match is undefined - order = None - else: - order = A_filtered == B_filtered - return {"a-and-b": overlap, "a-and-b-same-order": order} - - -def seqcol_digest(seqcol_obj: SeqCol, schema: dict = None) -> str: - """ - Given a canonical sequence collection, compute its digest. - - :param dict seqcol_obj: Dictionary representation of a canonical sequence collection object - :param dict schema: Schema defining the inherent attributes to digest - :return str: The sequence collection digest - """ - - validate_seqcol(seqcol_obj) - # Step 1a: Remove any non-inherent attributes, - # so that only the inherent attributes contribute to the digest. - seqcol_obj2 = {} - if schema: - for k in schema["inherent"]: - # Step 2: Apply RFC-8785 to canonicalize the value - # associated with each attribute individually. - seqcol_obj2[k] = canonical_str(seqcol_obj[k]) - else: # no schema provided, so assume all attributes are inherent - for k in seqcol_obj: - seqcol_obj2[k] = canonical_str(seqcol_obj[k]) - # Step 3: Digest each canonicalized attribute value - # using the GA4GH digest algorithm. - - seqcol_obj3 = {} - for attribute in seqcol_obj2: - seqcol_obj3[attribute] = sha512t24u_digest(seqcol_obj2[attribute]) - # print(json.dumps(seqcol_obj3, indent=2)) # visualize the result - - # Step 4: Apply RFC-8785 again to canonicalize the JSON - # of new seqcol object representation. - - seqcol_obj4 = canonical_str(seqcol_obj3) - - # Step 5: Digest the final canonical representation again. - seqcol_digest = sha512t24u_digest(seqcol_obj4) - return seqcol_digest diff --git a/seqcol/seqcol_client.py b/seqcol/seqcol_client.py index af35d29..01d0ec5 100644 --- a/seqcol/seqcol_client.py +++ b/seqcol/seqcol_client.py @@ -7,8 +7,7 @@ from itertools import compress from .const import * -from .seqcol import * -from .utilities import sha512t24u_digest +from .utilities import * _LOGGER = logging.getLogger(__name__) diff --git a/seqcol/utilities.py b/seqcol/utilities.py index c403b7c..b74bfee 100644 --- a/seqcol/utilities.py +++ b/seqcol/utilities.py @@ -1,15 +1,19 @@ +import base64 import binascii import hashlib import json +import logging import os +import pyfaidx from jsonschema import Draft7Validator -from typing import Optional +from typing import Optional, Callable from yacman import load_yaml from .const import SeqCol from .exceptions import * +_LOGGER = logging.getLogger(__name__) # Refget digests from published seqcol v1.0 protocol # Retrieved July 2019 @@ -19,6 +23,7 @@ def trunc512_digest(seq, offset=24) -> str: hex_digest = binascii.hexlify(digest[:offset]) return hex_digest.decode() + def sha512t24u_digest(seq: str, offset: int = 24) -> str: """ GA4GH digest function """ digest = hashlib.sha512(seq.encode()).digest() @@ -69,4 +74,187 @@ def format_itemwise(csc: SeqCol) -> list: "length": csc["lengths"][i], "sequence": csc["sequences"][i], }) - return list_of_dicts \ No newline at end of file + return list_of_dicts + + +def explain_flag(flag): + """Explains a compare flag""" + print(f"Flag: {flag}\nBinary: {bin(flag)}\n") + for e in range(0, 13): + if flag & 2**e: + print(FLAGS[2**e]) + +def fasta_to_digest(fa_file_path: str) -> str: + """Given a fasta, return a digest""" + seqcol_obj = fasta_to_seqcol(fa_file_path) + return seqcol_digest(seqcol_obj) + + +def parse_fasta(fa_file) -> pyfaidx.Fasta: + """ + Read in a gzipped or not gzipped FASTA file + """ + try: + return pyfaidx.Fasta(fa_file) + except pyfaidx.UnsupportedCompressionFormat: + # pyfaidx can handle bgzip but not gzip; so we just hack it here and + # gunzip the file into a temporary one and read it in not to interfere + # with the original one. + from gzip import open as gzopen + from tempfile import NamedTemporaryFile + + with gzopen(fa_file, "rt") as f_in, NamedTemporaryFile(mode="w+t", suffix=".fa") as f_out: + f_out.writelines(f_in.read()) + f_out.seek(0) + return pyfaidx.Fasta(f_out.name) + + +def fasta_to_seqcol(fa_file_path: str) -> dict: + """Given a fasta, return a canonical seqcol object""" + fa_obj = parse_fasta(fa_file_path) + return fasta_obj_to_seqcol(fa_obj) + + +def fasta_obj_to_seqcol( + fa_object: pyfaidx.Fasta, + verbose: bool = True, + digest_function: Callable[[str], str] = sha512t24u_digest, +) -> dict: + """ + Given a fasta object, return a CSC (Canonical Sequence Collection object) + """ + # CSC = SeqColArraySet + # Or maybe should be "Level 1 SC" + + CSC = {"lengths": [], "names": [], "sequences": [], "sorted_name_length_pairs": []} + seqs = fa_object.keys() + nseqs = len(seqs) + print(f"Found {nseqs} chromosomes") + i = 1 + for k in fa_object.keys(): + if verbose: + print(f"Processing ({i} of {nseqs}) {k}...") + seq = str(fa_object[k]) + seq_length = len(seq) + seq_name = fa_object[k].name + seq_digest = digest_function(seq.upper()) + snlp = {"length": seq_length, "name": seq_name} # sorted_name_length_pairs + snlp_digest = digest_function(canonical_str(snlp)) + CSC["lengths"].append(seq_length) + CSC["names"].append(seq_name) + CSC["sorted_name_length_pairs"].append(snlp_digest) + CSC["sequences"].append(seq_digest) + i += 1 + CSC["sorted_name_length_pairs"].sort() + return CSC + + +def build_sorted_name_length_pairs(obj: dict, digest_function): + """Builds the sorted_name_length_pairs attribute, which corresponds to the coordinate system""" + sorted_name_length_pairs = [] + for i in range(len(obj["names"])): + sorted_name_length_pairs.append({"length": obj["lengths"][i], "name": obj["names"][i]}) + nl_digests = [] + for i in range(len(sorted_name_length_pairs)): + nl_digests.append(digest_function(canonical_str(sorted_name_length_pairs[i]))) + + nl_digests.sort() + return nl_digests + + +def compare_seqcols(A: SeqCol, B: SeqCol): + """ + Workhorse comparison function + + @param A Sequence collection A + @param B Sequence collection B + @return dict Following formal seqcol specification comparison function return value + """ + validate_seqcol(A) # First ensure these are the right structure + validate_seqcol(B) + + all_keys = list(A.keys()) + list(set(B.keys()) - set(list(A.keys()))) + result = {} + return_obj = { + "arrays": {"a-only": [], "b-only": [], "a-and-b": []}, + "elements": { + "total": {"a": len(A["lengths"]), "b": len(B["lengths"])}, + "a-and-b": {}, + "a-and-b-same-order": {}, + }, + } + + for k in all_keys: + _LOGGER.info(k) + if k not in A: + result[k] = {"flag": -1} + return_obj["arrays"]["b-only"].append(k) + elif k not in B: + return_obj["arrays"]["a-only"].append(k) + else: + return_obj["arrays"]["a-and-b"].append(k) + res = _compare_elements(A[k], B[k]) + return_obj["elements"]["a-and-b"][k] = res["a-and-b"] + return_obj["elements"]["a-and-b-same-order"][k] = res["a-and-b-same-order"] + return return_obj + + +def _compare_elements(A: list, B: list): + """ + Compare elements between two arrays. Helper function for individual elements used by workhorse compare_seqcols function + """ + + A_filtered = list(filter(lambda x: x in B, A)) + B_filtered = list(filter(lambda x: x in A, B)) + A_count = len(A_filtered) + B_count = len(B_filtered) + overlap = min(len(A_filtered), len(B_filtered)) # counts duplicates + + if A_count + B_count < 1: + # order match requires at least 2 matching elements + order = None + elif not (A_count == B_count == overlap): + # duplicated matches means order match is undefined + order = None + else: + order = A_filtered == B_filtered + return {"a-and-b": overlap, "a-and-b-same-order": order} + + +def seqcol_digest(seqcol_obj: SeqCol, schema: dict = None) -> str: + """ + Given a canonical sequence collection, compute its digest. + + :param dict seqcol_obj: Dictionary representation of a canonical sequence collection object + :param dict schema: Schema defining the inherent attributes to digest + :return str: The sequence collection digest + """ + + validate_seqcol(seqcol_obj) + # Step 1a: Remove any non-inherent attributes, + # so that only the inherent attributes contribute to the digest. + seqcol_obj2 = {} + if schema: + for k in schema["inherent"]: + # Step 2: Apply RFC-8785 to canonicalize the value + # associated with each attribute individually. + seqcol_obj2[k] = canonical_str(seqcol_obj[k]) + else: # no schema provided, so assume all attributes are inherent + for k in seqcol_obj: + seqcol_obj2[k] = canonical_str(seqcol_obj[k]) + # Step 3: Digest each canonicalized attribute value + # using the GA4GH digest algorithm. + + seqcol_obj3 = {} + for attribute in seqcol_obj2: + seqcol_obj3[attribute] = sha512t24u_digest(seqcol_obj2[attribute]) + # print(json.dumps(seqcol_obj3, indent=2)) # visualize the result + + # Step 4: Apply RFC-8785 again to canonicalize the JSON + # of new seqcol object representation. + + seqcol_obj4 = canonical_str(seqcol_obj3) + + # Step 5: Digest the final canonical representation again. + seqcol_digest = sha512t24u_digest(seqcol_obj4) + return seqcol_digest From 63aa628832ed455a817f989f4081458e8d749823 Mon Sep 17 00:00:00 2001 From: nsheff Date: Wed, 13 Sep 2023 19:14:34 -0400 Subject: [PATCH 03/11] rename SeqColClient to SeqColHenge --- docs_jupyter/demo.ipynb | 14 +++++++------- docs_jupyter/seqcol.ipynb | 8 ++++---- interactive_tests.py | 6 +++--- seqcol/__init__.py | 2 +- seqcol/seqcol_client.py | 13 +++++++------ seqcol/utilities.py | 4 +++- tests/test_seqcol.py | 12 ++++++------ 7 files changed, 31 insertions(+), 28 deletions(-) diff --git a/docs_jupyter/demo.ipynb b/docs_jupyter/demo.ipynb index c9cf0b1..0806745 100644 --- a/docs_jupyter/demo.ipynb +++ b/docs_jupyter/demo.ipynb @@ -106,7 +106,7 @@ " trunc512_digest('TCGA'): \"TCGA\"\n", "}\n", "\n", - "scdb_local = seqcol.SeqColClient(local_lookup_dict)\n" + "scdb_local = seqcol.SeqColHenge(local_lookup_dict)\n" ] }, { @@ -616,7 +616,7 @@ "metadata": {}, "outputs": [], "source": [ - "rgdb = seqcol.SeqColClient(my_dict)" + "rgdb = seqcol.SeqColHenge(my_dict)" ] }, { @@ -947,7 +947,7 @@ "outputs": [], "source": [ "import henge \n", - "sc = seqcol.SeqColClient(database=mydict, schemas=[\"/home/nsheff/code/seqcol/seqcol/schemas/RawSeqCol.yaml\"])" + "sc = seqcol.SeqColHenge(database=mydict, schemas=[\"/home/nsheff/code/seqcol/seqcol/schemas/RawSeqCol.yaml\"])" ] }, { @@ -1039,16 +1039,16 @@ "68a178f7c740c5c240aa67ba41843b119d3bf9f8b0f0ac36 ACGT\n", "68a178f7c740c5c240aa67ba41843b119d3bf9f8b0f0ac36_item_type sequence\n", "68a178f7c740c5c240aa67ba41843b119d3bf9f8b0f0ac36_digest_version md5\n", - "25ccf153f9ac4876a631270b6bb23328f8e5fed08087a9f5 chr1\u001e", - "4\u001e", + "25ccf153f9ac4876a631270b6bb23328f8e5fed08087a9f5 chr1\u001e\n", + "4\u001e\n", "68a178f7c740c5c240aa67ba41843b119d3bf9f8b0f0ac36\n", "25ccf153f9ac4876a631270b6bb23328f8e5fed08087a9f5_item_type ASD\n", "25ccf153f9ac4876a631270b6bb23328f8e5fed08087a9f5_digest_version md5\n", "3912dddce432f3085c6b4f72a644c4c4c73f07215a9679ce TCGA\n", "3912dddce432f3085c6b4f72a644c4c4c73f07215a9679ce_item_type sequence\n", "3912dddce432f3085c6b4f72a644c4c4c73f07215a9679ce_digest_version md5\n", - "7ea134e2ee93733d2a0aa66150d9b4540ad7fafe5782715b chr2\u001e", - "4\u001e", + "7ea134e2ee93733d2a0aa66150d9b4540ad7fafe5782715b chr2\u001e\n", + "4\u001e\n", "3912dddce432f3085c6b4f72a644c4c4c73f07215a9679ce\n", "7ea134e2ee93733d2a0aa66150d9b4540ad7fafe5782715b_item_type ASD\n", "7ea134e2ee93733d2a0aa66150d9b4540ad7fafe5782715b_digest_version md5\n", diff --git a/docs_jupyter/seqcol.ipynb b/docs_jupyter/seqcol.ipynb index ea1888e..c16b011 100644 --- a/docs_jupyter/seqcol.ipynb +++ b/docs_jupyter/seqcol.ipynb @@ -38,7 +38,7 @@ } ], "source": [ - "sqdb = seqcol.SeqColClient({})" + "sqdb = seqcol.SeqColHenge({})" ] }, { @@ -129,7 +129,7 @@ } ], "source": [ - "sqdb2 = seqcol.SeqColClient({}, schemas=[\"../seqcol/schemas/RawSeqCol.yaml\"])" + "sqdb2 = seqcol.SeqColHenge({}, schemas=[\"../seqcol/schemas/RawSeqCol.yaml\"])" ] }, { @@ -293,7 +293,7 @@ } ], "source": [ - "sqdb3 = seqcol.SeqColClient({}, schemas=[\"../seqcol/schemas/TASeqCol.yaml\"])" + "sqdb3 = seqcol.SeqColHenge({}, schemas=[\"../seqcol/schemas/TASeqCol.yaml\"])" ] }, { @@ -474,7 +474,7 @@ } ], "source": [ - "sqdb4 = seqcol.SeqColClient({}, schemas=[\"../seqcol/schemas/SeqColArraySet.yaml\"])" + "sqdb4 = seqcol.SeqColHenge({}, schemas=[\"../seqcol/schemas/SeqColArraySet.yaml\"])" ] }, { diff --git a/interactive_tests.py b/interactive_tests.py index 8d32f08..7f5d6a0 100644 --- a/interactive_tests.py +++ b/interactive_tests.py @@ -1,8 +1,8 @@ import seqcol -from seqcol import SeqColClient +from seqcol import SeqColHenge -scc = SeqColClient(database={}, schemas=["seqcol/schemas/SeqColArraySet.yaml"]) +scc = SeqColHenge(database={}, schemas=["seqcol/schemas/SeqColArraySet.yaml"]) scc fa_file = "demo_fasta/demo0.fa" @@ -38,7 +38,7 @@ # Now a test of inherent attributes import seqcol -scci = seqcol.SeqColClient(database={}, schemas=["seqcol/schemas/SeqColArraySetInherent.yaml"]) +scci = seqcol.SeqColHenge(database={}, schemas=["seqcol/schemas/SeqColArraySetInherent.yaml"]) scci scci.schemas diff --git a/seqcol/__init__.py b/seqcol/__init__.py index 2ef78f9..d5db6b6 100644 --- a/seqcol/__init__.py +++ b/seqcol/__init__.py @@ -4,5 +4,5 @@ from ._version import __version__ -__classes__ = ["SeqColClient"] +__classes__ = ["SeqColHenge"] __all__ = (__classes__ + ["build_sorted_name_length_pairs", "compare", "validate_seqcol", "fasta_to_digest"],) diff --git a/seqcol/seqcol_client.py b/seqcol/seqcol_client.py index 01d0ec5..3133c3f 100644 --- a/seqcol/seqcol_client.py +++ b/seqcol/seqcol_client.py @@ -14,7 +14,7 @@ henge.ITEM_TYPE = "_item_type" -class SeqColClient(refget.RefGetClient): +class SeqColHenge(refget.RefGetClient): """ Extension of henge that accommodates collections of sequences. """ @@ -39,14 +39,14 @@ def __init__( handle the digest of the serialized items stored in this henge. """ - super(SeqColClient, self).__init__( + super(SeqColHenge, self).__init__( api_url_base=api_url_base, database=database, schemas=schemas or INTERNAL_SCHEMAS, henges=henges, checksum_function=checksum_function, ) - _LOGGER.info("Initializing SeqColClient") + _LOGGER.info("Initializing SeqColHenge") def load_fasta(self, fa_file, skip_seq=False, topology_default="linear"): """ @@ -122,7 +122,7 @@ def compare_digests(self, digestA, digestB): def retrieve(self, druid, reclimit=None, raw=False): try: - return super(SeqColClient, self).retrieve(druid, reclimit, raw) + return super(SeqColHenge, self).retrieve(druid, reclimit, raw) except henge.NotFoundException as e: _LOGGER.debug(e) try: @@ -138,7 +138,7 @@ def load_fasta_from_refgenie(self, rgc, refgenie_key): """ @param rgc RefGenConf object @param refgenie_key key of genome to load - @param scc SeqColClient object to load into + @param scc SeqColHenge object to load into """ filepath = rgc.seek(refgenie_key, "fasta") return self.load_fasta_from_filepath(filepath) @@ -165,7 +165,8 @@ def load_multiple_fastas(self, fasta_dict): @param fasta_list """ results = {} - for name, path in fasta_dict.items(): + for name in fasta_dict.keys(): + path = fasta_dict[name]["fasta"] print(f"Processing fasta '{name}'' at path '{path}'...") results[name] = self.load_fasta_from_filepath(path) return results diff --git a/seqcol/utilities.py b/seqcol/utilities.py index b74bfee..c374c53 100644 --- a/seqcol/utilities.py +++ b/seqcol/utilities.py @@ -67,6 +67,8 @@ def format_itemwise(csc: SeqCol) -> list: Format a SeqCol object into a list of dicts, one per sequence. """ list_of_dicts = [] + # TODO: handle all properties, not just these 3 + # TODO: handle non-collated attributes, somehow for i in range(len(csc["lengths"])): list_of_dicts.append( { @@ -74,7 +76,7 @@ def format_itemwise(csc: SeqCol) -> list: "length": csc["lengths"][i], "sequence": csc["sequences"][i], }) - return list_of_dicts + return {"sequences": list_of_dicts} def explain_flag(flag): diff --git a/tests/test_seqcol.py b/tests/test_seqcol.py index cf45652..f74a9ce 100644 --- a/tests/test_seqcol.py +++ b/tests/test_seqcol.py @@ -3,7 +3,7 @@ import pytest import seqcol -# from seqcol import SeqColClient, validate_seqcol, compare +# from seqcol import SeqColHenge, validate_seqcol, compare # from seqcol.const import * DEMO_FILES = [ @@ -29,16 +29,16 @@ class TestGeneral: def test_no_schemas_required(self): """ - In contrast to the generic Henge object, SeqColClient does not + In contrast to the generic Henge object, SeqColHenge does not require schemas as input, they are predefined in the constructor """ - assert isinstance(seqcol.SeqColClient(database={}), seqcol.SeqColClient) + assert isinstance(seqcol.SeqColHenge(database={}), seqcol.SeqColHenge) class TestFastaInserting: @pytest.mark.parametrize("fasta_name", DEMO_FILES) def test_fasta_loading_works(self, fasta_name, fa_root): - scc = seqcol.SeqColClient(database={}) + scc = seqcol.SeqColHenge(database={}) f = os.path.join(fa_root, fasta_name) print("Fasta file to be loaded: {}".format(f)) res = scc.load_fasta(f) @@ -48,7 +48,7 @@ def test_fasta_loading_works(self, fasta_name, fa_root): class TestRetrieval: @pytest.mark.parametrize("fasta_name", DEMO_FILES) def test_retrieval_works(self, fasta_name, fa_root): - scc = seqcol.SeqColClient(database={}) + scc = seqcol.SeqColHenge(database={}) f = os.path.join(fa_root, fasta_name) print("Fasta file to be loaded: {}".format(f)) d, asds = scc.load_fasta(f) @@ -60,7 +60,7 @@ def test_retrieval_works(self, fasta_name, fa_root): def check_comparison(fasta1, fasta2, expected_comparison): print(f"Comparison: Fasta1: {fasta1} vs Fasta2: {fasta2}. Expected: {expected_comparison}") - scc = seqcol.SeqColClient(database={}) + scc = seqcol.SeqColHenge(database={}) d = scc.load_fasta_from_filepath(fasta1) d2 = scc.load_fasta_from_filepath(fasta2) with open(expected_comparison) as fp: From 48ba7d2d7e22e34b8c5fad3925c25205c648ccd2 Mon Sep 17 00:00:00 2001 From: nsheff Date: Wed, 13 Sep 2023 19:15:12 -0400 Subject: [PATCH 04/11] lint --- seqcol/__init__.py | 5 ++++- seqcol/const.py | 1 + seqcol/utilities.py | 10 +++++++--- 3 files changed, 12 insertions(+), 4 deletions(-) diff --git a/seqcol/__init__.py b/seqcol/__init__.py index d5db6b6..38f3d72 100644 --- a/seqcol/__init__.py +++ b/seqcol/__init__.py @@ -5,4 +5,7 @@ __classes__ = ["SeqColHenge"] -__all__ = (__classes__ + ["build_sorted_name_length_pairs", "compare", "validate_seqcol", "fasta_to_digest"],) +__all__ = ( + __classes__ + + ["build_sorted_name_length_pairs", "compare", "validate_seqcol", "fasta_to_digest"], +) diff --git a/seqcol/const.py b/seqcol/const.py index 086c129..987b644 100644 --- a/seqcol/const.py +++ b/seqcol/const.py @@ -1,5 +1,6 @@ import os + def _schema_path(name): return os.path.join(SCHEMA_FILEPATH, name) diff --git a/seqcol/utilities.py b/seqcol/utilities.py index c374c53..e3cff6a 100644 --- a/seqcol/utilities.py +++ b/seqcol/utilities.py @@ -15,6 +15,7 @@ _LOGGER = logging.getLogger(__name__) + # Refget digests from published seqcol v1.0 protocol # Retrieved July 2019 # http://samtools.github.io/hts-specs/refget.html @@ -25,7 +26,7 @@ def trunc512_digest(seq, offset=24) -> str: def sha512t24u_digest(seq: str, offset: int = 24) -> str: - """ GA4GH digest function """ + """GA4GH digest function""" digest = hashlib.sha512(seq.encode()).digest() tdigest_b64us = base64.urlsafe_b64encode(digest[:offset]) return tdigest_b64us.decode("ascii") @@ -62,8 +63,9 @@ def validate_seqcol(seqcol_obj: SeqCol, schema=None) -> Optional[dict]: raise InvalidSeqColError("Validation failed", errors) return True + def format_itemwise(csc: SeqCol) -> list: - """ + """ Format a SeqCol object into a list of dicts, one per sequence. """ list_of_dicts = [] @@ -75,7 +77,8 @@ def format_itemwise(csc: SeqCol) -> list: "name": csc["names"][i], "length": csc["lengths"][i], "sequence": csc["sequences"][i], - }) + } + ) return {"sequences": list_of_dicts} @@ -86,6 +89,7 @@ def explain_flag(flag): if flag & 2**e: print(FLAGS[2**e]) + def fasta_to_digest(fa_file_path: str) -> str: """Given a fasta, return a digest""" seqcol_obj = fasta_to_seqcol(fa_file_path) From 4b09130f9de352afabd8175e1ceda566c5bbb42f Mon Sep 17 00:00:00 2001 From: nsheff Date: Wed, 13 Sep 2023 19:18:27 -0400 Subject: [PATCH 05/11] some cleanup --- seqcol/__init__.py | 2 +- seqcol/_version.py | 2 +- seqcol/{seqcol_client.py => seqcol.py} | 2 -- 3 files changed, 2 insertions(+), 4 deletions(-) rename seqcol/{seqcol_client.py => seqcol.py} (99%) diff --git a/seqcol/__init__.py b/seqcol/__init__.py index 38f3d72..b29eaad 100644 --- a/seqcol/__init__.py +++ b/seqcol/__init__.py @@ -1,5 +1,5 @@ from .const import * -from .seqcol_client import * +from .seqcol import * from .utilities import * from ._version import __version__ diff --git a/seqcol/_version.py b/seqcol/_version.py index 6892a3d..d89a3b2 100644 --- a/seqcol/_version.py +++ b/seqcol/_version.py @@ -1 +1 @@ -__version__ = "0.0.2-dev" +__version__ = "0.0.3-dev" diff --git a/seqcol/seqcol_client.py b/seqcol/seqcol.py similarity index 99% rename from seqcol/seqcol_client.py rename to seqcol/seqcol.py index 3133c3f..e18ee3f 100644 --- a/seqcol/seqcol_client.py +++ b/seqcol/seqcol.py @@ -2,8 +2,6 @@ import logging import refget -from copy import copy -from functools import reduce from itertools import compress from .const import * From 64c09a08910a3206517a8a8070ac955ae278fc09 Mon Sep 17 00:00:00 2001 From: nsheff Date: Wed, 13 Sep 2023 19:19:25 -0400 Subject: [PATCH 06/11] convert hyphens to underscores --- seqcol/utilities.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/seqcol/utilities.py b/seqcol/utilities.py index e3cff6a..4db93e2 100644 --- a/seqcol/utilities.py +++ b/seqcol/utilities.py @@ -182,11 +182,11 @@ def compare_seqcols(A: SeqCol, B: SeqCol): all_keys = list(A.keys()) + list(set(B.keys()) - set(list(A.keys()))) result = {} return_obj = { - "arrays": {"a-only": [], "b-only": [], "a-and-b": []}, + "arrays": {"a_only": [], "b_only": [], "a_and_b": []}, "elements": { "total": {"a": len(A["lengths"]), "b": len(B["lengths"])}, - "a-and-b": {}, - "a-and-b-same-order": {}, + "a_and_b": {}, + "a_and_b_same_order": {}, }, } @@ -194,14 +194,14 @@ def compare_seqcols(A: SeqCol, B: SeqCol): _LOGGER.info(k) if k not in A: result[k] = {"flag": -1} - return_obj["arrays"]["b-only"].append(k) + return_obj["arrays"]["b_only"].append(k) elif k not in B: - return_obj["arrays"]["a-only"].append(k) + return_obj["arrays"]["a_only"].append(k) else: - return_obj["arrays"]["a-and-b"].append(k) + return_obj["arrays"]["a_and_b"].append(k) res = _compare_elements(A[k], B[k]) - return_obj["elements"]["a-and-b"][k] = res["a-and-b"] - return_obj["elements"]["a-and-b-same-order"][k] = res["a-and-b-same-order"] + return_obj["elements"]["a_and_b"][k] = res["a_and_b"] + return_obj["elements"]["a_and_b_same_order"][k] = res["a_and_b_same_order"] return return_obj From 0ed6ec75819e1f6b8a71cc3ddd722d84c66055cb Mon Sep 17 00:00:00 2001 From: nsheff Date: Wed, 13 Sep 2023 19:27:51 -0400 Subject: [PATCH 07/11] lint, remove old stuff, renames --- deprecated.py | 192 ------------------------------------------- interactive_tests.py | 40 +++++---- seqcol/__init__.py | 2 +- seqcol/utilities.py | 39 +++++---- 4 files changed, 47 insertions(+), 226 deletions(-) diff --git a/deprecated.py b/deprecated.py index bca8b35..68e05b6 100644 --- a/deprecated.py +++ b/deprecated.py @@ -19,198 +19,6 @@ - - - - - - - @staticmethod - def compat_all_old(A, B): - all_keys = list(A.keys()) + list(set(B.keys()) - set(list(A.keys()))) - result = {} - flipped_format = { - "a-in-b": {}, - "b-in-a": {}, - "a-total": {}, - "b-total": {}, - "a-duplicated": {}, - "b-duplicated": {}, - "order-match": [], - "only-in-a": [], - "only-in-b": [], - } - for k in all_keys: - _LOGGER.info(k) - if k not in A: - result[k] = {"flag": -1} - flipped_format["only-in-b"].append(k) - elif k not in B: - flipped_format["only-in-a"].append(k) - else: - v = SeqColClient.compat(A[k], B[k]) - result[k] = v - if "a-in-b" in v: - flipped_format["a-in-b"][k] = v['a-in-b'] - if "b-in-a": - flipped_format["b-in-a"][k] = v['b-in-a'] - if "a-total" in v: - flipped_format["a-total"][k] = v['a-total'] - if "b-total" in v: - flipped_format["b-total"][k] = v['b-total'] - if "a-duplicated" in v: - flipped_format["a-duplicated"][k] = v['a-duplicated'] - if "b-duplicated" in v: - flipped_format["b-duplicated"][k] = v['b-duplicated'] - if "order-match" in v: - flipped_format["order-match"].append(k) - - # result = { - # "any-elements-shared": any(ainb), - # "all-a-in-b": all(ainb), - # "all-b-in-a": all(bina), - # "order-match": order, - # "flag": flag - # } - - return flipped_format - - - def compare_digests_old(self, digestA, digestB, explain=False): - """ - Given two collection checksums in the database, provide some information - about how they are related. - - :param str digestA: Digest for first sequence collection to compare. - :param str digestB: Digest for second sequence collection to compare. - :param bool explain: Print an explanation of the flag? [Default: False] - """ - typeA = self.database[digestA + henge.ITEM_TYPE] - typeB = self.database[digestB + henge.ITEM_TYPE] - - if typeA != typeB: - _LOGGER.error( - f"Can't compare objects of different types: " f"{typeA} vs {typeB}" - ) - - asdA = self.retrieve(digestA, reclimit=1) - asdB = self.retrieve(digestB, reclimit=1) - return self.compare_asds(asdA, asdB, explain=explain) - - - @staticmethod - def compare_asds(asdA, asdB, explain=False): - """ - Compare Annotated Sequence Digests (ASDs) -- digested sequences and `data - - :param str asdA: ASD for first sequence collection to compare. - :param str asdB: ASD for second sequence collection to compare. - :param bool explain: Print an explanation of the flag? [Default: False] - """ - - def _xp(prop, lst): - """Extract property from a list of dicts""" - return list(map(lambda x: x[prop], lst)) - - def _index(x, lst): - """Find an index of a sequence element in a list of dicts""" - try: - return _xp(SEQ_KEY, lst).index(x) - except: - return None - - def _get_common_content(lstA, lstB): - """ - Find the intersection between two list of dicts with sequences - """ - return list( - filter(None.__ne__, [_index(x, lstB) for x in _xp(SEQ_KEY, lstA)]) - ) - - # Not ideal, but we expect these to return lists, but if the item was - # singular only a dict is returned - if not isinstance(asdA, list): - asdA = [asdA] - if not isinstance(asdB, list): - asdB = [asdB] - - ainb = [x in _xp(SEQ_KEY, asdB) for x in _xp(SEQ_KEY, asdA)] - bina = [x in _xp(SEQ_KEY, asdA) for x in _xp(SEQ_KEY, asdB)] - - return_flag = 0 # initialize - if sum(ainb) > 1: - ordA = _get_common_content(asdA, asdB) - if ordA == sorted(ordA): - return_flag += CONTENT_A_ORDER - if sum(bina) > 1: - ordB = _get_common_content(asdB, asdA) - if ordB == sorted(ordB): - return_flag += CONTENT_B_ORDER - - ainb_len = [x in _xp(LEN_KEY, asdB) for x in _xp(LEN_KEY, asdA)] - bina_len = [x in _xp(LEN_KEY, asdA) for x in _xp(LEN_KEY, asdB)] - - ainb_name = [x in _xp(NAME_KEY, asdB) for x in _xp(NAME_KEY, asdA)] - bina_name = [x in _xp(NAME_KEY, asdA) for x in _xp(NAME_KEY, asdB)] - - ainb_topo = [x in _xp(TOPO_KEY, asdB) for x in _xp(TOPO_KEY, asdA)] - bina_topo = [x in _xp(TOPO_KEY, asdA) for x in _xp(TOPO_KEY, asdB)] - - if all(ainb): - return_flag += CONTENT_ALL_A_IN_B - if all(bina): - return_flag += CONTENT_ALL_B_IN_A - - if all(ainb_name): - return_flag += NAMES_ALL_A_IN_B - if all(bina_name): - return_flag += NAMES_ALL_B_IN_A - - if all(ainb_topo): - return_flag += TOPO_ALL_A_IN_B - if all(bina_topo): - return_flag += TOPO_ALL_B_IN_A - - if all(ainb_len): - return_flag += LENGTHS_ALL_A_IN_B - if all(bina_len): - return_flag += LENGTHS_ALL_B_IN_A - - if explain: - explain_flag(return_flag) - return return_flag - - - @staticmethod - def compat(A, B): - """ - New compatibility function for array-based data model. - """ - - lenA = len(A) - lenB = len(B) - dupeA = lenA - len(dict.fromkeys(A)) - dupeB = lenB - len(dict.fromkeys(B)) - ainb = [x in B for x in A] - bina = [x in A for x in B] - sum_ainb = sum(ainb) - if sum_ainb > 1: - order = list(compress(B, bina)) == list(compress(A, ainb)) - else: - order = False - - result = { - "a-in-b": sum_ainb, - "b-in-a": sum(bina), - "a-total": lenA, - "b-total": lenB, - "a-duplicated": dupeA, - "b-duplicated": dupeB, - "order-match": order - } - return result - - def compat(A, B): ainb = [x in B for x in A] bina = [x in A for x in B] diff --git a/interactive_tests.py b/interactive_tests.py index 7f5d6a0..e16cdaa 100644 --- a/interactive_tests.py +++ b/interactive_tests.py @@ -38,6 +38,7 @@ # Now a test of inherent attributes import seqcol + scci = seqcol.SeqColHenge(database={}, schemas=["seqcol/schemas/SeqColArraySetInherent.yaml"]) scci scci.schemas @@ -46,8 +47,8 @@ fa_file = "demo_fasta/demo0.fa" fa_object = seqcol.parse_fasta(fa_file) -array_set_i = {"names": names, "lengths": lengthsi, "sequences": sequences, "author":"urkel"} -array_set_i2 = {"names": names, "lengths": lengthsi, "sequences": sequences, "author" :"nathan"} +array_set_i = {"names": names, "lengths": lengthsi, "sequences": sequences, "author": "urkel"} +array_set_i2 = {"names": names, "lengths": lengthsi, "sequences": sequences, "author": "nathan"} di = scci.insert(array_set_i, "SeqColArraySet") @@ -55,21 +56,22 @@ di # scc.retrieve(di) scci.retrieve(di) -fasta_path="demo_fasta" +fasta_path = "demo_fasta" fasta1 = "demo2.fa" fasta2 = "demo3.fa" fasta5 = "demo5.fa.gz" fasta6 = "demo6.fa" import os + d = scci.load_fasta_from_filepath(os.path.join(fasta_path, fasta1)) d2 = scci.load_fasta_from_filepath(os.path.join(fasta_path, fasta2)) d2 = scci.load_fasta_from_filepath(os.path.join(fasta_path, fasta2)) d5 = scci.load_fasta_from_filepath(os.path.join(fasta_path, fasta5)) d6 = scci.load_fasta_from_filepath(os.path.join(fasta_path, fasta6)) -scci.retrieve(d['digest']) +scci.retrieve(d["digest"]) -scci.retrieve(d5['digest']) +scci.retrieve(d5["digest"]) fa_object = seqcol.parse_fasta(os.path.join(fasta_path, fasta1)) SCAS = seqcol.fasta_to_csc(fa_object) @@ -84,17 +86,25 @@ json.dumps(scci.compare(d5["SCAS"], d6["SCAS"])) -print(json.dumps(scci.compare(d5["SCAS"], d6["SCAS"]), separators=(",", ":"), ensure_ascii=False - ...: , allow_nan=False, sort_keys=True, indent=2)) +print( + json.dumps( + scci.compare(d5["SCAS"], d6["SCAS"]), + separators=(",", ":"), + ensure_ascii=False, + allow_nan=False, + sort_keys=True, + indent=2, + ) +) build_sorted_name_length_pairs(array_set_i) -#reorder +# reorder array_set_reordered = {} -for k,v in array_set.items(): - print(k,v) - array_set_reordered[k] = list(reversed(v)) +for k, v in array_set.items(): + print(k, v) + array_set_reordered[k] = list(reversed(v)) array_set array_set_reordered @@ -106,8 +116,6 @@ import henge - - from henge import md5 names = [] @@ -142,12 +150,10 @@ os.getcwd() - - - ## standalone functions import seqcol + fa_file = "demo_fasta/demo0.fa" fa_object = seqcol.parse_fasta(fa_file) @@ -155,8 +161,8 @@ csc = seqcol.fasta_to_csc(fa_object) csc import json + print(json.dumps(csc, indent=2)) seqcol.seqcol_digest(csc) - diff --git a/seqcol/__init__.py b/seqcol/__init__.py index b29eaad..bb6f733 100644 --- a/seqcol/__init__.py +++ b/seqcol/__init__.py @@ -7,5 +7,5 @@ __classes__ = ["SeqColHenge"] __all__ = ( __classes__ - + ["build_sorted_name_length_pairs", "compare", "validate_seqcol", "fasta_to_digest"], + + ["build_sorted_name_length_pairs", "compare", "validate_seqcol", "fasta_file_to_digest"], ) diff --git a/seqcol/utilities.py b/seqcol/utilities.py index 4db93e2..80b21e8 100644 --- a/seqcol/utilities.py +++ b/seqcol/utilities.py @@ -20,6 +20,7 @@ # Retrieved July 2019 # http://samtools.github.io/hts-specs/refget.html def trunc512_digest(seq, offset=24) -> str: + """Deprecated GA4GH digest function""" digest = hashlib.sha512(seq.encode()).digest() hex_digest = binascii.hexlify(digest[:offset]) return hex_digest.decode() @@ -44,17 +45,23 @@ def print_csc(csc: dict) -> str: return print(json.dumps(csc, indent=2)) -# Simple true/false validation def validate_seqcol_bool(seqcol_obj: SeqCol, schema=None) -> bool: + """ + Validate a seqcol object against the seqcol schema. Returns True if valid, False if not. + + To enumerate the errors, use validate_seqcol instead. + """ schema_path = os.path.join(os.path.dirname(__file__), "schemas", "seqcol.yaml") schema = load_yaml(schema_path) validator = Draft7Validator(schema) return validator.is_valid(seqcol_obj) -# Get errors if invalid (use this one) -# Get the errors with exception.errors def validate_seqcol(seqcol_obj: SeqCol, schema=None) -> Optional[dict]: + """Validate a seqcol object against the seqcol schema. + Returns True if valid, raises InvalidSeqColError if not, which enumerates the errors. + Retrieve individual errors with exception.errors + """ schema_path = os.path.join(os.path.dirname(__file__), "schemas", "seqcol.yaml") schema = load_yaml(schema_path) validator = Draft7Validator(schema) @@ -82,17 +89,9 @@ def format_itemwise(csc: SeqCol) -> list: return {"sequences": list_of_dicts} -def explain_flag(flag): - """Explains a compare flag""" - print(f"Flag: {flag}\nBinary: {bin(flag)}\n") - for e in range(0, 13): - if flag & 2**e: - print(FLAGS[2**e]) - - -def fasta_to_digest(fa_file_path: str) -> str: +def fasta_file_to_digest(fa_file_path: str) -> str: """Given a fasta, return a digest""" - seqcol_obj = fasta_to_seqcol(fa_file_path) + seqcol_obj = fasta_file_to_seqcol(fa_file_path) return seqcol_digest(seqcol_obj) @@ -115,7 +114,7 @@ def parse_fasta(fa_file) -> pyfaidx.Fasta: return pyfaidx.Fasta(f_out.name) -def fasta_to_seqcol(fa_file_path: str) -> dict: +def fasta_file_to_seqcol(fa_file_path: str) -> dict: """Given a fasta, return a canonical seqcol object""" fa_obj = parse_fasta(fa_file_path) return fasta_obj_to_seqcol(fa_obj) @@ -130,7 +129,7 @@ def fasta_obj_to_seqcol( Given a fasta object, return a CSC (Canonical Sequence Collection object) """ # CSC = SeqColArraySet - # Or maybe should be "Level 1 SC" + # Or equivalently, a "Level 1 SeqCol" CSC = {"lengths": [], "names": [], "sequences": [], "sorted_name_length_pairs": []} seqs = fa_object.keys() @@ -160,7 +159,7 @@ def build_sorted_name_length_pairs(obj: dict, digest_function): sorted_name_length_pairs = [] for i in range(len(obj["names"])): sorted_name_length_pairs.append({"length": obj["lengths"][i], "name": obj["names"][i]}) - nl_digests = [] + nl_digests = [] # name-length digests for i in range(len(sorted_name_length_pairs)): nl_digests.append(digest_function(canonical_str(sorted_name_length_pairs[i]))) @@ -264,3 +263,11 @@ def seqcol_digest(seqcol_obj: SeqCol, schema: dict = None) -> str: # Step 5: Digest the final canonical representation again. seqcol_digest = sha512t24u_digest(seqcol_obj4) return seqcol_digest + + +def explain_flag(flag): + """Explains a compare flag""" + print(f"Flag: {flag}\nBinary: {bin(flag)}\n") + for e in range(0, 13): + if flag & 2**e: + print(FLAGS[2**e]) From 5a55798c3faf896bead6fdacd16d73dd0e2a8bc4 Mon Sep 17 00:00:00 2001 From: nsheff Date: Thu, 14 Sep 2023 08:56:29 -0400 Subject: [PATCH 08/11] order --- seqcol/utilities.py | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/seqcol/utilities.py b/seqcol/utilities.py index 80b21e8..5133221 100644 --- a/seqcol/utilities.py +++ b/seqcol/utilities.py @@ -16,9 +16,6 @@ _LOGGER = logging.getLogger(__name__) -# Refget digests from published seqcol v1.0 protocol -# Retrieved July 2019 -# http://samtools.github.io/hts-specs/refget.html def trunc512_digest(seq, offset=24) -> str: """Deprecated GA4GH digest function""" digest = hashlib.sha512(seq.encode()).digest() @@ -89,12 +86,6 @@ def format_itemwise(csc: SeqCol) -> list: return {"sequences": list_of_dicts} -def fasta_file_to_digest(fa_file_path: str) -> str: - """Given a fasta, return a digest""" - seqcol_obj = fasta_file_to_seqcol(fa_file_path) - return seqcol_digest(seqcol_obj) - - def parse_fasta(fa_file) -> pyfaidx.Fasta: """ Read in a gzipped or not gzipped FASTA file @@ -114,6 +105,12 @@ def parse_fasta(fa_file) -> pyfaidx.Fasta: return pyfaidx.Fasta(f_out.name) +def fasta_file_to_digest(fa_file_path: str) -> str: + """Given a fasta, return a digest""" + seqcol_obj = fasta_file_to_seqcol(fa_file_path) + return seqcol_digest(seqcol_obj) + + def fasta_file_to_seqcol(fa_file_path: str) -> dict: """Given a fasta, return a canonical seqcol object""" fa_obj = parse_fasta(fa_file_path) From 7974c0147ec0a0a568e1857a415c56f5a7261439 Mon Sep 17 00:00:00 2001 From: nsheff Date: Thu, 14 Sep 2023 09:19:12 -0400 Subject: [PATCH 09/11] hyphen to underscore --- demo_fasta/compare-0vs1.json | 10 +++++----- demo_fasta/compare-1vs1.json | 10 +++++----- demo_fasta/compare-5vs6.json | 10 +++++----- seqcol/utilities.py | 2 +- 4 files changed, 16 insertions(+), 16 deletions(-) diff --git a/demo_fasta/compare-0vs1.json b/demo_fasta/compare-0vs1.json index 719cd2f..d1bc8a2 100644 --- a/demo_fasta/compare-0vs1.json +++ b/demo_fasta/compare-0vs1.json @@ -1,22 +1,22 @@ { "arrays":{ - "a-and-b":[ + "a_and_b":[ "lengths", "names", "sequences", "sorted_name_length_pairs" ], - "a-only":[], - "b-only":[] + "a_only":[], + "b_only":[] }, "elements":{ - "a-and-b":{ + "a_and_b":{ "lengths":2, "names":2, "sorted_name_length_pairs":2, "sequences":0 }, - "a-and-b-same-order":{ + "a_and_b_same_order":{ "lengths":true, "names":true, "sorted_name_length_pairs":true, diff --git a/demo_fasta/compare-1vs1.json b/demo_fasta/compare-1vs1.json index b944bc5..87b71b4 100644 --- a/demo_fasta/compare-1vs1.json +++ b/demo_fasta/compare-1vs1.json @@ -1,22 +1,22 @@ { "arrays":{ - "a-and-b":[ + "a_and_b":[ "lengths", "names", "sequences", "sorted_name_length_pairs" ], - "a-only":[], - "b-only":[] + "a_only":[], + "b_only":[] }, "elements":{ - "a-and-b":{ + "a_and_b":{ "lengths":2, "names":2, "sorted_name_length_pairs":2, "sequences":2 }, - "a-and-b-same-order":{ + "a_and_b_same_order":{ "lengths":true, "names":true, "sorted_name_length_pairs":true, diff --git a/demo_fasta/compare-5vs6.json b/demo_fasta/compare-5vs6.json index 56114f8..2dfed9f 100644 --- a/demo_fasta/compare-5vs6.json +++ b/demo_fasta/compare-5vs6.json @@ -1,22 +1,22 @@ { "arrays":{ - "a-and-b":[ + "a_and_b":[ "lengths", "names", "sequences", "sorted_name_length_pairs" ], - "a-only":[], - "b-only":[] + "a_only":[], + "b_only":[] }, "elements":{ - "a-and-b":{ + "a_and_b":{ "lengths":3, "names":3, "sorted_name_length_pairs":3, "sequences":3 }, - "a-and-b-same-order":{ + "a_and_b_same_order":{ "lengths":false, "names":false, "sorted_name_length_pairs":true, diff --git a/seqcol/utilities.py b/seqcol/utilities.py index 5133221..a902722 100644 --- a/seqcol/utilities.py +++ b/seqcol/utilities.py @@ -220,7 +220,7 @@ def _compare_elements(A: list, B: list): order = None else: order = A_filtered == B_filtered - return {"a-and-b": overlap, "a-and-b-same-order": order} + return {"a_and_b": overlap, "a_and_b_same_order": order} def seqcol_digest(seqcol_obj: SeqCol, schema: dict = None) -> str: From 2d335c3bae24bbfc1060f56561f377e9c80460d6 Mon Sep 17 00:00:00 2001 From: nsheff Date: Thu, 14 Sep 2023 16:39:05 -0400 Subject: [PATCH 10/11] update to refget2, seq prefix, remove refget extension --- seqcol/seqcol.py | 28 ++++++++++++++++++++++++---- seqcol/utilities.py | 2 +- 2 files changed, 25 insertions(+), 5 deletions(-) diff --git a/seqcol/seqcol.py b/seqcol/seqcol.py index e18ee3f..2b7e3e5 100644 --- a/seqcol/seqcol.py +++ b/seqcol/seqcol.py @@ -1,6 +1,6 @@ import henge import logging -import refget +import yacman from itertools import compress @@ -12,14 +12,34 @@ henge.ITEM_TYPE = "_item_type" -class SeqColHenge(refget.RefGetClient): +class SeqColConf(yacman.YAMLConfigManager): + """ + Simple configuration manager object for SeqColHenge. + """ + def __init__( + self, + entries={}, + filepath=None, + yamldata=None, + writable=False, + wait_max=60, + skip_read_lock=False, + ): + filepath = yacman.select_config( + config_filepath=filepath, + config_env_vars=["SEQCOLAPI_CONFIG"], + config_name="seqcol" + ) + super(SeqColConf, self).__init__(entries, filepath, yamldata, writable) + + +class SeqColHenge(henge.Henge): """ Extension of henge that accommodates collections of sequences. """ def __init__( self, - api_url_base=None, database={}, schemas=None, henges=None, @@ -38,7 +58,6 @@ def __init__( serialized items stored in this henge. """ super(SeqColHenge, self).__init__( - api_url_base=api_url_base, database=database, schemas=schemas or INTERNAL_SCHEMAS, henges=henges, @@ -123,6 +142,7 @@ def retrieve(self, druid, reclimit=None, raw=False): return super(SeqColHenge, self).retrieve(druid, reclimit, raw) except henge.NotFoundException as e: _LOGGER.debug(e) + raise e try: return self.refget(druid) except Exception as e: diff --git a/seqcol/utilities.py b/seqcol/utilities.py index a902722..12f32bf 100644 --- a/seqcol/utilities.py +++ b/seqcol/utilities.py @@ -139,7 +139,7 @@ def fasta_obj_to_seqcol( seq = str(fa_object[k]) seq_length = len(seq) seq_name = fa_object[k].name - seq_digest = digest_function(seq.upper()) + seq_digest = "SQ."+digest_function(seq.upper()) snlp = {"length": seq_length, "name": seq_name} # sorted_name_length_pairs snlp_digest = digest_function(canonical_str(snlp)) CSC["lengths"].append(seq_length) From 5c7fb31bb2af6818f9279203186e087f97624253 Mon Sep 17 00:00:00 2001 From: nsheff Date: Fri, 15 Sep 2023 17:04:00 -0400 Subject: [PATCH 11/11] add ability to load from chromsizes --- seqcol/seqcol.py | 16 ++++++++++++++-- seqcol/utilities.py | 28 ++++++++++++++++++++++++++++ 2 files changed, 42 insertions(+), 2 deletions(-) diff --git a/seqcol/seqcol.py b/seqcol/seqcol.py index 2b7e3e5..634952d 100644 --- a/seqcol/seqcol.py +++ b/seqcol/seqcol.py @@ -156,7 +156,6 @@ def load_fasta_from_refgenie(self, rgc, refgenie_key): """ @param rgc RefGenConf object @param refgenie_key key of genome to load - @param scc SeqColHenge object to load into """ filepath = rgc.seek(refgenie_key, "fasta") return self.load_fasta_from_filepath(filepath) @@ -164,7 +163,6 @@ def load_fasta_from_refgenie(self, rgc, refgenie_key): def load_fasta_from_filepath(self, filepath): """ @param filepath Path to fasta file - @param sc """ fa_object = parse_fasta(filepath) SCAS = fasta_obj_to_seqcol(fa_object, digest_function=self.checksum_function) @@ -176,6 +174,20 @@ def load_fasta_from_filepath(self, filepath): "digest": digest, } + def load_from_chromsizes(self, chromsizes): + """ + @param chromsizes Path to chromsizes file + """ + SCAS = chrom_sizes_to_seqcol( + chromsizes, digest_function=self.checksum_function + ) + digest = self.insert(SCAS, "SeqColArraySet", reclimit=1) + return { + "chromsizes_file": chromsizes, + "SCAS": SCAS, + "digest": digest, + } + def load_multiple_fastas(self, fasta_dict): """ Wrapper for load_fasta_from_filepath diff --git a/seqcol/utilities.py b/seqcol/utilities.py index 12f32bf..5e94b4c 100644 --- a/seqcol/utilities.py +++ b/seqcol/utilities.py @@ -105,6 +105,34 @@ def parse_fasta(fa_file) -> pyfaidx.Fasta: return pyfaidx.Fasta(f_out.name) +def chrom_sizes_to_digest(chrom_sizes_file_path: str) -> str: + """Given a chrom.sizes file, return a digest""" + seqcol_obj = chrom_sizes_to_seqcol(chrom_sizes_file_path) + return seqcol_digest(seqcol_obj) + + +def chrom_sizes_to_seqcol( + chrom_sizes_file_path: str, + digest_function: Callable[[str], str] = sha512t24u_digest, + ) -> dict: + """Given a chrom.sizes file, return a canonical seqcol object""" + with open(chrom_sizes_file_path, "r") as f: + lines = f.readlines() + CSC = {"lengths": [], "names": [], "sequences": [], "sorted_name_length_pairs": []} + for line in lines: + line = line.strip() + if line == "": + continue + seq_name, seq_length, ga4gh_digest, md5_digest = line.split("\t") + snlp = {"length": seq_length, "name": seq_name} # sorted_name_length_pairs + snlp_digest = digest_function(canonical_str(snlp)) + CSC["lengths"].append(int(seq_length)) + CSC["names"].append(seq_name) + CSC["sequences"].append(ga4gh_digest) + CSC["sorted_name_length_pairs"].append(snlp_digest) + return CSC + + def fasta_file_to_digest(fa_file_path: str) -> str: """Given a fasta, return a digest""" seqcol_obj = fasta_file_to_seqcol(fa_file_path)