Skip to content
This repository has been archived by the owner on Feb 14, 2024. It is now read-only.

Commit

Permalink
lint, remove old stuff, renames
Browse files Browse the repository at this point in the history
  • Loading branch information
nsheff committed Sep 13, 2023
1 parent 64c09a0 commit 0ed6ec7
Show file tree
Hide file tree
Showing 4 changed files with 47 additions and 226 deletions.
192 changes: 0 additions & 192 deletions deprecated.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,198 +19,6 @@









@staticmethod
def compat_all_old(A, B):
all_keys = list(A.keys()) + list(set(B.keys()) - set(list(A.keys())))
result = {}
flipped_format = {
"a-in-b": {},
"b-in-a": {},
"a-total": {},
"b-total": {},
"a-duplicated": {},
"b-duplicated": {},
"order-match": [],
"only-in-a": [],
"only-in-b": [],
}
for k in all_keys:
_LOGGER.info(k)
if k not in A:
result[k] = {"flag": -1}
flipped_format["only-in-b"].append(k)
elif k not in B:
flipped_format["only-in-a"].append(k)
else:
v = SeqColClient.compat(A[k], B[k])
result[k] = v
if "a-in-b" in v:
flipped_format["a-in-b"][k] = v['a-in-b']
if "b-in-a":
flipped_format["b-in-a"][k] = v['b-in-a']
if "a-total" in v:
flipped_format["a-total"][k] = v['a-total']
if "b-total" in v:
flipped_format["b-total"][k] = v['b-total']
if "a-duplicated" in v:
flipped_format["a-duplicated"][k] = v['a-duplicated']
if "b-duplicated" in v:
flipped_format["b-duplicated"][k] = v['b-duplicated']
if "order-match" in v:
flipped_format["order-match"].append(k)

# result = {
# "any-elements-shared": any(ainb),
# "all-a-in-b": all(ainb),
# "all-b-in-a": all(bina),
# "order-match": order,
# "flag": flag
# }

return flipped_format


def compare_digests_old(self, digestA, digestB, explain=False):
"""
Given two collection checksums in the database, provide some information
about how they are related.
:param str digestA: Digest for first sequence collection to compare.
:param str digestB: Digest for second sequence collection to compare.
:param bool explain: Print an explanation of the flag? [Default: False]
"""
typeA = self.database[digestA + henge.ITEM_TYPE]
typeB = self.database[digestB + henge.ITEM_TYPE]

if typeA != typeB:
_LOGGER.error(
f"Can't compare objects of different types: " f"{typeA} vs {typeB}"
)

asdA = self.retrieve(digestA, reclimit=1)
asdB = self.retrieve(digestB, reclimit=1)
return self.compare_asds(asdA, asdB, explain=explain)


@staticmethod
def compare_asds(asdA, asdB, explain=False):
"""
Compare Annotated Sequence Digests (ASDs) -- digested sequences and `data
:param str asdA: ASD for first sequence collection to compare.
:param str asdB: ASD for second sequence collection to compare.
:param bool explain: Print an explanation of the flag? [Default: False]
"""

def _xp(prop, lst):
"""Extract property from a list of dicts"""
return list(map(lambda x: x[prop], lst))

def _index(x, lst):
"""Find an index of a sequence element in a list of dicts"""
try:
return _xp(SEQ_KEY, lst).index(x)
except:
return None

def _get_common_content(lstA, lstB):
"""
Find the intersection between two list of dicts with sequences
"""
return list(
filter(None.__ne__, [_index(x, lstB) for x in _xp(SEQ_KEY, lstA)])
)

# Not ideal, but we expect these to return lists, but if the item was
# singular only a dict is returned
if not isinstance(asdA, list):
asdA = [asdA]
if not isinstance(asdB, list):
asdB = [asdB]

ainb = [x in _xp(SEQ_KEY, asdB) for x in _xp(SEQ_KEY, asdA)]
bina = [x in _xp(SEQ_KEY, asdA) for x in _xp(SEQ_KEY, asdB)]

return_flag = 0 # initialize
if sum(ainb) > 1:
ordA = _get_common_content(asdA, asdB)
if ordA == sorted(ordA):
return_flag += CONTENT_A_ORDER
if sum(bina) > 1:
ordB = _get_common_content(asdB, asdA)
if ordB == sorted(ordB):
return_flag += CONTENT_B_ORDER

ainb_len = [x in _xp(LEN_KEY, asdB) for x in _xp(LEN_KEY, asdA)]
bina_len = [x in _xp(LEN_KEY, asdA) for x in _xp(LEN_KEY, asdB)]

ainb_name = [x in _xp(NAME_KEY, asdB) for x in _xp(NAME_KEY, asdA)]
bina_name = [x in _xp(NAME_KEY, asdA) for x in _xp(NAME_KEY, asdB)]

ainb_topo = [x in _xp(TOPO_KEY, asdB) for x in _xp(TOPO_KEY, asdA)]
bina_topo = [x in _xp(TOPO_KEY, asdA) for x in _xp(TOPO_KEY, asdB)]

if all(ainb):
return_flag += CONTENT_ALL_A_IN_B
if all(bina):
return_flag += CONTENT_ALL_B_IN_A

if all(ainb_name):
return_flag += NAMES_ALL_A_IN_B
if all(bina_name):
return_flag += NAMES_ALL_B_IN_A

if all(ainb_topo):
return_flag += TOPO_ALL_A_IN_B
if all(bina_topo):
return_flag += TOPO_ALL_B_IN_A

if all(ainb_len):
return_flag += LENGTHS_ALL_A_IN_B
if all(bina_len):
return_flag += LENGTHS_ALL_B_IN_A

if explain:
explain_flag(return_flag)
return return_flag


@staticmethod
def compat(A, B):
"""
New compatibility function for array-based data model.
"""

lenA = len(A)
lenB = len(B)
dupeA = lenA - len(dict.fromkeys(A))
dupeB = lenB - len(dict.fromkeys(B))
ainb = [x in B for x in A]
bina = [x in A for x in B]
sum_ainb = sum(ainb)
if sum_ainb > 1:
order = list(compress(B, bina)) == list(compress(A, ainb))
else:
order = False

result = {
"a-in-b": sum_ainb,
"b-in-a": sum(bina),
"a-total": lenA,
"b-total": lenB,
"a-duplicated": dupeA,
"b-duplicated": dupeB,
"order-match": order
}
return result


def compat(A, B):
ainb = [x in B for x in A]
bina = [x in A for x in B]
Expand Down
40 changes: 23 additions & 17 deletions interactive_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@

# Now a test of inherent attributes
import seqcol

scci = seqcol.SeqColHenge(database={}, schemas=["seqcol/schemas/SeqColArraySetInherent.yaml"])
scci
scci.schemas
Expand All @@ -46,30 +47,31 @@
fa_file = "demo_fasta/demo0.fa"
fa_object = seqcol.parse_fasta(fa_file)

array_set_i = {"names": names, "lengths": lengthsi, "sequences": sequences, "author":"urkel"}
array_set_i2 = {"names": names, "lengths": lengthsi, "sequences": sequences, "author" :"nathan"}
array_set_i = {"names": names, "lengths": lengthsi, "sequences": sequences, "author": "urkel"}
array_set_i2 = {"names": names, "lengths": lengthsi, "sequences": sequences, "author": "nathan"}


di = scci.insert(array_set_i, "SeqColArraySet")
di = scci.insert(array_set_i2, "SeqColArraySet")
di
# scc.retrieve(di)
scci.retrieve(di)
fasta_path="demo_fasta"
fasta_path = "demo_fasta"
fasta1 = "demo2.fa"
fasta2 = "demo3.fa"
fasta5 = "demo5.fa.gz"
fasta6 = "demo6.fa"

import os

d = scci.load_fasta_from_filepath(os.path.join(fasta_path, fasta1))
d2 = scci.load_fasta_from_filepath(os.path.join(fasta_path, fasta2))
d2 = scci.load_fasta_from_filepath(os.path.join(fasta_path, fasta2))
d5 = scci.load_fasta_from_filepath(os.path.join(fasta_path, fasta5))
d6 = scci.load_fasta_from_filepath(os.path.join(fasta_path, fasta6))
scci.retrieve(d['digest'])
scci.retrieve(d["digest"])

scci.retrieve(d5['digest'])
scci.retrieve(d5["digest"])

fa_object = seqcol.parse_fasta(os.path.join(fasta_path, fasta1))
SCAS = seqcol.fasta_to_csc(fa_object)
Expand All @@ -84,17 +86,25 @@


json.dumps(scci.compare(d5["SCAS"], d6["SCAS"]))
print(json.dumps(scci.compare(d5["SCAS"], d6["SCAS"]), separators=(",", ":"), ensure_ascii=False
...: , allow_nan=False, sort_keys=True, indent=2))
print(
json.dumps(
scci.compare(d5["SCAS"], d6["SCAS"]),
separators=(",", ":"),
ensure_ascii=False,
allow_nan=False,
sort_keys=True,
indent=2,
)
)

build_sorted_name_length_pairs(array_set_i)

#reorder
# reorder

array_set_reordered = {}
for k,v in array_set.items():
print(k,v)
array_set_reordered[k] = list(reversed(v))
for k, v in array_set.items():
print(k, v)
array_set_reordered[k] = list(reversed(v))

array_set
array_set_reordered
Expand All @@ -106,8 +116,6 @@
import henge




from henge import md5

names = []
Expand Down Expand Up @@ -142,21 +150,19 @@
os.getcwd()





## standalone functions

import seqcol

fa_file = "demo_fasta/demo0.fa"
fa_object = seqcol.parse_fasta(fa_file)

# get a canonical seqcol object
csc = seqcol.fasta_to_csc(fa_object)
csc
import json

print(json.dumps(csc, indent=2))


seqcol.seqcol_digest(csc)

2 changes: 1 addition & 1 deletion seqcol/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,5 +7,5 @@
__classes__ = ["SeqColHenge"]
__all__ = (
__classes__
+ ["build_sorted_name_length_pairs", "compare", "validate_seqcol", "fasta_to_digest"],
+ ["build_sorted_name_length_pairs", "compare", "validate_seqcol", "fasta_file_to_digest"],
)
Loading

0 comments on commit 0ed6ec7

Please sign in to comment.