Skip to content
This repository has been archived by the owner on Feb 14, 2024. It is now read-only.

Commit

Permalink
Merge pull request #6 from refgenie/dev
Browse files Browse the repository at this point in the history
Dev
  • Loading branch information
nsheff authored Sep 18, 2023
2 parents 70e51ce + 5c7fb31 commit 3981852
Show file tree
Hide file tree
Showing 14 changed files with 493 additions and 610 deletions.
10 changes: 5 additions & 5 deletions demo_fasta/compare-0vs1.json
Original file line number Diff line number Diff line change
@@ -1,22 +1,22 @@
{
"arrays":{
"a-and-b":[
"a_and_b":[
"lengths",
"names",
"sequences",
"sorted_name_length_pairs"
],
"a-only":[],
"b-only":[]
"a_only":[],
"b_only":[]
},
"elements":{
"a-and-b":{
"a_and_b":{
"lengths":2,
"names":2,
"sorted_name_length_pairs":2,
"sequences":0
},
"a-and-b-same-order":{
"a_and_b_same_order":{
"lengths":true,
"names":true,
"sorted_name_length_pairs":true,
Expand Down
10 changes: 5 additions & 5 deletions demo_fasta/compare-1vs1.json
Original file line number Diff line number Diff line change
@@ -1,22 +1,22 @@
{
"arrays":{
"a-and-b":[
"a_and_b":[
"lengths",
"names",
"sequences",
"sorted_name_length_pairs"
],
"a-only":[],
"b-only":[]
"a_only":[],
"b_only":[]
},
"elements":{
"a-and-b":{
"a_and_b":{
"lengths":2,
"names":2,
"sorted_name_length_pairs":2,
"sequences":2
},
"a-and-b-same-order":{
"a_and_b_same_order":{
"lengths":true,
"names":true,
"sorted_name_length_pairs":true,
Expand Down
10 changes: 5 additions & 5 deletions demo_fasta/compare-5vs6.json
Original file line number Diff line number Diff line change
@@ -1,22 +1,22 @@
{
"arrays":{
"a-and-b":[
"a_and_b":[
"lengths",
"names",
"sequences",
"sorted_name_length_pairs"
],
"a-only":[],
"b-only":[]
"a_only":[],
"b_only":[]
},
"elements":{
"a-and-b":{
"a_and_b":{
"lengths":3,
"names":3,
"sorted_name_length_pairs":3,
"sequences":3
},
"a-and-b-same-order":{
"a_and_b_same_order":{
"lengths":false,
"names":false,
"sorted_name_length_pairs":true,
Expand Down
192 changes: 0 additions & 192 deletions deprecated.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,198 +19,6 @@









@staticmethod
def compat_all_old(A, B):
all_keys = list(A.keys()) + list(set(B.keys()) - set(list(A.keys())))
result = {}
flipped_format = {
"a-in-b": {},
"b-in-a": {},
"a-total": {},
"b-total": {},
"a-duplicated": {},
"b-duplicated": {},
"order-match": [],
"only-in-a": [],
"only-in-b": [],
}
for k in all_keys:
_LOGGER.info(k)
if k not in A:
result[k] = {"flag": -1}
flipped_format["only-in-b"].append(k)
elif k not in B:
flipped_format["only-in-a"].append(k)
else:
v = SeqColClient.compat(A[k], B[k])
result[k] = v
if "a-in-b" in v:
flipped_format["a-in-b"][k] = v['a-in-b']
if "b-in-a":
flipped_format["b-in-a"][k] = v['b-in-a']
if "a-total" in v:
flipped_format["a-total"][k] = v['a-total']
if "b-total" in v:
flipped_format["b-total"][k] = v['b-total']
if "a-duplicated" in v:
flipped_format["a-duplicated"][k] = v['a-duplicated']
if "b-duplicated" in v:
flipped_format["b-duplicated"][k] = v['b-duplicated']
if "order-match" in v:
flipped_format["order-match"].append(k)

# result = {
# "any-elements-shared": any(ainb),
# "all-a-in-b": all(ainb),
# "all-b-in-a": all(bina),
# "order-match": order,
# "flag": flag
# }

return flipped_format


def compare_digests_old(self, digestA, digestB, explain=False):
"""
Given two collection checksums in the database, provide some information
about how they are related.
:param str digestA: Digest for first sequence collection to compare.
:param str digestB: Digest for second sequence collection to compare.
:param bool explain: Print an explanation of the flag? [Default: False]
"""
typeA = self.database[digestA + henge.ITEM_TYPE]
typeB = self.database[digestB + henge.ITEM_TYPE]

if typeA != typeB:
_LOGGER.error(
f"Can't compare objects of different types: " f"{typeA} vs {typeB}"
)

asdA = self.retrieve(digestA, reclimit=1)
asdB = self.retrieve(digestB, reclimit=1)
return self.compare_asds(asdA, asdB, explain=explain)


@staticmethod
def compare_asds(asdA, asdB, explain=False):
"""
Compare Annotated Sequence Digests (ASDs) -- digested sequences and `data
:param str asdA: ASD for first sequence collection to compare.
:param str asdB: ASD for second sequence collection to compare.
:param bool explain: Print an explanation of the flag? [Default: False]
"""

def _xp(prop, lst):
"""Extract property from a list of dicts"""
return list(map(lambda x: x[prop], lst))

def _index(x, lst):
"""Find an index of a sequence element in a list of dicts"""
try:
return _xp(SEQ_KEY, lst).index(x)
except:
return None

def _get_common_content(lstA, lstB):
"""
Find the intersection between two list of dicts with sequences
"""
return list(
filter(None.__ne__, [_index(x, lstB) for x in _xp(SEQ_KEY, lstA)])
)

# Not ideal, but we expect these to return lists, but if the item was
# singular only a dict is returned
if not isinstance(asdA, list):
asdA = [asdA]
if not isinstance(asdB, list):
asdB = [asdB]

ainb = [x in _xp(SEQ_KEY, asdB) for x in _xp(SEQ_KEY, asdA)]
bina = [x in _xp(SEQ_KEY, asdA) for x in _xp(SEQ_KEY, asdB)]

return_flag = 0 # initialize
if sum(ainb) > 1:
ordA = _get_common_content(asdA, asdB)
if ordA == sorted(ordA):
return_flag += CONTENT_A_ORDER
if sum(bina) > 1:
ordB = _get_common_content(asdB, asdA)
if ordB == sorted(ordB):
return_flag += CONTENT_B_ORDER

ainb_len = [x in _xp(LEN_KEY, asdB) for x in _xp(LEN_KEY, asdA)]
bina_len = [x in _xp(LEN_KEY, asdA) for x in _xp(LEN_KEY, asdB)]

ainb_name = [x in _xp(NAME_KEY, asdB) for x in _xp(NAME_KEY, asdA)]
bina_name = [x in _xp(NAME_KEY, asdA) for x in _xp(NAME_KEY, asdB)]

ainb_topo = [x in _xp(TOPO_KEY, asdB) for x in _xp(TOPO_KEY, asdA)]
bina_topo = [x in _xp(TOPO_KEY, asdA) for x in _xp(TOPO_KEY, asdB)]

if all(ainb):
return_flag += CONTENT_ALL_A_IN_B
if all(bina):
return_flag += CONTENT_ALL_B_IN_A

if all(ainb_name):
return_flag += NAMES_ALL_A_IN_B
if all(bina_name):
return_flag += NAMES_ALL_B_IN_A

if all(ainb_topo):
return_flag += TOPO_ALL_A_IN_B
if all(bina_topo):
return_flag += TOPO_ALL_B_IN_A

if all(ainb_len):
return_flag += LENGTHS_ALL_A_IN_B
if all(bina_len):
return_flag += LENGTHS_ALL_B_IN_A

if explain:
explain_flag(return_flag)
return return_flag


@staticmethod
def compat(A, B):
"""
New compatibility function for array-based data model.
"""

lenA = len(A)
lenB = len(B)
dupeA = lenA - len(dict.fromkeys(A))
dupeB = lenB - len(dict.fromkeys(B))
ainb = [x in B for x in A]
bina = [x in A for x in B]
sum_ainb = sum(ainb)
if sum_ainb > 1:
order = list(compress(B, bina)) == list(compress(A, ainb))
else:
order = False

result = {
"a-in-b": sum_ainb,
"b-in-a": sum(bina),
"a-total": lenA,
"b-total": lenB,
"a-duplicated": dupeA,
"b-duplicated": dupeB,
"order-match": order
}
return result


def compat(A, B):
ainb = [x in B for x in A]
bina = [x in A for x in B]
Expand Down
14 changes: 7 additions & 7 deletions docs_jupyter/demo.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -106,7 +106,7 @@
" trunc512_digest('TCGA'): \"TCGA\"\n",
"}\n",
"\n",
"scdb_local = seqcol.SeqColClient(local_lookup_dict)\n"
"scdb_local = seqcol.SeqColHenge(local_lookup_dict)\n"
]
},
{
Expand Down Expand Up @@ -616,7 +616,7 @@
"metadata": {},
"outputs": [],
"source": [
"rgdb = seqcol.SeqColClient(my_dict)"
"rgdb = seqcol.SeqColHenge(my_dict)"
]
},
{
Expand Down Expand Up @@ -947,7 +947,7 @@
"outputs": [],
"source": [
"import henge \n",
"sc = seqcol.SeqColClient(database=mydict, schemas=[\"/home/nsheff/code/seqcol/seqcol/schemas/RawSeqCol.yaml\"])"
"sc = seqcol.SeqColHenge(database=mydict, schemas=[\"/home/nsheff/code/seqcol/seqcol/schemas/RawSeqCol.yaml\"])"
]
},
{
Expand Down Expand Up @@ -1039,16 +1039,16 @@
"68a178f7c740c5c240aa67ba41843b119d3bf9f8b0f0ac36 ACGT\n",
"68a178f7c740c5c240aa67ba41843b119d3bf9f8b0f0ac36_item_type sequence\n",
"68a178f7c740c5c240aa67ba41843b119d3bf9f8b0f0ac36_digest_version md5\n",
"25ccf153f9ac4876a631270b6bb23328f8e5fed08087a9f5 chr1\u001e",
"4\u001e",
"25ccf153f9ac4876a631270b6bb23328f8e5fed08087a9f5 chr1\u001e\n",
"4\u001e\n",
"68a178f7c740c5c240aa67ba41843b119d3bf9f8b0f0ac36\n",
"25ccf153f9ac4876a631270b6bb23328f8e5fed08087a9f5_item_type ASD\n",
"25ccf153f9ac4876a631270b6bb23328f8e5fed08087a9f5_digest_version md5\n",
"3912dddce432f3085c6b4f72a644c4c4c73f07215a9679ce TCGA\n",
"3912dddce432f3085c6b4f72a644c4c4c73f07215a9679ce_item_type sequence\n",
"3912dddce432f3085c6b4f72a644c4c4c73f07215a9679ce_digest_version md5\n",
"7ea134e2ee93733d2a0aa66150d9b4540ad7fafe5782715b chr2\u001e",
"4\u001e",
"7ea134e2ee93733d2a0aa66150d9b4540ad7fafe5782715b chr2\u001e\n",
"4\u001e\n",
"3912dddce432f3085c6b4f72a644c4c4c73f07215a9679ce\n",
"7ea134e2ee93733d2a0aa66150d9b4540ad7fafe5782715b_item_type ASD\n",
"7ea134e2ee93733d2a0aa66150d9b4540ad7fafe5782715b_digest_version md5\n",
Expand Down
8 changes: 4 additions & 4 deletions docs_jupyter/seqcol.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@
}
],
"source": [
"sqdb = seqcol.SeqColClient({})"
"sqdb = seqcol.SeqColHenge({})"
]
},
{
Expand Down Expand Up @@ -129,7 +129,7 @@
}
],
"source": [
"sqdb2 = seqcol.SeqColClient({}, schemas=[\"../seqcol/schemas/RawSeqCol.yaml\"])"
"sqdb2 = seqcol.SeqColHenge({}, schemas=[\"../seqcol/schemas/RawSeqCol.yaml\"])"
]
},
{
Expand Down Expand Up @@ -293,7 +293,7 @@
}
],
"source": [
"sqdb3 = seqcol.SeqColClient({}, schemas=[\"../seqcol/schemas/TASeqCol.yaml\"])"
"sqdb3 = seqcol.SeqColHenge({}, schemas=[\"../seqcol/schemas/TASeqCol.yaml\"])"
]
},
{
Expand Down Expand Up @@ -474,7 +474,7 @@
}
],
"source": [
"sqdb4 = seqcol.SeqColClient({}, schemas=[\"../seqcol/schemas/SeqColArraySet.yaml\"])"
"sqdb4 = seqcol.SeqColHenge({}, schemas=[\"../seqcol/schemas/SeqColArraySet.yaml\"])"
]
},
{
Expand Down
Loading

0 comments on commit 3981852

Please sign in to comment.