diff --git a/bin/ISCNsegmenter.py b/bin/ISCNsegmenter.py
index c5c40f3..29d887a 100755
--- a/bin/ISCNsegmenter.py
+++ b/bin/ISCNsegmenter.py
@@ -38,7 +38,6 @@ def iscn_segmenter():
group_parameter = BYC_PARS.get("groupBy", "histological_diagnosis_id")
input_file = BYC_PARS.get("inputfile")
output_file = BYC_PARS.get("outputfile")
- dt_m = byc.get("datatable_mappings", {})
technique = "cCGH"
iscn_field = "iscn_ccgh"
@@ -85,7 +84,7 @@ def iscn_segmenter():
"callset_id": s.get("callset_id", "exp-"+n),
"individual_id": s.get("individual_id", "ind-"+n),
}
- update_bs = import_datatable_dict_line(dt_m, update_bs, fieldnames, s, "biosample")
+ update_bs = import_datatable_dict_line(update_bs, fieldnames, s, "biosample")
h_line = pgxseg_biosample_meta_line(byc, update_bs, group_parameter)
pgxseg.write( "{}\n".format(h_line) )
diff --git a/bin/analysesStatusmapsRefresher.py b/bin/analysesStatusmapsRefresher.py
old mode 100644
new mode 100755
diff --git a/bin/collationsCreator.py b/bin/collationsCreator.py
index 90a62f4..0b8038b 100755
--- a/bin/collationsCreator.py
+++ b/bin/collationsCreator.py
@@ -69,7 +69,6 @@ def collations_creator():
data_coll = data_db[ collection ]
onto_ids = _get_ids_for_prefix( data_coll, coll_defs )
- is_series = coll_defs.get("is_series", False)
onto_keys = list( set( onto_ids ) & hier.keys() )
# get the set of all parents for sample codes
@@ -78,10 +77,6 @@ def collations_creator():
if o_id in hier.keys():
onto_keys.update( hier[ o_id ][ "parent_terms" ] )
- if is_series is True:
- child_ids = _get_child_ids_for_prefix(data_coll, coll_defs)
- onto_keys.update(child_ids)
-
sel_hiers = [ ]
no = len(hier.keys())
matched = 0
@@ -90,13 +85,13 @@ def collations_creator():
for count, code in enumerate(hier.keys(), start=1):
if not BYC["TEST_MODE"]:
bar.next()
- children = list( set( hier[ code ][ "child_terms" ] ) & onto_keys )
- hier[ code ].update( { "child_terms": children } )
+ children = list(set(hier[ code ]["child_terms"]) & onto_keys)
+ hier[ code ].update( {"child_terms": children})
if len( children ) < 1:
if BYC["TEST_MODE"]:
print(code+" w/o children")
continue
- code_no = data_coll.count_documents( { db_key: code } )
+ code_no = data_coll.count_documents({db_key: code})
if code_no < 1:
code_no = 0
if len( children ) < 2:
@@ -106,7 +101,7 @@ def collations_creator():
if child_no > 0:
# sub_id = re.sub(pre, coll_type, code)
sub_id = code
- update_obj = hier[ code ].copy()
+ update_obj = hier[code].copy()
update_obj.update({
"id": sub_id,
"ft_type": coll_defs.get("ft_type", "ontologyTerm"),
@@ -115,6 +110,7 @@ def collations_creator():
"reference": "https://progenetix.org/services/ids/"+code,
"namespace_prefix": coll_defs.get("namespace_prefix", ""),
"scope": coll_defs.get("scope", ""),
+ "entity": coll_defs.get("entity", ""),
"code_matches": code_no,
"code": code,
"count": child_no,
@@ -131,8 +127,7 @@ def collations_creator():
if not BYC["TEST_MODE"]:
sel_hiers.append( update_obj )
else:
- print("{}:\t{} ({} deep) samples - {} / {} {}".format(sub_id, code_no, child_no, count, no, pre))
-
+ print(f'{sub_id}:\t{code_no} ({child_no} deep) samples - {count} / {no} {pre}')
# UPDATE
if not BYC["TEST_MODE"]:
bar.finish()
@@ -141,12 +136,12 @@ def collations_creator():
coll_coll.delete_many( { "collation_type": coll_type } )
coll_coll.insert_many( sel_hiers )
- print("===> Found {} of {} {} codes & added them to {}.collations <===".format(matched, no, coll_type, ds_id))
-
+ print(f'===> Found {matched} of {no} {coll_type} codes & added them to {ds_id}.collations <===')
+
+
################################################################################
def get_prefix_hierarchy( ds_id, coll_type, pre_h_f, byc):
-
coll_defs = byc["filter_definitions"][coll_type]
hier = hierarchy_from_file(ds_id, coll_type, pre_h_f, byc)
no = len(hier.keys())
@@ -174,21 +169,18 @@ def get_prefix_hierarchy( ds_id, coll_type, pre_h_f, byc):
"collation_type": coll_type,
"namespace_prefix": coll_defs.get("namespace_prefix", ""),
"scope": coll_defs.get("scope", ""),
+ "entity": coll_defs.get("entity", ""),
"db_key": coll_defs.get("db_key", ""),
"hierarchy_paths": [ { "order": no, "depth": 1, "path": [ "NCIT:C3262", "NCIT:C000000" ] } ]
}
} )
for o in onto_ids:
-
if o in hier.keys():
continue
-
added_no += 1
no += 1
-
l = _get_label_for_code(data_coll, coll_defs, o)
-
if coll_type == "NCIT":
hier.update( {
o: { "id": o, "label": l, "hierarchy_paths":
@@ -200,15 +192,13 @@ def get_prefix_hierarchy( ds_id, coll_type, pre_h_f, byc):
o_p = { "order": int(no), "depth": 0, "path": [ o ] }
hier.update( { o: { "id": o, "label": l, "hierarchy_paths": [ o_p ] } } )
print("Added:\t{}\t{}".format(o, l))
-
if added_no > 0:
print("===> Added {} {} codes from {}.{} <===".format(added_no, coll_type, ds_id, coll_defs["scope"] ) )
- ############################################################################
+ #--------------------------------------------------------------------------#
no = len(hier)
bar = Bar(" parsing parents ", max = no, suffix='%(percent)d%%'+" of "+str(no) )
-
for c, h in hier.items():
bar.next()
all_parents = { }
@@ -219,29 +209,16 @@ def get_prefix_hierarchy( ds_id, coll_type, pre_h_f, byc):
bar.finish()
- ############################################################################
+ #--------------------------------------------------------------------------#
bar = Bar(" parsing children ", max = no, suffix='%(percent)d%%'+" of "+str(no) )
-
for c, h in hier.items():
bar.next()
all_children = set()
for c_2, h_2 in hier.items():
if c in h_2["parent_terms"]:
all_children.add( c_2 )
- hier[ c ].update( { "child_terms": list( all_children ) } )
-
- if "series_pattern" in coll_defs:
- ch_re = re.compile( coll_defs["series_pattern"] )
- for c, h in hier.items():
- all_children = set( )
- for p in h["child_terms"]:
- gsms = data_coll.distinct( db_key, { db_key: p } )
- gsms = list(filter(lambda d: ch_re.match(d), gsms))
- all_children.update(gsms)
- all_children.add(p)
- h.update({ "child_terms": list(all_children) })
-
+ hier[c].update({"child_terms": list(all_children)})
bar.finish()
return hier
@@ -249,11 +226,10 @@ def get_prefix_hierarchy( ds_id, coll_type, pre_h_f, byc):
################################################################################
def _make_dummy_publication_hierarchy(byc):
-
coll_type = "pubmed"
coll_defs = byc["filter_definitions"][coll_type]
data_db = "progenetix"
- data_coll = MongoClient(host=environ.get("BYCON_MONGO_HOST", "localhost"))[ data_db ][ "publications" ]
+ data_coll = MongoClient(host=DB_MONGOHOST)[data_db]["publications"]
query = { "id": { "$regex": r'^PMID\:\d+?$' } }
no = data_coll.count_documents( query )
bar = Bar("Publications...", max = no, suffix='%(percent)d%%'+" of "+str(no) )
@@ -272,6 +248,7 @@ def _make_dummy_publication_hierarchy(byc):
"collation_type": coll_type,
"namespace_prefix": coll_defs.get("namespace_prefix", ""),
"scope": coll_defs.get("scope", ""),
+ "entity": coll_defs.get("entity", ""),
"db_key": coll_defs.get("db_key", ""),
"updated": datetime.datetime.now().isoformat(),
"hierarchy_paths": [ { "order": int(order), "depth": 0, "path": [ code ] } ],
@@ -279,48 +256,31 @@ def _make_dummy_publication_hierarchy(byc):
"child_terms": [ code ]
}
} )
-
bar.finish()
-
return hier
################################################################################
def _get_dummy_hierarchy(ds_id, coll_type, coll_defs, byc):
-
data_client = MongoClient(host=DB_MONGOHOST)
data_db = data_client[ ds_id ]
data_coll = data_db[ coll_defs["scope"] ]
data_pat = coll_defs["pattern"]
db_key = coll_defs["db_key"]
- is_series = coll_defs.get("is_series", False)
-
- if is_series is True:
- s_pat = coll_defs["series_pattern"]
- s_re = re.compile( s_pat )
-
- pre_ids = _get_ids_for_prefix( data_coll, coll_defs )
-
+ pre_ids = _get_ids_for_prefix(data_coll, coll_defs)
hier = { }
no = len(pre_ids)
bar = Bar(coll_type, max = no, suffix='%(percent)d%%'+" of "+str(no) )
for order, c in enumerate(sorted(pre_ids), start=1):
-
bar.next()
hier.update( { c: _get_hierarchy_item( data_coll, coll_defs, coll_type, c, order, 0, [ c ] ) } )
-
- if is_series is True:
-
- ser_ids = data_coll.distinct( db_key, { db_key: c } )
- ser_ids = list(filter(lambda d: s_re.match(d), ser_ids))
- hier[c].update( { "child_terms": list( set(ser_ids) | set(hier[c]["child_terms"]) ) } )
bar.finish()
-
return hier
+
################################################################################
def _get_hierarchy_item(data_coll, coll_defs, coll_type, code, order, depth, path):
@@ -332,6 +292,7 @@ def _get_hierarchy_item(data_coll, coll_defs, coll_type, code, order, depth, pat
"collation_type": coll_type,
"namespace_prefix": coll_defs.get("namespace_prefix", ""),
"scope": coll_defs.get("scope", ""),
+ "entity": coll_defs.get("entity", ""),
"db_key": coll_defs.get("db_key", ""),
"updated": datetime.datetime.now().isoformat(),
"hierarchy_paths": [ { "order": int(order), "depth": int(depth), "path": list(path) } ],
diff --git a/bin/collationsPlotter.py b/bin/collationsPlotter.py
deleted file mode 100755
index 61e3d27..0000000
--- a/bin/collationsPlotter.py
+++ /dev/null
@@ -1,59 +0,0 @@
-#!/usr/bin/env python3
-import argparse, datetime, re, sys
-from pymongo import MongoClient
-from humps import camelize
-from os import path, environ, pardir
-
-from bycon import *
-
-services_lib_path = path.join( path.dirname( path.abspath(__file__) ), pardir, "services", "lib" )
-sys.path.append( services_lib_path )
-from bycon_bundler import *
-from bycon_plot import *
-from interval_utils import generate_genome_bins
-
-"""
-./bin/collationsPlotter.py -d "progenetix,cellz" --filters "pgx:icdom-85003,pgx:icdom-81703,pgx:icdom-87003,pgx:icdom-87203,pgx:icdom-94003,pgx:icdom-95003,pgx:icdom-81403" -o ./exports/multicollationtest.svg -p "plot_area_height=50&plot_axis_y_max=80&plot_histogram_frequency_labels=30,60"
-
-"""
-
-################################################################################
-################################################################################
-################################################################################
-
-def main():
- collations_plotter()
-
-################################################################################
-
-def collations_plotter():
- initialize_bycon_service(byc, "collations_plotter")
- run_beacon_init_stack(byc)
- generate_genome_bins(byc)
-
- BYC_PARS.update({"plot_type": "histoplot"})
- out_putfile = BYC_PARS.get("outputfile")
-
- if len(byc["dataset_ids"]) < 1:
- print("Please indicate one or more dataset ids using `-d`")
- exit()
- if len(BYC_PARS.get("filters", [])) < 1:
- print("Please indicate one or more collation ids using `--filters`")
- if not out_putfile:
- print("No output file specified (-o, --outputfile) => quitting ...")
- exit()
- svg_file = out_putfile
- if not ".svg" in svg_file.lower():
- print("The output file should be an `.svg` => quitting ...")
- exit()
-
- pdb = ByconBundler(byc).collationsPlotbundles()
- ByconPlot(byc, pdb).svg2file(svg_file)
-
-################################################################################
-################################################################################
-################################################################################
-
-
-if __name__ == '__main__':
- main()
diff --git a/bin/frequencymapsCreator.py b/bin/frequencymapsCreator.py
index b1390b8..755b375 100755
--- a/bin/frequencymapsCreator.py
+++ b/bin/frequencymapsCreator.py
@@ -13,6 +13,7 @@
services_lib_path = path.join( path.dirname( path.abspath(__file__) ), pardir, "services", "lib" )
sys.path.append( services_lib_path )
+from bycon_bundler import ByconBundler
from interval_utils import generate_genome_bins, interval_cnv_arrays, interval_counts_from_callsets
from collation_utils import set_collation_types
@@ -53,44 +54,35 @@ def frequencymaps_creator():
coll_ids = _filter_coll_ids(coll_coll, byc)
coll_no = len(coll_ids)
- bar = Bar(f'{coll_no} {ds_id} fMaps', max = coll_no, suffix='%(percent)d%%'+f' of {coll_no}' )
+ if not BYC["TEST_MODE"]:
+ bar = Bar(f'{coll_no} {ds_id} fMaps', max = coll_no, suffix='%(percent)d%%'+f' of {coll_no}' )
coll_i = 0
for c_id in coll_ids:
-
- bar.next()
-
+ if not BYC["TEST_MODE"]:
+ bar.next()
coll = coll_coll.find_one({"id": c_id})
c_o_id = coll.get("_id")
if not coll:
- print("¡¡¡ some error - collation {} not found !!!".format(c_id))
+ print(f"¡¡¡ some error - collation {c_id} not found !!!")
continue
-
- pre, code = re.split("[:-]", c_id, 1)
- coll_type = coll.get("collation_type", "undefined")
- db_key = coll["db_key"]
-
- exclude_normals = True
- for normal in ("EFO:0009654", "oneKgenomes"):
- if normal in c_id:
- print(f'\n---> keeping normals for {c_id}')
- exclude_normals = False
-
coll_i += 1
- query = { db_key: { '$in': coll["child_terms"] } }
- bios_no, cs_cursor = _cs_cursor_from_bios_query(byc, bios_coll, ind_coll, cs_coll, c_id, coll["scope"], query, exclude_normals)
- cs_no = len(list(cs_cursor))
+ byc.update({"filters":[{"id":c_id}, {"id": "EDAM:operation_3961"}]})
+ RSS = ByconResultSets(byc).datasetsResults()
+ pdb = ByconBundler(byc).resultsets_frequencies_bundles(RSS)
+ if_bundles = pdb.get("interval_frequencies_bundles")
+ if len(if_bundles) < 1:
+ prdbug(f'No interval_frequencies for {c_id}')
+ continue
- if cs_no < 1:
- coll_coll.update_one({"_id": c_o_id}, {"$set": {"cnv_analyses": 0}})
+ cnv_cs_count = if_bundles[0].get("sample_count", 0)
+ coll_coll.update_one({"_id": c_o_id}, {"$set": {"cnv_analyses": cnv_cs_count}})
+ if cnv_cs_count < 1:
continue
- i_t = coll_i % 100
start_time = time.time()
- # if i_t == 0 or cs_no > 1000:
- # print("{}: {} bios, {} cs\t{}/{}\t{:.1f}%".format(c_id, bios_no, cs_no, coll_i, coll_no, 100*coll_i/coll_no))
update_obj = {
"id": c_id,
@@ -101,22 +93,15 @@ def frequencymaps_creator():
"collation_type": coll["collation_type"],
"child_terms": coll["child_terms"],
"updated": datetime.datetime.now().isoformat(),
- "counts": {"biosamples": bios_no, "analyses": cs_no },
+ "counts": {"analyses": cnv_cs_count },
"frequencymap": {
"interval_count": byc["genomic_interval_count"],
"binning": BYC_PARS.get("genome_binning", ""),
- "biosample_count": bios_no
+ "intervals": if_bundles[0].get("interval_frequencies", []),
+ "analysis_count": cnv_cs_count
}
}
- intervals, cnv_cs_count = interval_counts_from_callsets(cs_cursor, byc)
- update_obj["frequencymap"].update({
- "intervals": intervals,
- "analysis_count": cnv_cs_count
- })
-
- coll_coll.update_one({"_id": c_o_id}, {"$set": {"cnv_analyses": cnv_cs_count}})
-
proc_time = time.time() - start_time
# if cs_no > 1000:
# print(" => Processed in {:.2f}s: {:.4f}s per callset".format(proc_time, (proc_time/cs_no)))
@@ -125,29 +110,32 @@ def frequencymaps_creator():
fm_coll.delete_many( { "id": c_id } )
fm_coll.insert_one( update_obj )
- if coll["code_matches"] > 0:
- if int(cs_no) > int(coll["code_matches"]):
- query_cm = { db_key: c_id }
- bios_no_cm, cs_cursor_cm = _cs_cursor_from_bios_query(byc, bios_coll, ind_coll, cs_coll, c_id, coll["scope"], query_cm)
- cs_no_cm = len(list(cs_cursor_cm))
- if cs_no_cm > 0:
- cm_obj = { "frequencymap_codematches": {
- "interval_count": len(byc["genomic_intervals"]),
- "binning": BYC_PARS.get("genome_binning", ""),
- "biosample_count": bios_no_cm
- }
+ if cnv_cs_count > coll.get("code_matches", cnv_cs_count):
+ byc.update({"filters":[{"id":c_id, "includeDescendantTerms": False}, {"id": "EDAM:operation_3961"}]})
+ CMRSS = ByconResultSets(byc).datasetsResults()
+ cmpdb = ByconBundler(byc).resultsets_frequencies_bundles(CMRSS)
+
+ cmif_bundles = cmpdb.get("interval_frequencies_bundles")
+ if len(cmif_bundles) < 1:
+ # print(f'No code match interval_frequencies for {c_id}')
+ continue
+
+ cnv_cmcs_count = cmif_bundles[0].get("sample_count", 0)
+ if cnv_cmcs_count > 0:
+ cm_obj = {"frequencymap_codematches":
+ {
+ "interval_count": len(byc["genomic_intervals"]),
+ "binning": BYC_PARS.get("genome_binning", ""),
+ "intervals": cmif_bundles[0].get("interval_frequencies", []),
+ "analysis_count": cnv_cmcs_count
}
+ }
+ prdbug(f'\n{c_id}: {cnv_cmcs_count} exact of {cnv_cs_count} total code matches ({coll["code_matches"]} indicated)')
+ if not BYC["TEST_MODE"]:
+ fm_coll.update_one( { "id": c_id }, { '$set': cm_obj }, upsert=False )
- intervals, cnv_cs_count = interval_counts_from_callsets(cs_cursor_cm, byc)
- cm_obj["frequencymap_codematches"].update({
- "intervals": intervals,
- "analysis_count": cs_no_cm
- })
- prdbug(f'\n{c_id}: {cs_no_cm} exact of {cs_no} total code matches ({coll["code_matches"]} indicated)')
- if not BYC["TEST_MODE"]:
- fm_coll.update_one( { "id": c_id }, { '$set': cm_obj }, upsert=False )
-
- bar.finish()
+ if not BYC["TEST_MODE"]:
+ bar.finish()
################################################################################
@@ -177,31 +165,6 @@ def _filter_coll_ids(coll_coll, byc):
return coll_ids
-################################################################################
-
-def _cs_cursor_from_bios_query(byc, bios_coll, ind_coll, cs_coll, coll_id, scope, query, exclude_normals=True):
- if scope == "individuals":
- ind_ids = ind_coll.distinct( "id" , query )
- bios_ids = bios_coll.distinct( "id" , {"individual_id":{"$in": ind_ids } } )
- elif scope == "analyses":
- bios_ids = cs_coll.distinct( "biosample_id" , query )
- else:
- bios_ids = bios_coll.distinct( "id" , query )
-
- pre_b = len(bios_ids)
-
- # for most entities samples labeled as "normal" will be excluded for frequency calculations
- if exclude_normals:
- bios_ids = bios_coll.distinct( "id" , { "id": { "$in": bios_ids } , "biosample_status.id": {"$ne": "EFO:0009654" }} )
- bios_no = len(bios_ids)
-
- if pre_b > bios_no:
- prdbug(f'\nWARNING: {pre_b} samples for {coll_id}, while {bios_no} after excluding normals by EFO:0009654')
-
- cs_query = { "biosample_id": { "$in": bios_ids } , "variant_class": { "$ne": "SNV" } }
- cs_cursor = cs_coll.find(cs_query)
-
- return bios_no, cs_cursor
################################################################################
################################################################################
diff --git a/bin/lib/mongodb_utils.py b/bin/lib/mongodb_utils.py
index 8c2c18d..03fa9fd 100644
--- a/bin/lib/mongodb_utils.py
+++ b/bin/lib/mongodb_utils.py
@@ -2,12 +2,12 @@
from os import environ
from pymongo import MongoClient, GEOSPHERE
-from bycon import DB_MONGOHOST
+from bycon import BYC, DB_MONGOHOST
################################################################################
def mongodb_update_indexes(ds_id, byc):
- dt_m = byc["datatable_mappings"]
+ dt_m = BYC["datatable_mappings"]
b_rt_s = byc["service_config"]["indexed_response_types"]
mongo_client = MongoClient(host=DB_MONGOHOST)
data_db = mongo_client[ds_id]
diff --git a/bin/local/beacon_defaults.yaml b/bin/local/beacon_defaults.yaml
index e5c9482..55ef4d1 100644
--- a/bin/local/beacon_defaults.yaml
+++ b/bin/local/beacon_defaults.yaml
@@ -13,11 +13,12 @@ defaults:
# the aliases here are for non-standard speling or additional entry types
service_path_aliases:
- analyses: analyses
- filteringTerms: filteringTerms
- phenopackets: phenopackets
- variants: genomicVariations
- genomicVariations: genomicVariations
+ filteringTerms: filtering_terms # just for speling variations
+ entryTypes: entry_types # just for speling variations
+ variants: genomicVariations # just for speling variations
+ genomicVariations: genomicVariations # just for speling variations
+ phenopackets: phenopackets # Beacon+ specific example
+
################################################################################
# here you can map additional path values to the corresponding (additional)
@@ -25,16 +26,16 @@ service_path_aliases:
################################################################################
path_entry_type_mappings:
- phenopackets: phenopacket
+ phenopackets: phenopacket # Beacon+ specific example
################################################################################
# her you can add additional path ids to the data query aggregation pipeline
# that usually mapps/reduces queries against biosamples, genomicVariations,
-#individuals ...
+# individuals ...
################################################################################
data_pipeline_path_ids:
- - phenopackets
+ - phenopackets # Beacon+ specific example
################################################################################
# Beacon entry type defaults - please adjust esp. info and schema paths...
@@ -42,36 +43,7 @@ data_pipeline_path_ids:
# framework and might be disentangled further on ...
################################################################################
-# => snake_casing
-
-# standard examples
-
-# ################################################################################
-# filteringTerm:
-# is_entry_type: False
-# request_entity_path_id: filteringTerms
-# response_entity_id: filteringTerm
-# collection: collations
-# response_schema: beaconFilteringTermsResponse
-# beacon_schema:
-# entity_type: filteringTerm
-# schema: https://progenetix.org/services/schemas/filteringTermsSchema/
-# h->o_access_key: Null
-# ################################################################################
-# biosample:
-# is_entry_type: True
-# request_entity_path_id: biosamples
-# response_entity_id: biosample
-# collection: biosamples
-# response_schema: beaconResultsetsResponse
-# beacon_schema:
-# entity_type: biosample
-# schema: https://progenetix.org/services/schemas/biosample/
-# h->o_access_key: biosamples._id
-# ################################################################################
-
entity_defaults:
-
info:
is_entry_type: False
collection: Null
diff --git a/bin/local/instance_overrides.yaml b/bin/local/instance_overrides.yaml
new file mode 100644
index 0000000..8c8fc7a
--- /dev/null
+++ b/bin/local/instance_overrides.yaml
@@ -0,0 +1,62 @@
+progenetix:
+ domains:
+ - progenetix.org
+ - www.progenetix.org
+ - progenetix.test
+ beacon_defaults:
+ defaults:
+ default_dataset_id: progenetix
+ test_domains:
+ - progenetix.test
+
+beaconplus:
+ domains:
+ - beaconplus.progenetix.org
+ - beaconplus.test
+ beacon_defaults:
+ defaults:
+ default_dataset_id: examplez
+ test_domains:
+ - beaconplus.test
+
+cancercelllines:
+ domains:
+ - cancercelllines.org
+ - www.cancercelllines.org
+ - cancercelllines.test
+ beacon_defaults:
+ defaults:
+ default_dataset_id: cellz
+ test_domains:
+ - cancercelllines.test
+ entity_defaults:
+ info:
+ content:
+ beacon_id: org.cancercelllines
+ name: Cancer Cell Line Genomics Beacon+
+ id: org.cancercelllines.beacon
+ environment: prod
+ description: >-
+ The cancercelllines.org Beacon is a specific instance of the Progenetix
+ Beacon+ environment providing information about genommic variations in
+ cancer cell lines.
+ type:
+ group: org.ga4gh
+ artifact: beacon
+ version: v2.1.0-beaconplus
+ documentation_url: http://docs.cancercelllines.org
+ service_url: http://cancercelllines.org/beacon/
+ welcome_url: https://cancercelllines.org/biosamples/
+ alternative_url: https://cancercelllines.org
+ contact_url: mailto:contact@progenetix.org
+ created_at: 2023-07-01T00:00:00
+ updated_at: 2024-02-24T13:00:00
+ organization:
+ welcome_url: https://cancercelllines.org/
+ contact_url: mailto:contact@progenetix.org
+ logoUrl: https://cancercelllines.org/img/cancercelllines-icon-400x300.png
+ info:
+ update_date_time: 2024-02-24T12:45:00
+ create_date_time: 2023-07-01T00:00:00
+ update_date_time: 2024-02-24T13:00:00
+
diff --git a/bin/local/local_paths.yaml b/bin/local/local_paths.yaml
index 29dc172..a35deda 100644
--- a/bin/local/local_paths.yaml
+++ b/bin/local/local_paths.yaml
@@ -19,8 +19,3 @@ server_callsets_dir_loc:
- grch38
probefile_name: probes,cn.tsv
-
-test_domains:
- - progenetix.test
- - cancercelllines.test
- - beaconplus.test
diff --git a/bin/local/services_defaults.yaml b/bin/local/services_defaults.yaml
index e60ce82..6d50321 100644
--- a/bin/local/services_defaults.yaml
+++ b/bin/local/services_defaults.yaml
@@ -1,3 +1,5 @@
+# Definitions here in fact are treated like `beacon_defaults` and merged into
+# the global `beacon_defaults` dictionary
defaults: {}
################################################################################
@@ -35,6 +37,7 @@ service_path_aliases:
schemas: schemas
uploader: uploader
uploadplotter: uploadplotter
+ variantsbedfile: variantsbedfile
vcf: vcfvariants
vcfvariants: vcfvariants
diff --git a/bin/publicationsInserter.py b/bin/publicationsInserter.py
index e230e46..2608585 100755
--- a/bin/publicationsInserter.py
+++ b/bin/publicationsInserter.py
@@ -238,7 +238,7 @@ def get_ncit_tumor_types(n_p, pub):
##############################################################################
def get_empty_publication(byc):
- publication = object_instance_from_schema_name(byc, "Publication", "")
+ publication = object_instance_from_schema_name("Publication", "")
publication.update({
"updated": date_isoformat(datetime.datetime.now()),
"provenance": {
diff --git a/bin/templateTablesCreator.py b/bin/templateTablesCreator.py
old mode 100644
new mode 100755
index ae3b805..e8b01b7
--- a/bin/templateTablesCreator.py
+++ b/bin/templateTablesCreator.py
@@ -28,7 +28,7 @@ def main():
def templates_creator():
initialize_bycon_service(byc, "templates_creator")
- dt_m = byc["datatable_mappings"].get("definitions", {})
+ dt_m = BYC["datatable_mappings"].get("definitions", {})
rsrc_p = path.join(pkg_path, "rsrc", "templates")
all_cols = []
diff --git a/bin/variantsInserter.py b/bin/variantsInserter.py
index 0ff4c4e..7a7d641 100755
--- a/bin/variantsInserter.py
+++ b/bin/variantsInserter.py
@@ -35,7 +35,6 @@ def variants_inserter():
ds_id = byc["dataset_ids"][0]
input_file = BYC_PARS.get("inputfile")
- dt_m = byc.get("datatable_mappings", {})
if not input_file:
print("No input file file specified (-i, --inputfile) => quitting ...")
@@ -121,7 +120,7 @@ def variants_inserter():
"individual_id": v.get("individual_id", re.sub("pgxbs-", "pgxind-", bs_id))
})
- insert_v = import_datatable_dict_line(dt_m, insert_v, variants.fieldnames, v, "genomicVariant")
+ insert_v = import_datatable_dict_line(insert_v, variants.fieldnames, v, "genomicVariant")
prdbug(insert_v)
insert_v = ByconVariant(byc).pgxVariant(insert_v)
insert_v.update({"updated": datetime.datetime.now().isoformat()})
diff --git a/exports/multicollationtest-collationplots.svg b/exports/multicollationtest-collationplots.svg
new file mode 100644
index 0000000..da16af4
--- /dev/null
+++ b/exports/multicollationtest-collationplots.svg
@@ -0,0 +1,1715 @@
+
\ No newline at end of file
diff --git a/exports/multicollationtest.svg b/exports/multicollationtest.svg
index e4cdd6c..da16af4 100644
--- a/exports/multicollationtest.svg
+++ b/exports/multicollationtest.svg
@@ -4,7 +4,7 @@ xmlns:xlink="http://www.w3.org/1999/xlink"
version="1.1"
id="genomeplot"
width="1024px"
-height="785px"
+height="1435px"
style="margin: auto; font-family: Helvetica, sans-serif;">
-
+
@@ -918,694 +918,798 @@ style="margin: auto; font-family: Helvetica, sans-serif;">
-Adenocarcinoma, NOS
-pgx:icdom-81403 (progenetix, 18589 samples)
-
-
-
-30%
-
-30%
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-Infiltrating duct carcinoma, NOS
-pgx:icdom-85003 (progenetix, 12621 samples)
-
-
-
-30%
-
-30%
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-Hepatocellular carcinoma, NOS
-pgx:icdom-81703 (progenetix, 2024 samples)
-
-
-
-30%
-
-30%
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-Pheochromocytoma, malignant
-pgx:icdom-87003 (progenetix, 56 samples)
-
-
-
-30%
-
-30%
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-Neuroblastoma, NOS
-pgx:icdom-95003 (cellz, 112 samples)
-
-
-
-30%
-
-30%
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-Astrocytoma, NOS
-pgx:icdom-94003 (progenetix, 556 samples)
-
-
-
-30%
-
-30%
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-Neuroblastoma, NOS
-pgx:icdom-95003 (progenetix, 1982 samples)
-
-
-
-30%
-
-30%
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-Hepatocellular carcinoma, NOS
-pgx:icdom-81703 (cellz, 72 samples)
-
-
-
-30%
-
-30%
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-Infiltrating duct carcinoma, NOS
-pgx:icdom-85003 (cellz, 835 samples)
-
-
-
-30%
-
-30%
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-Adenocarcinoma, NOS
-pgx:icdom-81403 (cellz, 951 samples)
-
-
-
-30%
-
-30%
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-Astrocytoma, NOS
-pgx:icdom-94003 (cellz, 53 samples)
-
-
-
-30%
-
-30%
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-Malignant melanoma, NOS
-pgx:icdom-87203 (progenetix, 2538 samples)
-
-
-
-30%
-
-30%
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-Malignant melanoma, NOS
-pgx:icdom-87203 (cellz, 673 samples)
-
-
-
-30%
-
-30%
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
+Hepatocellular carcinoma, NOS
+pgx:icdom-81703 (progenetix, 200 samples)
+
+
+
+25%
+
+25%
+
+50%
+
+50%
+
+75%
+
+75%
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+Neuroblastoma, NOS
+pgx:icdom-95003 (progenetix, 200 samples)
+
+
+
+25%
+
+25%
+
+50%
+
+50%
+
+75%
+
+75%
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+Astrocytoma, NOS
+pgx:icdom-94003 (progenetix, 200 samples)
+
+
+
+25%
+
+25%
+
+50%
+
+50%
+
+75%
+
+75%
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+Malignant melanoma, NOS
+pgx:icdom-87203 (progenetix, 200 samples)
+
+
+
+25%
+
+25%
+
+50%
+
+50%
+
+75%
+
+75%
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+Adenocarcinoma, NOS
+pgx:icdom-81403 (progenetix, 200 samples)
+
+
+
+25%
+
+25%
+
+50%
+
+50%
+
+75%
+
+75%
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+Infiltrating duct carcinoma, NOS
+pgx:icdom-85003 (progenetix, 200 samples)
+
+
+
+25%
+
+25%
+
+50%
+
+50%
+
+75%
+
+75%
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+Pheochromocytoma, malignant
+pgx:icdom-87003 (progenetix, 56 samples)
+
+
+
+25%
+
+25%
+
+50%
+
+50%
+
+75%
+
+75%
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+Neuroblastoma, NOS
+pgx:icdom-95003 (cellz, 112 samples)
+
+
+
+25%
+
+25%
+
+50%
+
+50%
+
+75%
+
+75%
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+Hepatocellular carcinoma, NOS
+pgx:icdom-81703 (cellz, 72 samples)
+
+
+
+25%
+
+25%
+
+50%
+
+50%
+
+75%
+
+75%
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+Adenocarcinoma, NOS
+pgx:icdom-81403 (cellz, 951 samples)
+
+
+
+25%
+
+25%
+
+50%
+
+50%
+
+75%
+
+75%
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+Infiltrating duct carcinoma, NOS
+pgx:icdom-85003 (cellz, 835 samples)
+
+
+
+25%
+
+25%
+
+50%
+
+50%
+
+75%
+
+75%
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+Malignant melanoma, NOS
+pgx:icdom-87203 (cellz, 673 samples)
+
+
+
+25%
+
+25%
+
+50%
+
+50%
+
+75%
+
+75%
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+Astrocytoma, NOS
+pgx:icdom-94003 (cellz, 53 samples)
+
+
+
+25%
+
+25%
+
+50%
+
+50%
+
+75%
+
+75%
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/local/beacon_defaults.yaml b/local/beacon_defaults.yaml
index e5c9482..55ef4d1 100644
--- a/local/beacon_defaults.yaml
+++ b/local/beacon_defaults.yaml
@@ -13,11 +13,12 @@ defaults:
# the aliases here are for non-standard speling or additional entry types
service_path_aliases:
- analyses: analyses
- filteringTerms: filteringTerms
- phenopackets: phenopackets
- variants: genomicVariations
- genomicVariations: genomicVariations
+ filteringTerms: filtering_terms # just for speling variations
+ entryTypes: entry_types # just for speling variations
+ variants: genomicVariations # just for speling variations
+ genomicVariations: genomicVariations # just for speling variations
+ phenopackets: phenopackets # Beacon+ specific example
+
################################################################################
# here you can map additional path values to the corresponding (additional)
@@ -25,16 +26,16 @@ service_path_aliases:
################################################################################
path_entry_type_mappings:
- phenopackets: phenopacket
+ phenopackets: phenopacket # Beacon+ specific example
################################################################################
# her you can add additional path ids to the data query aggregation pipeline
# that usually mapps/reduces queries against biosamples, genomicVariations,
-#individuals ...
+# individuals ...
################################################################################
data_pipeline_path_ids:
- - phenopackets
+ - phenopackets # Beacon+ specific example
################################################################################
# Beacon entry type defaults - please adjust esp. info and schema paths...
@@ -42,36 +43,7 @@ data_pipeline_path_ids:
# framework and might be disentangled further on ...
################################################################################
-# => snake_casing
-
-# standard examples
-
-# ################################################################################
-# filteringTerm:
-# is_entry_type: False
-# request_entity_path_id: filteringTerms
-# response_entity_id: filteringTerm
-# collection: collations
-# response_schema: beaconFilteringTermsResponse
-# beacon_schema:
-# entity_type: filteringTerm
-# schema: https://progenetix.org/services/schemas/filteringTermsSchema/
-# h->o_access_key: Null
-# ################################################################################
-# biosample:
-# is_entry_type: True
-# request_entity_path_id: biosamples
-# response_entity_id: biosample
-# collection: biosamples
-# response_schema: beaconResultsetsResponse
-# beacon_schema:
-# entity_type: biosample
-# schema: https://progenetix.org/services/schemas/biosample/
-# h->o_access_key: biosamples._id
-# ################################################################################
-
entity_defaults:
-
info:
is_entry_type: False
collection: Null
diff --git a/local/instance_overrides.yaml b/local/instance_overrides.yaml
new file mode 100644
index 0000000..8c8fc7a
--- /dev/null
+++ b/local/instance_overrides.yaml
@@ -0,0 +1,62 @@
+progenetix:
+ domains:
+ - progenetix.org
+ - www.progenetix.org
+ - progenetix.test
+ beacon_defaults:
+ defaults:
+ default_dataset_id: progenetix
+ test_domains:
+ - progenetix.test
+
+beaconplus:
+ domains:
+ - beaconplus.progenetix.org
+ - beaconplus.test
+ beacon_defaults:
+ defaults:
+ default_dataset_id: examplez
+ test_domains:
+ - beaconplus.test
+
+cancercelllines:
+ domains:
+ - cancercelllines.org
+ - www.cancercelllines.org
+ - cancercelllines.test
+ beacon_defaults:
+ defaults:
+ default_dataset_id: cellz
+ test_domains:
+ - cancercelllines.test
+ entity_defaults:
+ info:
+ content:
+ beacon_id: org.cancercelllines
+ name: Cancer Cell Line Genomics Beacon+
+ id: org.cancercelllines.beacon
+ environment: prod
+ description: >-
+ The cancercelllines.org Beacon is a specific instance of the Progenetix
+ Beacon+ environment providing information about genommic variations in
+ cancer cell lines.
+ type:
+ group: org.ga4gh
+ artifact: beacon
+ version: v2.1.0-beaconplus
+ documentation_url: http://docs.cancercelllines.org
+ service_url: http://cancercelllines.org/beacon/
+ welcome_url: https://cancercelllines.org/biosamples/
+ alternative_url: https://cancercelllines.org
+ contact_url: mailto:contact@progenetix.org
+ created_at: 2023-07-01T00:00:00
+ updated_at: 2024-02-24T13:00:00
+ organization:
+ welcome_url: https://cancercelllines.org/
+ contact_url: mailto:contact@progenetix.org
+ logoUrl: https://cancercelllines.org/img/cancercelllines-icon-400x300.png
+ info:
+ update_date_time: 2024-02-24T12:45:00
+ create_date_time: 2023-07-01T00:00:00
+ update_date_time: 2024-02-24T13:00:00
+
diff --git a/local/local_paths.yaml b/local/local_paths.yaml
index 29dc172..a35deda 100644
--- a/local/local_paths.yaml
+++ b/local/local_paths.yaml
@@ -19,8 +19,3 @@ server_callsets_dir_loc:
- grch38
probefile_name: probes,cn.tsv
-
-test_domains:
- - progenetix.test
- - cancercelllines.test
- - beaconplus.test
diff --git a/local/services_defaults.yaml b/local/services_defaults.yaml
index e60ce82..6d50321 100644
--- a/local/services_defaults.yaml
+++ b/local/services_defaults.yaml
@@ -1,3 +1,5 @@
+# Definitions here in fact are treated like `beacon_defaults` and merged into
+# the global `beacon_defaults` dictionary
defaults: {}
################################################################################
@@ -35,6 +37,7 @@ service_path_aliases:
schemas: schemas
uploader: uploader
uploadplotter: uploadplotter
+ variantsbedfile: variantsbedfile
vcf: vcfvariants
vcfvariants: vcfvariants
diff --git a/rsrc/templates/analysis_template.tsv b/rsrc/templates/analysis_template.tsv
index 55fb41a..46eab2c 100644
--- a/rsrc/templates/analysis_template.tsv
+++ b/rsrc/templates/analysis_template.tsv
@@ -1 +1 @@
-analysis_id biosample_id individual_id legacy_ids variant_class experiment_id series_id platform_id platform_label data_provenance
+analysis_id biosample_id individual_id analysis_legacy_id legacy_ids analysis_operation_id analysis_operation_label experiment_id series_id platform_id platform_label data_provenance calling_pipeline
diff --git a/rsrc/templates/biosample_template.tsv b/rsrc/templates/biosample_template.tsv
index 35dd41b..7a1f5ed 100644
--- a/rsrc/templates/biosample_template.tsv
+++ b/rsrc/templates/biosample_template.tsv
@@ -1 +1 @@
-biosample_id group_id group_label individual_id callset_ids external_references_id___PMID external_references_label___PMID external_references_id___arrayexpress external_references_label___arrayexpress external_references_id___cbioportal external_references_label___cbioportal external_references_id___cellosaurus external_references_label___cellosaurus legacy_ids notes histological_diagnosis_id histological_diagnosis_label icdo_morphology_id icdo_morphology_label icdo_topography_id icdo_topography_label pathological_stage_id pathological_stage_label biosample_status_id biosample_status_label sampled_tissue_id sampled_tissue_label tnm stage grade age_iso sex_id sex_label followup_state_id followup_state_label followup_time geoprov_city geoprov_country geoprov_iso_alpha3 geoprov_long_lat cnv_fraction cnv_del_fraction cnv_dup_fraction cell_line experiment_id series_id platform_id cell_line_id cell_line_label
+biosample_id biosample_label biosample_legacy_id individual_id callset_ids group_id group_label pubmed_id pubmed_label cellosaurus_id cellosaurus_label cbioportal_id cbioportal_label external_references_id___arrayexpress external_references_label___arrayexpress cohort_ids legacy_ids notes histological_diagnosis_id histological_diagnosis_label icdo_morphology_id icdo_morphology_label icdo_topography_id icdo_topography_label pathological_stage_id pathological_stage_label biosample_status_id biosample_status_label sampled_tissue_id sampled_tissue_label tnm stage grade age_iso sex_id sex_label followup_state_id followup_state_label followup_time geoprov_city geoprov_country geoprov_iso_alpha3 geoprov_long_lat cnv_fraction cnv_del_fraction cnv_dup_fraction cell_line experiment_id series_id platform_id cell_line_id cell_line_label
diff --git a/rsrc/templates/genomicVariant_template.tsv b/rsrc/templates/genomicVariant_template.tsv
index 5fd7c5b..6ceabf4 100644
--- a/rsrc/templates/genomicVariant_template.tsv
+++ b/rsrc/templates/genomicVariant_template.tsv
@@ -1 +1 @@
-variant_id variant_internal_id callset_id biosample_id individual_id sequence_id reference_name start end variant_state_id variant_state_label reference_sequence sequence annotation_derived aminoacid_changes genomic_hgvs_id log2 variant_type reference_bases alternate_bases
+variant_id variant_internal_id callset_id biosample_id individual_id sequence_id reference_name start end variant_state_id variant_state_label reference_sequence sequence annotation_derived aminoacid_changes genomic_hgvs_id log2 variant_type
diff --git a/rsrc/templates/individual_template.tsv b/rsrc/templates/individual_template.tsv
index 3c75eaa..82be326 100644
--- a/rsrc/templates/individual_template.tsv
+++ b/rsrc/templates/individual_template.tsv
@@ -1 +1 @@
-individual_id legacy_ids sex_id sex_label age_iso age_days data_use_conditions_id data_use_conditions_label histological_diagnosis_id histological_diagnosis_label index_disease_notes index_disease_followup_time index_disease_followup_state_id index_disease_followup_state_label auxiliary_disease_id auxiliary_disease_label auxiliary_disease_notes geoprov_city geoprov_country geoprov_iso_alpha3 geoprov_long_lat cell_line_donation_id cell_line_donation_label
+individual_id individual_legacy_id legacy_ids sex_id sex_label age_iso age_days data_use_conditions_id data_use_conditions_label histological_diagnosis_id histological_diagnosis_label index_disease_notes index_disease_followup_time index_disease_followup_state_id index_disease_followup_state_label auxiliary_disease_id auxiliary_disease_label auxiliary_disease_notes geoprov_city geoprov_country geoprov_iso_alpha3 geoprov_long_lat cell_line_donation_id cell_line_donation_label
diff --git a/rsrc/templates/metadata_template.tsv b/rsrc/templates/metadata_template.tsv
index 039ec8d..7857c2c 100644
--- a/rsrc/templates/metadata_template.tsv
+++ b/rsrc/templates/metadata_template.tsv
@@ -1 +1 @@
-biosample_id group_id group_label individual_id callset_ids external_references_id___PMID external_references_label___PMID external_references_id___arrayexpress external_references_label___arrayexpress external_references_id___cbioportal external_references_label___cbioportal external_references_id___cellosaurus external_references_label___cellosaurus legacy_ids notes histological_diagnosis_id histological_diagnosis_label icdo_morphology_id icdo_morphology_label icdo_topography_id icdo_topography_label pathological_stage_id pathological_stage_label biosample_status_id biosample_status_label sampled_tissue_id sampled_tissue_label tnm stage grade age_iso sex_id sex_label followup_state_id followup_state_label followup_time geoprov_city geoprov_country geoprov_iso_alpha3 geoprov_long_lat cnv_fraction cnv_del_fraction cnv_dup_fraction cell_line experiment_id series_id platform_id cell_line_id cell_line_label age_days data_use_conditions_id data_use_conditions_label index_disease_notes index_disease_followup_time index_disease_followup_state_id index_disease_followup_state_label auxiliary_disease_id auxiliary_disease_label auxiliary_disease_notes cell_line_donation_id cell_line_donation_label analysis_id variant_class platform_label data_provenance
+biosample_id biosample_label biosample_legacy_id individual_id callset_ids group_id group_label pubmed_id pubmed_label cellosaurus_id cellosaurus_label cbioportal_id cbioportal_label external_references_id___arrayexpress external_references_label___arrayexpress cohort_ids legacy_ids notes histological_diagnosis_id histological_diagnosis_label icdo_morphology_id icdo_morphology_label icdo_topography_id icdo_topography_label pathological_stage_id pathological_stage_label biosample_status_id biosample_status_label sampled_tissue_id sampled_tissue_label tnm stage grade age_iso sex_id sex_label followup_state_id followup_state_label followup_time geoprov_city geoprov_country geoprov_iso_alpha3 geoprov_long_lat cnv_fraction cnv_del_fraction cnv_dup_fraction cell_line experiment_id series_id platform_id cell_line_id cell_line_label individual_legacy_id age_days data_use_conditions_id data_use_conditions_label index_disease_notes index_disease_followup_time index_disease_followup_state_id index_disease_followup_state_label auxiliary_disease_id auxiliary_disease_label auxiliary_disease_notes cell_line_donation_id cell_line_donation_label analysis_id analysis_legacy_id analysis_operation_id analysis_operation_label platform_label data_provenance calling_pipeline
diff --git a/services/collationplots.py b/services/collationplots.py
index a9b2341..c3c3126 100755
--- a/services/collationplots.py
+++ b/services/collationplots.py
@@ -4,12 +4,22 @@
import sys, datetime, argparse
from pymongo import MongoClient
-from bycon import *
+from bycon import (
+ BeaconErrorResponse,
+ byc,
+ initialize_bycon_service,
+ print_text_response,
+ rest_path_value,
+ run_beacon_init_stack,
+ BYC,
+ BYC_PARS
+)
services_lib_path = path.join( path.dirname( path.abspath(__file__) ), "lib" )
sys.path.append( services_lib_path )
from bycon_bundler import *
from bycon_plot import *
+from file_utils import ExportFile
from interval_utils import generate_genome_bins
from service_helpers import *
from service_response_generation import *
@@ -19,9 +29,9 @@
* https://progenetix.org/services/collationplots/?datasetIds=progenetix&filters=NCIT:C7376,PMID:22824167,pgx:icdom-85003
* https://progenetix.org/services/intervalFrequencies/?datasetIds=progenetix&filters=NCIT:C7376,PMID:22824167&output=histoplot
* https://progenetix.org/services/intervalFrequencies/?datasetIds=progenetix&id=pgxcohort-TCGAcancers
-* https://progenetix.org/cgi/bycon/services/intervalFrequencies.py/?output=pgxseg&datasetIds=progenetix&filters=NCIT:C7376
* http://progenetix.org/services/intervalFrequencies/?datasetIds=progenetix&filters=NCIT&filterPrecision=start&withSamples=20&collationTypes=NCIT&output=histoplot&plot_area_height=20&plot_labelcol_font_size=6&plot_axislab_y_width=2&plot_label_y_values=0&plot_axis_y_max=80&plot_region_gap_width=1&debug=
* http://progenetix.test/services/intervalFrequencies/?datasetIds=progenetix&filters=NCIT:C7376,PMID:22824167&output=histoheatplot
+* http://progenetix.test/services/collationplots/?datasetIds=progenetix&collationTypes=NCIT&minNumber=500&plotType=histoheatplot&method=codematches
podmd"""
################################################################################
@@ -29,7 +39,6 @@
################################################################################
def main():
-
try:
collationplots()
except Exception:
@@ -42,23 +51,23 @@ def collationplots():
run_beacon_init_stack(byc)
generate_genome_bins(byc)
- plot_type = BYC_PARS.get("plot_type", "___none___")
- if plot_type not in ["histoplot", "histoheatplot", "histosparkplot"]:
- plot_type = "histoplot"
-
- BYC_PARS.update({"plot_type": plot_type})
- id_from_path = rest_path_value("collationplots")
- if id_from_path:
- byc[ "filters" ] = [ {"id": id_from_path } ]
+ if (id_from_path := rest_path_value("collationplots")):
+ byc["filters"] = [ {"id": id_from_path } ]
elif "id" in BYC_PARS:
- byc[ "filters" ] = [ {"id": BYC_PARS["id"]} ]
+ byc["filters"] = [ {"id": BYC_PARS["id"]} ]
+ if BYC_PARS.get("plot_type", "___none___") not in ["histoplot", "histoheatplot", "histosparkplot"]:
+ BYC_PARS.update({"plot_type": "histoplot"})
- if not "filters" in byc:
- BYC["ERRORS"].append("No value was provided for collation `id` or `filters`.")
+ svg_f = ExportFile("svg").checkOutputFile()
+ pdb = ByconBundler(byc).collationsPlotbundles()
+ if len(BYC["ERRORS"]) >1:
BeaconErrorResponse(byc).response(422)
- pdb = ByconBundler(byc).collationsPlotbundles()
- ByconPlot(byc, pdb).svgResponse()
+ BP = ByconPlot(byc, pdb)
+ if svg_f:
+ BP.svg2file(svg_f)
+ else:
+ BP.svgResponse()
################################################################################
diff --git a/services/config/genespans.yaml b/services/config/genespans.yaml
index 185c72c..9b49087 100644
--- a/services/config/genespans.yaml
+++ b/services/config/genespans.yaml
@@ -1,17 +1,11 @@
defaults:
- query_fields:
- - symbol
- - ensembl_gene_ids
- - synonyms
response_entity_id: gene
meta:
received_request_summary:
- requested_granularity: record
assembly_id: GRCh38
requested_schemas:
- entity_type: gene
schema: https://progenetix.org/services/schemas/ProgenetixGene
- returned_granularity: record
info: >
The main genes payload can be accessed in `response.results`.
assembly_ids:
diff --git a/services/intervalFrequencies.py b/services/intervalFrequencies.py
index 6cd5a34..dfc57dc 100755
--- a/services/intervalFrequencies.py
+++ b/services/intervalFrequencies.py
@@ -47,22 +47,18 @@ def interval_frequencies():
run_beacon_init_stack(byc)
generate_genome_bins(byc)
- id_from_path = rest_path_value("intervalFrequencies")
- if id_from_path:
- byc[ "filters" ] = [ {"id": id_from_path } ]
+ if (id_from_path := rest_path_value("collationplots")):
+ byc["filters"] = [ {"id": id_from_path } ]
elif "id" in BYC_PARS:
- byc[ "filters" ] = [ {"id": BYC_PARS["id"]} ]
-
- if not "filters" in byc:
- BYC["ERRORS"].append("No value was provided for collation `id` or `filters`.")
+ byc["filters"] = [ {"id": BYC_PARS["id"]} ]
+ pdb = ByconBundler(byc).collationsPlotbundles()
+ if len(BYC["ERRORS"]) >1:
BeaconErrorResponse(byc).response(422)
file_type = BYC_PARS.get("output", "___none___")
if file_type not in ["pgxfreq", "pgxmatrix", "pgxseg"]:
file_type = "pgxfreq"
output = file_type
- pdb = ByconBundler(byc).collationsPlotbundles()
-
if "pgxseg" in output or "pgxfreq" in output:
export_pgxseg_frequencies(byc, pdb["interval_frequencies_bundles"])
elif "matrix" in output:
diff --git a/services/lib/bycon_bundler.py b/services/lib/bycon_bundler.py
index 723344c..50ed29a 100644
--- a/services/lib/bycon_bundler.py
+++ b/services/lib/bycon_bundler.py
@@ -18,7 +18,6 @@
################################################################################
class ByconBundler:
-
"""
# The `ByconBundler` class
@@ -36,8 +35,8 @@ def __init__(self, byc):
self.local_paths = byc.get("local_paths", {})
self.datasets_results = None
self.dataset_ids = byc.get("dataset_ids", [])
- self.datatable_mappings = byc.get("datatable_mappings", {})
self.filters = byc.get("filters", [])
+ self.collation_types = BYC_PARS.get("collation_types", [])
self.min_number = BYC_PARS.get("min_number", 0)
self.delivery_method = BYC_PARS.get("method")
self.header = []
@@ -79,7 +78,6 @@ def __init__(self, byc):
#--------------------------------------------------------------------------#
def read_pgx_file(self, filepath):
-
self.filepath = filepath
h_lines = []
@@ -101,7 +99,6 @@ def read_pgx_file(self, filepath):
#--------------------------------------------------------------------------#
def read_probedata_file(self, filepath):
-
self.filepath = filepath
self.probedata = []
@@ -139,11 +136,9 @@ def read_probedata_file(self, filepath):
def pgxseg_to_keyed_bundle(self, filepath):
self.read_pgx_file(filepath)
-
if not "biosample_id" in self.fieldnames:
self.errors.append("¡¡¡ The `biosample_id` parameter is required for variant assignment !!!")
return
-
self.__deparse_pgxseg_samples_header()
self.__keyed_bundle_add_variants_from_lines()
@@ -153,33 +148,27 @@ def pgxseg_to_keyed_bundle(self, filepath):
#--------------------------------------------------------------------------#
def pgxseg_to_plotbundle(self, filepath):
-
self.pgxseg_to_keyed_bundle(filepath)
self.__flatten_keyed_bundle()
-
return {
"interval_frequencies_bundles": self.callsets_frequencies_bundles(),
"callsets_variants_bundles": self.callsets_variants_bundles()
}
+
#--------------------------------------------------------------------------#
def callsets_variants_bundles(self):
-
# TODO: This is similar to a keyed bundle component ...
-
bb = self.bundle
-
c_p_l = []
for p_o in bb.get("analyses", []):
cs_id = p_o.get("id")
p_o.update({
"variants": list(filter(lambda v: v.get("callset_id", "___none___") == cs_id, bb["variants"]))
})
- c_p_l.append(p_o)
-
+ c_p_l.append(p_o)
self.callsetVariantsBundles = c_p_l
-
return self.callsetVariantsBundles
@@ -240,7 +229,7 @@ def __deparse_pgxseg_samples_header(self):
continue
bios = {"id": bs_id}
- bios = import_datatable_dict_line(self.datatable_mappings, bios, fieldnames, bios_d, "biosample")
+ bios = import_datatable_dict_line(bios, fieldnames, bios_d, "biosample")
cs_id = bios.get("callset_id", re.sub("pgxbs", "pgxcs", bs_id) )
ind_id = bios.get("individual_id", re.sub("pgxbs", "pgxind", bs_id) )
ind = {"id": ind_id}
@@ -295,9 +284,6 @@ def __callsets_bundle_from_result_set(self, bundle_type="analyses"):
if cnv_chro_stats is False or cnv_statusmaps is False:
continue
- prdbug(f'dataset_id: {ds_id}')
- prdbug(f'label in bundler: {s.get("label")}')
-
p_o = {
"dataset_id": ds_id,
"callset_id": s.get(analysis_key, "NA"),
@@ -328,9 +314,6 @@ def __callsets_add_database_variants(self):
cs_id = p_o.get("callset_id", "___none___")
v_q = {"callset_id": cs_id}
p_o.update({"variants": list(var_coll.find(v_q))})
- # for v in var_coll.find(v_q):
- # p_o["variants"].append(ByconVariant(self.byc).byconVariant(v))
-
c_p_l.append(p_o)
self.callsetVariantsBundles = c_p_l
@@ -380,7 +363,7 @@ def __keyed_bundle_add_variants_from_lines(self):
"callset_id": cs_id,
}
- update_v = import_datatable_dict_line(self.datatable_mappings, update_v, fieldnames, v, "genomicVariant")
+ update_v = import_datatable_dict_line(update_v, fieldnames, v, "genomicVariant")
update_v = ByconVariant(self.byc).pgxVariant(update_v)
update_v.update({
@@ -403,6 +386,7 @@ def __keyed_bundle_add_variants_from_lines(self):
"variants_by_callset_id": vars_ided
})
+
#--------------------------------------------------------------------------#
def __flatten_keyed_bundle(self):
@@ -419,6 +403,7 @@ def __flatten_keyed_bundle(self):
"variants": [elem for sublist in ( v_cs_k.values() ) for elem in sublist]
})
+
#--------------------------------------------------------------------------#
def __callsetBundleCreateIsets(self, label=""):
@@ -426,10 +411,8 @@ def __callsetBundleCreateIsets(self, label=""):
for ds_id in self.dataset_ids:
dscs = list(filter(lambda cs: cs.get("dataset_id", "NA") == ds_id, self.bundle["analyses"]))
intervals, cnv_cs_count = interval_counts_from_callsets(self.bundle["analyses"], self.byc)
-
if cnv_cs_count < self.min_number:
continue
-
iset = {
"dataset_id": ds_id,
"group_id": ds_id,
@@ -446,44 +429,50 @@ def __callsetBundleCreateIsets(self, label=""):
def __isetBundlesFromCollationParameters(self):
if len(self.dataset_ids) < 1:
+ BYC["ERRORS"].append("¡¡¡ No `datasetdIds` parameter !!!")
return
- if len(self.filters) < 1:
+ if len(self.filters) < 1 and len(self.collation_types) < 1:
+ BYC["ERRORS"].append("¡¡¡ No `filters` or `collationTypes` parameter !!!")
return
-
fmap_name = "frequencymap"
if "codematches" in str(self.delivery_method):
fmap_name = "frequencymap_codematches"
- mongo_client = MongoClient(host=DB_MONGOHOST)
+ id_q = {}
+ if len(self.filters) > 0:
+ fids = [x.get("id", "___none___") for x in self.filters]
+ id_q = {"id": {"$in": fids}}
+ elif len(self.collation_types) > 0:
+ id_q = {"collation_type": {"$in": self.collation_types}}
+ prdbug(f'... __isetBundlesFromCollationParameters query {id_q}')
+
+ mongo_client = MongoClient(host=DB_MONGOHOST)
for ds_id in self.dataset_ids:
coll_db = mongo_client[ds_id]
- for f in self.filters:
- f_val = f["id"]
+ coll_ids = coll_db[ "collations" ].distinct("id", id_q)
+ prdbug(f'prefetched coll ids: {coll_ids}')
+ for f_val in coll_ids:
f_q = { "id": f_val }
- collation_f = coll_db[ "frequencymaps" ].find_one( { "id": f_val } )
- collation_c = coll_db[ "collations" ].find_one( { "id": f_val } )
-
+ collation_f = coll_db[ "frequencymaps" ].find_one( f_q )
+ collation_c = coll_db[ "collations" ].find_one( f_q )
if not collation_f:
continue
if not collation_c:
continue
if not fmap_name in collation_f:
continue
-
fmap_count = collation_f[ fmap_name ].get("analysis_count", 0)
if fmap_count < self.min_number:
continue
-
r_o = {
"dataset_id": ds_id,
"group_id": f_val,
"label": re.sub(r';', ',', collation_c["label"]),
"sample_count": fmap_count,
- "interval_frequencies": collation_f[ fmap_name ]["intervals"] }
-
+ "interval_frequencies": collation_f[ fmap_name ]["intervals"] }
self.intervalFrequenciesBundles.append(r_o)
-
mongo_client.close( )
+
################################################################################
diff --git a/services/lib/cytoband_utils.py b/services/lib/cytoband_utils.py
index 3ab9de9..463d7cc 100644
--- a/services/lib/cytoband_utils.py
+++ b/services/lib/cytoband_utils.py
@@ -239,37 +239,26 @@ def deparse_ISCN_to_variants(iscn, byc):
errors = []
for cnv_t, cnv_defs in v_t_defs.items():
-
revish = cnv_defs.get("revish_label")
if not revish:
continue
iscn_re = re.compile(rf"^.*?{revish}\(([\w.,]+)\).*?$", re.IGNORECASE)
-
if iscn_re.match(iscn):
-
m = iscn_re.match(iscn).group(1)
-
- for i_v in re.split(",", m):
-
+ for i_v in re.split(",", m):
if not cb_pat.match(i_v):
continue
-
cytoBands, chro, start, end, error = bands_from_cytobands(i_v, c_b_d, a_d)
if len(error) > 0:
errors.append(error)
continue
-
v_l = end - start
t = cnv_defs.get("DUPDEL", "CNV")
-
cytostring = "{}({})".format(cnv_t, i_v).lower()
-
if "amp" in revish and v_l > i_d.get("cnv_amp_max_size", 3000000):
revish = "hldup"
-
- v_s = {}
-
+ v_s = {}
v = ({
"variant_state": cnv_defs.get("variant_state"),
"location": {
diff --git a/services/lib/datatable_utils.py b/services/lib/datatable_utils.py
index 44aede2..31105c1 100644
--- a/services/lib/datatable_utils.py
+++ b/services/lib/datatable_utils.py
@@ -3,7 +3,7 @@
from random import sample as randomSamples
# bycon
-from bycon import assign_nested_value, get_nested_value, prdbug, prjsonnice, BYC_PARS, ENV
+from bycon import assign_nested_value, get_nested_value, prdbug, prjsonnice, BYC, BYC_PARS, ENV
################################################################################
@@ -11,9 +11,7 @@ def export_datatable_download(results, byc):
# TODO: separate table generation from HTTP response
output = BYC_PARS.get("output", "___none___")
prdbug(f'... in export_datatable_download => {output}')
- dt_m = byc.get("datatable_mappings")
- if not dt_m:
- return
+ dt_m = BYC["datatable_mappings"]
r_t = byc.get("response_entity_id", "___none___")
if not r_t in dt_m["definitions"]:
return
@@ -62,8 +60,8 @@ def export_datatable_download(results, byc):
################################################################################
-def import_datatable_dict_line(datatable_mappings, parent, fieldnames, lineobj, primary_scope="biosample"):
- dt_m = datatable_mappings
+def import_datatable_dict_line(parent, fieldnames, lineobj, primary_scope="biosample"):
+ dt_m = BYC["datatable_mappings"]
if not primary_scope in dt_m["definitions"]:
return
io_params = dt_m["definitions"][ primary_scope ]["parameters"]
diff --git a/services/lib/export_file_generation.py b/services/lib/export_file_generation.py
index d6cdcf8..4123f60 100644
--- a/services/lib/export_file_generation.py
+++ b/services/lib/export_file_generation.py
@@ -1,7 +1,7 @@
-import pymongo
from os import path, environ
+from pymongo import MongoClient
-from bycon_helpers import get_nested_value, return_paginated_list
+from bycon_helpers import get_nested_value, return_paginated_list, select_this_server
from cgi_parsing import *
from config import *
from variant_mapping import ByconVariant
@@ -18,7 +18,7 @@ def stream_pgx_meta_header(ds_id, ds_results, byc):
ds_d = byc.get("dataset_definitions", {})
ds_ds_d = ds_d.get(ds_id, {})
- mongo_client = pymongo.MongoClient(host=environ.get("BYCON_MONGO_HOST", "localhost"))
+ mongo_client = MongoClient(host=environ.get("BYCON_MONGO_HOST", "localhost"))
bs_coll = mongo_client[ ds_id ][ "biosamples" ]
open_text_streaming()
@@ -44,7 +44,7 @@ def stream_pgx_meta_header(ds_id, ds_results, byc):
################################################################################
def pgxseg_biosample_meta_line(byc, biosample, group_id_key="histological_diagnosis_id"):
- dt_m = byc["datatable_mappings"]
+ dt_m = BYC["datatable_mappings"]
io_params = dt_m["definitions"][ "biosample" ]["parameters"]
g_id_k = group_id_key
@@ -118,11 +118,11 @@ def print_filters_meta_line(byc):
################################################################################
def export_pgxseg_download(datasets_results, ds_id, byc):
- data_client = pymongo.MongoClient(host=DB_MONGOHOST)
+ data_client = MongoClient(host=DB_MONGOHOST)
v_coll = data_client[ ds_id ][ "variants" ]
ds_results = datasets_results.get(ds_id, {})
if not "variants._id" in ds_results:
- # TODO: error message here
+ BYC["ERRORS"].append("No variants found in the dataset results.")
return
v__ids = ds_results["variants._id"].get("target_values", [])
if test_truthy( BYC_PARS.get("paginate_results", True) ):
@@ -139,14 +139,113 @@ def export_pgxseg_download(datasets_results, ds_id, byc):
v_instances = list(sorted(v_instances, key=lambda x: (f'{x["reference_name"].replace("X", "XX").replace("Y", "YY").zfill(2)}', x['start'])))
for v in v_instances:
print_variant_pgxseg(v)
-
close_text_streaming()
+
+################################################################################
+
+def write_variants_bedfile(datasets_results, ds_id, byc):
+ """podmd
+ ##### Accepts
+
+ * a Bycon `byc` object
+ * a Bycon `h_o` handover object with its `target_values` representing `_id`
+ objects of a `variants` collection
+
+ The function creates a basic BED file and returns its local path. A standard
+ use would be to create a link to this file and submit it as `hgt.customText`
+ parameter to the UCSC browser.
+
+ ##### TODO
+
+ * The creation of the different variant types is still rudimentary and has to be
+ expanded in lockstep with improving Beacon documentation and examples. The
+ definition of the types and their match patterns should also be moved to a
+ +separate configuration entry and subroutine.
+ * evaluate to use "bedDetails" format
+
+ podmd"""
+ local_paths = byc.get("local_paths")
+ if not local_paths:
+ return False
+ tmp_path = path.join( *local_paths[ "server_tmp_dir_loc" ])
+ if not path.isdir(tmp_path):
+ BYC["ERRORS"].append(f"Temporary directory `{tmp_path}` not found.")
+ return False
+ h_o_server = select_this_server(byc)
+ ext_url = f'http://genome.ucsc.edu/cgi-bin/hgTracks?org=human&db=hg38'
+ bed_url = f''
+
+ vs = { "DUP": [ ], "DEL": [ ], "LOH": [ ], "SNV": [ ]}
+ colors = {
+ "plot_DUP_color": (255, 198, 51),
+ "plot_AMP_color": (255,102,0),
+ "plot_DEL_color": (51, 160, 255),
+ "plot_HOMODEL_color": (0, 51, 204),
+ "plot_LOH_color": (102, 170, 153),
+ "plot_SNV_color": (255, 51, 204)
+ }
+
+ data_client = MongoClient(host=DB_MONGOHOST)
+ v_coll = data_client[ ds_id ][ "variants" ]
+ ds_results = datasets_results.get(ds_id, {})
+ if not "variants._id" in ds_results:
+ BYC["ERRORS"].append("No variants found in the dataset results.")
+ return [ext_url, bed_url]
+ v__ids = ds_results["variants._id"].get("target_values", [])
+ v_count = ds_results["variants._id"].get("target_count", 0)
+ accessid = ds_results["variants._id"].get("id", "___none___")
+ if test_truthy( BYC_PARS.get("paginate_results", True) ):
+ v__ids = return_paginated_list(v__ids, BYC_PARS.get("skip", 0), BYC_PARS.get("limit", 0))
+
+ bed_file_name = f'{accessid}.bed'
+ bed_file = path.join( tmp_path, bed_file_name )
+
+ for v__id in v__ids:
+ v = v_coll.find_one( { "_id": v__id }, { "_id": 0 } )
+ pv = ByconVariant(byc).byconVariant(v)
+ if (pvt := pv.get("variant_type", "___none___")) not in vs.keys():
+ continue
+ vs[pvt].append(pv)
+
+ b_f = open( bed_file, 'w' )
+ pos = set()
+ ucsc_chr = ""
+ for vt in vs.keys():
+ if len(vs[vt]) > 0:
+ try:
+ vs[vt] = sorted(vs[vt], key=lambda k: k['variant_length'], reverse=True)
+ except:
+ pass
+ col_key = f"plot_{vt}_color"
+ col_rgb = colors.get(col_key, (127, 127, 127))
+ # col_rgb = [127, 127, 127]
+ b_f.write(f'track name={vt} visibility=squish description=\"overall {v_count} variants matching the query; {len(vs[vt])} in this track\" color={col_rgb[0]},{col_rgb[1]},{col_rgb[2]}\n')
+ b_f.write("#chrom\tchromStart\tchromEnd\tbiosampleId\n")
+ for v in vs[vt]:
+ ucsc_chr = "chr"+v["reference_name"]
+ ucsc_min = int( v["start"] + 1 )
+ ucsc_max = int( v["end"] )
+ l = f'{ucsc_chr}\t{ucsc_min}\t{ucsc_max}\t{v.get("biosample_id", "___none___")}\n'
+ pos.add(ucsc_min)
+ pos.add(ucsc_max)
+ b_f.write( l )
+
+ b_f.close()
+ ucsc_range = sorted(pos)
+ ucsc_pos = "{}:{}-{}".format(ucsc_chr, ucsc_range[0], ucsc_range[-1])
+ ext_url = f'{ext_url}&position={ucsc_pos}&hgt.customText='
+ bed_url = f'{h_o_server}{local_paths.get("server_tmp_dir_web", "/tmp")}/{bed_file_name}'
+
+ return [ext_url, bed_url]
+
+
################################################################################
def print_variant_pgxseg(v_pgxseg):
print( pgxseg_variant_line(v_pgxseg) )
+
################################################################################
def print_pgxseg_header_line():
@@ -196,7 +295,7 @@ def export_callsets_matrix(datasets_results, ds_id, byc):
cs_r = datasets_results[ds_id].get("analyses._id")
if not cs_r:
return
- mongo_client = pymongo.MongoClient(host=DB_MONGOHOST)
+ mongo_client = MongoClient(host=DB_MONGOHOST)
bs_coll = mongo_client[ ds_id ][ "biosamples" ]
cs_coll = mongo_client[ ds_id ][ "analyses" ]
@@ -354,7 +453,7 @@ def export_vcf_download(datasets_results, ds_id, byc):
"INFO": ""
}
- data_client = pymongo.MongoClient(host=environ.get("BYCON_MONGO_HOST", "localhost"))
+ data_client = MongoClient(host=environ.get("BYCON_MONGO_HOST", "localhost"))
v_coll = data_client[ ds_id ][ "variants" ]
ds_results = datasets_results.get(ds_id, {})
if not "variants._id" in ds_results:
diff --git a/services/lib/file_utils.py b/services/lib/file_utils.py
index f0113cd..f3d7b15 100644
--- a/services/lib/file_utils.py
+++ b/services/lib/file_utils.py
@@ -6,13 +6,46 @@
from copy import deepcopy
from random import sample as random_samples
-from bycon import ByconVariant, prjsonnice, return_paginated_list
+from bycon import (
+ ByconVariant,
+ BYC,
+ BYC_PARS,
+ ENV,
+ prdbug,
+ prjsonnice,
+ return_paginated_list
+)
-from datatable_utils import import_datatable_dict_line
from interval_utils import interval_cnv_arrays, interval_counts_from_callsets
################################################################################
+class ExportFile:
+
+ def __init__(self, file_type=None):
+ self.file_path = BYC_PARS.get("outputfile")
+ self.file_type = file_type
+
+ # -------------------------------------------------------------------------#
+ # ----------------------------- public ------------------------------------#
+ # -------------------------------------------------------------------------#
+
+ def checkOutputFile(self):
+ if not self.file_path:
+ if "local" in ENV:
+ BYC["ERRORS"].append("No output file specified (-o, --outputfile) => quitting ...")
+ return False
+ if self.file_type:
+ if not self.file_path.endswith(self.file_type):
+ if "local" in ENV:
+ BYC["ERRORS"].append(f"The output file should be an `{self.file_type}` => quitting ...")
+ return False
+ return self.file_path
+
+
+################################################################################
+
+
def read_tsv_to_dictlist(filepath, max_count=0):
dictlist = []
with open(filepath, newline='') as csvfile:
diff --git a/services/lib/interval_utils.py b/services/lib/interval_utils.py
index daf8fdc..c682d5b 100644
--- a/services/lib/interval_utils.py
+++ b/services/lib/interval_utils.py
@@ -64,6 +64,15 @@
################################################################################
################################################################################
+class GenomeBins:
+ def __init__(self, byc):
+ self.genomic_intervals = []
+
+ #--------------------------------------------------------------------------#
+ #----------------------------- public -------------------------------------#
+ #--------------------------------------------------------------------------#
+
+
def generate_genome_bins(byc):
parse_cytoband_file(byc)
__generate_cytoband_intervals(byc)
@@ -220,48 +229,33 @@ def interval_cnv_arrays(cs_vars, byc):
# the values_map collects all values for the given interval to retrieve
# the min and max values of each interval
values_map = [[] for i in range(int_no)]
-
digests = []
-
if type(cs_vars).__name__ == "Cursor":
cs_vars.rewind()
-
for v in cs_vars:
- if "variant_state" not in v:
- continue
-
- v_t_c = v["variant_state"].get("id", "__NA__")
+ v_t_c = v.get("variant_state", {}).get("id", "__NA__")
if v_t_c not in v_t_defs.keys():
continue
-
dup_del = v_t_defs[v_t_c].get("DUPDEL")
# skipping non-CNV vars
if dup_del is None:
continue
-
cov_lab = cov_labs[dup_del]
-
- if "reference_name" not in v:
- v.update({"reference_name": v["location"]["chromosome"]})
-
v_i_id = v.get("variant_internal_id", None)
v_cs_id = v.get("callset_id", None)
-
if v_i_id in digests:
if "local" in ENV:
print(f'\n¡¡¡ {v_i_id} already counted for {v_cs_id}')
+ continue
else:
digests.append(v_i_id)
for i, intv in enumerate(intervals):
-
if _has_overlap(intv, v):
-
ov_end = min(intv["end"], v["location"]["end"])
ov_start = max(intv["start"], v["location"]["start"])
ov = ov_end - ov_start
maps[cov_lab][i] += ov
-
try:
# print(type(v["info"]["cnv_value"]))
if type(v["info"]["cnv_value"]) == int or type(v["info"]["cnv_value"]) == float:
@@ -277,7 +271,7 @@ def interval_cnv_arrays(cs_vars, byc):
if maps[cov_lab][i] > 0:
cov = maps[cov_lab][i]
lab = f'{cov_lab}coverage'
- chro = str(intv["reference_name"])
+ chro = str(v["location"].get("chromosome"))
c_a = chro + intv["arm"]
cnv_stats[lab] += cov
chro_stats[chro][lab] += cov
@@ -335,7 +329,6 @@ def interval_counts_from_callsets(analyses, byc):
analyses with CNV statusmaps and return a list of standard genomic interval
objects with added per-interval quantitative data.
"""
-
min_f = byc["interval_definitions"]["interval_min_fraction"].get("value", 0.001)
int_fs = deepcopy(byc["genomic_intervals"])
int_no = len(int_fs)
@@ -346,32 +339,25 @@ def interval_counts_from_callsets(analyses, byc):
cs_no = len(list(analyses))
f_factor = 0
-
if cs_no > 0:
f_factor = 100 / cs_no
-
pars = {
"gain": {"cov_l": "dup", "val_l": "max"},
"loss": {"cov_l": "del", "val_l": "min"}
}
for t in pars.keys():
-
covs = np.zeros((cs_no, int_no))
vals = np.zeros((cs_no, int_no))
-
if type(analyses).__name__ == "Cursor":
analyses.rewind()
-
for i, cs in enumerate(analyses):
covs[i] = cs["cnv_statusmaps"][pars[t]["cov_l"]]
vals[i] = cs["cnv_statusmaps"][pars[t]["val_l"]]
-
counts = np.count_nonzero(covs >= min_f, axis=0)
frequencies = np.around(counts * f_factor, 3)
medians = np.around(np.ma.median(np.ma.masked_where(covs < min_f, vals), axis=0).filled(0), 3)
means = np.around(np.ma.mean(np.ma.masked_where(covs < min_f, vals), axis=0).filled(0), 3)
-
for i, interval in enumerate(int_fs):
int_fs[i].update({
t + "_frequency": frequencies[i],
diff --git a/services/lib/service_response_generation.py b/services/lib/service_response_generation.py
index a5d6d27..eb0adca 100644
--- a/services/lib/service_response_generation.py
+++ b/services/lib/service_response_generation.py
@@ -18,17 +18,15 @@ class ByconautServiceResponse:
def __init__(self, byc: dict, response_schema="byconautServiceResponse"):
self.byc = byc
- self.beacon_defaults = byc.get("beacon_defaults", {})
- self.services_defaults = byc.get("services_defaults", {})
- self.entity_defaults = self.beacon_defaults.get("entity_defaults", {"info":{}})
+ self.entity_defaults = BYC["beacon_defaults"].get("entity_defaults", {"info":{}})
self.service_config = self.byc.get("service_config", {})
self.response_schema = response_schema
self.requested_granularity = BYC_PARS.get("requested_granularity", "record")
# TBD for authentication?
- self.returned_granularity = self.requested_granularity
+ self.returned_granularity = byc.get("returned_granularity", "boolean")
self.beacon_schema = self.byc["response_entity"].get("beacon_schema", "___none___")
- self.data_response = object_instance_from_schema_name(byc, response_schema, "")
- self.error_response = object_instance_from_schema_name(byc, "beaconErrorResponse", "")
+ self.data_response = object_instance_from_schema_name(response_schema, "")
+ self.error_response = object_instance_from_schema_name("beaconErrorResponse", "")
self.__meta_add_received_request_summary_parameters()
self.__meta_add_parameters()
@@ -192,11 +190,9 @@ def __init__(self, byc: dict):
self.delivery_method = BYC_PARS.get("method", "___none___")
self.output = BYC_PARS.get("output", "___none___")
self.dataset_ids = byc.get("dataset_ids", [])
- self.beacon_defaults = byc.get("beacon_defaults", {})
self.service_config = byc.get("service_config", {})
- self.entity_defaults = self.beacon_defaults.get("entity_defaults", {"info":{}})
+ self.entity_defaults = BYC["beacon_defaults"].get("entity_defaults", {"info":{}})
self.filter_definitions = byc.get("filter_definitions", {})
- self.form_data = byc.get("form_data", {})
self.filters = byc.get("filters", [])
self.response_entity_id = byc.get("response_entity_id", "filteringTerm")
self.path_id_value = byc.get("request_entity_path_id_value", False)
@@ -221,18 +217,17 @@ def __return_collations(self):
f_coll = "collations"
d_k = set_selected_delivery_keys(self.service_config.get("method_keys"))
- c_id = self.form_data.get("id", "")
+ c_id = BYC_PARS.get("id", "")
# TODO: This should be derived from some entity definitions
# TODO: whole query generation in separate function ...
query = {}
if BYC["TEST_MODE"] is True:
- t_m_c = self.form_data.get("test_mode_count", 5)
+ t_m_c = BYC_PARS.get("test_mode_count", 5)
query = mongo_test_mode_query(self.dataset_ids[0], f_coll, t_m_c)
elif len(c_id) > 0:
query = { "id": c_id }
else:
-
q_list = []
ft_fs = []
for f in self.filters:
@@ -242,11 +237,9 @@ def __return_collations(self):
f_re = re.compile(r'^' + '|'.join(ft_fs))
else:
f_re = None
-
if f_re is not None:
q_list.append({"id": { "$regex": f_re}})
-
- q_types = self.form_data.get("collation_types", [])
+ q_types = BYC_PARS.get("collation_types", [])
if len(q_types) > 0:
q_list.append({"collation_type": {"$in": q_types }})
@@ -254,13 +247,14 @@ def __return_collations(self):
query = q_list[0]
elif len(q_list) > 1:
query = {"$and": q_list}
+
+ prdbug(f'Collation query: {query}')
# TODO
# if not query:
# warning = 'No limit (filters, collationTypes, id) on collation listing -> abortin...'
s_s = { }
-
for ds_id in self.dataset_ids:
fields = {"_id": 0}
f_s = mongo_result_list(ds_id, f_coll, query, fields)
@@ -268,14 +262,11 @@ def __return_collations(self):
if "codematches" in str(self.delivery_method):
if int(f.get("code_matches", 0)) < 1:
continue
-
i_d = f.get("id", "NA")
if i_d not in s_s:
s_s[ i_d ] = { }
-
if len(d_k) < 1:
- d_k = list(f.keys())
-
+ d_k = list(f.keys())
for k in d_k:
if k in self.service_config.get("integer_keys", []):
s_s[ i_d ].update({k: s_s[ i_d ].get(k, 0) + f.get(k, 0)})
diff --git a/services/local/beacon_defaults.yaml b/services/local/beacon_defaults.yaml
index e5c9482..55ef4d1 100644
--- a/services/local/beacon_defaults.yaml
+++ b/services/local/beacon_defaults.yaml
@@ -13,11 +13,12 @@ defaults:
# the aliases here are for non-standard speling or additional entry types
service_path_aliases:
- analyses: analyses
- filteringTerms: filteringTerms
- phenopackets: phenopackets
- variants: genomicVariations
- genomicVariations: genomicVariations
+ filteringTerms: filtering_terms # just for speling variations
+ entryTypes: entry_types # just for speling variations
+ variants: genomicVariations # just for speling variations
+ genomicVariations: genomicVariations # just for speling variations
+ phenopackets: phenopackets # Beacon+ specific example
+
################################################################################
# here you can map additional path values to the corresponding (additional)
@@ -25,16 +26,16 @@ service_path_aliases:
################################################################################
path_entry_type_mappings:
- phenopackets: phenopacket
+ phenopackets: phenopacket # Beacon+ specific example
################################################################################
# her you can add additional path ids to the data query aggregation pipeline
# that usually mapps/reduces queries against biosamples, genomicVariations,
-#individuals ...
+# individuals ...
################################################################################
data_pipeline_path_ids:
- - phenopackets
+ - phenopackets # Beacon+ specific example
################################################################################
# Beacon entry type defaults - please adjust esp. info and schema paths...
@@ -42,36 +43,7 @@ data_pipeline_path_ids:
# framework and might be disentangled further on ...
################################################################################
-# => snake_casing
-
-# standard examples
-
-# ################################################################################
-# filteringTerm:
-# is_entry_type: False
-# request_entity_path_id: filteringTerms
-# response_entity_id: filteringTerm
-# collection: collations
-# response_schema: beaconFilteringTermsResponse
-# beacon_schema:
-# entity_type: filteringTerm
-# schema: https://progenetix.org/services/schemas/filteringTermsSchema/
-# h->o_access_key: Null
-# ################################################################################
-# biosample:
-# is_entry_type: True
-# request_entity_path_id: biosamples
-# response_entity_id: biosample
-# collection: biosamples
-# response_schema: beaconResultsetsResponse
-# beacon_schema:
-# entity_type: biosample
-# schema: https://progenetix.org/services/schemas/biosample/
-# h->o_access_key: biosamples._id
-# ################################################################################
-
entity_defaults:
-
info:
is_entry_type: False
collection: Null
diff --git a/services/local/instance_overrides.yaml b/services/local/instance_overrides.yaml
new file mode 100644
index 0000000..8c8fc7a
--- /dev/null
+++ b/services/local/instance_overrides.yaml
@@ -0,0 +1,62 @@
+progenetix:
+ domains:
+ - progenetix.org
+ - www.progenetix.org
+ - progenetix.test
+ beacon_defaults:
+ defaults:
+ default_dataset_id: progenetix
+ test_domains:
+ - progenetix.test
+
+beaconplus:
+ domains:
+ - beaconplus.progenetix.org
+ - beaconplus.test
+ beacon_defaults:
+ defaults:
+ default_dataset_id: examplez
+ test_domains:
+ - beaconplus.test
+
+cancercelllines:
+ domains:
+ - cancercelllines.org
+ - www.cancercelllines.org
+ - cancercelllines.test
+ beacon_defaults:
+ defaults:
+ default_dataset_id: cellz
+ test_domains:
+ - cancercelllines.test
+ entity_defaults:
+ info:
+ content:
+ beacon_id: org.cancercelllines
+ name: Cancer Cell Line Genomics Beacon+
+ id: org.cancercelllines.beacon
+ environment: prod
+ description: >-
+ The cancercelllines.org Beacon is a specific instance of the Progenetix
+ Beacon+ environment providing information about genommic variations in
+ cancer cell lines.
+ type:
+ group: org.ga4gh
+ artifact: beacon
+ version: v2.1.0-beaconplus
+ documentation_url: http://docs.cancercelllines.org
+ service_url: http://cancercelllines.org/beacon/
+ welcome_url: https://cancercelllines.org/biosamples/
+ alternative_url: https://cancercelllines.org
+ contact_url: mailto:contact@progenetix.org
+ created_at: 2023-07-01T00:00:00
+ updated_at: 2024-02-24T13:00:00
+ organization:
+ welcome_url: https://cancercelllines.org/
+ contact_url: mailto:contact@progenetix.org
+ logoUrl: https://cancercelllines.org/img/cancercelllines-icon-400x300.png
+ info:
+ update_date_time: 2024-02-24T12:45:00
+ create_date_time: 2023-07-01T00:00:00
+ update_date_time: 2024-02-24T13:00:00
+
diff --git a/services/local/local_paths.yaml b/services/local/local_paths.yaml
index 29dc172..a35deda 100644
--- a/services/local/local_paths.yaml
+++ b/services/local/local_paths.yaml
@@ -19,8 +19,3 @@ server_callsets_dir_loc:
- grch38
probefile_name: probes,cn.tsv
-
-test_domains:
- - progenetix.test
- - cancercelllines.test
- - beaconplus.test
diff --git a/services/local/services_defaults.yaml b/services/local/services_defaults.yaml
index e60ce82..6d50321 100644
--- a/services/local/services_defaults.yaml
+++ b/services/local/services_defaults.yaml
@@ -1,3 +1,5 @@
+# Definitions here in fact are treated like `beacon_defaults` and merged into
+# the global `beacon_defaults` dictionary
defaults: {}
################################################################################
@@ -35,6 +37,7 @@ service_path_aliases:
schemas: schemas
uploader: uploader
uploadplotter: uploadplotter
+ variantsbedfile: variantsbedfile
vcf: vcfvariants
vcfvariants: vcfvariants
diff --git a/services/services.py b/services/services.py
index 220b32c..8848938 100755
--- a/services/services.py
+++ b/services/services.py
@@ -45,7 +45,7 @@ def services():
# for d_k, d_v in defaults.items():
# byc.update( { d_k: d_v } )
read_service_prefs(service, services_conf_path, byc)
- defs = byc.get("beacon_defaults", {})
+ defs = BYC["beacon_defaults"]
s_a_s = defs.get("service_path_aliases", {})
r_w = defs.get("rewrites", {})
diff --git a/services/variantsbedfile.py b/services/variantsbedfile.py
new file mode 100644
index 0000000..378d5d9
--- /dev/null
+++ b/services/variantsbedfile.py
@@ -0,0 +1,47 @@
+#!/usr/bin/env python3
+import sys
+from os import path, environ, pardir
+
+from bycon import *
+
+services_lib_path = path.join( path.dirname( path.abspath(__file__) ), "lib" )
+sys.path.append( services_lib_path )
+from export_file_generation import write_variants_bedfile
+
+"""
+The plot service uses the standard bycon data retrieval pipeline with `biosample`
+as entity type. Therefore, all standard Beacon query parameters work and also
+the path is interpreted for an biosample `id` value if there is an entry at
+`.../pgxsegvariants/{id}`
+
+* http://progenetix.org/services/pgxsegvariants/pgxbs-kftvjv8w
+
+"""
+
+################################################################################
+################################################################################
+################################################################################
+
+def main():
+ variantsbedfile()
+
+################################################################################
+
+def variantsbedfile():
+ initialize_bycon_service(byc, "g_variants")
+ run_beacon_init_stack(byc)
+ rss = ByconResultSets(byc).datasetsResults()
+ ds_id = list(rss.keys())[0]
+ ucsclink, bedfilelink = write_variants_bedfile(rss, ds_id, byc)
+ # TODO: Error
+ if "ucsc" in BYC_PARS.get("output", "bed"):
+ print_uri_rewrite_response(ucsclink, bedfilelink)
+ print_uri_rewrite_response(bedfilelink)
+
+
+################################################################################
+################################################################################
+################################################################################
+
+if __name__ == '__main__':
+ main()
diff --git a/tmp/aggregator.yaml b/tmp/aggregator.yaml
index 6162e6b..66ef692 100644
--- a/tmp/aggregator.yaml
+++ b/tmp/aggregator.yaml
@@ -3,7 +3,7 @@ description: >-
federated Beacon queries through translating / sending / retrieving / converting
Beacon queries in v2 format to the format of the respective Beacon instances.
Please be aware that `bycon` uses `snake_cased` keys for its internal parameters;
- _i.e._ a URL parameter `assemblyId` will be accessible as `byc["form_data"]["assembly_id"]`
+ _i.e._ a URL parameter `assemblyId` will be accessible as `BYC_PARS["assembly_id"]`
to the internal methods.
selected_beacons: