2.0.5

mainly examplez & methods for this
progenetix · Oct 28, 2024 · 114659a · 114659a
1 parent a12e50b
commit 114659a
Show file tree

Hide file tree

Showing 71 changed files with 636 additions and 14,642 deletions.
diff --git a/bycon/byconServiceLibs/bycon_bundler.py b/bycon/byconServiceLibs/bycon_bundler.py
@@ -441,37 +441,24 @@ def __isetBundlesFromCollationParameters(self):
         #     return
 
         fmap_name = "frequencymap"
-
         query = CollationQuery().getQuery()
-
-        # id_q = {}
-        # if len(filters) > 0:
-        #     fids = [x.get("id", "___none___") for x in filters]
-        #     id_q = {"id": {"$in": fids}}
-        # elif len(self.collation_types) > 0:
-        #     id_q = {"collation_type": {"$in": self.collation_types}}
-
+
         prdbug(f'... __isetBundlesFromCollationParameters query {query}')
 
         mongo_client = MongoClient(host=DB_MONGOHOST)
         for ds_id in datset_ids:
             coll_db = mongo_client[ds_id]
             coll_ids = coll_db[ "collations" ].distinct("id", query)
-            for f_val in coll_ids:
-                f_q = { "id": f_val }
-                if not (collation_f := coll_db["frequencymaps"].find_one(f_q)):
-                    continue
-                if not (collation_c := coll_db["collations"].find_one(f_q)):
-                    continue
+            for collation_f in coll_db["frequencymaps" ].find(query):
                 if not fmap_name in collation_f:
                     continue
                 fmap_count = collation_f[ fmap_name ].get("cnv_analyses", 0)
                 if fmap_count < self.min_number:
                     continue
                 r_o = {
                     "dataset_id": ds_id,
-                    "group_id": f_val,
-                    "label": re.sub(r';', ',', collation_c["label"]),
+                    "group_id": collation_f.get("id", ""),
+                    "label": re.sub(r';', ',', collation_f.get("label", "")),
                     "sample_count": fmap_count,
                     "frequencymap_samples": collation_f[ fmap_name ].get("frequencymap_samples", fmap_count),
                     "interval_frequencies": collation_f[ fmap_name ]["intervals"] }                    

diff --git a/bycon/byconServiceLibs/datatable_utils.py b/bycon/byconServiceLibs/datatable_utils.py
@@ -1,6 +1,5 @@
 import csv, re, requests
 # from attrdictionary import AttrDict
-from random import sample as randomSamples
 
 # bycon
 from bycon import RefactoredValues, prdbug, prdlhead, prjsonnice, BYC, BYC_PARS, ENV

diff --git a/bycon/byconServiceLibs/file_utils.py b/bycon/byconServiceLibs/file_utils.py
@@ -53,7 +53,7 @@ def read_tsv_to_dictlist(filepath, max_count=0):
         for l in data:
             dictlist.append(dict(l))
     if 0 < max_count < len(dictlist):
-        dictlist = random_samples(dictlist, k=max_count)
+        dictlist = random_samples(dictlist, max_count)
 
     return dictlist, fieldnames
 
@@ -76,7 +76,7 @@ def read_www_tsv_to_dictlist(www, max_count=0):
             dictlist.append(dict(l))
 
     if 0 < max_count < len(dictlist):
-        dictlist = random_samples(dictlist, k=max_count)
+        dictlist = random_samples(dictlist, max_count)
 
     return dictlist, fieldnames
 

diff --git a/bycon/byconServiceLibs/ontology_utils.py b/bycon/byconServiceLibs/ontology_utils.py
@@ -1,5 +1,5 @@
 import re
-from random import sample
+from random import sample as random_samples
 from progress.bar import Bar
 
 from pymongo import MongoClient
@@ -213,7 +213,7 @@ def __create_ontology_maps(self):
         for k, v in keyed_maps.items():
             examples = self.bios_coll.distinct("notes", v["local_query"])
             s_no = min(10, len(examples))
-            e = sample(examples, s_no)
+            e = random_samples(examples, s_no)
             e = [t for t in e if len(t) > 2]
             v.update({"examples": e})
             if len(v.get("errors", 0)) > 0:

diff --git a/bycon/byconServiceLibs/service_helpers.py b/bycon/byconServiceLibs/service_helpers.py
@@ -1,9 +1,9 @@
-import re, time, base36
+import re, time, base36, datetime
 from humps import decamelize
 from os import path
 from pathlib import Path
 
-from bycon import load_yaml_empty_fallback, BYC, BYC_PARS, ENV
+from bycon import load_yaml_empty_fallback, BYC, BYC_PARS, ENV, prdbug
 
 ################################################################################
 
@@ -68,6 +68,7 @@ def open_text_streaming(filename="data.pgxseg"):
 
 def close_text_streaming():
     print()
+    prdbug(f'... closing text streaming at {datetime.datetime.now().strftime("%H:%M:%S")}')
     exit()
 
 

diff --git a/bycon/config.py b/bycon/config.py
@@ -16,8 +16,8 @@
 LIB_PATH = path.join( pkg_path, "lib")
 
 # path of the calling script is used to point to a local config directory
-__caller_path = path.dirname( path.abspath(sys.argv[0]))
-LOC_PATH = path.join(__caller_path, pardir, "local")
+CALLER_PATH = path.dirname( path.abspath(sys.argv[0]))
+LOC_PATH = path.join(CALLER_PATH, pardir, "local")
 
 REQUEST_PATH_ROOT = "beacon"
 

diff --git a/bycon/config/argument_definitions.yaml b/bycon/config/argument_definitions.yaml
@@ -502,14 +502,6 @@ outputfile:
     - --outputfile
   description: output file where supported (cmd line)
 
-randno:
-  type: integer
-  cmdFlags:
-    - -r
-    - --randno
-  description: random number to limit processing, where supported
-  default: 0
-
 min_number:
   type: integer
   cmdFlags:

diff --git a/bycon/config/datatable_mappings.yaml b/bycon/config/datatable_mappings.yaml
@@ -62,7 +62,7 @@ ordered_metadata_core:
   - individual_id
   - experiment_title
   - biosample_name
-  - biosample_notes
+  - notes
   - histological_diagnosis_id
   - histological_diagnosis_label
   - experiment_id
@@ -86,6 +86,8 @@ ordered_metadata_core:
   - pathological_stage_label
   - tumor_grade_id
   - tumor_grade_label
+  - ethnicity_id
+  - ethnicity_label
   - index_disease_followup_time
   - index_disease_followup_state_id
   - index_disease_followup_state_label
@@ -95,6 +97,21 @@ ordered_metadata_core:
   - pipeline_ref
   - analysis_operation_id
   - analysis_operation_label
+  - geoprov_id
+
+
+ordered_variants_core:
+  - analysis_id
+  - biosample_id
+  - individual_id
+  - reference_name
+  - start
+  - end
+  - variant_state_id
+  - variant_state_label
+  - reference_sequence
+  - sequence
+
 
 definitions:
 
@@ -140,14 +157,6 @@ definitions:
       pipeline_ref:
         db_key: pipeline_ref
         beacon_model_path: analyses.pipelineRef
-
- # In Beacon but not used by us
- #      library_source_id:
- #        beacon_model_path: runs.librarySource.id
- #        default: "GENEPIO:0001966"
- #      library_source_label:
- #        beacon_model_path: runs.librarySource.label
- #        default: "genomic source"
 
       # bycon &  data management specials
       analysis_operation_id:
@@ -184,23 +193,6 @@ definitions:
           - T48_Xba_051011
       data_provenance:
         db_key: info.data_provenance
-      geoprov_city:
-        type: string
-        db_key: geo_location.properties.city
-        indexed: True
-      geoprov_country:
-        type: string
-        db_key: geo_location.properties.country
-        indexed: True
-      geoprov_iso_alpha3:
-        type: string
-        db_key: geo_location.properties.ISO3166alpha3
-        indexed: True
-      geoprov_long_lat:
-        type: array
-        items:
-        type: number
-        db_key: geo_location.geometry.coordinates
 
 
 #------------------------------------------------------------------------------#
@@ -391,23 +383,31 @@ definitions:
           - pgx:cohort-TCGA
           - pgx:cohort-TCGAcancers
           - pgx:cohort-arraymap
+      geoprov_id:
+        type: string
+        db_key: geo_location.properties.id
+        indexed: True
       geoprov_city:
         type: string
         db_key: geo_location.properties.city
         indexed: True
+        computed: True
       geoprov_country:
         type: string
         db_key: geo_location.properties.country
         indexed: True
+        computed: True
       geoprov_iso_alpha3:
         type: string
         db_key: geo_location.properties.ISO3166alpha3
         indexed: True
+        computed: True
       geoprov_long_lat:
         type: array
         items:
-          type: number
+        type: number
         db_key: geo_location.geometry.coordinates
+        computed: True
 
       # special export labels
       group_id:
@@ -505,7 +505,16 @@ definitions:
       auxiliary_disease_notes:
         type: string
         db_key: auxiliary_disease.notes
-
+      ethnicity_id:
+        type: string
+        db_key: ethnicity.id
+        beacon_model_path: individuals.ethnicity.id
+        indexed: True
+      ethnicity_label:
+        type: string
+        db_key: ethnicity.label
+        beacon_model_path: individuals.ethnicity.label
+        indexed: True
 
 #------------------------------------------------------------------------------#
 
@@ -547,6 +556,7 @@ definitions:
         type: string
         db_key: location.sequence_id
         indexed: True
+        computed: True
       reference_name:
         type: string
         db_key: location.chromosome

diff --git a/bycon/config/entity_defaults.yaml b/bycon/config/entity_defaults.yaml
@@ -5,7 +5,6 @@ info:
     - "service-info"
     - "service_info"
   response_entity_id: info
-  collection: Null
   response_schema: beaconInfoResponse
   bycon_response_class: BeaconInfoResponse
   beacon_schema:

diff --git a/bycon/lib/beacon_response_generation.py b/bycon/lib/beacon_response_generation.py
@@ -698,7 +698,6 @@ def __init__(self):
         self.result_sets = list()       # data rewrapped into the resultSets list
         self.flattened_data = list()    # data from all resultSets as flat list
         self.entity_defaults = BYC.get("entity_defaults", {})
-        self.data_collection = BYC["response_entity"].get("collection", "biosamples")
         self.response_entity_id = BYC.get("response_entity_id", "biosample")
         self.limit = BYC_PARS.get("limit")
         self.skip = BYC_PARS.get("skip")
@@ -713,6 +712,11 @@ def __init__(self):
     # ----------------------------- public ------------------------------------#
     # -------------------------------------------------------------------------#
 
+    def get_record_queries(self):
+        return self.record_queries
+
+
+    # -------------------------------------------------------------------------#
     def get_populated_result_sets(self):
         self.__retrieve_datasets_data()
         self.__retrieve_variants_data()
@@ -740,6 +744,24 @@ def datasetsResults(self):
         return self.datasets_results
 
 
+    # -------------------------------------------------------------------------#
+
+    def dataset_results_individual_ids(self, ds_id="___none___"):
+        individual_ids = set()
+        self.response_entity_id = "individual"
+        self.__retrieve_datasets_data()
+        if not ds_id in self.datasets_data:
+            BYC["ERRORS"].append("no correct dataset id provided to `dataset_results_biosample_ids`")
+            return []
+
+        data = self.datasets_data[ds_id]
+        for s in data:
+            if (ind_id := s.get("individual_id")):
+                individual_ids.add(ind_id)
+
+        return list(individual_ids)
+
+
     # -------------------------------------------------------------------------#
     # ----------------------------- private -----------------------------------#
     # -------------------------------------------------------------------------#
@@ -809,7 +831,7 @@ def __retrieve_datasets_results(self):
     # -------------------------------------------------------------------------#
 
     def __retrieve_datasets_data(self):
-        if "variants" in self.data_collection:
+        if "variant" in self.response_entity_id.lower():
             return
 
         e_d_s = BYC["entity_defaults"].get(self.response_entity_id, {})
@@ -846,7 +868,7 @@ def __retrieve_datasets_data(self):
     # -------------------------------------------------------------------------#
 
     def __retrieve_variants_data(self):
-        if not "variants" in self.data_collection:
+        if not "variant" in self.response_entity_id.lower():
             return
 
         ds_v_start = datetime.datetime.now()

diff --git a/bycon/lib/query_generation.py b/bycon/lib/query_generation.py
@@ -278,7 +278,7 @@ def __loop_multivars(self):
                     queries.append(q)
                     continue
 
-        prdbug(f'??? queries: {queries}')
+        prdbug(f'__loop_multivars queries: {queries}')
 
         return queries
 

diff --git a/bycon/lib/service_utils.py b/bycon/lib/service_utils.py
@@ -46,6 +46,13 @@ def set_entities():
     This function evaluates the definitions for the entities and their selection
     by path elements (including aliases) or parameters and updates the global
     BYC definitions.
+
+    As approximation in a script one can override the original selection by providing
+    a `--responseEntityPathId analyses` (or "individuals" etc.) parameter or forcing
+    ```
+    BYC_PARS.update({"response_entity_path_id":"analyses"})
+    set_entities()
+    ```
     """
     b_e_d = BYC.get("entity_defaults", {})
     a_defs = BYC.get("argument_definitions", {})
@@ -61,16 +68,15 @@ def set_entities():
     # it should only apply to special cases (e.g. overriding the standard
     # biosample table export in services with individuals) or for command
     # line testing
-    if (e_p_id := BYC_PARS.get("request_entity_path_id", "___none___")) in dealiased_path_ids.keys():
-        BYC.update({"request_entity_path_id": e_p_id})
-    if (e_p_id := BYC_PARS.get("response_entity_path_id", "___none___")) in dealiased_path_ids.keys():
-        BYC.update({"response_entity_path_id": e_p_id})
+    if (q_p_id := BYC_PARS.get("request_entity_path_id", "___none___")) in dealiased_path_ids.keys():
+        BYC.update({"request_entity_path_id": q_p_id})
+    if (p_p_id := BYC_PARS.get("response_entity_path_id", "___none___")) in dealiased_path_ids.keys():
+        BYC.update({"response_entity_path_id": p_p_id})
 
-    p_i_d = BYC.get("request_entity_path_id", "___none___")
-    if p_i_d not in dealiased_path_ids.keys():
+    if (p_i_d := BYC.get("request_entity_path_id", "___none___")) not in dealiased_path_ids.keys():
         p_i_d = "info"
-    rp_i_d = BYC.get("response_entity_path_id", "___none___")
-    if rp_i_d not in dealiased_path_ids.keys():
+
+    if (rp_i_d := BYC.get("response_entity_path_id", "___none___")) not in dealiased_path_ids.keys():
         rp_i_d = p_i_d
 
     # after settling the paths we can get the entity ids

diff --git a/bycon/schemas/models/json/bycon-database-schemas/pgxBiosample.json b/bycon/schemas/models/json/bycon-database-schemas/pgxBiosample.json
@@ -51,7 +51,7 @@
             },
             "examples": [
                 {
-                    "id": "pgxcohort-arraymap",
+                    "id": "pgx:cohort-arraymap",
                     "label": "arrayMap collection"
                 }
             ]