-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
removal of installers & housekeepers
... to bycon
- Loading branch information
Showing
60 changed files
with
359 additions
and
374,367 deletions.
There are no files selected for viewing
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,77 @@ | ||
#!/usr/local/bin/python3 | ||
|
||
import re, json, yaml | ||
from os import path, environ, pardir | ||
import sys, datetime | ||
import argparse | ||
|
||
################################################################################ | ||
################################################################################ | ||
################################################################################ | ||
|
||
""" | ||
./ontologyTermLister.py -c rsrc/NCITstage/NCITstage-codes.tsv -l rsrc/NCITstage/NCITstage-labels.tsv -o rsrc/NCITstage/numbered_hierarchies.tsv | ||
""" | ||
|
||
################################################################################ | ||
################################################################################ | ||
################################################################################ | ||
|
||
def _get_args(): | ||
|
||
parser = argparse.ArgumentParser() | ||
|
||
parser.add_argument("-c", "--codefile", help="") | ||
parser.add_argument("-l", "--labelfile",help="") | ||
parser.add_argument("-o", "--hierarchyfile", help="") | ||
|
||
return parser.parse_args() | ||
|
||
################################################################################ | ||
|
||
def main(): | ||
|
||
args = _get_args() | ||
|
||
codes_f = args.codefile | ||
labels_f = args.labelfile | ||
hierarchy_f = args.hierarchyfile | ||
|
||
f = open(labels_f, 'r+') | ||
labels = [line for line in f.readlines()] | ||
f.close() | ||
f = open(codes_f, 'r+') | ||
codes = [line for line in f.readlines()] | ||
f.close() | ||
|
||
n_h_f = open( hierarchy_f, 'w' ) | ||
|
||
i = 0 | ||
for c_l in codes: | ||
c_l = c_l.rstrip() | ||
if not "NCIT" in c_l: | ||
c_l = "NCIT:" + c_l | ||
else: | ||
c_l = c_l.replace("obo:NCIT", "NCIT") | ||
c_l = c_l.replace("_", ":") | ||
tab_n = c_l.count("\t") | ||
c = re.sub(r"\s+", "", c_l) | ||
l = labels[ i ].strip() | ||
|
||
# c_l = c_l.replace("NCIT", l+" (NCIT") + ")" | ||
# h_f.write( c_l + "\n") | ||
# l_d.update( { c: l } ) | ||
|
||
i += 1 | ||
|
||
n_h_f.write( "{}\t{}\t{}\t{}\n".format(c, l, tab_n, i) ) | ||
|
||
n_h_f.close() | ||
|
||
################################################################################ | ||
################################################################################ | ||
################################################################################ | ||
|
||
|
||
if __name__ == '__main__': | ||
main() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,282 @@ | ||
#!/usr/local/bin/python3 | ||
|
||
import re, json, yaml | ||
from os import path, environ, pardir | ||
import sys, datetime | ||
from pymongo import MongoClient | ||
from progress.bar import Bar | ||
from pyexcel import get_sheet | ||
|
||
# bycon is supposed to be in the same parent directory | ||
dir_path = path.dirname( path.abspath(__file__) ) | ||
pkg_path = path.join( dir_path, pardir ) | ||
parent_path = path.join( pkg_path, pardir ) | ||
sys.path.append( parent_path ) | ||
|
||
from bycon import * | ||
from byconeer import * | ||
|
||
""" | ||
## `ontologymapsCreator` | ||
""" | ||
|
||
################################################################################ | ||
################################################################################ | ||
################################################################################ | ||
|
||
def main(): | ||
|
||
ontologymaps_creator() | ||
|
||
################################################################################ | ||
|
||
def ontologymaps_creator(): | ||
|
||
initialize_bycon_service(byc) | ||
run_beacon_init_stack(byc) | ||
|
||
if len(byc["dataset_ids"]) > 1: | ||
print("Please give only one dataset using -d") | ||
exit() | ||
|
||
ds_id = byc["dataset_ids"][0] | ||
print("=> Using data values from {}".format(ds_id)) | ||
|
||
execute_bycon_queries( ds_id, byc ) | ||
ds_results = byc["dataset_results"][ds_id] | ||
|
||
mongo_client = MongoClient( ) | ||
data_db = mongo_client[ ds_id ] | ||
bios_coll = data_db[ "biosamples" ] | ||
|
||
if not "biosamples._id" in ds_results.keys(): | ||
bios_ids = bios_coll.distinct("_id", {}) | ||
print("¡¡¡ Using all {} biosamples from {} !!!".format(len(bios_ids), ds_id)) | ||
else: | ||
bios_ids = ds_results["biosamples._id"]["target_values"] | ||
|
||
no = len(bios_ids) | ||
|
||
o_m_all = { } | ||
|
||
for mt, mv in byc["service_config"]["map_types"].items(): | ||
|
||
o_m = { } | ||
|
||
o_l_max = len(mv["ontology_types"]) | ||
|
||
if not byc["test_mode"]: | ||
bar = Bar("Analyzing samples", max = no, suffix="%(percent)d%%"+" of "+str(no) ) | ||
|
||
for bios__id in bios_ids: | ||
|
||
s = bios_coll.find_one({"_id": bios__id}) | ||
o_l_c = 0 | ||
k_l = [ ] | ||
o_l = [ ] | ||
d = "" | ||
if "description" in s: | ||
d = s["description"].strip() | ||
|
||
o_t_l = len(mv["ontology_types"]) | ||
|
||
for o_t in mv["ontology_types"]: | ||
|
||
data_key = byc["filter_definitions"][ o_t ]["db_key"] | ||
parent_key = re.sub(".id", "", data_key) | ||
data_re = re.compile( byc["filter_definitions"][ o_t ]["pattern"] ) | ||
|
||
try: | ||
o_p = s[ parent_key ] | ||
except: | ||
continue | ||
|
||
if not o_p: | ||
continue | ||
|
||
if not "id" in o_p: | ||
continue | ||
|
||
if not data_re.match( o_p["id"] ): | ||
continue | ||
|
||
if not "label" in o_p: | ||
o_p.update({"label":""}) | ||
|
||
k_l.append( o_p["id"] ) | ||
o_l.append( o_p ) | ||
|
||
if len(k_l) < o_t_l: | ||
continue | ||
k = "::".join(k_l) | ||
|
||
if k in o_m: | ||
if len(o_m[k]["examples"]) < byc["service_config"]["example_length"]: | ||
o_m[k]["examples"].add(d) | ||
o_m_all[k]["examples"].add(d) | ||
|
||
else: | ||
if byc["test_mode"]: | ||
print(k) | ||
o_m[k] = { | ||
"id": k, | ||
"examples": set([d]), | ||
"code_group": o_l, | ||
"note": ds_id | ||
} | ||
|
||
o_m_all[k] = o_m[k].copy() | ||
o_m_all[k].pop("note") | ||
|
||
if byc["args"].filters: | ||
o_m[k].update({"note": byc["args"].filters}) | ||
if "TCGA" in byc["args"].filters: | ||
e_r_s = s.get("external_references", []) | ||
for e_r in e_r_s: | ||
if "pgx:TCGA" in e_r["id"]: | ||
if "project" in e_r["label"]: | ||
n = re.sub("pgx:", "", e_r["id"]) | ||
o_m[k].update({"note":n}) | ||
|
||
if not byc["test_mode"]: | ||
bar.next() | ||
if not byc["test_mode"]: | ||
bar.finish() | ||
|
||
mt_l = mt | ||
|
||
if byc["args"].filters: | ||
mt_l = "{}-{}".format(mt, byc["args"].filters) | ||
mt_l = re.sub(":", "", mt_l) | ||
|
||
print("{} code combinations for {}".format(len(o_m.keys()), mt)) | ||
_export_currentmaps(dir_path, mt_l, o_m) | ||
|
||
def_m = _read_mapping_defaults(dir_path, mt, **byc) | ||
|
||
for def_k in def_m: | ||
if not def_k in o_m.keys(): | ||
o_m.update( { def_k: def_m[def_k] } ) | ||
print("Now {} code combinations after defaults ...".format(len(o_m.keys()))) | ||
_export_ontologymaps(dir_path, mt_l, o_m) | ||
|
||
if not byc["test_mode"]: | ||
|
||
for om_k, om_v in o_m_all.items(): | ||
o_m_all[om_k].update({"examples": list(om_v["examples"]) }) | ||
|
||
|
||
if not byc["args"].filters: | ||
om_coll = mongo_client["progenetix"]["ontologymaps"] | ||
om_coll.drop() | ||
om_coll.insert_many( o_m_all.values() ) | ||
print("==> Rewrote {}.{} collection".format(byc["config"]["services_db"], byc["config"]["ontologymaps_coll"])) | ||
|
||
################################################################################ | ||
################################################################################ | ||
################################################################################ | ||
|
||
def _export_currentmaps(dir_path, map_type, o_m): | ||
|
||
f_p = path.join(dir_path, "exports", "ontologymaps", "{}.tsv".format(map_type)) | ||
f = open(f_p, "w") | ||
for o_k in sorted(o_m.keys()): | ||
l = [o_m[o_k]["note"]] | ||
for g_m in o_m[o_k]["code_group"]: | ||
l.append(str(g_m["id"])) | ||
l.append(str(g_m["label"])) | ||
l.append(str(sorted(o_m[o_k]["examples"])[0])) | ||
f.write("\t".join(l)+"\n") | ||
f.close() | ||
|
||
################################################################################ | ||
|
||
def _export_ontologymaps(dir_path, map_type, o_m): | ||
for k, content in o_m.items(): | ||
if "examples" in content: | ||
content["examples"] = sorted(content["examples"]) | ||
export = [ ] | ||
for o_k in sorted(o_m.keys()): | ||
export.append(o_m[o_k]) | ||
yaml.dump(export, open(path.join(dir_path, "exports", "ontologymaps", "{}.yaml".format(map_type)),"w")) | ||
|
||
################################################################################ | ||
################################################################################ | ||
################################################################################ | ||
|
||
def _read_mapping_defaults(dir_path, map_type, **byc): | ||
|
||
if not "mappingfile" in byc["service_config"]["map_types"][ map_type ]: | ||
return {} | ||
|
||
mf = path.join( dir_path, *byc["service_config"]["map_types"][ map_type ]["mappingfile"] ) | ||
o_m_r = { } | ||
equiv_keys = [ ] | ||
pre_fs = byc["service_config"]["map_types"][ map_type ]["ontology_types"] | ||
o_l_max = len(pre_fs) | ||
|
||
for o_t in pre_fs: | ||
equiv_keys.append( o_t+"::id" ) | ||
equiv_keys.append( o_t+"::label" ) | ||
|
||
sheet_name = "__".join(pre_fs)+"__matched" | ||
|
||
try: | ||
table = get_sheet(file_name=mf, sheet_name=sheet_name) | ||
except Exception as e: | ||
print(e) | ||
print("No matching mapping file could be found!") | ||
exit() | ||
|
||
header = table[0] | ||
col_inds = { } | ||
hi = 0 | ||
fi = 0 | ||
for col_name in header: | ||
if col_name in equiv_keys: | ||
col_inds[ col_name ] = hi | ||
|
||
hi += 1 | ||
|
||
for i in range(1, len(table)): | ||
id_s = [ ] | ||
bioc_s = [ ] | ||
bioc = { } | ||
col_match_count = 0 | ||
for col_name in equiv_keys: | ||
try: | ||
cell_val = table[ i, col_inds[ col_name ] ] | ||
if "id" in col_name: | ||
o_t, code = re.split("[:-]", cell_val) | ||
data_re = re.compile( byc["filter_definitions"][ o_t ]["pattern"] ) | ||
if data_re.match( cell_val ): | ||
bioc = { "id": cell_val } | ||
id_s.append( cell_val ) | ||
else: | ||
bioc.update( { "label": cell_val } ) | ||
bioc_s.append(bioc) | ||
if len(id_s) == o_l_max: | ||
o_k = "::".join(id_s) | ||
o_m_r.update( | ||
{ o_k: | ||
{ | ||
"id": o_k, | ||
"code_group": bioc_s | ||
} | ||
} | ||
) | ||
fi += 1 | ||
except: | ||
continue | ||
|
||
print("default mappings: "+str(fi)) | ||
return o_m_r | ||
|
||
################################################################################ | ||
################################################################################ | ||
################################################################################ | ||
|
||
if __name__ == "__main__": | ||
main() |
Oops, something went wrong.