Skip to content

Commit

Permalink
Merge pull request #570 from 4dn-dcic/ajs_upd_st_mismatch
Browse files Browse the repository at this point in the history
Refactor status mismatch check
  • Loading branch information
aschroed authored Apr 25, 2024
2 parents 8c607ce + c5268ac commit f2c412d
Show file tree
Hide file tree
Showing 7 changed files with 751 additions and 193 deletions.
8 changes: 7 additions & 1 deletion CHANGELOG.rst
Original file line number Diff line number Diff line change
Expand Up @@ -7,11 +7,17 @@ foursight
Change Log
----------

4.6.0
=====
* Fix calls to get_es_metadata in checks/audit_checks.py to work when ES_HOST_LOCAL is set.
* Refactored item_status_mismatch check to run on fewer items if need be

`PR 570: Fix status mismatch check <https://github.com/4dn-dcic/foursight/pull/570>`_

4.5.0
=====
* Update Tibanna


4.4.6
=====
* Add organism when ATAC-seq check calls stepper helper
Expand Down
91 changes: 71 additions & 20 deletions chalicelib_fourfront/checks/audit_checks.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,17 @@
from dcicutils import ff_utils
import datetime
import os
import re
import requests
import datetime
from .helpers import wrangler_utils

from typing import Optional
from dcicutils.es_utils import create_es_client
from dcicutils import ff_utils
from chalicelib_fourfront.checks.helpers import wrangler_utils
from chalicelib_fourfront.checks.helpers.es_utils import get_es_metadata
# Use confchecks to import decorators object and its methods for each check module
# rather than importing check_function, action_function, CheckResult, ActionResult
# individually - they're now part of class Decorators in foursight-core::decorators
# that requires initialization with foursight prefix.
from .helpers.confchecks import *
from chalicelib_fourfront.checks.helpers.confchecks import *


STATUS_LEVEL = {
Expand Down Expand Up @@ -573,14 +576,53 @@ def check_search_urls(connection, **kwargs):
return check


@check_function(id_list=None)
# value of the tag on items where mismatch status should be ignored
TAG_TO_IGNORE = 'ignore_status_mismatch'


def get_items_with_ignore_tags(key):
query = 'search/?type=Item&tags={}'.format(TAG_TO_IGNORE)
delquery = query + '&status=deleted&status=replaced&status=obsolete'

res = ff_utils.search_metadata(query, key=key)
res.extend(ff_utils.search_metadata(delquery, key=key))
return [item.get('uuid') for item in res]


@check_function(id_list=None, last_mod_date=None, run_for_all=False)
def check_status_mismatch(connection, **kwargs):
# embedded sub items should have an equal or greater level
# than that of the item in which they are embedded
check = CheckResult(connection, 'check_status_mismatch')
# if true will run on all replicate sets
run_for_all = kwargs['run_for_all']
# if values will run only on these ids
id_list = kwargs['id_list']
# if provided as a param will look for items modified more recently than
last_mod_date = kwargs['last_mod_date']

# limit the number of top level items to query (ExperimentSets) if id_list is not
# provided - if a passing result cannot be found will do what?
if not (run_for_all or id_list or last_mod_date):
last_result = check.get_primary_result()
days = 0
while last_result['status'] != 'PASS' or not last_result['kwargs'].get('primary'):
days += 1
try:
last_result = check.get_closest_result(diff_hours=days*24)
except Exception:
pass
if days > 20:
# no passing primary check in the past 20 days so use date from
# 'oldest' last_result
break
chk_uuid = last_result.get('uuid')
chk_uuid = chk_uuid.replace('T', ' ')
last_colon_idx = chk_uuid.rfind(':')
last_mod_date = chk_uuid[:last_colon_idx]


MIN_CHUNK_SIZE = 200
# embedded sub items should have an equal or greater level
# than that of the item in which they are embedded
id2links = {}
id2status = {}
id2item = {}
Expand All @@ -596,10 +638,19 @@ def check_status_mismatch(connection, **kwargs):
itemids = re.split(',|\s+', id_list)
itemids = [id for id in itemids if id]
else:
if last_mod_date:
item_search += '&last_modified.date_modified.from={}'.format(last_mod_date)
itemres = ff_utils.search_metadata(item_search, key=connection.ff_keys, page_limit=500)
itemids = [item.get('uuid') for item in itemres]
es_items = ff_utils.get_es_metadata(itemids, key=connection.ff_keys, chunk_size=200, is_generator=True)

tagged2ignore = []
checked_tags = False
es_items = get_es_metadata(itemids, key=connection.ff_keys, chunk_size=200, is_generator=True)

for es_item in es_items:
if not checked_tags:
tagged2ignore = get_items_with_ignore_tags(connection.ff_keys)
checked_tags = True # only do this once if at all
label = es_item.get('embedded').get('display_title')
desc = es_item.get('object').get('description')
lab = es_item.get('embedded').get('lab').get('display_title')
Expand All @@ -608,7 +659,7 @@ def check_status_mismatch(connection, **kwargs):
id2links[es_item.get('uuid')] = [li.get('uuid') for li in es_item.get('linked_uuids_embedded')]
id2status[es_item.get('uuid')] = STATUS_LEVEL.get(status)
id2item[es_item.get('uuid')] = {'label': label, 'status': status, 'lab': lab,
'description': desc, 'to_ignore': list(set(opfs))}
'description': desc, 'to_ignore': list(set(opfs)) + tagged2ignore}

mismatches = {}
linked2get = {}
Expand All @@ -629,8 +680,8 @@ def check_status_mismatch(connection, **kwargs):
mismatches.setdefault(iid, []).append(lid)

if len(linked2get) > MIN_CHUNK_SIZE or i + 1 == len(itemids): # only query es when we have more than a set number of ids (500)
linked2chk = ff_utils.get_es_metadata(list(linked2get.keys()), key=connection.ff_keys,
chunk_size=200, is_generator=True)
linked2chk = get_es_metadata(list(linked2get.keys()), key=connection.ff_keys,
chunk_size=200, is_generator=True)
for litem in linked2chk:
luuid = litem.get('uuid')
listatus = litem.get('properties').get('status', 'in review by lab')
Expand Down Expand Up @@ -704,17 +755,17 @@ def check_opf_status_mismatch(connection, **kwargs):
for case in exp['other_processed_files']:
files.extend([i['uuid'] for i in case['files']])
# get metadata for files, to collect status
resp = ff_utils.get_es_metadata(list(set(files)),
sources=['links.quality_metric', 'object.status', 'uuid'],
key=connection.ff_keys)
resp = get_es_metadata(list(set(files)),
sources=['links.quality_metric', 'object.status', 'uuid'],
key=connection.ff_keys)
opf_status_dict = {item['uuid']: item['object']['status'] for item in resp if item['uuid'] in files}
opf_linked_dict = {
item['uuid']: item.get('links', {}).get('quality_metric', []) for item in resp if item['uuid'] in files
}
quality_metrics = [uuid for item in resp for uuid in item.get('links', {}).get('quality_metric', [])]
qm_resp = ff_utils.get_es_metadata(list(set(quality_metrics)),
sources=['uuid', 'object.status'],
key=connection.ff_keys)
qm_resp = get_es_metadata(list(set(quality_metrics)),
sources=['uuid', 'object.status'],
key=connection.ff_keys)
opf_other_dict = {item['uuid']: item['object']['status'] for item in qm_resp if item not in files}
check.full_output = {}
for result in results:
Expand Down Expand Up @@ -885,8 +936,8 @@ def check_bio_feature_organism_name(connection, **kwargs):
assembly_in_dt = True
break
if not assembly_in_dt:
gr_res = ff_utils.get_es_metadata([genreg.get('uuid')],
key=connection.ff_keys, sources=['properties.genome_assembly'])
gr_res = get_es_metadata([genreg.get('uuid')],
key=connection.ff_keys, sources=['properties.genome_assembly'])
try:
gr_ass = gr_res[0].get('properties').get('genome_assembly')
except AttributeError:
Expand Down
14 changes: 14 additions & 0 deletions chalicelib_fourfront/checks/helpers/es_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
import os
from typing import Optional
from dcicutils.es_utils import create_es_client
from dcicutils import ff_utils

def get_es_metadata(*args, **kwargs):
if (kwargs.get("es_client", None) is None) and ((es_host_local := _get_es_host_local()) is not None):
es_client = create_es_client(es_host_local, use_aws_auth=True)
return ff_utils.get_es_metadata(*args, **kwargs, es_client=es_client)
return ff_utils.get_es_metadata(*args, **kwargs)


def _get_es_host_local() -> Optional[str]:
return os.environ.get("ES_HOST_LOCAL", None)
4 changes: 2 additions & 2 deletions chalicelib_fourfront/checks/system_checks.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
beanstalk_utils,
env_utils
)
from chalicelib_fourfront.checks.helpers.es_utils import get_es_metadata

# Use confchecks to import decorators object and its methods for each check module
# rather than importing check_function, action_function, CheckResult, ActionResult
Expand Down Expand Up @@ -833,8 +834,7 @@ def purge_download_tracking_items(connection, **kwargs):
client = es_utils.create_es_client(connection.ff_es, True)
# a bit convoluted, but we want the frame=raw, which does not include uuid
# use get_es_metadata to handle this. Use it as a generator
for to_purge in ff_utils.get_es_metadata(search_uuids, es_client=client, is_generator=True,
key=connection.ff_keys):
for to_purge in get_es_metadata(search_uuids, es_client=client, is_generator=True, key=connection.ff_keys):
if round(time.time() - t0, 2) > time_limit:
break
purge_properties = to_purge['properties']
Expand Down
3 changes: 2 additions & 1 deletion chalicelib_fourfront/checks/wrangler_checks.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
from .check_utils import convert_table_to_ordered_dict
from collections import OrderedDict
import uuid
from chalicelib_fourfront.checks.helpers.es_utils import get_es_metadata

# Use confchecks to import decorators object and its methods for each check module
# rather than importing check_function, action_function, CheckResult, ActionResult
Expand Down Expand Up @@ -2123,7 +2124,7 @@ def check_opf_lab_different_than_experiment(connection, **kwargs):
exp_set_uuids_to_check.extend([uuid for uuid in opf['exp_set_uuids'] if uuid not in exp_set_uuids_to_check])

# get lab of Exp/ExpSet
result_exp_set = ff_utils.get_es_metadata(exp_set_uuids_to_check, sources=['uuid', 'properties.lab'], key=connection.ff_keys)
result_exp_set = get_es_metadata(exp_set_uuids_to_check, sources=['uuid', 'properties.lab'], key=connection.ff_keys)
es_uuid_2_lab = {} # map Exp/Set uuid to Exp/Set lab
for es in result_exp_set:
es_uuid_2_lab[es['uuid']] = es['properties']['lab']
Expand Down
Loading

0 comments on commit f2c412d

Please sign in to comment.