From f53f2ce345ffe42227be88a9c6981c0793cb158d Mon Sep 17 00:00:00 2001 From: Carol Bouchard Date: Wed, 13 Nov 2024 16:04:48 -0500 Subject: [PATCH] MemoryError seen on expire_distros execution When the beaker_expire_distros hourly cron job ran, the MemoryError is seen on the lab controller as well as Server. This same error is seen when executing the bkr CLI 'bkr distro-trees-list --limit=8000 --labcontroller=hostname | grep "ID:" | wc -l' As it turns out, expire_distros is calling the same underlying get method. The error is sporadic. It is likely due to difficulty attaining a contiguous memory chunk for a lot of data. The solution is to get smaller chunks. Similar filters as the cli distro-trees-list are now allow thru expired_distros.py which passes them to the same get operation. These filters also allows for more variation for beaker_expire_distros. Stepping thru architectures seemed to be the best choice of filters for the chunks. When --arch=all, the expire_distros.py code knows to step thru a list of arch to perform removal instead of trying to do it all at once. So the cron job is updated to include --arch=all. --- .../cron.hourly/beaker_expire_distros | 2 +- .../src/bkr/labcontroller/expire_distros.py | 73 +++++++++++++++---- 2 files changed, 60 insertions(+), 15 deletions(-) diff --git a/LabController/cron.hourly/beaker_expire_distros b/LabController/cron.hourly/beaker_expire_distros index 2c1ce1485..af86c8815 100755 --- a/LabController/cron.hourly/beaker_expire_distros +++ b/LabController/cron.hourly/beaker_expire_distros @@ -1,2 +1,2 @@ #!/bin/sh -exec flock -n /var/run/beaker_expire_distros.cron.lock beaker-expire-distros +exec flock -n /var/run/beaker_expire_distros.cron.lock beaker-expire-distros --arch=all diff --git a/LabController/src/bkr/labcontroller/expire_distros.py b/LabController/src/bkr/labcontroller/expire_distros.py index bae155408..64e8e600c 100644 --- a/LabController/src/bkr/labcontroller/expire_distros.py +++ b/LabController/src/bkr/labcontroller/expire_distros.py @@ -78,10 +78,12 @@ def check_url(url): def check_all_trees(ignore_errors=False, dry_run=False, lab_controller='http://localhost:8000', - remove_all=False): + remove_all=False, + filter=None): + filter_on_arch = True if (filter is not None and filter and 'arch' in filter.keys()) else False proxy = xmlrpc_client.ServerProxy(lab_controller, allow_none=True) rdistro_trees = [] - distro_trees = proxy.get_distro_trees() + distro_trees = proxy.get_distro_trees(filter) if not remove_all: for distro_tree in distro_trees: accessible = False @@ -108,21 +110,29 @@ def check_all_trees(ignore_errors=False, else: rdistro_trees = distro_trees + print('INFO: expire_distros to remove %d entries for arch %s' % (len(rdistro_trees), + filter['arch'] if (filter_on_arch) else 'unset')) + # If all distro_trees are expired then something is wrong # Unless there is intention to remove all distro_trees - if len(distro_trees) != len(rdistro_trees) or remove_all: + if (len(distro_trees) != len(rdistro_trees)) or remove_all: for distro_tree in rdistro_trees: if dry_run: print('Distro marked for remove %s:%d' % (distro_tree['distro_name'], - distro_tree['distro_tree_id'])) + distro_tree['distro_tree_id'])) else: print('Removing distro %s:%d' % (distro_tree['distro_name'], distro_tree['distro_tree_id'])) proxy.remove_distro_trees([distro_tree['distro_tree_id']]) else: - sys.stderr.write('All distros are missing! Please check your server!\n') - sys.exit(1) - + if (len(distro_trees) == 0): + if (filter is None): + sys.stderr.write('All distros are missing! Please check your server!\n') + sys.exit(1) + else: + sys.stderr.write('Stopped removal of all distros for arch %s!! Please check your ' + 'server.\nYou can manually force removal using --remove-all.\n' % + (filter['arch'] if (filter_on_arch) else 'unset')) def main(): from optparse import OptionParser @@ -137,14 +147,49 @@ def main(): 'Defaults to http://localhost:8000.') parser.add_option('--remove-all', default=False, action='store_true', help='Remove all distros from lab controller.') + parser.add_option('--name', default=None, + help='Remove all distros with given name. Use "%" for wildcard.') + parser.add_option('--family', default=None, + help='Remove all distros for a given family.') + parser.add_option('--arch', default=None, + help='Remove all distros for a given architecture. When set to "all", ' + 'steps thru each available arch to reduce memory usage.') options, args = parser.parse_args() - try: - check_all_trees(options.ignore_errors, - options.dry_run, - options.lab_controller, - options.remove_all) - except KeyboardInterrupt: - pass + startmsg = str("INFO: expire_distros running with --lab-controller=" + options.lab_controller) + for i in range(1,len(sys.argv)): + startmsg += ' ' + sys.argv[i] + print('%s' % (startmsg)) + filter = {} + if options.name: + filter['name'] = options.name + if options.family: + filter['family'] = options.family + + arch_list = [] + if options.arch: + if options.arch == "all": + arch_list = [ "x86_64", "ppc", "ppc64le", "ppc64", "i386", "s390", "s390x", "aarch64", "ia64", "arm", "armhfp" ] + else: + arch_list = [ options.arch ] + for arch in arch_list: + filter['arch'] = arch + try: + check_all_trees(options.ignore_errors, + options.dry_run, + options.lab_controller, + options.remove_all, + filter) + except KeyboardInterrupt: + pass + else: + try: + check_all_trees(options.ignore_errors, + options.dry_run, + options.lab_controller, + options.remove_all, + filter) + except KeyboardInterrupt: + pass if __name__ == '__main__':