From 7853c861f5396db1d9b368cbc869121d36df356d Mon Sep 17 00:00:00 2001 From: Maria Telenczuk Date: Thu, 26 Nov 2020 22:38:24 +0100 Subject: [PATCH] deleted all the files related to aws_train (used in old versions of ramp) (#481) --- ramp-engine/ramp_engine/aws/README.md | 198 ---------- ramp-engine/ramp_engine/aws/aws_train.py | 338 ------------------ ramp-engine/ramp_engine/aws/ramp_aws_train.py | 101 ------ .../ramp_engine/aws/ramp_aws_train_loop.py | 43 --- 4 files changed, 680 deletions(-) delete mode 100644 ramp-engine/ramp_engine/aws/README.md delete mode 100644 ramp-engine/ramp_engine/aws/aws_train.py delete mode 100644 ramp-engine/ramp_engine/aws/ramp_aws_train.py delete mode 100644 ramp-engine/ramp_engine/aws/ramp_aws_train_loop.py diff --git a/ramp-engine/ramp_engine/aws/README.md b/ramp-engine/ramp_engine/aws/README.md deleted file mode 100644 index 18a29c83e..000000000 --- a/ramp-engine/ramp_engine/aws/README.md +++ /dev/null @@ -1,198 +0,0 @@ -Backend support for Amazon EC2 instances. -The goal of this module is to provide a set of -helper functions and CLI command lines -to train submission(s) on EC2. - -In the following is a tutorial showing the full steps starting -from creating the AMI to running submissions on EC2. - -# Step 1 : Creating the AMI - -In order to create the AMI for a new RAMP challenge, you need to launch an instance on EC2. -Go to https://us-west-2.console.aws.amazon.com/ec2/v2/home, then launch a new instance. -You will need to choose a base AMI. For most cases you will probably just need -the default AMIs provided by amazon such as Ubuntu Server 16.04 LTS. - -Once you run the instance, you will need to first install -ramp-workflow (https://github.com/paris-saclay-cds/ramp-workflow). -Then, you should clone the ramp-kits in any folder (we will provide later -the folder in the configuration file), but as a convention we put the -kit in the folder ~/ramp-kits. Here is an example with the iris -kit: - -``` -pip install git+https://github.com/paris-saclay-cds/ramp-workflow -mkdir ~/ramp-kits -cd ~/ramp-kits -git clone https://github.com/ramp-kits/iris -``` - -The second step is to put the private data in the folder data of the kit. -To do that you will have to rsync the private data from the ramp server. -In the ramp server, do this: - -``` -rsync -avzP /mnt/ramp_data/frontend/ramp-data/iris/data ubuntu@ip:~/ramp-kits/iris -``` - -where ip is the public ip of the ec2 instance (check the EC2 console to get the IP). - -To make sure everything works, try : - -``` -ramp_test_submission -``` - - -The last step is to install the remaining required packages. - -Make sure you install memory_profiler from master if you want to enable memory profiling. - -``` -pip install git+https://github.com/pythonprofilers/memory_profiler/ -``` - -You can now install all the other packages needed for training the submissions. - -Now, you can create the AMI in amazon in the EC2 console. -Go to https://us-west-2.console.aws.amazon.com/ec2/v2/home. -Select the instance, then actions, image, create image. -You can name it according to the ramp kit, e.g., "iris_backend" -to follow the convention. To use the image, you will need to either provide -the image ID, or the image name, as we will se below in Step 2. - -# Step 2 : configuration file for ramp-backend - -The second step is to prepare a configuration file in the ramp server. -It can be anywhere, by convention it is in /mnt/ramp_data/backend//config.yml -where here event_name is iris. An example is provided in the following: - -``` - -sqlalchemy: - drivername: postgresql - username: username - password: ***** - host: localhost - port: ***** - database: ***** -ramp: - event_name : iris -aws: - ami_image_id : ami-0bc19972 OR ami_image_name : iris_backend - ami_user_name : ubuntu - instance_type : t2.micro - key_name: key - key_path: /home/user/.ssh/id_rsa - security_group : launch-wizard-1 - remote_ramp_kit_folder : ~/ramp-kits/iris - local_predictions_folder : ./predictions - local_log_folder : ./logs - check_status_interval_secs : 60 - check_finished_training_interval_secs : 60 - train_loop_interval_secs : 60 - memory_profiling : true - hooks : - after_sucessful_training: cd /mnt/ramp_data/frontend;fab compute_contributivity:iris;fab update_leaderboards:e=iris -``` - -The following is an explanation of each field in the aws section. - -`event_name` is the event name. This is used by `ramp_aws_train_loop` -(see below) to know the event for which to train new submissions. - -`ami_image_id` is the id of the image to use for training the submissions -(the one we created in Step 1). You can get the AMI image ID in the EC2 -console, in the tab AMI. It should start with 'ami-'. -The AMI should contain a folder `remote_ramp_kit_folder` (see below) -which contains the ramp kit. In Step 1 we chose `remote_ramp_kit_folder` to be ~/ramp-kits/iris. -Alternatively you can specify the image name rather than the image id, especially if you modify -the image a lot. To do that, you need to use the field `ami_image_name`. - -`ami_image_name` is the name of the image to use for training the submissions. -It is an alternative to `ami_image_id`. That is, you either specify `ami_image_id` -or `ami_image_name`, not both at the same time. - -`ami_user_name` is the username to connect with remotely on ec2 instances. - -`instance_type` is the instance type (check https://ec2instances.info/). - -`key_name` is the name of the key to connect with, so `key_name` should -exist im amazon (check key pairs in EC2 console). - -`security_group` is the name of the security group to use. -Security groups control which ports are accepted/blocked inbound or outbound. -They can be created in the web app of amazon. Use `default` -to use the default one or choose one from the EC2 console, in the tab -security group. - -`remote_ramp_kit_folder` is the folder in the ec2 instance -where the ramp-kit will reside. In Step 1 we chose to put it -in ~/ramp-kits/iris. It should be possible to launch -`ramp_test_submission` with success in that folder. - -`local_predictions_folder` is the local folder where the predictions are -downloaded (from the ec2 instance). - -`local_log_folder` is the local folder where the logs are downloaded -(from the ec2 instance). The logs contain the standard output and error -obtained from running `ramp_test_submission` for a given submission. - -`check_status_interval_secs` is the number of secs to wait until we -recheck whether an ec2 instance is ready to be used. - -`check_finished_training_interval_secs` is the number of secs to wait -until we recheck whether the training of a submission in an ec2 -instance is finished. - -`train_loop_interval_secs` is the number of secs to wait each time we -process new events in `train_loop` - -`memory_profiling` turns on (or off) memory profiling to know how much memory was -needed by a submission - -`hooks` is for specifying local commands that will run for after some event such as when -a submission has been trained successfully. Hooks available are: - -### hooks - -after_sucessful_training: `command`. it runs the given command each time a submission is -successfully trained. - -# Step 3: Using the CLI - -Before using the CLI, you need to provided your account details. -To do that, please follow https://docs.aws.amazon.com/en_en/lambda/latest/dg/setup-awscli.html - -Two command line interfaces are provided, ramp_aws_train and -ramp_aws_train_loop. - -## ramp_aws_train - -To train a single submission on aws, you can use ramp_aws_train. -To train a submission, use the following: - -``` -ramp_aws_train config.yml --event= --team= --name= -``` - -By default a new ec2 instance will be created then training will be done there, -then the instance will be killed after training. - -If you want to train on an existing (running) instance just add the option ---instance-id like the following: - -``` -ramp_aws_train config.yml --event= --team= --name= --instance-id= -``` - -To find the instance id, you have to check the EC2 console. - - -### ramp_aws_train_loop - -To launch a training loop that will automatically listen for new submissions and run them, use the following: - -``` -ramp_aws_train_loop config.yml -``` diff --git a/ramp-engine/ramp_engine/aws/aws_train.py b/ramp-engine/ramp_engine/aws/aws_train.py deleted file mode 100644 index f11fa3bd6..000000000 --- a/ramp-engine/ramp_engine/aws/aws_train.py +++ /dev/null @@ -1,338 +0,0 @@ -import logging -import os -from subprocess import call -import time - -import botocore # amazon api - -from ramp_database.tools import set_predictions -from ramp_database.tools import set_time -from ramp_database.tools import set_scores -from ramp_database.tools import set_submission_state -from ramp_database.tools import get_submissions -from ramp_database.tools import get_submission_state -from ramp_database.tools import get_submission_by_id -from ramp_database.tools import set_submission_max_ram -from ramp_database.tools import score_submission -from ramp_database.tools import set_submission_error_msg -from ramp_database.tools import get_event_nb_folds - -from ..base import _get_traceback - -from .api import ( - AWS_CONFIG_SECTION, - TRAIN_LOOP_INTERVAL_SECS_FIELD, - HOOKS_SECTION, - HOOK_SUCCESSFUL_TRAINING, HOOK_START_TRAINING, HOOK_FAILED_TRAINING, - CHECK_STATUS_INTERVAL_SECS_FIELD, - MEMORY_PROFILING_FIELD, LOCAL_LOG_FOLDER_FIELD, - launch_ec2_instances, terminate_ec2_instance, - _tag_instance_by_submission, _add_or_update_tag, - list_ec2_instance_ids, _is_ready, _get_tags, - upload_submission, launch_train, download_log, - _training_finished, _training_successful, - _get_submission_max_ram, download_mprof_data, download_predictions, - _get_log_content, _wait_until_train_finished) - - -logger = logging.getLogger('ramp_aws') - - -def train_loop(config, event_name): - """ - This function starts a training loop for a given event - The loop waits for any submission with the state 'new' then - create an ec2 instance to train the submission on it. - - Parameters - ---------- - - event_name : str - event name - """ - conf_aws = config[AWS_CONFIG_SECTION] - secs = conf_aws[TRAIN_LOOP_INTERVAL_SECS_FIELD] - while True: - # Launch new instances for new submissions - submissions = get_submissions(config, event_name, 'new') - for submission_id, _ in submissions: - submission = get_submission_by_id(config, submission_id) - submission_name = _get_submission_folder_name(submission_id) - if submission.is_sandbox: - continue - try: - instance, = launch_ec2_instances(conf_aws, nb=1) - except botocore.exceptions.ClientError as ex: - logger.info('Exception when launching a new instance : "{}"' - .format(ex)) - logger.info('Skipping...') - continue - nb_trials = 0 - while nb_trials < conf_aws.get('new_instance_nb_trials', 20): - if instance.state.get('name') == 'running': - break - nb_trials += 1 - time.sleep(conf_aws.get('new_instance_check_interval', 6)) - - _tag_instance_by_submission(conf_aws, instance.id, submission_name) - _add_or_update_tag(conf_aws, instance.id, 'train_loop', '1') - _add_or_update_tag(conf_aws, instance.id, 'event_name', event_name) - logger.info('Launched instance "{}" for submission "{}"'.format( - instance.id, submission)) - set_submission_state(config, submission.id, 'sent_to_training') - # Score tested submissions - submissions = get_submissions(config, event_name, 'tested') - for submission_id, _ in submissions: - label = _get_submission_label_by_id(config, submission_id) - logger.info('Scoring submission : {}'.format(label)) - score_submission(config, submission_id) - _run_hook(config, HOOK_SUCCESSFUL_TRAINING, submission_id) - # Get running instances and process events - instance_ids = list_ec2_instance_ids(conf_aws) - for instance_id in instance_ids: - if not _is_ready(conf_aws, instance_id): - continue - tags = _get_tags(conf_aws, instance_id) - # Filter instances that were not launched - # by the training loop API - # if 'submission_id' not in tags: # no longer added to tags - # continue - if tags.get('event_name') != event_name: - continue - if 'train_loop' not in tags: - continue - # Process each instance - submission_name = tags['Name'] - assert submission_name.startswith('submission_') - submission_id = int(submission_name[11:]) - submission = get_submission_by_id(config, submission_id) - label = '{}_{}'.format(submission_id, submission.name) - state = get_submission_state(config, submission_id) - submissions_dir = os.path.split(submission.path)[0] - if state == 'sent_to_training': - exit_status = upload_submission( - conf_aws, instance_id, submission_name, submissions_dir) - if exit_status != 0: - logger.error( - 'Cannot upload submission "{}"' - ', an error occured'.format(label)) - continue - # start training HERE - exit_status = launch_train( - conf_aws, instance_id, submission_name) - if exit_status != 0: - logger.error( - 'Cannot start training of submission "{}"' - ', an error occured.'.format(label)) - continue - set_submission_state(config, submission_id, 'training') - _run_hook(config, HOOK_START_TRAINING, submission_id) - - elif state == 'training': - # in any case (successful training or not) - # download the log - download_log(conf_aws, instance_id, submission_name) - if _training_finished(conf_aws, instance_id, submission_name): - logger.info( - 'Training of "{}" finished, checking ' - 'if successful or not...'.format(label)) - submission = get_submission_by_id(config, submission_id) - actual_nb_folds = get_event_nb_folds( - config, submission.event.name - ) - all_fine = True - is_training_successful = _training_successful( - conf_aws, - instance_id, - submission_name, - actual_nb_folds) - if is_training_successful: - logger.info('Training of "{}" was successful' - .format(label)) - if conf_aws.get(MEMORY_PROFILING_FIELD): - logger.info('Download max ram usage info of "{}"' - .format(label)) - download_mprof_data( - conf_aws, instance_id, submission_name - ) - max_ram = _get_submission_max_ram( - conf_aws, submission_name - ) - logger.info('Max ram usage of "{}": {}MB' - .format(label, max_ram)) - set_submission_max_ram( - config, submission_id, max_ram - ) - - logger.info('Downloading the predictions of "{}"' - .format(label)) - try: - path = download_predictions( - conf_aws, instance_id, submission_name) - except Exception as e: - label = str(e) - all_fine = False - set_predictions(config, submission_id, path) - set_time(config, submission_id, path) - set_scores(config, submission_id, path) - set_submission_state(config, submission_id, 'tested') - if not is_training_successful or not all_fine: - logger.info('Training of "{}" failed'.format(label)) - set_submission_state( - config, submission_id, 'training_error') - error_msg = _get_traceback( - _get_log_content(conf_aws, submission_name) - ) - set_submission_error_msg( - config, submission_id, error_msg) - _run_hook(config, HOOK_FAILED_TRAINING, submission_id) - # training finished, so terminate the instance - terminate_ec2_instance(conf_aws, instance_id) - time.sleep(secs) - - -def launch_ec2_instance_and_train(config, submission_id): - """ - This function does the following steps: - - 1) launch a new ec2 instance - 2) upload the submission into the ec2 the instance - 3) train the submission - 4) get back the predictions and the log - 5) terminate the ec2 instance. - - Parameters - ---------- - - config : dict - configuration - - submission_id : int - submission id - - """ - conf_aws = config[AWS_CONFIG_SECTION] - instance, = launch_ec2_instances(conf_aws, nb=1) - set_submission_state(config, submission_id, 'sent_to_training') - _wait_until_ready(config, instance.id) - train_on_existing_ec2_instance(config, instance.id, submission_id) - terminate_ec2_instance(conf_aws, instance.id) - - -def _wait_until_ready(config, instance_id): - """ - Wait until an ec2 instance is ready. - - Parameters - ---------- - - config : dict - configuration - - instance_id : str - - """ - logger.info('Waiting until instance "{}" is ready...'.format(instance_id)) - conf_aws = config[AWS_CONFIG_SECTION] - secs = int(conf_aws[CHECK_STATUS_INTERVAL_SECS_FIELD]) - while not _is_ready(conf_aws, instance_id): - time.sleep(secs) - - -def train_on_existing_ec2_instance(config, instance_id, submission_id): - """ - Train a submission on a ready ec2 instance - the steps followed by this function are the following: - 1) upload the submission code to the instance - 2) launch training in a screen - 3) wait until training is finished - 4) download the predictions - 5) download th log - 6) set the predictions in the database - 7) score the submission - """ - conf_aws = config[AWS_CONFIG_SECTION] - upload_submission(conf_aws, instance_id, submission_id) - launch_train(conf_aws, instance_id, submission_id) - set_submission_state(config, submission_id, 'training') - _run_hook(config, HOOK_START_TRAINING, submission_id) - _wait_until_train_finished(conf_aws, instance_id, submission_id) - download_log(conf_aws, instance_id, submission_id) - - label = _get_submission_label_by_id(config, submission_id) - submission = get_submission_by_id(config, submission_id) - actual_nb_folds = get_event_nb_folds(config, submission.event.name) - if _training_successful(conf_aws, instance_id, submission_id, - actual_nb_folds): - logger.info('Training of "{}" on instance: {} was successful'.format( - label, instance_id)) - if conf_aws[MEMORY_PROFILING_FIELD]: - logger.info('Download max ram usage info of "{}"'.format(label)) - download_mprof_data(conf_aws, instance_id, submission_id) - max_ram = _get_submission_max_ram(conf_aws, submission_id) - logger.info('Max ram usage of "{}": {}MB'.format(label, max_ram)) - set_submission_max_ram(config, submission_id, max_ram) - - logger.info('Downloading predictions of : "{}"'.format(label)) - predictions_folder_path = download_predictions( - conf_aws, instance_id, submission_id) - set_predictions(config, submission_id, predictions_folder_path) - set_time(config, submission_id, predictions_folder_path) - set_scores(config, submission_id, predictions_folder_path) - set_submission_state(config, submission_id, 'tested') - logger.info('Scoring "{}"'.format(label)) - score_submission(config, submission_id) - _run_hook(config, HOOK_SUCCESSFUL_TRAINING, submission_id) - else: - logger.info('Training of "{}" in "{}" failed'.format( - label, instance_id)) - set_submission_state(config, submission_id, 'training_error') - error_msg = _get_traceback( - _get_log_content(conf_aws, submission_id)) - set_submission_error_msg(config, submission_id, error_msg) - _run_hook(config, HOOK_FAILED_TRAINING, submission_id) - - -def _run_hook(config, hook_name, submission_id): - """ - run hooks corresponding to hook_name - """ - conf_aws = config[AWS_CONFIG_SECTION] - hooks = conf_aws.get(HOOKS_SECTION) - if not hooks: - return - if hook_name in hooks: - submission = get_submission_by_id(config, submission_id) - submission_folder_name = _get_submission_folder_name(submission_id) - submission_folder = os.path.join( - conf_aws[LOCAL_LOG_FOLDER_FIELD], - submission_folder_name) - env = { - 'RAMP_AWS_SUBMISSION_ID': str(submission_id), - 'RAMP_AWS_SUBMISSION_NAME': submission.name, - 'RAMP_AWS_EVENT': submission.event.name, - 'RAMP_AWS_TEAM': submission.team.name, - 'RAMP_AWS_HOOK': hook_name, - 'RAMP_AWS_SUBMISSION_FOLDER': submission_folder - } - env.update(os.environ) - cmd = hooks[hook_name] - if type(cmd) == list: - cmd = ';'.join(cmd) - logger.info('Running "{}" for hook {}'.format(cmd, hook_name)) - return call(cmd, shell=True, env=env) - - -def _get_submission_folder_name(submission_id): - return 'submission_{:09d}'.format(submission_id) - - -def _get_submission_label_by_id(config, submission_id): - submission = get_submission_by_id(config, submission_id) - return _get_submission_label(submission) - - -def _get_submission_label(submission): - # Submissions in AWS are tagged by the label - label = '{}_{}'.format(submission.id, submission.name) - return label diff --git a/ramp-engine/ramp_engine/aws/ramp_aws_train.py b/ramp-engine/ramp_engine/aws/ramp_aws_train.py deleted file mode 100644 index d6132117d..000000000 --- a/ramp-engine/ramp_engine/aws/ramp_aws_train.py +++ /dev/null @@ -1,101 +0,0 @@ -from __future__ import print_function, absolute_import, unicode_literals - -import sys -import logging -import argparse -from argparse import RawTextHelpFormatter - -from ramp_engine.aws.api import validate_config -from ramp_engine.config import read_backend_config -from ramp_database.tools import get_submission_by_name - -from .aws_train import ( - launch_ec2_instance_and_train, - train_on_existing_ec2_instance) - - -desc = """ -Train a submission on AWS. -Two ways of specifying the submission are available. -Either we give the submission id or name. - -Use ramp_aws_train config.yml --id= if you want to -specify submission by id. - -Use ramp_aws_train config.yml --event= --team= ---name= -if you want to specify submission by name. - -By default a new ec2 instance will be created then training will be done there, -then the instance will be killed after training. - -If you want to train on an existing instance just add the option ---instance-id. Example: - -ramp_aws_train config.yml --event= --team= ---name= --instance-id= - -To find the instance id, you have to check the AWS EC2 console -or use the cli `aws` provided by amazon. - -""" - - -def init_parser(): - """Defines command-line interface""" - parser = argparse.ArgumentParser( - prog=__file__, - description=desc, - formatter_class=RawTextHelpFormatter) - parser.add_argument('config', type=str, - help='Backend configuration file with database') - parser.add_argument('--id', type=int, - help='Submission ID') - parser.add_argument('--event', type=str, - help='Event name') - parser.add_argument('--team', type=str, - help='Team name') - parser.add_argument('--name', type=str, - help='Submission name') - parser.add_argument('--instance-id', type=str, - help='Instance id') - parser.add_argument('--log-level', type=str, default='INFO', - help='Log level : DEBUG/INFO/WARNING/ERROR/CRITICAL') - return parser - - -def main(): - parser = init_parser() - args = parser.parse_args() - logger = logging.getLogger('ramp_aws') - logger.setLevel(args.log_level) - config = read_backend_config(args.config) - validate_config(config) - if args.id: - submission_id = args.id - elif args.name and args.event and args.team: - try: - submission = get_submission_by_name( - config, - args.event, - args.team, - args.name - ) - except Exception as ex: - print('Submission not found. Reasons:') - print(ex) - sys.exit(1) - submission_id = submission.id - else: - print('Please specify either submission id, or alternatively ' - 'submission event/team/name. Use ramp_aws_train --help for ' - 'help.') - sys.exit(1) - if args.instance_id: - train_on_existing_ec2_instance(config, args.instance_id, submission_id) - else: - launch_ec2_instance_and_train(config, submission_id) - - -if __name__ == '__main__': - main() diff --git a/ramp-engine/ramp_engine/aws/ramp_aws_train_loop.py b/ramp-engine/ramp_engine/aws/ramp_aws_train_loop.py deleted file mode 100644 index 239b56613..000000000 --- a/ramp-engine/ramp_engine/aws/ramp_aws_train_loop.py +++ /dev/null @@ -1,43 +0,0 @@ -from __future__ import print_function, absolute_import, unicode_literals - -import sys -import logging -import argparse - -from .aws_train import train_loop - -from ramp_engine.aws.api import validate_config -from ramp_engine.config import read_backend_config - - -def init_parser(): - """Defines command-line interface""" - parser = argparse.ArgumentParser( - prog=__file__, - description='Train loop using AWS EC2 as a backend') - parser.add_argument('config', type=str, - help='Backend configuration file with database ' - 'connexion and RAMP event details.') - parser.add_argument('--log-level', type=str, default='INFO', - help='Log level : DEBUG/INFO/WARNING/ERROR/CRITICAL') - return parser - - -def main(): - parser = init_parser() - args = parser.parse_args() - logger = logging.getLogger('ramp_aws') - logger.setLevel(args.log_level) - config = read_backend_config(args.config) - validate_config(config) - try: - event_name = config['ramp']['event_name'] - except KeyError: - print('Cannot find event_name in section ramp of the {}' - .format(args.config)) - sys.exit(1) - train_loop(config, event_name) - - -if __name__ == '__main__': - main()