aws
diff --git a/‎CHANGELOG.md
Lines changed: 24 additions & 2 deletions b/‎CHANGELOG.md
Lines changed: 24 additions & 2 deletions
diff --git a/‎jobwatcher/jobwatcher.py
Lines changed: 136 additions & 67 deletions b/‎jobwatcher/jobwatcher.py
Lines changed: 136 additions & 67 deletions
diff --git a/‎jobwatcher/plugins/utils.py
Lines changed: 21 additions & 12 deletions b/‎jobwatcher/plugins/utils.py
Lines changed: 21 additions & 12 deletions
@@ -3,6 +3,29 @@ aws-parallelcluster-node CHANGELOG
 
 This file is used to list changes made in each version of the aws-parallelcluster-node package.
 
+2.2.1
+-----
+
+**CHANGES**
+- `nodewatcher`: sge - improve logic to detect if a compute node has running jobs
+- `sqswatcher`: remove invalid messages from SQS queue in order to process remaining messages
+- `sqswatcher`: add number of slots to the log of torque scheduler
+- `sqswatcher`: add retries in case aws request limits are reached
+
+**BUG FIXES**
+- `sqswatcher`: keep processing compute node termination until all scheduled jobs are terminated/cancelled.
+  This allows to automatically remove dead nodes from the scheduler once all jobs are terminated.
+- `jobwatcher`: better handling of error conditions and usage of fallback values
+- `nodewatcher`: enable daemon when cluster status is `UPDATE_ROLLBACK_COMPLETE`
+
+**TOOLING**
+- Add a script to simplify node package upload when using `custom_node_package` option
+
+2.1.1
+-----
+
+- China Regions, cn-north-1 and cn-northwest-1 support
+
 2.1.0
 -----
 
@@ -12,8 +35,7 @@ This file is used to list changes made in each version of the aws-parallelcluste
 -----
 
 Bug Fixes:
-
-    - Don't schedule jobs on compute nodes that are terminating
+- Don't schedule jobs on compute nodes that are terminating
 
 2.0.2
 -----
 
@@ -28,7 +28,13 @@
 cfnconfig_file = '/opt/parallelcluster/cfnconfig'
 
 
-def load_scheduler_module(scheduler):
+def _load_scheduler_module(scheduler):
+    """
+    Load scheduler module, containing scheduler specific functions.
+
+    :param scheduler: scheduler name, it must corresponds to the <scheduler>.py file in the current folder.
+    :return: the scheduler module
+    """
     scheduler = 'jobwatcher.plugins.' + scheduler
     _scheduler = __import__(scheduler)
     _scheduler = sys.modules[scheduler]
@@ -38,7 +44,15 @@ def load_scheduler_module(scheduler):
     return _scheduler
 
 
-def get_asg_name(stack_name, region, proxy_config):
+def _get_asg_name(stack_name, region, proxy_config):
+    """
+    Get autoscaling group name.
+
+    :param stack_name: stack name to search for
+    :param region: AWS region
+    :param proxy_config: Proxy configuration
+    :return: the ASG name
+    """
     asg_conn = boto3.client('autoscaling', region_name=region, config=proxy_config)
     asg_name = ""
     no_asg = True
@@ -48,14 +62,19 @@ def get_asg_name(stack_name, region, proxy_config):
             r = asg_conn.describe_tags(Filters=[{'Name': 'value', 'Values': [stack_name]}])
             asg_name = r.get('Tags')[0].get('ResourceId')
             no_asg = False
-        except IndexError as e:
+        except IndexError:
             log.error("No asg found for cluster %s" % stack_name)
             time.sleep(30)
 
     return asg_name
 
 
-def read_cfnconfig():
+def _read_cfnconfig():
+    """
+    Read configuration file.
+
+    :return: a dictionary containing the configuration parameters
+    """
     cfnconfig_params = {}
     with open(cfnconfig_file) as f:
         for kvp in f:
@@ -64,61 +83,94 @@ def read_cfnconfig():
     return cfnconfig_params
 
 
-def get_vcpus_from_pricing_file(instance_type):
+def _get_vcpus_from_pricing_file(instance_type):
+    """
+    Read pricing file and get number of vcpus for the given instance type.
+
+    :param instance_type: the instance type to search for.
+    :return: the number of vcpus or -1 if the instance type cannot be found
+    """
     with open(pricing_file) as f:
         instances = json.load(f)
         try:
             vcpus = int(instances[instance_type]["vcpus"])
             log.info("Instance %s has %s vcpus." % (instance_type, vcpus))
-            return vcpus
-        except KeyError as e:
-            log.error("Instance %s not found in file %s." % (instance_type, pricing_file))
-            exit(1)
+        except KeyError:
+            log.error("Unable to get vcpus from file %s. Instance type %s not found." % (pricing_file, instance_type))
+            vcpus = -1
 
+        return vcpus
 
-def get_instance_properties(instance_type):
-    cfnconfig_params = read_cfnconfig()
+
+def _get_instance_properties(instance_type):
+    """
+    Get instance properties for the given instance type, according to the cfn_scheduler_slots configuration parameter.
+
+    :param instance_type: instance type to search for
+    :return: a dictionary containing the instance properties. E.g. {'slots': <slots>}
+    """
     try:
+        cfnconfig_params = _read_cfnconfig()
         cfn_scheduler_slots = cfnconfig_params["cfn_scheduler_slots"]
-        slots = 0
-        vcpus = get_vcpus_from_pricing_file(instance_type)
-
-        if cfn_scheduler_slots == "cores":
-            log.info("Instance %s will use number of cores as slots based on configuration." % instance_type)
-            slots = -(-vcpus//2)
-        elif cfn_scheduler_slots == "vcpus":
-            log.info("Instance %s will use number of vcpus as slots based on configuration." % instance_type)
+    except KeyError:
+        log.error(
+            "Required config parameter 'cfn_scheduler_slots' not found in file %s. Assuming 'vcpus'" % cfnconfig_file
+        )
+        cfn_scheduler_slots = "vcpus"
+
+    vcpus = _get_vcpus_from_pricing_file(instance_type)
+
+    if cfn_scheduler_slots == "cores":
+        log.info("Instance %s will use number of cores as slots based on configuration." % instance_type)
+        slots = -(-vcpus//2)
+
+    elif cfn_scheduler_slots == "vcpus":
+        log.info("Instance %s will use number of vcpus as slots based on configuration." % instance_type)
+        slots = vcpus
+
+    elif cfn_scheduler_slots.isdigit():
+        slots = int(cfn_scheduler_slots)
+        log.info("Instance %s will use %s slots based on configuration." % (instance_type, slots))
+
+        if slots <= 0:
+            log.error(
+                "cfn_scheduler_slots config parameter '%s' must be greater than 0. "
+                "Assuming 'vcpus'" % cfn_scheduler_slots
+            )
             slots = vcpus
-        elif cfn_scheduler_slots.isdigit():
-            slots = int(cfn_scheduler_slots)
-            log.info("Instance %s will use %s slots based on configuration." % (instance_type, slots))
+    else:
+        log.error("cfn_scheduler_slots config parameter '%s' is invalid. Assuming 'vcpus'" % cfn_scheduler_slots)
+        slots = vcpus
 
-        if not slots > 0:
-            log.critical("cfn_scheduler_slots config parameter '%s' was invalid" % cfn_scheduler_slots)
-            exit(1)
+    if slots <= 0:
+        log.critical("slots value is invalid. Setting it to 0.")
+        slots = 0
 
-        return {'slots': slots}
+    return {'slots': slots}
 
-    except KeyError:
-        log.error("Required config parameter 'cfn_scheduler_slots' not found in file %s." % cfnconfig_file)
-        exit(1)
 
+def _fetch_pricing_file(pcluster_dir, region, proxy_config):
+    """
+    Download pricing file.
 
-def fetch_pricing_file(proxy_config, cfncluster_dir, region):
+    :param proxy_config: Proxy Configuration
+    :param pcluster_dir: Parallelcluster configuration folder
+    :param region: AWS Region
+    """
     s3 = boto3.resource('s3', region_name=region, config=proxy_config)
     try:
-        if not os.path.exists(cfncluster_dir):
-            os.makedirs(cfncluster_dir)
+        if not os.path.exists(pcluster_dir):
+            os.makedirs(pcluster_dir)
     except OSError as ex:
-        log.critical('Could not create directory %s. Failed with exception: %s' % (cfncluster_dir, ex))
+        log.critical('Could not create directory %s. Failed with exception: %s' % (pcluster_dir, ex))
         raise
     bucket_name = '%s-aws-parallelcluster' % region
     try:
         bucket = s3.Bucket(bucket_name)
-        bucket.download_file('instances/instances.json', '%s/instances.json' % cfncluster_dir)
+        bucket.download_file('instances/instances.json', '%s/instances.json' % pcluster_dir)
     except ClientError as e:
         log.critical("Could not save instance mapping file %s/instances.json from S3 bucket %s. "
-                     "Failed with exception: %s" % (cfncluster_dir, bucket_name, e))
+                     "Failed with exception: %s" % (pcluster_dir, bucket_name, e))
         raise
 
 
@@ -139,7 +191,7 @@ def main():
     scheduler = config.get('jobwatcher', 'scheduler')
     stack_name = config.get('jobwatcher', 'stack_name')
     instance_type = config.get('jobwatcher', 'compute_instance_type')
-    cfncluster_dir = config.get('jobwatcher', 'cfncluster_dir')
+    pcluster_dir = config.get('jobwatcher', 'cfncluster_dir')
     _proxy = config.get('jobwatcher', 'proxy')
     proxy_config = Config()
 
@@ -150,53 +202,70 @@ def main():
     try:
         asg_name = config.get('jobwatcher', 'asg_name')
     except ConfigParser.NoOptionError:
-        asg_name = get_asg_name(stack_name, region, proxy_config)
+        asg_name = _get_asg_name(stack_name, region, proxy_config)
         config.set('jobwatcher', 'asg_name', asg_name)
         log.info("Saving asg_name %s in the config file %s" % (asg_name, _configfilename))
         with open(_configfilename, 'w') as configfile:
             config.write(configfile)
 
     # fetch the pricing file on startup
-    fetch_pricing_file(proxy_config, cfncluster_dir, region)
+    _fetch_pricing_file(pcluster_dir, region, proxy_config)
 
     # load scheduler
-    s = load_scheduler_module(scheduler)
+    s = _load_scheduler_module(scheduler)
 
     while True:
         # get the number of vcpu's per compute instance
-        instance_properties = get_instance_properties(instance_type)
-
-        # Get number of nodes requested
-        pending = s.get_required_nodes(instance_properties)
-
-        # Get number of nodes currently
-        running = s.get_busy_nodes(instance_properties)
-
-        log.info("%s jobs pending; %s jobs running" % (pending, running))
+        instance_properties = _get_instance_properties(instance_type)
+        if instance_properties.get('slots') <= 0:
+            log.critical("Error detecting number of slots per instance. The cluster will not scale up.")
 
-        if pending > 0:
-            # connect to asg
-            asg_conn = boto3.client('autoscaling', region_name=region, config=proxy_config)
+        else:
+            # Get number of nodes requested
+            pending = s.get_required_nodes(instance_properties)
 
-            # get current limits
-            asg = asg_conn.describe_auto_scaling_groups(AutoScalingGroupNames=[asg_name]).get('AutoScalingGroups')[0]
+            if pending < 0:
+                log.critical("Error detecting number of required nodes. The cluster will not scale up.")
 
-            min = asg.get('MinSize')
-            current_desired = asg.get('DesiredCapacity')
-            max = asg.get('MaxSize')
-            log.info("min/desired/max %d/%d/%d" % (min, current_desired, max))
-            log.info("Nodes requested %d, Nodes running %d" % (pending, running))
+            elif pending == 0:
+                log.debug("There are no pending jobs. Noop.")
 
-            # check to make sure it's in limits
-            desired = running + pending
-            if desired > max:
-                log.info("%d requested nodes is greater than max %d. Requesting max %d." % (desired, max, max))
-                asg_conn.update_auto_scaling_group(AutoScalingGroupName=asg_name, DesiredCapacity=max)
-            elif desired <= current_desired:
-                log.info("%d nodes desired %d nodes in asg. Noop" % (desired, current_desired))
             else:
-                log.info("Setting desired to %d nodes, requesting %d more nodes from asg." % (desired, desired - current_desired))
-                asg_conn.update_auto_scaling_group(AutoScalingGroupName=asg_name, DesiredCapacity=desired)
+                # Get current number of nodes
+                running = s.get_busy_nodes(instance_properties)
+                log.info("%s jobs pending; %s jobs running" % (pending, running))
+
+                # connect to asg
+                asg_client = boto3.client('autoscaling', region_name=region, config=proxy_config)
+
+                # get current limits
+                asg = asg_client.describe_auto_scaling_groups(AutoScalingGroupNames=[asg_name]).get('AutoScalingGroups')[0]
+
+                min_size = asg.get('MinSize')
+                current_desired = asg.get('DesiredCapacity')
+                max_size = asg.get('MaxSize')
+                log.info("min/desired/max %d/%d/%d" % (min_size, current_desired, max_size))
+                log.info("%d nodes requested, %d nodes running" % (pending, running))
+
+                # Check to make sure requested number of instances is within ASG limits
+                required = running + pending
+                if required <= current_desired:
+                    log.info("%d nodes required, %d nodes in asg. Noop" % (required, current_desired))
+                else:
+                    if required > max_size:
+                        log.info(
+                            "The number of required nodes %d is greater than max %d. Requesting max %d."
+                            % (required, max_size, max_size)
+                        )
+                    else:
+                        log.info(
+                            "Setting desired to %d nodes, requesting %d more nodes from asg."
+                            % (required, required - current_desired)
+                        )
+                    requested = min(required, max_size)
+
+                    # update ASG
+                    asg_client.update_auto_scaling_group(AutoScalingGroupName=asg_name, DesiredCapacity=requested)
 
         time.sleep(60)
 
 
@@ -22,33 +22,42 @@ def run_command(command, env):
 
 
 def get_optimal_nodes(nodes_requested, slots_requested, instance_properties):
+    """
+    Get the optimal number of nodes required to satisfy the number of nodes and slots requested.
+
+    :param nodes_requested: Array containing the number of nodes requested by the ith job
+    :param slots_requested: Array containing the number of slots requested by the ith job
+    :param instance_properties: instance properties, i.e. number of slots available per node
+    :return: The optimal number of nodes required to satisfy the input queue.
+    """
     vcpus = instance_properties.get('slots')
     slots_remaining_per_node = []
 
-    for node_idx, node in enumerate(nodes_requested):
-        log.info("Requested node %s with slots %s" % (node, slots_requested[node_idx]))
-        # for simplicity, uniformly distribute the numbers of cpus requested across all the requested nodes
-        slots_required_per_node = -(-slots_requested[node_idx] // node)
+    for node_idx, num_of_nodes in enumerate(nodes_requested):
+        log.info("Requested %s nodes and %s slots" % (num_of_nodes, slots_requested[node_idx]))
+        # For simplicity, uniformly distribute the numbers of cpus requested across all the requested nodes
+        slots_required_per_node = -(-slots_requested[node_idx] // num_of_nodes)
 
         if slots_required_per_node > vcpus:
-            # if slots required per node is greater than vcpus, add additional nodes
+            # If slots required per node is greater than vcpus, add additional nodes
             # and recalculate slots_required_per_node
             log.info("Slots required per node is greater than vcpus, recalculating")
-            node = -(-slots_requested[node_idx] // vcpus)
-            slots_required_per_node = -(-slots_requested[node_idx] // node)
-            log.info("Recalculated: node %s and slots_required_per_node %s" % (node, slots_required_per_node))
+            num_of_nodes = -(-slots_requested[node_idx] // vcpus)
+            slots_required_per_node = -(-slots_requested[node_idx] // num_of_nodes)
+            log.info("Recalculated: %s nodes and %s slots required per node" % (num_of_nodes, slots_required_per_node))
 
+        # Verify if there are enough available slots in the nodes allocated in the previous rounds
         for slot_idx, slots_available in enumerate(slots_remaining_per_node):
-            if node > 0 and slots_available >= slots_required_per_node:
+            if num_of_nodes > 0 and slots_available >= slots_required_per_node:
                 log.info("Slot available in existing node")
                 # The node represented by slot_idx can be used to run this job
                 slots_remaining_per_node[slot_idx] -= slots_required_per_node
-                node -= 1
+                num_of_nodes -= 1
 
-        log.info("After looking at already allocated nodes, %s more nodes are needed" % node)
+        log.info("After looking at already allocated nodes, %s more nodes are needed" % num_of_nodes)
 
         # Since the number of available slots were unable to run this job entirely, only add the necessary nodes.
-        for i in range(node):
+        for i in range(num_of_nodes):
             log.info("Adding node. Using %s slots" % slots_required_per_node)
             slots_remaining_per_node.append(vcpus - slots_required_per_node)