Skip to content

Commit 7c4cee5

Browse files
authored
Merge pull request #70 from awslabs/develop
Release 1.6.0
2 parents 9ff369a + fea0c19 commit 7c4cee5

23 files changed

+720
-51
lines changed

.travis.yml

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -8,11 +8,12 @@ python:
88
- "3.6"
99

1010
install:
11-
- if [[ $TRAVIS_PYTHON_VERSION == '2.6' ]]; then pip install -r requirements26.txt; fi
11+
- if [[ $TRAVIS_PYTHON_VERSION == '2.6' ]]; then xargs -L 1 pip install < requirements26.txt; fi
1212
- if [[ $TRAVIS_PYTHON_VERSION != '2.6' ]]; then pip install -r requirements.txt; fi
1313
- pip install -e .
1414

1515
sudo: false
1616

17-
script: sh tests/test.sh
18-
17+
script:
18+
- sh tests/test.sh
19+
- python jobwatcher/plugins/unittests.py

CHANGELOG.md

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,14 @@ cfncluster-node CHANGELOG
33

44
This file is used to list changes made in each version of the cfncluster-node package.
55

6+
1.6.0
7+
-----
8+
9+
Bug fixes/minor improvements:
10+
11+
- Changed scaling functionality to scale up and scale down faster.
12+
13+
614
1.5.4
715
-----
816

jobwatcher/__init__.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
# Copyright 2013-2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License"). You may not use this file except in compliance with the
4+
# License. A copy of the License is located at
5+
#
6+
# http://aws.amazon.com/apache2.0/
7+
#
8+
# or in the "LICENSE.txt" file accompanying this file. This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES
9+
# OR CONDITIONS OF ANY KIND, express or implied. See the License for the specific language governing permissions and
10+
# limitations under the License.

jobwatcher/jobwatcher.cfg

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
# Testing config file
2+
# Create an ASG with name Test
3+
[jobwatcher]
4+
region = us-east-1
5+
asg_name = Test
6+
stack_name = test
7+
scheduler = test
8+
proxy = NONE
9+
cfncluster_dir = ./
10+
compute_instance_type = c4.xlarge

jobwatcher/jobwatcher.py

Lines changed: 204 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,204 @@
1+
#!/usr/bin/env python2.6
2+
3+
# Copyright 2013-2016 Amazon.com, Inc. or its affiliates. All Rights Reserved.
4+
#
5+
# Licensed under the Apache License, Version 2.0 (the "License"). You may not use this file except in compliance with the
6+
# License. A copy of the License is located at
7+
#
8+
# http://aws.amazon.com/apache2.0/
9+
#
10+
# or in the "LICENSE.txt" file accompanying this file. This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES
11+
# OR CONDITIONS OF ANY KIND, express or implied. See the License for the specific language governing permissions and
12+
# limitations under the License.
13+
14+
__author__ = 'seaam'
15+
16+
import ConfigParser
17+
import boto3
18+
import os
19+
import sys
20+
import time
21+
import logging
22+
import json
23+
from botocore.exceptions import ClientError
24+
from botocore.config import Config
25+
26+
log = logging.getLogger(__name__)
27+
pricing_file = '/opt/cfncluster/instances.json'
28+
cfnconfig_file = '/opt/cfncluster/cfnconfig'
29+
30+
31+
def load_scheduler_module(scheduler):
32+
scheduler = 'jobwatcher.plugins.' + scheduler
33+
_scheduler = __import__(scheduler)
34+
_scheduler = sys.modules[scheduler]
35+
36+
log.debug("scheduler=%s" % repr(_scheduler))
37+
38+
return _scheduler
39+
40+
41+
def get_asg_name(stack_name, region, proxy_config):
42+
asg_conn = boto3.client('autoscaling', region_name=region, config=proxy_config)
43+
asg_name = ""
44+
no_asg = True
45+
46+
while no_asg:
47+
try:
48+
r = asg_conn.describe_tags(Filters=[{'Name': 'value', 'Values': [stack_name]}])
49+
asg_name = r.get('Tags')[0].get('ResourceId')
50+
no_asg = False
51+
except IndexError as e:
52+
log.error("No asg found for cluster %s" % stack_name)
53+
time.sleep(30)
54+
55+
return asg_name
56+
57+
58+
def read_cfnconfig():
59+
cfnconfig_params = {}
60+
with open(cfnconfig_file) as f:
61+
for kvp in f:
62+
key, value = kvp.partition('=')[::2]
63+
cfnconfig_params[key.strip()] = value.strip()
64+
return cfnconfig_params
65+
66+
67+
def get_vcpus_from_pricing_file(instance_type):
68+
with open(pricing_file) as f:
69+
instances = json.load(f)
70+
try:
71+
vcpus = int(instances[instance_type]["vcpus"])
72+
log.info("Instance %s has %s vcpus." % (instance_type, vcpus))
73+
return vcpus
74+
except KeyError as e:
75+
log.error("Instance %s not found in file %s." % (instance_type, pricing_file))
76+
exit(1)
77+
78+
79+
def get_instance_properties(instance_type):
80+
cfnconfig_params = read_cfnconfig()
81+
try:
82+
cfn_scheduler_slots = cfnconfig_params["cfn_scheduler_slots"]
83+
slots = 0
84+
vcpus = get_vcpus_from_pricing_file(instance_type)
85+
86+
if cfn_scheduler_slots == "cores":
87+
log.info("Instance %s will use number of cores as slots based on configuration." % instance_type)
88+
slots = -(-vcpus//2)
89+
elif cfn_scheduler_slots == "vcpus":
90+
log.info("Instance %s will use number of vcpus as slots based on configuration." % instance_type)
91+
slots = vcpus
92+
elif cfn_scheduler_slots.isdigit():
93+
slots = int(cfn_scheduler_slots)
94+
log.info("Instance %s will use %s slots based on configuration." % (instance_type, slots))
95+
96+
if not slots > 0:
97+
log.critical("cfn_scheduler_slots config parameter '%s' was invalid" % cfn_scheduler_slots)
98+
exit(1)
99+
100+
return {'slots': slots}
101+
102+
except KeyError:
103+
log.error("Required config parameter 'cfn_scheduler_slots' not found in file %s." % cfnconfig_file)
104+
exit(1)
105+
106+
107+
def fetch_pricing_file(proxy_config, cfncluster_dir, region):
108+
s3 = boto3.resource('s3', region_name=region, config=proxy_config)
109+
try:
110+
if not os.path.exists(cfncluster_dir):
111+
os.makedirs(cfncluster_dir)
112+
except OSError as ex:
113+
log.critical('Could not create directory %s. Failed with exception: %s' % (cfncluster_dir, ex))
114+
raise
115+
bucket_name = '%s-cfncluster' % region
116+
try:
117+
bucket = s3.Bucket(bucket_name)
118+
bucket.download_file('instances/instances.json', '%s/instances.json' % cfncluster_dir)
119+
except ClientError as e:
120+
log.critical("Could not save instance mapping file %s/instances.json from S3 bucket %s. Failed with exception: %s" % (cfncluster_dir, bucket_name, e))
121+
raise
122+
123+
124+
def main():
125+
logging.basicConfig(
126+
level=logging.INFO,
127+
format='%(asctime)s %(levelname)s [%(module)s:%(funcName)s] %(message)s'
128+
)
129+
130+
_configfilename = "/etc/jobwatcher.cfg"
131+
log.info("Reading configuration file %s" % _configfilename)
132+
config = ConfigParser.RawConfigParser()
133+
config.read(_configfilename)
134+
if config.has_option('jobwatcher', 'loglevel'):
135+
lvl = logging._levelNames[config.get('jobwatcher', 'loglevel')]
136+
logging.getLogger().setLevel(lvl)
137+
region = config.get('jobwatcher', 'region')
138+
scheduler = config.get('jobwatcher', 'scheduler')
139+
stack_name = config.get('jobwatcher', 'stack_name')
140+
instance_type = config.get('jobwatcher', 'compute_instance_type')
141+
cfncluster_dir = config.get('jobwatcher', 'cfncluster_dir')
142+
_proxy = config.get('jobwatcher', 'proxy')
143+
proxy_config = Config()
144+
145+
if not _proxy == "NONE":
146+
proxy_config = Config(proxies={'https': _proxy})
147+
log.info("Configured proxy is: %s" % _proxy)
148+
149+
try:
150+
asg_name = config.get('jobwatcher', 'asg_name')
151+
except ConfigParser.NoOptionError:
152+
asg_name = get_asg_name(stack_name, region, proxy_config)
153+
config.set('jobwatcher', 'asg_name', asg_name)
154+
log.info("Saving asg_name %s in the config file %s" % (asg_name, _configfilename))
155+
with open(_configfilename, 'w') as configfile:
156+
config.write(configfile)
157+
158+
# fetch the pricing file on startup
159+
fetch_pricing_file(proxy_config, cfncluster_dir, region)
160+
161+
# load scheduler
162+
s = load_scheduler_module(scheduler)
163+
164+
while True:
165+
# get the number of vcpu's per compute instance
166+
instance_properties = get_instance_properties(instance_type)
167+
168+
# Get number of nodes requested
169+
pending = s.get_required_nodes(instance_properties)
170+
171+
# Get number of nodes currently
172+
running = s.get_busy_nodes(instance_properties)
173+
174+
log.info("%s jobs pending; %s jobs running" % (pending, running))
175+
176+
if pending > 0:
177+
# connect to asg
178+
asg_conn = boto3.client('autoscaling', region_name=region, config=proxy_config)
179+
180+
# get current limits
181+
asg = asg_conn.describe_auto_scaling_groups(AutoScalingGroupNames=[asg_name]).get('AutoScalingGroups')[0]
182+
183+
min = asg.get('MinSize')
184+
current_desired = asg.get('DesiredCapacity')
185+
max = asg.get('MaxSize')
186+
log.info("min/desired/max %d/%d/%d" % (min, current_desired, max))
187+
log.info("Nodes requested %d, Nodes running %d" % (pending, running))
188+
189+
# check to make sure it's in limits
190+
desired = running + pending
191+
if desired > max:
192+
log.info("%d requested nodes is greater than max %d. Requesting max %d." % (desired, max, max))
193+
asg_conn.update_auto_scaling_group(AutoScalingGroupName=asg_name, DesiredCapacity=max)
194+
elif desired <= current_desired:
195+
log.info("%d nodes desired %d nodes in asg. Noop" % (desired, current_desired))
196+
else:
197+
log.info("Setting desired to %d nodes, requesting %d more nodes from asg." % (desired, desired - current_desired))
198+
asg_conn.update_auto_scaling_group(AutoScalingGroupName=asg_name, DesiredCapacity=desired)
199+
200+
time.sleep(60)
201+
202+
203+
if __name__ == '__main__':
204+
main()

jobwatcher/plugins/__init__.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
# Copyright 2013-2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License"). You may not use this file except in compliance with the
4+
# License. A copy of the License is located at
5+
#
6+
# http://aws.amazon.com/apache2.0/
7+
#
8+
# or in the "LICENSE.txt" file accompanying this file. This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES
9+
# OR CONDITIONS OF ANY KIND, express or implied. See the License for the specific language governing permissions and
10+
# limitations under the License.

jobwatcher/plugins/sge.py

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,37 @@
1+
import logging
2+
import json
3+
from utils import run_command
4+
5+
log = logging.getLogger(__name__)
6+
7+
# get nodes requested from pending jobs
8+
def get_required_nodes(instance_properties):
9+
command = "/opt/sge/bin/lx-amd64/qstat -g d -s p -u '*'"
10+
_output = run_command(command, {'SGE_ROOT': '/opt/sge',
11+
'PATH': '/opt/sge/bin:/opt/sge/bin/lx-amd64:/bin:/usr/bin'})
12+
slots = 0
13+
output = _output.split("\n")[2:]
14+
for line in output:
15+
line_arr = line.split()
16+
if len(line_arr) >= 8:
17+
slots += int(line_arr[7])
18+
vcpus = instance_properties.get('slots')
19+
return -(-slots // vcpus)
20+
21+
# get nodes reserved by running jobs
22+
# if a host has 1 or more job running on it, it'll be marked busy
23+
def get_busy_nodes(instance_properties):
24+
command = "/opt/sge/bin/lx-amd64/qstat -f"
25+
_output = run_command(command, {'SGE_ROOT': '/opt/sge',
26+
'PATH': '/opt/sge/bin:/opt/sge/bin/lx-amd64:/bin:/usr/bin'})
27+
nodes = 0
28+
output = _output.split("\n")[2:]
29+
for line in output:
30+
line_arr = line.split()
31+
if len(line_arr) == 5:
32+
# resv/used/tot.
33+
(resv, used, total) = line_arr[2].split('/')
34+
if int(used) > 0 or int(resv) > 0:
35+
nodes += 1
36+
return nodes
37+

jobwatcher/plugins/slurm.py

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,41 @@
1+
import logging
2+
from utils import run_command, get_optimal_nodes
3+
4+
5+
log = logging.getLogger(__name__)
6+
7+
8+
# get nodes requested from pending jobs
9+
def get_required_nodes(instance_properties):
10+
command = "/opt/slurm/bin/squeue -r -h -o '%i %t %D %C'"
11+
# Example output of squeue
12+
# 25 PD 1 24
13+
# 26 R 1 24
14+
_output = run_command(command, {})
15+
slots_requested = []
16+
nodes_requested = []
17+
output = _output.split("\n")
18+
for line in output:
19+
line_arr = line.split()
20+
if len(line_arr) == 4 and line_arr[1] == 'PD':
21+
slots_requested.append(int(line_arr[3]))
22+
nodes_requested.append(int(line_arr[2]))
23+
24+
return get_optimal_nodes(nodes_requested, slots_requested, instance_properties)
25+
26+
27+
# get nodes reserved by running jobs
28+
def get_busy_nodes(instance_properties):
29+
command = "/opt/slurm/bin/sinfo -r -h -o '%D %t'"
30+
# Sample output:
31+
# 2 mix
32+
# 4 alloc
33+
# 10 idle
34+
_output = run_command(command, {})
35+
nodes = 0
36+
output = _output.split("\n")
37+
for line in output:
38+
line_arr = line.split()
39+
if len(line_arr) == 2 and (line_arr[1] == 'mix' or line_arr[1] == 'alloc'):
40+
nodes += int(line_arr[0])
41+
return nodes

jobwatcher/plugins/test.py

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,38 @@
1+
# Copyright 2013-2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License"). You may not use this file except in compliance with the
4+
# License. A copy of the License is located at
5+
#
6+
# http://aws.amazon.com/apache2.0/
7+
#
8+
# or in the "LICENSE.txt" file accompanying this file. This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES
9+
# OR CONDITIONS OF ANY KIND, express or implied. See the License for the specific language governing permissions and
10+
# limitations under the License.
11+
12+
import json
13+
import logging
14+
15+
log = logging.getLogger(__name__)
16+
17+
# get nodes requested from pending jobs
18+
def get_required_nodes(instance_properties):
19+
# Test function. Change as needed.
20+
slots = 4
21+
vcpus = instance_properties.get('slots')
22+
return -(-slots // vcpus)
23+
24+
# get nodes reserved by running jobs
25+
def get_busy_nodes(instance_properties):
26+
# Test function. Change as needed.
27+
slots = 13
28+
vcpus = instance_properties.get('slots')
29+
return -(-slots // vcpus)
30+
31+
def nodes(slots, instance_properties):
32+
if slots <= 0:
33+
return 0
34+
with open('/opt/cfncluster/instances.json') as f:
35+
instances = json.load(f)
36+
vcpus = int(instances[instance_type]["vcpus"])
37+
log.info("Instance %s has %s slots." % (instance_type, vcpus))
38+
return

0 commit comments

Comments
 (0)