Skip to content

Commit

Permalink
Remove GPUs detection
Browse files Browse the repository at this point in the history
  • Loading branch information
Blanca Fuentes Monjas committed Nov 18, 2024
1 parent 6abd333 commit c6fe75d
Show file tree
Hide file tree
Showing 7 changed files with 45 additions and 270 deletions.
12 changes: 5 additions & 7 deletions autodetect_config.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,9 @@
import reframe.core.config as config

site_configuration = config.detect_config(
exclude_feats=['row*', 'c*-*', 'group*',
'contbuild', 'startx', 'perf', 'cvmfs', 'gpumodedefault',
'gpu'],
detect_containers=True,
detect_devices=False,
sched_options=['-A csstaff'],
filename='daint_config'
exclude_feats=['colum*'],
detect_containers=False,
sched_options=[],
time_limit=200,
filename='system_config'
)
27 changes: 11 additions & 16 deletions reframe/core/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -336,7 +336,7 @@ def sources(self):
def subconfig_system(self):
return self._local_system

def load_config_python(self, filename):
def load_config_python(self, filename, validate=True):
try:
mod = util.import_module_from_file(filename)
except ImportError as e:
Expand All @@ -346,13 +346,12 @@ def load_config_python(self, filename):
f'could not load Python configuration file: {filename!r}'
) from e

print(dir(mod))
if not hasattr(mod, 'site_configuration'):
raise ConfigError(
f"not a valid Python configuration file: '{filename}'"
)

if mod.site_configuration:
if validate:
self._config_modules.append(mod)
self.update_config(mod.site_configuration, filename)

Expand Down Expand Up @@ -669,7 +668,7 @@ def find_config_files(config_path=None, config_file=None):
return res


def load_config(*filenames):
def load_config(*filenames, validate=True):
ret = _SiteConfig()
getlogger().debug('Loading the builtin configuration')
ret.update_config(settings.site_configuration, '<builtin>')
Expand All @@ -681,7 +680,7 @@ def load_config(*filenames):
getlogger().debug(f'Loading configuration file: {f!r}')
_, ext = os.path.splitext(f)
if ext == '.py':
ret.load_config_python(f)
ret.load_config_python(f, validate)
elif ext == '.json':
ret.load_config_json(f)
else:
Expand All @@ -691,12 +690,11 @@ def load_config(*filenames):


def detect_config(detect_containers: bool = False,
detect_devices: bool = False,
exclude_feats: list = [],
sched_options: list = [],
filename: str = 'system_config'
):
# job_cli_options, exclude_feats
time_limit: int = 200,
filename: str = 'system_config'):

import reframe.core.runtime as rt

# Initialize the Site Configuration object
Expand All @@ -710,7 +708,8 @@ def detect_config(detect_containers: bool = False,
site_config.setdefault('name', '')
site_config.setdefault('hostnames', [])
hostname = ret._detect_system(detect_only=True)
site_config['hostnames'] += hostname
site_config['hostnames'] += [hostname]
print(site_config['hostnames'])
site_config['name'] += hostname
msg = color.colorize(
f'Detected hostname: {hostname}', color.GREEN
Expand All @@ -737,15 +736,11 @@ def detect_config(detect_containers: bool = False,
msg = color.colorize(f'Launcher set to {launcher_name}', color.GREEN)
getlogger().info(msg)

# Initialize the RuntimeContext with builtin settings
rt.init_runtime(ret)

site_config.setdefault('partitions', [])
# Detect the context with the corresponding scheduler
site_config['partitions'] = scheduler().build_context(
modules_system, launcher(), sched_options,
exclude_feats, rt.runtime().prefix,
detect_containers, detect_devices
modules_system, launcher(), sched_options, time_limit,
exclude_feats, rt.runtime().prefix, detect_containers
)

template_loader = FileSystemLoader(searchpath=os.path.join(
Expand Down
56 changes: 13 additions & 43 deletions reframe/core/schedulers/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -757,13 +757,13 @@ class ReframeContext(abc.ABC):
'''

def __init__(self, modules_system, launcher, scheduler, prefix,
detect_containers, detect_devices):
time_limit, detect_containers):
self.partitions = []
self._modules_system = modules_system
self._scheduler = scheduler
self._launcher = launcher
self._time_limit = time_limit
self._detect_containers = detect_containers
self._detect_devices = detect_devices
self._p_n = 0 # System partitions counter
self._keep_tmp_dir = False
if prefix == '.':
Expand All @@ -775,7 +775,7 @@ def __init__(self, modules_system, launcher, scheduler, prefix,
prefix='reframe_config_detection_',
dir=prefix
)
if detect_containers or detect_devices:
if detect_containers:
getlogger().info(f'Stage directory: {self.TMP_DIR}')

@abc.abstractmethod
Expand Down Expand Up @@ -812,9 +812,6 @@ def _check_gpus_count(self, node_devices_slurm: dict,
for gpu_job in node_devices_job:
devices.append({'type': 'gpu',
'model': gpu_job,
# TODO
'arch': c_d.nvidia_gpu_architecture.get(gpu_job) or
c_d.amd_gpu_architecture.get(gpu_job),
'num_devices': node_devices_job[gpu_job]})
gpus_job_count += node_devices_job[gpu_job]

Expand Down Expand Up @@ -928,9 +925,6 @@ def _extract_info(self, job: Job):
if job.detect_containers:
job.container_platforms = self._parse_containers(file_path)

if job.detect_devices:
job.devices = self._parse_devices(file_path)

def _create_detection_job(self, name: str, access_node: list,
access_options: list):
'''Create the instance of the job for remote autodetection'''
Expand All @@ -942,8 +936,7 @@ def _create_detection_job(self, name: str, access_node: list,
sched_access=access_node,
sched_options=access_options
)
# TODO: move this somewhere to be user defined, not here (maybe yaml)
remote_job.max_pending_time = 20
remote_job.max_pending_time = self._time_limit
remote_job.time_limit = '2m'
remote_job.container_platforms = []
remote_job.devices = {}
Expand All @@ -954,14 +947,10 @@ def _generate_job_content(self, job):
if job.detect_containers:
job.content += [c_d.containers_detect_bash]
job.content += ['\n\n\n']
if job.detect_devices:
job.content += [c_d.devices_detect_bash]

def create_login_partition(self):
# TODO: move this somewhere to be user defined, not here (maybe yaml)
max_jobs = 4
time_limit = '2m'
# TODO: improve this (?)
self.partitions.append(
{'name': 'login',
'scheduler': 'local',
Expand All @@ -971,34 +960,30 @@ def create_login_partition(self):
'launcher': 'local'})

def create_remote_partition(self, node_feats: tuple,
job_cli_options):
sched_options):

node_features = list(node_feats)
_detect_devices = copy.deepcopy(self._detect_devices)
_detect_containers = copy.deepcopy(self._detect_containers)
self._p_n += 1 # Count the partition that is being created
access_options = copy.deepcopy(job_cli_options)
access_options = copy.deepcopy(sched_options)
access_node = self._scheduler.feats_access_option(node_features)
name = f'partition_{self._p_n}'
getlogger().info(f'{name} : {node_feats}')
# TODO: move this somewhere to be user defined, not here (maybe yaml)
max_jobs = 100
time_limit = '10m'
container_platforms = []
devices = []

# Try to get the devices from the scheduler config
_detect_devices = self._find_devices(node_features)
if _detect_devices:
# If detection of remote devices is requested,
# try to get the devices from the scheduler config
_detect_devices = self._find_devices(node_features)
getlogger().info('GPUs were detected in this node type.')

remote_job = None
if _detect_devices or _detect_containers:
if _detect_containers:
self._keep_tmp_dir = True
remote_job = self._create_detection_job(
name, access_node, access_options
)
remote_job.detect_devices = _detect_devices
remote_job.detect_containers = _detect_containers
self._generate_job_content(remote_job)
submission_error, access_node = self.submit_detect_job(
Expand Down Expand Up @@ -1041,22 +1026,8 @@ def create_remote_partition(self, node_feats: tuple,
f'partition "{name}"'
)

if remote_job.devices:
# Issue any warning regarding missconfigurations
# between Gres and the detected devices
getlogger().info(f"\nGPUs found in partition {name}")
devices = self._check_gpus_count(
_detect_devices, remote_job.devices
)

elif not remote_job:
# No jobs were launched so we cannot check the access options
access_options += access_node
else:
# TODO check this
access_options = access_node
access_options += access_node

# TODO: improve this (?)
# Create the partition
self.partitions.append(
{'name': name,
Expand All @@ -1069,14 +1040,13 @@ def create_remote_partition(self, node_feats: tuple,
'launcher': self._launcher.name,
'access': access_options,
'features': node_features+['remote'],
'devices': devices,
'container_platforms': container_platforms}
)

def create_partitions(self, job_cli_options):
def create_partitions(self, sched_options):
# TODO: asynchronous
for node in self.node_types:
self.create_remote_partition(node, job_cli_options)
self.create_remote_partition(node, sched_options)
if not self._keep_tmp_dir:
shutil.rmtree(self.TMP_DIR)
else:
Expand Down
32 changes: 12 additions & 20 deletions reframe/core/schedulers/slurm.py
Original file line number Diff line number Diff line change
Expand Up @@ -586,17 +586,15 @@ def validate(cls) -> bool:
return False

def build_context(self, modules_system, launcher,
job_cli_options, exclude_feats,
prefix, detect_containers,
detect_devices):
sched_options, time_limit,
exclude_feats, prefix, detect_containers):
self._context = _SlurmContext(
modules_system, launcher, self, prefix,
detect_containers=detect_containers,
detect_devices=detect_devices
modules_system, launcher, self, prefix, time_limit,
detect_containers=detect_containers
)
self._context.search_node_types(exclude_feats)
self._context.create_login_partition()
self._context.create_partitions(job_cli_options)
self._context.create_partitions(sched_options)
return self._context.partitions


Expand Down Expand Up @@ -728,8 +726,7 @@ def satisfies(self, slurm_constraint: str):
# Convert the Slurm constraint to a Python expression and evaluate it,
# but restrict our syntax to accept only AND or OR constraints and
# their combinations
# print(re.match(r'^[\w\d\(\)\|\&]*$', slurm_constraint))
if not re.match(r'^[\w\d\(\)\|\&]*$', slurm_constraint):
if not re.match(r'^[\w\d\(\)\|\&\-]*$', slurm_constraint):
return False

names = {grp[0]
Expand All @@ -741,16 +738,11 @@ def satisfies(self, slurm_constraint: str):
] else key
for key in slurm_constraint.split()])

# print(expr)
# print('mc' in self.active_features and '128gb' in self.active_features)
vars = {n: True for n in self.active_features}
# print(vars)
vars.update({n: False for n in names - self.active_features})
# print(eval(expr, {}, vars))
try:
# print("returning", eval(expr, {}, vars))
return eval(expr, {}, {'vars': vars})
except BaseException as e:
except BaseException:
return False

@property
Expand Down Expand Up @@ -792,12 +784,12 @@ def __str__(self):
class _SlurmContext(sched.ReframeContext):

def __init__(self, modules_system: str, launcher: str, scheduler: str,
prefix: str, detect_containers: bool = True,
detect_devices: bool = True, access_opt: list = []):
prefix: str, time_limit: int, detect_containers: bool = True,
access_opt: list = []):

super().__init__(modules_system, launcher,
scheduler, prefix,
detect_containers, detect_devices)
scheduler, prefix, time_limit,
detect_containers)
self.node_types = []
self.default_nodes = []
self.reservations = []
Expand Down Expand Up @@ -950,11 +942,11 @@ def _find_devices(self, node_feats: list) -> Union[dict, None]:
def _get_access_partition(self, node_feats: list) -> Union[str, None]:

nodes_info = self._scheduler.allnodes()
node_feats = "&".join(node_feats)
nd_partitions = {tuple(n.partitions)
for n in nodes_info
if n.satisfies(node_feats)}
nd_partitions = set(nd_partitions)
print(nd_partitions)
if len(nd_partitions) > 1:
return None
else:
Expand Down
5 changes: 4 additions & 1 deletion reframe/frontend/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -879,7 +879,10 @@ def restrict_logging():
printer.adjust_verbosity(calc_verbosity(site_config, options.quiet))

if options.detect_configuration:
site_config = config.load_config(options.detect_configuration)
runtime.init_runtime(site_config)
site_config = config.load_config(
options.detect_configuration, validate=False
)
sys.exit(0)

# Now configure ReFrame according to the user configuration file
Expand Down
15 changes: 0 additions & 15 deletions reframe/schemas/reframe_config_template.j2
Original file line number Diff line number Diff line change
Expand Up @@ -39,21 +39,6 @@ site_configuration = {
'extras': {{partition.extras}},
# https://reframe-hpc.readthedocs.io/en/stable/config_reference.html#config.systems.partitions.env_vars
'env_vars': {{partition.env_vars}},
{% if partition.devices %}
# Check if any specific devices were found in this node type
# The gpus found in slurm GRes will be specified here
'devices': [
{% for dev in partition.devices %}
{ 'type': '{{dev.type}}',
'model': '{{dev.model}}',
{% if dev.arch %}
'arch': '{{dev.arch}}',
{% endif %}
'num_devices': {{dev.num_devices}}
},
{% endfor %}
],
{% endif %}
{% if partition.container_platforms %}
# Check if any container platforms are available in these nodes and add them
# https://reframe-hpc.readthedocs.io/en/stable/config_reference.html#container-platform-configuration
Expand Down
Loading

0 comments on commit c6fe75d

Please sign in to comment.