Skip to content

Commit c6fe75d

Browse files
author
Blanca Fuentes Monjas
committed
Remove GPUs detection
1 parent 6abd333 commit c6fe75d

File tree

7 files changed

+45
-270
lines changed

7 files changed

+45
-270
lines changed

autodetect_config.py

Lines changed: 5 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,9 @@
11
import reframe.core.config as config
22

33
site_configuration = config.detect_config(
4-
exclude_feats=['row*', 'c*-*', 'group*',
5-
'contbuild', 'startx', 'perf', 'cvmfs', 'gpumodedefault',
6-
'gpu'],
7-
detect_containers=True,
8-
detect_devices=False,
9-
sched_options=['-A csstaff'],
10-
filename='daint_config'
4+
exclude_feats=['colum*'],
5+
detect_containers=False,
6+
sched_options=[],
7+
time_limit=200,
8+
filename='system_config'
119
)

reframe/core/config.py

Lines changed: 11 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -336,7 +336,7 @@ def sources(self):
336336
def subconfig_system(self):
337337
return self._local_system
338338

339-
def load_config_python(self, filename):
339+
def load_config_python(self, filename, validate=True):
340340
try:
341341
mod = util.import_module_from_file(filename)
342342
except ImportError as e:
@@ -346,13 +346,12 @@ def load_config_python(self, filename):
346346
f'could not load Python configuration file: {filename!r}'
347347
) from e
348348

349-
print(dir(mod))
350349
if not hasattr(mod, 'site_configuration'):
351350
raise ConfigError(
352351
f"not a valid Python configuration file: '{filename}'"
353352
)
354353

355-
if mod.site_configuration:
354+
if validate:
356355
self._config_modules.append(mod)
357356
self.update_config(mod.site_configuration, filename)
358357

@@ -669,7 +668,7 @@ def find_config_files(config_path=None, config_file=None):
669668
return res
670669

671670

672-
def load_config(*filenames):
671+
def load_config(*filenames, validate=True):
673672
ret = _SiteConfig()
674673
getlogger().debug('Loading the builtin configuration')
675674
ret.update_config(settings.site_configuration, '<builtin>')
@@ -681,7 +680,7 @@ def load_config(*filenames):
681680
getlogger().debug(f'Loading configuration file: {f!r}')
682681
_, ext = os.path.splitext(f)
683682
if ext == '.py':
684-
ret.load_config_python(f)
683+
ret.load_config_python(f, validate)
685684
elif ext == '.json':
686685
ret.load_config_json(f)
687686
else:
@@ -691,12 +690,11 @@ def load_config(*filenames):
691690

692691

693692
def detect_config(detect_containers: bool = False,
694-
detect_devices: bool = False,
695693
exclude_feats: list = [],
696694
sched_options: list = [],
697-
filename: str = 'system_config'
698-
):
699-
# job_cli_options, exclude_feats
695+
time_limit: int = 200,
696+
filename: str = 'system_config'):
697+
700698
import reframe.core.runtime as rt
701699

702700
# Initialize the Site Configuration object
@@ -710,7 +708,8 @@ def detect_config(detect_containers: bool = False,
710708
site_config.setdefault('name', '')
711709
site_config.setdefault('hostnames', [])
712710
hostname = ret._detect_system(detect_only=True)
713-
site_config['hostnames'] += hostname
711+
site_config['hostnames'] += [hostname]
712+
print(site_config['hostnames'])
714713
site_config['name'] += hostname
715714
msg = color.colorize(
716715
f'Detected hostname: {hostname}', color.GREEN
@@ -737,15 +736,11 @@ def detect_config(detect_containers: bool = False,
737736
msg = color.colorize(f'Launcher set to {launcher_name}', color.GREEN)
738737
getlogger().info(msg)
739738

740-
# Initialize the RuntimeContext with builtin settings
741-
rt.init_runtime(ret)
742-
743739
site_config.setdefault('partitions', [])
744740
# Detect the context with the corresponding scheduler
745741
site_config['partitions'] = scheduler().build_context(
746-
modules_system, launcher(), sched_options,
747-
exclude_feats, rt.runtime().prefix,
748-
detect_containers, detect_devices
742+
modules_system, launcher(), sched_options, time_limit,
743+
exclude_feats, rt.runtime().prefix, detect_containers
749744
)
750745

751746
template_loader = FileSystemLoader(searchpath=os.path.join(

reframe/core/schedulers/__init__.py

Lines changed: 13 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -757,13 +757,13 @@ class ReframeContext(abc.ABC):
757757
'''
758758

759759
def __init__(self, modules_system, launcher, scheduler, prefix,
760-
detect_containers, detect_devices):
760+
time_limit, detect_containers):
761761
self.partitions = []
762762
self._modules_system = modules_system
763763
self._scheduler = scheduler
764764
self._launcher = launcher
765+
self._time_limit = time_limit
765766
self._detect_containers = detect_containers
766-
self._detect_devices = detect_devices
767767
self._p_n = 0 # System partitions counter
768768
self._keep_tmp_dir = False
769769
if prefix == '.':
@@ -775,7 +775,7 @@ def __init__(self, modules_system, launcher, scheduler, prefix,
775775
prefix='reframe_config_detection_',
776776
dir=prefix
777777
)
778-
if detect_containers or detect_devices:
778+
if detect_containers:
779779
getlogger().info(f'Stage directory: {self.TMP_DIR}')
780780

781781
@abc.abstractmethod
@@ -812,9 +812,6 @@ def _check_gpus_count(self, node_devices_slurm: dict,
812812
for gpu_job in node_devices_job:
813813
devices.append({'type': 'gpu',
814814
'model': gpu_job,
815-
# TODO
816-
'arch': c_d.nvidia_gpu_architecture.get(gpu_job) or
817-
c_d.amd_gpu_architecture.get(gpu_job),
818815
'num_devices': node_devices_job[gpu_job]})
819816
gpus_job_count += node_devices_job[gpu_job]
820817

@@ -928,9 +925,6 @@ def _extract_info(self, job: Job):
928925
if job.detect_containers:
929926
job.container_platforms = self._parse_containers(file_path)
930927

931-
if job.detect_devices:
932-
job.devices = self._parse_devices(file_path)
933-
934928
def _create_detection_job(self, name: str, access_node: list,
935929
access_options: list):
936930
'''Create the instance of the job for remote autodetection'''
@@ -942,8 +936,7 @@ def _create_detection_job(self, name: str, access_node: list,
942936
sched_access=access_node,
943937
sched_options=access_options
944938
)
945-
# TODO: move this somewhere to be user defined, not here (maybe yaml)
946-
remote_job.max_pending_time = 20
939+
remote_job.max_pending_time = self._time_limit
947940
remote_job.time_limit = '2m'
948941
remote_job.container_platforms = []
949942
remote_job.devices = {}
@@ -954,14 +947,10 @@ def _generate_job_content(self, job):
954947
if job.detect_containers:
955948
job.content += [c_d.containers_detect_bash]
956949
job.content += ['\n\n\n']
957-
if job.detect_devices:
958-
job.content += [c_d.devices_detect_bash]
959950

960951
def create_login_partition(self):
961-
# TODO: move this somewhere to be user defined, not here (maybe yaml)
962952
max_jobs = 4
963953
time_limit = '2m'
964-
# TODO: improve this (?)
965954
self.partitions.append(
966955
{'name': 'login',
967956
'scheduler': 'local',
@@ -971,34 +960,30 @@ def create_login_partition(self):
971960
'launcher': 'local'})
972961

973962
def create_remote_partition(self, node_feats: tuple,
974-
job_cli_options):
963+
sched_options):
975964

976965
node_features = list(node_feats)
977-
_detect_devices = copy.deepcopy(self._detect_devices)
978966
_detect_containers = copy.deepcopy(self._detect_containers)
979967
self._p_n += 1 # Count the partition that is being created
980-
access_options = copy.deepcopy(job_cli_options)
968+
access_options = copy.deepcopy(sched_options)
981969
access_node = self._scheduler.feats_access_option(node_features)
982970
name = f'partition_{self._p_n}'
983971
getlogger().info(f'{name} : {node_feats}')
984-
# TODO: move this somewhere to be user defined, not here (maybe yaml)
985972
max_jobs = 100
986973
time_limit = '10m'
987974
container_platforms = []
988-
devices = []
989975

976+
# Try to get the devices from the scheduler config
977+
_detect_devices = self._find_devices(node_features)
990978
if _detect_devices:
991-
# If detection of remote devices is requested,
992-
# try to get the devices from the scheduler config
993-
_detect_devices = self._find_devices(node_features)
979+
getlogger().info('GPUs were detected in this node type.')
994980

995981
remote_job = None
996-
if _detect_devices or _detect_containers:
982+
if _detect_containers:
997983
self._keep_tmp_dir = True
998984
remote_job = self._create_detection_job(
999985
name, access_node, access_options
1000986
)
1001-
remote_job.detect_devices = _detect_devices
1002987
remote_job.detect_containers = _detect_containers
1003988
self._generate_job_content(remote_job)
1004989
submission_error, access_node = self.submit_detect_job(
@@ -1041,22 +1026,8 @@ def create_remote_partition(self, node_feats: tuple,
10411026
f'partition "{name}"'
10421027
)
10431028

1044-
if remote_job.devices:
1045-
# Issue any warning regarding missconfigurations
1046-
# between Gres and the detected devices
1047-
getlogger().info(f"\nGPUs found in partition {name}")
1048-
devices = self._check_gpus_count(
1049-
_detect_devices, remote_job.devices
1050-
)
1051-
1052-
elif not remote_job:
1053-
# No jobs were launched so we cannot check the access options
1054-
access_options += access_node
1055-
else:
1056-
# TODO check this
1057-
access_options = access_node
1029+
access_options += access_node
10581030

1059-
# TODO: improve this (?)
10601031
# Create the partition
10611032
self.partitions.append(
10621033
{'name': name,
@@ -1069,14 +1040,13 @@ def create_remote_partition(self, node_feats: tuple,
10691040
'launcher': self._launcher.name,
10701041
'access': access_options,
10711042
'features': node_features+['remote'],
1072-
'devices': devices,
10731043
'container_platforms': container_platforms}
10741044
)
10751045

1076-
def create_partitions(self, job_cli_options):
1046+
def create_partitions(self, sched_options):
10771047
# TODO: asynchronous
10781048
for node in self.node_types:
1079-
self.create_remote_partition(node, job_cli_options)
1049+
self.create_remote_partition(node, sched_options)
10801050
if not self._keep_tmp_dir:
10811051
shutil.rmtree(self.TMP_DIR)
10821052
else:

reframe/core/schedulers/slurm.py

Lines changed: 12 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -586,17 +586,15 @@ def validate(cls) -> bool:
586586
return False
587587

588588
def build_context(self, modules_system, launcher,
589-
job_cli_options, exclude_feats,
590-
prefix, detect_containers,
591-
detect_devices):
589+
sched_options, time_limit,
590+
exclude_feats, prefix, detect_containers):
592591
self._context = _SlurmContext(
593-
modules_system, launcher, self, prefix,
594-
detect_containers=detect_containers,
595-
detect_devices=detect_devices
592+
modules_system, launcher, self, prefix, time_limit,
593+
detect_containers=detect_containers
596594
)
597595
self._context.search_node_types(exclude_feats)
598596
self._context.create_login_partition()
599-
self._context.create_partitions(job_cli_options)
597+
self._context.create_partitions(sched_options)
600598
return self._context.partitions
601599

602600

@@ -728,8 +726,7 @@ def satisfies(self, slurm_constraint: str):
728726
# Convert the Slurm constraint to a Python expression and evaluate it,
729727
# but restrict our syntax to accept only AND or OR constraints and
730728
# their combinations
731-
# print(re.match(r'^[\w\d\(\)\|\&]*$', slurm_constraint))
732-
if not re.match(r'^[\w\d\(\)\|\&]*$', slurm_constraint):
729+
if not re.match(r'^[\w\d\(\)\|\&\-]*$', slurm_constraint):
733730
return False
734731

735732
names = {grp[0]
@@ -741,16 +738,11 @@ def satisfies(self, slurm_constraint: str):
741738
] else key
742739
for key in slurm_constraint.split()])
743740

744-
# print(expr)
745-
# print('mc' in self.active_features and '128gb' in self.active_features)
746741
vars = {n: True for n in self.active_features}
747-
# print(vars)
748742
vars.update({n: False for n in names - self.active_features})
749-
# print(eval(expr, {}, vars))
750743
try:
751-
# print("returning", eval(expr, {}, vars))
752744
return eval(expr, {}, {'vars': vars})
753-
except BaseException as e:
745+
except BaseException:
754746
return False
755747

756748
@property
@@ -792,12 +784,12 @@ def __str__(self):
792784
class _SlurmContext(sched.ReframeContext):
793785

794786
def __init__(self, modules_system: str, launcher: str, scheduler: str,
795-
prefix: str, detect_containers: bool = True,
796-
detect_devices: bool = True, access_opt: list = []):
787+
prefix: str, time_limit: int, detect_containers: bool = True,
788+
access_opt: list = []):
797789

798790
super().__init__(modules_system, launcher,
799-
scheduler, prefix,
800-
detect_containers, detect_devices)
791+
scheduler, prefix, time_limit,
792+
detect_containers)
801793
self.node_types = []
802794
self.default_nodes = []
803795
self.reservations = []
@@ -950,11 +942,11 @@ def _find_devices(self, node_feats: list) -> Union[dict, None]:
950942
def _get_access_partition(self, node_feats: list) -> Union[str, None]:
951943

952944
nodes_info = self._scheduler.allnodes()
945+
node_feats = "&".join(node_feats)
953946
nd_partitions = {tuple(n.partitions)
954947
for n in nodes_info
955948
if n.satisfies(node_feats)}
956949
nd_partitions = set(nd_partitions)
957-
print(nd_partitions)
958950
if len(nd_partitions) > 1:
959951
return None
960952
else:

reframe/frontend/cli.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -879,7 +879,10 @@ def restrict_logging():
879879
printer.adjust_verbosity(calc_verbosity(site_config, options.quiet))
880880

881881
if options.detect_configuration:
882-
site_config = config.load_config(options.detect_configuration)
882+
runtime.init_runtime(site_config)
883+
site_config = config.load_config(
884+
options.detect_configuration, validate=False
885+
)
883886
sys.exit(0)
884887

885888
# Now configure ReFrame according to the user configuration file

reframe/schemas/reframe_config_template.j2

Lines changed: 0 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -39,21 +39,6 @@ site_configuration = {
3939
'extras': {{partition.extras}},
4040
# https://reframe-hpc.readthedocs.io/en/stable/config_reference.html#config.systems.partitions.env_vars
4141
'env_vars': {{partition.env_vars}},
42-
{% if partition.devices %}
43-
# Check if any specific devices were found in this node type
44-
# The gpus found in slurm GRes will be specified here
45-
'devices': [
46-
{% for dev in partition.devices %}
47-
{ 'type': '{{dev.type}}',
48-
'model': '{{dev.model}}',
49-
{% if dev.arch %}
50-
'arch': '{{dev.arch}}',
51-
{% endif %}
52-
'num_devices': {{dev.num_devices}}
53-
},
54-
{% endfor %}
55-
],
56-
{% endif %}
5742
{% if partition.container_platforms %}
5843
# Check if any container platforms are available in these nodes and add them
5944
# https://reframe-hpc.readthedocs.io/en/stable/config_reference.html#container-platform-configuration

0 commit comments

Comments
 (0)