Skip to content

Commit

Permalink
Merge pull request #24 from oracle-quickstart/2.10.2
Browse files Browse the repository at this point in the history
2.10.2
  • Loading branch information
arnaudfroidmont authored May 24, 2023
2 parents 6640d3e + 0cfde2a commit 3595386
Show file tree
Hide file tree
Showing 60 changed files with 839 additions and 162 deletions.
2 changes: 1 addition & 1 deletion autoscaling/crontab/autoscale_slurm.sh
Original file line number Diff line number Diff line change
Expand Up @@ -329,7 +329,7 @@ def getstatus_slurm():
return cluster_to_build,cluster_to_destroy,nodes_to_destroy,cluster_building,cluster_destroying,used_index,current_nodes,building_nodes

if os.path.isfile(lockfile):
print("Lockfile "+lockfile + " is present, exiting")
print( "Lockfile "+lockfile + " is present, exiting" )
exit()
open(lockfile,'w').close()
try:
Expand Down
2 changes: 2 additions & 0 deletions autoscaling/tf_init/bastion_update.tf
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,8 @@ resource "local_file" "inventory" {
nfs_source_path = var.nfs_source_path,
nfs_options = var.nfs_options,
localdisk = var.localdisk,
log_vol = var.log_vol,
redundancy = var.redundancy,
cluster_nfs_path = var.cluster_nfs_path,
scratch_nfs_path = var.scratch_nfs_path,
cluster_network = var.cluster_network,
Expand Down
2 changes: 2 additions & 0 deletions autoscaling/tf_init/inventory.tpl
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,8 @@ nfs_source_IP=${nfs_source_IP}
nfs_source_path=${nfs_source_path}
nfs_options=${nfs_options}
localdisk=${localdisk}
redundancy=${redundancy}
log_vol=${log_vol}
ldap=${ldap}
queue=${queue}
instance_type=${instance_type}
Expand Down
2 changes: 1 addition & 1 deletion autoscaling/tf_init/versions.tf
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ terraform {
required_providers {
oci = {
source = "oracle/oci"
version = "4.112.0"
version = "4.115.0"
}
}
}
25 changes: 8 additions & 17 deletions bastion.tf
Original file line number Diff line number Diff line change
Expand Up @@ -173,23 +173,6 @@ resource "null_resource" "bastion" {
private_key = tls_private_key.ssh.private_key_pem
}
}


provisioner "remote-exec" {
inline = [
"#!/bin/bash",
"chmod 600 /home/${var.bastion_username}/.ssh/cluster.key",
"cp /home/${var.bastion_username}/.ssh/cluster.key /home/${var.bastion_username}/.ssh/id_rsa",
"chmod a+x /opt/oci-hpc/bin/*.sh",
"timeout --foreground 60m /opt/oci-hpc/bin/bastion.sh"
]
connection {
host = local.host
type = "ssh"
user = var.bastion_username
private_key = tls_private_key.ssh.private_key_pem
}
}
}
resource "null_resource" "cluster" {
depends_on = [null_resource.bastion, null_resource.backup, oci_core_cluster_network.cluster_network, oci_core_instance.bastion, oci_core_volume_attachment.bastion_volume_attachment ]
Expand Down Expand Up @@ -224,6 +207,8 @@ resource "null_resource" "cluster" {
nfs_source_path = var.nfs_source_path,
nfs_options = var.nfs_options,
localdisk = var.localdisk,
log_vol = var.log_vol,
redundancy = var.redundancy,
cluster_network = var.cluster_network,
slurm = var.slurm,
rack_aware = var.rack_aware,
Expand Down Expand Up @@ -379,6 +364,8 @@ resource "null_resource" "cluster" {
nfs_source_path = var.nfs_source_path,
nfs_options = var.nfs_options,
localdisk = var.localdisk,
log_vol = var.log_vol,
redundancy = var.redundancy,
monitoring = var.monitoring,
hyperthreading = var.hyperthreading,
unsupported = var.unsupported,
Expand Down Expand Up @@ -438,6 +425,10 @@ provisioner "file" {
provisioner "remote-exec" {
inline = [
"#!/bin/bash",
"chmod 600 /home/${var.bastion_username}/.ssh/cluster.key",
"cp /home/${var.bastion_username}/.ssh/cluster.key /home/${var.bastion_username}/.ssh/id_rsa",
"chmod a+x /opt/oci-hpc/bin/*.sh",
"timeout --foreground 60m /opt/oci-hpc/bin/bastion.sh",
"chmod 755 /opt/oci-hpc/autoscaling/crontab/*.sh",
"chmod 600 /opt/oci-hpc/autoscaling/credentials/key.pem",
"echo ${var.configure} > /tmp/configure.conf",
Expand Down
2 changes: 1 addition & 1 deletion bin/bastion.sh
Original file line number Diff line number Diff line change
Expand Up @@ -93,7 +93,7 @@ elif [ $ID == "debian" ] || [ $ID == "ubuntu" ] ; then
pip install pip --upgrade
pip install pyopenssl --upgrade

# install oci-cli
# install oci-cli (add --oci-cli-version 3.23.3 or version that you know works if the latest does not work )
bash -c "$(curl -L https://raw.githubusercontent.com/oracle/oci-cli/master/scripts/install/install.sh)" -s --accept-all-defaults

# install oci module
Expand Down
111 changes: 59 additions & 52 deletions bin/create_cluster.sh
Original file line number Diff line number Diff line change
Expand Up @@ -96,63 +96,70 @@ do
end_timestamp=`date -u +'%F %T'`
runtime=$((end-start))
if [ $status -eq 0 ]
then
echo "Successfully created $2 in $runtime seconds"
rm currently_building
if [ -f $monitoring_folder/activated ]
then
echo "Successfully created $2 in $runtime seconds"
rm currently_building
if [ -f $monitoring_folder/activated ]
then
ocid=`tail $logs_folder/create_$2_${date}.log | grep "cluster_ocid =" | awk '{print $3}'`
ips=`tail $logs_folder/create_$2_${date}.log | grep "private_ips =" | awk '{print $3}'`
hostnames=`tail $logs_folder/create_$2_${date}.log | grep "hostnames =" | awk '{print $3}'`
ocids=`tail $logs_folder/create_$2_${date}.log | grep "ocids =" | awk '{print $3}'`
mysql -u $ENV_MYSQL_USER -p$ENV_MYSQL_PASS -e "use $ENV_MYSQL_DATABASE_NAME; UPDATE cluster_log.clusters SET cluster_OCID='${ocid:1:-1}',created='$end_timestamp',state='running',creation_time=SEC_TO_TIME($runtime) WHERE id='$2_${date}';" >> $logs_folder/create_$2_${date}.log 2>&1
export IFS=","
for ip in ${ips:1:-5}; do
ip_array+=( $ip )
done
for ocid in ${ocids:1:-5}; do
ocid_array+=( $ocid )
done
for hostname in ${hostnames:1:-1}; do
hostname_array+=( $hostname )
done
for index in "${!ip_array[@]}"; do
mysql -u $ENV_MYSQL_USER -p$ENV_MYSQL_PASS -e "use $ENV_MYSQL_DATABASE_NAME; UPDATE nodes SET created='$end_timestamp',state='running',hostname='${hostname_array[$index]}',ip='${ip_array[$index]}',node_OCID='${ocid_array[$index]}' WHERE cluster_id='$2_${date}' AND cluster_index=$(($index+1));" >> $logs_folder/create_$2_${date}.log 2>&1
done
fi
break
else
ERROR_MSG=`cat $logs_folder/create_$2_${date}.log | grep Error: | grep -o 'Output.*'`
if [ "$ERROR_MSG" == "" ]
then
ERROR_MSG=`cat $logs_folder/create_$2_${date}.log | grep Error:`
fi
comp_tmp=`curl -sH "Authorization: Bearer Oracle" -L http://169.254.169.254/opc/v2/instance/ | jq .compartmentId`
compartment_ocid=${comp_tmp:1:-1}
ocid=`tail $logs_folder/create_$2_${date}.log | grep "cluster_ocid =" | awk '{print $3}'`
ips=`tail $logs_folder/create_$2_${date}.log | grep "private_ips =" | awk '{print $3}'`
hostnames=`tail $logs_folder/create_$2_${date}.log | grep "hostnames =" | awk '{print $3}'`
ocids=`tail $logs_folder/create_$2_${date}.log | grep "ocids =" | awk '{print $3}'`
mysql -u $ENV_MYSQL_USER -p$ENV_MYSQL_PASS -e "use $ENV_MYSQL_DATABASE_NAME; UPDATE cluster_log.clusters SET cluster_OCID='${ocid:1:-1}',created='$end_timestamp',state='running',creation_time=SEC_TO_TIME($runtime) WHERE id='$2_${date}';" >> $logs_folder/create_$2_${date}.log 2>&1
export IFS=","
for ip in ${ips:1:-5}; do
ip_array+=( $ip )
done
for ocid in ${ocids:1:-5}; do
ocid_array+=( $ocid )
done
for hostname in ${hostnames:1:-1}; do
hostname_array+=( $hostname )
done
for index in "${!ip_array[@]}"; do
mysql -u $ENV_MYSQL_USER -p$ENV_MYSQL_PASS -e "use $ENV_MYSQL_DATABASE_NAME; UPDATE nodes SET created='$end_timestamp',state='running',hostname='${hostname_array[$index]}',ip='${ip_array[$index]}',node_OCID='${ocid_array[$index]}' WHERE cluster_id='$2_${date}' AND cluster_index=$(($index+1));" >> $logs_folder/create_$2_${date}.log 2>&1
done
fi
break
else
ERROR_MSG=`cat $logs_folder/create_$2_${date}.log | grep Error: | grep -o 'Output.*'`
if [ "$ERROR_MSG" == "" ]
then
ERROR_MSG=`cat $logs_folder/create_$2_${date}.log | grep Error:`
fi
comp_tmp=`curl -sH "Authorization: Bearer Oracle" -L http://169.254.169.254/opc/v2/instance/ | jq .compartmentId`
compartment_ocid=${comp_tmp:1:-1}

inst_pool_ocid=`oci compute-management instance-pool list --compartment-id $compartment_ocid --auth instance_principal --region $region --all --display-name $2 | jq '.data | sort_by(."time-created" | split(".") | .[0] | strptime("%Y-%m-%dT%H:%M:%S")) |.[-1] .id'` >> $logs_folder/create_$2_${date}.log 2>&1
if [ "$inst_pool_ocid" == "" ]
then
inst_pool_work_request_error_messages=""
else
requestID=`oci work-requests work-request list --compartment-id $compartment_ocid --auth instance_principal --region $region --all --resource-id ${inst_pool_ocid:1:-1} | jq '.data | .[] | select(."operation-type"=="LaunchInstancesInPool") | .id'` >> $logs_folder/create_$2_${date}.log 2>&1
inst_pool_work_request_error_messages=`oci work-requests work-request-error list --work-request-id ${requestID:1:-1} --auth instance_principal --region $region --all | jq '.data | .[] | .message '` >> $logs_folder/create_$2_${date}.log 2>&1
fi
if [ "$inst_pool_work_request_error_messages" == "" ]
then
cn_ocid=`oci compute-management cluster-network list --compartment-id $compartment_ocid --auth instance_principal --region $region --all --display-name $2 | jq '.data | sort_by(."time-created" | split(".") | .[0] | strptime("%Y-%m-%dT%H:%M:%S")) |.[-1] .id'` >> $logs_folder/create_$2_${date}.log 2>&1
inst_pool_ocid=`oci compute-management instance-pool list --compartment-id $compartment_ocid --auth instance_principal --region $region --all --display-name $2 | jq '.data | sort_by(."time-created" | split(".") | .[0] | strptime("%Y-%m-%dT%H:%M:%S")) |.[-1] .id'` >> $logs_folder/create_$2_${date}.log 2>&1
if [ "$inst_pool_ocid" == "" ]
then
inst_pool_work_request_error_messages=""
else
requestID=`oci work-requests work-request list --compartment-id $compartment_ocid --auth instance_principal --region $region --all --resource-id ${inst_pool_ocid:1:-1} | jq '.data | .[] | select(."operation-type"=="LaunchInstancesInPool") | .id'` >> $logs_folder/create_$2_${date}.log 2>&1
inst_pool_work_request_error_messages=`oci work-requests work-request-error list --work-request-id ${requestID:1:-1} --auth instance_principal --region $region --all | jq '.data | .[] | .message '` >> $logs_folder/create_$2_${date}.log 2>&1
fi
if [ "$inst_pool_work_request_error_messages" == "" ] && [ "$cluster_network" == "true" ]
then
cn_ocid=`oci compute-management cluster-network list --compartment-id $compartment_ocid --auth instance_principal --region $region --all --display-name $2 | jq '.data | sort_by(."time-created" | split(".") | .[0] | strptime("%Y-%m-%dT%H:%M:%S")) |.[-1] .id'` >> $logs_folder/create_$2_${date}.log 2>&1
if [ "$cn_ocid" == "" ]
then
cn_work_request_error_messages=""
else
requestID=`oci work-requests work-request list --compartment-id $compartment_ocid --auth instance_principal --region $region --all --resource-id ${cn_ocid:1:-1} | jq '.data | .[] | select(."operation-type"=="CreateClusterNetworkReservation") | .id'` >> $logs_folder/create_$2_${date}.log 2>&1
cn_work_request_error_messages=`oci work-requests work-request-log-entry list --work-request-id ${requestID:1:-1} --auth instance_principal --region $region --all | jq '.data | .[] | .message '` >> $logs_folder/create_$2_${date}.log 2>&1
fi
echo "Could not create $2 with $1 nodes in $runtime seconds"
echo "$ERROR_MSG $inst_pool_work_request_error_messages $cn_work_request_error_messages" | tee -a $logs_folder/create_$2_${date}.log 2>&1
fi
else
cn_work_request_error_messages=""
fi
echo "Could not create $2 with $1 nodes in $runtime seconds"
echo "$ERROR_MSG $inst_pool_work_request_error_messages $cn_work_request_error_messages" | tee -a $logs_folder/create_$2_${date}.log 2>&1

if [ -f $monitoring_folder/activated ]
then
mysql -u $ENV_MYSQL_USER -p$ENV_MYSQL_PASS -e "use $ENV_MYSQL_DATABASE_NAME; INSERT INTO cluster_log.errors_timeserie (cluster_id,state,error_log,error_type,nodes,created_on_m,class_name) VALUES ('$2_${date}','creation','$logs_folder/create_$2_${date}.log','$ERROR_MSG $inst_pool_work_request_error_messages $cn_work_request_error_messages','$1','$end_timestamp','$4');" >> $logs_folder/create_$2_${date}.log 2>&1
mysql -u $ENV_MYSQL_USER -p$ENV_MYSQL_PASS -e "use $ENV_MYSQL_DATABASE_NAME; UPDATE cluster_log.clusters SET state='deleting',creation_error='`tail $logs_folder/create_$2_${date}.log | grep Error`' WHERE id='$2_${date}';" >> $logs_folder/create_$2_${date}.log 2>&1
fi
rm currently_building
if [ -f $monitoring_folder/activated ]
then
mysql -u $ENV_MYSQL_USER -p$ENV_MYSQL_PASS -e "use $ENV_MYSQL_DATABASE_NAME; INSERT INTO cluster_log.errors_timeserie (cluster_id,state,error_log,error_type,nodes,created_on_m,class_name) VALUES ('$2_${date}','creation','$logs_folder/create_$2_${date}.log','$ERROR_MSG $inst_pool_work_request_error_messages $cn_work_request_error_messages','$1','$end_timestamp','$4');" >> $logs_folder/create_$2_${date}.log 2>&1
mysql -u $ENV_MYSQL_USER -p$ENV_MYSQL_PASS -e "use $ENV_MYSQL_DATABASE_NAME; UPDATE cluster_log.clusters SET state='deleting',creation_error='`tail $logs_folder/create_$2_${date}.log | grep Error`' WHERE id='$2_${date}';" >> $logs_folder/create_$2_${date}.log 2>&1
fi
rm currently_building
fi
done

Expand Down
8 changes: 5 additions & 3 deletions conf/variables.tpl
Original file line number Diff line number Diff line change
Expand Up @@ -49,12 +49,12 @@ variable "marketplace_version_id" {
"2" = "OL7.8-OFED5.0-1.0.0.0-UEK-20200826"
"3" = "OL7.7-OFED-4.4-2.0.7.0-UEK-20200229"
"4" = "OL7.9-OFED5.0-2.1.8.0-RHCK-20210709"
"HPC_OL7" = "OracleLinux-7-RHCK-3.10.0-OFED-5.4-3.6.8.1-2023.01.10-0"
"HPC_OL8" = "OracleLinux-8-RHCK-OFED-5.4-3.6.8.1-2023.01.10-0"
"HPC_OL7" = "OracleLinux-7-RHCK-3.10.0-OFED-5.4-3.6.8.1-2023.05.18-0"
"HPC_OL8" = "OracleLinux-8-RHCK-OFED-5.4-3.6.8.1-2023.05.18-0"
"HPC_OL7_old" = "OL7.9-RHCK-3.10.0-OFED-5.4-3.4.0-1"
"HPC_OL8_old" = "OracleLinux-8-RHCK-OFED-5.4-3.5.8.0-2022.11.15-0"
"GPU_old" = "OracleLinux-7-RHCK-3.10.0-OFED-5.4-3.4.0.0-GPU-510-2022.09.23-1"
"GPU" = "OracleLinux-7-RHCK-3.10.0-OFED-5.4-3.6.8.1-GPU-515-2023.01.10-0"
"GPU" = "OracleLinux-7-RHCK-3.10.0-OFED-5.4-3.6.8.1-GPU-515-2023.05.18-0"
}
}

Expand Down Expand Up @@ -126,5 +126,7 @@ variable "bastion_username" { default = "${bastion_username}" }
variable "compute_username" { default = "${compute_username}" }

variable "localdisk" { default = "${localdisk}" }
variable "log_vol" { default = "${log_vol}" }
variable "redundancy" { default = "${redundancy}" }

variable "instance_pool_ocpus_denseIO_flex" { default = "##OCPU##"}
2 changes: 2 additions & 0 deletions inventory.tpl
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,8 @@ nfs_source_IP=${nfs_source_IP}
nfs_source_path=${nfs_source_path}
nfs_options=${nfs_options}
localdisk=${localdisk}
redundancy=${redundancy}
log_vol=${log_vol}
instance_pool_ocpus=${instance_pool_ocpus}
queue=${queue}
monitoring=${monitoring}
Expand Down
13 changes: 11 additions & 2 deletions playbooks/new_nodes.yml
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@
- limits
- mpi-hostfiles
- boot-volume
- mpivars

- hosts: compute
become: true
Expand All @@ -48,6 +49,8 @@
- include_role:
name: rdma-interface
when: cluster_network|bool
- include_role:
name: nvidia_peermem

- hosts: bastion,slurm_backup,login,compute
become: true
Expand Down Expand Up @@ -165,13 +168,19 @@
when: enroot|default(true)|bool
- include_role:
name: tuned


- hosts: compute
tasks:
- include_role:
name: latency_check
when: cluster_network|bool and not 'GPU' in shape
when: cluster_network|bool and not 'GPU' in shape

- hosts: all
become: true
tasks:
- include_role:
name: fix_ldap
when: ldap|default(true)|bool

- hosts: compute, slurm_backup
vars:
Expand Down
13 changes: 11 additions & 2 deletions playbooks/resize_add.yml
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@
- ssh
- limits
- boot-volume
- mpivars

- hosts: compute_to_add
become: true
Expand All @@ -46,6 +47,8 @@
- include_role:
name: rdma-interface
when: cluster_network|bool
- include_role:
name: nvidia_peermem

- hosts: bastion,slurm_backup,login,compute
become: true
Expand Down Expand Up @@ -168,12 +171,18 @@
- include_role:
name: tuned


- hosts: compute_to_add
tasks:
- include_role:
name: latency_check
when: cluster_network|bool and not 'GPU' in shape
when: cluster_network|bool and not 'GPU' in shape

- hosts: all
become: true
tasks:
- include_role:
name: fix_ldap
when: ldap|default(true)|bool

- hosts: compute_to_add
vars:
Expand Down
2 changes: 1 addition & 1 deletion playbooks/roles/cluster-cli/files/cluster
Original file line number Diff line number Diff line change
Expand Up @@ -161,7 +161,7 @@ def list():
confirmation_prompt=True)
@click.option('-n', '--name', prompt='Full Name', required=True)
@click.option('-i', '--uid', default=None, help='Select the userID')
@click.option('-g', '--gid', default=None, help='Add to this groupID')
@click.option('-g', '--gid', default="9876", help='Add to this groupID')
@click.option('-nossh', '--nossh', is_flag=True, default=False, help='Flag to not generate a user-specific ssh-key pair for passwordless ssh.')
def add(user, password, uid, gid, name, nossh):
""" add user """
Expand Down
8 changes: 8 additions & 0 deletions playbooks/roles/etc-hosts/tasks/common.yml
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,14 @@
force: yes
when: ( not destroy|bool ) and (('slurm_backup' in group_names) or ('login' in group_names))

- name: Make sure the IP for each node was not left over in another cluster
become: true
lineinfile:
dest: /etc/hosts
regexp: "^127.0.1.1\\s{{hostvars[groups['bastion'][0]]['inventory_hostname']}}.*"
state: absent
when: ( not destroy|bool ) and (('slurm_backup' in group_names) or ('login' in group_names))

- name: move /etc/hosts on all compute nodes
become: true
copy:
Expand Down
2 changes: 2 additions & 0 deletions playbooks/roles/fix_ldap/tasks/main.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
- include: ubuntu.yml
when: ansible_distribution == 'Ubuntu'
16 changes: 16 additions & 0 deletions playbooks/roles/fix_ldap/tasks/ubuntu.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
---
- name: restart nscd
become: true
systemd:
name: nscd
state: restarted
daemon_reload: true
enabled: true

- name: restart sssd
become: true
service:
name: sssd
state: restarted
daemon_reload: true
enabled: true
Loading

0 comments on commit 3595386

Please sign in to comment.