diff --git a/config.tpl.yml b/config.tpl.yml index c69aceacf..166aa3e9e 100644 --- a/config.tpl.yml +++ b/config.tpl.yml @@ -23,10 +23,10 @@ log_analytics: #name: #subscription_id: # Optional, if not specified the current subscription will be used -# Option to install the monitoring agent on static infra VMs. Can be disabled if the agent is installed by policy. -monitoring: - install_agent: true - +# Option to install the monitoring agent on static infra VMs. Can be disabled if the agent is installed by policy. +monitoring: + install_agent: true + #If set to true, it will create alert rules associated with az-hop. Enablement of alerting will require the specification of an admin email to send alerts to. alerting: enabled: true @@ -46,7 +46,7 @@ anf: homefs_service_level: Standard # dual protocol dual_protocol: false # true to enable SMB support. false by default - # If alerting is enabled, this value will be used to determine when to trigger alerts + # If alerting is enabled, this value will be used to determine when to trigger alerts alert_threshold: 80 # alert when ANF volume reaches this threshold # For small deployments you can use Azure Files instead of ANF for the home directory @@ -64,7 +64,7 @@ mounts: export: '{{anf_home_path}}' # Specify an existing NFS export directory, when using the ANF built in use '{{anf_home_path}}' options: '{{anf_home_opts}}' # Specify the mount options. Default to rw,hard,rsize=262144,wsize=262144,vers=3,tcp,_netdev # mount1: -# mountpoint: /mount1 +# mountpoint: /mount1 # server: a.b.c.d # Specify an existing NFS server name or IP # export: myexport1 # Specify an existing NFS export name # options: my_options # Specify the mount options. @@ -80,7 +80,7 @@ network: vnet: name: hpcvnet # Optional - default to hpcvnet #id: # If a vnet id is set then no network will be created and the provided vnet will be used - address_space: "10.0.0.0/23" + address_space: "10.0.0.0/23" # Special VNET Tags # tags: # key1: value1 @@ -143,7 +143,7 @@ network: # asg-deployer: asg-deployer # asg-guacamole: asg-guacamole # asg-mariadb-client: asg-mariadb-client - + # peering: # This list is optional, and can be used to create VNet Peerings in the same subscription. # - vnet_name: #"VNET Name to Peer to" # vnet_resource_group: #"Resource Group of the VNET to peer to" @@ -413,7 +413,7 @@ queues: ColocateNodes: false # Specific idle time in seconds before shutting down VMs, make sure it's lower than autoscale.idle_timeout idle_timeout: 300 - # Set the max number of vm's in a VMSS; requires additional limit raise through support ticket for >100; + # Set the max number of vm's in a VMSS; requires additional limit raise through support ticket for >100; # 100 is default value; lower numbers will improve scaling for single node jobs or jobs with small number of nodes MaxScaleSetSize: 100 - name: hc44rs @@ -498,3 +498,17 @@ applications: enabled: false bc_vizer: enabled: false + cryosparc: + enabled: false + license_id: + admin_user: adminuser + master_vm_size: Standard_D8s_v5 + master_vm_image: azhpc:azhop-compute:centos-7_9:latest + master_hostname: cryosparc-master + master_data_disk_size: 256 + master_data_disk_type: Premium_LRS + target_queues: + - nc24v3 + - hb120v3 + - hc44rs + diff --git a/deploy/purebicep/azhop.bicep b/deploy/purebicep/azhop.bicep index ea1cd0051..1532fe0c1 100644 --- a/deploy/purebicep/azhop.bicep +++ b/deploy/purebicep/azhop.bicep @@ -56,7 +56,7 @@ var createDatabase = (config.queue_manager == 'slurm' && config.slurm.accounting var lustreOssCount = deployLustre ? azhopConfig.lustre.oss_count : 0 -var ossVmConfig = [for oss in range(0, lustreOssCount) : { +var ossVmConfig = [for oss in range(0, lustreOssCount) : { key: 'lustre-oss-${oss}' value: { identity: { @@ -393,6 +393,8 @@ var config = { MariaDB: ['3306', '33060'] Guacamole: ['8080'] WinRM: ['5985', '5986'] + // Applications: CryoSPARC + Applications: ['39000'] } nsg_rules: { @@ -400,7 +402,7 @@ var config = { // // INBOUND RULES // - + // AD communication AllowAdServerTcpIn : ['220', 'Inbound', 'Allow', 'Tcp', 'DomainControlerTcp', 'asg', 'asg-ad', 'asg', 'asg-ad-client'] AllowAdServerUdpIn : ['230', 'Inbound', 'Allow', 'Udp', 'DomainControlerUdp', 'asg', 'asg-ad', 'asg', 'asg-ad-client'] @@ -412,17 +414,17 @@ var config = { AllowAdClientComputeUdpIn : ['290', 'Inbound', 'Allow', 'Udp', 'DomainControlerUdp', 'subnet', 'compute', 'asg', 'asg-ad'] AllowAdServerNetappTcpIn : ['300', 'Inbound', 'Allow', 'Tcp', 'DomainControlerTcp', 'subnet', 'netapp', 'asg', 'asg-ad'] AllowAdServerNetappUdpIn : ['310', 'Inbound', 'Allow', 'Udp', 'DomainControlerUdp', 'subnet', 'netapp', 'asg', 'asg-ad'] - + // SSH internal rules AllowSshFromJumpboxIn : ['320', 'Inbound', 'Allow', 'Tcp', 'Ssh', 'asg', 'asg-jumpbox', 'asg', 'asg-ssh'] AllowSshFromComputeIn : ['330', 'Inbound', 'Allow', 'Tcp', 'Ssh', 'subnet', 'compute', 'asg', 'asg-ssh'] // Only in a deployer VM scenario - AllowSshFromDeployerIn : ['340', 'Inbound', 'Allow', 'Tcp', 'Ssh', 'asg', 'asg-deployer', 'asg', 'asg-ssh'] + AllowSshFromDeployerIn : ['340', 'Inbound', 'Allow', 'Tcp', 'Ssh', 'asg', 'asg-deployer', 'asg', 'asg-ssh'] // Only in a deployer VM scenario AllowDeployerToPackerSshIn : ['350', 'Inbound', 'Allow', 'Tcp', 'Ssh', 'asg', 'asg-deployer', 'subnet', 'admin'] AllowSshToComputeIn : ['360', 'Inbound', 'Allow', 'Tcp', 'Ssh', 'asg', 'asg-ssh', 'subnet', 'compute'] AllowSshComputeComputeIn : ['365', 'Inbound', 'Allow', 'Tcp', 'Ssh', 'subnet', 'compute', 'subnet', 'compute'] - + // PBS AllowPbsIn : ['369', 'Inbound', 'Allow', '*', 'Pbs', 'asg', 'asg-pbs', 'asg', 'asg-pbs-client'] AllowPbsClientIn : ['370', 'Inbound', 'Allow', '*', 'Pbs', 'asg', 'asg-pbs-client', 'asg', 'asg-pbs'] @@ -430,44 +432,47 @@ var config = { AllowComputePbsClientIn : ['390', 'Inbound', 'Allow', '*', 'Pbs', 'subnet', 'compute', 'asg', 'asg-pbs-client'] AllowComputePbsIn : ['400', 'Inbound', 'Allow', '*', 'Pbs', 'subnet', 'compute', 'asg', 'asg-pbs'] AllowComputeComputePbsIn : ['401', 'Inbound', 'Allow', '*', 'Pbs', 'subnet', 'compute', 'subnet', 'compute'] - + // SLURM AllowComputeSlurmIn : ['405', 'Inbound', 'Allow', '*', 'Slurmd', 'asg', 'asg-ondemand', 'subnet', 'compute'] - + // CycleCloud AllowCycleWebIn : ['440', 'Inbound', 'Allow', 'Tcp', 'Web', 'asg', 'asg-ondemand', 'asg', 'asg-cyclecloud'] AllowCycleClientIn : ['450', 'Inbound', 'Allow', 'Tcp', 'CycleCloud', 'asg', 'asg-cyclecloud-client', 'asg', 'asg-cyclecloud'] AllowCycleClientComputeIn : ['460', 'Inbound', 'Allow', 'Tcp', 'CycleCloud', 'subnet', 'compute', 'asg', 'asg-cyclecloud'] AllowCycleServerIn : ['465', 'Inbound', 'Allow', 'Tcp', 'CycleCloud', 'asg', 'asg-cyclecloud', 'asg', 'asg-cyclecloud-client'] - + // OnDemand NoVNC AllowComputeNoVncIn : ['470', 'Inbound', 'Allow', 'Tcp', 'NoVnc', 'subnet', 'compute', 'asg', 'asg-ondemand'] AllowNoVncComputeIn : ['480', 'Inbound', 'Allow', 'Tcp', 'NoVnc', 'asg', 'asg-ondemand', 'subnet', 'compute'] - + // Telegraf / Grafana AllowTelegrafIn : ['490', 'Inbound', 'Allow', 'Tcp', 'Telegraf', 'asg', 'asg-telegraf', 'asg', 'asg-grafana'] AllowComputeTelegrafIn : ['500', 'Inbound', 'Allow', 'Tcp', 'Telegraf', 'subnet', 'compute', 'asg', 'asg-grafana'] AllowGrafanaIn : ['510', 'Inbound', 'Allow', 'Tcp', 'Grafana', 'asg', 'asg-ondemand', 'asg', 'asg-grafana'] - + // Admin and Deployment AllowWinRMIn : ['520', 'Inbound', 'Allow', 'Tcp', 'WinRM', 'asg', 'asg-jumpbox', 'asg', 'asg-rdp'] AllowRdpIn : ['550', 'Inbound', 'Allow', 'Tcp', 'Rdp', 'asg', 'asg-jumpbox', 'asg', 'asg-rdp'] AllowWebDeployerIn : ['595', 'Inbound', 'Allow', 'Tcp', 'Web', 'asg', 'asg-deployer', 'asg', 'asg-ondemand'] - + // Guacamole AllowGuacamoleRdpIn : ['610', 'Inbound', 'Allow', 'Tcp', 'Rdp', 'asg', 'asg-guacamole', 'subnet', 'compute'] - + // MariaDB AllowMariaDBIn : ['700', 'Inbound', 'Allow', 'Tcp', 'MariaDB', 'asg', 'asg-mariadb-client', 'subnet', 'admin'] + // Cluster applications + AllowApplicationsIn : ['710', 'Inbound', 'Allow', 'All', 'Applications', 'asg', 'asg-ondemand', 'subnet', 'compute'] + // Deny all remaining traffic DenyVnetInbound : ['3100', 'Inbound', 'Deny', '*', 'All', 'tag', 'VirtualNetwork', 'tag', 'VirtualNetwork'] - - + + // // Outbound // - + // AD communication AllowAdClientTcpOut : ['200', 'Outbound', 'Allow', 'Tcp', 'DomainControlerTcp', 'asg', 'asg-ad-client', 'asg', 'asg-ad'] AllowAdClientUdpOut : ['210', 'Outbound', 'Allow', 'Udp', 'DomainControlerUdp', 'asg', 'asg-ad-client', 'asg', 'asg-ad'] @@ -479,13 +484,13 @@ var config = { AllowAdServerComputeUdpOut : ['270', 'Outbound', 'Allow', 'Udp', 'DomainControlerUdp', 'asg', 'asg-ad', 'subnet', 'compute'] AllowAdServerNetappTcpOut : ['280', 'Outbound', 'Allow', 'Tcp', 'DomainControlerTcp', 'asg', 'asg-ad', 'subnet', 'netapp'] AllowAdServerNetappUdpOut : ['290', 'Outbound', 'Allow', 'Udp', 'DomainControlerUdp', 'asg', 'asg-ad', 'subnet', 'netapp'] - + // CycleCloud AllowCycleServerOut : ['300', 'Outbound', 'Allow', 'Tcp', 'CycleCloud', 'asg', 'asg-cyclecloud', 'asg', 'asg-cyclecloud-client'] AllowCycleClientOut : ['310', 'Outbound', 'Allow', 'Tcp', 'CycleCloud', 'asg', 'asg-cyclecloud-client', 'asg', 'asg-cyclecloud'] AllowComputeCycleClientIn : ['320', 'Outbound', 'Allow', 'Tcp', 'CycleCloud', 'subnet', 'compute', 'asg', 'asg-cyclecloud'] AllowCycleWebOut : ['330', 'Outbound', 'Allow', 'Tcp', 'Web', 'asg', 'asg-ondemand', 'asg', 'asg-cyclecloud'] - + // PBS AllowPbsOut : ['340', 'Outbound', 'Allow', '*', 'Pbs', 'asg', 'asg-pbs', 'asg', 'asg-pbs-client'] AllowPbsClientOut : ['350', 'Outbound', 'Allow', '*', 'Pbs', 'asg', 'asg-pbs-client', 'asg', 'asg-pbs'] @@ -493,19 +498,19 @@ var config = { AllowPbsClientComputeOut : ['370', 'Outbound', 'Allow', '*', 'Pbs', 'subnet', 'compute', 'asg', 'asg-pbs'] AllowComputePbsClientOut : ['380', 'Outbound', 'Allow', '*', 'Pbs', 'subnet', 'compute', 'asg', 'asg-pbs-client'] AllowComputeComputePbsOut : ['381', 'Outbound', 'Allow', '*', 'Pbs', 'subnet', 'compute', 'subnet', 'compute'] - + // SLURM AllowSlurmComputeOut : ['385', 'Outbound', 'Allow', '*', 'Slurmd', 'asg', 'asg-ondemand', 'subnet', 'compute'] - + // NFS AllowNfsOut : ['440', 'Outbound', 'Allow', '*', 'Nfs', 'asg', 'asg-nfs-client', 'subnet', 'netapp'] AllowNfsComputeOut : ['450', 'Outbound', 'Allow', '*', 'Nfs', 'subnet', 'compute', 'subnet', 'netapp'] - + // Telegraf / Grafana AllowTelegrafOut : ['460', 'Outbound', 'Allow', 'Tcp', 'Telegraf', 'asg', 'asg-telegraf', 'asg', 'asg-grafana'] AllowComputeTelegrafOut : ['470', 'Outbound', 'Allow', 'Tcp', 'Telegraf', 'subnet', 'compute', 'asg', 'asg-grafana'] AllowGrafanaOut : ['480', 'Outbound', 'Allow', 'Tcp', 'Grafana', 'asg', 'asg-ondemand', 'asg', 'asg-grafana'] - + // SSH internal rules AllowSshFromJumpboxOut : ['490', 'Outbound', 'Allow', 'Tcp', 'Ssh', 'asg', 'asg-jumpbox', 'asg', 'asg-ssh'] AllowSshComputeOut : ['500', 'Outbound', 'Allow', 'Tcp', 'Ssh', 'asg', 'asg-ssh', 'subnet', 'compute'] @@ -513,23 +518,26 @@ var config = { AllowSshDeployerPackerOut : ['520', 'Outbound', 'Allow', 'Tcp', 'Ssh', 'asg', 'asg-deployer', 'subnet', 'admin'] AllowSshFromComputeOut : ['530', 'Outbound', 'Allow', 'Tcp', 'Ssh', 'subnet', 'compute', 'asg', 'asg-ssh'] AllowSshComputeComputeOut : ['540', 'Outbound', 'Allow', 'Tcp', 'Ssh', 'subnet', 'compute', 'subnet', 'compute'] - + // OnDemand NoVNC AllowComputeNoVncOut : ['550', 'Outbound', 'Allow', 'Tcp', 'NoVnc', 'subnet', 'compute', 'asg', 'asg-ondemand'] AllowNoVncComputeOut : ['560', 'Outbound', 'Allow', 'Tcp', 'NoVnc', 'asg', 'asg-ondemand', 'subnet', 'compute'] - + // Admin and Deployment AllowRdpOut : ['570', 'Outbound', 'Allow', 'Tcp', 'Rdp', 'asg', 'asg-jumpbox', 'asg', 'asg-rdp'] AllowWinRMOut : ['580', 'Outbound', 'Allow', 'Tcp', 'WinRM', 'asg', 'asg-jumpbox', 'asg', 'asg-rdp'] AllowDnsOut : ['590', 'Outbound', 'Allow', '*', 'Dns', 'tag', 'VirtualNetwork', 'tag', 'VirtualNetwork'] AllowWebDeployerOut : ['595', 'Outbound', 'Allow', 'Tcp', 'Web', 'asg', 'asg-deployer', 'asg', 'asg-ondemand'] - + // Guacamole AllowGuacamoleRdpOut : ['610', 'Outbound', 'Allow', 'Tcp', 'Rdp', 'asg', 'asg-guacamole', 'subnet', 'compute'] - + // MariaDB AllowMariaDBOut : ['700', 'Outbound', 'Allow', 'Tcp', 'MariaDB', 'asg', 'asg-mariadb-client', 'subnet', 'admin'] - + + // Cluster applications + AllowApplicationsOut : ['710', 'Outbound', 'Allow', 'All', 'Applications', 'asg', 'asg-ondemand', 'subnet', 'compute'] + // Deny all remaining traffic and allow Internet access AllowInternetOutBound : ['3000', 'Outbound', 'Allow', 'Tcp', 'All', 'tag', 'VirtualNetwork', 'tag', 'Internet'] DenyVnetOutbound : ['3100', 'Outbound', 'Deny', '*', 'All', 'tag', 'VirtualNetwork', 'tag', 'VirtualNetwork'] diff --git a/playbooks/cccluster.yml b/playbooks/cccluster.yml index a11c150af..f4e519a24 100644 --- a/playbooks/cccluster.yml +++ b/playbooks/cccluster.yml @@ -93,9 +93,9 @@ include_vars: file: '{{lookup_img_file}}' - - include_role: + - include_role: name: cyclecloud_cluster - apply: + apply: become: true vars: cc_region: '{{location}}' @@ -108,7 +108,7 @@ cc_domain: '{{domain_name}}' cc_queue_manager: '{{ queue_manager | default("openpbs") }}' influxdb_database_name: "telegraf" - telegraf_influxdb_urls: + telegraf_influxdb_urls: - "http://grafana:8086" cc_slurm_version: '{{slurm.slurm_version | default("20.11.9")}}-1' slurm_uid: 11100 @@ -119,6 +119,7 @@ enroot_scratch_dir: '/mnt/resource' cvmfs_eessi_enabled: '{{cvmfs_eessi.enabled | default(false)}}' cc_enable_remote_winviz: '{{enable_remote_winviz | default(false)}}' + cryosparc_enabled: '{{applications.cryosparc.enabled | default(false)}}' # Generate the node array core lookup file for ondemand - will be only run if the marker file for ondemand exists - import_tasks: nodearray_lookup.yml diff --git a/playbooks/linux.yml b/playbooks/linux.yml index 39bbcb79d..58285af57 100644 --- a/playbooks/linux.yml +++ b/playbooks/linux.yml @@ -19,7 +19,7 @@ line: 'AllowTcpForwarding yes' - name: restart sshd service: - name: sshd + name: sshd state: restarted - name: update packages for security become: true @@ -71,3 +71,9 @@ mode: '0755' run_once : true + - name: Create {{homedir_mountpoint}}/apps directory + file: + path: '{{homedir_mountpoint}}/apps' + state: directory + mode: '0755' + run_once : true diff --git a/playbooks/roles/cyclecloud_cluster/projects/cryosparc/cluster-init/files/cryosparc.desktop b/playbooks/roles/cyclecloud_cluster/projects/cryosparc/cluster-init/files/cryosparc.desktop new file mode 100644 index 000000000..c95721942 --- /dev/null +++ b/playbooks/roles/cyclecloud_cluster/projects/cryosparc/cluster-init/files/cryosparc.desktop @@ -0,0 +1,8 @@ +[Desktop Entry] +Type=Link +Version=1.0 +Name=CryoSPARC +Icon=/usr/share/icons/hicolor/16x16/apps/cryosparc.png +URL=http://cryosparc-master:39000/ +Name[en_US.UTF-8]=CryoSPARC +Categories=Education diff --git a/playbooks/roles/cyclecloud_cluster/projects/cryosparc/cluster-init/files/cryosparc_16.png b/playbooks/roles/cyclecloud_cluster/projects/cryosparc/cluster-init/files/cryosparc_16.png new file mode 100644 index 000000000..5f4ed1066 Binary files /dev/null and b/playbooks/roles/cyclecloud_cluster/projects/cryosparc/cluster-init/files/cryosparc_16.png differ diff --git a/playbooks/roles/cyclecloud_cluster/projects/cryosparc/cluster-init/files/slurm_cluster_info.json b/playbooks/roles/cyclecloud_cluster/projects/cryosparc/cluster-init/files/slurm_cluster_info.json new file mode 100644 index 000000000..212147251 --- /dev/null +++ b/playbooks/roles/cyclecloud_cluster/projects/cryosparc/cluster-init/files/slurm_cluster_info.json @@ -0,0 +1,11 @@ +{ + "name": "PARTITION", + "worker_bin_path": "/anfhome/apps/cryosparc/cryosparc_worker/bin/cryosparcw", + "cache_path": "/mnt/resource", + "send_cmd_tpl": "{{ command }}", + "qsub_cmd_tpl": "sbatch {{ script_path_abs }}", + "qstat_cmd_tpl": "squeue -j {{ cluster_job_id }}", + "qdel_cmd_tpl": "scancel {{ cluster_job_id }}", + "qinfo_cmd_tpl": "sinfo", + "transfer_cmd_tpl": "scp {{ src_path }} loginnode:{{ dest_path }}" +} diff --git a/playbooks/roles/cyclecloud_cluster/projects/cryosparc/cluster-init/files/slurm_cluster_script.sh b/playbooks/roles/cyclecloud_cluster/projects/cryosparc/cluster-init/files/slurm_cluster_script.sh new file mode 100644 index 000000000..af537ceab --- /dev/null +++ b/playbooks/roles/cyclecloud_cluster/projects/cryosparc/cluster-init/files/slurm_cluster_script.sh @@ -0,0 +1,13 @@ +#!/bin/bash +#SBATCH --partition=PARTITION +#SBATCH --nodes=1 +#SBATCH --ntasks-per-node={{ num_cpu }} +#SBATCH --cpus-per-task=1 +#SBATCH --threads-per-core=1 +#SBATCH --gres=gpu:{{ num_gpu }} +#SBATCH --mem={{ (ram_gb*1000)|int }}MB +#SBATCH --job-name cryosparc_{{ project_uid }}_{{ job_uid }} +#SBATCH --output={{ job_log_path_abs }} +#SBATCH --error={{ job_log_path_abs }} + +{{ run_cmd }} diff --git a/playbooks/roles/cyclecloud_cluster/projects/cryosparc/cluster-init/scripts/01-setup_data_disk.sh.j2 b/playbooks/roles/cyclecloud_cluster/projects/cryosparc/cluster-init/scripts/01-setup_data_disk.sh.j2 new file mode 100644 index 000000000..a1c672c67 --- /dev/null +++ b/playbooks/roles/cyclecloud_cluster/projects/cryosparc/cluster-init/scripts/01-setup_data_disk.sh.j2 @@ -0,0 +1,14 @@ +#!/bin/bash +set -e + +# Run only on CryoSPARC master node +[ $(hostname) != {{ applications.cryosparc.master_hostname }} ] && exit 0 + +parted /dev/sdb mktable gpt +parted /dev/sdb mkpart primary ext4 0% 100% +mkfs.ext4 /dev/sdb1 +DEV_UUID=$(blkid -s UUID -o value /dev/sdb1) +printf 'UUID=%s /cryosparc_data ext4 defaults 0 0\n' $DEV_UUID >> /etc/fstab +mkdir /cryosparc_data +mount -a +chown {{ applications.cryosparc.admin_user }}: /cryosparc_data diff --git a/playbooks/roles/cyclecloud_cluster/projects/cryosparc/cluster-init/scripts/02-download_cryosparc.sh.j2 b/playbooks/roles/cyclecloud_cluster/projects/cryosparc/cluster-init/scripts/02-download_cryosparc.sh.j2 new file mode 100644 index 000000000..f172cb527 --- /dev/null +++ b/playbooks/roles/cyclecloud_cluster/projects/cryosparc/cluster-init/scripts/02-download_cryosparc.sh.j2 @@ -0,0 +1,22 @@ +#!/bin/bash +set -e + +# Run only on CryoSPARC master node +[ $(hostname) != {{ applications.cryosparc.master_hostname }} ] && exit 0 + +INSTALL_DIR=/anfhome/apps/cryosparc +SOURCES_DIR=${INSTALL_DIR}/sources + +mkdir -p ${SOURCES_DIR} +cd ${SOURCES_DIR} + +for COMPONENT in master worker; do + if [ -s ${SOURCES_DIR}/cryosparc_${COMPONENT}.tar.gz ]; then + echo "cryosparc_${COMPONENT}.tar.gz already downloaded" + else + echo "Downloading cryosparc_${COMPONENT}.tar.gz" + curl -L https://get.cryosparc.com/download/${COMPONENT}-latest/{{ applications.cryosparc.license_id }} -o cryosparc_${COMPONENT}.tar.gz + fi +done + +chown -R {{ applications.cryosparc.admin_user }}: ${INSTALL_DIR} diff --git a/playbooks/roles/cyclecloud_cluster/projects/cryosparc/cluster-init/scripts/03-install_cryosparc.sh.j2 b/playbooks/roles/cyclecloud_cluster/projects/cryosparc/cluster-init/scripts/03-install_cryosparc.sh.j2 new file mode 100644 index 000000000..f49947846 --- /dev/null +++ b/playbooks/roles/cyclecloud_cluster/projects/cryosparc/cluster-init/scripts/03-install_cryosparc.sh.j2 @@ -0,0 +1,75 @@ +#!/bin/bash +set -e + +export INSTALL_DIR=/anfhome/apps/cryosparc +SOURCES_DIR=${INSTALL_DIR}/sources + +# connect worker nodes to the master +if [[ $(hostname) != {{ applications.cryosparc.master_hostname }} ]]; then + sudo chmod 777 /mnt + sudo su -c "$INSTALL_DIR/cryosparc_worker/bin/cryosparcw connect --worker $(hostname) \ + --master {{ applications.cryosparc.master_hostname }} \ + --port 39000 --ssdpath /mnt/" {{ applications.cryosparc.admin_user }} +fi + +# Run only on CryoSPARC master node +[ $(hostname) != {{ applications.cryosparc.master_hostname }} ] && exit 0 + +################################################### +# Install CryoSPARC master locally on master node # +################################################### + +# CryoSPARC master must be installed as admin user +sudo chown {{ applications.cryosparc.admin_user }} /cryosparc_data +sudo -i -u {{ applications.cryosparc.admin_user }} bash << EOF +cd /cryosparc_data +tar xzf ${SOURCES_DIR}/cryosparc_master.tar.gz +cd cryosparc_master +./install.sh --license {{ applications.cryosparc.license_id }} \ + --hostname $(hostname -f) \ + --dbpath /cryosparc_data/cryosparc_database \ + --port 39000 \ + --yes + +# The service log target directory must be created manually +# otherwise the systemd service will fail to start (likely a bug) +mkdir -p /cryosparc_data/cryosparc_master/run +EOF + +# Install CryoSPARC systemd service +eval $(/cryosparc_data/cryosparc_master/bin/cryosparcm env) +cd /cryosparc_data/cryosparc_master/systemd +env "CRYOSPARC_ROOT_DIR=$CRYOSPARC_ROOT_DIR" ./install_services.sh + +systemctl enable cryosparc-supervisor.service +systemctl start cryosparc-supervisor.service + +# Create admin user in CryoSPARC +sudo -i -u {{ applications.cryosparc.admin_user }} bash << EOF +eval $(/cryosparc_data/cryosparc_master/bin/cryosparcm env) +cryosparcm createuser --email {{ applications.cryosparc.admin_user }}@azhop.com \ + --username {{ applications.cryosparc.admin_user }} \ + --firstname Admin \ + --lastname User \ + --password {{ cryosparc_admin_pwd.stdout }} +EOF + +############################################################# +# Install CryoSPARC worker on shared applications directory # +############################################################# + +cd ${INSTALL_DIR} +tar xzf ${SOURCES_DIR}/cryosparc_worker.tar.gz +chown -R {{ applications.cryosparc.admin_user }}: ./cryosparc*_worker + +# Get CUDA library path +LIBCUDART_PATH=$(sudo find /usr/local -name libcudart.so) +CUDA_PATH=$(echo $LIBCUDART_PATH | cut -d'/' -f-4) +export CUDA_PATH + +# CryoSPARC must be installed as admin user +sudo -i -u {{ applications.cryosparc.admin_user }} bash << EOF +cd ${INSTALL_DIR}/cryosparc_worker +./install.sh --license {{ applications.cryosparc.license_id }} \ + --yes +EOF diff --git a/playbooks/roles/cyclecloud_cluster/projects/cryosparc/cluster-init/scripts/04-import_slurm_cluster.sh.j2 b/playbooks/roles/cyclecloud_cluster/projects/cryosparc/cluster-init/scripts/04-import_slurm_cluster.sh.j2 new file mode 100644 index 000000000..d08ab4075 --- /dev/null +++ b/playbooks/roles/cyclecloud_cluster/projects/cryosparc/cluster-init/scripts/04-import_slurm_cluster.sh.j2 @@ -0,0 +1,19 @@ +#!/bin/bash +set -e + +# Run only on CryoSPARC master node +[ $(hostname) != {{ applications.cryosparc.master_hostname }} ] && exit 0 + +{% for partition in applications.cryosparc.target_queues %} + export TARGET_DIR=/cryosparc_data/cryosparc_cluster_{{ partition }} + mkdir -p ${TARGET_DIR} + cp $CYCLECLOUD_SPEC_PATH/files/* ${TARGET_DIR} + sed -i 's/PARTITION/{{ partition }}/g' ${TARGET_DIR}/* + chown -R adminuser: ${TARGET_DIR} + + # Import Slurm cluster + sudo -i -u {{ applications.cryosparc.admin_user }} bash << EOF + cd ${TARGET_DIR} + /cryosparc_data/cryosparc_master/bin/cryosparcm cluster connect +EOF +{% endfor %} diff --git a/playbooks/roles/cyclecloud_cluster/projects/cryosparc/cluster-init/scripts/05-install_app_menu_shortcut.sh.j2 b/playbooks/roles/cyclecloud_cluster/projects/cryosparc/cluster-init/scripts/05-install_app_menu_shortcut.sh.j2 new file mode 100644 index 000000000..d84684a53 --- /dev/null +++ b/playbooks/roles/cyclecloud_cluster/projects/cryosparc/cluster-init/scripts/05-install_app_menu_shortcut.sh.j2 @@ -0,0 +1,10 @@ +#!/bin/bash +set -e + +APP_SHORTCUT_DIR=/usr/share/applications +ICON_DIR=/usr/share/icons/hicolor/16x16/apps + +cp $CYCLECLOUD_SPEC_PATH/files/cryosparc.desktop ${APP_SHORTCUT_DIR} +chmod 644 ${APP_SHORTCUT_DIR}/cryosparc.desktop +cp $CYCLECLOUD_SPEC_PATH/files/cryosparc_16.png ${ICON_DIR}/cryosparc.png +chmod 644 ${ICON_DIR}/cryosparc.png diff --git a/playbooks/roles/cyclecloud_cluster/tasks/main.yml b/playbooks/roles/cyclecloud_cluster/tasks/main.yml index 88df70c6b..6767031ed 100644 --- a/playbooks/roles/cyclecloud_cluster/tasks/main.yml +++ b/playbooks/roles/cyclecloud_cluster/tasks/main.yml @@ -21,44 +21,44 @@ src: '{{role_path}}/projects/common/cluster-init' dest: '{{common_project_root}}/specs/default/' -- name: Copy mountnfs file. +- name: Copy mountnfs file template: src: '{{role_path}}/projects/common/cluster-init/scripts/1-mountnfs.sh.j2' dest: '{{common_project_root}}/specs/default/cluster-init/scripts/1-mountnfs.sh' mode: 0777 -- name: Add lustre script +- name: Add lustre script template: src: '{{role_path}}/projects/common/cluster-init/scripts/2-mountlustre.sh.j2' dest: '{{common_project_root}}/specs/default/cluster-init/scripts/2-mountlustre.sh' mode: 0777 when: ( lustre.create | default(false) ) -- name: Add Linux joindomain script +- name: Add Linux joindomain script template: src: '{{role_path}}/projects/common/cluster-init/scripts/3-joindomain.sh.j2' dest: '{{common_project_root}}/specs/default/cluster-init/scripts/3-joindomain.sh' mode: 0777 -- name: Add Windows joindomain script +- name: Add Windows joindomain script template: src: '{{role_path}}/projects/common/cluster-init/scripts/3-joindomain.bat.j2' dest: '{{common_project_root}}/specs/default/cluster-init/scripts/3-joindomain.bat' mode: 0777 -- name: Add default script +- name: Add default script template: src: '{{role_path}}/projects/common/cluster-init/scripts/5-default.sh.j2' dest: '{{common_project_root}}/specs/default/cluster-init/scripts/5-default.sh' mode: 0777 -- name: Add telegraf configuration file +- name: Add telegraf configuration file template: src: '{{role_path}}/projects/common/cluster-init/files/telegraf.conf.j2' dest: '{{common_project_root}}/specs/default/cluster-init/files/telegraf.conf' mode: 0600 -- name: Add nhc configuration file +- name: Add nhc configuration file template: src: '{{role_path}}/projects/common/cluster-init/files/nhc/nhc_common.conf.j2' dest: '{{common_project_root}}/specs/default/cluster-init/files/nhc/nhc_common.conf' @@ -117,6 +117,62 @@ command: '/usr/local/bin/cyclecloud start_cluster pbs1' when: cc_queue_manager == "openpbs" +- name: CryoSPARC CycleCloud project + block: + - name: Read CryoSPARC admin password from KV + command: az keyvault secret show --vault-name {{key_vault}} -n {{database_user}}-password --query "value" -o tsv + delegate_to: localhost + connection: local + register: cryosparc_admin_pwd + become: false + + - name: Create cryosparc project + command: '/usr/local/bin/cyclecloud project init cryosparc' + args: + chdir: '{{project_root}}' + creates: '{{cryosparc_project_root}}/project.ini' + + - name: Create common cryosparc project scripts + template: + src: '{{role_path}}/projects/cryosparc/cluster-init/scripts/{{ item }}.j2' + dest: '{{cryosparc_project_root}}/specs/default/cluster-init/scripts/{{ item }}' + mode: 0777 + with_items: + - 01-setup_data_disk.sh + - 02-download_cryosparc.sh + - 03-install_cryosparc.sh + - 05-install_app_menu_shortcut.sh + + - name: Create import Slurm cluster project script + template: + src: '{{role_path}}/projects/cryosparc/cluster-init/scripts/04-import_slurm_cluster.sh.j2' + dest: '{{cryosparc_project_root}}/specs/default/cluster-init/scripts/04-import_slurm_cluster.sh' + mode: 0777 + when: cc_queue_manager == "slurm" + + - name: Copy CryoSPARC Slurm cluster definition files + copy: + src: '{{role_path}}/projects/cryosparc/cluster-init/files/slurm_{{ item }}' + dest: '{{cryosparc_project_root}}/specs/default/cluster-init/files/{{ item }}' + with_items: + - cluster_info.json + - cluster_script.sh + when: cc_queue_manager == "slurm" + + - name: Copy CryoSPARC app shortcut files + copy: + src: '{{role_path}}/projects/cryosparc/cluster-init/files/{{ item }}' + dest: '{{cryosparc_project_root}}/specs/default/cluster-init/files/{{ item }}' + with_items: + - cryosparc.desktop + - cryosparc_16.png + + - name: Upload cryosparc CycleCloud project + command: '/usr/local/bin/cyclecloud project upload' + args: + chdir: '{{cryosparc_project_root}}' + when: cryosparc_enabled == true + - name: SLURM template and optional Enroot project block: - name: Add azhop-Slurm template @@ -138,6 +194,7 @@ args: chdir: '{{project_root}}' creates: '{{enroot_project_root}}/project.ini' + - name: Create install_pyxis.sh template: src: '{{role_path}}/projects/enroot/cluster-init/scripts/1-install_pyxis.sh.j2' diff --git a/playbooks/roles/cyclecloud_cluster/templates/azhop-slurm.txt.j2 b/playbooks/roles/cyclecloud_cluster/templates/azhop-slurm.txt.j2 index 485565498..a61794ce7 100644 --- a/playbooks/roles/cyclecloud_cluster/templates/azhop-slurm.txt.j2 +++ b/playbooks/roles/cyclecloud_cluster/templates/azhop-slurm.txt.j2 @@ -5,7 +5,7 @@ [cluster azhop-slurm] FormLayout = selectionpanel -Category = Azure HPC OnDemand Platform +Category = Azure HPC OnDemand Platform Autoscale = true @@ -74,7 +74,7 @@ echo "cloud-init done" >> /tmp/cloud-init.txt cyclecloud.cluster.autoscale.idle_time_before_jobs = {{autoscale.idle_timeout}} {% endif %} - [[[volume boot]]] + [[[volume boot]]] StorageAccountType = StandardSSD_LRS [[[cluster-init cyclecloud/slurm:default:{{cyclecloud_slurm_release}}]]] @@ -99,7 +99,7 @@ echo "cloud-init done" >> /tmp/cloud-init.txt {% for queue in cc_queues %} [[nodearray {{ queue.name }}]] Extends = nodearraybase - MachineType = {{ queue.vm_size }} + MachineType = {{ queue.vm_size }} MaxCoreCount = {{ queue.max_core_count }} {% if queue.EnableAcceleratedNetworking is defined %} EnableAcceleratedNetworking = {{ queue.EnableAcceleratedNetworking }} @@ -135,4 +135,26 @@ echo "cloud-init done" >> /tmp/cloud-init.txt cyclecloud.cluster.autoscale.idle_time_before_jobs = {{queue.idle_timeout}} {% endif %} [[[cluster-init enroot:default:1.0.0]]] + {% if queue.name.startswith('viz') %} + [[[cluster-init cryosparc:default:1.0.0]]] + {% endif %} {% endfor %} + +{% if cryosparc_enabled %} + [[node {{ applications.cryosparc.master_hostname }}]] + MachineType = {{applications.cryosparc.master_vm_size}} + EnableAcceleratedNetworking = true + ImageName = {{applications.cryosparc.master_vm_image}} + + [[[volume boot]]] + size = 64 + StorageAccountType = StandardSSD_LRS + + [[[volume cryosparc]]] + size = {{applications.cryosparc.master_data_disk_size}} + StorageAccountType = {{applications.cryosparc.master_data_disk_type}} + Peristent = true + + [[[cluster-init cyclecloud/slurm:login:{{cyclecloud_slurm_release}}]]] + [[[cluster-init cryosparc:default:1.0.0]]] +{% endif %} diff --git a/playbooks/roles/cyclecloud_cluster/vars/main.yml b/playbooks/roles/cyclecloud_cluster/vars/main.yml index cf6b8fd7d..d34e887e3 100644 --- a/playbooks/roles/cyclecloud_cluster/vars/main.yml +++ b/playbooks/roles/cyclecloud_cluster/vars/main.yml @@ -2,6 +2,7 @@ project_root: /root/projects common_project_root: '{{project_root}}/common' openpbs_project_root: '{{project_root}}/openpbs' enroot_project_root: '{{project_root}}/enroot' +cryosparc_project_root: '{{project_root}}/cryosparc' cc_queue_manager: cyclecloud_slurm_release: 2.7.0 slurm_version: '{{cc_slurm_version}}' @@ -12,4 +13,4 @@ slurm_gid: 11100 munge_uid: 11101 munge_gid: 11101 cvmfs_eessi_enabled: false -cc_enable_remote_winviz: false \ No newline at end of file +cc_enable_remote_winviz: false