diff --git a/cookbooks/aws-parallelcluster-environment/recipes/init/config_default_user_home.rb b/cookbooks/aws-parallelcluster-environment/recipes/init/config_default_user_home.rb index 3902a4022..9607007b0 100644 --- a/cookbooks/aws-parallelcluster-environment/recipes/init/config_default_user_home.rb +++ b/cookbooks/aws-parallelcluster-environment/recipes/init/config_default_user_home.rb @@ -44,7 +44,12 @@ EOH end -# move the cluster user's default home directory +# Move the cluster user's default home directory +# This script performs the following actions: +# 1. Creates the new local home directory for the cluster user if it doesn't already exist. +# 2. Copies the data from the temporary backup directory (/tmp/cluster_user_home) to the new local home directory. +# 3. Updates the cluster user's home directory path to the new local home directory. +# 4. Changes the ownership of the new local home directory to the cluster user. bash "Move #{node['cluster']['cluster_user_home']}" do user 'root' group 'root' @@ -54,8 +59,26 @@ rsync -a /tmp#{node['cluster']['cluster_user_home']}/ #{node['cluster']['cluster_user_local_home']} usermod -d #{node['cluster']['cluster_user_local_home']} #{node['cluster']['cluster_user']} chown -R #{node['cluster']['cluster_user']}: #{node['cluster']['cluster_user_local_home']} - rm -rf /tmp#{node['cluster']['cluster_user_home']} - rm -rf #{node['cluster']['cluster_user_home']} + EOH +end + +# Data integrity check and cleanup for temporary backup and original home directory +# 1. Verifies data integrity by comparing the temporary backup directory and the new local home directory. +# 2. If the data integrity check passes, it removes both the temporary backup directory and the original home directory. +# 3. If the data integrity check fails, it outputs an error message and exits with an error code 1. +bash "Verify data integrity for #{node['cluster']['cluster_user_home']}" do + user 'root' + group 'root' + code <<-EOH + diff_output=$(diff -r #{node['cluster']['cluster_user_home']} #{node['cluster']['cluster_user_local_home']}) + if [ $? -eq 0 ]; then + rm -rf /tmp#{node['cluster']['cluster_user_home']} + rm -rf #{node['cluster']['cluster_user_home']} + else + echo "Data integrity check failed comparing #{node['cluster']['cluster_user_local_home']} and #{node['cluster']['cluster_user_home']}: $diff_output" >&2 + systemctl start sshd + exit 1 + fi EOH end diff --git a/cookbooks/aws-parallelcluster-environment/recipes/init/restore_home_shared_data.rb b/cookbooks/aws-parallelcluster-environment/recipes/init/restore_home_shared_data.rb index b75c04cb8..df13ca3c3 100644 --- a/cookbooks/aws-parallelcluster-environment/recipes/init/restore_home_shared_data.rb +++ b/cookbooks/aws-parallelcluster-environment/recipes/init/restore_home_shared_data.rb @@ -19,13 +19,22 @@ # This is necessary to preserve any data in these directories that was # generated during the build of ParallelCluster AMIs after converting to # shared storage and backed up to a temporary location previously - # Remove the backup after the copy is done + # Before removing the backup, ensure the data in the new home is the same + # as the original to avoid any data loss or inconsistency. This is done + # by using rsync to copy the data and diff to check for differences. + # Remove the backup after the copy is done and the data integrity is verified. bash "Restore /home" do user 'root' group 'root' code <<-EOH rsync -a --ignore-existing /tmp/home/ /home - rm -rf /tmp/home/ + diff_output=$(diff -r /tmp/home/ /home) + if [ $? -eq 0 ]; then + rm -rf /tmp/home/ + else + echo "Data integrity check failed comparing /home and /tmp/home: $diff_output" + exit 1 + fi EOH end end diff --git a/cookbooks/aws-parallelcluster-environment/recipes/init/restore_internal_use_shared_data.rb b/cookbooks/aws-parallelcluster-environment/recipes/init/restore_internal_use_shared_data.rb index 83b0d71ec..39dc93d9c 100644 --- a/cookbooks/aws-parallelcluster-environment/recipes/init/restore_internal_use_shared_data.rb +++ b/cookbooks/aws-parallelcluster-environment/recipes/init/restore_internal_use_shared_data.rb @@ -19,14 +19,23 @@ # This is necessary to preserve any data in these directories that was # generated during the build of ParallelCluster AMIs after converting to # shared storage and backed up to a temporary location previously - # Remove the backup after the copy is done + # Before removing the backup, ensure the data in the new directory is the same + # as the original to avoid any data loss or inconsistency. This is done + # by using rsync to copy the data and diff to check for differences. + # Remove the backup after the copy is done and the data integrity is verified. node['cluster']['internal_shared_dirs'].each do |dir| bash "Restore #{dir}" do user 'root' group 'root' code <<-EOH rsync -a --ignore-existing /tmp#{dir}/ #{dir} - rm -rf /tmp#{dir}/ + diff_output=$(diff -r /tmp#{dir}/ #{dir}) + if [ $? -eq 0 ]; then + rm -rf /tmp#{dir}/ + else + echo "Data integrity check failed comparing #{dir} and /tmp#{dir}: $diff_output" + exit 1 + fi EOH end end diff --git a/cookbooks/aws-parallelcluster-environment/spec/unit/recipes/config_default_user_home_spec.rb b/cookbooks/aws-parallelcluster-environment/spec/unit/recipes/config_default_user_home_spec.rb index 34da35d70..20ee258d2 100644 --- a/cookbooks/aws-parallelcluster-environment/spec/unit/recipes/config_default_user_home_spec.rb +++ b/cookbooks/aws-parallelcluster-environment/spec/unit/recipes/config_default_user_home_spec.rb @@ -22,7 +22,26 @@ expect(chef_run.node['cluster']['cluster_user_home']).to eq('/local/home/user') is_expected.to start_service("sshd") end + + it 'moves the cluster user home directory with data integrity check' do + user_home = "/home/user" + user_local_home = "/local/home/user" + expect(chef_run).to run_bash("Verify data integrity for #{user_home}").with( + code: <<-CODE + diff_output=$(diff -r #{user_home} #{user_local_home}) + if [ $? -eq 0 ]; then + rm -rf /tmp#{user_home} + rm -rf #{user_home} + else + echo "Data integrity check failed comparing #{user_local_home} and #{user_home}: $diff_output" >&2 + systemctl start sshd + exit 1 + fi + CODE + ) + end end + context 'when shared' do cached(:chef_run) do runner = runner(platform: platform, version: version) do |node| diff --git a/cookbooks/aws-parallelcluster-environment/spec/unit/recipes/mount_internal_use_efs_spec.rb b/cookbooks/aws-parallelcluster-environment/spec/unit/recipes/mount_internal_use_efs_spec.rb index 0198e3261..34123f56b 100644 --- a/cookbooks/aws-parallelcluster-environment/spec/unit/recipes/mount_internal_use_efs_spec.rb +++ b/cookbooks/aws-parallelcluster-environment/spec/unit/recipes/mount_internal_use_efs_spec.rb @@ -17,6 +17,38 @@ describe 'call efs for mounting' do it { is_expected.to mount_efs('mount internal shared efs') } end + + context "when node type is HeadNode" do + cached(:chef_run) do + runner = runner(platform: platform, version: version) do |node| + node.override['cluster']['head_node_private_ip'] = '0.0.0.0' + node.override['cluster']['node_type'] = 'HeadNode' + node.override['cluster']['internal_shared_dirs'] = %w(/opt/slurm /opt/intel) + node.override['cluster']['efs_shared_dirs'] = "/opt/parallelcluster/init_shared" + end + runner.converge(described_recipe) + end + cached(:node) { chef_run.node } + + describe 'restore internal use shared data with integrity check' do + it 'restores internal shared dirs with data integrity check' do + chef_run.node['cluster']['internal_shared_dirs'].each do |dir| + expect(chef_run).to run_bash("Restore #{dir}").with( + code: <<-CODE + rsync -a --ignore-existing /tmp#{dir}/ #{dir} + diff_output=$(diff -r /tmp#{dir}/ #{dir}) + if [ $? -eq 0 ]; then + rm -rf /tmp#{dir}/ + else + echo "Data integrity check failed comparing #{dir} and /tmp#{dir}: $diff_output" + exit 1 + fi + CODE + ) + end + end + end + end end end end