Skip to content

Commit

Permalink
[Develop] Add data integrity checks before removing backup directories (
Browse files Browse the repository at this point in the history
aws#2727)

* feat: Add data integrity checks before removing backup directories
  * Updated `restore_home_shared_data.rb` to include a diff check before removing /tmp/home/ directory.
  * Updated `restore_internal_use_shared_data.rb` to include a diff check before removing /tmp directories for each internal shared directory.
  * Updated `config_default_user_home.rb` to include data integrity checks during the process of moving the cluster user's default home directory.
  * Added new Spec tests.

* This update prevents potential data loss or inconsistency during the restoration process.
  • Loading branch information
hehe7318 authored May 28, 2024
1 parent 15869b1 commit a252e53
Show file tree
Hide file tree
Showing 5 changed files with 99 additions and 7 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,12 @@
EOH
end

# move the cluster user's default home directory
# Move the cluster user's default home directory
# This script performs the following actions:
# 1. Creates the new local home directory for the cluster user if it doesn't already exist.
# 2. Copies the data from the temporary backup directory (/tmp/cluster_user_home) to the new local home directory.
# 3. Updates the cluster user's home directory path to the new local home directory.
# 4. Changes the ownership of the new local home directory to the cluster user.
bash "Move #{node['cluster']['cluster_user_home']}" do
user 'root'
group 'root'
Expand All @@ -54,8 +59,26 @@
rsync -a /tmp#{node['cluster']['cluster_user_home']}/ #{node['cluster']['cluster_user_local_home']}
usermod -d #{node['cluster']['cluster_user_local_home']} #{node['cluster']['cluster_user']}
chown -R #{node['cluster']['cluster_user']}: #{node['cluster']['cluster_user_local_home']}
rm -rf /tmp#{node['cluster']['cluster_user_home']}
rm -rf #{node['cluster']['cluster_user_home']}
EOH
end

# Data integrity check and cleanup for temporary backup and original home directory
# 1. Verifies data integrity by comparing the temporary backup directory and the new local home directory.
# 2. If the data integrity check passes, it removes both the temporary backup directory and the original home directory.
# 3. If the data integrity check fails, it outputs an error message and exits with an error code 1.
bash "Verify data integrity for #{node['cluster']['cluster_user_home']}" do
user 'root'
group 'root'
code <<-EOH
diff_output=$(diff -r #{node['cluster']['cluster_user_home']} #{node['cluster']['cluster_user_local_home']})
if [ $? -eq 0 ]; then
rm -rf /tmp#{node['cluster']['cluster_user_home']}
rm -rf #{node['cluster']['cluster_user_home']}
else
echo "Data integrity check failed comparing #{node['cluster']['cluster_user_local_home']} and #{node['cluster']['cluster_user_home']}: $diff_output" >&2
systemctl start sshd
exit 1
fi
EOH
end

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,13 +19,22 @@
# This is necessary to preserve any data in these directories that was
# generated during the build of ParallelCluster AMIs after converting to
# shared storage and backed up to a temporary location previously
# Remove the backup after the copy is done
# Before removing the backup, ensure the data in the new home is the same
# as the original to avoid any data loss or inconsistency. This is done
# by using rsync to copy the data and diff to check for differences.
# Remove the backup after the copy is done and the data integrity is verified.
bash "Restore /home" do
user 'root'
group 'root'
code <<-EOH
rsync -a --ignore-existing /tmp/home/ /home
rm -rf /tmp/home/
diff_output=$(diff -r /tmp/home/ /home)
if [ $? -eq 0 ]; then
rm -rf /tmp/home/
else
echo "Data integrity check failed comparing /home and /tmp/home: $diff_output"
exit 1
fi
EOH
end
end
Original file line number Diff line number Diff line change
Expand Up @@ -19,14 +19,23 @@
# This is necessary to preserve any data in these directories that was
# generated during the build of ParallelCluster AMIs after converting to
# shared storage and backed up to a temporary location previously
# Remove the backup after the copy is done
# Before removing the backup, ensure the data in the new directory is the same
# as the original to avoid any data loss or inconsistency. This is done
# by using rsync to copy the data and diff to check for differences.
# Remove the backup after the copy is done and the data integrity is verified.
node['cluster']['internal_shared_dirs'].each do |dir|
bash "Restore #{dir}" do
user 'root'
group 'root'
code <<-EOH
rsync -a --ignore-existing /tmp#{dir}/ #{dir}
rm -rf /tmp#{dir}/
diff_output=$(diff -r /tmp#{dir}/ #{dir})
if [ $? -eq 0 ]; then
rm -rf /tmp#{dir}/
else
echo "Data integrity check failed comparing #{dir} and /tmp#{dir}: $diff_output"
exit 1
fi
EOH
end
end
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,26 @@
expect(chef_run.node['cluster']['cluster_user_home']).to eq('/local/home/user')
is_expected.to start_service("sshd")
end

it 'moves the cluster user home directory with data integrity check' do
user_home = "/home/user"
user_local_home = "/local/home/user"
expect(chef_run).to run_bash("Verify data integrity for #{user_home}").with(
code: <<-CODE
diff_output=$(diff -r #{user_home} #{user_local_home})
if [ $? -eq 0 ]; then
rm -rf /tmp#{user_home}
rm -rf #{user_home}
else
echo "Data integrity check failed comparing #{user_local_home} and #{user_home}: $diff_output" >&2
systemctl start sshd
exit 1
fi
CODE
)
end
end

context 'when shared' do
cached(:chef_run) do
runner = runner(platform: platform, version: version) do |node|
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,38 @@
describe 'call efs for mounting' do
it { is_expected.to mount_efs('mount internal shared efs') }
end

context "when node type is HeadNode" do
cached(:chef_run) do
runner = runner(platform: platform, version: version) do |node|
node.override['cluster']['head_node_private_ip'] = '0.0.0.0'
node.override['cluster']['node_type'] = 'HeadNode'
node.override['cluster']['internal_shared_dirs'] = %w(/opt/slurm /opt/intel)
node.override['cluster']['efs_shared_dirs'] = "/opt/parallelcluster/init_shared"
end
runner.converge(described_recipe)
end
cached(:node) { chef_run.node }

describe 'restore internal use shared data with integrity check' do
it 'restores internal shared dirs with data integrity check' do
chef_run.node['cluster']['internal_shared_dirs'].each do |dir|
expect(chef_run).to run_bash("Restore #{dir}").with(
code: <<-CODE
rsync -a --ignore-existing /tmp#{dir}/ #{dir}
diff_output=$(diff -r /tmp#{dir}/ #{dir})
if [ $? -eq 0 ]; then
rm -rf /tmp#{dir}/
else
echo "Data integrity check failed comparing #{dir} and /tmp#{dir}: $diff_output"
exit 1
fi
CODE
)
end
end
end
end
end
end
end

0 comments on commit a252e53

Please sign in to comment.