Skip to content

Commit

Permalink
Uses volume UUIDs for disk identification
Browse files Browse the repository at this point in the history
 - Traditionally the vSphere CPI used host-relative SCSI IDs for its disk identification to the bosh agent.
   This does not work in a multi-SCSI controller environment such as Kubernetes, where disk devices may
   be arbitrarily changed at reboot time (e.g. /dev/sda can't be assumed to be the root disk).
 - The existing behavior can lead to VMs locked up and/or Kubernetes PV data loss fter a VMware HA event, power outage/reboot, etc.
 - This fix detects if the VM has `disk.enableUUID` enabled in the BOSH VM, which ensures
   stable volume UUIDs between vSphere and the guest.  It will then use that UUID in its disk
   identification to the BOSH agent, which already supports UUID matching.
 - Add integration test
 - Powered off vms don't report their vmx options correctly that's why
 we use vm_config in vm_creator.
 - fix unit test
  • Loading branch information
svrc authored and EleanorRigby committed Nov 21, 2019
1 parent 5583a83 commit 5626965
Show file tree
Hide file tree
Showing 6 changed files with 440 additions and 96 deletions.
50 changes: 44 additions & 6 deletions src/vsphere_cpi/lib/cloud/vsphere/cloud.rb
Original file line number Diff line number Diff line change
Expand Up @@ -670,12 +670,26 @@ def generate_network_env(devices, networks, dvs_index)
network_env
end

def generate_disk_env(system_disk, ephemeral_disk)
{
'system' => system_disk.unit_number.to_s,
'ephemeral' => ephemeral_disk.unit_number.to_s,
'persistent' => {}
}
def generate_disk_env(system_disk, ephemeral_disk, vm_config)

# When disk.enableUUID is true on the vmx options, consistent volume IDs are requested, and we can use them
# to ensure the precise ephemeral volume is mounted. This is mandatory for
# cases where multiple SCSI controllers are present on the VM, as is common with Kubernetes VMs.
if vm_config.vmx_options['disk.enableUUID'] == "1"
logger.info("Using ephemeral disk UUID #{ephemeral_disk.backing.uuid.downcase}")
{
'system' => system_disk.unit_number.to_s,
'ephemeral' => { 'id' => ephemeral_disk.backing.uuid.downcase },
'persistent' => {}
}
else
logger.info("Using ephemeral disk unit number #{ephemeral_disk.unit_number.to_s}")
{
'system' => system_disk.unit_number.to_s,
'ephemeral' => ephemeral_disk.unit_number.to_s,
'persistent' => {}
}
end
end

def generate_agent_env(name, vm, agent_id, networking_env, disk_env)
Expand Down Expand Up @@ -829,7 +843,31 @@ def get_vm_env_datastore(vm)

def add_disk_to_agent_env(vm, director_disk_cid, device_unit_number)
env = @agent_env.get_current_env(vm.mob, @datacenter.name)

env['disks']['persistent'][director_disk_cid.raw] = device_unit_number.to_s

# For VMs with multiple SCSI controllers, as is common in Kubernetes workers,
# it is mandatory that disk.enableUUID is set in the VMX options / extra config of the VM to ensure
# that disk mounting can be performed unambigously. This is typically set as part of a VM extension.
# Using the relative device unit number (see above), which is the traditional vSphere CPI disk identifier,
# only presumes a single SCSI controller. The BOSH agent however does not distinguish SCSI controllers using
# this method of volume identification, which can lead to ambiguous mounts, and thus failed agent bootstraps or data loss.
# The BOSH agent already supports volume UUID identification, which is used below for unambiguous
# BOSH disk association.


if vm.disk_uuid_is_enabled?
# We have to query the VIM API to learn the freshly attached disk UUID. We must also pick the specific
# BOSH-managed independent persistent disk and avoid any non-BOSH peristent disks. Also due to Linux
# filsystem case-sensitivity, ensure that the UUID is downcased as VIM returns it upper case.
disk = vm.disk_by_cid(director_disk_cid.value)
uuid = disk.backing.uuid.downcase
logger.info("adding disk to env, disk.enableUUID is TRUE, using volume uuid #{uuid} for mounting")
env['disks']['persistent'][director_disk_cid.raw] = {"id" => uuid}
else
logger.info("adding disk to env, disk.enableUUID is FALSE, using relative device number #{device_unit_number.to_s} for mounting")
end

location = { datacenter: @datacenter.name, datastore: get_vm_env_datastore(vm), vm: vm.cid }
@agent_env.set_env(vm.mob, location, env)
end
Expand Down
14 changes: 12 additions & 2 deletions src/vsphere_cpi/lib/cloud/vsphere/resources/vm.rb
Original file line number Diff line number Diff line change
Expand Up @@ -211,6 +211,10 @@ def power_state
properties['runtime.powerState']
end

def extra_config
properties['config.extraConfig']
end

def has_persistent_disk_property_mismatch?(disk)
found_property = get_vapp_property_by_key(disk.key)
return false if found_property.nil? || !verify_persistent_disk_property?(found_property)
Expand Down Expand Up @@ -260,6 +264,12 @@ def attach_disk(disk_resource_object)
return disk_config_spec
end

def disk_uuid_is_enabled?
extra_config.any? do |option|
option.key == 'disk.enableUUID' && option.value == "TRUE"
end
end

def detach_disks(virtual_disks)
reload
check_for_nonpersistent_disk_modes
Expand Down Expand Up @@ -345,8 +355,8 @@ def properties
@properties ||= cloud_searcher.get_properties(
@mob,
Vim::VirtualMachine,
['runtime.powerState', 'runtime.question', 'config.hardware.device', 'name', 'runtime', 'resourcePool'],
ensure: ['config.hardware.device', 'runtime']
['runtime.powerState', 'runtime.question', 'config.hardware.device', 'name', 'runtime', 'resourcePool', 'config.extraConfig'],
ensure: ['config.hardware.device', 'runtime', 'config.extraConfig']
)
end

Expand Down
2 changes: 1 addition & 1 deletion src/vsphere_cpi/lib/cloud/vsphere/vm_creator.rb
Original file line number Diff line number Diff line change
Expand Up @@ -167,7 +167,7 @@ def create(vm_config)
# Set agent env settings
begin
network_env = @cpi.generate_network_env(created_vm.devices, vm_config.networks_spec, dvs_index)
disk_env = @cpi.generate_disk_env(created_vm.system_disk, ephemeral_disk_config.device)
disk_env = @cpi.generate_disk_env(created_vm.system_disk, created_vm.ephemeral_disk, vm_config)
env = @cpi.generate_agent_env(vm_config.name, created_vm.mob, vm_config.agent_id, network_env, disk_env)
env['env'] = vm_config.agent_env

Expand Down
262 changes: 188 additions & 74 deletions src/vsphere_cpi/spec/integration/disk_reattachment_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -31,101 +31,215 @@
}
end

it 're-attaches the disk without locking the cd-rom' do
begin
vm_id = @cpi.create_vm(
'agent-007',
@stemcell_id,
vm_type,
network_spec,
[],
{'key' => 'value'}
)

expect(vm_id).to_not be_nil
expect(@cpi.has_vm?(vm_id)).to be(true)

disk_id = @cpi.create_disk(2048, {}, vm_id)
expect(disk_id).to_not be_nil

@cpi.attach_disk(vm_id, disk_id)
@cpi.detach_disk(vm_id, disk_id)
@cpi.attach_disk(vm_id, disk_id)
@cpi.detach_disk(vm_id, disk_id)
ensure
delete_vm(@cpi, vm_id)
delete_disk(@cpi, disk_id)
context 'when vm_type has disk uuid enabled in vmx options' do
let(:vm_type) do
{
'ram' => 512,
'disk' => 2048,
'cpu' => 1,
'vmx_options' => {
'disk.enableUUID' => 'TRUE'
}
}
end
end

context 'when there is no datastore matching pattern to move the disk to' do
let(:both_cluster_cpi_options) do
cpi_options(
datacenters: [{
persistent_datastore_pattern: @second_datastore_pattern,
clusters: [@cluster_name, @second_cluster_name],
}],
)
it 're-attaches the disk without locking the cd-rom' do
begin
vm_id = @cpi.create_vm(
'agent-007',
@stemcell_id,
vm_type,
network_spec,
[],
{'key' => 'value'}
)

expect(vm_id).to_not be_nil
expect(@cpi.has_vm?(vm_id)).to be(true)

disk_id = @cpi.create_disk(2048, {}, vm_id)
expect(disk_id).to_not be_nil

@cpi.attach_disk(vm_id, disk_id)
@cpi.detach_disk(vm_id, disk_id)
@cpi.attach_disk(vm_id, disk_id)
@cpi.detach_disk(vm_id, disk_id)
ensure
delete_vm(@cpi, vm_id)
delete_disk(@cpi, disk_id)
end
end
let(:both_cluster_cpi) { VSphereCloud::Cloud.new(both_cluster_cpi_options) }

let(:network_spec) do
{
'static' => {
'ip' => "169.254.#{rand(1..254)}.#{rand(4..254)}",
'netmask' => '255.255.254.0',
'cloud_properties' => {'name' => vlan},
'default' => ['dns', 'gateway'],
'dns' => ['169.254.1.2'],
'gateway' => '169.254.1.3'
context 'when there is no datastore matching pattern to move the disk to' do
let(:both_cluster_cpi_options) do
cpi_options(
datacenters: [{
persistent_datastore_pattern: @second_datastore_pattern,
clusters: [@cluster_name, @second_cluster_name],
}],
)
end
let(:both_cluster_cpi) { VSphereCloud::Cloud.new(both_cluster_cpi_options) }

let(:network_spec) do
{
'static' => {
'ip' => "169.254.#{rand(1..254)}.#{rand(4..254)}",
'netmask' => '255.255.254.0',
'cloud_properties' => {'name' => vlan},
'default' => ['dns', 'gateway'],
'dns' => ['169.254.1.2'],
'gateway' => '169.254.1.3'
}
}
}
end
end

let(:vlan) { @vlan }
let(:vlan) { @vlan }

let(:vm_type) do
{
'ram' => 512,
'disk' => 2048,
'cpu' => 1,
'datastores' => [@datastore_pattern],
'datacenters' => [
{
'name' => @datacenter_name,
'clusters' => [
{
@cluster_name => {}
}
let(:vm_type) do
{
'ram' => 512,
'disk' => 2048,
'cpu' => 1,
'datastores' => [@datastore_pattern],
'datacenters' => [
{
'name' => @datacenter_name,
'clusters' => [
{
@cluster_name => {}
}
]
}
]
}
]
}
}
end
it 'raises error ' do
begin
vm_id = @cpi.create_vm(
'agent-007',
@stemcell_id,
vm_type,
network_spec,
[],
{'key' => 'value'}
)

expect(vm_id).to_not be_nil
expect(@cpi.has_vm?(vm_id)).to be(true)

disk_id = both_cluster_cpi.create_disk(2048, {datastores: @second_datastore_pattern})
expect(disk_id).to_not be_nil

expect do
both_cluster_cpi.attach_disk(vm_id, disk_id)
end.to raise_error(/Unable to attach disk to the VM/)
ensure
delete_vm(@cpi, vm_id)
delete_disk(both_cluster_cpi, disk_id)
end
end
end
it 'raises error ' do
end

context 'when vm_type has no disk uuid enabled' do
it 're-attaches the disk without locking the cd-rom' do
begin
vm_id = @cpi.create_vm(
'agent-007',
@stemcell_id,
vm_type,
network_spec,
[],
{'key' => 'value'}
'agent-007',
@stemcell_id,
vm_type,
network_spec,
[],
{'key' => 'value'}
)

expect(vm_id).to_not be_nil
expect(@cpi.has_vm?(vm_id)).to be(true)

disk_id = both_cluster_cpi.create_disk(2048, {datastores: @second_datastore_pattern})
disk_id = @cpi.create_disk(2048, {}, vm_id)
expect(disk_id).to_not be_nil

expect do
both_cluster_cpi.attach_disk(vm_id, disk_id)
end.to raise_error(/Unable to attach disk to the VM/)
@cpi.attach_disk(vm_id, disk_id)
@cpi.detach_disk(vm_id, disk_id)
@cpi.attach_disk(vm_id, disk_id)
@cpi.detach_disk(vm_id, disk_id)
ensure
delete_vm(@cpi, vm_id)
delete_disk(both_cluster_cpi, disk_id)
delete_disk(@cpi, disk_id)
end
end

context 'when there is no datastore matching pattern to move the disk to' do
let(:both_cluster_cpi_options) do
cpi_options(
datacenters: [{
persistent_datastore_pattern: @second_datastore_pattern,
clusters: [@cluster_name, @second_cluster_name],
}],
)
end
let(:both_cluster_cpi) { VSphereCloud::Cloud.new(both_cluster_cpi_options) }

let(:network_spec) do
{
'static' => {
'ip' => "169.254.#{rand(1..254)}.#{rand(4..254)}",
'netmask' => '255.255.254.0',
'cloud_properties' => {'name' => vlan},
'default' => ['dns', 'gateway'],
'dns' => ['169.254.1.2'],
'gateway' => '169.254.1.3'
}
}
end

let(:vlan) { @vlan }

let(:vm_type) do
{
'ram' => 512,
'disk' => 2048,
'cpu' => 1,
'datastores' => [@datastore_pattern],
'datacenters' => [
{
'name' => @datacenter_name,
'clusters' => [
{
@cluster_name => {}
}
]
}
]
}
end
it 'raises error ' do
begin
vm_id = @cpi.create_vm(
'agent-007',
@stemcell_id,
vm_type,
network_spec,
[],
{'key' => 'value'}
)

expect(vm_id).to_not be_nil
expect(@cpi.has_vm?(vm_id)).to be(true)

disk_id = both_cluster_cpi.create_disk(2048, {datastores: @second_datastore_pattern})
expect(disk_id).to_not be_nil

expect do
both_cluster_cpi.attach_disk(vm_id, disk_id)
end.to raise_error(/Unable to attach disk to the VM/)
ensure
delete_vm(@cpi, vm_id)
delete_disk(both_cluster_cpi, disk_id)
end
end
end
end

end
Loading

0 comments on commit 5626965

Please sign in to comment.