Skip to content

Commit f12fd22

Browse files
author
Craig Hagan
committed
Validate md5sum of downloaded s3 objects and retry on mismatch
Transient network errors, timeouts, Ruby Net::HTTP bugs, and other things can cause s3 downloads to be incomplete or outright incorrect. Only succeed a download if the md5 of the downloaded object matches the md5 known to s3.
1 parent 86d15e5 commit f12fd22

File tree

3 files changed

+94
-8
lines changed

3 files changed

+94
-8
lines changed

libraries/s3_file.rb

+92-7
Original file line numberDiff line numberDiff line change
@@ -128,22 +128,88 @@ def self.get_digests_from_headers(headers)
128128
return {"md5" => etag}.merge(digests)
129129
end
130130

131-
def self.get_digests_from_s3(bucket,url,path,aws_access_key_id,aws_secret_access_key,token, region)
132-
response = do_request("HEAD", url, bucket, path, aws_access_key_id, aws_secret_access_key, token, region)
133-
return self.get_digests_from_headers(response.headers)
131+
def self.get_digests_from_s3(bucket,path,aws_access_key_id,aws_secret_access_key,token,timeout=300,open_timeout=10,retries=5)
132+
now, auth_string = get_s3_auth("HEAD", bucket,path,aws_access_key_id,aws_secret_access_key, token)
133+
max_tries = retries + 1
134+
headers = build_headers(now, auth_string, token)
135+
saved_exception = nil
136+
137+
while (max_tries > 0)
138+
begin
139+
140+
response = RestClient.head('https://%s.s3.amazonaws.com%s' % [bucket,path], headers)
141+
142+
etag = response.headers[:etag].gsub('"','')
143+
digest = response.headers[:x_amz_meta_digest]
144+
digests = digest.nil? ? {} : Hash[digest.split(",").map {|a| a.split("=")}]
145+
146+
return {"md5" => etag}.merge(digests)
147+
148+
rescue => e
149+
max_tries = max_tries - 1
150+
saved_exception = e
151+
end
152+
end
153+
raise saved_exception
154+
end
155+
156+
def self.validate_download_checksum(response)
157+
# Default to not checking md5 sum of downloaded objects
158+
# per http://docs.aws.amazon.com/AmazonS3/latest/API/RESTCommonResponseHeaders.html
159+
# If an object is created by either the Multipart Upload or Part Copy operation,
160+
# the ETag is not an MD5 digest, regardless of the method of encryption
161+
# however, if present, x-amz-meta-digest will contain the digest, so
162+
# try if we see enough information and verify_md5 is set.
163+
if response.headers[:x_amz_meta_digest]
164+
return self.verify_md5_checksum(response.headers[:x_amz_meta_digest_md5].gsub('"',''), response.file.path)
165+
else
166+
server_side_encryption_customer_algorithm = response.headers[:x_amz_server_side_encryption_customer_algorithm]
167+
server_side_encryption = response.headers[:x_amz_server_side_encryption]
168+
if server_side_encryption_customer_algorithm.nil? and server_side_encryption != "aws:kms"
169+
return self.verify_md5_checksum(response.headers[:etag].gsub('"',''), response.file.path)
170+
else
171+
# If we do not have the x-amz-meta-digest-md5 header, we
172+
# cannot validate objects encrypted with SSE-C or SSE-KMS,
173+
# because the ETag will not be the MD5 digest. Assume it is
174+
# valid in those cases.
175+
return true
176+
end
177+
end
134178
end
135179

136-
def self.get_from_s3(bucket, url, path, aws_access_key_id, aws_secret_access_key, token, region = nil, verify_md5=true)
180+
181+
def self.get_from_s3(bucket, url, path, aws_access_key_id, aws_secret_access_key, token, verify_md5=false, region = nil)
137182
response = nil
138183
retries = 5
139184
for attempts in 0..retries
140185
begin
141186
response = do_request("GET", url, bucket, path, aws_access_key_id, aws_secret_access_key, token, region)
142187

188+
# check the length of the downloaded object,
189+
# make sure we didn't get nailed by
190+
# a quirk in Net::HTTP class from the Ruby standard library.
191+
# Net::HTTP has the behavior (and I would call this a bug) that if the
192+
# connection gets reset in the middle of transferring the response,
193+
# it silently truncates the response back to the caller without throwing an exception.
194+
# ** See https://github.com/ruby/ruby/blob/trunk/lib/net/http/response.rb#L291
195+
# and https://github.com/ruby/ruby/blob/trunk/lib/net/protocol.rb#L99 .
196+
# It attempts to read up to Content-Length worth of bytes, but if hits an early EOF,
197+
# it just returns without throwing an exception (the ignore_eof flag).
198+
199+
length = response.headers[:content_length].to_i()
200+
if not length.nil? and response.file.size() != length
201+
raise "Downloaded object size (#{response.file.size()}) does not match expected content_length (#{length})"
202+
end
203+
204+
# default to not checking md5 sum of downloaded objects
205+
# per http://docs.aws.amazon.com/AmazonS3/latest/API/RESTCommonResponseHeaders.html
206+
# If an object is created by either the Multipart Upload or Part Copy operation,
207+
# the ETag is not an MD5 digest, regardless of the method of encryption
208+
# however, if present, x-amz-meta-digest will contain the digest, so
209+
# try if we see enough information and verify_md5 is set.
143210
if verify_md5
144-
md5 = self.get_digests_from_headers(response.headers)["md5"]
145-
if not self.verify_md5_checksum(md5,response.file.path)
146-
raise "unable to validate md5 checksum of downloaded object"
211+
if not self.validate_download_checksum(response)
212+
raise "Downloaded object has an md5sum which differs from the expected value provided by S3"
147213
end
148214
end
149215

@@ -170,6 +236,25 @@ def self.get_from_s3(bucket, url, path, aws_access_key_id, aws_secret_access_key
170236
end
171237
end
172238

239+
def self.get_s3_auth(method, bucket,path,aws_access_key_id,aws_secret_access_key, token)
240+
now = Time.now().utc.strftime('%a, %d %b %Y %H:%M:%S GMT')
241+
string_to_sign = "#{method}\n\n\n%s\n" % [now]
242+
243+
if token
244+
string_to_sign += "x-amz-security-token:#{token}\n"
245+
end
246+
247+
string_to_sign += "/%s%s" % [bucket,path]
248+
249+
digest = digest = OpenSSL::Digest::Digest.new('sha1')
250+
signed = OpenSSL::HMAC.digest(digest, aws_secret_access_key, string_to_sign)
251+
signed_base64 = Base64.encode64(signed)
252+
253+
auth_string = 'AWS %s:%s' % [aws_access_key_id,signed_base64]
254+
255+
[now,auth_string]
256+
end
257+
173258
def self.aes256_decrypt(key, file)
174259
Chef::Log.debug("Decrypting S3 file.")
175260
key = key.strip

providers/default.rb

+1-1
Original file line numberDiff line numberDiff line change
@@ -62,7 +62,7 @@
6262
end
6363

6464
if download
65-
response = S3FileLib::get_from_s3(new_resource.bucket, new_resource.s3_url, remote_path, aws_access_key_id, aws_secret_access_key, token)
65+
response = S3FileLib::get_from_s3(new_resource.bucket, new_resource.s3_url, remote_path, aws_access_key_id, aws_secret_access_key, token, new_resource.verify_md5)
6666

6767
# not simply using the file resource here because we would have to buffer
6868
# whole file into memory in order to set content this solves

resources/default.rb

+1
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
attribute :mode, :kind_of => [String, Integer, NilClass], :default => nil
1212
attribute :decryption_key, :kind_of => String, :default => nil
1313
attribute :decrypted_file_checksum, :kind_of => String, :default => nil
14+
attribute :verify_md5, :kind_of => [TrueClass, FalseClass], :default => false
1415

1516
def initialize(*args)
1617
super

0 commit comments

Comments
 (0)