diff --git a/include/llvm/IR/Instruction.h b/include/llvm/IR/Instruction.h
index df4f8df78b12..53e88de89492 100644
--- a/include/llvm/IR/Instruction.h
+++ b/include/llvm/IR/Instruction.h
@@ -197,6 +197,16 @@ class Instruction : public User,
   void setMetadata(unsigned KindID, MDNode *Node);
   void setMetadata(StringRef Kind, MDNode *Node);
 
+  /// Copy metadata from \p SrcInst to this instruction. \p WL, if not empty,
+  /// specifies the list of meta data that needs to be copied. If \p WL is
+  /// empty, all meta data will be copied.
+  void copyMetadata(const Instruction &SrcInst, ArrayRef<unsigned> WL = {});
+
+  /// If the instruction has "branch_weights" MD_prof metadata and the MDNode
+  /// has three operands (including name string), swap the order of the
+  /// metadata.
+  void swapProfMetadata();
+
   /// Drop all unknown metadata except for debug locations.
   /// @{
   /// Passes are required to drop metadata they don't understand. This is a
diff --git a/lib/IR/Instruction.cpp b/lib/IR/Instruction.cpp
index ed08f85c60b6..d65eb40ed1a5 100644
--- a/lib/IR/Instruction.cpp
+++ b/lib/IR/Instruction.cpp
@@ -11,6 +11,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "llvm/ADT/DenseSet.h"
 #include "llvm/IR/Instruction.h"
 #include "llvm/IR/CallSite.h"
 #include "llvm/IR/Constants.h"
@@ -627,6 +628,47 @@ Instruction *Instruction::cloneImpl() const {
   llvm_unreachable("Subclass of Instruction failed to implement cloneImpl");
 }
 
+void Instruction::swapProfMetadata() {
+  MDNode *ProfileData = getMetadata(LLVMContext::MD_prof);
+  if (!ProfileData || ProfileData->getNumOperands() != 3 ||
+      !isa<MDString>(ProfileData->getOperand(0)))
+    return;
+
+  MDString *MDName = cast<MDString>(ProfileData->getOperand(0));
+  if (MDName->getString() != "branch_weights")
+    return;
+
+  // The first operand is the name. Fetch them backwards and build a new one.
+  Metadata *Ops[] = {ProfileData->getOperand(0), ProfileData->getOperand(2),
+                     ProfileData->getOperand(1)};
+  setMetadata(LLVMContext::MD_prof,
+              MDNode::get(ProfileData->getContext(), Ops));
+}
+
+/// Copy meta data from \p SrcInst to this instruction. If WL is empty, all
+/// data will be copied, otherwise only ones specified in WL will be copied.
+void Instruction::copyMetadata(const Instruction &SrcInst,
+                               ArrayRef<unsigned> WL) {
+  if (!SrcInst.hasMetadata())
+    return;
+
+  DenseSet<unsigned> WLS;
+  for (unsigned M : WL)
+    WLS.insert(M);
+
+  // Otherwise, enumerate and copy over metadata from the old instruction to the
+  // new one.
+  SmallVector<std::pair<unsigned, MDNode *>, 4> TheMDs;
+  SrcInst.getAllMetadataOtherThanDebugLoc(TheMDs);
+  for (const auto &MD : TheMDs) {
+    if (WL.empty() || WLS.count(MD.first))
+      setMetadata(MD.first, MD.second);
+  }
+  if (WL.empty() || WLS.count(LLVMContext::MD_dbg))
+    setDebugLoc(SrcInst.getDebugLoc());
+  return;
+}
+
 Instruction *Instruction::clone() const {
   Instruction *New = nullptr;
   switch (getOpcode()) {
@@ -641,16 +683,6 @@ Instruction *Instruction::clone() const {
   }
 
   New->SubclassOptionalData = SubclassOptionalData;
-  if (!hasMetadata())
-    return New;
-
-  // Otherwise, enumerate and copy over metadata from the old instruction to the
-  // new one.
-  SmallVector<std::pair<unsigned, MDNode *>, 4> TheMDs;
-  getAllMetadataOtherThanDebugLoc(TheMDs);
-  for (const auto &MD : TheMDs)
-    New->setMetadata(MD.first, MD.second);
-
-  New->setDebugLoc(getDebugLoc());
+  New->copyMetadata(*this);
   return New;
 }
diff --git a/lib/IR/Instructions.cpp b/lib/IR/Instructions.cpp
index b9c693ff19ad..b3f3d2bbb538 100644
--- a/lib/IR/Instructions.cpp
+++ b/lib/IR/Instructions.cpp
@@ -1209,15 +1209,7 @@ void BranchInst::swapSuccessors() {
 
   // Update profile metadata if present and it matches our structural
   // expectations.
-  MDNode *ProfileData = getMetadata(LLVMContext::MD_prof);
-  if (!ProfileData || ProfileData->getNumOperands() != 3)
-    return;
-
-  // The first operand is the name. Fetch them backwards and build a new one.
-  Metadata *Ops[] = {ProfileData->getOperand(0), ProfileData->getOperand(2),
-                     ProfileData->getOperand(1)};
-  setMetadata(LLVMContext::MD_prof,
-              MDNode::get(ProfileData->getContext(), Ops));
+  swapProfMetadata();
 }
 
 BasicBlock *BranchInst::getSuccessorV(unsigned idx) const {
diff --git a/lib/Transforms/Scalar/LoopUnswitch.cpp b/lib/Transforms/Scalar/LoopUnswitch.cpp
index 71980e85e8ca..c954341438b0 100644
--- a/lib/Transforms/Scalar/LoopUnswitch.cpp
+++ b/lib/Transforms/Scalar/LoopUnswitch.cpp
@@ -742,42 +742,6 @@ static Loop *CloneLoop(Loop *L, Loop *PL, ValueToValueMapTy &VM,
   return &New;
 }
 
-static void copyMetadata(Instruction *DstInst, const Instruction *SrcInst,
-                         bool Swapped) {
-  if (!SrcInst || !SrcInst->hasMetadata())
-    return;
-
-  SmallVector<std::pair<unsigned, MDNode *>, 4> MDs;
-  SrcInst->getAllMetadata(MDs);
-  for (auto &MD : MDs) {
-    switch (MD.first) {
-    default:
-      break;
-    case LLVMContext::MD_prof:
-      if (Swapped && MD.second->getNumOperands() == 3 &&
-          isa<MDString>(MD.second->getOperand(0))) {
-        MDString *MDName = cast<MDString>(MD.second->getOperand(0));
-        if (MDName->getString() == "branch_weights") {
-          auto *ValT = cast_or_null<ConstantAsMetadata>(
-                           MD.second->getOperand(1))->getValue();
-          auto *ValF = cast_or_null<ConstantAsMetadata>(
-                           MD.second->getOperand(2))->getValue();
-          assert(ValT && ValF && "Invalid Operands of branch_weights");
-          auto NewMD =
-              MDBuilder(DstInst->getParent()->getContext())
-                  .createBranchWeights(cast<ConstantInt>(ValF)->getZExtValue(),
-                                       cast<ConstantInt>(ValT)->getZExtValue());
-          MD.second = NewMD;
-        }
-      }
-      // fallthrough.
-    case LLVMContext::MD_make_implicit:
-    case LLVMContext::MD_dbg:
-      DstInst->setMetadata(MD.first, MD.second);
-    }
-  }
-}
-
 /// Emit a conditional branch on two values if LIC == Val, branch to TrueDst,
 /// otherwise branch to FalseDest. Insert the code immediately before InsertPt.
 void LoopUnswitch::EmitPreheaderBranchOnCondition(Value *LIC, Constant *Val,
@@ -800,7 +764,14 @@ void LoopUnswitch::EmitPreheaderBranchOnCondition(Value *LIC, Constant *Val,
 
   // Insert the new branch.
   BranchInst *BI = BranchInst::Create(TrueDest, FalseDest, BranchVal, InsertPt);
-  copyMetadata(BI, TI, Swapped);
+  if (TI) {
+    // FIXME: check why white list is needed here:
+    ArrayRef<unsigned> WL = {LLVMContext::MD_dbg, LLVMContext::MD_prof,
+                             LLVMContext::MD_make_implicit};
+    BI->copyMetadata(*TI, WL);
+    if (Swapped)
+      BI->swapProfMetadata();
+  }
 
   // If either edge is critical, split it. This helps preserve LoopSimplify
   // form for enclosing loops.
diff --git a/lib/Transforms/Scalar/SROA.cpp b/lib/Transforms/Scalar/SROA.cpp
index 7d33259c030b..6566cd0538ff 100644
--- a/lib/Transforms/Scalar/SROA.cpp
+++ b/lib/Transforms/Scalar/SROA.cpp
@@ -2390,6 +2390,10 @@ class llvm::sroa::AllocaSliceRewriter
                                               LI.isVolatile(), LI.getName());
       if (LI.isVolatile())
         NewLI->setAtomic(LI.getOrdering(), LI.getSynchScope());
+
+      // Try to preserve nonnull metadata
+      if (TargetTy->isPointerTy())
+        NewLI->copyMetadata(LI, LLVMContext::MD_nonnull);
       V = NewLI;
 
       // If this is an integer load past the end of the slice (which means the
diff --git a/lib/Transforms/Utils/PromoteMemoryToRegister.cpp b/lib/Transforms/Utils/PromoteMemoryToRegister.cpp
index cbf385d56339..186ab34bca7a 100644
--- a/lib/Transforms/Utils/PromoteMemoryToRegister.cpp
+++ b/lib/Transforms/Utils/PromoteMemoryToRegister.cpp
@@ -15,7 +15,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/Transforms/Utils/PromoteMemToReg.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/STLExtras.h"
@@ -23,6 +22,7 @@
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/AliasSetTracker.h"
+#include "llvm/Analysis/AssumptionCache.h"
 #include "llvm/Analysis/InstructionSimplify.h"
 #include "llvm/Analysis/IteratedDominanceFrontier.h"
 #include "llvm/Analysis/ValueTracking.h"
@@ -38,6 +38,7 @@
 #include "llvm/IR/Metadata.h"
 #include "llvm/IR/Module.h"
 #include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Transforms/Utils/PromoteMemToReg.h"
 #include <algorithm>
 using namespace llvm;
 
@@ -301,6 +302,18 @@ struct PromoteMem2Reg {
 
 } // end of anonymous namespace
 
+/// Given a LoadInst LI this adds assume(LI != null) after it.
+static void addAssumeNonNull(AssumptionCache *AC, LoadInst *LI) {
+  Function *AssumeIntrinsic =
+      Intrinsic::getDeclaration(LI->getModule(), Intrinsic::assume);
+  ICmpInst *LoadNotNull = new ICmpInst(ICmpInst::ICMP_NE, LI,
+                                       Constant::getNullValue(LI->getType()));
+  LoadNotNull->insertAfter(LI);
+  CallInst *CI = CallInst::Create(AssumeIntrinsic, {LoadNotNull});
+  CI->insertAfter(LoadNotNull);
+  AC->registerAssumption(CI);
+}
+
 static void removeLifetimeIntrinsicUsers(AllocaInst *AI) {
   // Knowing that this alloca is promotable, we know that it's safe to kill all
   // instructions except for load and store.
@@ -334,9 +347,9 @@ static void removeLifetimeIntrinsicUsers(AllocaInst *AI) {
 /// and thus must be phi-ed with undef. We fall back to the standard alloca
 /// promotion algorithm in that case.
 static bool rewriteSingleStoreAlloca(AllocaInst *AI, AllocaInfo &Info,
-                                     LargeBlockInfo &LBI,
-                                     DominatorTree &DT,
-                                     AliasSetTracker *AST) {
+                                     LargeBlockInfo &LBI, DominatorTree &DT,
+                                     AliasSetTracker *AST,
+                                     AssumptionCache *AC) {
   StoreInst *OnlyStore = Info.OnlyStore;
   bool StoringGlobalVal = !isa<Instruction>(OnlyStore->getOperand(0));
   BasicBlock *StoreBB = OnlyStore->getParent();
@@ -387,6 +400,14 @@ static bool rewriteSingleStoreAlloca(AllocaInst *AI, AllocaInfo &Info,
     // code.
     if (ReplVal == LI)
       ReplVal = UndefValue::get(LI->getType());
+
+    // If the load was marked as nonnull we don't want to lose
+    // that information when we erase this Load. So we preserve
+    // it with an assume.
+    if (AC && LI->getMetadata(LLVMContext::MD_nonnull) &&
+        !llvm::isKnownNonNullAt(ReplVal, LI, &DT))
+      addAssumeNonNull(AC, LI);
+
     LI->replaceAllUsesWith(ReplVal);
     if (AST && LI->getType()->isPointerTy())
       AST->deleteValue(LI);
@@ -435,7 +456,9 @@ static bool rewriteSingleStoreAlloca(AllocaInst *AI, AllocaInfo &Info,
 ///  }
 static bool promoteSingleBlockAlloca(AllocaInst *AI, const AllocaInfo &Info,
                                      LargeBlockInfo &LBI,
-                                     AliasSetTracker *AST) {
+                                     AliasSetTracker *AST,
+                                     DominatorTree &DT,
+                                     AssumptionCache *AC) {
   // The trickiest case to handle is when we have large blocks. Because of this,
   // this code is optimized assuming that large blocks happen.  This does not
   // significantly pessimize the small block case.  This uses LargeBlockInfo to
@@ -476,10 +499,17 @@ static bool promoteSingleBlockAlloca(AllocaInst *AI, const AllocaInfo &Info,
         // There is no store before this load, bail out (load may be affected
         // by the following stores - see main comment).
         return false;
-    }
-    else
+    } else {
       // Otherwise, there was a store before this load, the load takes its value.
-      LI->replaceAllUsesWith(std::prev(I)->second->getOperand(0));
+      // Note, if the load was marked as nonnull we don't want to lose that
+      // information when we erase it. So we preserve it with an assume.
+      Value *ReplVal = std::prev(I)->second->getOperand(0);
+      if (AC && LI->getMetadata(LLVMContext::MD_nonnull) &&
+          !llvm::isKnownNonNullAt(ReplVal, LI, &DT))
+        addAssumeNonNull(AC, LI);
+
+      LI->replaceAllUsesWith(ReplVal);
+    }
 
     if (AST && LI->getType()->isPointerTy())
       AST->deleteValue(LI);
@@ -553,7 +583,7 @@ void PromoteMem2Reg::run() {
     // If there is only a single store to this value, replace any loads of
     // it that are directly dominated by the definition with the value stored.
     if (Info.DefiningBlocks.size() == 1) {
-      if (rewriteSingleStoreAlloca(AI, Info, LBI, DT, AST)) {
+      if (rewriteSingleStoreAlloca(AI, Info, LBI, DT, AST, AC)) {
         // The alloca has been processed, move on.
         RemoveFromAllocasList(AllocaNum);
         ++NumSingleStore;
@@ -564,7 +594,7 @@ void PromoteMem2Reg::run() {
     // If the alloca is only read and written in one basic block, just perform a
     // linear sweep over the block to eliminate it.
     if (Info.OnlyUsedInOneBlock &&
-        promoteSingleBlockAlloca(AI, Info, LBI, AST)) {
+        promoteSingleBlockAlloca(AI, Info, LBI, AST, DT, AC)) {
       // The alloca has been processed, move on.
       RemoveFromAllocasList(AllocaNum);
       continue;
@@ -938,6 +968,13 @@ void PromoteMem2Reg::RenamePass(BasicBlock *BB, BasicBlock *Pred,
 
       Value *V = IncomingVals[AI->second];
 
+      // If the load was marked as nonnull we don't want to lose
+      // that information when we erase this Load. So we preserve
+      // it with an assume.
+      if (AC && LI->getMetadata(LLVMContext::MD_nonnull) &&
+          !llvm::isKnownNonNullAt(V, LI, &DT))
+        addAssumeNonNull(AC, LI);
+
       // Anything using the load now uses the current value.
       LI->replaceAllUsesWith(V);
       if (AST && LI->getType()->isPointerTy())
diff --git a/test/Transforms/Mem2Reg/preserve-nonnull-load-metadata.ll b/test/Transforms/Mem2Reg/preserve-nonnull-load-metadata.ll
new file mode 100644
index 000000000000..33a5b124c555
--- /dev/null
+++ b/test/Transforms/Mem2Reg/preserve-nonnull-load-metadata.ll
@@ -0,0 +1,89 @@
+; RUN: opt < %s -mem2reg -S | FileCheck %s
+
+; This tests that mem2reg preserves the !nonnull metadata on loads
+; from allocas that get optimized out.
+
+; Check the case where the alloca in question has a single store.
+define float* @single_store(float** %arg) {
+; CHECK-LABEL: define float* @single_store
+; CHECK: %arg.load = load float*, float** %arg, align 8
+; CHECK: [[ASSUME:%(.*)]] = icmp ne float* %arg.load, null
+; CHECK: call void @llvm.assume(i1 {{.*}}[[ASSUME]])
+; CHECK: ret float* %arg.load
+entry:
+  %buf = alloca float*
+  %arg.load = load float*, float** %arg, align 8
+  store float* %arg.load, float** %buf, align 8
+  %buf.load = load float*, float **%buf, !nonnull !0
+  ret float* %buf.load
+}
+
+; Check the case where the alloca in question has more than one
+; store but still within one basic block.
+define float* @single_block(float** %arg) {
+; CHECK-LABEL: define float* @single_block
+; CHECK: %arg.load = load float*, float** %arg, align 8
+; CHECK: [[ASSUME:%(.*)]] = icmp ne float* %arg.load, null
+; CHECK: call void @llvm.assume(i1 {{.*}}[[ASSUME]])
+; CHECK: ret float* %arg.load
+entry:
+  %buf = alloca float*
+  %arg.load = load float*, float** %arg, align 8
+  store float* null, float** %buf, align 8
+  store float* %arg.load, float** %buf, align 8
+  %buf.load = load float*, float **%buf, !nonnull !0
+  ret float* %buf.load
+}
+
+; Check the case where the alloca in question has more than one
+; store and also reads ands writes in multiple blocks.
+define float* @multi_block(float** %arg) {
+; CHECK-LABEL: define float* @multi_block
+; CHECK-LABEL: entry:
+; CHECK: %arg.load = load float*, float** %arg, align 8
+; CHECK: br label %next
+; CHECK-LABEL: next:
+; CHECK: [[ASSUME:%(.*)]] = icmp ne float* %arg.load, null
+; CHECK: call void @llvm.assume(i1 {{.*}}[[ASSUME]])
+; CHECK: ret float* %arg.load
+entry:
+  %buf = alloca float*
+  %arg.load = load float*, float** %arg, align 8
+  store float* null, float** %buf, align 8
+  br label %next
+next:
+  store float* %arg.load, float** %buf, align 8
+  %buf.load = load float*, float** %buf, !nonnull !0
+  ret float* %buf.load
+}
+
+; Check that we don't add an assume if it's not
+; necessary i.e. the value is already implied to be nonnull
+define float* @no_assume(float** %arg) {
+; CHECK-LABEL: define float* @no_assume
+; CHECK-LABEL: entry:
+; CHECK: %arg.load = load float*, float** %arg, align 8
+; CHECK: %cn = icmp ne float* %arg.load, null
+; CHECK: br i1 %cn, label %next, label %fin
+; CHECK-LABEL: next:
+; CHECK-NOT: call void @llvm.assume
+; CHECK: ret float* %arg.load
+; CHECK-LABEL: fin:
+; CHECK: ret float* null
+entry:
+  %buf = alloca float*
+  %arg.load = load float*, float** %arg, align 8
+  %cn = icmp ne float* %arg.load, null
+  br i1 %cn, label %next, label %fin
+next:
+; At this point the above nonnull check ensures that
+; the value %arg.load is nonnull in this block and thus
+; we need not add the assume.
+  store float* %arg.load, float** %buf, align 8
+  %buf.load = load float*, float** %buf, !nonnull !0
+  ret float* %buf.load
+fin:
+  ret float* null
+}
+
+!0 = !{}
diff --git a/test/Transforms/SROA/preserve-nonnull.ll b/test/Transforms/SROA/preserve-nonnull.ll
new file mode 100644
index 000000000000..fc5ce6a445fa
--- /dev/null
+++ b/test/Transforms/SROA/preserve-nonnull.ll
@@ -0,0 +1,26 @@
+; RUN: opt < %s -sroa -S | FileCheck %s
+;
+; Make sure that SROA doesn't lose nonnull metadata
+; on loads from allocas that get optimized out.
+
+; CHECK-LABEL: define float* @yummy_nonnull
+; CHECK: [[RETURN:%(.*)]] = load float*, float** %arg, align 8
+; CHECK: [[ASSUME:%(.*)]] = icmp ne float* {{.*}}[[RETURN]], null
+; CHECK: call void @llvm.assume(i1 {{.*}}[[ASSUME]])
+; CHECK: ret float* {{.*}}[[RETURN]]
+
+define float* @yummy_nonnull(float** %arg) {
+entry-block:
+	%buf = alloca float*
+
+	%_arg_i8 = bitcast float** %arg to i8*
+	%_buf_i8 = bitcast float** %buf to i8*
+	call void @llvm.memcpy.p0i8.p0i8.i64(i8* %_buf_i8, i8* %_arg_i8, i64 8, i32 8, i1 false)
+
+	%ret = load float*, float** %buf, align 8, !nonnull !0
+	ret float* %ret
+}
+
+declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture writeonly, i8* nocapture readonly, i64, i32, i1)
+
+!0 = !{}