diff --git a/CMakeLists.txt b/CMakeLists.txt
index 156fb24e6b..6b76f27eb0 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -721,10 +721,15 @@ if(EXECUTORCH_BUILD_PYBIND)
       -fPIC
       -frtti
       -fexceptions
-      # libtorch is built with the old ABI, so we need to do the same for any
-      # .cpp files that include torch, c10, or ATen targets.
-      -D_GLIBCXX_USE_CXX11_ABI=0
   )
+  if(EXECUTORCH_DO_NOT_USE_CXX11_ABI)
+      # libtorch is built with the old ABI, so we need to do the same for any
+      # .cpp files that include torch, c10, or ATen targets. Note that PyTorch
+      # nightly binary is built with _GLIBCXX_USE_CXX11_ABI set to 0 while its
+      # CI build sets this to 1 (default)
+    list(APPEND _pybind_compile_options -D_GLIBCXX_USE_CXX11_ABI=0)
+  endif()
+
   # util lib
   add_library(
     util ${CMAKE_CURRENT_SOURCE_DIR}/extension/evalue_util/print_evalue.cpp
diff --git a/exir/pass_base.py b/exir/pass_base.py
index db6bef8e3f..9c97921f51 100644
--- a/exir/pass_base.py
+++ b/exir/pass_base.py
@@ -318,7 +318,11 @@ def call_function(
             if target == operator.getitem:
                 value, key = args
                 return self.callback.call_getitem(value, key, meta)
-            elif getattr(target, "__module__", None) in {"_operator", "math"}:
+            elif getattr(target, "__module__", None) in {
+                "_operator",
+                "builtins",
+                "math",
+            }:
                 assert callable(target)
                 return self.callback.call_sym(target, args, meta)
             elif target in _TORCH_SYM_OPS:
diff --git a/exir/passes/__init__.py b/exir/passes/__init__.py
index 7a0623040f..fdb954010c 100644
--- a/exir/passes/__init__.py
+++ b/exir/passes/__init__.py
@@ -339,7 +339,7 @@ def get_submodule(node: torch.fx.Node) -> torch.fx.GraphModule:
                 self.call(get_submodule(node.args[0]))
                 self.call(get_submodule(node.args[1]))
                 continue
-            elif getattr(target, "__module__", None) == "_operator":
+            elif getattr(target, "__module__", None) in ("builtins", "_operator"):
                 continue
             elif target in to_out_var_skiplist:
                 continue
diff --git a/exir/passes/executorch_prim_ops_registry.py b/exir/passes/executorch_prim_ops_registry.py
index 4af233aaa6..fa1c2e6913 100644
--- a/exir/passes/executorch_prim_ops_registry.py
+++ b/exir/passes/executorch_prim_ops_registry.py
@@ -4,9 +4,10 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
+import builtins
 import math
 import operator
-from typing import Dict, Set, Union
+from typing import Any, Dict, Set, Union
 
 # necessary to ensure the ops are registered
 import torch
@@ -94,12 +95,24 @@ def neg(a: _SymScalar) -> _SymScalar:
     return -a  # pyre-ignore
 
 
+@bind_pattern_to_op(executorch_prims_lib, "ceil.Scalar(Scalar a) -> Scalar")
+def ceil(a: _SymScalar) -> _SymScalar:
+    return math.ceil(a)  # pyre-ignore
+
+
+@bind_pattern_to_op(executorch_prims_lib, "round.Scalar(Scalar a) -> Scalar")
+def builtin_round(a: _SymScalar) -> _SymScalar:
+    return round(a)  # pyre-ignore
+
+
 @bind_pattern_to_op(executorch_prims_lib, "trunc.Scalar(Scalar a) -> Scalar")
 def trunc(a: _SymScalar) -> _SymScalar:
     return math.trunc(a)  # pyre-ignore
 
 
-_PYTHON_SYM_OPS_TO_EXECUTORCH_SYM_OPS: Dict[OpOverload, OpOverload] = {
+_PYTHON_SYM_OPS_TO_EXECUTORCH_SYM_OPS: Dict[Any, OpOverload] = {
+    builtins.round: ops.backend.executorch_prim.round.Scalar,
+    math.ceil: ops.backend.executorch_prim.ceil.Scalar,
     math.trunc: ops.backend.executorch_prim.trunc.Scalar,
     operator.sub: ops.backend.executorch_prim.sub.Scalar,
     operator.mul: ops.backend.executorch_prim.mul.Scalar,
diff --git a/extension/android/CMakeLists.txt b/extension/android/CMakeLists.txt
index c96cfeb5d7..70f21f2751 100644
--- a/extension/android/CMakeLists.txt
+++ b/extension/android/CMakeLists.txt
@@ -64,7 +64,7 @@ set(executorch_DIR ${CMAKE_CURRENT_BINARY_DIR}/../../lib/cmake/ExecuTorch)
 find_package(executorch CONFIG REQUIRED)
 target_link_options_shared_lib(executorch)
 
-add_library(executorch_jni SHARED jni/jni_layer.cpp)
+add_library(executorch_jni SHARED jni/jni_layer.cpp jni/log.cpp)
 
 set(link_libraries)
 list(
@@ -146,7 +146,7 @@ if(EXECUTORCH_JNI_CUSTOM_LIBRARY)
 endif()
 
 if(EXECUTORCH_BUILD_LLAMA_JNI)
-  target_sources(executorch_jni PRIVATE jni/jni_layer_llama.cpp)
+  target_sources(executorch_jni PRIVATE jni/jni_layer_llama.cpp jni/log.cpp)
   list(APPEND link_libraries llama_runner llava_runner)
   target_compile_definitions(executorch_jni PUBLIC EXECUTORCH_BUILD_LLAMA_JNI=1)
   add_subdirectory(
diff --git a/extension/android/jni/BUCK b/extension/android/jni/BUCK
index 6f269739c0..e1bf26fef2 100644
--- a/extension/android/jni/BUCK
+++ b/extension/android/jni/BUCK
@@ -1,5 +1,6 @@
 load("@fbsource//tools/build_defs/android:fb_android_cxx_library.bzl", "fb_android_cxx_library")
 load("@fbsource//xplat/executorch/backends/xnnpack/third-party:third_party_libs.bzl", "third_party_dep")
+load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime")
 load("@fbsource//xplat/executorch/codegen:codegen.bzl", "executorch_generated_lib")
 
 oncall("executorch")
@@ -25,7 +26,7 @@ executorch_generated_lib(
 
 fb_android_cxx_library(
     name = "executorch_jni",
-    srcs = ["jni_layer.cpp"],
+    srcs = ["jni_layer.cpp", "log.cpp"],
     headers = ["jni_layer_constants.h"],
     allow_jni_merging = False,
     compiler_flags = [
@@ -36,6 +37,7 @@ fb_android_cxx_library(
     soname = "libexecutorch.$(ext)",
     visibility = ["PUBLIC"],
     deps = [
+        ":log_provider_static",
         "//fbandroid/libraries/fbjni:fbjni",
         "//fbandroid/native/fb:fb",
         "//third-party/glog:glog",
@@ -49,7 +51,7 @@ fb_android_cxx_library(
 
 fb_android_cxx_library(
     name = "executorch_jni_full",
-    srcs = ["jni_layer.cpp"],
+    srcs = ["jni_layer.cpp", "log.cpp"],
     headers = ["jni_layer_constants.h"],
     allow_jni_merging = False,
     compiler_flags = [
@@ -60,6 +62,7 @@ fb_android_cxx_library(
     soname = "libexecutorch.$(ext)",
     visibility = ["PUBLIC"],
     deps = [
+        ":log_provider_static",
         ":generated_op_lib_optimized_static",
         "//fbandroid/libraries/fbjni:fbjni",
         "//fbandroid/native/fb:fb",
@@ -88,6 +91,7 @@ fb_android_cxx_library(
     soname = "libexecutorch.$(ext)",
     visibility = ["PUBLIC"],
     deps = [
+        ":log_provider_static",
         "//fbandroid/libraries/fbjni:fbjni",
         "//fbandroid/native/fb:fb",
         "//third-party/glog:glog",
@@ -101,3 +105,18 @@ fb_android_cxx_library(
         "//xplat/executorch/extension/threadpool:threadpool_static",
     ],
 )
+
+runtime.cxx_library(
+    name = "log_provider",
+    srcs = ["log.cpp"],
+    exported_headers = ["log.h"],
+    compiler_flags = [
+        "-frtti",
+        "-fexceptions",
+        "-Wno-unused-variable",
+    ],
+    deps = [
+        "//executorch/runtime/core:core",
+    ],
+    visibility = ["@EXECUTORCH_CLIENTS"],
+)
diff --git a/extension/android/jni/jni_layer.cpp b/extension/android/jni/jni_layer.cpp
index 479da28806..ddba8462b9 100644
--- a/extension/android/jni/jni_layer.cpp
+++ b/extension/android/jni/jni_layer.cpp
@@ -17,6 +17,7 @@
 
 #include "jni_layer_constants.h"
 
+#include <executorch/extension/android/jni/log.h>
 #include <executorch/extension/module/module.h>
 #include <executorch/extension/runner_util/inputs.h>
 #include <executorch/extension/tensor/tensor.h>
@@ -36,76 +37,6 @@
 using namespace executorch::extension;
 using namespace torch::executor;
 
-#ifdef __ANDROID__
-#include <android/log.h>
-#include <mutex>
-#include <sstream>
-
-// Number of entries to store in the in-memory log buffer.
-const size_t log_buffer_length = 16;
-
-struct log_entry {
-  et_timestamp_t timestamp;
-  et_pal_log_level_t level;
-  std::string filename;
-  std::string function;
-  size_t line;
-  std::string message;
-
-  log_entry(
-      et_timestamp_t timestamp,
-      et_pal_log_level_t level,
-      const char* filename,
-      const char* function,
-      size_t line,
-      const char* message,
-      size_t length)
-      : timestamp(timestamp),
-        level(level),
-        filename(filename),
-        function(function),
-        line(line),
-        message(message, length) {}
-};
-
-namespace {
-std::vector<log_entry> log_buffer_;
-std::mutex log_buffer_mutex_;
-} // namespace
-
-// For Android, write to logcat
-void et_pal_emit_log_message(
-    et_timestamp_t timestamp,
-    et_pal_log_level_t level,
-    const char* filename,
-    const char* function,
-    size_t line,
-    const char* message,
-    size_t length) {
-  std::lock_guard<std::mutex> guard(log_buffer_mutex_);
-
-  while (log_buffer_.size() >= log_buffer_length) {
-    log_buffer_.erase(log_buffer_.begin());
-  }
-
-  log_buffer_.emplace_back(
-      timestamp, level, filename, function, line, message, length);
-
-  int android_log_level = ANDROID_LOG_UNKNOWN;
-  if (level == 'D') {
-    android_log_level = ANDROID_LOG_DEBUG;
-  } else if (level == 'I') {
-    android_log_level = ANDROID_LOG_INFO;
-  } else if (level == 'E') {
-    android_log_level = ANDROID_LOG_ERROR;
-  } else if (level == 'F') {
-    android_log_level = ANDROID_LOG_FATAL;
-  }
-
-  __android_log_print(android_log_level, "ExecuTorch", "%s", message);
-}
-#endif
-
 namespace executorch::extension {
 class TensorHybrid : public facebook::jni::HybridClass<TensorHybrid> {
  public:
@@ -437,24 +368,26 @@ class ExecuTorchJni : public facebook::jni::HybridClass<ExecuTorchJni> {
   facebook::jni::local_ref<facebook::jni::JArrayClass<jstring>>
   readLogBuffer() {
 #ifdef __ANDROID__
-    std::lock_guard<std::mutex> guard(log_buffer_mutex_);
-
-    const auto size = log_buffer_.size();
-    facebook::jni::local_ref<facebook::jni::JArrayClass<jstring>> ret =
-        facebook::jni::JArrayClass<jstring>::newArray(size);
-
-    for (auto i = 0u; i < size; i++) {
-      const auto& entry = log_buffer_[i];
-      // Format the log entry as "[TIMESTAMP FUNCTION FILE:LINE] LEVEL MESSAGE".
-      std::stringstream ss;
-      ss << "[" << entry.timestamp << " " << entry.function << " "
-         << entry.filename << ":" << entry.line << "] "
-         << static_cast<char>(entry.level) << " " << entry.message;
-
-      facebook::jni::local_ref<facebook::jni::JString> jstr_message =
-          facebook::jni::make_jstring(ss.str().c_str());
-      (*ret)[i] = jstr_message;
-    }
+
+    facebook::jni::local_ref<facebook::jni::JArrayClass<jstring>> ret;
+
+    access_log_buffer([&](std::vector<log_entry>& buffer) {
+      const auto size = buffer.size();
+      ret = facebook::jni::JArrayClass<jstring>::newArray(size);
+      for (auto i = 0u; i < size; i++) {
+        const auto& entry = buffer[i];
+        // Format the log entry as "[TIMESTAMP FUNCTION FILE:LINE] LEVEL
+        // MESSAGE".
+        std::stringstream ss;
+        ss << "[" << entry.timestamp << " " << entry.function << " "
+           << entry.filename << ":" << entry.line << "] "
+           << static_cast<char>(entry.level) << " " << entry.message;
+
+        facebook::jni::local_ref<facebook::jni::JString> jstr_message =
+            facebook::jni::make_jstring(ss.str().c_str());
+        (*ret)[i] = jstr_message;
+      }
+    });
 
     return ret;
 #else
@@ -468,10 +401,7 @@ class ExecuTorchJni : public facebook::jni::HybridClass<ExecuTorchJni> {
         makeNativeMethod("forward", ExecuTorchJni::forward),
         makeNativeMethod("execute", ExecuTorchJni::execute),
         makeNativeMethod("loadMethod", ExecuTorchJni::load_method),
-
-#ifdef __ANDROID__
         makeNativeMethod("readLogBuffer", ExecuTorchJni::readLogBuffer),
-#endif
     });
   }
 };
diff --git a/extension/android/jni/log.cpp b/extension/android/jni/log.cpp
new file mode 100644
index 0000000000..663198e127
--- /dev/null
+++ b/extension/android/jni/log.cpp
@@ -0,0 +1,69 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include "log.h"
+
+#ifdef __ANDROID__
+
+#include <android/log.h>
+#include <functional>
+#include <mutex>
+#include <sstream>
+
+using executorch::extension::log_entry;
+
+// Number of entries to store in the in-memory log buffer.
+const size_t log_buffer_length = 16;
+
+namespace {
+std::vector<log_entry> log_buffer_;
+std::mutex log_buffer_mutex_;
+} // namespace
+
+// For Android, write to logcat
+void et_pal_emit_log_message(
+    et_timestamp_t timestamp,
+    et_pal_log_level_t level,
+    const char* filename,
+    const char* function,
+    size_t line,
+    const char* message,
+    size_t length) {
+  std::lock_guard<std::mutex> guard(log_buffer_mutex_);
+
+  while (log_buffer_.size() >= log_buffer_length) {
+    log_buffer_.erase(log_buffer_.begin());
+  }
+
+  log_buffer_.emplace_back(
+      timestamp, level, filename, function, line, message, length);
+
+  int android_log_level = ANDROID_LOG_UNKNOWN;
+  if (level == 'D') {
+    android_log_level = ANDROID_LOG_DEBUG;
+  } else if (level == 'I') {
+    android_log_level = ANDROID_LOG_INFO;
+  } else if (level == 'E') {
+    android_log_level = ANDROID_LOG_ERROR;
+  } else if (level == 'F') {
+    android_log_level = ANDROID_LOG_FATAL;
+  }
+
+  __android_log_print(android_log_level, "ExecuTorch", "%s", message);
+}
+
+namespace executorch::extension {
+
+void access_log_buffer(std::function<void(std::vector<log_entry>&)> accessor) {
+  std::lock_guard<std::mutex> guard(log_buffer_mutex_);
+  accessor(log_buffer_);
+}
+
+} // namespace executorch::extension
+
+#endif
diff --git a/extension/android/jni/log.h b/extension/android/jni/log.h
new file mode 100644
index 0000000000..4389b1d61a
--- /dev/null
+++ b/extension/android/jni/log.h
@@ -0,0 +1,43 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <functional>
+#include <string>
+#include <vector>
+
+#include <executorch/runtime/platform/log.h>
+#include <executorch/runtime/platform/platform.h>
+#include <executorch/runtime/platform/runtime.h>
+
+namespace executorch::extension {
+struct log_entry {
+  et_timestamp_t timestamp;
+  et_pal_log_level_t level;
+  std::string filename;
+  std::string function;
+  size_t line;
+  std::string message;
+
+  log_entry(
+      et_timestamp_t timestamp,
+      et_pal_log_level_t level,
+      const char* filename,
+      const char* function,
+      size_t line,
+      const char* message,
+      size_t length)
+      : timestamp(timestamp),
+        level(level),
+        filename(filename),
+        function(function),
+        line(line),
+        message(message, length) {}
+};
+
+void access_log_buffer(std::function<void(std::vector<log_entry>&)> accessor);
+} // namespace executorch::extension
diff --git a/kernels/prim_ops/register_prim_ops.cpp b/kernels/prim_ops/register_prim_ops.cpp
index 5755ab8d66..38901bb840 100644
--- a/kernels/prim_ops/register_prim_ops.cpp
+++ b/kernels/prim_ops/register_prim_ops.cpp
@@ -303,6 +303,51 @@ static Kernel prim_ops[] = {
           }
         }),
 
+    // ceil.Scalar(Scalar a) -> Scalar
+    Kernel(
+        "executorch_prim::ceil.Scalar",
+        [](KernelRuntimeContext& context, EValue** stack) {
+          (void)context;
+          EValue& a = *stack[0];
+          EValue& out = *stack[1];
+          if (a.isDouble()) {
+            out = EValue(static_cast<int64_t>(ceil(a.toDouble())));
+          } else {
+            ET_CHECK_MSG(false, "Unsupported DType %zu", (size_t)a.tag);
+          }
+        }),
+
+    // round.Scalar(Scalar a) -> Scalar
+    Kernel(
+        "executorch_prim::round.Scalar",
+        [](KernelRuntimeContext& context, EValue** stack) {
+          (void)context;
+          EValue& a = *stack[0];
+          EValue& out = *stack[1];
+          if (a.isDouble()) {
+            // Round half to even to match Python round(). Need an explicit
+            // implementation as not all platforms support fenv rounding modes.
+            // See
+            // https://codeyarns.com/tech/2018-08-17-how-to-round-half-to-even.html
+            const auto val = a.toDouble();
+            const auto r = round(val);
+            const auto d = r - val;
+            auto res = 0.0;
+
+            if (std::abs(d) != 0.5) {
+              res = r;
+            } else if (fmod(r, 2.0) == 0.0) {
+              res = r;
+            } else {
+              res = val - d;
+            }
+
+            out = EValue(static_cast<int64_t>(res));
+          } else {
+            ET_CHECK_MSG(false, "Unsupported DType %zu", (size_t)a.tag);
+          }
+        }),
+
     // trunc.Scalar(Scalar a) -> Scalar
     Kernel(
         "executorch_prim::trunc.Scalar",
diff --git a/kernels/prim_ops/test/prim_ops_test.cpp b/kernels/prim_ops/test/prim_ops_test.cpp
index 3581a470da..ab6bd28e6c 100644
--- a/kernels/prim_ops/test/prim_ops_test.cpp
+++ b/kernels/prim_ops/test/prim_ops_test.cpp
@@ -503,6 +503,47 @@ TEST_F(RegisterPrimOpsTest, TestETViewEmpty) {
       getOpsFn("executorch_prim::et_view.default")(context, bad_stack), "");
 }
 
+TEST_F(RegisterPrimOpsTest, TestCeil) {
+  std::array<double, 10> inputs = {
+      0.0, 0.25, 0.5, 0.75, 1.0, 1.75, -0.5, -1.0, -1.5, 9.999999};
+  std::array<int64_t, 10> expected = {0, 1, 1, 1, 1, 2, 0, -1, -1, 10};
+
+  for (auto i = 0; i < inputs.size(); i++) {
+    EValue values[2];
+    values[0] = EValue(inputs[i]);
+    values[1] = EValue(0.0);
+
+    EValue* stack[2];
+    for (size_t j = 0; j < 2; j++) {
+      stack[j] = &values[j];
+    }
+
+    getOpsFn("executorch_prim::ceil.Scalar")(context, stack);
+    EXPECT_EQ(stack[1]->toInt(), expected[i]);
+  }
+}
+
+TEST_F(RegisterPrimOpsTest, TestRound) {
+  // Note that Python uses round-to-even for halfway values.
+  std::array<double, 10> inputs = {
+      0.0, 0.25, 0.5, 0.75, 1.0, 1.5, -0.5, -1.0, -1.5, 9.999999};
+  std::array<int64_t, 10> expected = {0, 0, 0, 1, 1, 2, 0, -1, -2, 10};
+
+  for (auto i = 0; i < inputs.size(); i++) {
+    EValue values[2];
+    values[0] = EValue(inputs[i]);
+    values[1] = EValue(0.0);
+
+    EValue* stack[2];
+    for (size_t j = 0; j < 2; j++) {
+      stack[j] = &values[j];
+    }
+
+    getOpsFn("executorch_prim::round.Scalar")(context, stack);
+    EXPECT_EQ(stack[1]->toInt(), expected[i]);
+  }
+}
+
 TEST_F(RegisterPrimOpsTest, TestTrunc) {
   std::array<double, 10> inputs = {
       0.0, 0.25, 0.5, 0.75, 1.0, 1.75, -0.5, -1.0, -1.5, 9.999999};