diff --git a/.gitignore b/.gitignore
index 524a12d066c4d..80bdd67619454 100644
--- a/.gitignore
+++ b/.gitignore
@@ -34,6 +34,7 @@
 .DS_Store
 .idea/*
 .vscode/*
+.zed/*
 *.heapsnapshot
 .cache
 # Buildkite: Ignore the entire .buildkite directory
diff --git a/Make.inc b/Make.inc
index 0da638cfab52e..f078a0c84f806 100644
--- a/Make.inc
+++ b/Make.inc
@@ -1095,20 +1095,13 @@ LIBUNWIND:=
 else ifneq ($(DISABLE_LIBUNWIND), 0)
 LIBUNWIND:=
 else
-ifeq ($(USE_SYSTEM_LIBUNWIND), 1)
-ifneq ($(OS),Darwin)
 LIBUNWIND:=-lunwind
-# Only for linux since we want to use not yet released libunwind features
-JCFLAGS+=-DSYSTEM_LIBUNWIND
-JCPPFLAGS+=-DSYSTEM_LIBUNWIND
-endif
-else
 ifneq ($(findstring $(OS),Darwin OpenBSD),)
-LIBUNWIND:=-lunwind
 JCPPFLAGS+=-DLLVMLIBUNWIND
-else
-LIBUNWIND:=-lunwind
-endif
+else ifeq ($(USE_SYSTEM_LIBUNWIND), 1)
+# Only for linux and freebsd since we want to use not yet released gnu libunwind features
+JCFLAGS+=-DSYSTEM_LIBUNWIND
+JCPPFLAGS+=-DSYSTEM_LIBUNWIND
 endif
 endif
 
diff --git a/Makefile b/Makefile
index 735d342a79eb5..4fd8b878c5d1f 100644
--- a/Makefile
+++ b/Makefile
@@ -82,7 +82,7 @@ julia-deps: | $(DIRS) $(build_datarootdir)/julia/base $(build_datarootdir)/julia
 julia-stdlib: | $(DIRS) julia-deps
 	@$(MAKE) $(QUIET_MAKE) -C $(BUILDROOT)/stdlib
 
-julia-base: julia-deps $(build_sysconfdir)/julia/startup.jl $(build_man1dir)/julia.1 $(build_datarootdir)/julia/julia-config.jl
+julia-base: julia-deps $(build_sysconfdir)/julia/startup.jl $(build_man1dir)/julia.1 $(build_datarootdir)/julia/julia-config.jl $(build_datarootdir)/julia/juliac.jl $(build_datarootdir)/julia/juliac-buildscript.jl
 	@$(MAKE) $(QUIET_MAKE) -C $(BUILDROOT)/base
 
 julia-libccalltest: julia-deps
@@ -181,7 +181,7 @@ $(build_sysconfdir)/julia/startup.jl: $(JULIAHOME)/etc/startup.jl | $(build_sysc
 	@echo Creating usr/etc/julia/startup.jl
 	@cp $< $@
 
-$(build_datarootdir)/julia/julia-config.jl: $(JULIAHOME)/contrib/julia-config.jl | $(build_datarootdir)/julia
+$(build_datarootdir)/julia/%: $(JULIAHOME)/contrib/% | $(build_datarootdir)/julia
 	$(INSTALL_M) $< $(dir $@)
 
 $(build_depsbindir)/stringreplace: $(JULIAHOME)/contrib/stringreplace.c | $(build_depsbindir)
@@ -410,7 +410,7 @@ endif
 	$(INSTALL_F) $(JULIAHOME)/contrib/julia.appdata.xml $(DESTDIR)$(datarootdir)/metainfo/
 	# Install terminal info database
 ifneq ($(WITH_TERMINFO),0)
-	cp -R -L $(build_datarootdir)/terminfo $(DESTDIR)$(datarootdir)
+	cp -R -L $(build_datarootdir)/julia/terminfo $(DESTDIR)$(datarootdir)/julia/
 endif
 
 	# Update RPATH entries and JL_SYSTEM_IMAGE_PATH if $(private_libdir_rel) != $(build_private_libdir_rel)
diff --git a/NEWS.md b/NEWS.md
index b5caaf5376fb5..bb22c9f940a78 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -4,6 +4,8 @@ Julia v1.12 Release Notes
 New language features
 ---------------------
 
+- New option `--trim` for building "trimmed" binaries, where code not provably reachable from entry points
+  is removed. Entry points can be marked using `Base.Experimental.entrypoint` ([#55047]).
 - A new keyword argument `usings::Bool` has been added to `names`. By using this, we can now
   find all the names available in module `A` by `names(A; all=true, imported=true, usings=true)`. ([#54609])
 - the `@atomic(...)` macro family supports now the reference assignment syntax, e.g.
@@ -35,6 +37,10 @@ Language changes
    expression within a given `:toplevel` expression to make use of macros
    defined earlier in the same `:toplevel` expression. ([#53515])
 
+ - Trivial infinite loops (like `while true; end`) are no longer undefined
+   behavior. Infinite loops that actually do things (e.g. have side effects
+   or sleep) were never and are still not undefined behavior. ([#52999])
+
 Compiler/Runtime improvements
 -----------------------------
 
@@ -56,6 +62,8 @@ variables. ([#53742]).
 * `--project=@temp` starts Julia with a temporary environment.
 * New `--trace-compile-timing` option to report how long each method reported by `--trace-compile` took
   to compile, in ms. ([#54662])
+* `--trace-compile` now prints recompiled methods in yellow or with a trailing comment if color is not supported ([#55763])
+* New `--trace-dispatch` option to report methods that are dynamically dispatched ([#55848]).
 
 Multi-threading changes
 -----------------------
@@ -128,6 +136,8 @@ Standard library changes
   between different eigendecomposition algorithms ([#49355]).
 * Added a generic version of the (unblocked) pivoted Cholesky decomposition
   (callable via `cholesky[!](A, RowMaximum())`) ([#54619]).
+* The number of default BLAS threads now respects process affinity, instead of
+  using total number of logical threads available on the system ([#55574]).
 
 #### Logging
 
@@ -138,6 +148,9 @@ Standard library changes
 * `Profile.take_heap_snapshot` takes a new keyword argument, `redact_data::Bool`,
   that is `true` by default. When set, the contents of Julia objects are not emitted
   in the heap snapshot. This currently only applies to strings. ([#55326])
+* `Profile.print()` now colors Base/Core/Package modules similarly to how they are in stacktraces.
+  Also paths, even if truncated, are now clickable in terminals that support URI links
+  to take you to the specified `JULIA_EDITOR` for the given file & line number. ([#55335])
 
 #### Random
 
@@ -169,6 +182,10 @@ Standard library changes
 
 #### InteractiveUtils
 
+* New macros `@trace_compile` and `@trace_dispatch` for running an expression with
+  `--trace-compile=stderr --trace-compile-timing` and `--trace-dispatch=stderr` respectively enabled.
+  ([#55915])
+
 Deprecated or removed
 ---------------------
 
diff --git a/README.md b/README.md
index bd9e9b9c0bd02..465adcf049922 100644
--- a/README.md
+++ b/README.md
@@ -92,7 +92,7 @@ and then use the command prompt to change into the resulting julia directory. By
 Julia. However, most users should use the [most recent stable version](https://github.com/JuliaLang/julia/releases)
 of Julia. You can get this version by running:
 
-    git checkout v1.10.4
+    git checkout v1.10.5
 
 To build the `julia` executable, run `make` from within the julia directory.
 
diff --git a/base/Base.jl b/base/Base.jl
index 10a8dd1532f92..84e10ca788ba2 100644
--- a/base/Base.jl
+++ b/base/Base.jl
@@ -306,7 +306,6 @@ end
 include("hashing.jl")
 include("rounding.jl")
 include("div.jl")
-include("rawbigints.jl")
 include("float.jl")
 include("twiceprecision.jl")
 include("complex.jl")
@@ -649,7 +648,7 @@ function __init__()
     empty!(explicit_loaded_modules)
     empty!(loaded_precompiles) # If we load a packageimage when building the image this might not be empty
     for (mod, key) in module_keys
-        loaded_precompiles[key => module_build_id(mod)] = mod
+        push!(get!(Vector{Module}, loaded_precompiles, key), mod)
     end
     if haskey(ENV, "JULIA_MAX_NUM_PRECOMPILE_FILES")
         MAX_NUM_PRECOMPILE_FILES[] = parse(Int, ENV["JULIA_MAX_NUM_PRECOMPILE_FILES"])
diff --git a/base/abstractarray.jl b/base/abstractarray.jl
index 754ab20660ab8..e877a87c2cdd1 100644
--- a/base/abstractarray.jl
+++ b/base/abstractarray.jl
@@ -1101,11 +1101,8 @@ function copyto_unaliased!(deststyle::IndexStyle, dest::AbstractArray, srcstyle:
             end
         else
             # Dual-iterator implementation
-            ret = iterate(iterdest)
-            @inbounds for a in src
-                idx, state = ret::NTuple{2,Any}
-                dest[idx] = a
-                ret = iterate(iterdest, state)
+            for (Idest, Isrc) in zip(iterdest, itersrc)
+                @inbounds dest[Idest] = src[Isrc]
             end
         end
     end
diff --git a/base/abstractdict.jl b/base/abstractdict.jl
index 62a5b3ee9e1b0..85a726b4cdbf4 100644
--- a/base/abstractdict.jl
+++ b/base/abstractdict.jl
@@ -392,6 +392,10 @@ Dict{String, Float64} with 3 entries:
 
 julia> ans == mergewith(+)(a, b)
 true
+
+julia> mergewith(-, Dict(), Dict(:a=>1))  # Combining function only used if key is present in both
+Dict{Any, Any} with 1 entry:
+  :a => 1
 ```
 """
 mergewith(combine, d::AbstractDict, others::AbstractDict...) =
diff --git a/base/array.jl b/base/array.jl
index 648fedd5036e1..5b3e6cc398479 100644
--- a/base/array.jl
+++ b/base/array.jl
@@ -415,7 +415,7 @@ function fill!(a::Union{Array{UInt8}, Array{Int8}}, x::Integer)
     ref = a.ref
     t = @_gc_preserve_begin ref
     p = unsafe_convert(Ptr{Cvoid}, ref)
-    memset(p, x isa eltype(a) ? x : convert(eltype(a), x), length(a))
+    memset(p, x isa eltype(a) ? x : convert(eltype(a), x), length(a) % UInt)
     @_gc_preserve_end t
     return a
 end
diff --git a/base/char.jl b/base/char.jl
index bc68a672ce0ca..2e8410f6903e2 100644
--- a/base/char.jl
+++ b/base/char.jl
@@ -223,6 +223,7 @@ hash(x::Char, h::UInt) =
     hash_uint64(((bitcast(UInt32, x) + UInt64(0xd4d64234)) << 32) ⊻ UInt64(h))
 
 first_utf8_byte(c::Char) = (bitcast(UInt32, c) >> 24) % UInt8
+first_utf8_byte(c::AbstractChar) = first_utf8_byte(Char(c)::Char)
 
 # fallbacks:
 isless(x::AbstractChar, y::AbstractChar) = isless(Char(x), Char(y))
diff --git a/base/client.jl b/base/client.jl
index 0290d27b09cf0..a04556507d5dc 100644
--- a/base/client.jl
+++ b/base/client.jl
@@ -41,7 +41,6 @@ function repl_cmd(cmd, out)
     if isempty(cmd.exec)
         throw(ArgumentError("no cmd to execute"))
     elseif cmd.exec[1] == "cd"
-        new_oldpwd = pwd()
         if length(cmd.exec) > 2
             throw(ArgumentError("cd method only takes one argument"))
         elseif length(cmd.exec) == 2
@@ -52,11 +51,17 @@ function repl_cmd(cmd, out)
                 end
                 dir = ENV["OLDPWD"]
             end
-            cd(dir)
         else
-            cd()
+            dir = homedir()
         end
-        ENV["OLDPWD"] = new_oldpwd
+        try
+            ENV["OLDPWD"] = pwd()
+        catch ex
+            ex isa IOError || rethrow()
+            # if current dir has been deleted, then pwd() will throw an IOError: pwd(): no such file or directory (ENOENT)
+            delete!(ENV, "OLDPWD")
+        end
+        cd(dir)
         println(out, pwd())
     else
         @static if !Sys.iswindows()
@@ -292,12 +297,12 @@ function exec_options(opts)
             invokelatest(show, Core.eval(Main, parse_input_line(arg)))
             println()
         elseif cmd == 'm'
-            @eval Main import $(Symbol(arg)).main
+            entrypoint = push!(split(arg, "."), "main")
+            Base.eval(Main, Expr(:import, Expr(:., Symbol.(entrypoint)...)))
             if !should_use_main_entrypoint()
                 error("`main` in `$arg` not declared as entry point (use `@main` to do so)")
             end
             return false
-
         elseif cmd == 'L'
             # load file immediately on all processors
             if !distributed_mode
diff --git a/base/compiler/abstractinterpretation.jl b/base/compiler/abstractinterpretation.jl
index f3fc4e0423173..c8a25be422637 100644
--- a/base/compiler/abstractinterpretation.jl
+++ b/base/compiler/abstractinterpretation.jl
@@ -42,230 +42,215 @@ function abstract_call_gf_by_type(interp::AbstractInterpreter, @nospecialize(f),
                                   arginfo::ArgInfo, si::StmtInfo, @nospecialize(atype),
                                   sv::AbsIntState, max_methods::Int)
     𝕃ₚ, 𝕃ᵢ = ipo_lattice(interp), typeinf_lattice(interp)
-    ⊑ₚ, ⊔ₚ, ⊔ᵢ  = partialorder(𝕃ₚ), join(𝕃ₚ), join(𝕃ᵢ)
+    ⊑ₚ, ⋤ₚ, ⊔ₚ, ⊔ᵢ  = partialorder(𝕃ₚ), strictneqpartialorder(𝕃ₚ), join(𝕃ₚ), join(𝕃ᵢ)
     argtypes = arginfo.argtypes
     matches = find_method_matches(interp, argtypes, atype; max_methods)
     if isa(matches, FailedMethodMatch)
         add_remark!(interp, sv, matches.reason)
-        return CallMeta(Any, Any, Effects(), NoCallInfo())
+        return Future(CallMeta(Any, Any, Effects(), NoCallInfo()))
     end
 
     (; valid_worlds, applicable, info) = matches
     update_valid_age!(sv, valid_worlds)
-    napplicable = length(applicable)
+
+    # final result
+    gfresult = Future{CallMeta}()
+    # intermediate work for computing gfresult
     rettype = exctype = Bottom
     edges = MethodInstance[]
     conditionals = nothing # keeps refinement information of call argument types when the return type is boolean
-    seen = 0               # number of signatures actually inferred
+    seenall = true
     const_results = nothing # or const_results::Vector{Union{Nothing,ConstResult}} if any const results are available
-    multiple_matches = napplicable > 1
     fargs = arginfo.fargs
     all_effects = EFFECTS_TOTAL
     slotrefinements = nothing # keeps refinement information on slot types obtained from call signature
 
-    for i in 1:napplicable
-        match = applicable[i]::MethodMatch
-        method = match.method
-        sig = match.spec_types
-        if bail_out_toplevel_call(interp, InferenceLoopState(sig, rettype, all_effects), sv)
-            # only infer concrete call sites in top-level expressions
-            add_remark!(interp, sv, "Refusing to infer non-concrete call site in top-level expression")
-            break
-        end
-        this_rt = Bottom
-        this_exct = Bottom
-        splitunions = false
-        # TODO: this used to trigger a bug in inference recursion detection, and is unmaintained now
-        # sigtuple = unwrap_unionall(sig)::DataType
-        # splitunions = 1 < unionsplitcost(sigtuple.parameters) * napplicable <= InferenceParams(interp).max_union_splitting
-        if splitunions
-            splitsigs = switchtupleunion(sig)
-            for sig_n in splitsigs
-                result = abstract_call_method(interp, method, sig_n, svec(), multiple_matches, si, sv)
-                (; rt, exct, edge, effects, volatile_inf_result) = result
+    # split the for loop off into a function, so that we can pause and restart it at will
+    i::Int = 1
+    f = Core.Box(f)
+    atype = Core.Box(atype)
+    function infercalls(interp, sv)
+        napplicable = length(applicable)
+        multiple_matches = napplicable > 1
+        while i <= napplicable
+            match = applicable[i]::MethodMatch
+            method = match.method
+            sig = match.spec_types
+            if bail_out_toplevel_call(interp, InferenceLoopState(sig, rettype, all_effects), sv)
+                # only infer concrete call sites in top-level expressions
+                add_remark!(interp, sv, "Refusing to infer non-concrete call site in top-level expression")
+                seenall = false
+                break
+            end
+            # TODO: this is unmaintained now as it didn't seem to improve things, though it does avoid hard-coding the union split at the higher level,
+            # it also can hurt infer-ability of some constrained parameter types (e.g. quacks like a duck)
+            # sigtuple = unwrap_unionall(sig)::DataType
+            # splitunions = 1 < unionsplitcost(sigtuple.parameters) * napplicable <= InferenceParams(interp).max_union_splitting
+            #if splitunions
+            #    splitsigs = switchtupleunion(sig)
+            #    for sig_n in splitsigs
+            #        result = abstract_call_method(interp, method, sig_n, svec(), multiple_matches, si, sv)::Future
+            #        handle1(...)
+            #    end
+            #end
+            mresult = abstract_call_method(interp, method, sig, match.sparams, multiple_matches, si, sv)::Future
+            function handle1(interp, sv)
+                local (; rt, exct, edge, effects, volatile_inf_result) = mresult[]
+                this_conditional = ignorelimited(rt)
+                this_rt = widenwrappedconditional(rt)
+                this_exct = exct
+                # try constant propagation with argtypes for this match
+                # this is in preparation for inlining, or improving the return result
                 this_argtypes = isa(matches, MethodMatches) ? argtypes : matches.applicable_argtypes[i]
                 this_arginfo = ArgInfo(fargs, this_argtypes)
                 const_call_result = abstract_call_method_with_const_args(interp,
-                    result, f, this_arginfo, si, match, sv)
+                    mresult[], f.contents, this_arginfo, si, match, sv)
                 const_result = volatile_inf_result
                 if const_call_result !== nothing
-                    if const_call_result.rt ⊑ₚ rt
-                        rt = const_call_result.rt
+                    this_const_conditional = ignorelimited(const_call_result.rt)
+                    this_const_rt = widenwrappedconditional(const_call_result.rt)
+                    if this_const_rt ⊑ₚ this_rt
+                        # As long as the const-prop result we have is not *worse* than
+                        # what we found out on types, we'd like to use it. Even if the
+                        # end result is exactly equivalent, it is likely that the IR
+                        # we produced while constproping is better than that with
+                        # generic types.
+                        # Return type of const-prop' inference can be wider than that of non const-prop' inference
+                        # e.g. in cases when there are cycles but cached result is still accurate
+                        this_conditional = this_const_conditional
+                        this_rt = this_const_rt
                         (; effects, const_result, edge) = const_call_result
                     elseif is_better_effects(const_call_result.effects, effects)
                         (; effects, const_result, edge) = const_call_result
                     else
                         add_remark!(interp, sv, "[constprop] Discarded because the result was wider than inference")
                     end
-                    if !(exct ⊑ₚ const_call_result.exct)
-                        exct = const_call_result.exct
+                    # Treat the exception type separately. Currently, constprop often cannot determine the exception type
+                    # because consistent-cy does not apply to exceptions.
+                    if const_call_result.exct ⋤ this_exct
+                        this_exct = const_call_result.exct
                         (; const_result, edge) = const_call_result
                     else
                         add_remark!(interp, sv, "[constprop] Discarded exception type because result was wider than inference")
                     end
                 end
+
                 all_effects = merge_effects(all_effects, effects)
                 if const_result !== nothing
                     if const_results === nothing
-                        const_results = fill!(Vector{Union{Nothing,ConstResult}}(undef, #=TODO=#napplicable), nothing)
+                        const_results = fill!(Vector{Union{Nothing,ConstResult}}(undef, napplicable), nothing)
                     end
                     const_results[i] = const_result
                 end
                 edge === nothing || push!(edges, edge)
-                this_rt = this_rt ⊔ₚ rt
-                this_exct = this_exct ⊔ₚ exct
-                if bail_out_call(interp, this_rt, sv)
-                    break
+                @assert !(this_conditional isa Conditional || this_rt isa MustAlias) "invalid lattice element returned from inter-procedural context"
+                if can_propagate_conditional(this_conditional, argtypes)
+                    # The only case where we need to keep this in rt is where
+                    # we can directly propagate the conditional to a slot argument
+                    # that is not one of our arguments, otherwise we keep all the
+                    # relevant information in `conditionals` below.
+                    this_rt = this_conditional
                 end
-            end
-            this_conditional = ignorelimited(this_rt)
-            this_rt = widenwrappedconditional(this_rt)
-        else
-            result = abstract_call_method(interp, method, sig, match.sparams, multiple_matches, si, sv)
-            (; rt, exct, edge, effects, volatile_inf_result) = result
-            this_conditional = ignorelimited(rt)
-            this_rt = widenwrappedconditional(rt)
-            this_exct = exct
-            # try constant propagation with argtypes for this match
-            # this is in preparation for inlining, or improving the return result
-            this_argtypes = isa(matches, MethodMatches) ? argtypes : matches.applicable_argtypes[i]
-            this_arginfo = ArgInfo(fargs, this_argtypes)
-            const_call_result = abstract_call_method_with_const_args(interp,
-                result, f, this_arginfo, si, match, sv)
-            const_result = volatile_inf_result
-            if const_call_result !== nothing
-                this_const_conditional = ignorelimited(const_call_result.rt)
-                this_const_rt = widenwrappedconditional(const_call_result.rt)
-                if this_const_rt ⊑ₚ this_rt
-                    # As long as the const-prop result we have is not *worse* than
-                    # what we found out on types, we'd like to use it. Even if the
-                    # end result is exactly equivalent, it is likely that the IR
-                    # we produced while constproping is better than that with
-                    # generic types.
-                    # Return type of const-prop' inference can be wider than that of non const-prop' inference
-                    # e.g. in cases when there are cycles but cached result is still accurate
-                    this_conditional = this_const_conditional
-                    this_rt = this_const_rt
-                    (; effects, const_result, edge) = const_call_result
-                elseif is_better_effects(const_call_result.effects, effects)
-                    (; effects, const_result, edge) = const_call_result
-                else
-                    add_remark!(interp, sv, "[constprop] Discarded because the result was wider than inference")
+
+                rettype = rettype ⊔ₚ this_rt
+                exctype = exctype ⊔ₚ this_exct
+                if has_conditional(𝕃ₚ, sv) && this_conditional !== Bottom && is_lattice_bool(𝕃ₚ, rettype) && fargs !== nothing
+                    if conditionals === nothing
+                        conditionals = Any[Bottom for _ in 1:length(argtypes)],
+                                       Any[Bottom for _ in 1:length(argtypes)]
+                    end
+                    for i = 1:length(argtypes)
+                        cnd = conditional_argtype(𝕃ᵢ, this_conditional, sig, argtypes, i)
+                        conditionals[1][i] = conditionals[1][i] ⊔ᵢ cnd.thentype
+                        conditionals[2][i] = conditionals[2][i] ⊔ᵢ cnd.elsetype
+                    end
                 end
-                # Treat the exception type separately. Currently, constprop often cannot determine the exception type
-                # because consistent-cy does not apply to exceptions.
-                if !(this_exct ⊑ₚ const_call_result.exct)
-                    this_exct = const_call_result.exct
-                    (; const_result, edge) = const_call_result
-                else
-                    add_remark!(interp, sv, "[constprop] Discarded exception type because result was wider than inference")
+                if i < napplicable && bail_out_call(interp, InferenceLoopState(sig, rettype, all_effects), sv)
+                    add_remark!(interp, sv, "Call inference reached maximally imprecise information. Bailing on.")
+                    seenall = false
+                    i = napplicable # break in outer function
                 end
+                i += 1
+                return true
             end
-            all_effects = merge_effects(all_effects, effects)
-            if const_result !== nothing
-                if const_results === nothing
-                    const_results = fill!(Vector{Union{Nothing,ConstResult}}(undef, napplicable), nothing)
-                end
-                const_results[i] = const_result
+            if isready(mresult) && handle1(interp, sv)
+                continue
+            else
+                push!(sv.tasks, handle1)
+                return false
             end
-            edge === nothing || push!(edges, edge)
-        end
-        @assert !(this_conditional isa Conditional || this_rt isa MustAlias) "invalid lattice element returned from inter-procedural context"
-        seen += 1
+        end # while
 
-        if can_propagate_conditional(this_conditional, argtypes)
-            # The only case where we need to keep this in rt is where
-            # we can directly propagate the conditional to a slot argument
-            # that is not one of our arguments, otherwise we keep all the
-            # relevant information in `conditionals` below.
-            this_rt = this_conditional
+        if const_results !== nothing
+            @assert napplicable == nmatches(info) == length(const_results)
+            info = ConstCallInfo(info, const_results)
         end
 
-        rettype = rettype ⊔ₚ this_rt
-        exctype = exctype ⊔ₚ this_exct
-        if has_conditional(𝕃ₚ, sv) && this_conditional !== Bottom && is_lattice_bool(𝕃ₚ, rettype) && fargs !== nothing
-            if conditionals === nothing
-                conditionals = Any[Bottom for _ in 1:length(argtypes)],
-                               Any[Bottom for _ in 1:length(argtypes)]
+        if seenall
+            if !fully_covering(matches) || any_ambig(matches)
+                # Account for the fact that we may encounter a MethodError with a non-covered or ambiguous signature.
+                all_effects = Effects(all_effects; nothrow=false)
+                exctype = exctype ⊔ₚ MethodError
             end
-            for i = 1:length(argtypes)
-                cnd = conditional_argtype(𝕃ᵢ, this_conditional, sig, argtypes, i)
-                conditionals[1][i] = conditionals[1][i] ⊔ᵢ cnd.thentype
-                conditionals[2][i] = conditionals[2][i] ⊔ᵢ cnd.elsetype
+            if sv isa InferenceState && fargs !== nothing
+                slotrefinements = collect_slot_refinements(𝕃ᵢ, applicable, argtypes, fargs, sv)
             end
-        end
-        if bail_out_call(interp, InferenceLoopState(sig, rettype, all_effects), sv)
-            add_remark!(interp, sv, "Call inference reached maximally imprecise information. Bailing on.")
-            break
-        end
-    end
-
-    if const_results !== nothing
-        @assert napplicable == nmatches(info) == length(const_results)
-        info = ConstCallInfo(info, const_results)
-    end
-
-    if seen ≠ napplicable
-        # there is unanalyzed candidate, widen type and effects to the top
-        rettype = exctype = Any
-        all_effects = Effects()
-    else
-        if (matches isa MethodMatches ? (!matches.fullmatch || any_ambig(matches)) :
-            (!all(matches.fullmatches) || any_ambig(matches)))
-            # Account for the fact that we may encounter a MethodError with a non-covered or ambiguous signature.
-            all_effects = Effects(all_effects; nothrow=false)
-            exctype = exctype ⊔ₚ MethodError
-        end
-        if sv isa InferenceState && fargs !== nothing
-            slotrefinements = collect_slot_refinements(𝕃ᵢ, applicable, argtypes, fargs, sv)
-        end
-    end
-
-    rettype = from_interprocedural!(interp, rettype, sv, arginfo, conditionals)
-
-    # Also considering inferring the compilation signature for this method, so
-    # it is available to the compiler in case it ends up needing it.
-    if (isa(sv, InferenceState) && infer_compilation_signature(interp) &&
-        (1 == seen == napplicable) && rettype !== Any && rettype !== Bottom &&
-        !is_removable_if_unused(all_effects))
-        match = applicable[1]::MethodMatch
-        method = match.method
-        sig = match.spec_types
-        mi = specialize_method(match; preexisting=true)
-        if mi !== nothing && !const_prop_methodinstance_heuristic(interp, mi, arginfo, sv)
-            csig = get_compileable_sig(method, sig, match.sparams)
-            if csig !== nothing && csig !== sig
-                abstract_call_method(interp, method, csig, match.sparams, multiple_matches, StmtInfo(false), sv)
+        else
+            # there is unanalyzed candidate, widen type and effects to the top
+            rettype = exctype = Any
+            all_effects = Effects()
+        end
+
+        rettype = from_interprocedural!(interp, rettype, sv, arginfo, conditionals)
+
+        # Also considering inferring the compilation signature for this method, so
+        # it is available to the compiler in case it ends up needing it.
+        if (isa(sv, InferenceState) && infer_compilation_signature(interp) &&
+            (seenall && 1 == napplicable) && rettype !== Any && rettype !== Bottom &&
+            !is_removable_if_unused(all_effects))
+            match = applicable[1]::MethodMatch
+            method = match.method
+            sig = match.spec_types
+            mi = specialize_method(match; preexisting=true)
+            if mi !== nothing && !const_prop_methodinstance_heuristic(interp, mi, arginfo, sv)
+                csig = get_compileable_sig(method, sig, match.sparams)
+                if csig !== nothing && csig !== sig
+                    abstract_call_method(interp, method, csig, match.sparams, multiple_matches, StmtInfo(false), sv)::Future
+                end
             end
         end
-    end
 
-    if call_result_unused(si) && !(rettype === Bottom)
-        add_remark!(interp, sv, "Call result type was widened because the return value is unused")
-        # We're mainly only here because the optimizer might want this code,
-        # but we ourselves locally don't typically care about it locally
-        # (beyond checking if it always throws).
-        # So avoid adding an edge, since we don't want to bother attempting
-        # to improve our result even if it does change (to always throw),
-        # and avoid keeping track of a more complex result type.
-        rettype = Any
-    end
-    any_slot_refined = slotrefinements !== nothing
-    add_call_backedges!(interp, rettype, all_effects, any_slot_refined, edges, matches, atype, sv)
-    if isa(sv, InferenceState)
-        # TODO (#48913) implement a proper recursion handling for irinterp:
-        # This works just because currently the `:terminate` condition guarantees that
-        # irinterp doesn't fail into unresolved cycles, but it's not a good solution.
-        # We should revisit this once we have a better story for handling cycles in irinterp.
-        if !isempty(sv.pclimitations) # remove self, if present
-            delete!(sv.pclimitations, sv)
-            for caller in callers_in_cycle(sv)
-                delete!(sv.pclimitations, caller)
+        if call_result_unused(si) && !(rettype === Bottom)
+            add_remark!(interp, sv, "Call result type was widened because the return value is unused")
+            # We're mainly only here because the optimizer might want this code,
+            # but we ourselves locally don't typically care about it locally
+            # (beyond checking if it always throws).
+            # So avoid adding an edge, since we don't want to bother attempting
+            # to improve our result even if it does change (to always throw),
+            # and avoid keeping track of a more complex result type.
+            rettype = Any
+        end
+        any_slot_refined = slotrefinements !== nothing
+        add_call_backedges!(interp, rettype, all_effects, any_slot_refined, edges, matches, atype.contents, sv)
+        if isa(sv, InferenceState)
+            # TODO (#48913) implement a proper recursion handling for irinterp:
+            # This works just because currently the `:terminate` condition guarantees that
+            # irinterp doesn't fail into unresolved cycles, but it's not a good solution.
+            # We should revisit this once we have a better story for handling cycles in irinterp.
+            if !isempty(sv.pclimitations) # remove self, if present
+                delete!(sv.pclimitations, sv)
+                for caller in callers_in_cycle(sv)
+                    delete!(sv.pclimitations, caller)
+                end
             end
         end
-    end
 
-    return CallMeta(rettype, exctype, all_effects, info, slotrefinements)
+        gfresult[] = CallMeta(rettype, exctype, all_effects, info, slotrefinements)
+        return true
+    end # infercalls
+    # start making progress on the first call
+    infercalls(interp, sv) || push!(sv.tasks, infercalls)
+    return gfresult
 end
 
 struct FailedMethodMatch
@@ -276,21 +261,47 @@ struct MethodMatches
     applicable::Vector{Any}
     info::MethodMatchInfo
     valid_worlds::WorldRange
-    mt::MethodTable
-    fullmatch::Bool
 end
-any_ambig(info::MethodMatchInfo) = info.results.ambig
+any_ambig(result::MethodLookupResult) = result.ambig
+any_ambig(info::MethodMatchInfo) = any_ambig(info.results)
 any_ambig(m::MethodMatches) = any_ambig(m.info)
+fully_covering(info::MethodMatchInfo) = info.fullmatch
+fully_covering(m::MethodMatches) = fully_covering(m.info)
+function add_uncovered_edges!(sv::AbsIntState, info::MethodMatchInfo, @nospecialize(atype))
+    fully_covering(info) || add_mt_backedge!(sv, info.mt, atype)
+    nothing
+end
+add_uncovered_edges!(sv::AbsIntState, matches::MethodMatches, @nospecialize(atype)) =
+    add_uncovered_edges!(sv, matches.info, atype)
 
 struct UnionSplitMethodMatches
     applicable::Vector{Any}
     applicable_argtypes::Vector{Vector{Any}}
     info::UnionSplitInfo
     valid_worlds::WorldRange
-    mts::Vector{MethodTable}
-    fullmatches::Vector{Bool}
 end
-any_ambig(m::UnionSplitMethodMatches) = any(any_ambig, m.info.matches)
+any_ambig(info::UnionSplitInfo) = any(any_ambig, info.split)
+any_ambig(m::UnionSplitMethodMatches) = any_ambig(m.info)
+fully_covering(info::UnionSplitInfo) = all(fully_covering, info.split)
+fully_covering(m::UnionSplitMethodMatches) = fully_covering(m.info)
+function add_uncovered_edges!(sv::AbsIntState, info::UnionSplitInfo, @nospecialize(atype))
+    all(fully_covering, info.split) && return nothing
+    # add mt backedges with removing duplications
+    for mt in uncovered_method_tables(info)
+        add_mt_backedge!(sv, mt, atype)
+    end
+end
+add_uncovered_edges!(sv::AbsIntState, matches::UnionSplitMethodMatches, @nospecialize(atype)) =
+    add_uncovered_edges!(sv, matches.info, atype)
+function uncovered_method_tables(info::UnionSplitInfo)
+    mts = MethodTable[]
+    for mminfo in info.split
+        fully_covering(mminfo) && continue
+        any(mt′::MethodTable->mt′===mminfo.mt, mts) && continue
+        push!(mts, mminfo.mt)
+    end
+    return mts
+end
 
 function find_method_matches(interp::AbstractInterpreter, argtypes::Vector{Any}, @nospecialize(atype);
                              max_union_splitting::Int = InferenceParams(interp).max_union_splitting,
@@ -312,41 +323,28 @@ function find_union_split_method_matches(interp::AbstractInterpreter, argtypes::
     applicable = Any[]
     applicable_argtypes = Vector{Any}[] # arrays like `argtypes`, including constants, for each match
     valid_worlds = WorldRange()
-    mts = MethodTable[]
-    fullmatches = Bool[]
     for i in 1:length(split_argtypes)
         arg_n = split_argtypes[i]::Vector{Any}
         sig_n = argtypes_to_type(arg_n)
         mt = ccall(:jl_method_table_for, Any, (Any,), sig_n)
         mt === nothing && return FailedMethodMatch("Could not identify method table for call")
         mt = mt::MethodTable
-        matches = findall(sig_n, method_table(interp); limit = max_methods)
-        if matches === nothing
+        thismatches = findall(sig_n, method_table(interp); limit = max_methods)
+        if thismatches === nothing
             return FailedMethodMatch("For one of the union split cases, too many methods matched")
         end
-        push!(infos, MethodMatchInfo(matches))
-        for m in matches
+        for m in thismatches
             push!(applicable, m)
             push!(applicable_argtypes, arg_n)
         end
-        valid_worlds = intersect(valid_worlds, matches.valid_worlds)
-        thisfullmatch = any(match::MethodMatch->match.fully_covers, matches)
-        found = false
-        for (i, mt′) in enumerate(mts)
-            if mt′ === mt
-                fullmatches[i] &= thisfullmatch
-                found = true
-                break
-            end
-        end
-        if !found
-            push!(mts, mt)
-            push!(fullmatches, thisfullmatch)
-        end
+        valid_worlds = intersect(valid_worlds, thismatches.valid_worlds)
+        thisfullmatch = any(match::MethodMatch->match.fully_covers, thismatches)
+        thisinfo = MethodMatchInfo(thismatches, mt, thisfullmatch)
+        push!(infos, thisinfo)
     end
     info = UnionSplitInfo(infos)
     return UnionSplitMethodMatches(
-        applicable, applicable_argtypes, info, valid_worlds, mts, fullmatches)
+        applicable, applicable_argtypes, info, valid_worlds)
 end
 
 function find_simple_method_matches(interp::AbstractInterpreter, @nospecialize(atype), max_methods::Int)
@@ -361,10 +359,9 @@ function find_simple_method_matches(interp::AbstractInterpreter, @nospecialize(a
         # (assume this will always be true, so we don't compute / update valid age in this case)
         return FailedMethodMatch("Too many methods matched")
     end
-    info = MethodMatchInfo(matches)
     fullmatch = any(match::MethodMatch->match.fully_covers, matches)
-    return MethodMatches(
-        matches.matches, info, matches.valid_worlds, mt, fullmatch)
+    info = MethodMatchInfo(matches, mt, fullmatch)
+    return MethodMatches(matches.matches, info, matches.valid_worlds)
 end
 
 """
@@ -584,13 +581,7 @@ function add_call_backedges!(interp::AbstractInterpreter, @nospecialize(rettype)
     end
     # also need an edge to the method table in case something gets
     # added that did not intersect with any existing method
-    if isa(matches, MethodMatches)
-        matches.fullmatch || add_mt_backedge!(sv, matches.mt, atype)
-    else
-        for (thisfullmatch, mt) in zip(matches.fullmatches, matches.mts)
-            thisfullmatch || add_mt_backedge!(sv, mt, atype)
-        end
-    end
+    add_uncovered_edges!(sv, matches, atype)
     return nothing
 end
 
@@ -603,9 +594,9 @@ function abstract_call_method(interp::AbstractInterpreter,
                               hardlimit::Bool, si::StmtInfo, sv::AbsIntState)
     sigtuple = unwrap_unionall(sig)
     sigtuple isa DataType ||
-        return MethodCallResult(Any, Any, false, false, nothing, Effects())
+        return Future(MethodCallResult(Any, Any, false, false, nothing, Effects()))
     all(@nospecialize(x) -> valid_as_lattice(unwrapva(x), true), sigtuple.parameters) ||
-        return MethodCallResult(Union{}, Any, false, false, nothing, EFFECTS_THROWS) # catch bad type intersections early
+        return Future(MethodCallResult(Union{}, Any, false, false, nothing, EFFECTS_THROWS)) # catch bad type intersections early
 
     if is_nospecializeinfer(method)
         sig = get_nospecializeinfer_sig(method, sig, sparams)
@@ -630,7 +621,7 @@ function abstract_call_method(interp::AbstractInterpreter,
                     # we have a self-cycle in the call-graph, but not in the inference graph (typically):
                     # break this edge now (before we record it) by returning early
                     # (non-typically, this means that we lose the ability to detect a guaranteed StackOverflow in some cases)
-                    return MethodCallResult(Any, Any, true, true, nothing, Effects())
+                    return Future(MethodCallResult(Any, Any, true, true, nothing, Effects()))
                 end
                 topmost = nothing
                 edgecycle = true
@@ -685,7 +676,7 @@ function abstract_call_method(interp::AbstractInterpreter,
                 # since it's very unlikely that we'll try to inline this,
                 # or want make an invoke edge to its calling convention return type.
                 # (non-typically, this means that we lose the ability to detect a guaranteed StackOverflow in some cases)
-                return MethodCallResult(Any, Any, true, true, nothing, Effects())
+                return Future(MethodCallResult(Any, Any, true, true, nothing, Effects()))
             end
             add_remark!(interp, sv, washardlimit ? RECURSION_MSG_HARDLIMIT : RECURSION_MSG)
             # TODO (#48913) implement a proper recursion handling for irinterp:
@@ -741,31 +732,7 @@ function abstract_call_method(interp::AbstractInterpreter,
         sparams = recomputed[2]::SimpleVector
     end
 
-    (; rt, exct, edge, effects, volatile_inf_result) = typeinf_edge(interp, method, sig, sparams, sv)
-
-    if edge === nothing
-        edgecycle = edgelimited = true
-    end
-
-    # we look for the termination effect override here as well, since the :terminates effect
-    # may have been tainted due to recursion at this point even if it's overridden
-    if is_effect_overridden(sv, :terminates_globally)
-        # this frame is known to terminate
-        effects = Effects(effects, terminates=true)
-    elseif is_effect_overridden(method, :terminates_globally)
-        # this edge is known to terminate
-        effects = Effects(effects; terminates=true)
-    elseif edgecycle
-        # Some sort of recursion was detected.
-        if edge !== nothing && !edgelimited && !is_edge_recursed(edge, sv)
-            # no `MethodInstance` cycles -- don't taint :terminate
-        else
-            # we cannot guarantee that the call will terminate
-            effects = Effects(effects; terminates=false)
-        end
-    end
-
-    return MethodCallResult(rt, exct, edgecycle, edgelimited, edge, effects, volatile_inf_result)
+    return typeinf_edge(interp, method, sig, sparams, sv, edgecycle, edgelimited)
 end
 
 function edge_matches_sv(interp::AbstractInterpreter, frame::AbsIntState,
@@ -1315,7 +1282,7 @@ function semi_concrete_eval_call(interp::AbstractInterpreter,
                     effects = Effects(effects; noub=ALWAYS_TRUE)
                 end
                 exct = refine_exception_type(result.exct, effects)
-                return ConstCallResults(rt, exct, SemiConcreteResult(mi, ir, effects), effects, mi)
+                return ConstCallResults(rt, exct, SemiConcreteResult(mi, ir, effects, spec_info(irsv)), effects, mi)
             end
         end
     end
@@ -1327,7 +1294,7 @@ const_prop_result(inf_result::InferenceResult) =
                      inf_result.ipo_effects, inf_result.linfo)
 
 # return cached result of constant analysis
-return_cached_result(::AbstractInterpreter, inf_result::InferenceResult, ::AbsIntState) =
+return_localcache_result(::AbstractInterpreter, inf_result::InferenceResult, ::AbsIntState) =
     const_prop_result(inf_result)
 
 function compute_forwarded_argtypes(interp::AbstractInterpreter, arginfo::ArgInfo, sv::AbsIntState)
@@ -1357,7 +1324,7 @@ function const_prop_call(interp::AbstractInterpreter,
             return nothing
         end
         @assert inf_result.linfo === mi "MethodInstance for cached inference result does not match"
-        return return_cached_result(interp, inf_result, sv)
+        return return_localcache_result(interp, inf_result, sv)
     end
     overridden_by_const = falses(length(argtypes))
     for i = 1:length(argtypes)
@@ -1371,7 +1338,7 @@ function const_prop_call(interp::AbstractInterpreter,
     end
     # perform fresh constant prop'
     inf_result = InferenceResult(mi, argtypes, overridden_by_const)
-    frame = InferenceState(inf_result, #=cache_mode=#:local, interp)
+    frame = InferenceState(inf_result, #=cache_mode=#:local, interp) # TODO: this should also be converted to a stackless Future
     if frame === nothing
         add_remark!(interp, sv, "[constprop] Could not retrieve the source")
         return nothing # this is probably a bad generated function (unsound), but just ignore it
@@ -1513,9 +1480,9 @@ function precise_container_type(interp::AbstractInterpreter, @nospecialize(itft)
         widet = typ.typ
         if isa(widet, DataType)
             if widet.name === Tuple.name
-                return AbstractIterationResult(typ.fields, nothing)
+                return Future(AbstractIterationResult(typ.fields, nothing))
             elseif widet.name === _NAMEDTUPLE_NAME
-                return AbstractIterationResult(typ.fields, nothing)
+                return Future(AbstractIterationResult(typ.fields, nothing))
             end
         end
     end
@@ -1523,7 +1490,7 @@ function precise_container_type(interp::AbstractInterpreter, @nospecialize(itft)
     if isa(typ, Const)
         val = typ.val
         if isa(val, SimpleVector) || isa(val, Tuple) || isa(val, NamedTuple)
-            return AbstractIterationResult(Any[ Const(val[i]) for i in 1:length(val) ], nothing) # avoid making a tuple Generator here!
+            return Future(AbstractIterationResult(Any[ Const(val[i]) for i in 1:length(val) ], nothing)) # avoid making a tuple Generator here!
         end
     end
 
@@ -1540,18 +1507,18 @@ function precise_container_type(interp::AbstractInterpreter, @nospecialize(itft)
         # refine the Union to remove elements that are not valid tags for objects
         filter!(@nospecialize(x) -> valid_as_lattice(x, true), utis)
         if length(utis) == 0
-            return AbstractIterationResult(Any[], nothing) # oops, this statement was actually unreachable
+            return Future(AbstractIterationResult(Any[], nothing)) # oops, this statement was actually unreachable
         elseif length(utis) == 1
             tti = utis[1]
             tti0 = rewrap_unionall(tti, tti0)
         else
             if any(@nospecialize(t) -> !isa(t, DataType) || !(t <: Tuple) || !isknownlength(t), utis)
-                return AbstractIterationResult(Any[Vararg{Any}], nothing, Effects())
+                return Future(AbstractIterationResult(Any[Vararg{Any}], nothing, Effects()))
             end
             ltp = length((utis[1]::DataType).parameters)
             for t in utis
                 if length((t::DataType).parameters) != ltp
-                    return AbstractIterationResult(Any[Vararg{Any}], nothing)
+                    return Future(AbstractIterationResult(Any[Vararg{Any}], nothing))
                 end
             end
             result = Any[ Union{} for _ in 1:ltp ]
@@ -1562,14 +1529,14 @@ function precise_container_type(interp::AbstractInterpreter, @nospecialize(itft)
                     result[j] = tmerge(result[j], rewrap_unionall(tps[j], tti0))
                 end
             end
-            return AbstractIterationResult(result, nothing)
+            return Future(AbstractIterationResult(result, nothing))
         end
     end
     if tti0 <: Tuple
         if isa(tti0, DataType)
-            return AbstractIterationResult(Any[ p for p in tti0.parameters ], nothing)
+            return Future(AbstractIterationResult(Any[ p for p in tti0.parameters ], nothing))
         elseif !isa(tti, DataType)
-            return AbstractIterationResult(Any[Vararg{Any}], nothing)
+            return Future(AbstractIterationResult(Any[Vararg{Any}], nothing))
         else
             len = length(tti.parameters)
             last = tti.parameters[len]
@@ -1582,17 +1549,17 @@ function precise_container_type(interp::AbstractInterpreter, @nospecialize(itft)
                     elts[len] = Vararg{elts[len]}
                 end
             end
-            return AbstractIterationResult(elts, nothing)
+            return Future(AbstractIterationResult(elts, nothing))
         end
     elseif tti0 === SimpleVector
-        return AbstractIterationResult(Any[Vararg{Any}], nothing)
+        return Future(AbstractIterationResult(Any[Vararg{Any}], nothing))
     elseif tti0 === Any
-        return AbstractIterationResult(Any[Vararg{Any}], nothing, Effects())
+        return Future(AbstractIterationResult(Any[Vararg{Any}], nothing, Effects()))
     elseif tti0 <: Array || tti0 <: GenericMemory
         if eltype(tti0) === Union{}
-            return AbstractIterationResult(Any[], nothing)
+            return Future(AbstractIterationResult(Any[], nothing))
         end
-        return AbstractIterationResult(Any[Vararg{eltype(tti0)}], nothing)
+        return Future(AbstractIterationResult(Any[Vararg{eltype(tti0)}], nothing))
     else
         return abstract_iteration(interp, itft, typ, sv)
     end
@@ -1603,95 +1570,144 @@ function abstract_iteration(interp::AbstractInterpreter, @nospecialize(itft), @n
     if isa(itft, Const)
         iteratef = itft.val
     else
-        return AbstractIterationResult(Any[Vararg{Any}], nothing, Effects())
+        return Future(AbstractIterationResult(Any[Vararg{Any}], nothing, Effects()))
     end
     @assert !isvarargtype(itertype)
-    call = abstract_call_known(interp, iteratef, ArgInfo(nothing, Any[itft, itertype]), StmtInfo(true), sv)
-    stateordonet = call.rt
-    info = call.info
-    # Return Bottom if this is not an iterator.
-    # WARNING: Changes to the iteration protocol must be reflected here,
-    # this is not just an optimization.
-    # TODO: this doesn't realize that Array, GenericMemory, SimpleVector, Tuple, and NamedTuple do not use the iterate protocol
-    stateordonet === Bottom && return AbstractIterationResult(Any[Bottom], AbstractIterationInfo(CallMeta[CallMeta(Bottom, Any, call.effects, info)], true))
-    valtype = statetype = Bottom
-    ret = Any[]
-    calls = CallMeta[call]
-    stateordonet_widened = widenconst(stateordonet)
-    𝕃ᵢ = typeinf_lattice(interp)
 
-    # Try to unroll the iteration up to max_tuple_splat, which covers any finite
-    # length iterators, or interesting prefix
-    while true
-        if stateordonet_widened === Nothing
-            return AbstractIterationResult(ret, AbstractIterationInfo(calls, true))
-        end
-        if Nothing <: stateordonet_widened || length(ret) >= InferenceParams(interp).max_tuple_splat
-            break
-        end
-        if !isa(stateordonet_widened, DataType) || !(stateordonet_widened <: Tuple) || isvatuple(stateordonet_widened) || length(stateordonet_widened.parameters) != 2
-            break
-        end
-        nstatetype = getfield_tfunc(𝕃ᵢ, stateordonet, Const(2))
-        # If there's no new information in this statetype, don't bother continuing,
-        # the iterator won't be finite.
-        if ⊑(𝕃ᵢ, nstatetype, statetype)
-            return AbstractIterationResult(Any[Bottom], AbstractIterationInfo(calls, false), EFFECTS_THROWS)
-        end
-        valtype = getfield_tfunc(𝕃ᵢ, stateordonet, Const(1))
-        push!(ret, valtype)
-        statetype = nstatetype
-        call = abstract_call_known(interp, iteratef, ArgInfo(nothing, Any[Const(iteratef), itertype, statetype]), StmtInfo(true), sv)
-        stateordonet = call.rt
+    iterateresult = Future{AbstractIterationResult}()
+    call1future = abstract_call_known(interp, iteratef, ArgInfo(nothing, Any[itft, itertype]), StmtInfo(true), sv)::Future
+    function inferiterate(interp, sv)
+        call1 = call1future[]
+        stateordonet = call1.rt
+        # Return Bottom if this is not an iterator.
+        # WARNING: Changes to the iteration protocol must be reflected here,
+        # this is not just an optimization.
+        # TODO: this doesn't realize that Array, GenericMemory, SimpleVector, Tuple, and NamedTuple do not use the iterate protocol
+        if stateordonet === Bottom
+            iterateresult[] = AbstractIterationResult(Any[Bottom], AbstractIterationInfo(CallMeta[CallMeta(Bottom, Any, call1.effects, call1.info)], true))
+            return true
+        end
         stateordonet_widened = widenconst(stateordonet)
-        push!(calls, call)
-    end
-    # From here on, we start asking for results on the widened types, rather than
-    # the precise (potentially const) state type
-    # statetype and valtype are reinitialized in the first iteration below from the
-    # (widened) stateordonet, which has not yet been fully analyzed in the loop above
-    valtype = statetype = Bottom
-    may_have_terminated = Nothing <: stateordonet_widened
-    while valtype !== Any
-        nounion = typeintersect(stateordonet_widened, Tuple{Any,Any})
-        if nounion !== Union{} && !isa(nounion, DataType)
-            # nounion is of a type we cannot handle
-            valtype = Any
-            break
-        end
-        if nounion === Union{} || (nounion.parameters[1] <: valtype && nounion.parameters[2] <: statetype)
-            # reached a fixpoint or iterator failed/gave invalid answer
-            if !hasintersect(stateordonet_widened, Nothing)
-                # ... but cannot terminate
-                if !may_have_terminated
-                    #  ... and cannot have terminated prior to this loop
-                    return AbstractIterationResult(Any[Bottom], AbstractIterationInfo(calls, false), Effects())
-                else
-                    # iterator may have terminated prior to this loop, but not during it
-                    valtype = Bottom
+        calls = CallMeta[call1]
+        valtype = statetype = Bottom
+        ret = Any[]
+        𝕃ᵢ = typeinf_lattice(interp)
+        may_have_terminated = false
+        local call2future::Future{CallMeta}
+
+        nextstate::UInt8 = 0x0
+        function inferiterate_2arg(interp, sv)
+            if nextstate === 0x1
+                nextstate = 0xff
+                @goto state1
+            elseif nextstate === 0x2
+                nextstate = 0xff
+                @goto state2
+            else
+                @assert nextstate === 0x0
+                nextstate = 0xff
+            end
+
+            # Try to unroll the iteration up to max_tuple_splat, which covers any finite
+            # length iterators, or interesting prefix
+            while true
+                if stateordonet_widened === Nothing
+                    iterateresult[] = AbstractIterationResult(ret, AbstractIterationInfo(calls, true))
+                    return true
+                end
+                if Nothing <: stateordonet_widened || length(ret) >= InferenceParams(interp).max_tuple_splat
+                    break
+                end
+                if !isa(stateordonet_widened, DataType) || !(stateordonet_widened <: Tuple) || isvatuple(stateordonet_widened) || length(stateordonet_widened.parameters) != 2
+                    break
+                end
+                nstatetype = getfield_tfunc(𝕃ᵢ, stateordonet, Const(2))
+                # If there's no new information in this statetype, don't bother continuing,
+                # the iterator won't be finite.
+                if ⊑(𝕃ᵢ, nstatetype, statetype)
+                    iterateresult[] = AbstractIterationResult(Any[Bottom], AbstractIterationInfo(calls, false), EFFECTS_THROWS)
+                    return true
+                end
+                valtype = getfield_tfunc(𝕃ᵢ, stateordonet, Const(1))
+                push!(ret, valtype)
+                statetype = nstatetype
+                call2future = abstract_call_known(interp, iteratef, ArgInfo(nothing, Any[Const(iteratef), itertype, statetype]), StmtInfo(true), sv)::Future
+                if !isready(call2future)
+                    nextstate = 0x1
+                    return false
+                    @label state1
+                end
+                let call = call2future[]
+                    push!(calls, call)
+                    stateordonet = call.rt
+                    stateordonet_widened = widenconst(stateordonet)
                 end
             end
-            break
-        end
-        valtype = tmerge(valtype, nounion.parameters[1])
-        statetype = tmerge(statetype, nounion.parameters[2])
-        call = abstract_call_known(interp, iteratef, ArgInfo(nothing, Any[Const(iteratef), itertype, statetype]), StmtInfo(true), sv)
-        push!(calls, call)
-        stateordonet = call.rt
-        stateordonet_widened = widenconst(stateordonet)
-    end
-    if valtype !== Union{}
-        push!(ret, Vararg{valtype})
+            # From here on, we start asking for results on the widened types, rather than
+            # the precise (potentially const) state type
+            # statetype and valtype are reinitialized in the first iteration below from the
+            # (widened) stateordonet, which has not yet been fully analyzed in the loop above
+            valtype = statetype = Bottom
+            may_have_terminated = Nothing <: stateordonet_widened
+            while valtype !== Any
+                nounion = typeintersect(stateordonet_widened, Tuple{Any,Any})
+                if nounion !== Union{} && !isa(nounion, DataType)
+                    # nounion is of a type we cannot handle
+                    valtype = Any
+                    break
+                end
+                if nounion === Union{} || (nounion.parameters[1] <: valtype && nounion.parameters[2] <: statetype)
+                    # reached a fixpoint or iterator failed/gave invalid answer
+                    if !hasintersect(stateordonet_widened, Nothing)
+                        # ... but cannot terminate
+                        if may_have_terminated
+                            # ... and iterator may have terminated prior to this loop, but not during it
+                            valtype = Bottom
+                        else
+                            #  ... or cannot have terminated prior to this loop
+                            iterateresult[] = AbstractIterationResult(Any[Bottom], AbstractIterationInfo(calls, false), Effects())
+                            return true
+                        end
+                    end
+                    break
+                end
+                valtype = tmerge(valtype, nounion.parameters[1])
+                statetype = tmerge(statetype, nounion.parameters[2])
+                call2future = abstract_call_known(interp, iteratef, ArgInfo(nothing, Any[Const(iteratef), itertype, statetype]), StmtInfo(true), sv)::Future
+                if !isready(call2future)
+                    nextstate = 0x2
+                    return false
+                    @label state2
+                end
+                let call = call2future[]
+                    push!(calls, call)
+                    stateordonet = call.rt
+                    stateordonet_widened = widenconst(stateordonet)
+                end
+            end
+            if valtype !== Union{}
+                push!(ret, Vararg{valtype})
+            end
+            iterateresult[] = AbstractIterationResult(ret, AbstractIterationInfo(calls, false))
+            return true
+        end # inferiterate_2arg
+        # continue making progress as much as possible, on iterate(arg, state)
+        inferiterate_2arg(interp, sv) || push!(sv.tasks, inferiterate_2arg)
+        return true
+    end # inferiterate
+    # continue making progress as soon as possible, on iterate(arg)
+    if !(isready(call1future) && inferiterate(interp, sv))
+        push!(sv.tasks, inferiterate)
     end
-    return AbstractIterationResult(ret, AbstractIterationInfo(calls, false))
+    return iterateresult
 end
 
 # do apply(af, fargs...), where af is a function value
 function abstract_apply(interp::AbstractInterpreter, argtypes::Vector{Any}, si::StmtInfo,
                         sv::AbsIntState, max_methods::Int=get_max_methods(interp, sv))
-    itft = argtype_by_index(argtypes, 2)
+    itft = Core.Box(argtype_by_index(argtypes, 2))
     aft = argtype_by_index(argtypes, 3)
-    (itft === Bottom || aft === Bottom) && return CallMeta(Bottom, Any, EFFECTS_THROWS, NoCallInfo())
+    (itft.contents === Bottom || aft === Bottom) && return Future(CallMeta(Bottom, Any, EFFECTS_THROWS, NoCallInfo()))
     aargtypes = argtype_tail(argtypes, 4)
     aftw = widenconst(aft)
     if !isa(aft, Const) && !isa(aft, PartialOpaque) && (!isType(aftw) || has_free_typevars(aftw))
@@ -1699,100 +1715,155 @@ function abstract_apply(interp::AbstractInterpreter, argtypes::Vector{Any}, si::
             add_remark!(interp, sv, "Core._apply_iterate called on a function of a non-concrete type")
             # bail now, since it seems unlikely that abstract_call will be able to do any better after splitting
             # this also ensures we don't call abstract_call_gf_by_type below on an IntrinsicFunction or Builtin
-            return CallMeta(Any, Any, Effects(), NoCallInfo())
+            return Future(CallMeta(Any, Any, Effects(), NoCallInfo()))
         end
     end
     res = Union{}
-    nargs = length(aargtypes)
     splitunions = 1 < unionsplitcost(typeinf_lattice(interp), aargtypes) <= InferenceParams(interp).max_apply_union_enum
-    ctypes = [Any[aft]]
-    infos = Vector{MaybeAbstractIterationInfo}[MaybeAbstractIterationInfo[]]
-    effects = EFFECTS_TOTAL
-    for i = 1:nargs
-        ctypes´ = Vector{Any}[]
-        infos′ = Vector{MaybeAbstractIterationInfo}[]
-        for ti in (splitunions ? uniontypes(aargtypes[i]) : Any[aargtypes[i]])
-            if !isvarargtype(ti)
-                (;cti, info, ai_effects) = precise_container_type(interp, itft, ti, sv)
-            else
-                (;cti, info, ai_effects) = precise_container_type(interp, itft, unwrapva(ti), sv)
-                # We can't represent a repeating sequence of the same types,
-                # so tmerge everything together to get one type that represents
-                # everything.
-                argt = cti[end]
-                if isvarargtype(argt)
-                    argt = unwrapva(argt)
+    ctypes::Vector{Vector{Any}} = [Any[aft]]
+    infos::Vector{Vector{MaybeAbstractIterationInfo}} = Vector{MaybeAbstractIterationInfo}[MaybeAbstractIterationInfo[]]
+    all_effects::Effects = EFFECTS_TOTAL
+    retinfos = ApplyCallInfo[]
+    retinfo = UnionSplitApplyCallInfo(retinfos)
+    exctype = Union{}
+    ctypes´ = Vector{Any}[]
+    infos´ = Vector{MaybeAbstractIterationInfo}[]
+    local ti, argtypesi
+    local ctfuture::Future{AbstractIterationResult}
+    local callfuture::Future{CallMeta}
+
+    applyresult = Future{CallMeta}()
+    # split the rest into a resumable state machine
+    i::Int = 1
+    j::Int = 1
+    nextstate::UInt8 = 0x0
+    function infercalls(interp, sv)
+        # n.b. Remember that variables will lose their values across restarts,
+        # so be sure to manually hoist any values that must be preserved and do
+        # not rely on program order.
+        # This is a little more complex than the closure continuations often used elsewhere, but avoids needing to manage all of that indentation
+        if nextstate === 0x1
+            nextstate = 0xff
+            @goto state1
+        elseif nextstate === 0x2
+            nextstate = 0xff
+            @goto state2
+        elseif nextstate === 0x3
+            nextstate = 0xff
+            @goto state3
+        else
+            @assert nextstate === 0x0
+            nextstate = 0xff
+        end
+        while i <= length(aargtypes)
+            argtypesi = (splitunions ? uniontypes(aargtypes[i]) : Any[aargtypes[i]])
+            i += 1
+            j = 1
+            while j <= length(argtypesi)
+                ti = argtypesi[j]
+                j += 1
+                if !isvarargtype(ti)
+                    ctfuture = precise_container_type(interp, itft.contents, ti, sv)::Future
+                    if !isready(ctfuture)
+                        nextstate = 0x1
+                        return false
+                        @label state1
+                    end
+                    (;cti, info, ai_effects) = ctfuture[]
+                else
+                    ctfuture = precise_container_type(interp, itft.contents, unwrapva(ti), sv)::Future
+                    if !isready(ctfuture)
+                        nextstate = 0x2
+                        return false
+                        @label state2
+                    end
+                    (;cti, info, ai_effects) = ctfuture[]
+                    # We can't represent a repeating sequence of the same types,
+                    # so tmerge everything together to get one type that represents
+                    # everything.
+                    argt = cti[end]
+                    if isvarargtype(argt)
+                        argt = unwrapva(argt)
+                    end
+                    for k in 1:(length(cti)-1)
+                        argt = tmerge(argt, cti[k])
+                    end
+                    cti = Any[Vararg{argt}]
+                end
+                all_effects = merge_effects(all_effects, ai_effects)
+                if info !== nothing
+                    for call in info.each
+                        all_effects = merge_effects(all_effects, call.effects)
+                    end
                 end
-                for i in 1:(length(cti)-1)
-                    argt = tmerge(argt, cti[i])
+                if any(@nospecialize(t) -> t === Bottom, cti)
+                    continue
+                end
+                for k = 1:length(ctypes)
+                    ct = ctypes[k]
+                    if isvarargtype(ct[end])
+                        # This is vararg, we're not gonna be able to do any inlining,
+                        # drop the info
+                        info = nothing
+                        tail = tuple_tail_elem(typeinf_lattice(interp), unwrapva(ct[end]), cti)
+                        push!(ctypes´, push!(ct[1:(end - 1)], tail))
+                    else
+                        push!(ctypes´, append!(ct[:], cti))
+                    end
+                    push!(infos´, push!(copy(infos[k]), info))
                 end
-                cti = Any[Vararg{argt}]
             end
-            effects = merge_effects(effects, ai_effects)
-            if info !== nothing
-                for call in info.each
-                    effects = merge_effects(effects, call.effects)
+            # swap for the new array and empty the temporary one
+            ctypes´, ctypes = ctypes, ctypes´
+            infos´, infos = infos, infos´
+            empty!(ctypes´)
+            empty!(infos´)
+        end
+        all_effects.nothrow || (exctype = Any)
+
+        i = 1
+        while i <= length(ctypes)
+            ct = ctypes[i]
+            lct = length(ct)
+            # truncate argument list at the first Vararg
+            for k = 1:lct-1
+                cti = ct[k]
+                if isvarargtype(cti)
+                    ct[k] = tuple_tail_elem(typeinf_lattice(interp), unwrapva(cti), ct[(k+1):lct])
+                    resize!(ct, k)
+                    break
                 end
             end
-            if any(@nospecialize(t) -> t === Bottom, cti)
-                continue
+            callfuture = abstract_call(interp, ArgInfo(nothing, ct), si, sv, max_methods)::Future
+            if !isready(callfuture)
+                nextstate = 0x3
+                return false
+                @label state3
             end
-            for j = 1:length(ctypes)
-                ct = ctypes[j]::Vector{Any}
-                if isvarargtype(ct[end])
-                    # This is vararg, we're not gonna be able to do any inlining,
-                    # drop the info
-                    info = nothing
-                    tail = tuple_tail_elem(typeinf_lattice(interp), unwrapva(ct[end]), cti)
-                    push!(ctypes´, push!(ct[1:(end - 1)], tail))
-                else
-                    push!(ctypes´, append!(ct[:], cti))
+            let (; info, rt, exct, effects) = callfuture[]
+                push!(retinfos, ApplyCallInfo(info, infos[i]))
+                res = tmerge(typeinf_lattice(interp), res, rt)
+                exctype = tmerge(typeinf_lattice(interp), exctype, exct)
+                all_effects = merge_effects(all_effects, effects)
+                if i < length(ctypes) && bail_out_apply(interp, InferenceLoopState(ctypes[i], res, all_effects), sv)
+                    add_remark!(interp, sv, "_apply_iterate inference reached maximally imprecise information. Bailing on.")
+                    # there is unanalyzed candidate, widen type and effects to the top
+                    let retinfo = NoCallInfo() # NOTE this is necessary to prevent the inlining processing
+                        applyresult[] = CallMeta(Any, Any, Effects(), retinfo)
+                        return true
+                    end
                 end
-                push!(infos′, push!(copy(infos[j]), info))
             end
+            i += 1
         end
-        ctypes = ctypes´
-        infos = infos′
-    end
-    retinfos = ApplyCallInfo[]
-    retinfo = UnionSplitApplyCallInfo(retinfos)
-    napplicable = length(ctypes)
-    seen = 0
-    exct = effects.nothrow ? Union{} : Any
-    for i = 1:napplicable
-        ct = ctypes[i]
-        arginfo = infos[i]
-        lct = length(ct)
-        # truncate argument list at the first Vararg
-        for i = 1:lct-1
-            cti = ct[i]
-            if isvarargtype(cti)
-                ct[i] = tuple_tail_elem(typeinf_lattice(interp), unwrapva(cti), ct[(i+1):lct])
-                resize!(ct, i)
-                break
-            end
-        end
-        call = abstract_call(interp, ArgInfo(nothing, ct), si, sv, max_methods)
-        seen += 1
-        push!(retinfos, ApplyCallInfo(call.info, arginfo))
-        res = tmerge(typeinf_lattice(interp), res, call.rt)
-        exct = tmerge(typeinf_lattice(interp), exct, call.exct)
-        effects = merge_effects(effects, call.effects)
-        if bail_out_apply(interp, InferenceLoopState(ct, res, effects), sv)
-            add_remark!(interp, sv, "_apply_iterate inference reached maximally imprecise information. Bailing on.")
-            break
-        end
-    end
-    if seen ≠ napplicable
-        # there is unanalyzed candidate, widen type and effects to the top
-        res = Any
-        exct = Any
-        effects = Effects()
-        retinfo = NoCallInfo() # NOTE this is necessary to prevent the inlining processing
+        # TODO: Add a special info type to capture all the iteration info.
+        # For now, only propagate info if we don't also union-split the iteration
+        applyresult[] = CallMeta(res, exctype, all_effects, retinfo)
+        return true
     end
-    # TODO: Add a special info type to capture all the iteration info.
-    # For now, only propagate info if we don't also union-split the iteration
-    return CallMeta(res, exct, effects, retinfo)
+    # start making progress on the first call
+    infercalls(interp, sv) || push!(sv.tasks, infercalls)
+    return applyresult
 end
 
 function argtype_by_index(argtypes::Vector{Any}, i::Int)
@@ -2131,57 +2202,69 @@ function abstract_invoke(interp::AbstractInterpreter, arginfo::ArgInfo, si::Stmt
     argtypes = arginfo.argtypes
     ft′ = argtype_by_index(argtypes, 2)
     ft = widenconst(ft′)
-    ft === Bottom && return CallMeta(Bottom, Any, EFFECTS_THROWS, NoCallInfo())
+    ft === Bottom && return Future(CallMeta(Bottom, Any, EFFECTS_THROWS, NoCallInfo()))
     (types, isexact, isconcrete, istype) = instanceof_tfunc(argtype_by_index(argtypes, 3), false)
-    isexact || return CallMeta(Any, Any, Effects(), NoCallInfo())
+    isexact || return Future(CallMeta(Any, Any, Effects(), NoCallInfo()))
     unwrapped = unwrap_unionall(types)
-    if types === Bottom || !(unwrapped isa DataType) || unwrapped.name !== Tuple.name
-        return CallMeta(Bottom, Any, EFFECTS_THROWS, NoCallInfo())
+    types === Bottom && return Future(CallMeta(Bottom, Any, EFFECTS_THROWS, NoCallInfo()))
+    if !(unwrapped isa DataType && unwrapped.name === Tuple.name)
+        return Future(CallMeta(Bottom, TypeError, EFFECTS_THROWS, NoCallInfo()))
     end
     argtype = argtypes_to_type(argtype_tail(argtypes, 4))
     nargtype = typeintersect(types, argtype)
-    nargtype === Bottom && return CallMeta(Bottom, Any, EFFECTS_THROWS, NoCallInfo())
-    nargtype isa DataType || return CallMeta(Any, Any, Effects(), NoCallInfo()) # other cases are not implemented below
-    isdispatchelem(ft) || return CallMeta(Any, Any, Effects(), NoCallInfo()) # check that we might not have a subtype of `ft` at runtime, before doing supertype lookup below
+    nargtype === Bottom && return Future(CallMeta(Bottom, TypeError, EFFECTS_THROWS, NoCallInfo()))
+    nargtype isa DataType || return Future(CallMeta(Any, Any, Effects(), NoCallInfo())) # other cases are not implemented below
+    isdispatchelem(ft) || return Future(CallMeta(Any, Any, Effects(), NoCallInfo())) # check that we might not have a subtype of `ft` at runtime, before doing supertype lookup below
     ft = ft::DataType
     lookupsig = rewrap_unionall(Tuple{ft, unwrapped.parameters...}, types)::Type
     nargtype = Tuple{ft, nargtype.parameters...}
     argtype = Tuple{ft, argtype.parameters...}
     match, valid_worlds = findsup(lookupsig, method_table(interp))
-    match === nothing && return CallMeta(Any, Any, Effects(), NoCallInfo())
+    match === nothing && return Future(CallMeta(Any, Any, Effects(), NoCallInfo()))
     update_valid_age!(sv, valid_worlds)
     method = match.method
     tienv = ccall(:jl_type_intersection_with_env, Any, (Any, Any), nargtype, method.sig)::SimpleVector
-    ti = tienv[1]; env = tienv[2]::SimpleVector
-    result = abstract_call_method(interp, method, ti, env, false, si, sv)
-    (; rt, edge, effects, volatile_inf_result) = result
+    ti = tienv[1]
+    env = tienv[2]::SimpleVector
+    mresult = abstract_call_method(interp, method, ti, env, false, si, sv)::Future
     match = MethodMatch(ti, env, method, argtype <: method.sig)
-    res = nothing
-    sig = match.spec_types
-    argtypes′ = invoke_rewrite(argtypes)
-    fargs = arginfo.fargs
-    fargs′ = fargs === nothing ? nothing : invoke_rewrite(fargs)
-    arginfo = ArgInfo(fargs′, argtypes′)
-    # # typeintersect might have narrowed signature, but the accuracy gain doesn't seem worth the cost involved with the lattice comparisons
-    # for i in 1:length(argtypes′)
-    #     t, a = ti.parameters[i], argtypes′[i]
-    #     argtypes′[i] = t ⊑ a ? t : a
-    # end
-    𝕃ₚ = ipo_lattice(interp)
-    f = singleton_type(ft′)
-    invokecall = InvokeCall(types, lookupsig)
-    const_call_result = abstract_call_method_with_const_args(interp,
-        result, f, arginfo, si, match, sv, invokecall)
-    const_result = volatile_inf_result
-    if const_call_result !== nothing
-        if ⊑(𝕃ₚ, const_call_result.rt, rt)
-            (; rt, effects, const_result, edge) = const_call_result
+    return Future{CallMeta}(mresult, interp, sv) do result, interp, sv
+        (; rt, exct, edge, effects, volatile_inf_result) = result
+        res = nothing
+        sig = match.spec_types
+        argtypes′ = invoke_rewrite(argtypes)
+        fargs = arginfo.fargs
+        fargs′ = fargs === nothing ? nothing : invoke_rewrite(fargs)
+        arginfo = ArgInfo(fargs′, argtypes′)
+        # # typeintersect might have narrowed signature, but the accuracy gain doesn't seem worth the cost involved with the lattice comparisons
+        # for i in 1:length(argtypes′)
+        #     t, a = ti.parameters[i], argtypes′[i]
+        #     argtypes′[i] = t ⊑ a ? t : a
+        # end
+        𝕃ₚ = ipo_lattice(interp)
+        ⊑, ⋤, ⊔ = partialorder(𝕃ₚ), strictneqpartialorder(𝕃ₚ), join(𝕃ₚ)
+        f = singleton_type(ft′)
+        invokecall = InvokeCall(types, lookupsig)
+        const_call_result = abstract_call_method_with_const_args(interp,
+            result, f, arginfo, si, match, sv, invokecall)
+        const_result = volatile_inf_result
+        if const_call_result !== nothing
+            if const_call_result.rt ⊑ rt
+                (; rt, effects, const_result, edge) = const_call_result
+            end
+            if const_call_result.exct ⋤ exct
+                (; exct, const_result, edge) = const_call_result
+            end
+        end
+        rt = from_interprocedural!(interp, rt, sv, arginfo, sig)
+        info = InvokeCallInfo(match, const_result)
+        edge !== nothing && add_invoke_backedge!(sv, lookupsig, edge)
+        if !match.fully_covers
+            effects = Effects(effects; nothrow=false)
+            exct = exct ⊔ TypeError
         end
+        return CallMeta(rt, exct, effects, info)
     end
-    rt = from_interprocedural!(interp, rt, sv, arginfo, sig)
-    info = InvokeCallInfo(match, const_result)
-    edge !== nothing && add_invoke_backedge!(sv, lookupsig, edge)
-    return CallMeta(rt, Any, effects, info)
 end
 
 function invoke_rewrite(xs::Vector{Any})
@@ -2194,28 +2277,42 @@ end
 function abstract_finalizer(interp::AbstractInterpreter, argtypes::Vector{Any}, sv::AbsIntState)
     if length(argtypes) == 3
         finalizer_argvec = Any[argtypes[2], argtypes[3]]
-        call = abstract_call(interp, ArgInfo(nothing, finalizer_argvec), StmtInfo(false), sv, #=max_methods=#1)
-        return CallMeta(Nothing, Any, Effects(), FinalizerInfo(call.info, call.effects))
+        call = abstract_call(interp, ArgInfo(nothing, finalizer_argvec), StmtInfo(false), sv, #=max_methods=#1)::Future
+        return Future{CallMeta}(call, interp, sv) do call, interp, sv
+            return CallMeta(Nothing, Any, Effects(), FinalizerInfo(call.info, call.effects))
+        end
     end
-    return CallMeta(Nothing, Any, Effects(), NoCallInfo())
+    return Future(CallMeta(Nothing, Any, Effects(), NoCallInfo()))
 end
 
 function abstract_throw(interp::AbstractInterpreter, argtypes::Vector{Any}, ::AbsIntState)
     na = length(argtypes)
-    𝕃ᵢ = typeinf_lattice(interp)
+    ⊔ = join(typeinf_lattice(interp))
     if na == 2
         argtype2 = argtypes[2]
         if isvarargtype(argtype2)
-            exct = tmerge(𝕃ᵢ, unwrapva(argtype2), ArgumentError)
+            exct = unwrapva(argtype2) ⊔ ArgumentError
         else
             exct = argtype2
         end
     elseif na == 3 && isvarargtype(argtypes[3])
-        exct = tmerge(𝕃ᵢ, argtypes[2], ArgumentError)
+        exct = argtypes[2] ⊔ ArgumentError
     else
         exct = ArgumentError
     end
-    return CallMeta(Union{}, exct, EFFECTS_THROWS, NoCallInfo())
+    return Future(CallMeta(Union{}, exct, EFFECTS_THROWS, NoCallInfo()))
+end
+
+function abstract_throw_methoderror(interp::AbstractInterpreter, argtypes::Vector{Any}, ::AbsIntState)
+    exct = if length(argtypes) == 1
+        ArgumentError
+    elseif !isvarargtype(argtypes[2])
+        MethodError
+    else
+        ⊔ = join(typeinf_lattice(interp))
+        MethodError ⊔ ArgumentError
+    end
+    return Future(CallMeta(Union{}, exct, EFFECTS_THROWS, NoCallInfo()))
 end
 
 # call where the function is known exactly
@@ -2238,6 +2335,8 @@ function abstract_call_known(interp::AbstractInterpreter, @nospecialize(f),
             return abstract_applicable(interp, argtypes, sv, max_methods)
         elseif f === throw
             return abstract_throw(interp, argtypes, sv)
+        elseif f === Core.throw_methoderror
+            return abstract_throw_methoderror(interp, argtypes, sv)
         end
         rt = abstract_call_builtin(interp, f, arginfo, sv)
         ft = popfirst!(argtypes)
@@ -2258,60 +2357,70 @@ function abstract_call_known(interp::AbstractInterpreter, @nospecialize(f),
                 end
             end
         end
-        return CallMeta(rt, exct, effects, NoCallInfo(), refinements)
+        return Future(CallMeta(rt, exct, effects, NoCallInfo(), refinements))
     elseif isa(f, Core.OpaqueClosure)
         # calling an OpaqueClosure about which we have no information returns no information
-        return CallMeta(typeof(f).parameters[2], Any, Effects(), NoCallInfo())
+        return Future(CallMeta(typeof(f).parameters[2], Any, Effects(), NoCallInfo()))
     elseif f === TypeVar && !isvarargtype(argtypes[end])
         # Manually look through the definition of TypeVar to
         # make sure to be able to get `PartialTypeVar`s out.
-        2 ≤ la ≤ 4 || return CallMeta(Bottom, Any, EFFECTS_THROWS, NoCallInfo())
-        n = argtypes[2]
-        ub_var = Const(Any)
-        lb_var = Const(Union{})
-        if la == 4
-            ub_var = argtypes[4]
-            lb_var = argtypes[3]
-        elseif la == 3
-            ub_var = argtypes[3]
-        end
+        2 ≤ la ≤ 4 || return Future(CallMeta(Bottom, Any, EFFECTS_THROWS, NoCallInfo()))
         # make sure generic code is prepared for inlining if needed later
-        call = let T = Any[Type{TypeVar}, Any, Any, Any]
+        let T = Any[Type{TypeVar}, Any, Any, Any]
             resize!(T, la)
             atype = Tuple{T...}
             T[1] = Const(TypeVar)
-            abstract_call_gf_by_type(interp, f, ArgInfo(nothing, T), si, atype, sv, max_methods)
-        end
-        pT = typevar_tfunc(𝕃ᵢ, n, lb_var, ub_var)
-        typevar_argtypes = Any[n, lb_var, ub_var]
-        effects = builtin_effects(𝕃ᵢ, Core._typevar, typevar_argtypes, pT)
-        if effects.nothrow
-            exct = Union{}
-        else
-            exct = builtin_exct(𝕃ᵢ, Core._typevar, typevar_argtypes, pT)
+            let call = abstract_call_gf_by_type(interp, f, ArgInfo(nothing, T), si, atype, sv, max_methods)::Future
+                return Future{CallMeta}(call, interp, sv) do call, interp, sv
+                    n = argtypes[2]
+                    ub_var = Const(Any)
+                    lb_var = Const(Union{})
+                    if la == 4
+                        ub_var = argtypes[4]
+                        lb_var = argtypes[3]
+                    elseif la == 3
+                        ub_var = argtypes[3]
+                    end
+                    pT = typevar_tfunc(𝕃ᵢ, n, lb_var, ub_var)
+                    typevar_argtypes = Any[n, lb_var, ub_var]
+                    effects = builtin_effects(𝕃ᵢ, Core._typevar, typevar_argtypes, pT)
+                    if effects.nothrow
+                        exct = Union{}
+                    else
+                        exct = builtin_exct(𝕃ᵢ, Core._typevar, typevar_argtypes, pT)
+                    end
+                    return CallMeta(pT, exct, effects, call.info)
+                end
+            end
         end
-        return CallMeta(pT, exct, effects, call.info)
     elseif f === UnionAll
-        call = abstract_call_gf_by_type(interp, f, ArgInfo(nothing, Any[Const(UnionAll), Any, Any]), si, Tuple{Type{UnionAll}, Any, Any}, sv, max_methods)
-        return abstract_call_unionall(interp, argtypes, call)
+        let call = abstract_call_gf_by_type(interp, f, ArgInfo(nothing, Any[Const(UnionAll), Any, Any]), si, Tuple{Type{UnionAll}, Any, Any}, sv, max_methods)::Future
+            return Future{CallMeta}(call, interp, sv) do call, interp, sv
+                return abstract_call_unionall(interp, argtypes, call)
+            end
+        end
     elseif f === Tuple && la == 2
         aty = argtypes[2]
         ty = isvarargtype(aty) ? unwrapva(aty) : widenconst(aty)
         if !isconcretetype(ty)
-            return CallMeta(Tuple, Any, EFFECTS_UNKNOWN, NoCallInfo())
+            return Future(CallMeta(Tuple, Any, EFFECTS_UNKNOWN, NoCallInfo()))
         end
     elseif is_return_type(f)
         return return_type_tfunc(interp, argtypes, si, sv)
     elseif la == 3 && f === Core.:(!==)
         # mark !== as exactly a negated call to ===
-        call = abstract_call_gf_by_type(interp, f, ArgInfo(fargs, Any[Const(f), Any, Any]), si, Tuple{typeof(f), Any, Any}, sv, max_methods)
-        rty = abstract_call_known(interp, (===), arginfo, si, sv, max_methods).rt
-        if isa(rty, Conditional)
-            return CallMeta(Conditional(rty.slot, rty.elsetype, rty.thentype), Bottom, EFFECTS_TOTAL, NoCallInfo()) # swap if-else
-        elseif isa(rty, Const)
-            return CallMeta(Const(rty.val === false), Bottom, EFFECTS_TOTAL, MethodResultPure())
-        end
-        return call
+        let callfuture = abstract_call_gf_by_type(interp, f, ArgInfo(fargs, Any[Const(f), Any, Any]), si, Tuple{typeof(f), Any, Any}, sv, max_methods)::Future,
+            rtfuture = abstract_call_known(interp, (===), arginfo, si, sv, max_methods)::Future
+            return Future{CallMeta}(isready(callfuture) && isready(rtfuture), interp, sv) do interp, sv
+                local rty = rtfuture[].rt
+                if isa(rty, Conditional)
+                    return CallMeta(Conditional(rty.slot, rty.elsetype, rty.thentype), Bottom, EFFECTS_TOTAL, NoCallInfo()) # swap if-else
+                elseif isa(rty, Const)
+                    return CallMeta(Const(rty.val === false), Bottom, EFFECTS_TOTAL, MethodResultPure())
+                end
+                return callfuture[]
+            end
+        end
     elseif la == 3 && f === Core.:(>:)
         # mark issupertype as a exact alias for issubtype
         # swap T1 and T2 arguments and call <:
@@ -2323,46 +2432,58 @@ function abstract_call_known(interp::AbstractInterpreter, @nospecialize(f),
         argtypes = Any[typeof(<:), argtypes[3], argtypes[2]]
         return abstract_call_known(interp, <:, ArgInfo(fargs, argtypes), si, sv, max_methods)
     elseif la == 2 && f === Core.typename
-        return CallMeta(typename_static(argtypes[2]), Bottom, EFFECTS_TOTAL, MethodResultPure())
+        return Future(CallMeta(typename_static(argtypes[2]), Bottom, EFFECTS_TOTAL, MethodResultPure()))
     elseif f === Core._hasmethod
-        return _hasmethod_tfunc(interp, argtypes, sv)
+        return Future(_hasmethod_tfunc(interp, argtypes, sv))
     end
     atype = argtypes_to_type(argtypes)
-    return abstract_call_gf_by_type(interp, f, arginfo, si, atype, sv, max_methods)
+    return abstract_call_gf_by_type(interp, f, arginfo, si, atype, sv, max_methods)::Future
 end
 
 function abstract_call_opaque_closure(interp::AbstractInterpreter,
     closure::PartialOpaque, arginfo::ArgInfo, si::StmtInfo, sv::AbsIntState, check::Bool=true)
     sig = argtypes_to_type(arginfo.argtypes)
-    result = abstract_call_method(interp, closure.source::Method, sig, Core.svec(), false, si, sv)
-    (; rt, edge, effects, volatile_inf_result) = result
     tt = closure.typ
-    sigT = (unwrap_unionall(tt)::DataType).parameters[1]
-    match = MethodMatch(sig, Core.svec(), closure.source, sig <: rewrap_unionall(sigT, tt))
-    𝕃ₚ = ipo_lattice(interp)
-    ⊑ₚ = ⊑(𝕃ₚ)
-    const_result = volatile_inf_result
-    if !result.edgecycle
-        const_call_result = abstract_call_method_with_const_args(interp, result,
-            nothing, arginfo, si, match, sv)
-        if const_call_result !== nothing
-            if const_call_result.rt ⊑ₚ rt
-                (; rt, effects, const_result, edge) = const_call_result
+    ocargsig = rewrap_unionall((unwrap_unionall(tt)::DataType).parameters[1], tt)
+    ocargsig′ = unwrap_unionall(ocargsig)
+    ocargsig′ isa DataType || return Future(CallMeta(Any, Any, Effects(), NoCallInfo()))
+    ocsig = rewrap_unionall(Tuple{Tuple, ocargsig′.parameters...}, ocargsig)
+    hasintersect(sig, ocsig) || return Future(CallMeta(Union{}, Union{MethodError,TypeError}, EFFECTS_THROWS, NoCallInfo()))
+    ocmethod = closure.source::Method
+    match = MethodMatch(sig, Core.svec(), ocmethod, sig <: ocsig)
+    mresult = abstract_call_method(interp, ocmethod, sig, Core.svec(), false, si, sv)
+    ocsig_box = Core.Box(ocsig)
+    return Future{CallMeta}(mresult, interp, sv) do result, interp, sv
+        (; rt, exct, edge, effects, volatile_inf_result, edgecycle) = result
+        𝕃ₚ = ipo_lattice(interp)
+        ⊑, ⋤, ⊔ = partialorder(𝕃ₚ), strictneqpartialorder(𝕃ₚ), join(𝕃ₚ)
+        const_result = volatile_inf_result
+        if !edgecycle
+            const_call_result = abstract_call_method_with_const_args(interp, result,
+                nothing, arginfo, si, match, sv)
+            if const_call_result !== nothing
+                if const_call_result.rt ⊑ rt
+                    (; rt, effects, const_result, edge) = const_call_result
+                end
+                if const_call_result.exct ⋤ exct
+                    (; exct, const_result, edge) = const_call_result
+                end
             end
         end
-    end
-    if check # analyze implicit type asserts on argument and return type
-        ftt = closure.typ
-        (aty, rty) = (unwrap_unionall(ftt)::DataType).parameters
-        rty = rewrap_unionall(rty isa TypeVar ? rty.lb : rty, ftt)
-        if !(rt ⊑ₚ rty && tuple_tfunc(𝕃ₚ, arginfo.argtypes[2:end]) ⊑ₚ rewrap_unionall(aty, ftt))
-            effects = Effects(effects; nothrow=false)
+        if check # analyze implicit type asserts on argument and return type
+            ftt = closure.typ
+            rty = (unwrap_unionall(ftt)::DataType).parameters[2]
+            rty = rewrap_unionall(rty isa TypeVar ? rty.ub : rty, ftt)
+            if !(rt ⊑ rty && sig ⊑ ocsig_box.contents)
+                effects = Effects(effects; nothrow=false)
+                exct = exct ⊔ TypeError
+            end
         end
+        rt = from_interprocedural!(interp, rt, sv, arginfo, match.spec_types)
+        info = OpaqueClosureCallInfo(match, const_result)
+        edge !== nothing && add_backedge!(sv, edge)
+        return CallMeta(rt, exct, effects, info)
     end
-    rt = from_interprocedural!(interp, rt, sv, arginfo, match.spec_types)
-    info = OpaqueClosureCallInfo(match, const_result)
-    edge !== nothing && add_backedge!(sv, edge)
-    return CallMeta(rt, Any, effects, info)
 end
 
 function most_general_argtypes(closure::PartialOpaque)
@@ -2387,17 +2508,17 @@ function abstract_call_unknown(interp::AbstractInterpreter, @nospecialize(ft),
     wft = widenconst(ft)
     if hasintersect(wft, Builtin)
         add_remark!(interp, sv, "Could not identify method table for call")
-        return CallMeta(Any, Any, Effects(), NoCallInfo())
+        return Future(CallMeta(Any, Any, Effects(), NoCallInfo()))
     elseif hasintersect(wft, Core.OpaqueClosure)
         uft = unwrap_unionall(wft)
         if isa(uft, DataType)
-            return CallMeta(rewrap_unionall(uft.parameters[2], wft), Any, Effects(), NoCallInfo())
+            return Future(CallMeta(rewrap_unionall(uft.parameters[2], wft), Any, Effects(), NoCallInfo()))
         end
-        return CallMeta(Any, Any, Effects(), NoCallInfo())
+        return Future(CallMeta(Any, Any, Effects(), NoCallInfo()))
     end
     # non-constant function, but the number of arguments is known and the `f` is not a builtin or intrinsic
     atype = argtypes_to_type(arginfo.argtypes)
-    return abstract_call_gf_by_type(interp, nothing, arginfo, si, atype, sv, max_methods)
+    return abstract_call_gf_by_type(interp, nothing, arginfo, si, atype, sv, max_methods)::Future
 end
 
 # call where the function is any lattice element
@@ -2468,7 +2589,7 @@ function abstract_eval_cfunction(interp::AbstractInterpreter, e::Expr, vtypes::U
     # this may be the wrong world for the call,
     # but some of the result is likely to be valid anyways
     # and that may help generate better codegen
-    abstract_call(interp, ArgInfo(nothing, at), StmtInfo(false), sv)
+    abstract_call(interp, ArgInfo(nothing, at), StmtInfo(false), sv)::Future
     rt = e.args[1]
     isa(rt, Type) || (rt = Any)
     return RTEffects(rt, Any, EFFECTS_UNKNOWN)
@@ -2509,6 +2630,7 @@ function abstract_eval_value_expr(interp::AbstractInterpreter, e::Expr, sv::AbsI
         # TODO: We still have non-linearized cglobal
         @assert e.args[1] === Core.tuple || e.args[1] === GlobalRef(Core, :tuple)
     else
+        @assert e.head !== :(=)
         # Some of our tests expect us to handle invalid IR here and error later
         # - permit that for now.
         # @assert false "Unexpected EXPR head in value position"
@@ -2557,8 +2679,13 @@ function abstract_call(interp::AbstractInterpreter, arginfo::ArgInfo, sv::Infere
         add_curr_ssaflag!(sv, IR_FLAG_UNUSED)
     end
     si = StmtInfo(!unused)
-    call = abstract_call(interp, arginfo, si, sv)
-    sv.stmt_info[sv.currpc] = call.info
+    call = abstract_call(interp, arginfo, si, sv)::Future
+    Future{Nothing}(call, interp, sv) do call, interp, sv
+        # this only is needed for the side-effect, sequenced before any task tries to consume the return value,
+        # which this will do even without returning this Future
+        sv.stmt_info[sv.currpc] = call.info
+        nothing
+    end
     return call
 end
 
@@ -2567,11 +2694,14 @@ function abstract_eval_call(interp::AbstractInterpreter, e::Expr, vtypes::Union{
     ea = e.args
     argtypes = collect_argtypes(interp, ea, vtypes, sv)
     if argtypes === nothing
-        return RTEffects(Bottom, Any, Effects())
+        return Future(RTEffects(Bottom, Any, Effects()))
     end
     arginfo = ArgInfo(ea, argtypes)
-    (; rt, exct, effects, refinements) = abstract_call(interp, arginfo, sv)
-    return RTEffects(rt, exct, effects, refinements)
+    call = abstract_call(interp, arginfo, sv)::Future
+    return Future{RTEffects}(call, interp, sv) do call, interp, sv
+        (; rt, exct, effects, refinements) = call
+        return RTEffects(rt, exct, effects, refinements)
+    end
 end
 
 function abstract_eval_new(interp::AbstractInterpreter, e::Expr, vtypes::Union{VarTable,Nothing},
@@ -2701,12 +2831,15 @@ function abstract_eval_new_opaque_closure(interp::AbstractInterpreter, e::Expr,
                 argtypes = most_general_argtypes(rt)
                 pushfirst!(argtypes, rt.env)
                 callinfo = abstract_call_opaque_closure(interp, rt,
-                    ArgInfo(nothing, argtypes), StmtInfo(true), sv, #=check=#false)
-                sv.stmt_info[sv.currpc] = OpaqueClosureCreateInfo(callinfo)
+                    ArgInfo(nothing, argtypes), StmtInfo(true), sv, #=check=#false)::Future
+                Future{Nothing}(callinfo, interp, sv) do callinfo, interp, sv
+                    sv.stmt_info[sv.currpc] = OpaqueClosureCreateInfo(callinfo)
+                    nothing
+                end
             end
         end
     end
-    return RTEffects(rt, Any, effects)
+    return Future(RTEffects(rt, Any, effects))
 end
 
 function abstract_eval_copyast(interp::AbstractInterpreter, e::Expr, vtypes::Union{VarTable,Nothing},
@@ -2802,7 +2935,7 @@ function abstract_eval_static_parameter(::AbstractInterpreter, e::Expr, sv::AbsI
 end
 
 function abstract_eval_statement_expr(interp::AbstractInterpreter, e::Expr, vtypes::Union{VarTable,Nothing},
-                                      sv::AbsIntState)
+                                      sv::AbsIntState)::Future{RTEffects}
     ehead = e.head
     if ehead === :call
         return abstract_eval_call(interp, e, vtypes, sv)
@@ -2900,43 +3033,7 @@ function stmt_taints_inbounds_consistency(sv::AbsIntState)
     return has_curr_ssaflag(sv, IR_FLAG_INBOUNDS)
 end
 
-function abstract_eval_statement(interp::AbstractInterpreter, @nospecialize(e), vtypes::VarTable, sv::InferenceState)
-    if !isa(e, Expr)
-        if isa(e, PhiNode)
-            add_curr_ssaflag!(sv, IR_FLAGS_REMOVABLE)
-            # Implement convergence for PhiNodes. In particular, PhiNodes need to tmerge over
-            # the incoming values from all iterations, but `abstract_eval_phi` will only tmerge
-            # over the first and last iterations. By tmerging in the current old_rt, we ensure that
-            # we will not lose an intermediate value.
-            rt = abstract_eval_phi(interp, e, vtypes, sv)
-            old_rt = sv.ssavaluetypes[sv.currpc]
-            rt = old_rt === NOT_FOUND ? rt : tmerge(typeinf_lattice(interp), old_rt, rt)
-            return RTEffects(rt, Union{}, EFFECTS_TOTAL)
-        end
-        (; rt, exct, effects, refinements) = abstract_eval_special_value(interp, e, vtypes, sv)
-    else
-        (; rt, exct, effects, refinements) = abstract_eval_statement_expr(interp, e, vtypes, sv)
-        if effects.noub === NOUB_IF_NOINBOUNDS
-            if has_curr_ssaflag(sv, IR_FLAG_INBOUNDS)
-                effects = Effects(effects; noub=ALWAYS_FALSE)
-            elseif !propagate_inbounds(sv)
-                # The callee read our inbounds flag, but unless we propagate inbounds,
-                # we ourselves don't read our parent's inbounds.
-                effects = Effects(effects; noub=ALWAYS_TRUE)
-            end
-        end
-        e = e::Expr
-        @assert !isa(rt, TypeVar) "unhandled TypeVar"
-        rt = maybe_singleton_const(rt)
-        if !isempty(sv.pclimitations)
-            if rt isa Const || rt === Union{}
-                empty!(sv.pclimitations)
-            else
-                rt = LimitedAccuracy(rt, sv.pclimitations)
-                sv.pclimitations = IdSet{InferenceState}()
-            end
-        end
-    end
+function merge_override_effects!(interp::AbstractInterpreter, effects::Effects, sv::InferenceState)
     # N.B.: This only applies to the effects of the statement itself.
     # It is possible for arguments (GlobalRef/:static_parameter) to throw,
     # but these will be recomputed during SSA construction later.
@@ -2944,8 +3041,11 @@ function abstract_eval_statement(interp::AbstractInterpreter, @nospecialize(e),
     effects = override_effects(effects, override)
     set_curr_ssaflag!(sv, flags_for_effects(effects), IR_FLAGS_EFFECTS)
     merge_effects!(interp, sv, effects)
+    return effects
+end
 
-    return RTEffects(rt, exct, effects, refinements)
+function abstract_eval_statement(interp::AbstractInterpreter, @nospecialize(e), vtypes::VarTable, sv::InferenceState)
+    @assert !isa(e, Union{Expr, PhiNode, NewvarNode})
 end
 
 function override_effects(effects::Effects, override::EffectsOverride)
@@ -3191,60 +3291,6 @@ function handle_control_backedge!(interp::AbstractInterpreter, frame::InferenceS
     return nothing
 end
 
-struct BasicStmtChange
-    changes::Union{Nothing,StateUpdate}
-    rt::Any # extended lattice element or `nothing` - `nothing` if this statement may not be used as an SSA Value
-    exct::Any
-    # TODO effects::Effects
-    refinements # ::Union{Nothing,SlotRefinement,Vector{Any}}
-    function BasicStmtChange(changes::Union{Nothing,StateUpdate}, rt::Any, exct::Any,
-                             refinements=nothing)
-        @nospecialize rt exct refinements
-        return new(changes, rt, exct, refinements)
-    end
-end
-
-@inline function abstract_eval_basic_statement(interp::AbstractInterpreter,
-    @nospecialize(stmt), pc_vartable::VarTable, frame::InferenceState)
-    if isa(stmt, NewvarNode)
-        changes = StateUpdate(stmt.slot, VarState(Bottom, true))
-        return BasicStmtChange(changes, nothing, Union{})
-    elseif !isa(stmt, Expr)
-        (; rt, exct) = abstract_eval_statement(interp, stmt, pc_vartable, frame)
-        return BasicStmtChange(nothing, rt, exct)
-    end
-    changes = nothing
-    hd = stmt.head
-    if hd === :(=)
-        (; rt, exct, refinements) = abstract_eval_statement(interp, stmt.args[2], pc_vartable, frame)
-        if rt === Bottom
-            return BasicStmtChange(nothing, Bottom, exct, refinements)
-        end
-        lhs = stmt.args[1]
-        if isa(lhs, SlotNumber)
-            changes = StateUpdate(lhs, VarState(rt, false))
-        elseif isa(lhs, GlobalRef)
-            handle_global_assignment!(interp, frame, lhs, rt)
-        elseif !isa(lhs, SSAValue)
-            merge_effects!(interp, frame, EFFECTS_UNKNOWN)
-        end
-        return BasicStmtChange(changes, rt, exct, refinements)
-    elseif hd === :method
-        fname = stmt.args[1]
-        if isa(fname, SlotNumber)
-            changes = StateUpdate(fname, VarState(Any, false))
-        end
-        return BasicStmtChange(changes, nothing, Union{})
-    elseif (hd === :code_coverage_effect || (
-            hd !== :boundscheck && # :boundscheck can be narrowed to Bool
-            is_meta_expr(stmt)))
-        return BasicStmtChange(nothing, Nothing, Bottom)
-    else
-        (; rt, exct, refinements) = abstract_eval_statement(interp, stmt, pc_vartable, frame)
-        return BasicStmtChange(nothing, rt, exct, refinements)
-    end
-end
-
 function update_bbstate!(𝕃ᵢ::AbstractLattice, frame::InferenceState, bb::Int, vartable::VarTable)
     bbtable = frame.bb_vartables[bb]
     if bbtable === nothing
@@ -3344,27 +3390,45 @@ function update_cycle_worklists!(callback, frame::InferenceState)
 end
 
 # make as much progress on `frame` as possible (without handling cycles)
-function typeinf_local(interp::AbstractInterpreter, frame::InferenceState)
+struct CurrentState
+    result::Future
+    currstate::VarTable
+    bbstart::Int
+    bbend::Int
+    CurrentState(result::Future, currstate::VarTable, bbstart::Int, bbend::Int) = new(result, currstate, bbstart, bbend)
+    CurrentState() = new()
+end
+function typeinf_local(interp::AbstractInterpreter, frame::InferenceState, nextresult::CurrentState)
     @assert !is_inferred(frame)
     W = frame.ip
     ssavaluetypes = frame.ssavaluetypes
     bbs = frame.cfg.blocks
     nbbs = length(bbs)
     𝕃ᵢ = typeinf_lattice(interp)
-
+    states = frame.bb_vartables
     currbb = frame.currbb
+    currpc = frame.currpc
+
+    if isdefined(nextresult, :result)
+        # for reasons that are fairly unclear, some state is arbitrarily on the stack instead in the InferenceState as normal
+        bbstart = nextresult.bbstart
+        bbend = nextresult.bbend
+        currstate = nextresult.currstate
+        @goto injectresult
+    end
+
     if currbb != 1
         currbb = frame.currbb = _bits_findnext(W.bits, 1)::Int # next basic block
     end
-
-    states = frame.bb_vartables
     currstate = copy(states[currbb]::VarTable)
     while currbb <= nbbs
         delete!(W, currbb)
         bbstart = first(bbs[currbb].stmts)
         bbend = last(bbs[currbb].stmts)
 
-        for currpc in bbstart:bbend
+        currpc = bbstart - 1
+        while currpc < bbend
+            currpc += 1
             frame.currpc = currpc
             empty_backedges!(frame, currpc)
             stmt = frame.src.code[currpc]
@@ -3476,14 +3540,14 @@ function typeinf_local(interp::AbstractInterpreter, frame::InferenceState)
                             return caller.ssavaluetypes[caller_pc] !== Any
                         end
                     end
-                    ssavaluetypes[frame.currpc] = Any
+                    ssavaluetypes[currpc] = Any
                     @goto find_next_bb
                 elseif isa(stmt, EnterNode)
                     ssavaluetypes[currpc] = Any
                     add_curr_ssaflag!(frame, IR_FLAG_NOTHROW)
                     if isdefined(stmt, :scope)
                         scopet = abstract_eval_value(interp, stmt.scope, currstate, frame)
-                        handler = gethandler(frame, frame.currpc+1)::TryCatchFrame
+                        handler = gethandler(frame, currpc + 1)::TryCatchFrame
                         @assert handler.scopet !== nothing
                         if !⊑(𝕃ᵢ, scopet, handler.scopet)
                             handler.scopet = tmerge(𝕃ᵢ, scopet, handler.scopet)
@@ -3502,8 +3566,91 @@ function typeinf_local(interp::AbstractInterpreter, frame::InferenceState)
                 # Fall through terminator - treat as regular stmt
             end
             # Process non control-flow statements
-            (; changes, rt, exct, refinements) = abstract_eval_basic_statement(interp,
-                stmt, currstate, frame)
+            @assert isempty(frame.tasks)
+            rt = nothing
+            exct = Bottom
+            changes = nothing
+            refinements = nothing
+            effects = nothing
+            if isa(stmt, NewvarNode)
+                changes = StateUpdate(stmt.slot, VarState(Bottom, true))
+            elseif isa(stmt, PhiNode)
+                add_curr_ssaflag!(frame, IR_FLAGS_REMOVABLE)
+                # Implement convergence for PhiNodes. In particular, PhiNodes need to tmerge over
+                # the incoming values from all iterations, but `abstract_eval_phi` will only tmerge
+                # over the first and last iterations. By tmerging in the current old_rt, we ensure that
+                # we will not lose an intermediate value.
+                rt = abstract_eval_phi(interp, stmt, currstate, frame)
+                old_rt = frame.ssavaluetypes[currpc]
+                rt = old_rt === NOT_FOUND ? rt : tmerge(typeinf_lattice(interp), old_rt, rt)
+            else
+                lhs = nothing
+                if isexpr(stmt, :(=))
+                    lhs = stmt.args[1]
+                    stmt = stmt.args[2]
+                end
+                if !isa(stmt, Expr)
+                    (; rt, exct, effects, refinements) = abstract_eval_special_value(interp, stmt, currstate, frame)
+                else
+                    hd = stmt.head
+                    if hd === :method
+                        fname = stmt.args[1]
+                        if isa(fname, SlotNumber)
+                            changes = StateUpdate(fname, VarState(Any, false))
+                        end
+                    elseif (hd === :code_coverage_effect || (
+                            hd !== :boundscheck && # :boundscheck can be narrowed to Bool
+                            is_meta_expr(stmt)))
+                        rt = Nothing
+                    else
+                        result = abstract_eval_statement_expr(interp, stmt, currstate, frame)::Future
+                        if !isready(result) || !isempty(frame.tasks)
+                            return CurrentState(result, currstate, bbstart, bbend)
+                            @label injectresult
+                            # reload local variables
+                            stmt = frame.src.code[currpc]
+                            changes = nothing
+                            lhs = nothing
+                            if isexpr(stmt, :(=))
+                                lhs = stmt.args[1]
+                                stmt = stmt.args[2]
+                            end
+                            result = nextresult.result::Future{RTEffects}
+                        end
+                        result = result[]
+                        (; rt, exct, effects, refinements) = result
+                        if effects.noub === NOUB_IF_NOINBOUNDS
+                            if has_curr_ssaflag(frame, IR_FLAG_INBOUNDS)
+                                effects = Effects(effects; noub=ALWAYS_FALSE)
+                            elseif !propagate_inbounds(frame)
+                                # The callee read our inbounds flag, but unless we propagate inbounds,
+                                # we ourselves don't read our parent's inbounds.
+                                effects = Effects(effects; noub=ALWAYS_TRUE)
+                            end
+                        end
+                        @assert !isa(rt, TypeVar) "unhandled TypeVar"
+                        rt = maybe_singleton_const(rt)
+                        if !isempty(frame.pclimitations)
+                            if rt isa Const || rt === Union{}
+                                empty!(frame.pclimitations)
+                            else
+                                rt = LimitedAccuracy(rt, frame.pclimitations)
+                                frame.pclimitations = IdSet{InferenceState}()
+                            end
+                        end
+                    end
+                end
+                effects === nothing || merge_override_effects!(interp, effects, frame)
+                if lhs !== nothing && rt !== Bottom
+                    if isa(lhs, SlotNumber)
+                        changes = StateUpdate(lhs, VarState(rt, false))
+                    elseif isa(lhs, GlobalRef)
+                        handle_global_assignment!(interp, frame, lhs, rt)
+                    elseif !isa(lhs, SSAValue)
+                        merge_effects!(interp, frame, EFFECTS_UNKNOWN)
+                    end
+                end
+            end
             if !has_curr_ssaflag(frame, IR_FLAG_NOTHROW)
                 if exct !== Union{}
                     update_exc_bestguess!(interp, exct, frame)
@@ -3566,7 +3713,7 @@ function typeinf_local(interp::AbstractInterpreter, frame::InferenceState)
         end
     end # while currbb <= nbbs
 
-    nothing
+    return CurrentState()
 end
 
 function apply_refinement!(𝕃ᵢ::AbstractLattice, slot::SlotNumber, @nospecialize(newtyp),
@@ -3617,31 +3764,81 @@ function condition_object_change(currstate::VarTable, condt::Conditional,
 end
 
 # make as much progress on `frame` as possible (by handling cycles)
-function typeinf_nocycle(interp::AbstractInterpreter, frame::InferenceState)
-    typeinf_local(interp, frame)
-    @assert isempty(frame.ip)
+warnlength::Int = 2500
+function typeinf(interp::AbstractInterpreter, frame::InferenceState)
     callstack = frame.callstack::Vector{AbsIntState}
-    frame.cycleid == length(callstack) && return true
-
-    no_active_ips_in_callers = false
-    while true
-        # If the current frame is not the top part of a cycle, continue to the top of the cycle before resuming work
-        frame.cycleid == frame.frameid || return false
-        # If done, return and finalize this cycle
-        no_active_ips_in_callers && return true
-        # Otherwise, do at least one iteration over the entire current cycle
-        no_active_ips_in_callers = true
-        for i = reverse(frame.cycleid:length(callstack))
-            caller = callstack[i]::InferenceState
-            if !isempty(caller.ip)
-                # Note that `typeinf_local(interp, caller)` can potentially modify the other frames
-                # `frame.cycleid`, which is why making incremental progress requires the
-                # outer while loop.
-                typeinf_local(interp, caller)
-                no_active_ips_in_callers = false
-            end
-            update_valid_age!(caller, frame.valid_worlds)
+    nextstates = CurrentState[]
+    takenext = frame.frameid
+    minwarn = warnlength
+    takeprev = 0
+    while takenext >= frame.frameid
+        callee = takenext == 0 ? frame : callstack[takenext]::InferenceState
+        if !isempty(callstack)
+            if length(callstack) - frame.frameid >= minwarn
+                topmethod = callstack[1].linfo
+                topmethod.def isa Method || (topmethod = callstack[2].linfo)
+                print(Core.stderr, "info: inference of ", topmethod, " exceeding ", length(callstack), " frames (may be slow).\n")
+                minwarn *= 2
+            end
+            topcallee = (callstack[end]::InferenceState)
+            if topcallee.cycleid != callee.cycleid
+                callee = topcallee
+                takenext = length(callstack)
+            end
+        end
+        nextstateid = takenext + 1 - frame.frameid
+        while length(nextstates) < nextstateid
+            push!(nextstates, CurrentState())
+        end
+        if doworkloop(interp, callee)
+            # First drain the workloop. Note that since some scheduled work doesn't
+            # affect the result (e.g. cfunction or abstract_call_method on
+            # get_compileable_sig), but still must be finished up since it may see and
+            # change the local variables of the InferenceState at currpc, we do this
+            # even if the nextresult status is already completed.
+            continue
+        elseif isdefined(nextstates[nextstateid], :result) || !isempty(callee.ip)
+            # Next make progress on this frame
+            prev = length(callee.tasks) + 1
+            nextstates[nextstateid] = typeinf_local(interp, callee, nextstates[nextstateid])
+            reverse!(callee.tasks, prev)
+        elseif callee.cycleid == length(callstack)
+            # With no active ip's and no cycles, frame is done
+            finish_nocycle(interp, callee)
+            callee.frameid == 0 && break
+            takenext = length(callstack)
+            nextstateid = takenext + 1 - frame.frameid
+            #@assert length(nextstates) == nextstateid + 1
+            #@assert all(i -> !isdefined(nextstates[i], :result), nextstateid+1:length(nextstates))
+            resize!(nextstates, nextstateid)
+        elseif callee.cycleid == callee.frameid
+            # If the current frame is the top part of a cycle, check if the whole cycle
+            # is done, and if not, pick the next item to work on.
+            no_active_ips_in_cycle = true
+            for i = callee.cycleid:length(callstack)
+                caller = callstack[i]::InferenceState
+                @assert caller.cycleid == callee.cycleid
+                if !isempty(caller.tasks) || isdefined(nextstates[i+1-frame.frameid], :result) || !isempty(caller.ip)
+                    no_active_ips_in_cycle = false
+                    break
+                end
+            end
+            if no_active_ips_in_cycle
+                finish_cycle(interp, callstack, callee.cycleid)
+            end
+            takenext = length(callstack)
+            nextstateid = takenext + 1 - frame.frameid
+            if no_active_ips_in_cycle
+                #@assert all(i -> !isdefined(nextstates[i], :result), nextstateid+1:length(nextstates))
+                resize!(nextstates, nextstateid)
+            else
+                #@assert length(nextstates) == nextstateid
+            end
+        else
+            # Continue to the next frame in this cycle
+            takenext = takenext - 1
         end
     end
-    return true
+    #@assert all(nextresult -> !isdefined(nextresult, :result), nextstates)
+    return is_inferred(frame)
 end
diff --git a/base/compiler/inferencestate.jl b/base/compiler/inferencestate.jl
index 6953dea5b9bd7..5f8fb82caaa34 100644
--- a/base/compiler/inferencestate.jl
+++ b/base/compiler/inferencestate.jl
@@ -236,7 +236,7 @@ mutable struct InferenceState
     slottypes::Vector{Any}
     src::CodeInfo
     cfg::CFG
-    method_info::MethodInfo
+    spec_info::SpecInfo
 
     #= intermediate states for local abstract interpretation =#
     currbb::Int
@@ -251,6 +251,7 @@ mutable struct InferenceState
     stmt_info::Vector{CallInfo}
 
     #= intermediate states for interprocedural abstract interpretation =#
+    tasks::Vector{WorkThunk}
     pclimitations::IdSet{InferenceState} # causes of precision restrictions (LimitedAccuracy) on currpc ssavalue
     limitations::IdSet{InferenceState} # causes of precision restrictions (LimitedAccuracy) on return
     cycle_backedges::Vector{Tuple{InferenceState, Int}} # call-graph backedges connecting from callee to caller
@@ -293,7 +294,7 @@ mutable struct InferenceState
         sptypes = sptypes_from_meth_instance(mi)
         code = src.code::Vector{Any}
         cfg = compute_basic_blocks(code)
-        method_info = MethodInfo(src)
+        spec_info = SpecInfo(src)
 
         currbb = currpc = 1
         ip = BitSet(1) # TODO BitSetBoundedMinPrioritySet(1)
@@ -328,6 +329,7 @@ mutable struct InferenceState
         limitations = IdSet{InferenceState}()
         cycle_backedges = Vector{Tuple{InferenceState,Int}}()
         callstack = AbsIntState[]
+        tasks = WorkThunk[]
 
         valid_worlds = WorldRange(1, get_world_counter())
         bestguess = Bottom
@@ -349,9 +351,9 @@ mutable struct InferenceState
         restrict_abstract_call_sites = isa(def, Module)
 
         this = new(
-            mi, world, mod, sptypes, slottypes, src, cfg, method_info,
+            mi, world, mod, sptypes, slottypes, src, cfg, spec_info,
             currbb, currpc, ip, handler_info, ssavalue_uses, bb_vartables, ssavaluetypes, stmt_edges, stmt_info,
-            pclimitations, limitations, cycle_backedges, callstack, 0, 0, 0,
+            tasks, pclimitations, limitations, cycle_backedges, callstack, 0, 0, 0,
             result, unreachable, valid_worlds, bestguess, exc_bestguess, ipo_effects,
             restrict_abstract_call_sites, cache_mode, insert_coverage,
             interp)
@@ -789,7 +791,7 @@ end
 
 # TODO add `result::InferenceResult` and put the irinterp result into the inference cache?
 mutable struct IRInterpretationState
-    const method_info::MethodInfo
+    const spec_info::SpecInfo
     const ir::IRCode
     const mi::MethodInstance
     const world::UInt
@@ -800,13 +802,14 @@ mutable struct IRInterpretationState
     const ssa_refined::BitSet
     const lazyreachability::LazyCFGReachability
     valid_worlds::WorldRange
+    const tasks::Vector{WorkThunk}
     const edges::Vector{Any}
     callstack #::Vector{AbsIntState}
     frameid::Int
     parentid::Int
 
     function IRInterpretationState(interp::AbstractInterpreter,
-        method_info::MethodInfo, ir::IRCode, mi::MethodInstance, argtypes::Vector{Any},
+        spec_info::SpecInfo, ir::IRCode, mi::MethodInstance, argtypes::Vector{Any},
         world::UInt, min_world::UInt, max_world::UInt)
         curridx = 1
         given_argtypes = Vector{Any}(undef, length(argtypes))
@@ -825,10 +828,11 @@ mutable struct IRInterpretationState
         ssa_refined = BitSet()
         lazyreachability = LazyCFGReachability(ir)
         valid_worlds = WorldRange(min_world, max_world == typemax(UInt) ? get_world_counter() : max_world)
+        tasks = WorkThunk[]
         edges = Any[]
         callstack = AbsIntState[]
-        return new(method_info, ir, mi, world, curridx, argtypes_refined, ir.sptypes, tpdum,
-                ssa_refined, lazyreachability, valid_worlds, edges, callstack, 0, 0)
+        return new(spec_info, ir, mi, world, curridx, argtypes_refined, ir.sptypes, tpdum,
+                ssa_refined, lazyreachability, valid_worlds, tasks, edges, callstack, 0, 0)
     end
 end
 
@@ -841,14 +845,13 @@ function IRInterpretationState(interp::AbstractInterpreter,
     else
         isa(src, CodeInfo) || return nothing
     end
-    method_info = MethodInfo(src)
+    spec_info = SpecInfo(src)
     ir = inflate_ir(src, mi)
     argtypes = va_process_argtypes(optimizer_lattice(interp), argtypes, src.nargs, src.isva)
-    return IRInterpretationState(interp, method_info, ir, mi, argtypes, world,
+    return IRInterpretationState(interp, spec_info, ir, mi, argtypes, world,
                                  codeinst.min_world, codeinst.max_world)
 end
 
-
 # AbsIntState
 # ===========
 
@@ -870,6 +873,7 @@ function print_callstack(frame::AbsIntState)
         print(frame_instance(sv))
         is_cached(sv) || print("  [uncached]")
         sv.parentid == idx - 1 || print(" [parent=", sv.parentid, "]")
+        isempty(callers_in_cycle(sv)) || print(" [cycle=", sv.cycleid, "]")
         println()
         @assert sv.frameid == idx
     end
@@ -922,11 +926,11 @@ is_constproped(::IRInterpretationState) = true
 is_cached(sv::InferenceState) = !iszero(sv.cache_mode & CACHE_MODE_GLOBAL)
 is_cached(::IRInterpretationState) = false
 
-method_info(sv::InferenceState) = sv.method_info
-method_info(sv::IRInterpretationState) = sv.method_info
+spec_info(sv::InferenceState) = sv.spec_info
+spec_info(sv::IRInterpretationState) = sv.spec_info
 
-propagate_inbounds(sv::AbsIntState) = method_info(sv).propagate_inbounds
-method_for_inference_limit_heuristics(sv::AbsIntState) = method_info(sv).method_for_inference_limit_heuristics
+propagate_inbounds(sv::AbsIntState) = spec_info(sv).propagate_inbounds
+method_for_inference_limit_heuristics(sv::AbsIntState) = spec_info(sv).method_for_inference_limit_heuristics
 
 frame_world(sv::InferenceState) = sv.world
 frame_world(sv::IRInterpretationState) = sv.world
@@ -994,7 +998,10 @@ of the same cycle, only if it is part of a cycle with multiple frames.
 function callers_in_cycle(sv::InferenceState)
     callstack = sv.callstack::Vector{AbsIntState}
     cycletop = cycleid = sv.cycleid
-    while cycletop < length(callstack) && (callstack[cycletop + 1]::InferenceState).cycleid == cycleid
+    while cycletop < length(callstack)
+        frame = callstack[cycletop + 1]
+        frame isa InferenceState || break
+        frame.cycleid == cycleid || break
         cycletop += 1
     end
     return AbsIntCycle(callstack, cycletop == cycleid ? 0 : cycleid, cycletop)
@@ -1054,6 +1061,7 @@ function merge_effects!(::AbstractInterpreter, caller::InferenceState, effects::
         effects = Effects(effects; effect_free=ALWAYS_TRUE)
     end
     caller.ipo_effects = merge_effects(caller.ipo_effects, effects)
+    nothing
 end
 merge_effects!(::AbstractInterpreter, ::IRInterpretationState, ::Effects) = return
 
@@ -1116,3 +1124,90 @@ function get_max_methods_for_module(mod::Module)
     max_methods < 0 && return nothing
     return max_methods
 end
+
+"""
+    Future{T}
+
+Delayed return value for a value of type `T`, similar to RefValue{T}, but
+explicitly represents completed as a `Bool` rather than as `isdefined`.
+Set once with `f[] = v` and accessed with `f[]` afterwards.
+
+Can also be constructed with the `completed` flag value and a closure to
+produce `x`, as well as the additional arguments to avoid always capturing the
+same couple of values.
+"""
+struct Future{T}
+    later::Union{Nothing,RefValue{T}}
+    now::Union{Nothing,T}
+    Future{T}() where {T} = new{T}(RefValue{T}(), nothing)
+    Future{T}(x) where {T} = new{T}(nothing, x)
+    Future(x::T) where {T} = new{T}(nothing, x)
+end
+isready(f::Future) = f.later === nothing
+getindex(f::Future{T}) where {T} = (later = f.later; later === nothing ? f.now::T : later[])
+setindex!(f::Future, v) = something(f.later)[] = v
+convert(::Type{Future{T}}, x) where {T} = Future{T}(x) # support return type conversion
+convert(::Type{Future{T}}, x::Future) where {T} = x::Future{T}
+function Future{T}(f, immediate::Bool, interp::AbstractInterpreter, sv::AbsIntState) where {T}
+    if immediate
+        return Future{T}(f(interp, sv))
+    else
+        @assert applicable(f, interp, sv)
+        result = Future{T}()
+        push!(sv.tasks, function (interp, sv)
+            result[] = f(interp, sv)
+            return true
+        end)
+        return result
+    end
+end
+function Future{T}(f, prev::Future{S}, interp::AbstractInterpreter, sv::AbsIntState) where {T, S}
+    later = prev.later
+    if later === nothing
+        return Future{T}(f(prev[], interp, sv))
+    else
+        @assert Core._hasmethod(Tuple{Core.Typeof(f), S, typeof(interp), typeof(sv)})
+        result = Future{T}()
+        push!(sv.tasks, function (interp, sv)
+            result[] = f(later[], interp, sv) # capture just later, instead of all of prev
+            return true
+        end)
+        return result
+    end
+end
+
+
+"""
+    doworkloop(args...)
+
+Run a tasks inside the abstract interpreter, returning false if there are none.
+Tasks will be run in DFS post-order tree order, such that all child tasks will
+be run in the order scheduled, prior to running any subsequent tasks. This
+allows tasks to generate more child tasks, which will be run before anything else.
+Each task will be run repeatedly when returning `false`, until it returns `true`.
+"""
+function doworkloop(interp::AbstractInterpreter, sv::AbsIntState)
+    tasks = sv.tasks
+    prev = length(tasks)
+    prev == 0 && return false
+    task = pop!(tasks)
+    completed = task(interp, sv)
+    tasks = sv.tasks # allow dropping gc root over the previous call
+    completed isa Bool || throw(TypeError(:return, "", Bool, task)) # print the task on failure as part of the error message, instead of just "@ workloop:line"
+    completed || push!(tasks, task)
+    # efficient post-order visitor: items pushed are executed in reverse post order such
+    # that later items are executed before earlier ones, but are fully executed
+    # (including any dependencies scheduled by them) before going on to the next item
+    reverse!(tasks, #=start=#prev)
+    return true
+end
+
+
+#macro workthunk(name::Symbol, body)
+#    name = esc(name)
+#    body = esc(body)
+#    return replace_linenums!(
+#        :(function $name($(esc(interp)), $(esc(sv)))
+#              $body
+#          end), __source__)
+#end
diff --git a/base/compiler/optimize.jl b/base/compiler/optimize.jl
index fb712b1c71b12..02f6b46e2e73f 100644
--- a/base/compiler/optimize.jl
+++ b/base/compiler/optimize.jl
@@ -644,10 +644,10 @@ function ((; code_cache)::GetNativeEscapeCache)(mi::MethodInstance)
     return false
 end
 
-function refine_effects!(interp::AbstractInterpreter, sv::PostOptAnalysisState)
+function refine_effects!(interp::AbstractInterpreter, opt::OptimizationState, sv::PostOptAnalysisState)
     if !is_effect_free(sv.result.ipo_effects) && sv.all_effect_free && !isempty(sv.ea_analysis_pending)
         ir = sv.ir
-        nargs = length(ir.argtypes)
+        nargs = Int(opt.src.nargs)
         estate = EscapeAnalysis.analyze_escapes(ir, nargs, optimizer_lattice(interp), GetNativeEscapeCache(interp))
         argescapes = EscapeAnalysis.ArgEscapeCache(estate)
         stack_analysis_result!(sv.result, argescapes)
@@ -702,6 +702,8 @@ function check_all_args_noescape!(sv::PostOptAnalysisState, ir::IRCode, @nospeci
     else
         return false
     end
+    has_no_escape(x::EscapeAnalysis.EscapeInfo) =
+        EscapeAnalysis.has_no_escape(EscapeAnalysis.ignore_argescape(x))
     for i = startidx:length(stmt.args)
         arg = stmt.args[i]
         argt = argextype(arg, ir)
@@ -710,7 +712,7 @@ function check_all_args_noescape!(sv::PostOptAnalysisState, ir::IRCode, @nospeci
         end
         # See if we can find the allocation
         if isa(arg, Argument)
-            if EscapeAnalysis.has_no_escape(EscapeAnalysis.ignore_argescape(estate[arg]))
+            if has_no_escape(estate[arg])
                 # Even if we prove everything else effect_free, the best we can
                 # say is :effect_free_if_argmem_only
                 if sv.effect_free_if_argmem_only === nothing
@@ -721,7 +723,7 @@ function check_all_args_noescape!(sv::PostOptAnalysisState, ir::IRCode, @nospeci
             end
             return false
         elseif isa(arg, SSAValue)
-            EscapeAnalysis.has_no_escape(estate[arg]) || return false
+            has_no_escape(estate[arg]) || return false
             check_all_args_noescape!(sv, ir, ir[arg][:stmt], estate) || return false
         else
             return false
@@ -937,7 +939,8 @@ function check_inconsistentcy!(sv::PostOptAnalysisState, scanner::BBScanner)
     end
 end
 
-function ipo_dataflow_analysis!(interp::AbstractInterpreter, ir::IRCode, result::InferenceResult)
+function ipo_dataflow_analysis!(interp::AbstractInterpreter, opt::OptimizationState,
+                                ir::IRCode, result::InferenceResult)
     if !is_ipo_dataflow_analysis_profitable(result.ipo_effects)
         return false
     end
@@ -965,13 +968,13 @@ function ipo_dataflow_analysis!(interp::AbstractInterpreter, ir::IRCode, result:
         end
     end
 
-    return refine_effects!(interp, sv)
+    return refine_effects!(interp, opt, sv)
 end
 
 # run the optimization work
 function optimize(interp::AbstractInterpreter, opt::OptimizationState, caller::InferenceResult)
-    @timeit "optimizer" ir = run_passes_ipo_safe(opt.src, opt, caller)
-    ipo_dataflow_analysis!(interp, ir, caller)
+    @timeit "optimizer" ir = run_passes_ipo_safe(opt.src, opt)
+    ipo_dataflow_analysis!(interp, opt, ir, caller)
     return finish(interp, opt, ir, caller)
 end
 
@@ -993,7 +996,6 @@ matchpass(::Nothing, _, _) = false
 function run_passes_ipo_safe(
     ci::CodeInfo,
     sv::OptimizationState,
-    caller::InferenceResult,
     optimize_until = nothing,  # run all passes by default
 )
     __stage__ = 0  # used by @pass
diff --git a/base/compiler/ssair/EscapeAnalysis/EscapeAnalysis.jl b/base/compiler/ssair/EscapeAnalysis/EscapeAnalysis.jl
index f74cb90e6ab51..0ad55d6fbcd9e 100644
--- a/base/compiler/ssair/EscapeAnalysis/EscapeAnalysis.jl
+++ b/base/compiler/ssair/EscapeAnalysis/EscapeAnalysis.jl
@@ -732,11 +732,13 @@ function compute_frameinfo(ir::IRCode)
         inst = ir[SSAValue(idx)]
         stmt = inst[:stmt]
         if isa(stmt, EnterNode)
-            @assert idx ≤ nstmts "try/catch inside new_nodes unsupported"
-            tryregions === nothing && (tryregions = UnitRange{Int}[])
             leave_block = stmt.catch_dest
-            leave_pc = first(ir.cfg.blocks[leave_block].stmts)
-            push!(tryregions, idx:leave_pc)
+            if leave_block ≠ 0
+                @assert idx ≤ nstmts "try/catch inside new_nodes unsupported"
+                tryregions === nothing && (tryregions = UnitRange{Int}[])
+                leave_pc = first(ir.cfg.blocks[leave_block].stmts)
+                push!(tryregions, idx:leave_pc)
+            end
         elseif arrayinfo !== nothing
             # TODO this super limited alias analysis is able to handle only very simple cases
             # this should be replaced with a proper forward dimension analysis
@@ -1213,6 +1215,7 @@ escape_builtin!(::typeof(Core.donotdelete), _...) = false
 # not really safe, but `ThrownEscape` will be imposed later
 escape_builtin!(::typeof(isdefined), _...) = false
 escape_builtin!(::typeof(throw), _...) = false
+escape_builtin!(::typeof(Core.throw_methoderror), _...) = false
 
 function escape_builtin!(::typeof(ifelse), astate::AnalysisState, pc::Int, args::Vector{Any})
     length(args) == 4 || return false
diff --git a/base/compiler/ssair/inlining.jl b/base/compiler/ssair/inlining.jl
index 70318b9e1a979..5017b619469ff 100644
--- a/base/compiler/ssair/inlining.jl
+++ b/base/compiler/ssair/inlining.jl
@@ -12,6 +12,8 @@ struct InliningTodo
     mi::MethodInstance
     # The IR of the inlinee
     ir::IRCode
+    # The SpecInfo for the inlinee
+    spec_info::SpecInfo
     # The DebugInfo table for the inlinee
     di::DebugInfo
     # If the function being inlined is a single basic block we can use a
@@ -20,8 +22,8 @@ struct InliningTodo
     # Effects of the call statement
     effects::Effects
 end
-function InliningTodo(mi::MethodInstance, (ir, di)::Tuple{IRCode, DebugInfo}, effects::Effects)
-    return InliningTodo(mi, ir, di, linear_inline_eligible(ir), effects)
+function InliningTodo(mi::MethodInstance, ir::IRCode, spec_info::SpecInfo, di::DebugInfo, effects::Effects)
+    return InliningTodo(mi, ir, spec_info, di, linear_inline_eligible(ir), effects)
 end
 
 struct ConstantCase
@@ -50,12 +52,13 @@ struct InliningCase
 end
 
 struct UnionSplit
-    fully_covered::Bool
+    handled_all_cases::Bool # All possible dispatches are included in the cases
+    fully_covered::Bool # All handled cases are fully covering
     atype::DataType
     cases::Vector{InliningCase}
     bbs::Vector{Int}
-    UnionSplit(fully_covered::Bool, atype::DataType, cases::Vector{InliningCase}) =
-        new(fully_covered, atype, cases, Int[])
+    UnionSplit(handled_all_cases::Bool, fully_covered::Bool, atype::DataType, cases::Vector{InliningCase}) =
+        new(handled_all_cases, fully_covered, atype, cases, Int[])
 end
 
 struct InliningEdgeTracker
@@ -215,7 +218,7 @@ end
 
 function cfg_inline_unionsplit!(ir::IRCode, idx::Int, union_split::UnionSplit,
                                 state::CFGInliningState, params::OptimizationParams)
-    (; fully_covered, #=atype,=# cases, bbs) = union_split
+    (; handled_all_cases, fully_covered, #=atype,=# cases, bbs) = union_split
     inline_into_block!(state, block_for_inst(ir, idx))
     from_bbs = Int[]
     delete!(state.split_targets, length(state.new_cfg_blocks))
@@ -235,7 +238,7 @@ function cfg_inline_unionsplit!(ir::IRCode, idx::Int, union_split::UnionSplit,
             end
         end
         push!(from_bbs, length(state.new_cfg_blocks))
-        if !(i == length(cases) && fully_covered)
+        if !(i == length(cases) && (handled_all_cases && fully_covered))
             # This block will have the next condition or the final else case
             push!(state.new_cfg_blocks, BasicBlock(StmtRange(idx, idx)))
             push!(state.new_cfg_blocks[cond_bb].succs, length(state.new_cfg_blocks))
@@ -244,7 +247,10 @@ function cfg_inline_unionsplit!(ir::IRCode, idx::Int, union_split::UnionSplit,
         end
     end
     # The edge from the fallback block.
-    fully_covered || push!(from_bbs, length(state.new_cfg_blocks))
+    # NOTE This edge is only required for `!handled_all_cases` and not `!fully_covered`,
+    #      since in the latter case we inline `Core.throw_methoderror` into the fallback
+    #      block, which is must-throw, making the subsequent code path unreachable.
+    !handled_all_cases && push!(from_bbs, length(state.new_cfg_blocks))
     # This block will be the block everyone returns to
     push!(state.new_cfg_blocks, BasicBlock(StmtRange(idx, idx), from_bbs, orig_succs))
     join_bb = length(state.new_cfg_blocks)
@@ -317,7 +323,8 @@ function ir_inline_linetable!(debuginfo::DebugInfoStream, inlinee_debuginfo::Deb
 end
 
 function ir_prepare_inlining!(insert_node!::Inserter, inline_target::Union{IRCode, IncrementalCompact},
-                              ir::IRCode, di::DebugInfo, mi::MethodInstance, inlined_at::NTuple{3,Int32}, argexprs::Vector{Any})
+                              ir::IRCode, spec_info::SpecInfo, di::DebugInfo, mi::MethodInstance,
+                              inlined_at::NTuple{3,Int32}, argexprs::Vector{Any})
     def = mi.def::Method
     debuginfo = inline_target isa IRCode ? inline_target.debuginfo : inline_target.ir.debuginfo
     topline = new_inlined_at = ir_inline_linetable!(debuginfo, di, inlined_at)
@@ -330,8 +337,8 @@ function ir_prepare_inlining!(insert_node!::Inserter, inline_target::Union{IRCod
         spvals_ssa = insert_node!(
             removable_if_unused(NewInstruction(Expr(:call, Core._compute_sparams, def, argexprs...), SimpleVector, topline)))
     end
-    if def.isva
-        nargs_def = Int(def.nargs::Int32)
+    if spec_info.isva
+        nargs_def = spec_info.nargs
         if nargs_def > 0
             argexprs = fix_va_argexprs!(insert_node!, inline_target, argexprs, nargs_def, topline)
         end
@@ -358,7 +365,7 @@ function ir_inline_item!(compact::IncrementalCompact, idx::Int, argexprs::Vector
                          item::InliningTodo, boundscheck::Symbol, todo_bbs::Vector{Tuple{Int, Int}})
     # Ok, do the inlining here
     inlined_at = compact.result[idx][:line]
-    ssa_substitute = ir_prepare_inlining!(InsertHere(compact), compact, item.ir, item.di, item.mi, inlined_at, argexprs)
+    ssa_substitute = ir_prepare_inlining!(InsertHere(compact), compact, item.ir, item.spec_info, item.di, item.mi, inlined_at, argexprs)
     boundscheck = has_flag(compact.result[idx], IR_FLAG_INBOUNDS) ? :off : boundscheck
 
     # If the iterator already moved on to the next basic block,
@@ -523,7 +530,7 @@ assuming their order stays the same post-discovery in `ml_matches`.
 function ir_inline_unionsplit!(compact::IncrementalCompact, idx::Int, argexprs::Vector{Any},
                                union_split::UnionSplit, boundscheck::Symbol,
                                todo_bbs::Vector{Tuple{Int,Int}}, interp::AbstractInterpreter)
-    (; fully_covered, atype, cases, bbs) = union_split
+    (; handled_all_cases, fully_covered, atype, cases, bbs) = union_split
     stmt, typ, line = compact.result[idx][:stmt], compact.result[idx][:type], compact.result[idx][:line]
     join_bb = bbs[end]
     pn = PhiNode()
@@ -538,7 +545,7 @@ function ir_inline_unionsplit!(compact::IncrementalCompact, idx::Int, argexprs::
         cond = true
         nparams = fieldcount(atype)
         @assert nparams == fieldcount(mtype)
-        if !(i == ncases && fully_covered)
+        if !(i == ncases && fully_covered && handled_all_cases)
             for i = 1:nparams
                 aft, mft = fieldtype(atype, i), fieldtype(mtype, i)
                 # If this is always true, we don't need to check for it
@@ -597,14 +604,18 @@ function ir_inline_unionsplit!(compact::IncrementalCompact, idx::Int, argexprs::
     end
     bb += 1
     # We're now in the fall through block, decide what to do
-    if !fully_covered
+    if !handled_all_cases
         ssa = insert_node_here!(compact, NewInstruction(stmt, typ, line))
         push!(pn.edges, bb)
         push!(pn.values, ssa)
         insert_node_here!(compact, NewInstruction(GotoNode(join_bb), Any, line))
         finish_current_bb!(compact, 0)
+    elseif !fully_covered
+        insert_node_here!(compact, NewInstruction(Expr(:call, GlobalRef(Core, :throw_methoderror), argexprs...), Union{}, line))
+        insert_node_here!(compact, NewInstruction(ReturnNode(), Union{}, line))
+        finish_current_bb!(compact, 0)
+        ncases == 0 && return insert_node_here!(compact, NewInstruction(nothing, Any, line))
     end
-
     # We're now in the join block.
     return insert_node_here!(compact, NewInstruction(pn, typ, line))
 end
@@ -852,15 +863,14 @@ function resolve_todo(mi::MethodInstance, result::Union{Nothing,InferenceResult,
     if inferred_result isa ConstantCase
         add_inlining_backedge!(et, mi)
         return inferred_result
-    end
-    if inferred_result isa InferredResult
+    elseif inferred_result isa InferredResult
         (; src, effects) = inferred_result
     elseif inferred_result isa CodeInstance
         src = @atomic :monotonic inferred_result.inferred
         effects = decode_effects(inferred_result.ipo_purity_bits)
-    else
-        src = nothing
-        effects = Effects()
+    else # there is no cached source available, bail out
+        return compileable_specialization(mi, Effects(), et, info;
+            compilesig_invokes=OptimizationParams(state.interp).compilesig_invokes)
     end
 
     # the duplicated check might have been done already within `analyze_method!`, but still
@@ -875,9 +885,12 @@ function resolve_todo(mi::MethodInstance, result::Union{Nothing,InferenceResult,
             compilesig_invokes=OptimizationParams(state.interp).compilesig_invokes)
 
     add_inlining_backedge!(et, mi)
-    ir = inferred_result isa CodeInstance  ? retrieve_ir_for_inlining(inferred_result, src) :
-                                             retrieve_ir_for_inlining(mi, src, preserve_local_sources)
-    return InliningTodo(mi, ir, effects)
+    if inferred_result isa CodeInstance
+        ir, spec_info, debuginfo = retrieve_ir_for_inlining(inferred_result, src)
+    else
+        ir, spec_info, debuginfo = retrieve_ir_for_inlining(mi, src, preserve_local_sources)
+    end
+    return InliningTodo(mi, ir, spec_info, debuginfo, effects)
 end
 
 # the special resolver for :invoke-d call
@@ -893,23 +906,17 @@ function resolve_todo(mi::MethodInstance, @nospecialize(info::CallInfo), flag::U
     if cached_result isa ConstantCase
         add_inlining_backedge!(et, mi)
         return cached_result
-    end
-    if cached_result isa InferredResult
-        (; src, effects) = cached_result
     elseif cached_result isa CodeInstance
         src = @atomic :monotonic cached_result.inferred
         effects = decode_effects(cached_result.ipo_purity_bits)
-    else
-        src = nothing
-        effects = Effects()
+    else # there is no cached source available, bail out
+        return nothing
     end
 
-    preserve_local_sources = true
     src_inlining_policy(state.interp, src, info, flag) || return nothing
-    ir = cached_result isa CodeInstance  ? retrieve_ir_for_inlining(cached_result, src) :
-                                           retrieve_ir_for_inlining(mi, src, preserve_local_sources)
+    ir, spec_info, debuginfo = retrieve_ir_for_inlining(cached_result, src)
     add_inlining_backedge!(et, mi)
-    return InliningTodo(mi, ir, effects)
+    return InliningTodo(mi, ir, spec_info, debuginfo, effects)
 end
 
 function validate_sparams(sparams::SimpleVector)
@@ -963,22 +970,29 @@ function analyze_method!(match::MethodMatch, argtypes::Vector{Any},
     return resolve_todo(mi, volatile_inf_result, info, flag, state; invokesig)
 end
 
-function retrieve_ir_for_inlining(cached_result::CodeInstance, src::MaybeCompressed)
-    src = _uncompressed_ir(cached_result, src)::CodeInfo
-    return inflate_ir!(src, cached_result.def), src.debuginfo
+function retrieve_ir_for_inlining(cached_result::CodeInstance, src::String)
+    src = _uncompressed_ir(cached_result, src)
+    return inflate_ir!(src, cached_result.def), SpecInfo(src), src.debuginfo
+end
+function retrieve_ir_for_inlining(cached_result::CodeInstance, src::CodeInfo)
+    return inflate_ir!(copy(src), cached_result.def), SpecInfo(src), src.debuginfo
 end
 function retrieve_ir_for_inlining(mi::MethodInstance, src::CodeInfo, preserve_local_sources::Bool)
     if preserve_local_sources
         src = copy(src)
     end
-    return inflate_ir!(src, mi), src.debuginfo
+    return inflate_ir!(src, mi), SpecInfo(src), src.debuginfo
 end
 function retrieve_ir_for_inlining(mi::MethodInstance, ir::IRCode, preserve_local_sources::Bool)
     if preserve_local_sources
         ir = copy(ir)
     end
+    # COMBAK this is not correct, we should make `InferenceResult` propagate `SpecInfo`
+    spec_info = let m = mi.def::Method
+        SpecInfo(Int(m.nargs), m.isva, false, nothing)
+    end
     ir.debuginfo.def = mi
-    return ir, DebugInfo(ir.debuginfo, length(ir.stmts))
+    return ir, spec_info, DebugInfo(ir.debuginfo, length(ir.stmts))
 end
 
 function handle_single_case!(todo::Vector{Pair{Int,Any}},
@@ -1348,10 +1362,6 @@ function compute_inlining_cases(@nospecialize(info::CallInfo), flag::UInt32, sig
             # Too many applicable methods
             # Or there is a (partial?) ambiguity
             return nothing
-        elseif length(meth) == 0
-            # No applicable methods; try next union split
-            handled_all_cases = false
-            continue
         end
         local split_fully_covered = false
         for (j, match) in enumerate(meth)
@@ -1392,12 +1402,16 @@ function compute_inlining_cases(@nospecialize(info::CallInfo), flag::UInt32, sig
             handled_all_cases &= handle_any_const_result!(cases,
                 result, match, argtypes, info, flag, state; allow_typevars=true)
         end
+        if !fully_covered
+            atype = argtypes_to_type(sig.argtypes)
+            # We will emit an inline MethodError so we need a backedge to the MethodTable
+            add_uncovered_edges!(state.edges, info, atype)
+        end
     elseif !isempty(cases)
         # if we've not seen all candidates, union split is valid only for dispatch tuples
         filter!(case::InliningCase->isdispatchtuple(case.sig), cases)
     end
-
-    return cases, (handled_all_cases & fully_covered), joint_effects
+    return cases, handled_all_cases, fully_covered, joint_effects
 end
 
 function handle_call!(todo::Vector{Pair{Int,Any}},
@@ -1405,9 +1419,9 @@ function handle_call!(todo::Vector{Pair{Int,Any}},
     state::InliningState)
     cases = compute_inlining_cases(info, flag, sig, state)
     cases === nothing && return nothing
-    cases, all_covered, joint_effects = cases
+    cases, handled_all_cases, fully_covered, joint_effects = cases
     atype = argtypes_to_type(sig.argtypes)
-    handle_cases!(todo, ir, idx, stmt, atype, cases, all_covered, joint_effects)
+    handle_cases!(todo, ir, idx, stmt, atype, cases, handled_all_cases, fully_covered, joint_effects)
 end
 
 function handle_match!(cases::Vector{InliningCase},
@@ -1458,8 +1472,8 @@ function semiconcrete_result_item(result::SemiConcreteResult,
 
     add_inlining_backedge!(et, mi)
     preserve_local_sources = OptimizationParams(state.interp).preserve_local_sources
-    ir = retrieve_ir_for_inlining(mi, result.ir, preserve_local_sources)
-    return InliningTodo(mi, ir, result.effects)
+    ir, _, debuginfo = retrieve_ir_for_inlining(mi, result.ir, preserve_local_sources)
+    return InliningTodo(mi, ir, result.spec_info, debuginfo, result.effects)
 end
 
 function handle_semi_concrete_result!(cases::Vector{InliningCase}, result::SemiConcreteResult,
@@ -1496,19 +1510,19 @@ function concrete_result_item(result::ConcreteResult, @nospecialize(info::CallIn
 end
 
 function handle_cases!(todo::Vector{Pair{Int,Any}}, ir::IRCode, idx::Int, stmt::Expr,
-    @nospecialize(atype), cases::Vector{InliningCase}, all_covered::Bool,
+    @nospecialize(atype), cases::Vector{InliningCase}, handled_all_cases::Bool, fully_covered::Bool,
     joint_effects::Effects)
     # If we only have one case and that case is fully covered, we may either
     # be able to do the inlining now (for constant cases), or push it directly
     # onto the todo list
-    if all_covered && length(cases) == 1
+    if fully_covered && handled_all_cases && length(cases) == 1
         handle_single_case!(todo, ir, idx, stmt, cases[1].item)
-    elseif length(cases) > 0
+    elseif length(cases) > 0 || handled_all_cases
         isa(atype, DataType) || return nothing
         for case in cases
             isa(case.sig, DataType) || return nothing
         end
-        push!(todo, idx=>UnionSplit(all_covered, atype, cases))
+        push!(todo, idx=>UnionSplit(handled_all_cases, fully_covered, atype, cases))
     else
         add_flag!(ir[SSAValue(idx)], flags_for_effects(joint_effects))
     end
@@ -1589,7 +1603,6 @@ function handle_finalizer_call!(ir::IRCode, idx::Int, stmt::Expr, info::Finalize
             push!(stmt.args, item1.invoke)
         elseif isa(item1, ConstantCase)
             push!(stmt.args, nothing)
-            push!(stmt.args, item1.val)
         end
     end
     return nothing
diff --git a/base/compiler/ssair/ir.jl b/base/compiler/ssair/ir.jl
index 960da88ddffc8..fdcb4621c5c0f 100644
--- a/base/compiler/ssair/ir.jl
+++ b/base/compiler/ssair/ir.jl
@@ -1432,6 +1432,7 @@ function process_node!(compact::IncrementalCompact, result_idx::Int, inst::Instr
     elseif isa(stmt, OldSSAValue)
         ssa_rename[idx] = ssa_rename[stmt.id]
     elseif isa(stmt, GotoNode) && cfg_transforms_enabled
+        stmt.label < 0 && (println(stmt); println(compact))
         label = bb_rename_succ[stmt.label]
         @assert label > 0
         ssa_rename[idx] = SSAValue(result_idx)
diff --git a/base/compiler/ssair/irinterp.jl b/base/compiler/ssair/irinterp.jl
index 1aeb87accbcd7..ca8ca770df413 100644
--- a/base/compiler/ssair/irinterp.jl
+++ b/base/compiler/ssair/irinterp.jl
@@ -51,8 +51,11 @@ end
 
 function abstract_call(interp::AbstractInterpreter, arginfo::ArgInfo, irsv::IRInterpretationState)
     si = StmtInfo(true) # TODO better job here?
-    call = abstract_call(interp, arginfo, si, irsv)
-    irsv.ir.stmts[irsv.curridx][:info] = call.info
+    call = abstract_call(interp, arginfo, si, irsv)::Future
+    Future{Nothing}(call, interp, irsv) do call, interp, irsv
+        irsv.ir.stmts[irsv.curridx][:info] = call.info
+        nothing
+    end
     return call
 end
 
@@ -143,7 +146,19 @@ function reprocess_instruction!(interp::AbstractInterpreter, inst::Instruction,
         head = stmt.head
         if (head === :call || head === :foreigncall || head === :new || head === :splatnew ||
             head === :static_parameter || head === :isdefined || head === :boundscheck)
-            (; rt, effects) = abstract_eval_statement_expr(interp, stmt, nothing, irsv)
+            @assert isempty(irsv.tasks) # TODO: this whole function needs to be converted to a stackless design to be a valid AbsIntState, but this should work here for now
+            result = abstract_eval_statement_expr(interp, stmt, nothing, irsv)
+            reverse!(irsv.tasks)
+            while true
+                if length(irsv.callstack) > irsv.frameid
+                    typeinf(interp, irsv.callstack[irsv.frameid + 1])
+                elseif !doworkloop(interp, irsv)
+                    break
+                end
+            end
+            @assert length(irsv.callstack) == irsv.frameid && isempty(irsv.tasks)
+            result isa Future && (result = result[])
+            (; rt, effects) = result
             add_flag!(inst, flags_for_effects(effects))
         elseif head === :invoke
             rt, (nothrow, noub) = abstract_eval_invoke_inst(interp, inst, irsv)
@@ -293,7 +308,7 @@ function is_all_const_call(@nospecialize(stmt), interp::AbstractInterpreter, irs
     return true
 end
 
-function _ir_abstract_constant_propagation(interp::AbstractInterpreter, irsv::IRInterpretationState;
+function ir_abstract_constant_propagation(interp::AbstractInterpreter, irsv::IRInterpretationState;
         externally_refined::Union{Nothing,BitSet} = nothing)
     (; ir, tpdum, ssa_refined) = irsv
 
@@ -449,18 +464,3 @@ function _ir_abstract_constant_propagation(interp::AbstractInterpreter, irsv::IR
 
     return Pair{Any,Tuple{Bool,Bool}}(maybe_singleton_const(ultimate_rt), (nothrow, noub))
 end
-
-function ir_abstract_constant_propagation(interp::NativeInterpreter, irsv::IRInterpretationState)
-    if __measure_typeinf__[]
-        inf_frame = Timings.InferenceFrameInfo(irsv.mi, irsv.world, VarState[], Any[], length(irsv.ir.argtypes))
-        Timings.enter_new_timer(inf_frame)
-        ret = _ir_abstract_constant_propagation(interp, irsv)
-        append!(inf_frame.slottypes, irsv.ir.argtypes)
-        Timings.exit_current_timer(inf_frame)
-        return ret
-    else
-        return _ir_abstract_constant_propagation(interp, irsv)
-    end
-end
-ir_abstract_constant_propagation(interp::AbstractInterpreter, irsv::IRInterpretationState) =
-    _ir_abstract_constant_propagation(interp, irsv)
diff --git a/base/compiler/ssair/passes.jl b/base/compiler/ssair/passes.jl
index 37d79e2bd7b0c..0e2272524a0ed 100644
--- a/base/compiler/ssair/passes.jl
+++ b/base/compiler/ssair/passes.jl
@@ -1532,7 +1532,7 @@ function try_inline_finalizer!(ir::IRCode, argexprs::Vector{Any}, idx::Int,
     end
 
     src_inlining_policy(inlining.interp, src, info, IR_FLAG_NULL) || return false
-    src, di = retrieve_ir_for_inlining(code, src)
+    src, spec_info, di = retrieve_ir_for_inlining(code, src)
 
     # For now: Require finalizer to only have one basic block
     length(src.cfg.blocks) == 1 || return false
@@ -1542,7 +1542,7 @@ function try_inline_finalizer!(ir::IRCode, argexprs::Vector{Any}, idx::Int,
 
     # TODO: Should there be a special line number node for inlined finalizers?
     inline_at = ir[SSAValue(idx)][:line]
-    ssa_substitute = ir_prepare_inlining!(InsertBefore(ir, SSAValue(idx)), ir, src, di, mi, inline_at, argexprs)
+    ssa_substitute = ir_prepare_inlining!(InsertBefore(ir, SSAValue(idx)), ir, src, spec_info, di, mi, inline_at, argexprs)
 
     # TODO: Use the actual inliner here rather than open coding this special purpose inliner.
     ssa_rename = Vector{Any}(undef, length(src.stmts))
@@ -1564,10 +1564,12 @@ end
 
 is_nothrow(ir::IRCode, ssa::SSAValue) = has_flag(ir[ssa], IR_FLAG_NOTHROW)
 
-function reachable_blocks(cfg::CFG, from_bb::Int, to_bb::Union{Nothing,Int} = nothing)
+function reachable_blocks(cfg::CFG, from_bb::Int, to_bb::Int)
     worklist = Int[from_bb]
     visited = BitSet(from_bb)
-    if to_bb !== nothing
+    if to_bb == from_bb
+        return visited
+    else
         push!(visited, to_bb)
     end
     function visit!(bb::Int)
@@ -1582,100 +1584,78 @@ function reachable_blocks(cfg::CFG, from_bb::Int, to_bb::Union{Nothing,Int} = no
     return visited
 end
 
-function try_resolve_finalizer!(ir::IRCode, idx::Int, finalizer_idx::Int, defuse::SSADefUse,
+function try_resolve_finalizer!(ir::IRCode, alloc_idx::Int, finalizer_idx::Int, defuse::SSADefUse,
         inlining::InliningState, lazydomtree::LazyDomtree,
         lazypostdomtree::LazyPostDomtree, @nospecialize(info::CallInfo))
     # For now, require that:
     # 1. The allocation dominates the finalizer registration
-    # 2. The finalizer registration dominates all uses reachable from the
-    #    finalizer registration.
-    # 3. The insertion block for the finalizer is the post-dominator of all
-    #    uses and the finalizer registration block. The insertion block must
-    #    be dominated by the finalizer registration block.
-    # 4. The path from the finalizer registration to the finalizer inlining
+    # 2. The insertion block for the finalizer is the post-dominator of all
+    #    uses (including the finalizer registration).
+    # 3. The path from the finalizer registration to the finalizer inlining
     #    location is nothrow
     #
-    # TODO: We could relax item 3, by inlining the finalizer multiple times.
+    # TODO: We could relax the check 2, by inlining the finalizer multiple times.
 
     # Check #1: The allocation dominates the finalizer registration
     domtree = get!(lazydomtree)
     finalizer_bb = block_for_inst(ir, finalizer_idx)
-    alloc_bb = block_for_inst(ir, idx)
+    alloc_bb = block_for_inst(ir, alloc_idx)
     dominates(domtree, alloc_bb, finalizer_bb) || return nothing
 
-    bb_insert_block::Int = finalizer_bb
-    bb_insert_idx::Union{Int,Nothing} = finalizer_idx
-    function note_block_use!(usebb::Int, useidx::Int)
-        new_bb_insert_block = nearest_common_dominator(get!(lazypostdomtree),
-            bb_insert_block, usebb)
-        if new_bb_insert_block == bb_insert_block && bb_insert_idx !== nothing
-            bb_insert_idx = max(bb_insert_idx::Int, useidx)
-        elseif new_bb_insert_block == usebb
-            bb_insert_idx = useidx
+    # Check #2: The insertion block for the finalizer is the post-dominator of all uses
+    insert_bb::Int = finalizer_bb
+    insert_idx::Union{Int,Nothing} = finalizer_idx
+    function note_defuse!(x::Union{Int,SSAUse})
+        defuse_idx = x isa SSAUse ? x.idx : x
+        defuse_idx == finalizer_idx && return nothing
+        defuse_bb = block_for_inst(ir, defuse_idx)
+        new_insert_bb = nearest_common_dominator(get!(lazypostdomtree),
+            insert_bb, defuse_bb)
+        if new_insert_bb == insert_bb && insert_idx !== nothing
+            insert_idx = max(insert_idx::Int, defuse_idx)
+        elseif new_insert_bb == defuse_bb
+            insert_idx = defuse_idx
         else
-            bb_insert_idx = nothing
+            insert_idx = nothing
         end
-        bb_insert_block = new_bb_insert_block
+        insert_bb = new_insert_bb
         nothing
     end
-
-    # Collect all reachable blocks between the finalizer registration and the
-    # insertion point
-    blocks = reachable_blocks(ir.cfg, finalizer_bb, alloc_bb)
-
-    # Check #2
-    function check_defuse(x::Union{Int,SSAUse})
-        duidx = x isa SSAUse ? x.idx : x
-        duidx == finalizer_idx && return true
-        bb = block_for_inst(ir, duidx)
-        # Not reachable from finalizer registration - we're ok
-        bb ∉ blocks && return true
-        note_block_use!(bb, duidx)
-        if dominates(domtree, finalizer_bb, bb)
-            return true
-        else
-            return false
-        end
-    end
-    all(check_defuse, defuse.uses) || return nothing
-    all(check_defuse, defuse.defs) || return nothing
-    bb_insert_block != 0 || return nothing # verify post-dominator of all uses exists
-
-    # Check #3
-    dominates(domtree, finalizer_bb, bb_insert_block) || return nothing
+    foreach(note_defuse!, defuse.uses)
+    foreach(note_defuse!, defuse.defs)
+    insert_bb != 0 || return nothing # verify post-dominator of all uses exists
 
     if !OptimizationParams(inlining.interp).assume_fatal_throw
         # Collect all reachable blocks between the finalizer registration and the
         # insertion point
-        blocks = finalizer_bb == bb_insert_block ? Int[finalizer_bb] :
-            reachable_blocks(ir.cfg, finalizer_bb, bb_insert_block)
+        blocks = reachable_blocks(ir.cfg, finalizer_bb, insert_bb)
 
-        # Check #4
-        function check_range_nothrow(ir::IRCode, s::Int, e::Int)
+        # Check #3
+        function check_range_nothrow(s::Int, e::Int)
             return all(s:e) do sidx::Int
                 sidx == finalizer_idx && return true
-                sidx == idx && return true
+                sidx == alloc_idx && return true
                 return is_nothrow(ir, SSAValue(sidx))
             end
         end
         for bb in blocks
             range = ir.cfg.blocks[bb].stmts
             s, e = first(range), last(range)
-            if bb == bb_insert_block
-                bb_insert_idx === nothing && continue
-                e = bb_insert_idx
+            if bb == insert_bb
+                insert_idx === nothing && continue
+                e = insert_idx
             end
             if bb == finalizer_bb
                 s = finalizer_idx
             end
-            check_range_nothrow(ir, s, e) || return nothing
+            check_range_nothrow(s, e) || return nothing
         end
     end
 
     # Ok, legality check complete. Figure out the exact statement where we're
     # going to inline the finalizer.
-    loc = bb_insert_idx === nothing ? first(ir.cfg.blocks[bb_insert_block].stmts) : bb_insert_idx::Int
-    attach_after = bb_insert_idx !== nothing
+    loc = insert_idx === nothing ? first(ir.cfg.blocks[insert_bb].stmts) : insert_idx::Int
+    attach_after = insert_idx !== nothing
 
     finalizer_stmt = ir[SSAValue(finalizer_idx)][:stmt]
     argexprs = Any[finalizer_stmt.args[2], finalizer_stmt.args[3]]
@@ -1702,11 +1682,10 @@ function try_resolve_finalizer!(ir::IRCode, idx::Int, finalizer_idx::Int, defuse
     return nothing
 end
 
-function sroa_mutables!(ir::IRCode, defuses::IdDict{Int, Tuple{SPCSet, SSADefUse}}, used_ssas::Vector{Int}, lazydomtree::LazyDomtree, inlining::Union{Nothing, InliningState})
+function sroa_mutables!(ir::IRCode, defuses::IdDict{Int,Tuple{SPCSet,SSADefUse}}, used_ssas::Vector{Int}, lazydomtree::LazyDomtree, inlining::Union{Nothing,InliningState})
     𝕃ₒ = inlining === nothing ? SimpleInferenceLattice.instance : optimizer_lattice(inlining.interp)
     lazypostdomtree = LazyPostDomtree(ir)
     for (defidx, (intermediaries, defuse)) in defuses
-        intermediaries = collect(intermediaries)
         # Check if there are any uses we did not account for. If so, the variable
         # escapes and we cannot eliminate the allocation. This works, because we're guaranteed
         # not to include any intermediaries that have dead uses. As a result, missing uses will only ever
@@ -1728,22 +1707,24 @@ function sroa_mutables!(ir::IRCode, defuses::IdDict{Int, Tuple{SPCSet, SSADefUse
         ismutabletype(typ) || continue
         typ = typ::DataType
         # First check for any finalizer calls
-        finalizer_idx = nothing
-        for use in defuse.uses
+        finalizer_useidx = nothing
+        for (useidx, use) in enumerate(defuse.uses)
             if use.kind === :finalizer
                 # For now: Only allow one finalizer per allocation
-                finalizer_idx !== nothing && @goto skip
-                finalizer_idx = use.idx
+                finalizer_useidx !== nothing && @goto skip
+                finalizer_useidx = useidx
             end
         end
-        if finalizer_idx !== nothing && inlining !== nothing
+        all_eliminated = all_forwarded = true
+        if finalizer_useidx !== nothing && inlining !== nothing
+            finalizer_idx = defuse.uses[finalizer_useidx].idx
             try_resolve_finalizer!(ir, defidx, finalizer_idx, defuse, inlining,
                 lazydomtree, lazypostdomtree, ir[SSAValue(finalizer_idx)][:info])
-            continue
+            deleteat!(defuse.uses, finalizer_useidx)
+            all_eliminated = all_forwarded = false # can't eliminate `setfield!` calls safely
         end
         # Partition defuses by field
         fielddefuse = SSADefUse[SSADefUse() for _ = 1:fieldcount(typ)]
-        all_eliminated = all_forwarded = true
         for use in defuse.uses
             if use.kind === :preserve
                 for du in fielddefuse
@@ -1906,7 +1887,7 @@ function sroa_mutables!(ir::IRCode, defuses::IdDict{Int, Tuple{SPCSet, SSADefUse
     end
 end
 
-function form_new_preserves(origex::Expr, intermediates::Vector{Int}, new_preserves::Vector{Any})
+function form_new_preserves(origex::Expr, intermediaries::Union{Vector{Int},SPCSet}, new_preserves::Vector{Any})
     newex = Expr(:foreigncall)
     nccallargs = length(origex.args[3]::SimpleVector)
     for i in 1:(6+nccallargs-1)
@@ -1915,7 +1896,7 @@ function form_new_preserves(origex::Expr, intermediates::Vector{Int}, new_preser
     for i in (6+nccallargs):length(origex.args)
         x = origex.args[i]
         # don't need to preserve intermediaries
-        if isa(x, SSAValue) && x.id in intermediates
+        if isa(x, SSAValue) && x.id in intermediaries
             continue
         end
         push!(newex.args, x)
diff --git a/base/compiler/ssair/verify.jl b/base/compiler/ssair/verify.jl
index a4286177e93a4..268991282c483 100644
--- a/base/compiler/ssair/verify.jl
+++ b/base/compiler/ssair/verify.jl
@@ -1,9 +1,11 @@
 # This file is a part of Julia. License is MIT: https://julialang.org/license
 
 function maybe_show_ir(ir::IRCode)
-    if isdefined(Core, :Main)
+    if isdefined(Core, :Main) && isdefined(Core.Main, :Base)
         # ensure we use I/O that does not yield, as this gets called during compilation
         invokelatest(Core.Main.Base.show, Core.stdout, "text/plain", ir)
+    else
+        Core.show(ir)
     end
 end
 
@@ -25,6 +27,7 @@ is_toplevel_expr_head(head::Symbol) = head === :global || head === :method || he
 is_value_pos_expr_head(head::Symbol) = head === :static_parameter
 function check_op(ir::IRCode, domtree::DomTree, @nospecialize(op), use_bb::Int, use_idx::Int, printed_use_idx::Int, print::Bool, isforeigncall::Bool, arg_idx::Int, allow_frontend_forms::Bool)
     if isa(op, SSAValue)
+        op.id > 0 || @verify_error "Def ($(op.id)) is invalid in final IR"
         if op.id > length(ir.stmts)
             def_bb = block_for_inst(ir.cfg, ir.new_nodes.info[op.id - length(ir.stmts)].pos)
         else
diff --git a/base/compiler/stmtinfo.jl b/base/compiler/stmtinfo.jl
index 69d2ac7ae45a0..9dba7a4459f9e 100644
--- a/base/compiler/stmtinfo.jl
+++ b/base/compiler/stmtinfo.jl
@@ -33,10 +33,16 @@ not a call to a generic function.
 """
 struct MethodMatchInfo <: CallInfo
     results::MethodLookupResult
+    mt::MethodTable
+    fullmatch::Bool
 end
 nsplit_impl(info::MethodMatchInfo) = 1
 getsplit_impl(info::MethodMatchInfo, idx::Int) = (@assert idx == 1; info.results)
 getresult_impl(::MethodMatchInfo, ::Int) = nothing
+function add_uncovered_edges_impl(edges::Vector{Any}, info::MethodMatchInfo, @nospecialize(atype))
+    fully_covering(info) || push!(edges, info.mt, atype)
+    nothing
+end
 
 """
     info::UnionSplitInfo <: CallInfo
@@ -48,20 +54,27 @@ each partition (`info.matches::Vector{MethodMatchInfo}`).
 This info is illegal on any statement that is not a call to a generic function.
 """
 struct UnionSplitInfo <: CallInfo
-    matches::Vector{MethodMatchInfo}
+    split::Vector{MethodMatchInfo}
 end
 
 nmatches(info::MethodMatchInfo) = length(info.results)
 function nmatches(info::UnionSplitInfo)
     n = 0
-    for mminfo in info.matches
+    for mminfo in info.split
         n += nmatches(mminfo)
     end
     return n
 end
-nsplit_impl(info::UnionSplitInfo) = length(info.matches)
-getsplit_impl(info::UnionSplitInfo, idx::Int) = getsplit_impl(info.matches[idx], 1)
+nsplit_impl(info::UnionSplitInfo) = length(info.split)
+getsplit_impl(info::UnionSplitInfo, idx::Int) = getsplit(info.split[idx], 1)
 getresult_impl(::UnionSplitInfo, ::Int) = nothing
+function add_uncovered_edges_impl(edges::Vector{Any}, info::UnionSplitInfo, @nospecialize(atype))
+    all(fully_covering, info.split) && return nothing
+    # add mt backedges with removing duplications
+    for mt in uncovered_method_tables(info)
+        push!(edges, mt, atype)
+    end
+end
 
 abstract type ConstResult end
 
@@ -81,6 +94,7 @@ struct SemiConcreteResult <: ConstResult
     mi::MethodInstance
     ir::IRCode
     effects::Effects
+    spec_info::SpecInfo
 end
 
 # XXX Technically this does not represent a result of constant inference, but rather that of
@@ -105,6 +119,7 @@ end
 nsplit_impl(info::ConstCallInfo) = nsplit(info.call)
 getsplit_impl(info::ConstCallInfo, idx::Int) = getsplit(info.call, idx)
 getresult_impl(info::ConstCallInfo, idx::Int) = info.results[idx]
+add_uncovered_edges_impl(edges::Vector{Any}, info::ConstCallInfo, @nospecialize(atype)) = add_uncovered_edges!(edges, info.call, atype)
 
 """
     info::MethodResultPure <: CallInfo
diff --git a/base/compiler/tfuncs.jl b/base/compiler/tfuncs.jl
index 0c57c04a6ddea..a6b7e53c6f320 100644
--- a/base/compiler/tfuncs.jl
+++ b/base/compiler/tfuncs.jl
@@ -89,6 +89,7 @@ function add_tfunc(@nospecialize(f::Builtin), minarg::Int, maxarg::Int, @nospeci
 end
 
 add_tfunc(throw, 1, 1, @nospecs((𝕃::AbstractLattice, x)->Bottom), 0)
+add_tfunc(Core.throw_methoderror, 1, INT_INF, @nospecs((𝕃::AbstractLattice, x)->Bottom), 0)
 
 # the inverse of typeof_tfunc
 # returns (type, isexact, isconcrete, istype)
@@ -134,8 +135,8 @@ function instanceof_tfunc(@nospecialize(t), astag::Bool=false, @nospecialize(tro
         end
         return tr, isexact, isconcrete, istype
     elseif isa(t, Union)
-        ta, isexact_a, isconcrete_a, istype_a = instanceof_tfunc(t.a, astag, troot)
-        tb, isexact_b, isconcrete_b, istype_b = instanceof_tfunc(t.b, astag, troot)
+        ta, isexact_a, isconcrete_a, istype_a = instanceof_tfunc(unwraptv(t.a), astag, troot)
+        tb, isexact_b, isconcrete_b, istype_b = instanceof_tfunc(unwraptv(t.b), astag, troot)
         isconcrete = isconcrete_a && isconcrete_b
         istype = istype_a && istype_b
         # most users already handle the Union case, so here we assume that
@@ -562,9 +563,9 @@ add_tfunc(Core.sizeof, 1, 1, sizeof_tfunc, 1)
         end
     end
     if isa(x, Union)
-        na = nfields_tfunc(𝕃, x.a)
+        na = nfields_tfunc(𝕃, unwraptv(x.a))
         na === Int && return Int
-        return tmerge(na, nfields_tfunc(𝕃, x.b))
+        return tmerge(𝕃, na, nfields_tfunc(𝕃, unwraptv(x.b)))
     end
     return Int
 end
@@ -1382,10 +1383,10 @@ end
 
     nargs = length(argtypes)
     if !isempty(argtypes) && isvarargtype(argtypes[nargs])
-        nargs - 1 <= maxargs || return CallMeta(Bottom, Any, EFFECTS_THROWS, NoCallInfo())
-        nargs + 1 >= op_argi || return CallMeta(Any, Any, Effects(), NoCallInfo())
+        nargs - 1 <= maxargs || return Future(CallMeta(Bottom, Any, EFFECTS_THROWS, NoCallInfo()))
+        nargs + 1 >= op_argi || return Future(CallMeta(Any, Any, Effects(), NoCallInfo()))
     else
-        minargs <= nargs <= maxargs || return CallMeta(Bottom, Any, EFFECTS_THROWS, NoCallInfo())
+        minargs <= nargs <= maxargs || return Future(CallMeta(Bottom, Any, EFFECTS_THROWS, NoCallInfo()))
     end
     𝕃ᵢ = typeinf_lattice(interp)
     if ff === modifyfield!
@@ -1416,15 +1417,22 @@ end
         op = unwrapva(argtypes[op_argi])
         v = unwrapva(argtypes[v_argi])
         callinfo = abstract_call(interp, ArgInfo(nothing, Any[op, TF, v]), StmtInfo(true), sv, #=max_methods=#1)
-        TF2 = tmeet(callinfo.rt, widenconst(TF))
-        if TF2 === Bottom
-            RT = Bottom
-        elseif isconcretetype(RT) && has_nontrivial_extended_info(𝕃ᵢ, TF2) # isconcrete condition required to form a PartialStruct
-            RT = PartialStruct(RT, Any[TF, TF2])
+        TF = Core.Box(TF)
+        RT = Core.Box(RT)
+        return Future{CallMeta}(callinfo, interp, sv) do callinfo, interp, sv
+            TF = TF.contents
+            RT = RT.contents
+            TF2 = tmeet(callinfo.rt, widenconst(TF))
+            if TF2 === Bottom
+                RT = Bottom
+            elseif isconcretetype(RT) && has_nontrivial_extended_info(𝕃ᵢ, TF2) # isconcrete condition required to form a PartialStruct
+                RT = PartialStruct(RT, Any[TF, TF2])
+            end
+            info = ModifyOpInfo(callinfo.info)
+            return CallMeta(RT, Any, Effects(), info)
         end
-        info = ModifyOpInfo(callinfo.info)
     end
-    return CallMeta(RT, Any, Effects(), info)
+    return Future(CallMeta(RT, Any, Effects(), info))
 end
 
 # we could use tuple_tfunc instead of widenconst, but `o` is mutable, so that is unlikely to be beneficial
@@ -2313,6 +2321,7 @@ const _CONSISTENT_BUILTINS = Any[
     (<:),
     typeassert,
     throw,
+    Core.throw_methoderror,
     setfield!,
     donotdelete
 ]
@@ -2335,6 +2344,7 @@ const _EFFECT_FREE_BUILTINS = [
     (<:),
     typeassert,
     throw,
+    Core.throw_methoderror,
     getglobal,
     compilerbarrier,
 ]
@@ -2350,6 +2360,7 @@ const _INACCESSIBLEMEM_BUILTINS = Any[
     isa,
     nfields,
     throw,
+    Core.throw_methoderror,
     tuple,
     typeassert,
     typeof,
@@ -2891,17 +2902,17 @@ end
 function return_type_tfunc(interp::AbstractInterpreter, argtypes::Vector{Any}, si::StmtInfo, sv::AbsIntState)
     UNKNOWN = CallMeta(Type, Any, Effects(EFFECTS_THROWS; nortcall=false), NoCallInfo())
     if !(2 <= length(argtypes) <= 3)
-        return UNKNOWN
+        return Future(UNKNOWN)
     end
 
     tt = widenslotwrapper(argtypes[end])
     if !isa(tt, Const) && !(isType(tt) && !has_free_typevars(tt))
-        return UNKNOWN
+        return Future(UNKNOWN)
     end
 
     af_argtype = isa(tt, Const) ? tt.val : (tt::DataType).parameters[1]
     if !isa(af_argtype, DataType) || !(af_argtype <: Tuple)
-        return UNKNOWN
+        return Future(UNKNOWN)
     end
 
     if length(argtypes) == 3
@@ -2914,7 +2925,7 @@ function return_type_tfunc(interp::AbstractInterpreter, argtypes::Vector{Any}, s
     end
     if !(isa(aft, Const) || (isType(aft) && !has_free_typevars(aft)) ||
             (isconcretetype(aft) && !(aft <: Builtin) && !iskindtype(aft)))
-        return UNKNOWN
+        return Future(UNKNOWN)
     end
 
     # effects are not an issue if we know this statement will get removed, but if it does not get removed,
@@ -2922,7 +2933,7 @@ function return_type_tfunc(interp::AbstractInterpreter, argtypes::Vector{Any}, s
     RT_CALL_EFFECTS = Effects(EFFECTS_TOTAL; nortcall=false)
 
     if contains_is(argtypes_vec, Union{})
-        return CallMeta(Const(Union{}), Union{}, RT_CALL_EFFECTS, NoCallInfo())
+        return Future(CallMeta(Const(Union{}), Union{}, RT_CALL_EFFECTS, NoCallInfo()))
     end
 
     # Run the abstract_call without restricting abstract call
@@ -2931,42 +2942,45 @@ function return_type_tfunc(interp::AbstractInterpreter, argtypes::Vector{Any}, s
     if isa(sv, InferenceState)
         old_restrict = sv.restrict_abstract_call_sites
         sv.restrict_abstract_call_sites = false
-        call = abstract_call(interp, ArgInfo(nothing, argtypes_vec), si, sv, #=max_methods=#-1)
-        sv.restrict_abstract_call_sites = old_restrict
-    else
-        call = abstract_call(interp, ArgInfo(nothing, argtypes_vec), si, sv, #=max_methods=#-1)
-    end
-    info = verbose_stmt_info(interp) ? MethodResultPure(ReturnTypeCallInfo(call.info)) : MethodResultPure()
-    rt = widenslotwrapper(call.rt)
-    if isa(rt, Const)
-        # output was computed to be constant
-        return CallMeta(Const(typeof(rt.val)), Union{}, RT_CALL_EFFECTS, info)
-    end
-    rt = widenconst(rt)
-    if rt === Bottom || (isconcretetype(rt) && !iskindtype(rt))
-        # output cannot be improved so it is known for certain
-        return CallMeta(Const(rt), Union{}, RT_CALL_EFFECTS, info)
-    elseif isa(sv, InferenceState) && !isempty(sv.pclimitations)
-        # conservatively express uncertainty of this result
-        # in two ways: both as being a subtype of this, and
-        # because of LimitedAccuracy causes
-        return CallMeta(Type{<:rt}, Union{}, RT_CALL_EFFECTS, info)
-    elseif isa(tt, Const) || isconstType(tt)
-        # input arguments were known for certain
-        # XXX: this doesn't imply we know anything about rt
-        return CallMeta(Const(rt), Union{}, RT_CALL_EFFECTS, info)
-    elseif isType(rt)
-        return CallMeta(Type{rt}, Union{}, RT_CALL_EFFECTS, info)
-    else
-        return CallMeta(Type{<:rt}, Union{}, RT_CALL_EFFECTS, info)
+    end
+    call = abstract_call(interp, ArgInfo(nothing, argtypes_vec), si, sv, #=max_methods=#-1)
+    tt = Core.Box(tt)
+    return Future{CallMeta}(call, interp, sv) do call, interp, sv
+        if isa(sv, InferenceState)
+            sv.restrict_abstract_call_sites = old_restrict
+        end
+        info = verbose_stmt_info(interp) ? MethodResultPure(ReturnTypeCallInfo(call.info)) : MethodResultPure()
+        rt = widenslotwrapper(call.rt)
+        if isa(rt, Const)
+            # output was computed to be constant
+            return CallMeta(Const(typeof(rt.val)), Union{}, RT_CALL_EFFECTS, info)
+        end
+        rt = widenconst(rt)
+        if rt === Bottom || (isconcretetype(rt) && !iskindtype(rt))
+            # output cannot be improved so it is known for certain
+            return CallMeta(Const(rt), Union{}, RT_CALL_EFFECTS, info)
+        elseif isa(sv, InferenceState) && !isempty(sv.pclimitations)
+            # conservatively express uncertainty of this result
+            # in two ways: both as being a subtype of this, and
+            # because of LimitedAccuracy causes
+            return CallMeta(Type{<:rt}, Union{}, RT_CALL_EFFECTS, info)
+        elseif isa(tt.contents, Const) || isconstType(tt.contents)
+            # input arguments were known for certain
+            # XXX: this doesn't imply we know anything about rt
+            return CallMeta(Const(rt), Union{}, RT_CALL_EFFECTS, info)
+        elseif isType(rt)
+            return CallMeta(Type{rt}, Union{}, RT_CALL_EFFECTS, info)
+        else
+            return CallMeta(Type{<:rt}, Union{}, RT_CALL_EFFECTS, info)
+        end
     end
 end
 
 # a simplified model of abstract_call_gf_by_type for applicable
 function abstract_applicable(interp::AbstractInterpreter, argtypes::Vector{Any},
                              sv::AbsIntState, max_methods::Int)
-    length(argtypes) < 2 && return CallMeta(Bottom, Any, EFFECTS_THROWS, NoCallInfo())
-    isvarargtype(argtypes[2]) && return CallMeta(Bool, Any, EFFECTS_THROWS, NoCallInfo())
+    length(argtypes) < 2 && return Future(CallMeta(Bottom, Any, EFFECTS_THROWS, NoCallInfo()))
+    isvarargtype(argtypes[2]) && return Future(CallMeta(Bool, Any, EFFECTS_THROWS, NoCallInfo()))
     argtypes = argtypes[2:end]
     atype = argtypes_to_type(argtypes)
     matches = find_method_matches(interp, argtypes, atype; max_methods)
@@ -2975,36 +2989,25 @@ function abstract_applicable(interp::AbstractInterpreter, argtypes::Vector{Any},
     else
         (; valid_worlds, applicable) = matches
         update_valid_age!(sv, valid_worlds)
-
-        # also need an edge to the method table in case something gets
-        # added that did not intersect with any existing method
-        if isa(matches, MethodMatches)
-            matches.fullmatch || add_mt_backedge!(sv, matches.mt, atype)
-        else
-            for (thisfullmatch, mt) in zip(matches.fullmatches, matches.mts)
-                thisfullmatch || add_mt_backedge!(sv, mt, atype)
-            end
-        end
-
         napplicable = length(applicable)
         if napplicable == 0
             rt = Const(false) # never any matches
+        elseif !fully_covering(matches) || any_ambig(matches)
+            # Account for the fact that we may encounter a MethodError with a non-covered or ambiguous signature.
+            rt = Bool
         else
             rt = Const(true) # has applicable matches
-            for i in 1:napplicable
-                match = applicable[i]::MethodMatch
-                edge = specialize_method(match)::MethodInstance
-                add_backedge!(sv, edge)
-            end
-
-            if isa(matches, MethodMatches) ? (!matches.fullmatch || any_ambig(matches)) :
-                    (!all(matches.fullmatches) || any_ambig(matches))
-                # Account for the fact that we may encounter a MethodError with a non-covered or ambiguous signature.
-                rt = Bool
-            end
         end
+        for i in 1:napplicable
+            match = applicable[i]::MethodMatch
+            edge = specialize_method(match)::MethodInstance
+            add_backedge!(sv, edge)
+        end
+        # also need an edge to the method table in case something gets
+        # added that did not intersect with any existing method
+        add_uncovered_edges!(sv, matches, atype)
     end
-    return CallMeta(rt, Union{}, EFFECTS_TOTAL, NoCallInfo())
+    return Future(CallMeta(rt, Union{}, EFFECTS_TOTAL, NoCallInfo()))
 end
 add_tfunc(applicable, 1, INT_INF, @nospecs((𝕃::AbstractLattice, f, args...)->Bool), 40)
 
diff --git a/base/compiler/typeinfer.jl b/base/compiler/typeinfer.jl
index e2f2a1f2cc975..8b85f7c6f35f1 100644
--- a/base/compiler/typeinfer.jl
+++ b/base/compiler/typeinfer.jl
@@ -56,7 +56,7 @@ end
 Timing(mi_info, start_time, cur_start_time, time, children) = Timing(mi_info, start_time, cur_start_time, time, children, nothing)
 Timing(mi_info, start_time) = Timing(mi_info, start_time, start_time, UInt64(0), Timing[])
 
-_time_ns() = ccall(:jl_hrtime, UInt64, ())  # Re-implemented here because Base not yet available.
+_time_ns() = ccall(:jl_hrtime, UInt64, ())
 
 # We keep a stack of the Timings for each of the MethodInstances currently being timed.
 # Since type inference currently operates via a depth-first search (during abstract
@@ -77,114 +77,14 @@ const ROOTmi = Core.Compiler.specialize_method(
 Empty out the previously recorded type inference timings (`Core.Compiler._timings`), and
 start the ROOT() timer again. `ROOT()` measures all time spent _outside_ inference.
 """
-function reset_timings()
-    empty!(_timings)
-    push!(_timings, Timing(
-        # The MethodInstance for ROOT(), and default empty values for other fields.
-        InferenceFrameInfo(ROOTmi, 0x0, Core.Compiler.VarState[], Any[Core.Const(ROOT)], 1),
-        _time_ns()))
-    return nothing
-end
-reset_timings()
-
-# (This is split into a function so that it can be called both in this module, at the top
-# of `enter_new_timer()`, and once at the Very End of the operation, by whoever started
-# the operation and called `reset_timings()`.)
-# NOTE: the @inline annotations here are not to make it faster, but to reduce the gap between
-# timer manipulations and the tasks we're timing.
-@inline function close_current_timer()
-    stop_time = _time_ns()
-    parent_timer = _timings[end]
-    accum_time = stop_time - parent_timer.cur_start_time
-
-    # Add in accum_time ("modify" the immutable struct)
-    @inbounds begin
-        _timings[end] = Timing(
-            parent_timer.mi_info,
-            parent_timer.start_time,
-            parent_timer.cur_start_time,
-            parent_timer.time + accum_time,
-            parent_timer.children,
-            parent_timer.bt,
-        )
-    end
-    return nothing
-end
-
-@inline function enter_new_timer(frame)
-    # Very first thing, stop the active timer: get the current time and add in the
-    # time since it was last started to its aggregate exclusive time.
-    close_current_timer()
-
-    mi_info = _typeinf_identifier(frame)
-
-    # Start the new timer right before returning
-    push!(_timings, Timing(mi_info, UInt64(0)))
-    len = length(_timings)
-    new_timer = @inbounds _timings[len]
-    # Set the current time _after_ appending the node, to try to exclude the
-    # overhead from measurement.
-    start = _time_ns()
-
-    @inbounds begin
-        _timings[len] = Timing(
-            new_timer.mi_info,
-            start,
-            start,
-            new_timer.time,
-            new_timer.children,
-        )
-    end
-
-    return nothing
-end
-
-# _expected_frame_ is not needed within this function; it is used in the `@assert`, to
-# assert that indeed we are always returning to a parent after finishing all of its
-# children (that is, asserting that inference proceeds via depth-first-search).
-@inline function exit_current_timer(_expected_frame_)
-    # Finish the new timer
-    stop_time = _time_ns()
-
-    expected_mi_info = _typeinf_identifier(_expected_frame_)
-
-    # Grab the new timer again because it might have been modified in _timings
-    # (since it's an immutable struct)
-    # And remove it from the current timings stack
-    new_timer = pop!(_timings)
-    Core.Compiler.@assert new_timer.mi_info.mi === expected_mi_info.mi
-
-    # Prepare to unwind one level of the stack and record in the parent
-    parent_timer = _timings[end]
-
-    accum_time = stop_time - new_timer.cur_start_time
-    # Add in accum_time ("modify" the immutable struct)
-    new_timer = Timing(
-        new_timer.mi_info,
-        new_timer.start_time,
-        new_timer.cur_start_time,
-        new_timer.time + accum_time,
-        new_timer.children,
-        parent_timer.mi_info.mi === ROOTmi ? backtrace() : nothing,
-    )
-    # Record the final timing with the original parent timer
-    push!(parent_timer.children, new_timer)
-
-    # And finally restart the parent timer:
-    len = length(_timings)
-    @inbounds begin
-        _timings[len] = Timing(
-            parent_timer.mi_info,
-            parent_timer.start_time,
-            _time_ns(),
-            parent_timer.time,
-            parent_timer.children,
-            parent_timer.bt,
-        )
-    end
-
-    return nothing
-end
+function reset_timings() end
+push!(_timings, Timing(
+    # The MethodInstance for ROOT(), and default empty values for other fields.
+    InferenceFrameInfo(ROOTmi, 0x0, Core.Compiler.VarState[], Any[Core.Const(ROOT)], 1),
+    _time_ns()))
+function close_current_timer() end
+function enter_new_timer(frame) end
+function exit_current_timer(_expected_frame_) end
 
 end  # module Timings
 
@@ -194,19 +94,7 @@ end  # module Timings
 If set to `true`, record per-method-instance timings within type inference in the Compiler.
 """
 __set_measure_typeinf(onoff::Bool) = __measure_typeinf__[] = onoff
-const __measure_typeinf__ = fill(false)
-
-# Wrapper around `_typeinf` that optionally records the exclusive time for each invocation.
-function typeinf(interp::AbstractInterpreter, frame::InferenceState)
-    if __measure_typeinf__[]
-        Timings.enter_new_timer(frame)
-        v = _typeinf(interp, frame)
-        Timings.exit_current_timer(frame)
-        return v
-    else
-        return _typeinf(interp, frame)
-    end
-end
+const __measure_typeinf__ = RefValue{Bool}(false)
 
 function finish!(interp::AbstractInterpreter, caller::InferenceState;
                  can_discard_trees::Bool=may_discard_trees(interp))
@@ -258,19 +146,6 @@ function finish!(interp::AbstractInterpreter, caller::InferenceState;
     return nothing
 end
 
-function _typeinf(interp::AbstractInterpreter, frame::InferenceState)
-    typeinf_nocycle(interp, frame) || return false # frame is now part of a higher cycle
-    # with no active ip's, frame is done
-    frames = frame.callstack::Vector{AbsIntState}
-    if length(frames) == frame.cycleid
-        finish_nocycle(interp, frame)
-    else
-        @assert frame.cycleid != 0
-        finish_cycle(interp, frames, frame.cycleid)
-    end
-    return true
-end
-
 function finish_nocycle(::AbstractInterpreter, frame::InferenceState)
     finishinfer!(frame, frame.interp)
     opt = frame.result.src
@@ -762,16 +637,11 @@ function merge_call_chain!(interp::AbstractInterpreter, parent::InferenceState,
         add_cycle_backedge!(parent, child)
         parent.cycleid === ancestorid && break
         child = parent
-        parent = frame_parent(child)
-        while !isa(parent, InferenceState)
-            # XXX we may miss some edges here?
-            parent = frame_parent(parent::IRInterpretationState)
-        end
+        parent = frame_parent(child)::InferenceState
     end
     # ensure that walking the callstack has the same cycleid (DAG)
     for frame = reverse(ancestorid:length(frames))
-        frame = frames[frame]
-        frame isa InferenceState || continue
+        frame = frames[frame]::InferenceState
         frame.cycleid == ancestorid && break
         @assert frame.cycleid > ancestorid
         frame.cycleid = ancestorid
@@ -796,9 +666,9 @@ end
 # returned instead.
 function resolve_call_cycle!(interp::AbstractInterpreter, mi::MethodInstance, parent::AbsIntState)
     # TODO (#48913) implement a proper recursion handling for irinterp:
-    # This works just because currently the `:terminate` condition guarantees that
-    # irinterp doesn't fail into unresolved cycles, but it's not a good solution.
-    # We should revisit this once we have a better story for handling cycles in irinterp.
+    # This works currently just because the irinterp code doesn't get used much with
+    # `@assume_effects`, so it never sees a cycle normally, but that may not be a sustainable solution.
+    parent isa InferenceState || return false
     frames = parent.callstack::Vector{AbsIntState}
     uncached = false
     for frame = reverse(1:length(frames))
@@ -837,15 +707,43 @@ struct EdgeCallResult
 end
 
 # return cached result of regular inference
-function return_cached_result(::AbstractInterpreter, codeinst::CodeInstance, caller::AbsIntState)
+function return_cached_result(interp::AbstractInterpreter, method::Method, codeinst::CodeInstance, caller::AbsIntState, edgecycle::Bool, edgelimited::Bool)
     rt = cached_return_type(codeinst)
     effects = ipo_effects(codeinst)
     update_valid_age!(caller, WorldRange(min_world(codeinst), max_world(codeinst)))
-    return EdgeCallResult(rt, codeinst.exctype, codeinst.def, effects)
+    return Future(EdgeCall_to_MethodCall_Result(interp, caller, method, EdgeCallResult(rt, codeinst.exctype, codeinst.def, effects), edgecycle, edgelimited))
+end
+
+function EdgeCall_to_MethodCall_Result(interp::AbstractInterpreter, sv::AbsIntState, method::Method, result::EdgeCallResult, edgecycle::Bool, edgelimited::Bool)
+    (; rt, exct, edge, effects, volatile_inf_result) = result
+
+    if edge === nothing
+        edgecycle = edgelimited = true
+    end
+
+    # we look for the termination effect override here as well, since the :terminates effect
+    # may have been tainted due to recursion at this point even if it's overridden
+    if is_effect_overridden(sv, :terminates_globally)
+        # this frame is known to terminate
+        effects = Effects(effects, terminates=true)
+    elseif is_effect_overridden(method, :terminates_globally)
+        # this edge is known to terminate
+        effects = Effects(effects; terminates=true)
+    elseif edgecycle
+        # Some sort of recursion was detected.
+        if edge !== nothing && !edgelimited && !is_edge_recursed(edge, sv)
+            # no `MethodInstance` cycles -- don't taint :terminate
+        else
+            # we cannot guarantee that the call will terminate
+            effects = Effects(effects; terminates=false)
+        end
+    end
+
+    return MethodCallResult(rt, exct, edgecycle, edgelimited, edge, effects, volatile_inf_result)
 end
 
 # compute (and cache) an inferred AST and return the current best estimate of the result type
-function typeinf_edge(interp::AbstractInterpreter, method::Method, @nospecialize(atype), sparams::SimpleVector, caller::AbsIntState)
+function typeinf_edge(interp::AbstractInterpreter, method::Method, @nospecialize(atype), sparams::SimpleVector, caller::AbsIntState, edgecycle::Bool, edgelimited::Bool)
     mi = specialize_method(method, atype, sparams)::MethodInstance
     cache_mode = CACHE_MODE_GLOBAL # cache edge targets globally by default
     force_inline = is_stmt_inline(get_curr_ssaflag(caller))
@@ -859,13 +757,13 @@ function typeinf_edge(interp::AbstractInterpreter, method::Method, @nospecialize
                 cache_mode = CACHE_MODE_VOLATILE
             else
                 @assert codeinst.def === mi "MethodInstance for cached edge does not match"
-                return return_cached_result(interp, codeinst, caller)
+                return return_cached_result(interp, method, codeinst, caller, edgecycle, edgelimited)
             end
         end
     end
     if ccall(:jl_get_module_infer, Cint, (Any,), method.module) == 0 && !generating_output(#=incremental=#false)
-        add_remark!(interp, caller, "Inference is disabled for the target module")
-        return EdgeCallResult(Any, Any, nothing, Effects())
+        add_remark!(interp, caller, "[typeinf_edge] Inference is disabled for the target module")
+        return Future(EdgeCall_to_MethodCall_Result(interp, caller, method, EdgeCallResult(Any, Any, nothing, Effects()), edgecycle, edgelimited))
     end
     if !is_cached(caller) && frame_parent(caller) === nothing
         # this caller exists to return to the user
@@ -886,7 +784,7 @@ function typeinf_edge(interp::AbstractInterpreter, method::Method, @nospecialize
                         cache_mode = CACHE_MODE_VOLATILE
                     else
                         @assert codeinst.def === mi "MethodInstance for cached edge does not match"
-                        return return_cached_result(interp, codeinst, caller)
+                        return return_cached_result(interp, method, codeinst, caller, edgecycle, edgelimited)
                     end
                 end
             end
@@ -897,35 +795,45 @@ function typeinf_edge(interp::AbstractInterpreter, method::Method, @nospecialize
         end
         frame = InferenceState(result, cache_mode, interp) # always use the cache for edge targets
         if frame === nothing
-            add_remark!(interp, caller, "Failed to retrieve source")
+            add_remark!(interp, caller, "[typeinf_edge] Failed to retrieve source")
             # can't get the source for this, so we know nothing
             if cache_mode == CACHE_MODE_GLOBAL
                 engine_reject(interp, ci)
             end
-            return EdgeCallResult(Any, Any, nothing, Effects())
+            return Future(EdgeCall_to_MethodCall_Result(interp, caller, method, EdgeCallResult(Any, Any, nothing, Effects()), edgecycle, edgelimited))
         end
         assign_parentchild!(frame, caller)
-        typeinf(interp, frame)
-        update_valid_age!(caller, frame.valid_worlds)
-        isinferred = is_inferred(frame)
-        edge = isinferred ? mi : nothing
-        effects = isinferred ? frame.result.ipo_effects : # effects are adjusted already within `finish` for ipo_effects
-            adjust_effects(effects_for_cycle(frame.ipo_effects), method)
-        exc_bestguess = refine_exception_type(frame.exc_bestguess, effects)
-        # propagate newly inferred source to the inliner, allowing efficient inlining w/o deserialization:
-        # note that this result is cached globally exclusively, so we can use this local result destructively
-        volatile_inf_result = isinferred ? VolatileInferenceResult(result) : nothing
-        return EdgeCallResult(frame.bestguess, exc_bestguess, edge, effects, volatile_inf_result)
+        # the actual inference task for this edge is going to be scheduled within `typeinf_local` via the callstack queue
+        # while splitting off the rest of the work for this caller into a separate workq thunk
+        let mresult = Future{MethodCallResult}()
+            push!(caller.tasks, function get_infer_result(interp, caller)
+                update_valid_age!(caller, frame.valid_worlds)
+                local isinferred = is_inferred(frame)
+                local edge = isinferred ? mi : nothing
+                local effects = isinferred ? frame.result.ipo_effects : # effects are adjusted already within `finish` for ipo_effects
+                    adjust_effects(effects_for_cycle(frame.ipo_effects), method)
+                local exc_bestguess = refine_exception_type(frame.exc_bestguess, effects)
+                # propagate newly inferred source to the inliner, allowing efficient inlining w/o deserialization:
+                # note that this result is cached globally exclusively, so we can use this local result destructively
+                local volatile_inf_result = isinferred ? VolatileInferenceResult(result) : nothing
+                local edgeresult = EdgeCallResult(frame.bestguess, exc_bestguess, edge, effects, volatile_inf_result)
+                mresult[] = EdgeCall_to_MethodCall_Result(interp, caller, method, edgeresult, edgecycle, edgelimited)
+                return true
+            end)
+            return mresult
+        end
     elseif frame === true
         # unresolvable cycle
-        return EdgeCallResult(Any, Any, nothing, Effects())
+        add_remark!(interp, caller, "[typeinf_edge] Unresolvable cycle")
+        return Future(EdgeCall_to_MethodCall_Result(interp, caller, method, EdgeCallResult(Any, Any, nothing, Effects()), edgecycle, edgelimited))
     end
     # return the current knowledge about this cycle
     frame = frame::InferenceState
     update_valid_age!(caller, frame.valid_worlds)
     effects = adjust_effects(effects_for_cycle(frame.ipo_effects), method)
     exc_bestguess = refine_exception_type(frame.exc_bestguess, effects)
-    return EdgeCallResult(frame.bestguess, exc_bestguess, nothing, effects)
+    edgeresult = EdgeCallResult(frame.bestguess, exc_bestguess, nothing, effects)
+    return Future(EdgeCall_to_MethodCall_Result(interp, caller, method, edgeresult, edgecycle, edgelimited))
 end
 
 # The `:terminates` effect bit must be conservatively tainted unless recursion cycle has
@@ -1033,7 +941,7 @@ function typeinf_ircode(interp::AbstractInterpreter, mi::MethodInstance,
     end
     (; result) = frame
     opt = OptimizationState(frame, interp)
-    ir = run_passes_ipo_safe(opt.src, opt, result, optimize_until)
+    ir = run_passes_ipo_safe(opt.src, opt, optimize_until)
     rt = widenconst(ignorelimited(result.result))
     return ir, rt
 end
diff --git a/base/compiler/typelimits.jl b/base/compiler/typelimits.jl
index 91a44d3b117ab..3d0e5f3d0877d 100644
--- a/base/compiler/typelimits.jl
+++ b/base/compiler/typelimits.jl
@@ -831,6 +831,7 @@ end
                     typenames[i] = Any.name
                     simplify[i] = false
                     types[j] = widen
+                    typenames[j] = ijname
                     break
                 end
             end
diff --git a/base/compiler/types.jl b/base/compiler/types.jl
index f315b7968fd9b..ecf2417fd6199 100644
--- a/base/compiler/types.jl
+++ b/base/compiler/types.jl
@@ -1,4 +1,12 @@
 # This file is a part of Julia. License is MIT: https://julialang.org/license
+#
+
+const WorkThunk = Any
+# #@eval struct WorkThunk
+#    thunk::Core.OpaqueClosure{Tuple{Vector{Tasks}}, Bool}
+#    WorkThunk(work) = new($(Expr(:opaque_closure, :(Tuple{Vector{Tasks}}), :Bool, :Bool, :((tasks) -> work(tasks))))) # @opaque Vector{Tasks}->Bool (tasks)->work(tasks)
+# end
+# (p::WorkThunk)() = p.thunk()
 
 """
     AbstractInterpreter
@@ -33,11 +41,14 @@ struct StmtInfo
     used::Bool
 end
 
-struct MethodInfo
+struct SpecInfo
+    nargs::Int
+    isva::Bool
     propagate_inbounds::Bool
     method_for_inference_limit_heuristics::Union{Nothing,Method}
 end
-MethodInfo(src::CodeInfo) = MethodInfo(
+SpecInfo(src::CodeInfo) = SpecInfo(
+    Int(src.nargs), src.isva,
     src.propagate_inbounds,
     src.method_for_inference_limit_heuristics::Union{Nothing,Method})
 
@@ -450,10 +461,16 @@ abstract type CallInfo end
 
 nsplit(info::CallInfo) = nsplit_impl(info)::Union{Nothing,Int}
 getsplit(info::CallInfo, idx::Int) = getsplit_impl(info, idx)::MethodLookupResult
+add_uncovered_edges!(edges::Vector{Any}, info::CallInfo, @nospecialize(atype)) = add_uncovered_edges_impl(edges, info, atype)
+
 getresult(info::CallInfo, idx::Int) = getresult_impl(info, idx)
 
+# must implement `nsplit`, `getsplit`, and `add_uncovered_edges!` to opt in to inlining
 nsplit_impl(::CallInfo) = nothing
 getsplit_impl(::CallInfo, ::Int) = error("unexpected call into `getsplit`")
+add_uncovered_edges_impl(::Vector{Any}, ::CallInfo, _) = error("unexpected call into `add_uncovered_edges!`")
+
+# must implement `getresult` to opt in to extended lattice return information
 getresult_impl(::CallInfo, ::Int) = nothing
 
 @specialize
diff --git a/base/condition.jl b/base/condition.jl
index bc14b17b3ac6b..fd771c9be346a 100644
--- a/base/condition.jl
+++ b/base/condition.jl
@@ -69,6 +69,8 @@ struct GenericCondition{L<:AbstractLock}
     GenericCondition(l::AbstractLock) = new{typeof(l)}(IntrusiveLinkedList{Task}(), l)
 end
 
+show(io::IO, c::GenericCondition) = print(io, GenericCondition, "(", c.lock, ")")
+
 assert_havelock(c::GenericCondition) = assert_havelock(c.lock)
 lock(c::GenericCondition) = lock(c.lock)
 unlock(c::GenericCondition) = unlock(c.lock)
@@ -194,6 +196,8 @@ This object is NOT thread-safe. See [`Threads.Condition`](@ref) for a thread-saf
 """
 const Condition = GenericCondition{AlwaysLockedST}
 
+show(io::IO, ::Condition) = print(io, Condition, "()")
+
 lock(c::GenericCondition{AlwaysLockedST}) =
     throw(ArgumentError("`Condition` is not thread-safe. Please use `Threads.Condition` instead for multi-threaded code."))
 unlock(c::GenericCondition{AlwaysLockedST}) =
diff --git a/base/div.jl b/base/div.jl
index 8988f2b70f27b..3fec8d2f5cdf3 100644
--- a/base/div.jl
+++ b/base/div.jl
@@ -43,6 +43,21 @@ julia> div(4, 3, RoundFromZero)
 julia> div(-4, 3, RoundFromZero)
 -2
 ```
+Because `div(x, y)` implements strictly correct truncated rounding based on the true
+value of floating-point numbers, unintuitive situations can arise. For example:
+```jldoctest
+julia> div(6.0, 0.1)
+59.0
+julia> 6.0 / 0.1
+60.0
+julia> 6.0 / big(0.1)
+59.99999999999999666933092612453056361837965690217069245739573412231113406246995
+```
+What is happening here is that the true value of the floating-point number written
+as `0.1` is slightly larger than the numerical value 1/10 while `6.0` represents
+the number 6 precisely. Therefore the true value of `6.0 / 0.1` is slightly less
+than 60. When doing division, this is rounded to precisely `60.0`, but
+`div(6.0, 0.1, RoundToZero)` always truncates the true value, so the result is `59.0`.
 """
 div(x, y, r::RoundingMode)
 
diff --git a/base/docs/Docs.jl b/base/docs/Docs.jl
index 1327a1f795d4f..1a2403bbb8644 100644
--- a/base/docs/Docs.jl
+++ b/base/docs/Docs.jl
@@ -577,6 +577,10 @@ function _doc(binding::Binding, sig::Type = Union{})
             for msig in multidoc.order
                 sig <: msig && return multidoc.docs[msig]
             end
+            # if no matching signatures, return first
+            if !isempty(multidoc.docs)
+                return first(values(multidoc.docs))
+            end
         end
     end
     return nothing
diff --git a/base/docs/basedocs.jl b/base/docs/basedocs.jl
index e03d0db78f29f..a142ecffdb732 100644
--- a/base/docs/basedocs.jl
+++ b/base/docs/basedocs.jl
@@ -937,11 +937,14 @@ expression, rather than the side effects that evaluating `b` or `c` may have.
 See the manual section on [control flow](@ref man-conditional-evaluation) for more details.
 
 # Examples
-```
+```jldoctest
 julia> x = 1; y = 2;
 
-julia> x > y ? println("x is larger") : println("y is larger")
-y is larger
+julia> x > y ? println("x is larger") : println("x is not larger")
+x is not larger
+
+julia> x > y ? "x is larger" : x == y ? "x and y are equal" : "y is larger"
+"y is larger"
 ```
 """
 kw"?", kw"?:"
@@ -1694,7 +1697,7 @@ julia> ab = AB(1, 3)
 AB(1.0f0, 3.0)
 
 julia> ab.c # field `c` doesn't exist
-ERROR: FieldError: type AB has no field c
+ERROR: FieldError: type AB has no field `c`, available fields: `a`, `b`
 Stacktrace:
 [...]
 ```
@@ -3712,6 +3715,9 @@ unused and delete the entire benchmark code).
     which the value of the arguments of this intrinsic were available (in a register,
     in memory, etc.).
 
+!!! compat "Julia 1.8"
+    This method was added in Julia 1.8.
+
 # Examples
 
 ```julia
diff --git a/base/error.jl b/base/error.jl
index d169cdc8085ac..c49ede624607d 100644
--- a/base/error.jl
+++ b/base/error.jl
@@ -232,12 +232,14 @@ macro assert(ex, msgs...)
         msg = msg # pass-through
     elseif !isempty(msgs) && (isa(msg, Expr) || isa(msg, Symbol))
         # message is an expression needing evaluating
-        msg = :(Main.Base.string($(esc(msg))))
+        # N.B. To reduce the risk of invalidation caused by the complex callstack involved
+        # with `string`, use `inferencebarrier` here to hide this `string` from the compiler.
+        msg = :(Main.Base.inferencebarrier(Main.Base.string)($(esc(msg))))
     elseif isdefined(Main, :Base) && isdefined(Main.Base, :string) && applicable(Main.Base.string, msg)
         msg = Main.Base.string(msg)
     else
         # string() might not be defined during bootstrap
-        msg = :(_assert_tostring($(Expr(:quote,msg))))
+        msg = :(Main.Base.inferencebarrier(_assert_tostring)($(Expr(:quote,msg))))
     end
     return :($(esc(ex)) ? $(nothing) : throw(AssertionError($msg)))
 end
diff --git a/base/errorshow.jl b/base/errorshow.jl
index a3bf464439d44..20bdee1de6ec0 100644
--- a/base/errorshow.jl
+++ b/base/errorshow.jl
@@ -378,7 +378,7 @@ end
 
 function showerror(io::IO, exc::FieldError)
     @nospecialize
-    print(io, "FieldError: type $(exc.type |> nameof) has no field $(exc.field)")
+    print(io, "FieldError: type $(exc.type |> nameof) has no field `$(exc.field)`")
     Base.Experimental.show_error_hints(io, exc)
 end
 
@@ -1052,7 +1052,7 @@ function nonsetable_type_hint_handler(io, ex, arg_types, kwargs)
             print(io, "\nAre you trying to index into an array? For multi-dimensional arrays, separate the indices with commas: ")
             printstyled(io, "a[1, 2]", color=:cyan)
             print(io, " rather than a[1][2]")
-        else isType(T)
+        elseif isType(T)
             Tx = T.parameters[1]
             print(io, "\nYou attempted to index the type $Tx, rather than an instance of the type. Make sure you create the type using its constructor: ")
             printstyled(io, "d = $Tx([...])", color=:cyan)
@@ -1067,7 +1067,7 @@ Experimental.register_error_hint(nonsetable_type_hint_handler, MethodError)
 # (probably attempting concatenation)
 function string_concatenation_hint_handler(io, ex, arg_types, kwargs)
     @nospecialize
-    if (ex.f === +) && all(i -> i <: AbstractString, arg_types)
+    if (ex.f === +) && !isempty(arg_types) && all(i -> i <: AbstractString, arg_types)
         print(io, "\nString concatenation is performed with ")
         printstyled(io, "*", color=:cyan)
         print(io, " (See also: https://docs.julialang.org/en/v1/manual/strings/#man-concatenation).")
@@ -1102,7 +1102,7 @@ end
 Experimental.register_error_hint(methods_on_iterable, MethodError)
 
 # Display a hint in case the user tries to access non-member fields of container type datastructures
-function fielderror_hint_handler(io, exc)
+function fielderror_dict_hint_handler(io, exc)
     @nospecialize
     field = exc.field
     type = exc.type
@@ -1113,7 +1113,32 @@ function fielderror_hint_handler(io, exc)
     end
 end
 
-Experimental.register_error_hint(fielderror_hint_handler, FieldError)
+Experimental.register_error_hint(fielderror_dict_hint_handler, FieldError)
+
+function fielderror_listfields_hint_handler(io, exc)
+    fields = fieldnames(exc.type)
+    if isempty(fields)
+        print(io, "; $(nameof(exc.type)) has no fields at all.")
+    else
+        print(io, ", available fields: $(join(map(k -> "`$k`", fields), ", "))")
+    end
+    props = _propertynames_bytype(exc.type)
+    isnothing(props) && return
+    props = setdiff(props, fields)
+    isempty(props) && return
+    print(io, "\nAvailable properties: $(join(map(k -> "`$k`", props), ", "))")
+end
+
+function _propertynames_bytype(T::Type)
+    which(propertynames, (T,)) === which(propertynames, (Any,)) && return nothing
+    inferred_names = promote_op(Val∘propertynames, T)
+    inferred_names isa DataType && inferred_names <: Val || return nothing
+    inferred_names = inferred_names.parameters[1]
+    inferred_names isa NTuple{<:Any, Symbol} || return nothing
+    return Symbol[inferred_names[i] for i in 1:length(inferred_names)]
+end
+
+Experimental.register_error_hint(fielderror_listfields_hint_handler, FieldError)
 
 # ExceptionStack implementation
 size(s::ExceptionStack) = size(s.stack)
diff --git a/base/experimental.jl b/base/experimental.jl
index 58c7258120f3f..648b5da0ed9a1 100644
--- a/base/experimental.jl
+++ b/base/experimental.jl
@@ -319,9 +319,9 @@ function show_error_hints(io, ex, args...)
     for handler in hinters
         try
             @invokelatest handler(io, ex, args...)
-        catch err
+        catch
             tn = typeof(handler).name
-            @error "Hint-handler $handler for $(typeof(ex)) in $(tn.module) caused an error"
+            @error "Hint-handler $handler for $(typeof(ex)) in $(tn.module) caused an error" exception=current_exceptions()
         end
     end
 end
@@ -457,4 +457,18 @@ without adding them to the global method table.
 """
 :@MethodTable
 
+"""
+    Base.Experimental.entrypoint(f, argtypes::Tuple)
+
+Mark a method for inclusion when the `--trim` option is specified.
+"""
+function entrypoint(@nospecialize(f), @nospecialize(argtypes::Tuple))
+    entrypoint(Tuple{Core.Typeof(f), argtypes...})
+end
+
+function entrypoint(@nospecialize(argt::Type))
+    ccall(:jl_add_entrypoint, Int32, (Any,), argt)
+    nothing
+end
+
 end
diff --git a/base/expr.jl b/base/expr.jl
index c4f64b89de8b6..478ccd7d7cc20 100644
--- a/base/expr.jl
+++ b/base/expr.jl
@@ -902,8 +902,8 @@ end
 unwrap_macrocalls(@nospecialize(x)) = x
 function unwrap_macrocalls(ex::Expr)
     inner = ex
-    while inner.head === :macrocall
-        inner = inner.args[end]::Expr
+    while isexpr(inner, :macrocall)
+        inner = inner.args[end]
     end
     return inner
 end
diff --git a/base/file.jl b/base/file.jl
index 81bca9dd65577..567783c4b1e5b 100644
--- a/base/file.jl
+++ b/base/file.jl
@@ -385,7 +385,7 @@ of the file or directory `src` refers to.
 Return `dst`.
 
 !!! note
-    The `cp` function is different from the `cp` command. The `cp` function always operates on
+    The `cp` function is different from the `cp` Unix command. The `cp` function always operates on
     the assumption that `dst` is a file, while the command does different things depending
     on whether `dst` is a directory or a file.
     Using `force=true` when `dst` is a directory will result in loss of all the contents present
@@ -438,6 +438,16 @@ julia> mv("hello.txt", "goodbye.txt", force=true)
 julia> rm("goodbye.txt");
 
 ```
+
+!!! note
+    The `mv` function is different from the `mv` Unix command. The `mv` function by
+    default will error if `dst` exists, while the command will delete
+    an existing `dst` file by default.
+    Also the `mv` function always operates on
+    the assumption that `dst` is a file, while the command does different things depending
+    on whether `dst` is a directory or a file.
+    Using `force=true` when `dst` is a directory will result in loss of all the contents present
+    in the `dst` directory, and `dst` will become a file that has the contents of `src` instead.
 """
 function mv(src::AbstractString, dst::AbstractString; force::Bool=false)
     if force
@@ -1183,16 +1193,30 @@ function unlink(p::AbstractString)
 end
 
 """
-    rename(oldpath::AbstractString, newpath::AbstractString)
+    Base.rename(oldpath::AbstractString, newpath::AbstractString)
 
-Change the name of a file from `oldpath` to `newpath`. If `newpath` is an existing file it may be replaced.
-Equivalent to [rename(2)](https://man7.org/linux/man-pages/man2/rename.2.html).
-Throws an `IOError` on failure.
+Change the name of a file or directory from `oldpath` to `newpath`.
+If `newpath` is an existing file or empty directory it may be replaced.
+Equivalent to [rename(2)](https://man7.org/linux/man-pages/man2/rename.2.html) on Unix.
+If a path contains a "\\0" throw an `ArgumentError`.
+On other failures throw an `IOError`.
 Return `newpath`.
 
+This is a lower level filesystem operation used to implement [`mv`](@ref).
+
 OS-specific restrictions may apply when `oldpath` and `newpath` are in different directories.
 
+Currently there are a few differences in behavior on Windows which may be resolved in a future release.
+Specifically, currently on Windows:
+1. `rename` will fail if `oldpath` or `newpath` are opened files.
+2. `rename` will fail if `newpath` is an existing directory.
+3. `rename` may work if `newpath` is a file and `oldpath` is a directory.
+4. `rename` may remove `oldpath` if it is a hardlink to `newpath`.
+
 See also: [`mv`](@ref).
+
+!!! compat "Julia 1.12"
+    This method was made public in Julia 1.12.
 """
 function rename(oldpath::AbstractString, newpath::AbstractString)
     err = ccall(:jl_fs_rename, Int32, (Cstring, Cstring), oldpath, newpath)
diff --git a/base/floatfuncs.jl b/base/floatfuncs.jl
index 67e7899b4107c..2c26f7cff1133 100644
--- a/base/floatfuncs.jl
+++ b/base/floatfuncs.jl
@@ -232,7 +232,9 @@ function isapprox(x::Integer, y::Integer;
     if norm === abs && atol < 1 && rtol == 0
         return x == y
     else
-        return norm(x - y) <= max(atol, rtol*max(norm(x), norm(y)))
+        # We need to take the difference `max` - `min` when comparing unsigned integers.
+        _x, _y = x < y ? (x, y) : (y, x)
+        return norm(_y - _x) <= max(atol, rtol*max(norm(_x), norm(_y)))
     end
 end
 
diff --git a/base/genericmemory.jl b/base/genericmemory.jl
index 6537839320206..91b87ab14c6b1 100644
--- a/base/genericmemory.jl
+++ b/base/genericmemory.jl
@@ -190,7 +190,7 @@ function fill!(a::Union{Memory{UInt8}, Memory{Int8}}, x::Integer)
     t = @_gc_preserve_begin a
     p = unsafe_convert(Ptr{Cvoid}, a)
     T = eltype(a)
-    memset(p, x isa T ? x : convert(T, x), length(a))
+    memset(p, x isa T ? x : convert(T, x), length(a) % UInt)
     @_gc_preserve_end t
     return a
 end
@@ -320,11 +320,13 @@ end
 
 # get, set(once), modify, swap and replace at index, atomically
 function getindex_atomic(mem::GenericMemory, order::Symbol, i::Int)
+    @_propagate_inbounds_meta
     memref = memoryref(mem, i)
     return memoryrefget(memref, order, @_boundscheck)
 end
 
 function setindex_atomic!(mem::GenericMemory, order::Symbol, val, i::Int)
+    @_propagate_inbounds_meta
     T = eltype(mem)
     memref = memoryref(mem, i)
     return memoryrefset!(
@@ -342,6 +344,7 @@ function setindexonce_atomic!(
     val,
     i::Int,
 )
+    @_propagate_inbounds_meta
     T = eltype(mem)
     memref = memoryref(mem, i)
     return Core.memoryrefsetonce!(
@@ -354,11 +357,13 @@ function setindexonce_atomic!(
 end
 
 function modifyindex_atomic!(mem::GenericMemory, order::Symbol, op, val, i::Int)
+    @_propagate_inbounds_meta
     memref = memoryref(mem, i)
     return Core.memoryrefmodify!(memref, op, val, order, @_boundscheck)
 end
 
 function swapindex_atomic!(mem::GenericMemory, order::Symbol, val, i::Int)
+    @_propagate_inbounds_meta
     T = eltype(mem)
     memref = memoryref(mem, i)
     return Core.memoryrefswap!(
@@ -377,6 +382,7 @@ function replaceindex_atomic!(
     desired,
     i::Int,
 )
+    @_propagate_inbounds_meta
     T = eltype(mem)
     memref = memoryref(mem, i)
     return Core.memoryrefreplace!(
diff --git a/base/iddict.jl b/base/iddict.jl
index 9c133d5ba23c6..f1632e93427a8 100644
--- a/base/iddict.jl
+++ b/base/iddict.jl
@@ -126,7 +126,7 @@ function empty!(d::IdDict)
     d.ht = Memory{Any}(undef, 32)
     ht = d.ht
     t = @_gc_preserve_begin ht
-    memset(unsafe_convert(Ptr{Cvoid}, ht), 0, sizeof(ht))
+    memset(unsafe_convert(Ptr{Cvoid}, ht), 0, sizeof(ht) % UInt)
     @_gc_preserve_end t
     d.ndel = 0
     d.count = 0
diff --git a/base/intfuncs.jl b/base/intfuncs.jl
index 8d46fcffa3ad5..06a0213e7141c 100644
--- a/base/intfuncs.jl
+++ b/base/intfuncs.jl
@@ -362,7 +362,7 @@ end
 
 # Restrict inlining to hardware-supported arithmetic types, which
 # are fast enough to benefit from inlining.
-const HWReal = Union{Int8,Int16,Int32,Int64,UInt8,UInt16,UInt32,UInt64,Float32,Float64}
+const HWReal = Union{Int8,Int16,Int32,Int64,UInt8,UInt16,UInt32,UInt64,Float16,Float32,Float64}
 const HWNumber = Union{HWReal, Complex{<:HWReal}, Rational{<:HWReal}}
 
 # Inline x^2 and x^3 for Val
diff --git a/base/iostream.jl b/base/iostream.jl
index 762f881cfbecb..74908344e078e 100644
--- a/base/iostream.jl
+++ b/base/iostream.jl
@@ -230,8 +230,8 @@ end
 function filesize(s::IOStream)
     sz = @_lock_ios s ccall(:ios_filesize, Int64, (Ptr{Cvoid},), s.ios)
     if sz == -1
-        err = Libc.errno()
-        throw(IOError(string("filesize: ", Libc.strerror(err), " for ", s.name), err))
+        # if `s` is not seekable `ios_filesize` can fail, so fall back to slower stat method
+        sz = filesize(stat(s))
     end
     return sz
 end
diff --git a/base/irrationals.jl b/base/irrationals.jl
index eafe388162353..b3073c503238a 100644
--- a/base/irrationals.jl
+++ b/base/irrationals.jl
@@ -216,7 +216,7 @@ function irrational(sym, val, def)
     esym = esc(sym)
     qsym = esc(Expr(:quote, sym))
     bigconvert = isa(def,Symbol) ? quote
-        function Base.BigFloat(::Irrational{$qsym}, r::MPFR.MPFRRoundingMode=MPFR.ROUNDING_MODE[]; precision=precision(BigFloat))
+        function Base.BigFloat(::Irrational{$qsym}, r::MPFR.MPFRRoundingMode=Rounding.rounding_raw(BigFloat); precision=precision(BigFloat))
             c = BigFloat(;precision=precision)
             ccall(($(string("mpfr_const_", def)), :libmpfr),
                   Cint, (Ref{BigFloat}, MPFR.MPFRRoundingMode), c, r)
diff --git a/base/libuv.jl b/base/libuv.jl
index 143201598fde0..306854e9f4436 100644
--- a/base/libuv.jl
+++ b/base/libuv.jl
@@ -26,10 +26,10 @@ for r in uv_req_types
 @eval const $(Symbol("_sizeof_", lowercase(string(r)))) = uv_sizeof_req($r)
 end
 
-uv_handle_data(handle) = ccall(:jl_uv_handle_data, Ptr{Cvoid}, (Ptr{Cvoid},), handle)
-uv_req_data(handle) = ccall(:jl_uv_req_data, Ptr{Cvoid}, (Ptr{Cvoid},), handle)
-uv_req_set_data(req, data) = ccall(:jl_uv_req_set_data, Cvoid, (Ptr{Cvoid}, Any), req, data)
-uv_req_set_data(req, data::Ptr{Cvoid}) = ccall(:jl_uv_req_set_data, Cvoid, (Ptr{Cvoid}, Ptr{Cvoid}), req, data)
+uv_handle_data(handle) = ccall(:uv_handle_get_data, Ptr{Cvoid}, (Ptr{Cvoid},), handle)
+uv_req_data(handle) = ccall(:uv_req_get_data, Ptr{Cvoid}, (Ptr{Cvoid},), handle)
+uv_req_set_data(req, data) = ccall(:uv_req_set_data, Cvoid, (Ptr{Cvoid}, Any), req, data)
+uv_req_set_data(req, data::Ptr{Cvoid}) = ccall(:uv_handle_set_data, Cvoid, (Ptr{Cvoid}, Ptr{Cvoid}), req, data)
 
 macro handle_as(hand, typ)
     return quote
@@ -133,7 +133,10 @@ function uv_return_spawn end
 function uv_asynccb end
 function uv_timercb end
 
-function reinit_stdio()
+reinit_stdio() = _reinit_stdio()
+# we need this so it can be called by codegen to print errors, even after
+# reinit_stdio has been redefined by the juliac build script.
+function _reinit_stdio()
     global stdin = init_stdio(ccall(:jl_stdin_stream, Ptr{Cvoid}, ()))::IO
     global stdout = init_stdio(ccall(:jl_stdout_stream, Ptr{Cvoid}, ()))::IO
     global stderr = init_stdio(ccall(:jl_stderr_stream, Ptr{Cvoid}, ()))::IO
diff --git a/base/loading.jl b/base/loading.jl
index 4dc735f0099d8..c69e37e4d56ea 100644
--- a/base/loading.jl
+++ b/base/loading.jl
@@ -269,7 +269,6 @@ struct TOMLCache{Dates}
     d::Dict{String, CachedTOMLDict}
 end
 TOMLCache(p::TOML.Parser) = TOMLCache(p, Dict{String, CachedTOMLDict}())
-# TODO: Delete this converting constructor once Pkg stops using it
 TOMLCache(p::TOML.Parser, d::Dict{String, Dict{String, Any}}) = TOMLCache(p, convert(Dict{String, CachedTOMLDict}, d))
 
 const TOML_CACHE = TOMLCache(TOML.Parser{nothing}())
@@ -509,6 +508,8 @@ package root.
 To get the root directory of the package that implements the current module
 the form `pkgdir(@__MODULE__)` can be used.
 
+If an extension module is given, the root of the parent package is returned.
+
 ```julia-repl
 julia> pkgdir(Foo)
 "/path/to/Foo.jl"
@@ -526,7 +527,19 @@ function pkgdir(m::Module, paths::String...)
     rootmodule = moduleroot(m)
     path = pathof(rootmodule)
     path === nothing && return nothing
-    return joinpath(dirname(dirname(path)), paths...)
+    original = path
+    path, base = splitdir(dirname(path))
+    if base == "src"
+        # package source in `../src/Foo.jl`
+    elseif base == "ext"
+        # extension source in `../ext/FooExt.jl`
+    elseif basename(path) == "ext"
+        # extension source in `../ext/FooExt/FooExt.jl`
+        path = dirname(path)
+    else
+        error("Unexpected path structure for module source: $original")
+    end
+    return joinpath(path, paths...)
 end
 
 function get_pkgversion_from_path(path)
@@ -1190,7 +1203,7 @@ const TIMING_IMPORTS = Threads.Atomic{Int}(0)
 # these return either the array of modules loaded from the path / content given
 # or an Exception that describes why it couldn't be loaded
 # and it reconnects the Base.Docs.META
-function _include_from_serialized(pkg::PkgId, path::String, ocachepath::Union{Nothing, String}, depmods::Vector{Any}, ignore_native::Union{Nothing,Bool}=nothing)
+function _include_from_serialized(pkg::PkgId, path::String, ocachepath::Union{Nothing, String}, depmods::Vector{Any}, ignore_native::Union{Nothing,Bool}=nothing; register::Bool=true)
     if isnothing(ignore_native)
         if JLOptions().code_coverage == 0 && JLOptions().malloc_log == 0
             ignore_native = false
@@ -1218,7 +1231,7 @@ function _include_from_serialized(pkg::PkgId, path::String, ocachepath::Union{No
         dep = depmods[i]
         dep isa Module && continue
         _, depkey, depbuild_id = dep::Tuple{String, PkgId, UInt128}
-        dep = loaded_precompiles[depkey => depbuild_id]
+        dep = something(maybe_loaded_precompile(depkey, depbuild_id))
         @assert PkgId(dep) == depkey && module_build_id(dep) === depbuild_id
         depmods[i] = dep
     end
@@ -1239,23 +1252,11 @@ function _include_from_serialized(pkg::PkgId, path::String, ocachepath::Union{No
     for M in restored
         M = M::Module
         if parentmodule(M) === M && PkgId(M) == pkg
+            register && register_root_module(M)
             if timing_imports
-                elapsed = round((time_ns() - t_before) / 1e6, digits = 1)
+                elapsed_time = time_ns() - t_before
                 comp_time, recomp_time = cumulative_compile_time_ns() .- t_comp_before
-                print(lpad(elapsed, 9), " ms  ")
-                parentid = get(EXT_PRIMED, pkg, nothing)
-                if parentid !== nothing
-                    print(parentid.name, " → ")
-                end
-                print(pkg.name)
-                if comp_time > 0
-                    printstyled(" ", Ryu.writefixed(Float64(100 * comp_time / (elapsed * 1e6)), 2), "% compilation time", color = Base.info_color())
-                end
-                if recomp_time > 0
-                    perc = Float64(100 * recomp_time / comp_time)
-                    printstyled(" (", perc < 1 ? "<1" : Ryu.writefixed(perc, 0), "% recompilation)", color = Base.warn_color())
-                end
-                println()
+                print_time_imports_report(M, elapsed_time, comp_time, recomp_time)
             end
             return M
         end
@@ -1267,8 +1268,76 @@ function _include_from_serialized(pkg::PkgId, path::String, ocachepath::Union{No
     end
 end
 
+# printing functions for @time_imports
+# note that the time inputs are UInt64 on all platforms. Give default values here so that we don't have
+# confusing UInt64 types in generate_precompile.jl
+function print_time_imports_report(
+        mod::Module,
+        elapsed_time::UInt64=UInt64(1),
+        comp_time::UInt64=UInt64(1),
+        recomp_time::UInt64=UInt64(1)
+    )
+    print(lpad(round(elapsed_time / 1e6, digits=1), 9), " ms  ")
+    ext_parent = extension_parent_name(mod)
+    if ext_parent !== nothing
+        print(ext_parent::String, " → ")
+    end
+    print(string(mod))
+    if comp_time > 0
+        perc = Ryu.writefixed(Float64(100 * comp_time / (elapsed_time)), 2)
+        printstyled(" $perc% compilation time", color = Base.info_color())
+    end
+    if recomp_time > 0
+        perc = Float64(100 * recomp_time / comp_time)
+        perc_show = perc < 1 ? "<1" : Ryu.writefixed(perc, 0)
+        printstyled(" ($perc_show% recompilation)", color = Base.warn_color())
+    end
+    println()
+end
+function print_time_imports_report_init(
+        mod::Module, i::Int=1,
+        elapsed_time::UInt64=UInt64(1),
+        comp_time::UInt64=UInt64(1),
+        recomp_time::UInt64=UInt64(1)
+    )
+    connector = i > 1 ? "├" : "┌"
+    printstyled("               $connector ", color = :light_black)
+    print("$(round(elapsed_time / 1e6, digits=1)) ms $mod.__init__() ")
+    if comp_time > 0
+        perc = Ryu.writefixed(Float64(100 * (comp_time) / elapsed_time), 2)
+        printstyled("$perc% compilation time", color = Base.info_color())
+    end
+    if recomp_time > 0
+        perc = Float64(100 * recomp_time / comp_time)
+        printstyled(" ($(perc < 1 ? "<1" : Ryu.writefixed(perc, 0))% recompilation)", color = Base.warn_color())
+    end
+    println()
+end
+
+# if M is an extension, return the string name of the parent. Otherwise return nothing
+function extension_parent_name(M::Module)
+    rootmodule = moduleroot(M)
+    src_path = pathof(rootmodule)
+    src_path === nothing && return nothing
+    pkgdir_parts = splitpath(src_path)
+    ext_pos = findlast(==("ext"), pkgdir_parts)
+    if ext_pos !== nothing && ext_pos >= length(pkgdir_parts) - 2
+        parent_package_root = joinpath(pkgdir_parts[1:ext_pos-1]...)
+        parent_package_project_file = locate_project_file(parent_package_root)
+        if parent_package_project_file isa String
+            d = parsed_toml(parent_package_project_file)
+            name = get(d, "name", nothing)
+            if name !== nothing
+                return name
+            end
+        end
+    end
+    return nothing
+end
+
 function register_restored_modules(sv::SimpleVector, pkg::PkgId, path::String)
     # This function is also used by PkgCacheInspector.jl
+    assert_havelock(require_lock)
     restored = sv[1]::Vector{Any}
     for M in restored
         M = M::Module
@@ -1277,7 +1346,7 @@ function register_restored_modules(sv::SimpleVector, pkg::PkgId, path::String)
         end
         if parentmodule(M) === M
             push!(loaded_modules_order, M)
-            loaded_precompiles[pkg => module_build_id(M)] = M
+            push!(get!(Vector{Module}, loaded_precompiles, pkg), M)
         end
     end
 
@@ -1303,36 +1372,25 @@ function run_module_init(mod::Module, i::Int=1)
     # `i` informs ordering for the `@time_imports` report formatting
     if TIMING_IMPORTS[] == 0
         ccall(:jl_init_restored_module, Cvoid, (Any,), mod)
-    else
-        if isdefined(mod, :__init__)
-            connector = i > 1 ? "├" : "┌"
-            printstyled("               $connector ", color = :light_black)
-
-            elapsedtime = time_ns()
-            cumulative_compile_timing(true)
-            compile_elapsedtimes = cumulative_compile_time_ns()
+    elseif isdefined(mod, :__init__)
+        elapsed_time = time_ns()
+        cumulative_compile_timing(true)
+        compile_elapsedtimes = cumulative_compile_time_ns()
 
-            ccall(:jl_init_restored_module, Cvoid, (Any,), mod)
+        ccall(:jl_init_restored_module, Cvoid, (Any,), mod)
 
-            elapsedtime = (time_ns() - elapsedtime) / 1e6
-            cumulative_compile_timing(false);
-            comp_time, recomp_time = (cumulative_compile_time_ns() .- compile_elapsedtimes) ./ 1e6
+        elapsed_time = time_ns() - elapsed_time
+        cumulative_compile_timing(false);
+        comp_time, recomp_time = cumulative_compile_time_ns() .- compile_elapsedtimes
 
-            print("$(round(elapsedtime, digits=1)) ms $mod.__init__() ")
-            if comp_time > 0
-                printstyled(Ryu.writefixed(Float64(100 * comp_time / elapsedtime), 2), "% compilation time", color = Base.info_color())
-            end
-            if recomp_time > 0
-                perc = Float64(100 * recomp_time / comp_time)
-                printstyled(" ($(perc < 1 ? "<1" : Ryu.writefixed(perc, 0))% recompilation)", color = Base.warn_color())
-            end
-            println()
-        end
+        print_time_imports_report_init(mod, i, elapsed_time, comp_time, recomp_time)
     end
 end
 
 function run_package_callbacks(modkey::PkgId)
-    run_extension_callbacks(modkey)
+    if !precompiling_extension
+        run_extension_callbacks(modkey)
+    end
     assert_havelock(require_lock)
     unlock(require_lock)
     try
@@ -1448,7 +1506,7 @@ function _insert_extension_triggers(parent::PkgId, extensions::Dict{String, Any}
         triggers = triggers::Union{String, Vector{String}}
         triggers isa String && (triggers = [triggers])
         id = PkgId(uuid5(parent.uuid::UUID, ext), ext)
-        if id in keys(EXT_PRIMED) || haskey(Base.loaded_modules, id)
+        if haskey(EXT_PRIMED, id) || haskey(Base.loaded_modules, id)
             continue  # extension is already primed or loaded, don't add it again
         end
         EXT_PRIMED[id] = parent
@@ -1469,6 +1527,7 @@ function _insert_extension_triggers(parent::PkgId, extensions::Dict{String, Any}
     end
 end
 
+precompiling_package::Bool = false
 loading_extension::Bool = false
 precompiling_extension::Bool = false
 function run_extension_callbacks(extid::ExtensionId)
@@ -1877,8 +1936,7 @@ function _tryrequire_from_serialized(pkg::PkgId, path::String, ocachepath::Union
         depmods[i] = dep
     end
     # then load the file
-    loaded = _include_from_serialized(pkg, path, ocachepath, depmods, ignore_native)
-    loaded isa Module && register_root_module(loaded)
+    loaded = _include_from_serialized(pkg, path, ocachepath, depmods, ignore_native; register = true)
     return loaded
 end
 
@@ -1888,92 +1946,102 @@ end
     assert_havelock(require_lock)
     paths = find_all_in_cache_path(pkg, DEPOT_PATH)
     newdeps = PkgId[]
-    for path_to_try in paths::Vector{String}
-        staledeps = stale_cachefile(pkg, build_id, sourcepath, path_to_try; reasons, stalecheck)
-        if staledeps === true
-            continue
-        end
-        try
-            staledeps, ocachefile, newbuild_id = staledeps::Tuple{Vector{Any}, Union{Nothing, String}, UInt128}
-            # finish checking staledeps module graph
-            for i in eachindex(staledeps)
-                dep = staledeps[i]
-                dep isa Module && continue
-                modpath, modkey, modbuild_id = dep::Tuple{String, PkgId, UInt128}
-                modpaths = find_all_in_cache_path(modkey, DEPOT_PATH)
-                for modpath_to_try in modpaths
-                    modstaledeps = stale_cachefile(modkey, modbuild_id, modpath, modpath_to_try; stalecheck)
-                    if modstaledeps === true
-                        continue
-                    end
-                    modstaledeps, modocachepath, _ = modstaledeps::Tuple{Vector{Any}, Union{Nothing, String}, UInt128}
-                    staledeps[i] = (modpath, modkey, modbuild_id, modpath_to_try, modstaledeps, modocachepath)
-                    @goto check_next_dep
+    try_build_ids = UInt128[build_id]
+    if build_id == UInt128(0)
+        let loaded = get(loaded_precompiles, pkg, nothing)
+            if loaded !== nothing
+                for mod in loaded # try these in reverse original load order to see if one is already valid
+                    pushfirst!(try_build_ids, module_build_id(mod))
                 end
-                @debug "Rejecting cache file $path_to_try because required dependency $modkey with build ID $(UUID(modbuild_id)) is missing from the cache."
-                @goto check_next_path
-                @label check_next_dep
-            end
-            M = get(loaded_precompiles, pkg => newbuild_id, nothing)
-            if isa(M, Module)
-                stalecheck && register_root_module(M)
-                return M
             end
-            if stalecheck
-                try
-                    touch(path_to_try) # update timestamp of precompilation file
-                catch ex # file might be read-only and then we fail to update timestamp, which is fine
-                    ex isa IOError || rethrow()
-                end
+        end
+    end
+    for build_id in try_build_ids
+        for path_to_try in paths::Vector{String}
+            staledeps = stale_cachefile(pkg, build_id, sourcepath, path_to_try; reasons, stalecheck)
+            if staledeps === true
+                continue
             end
-            # finish loading module graph into staledeps
-            # TODO: call all start_loading calls (in reverse order) before calling any _include_from_serialized, since start_loading will drop the loading lock
-            for i in eachindex(staledeps)
-                dep = staledeps[i]
-                dep isa Module && continue
-                modpath, modkey, modbuild_id, modcachepath, modstaledeps, modocachepath = dep::Tuple{String, PkgId, UInt128, String, Vector{Any}, Union{Nothing, String}}
-                dep = start_loading(modkey, modbuild_id, stalecheck)
-                while true
-                    if dep isa Module
-                        if PkgId(dep) == modkey && module_build_id(dep) === modbuild_id
-                            break
-                        else
-                            @debug "Rejecting cache file $path_to_try because module $modkey got loaded at a different version than expected."
-                            @goto check_next_path
+            try
+                staledeps, ocachefile, newbuild_id = staledeps::Tuple{Vector{Any}, Union{Nothing, String}, UInt128}
+                # finish checking staledeps module graph
+                for i in eachindex(staledeps)
+                    dep = staledeps[i]
+                    dep isa Module && continue
+                    modpath, modkey, modbuild_id = dep::Tuple{String, PkgId, UInt128}
+                    modpaths = find_all_in_cache_path(modkey, DEPOT_PATH)
+                    for modpath_to_try in modpaths
+                        modstaledeps = stale_cachefile(modkey, modbuild_id, modpath, modpath_to_try; stalecheck)
+                        if modstaledeps === true
+                            continue
                         end
+                        modstaledeps, modocachepath, _ = modstaledeps::Tuple{Vector{Any}, Union{Nothing, String}, UInt128}
+                        staledeps[i] = (modpath, modkey, modbuild_id, modpath_to_try, modstaledeps, modocachepath)
+                        @goto check_next_dep
+                    end
+                    @debug "Rejecting cache file $path_to_try because required dependency $modkey with build ID $(UUID(modbuild_id)) is missing from the cache."
+                    @goto check_next_path
+                    @label check_next_dep
+                end
+                M = maybe_loaded_precompile(pkg, newbuild_id)
+                if isa(M, Module)
+                    stalecheck && register_root_module(M)
+                    return M
+                end
+                if stalecheck
+                    try
+                        touch(path_to_try) # update timestamp of precompilation file
+                    catch ex # file might be read-only and then we fail to update timestamp, which is fine
+                        ex isa IOError || rethrow()
                     end
-                    if dep === nothing
-                        try
-                            set_pkgorigin_version_path(modkey, modpath)
-                            dep = _include_from_serialized(modkey, modcachepath, modocachepath, modstaledeps)
-                            dep isa Module && stalecheck && register_root_module(dep)
-                        finally
-                            end_loading(modkey, dep)
+                end
+                # finish loading module graph into staledeps
+                # TODO: call all start_loading calls (in reverse order) before calling any _include_from_serialized, since start_loading will drop the loading lock
+                for i in eachindex(staledeps)
+                    dep = staledeps[i]
+                    dep isa Module && continue
+                    modpath, modkey, modbuild_id, modcachepath, modstaledeps, modocachepath = dep::Tuple{String, PkgId, UInt128, String, Vector{Any}, Union{Nothing, String}}
+                    dep = start_loading(modkey, modbuild_id, stalecheck)
+                    while true
+                        if dep isa Module
+                            if PkgId(dep) == modkey && module_build_id(dep) === modbuild_id
+                                break
+                            else
+                                @debug "Rejecting cache file $path_to_try because module $modkey got loaded at a different version than expected."
+                                @goto check_next_path
+                            end
                         end
-                        if !isa(dep, Module)
-                            @debug "Rejecting cache file $path_to_try because required dependency $modkey failed to load from cache file for $modcachepath." exception=dep
-                            @goto check_next_path
-                        else
-                            push!(newdeps, modkey)
+                        if dep === nothing
+                            try
+                                set_pkgorigin_version_path(modkey, modpath)
+                                dep = _include_from_serialized(modkey, modcachepath, modocachepath, modstaledeps; register = stalecheck)
+                            finally
+                                end_loading(modkey, dep)
+                            end
+                            if !isa(dep, Module)
+                                @debug "Rejecting cache file $path_to_try because required dependency $modkey failed to load from cache file for $modcachepath." exception=dep
+                                @goto check_next_path
+                            else
+                                push!(newdeps, modkey)
+                            end
                         end
                     end
+                    staledeps[i] = dep
                 end
-                staledeps[i] = dep
-            end
-            restored = get(loaded_precompiles, pkg => newbuild_id, nothing)
-            if !isa(restored, Module)
-                restored = _include_from_serialized(pkg, path_to_try, ocachefile, staledeps)
-            end
-            isa(restored, Module) && stalecheck && register_root_module(restored)
-            isa(restored, Module) && return restored
-            @debug "Deserialization checks failed while attempting to load cache from $path_to_try" exception=restored
-            @label check_next_path
-        finally
-            for modkey in newdeps
-                insert_extension_triggers(modkey)
-                stalecheck && run_package_callbacks(modkey)
+                restored = maybe_loaded_precompile(pkg, newbuild_id)
+                if !isa(restored, Module)
+                    restored = _include_from_serialized(pkg, path_to_try, ocachefile, staledeps; register = stalecheck)
+                end
+                isa(restored, Module) && return restored
+                @debug "Deserialization checks failed while attempting to load cache from $path_to_try" exception=restored
+                @label check_next_path
+            finally
+                for modkey in newdeps
+                    insert_extension_triggers(modkey)
+                    stalecheck && run_package_callbacks(modkey)
+                end
+                empty!(newdeps)
             end
-            empty!(newdeps)
         end
     end
     return nothing
@@ -1992,7 +2060,7 @@ function start_loading(modkey::PkgId, build_id::UInt128, stalecheck::Bool)
         loaded = stalecheck ? maybe_root_module(modkey) : nothing
         loaded isa Module && return loaded
         if build_id != UInt128(0)
-            loaded = get(loaded_precompiles, modkey => build_id, nothing)
+            loaded = maybe_loaded_precompile(modkey, build_id)
             loaded isa Module && return loaded
         end
         loading = get(package_locks, modkey, nothing)
@@ -2163,6 +2231,11 @@ For more details regarding code loading, see the manual sections on [modules](@r
 [parallel computing](@ref code-availability).
 """
 function require(into::Module, mod::Symbol)
+    if into === Base.__toplevel__ && precompiling_package
+        # this error type needs to match the error type compilecache throws for non-125 errors.
+        error("`using/import $mod` outside of a Module detected. Importing a package outside of a module \
+         is not allowed during package precompilation.")
+    end
     if _require_world_age[] != typemax(UInt)
         Base.invoke_in_world(_require_world_age[], __require, into, mod)
     else
@@ -2317,12 +2390,21 @@ const pkgorigins = Dict{PkgId,PkgOrigin}()
 
 const explicit_loaded_modules = Dict{PkgId,Module}() # Emptied on Julia start
 const loaded_modules = Dict{PkgId,Module}() # available to be explicitly loaded
-const loaded_precompiles = Dict{Pair{PkgId,UInt128},Module}() # extended (complete) list of modules, available to be loaded
+const loaded_precompiles = Dict{PkgId,Vector{Module}}() # extended (complete) list of modules, available to be loaded
 const loaded_modules_order = Vector{Module}()
 const module_keys = IdDict{Module,PkgId}() # the reverse of loaded_modules
 
 root_module_key(m::Module) = @lock require_lock module_keys[m]
 
+function maybe_loaded_precompile(key::PkgId, buildid::UInt128)
+    assert_havelock(require_lock)
+    mods = get(loaded_precompiles, key, nothing)
+    mods === nothing && return
+    for mod in mods
+        module_build_id(mod) == buildid && return mod
+    end
+end
+
 function module_build_id(m::Module)
     hi, lo = ccall(:jl_module_build_id, NTuple{2,UInt64}, (Any,), m)
     return (UInt128(hi) << 64) | lo
@@ -2343,7 +2425,7 @@ end
             end
         end
     end
-    haskey(loaded_precompiles, key => module_build_id(m)) || push!(loaded_modules_order, m)
+    maybe_loaded_precompile(key, module_build_id(m)) === nothing && push!(loaded_modules_order, m)
     loaded_modules[key] = m
     explicit_loaded_modules[key] = m
     module_keys[m] = key
@@ -2715,8 +2797,8 @@ function evalfile(path::AbstractString, args::Vector{String}=String[])
         Expr(:toplevel,
              :(const ARGS = $args),
              :(eval(x) = $(Expr(:core, :eval))(__anon__, x)),
-             :(include(x) = $(Expr(:top, :include))(__anon__, x)),
-             :(include(mapexpr::Function, x) = $(Expr(:top, :include))(mapexpr, __anon__, x)),
+             :(include(x::AbstractString) = $(Expr(:top, :include))(__anon__, x)),
+             :(include(mapexpr::Function, x::AbstractString) = $(Expr(:top, :include))(mapexpr, __anon__, x)),
              :(include($path))))
 end
 evalfile(path::AbstractString, args::Vector) = evalfile(path, String[args...])
@@ -2740,41 +2822,10 @@ function load_path_setup_code(load_path::Bool=true)
     return code
 end
 
-"""
-    check_src_module_wrap(srcpath::String)
-
-Checks that a package entry file `srcpath` has a module declaration, and that it is before any using/import statements.
-"""
-function check_src_module_wrap(pkg::PkgId, srcpath::String)
-    module_rgx = r"^(|end |\"\"\" )\s*(?:@)*(?:bare)?module\s"
-    load_rgx = r"\b(?:using|import)\s"
-    load_seen = false
-    inside_string = false
-    for s in eachline(srcpath)
-        if count("\"\"\"", s) == 1
-            # ignore module docstrings
-            inside_string = !inside_string
-        end
-        inside_string && continue
-        if contains(s, module_rgx)
-            if load_seen
-                throw(ErrorException("Package $(repr("text/plain", pkg)) source file $srcpath has a using/import before a module declaration."))
-            end
-            return true
-        end
-        if startswith(s, load_rgx)
-            load_seen = true
-        end
-    end
-    throw(ErrorException("Package $(repr("text/plain", pkg)) source file $srcpath does not contain a module declaration."))
-end
-
 # this is called in the external process that generates precompiled package files
 function include_package_for_output(pkg::PkgId, input::String, depot_path::Vector{String}, dl_load_path::Vector{String}, load_path::Vector{String},
                                     concrete_deps::typeof(_concrete_dependencies), source::Union{Nothing,String})
 
-    check_src_module_wrap(pkg, input)
-
     append!(empty!(Base.DEPOT_PATH), depot_path)
     append!(empty!(Base.DL_LOAD_PATH), dl_load_path)
     append!(empty!(Base.LOAD_PATH), load_path)
@@ -2801,11 +2852,22 @@ function include_package_for_output(pkg::PkgId, input::String, depot_path::Vecto
     finally
         Core.Compiler.track_newly_inferred.x = false
     end
+    # check that the package defined the expected module so we can give a nice error message if not
+    Base.check_package_module_loaded(pkg)
+end
+
+function check_package_module_loaded(pkg::PkgId)
+    if !haskey(Base.loaded_modules, pkg)
+        # match compilecache error type for non-125 errors
+        error("$(repr("text/plain", pkg)) did not define the expected module `$(pkg.name)`, \
+            check for typos in package module name")
+    end
+    return nothing
 end
 
 const PRECOMPILE_TRACE_COMPILE = Ref{String}()
 function create_expr_cache(pkg::PkgId, input::String, output::String, output_o::Union{Nothing, String},
-                           concrete_deps::typeof(_concrete_dependencies), flags::Cmd=``, internal_stderr::IO = stderr, internal_stdout::IO = stdout)
+                           concrete_deps::typeof(_concrete_dependencies), flags::Cmd=``, internal_stderr::IO = stderr, internal_stdout::IO = stdout, isext::Bool=false)
     @nospecialize internal_stderr internal_stdout
     rm(output, force=true)   # Remove file if it exists
     output_o === nothing || rm(output_o, force=true)
@@ -2874,7 +2936,8 @@ function create_expr_cache(pkg::PkgId, input::String, output::String, output_o::
     write(io.in, """
         empty!(Base.EXT_DORMITORY) # If we have a custom sysimage with `EXT_DORMITORY` prepopulated
         Base.track_nested_precomp($precomp_stack)
-        Base.precompiling_extension = $(loading_extension)
+        Base.precompiling_extension = $(loading_extension | isext)
+        Base.precompiling_package = true
         Base.include_package_for_output($(pkg_str(pkg)), $(repr(abspath(input))), $(repr(depot_path)), $(repr(dl_load_path)),
             $(repr(load_path)), $deps, $(repr(source_path(nothing))))
         """)
@@ -2931,18 +2994,18 @@ This can be used to reduce package load times. Cache files are stored in
 `DEPOT_PATH[1]/compiled`. See [Module initialization and precompilation](@ref)
 for important notes.
 """
-function compilecache(pkg::PkgId, internal_stderr::IO = stderr, internal_stdout::IO = stdout; flags::Cmd=``, reasons::Union{Dict{String,Int},Nothing}=Dict{String,Int}())
+function compilecache(pkg::PkgId, internal_stderr::IO = stderr, internal_stdout::IO = stdout; flags::Cmd=``, reasons::Union{Dict{String,Int},Nothing}=Dict{String,Int}(), isext::Bool=false)
     @nospecialize internal_stderr internal_stdout
     path = locate_package(pkg)
     path === nothing && throw(ArgumentError("$(repr("text/plain", pkg)) not found during precompilation"))
-    return compilecache(pkg, path, internal_stderr, internal_stdout; flags, reasons)
+    return compilecache(pkg, path, internal_stderr, internal_stdout; flags, reasons, isext)
 end
 
 const MAX_NUM_PRECOMPILE_FILES = Ref(10)
 
 function compilecache(pkg::PkgId, path::String, internal_stderr::IO = stderr, internal_stdout::IO = stdout,
                       keep_loaded_modules::Bool = true; flags::Cmd=``, cacheflags::CacheFlags=CacheFlags(),
-                      reasons::Union{Dict{String,Int},Nothing}=Dict{String,Int}())
+                      reasons::Union{Dict{String,Int},Nothing}=Dict{String,Int}(), isext::Bool=false)
 
     @nospecialize internal_stderr internal_stdout
     # decide where to put the resulting cache file
@@ -2982,7 +3045,7 @@ function compilecache(pkg::PkgId, path::String, internal_stderr::IO = stderr, in
             close(tmpio_o)
             close(tmpio_so)
         end
-        p = create_expr_cache(pkg, path, tmppath, tmppath_o, concrete_deps, flags, internal_stderr, internal_stdout)
+        p = create_expr_cache(pkg, path, tmppath, tmppath_o, concrete_deps, flags, internal_stderr, internal_stdout, isext)
 
         if success(p)
             if cache_objects
@@ -3124,16 +3187,9 @@ mutable struct CacheHeaderIncludes
     const modpath::Vector{String}   # seemingly not needed in Base, but used by Revise
 end
 
-function replace_depot_path(path::AbstractString)
-    for depot in DEPOT_PATH
-        !isdir(depot) && continue
-
-        # Strip extraneous pathseps through normalization.
-        if isdirpath(depot)
-            depot = dirname(depot)
-        end
-
-        if startswith(path, depot)
+function replace_depot_path(path::AbstractString, depots::Vector{String}=normalize_depots_for_relocation())
+    for depot in depots
+        if startswith(path, string(depot, Filesystem.pathsep())) || path == depot
             path = replace(path, depot => "@depot"; count=1)
             break
         end
@@ -3141,6 +3197,19 @@ function replace_depot_path(path::AbstractString)
     return path
 end
 
+function normalize_depots_for_relocation()
+    depots = String[]
+    sizehint!(depots, length(DEPOT_PATH))
+    for d in DEPOT_PATH
+        isdir(d) || continue
+        if isdirpath(d)
+            d = dirname(d)
+        end
+        push!(depots, abspath(d))
+    end
+    return depots
+end
+
 function restore_depot_path(path::AbstractString, depot::AbstractString)
     replace(path, r"^@depot" => depot; count=1)
 end
@@ -3742,8 +3811,8 @@ end
         for i in 1:ndeps
             req_key, req_build_id = required_modules[i]
             # Check if module is already loaded
-            if !stalecheck && haskey(loaded_precompiles, req_key => req_build_id)
-                M = loaded_precompiles[req_key => req_build_id]
+            M = stalecheck ? nothing : maybe_loaded_precompile(req_key, req_build_id)
+            if M !== nothing
                 @assert PkgId(M) == req_key && module_build_id(M) === req_build_id
                 depmods[i] = M
             elseif root_module_exists(req_key)
@@ -3795,10 +3864,17 @@ end
 
         # now check if this file's content hash has changed relative to its source files
         if stalecheck
-            if !samefile(includes[1].filename, modpath) && !samefile(fixup_stdlib_path(includes[1].filename), modpath)
-                @debug "Rejecting cache file $cachefile because it is for file $(includes[1].filename) not file $modpath"
-                record_reason(reasons, "wrong source")
-                return true # cache file was compiled from a different path
+            if !samefile(includes[1].filename, modpath)
+                # In certain cases the path rewritten by `fixup_stdlib_path` may
+                # point to an unreadable directory, make sure we can `stat` the
+                # file before comparing it with `modpath`.
+                stdlib_path = fixup_stdlib_path(includes[1].filename)
+                if !(isreadable(stdlib_path) && samefile(stdlib_path, modpath))
+                    !samefile(fixup_stdlib_path(includes[1].filename), modpath)
+                    @debug "Rejecting cache file $cachefile because it is for file $(includes[1].filename) not file $modpath"
+                    record_reason(reasons, "wrong source")
+                    return true # cache file was compiled from a different path
+                end
             end
             for (modkey, req_modkey) in requires
                 # verify that `require(modkey, name(req_modkey))` ==> `req_modkey`
diff --git a/base/lock.jl b/base/lock.jl
index b69f3c5c03638..b473045e5809d 100644
--- a/base/lock.jl
+++ b/base/lock.jl
@@ -51,6 +51,20 @@ end
 
 assert_havelock(l::ReentrantLock) = assert_havelock(l, l.locked_by)
 
+show(io::IO, ::ReentrantLock) = print(io, ReentrantLock, "()")
+
+function show(io::IO, ::MIME"text/plain", l::ReentrantLock)
+    show(io, l)
+    if !(get(io, :compact, false)::Bool)
+        locked_by = l.locked_by
+        if locked_by isa Task
+            print(io, " (locked by ", locked_by === current_task() ? "current " : "", locked_by, ")")
+        else
+            print(io, " (unlocked)")
+        end
+    end
+end
+
 """
     islocked(lock) -> Status (Boolean)
 
diff --git a/base/math.jl b/base/math.jl
index da51ab3a17bd0..16a8a547e8de1 100644
--- a/base/math.jl
+++ b/base/math.jl
@@ -1276,14 +1276,12 @@ end
     return ifelse(isfinite(x) & isfinite(err), muladd(x, y, err), x*y)
 end
 
-function ^(x::Float32, n::Integer)
+function ^(x::Union{Float16,Float32}, n::Integer)
     n == -2 && return (i=inv(x); i*i)
     n == 3 && return x*x*x #keep compatibility with literal_pow
-    n < 0 && return Float32(Base.power_by_squaring(inv(Float64(x)),-n))
-    Float32(Base.power_by_squaring(Float64(x),n))
+    n < 0 && return oftype(x, Base.power_by_squaring(inv(widen(x)),-n))
+    oftype(x, Base.power_by_squaring(widen(x),n))
 end
-@inline ^(x::Float16, y::Integer) = Float16(Float32(x) ^ y)
-@inline literal_pow(::typeof(^), x::Float16, ::Val{p}) where {p} = Float16(literal_pow(^,Float32(x),Val(p)))
 
 ## rem2pi-related calculations ##
 
diff --git a/base/mpfr.jl b/base/mpfr.jl
index d393469aa26a1..9d1a0843ebe06 100644
--- a/base/mpfr.jl
+++ b/base/mpfr.jl
@@ -18,12 +18,10 @@ import
         setrounding, maxintfloat, widen, significand, frexp, tryparse, iszero,
         isone, big, _string_n, decompose, minmax, _precision_with_base_2,
         sinpi, cospi, sincospi, tanpi, sind, cosd, tand, asind, acosd, atand,
-        uinttype, exponent_max, exponent_min, ieee754_representation, significand_mask,
-        RawBigIntRoundingIncrementHelper, truncated, RawBigInt
-
+        uinttype, exponent_max, exponent_min, ieee754_representation, significand_mask
 
 using .Base.Libc
-import ..Rounding:
+import ..Rounding: Rounding,
     rounding_raw, setrounding_raw, rounds_to_nearest, rounds_away_from_zero,
     tie_breaker_is_to_even, correct_rounding_requires_increment
 
@@ -39,7 +37,6 @@ else
     const libmpfr = "libmpfr.so.6"
 end
 
-
 version() = VersionNumber(unsafe_string(ccall((:mpfr_get_version,libmpfr), Ptr{Cchar}, ())))
 patches() = split(unsafe_string(ccall((:mpfr_get_patches,libmpfr), Ptr{Cchar}, ())),' ')
 
@@ -120,44 +117,116 @@ const mpfr_special_exponent_zero = typemin(Clong) + true
 const mpfr_special_exponent_nan = mpfr_special_exponent_zero + true
 const mpfr_special_exponent_inf = mpfr_special_exponent_nan + true
 
+struct BigFloatLayout
+    prec::Clong
+    sign::Cint
+    exp::Clong
+    d::Ptr{Limb}
+    # possible padding
+    p::Limb # Tuple{Vararg{Limb}}
+end
+const offset_prec = fieldoffset(BigFloatLayout, 1) % Int
+const offset_sign = fieldoffset(BigFloatLayout, 2) % Int
+const offset_exp = fieldoffset(BigFloatLayout, 3) % Int
+const offset_d = fieldoffset(BigFloatLayout, 4) % Int
+const offset_p_limbs = ((fieldoffset(BigFloatLayout, 5) % Int + sizeof(Limb) - 1) ÷ sizeof(Limb))
+const offset_p = offset_p_limbs * sizeof(Limb)
+
 """
     BigFloat <: AbstractFloat
 
 Arbitrary precision floating point number type.
 """
-mutable struct BigFloat <: AbstractFloat
-    prec::Clong
-    sign::Cint
-    exp::Clong
-    d::Ptr{Limb}
-    # _d::Buffer{Limb} # Julia gc handle for memory @ d
-    _d::String # Julia gc handle for memory @ d (optimized)
+struct BigFloat <: AbstractFloat
+    d::Memory{Limb}
 
     # Not recommended for general use:
     # used internally by, e.g. deepcopy
-    global function _BigFloat(prec::Clong, sign::Cint, exp::Clong, d::String)
-        # ccall-based version, inlined below
-        #z = new(zero(Clong), zero(Cint), zero(Clong), C_NULL, d)
-        #ccall((:mpfr_custom_init,libmpfr), Cvoid, (Ptr{Limb}, Clong), d, prec) # currently seems to be a no-op in mpfr
-        #NAN_KIND = Cint(0)
-        #ccall((:mpfr_custom_init_set,libmpfr), Cvoid, (Ref{BigFloat}, Cint, Clong, Ptr{Limb}), z, NAN_KIND, prec, d)
-        #return z
-        return new(prec, sign, exp, pointer(d), d)
-    end
+    global _BigFloat(d::Memory{Limb}) = new(d)
 
     function BigFloat(; precision::Integer=_precision_with_base_2(BigFloat))
         precision < 1 && throw(DomainError(precision, "`precision` cannot be less than 1."))
         nb = ccall((:mpfr_custom_get_size,libmpfr), Csize_t, (Clong,), precision)
-        nb = (nb + Core.sizeof(Limb) - 1) ÷ Core.sizeof(Limb) # align to number of Limb allocations required for this
-        #d = Vector{Limb}(undef, nb)
-        d = _string_n(nb * Core.sizeof(Limb))
-        EXP_NAN = mpfr_special_exponent_nan
-        return _BigFloat(Clong(precision), one(Cint), EXP_NAN, d) # +NAN
+        nl = (nb + offset_p + sizeof(Limb) - 1) ÷ Core.sizeof(Limb) # align to number of Limb allocations required for this
+        d = Memory{Limb}(undef, nl % Int)
+        # ccall-based version, inlined below
+        z = _BigFloat(d) # initialize to +NAN
+        #ccall((:mpfr_custom_init,libmpfr), Cvoid, (Ptr{Limb}, Clong), BigFloatData(d), prec) # currently seems to be a no-op in mpfr
+        #NAN_KIND = Cint(0)
+        #ccall((:mpfr_custom_init_set,libmpfr), Cvoid, (Ref{BigFloat}, Cint, Clong, Ptr{Limb}), z, NAN_KIND, prec, BigFloatData(d))
+        z.prec = Clong(precision)
+        z.sign = one(Cint)
+        z.exp = mpfr_special_exponent_nan
+        return z
     end
 end
 
-# The rounding mode here shouldn't matter.
-significand_limb_count(x::BigFloat) = div(sizeof(x._d), sizeof(Limb), RoundToZero)
+"""
+Segment of raw words of bits interpreted as a big integer. Less
+significant words come first. Each word is in machine-native bit-order.
+"""
+struct BigFloatData{Limb}
+    d::Memory{Limb}
+end
+
+# BigFloat interface
+@inline function Base.getproperty(x::BigFloat, s::Symbol)
+    d = getfield(x, :d)
+    p = Base.unsafe_convert(Ptr{Limb}, d)
+    if s === :prec
+        return GC.@preserve d unsafe_load(Ptr{Clong}(p) + offset_prec)
+    elseif s === :sign
+        return GC.@preserve d unsafe_load(Ptr{Cint}(p) + offset_sign)
+    elseif s === :exp
+        return GC.@preserve d unsafe_load(Ptr{Clong}(p) + offset_exp)
+    elseif s === :d
+        return BigFloatData(d)
+    else
+        return throw(FieldError(typeof(x), s))
+    end
+end
+
+@inline function Base.setproperty!(x::BigFloat, s::Symbol, v)
+    d = getfield(x, :d)
+    p = Base.unsafe_convert(Ptr{Limb}, d)
+    if s === :prec
+        return GC.@preserve d unsafe_store!(Ptr{Clong}(p) + offset_prec, v)
+    elseif s === :sign
+        return GC.@preserve d unsafe_store!(Ptr{Cint}(p) + offset_sign, v)
+    elseif s === :exp
+        return GC.@preserve d unsafe_store!(Ptr{Clong}(p) + offset_exp, v)
+    #elseif s === :d # not mutable
+    else
+        return throw(FieldError(x, s))
+    end
+end
+
+# Ref interface: make sure the conversion to C is done properly
+Base.unsafe_convert(::Type{Ref{BigFloat}}, x::Ptr{BigFloat}) = error("not compatible with mpfr")
+Base.unsafe_convert(::Type{Ref{BigFloat}}, x::Ref{BigFloat}) = error("not compatible with mpfr")
+Base.cconvert(::Type{Ref{BigFloat}}, x::BigFloat) = x.d # BigFloatData is the Ref type for BigFloat
+function Base.unsafe_convert(::Type{Ref{BigFloat}}, x::BigFloatData)
+    d = getfield(x, :d)
+    p = Base.unsafe_convert(Ptr{Limb}, d)
+    GC.@preserve d unsafe_store!(Ptr{Ptr{Limb}}(p) + offset_d, p + offset_p, :monotonic) # :monotonic ensure that TSAN knows that this isn't a data race
+    return Ptr{BigFloat}(p)
+end
+Base.unsafe_convert(::Type{Ptr{Limb}}, fd::BigFloatData) = Base.unsafe_convert(Ptr{Limb}, getfield(fd, :d)) + offset_p
+function Base.setindex!(fd::BigFloatData, v, i)
+    d = getfield(fd, :d)
+    @boundscheck 1 <= i <= length(d) - offset_p_limbs || throw(BoundsError(fd, i))
+    @inbounds d[i + offset_p_limbs] = v
+    return fd
+end
+function Base.getindex(fd::BigFloatData, i)
+    d = getfield(fd, :d)
+    @boundscheck 1 <= i <= length(d) - offset_p_limbs || throw(BoundsError(fd, i))
+    @inbounds d[i + offset_p_limbs]
+end
+Base.length(fd::BigFloatData) = length(getfield(fd, :d)) - offset_p_limbs
+Base.copyto!(fd::BigFloatData, limbs) = copyto!(getfield(fd, :d), offset_p_limbs + 1, limbs) # for Random
+
+include("rawbigfloats.jl")
 
 rounding_raw(::Type{BigFloat}) = something(Base.ScopedValues.get(CURRENT_ROUNDING_MODE), ROUNDING_MODE[])
 setrounding_raw(::Type{BigFloat}, r::MPFRRoundingMode) = ROUNDING_MODE[]=r
@@ -165,24 +234,12 @@ function setrounding_raw(f::Function, ::Type{BigFloat}, r::MPFRRoundingMode)
     Base.ScopedValues.@with(CURRENT_ROUNDING_MODE => r, f())
 end
 
-
 rounding(::Type{BigFloat}) = convert(RoundingMode, rounding_raw(BigFloat))
 setrounding(::Type{BigFloat}, r::RoundingMode) = setrounding_raw(BigFloat, convert(MPFRRoundingMode, r))
 setrounding(f::Function, ::Type{BigFloat}, r::RoundingMode) =
     setrounding_raw(f, BigFloat, convert(MPFRRoundingMode, r))
 
 
-# overload the definition of unsafe_convert to ensure that `x.d` is assigned
-# it may have been dropped in the event that the BigFloat was serialized
-Base.unsafe_convert(::Type{Ref{BigFloat}}, x::Ptr{BigFloat}) = x
-@inline function Base.unsafe_convert(::Type{Ref{BigFloat}}, x::Ref{BigFloat})
-    x = x[]
-    if x.d == C_NULL
-        x.d = pointer(x._d)
-    end
-    return convert(Ptr{BigFloat}, Base.pointer_from_objref(x))
-end
-
 """
     BigFloat(x::Union{Real, AbstractString} [, rounding::RoundingMode=rounding(BigFloat)]; [precision::Integer=precision(BigFloat)])
 
@@ -283,17 +340,18 @@ function BigFloat(x::Float64, r::MPFRRoundingMode=rounding_raw(BigFloat); precis
     nlimbs = (precision + 8*Core.sizeof(Limb) - 1) ÷ (8*Core.sizeof(Limb))
 
     # Limb is a CLong which is a UInt32 on windows (thank M$) which makes this more complicated and slower.
+    zd = z.d
     if Limb === UInt64
         for i in 1:nlimbs-1
-            unsafe_store!(z.d, 0x0, i)
+            @inbounds setindex!(zd, 0x0, i)
         end
-        unsafe_store!(z.d, val, nlimbs)
+        @inbounds setindex!(zd, val, nlimbs)
     else
         for i in 1:nlimbs-2
-            unsafe_store!(z.d, 0x0, i)
+            @inbounds setindex!(zd, 0x0, i)
         end
-        unsafe_store!(z.d, val % UInt32, nlimbs-1)
-        unsafe_store!(z.d, (val >> 32) % UInt32, nlimbs)
+        @inbounds setindex!(zd, val % UInt32, nlimbs-1)
+        @inbounds setindex!(zd, (val >> 32) % UInt32, nlimbs)
     end
     z
 end
@@ -440,12 +498,12 @@ function to_ieee754(::Type{T}, x::BigFloat, rm) where {T<:AbstractFloat}
     ret_u = if is_regular & !rounds_to_inf & !rounds_to_zero
         if !exp_is_huge_p
             # significand
-            v = RawBigInt{Limb}(x._d, significand_limb_count(x))
+            v = x.d::BigFloatData
             len = max(ieee_precision + min(exp_diff, 0), 0)::Int
             signif = truncated(U, v, len) & significand_mask(T)
 
             # round up if necessary
-            rh = RawBigIntRoundingIncrementHelper(v, len)
+            rh = BigFloatDataRoundingIncrementHelper(v, len)
             incr = correct_rounding_requires_increment(rh, rm, sb)
 
             # exponent
@@ -1193,10 +1251,8 @@ set_emin!(x) = check_exponent_err(ccall((:mpfr_set_emin, libmpfr), Cint, (Clong,
 
 function Base.deepcopy_internal(x::BigFloat, stackdict::IdDict)
     get!(stackdict, x) do
-        # d = copy(x._d)
-        d = x._d
-        d′ = GC.@preserve d unsafe_string(pointer(d), sizeof(d)) # creates a definitely-new String
-        y = _BigFloat(x.prec, x.sign, x.exp, d′)
+        d′ = copy(getfield(x, :d))
+        y = _BigFloat(d′)
         #ccall((:mpfr_custom_move,libmpfr), Cvoid, (Ref{BigFloat}, Ptr{Limb}), y, d) # unnecessary
         return y
     end::BigFloat
@@ -1210,7 +1266,8 @@ function decompose(x::BigFloat)::Tuple{BigInt, Int, Int}
     s.size = cld(x.prec, 8*sizeof(Limb)) # limbs
     b = s.size * sizeof(Limb)            # bytes
     ccall((:__gmpz_realloc2, libgmp), Cvoid, (Ref{BigInt}, Culong), s, 8b) # bits
-    memcpy(s.d, x.d, b)
+    xd = x.d
+    GC.@preserve xd memcpy(s.d, Base.unsafe_convert(Ptr{Limb}, xd), b)
     s, x.exp - 8b, x.sign
 end
 
diff --git a/base/options.jl b/base/options.jl
index 41ce3c9e20909..f535c27d99122 100644
--- a/base/options.jl
+++ b/base/options.jl
@@ -34,6 +34,7 @@ struct JLOptions
     can_inline::Int8
     polly::Int8
     trace_compile::Ptr{UInt8}
+    trace_dispatch::Ptr{UInt8}
     fast_math::Int8
     worker::Int8
     cookie::Ptr{UInt8}
@@ -58,6 +59,7 @@ struct JLOptions
     permalloc_pkgimg::Int8
     heap_size_hint::UInt64
     trace_compile_timing::Int8
+    trim::Int8
 end
 
 # This runs early in the sysimage != is not defined yet
diff --git a/base/partr.jl b/base/partr.jl
index 8c95e3668ee74..6053a584af5ba 100644
--- a/base/partr.jl
+++ b/base/partr.jl
@@ -20,7 +20,60 @@ const heaps = [Vector{taskheap}(undef, 0), Vector{taskheap}(undef, 0)]
 const heaps_lock = [SpinLock(), SpinLock()]
 
 
-cong(max::UInt32) = iszero(max) ? UInt32(0) : ccall(:jl_rand_ptls, UInt32, (UInt32,), max) + UInt32(1)
+"""
+    cong(max::UInt32)
+
+Return a random UInt32 in the range `1:max` except if max is 0, in that case return 0.
+"""
+cong(max::UInt32) = iszero(max) ? UInt32(0) : rand_ptls(max) + UInt32(1) #TODO: make sure users don't use 0 and remove this check
+
+get_ptls_rng() = ccall(:jl_get_ptls_rng, UInt64, ())
+
+set_ptls_rng(seed::UInt64) = ccall(:jl_set_ptls_rng, Cvoid, (UInt64,), seed)
+
+"""
+    rand_ptls(max::UInt32)
+
+Return a random UInt32 in the range `0:max-1` using the thread-local RNG
+state. Max must be greater than 0.
+"""
+Base.@assume_effects :removable :inaccessiblememonly :notaskstate function rand_ptls(max::UInt32)
+    rngseed = get_ptls_rng()
+    val, seed = rand_uniform_max_int32(max, rngseed)
+    set_ptls_rng(seed)
+    return val % UInt32
+end
+
+# This implementation is based on OpenSSLs implementation of rand_uniform
+# https://github.com/openssl/openssl/blob/1d2cbd9b5a126189d5e9bc78a3bdb9709427d02b/crypto/rand/rand_uniform.c#L13-L99
+# Comments are vendored from their implementation as well.
+# For the original developer check the PR to swift https://github.com/apple/swift/pull/39143.
+
+# Essentially it boils down to incrementally generating a fixed point
+# number on the interval [0, 1) and multiplying this number by the upper
+# range limit.  Once it is certain what the fractional part contributes to
+# the integral part of the product, the algorithm has produced a definitive
+# result.
+"""
+    rand_uniform_max_int32(max::UInt32, seed::UInt64)
+
+Return a random UInt32 in the range `0:max-1` using the given seed.
+Max must be greater than 0.
+"""
+Base.@assume_effects :total function rand_uniform_max_int32(max::UInt32, seed::UInt64)
+    if max == UInt32(1)
+        return UInt32(0), seed
+    end
+    # We are generating a fixed point number on the interval [0, 1).
+    # Multiplying this by the range gives us a number on [0, upper).
+    # The high word of the multiplication result represents the integral part
+    # This is not completely unbiased as it's missing the fractional part of the original implementation but it's good enough for our purposes
+    seed = UInt64(69069) * seed + UInt64(362437)
+    prod = (UInt64(max)) * (seed % UInt32) # 64 bit product
+    i = prod >> 32 % UInt32 # integral part
+    return i % UInt32, seed
+end
+
 
 
 function multiq_sift_up(heap::taskheap, idx::Int32)
diff --git a/base/path.jl b/base/path.jl
index 3b8124f34f174..69c8d22c63c54 100644
--- a/base/path.jl
+++ b/base/path.jl
@@ -613,3 +613,56 @@ relpath(path::AbstractString, startpath::AbstractString) =
 for f in (:isdirpath, :splitdir, :splitdrive, :splitext, :normpath, :abspath)
     @eval $f(path::AbstractString) = $f(String(path))
 end
+
+# RFC3986 Section 2.1
+percent_escape(s) = '%' * join(map(b -> uppercase(string(b, base=16)), codeunits(s)), '%')
+# RFC3986 Section 2.3
+encode_uri_component(s) = replace(s, r"[^A-Za-z0-9\-_.~/]+" => percent_escape)
+
+"""
+    uripath(path::AbstractString)
+
+Encode `path` as a URI as per [RFC8089: The "file" URI
+Scheme](https://www.rfc-editor.org/rfc/rfc8089), [RFC3986: Uniform Resource
+Identifier (URI): Generic Syntax](https://www.rfc-editor.org/rfc/rfc3986), and
+the [Freedesktop File URI spec](https://www.freedesktop.org/wiki/Specifications/file-uri-spec/).
+
+## Examples
+
+```julia-repl
+julia> uripath("/home/user/example file.jl") # On a unix machine
+"file://<hostname>/home/user/example%20file.jl"
+
+juila> uripath("C:\\Users\\user\\example file.jl") # On a windows machine
+"file:///C:/Users/user/example%20file.jl"
+```
+"""
+function uripath end
+
+@static if Sys.iswindows()
+    function uripath(path::String)
+        path = abspath(path)
+        if startswith(path, "\\\\") # UNC path, RFC8089 Appendix E.3
+            unixpath = join(eachsplit(path, path_separator_re, keepempty=false), '/')
+            string("file://", encode_uri_component(unixpath)) # RFC8089 Section 2
+        else
+            drive, localpath = splitdrive(path) # Assuming that non-UNC absolute paths on Windows always have a drive component
+            unixpath = join(eachsplit(localpath, path_separator_re, keepempty=false), '/')
+            encdrive = replace(encode_uri_component(drive), "%3A" => ':', "%7C" => '|') # RFC8089 Appendices D.2, E.2.1, and E.2.2
+            string("file:///", encdrive, '/', encode_uri_component(unixpath)) # RFC8089 Section 2
+        end
+    end
+else
+    function uripath(path::String)
+        localpath = join(eachsplit(abspath(path), path_separator_re, keepempty=false), '/')
+        host = if ispath("/proc/sys/fs/binfmt_misc/WSLInterop") # WSL sigil
+            distro = get(ENV, "WSL_DISTRO_NAME", "") # See <https://patrickwu.space/wslconf/>
+            "wsl\$/$distro" # See <https://github.com/microsoft/terminal/pull/14993> and <https://learn.microsoft.com/en-us/windows/wsl/filesystems>
+        else
+            gethostname() # Freedesktop File URI Spec, Hostnames section
+        end
+        string("file://", encode_uri_component(host), '/', encode_uri_component(localpath)) # RFC8089 Section 2
+    end
+end
+
+uripath(path::AbstractString) = uripath(String(path))
diff --git a/base/precompilation.jl b/base/precompilation.jl
index 6997ce12c8d01..b351ce67cfbad 100644
--- a/base/precompilation.jl
+++ b/base/precompilation.jl
@@ -285,7 +285,7 @@ function show_progress(io::IO, p::MiniProgressBar; termwidth=nothing, carriagere
         return
     end
     t = time()
-    if p.has_shown && (t - p.time_shown) < PROGRESS_BAR_TIME_GRANULARITY[]
+    if !p.always_reprint && p.has_shown && (t - p.time_shown) < PROGRESS_BAR_TIME_GRANULARITY[]
         return
     end
     p.time_shown = t
@@ -301,9 +301,11 @@ function show_progress(io::IO, p::MiniProgressBar; termwidth=nothing, carriagere
     max_progress_width = max(0, min(termwidth - textwidth(p.header) - textwidth(progress_text) - 10 , p.width))
     n_filled = ceil(Int, max_progress_width * perc / 100)
     n_left = max_progress_width - n_filled
+    headers = split(p.header, ' ')
     to_print = sprint(; context=io) do io
         print(io, " "^p.indent)
-        printstyled(io, p.header, color=p.color, bold=true)
+        printstyled(io, headers[1], " "; color=:green, bold=true)
+        printstyled(io, join(headers[2:end], ' '))
         print(io, " ")
         printstyled(io, "━"^n_filled; color=p.color)
         printstyled(io, perc >= 95 ? "━" : "╸"; color=p.color)
@@ -343,7 +345,7 @@ import Base: StaleCacheKey
 
 can_fancyprint(io::IO) = io isa Base.TTY && (get(ENV, "CI", nothing) != "true")
 
-function printpkgstyle(io, header, msg; color=:light_green)
+function printpkgstyle(io, header, msg; color=:green)
     printstyled(io, header; color, bold=true)
     println(io, " ", msg)
 end
@@ -433,51 +435,6 @@ function precompilepkgs(pkgs::Vector{String}=String[];
     # consider exts of direct deps to be direct deps so that errors are reported
     append!(direct_deps, keys(filter(d->last(d) in keys(env.project_deps), exts)))
 
-    # An extension effectively depends on another extension if it has all the the
-    # dependencies of that other extension
-    function expand_dependencies(depsmap)
-        function visit!(visited, node, all_deps)
-            if node in visited
-                return
-            end
-            push!(visited, node)
-            for dep in get(Set{Base.PkgId}, depsmap, node)
-                if !(dep in all_deps)
-                    push!(all_deps, dep)
-                    visit!(visited, dep, all_deps)
-                end
-            end
-        end
-
-        depsmap_transitive = Dict{Base.PkgId, Set{Base.PkgId}}()
-        for package in keys(depsmap)
-            # Initialize a set to keep track of all dependencies for 'package'
-            all_deps = Set{Base.PkgId}()
-            visited = Set{Base.PkgId}()
-            visit!(visited, package, all_deps)
-            # Update depsmap with the complete set of dependencies for 'package'
-            depsmap_transitive[package] = all_deps
-        end
-        return depsmap_transitive
-    end
-
-    depsmap_transitive = expand_dependencies(depsmap)
-
-    for (_, extensions_1) in pkg_exts_map
-        for extension_1 in extensions_1
-            deps_ext_1 = depsmap_transitive[extension_1]
-            for (_, extensions_2) in pkg_exts_map
-                for extension_2 in extensions_2
-                    extension_1 == extension_2 && continue
-                    deps_ext_2 = depsmap_transitive[extension_2]
-                    if issubset(deps_ext_2, deps_ext_1)
-                        push!(depsmap[extension_1], extension_2)
-                    end
-                end
-            end
-        end
-    end
-
     @debug "precompile: deps collected"
     # this loop must be run after the full depsmap has been populated
     for (pkg, pkg_exts) in pkg_exts_map
@@ -564,9 +521,6 @@ function precompilepkgs(pkgs::Vector{String}=String[];
     if !manifest
         if isempty(pkgs)
             pkgs = [pkg.name for pkg in direct_deps]
-            target = "all packages"
-        else
-            target = join(pkgs, ", ")
         end
         # restrict to dependencies of given packages
         function collect_all_deps(depsmap, dep, alldeps=Set{Base.PkgId}())
@@ -602,18 +556,16 @@ function precompilepkgs(pkgs::Vector{String}=String[];
                 return
             end
         end
-    else
-        target = "manifest"
     end
 
     nconfigs = length(configs)
+    target = nothing
     if nconfigs == 1
         if !isempty(only(configs)[1])
-            target *= " for configuration $(join(only(configs)[1], " "))"
+            target = "for configuration $(join(only(configs)[1], " "))"
         end
-        target *= "..."
     else
-        target *= " for $nconfigs compilation configurations..."
+        target = "for $nconfigs compilation configurations..."
     end
     @debug "precompile: packages filtered"
 
@@ -695,15 +647,19 @@ function precompilepkgs(pkgs::Vector{String}=String[];
         try
             wait(first_started)
             (isempty(pkg_queue) || interrupted_or_done.set) && return
-            fancyprint && lock(print_lock) do
-                printpkgstyle(io, :Precompiling, target)
-                print(io, ansi_disablecursor)
+            lock(print_lock) do
+                if target !== nothing
+                    printpkgstyle(io, :Precompiling, target)
+                end
+                if fancyprint
+                    print(io, ansi_disablecursor)
+                end
             end
             t = Timer(0; interval=1/10)
             anim_chars = ["◐","◓","◑","◒"]
             i = 1
             last_length = 0
-            bar = MiniProgressBar(; indent=2, header = "Progress", color = Base.info_color(), percentage=false, always_reprint=true)
+            bar = MiniProgressBar(; indent=0, header = "Precompiling packages ", color = :green, percentage=false, always_reprint=true)
             n_total = length(depsmap) * length(configs)
             bar.max = n_total - n_already_precomp
             final_loop = false
@@ -832,8 +788,10 @@ function precompilepkgs(pkgs::Vector{String}=String[];
                             config_str = "$(join(flags, " "))"
                             name *= color_string(" $(config_str)", :light_black)
                         end
-                        !fancyprint && lock(print_lock) do
-                            isempty(pkg_queue) && printpkgstyle(io, :Precompiling, target)
+                        lock(print_lock) do
+                            if !fancyprint && target === nothing && isempty(pkg_queue)
+                                printpkgstyle(io, :Precompiling, "packages...")
+                            end
                         end
                         push!(pkg_queue, pkg_config)
                         started[pkg_config] = true
@@ -849,7 +807,7 @@ function precompilepkgs(pkgs::Vector{String}=String[];
                             t = @elapsed ret = precompile_pkgs_maybe_cachefile_lock(io, print_lock, fancyprint, pkg_config, pkgspidlocked, hascolor) do
                                 Base.with_logger(Base.NullLogger()) do
                                     # The false here means we ignore loaded modules, so precompile for a fresh session
-                                    Base.compilecache(pkg, sourcepath, std_pipe, std_pipe, false; flags, cacheflags)
+                                    Base.compilecache(pkg, sourcepath, std_pipe, std_pipe, false; flags, cacheflags, isext = haskey(exts, pkg))
                                 end
                             end
                             if ret isa Base.PrecompilableError
@@ -916,8 +874,12 @@ function precompilepkgs(pkgs::Vector{String}=String[];
     seconds_elapsed = round(Int, (time_ns() - time_start) / 1e9)
     ndeps = count(values(was_recompiled))
     if ndeps > 0 || !isempty(failed_deps) || (quick_exit && !isempty(std_outputs))
-        str = sprint() do iostr
+        str = sprint(context=io) do iostr
             if !quick_exit
+                if fancyprint # replace the progress bar
+                    what = isempty(requested_pkgs) ? "packages finished." : "$(join(requested_pkgs, ", ", " and ")) finished."
+                    printpkgstyle(iostr, :Precompiling, what)
+                end
                 plural = length(configs) > 1 ? "dependency configurations" : ndeps == 1 ? "dependency" : "dependencies"
                 print(iostr, "  $(ndeps) $(plural) successfully precompiled in $(seconds_elapsed) seconds")
                 if n_already_precomp > 0 || !isempty(circular_deps)
diff --git a/base/public.jl b/base/public.jl
index 862aff48da63e..2e8e777d2f91d 100644
--- a/base/public.jl
+++ b/base/public.jl
@@ -21,15 +21,8 @@ public
     ImmutableDict,
     OneTo,
     LogRange,
-    AnnotatedString,
-    AnnotatedChar,
     UUID,
 
-# Annotated strings
-    annotatedstring,
-    annotate!,
-    annotations,
-
 # Semaphores
     Semaphore,
     acquire,
@@ -110,6 +103,9 @@ public
     reseteof,
     link_pipe!,
 
+# filesystem operations
+    rename,
+
 # misc
     notnothing,
     runtests,
diff --git a/base/range.jl b/base/range.jl
index 8b30222382c9a..4b5d076dcf436 100644
--- a/base/range.jl
+++ b/base/range.jl
@@ -1485,7 +1485,7 @@ end
 """
     mod(x::Integer, r::AbstractUnitRange)
 
-Find `y` in the range `r` such that ``x ≡ y (mod n)``, where `n = length(r)`,
+Find `y` in the range `r` such that `x` ≡ `y` (mod `n`), where `n = length(r)`,
 i.e. `y = mod(x - first(r), n) + first(r)`.
 
 See also [`mod1`](@ref).
diff --git a/base/rational.jl b/base/rational.jl
index fb1824acb6b31..b4e450fd73abc 100644
--- a/base/rational.jl
+++ b/base/rational.jl
@@ -293,8 +293,14 @@ julia> numerator(4)
 4
 ```
 """
-numerator(x::Integer) = x
+numerator(x::Union{Integer,Complex{<:Integer}}) = x
 numerator(x::Rational) = x.num
+function numerator(z::Complex{<:Rational})
+    den = denominator(z)
+    reim = (real(z), imag(z))
+    result = checked_mul.(numerator.(reim), div.(den, denominator.(reim)))
+    complex(result...)
+end
 
 """
     denominator(x)
@@ -310,8 +316,9 @@ julia> denominator(4)
 1
 ```
 """
-denominator(x::Integer) = one(x)
+denominator(x::Union{Integer,Complex{<:Integer}}) = one(x)
 denominator(x::Rational) = x.den
+denominator(z::Complex{<:Rational}) = lcm(denominator(real(z)), denominator(imag(z)))
 
 sign(x::Rational) = oftype(x, sign(x.num))
 signbit(x::Rational) = signbit(x.num)
diff --git a/base/rawbigints.jl b/base/rawbigfloats.jl
similarity index 54%
rename from base/rawbigints.jl
rename to base/rawbigfloats.jl
index 6508bea05be0f..4377edfc463d8 100644
--- a/base/rawbigints.jl
+++ b/base/rawbigfloats.jl
@@ -1,34 +1,21 @@
 # This file is a part of Julia. License is MIT: https://julialang.org/license
 
-"""
-Segment of raw words of bits interpreted as a big integer. Less
-significant words come first. Each word is in machine-native bit-order.
-"""
-struct RawBigInt{T<:Unsigned}
-    d::String
-    word_count::Int
-
-    function RawBigInt{T}(d::String, word_count::Int) where {T<:Unsigned}
-        new{T}(d, word_count)
-    end
-end
+# Some operations on BigFloat can be done more directly by treating the data portion ("BigFloatData") as a BigInt
 
-elem_count(x::RawBigInt, ::Val{:words}) = x.word_count
+elem_count(x::BigFloatData, ::Val{:words}) = length(x)
 elem_count(x::Unsigned, ::Val{:bits}) = sizeof(x) * 8
-word_length(::RawBigInt{T}) where {T} = elem_count(zero(T), Val(:bits))
-elem_count(x::RawBigInt{T}, ::Val{:bits}) where {T} = word_length(x) * elem_count(x, Val(:words))
+word_length(::BigFloatData{T}) where {T} = elem_count(zero(T), Val(:bits))
+elem_count(x::BigFloatData{T}, ::Val{:bits}) where {T} = word_length(x) * elem_count(x, Val(:words))
 reversed_index(n::Int, i::Int) = n - i - 1
 reversed_index(x, i::Int, v::Val) = reversed_index(elem_count(x, v), i)::Int
-split_bit_index(x::RawBigInt, i::Int) = divrem(i, word_length(x), RoundToZero)
+split_bit_index(x::BigFloatData, i::Int) = divrem(i, word_length(x), RoundToZero)
 
 """
 `i` is the zero-based index of the wanted word in `x`, starting from
 the less significant words.
 """
-function get_elem(x::RawBigInt{T}, i::Int, ::Val{:words}, ::Val{:ascending}) where {T}
-    # `i` must be non-negative and less than `x.word_count`
-    d = x.d
-    (GC.@preserve d unsafe_load(Ptr{T}(pointer(d)), i + 1))::T
+function get_elem(x::BigFloatData{T}, i::Int, ::Val{:words}, ::Val{:ascending}) where {T}
+    @inbounds return x[i + 1]::T
 end
 
 function get_elem(x, i::Int, v::Val, ::Val{:descending})
@@ -36,9 +23,9 @@ function get_elem(x, i::Int, v::Val, ::Val{:descending})
     get_elem(x, j, v, Val(:ascending))
 end
 
-word_is_nonzero(x::RawBigInt, i::Int, v::Val) = !iszero(get_elem(x, i, Val(:words), v))
+word_is_nonzero(x::BigFloatData, i::Int, v::Val) = !iszero(get_elem(x, i, Val(:words), v))
 
-word_is_nonzero(x::RawBigInt, v::Val) = let x = x
+word_is_nonzero(x::BigFloatData, v::Val) = let x = x
     i -> word_is_nonzero(x, i, v)
 end
 
@@ -46,7 +33,7 @@ end
 Returns a `Bool` indicating whether the `len` least significant words
 of `x` are nonzero.
 """
-function tail_is_nonzero(x::RawBigInt, len::Int, ::Val{:words})
+function tail_is_nonzero(x::BigFloatData, len::Int, ::Val{:words})
     any(word_is_nonzero(x, Val(:ascending)), 0:(len - 1))
 end
 
@@ -54,7 +41,7 @@ end
 Returns a `Bool` indicating whether the `len` least significant bits of
 the `i`-th (zero-based index) word of `x` are nonzero.
 """
-function tail_is_nonzero(x::RawBigInt, len::Int, i::Int, ::Val{:word})
+function tail_is_nonzero(x::BigFloatData, len::Int, i::Int, ::Val{:word})
     !iszero(len) &&
     !iszero(get_elem(x, i, Val(:words), Val(:ascending)) << (word_length(x) - len))
 end
@@ -63,7 +50,7 @@ end
 Returns a `Bool` indicating whether the `len` least significant bits of
 `x` are nonzero.
 """
-function tail_is_nonzero(x::RawBigInt, len::Int, ::Val{:bits})
+function tail_is_nonzero(x::BigFloatData, len::Int, ::Val{:bits})
     if 0 < len
         word_count, bit_count_in_word = split_bit_index(x, len)
         tail_is_nonzero(x, bit_count_in_word, word_count, Val(:word)) ||
@@ -83,7 +70,7 @@ end
 """
 Returns a `Bool` that is the `i`-th (zero-based index) bit of `x`.
 """
-function get_elem(x::RawBigInt, i::Int, ::Val{:bits}, v::Val{:ascending})
+function get_elem(x::BigFloatData, i::Int, ::Val{:bits}, v::Val{:ascending})
     vb = Val(:bits)
     if 0 ≤ i < elem_count(x, vb)
         word_index, bit_index_in_word = split_bit_index(x, i)
@@ -96,38 +83,44 @@ end
 
 """
 Returns an integer of type `R`, consisting of the `len` most
-significant bits of `x`.
+significant bits of `x`. If there are less than `len` bits in `x`,
+the least significant bits are zeroed.
 """
-function truncated(::Type{R}, x::RawBigInt, len::Int) where {R<:Integer}
+function truncated(::Type{R}, x::BigFloatData, len::Int) where {R<:Integer}
     ret = zero(R)
     if 0 < len
         word_count, bit_count_in_word = split_bit_index(x, len)
         k = word_length(x)
         vals = (Val(:words), Val(:descending))
+        lenx = elem_count(x, first(vals))
 
         for w ∈ 0:(word_count - 1)
             ret <<= k
-            word = get_elem(x, w, vals...)
-            ret |= R(word)
+            if w < lenx # if the output type is larger, truncate turns into zero-extend
+                word = get_elem(x, w, vals...)
+                ret |= R(word)
+            end
         end
 
         if !iszero(bit_count_in_word)
             ret <<= bit_count_in_word
-            wrd = get_elem(x, word_count, vals...)
-            ret |= R(wrd >>> (k - bit_count_in_word))
+            if word_count < lenx # if the output type is larger, truncate turns into zero-extend
+                wrd = get_elem(x, word_count, vals...)
+                ret |= R(wrd >>> (k - bit_count_in_word))
+            end
         end
     end
     ret::R
 end
 
-struct RawBigIntRoundingIncrementHelper{T<:Unsigned}
-    n::RawBigInt{T}
+struct BigFloatDataRoundingIncrementHelper{T<:Unsigned}
+    n::BigFloatData{T}
     trunc_len::Int
 
     final_bit::Bool
     round_bit::Bool
 
-    function RawBigIntRoundingIncrementHelper{T}(n::RawBigInt{T}, len::Int) where {T<:Unsigned}
+    function BigFloatDataRoundingIncrementHelper{T}(n::BigFloatData{T}, len::Int) where {T<:Unsigned}
         vals = (Val(:bits), Val(:descending))
         f = get_elem(n, len - 1, vals...)
         r = get_elem(n, len    , vals...)
@@ -135,15 +128,15 @@ struct RawBigIntRoundingIncrementHelper{T<:Unsigned}
     end
 end
 
-function RawBigIntRoundingIncrementHelper(n::RawBigInt{T}, len::Int) where {T<:Unsigned}
-    RawBigIntRoundingIncrementHelper{T}(n, len)
+function BigFloatDataRoundingIncrementHelper(n::BigFloatData{T}, len::Int) where {T<:Unsigned}
+    BigFloatDataRoundingIncrementHelper{T}(n, len)
 end
 
-(h::RawBigIntRoundingIncrementHelper)(::Rounding.FinalBit) = h.final_bit
+(h::BigFloatDataRoundingIncrementHelper)(::Rounding.FinalBit) = h.final_bit
 
-(h::RawBigIntRoundingIncrementHelper)(::Rounding.RoundBit) = h.round_bit
+(h::BigFloatDataRoundingIncrementHelper)(::Rounding.RoundBit) = h.round_bit
 
-function (h::RawBigIntRoundingIncrementHelper)(::Rounding.StickyBit)
+function (h::BigFloatDataRoundingIncrementHelper)(::Rounding.StickyBit)
     v = Val(:bits)
     n = h.n
     tail_is_nonzero(n, elem_count(n, v) - h.trunc_len - 1, v)
diff --git a/base/reflection.jl b/base/reflection.jl
index 2ddd34b0f73c1..80eeb4c4efb12 100644
--- a/base/reflection.jl
+++ b/base/reflection.jl
@@ -76,6 +76,17 @@ function fullname(m::Module)
     return (fullname(mp)..., mn)
 end
 
+"""
+    moduleloc(m::Module) -> LineNumberNode
+
+Get the location of the `module` definition.
+"""
+function moduleloc(m::Module)
+    line = Ref{Int32}(0)
+    file = ccall(:jl_module_getloc, Ref{Symbol}, (Any, Ref{Int32}), m, line)
+    return LineNumberNode(Int(line[]), file)
+end
+
 """
     names(x::Module; all::Bool=false, imported::Bool=false, usings::Bool=false) -> Vector{Symbol}
 
@@ -964,7 +975,7 @@ use it in the following manner to summarize information about a struct:
 julia> structinfo(T) = [(fieldoffset(T,i), fieldname(T,i), fieldtype(T,i)) for i = 1:fieldcount(T)];
 
 julia> structinfo(Base.Filesystem.StatStruct)
-13-element Vector{Tuple{UInt64, Symbol, Type}}:
+14-element Vector{Tuple{UInt64, Symbol, Type}}:
  (0x0000000000000000, :desc, Union{RawFD, String})
  (0x0000000000000008, :device, UInt64)
  (0x0000000000000010, :inode, UInt64)
@@ -978,6 +989,7 @@ julia> structinfo(Base.Filesystem.StatStruct)
  (0x0000000000000050, :blocks, Int64)
  (0x0000000000000058, :mtime, Float64)
  (0x0000000000000060, :ctime, Float64)
+ (0x0000000000000068, :ioerrno, Int32)
 ```
 """
 fieldoffset(x::DataType, idx::Integer) = (@_foldable_meta; ccall(:jl_get_field_offset, Csize_t, (Any, Cint), x, idx))
@@ -1017,7 +1029,7 @@ julia> struct Foo
        end
 
 julia> Base.fieldindex(Foo, :z)
-ERROR: FieldError: type Foo has no field z
+ERROR: FieldError: type Foo has no field `z`, available fields: `x`, `y`
 Stacktrace:
 [...]
 
@@ -1504,6 +1516,13 @@ struct CodegenParams
     """
     use_jlplt::Cint
 
+    """
+    If enabled, only provably reachable code (from functions marked with `entrypoint`) is included
+    in the output system image. Errors or warnings can be given for call sites too dynamic to handle.
+    The option is disabled by default. (0=>disabled, 1=>safe (static errors), 2=>unsafe, 3=>unsafe plus warnings)
+    """
+    trim::Cint
+
     """
     A pointer of type
 
@@ -1519,14 +1538,14 @@ struct CodegenParams
                    prefer_specsig::Bool=false,
                    gnu_pubnames::Bool=true, debug_info_kind::Cint = default_debug_info_kind(),
                    debug_info_level::Cint = Cint(JLOptions().debug_level), safepoint_on_entry::Bool=true,
-                   gcstack_arg::Bool=true, use_jlplt::Bool=true,
+                   gcstack_arg::Bool=true, use_jlplt::Bool=true, trim::Cint=Cint(0),
                    lookup::Ptr{Cvoid}=unsafe_load(cglobal(:jl_rettype_inferred_addr, Ptr{Cvoid})))
         return new(
             Cint(track_allocations), Cint(code_coverage),
             Cint(prefer_specsig),
             Cint(gnu_pubnames), debug_info_kind,
             debug_info_level, Cint(safepoint_on_entry),
-            Cint(gcstack_arg), Cint(use_jlplt),
+            Cint(gcstack_arg), Cint(use_jlplt), Cint(trim),
             lookup)
     end
 end
@@ -2440,7 +2459,7 @@ true
 ```
 """
 function hasmethod(@nospecialize(f), @nospecialize(t))
-    return Core._hasmethod(f, t isa Type ? t : to_tuple_type(t))
+    return Core._hasmethod(signature_type(f, t))
 end
 
 function Core.kwcall(kwargs::NamedTuple, ::typeof(hasmethod), @nospecialize(f), @nospecialize(t))
diff --git a/base/refvalue.jl b/base/refvalue.jl
index 000088ff0ce76..7a0f2f84e2206 100644
--- a/base/refvalue.jl
+++ b/base/refvalue.jl
@@ -46,9 +46,9 @@ function unsafe_convert(P::Union{Type{Ptr{T}},Type{Ptr{Cvoid}}}, b::RefValue{T})
         # Instead, explicitly load the pointer from the `RefValue`,
         # which also ensures this returns same pointer as the one rooted in the `RefValue` object.
         p = atomic_pointerref(Ptr{Ptr{Cvoid}}(pointer_from_objref(b)), :monotonic)
-    end
-    if p == C_NULL
-        throw(UndefRefError())
+        if p == C_NULL
+            throw(UndefRefError())
+        end
     end
     return p
 end
diff --git a/base/show.jl b/base/show.jl
index 0a2976e7ebe42..ec6776d81f2d5 100644
--- a/base/show.jl
+++ b/base/show.jl
@@ -2196,8 +2196,12 @@ function show_unquoted(io::IO, ex::Expr, indent::Int, prec::Int, quote_level::In
     elseif head === :do && nargs == 2
         iob = IOContext(io, beginsym=>false)
         show_unquoted(iob, args[1], indent, -1, quote_level)
-        print(io, " do ")
-        show_list(iob, (((args[2]::Expr).args[1])::Expr).args, ", ", 0, 0, quote_level)
+        print(io, " do")
+        do_args = (((args[2]::Expr).args[1])::Expr).args
+        if !isempty(do_args)
+            print(io, ' ')
+            show_list(iob, do_args, ", ", 0, 0, quote_level)
+        end
         for stmt in (((args[2]::Expr).args[2])::Expr).args
             print(io, '\n', " "^(indent + indent_width))
             show_unquoted(iob, stmt, indent + indent_width, -1, quote_level)
diff --git a/base/stat.jl b/base/stat.jl
index 506b5644dccbc..c6fb239a96404 100644
--- a/base/stat.jl
+++ b/base/stat.jl
@@ -63,6 +63,7 @@ struct StatStruct
     blocks  :: Int64
     mtime   :: Float64
     ctime   :: Float64
+    ioerrno :: Int32
 end
 
 @eval function Base.:(==)(x::StatStruct, y::StatStruct) # do not include `desc` in equality or hash
@@ -80,22 +81,23 @@ end
     end)
 end
 
-StatStruct() = StatStruct("", 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)
-StatStruct(buf::Union{Vector{UInt8},Ptr{UInt8}}) = StatStruct("", buf)
-StatStruct(desc::Union{AbstractString, OS_HANDLE}, buf::Union{Vector{UInt8},Ptr{UInt8}}) = StatStruct(
+StatStruct() = StatStruct("", 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, Base.UV_ENOENT)
+StatStruct(buf::Union{Memory{UInt8},Vector{UInt8},Ptr{UInt8}}, ioerrno::Int32) = StatStruct("", buf, ioerrno)
+StatStruct(desc::Union{AbstractString, OS_HANDLE}, buf::Union{Memory{UInt8},Vector{UInt8},Ptr{UInt8}}, ioerrno::Int32) = StatStruct(
     desc isa OS_HANDLE ? desc : String(desc),
-    ccall(:jl_stat_dev,     UInt32,  (Ptr{UInt8},), buf),
-    ccall(:jl_stat_ino,     UInt32,  (Ptr{UInt8},), buf),
-    ccall(:jl_stat_mode,    UInt32,  (Ptr{UInt8},), buf),
-    ccall(:jl_stat_nlink,   UInt32,  (Ptr{UInt8},), buf),
-    ccall(:jl_stat_uid,     UInt32,  (Ptr{UInt8},), buf),
-    ccall(:jl_stat_gid,     UInt32,  (Ptr{UInt8},), buf),
-    ccall(:jl_stat_rdev,    UInt32,  (Ptr{UInt8},), buf),
-    ccall(:jl_stat_size,    UInt64,  (Ptr{UInt8},), buf),
-    ccall(:jl_stat_blksize, UInt64,  (Ptr{UInt8},), buf),
-    ccall(:jl_stat_blocks,  UInt64,  (Ptr{UInt8},), buf),
-    ccall(:jl_stat_mtime,   Float64, (Ptr{UInt8},), buf),
-    ccall(:jl_stat_ctime,   Float64, (Ptr{UInt8},), buf),
+    ioerrno != 0 ? zero(UInt32) : ccall(:jl_stat_dev,     UInt32,  (Ptr{UInt8},), buf),
+    ioerrno != 0 ? zero(UInt32) : ccall(:jl_stat_ino,     UInt32,  (Ptr{UInt8},), buf),
+    ioerrno != 0 ? zero(UInt32) : ccall(:jl_stat_mode,    UInt32,  (Ptr{UInt8},), buf),
+    ioerrno != 0 ? zero(UInt32) : ccall(:jl_stat_nlink,   UInt32,  (Ptr{UInt8},), buf),
+    ioerrno != 0 ? zero(UInt32) : ccall(:jl_stat_uid,     UInt32,  (Ptr{UInt8},), buf),
+    ioerrno != 0 ? zero(UInt32) : ccall(:jl_stat_gid,     UInt32,  (Ptr{UInt8},), buf),
+    ioerrno != 0 ? zero(UInt32) : ccall(:jl_stat_rdev,    UInt32,  (Ptr{UInt8},), buf),
+    ioerrno != 0 ? zero(UInt64) : ccall(:jl_stat_size,    UInt64,  (Ptr{UInt8},), buf),
+    ioerrno != 0 ? zero(UInt64) : ccall(:jl_stat_blksize, UInt64,  (Ptr{UInt8},), buf),
+    ioerrno != 0 ? zero(UInt64) : ccall(:jl_stat_blocks,  UInt64,  (Ptr{UInt8},), buf),
+    ioerrno != 0 ? zero(Float64) : ccall(:jl_stat_mtime,   Float64, (Ptr{UInt8},), buf),
+    ioerrno != 0 ? zero(Float64) : ccall(:jl_stat_ctime,   Float64, (Ptr{UInt8},), buf),
+    ioerrno
 )
 
 function iso_datetime_with_relative(t, tnow)
@@ -130,35 +132,41 @@ end
 function show_statstruct(io::IO, st::StatStruct, oneline::Bool)
     print(io, oneline ? "StatStruct(" : "StatStruct for ")
     show(io, st.desc)
-    oneline || print(io, "\n  ")
-    print(io, " size: ", st.size, " bytes")
-    oneline || print(io, "\n")
-    print(io, " device: ", st.device)
-    oneline || print(io, "\n ")
-    print(io, " inode: ", st.inode)
-    oneline || print(io, "\n  ")
-    print(io, " mode: 0o", string(filemode(st), base = 8, pad = 6), " (", filemode_string(st), ")")
-    oneline || print(io, "\n ")
-    print(io, " nlink: ", st.nlink)
-    oneline || print(io, "\n   ")
-    print(io, " uid: $(st.uid)")
-    username = getusername(st.uid)
-    username === nothing || print(io, " (", username, ")")
-    oneline || print(io, "\n   ")
-    print(io, " gid: ", st.gid)
-    groupname = getgroupname(st.gid)
-    groupname === nothing || print(io, " (", groupname, ")")
-    oneline || print(io, "\n  ")
-    print(io, " rdev: ", st.rdev)
-    oneline || print(io, "\n ")
-    print(io, " blksz: ", st.blksize)
-    oneline || print(io, "\n")
-    print(io, " blocks: ", st.blocks)
-    tnow = round(UInt, time())
-    oneline || print(io, "\n ")
-    print(io, " mtime: ", iso_datetime_with_relative(st.mtime, tnow))
-    oneline || print(io, "\n ")
-    print(io, " ctime: ", iso_datetime_with_relative(st.ctime, tnow))
+    code = st.ioerrno
+    if code != 0
+        print(io, oneline ? " " : "\n ")
+        print(io, Base.uverrorname(code), ": ", Base.struverror(code))
+    else
+        oneline || print(io, "\n  ")
+        print(io, " size: ", st.size, " bytes")
+        oneline || print(io, "\n")
+        print(io, " device: ", st.device)
+        oneline || print(io, "\n ")
+        print(io, " inode: ", st.inode)
+        oneline || print(io, "\n  ")
+        print(io, " mode: 0o", string(filemode(st), base = 8, pad = 6), " (", filemode_string(st), ")")
+        oneline || print(io, "\n ")
+        print(io, " nlink: ", st.nlink)
+        oneline || print(io, "\n   ")
+        print(io, " uid: $(st.uid)")
+        username = getusername(st.uid)
+        username === nothing || print(io, " (", username, ")")
+        oneline || print(io, "\n   ")
+        print(io, " gid: ", st.gid)
+        groupname = getgroupname(st.gid)
+        groupname === nothing || print(io, " (", groupname, ")")
+        oneline || print(io, "\n  ")
+        print(io, " rdev: ", st.rdev)
+        oneline || print(io, "\n ")
+        print(io, " blksz: ", st.blksize)
+        oneline || print(io, "\n")
+        print(io, " blocks: ", st.blocks)
+        tnow = round(UInt, time())
+        oneline || print(io, "\n ")
+        print(io, " mtime: ", iso_datetime_with_relative(st.mtime, tnow))
+        oneline || print(io, "\n ")
+        print(io, " ctime: ", iso_datetime_with_relative(st.ctime, tnow))
+    end
     oneline && print(io, ")")
     return nothing
 end
@@ -168,18 +176,13 @@ show(io::IO, ::MIME"text/plain", st::StatStruct) = show_statstruct(io, st, false
 
 # stat & lstat functions
 
+checkstat(s::StatStruct) = Int(s.ioerrno) in (0, Base.UV_ENOENT, Base.UV_ENOTDIR, Base.UV_EINVAL) ? s : uv_error(string("stat(", repr(s.desc), ")"), s.ioerrno)
+
 macro stat_call(sym, arg1type, arg)
     return quote
-        stat_buf = zeros(UInt8, Int(ccall(:jl_sizeof_stat, Int32, ())))
+        stat_buf = fill!(Memory{UInt8}(undef, Int(ccall(:jl_sizeof_stat, Int32, ()))), 0x00)
         r = ccall($(Expr(:quote, sym)), Int32, ($(esc(arg1type)), Ptr{UInt8}), $(esc(arg)), stat_buf)
-        if !(r in (0, Base.UV_ENOENT, Base.UV_ENOTDIR, Base.UV_EINVAL))
-            uv_error(string("stat(", repr($(esc(arg))), ")"), r)
-        end
-        st = StatStruct($(esc(arg)), stat_buf)
-        if ispath(st) != (r == 0)
-            error("stat returned zero type for a valid path")
-        end
-        return st
+        return checkstat(StatStruct($(esc(arg)), stat_buf, r))
     end
 end
 
@@ -334,7 +337,7 @@ Return `true` if a valid filesystem entity exists at `path`,
 otherwise returns `false`.
 This is the generalization of [`isfile`](@ref), [`isdir`](@ref) etc.
 """
-ispath(st::StatStruct) = filemode(st) & 0xf000 != 0x0000
+ispath(st::StatStruct) = st.ioerrno == 0
 function ispath(path::String)
     # We use `access()` and `F_OK` to determine if a given path exists. `F_OK` comes from `unistd.h`.
     F_OK = 0x00
diff --git a/base/strings/annotated.jl b/base/strings/annotated.jl
index be4c6887d4a6d..9a0b4b2825436 100644
--- a/base/strings/annotated.jl
+++ b/base/strings/annotated.jl
@@ -39,13 +39,6 @@ the combined range.
 See also [`AnnotatedChar`](@ref), [`annotatedstring`](@ref),
 [`annotations`](@ref), and [`annotate!`](@ref).
 
-!!! warning
-    While the constructors are part of the Base public API, the fields
-    of `AnnotatedString` are not. This is to allow for potential future
-    changes in the implementation of this type. Instead use the
-    [`annotations`](@ref), and [`annotate!`](@ref) getter/setter
-    functions.
-
 # Constructors
 
 ```julia
@@ -81,13 +74,6 @@ More specifically, this is a simple wrapper around any other
 See also: [`AnnotatedString`](@ref), [`annotatedstring`](@ref), `annotations`,
 and `annotate!`.
 
-!!! warning
-    While the constructors are part of the Base public API, the fields
-    of `AnnotatedChar` are not. This it to allow for potential future
-    changes in the implementation of this type. Instead use the
-    [`annotations`](@ref), and [`annotate!`](@ref) getter/setter
-    functions.
-
 # Constructors
 
 ```julia
diff --git a/base/strings/io.jl b/base/strings/io.jl
index acbd945c8e137..df34712b519d5 100644
--- a/base/strings/io.jl
+++ b/base/strings/io.jl
@@ -51,6 +51,8 @@ function print(io::IO, xs...)
     return nothing
 end
 
+setfield!(typeof(print).name.mt, :max_args, 10, :monotonic)
+
 """
     println([io::IO], xs...)
 
@@ -74,6 +76,7 @@ julia> String(take!(io))
 """
 println(io::IO, xs...) = print(io, xs..., "\n")
 
+setfield!(typeof(println).name.mt, :max_args, 10, :monotonic)
 ## conversion of general objects to strings ##
 
 """
@@ -149,6 +152,7 @@ function print_to_string(xs...)
     end
     String(_unsafe_take!(s))
 end
+setfield!(typeof(print_to_string).name.mt, :max_args, 10, :monotonic)
 
 function string_with_env(env, xs...)
     if isempty(xs)
@@ -365,7 +369,8 @@ function _join_preserve_annotations(iterator, args...)
         # in nature, we extract an `AnnotatedString`, otherwise we just extract
         # a plain `String` from `io`.
         if isconcretetype(et) || !isempty(io.annotations)
-            read(seekstart(io), AnnotatedString{String})
+            seekstart(io)
+            read(io, AnnotatedString{String})
         else
             String(take!(io.io))
         end
diff --git a/base/strings/search.jl b/base/strings/search.jl
index b9c14f06e0898..a481b3af775e0 100644
--- a/base/strings/search.jl
+++ b/base/strings/search.jl
@@ -10,7 +10,29 @@ match strings with [`match`](@ref).
 """
 abstract type AbstractPattern end
 
-nothing_sentinel(i) = i == 0 ? nothing : i
+# TODO: These unions represent bytes in memory that can be accessed via a pointer.
+# this property is used throughout Julia, e.g. also in IO code.
+# This deserves a better solution - see #53178.
+# If such a better solution comes in place, these unions should be replaced.
+const DenseInt8 = Union{
+    DenseArray{Int8},
+    FastContiguousSubArray{Int8,N,<:DenseArray} where N
+}
+
+# Note: This union is different from that above in that it includes CodeUnits.
+# Currently, this is redundant as CodeUnits <: DenseVector, but this subtyping
+# is buggy and may be removed in the future, see #54002
+const DenseUInt8 = Union{
+    DenseArray{UInt8},
+    FastContiguousSubArray{UInt8,N,<:DenseArray} where N,
+    CodeUnits{UInt8, <:Union{String, SubString{String}}},
+    FastContiguousSubArray{UInt8,N,<:CodeUnits{UInt8, <:Union{String, SubString{String}}}} where N,
+}
+
+const DenseUInt8OrInt8 = Union{DenseUInt8, DenseInt8}
+
+last_byteindex(x::Union{String, SubString{String}}) = ncodeunits(x)
+last_byteindex(x::DenseUInt8OrInt8) = lastindex(x)
 
 function last_utf8_byte(c::Char)
     u = reinterpret(UInt32, c)
@@ -30,11 +52,11 @@ function findnext(pred::Fix2{<:Union{typeof(isequal),typeof(==)},<:AbstractChar}
     end
     @inbounds isvalid(s, i) || string_index_err(s, i)
     c = pred.x
-    c ≤ '\x7f' && return nothing_sentinel(_search(s, c % UInt8, i))
+    c ≤ '\x7f' && return _search(s, first_utf8_byte(c), i)
     while true
         i = _search(s, first_utf8_byte(c), i)
-        i == 0 && return nothing
-        pred(s[i]) && return i
+        i === nothing && return nothing
+        isvalid(s, i) && pred(s[i]) && return i
         i = nextind(s, i)
     end
 end
@@ -47,31 +69,41 @@ const DenseBytes = Union{
     CodeUnits{UInt8, <:Union{String, SubString{String}}},
 }
 
-const ByteArray = Union{DenseBytes, DenseArrayType{Int8}}
+function findfirst(pred::Fix2{<:Union{typeof(isequal),typeof(==)},<:Union{UInt8, Int8}}, a::Union{DenseInt8, DenseUInt8})
+    findnext(pred, a, firstindex(a))
+end
 
-findfirst(pred::Fix2{<:Union{typeof(isequal),typeof(==)},<:Union{Int8,UInt8}}, a::ByteArray) =
-    nothing_sentinel(_search(a, pred.x))
+function findnext(pred::Fix2{<:Union{typeof(isequal),typeof(==)},UInt8}, a::DenseUInt8, i::Integer)
+    _search(a, pred.x, i)
+end
 
-findnext(pred::Fix2{<:Union{typeof(isequal),typeof(==)},<:Union{Int8,UInt8}}, a::ByteArray, i::Integer) =
-    nothing_sentinel(_search(a, pred.x, i))
+function findnext(pred::Fix2{<:Union{typeof(isequal),typeof(==)},Int8}, a::DenseInt8, i::Integer)
+    _search(a, pred.x, i)
+end
 
-findfirst(::typeof(iszero), a::ByteArray) = nothing_sentinel(_search(a, zero(UInt8)))
-findnext(::typeof(iszero), a::ByteArray, i::Integer) = nothing_sentinel(_search(a, zero(UInt8), i))
+# iszero is special, in that the bitpattern for zero for Int8 and UInt8 is the same,
+# so we can use memchr even if we search for an Int8 in an UInt8 array or vice versa
+findfirst(::typeof(iszero), a::DenseUInt8OrInt8) = _search(a, zero(UInt8))
+findnext(::typeof(iszero), a::DenseUInt8OrInt8, i::Integer) = _search(a, zero(UInt8), i)
 
-function _search(a::Union{String,SubString{String},<:ByteArray}, b::Union{Int8,UInt8}, i::Integer = 1)
-    if i < 1
+function _search(a::Union{String,SubString{String},DenseUInt8OrInt8}, b::Union{Int8,UInt8}, i::Integer = firstindex(a))
+    fst = firstindex(a)
+    lst = last_byteindex(a)
+    if i < fst
         throw(BoundsError(a, i))
     end
-    n = sizeof(a)
-    if i > n
-        return i == n+1 ? 0 : throw(BoundsError(a, i))
+    n_bytes = lst - i + 1
+    if i > lst
+        return i == lst+1 ? nothing : throw(BoundsError(a, i))
     end
-    p = pointer(a)
-    q = GC.@preserve a ccall(:memchr, Ptr{UInt8}, (Ptr{UInt8}, Int32, Csize_t), p+i-1, b, n-i+1)
-    return q == C_NULL ? 0 : Int(q-p+1)
+    GC.@preserve a begin
+        p = pointer(a)
+        q = ccall(:memchr, Ptr{UInt8}, (Ptr{UInt8}, Int32, Csize_t), p+i-fst, b, n_bytes)
+    end
+    return q == C_NULL ? nothing : (q-p+fst) % Int
 end
 
-function _search(a::ByteArray, b::AbstractChar, i::Integer = 1)
+function _search(a::DenseUInt8, b::AbstractChar, i::Integer = firstindex(a))
     if isascii(b)
         _search(a,UInt8(b),i)
     else
@@ -80,41 +112,51 @@ function _search(a::ByteArray, b::AbstractChar, i::Integer = 1)
 end
 
 function findprev(pred::Fix2{<:Union{typeof(isequal),typeof(==)},<:AbstractChar},
-                  s::String, i::Integer)
+                  s::Union{String, SubString{String}}, i::Integer)
     c = pred.x
-    c ≤ '\x7f' && return nothing_sentinel(_rsearch(s, c % UInt8, i))
+    c ≤ '\x7f' && return _rsearch(s, first_utf8_byte(c), i)
     b = first_utf8_byte(c)
     while true
         i = _rsearch(s, b, i)
-        i == 0 && return nothing
-        pred(s[i]) && return i
+        i == nothing && return nothing
+        isvalid(s, i) && pred(s[i]) && return i
         i = prevind(s, i)
     end
 end
 
-findlast(pred::Fix2{<:Union{typeof(isequal),typeof(==)},<:Union{Int8,UInt8}}, a::ByteArray) =
-    nothing_sentinel(_rsearch(a, pred.x))
+function findlast(pred::Fix2{<:Union{typeof(isequal),typeof(==)},<:Union{Int8,UInt8}}, a::DenseUInt8OrInt8)
+    findprev(pred, a, lastindex(a))
+end
 
-findprev(pred::Fix2{<:Union{typeof(isequal),typeof(==)},<:Union{Int8,UInt8}}, a::ByteArray, i::Integer) =
-    nothing_sentinel(_rsearch(a, pred.x, i))
+function findprev(pred::Fix2{<:Union{typeof(isequal),typeof(==)},Int8}, a::DenseInt8, i::Integer)
+    _rsearch(a, pred.x, i)
+end
 
-findlast(::typeof(iszero), a::ByteArray) = nothing_sentinel(_rsearch(a, zero(UInt8)))
-findprev(::typeof(iszero), a::ByteArray, i::Integer) = nothing_sentinel(_rsearch(a, zero(UInt8), i))
+function findprev(pred::Fix2{<:Union{typeof(isequal),typeof(==)},UInt8}, a::DenseUInt8, i::Integer)
+    _rsearch(a, pred.x, i)
+end
 
-function _rsearch(a::Union{String,ByteArray}, b::Union{Int8,UInt8}, i::Integer = sizeof(a))
-    if i < 1
-        return i == 0 ? 0 : throw(BoundsError(a, i))
+# See comments above for findfirst(::typeof(iszero)) methods
+findlast(::typeof(iszero), a::DenseUInt8OrInt8) = _rsearch(a, zero(UInt8))
+findprev(::typeof(iszero), a::DenseUInt8OrInt8, i::Integer) = _rsearch(a, zero(UInt8), i)
+
+function _rsearch(a::Union{String,SubString{String},DenseUInt8OrInt8}, b::Union{Int8,UInt8}, i::Integer = last_byteindex(a))
+    fst = firstindex(a)
+    lst = last_byteindex(a)
+    if i < fst
+        return i == fst - 1 ? nothing : throw(BoundsError(a, i))
+    end
+    if i > lst
+        return i == lst+1 ? nothing : throw(BoundsError(a, i))
     end
-    n = sizeof(a)
-    if i > n
-        return i == n+1 ? 0 : throw(BoundsError(a, i))
+    GC.@preserve a begin
+        p = pointer(a)
+        q = ccall(:memrchr, Ptr{UInt8}, (Ptr{UInt8}, Int32, Csize_t), p, b, i-fst+1)
     end
-    p = pointer(a)
-    q = GC.@preserve a ccall(:memrchr, Ptr{UInt8}, (Ptr{UInt8}, Int32, Csize_t), p, b, i)
-    return q == C_NULL ? 0 : Int(q-p+1)
+    return q == C_NULL ? nothing : (q-p+fst) % Int
 end
 
-function _rsearch(a::ByteArray, b::AbstractChar, i::Integer = length(a))
+function _rsearch(a::DenseUInt8, b::AbstractChar, i::Integer = length(a))
     if isascii(b)
         _rsearch(a,UInt8(b),i)
     else
@@ -136,7 +178,7 @@ function findall(
     i = firstindex(s)
     while true
         i = _search(s, byte, i)
-        iszero(i) && return result
+        isnothing(i) && return result
         i += 1
         index = i - ncu
         # If the char is invalid, it's possible that its first byte is
@@ -224,18 +266,19 @@ end
 
 in(c::AbstractChar, s::AbstractString) = (findfirst(isequal(c),s)!==nothing)
 
-function _searchindex(s::Union{AbstractString,ByteArray},
+function _searchindex(s::Union{AbstractString,DenseUInt8OrInt8},
                       t::Union{AbstractString,AbstractChar,Int8,UInt8},
                       i::Integer)
+    sentinel = firstindex(s) - 1
     x = Iterators.peel(t)
     if isnothing(x)
-        return 1 <= i <= nextind(s,lastindex(s))::Int ? i :
+        return firstindex(s) <= i <= nextind(s,lastindex(s))::Int ? i :
                throw(BoundsError(s, i))
     end
     t1, trest = x
     while true
         i = findnext(isequal(t1),s,i)
-        if i === nothing return 0 end
+        if i === nothing return sentinel end
         ii = nextind(s, i)::Int
         a = Iterators.Stateful(trest)
         matched = all(splat(==), zip(SubString(s, ii), a))
@@ -509,9 +552,8 @@ julia> findall(UInt8[1,2], UInt8[1,2,3,1,2])
 !!! compat "Julia 1.3"
      This method requires at least Julia 1.3.
 """
-
-function findall(t::Union{AbstractString, AbstractPattern, AbstractVector{<:Union{Int8,UInt8}}},
-                 s::Union{AbstractString, AbstractPattern, AbstractVector{<:Union{Int8,UInt8}}},
+function findall(t::Union{AbstractString, AbstractPattern, AbstractVector{UInt8}},
+                 s::Union{AbstractString, AbstractPattern, AbstractVector{UInt8}},
                  ; overlap::Bool=false)
     found = UnitRange{Int}[]
     i, e = firstindex(s), lastindex(s)
@@ -564,7 +606,7 @@ function _rsearchindex(s::AbstractString,
     end
 end
 
-function _rsearchindex(s::String, t::String, i::Integer)
+function _rsearchindex(s::Union{String, SubString{String}}, t::Union{String, SubString{String}}, i::Integer)
     # Check for fast case of a single byte
     if lastindex(t) == 1
         return something(findprev(isequal(t[1]), s, i), 0)
diff --git a/base/strings/string.jl b/base/strings/string.jl
index f5abbead34bd1..a46ee60e4f023 100644
--- a/base/strings/string.jl
+++ b/base/strings/string.jl
@@ -208,7 +208,7 @@ end
             i = i′
             @inbounds l = codeunit(s, i)
             (l < 0x80) | (0xf8 ≤ l) && return i+1
-            @assert l >= 0xc0
+            @assert l >= 0xc0 "invalid codeunit"
         end
         # first continuation byte
         (i += 1) > n && return i
@@ -570,9 +570,10 @@ julia> repeat('A', 3)
 ```
 """
 function repeat(c::AbstractChar, r::Integer)
+    r < 0 && throw(ArgumentError("can't repeat a character $r times"))
+    r = UInt(r)::UInt
     c = Char(c)::Char
     r == 0 && return ""
-    r < 0 && throw(ArgumentError("can't repeat a character $r times"))
     u = bswap(reinterpret(UInt32, c))
     n = 4 - (leading_zeros(u | 0xff) >> 3)
     s = _string_n(n*r)
diff --git a/base/strings/substring.jl b/base/strings/substring.jl
index 2a6b4ae7b9a22..50717d3c27e23 100644
--- a/base/strings/substring.jl
+++ b/base/strings/substring.jl
@@ -272,6 +272,7 @@ end
 
 function repeat(s::Union{String, SubString{String}}, r::Integer)
     r < 0 && throw(ArgumentError("can't repeat a string $r times"))
+    r = UInt(r)::UInt
     r == 0 && return ""
     r == 1 && return String(s)
     n = sizeof(s)
diff --git a/base/sysinfo.jl b/base/sysinfo.jl
index d0dcac8c6d416..7dab313cf4f57 100644
--- a/base/sysinfo.jl
+++ b/base/sysinfo.jl
@@ -56,6 +56,8 @@ global STDLIB::String = "$BINDIR/../share/julia/stdlib/v$(VERSION.major).$(VERSI
 # In case STDLIB change after julia is built, the variable below can be used
 # to update cached method locations to updated ones.
 const BUILD_STDLIB_PATH = STDLIB
+# Similarly, this is the root of the julia repo directory that julia was built from
+const BUILD_ROOT_PATH = "$BINDIR/../.."
 
 # helper to avoid triggering precompile warnings
 
diff --git a/base/terminfo.jl b/base/terminfo.jl
index 79713f4a86aa3..8ea8387077d36 100644
--- a/base/terminfo.jl
+++ b/base/terminfo.jl
@@ -262,7 +262,7 @@ function find_terminfo_file(term::String)
         append!(terminfo_dirs,
                 replace(split(ENV["TERMINFO_DIRS"], ':'),
                         "" => "/usr/share/terminfo"))
-    push!(terminfo_dirs, normpath(Sys.BINDIR, DATAROOTDIR, "terminfo"))
+    push!(terminfo_dirs, normpath(Sys.BINDIR, DATAROOTDIR, "julia", "terminfo"))
     Sys.isunix() &&
         push!(terminfo_dirs, "/etc/terminfo", "/lib/terminfo", "/usr/share/terminfo")
     for dir in terminfo_dirs
diff --git a/base/timing.jl b/base/timing.jl
index 80ebb74abee26..b094aa230e1c2 100644
--- a/base/timing.jl
+++ b/base/timing.jl
@@ -104,6 +104,33 @@ function gc_page_utilization_data()
     return Base.unsafe_wrap(Array, page_utilization_raw, JL_GC_N_MAX_POOLS, own=false)
 end
 
+# must be kept in sync with `src/gc-stock.h``
+const FULL_SWEEP_REASONS = [:FULL_SWEEP_REASON_SWEEP_ALWAYS_FULL, :FULL_SWEEP_REASON_FORCED_FULL_SWEEP,
+                            :FULL_SWEEP_REASON_USER_MAX_EXCEEDED, :FULL_SWEEP_REASON_LARGE_PROMOTION_RATE]
+
+"""
+    Base.full_sweep_reasons()
+
+Return a dictionary of the number of times each full sweep reason has occurred.
+
+The reasons are:
+- `:FULL_SWEEP_REASON_SWEEP_ALWAYS_FULL`: Full sweep was caused due to `always_full` being set in the GC debug environment
+- `:FULL_SWEEP_REASON_FORCED_FULL_SWEEP`: Full sweep was forced by `GC.gc(true)`
+- `:FULL_SWEEP_REASON_USER_MAX_EXCEEDED`: Full sweep was forced due to the system reaching the heap soft size limit
+- `:FULL_SWEEP_REASON_LARGE_PROMOTION_RATE`: Full sweep was forced by a large promotion rate across GC generations
+
+Note that the set of reasons is not guaranteed to be stable across minor versions of Julia.
+"""
+function full_sweep_reasons()
+    reason = cglobal(:jl_full_sweep_reasons, UInt64)
+    reasons_as_array = Base.unsafe_wrap(Vector{UInt64}, reason, length(FULL_SWEEP_REASONS), own=false)
+    d = Dict{Symbol, Int64}()
+    for (i, r) in enumerate(FULL_SWEEP_REASONS)
+        d[r] = reasons_as_array[i]
+    end
+    return d
+end
+
 """
     Base.jit_total_bytes()
 
@@ -179,7 +206,7 @@ function time_print(io::IO, elapsedtime, bytes=0, gctime=0, allocs=0, lock_confl
             print(io, length(timestr) < 10 ? (" "^(10 - length(timestr))) : "")
         end
         print(io, timestr, " seconds")
-        parens = bytes != 0 || allocs != 0 || gctime > 0 || compile_time > 0
+        parens = bytes != 0 || allocs != 0 || gctime > 0 || lock_conflicts > 0 || compile_time > 0
         parens && print(io, " (")
         if bytes != 0 || allocs != 0
             allocs, ma = prettyprint_getunits(allocs, length(_cnt_units), Int64(1000))
@@ -197,11 +224,14 @@ function time_print(io::IO, elapsedtime, bytes=0, gctime=0, allocs=0, lock_confl
             print(io, Ryu.writefixed(Float64(100*gctime/elapsedtime), 2), "% gc time")
         end
         if lock_conflicts > 0
+            if bytes != 0 || allocs != 0 || gctime > 0
+                print(io, ", ")
+            end
             plural = lock_conflicts == 1 ? "" : "s"
-            print(io, ", ", lock_conflicts, " lock conflict$plural")
+            print(io, lock_conflicts, " lock conflict$plural")
         end
         if compile_time > 0
-            if bytes != 0 || allocs != 0 || gctime > 0
+            if bytes != 0 || allocs != 0 || gctime > 0 || lock_conflicts > 0
                 print(io, ", ")
             end
             print(io, Ryu.writefixed(Float64(100*compile_time/elapsedtime), 2), "% compilation time")
diff --git a/base/toml_parser.jl b/base/toml_parser.jl
index cc1455f61928b..4d07cfed05d8a 100644
--- a/base/toml_parser.jl
+++ b/base/toml_parser.jl
@@ -84,9 +84,6 @@ mutable struct Parser{Dates}
 
     # Filled in in case we are parsing a file to improve error messages
     filepath::Union{String, Nothing}
-
-    # Optionally populate with the Dates stdlib to change the type of Date types returned
-    Dates::Union{Module, Nothing} # TODO: remove once Pkg is updated
 end
 
 function Parser{Dates}(str::String; filepath=nothing) where {Dates}
@@ -106,8 +103,7 @@ function Parser{Dates}(str::String; filepath=nothing) where {Dates}
             IdSet{Any}(),         # static_arrays
             IdSet{TOMLDict}(),    # defined_tables
             root,
-            filepath,
-            nothing
+            filepath
         )
     startup(l)
     return l
@@ -495,8 +491,10 @@ function recurse_dict!(l::Parser, d::Dict, dotted_keys::AbstractVector{String},
         d = d::TOMLDict
         key = dotted_keys[i]
         d = get!(TOMLDict, d, key)
-        if d isa Vector
+        if d isa Vector{Any}
             d = d[end]
+        elseif d isa Vector
+            return ParserError(ErrKeyAlreadyHasValue)
         end
         check && @try check_allowed_add_key(l, d, i == length(dotted_keys))
     end
@@ -537,7 +535,7 @@ function parse_array_table(l)::Union{Nothing, ParserError}
     end
     d = @try recurse_dict!(l, l.root, @view(table_key[1:end-1]), false)
     k = table_key[end]
-    old = get!(() -> [], d, k)
+    old = get!(() -> Any[], d, k)
     if old isa Vector
         if old in l.static_arrays
             return ParserError(ErrAddArrayToStaticArray)
@@ -546,7 +544,7 @@ function parse_array_table(l)::Union{Nothing, ParserError}
         return ParserError(ErrArrayTreatedAsDictionary)
     end
     d_new = TOMLDict()
-    push!(old, d_new)
+    push!(old::Vector{Any}, d_new)
     push!(l.defined_tables, d_new)
     l.active_table = d_new
 
@@ -668,41 +666,20 @@ end
 # Array #
 #########
 
-function push!!(v::Vector, el)
-    # Since these types are typically non-inferable, they are a big invalidation risk,
-    # and since it's used by the package-loading infrastructure the cost of invalidation
-    # is high. Therefore, this is written to reduce the "exposed surface area": e.g., rather
-    # than writing `T[el]` we write it as `push!(Vector{T}(undef, 1), el)` so that there
-    # is no ambiguity about what types of objects will be created.
-    T = eltype(v)
-    t = typeof(el)
-    if el isa T || t === T
-        push!(v, el::T)
-        return v
-    elseif T === Union{}
-        out = Vector{t}(undef, 1)
-        out[1] = el
-        return out
-    else
-        if T isa Union
-            newT = Any
-        else
-            newT = Union{T, typeof(el)}
-        end
-        new = Array{newT}(undef, length(v))
-        copy!(new, v)
-        return push!(new, el)
+function copyto_typed!(a::Vector{T}, b::Vector) where T
+    for i in 1:length(b)
+        a[i] = b[i]::T
     end
+    return nothing
 end
 
-function parse_array(l::Parser)::Err{Vector}
+function parse_array(l::Parser{Dates})::Err{Vector} where Dates
     skip_ws_nl(l)
-    array = Vector{Union{}}()
+    array = Vector{Any}()
     empty_array = accept(l, ']')
     while !empty_array
         v = @try parse_value(l)
-        # TODO: Worth to function barrier this?
-        array = push!!(array, v)
+        array = push!(array, v)
         # There can be an arbitrary number of newlines and comments before a value and before the closing bracket.
         skip_ws_nl(l)
         comma = accept(l, ',')
@@ -712,8 +689,40 @@ function parse_array(l::Parser)::Err{Vector}
             return ParserError(ErrExpectedCommaBetweenItemsArray)
         end
     end
-    push!(l.static_arrays, array)
-    return array
+    # check for static type throughout array
+    T = !isempty(array) ? typeof(array[1]) : Union{}
+    for el in array
+        if typeof(el) != T
+            T = Any
+            break
+        end
+    end
+    if T === Any
+        new = array
+    elseif T === String
+        new = Array{T}(undef, length(array))
+        copyto_typed!(new, array)
+    elseif T === Bool
+        new = Array{T}(undef, length(array))
+        copyto_typed!(new, array)
+    elseif T === Int64
+        new = Array{T}(undef, length(array))
+        copyto_typed!(new, array)
+    elseif T === UInt64
+        new = Array{T}(undef, length(array))
+        copyto_typed!(new, array)
+    elseif T === Float64
+        new = Array{T}(undef, length(array))
+        copyto_typed!(new, array)
+    elseif T === Union{}
+        new = Any[]
+    elseif (T === TOMLDict) || (T == BigInt) || (T === UInt128) || (T === Int128) || (T <: Vector) ||
+        (T === Dates.Date) || (T === Dates.Time) || (T === Dates.DateTime)
+        # do nothing, leave as Vector{Any}
+        new = array
+    else @assert false end
+    push!(l.static_arrays, new)
+    return new
 end
 
 
@@ -1025,10 +1034,9 @@ function parse_datetime(l)
 end
 
 function try_return_datetime(p::Parser{Dates}, year, month, day, h, m, s, ms) where Dates
-    if Dates !== nothing || p.Dates !== nothing
-        mod = Dates !== nothing ? Dates : p.Dates
+    if Dates !== nothing
         try
-            return mod.DateTime(year, month, day, h, m, s, ms)
+            return Dates.DateTime(year, month, day, h, m, s, ms)
         catch ex
             ex isa ArgumentError && return ParserError(ErrParsingDateTime)
             rethrow()
@@ -1039,10 +1047,9 @@ function try_return_datetime(p::Parser{Dates}, year, month, day, h, m, s, ms) wh
 end
 
 function try_return_date(p::Parser{Dates}, year, month, day) where Dates
-    if Dates !== nothing || p.Dates !== nothing
-        mod = Dates !== nothing ? Dates : p.Dates
+    if Dates !== nothing
         try
-            return mod.Date(year, month, day)
+            return Dates.Date(year, month, day)
         catch ex
             ex isa ArgumentError && return ParserError(ErrParsingDateTime)
             rethrow()
@@ -1062,10 +1069,9 @@ function parse_local_time(l::Parser)
 end
 
 function try_return_time(p::Parser{Dates}, h, m, s, ms) where Dates
-    if Dates !== nothing || p.Dates !== nothing
-        mod = Dates !== nothing ? Dates : p.Dates
+    if Dates !== nothing
         try
-            return mod.Time(h, m, s, ms)
+            return Dates.Time(h, m, s, ms)
         catch ex
             ex isa ArgumentError && return ParserError(ErrParsingDateTime)
             rethrow()
diff --git a/cli/Makefile b/cli/Makefile
index 7b8d3587f5386..3cc0af1a76afd 100644
--- a/cli/Makefile
+++ b/cli/Makefile
@@ -25,8 +25,6 @@ else ifeq ($(OS),FreeBSD)
 LOADER_LDFLAGS += -Wl,--no-as-needed -ldl -lpthread -rdynamic -lc -Wl,--as-needed
 else ifeq ($(OS),OpenBSD)
 LOADER_LDFLAGS += -Wl,--no-as-needed -lpthread -rdynamic -lc -Wl,--as-needed
-else ifeq ($(OS),Darwin)
-LOADER_LDFLAGS += -lSystem
 endif
 
 # Build list of dependent libraries that must be opened
diff --git a/contrib/generate_precompile.jl b/contrib/generate_precompile.jl
index 2a0e4faff7f1c..60f7290c7a0ac 100644
--- a/contrib/generate_precompile.jl
+++ b/contrib/generate_precompile.jl
@@ -39,6 +39,15 @@ precompile(Base.__require_prelocked, (Base.PkgId, Nothing))
 precompile(Base._require, (Base.PkgId, Nothing))
 precompile(Base.indexed_iterate, (Pair{Symbol, Union{Nothing, String}}, Int))
 precompile(Base.indexed_iterate, (Pair{Symbol, Union{Nothing, String}}, Int, Int))
+precompile(Tuple{typeof(Base.Threads.atomic_add!), Base.Threads.Atomic{Int}, Int})
+precompile(Tuple{typeof(Base.Threads.atomic_sub!), Base.Threads.Atomic{Int}, Int})
+
+# LazyArtifacts (but more generally helpful)
+precompile(Tuple{Type{Base.Val{x} where x}, Module})
+precompile(Tuple{Type{NamedTuple{(:honor_overrides,), T} where T<:Tuple}, Tuple{Bool}})
+precompile(Tuple{typeof(Base.unique!), Array{String, 1}})
+precompile(Tuple{typeof(Base.invokelatest), Any})
+precompile(Tuple{typeof(Base.vcat), Array{String, 1}, Array{String, 1}})
 
 # Pkg loading
 precompile(Tuple{typeof(Base.Filesystem.normpath), String, String, Vararg{String}})
@@ -161,6 +170,8 @@ for match = Base._methods(+, (Int, Int), -1, Base.get_world_counter())
     push!(Expr[], Expr(:return, false))
     vcat(String[], String[])
     k, v = (:hello => nothing)
+    Base.print_time_imports_report(Base)
+    Base.print_time_imports_report_init(Base)
 
     # Preferences uses these
     get(Dict{String,Any}(), "missing", nothing)
@@ -172,6 +183,11 @@ for match = Base._methods(+, (Int, Int), -1, Base.get_world_counter())
     # interactive startup uses this
     write(IOBuffer(), "")
 
+    # not critical, but helps hide unrelated compilation from @time when using --trace-compile
+    foo() = rand(2,2) * rand(2,2)
+    @time foo()
+    @time foo()
+
     break   # only actually need to do this once
 end
 """
@@ -331,8 +347,7 @@ generate_precompile_statements() = try # Make sure `ansi_enablecursor` is printe
         print_state("step1" => "F$n_step1")
         return :ok
     end
-    Base.errormonitor(step1)
-    !PARALLEL_PRECOMPILATION && wait(step1)
+    PARALLEL_PRECOMPILATION ? bind(statements_step1, step1) : wait(step1)
 
     # Create a staging area where all the loaded packages are available
     PrecompileStagingArea = Module()
@@ -346,7 +361,7 @@ generate_precompile_statements() = try # Make sure `ansi_enablecursor` is printe
     # Make statements unique
     statements = Set{String}()
     # Execute the precompile statements
-    for sts in [statements_step1,], statement in sts
+    for statement in statements_step1
         # Main should be completely clean
         occursin("Main.", statement) && continue
         Base.in!(statement, statements) && continue
@@ -382,6 +397,7 @@ generate_precompile_statements() = try # Make sure `ansi_enablecursor` is printe
     println()
     # Seems like a reasonable number right now, adjust as needed
     # comment out if debugging script
+    have_repl = false
     n_succeeded > (have_repl ? 650 : 90) || @warn "Only $n_succeeded precompile statements"
 
     fetch(step1) == :ok || throw("Step 1 of collecting precompiles failed.")
@@ -392,7 +408,6 @@ generate_precompile_statements() = try # Make sure `ansi_enablecursor` is printe
 finally
     fancyprint && print(ansi_enablecursor)
     GC.gc(true); GC.gc(false); # reduce memory footprint
-    return
 end
 
 generate_precompile_statements()
diff --git a/contrib/julia-config.jl b/contrib/julia-config.jl
index df17b967c1ed7..8b1eb55cbe4f4 100755
--- a/contrib/julia-config.jl
+++ b/contrib/julia-config.jl
@@ -67,9 +67,7 @@ function ldlibs(doframework)
         "julia"
     end
     if Sys.isunix()
-        return "-Wl,-rpath,$(shell_escape(libDir())) " *
-            (Sys.isapple() ? string() : "-Wl,-rpath,$(shell_escape(private_libDir())) ") *
-            "-l$libname"
+        return "-L$(shell_escape(private_libDir())) -Wl,-rpath,$(shell_escape(libDir())) -Wl,-rpath,$(shell_escape(private_libDir())) -l$libname"
     else
         return "-l$libname -lopenlibm"
     end
diff --git a/contrib/juliac-buildscript.jl b/contrib/juliac-buildscript.jl
new file mode 100644
index 0000000000000..50f96198c416b
--- /dev/null
+++ b/contrib/juliac-buildscript.jl
@@ -0,0 +1,277 @@
+# Script to run in the process that generates juliac's object file output
+
+inputfile = ARGS[1]
+output_type = ARGS[2]
+add_ccallables = ARGS[3] == "true"
+
+# Initialize some things not usually initialized when output is requested
+Sys.__init__()
+Base.init_depot_path()
+Base.init_load_path()
+Base.init_active_project()
+task = current_task()
+task.rngState0 = 0x5156087469e170ab
+task.rngState1 = 0x7431eaead385992c
+task.rngState2 = 0x503e1d32781c2608
+task.rngState3 = 0x3a77f7189200c20b
+task.rngState4 = 0x5502376d099035ae
+uuid_tuple = (UInt64(0), UInt64(0))
+ccall(:jl_set_module_uuid, Cvoid, (Any, NTuple{2, UInt64}), Base.__toplevel__, uuid_tuple)
+ccall(:jl_set_newly_inferred, Cvoid, (Any,), Core.Compiler.newly_inferred)
+
+# Patch methods in Core and Base
+
+@eval Core begin
+    DomainError(@nospecialize(val), @nospecialize(msg::AbstractString)) = (@noinline; $(Expr(:new, :DomainError, :val, :msg)))
+end
+
+(f::Base.RedirectStdStream)(io::Core.CoreSTDOUT) = Base._redirect_io_global(io, f.unix_fd)
+
+@eval Base begin
+    _assert_tostring(msg) = ""
+    reinit_stdio() = nothing
+    JuliaSyntax.enable_in_core!() = nothing
+    init_active_project() = ACTIVE_PROJECT[] = nothing
+    set_active_project(projfile::Union{AbstractString,Nothing}) = ACTIVE_PROJECT[] = projfile
+    disable_library_threading() = nothing
+    start_profile_listener() = nothing
+    @inline function invokelatest(f::F, args...; kwargs...) where F
+        return f(args...; kwargs...)
+    end
+    function sprint(f::F, args::Vararg{Any,N}; context=nothing, sizehint::Integer=0) where {F<:Function,N}
+        s = IOBuffer(sizehint=sizehint)
+        if context isa Tuple
+            f(IOContext(s, context...), args...)
+        elseif context !== nothing
+            f(IOContext(s, context), args...)
+        else
+            f(s, args...)
+        end
+        String(_unsafe_take!(s))
+    end
+    function show_typeish(io::IO, @nospecialize(T))
+        if T isa Type
+            show(io, T)
+        elseif T isa TypeVar
+            print(io, (T::TypeVar).name)
+        else
+            print(io, "?")
+        end
+    end
+    function show(io::IO, T::Type)
+        if T isa DataType
+            print(io, T.name.name)
+            if T !== T.name.wrapper && length(T.parameters) > 0
+                print(io, "{")
+                first = true
+                for p in T.parameters
+                    if !first
+                        print(io, ", ")
+                    end
+                    first = false
+                    if p isa Int
+                        show(io, p)
+                    elseif p isa Type
+                        show(io, p)
+                    elseif p isa Symbol
+                        print(io, ":")
+                        print(io, p)
+                    elseif p isa TypeVar
+                        print(io, p.name)
+                    else
+                        print(io, "?")
+                    end
+                end
+                print(io, "}")
+            end
+        elseif T isa Union
+            print(io, "Union{")
+            show_typeish(io, T.a)
+            print(io, ", ")
+            show_typeish(io, T.b)
+            print(io, "}")
+        elseif T isa UnionAll
+            print(io, T.body::Type)
+            print(io, " where ")
+            print(io, T.var.name)
+        end
+    end
+    show_type_name(io::IO, tn::Core.TypeName) = print(io, tn.name)
+
+    mapreduce(f::F, op::F2, A::AbstractArrayOrBroadcasted; dims=:, init=_InitialValue()) where {F, F2} =
+    _mapreduce_dim(f, op, init, A, dims)
+    mapreduce(f::F, op::F2, A::AbstractArrayOrBroadcasted...; kw...) where {F, F2} =
+        reduce(op, map(f, A...); kw...)
+
+    _mapreduce_dim(f::F, op::F2, nt, A::AbstractArrayOrBroadcasted, ::Colon) where {F, F2} =
+        mapfoldl_impl(f, op, nt, A)
+
+    _mapreduce_dim(f::F, op::F2, ::_InitialValue, A::AbstractArrayOrBroadcasted, ::Colon) where {F, F2} =
+        _mapreduce(f, op, IndexStyle(A), A)
+
+    _mapreduce_dim(f::F, op::F2, nt, A::AbstractArrayOrBroadcasted, dims) where {F, F2} =
+        mapreducedim!(f, op, reducedim_initarray(A, dims, nt), A)
+
+    _mapreduce_dim(f::F, op::F2, ::_InitialValue, A::AbstractArrayOrBroadcasted, dims) where {F,F2} =
+        mapreducedim!(f, op, reducedim_init(f, op, A, dims), A)
+
+    mapreduce_empty_iter(f::F, op::F2, itr, ItrEltype) where {F, F2} =
+        reduce_empty_iter(MappingRF(f, op), itr, ItrEltype)
+        mapreduce_first(f::F, op::F2, x) where {F,F2} = reduce_first(op, f(x))
+
+    _mapreduce(f::F, op::F2, A::AbstractArrayOrBroadcasted) where {F,F2} = _mapreduce(f, op, IndexStyle(A), A)
+    mapreduce_empty(::typeof(identity), op::F, T) where {F} = reduce_empty(op, T)
+    mapreduce_empty(::typeof(abs), op::F, T) where {F}     = abs(reduce_empty(op, T))
+    mapreduce_empty(::typeof(abs2), op::F, T) where {F}    = abs2(reduce_empty(op, T))
+end
+@eval Base.Unicode begin
+    function utf8proc_map(str::Union{String,SubString{String}}, options::Integer, chartransform::F = identity) where F
+        nwords = utf8proc_decompose(str, options, C_NULL, 0, chartransform)
+        buffer = Base.StringVector(nwords*4)
+        nwords = utf8proc_decompose(str, options, buffer, nwords, chartransform)
+        nbytes = ccall(:utf8proc_reencode, Int, (Ptr{UInt8}, Int, Cint), buffer, nwords, options)
+        nbytes < 0 && utf8proc_error(nbytes)
+        return String(resize!(buffer, nbytes))
+    end
+end
+@eval Base.GMP begin
+    function __init__()
+        try
+            ccall((:__gmp_set_memory_functions, libgmp), Cvoid,
+                (Ptr{Cvoid},Ptr{Cvoid},Ptr{Cvoid}),
+                cglobal(:jl_gc_counted_malloc),
+                cglobal(:jl_gc_counted_realloc_with_old_size),
+                cglobal(:jl_gc_counted_free_with_size))
+            ZERO.alloc, ZERO.size, ZERO.d = 0, 0, C_NULL
+            ONE.alloc, ONE.size, ONE.d = 1, 1, pointer(_ONE)
+        catch ex
+            Base.showerror_nostdio(ex, "WARNING: Error during initialization of module GMP")
+        end
+        # This only works with a patched version of GMP, ignore otherwise
+        try
+            ccall((:__gmp_set_alloc_overflow_function, libgmp), Cvoid,
+                (Ptr{Cvoid},),
+                cglobal(:jl_throw_out_of_memory_error))
+            ALLOC_OVERFLOW_FUNCTION[] = true
+        catch ex
+            # ErrorException("ccall: could not find function...")
+            if typeof(ex) != ErrorException
+                rethrow()
+            end
+        end
+    end
+end
+@eval Base.Sort begin
+    issorted(itr;
+        lt::T=isless, by::F=identity, rev::Union{Bool,Nothing}=nothing, order::Ordering=Forward) where {T,F} =
+        issorted(itr, ord(lt,by,rev,order))
+end
+@eval Base.TOML begin
+    function try_return_datetime(p, year, month, day, h, m, s, ms)
+        return DateTime(year, month, day, h, m, s, ms)
+    end
+    function try_return_date(p, year, month, day)
+        return Date(year, month, day)
+    end
+    function parse_local_time(l::Parser)
+        h = @try parse_int(l, false)
+        h in 0:23 || return ParserError(ErrParsingDateTime)
+        _, m, s, ms = @try _parse_local_time(l, true)
+        # TODO: Could potentially parse greater accuracy for the
+        # fractional seconds here.
+        return try_return_time(l, h, m, s, ms)
+    end
+    function try_return_time(p, h, m, s, ms)
+        return Time(h, m, s, ms)
+    end
+end
+
+# Load user code
+
+import Base.Experimental.entrypoint
+
+let mod = Base.include(Base.__toplevel__, inputfile)
+    if !isa(mod, Module)
+        mod = Main
+    end
+    if output_type == "--output-exe" && isdefined(mod, :main) && !add_ccallables
+        entrypoint(mod.main, ())
+    end
+    #entrypoint(join, (Base.GenericIOBuffer{Memory{UInt8}}, Array{Base.SubString{String}, 1}, String))
+    #entrypoint(join, (Base.GenericIOBuffer{Memory{UInt8}}, Array{String, 1}, Char))
+    entrypoint(Base.task_done_hook, (Task,))
+    entrypoint(Base.wait, ())
+    entrypoint(Base.trypoptask, (Base.StickyWorkqueue,))
+    entrypoint(Base.checktaskempty, ())
+    if add_ccallables
+        ccall(:jl_add_ccallable_entrypoints, Cvoid, ())
+    end
+end
+
+# Additional method patches depending on whether user code loads certain stdlibs
+
+let loaded = Symbol.(Base.loaded_modules_array())  # TODO better way to do this
+    if :SparseArrays in loaded
+        using SparseArrays
+        @eval SparseArrays.CHOLMOD begin
+            function __init__()
+                ccall((:SuiteSparse_config_malloc_func_set, :libsuitesparseconfig),
+                    Cvoid, (Ptr{Cvoid},), cglobal(:jl_malloc, Ptr{Cvoid}))
+                ccall((:SuiteSparse_config_calloc_func_set, :libsuitesparseconfig),
+                    Cvoid, (Ptr{Cvoid},), cglobal(:jl_calloc, Ptr{Cvoid}))
+                ccall((:SuiteSparse_config_realloc_func_set, :libsuitesparseconfig),
+                    Cvoid, (Ptr{Cvoid},), cglobal(:jl_realloc, Ptr{Cvoid}))
+                ccall((:SuiteSparse_config_free_func_set, :libsuitesparseconfig),
+                Cvoid, (Ptr{Cvoid},), cglobal(:jl_free, Ptr{Cvoid}))
+            end
+        end
+    end
+    if :Artifacts in loaded
+        using Artifacts
+        @eval Artifacts begin
+            function _artifact_str(__module__, artifacts_toml, name, path_tail, artifact_dict, hash, platform, _::Val{lazyartifacts}) where lazyartifacts
+                moduleroot = Base.moduleroot(__module__)
+                if haskey(Base.module_keys, moduleroot)
+                    # Process overrides for this UUID, if we know what it is
+                    process_overrides(artifact_dict, Base.module_keys[moduleroot].uuid)
+                end
+
+                # If the artifact exists, we're in the happy path and we can immediately
+                # return the path to the artifact:
+                dirs = artifact_paths(hash; honor_overrides=true)
+                for dir in dirs
+                    if isdir(dir)
+                        return jointail(dir, path_tail)
+                    end
+                end
+            end
+        end
+    end
+    if :Pkg in loaded
+        using Pkg
+        @eval Pkg begin
+            __init__() = rand() #TODO, methods that do nothing don't get codegened
+        end
+    end
+    if :StyledStrings in loaded
+        using StyledStrings
+        @eval StyledStrings begin
+            __init__() = rand()
+        end
+    end
+end
+
+empty!(Core.ARGS)
+empty!(Base.ARGS)
+empty!(LOAD_PATH)
+empty!(DEPOT_PATH)
+empty!(Base.TOML_CACHE.d)
+Base.TOML.reinit!(Base.TOML_CACHE.p, "")
+Base.ACTIVE_PROJECT[] = nothing
+@eval Base begin
+    PROGRAM_FILE = ""
+end
+@eval Sys begin
+    BINDIR = ""
+    STDLIB = ""
+end
diff --git a/contrib/juliac.jl b/contrib/juliac.jl
new file mode 100644
index 0000000000000..61e0e91958667
--- /dev/null
+++ b/contrib/juliac.jl
@@ -0,0 +1,110 @@
+# Julia compiler wrapper script
+# NOTE: The interface and location of this script are considered unstable/experimental
+
+cmd = Base.julia_cmd()
+cmd = `$cmd --startup-file=no --history-file=no`
+output_type = nothing  # exe, sharedlib, sysimage
+trim = nothing
+outname = nothing
+file = nothing
+add_ccallables = false
+
+help = findfirst(x->x == "--help", ARGS)
+if help !== nothing
+    println(
+        """
+        Usage: julia juliac.jl [--output-exe | --output-lib | --output-sysimage] <name> [options] <file.jl>
+        --trim=<no,safe,unsafe,unsafe-warn>  Only output code statically determined to be reachable
+        --compile-ccallable  Include all methods marked `@ccallable` in output
+        --verbose            Request verbose output
+        """)
+    exit(0)
+end
+
+let i = 1
+    while i <= length(ARGS)
+        arg = ARGS[i]
+        if arg == "--output-exe" || arg == "--output-lib" || arg == "--output-sysimage"
+            isnothing(output_type) || error("Multiple output types specified")
+            global output_type = arg
+            i == length(ARGS) && error("Output specifier requires an argument")
+            global outname = ARGS[i+1]
+            i += 1
+        elseif startswith(arg, "--trim")
+            arg = split(arg, '=')
+            if length(arg) == 1
+                global trim = "safe"
+            else
+                global trim = arg[2]
+            end
+        elseif arg == "--compile-ccallable"
+            global add_ccallables = true
+        else
+            if arg[1] == '-' || !isnothing(file)
+                println("Unexpected argument `$arg`")
+                exit(1)
+            end
+            global file = arg
+        end
+        i += 1
+    end
+end
+
+isnothing(outname) && error("No output file specified")
+isnothing(file) && error("No input file specified")
+
+absfile = abspath(file)
+cflags = readchomp(`$(cmd) $(joinpath(Sys.BINDIR, Base.DATAROOTDIR,"julia", "julia-config.jl")) --cflags `)
+cflags = Base.shell_split(cflags)
+allflags = readchomp(`$(cmd) $(joinpath(Sys.BINDIR, Base.DATAROOTDIR,"julia", "julia-config.jl")) --allflags`)
+allflags = Base.shell_split(allflags)
+tmpdir = mktempdir(cleanup=false)
+initsrc_path = joinpath(tmpdir, "init.c")
+init_path = joinpath(tmpdir, "init.a")
+img_path = joinpath(tmpdir, "img.a")
+bc_path = joinpath(tmpdir, "img-bc.a")
+
+open(initsrc_path, "w") do io
+    print(io, """
+              #include <julia.h>
+              __attribute__((constructor)) void static_init(void) {
+                  if (jl_is_initialized())
+                      return;
+                  julia_init(JL_IMAGE_IN_MEMORY);
+                  jl_exception_clear();
+              }
+              """)
+end
+
+static_call_graph_arg() = isnothing(trim) ?  `` : `--trim=$(trim)`
+is_verbose() = verbose ? `--verbose-compilation=yes` : ``
+cmd = addenv(`$cmd --project=$(Base.active_project()) --output-o $img_path --output-incremental=no --strip-ir --strip-metadata $(static_call_graph_arg()) $(joinpath(@__DIR__,"juliac-buildscript.jl")) $absfile $output_type $add_ccallables`, "OPENBLAS_NUM_THREADS" => 1, "JULIA_NUM_THREADS" => 1)
+
+if !success(pipeline(cmd; stdout, stderr))
+    println(stderr, "\nFailed to compile $file")
+    exit(1)
+end
+
+run(`cc $(cflags) -g -c -o $init_path $initsrc_path`)
+
+if output_type == "--output-lib" || output_type == "--output-sysimage"
+    of, ext = splitext(outname)
+    soext = "." * Base.BinaryPlatforms.platform_dlext()
+    if ext == ""
+        outname = of * soext
+    end
+end
+
+julia_libs = Base.shell_split(Base.isdebugbuild() ? "-ljulia-debug -ljulia-internal-debug" : "-ljulia -ljulia-internal")
+try
+    if output_type == "--output-lib"
+        run(`cc $(allflags) -o $outname -shared -Wl,$(Base.Linking.WHOLE_ARCHIVE) $img_path  -Wl,$(Base.Linking.NO_WHOLE_ARCHIVE) $init_path  $(julia_libs)`)
+    elseif output_type == "--output-sysimage"
+        run(`cc $(allflags) -o $outname -shared -Wl,$(Base.Linking.WHOLE_ARCHIVE) $img_path  -Wl,$(Base.Linking.NO_WHOLE_ARCHIVE)             $(julia_libs)`)
+    else
+        run(`cc $(allflags) -o $outname -Wl,$(Base.Linking.WHOLE_ARCHIVE) $img_path -Wl,$(Base.Linking.NO_WHOLE_ARCHIVE) $init_path $(julia_libs)`)
+    end
+catch
+    println("\nCompilation failed.")
+    exit(1)
+end
diff --git a/contrib/pgo-lto/Makefile b/contrib/pgo-lto/Makefile
index 896d41ac2e286..ddd86f5d5b39a 100644
--- a/contrib/pgo-lto/Makefile
+++ b/contrib/pgo-lto/Makefile
@@ -8,7 +8,6 @@ STAGE0_TOOLS:=$(STAGE0_BUILD)/usr/tools/
 
 PROFILE_DIR:=$(CURDIR)/profiles
 PROFILE_FILE:=$(PROFILE_DIR)/merged.prof
-PROFRAW_FILES:=$(wildcard $(PROFILE_DIR)/*.profraw)
 JULIA_ROOT:=$(CURDIR)/../..
 
 LLVM_CXXFILT:=$(STAGE0_TOOLS)llvm-cxxfilt
@@ -26,15 +25,16 @@ AFTER_STAGE1_MESSAGE:='You can now optionally collect more profiling data for us
 	Note that running extensive scripts may result in counter overflows, which can be detected by running $\
 	`make top`. Afterwards run `make stage2`.'
 
-TOOLCHAIN_FLAGS = $\
-	"CC=$(STAGE0_TOOLS)clang" $\
-	"CXX=$(STAGE0_TOOLS)clang++" $\
-	"LD=$(STAGE0_TOOLS)ld.lld" $\
-	"AR=$(STAGE0_TOOLS)llvm-ar" $\
-	"RANLIB=$(STAGE0_TOOLS)llvm-ranlib" $\
-	"CFLAGS+=$(PGO_CFLAGS)" $\
-	"CXXFLAGS+=$(PGO_CXXFLAGS)" $\
-	"LDFLAGS+=$(PGO_LDFLAGS)"
+STAGE1_FLAGS:=LDFLAGS="-fuse-ld=lld -flto=thin -Wl,--undefined-version -fprofile-generate=$(PROFILE_DIR)" $\
+			CFLAGS="-fprofile-generate=$(PROFILE_DIR) -Xclang -mllvm -Xclang -vp-counters-per-site=$(COUNTERS_PER_SITE)" $\
+			CXXFLAGS="-fprofile-generate=$(PROFILE_DIR) -Xclang -mllvm -Xclang -vp-counters-per-site=$(COUNTERS_PER_SITE)"
+STAGE2_FLAGS:=LDFLAGS="-fuse-ld=lld -flto=thin -Wl,--undefined-version -fprofile-use=$(PROFILE_FILE) -Wl,--icf=safe" $\
+			CFLAGS="-fprofile-use=$(PROFILE_FILE)" $\
+			CXXFLAGS="-fprofile-use=$(PROFILE_FILE)"
+
+COMMON_FLAGS:=USECLANG=1 USE_BINARYBUILDER_LLVM=0
+
+all: stage2 # Default target as first in file
 
 $(STAGE0_BUILD) $(STAGE1_BUILD) $(STAGE2_BUILD):
 	$(MAKE) -C $(JULIA_ROOT) O=$@ configure
@@ -48,26 +48,20 @@ stage0: | $(STAGE0_BUILD)
 	touch $@
 
 $(STAGE1_BUILD): stage0
-stage1: PGO_CFLAGS:=-fprofile-generate=$(PROFILE_DIR) -Xclang -mllvm -Xclang -vp-counters-per-site=$(COUNTERS_PER_SITE)
-stage1: PGO_CXXFLAGS:=-fprofile-generate=$(PROFILE_DIR) -Xclang -mllvm -Xclang -vp-counters-per-site=$(COUNTERS_PER_SITE)
-stage1: PGO_LDFLAGS:=-fuse-ld=lld -flto=thin -fprofile-generate=$(PROFILE_DIR)
-stage1: export USE_BINARYBUILDER_LLVM=0
 stage1: | $(STAGE1_BUILD)
-	$(MAKE) -C $(STAGE1_BUILD) $(TOOLCHAIN_FLAGS) && touch $@
+	@echo "--- Build Julia Stage 1 - with instrumentation"
+	PATH=$(STAGE0_TOOLS):$$PATH $(STAGE1_FLAGS) $(MAKE) -C $(STAGE1_BUILD) $(COMMON_FLAGS) && touch $@
 	@echo $(AFTER_STAGE1_MESSAGE)
 
-stage2: PGO_CFLAGS:=-fprofile-use=$(PROFILE_FILE)
-stage2: PGO_CXXFLAGS:=-fprofile-use=$(PROFILE_FILE)
-stage2: PGO_LDFLAGS:=-fuse-ld=lld -flto=thin -fprofile-use=$(PROFILE_FILE) -Wl,--icf=safe
-stage2: export USE_BINARYBUILDER_LLVM=0
 stage2: $(PROFILE_FILE) | $(STAGE2_BUILD)
-	$(MAKE) -C $(STAGE2_BUILD) $(TOOLCHAIN_FLAGS) && touch $@
+	@echo "--- Build Julia Stage 2 - PGO + LTO optimised"
+	PATH=$(STAGE0_TOOLS):$$PATH $(STAGE2_FLAGS) $(MAKE) -C $(STAGE2_BUILD) $(COMMON_FLAGS) && touch $@
 
-install: stage2
-	$(MAKE) -C $(STAGE2_BUILD) USE_BINARYBUILDER_LLVM=0 install
+.DEFAULT: stage2
+	PATH=$(STAGE0_TOOLS):$$PATH $(STAGE2_FLAGS) $(MAKE) -C $(STAGE2_BUILD) $(COMMON_FLAGS) $@
 
-$(PROFILE_FILE): stage1 $(PROFRAW_FILES)
-	$(LLVM_PROFDATA) merge -output=$@ $(PROFRAW_FILES)
+$(PROFILE_FILE): stage1 $(wildcard $(PROFILE_DIR)/*.profraw)
+	$(LLVM_PROFDATA) merge -output=$@ $(PROFILE_DIR)/*.profraw
 
 # show top 50 functions
 top: $(PROFILE_FILE)
diff --git a/deps/checksums/Downloads-1061ecc377a053fce0df94e1a19e5260f7c030f5.tar.gz/md5 b/deps/checksums/Downloads-1061ecc377a053fce0df94e1a19e5260f7c030f5.tar.gz/md5
deleted file mode 100644
index f42bbedb6d415..0000000000000
--- a/deps/checksums/Downloads-1061ecc377a053fce0df94e1a19e5260f7c030f5.tar.gz/md5
+++ /dev/null
@@ -1 +0,0 @@
-70878dd96911d6960537dfee2a820d98
diff --git a/deps/checksums/Downloads-1061ecc377a053fce0df94e1a19e5260f7c030f5.tar.gz/sha512 b/deps/checksums/Downloads-1061ecc377a053fce0df94e1a19e5260f7c030f5.tar.gz/sha512
deleted file mode 100644
index 83164cad9a89d..0000000000000
--- a/deps/checksums/Downloads-1061ecc377a053fce0df94e1a19e5260f7c030f5.tar.gz/sha512
+++ /dev/null
@@ -1 +0,0 @@
-87d2bdc6c85cbbce5302aab8ffe92fc542c9c71a396844fcc04c0416be059b00298b4816ab5e5491dbf865660a3a6152f1c245875a1ec75fb49b4c7ba0d303d8
diff --git a/deps/checksums/Downloads-89d3c7dded535a77551e763a437a6d31e4d9bf84.tar.gz/md5 b/deps/checksums/Downloads-89d3c7dded535a77551e763a437a6d31e4d9bf84.tar.gz/md5
new file mode 100644
index 0000000000000..611f3dd448d98
--- /dev/null
+++ b/deps/checksums/Downloads-89d3c7dded535a77551e763a437a6d31e4d9bf84.tar.gz/md5
@@ -0,0 +1 @@
+2472bd6434d21c4b3e3199437e6fdcf7
diff --git a/deps/checksums/Downloads-89d3c7dded535a77551e763a437a6d31e4d9bf84.tar.gz/sha512 b/deps/checksums/Downloads-89d3c7dded535a77551e763a437a6d31e4d9bf84.tar.gz/sha512
new file mode 100644
index 0000000000000..6937982e838f3
--- /dev/null
+++ b/deps/checksums/Downloads-89d3c7dded535a77551e763a437a6d31e4d9bf84.tar.gz/sha512
@@ -0,0 +1 @@
+0a3fa9a09de81aa9676dbc7448408c7503f45e42519a2667540ad890316c7da089c95de5464a2032171f963c6f3cba73d6b3c246f1c7ac6ede283fc8132d5209
diff --git a/deps/checksums/Pkg-43e7849ce37545493d0da3226cd7449f5f88563e.tar.gz/md5 b/deps/checksums/Pkg-43e7849ce37545493d0da3226cd7449f5f88563e.tar.gz/md5
deleted file mode 100644
index 2d5f5888e777f..0000000000000
--- a/deps/checksums/Pkg-43e7849ce37545493d0da3226cd7449f5f88563e.tar.gz/md5
+++ /dev/null
@@ -1 +0,0 @@
-d992a5c629199747d68baa1593a7c37d
diff --git a/deps/checksums/Pkg-43e7849ce37545493d0da3226cd7449f5f88563e.tar.gz/sha512 b/deps/checksums/Pkg-43e7849ce37545493d0da3226cd7449f5f88563e.tar.gz/sha512
deleted file mode 100644
index 4201ee05347a7..0000000000000
--- a/deps/checksums/Pkg-43e7849ce37545493d0da3226cd7449f5f88563e.tar.gz/sha512
+++ /dev/null
@@ -1 +0,0 @@
-27ea738dbc4db8e4a02b00fbbdc4e2047906fe0561dd4c7f9e5ce5ea9b0b7b8ef9e29234f8e435deaa6cb3e29861130b06cb0da11118c40d78f4c475ac48db3f
diff --git a/deps/checksums/Pkg-51d4910c114a863d888659cb8962c1e161b2a421.tar.gz/md5 b/deps/checksums/Pkg-51d4910c114a863d888659cb8962c1e161b2a421.tar.gz/md5
new file mode 100644
index 0000000000000..b5b82565470c0
--- /dev/null
+++ b/deps/checksums/Pkg-51d4910c114a863d888659cb8962c1e161b2a421.tar.gz/md5
@@ -0,0 +1 @@
+88b8a25a8d465ac8cc94d13bc5f51707
diff --git a/deps/checksums/Pkg-51d4910c114a863d888659cb8962c1e161b2a421.tar.gz/sha512 b/deps/checksums/Pkg-51d4910c114a863d888659cb8962c1e161b2a421.tar.gz/sha512
new file mode 100644
index 0000000000000..a746b269d91f0
--- /dev/null
+++ b/deps/checksums/Pkg-51d4910c114a863d888659cb8962c1e161b2a421.tar.gz/sha512
@@ -0,0 +1 @@
+22262687f3bf75292ab0170e19a9c4a494022a653b2811443b8c52bc099dee0fddd09f6632ae42b3193adf3b0693ddcb6679b5d91e50a500f65261df5b7ced7d
diff --git a/deps/checksums/libuv b/deps/checksums/libuv
index 41a9a5bdf9722..6887c3fe62f41 100644
--- a/deps/checksums/libuv
+++ b/deps/checksums/libuv
@@ -1,34 +1,34 @@
-LibUV.v2.0.1+16.aarch64-apple-darwin.tar.gz/md5/132266a501144f34eb9b8d5199db43c0
-LibUV.v2.0.1+16.aarch64-apple-darwin.tar.gz/sha512/e466ba8a2fe916f0e2dccb1d1075a6a20fcc5d5068d2375c940353a63522332fa8f665461adbb47ad4d30dabaea011b8e72a603601da29a071d98c7d7d130f46
-LibUV.v2.0.1+16.aarch64-linux-gnu.tar.gz/md5/1ae3018d9ab8bb293dbf6277c2c209cc
-LibUV.v2.0.1+16.aarch64-linux-gnu.tar.gz/sha512/6e56876cdf0fdad1aade6435edf980b286438ee9fa695fa4e262b47f7ada6ff69535c59d216daee3eb1d061a90c2c16fd70d21438776c54addda93cf275ef1be
-LibUV.v2.0.1+16.aarch64-linux-musl.tar.gz/md5/08243e727c7e957f5972a200b5d89113
-LibUV.v2.0.1+16.aarch64-linux-musl.tar.gz/sha512/4a684f248704b16b882d66ed7af60e2217a0b98f476bfdd1cb545d3e2adb17f6a410bf09e270c1e2623e550b36639c9282a562ab415850dfea98736ec03fd000
-LibUV.v2.0.1+16.armv6l-linux-gnueabihf.tar.gz/md5/c4dfccf5a899782715cbb0ca0197938c
-LibUV.v2.0.1+16.armv6l-linux-gnueabihf.tar.gz/sha512/ecdcd655865a532187e4e98cb21ca68e62303813cad585de83382aa226d965213f24fe7a684e1189fad11b0e5f2f4b318c122f557a6117f61bb2948b51e16a76
-LibUV.v2.0.1+16.armv6l-linux-musleabihf.tar.gz/md5/5382dae963f3003aefdb119377a45e82
-LibUV.v2.0.1+16.armv6l-linux-musleabihf.tar.gz/sha512/f901c2965e8f9ca52900180c32cdb70d8adc13f12f076c1b109d57b749cac1ecaac3c72e22531e6fcb79c8f2c7cf952ff563779d3764b015b73db079f2b171cb
-LibUV.v2.0.1+16.armv7l-linux-gnueabihf.tar.gz/md5/9c4cd82249c03ebeac670e2c7c8c1078
-LibUV.v2.0.1+16.armv7l-linux-gnueabihf.tar.gz/sha512/ee4b7f866e3f63df303d00d48d36680c490570979bb7174c12cfcf9efaf48ea7ae90aa05b41da8ab686de93c910c5a761f31da22845ad48fd980e9c16437cbfb
-LibUV.v2.0.1+16.armv7l-linux-musleabihf.tar.gz/md5/5255d7e320ef37eb63d0e85c4b86d20d
-LibUV.v2.0.1+16.armv7l-linux-musleabihf.tar.gz/sha512/5bcd3d22b1e2398879e654bb550fd093891775c64cb48bd179c4f9ff8dcbff23eda91a66ea14852ef5945d5c114732957075e3b3fded4cbd3cca559fead842db
-LibUV.v2.0.1+16.i686-linux-gnu.tar.gz/md5/7f0fc52beb13dad773c6ab54deee7a62
-LibUV.v2.0.1+16.i686-linux-gnu.tar.gz/sha512/cb1736eab4fa1be89018b3c77c3551a99d0fa761ad2f1947587c215d87d963d43198ce87574b6eb9d1fb8a93abf1ae89e74fb8a3f3fb9c4fd08a49e04b4335f4
-LibUV.v2.0.1+16.i686-linux-musl.tar.gz/md5/ed22ccd7eaa09ed9c71afc0c6affa423
-LibUV.v2.0.1+16.i686-linux-musl.tar.gz/sha512/7f3ff061c3d7d0c3c0c0be3e4052aeed39f35e1ba0b92f3ee3d9f266f26d064acc153c08054a22d090167f00fef3c27ec54e836de35f348e4849baab301f7fa4
-LibUV.v2.0.1+16.i686-w64-mingw32.tar.gz/md5/7f1fe93df0b741ca30c4fb64ff9ac9bd
-LibUV.v2.0.1+16.i686-w64-mingw32.tar.gz/sha512/9d71722c538d8232d8510fa2a43e7a52271b078401dfa838de9eedcfc34a2483aa3b1c221b17c41353b54554fe76d86b4973c5261b288228a91f0cc92820ad93
-LibUV.v2.0.1+16.powerpc64le-linux-gnu.tar.gz/md5/b796de6c75f18f318823e3e1cdd316c8
-LibUV.v2.0.1+16.powerpc64le-linux-gnu.tar.gz/sha512/f8dbb98cb49edfa06a0b48fbe1e658ca5a9bca13fe33d21872a012deaa1052a495faf74f90c0dfa48378b9f4f51f1045e01e563aec427d8c89d50e4eef0e4938
-LibUV.v2.0.1+16.x86_64-apple-darwin.tar.gz/md5/f2d55b315fa1f77b632a461530bb6b3b
-LibUV.v2.0.1+16.x86_64-apple-darwin.tar.gz/sha512/eb40a193c3bca5e822a417879e854877b353a2a04b03a721ef4125360f1189a3685d2751e2f975360a2ad4c37e6043485a54b5349b3da423b8aae73d4a095d04
-LibUV.v2.0.1+16.x86_64-linux-gnu.tar.gz/md5/a573ded4f78f8677ef73594be9629638
-LibUV.v2.0.1+16.x86_64-linux-gnu.tar.gz/sha512/c5809635be3ab5dc53c37a028e58695d89ea91eee850af22a0e8db10ea021640f1e618a553848332ee6df66eecd08d34605e335aad46ece82365a3525b69c42f
-LibUV.v2.0.1+16.x86_64-linux-musl.tar.gz/md5/5bdad561b5db7d19f198ef090ae3ec84
-LibUV.v2.0.1+16.x86_64-linux-musl.tar.gz/sha512/6662c8226f22f79f8c40857a5a531841f013031dd2e9536568498bfd536f133976ff71d0cc5f56f1e0c0b7f2403a35c2ccef9117d9e0d7819771bd492194f20d
-LibUV.v2.0.1+16.x86_64-unknown-freebsd.tar.gz/md5/f4ad9e445e4b14e2b59b2b77c9ed72ad
-LibUV.v2.0.1+16.x86_64-unknown-freebsd.tar.gz/sha512/a78deac6d8321f274a229961620da4d069ff2accf7d1ed9edfb01c21ad47eb33d364ba2f310ff4a93b2732dcd16f6d481843dbcb273770d731fd528f9c7a9ddc
-LibUV.v2.0.1+16.x86_64-w64-mingw32.tar.gz/md5/72caa067cf24e304955405dcb4de195a
-LibUV.v2.0.1+16.x86_64-w64-mingw32.tar.gz/sha512/de80ca98d199d3c5626ebc771325806ce3aae5927220201c2351207c10ff67791d2865f76e41519df88f0be3da534342965e7ba0d055d807c4b2b6c78bd2427d
-libuv-ca3a5a431a1c37859b6508e6b2a288092337029a.tar.gz/md5/d1fbca8bcc5819037b8b81ae4f61c357
-libuv-ca3a5a431a1c37859b6508e6b2a288092337029a.tar.gz/sha512/e735861923c0fc597b53eb2efb56b26acec29e3fcae7e76d349fc08f8b9d340df9ac60a1cd245e46a434aa357ed8e377734c1c97bf08bd044c9ba0c02b082a6a
+LibUV.v2.0.1+18.aarch64-apple-darwin.tar.gz/md5/f176c76e5e2096dea8443302cf9550b8
+LibUV.v2.0.1+18.aarch64-apple-darwin.tar.gz/sha512/4301b13953a08a758b86e30af3261fd9a291ce3829b4d98e71e2a2c040e322e284c5a6eb4bc7189cc352f4b1cf7041e2cfd3380d511d88c151df3101ad74594e
+LibUV.v2.0.1+18.aarch64-linux-gnu.tar.gz/md5/c81515783363702a1bd4b65fd6d7f36b
+LibUV.v2.0.1+18.aarch64-linux-gnu.tar.gz/sha512/011429365337f5a45e56ca7a42709866bb994c747a1170d870f5f3ddfff2d36138866ee9278ac01172bc71bde8dee404bcb9cae9c7b44222bf1cc883659df269
+LibUV.v2.0.1+18.aarch64-linux-musl.tar.gz/md5/e74d5ea4912dd326b2705638faa7b805
+LibUV.v2.0.1+18.aarch64-linux-musl.tar.gz/sha512/a26a9f2c9051816230324071c502321f7af3885d581a400615858a93a4cd457226048d15b0e7f6a73d12659763c705b5ab519e9f5b35c6d886b9fd5babbfe352
+LibUV.v2.0.1+18.armv6l-linux-gnueabihf.tar.gz/md5/6df38bcf5d0a61dee63d16b73d0c9a24
+LibUV.v2.0.1+18.armv6l-linux-gnueabihf.tar.gz/sha512/d5354a6532061de0a58965ce0e427bde52f9ae0ee39a98e1a33de4c414fddcba9ba139ddf91be7321a4ccc97bbf7a8a8357ff10cf60f83c0a6bff7d839d6d7a8
+LibUV.v2.0.1+18.armv6l-linux-musleabihf.tar.gz/md5/6f02a24cfbfae3032fadceaea1faed39
+LibUV.v2.0.1+18.armv6l-linux-musleabihf.tar.gz/sha512/7fd107eb9a5ea84b488ea02e4fbedc9fe13bb11be859986a47af38f40ad775dd9f738c790878a3503437bcac1eb26ad9fe26f4aa0d3cb45c980b4c5abc9aec99
+LibUV.v2.0.1+18.armv7l-linux-gnueabihf.tar.gz/md5/96b09dec72f7e9b7409fa2920e67c866
+LibUV.v2.0.1+18.armv7l-linux-gnueabihf.tar.gz/sha512/6a0f79fc15c944fabba5c65180b665bc9769c6ff25863e330049f48b3a4394b448492f5a9a76bb7f8dbd3ce44dfc6f9ccdc2c71c42e1c749e88070fe99b1db69
+LibUV.v2.0.1+18.armv7l-linux-musleabihf.tar.gz/md5/f44e4b2521a813181f943895bdb0dd3c
+LibUV.v2.0.1+18.armv7l-linux-musleabihf.tar.gz/sha512/cda1413dca817f772e8b343db0c6de0ef6b8f269e9a6a2ef3403c2582aeab554f46281bbb1eb4659c259198ef47fe26aab648a281e66f80aaf2f2cda0a23ac05
+LibUV.v2.0.1+18.i686-linux-gnu.tar.gz/md5/1f231d89cf9c04515d2d107a5d786cc8
+LibUV.v2.0.1+18.i686-linux-gnu.tar.gz/sha512/089cb8a372cdee0cbc0e78fc201611bb9bafd99af9a78e09d6097a6b70e7c4aa001ebd86f944b0a885c072093c529bf86bcaa32bde4fc1934407a858c1a5a764
+LibUV.v2.0.1+18.i686-linux-musl.tar.gz/md5/01cfc2a9e2536dbd330267917abb19ce
+LibUV.v2.0.1+18.i686-linux-musl.tar.gz/sha512/72f3588cb464a60e61f8998242aaa2abdf93df920a2feba5e1d66ef0f2498488df0ec415cbb499d7f75c47bdfc7e3a2fdda6a94383492e0ad13e464eb1314ff8
+LibUV.v2.0.1+18.i686-w64-mingw32.tar.gz/md5/8c6599aab9ed4c46e52f03683aac664e
+LibUV.v2.0.1+18.i686-w64-mingw32.tar.gz/sha512/13f0565f7244a8bcf1ab43fac91a856dc86d214877033a3cefee8c2179c1a275dfd7dda32e9017763acac2ba42ab6799934a58f5feaa38fb6cf2253dd713f57a
+LibUV.v2.0.1+18.powerpc64le-linux-gnu.tar.gz/md5/af0e43d9d0aa91dd82b63220d96991ef
+LibUV.v2.0.1+18.powerpc64le-linux-gnu.tar.gz/sha512/9fabe3089e4fc60e910770c32d36300ce8ef36c28e8cc9c72fbecba6eb80285ee8174e84e4452fb4ce90ee7c7f94e99b03fce47d8c579bd614bfffd553f93666
+LibUV.v2.0.1+18.x86_64-apple-darwin.tar.gz/md5/871040e874eedae54553d8f1c91b9133
+LibUV.v2.0.1+18.x86_64-apple-darwin.tar.gz/sha512/d5eee08b65e4bb8b444c61ac277bec9ef944b9279dd7f0732b3cd91d47788c77938e5db71e019e01bbe7785a75df95faf14368764f700c6b7a6b9e4d96d6b4c2
+LibUV.v2.0.1+18.x86_64-linux-gnu.tar.gz/md5/d2d186952c6d017fe33f6a6bea63a3ea
+LibUV.v2.0.1+18.x86_64-linux-gnu.tar.gz/sha512/15501534bf5721e6bb668aabe6dc6375349f7a284e28df0609d00982e7e456908bd6868722391afa7f44a5c82faedc8cf544f69a0e4fb9fb0d529b3ae3d44d78
+LibUV.v2.0.1+18.x86_64-linux-musl.tar.gz/md5/271d4d40a1ae53ed5b2376e5936cfcf9
+LibUV.v2.0.1+18.x86_64-linux-musl.tar.gz/sha512/1956f059ed01f66b72349d6561b04e6a89b7257c0f838d7fbdd2cee79bd126bb46b93bf944a042b5a6a235762a7a0cdd117207342dd55a0c58653a70b4a38d48
+LibUV.v2.0.1+18.x86_64-unknown-freebsd.tar.gz/md5/62fe8523948914fbe7e28bf0b8d73594
+LibUV.v2.0.1+18.x86_64-unknown-freebsd.tar.gz/sha512/e6486888028c96975f74bc9313cba9706f6bf2be085aa776c44cbb2886753b2eee62469a0be92eb0542df1d0f51db3b34c7ba5e46842e16c6ff1d20e11b75322
+LibUV.v2.0.1+18.x86_64-w64-mingw32.tar.gz/md5/ae103f24b6e1830cdbe02143826fe551
+LibUV.v2.0.1+18.x86_64-w64-mingw32.tar.gz/sha512/f814085c135815947f342ff24fa0e1015e283ccece84a5b8dd5ccec0f5928a129e5fd79100a33b131376ad696f70b5acadcc5a02a7e6544635ecf7e18003ba1c
+libuv-af4172ec713ee986ba1a989b9e33993a07c60c9e.tar.gz/md5/c1a7d3c74ef3999052f3bfe426264353
+libuv-af4172ec713ee986ba1a989b9e33993a07c60c9e.tar.gz/sha512/a3f16863b711ddeeb5ab8d135d7df7a4be19cc2b9821fc78c8cd3ba421231d39b7d8bd9965321455094fda01584842a58f60612d93082b4fe32210b8aa44d999
diff --git a/deps/checksums/llvmunwind b/deps/checksums/llvmunwind
index e69de29bb2d1d..a90d28717dd85 100644
--- a/deps/checksums/llvmunwind
+++ b/deps/checksums/llvmunwind
@@ -0,0 +1,32 @@
+LLVMLibUnwind.v14.0.6+0.aarch64-apple-darwin.tar.gz/md5/d8584e0e3dc26ea7404d3719cea9e233
+LLVMLibUnwind.v14.0.6+0.aarch64-apple-darwin.tar.gz/sha512/7a0396eaace91b9b4d013c209605d468a7ff9b99ede9fdd57602539a6fa6f3ea84a440f32840056a1234df3ef1896739ea0820fee72b4f208096c553fc54adb9
+LLVMLibUnwind.v14.0.6+0.aarch64-linux-gnu.tar.gz/md5/d6edea561b61173d05aa79936e49f6b7
+LLVMLibUnwind.v14.0.6+0.aarch64-linux-gnu.tar.gz/sha512/9fbe29ec6a33c719bc9a4dd19911ceded9622269c042192d339a6cf45aa8209ad64c424167c094ca01293438af5930f091acba0538b3fe640a746297f5cc8cb3
+LLVMLibUnwind.v14.0.6+0.aarch64-linux-musl.tar.gz/md5/3ec68c87e4bddd024ee0ca6adc2b3b96
+LLVMLibUnwind.v14.0.6+0.aarch64-linux-musl.tar.gz/sha512/be3cd9d5510c2693dee1494c36c479d32311ff83f5b2d31c08508a3dd370788961ce46e9025afe148a0febd05942fd294370a357dd717bee353d8a108617f6de
+LLVMLibUnwind.v14.0.6+0.armv6l-linux-gnueabihf.tar.gz/md5/8ca5a926d69124225d485d679232a54f
+LLVMLibUnwind.v14.0.6+0.armv6l-linux-gnueabihf.tar.gz/sha512/353f540b342bc54877e7a41fe65c9eeac525fd91bf4cddbe1b3ec2ed93c3751beaf8316a4d31530502b067100b160301262e10cbe4407db3abf1ceb5d9a74eb2
+LLVMLibUnwind.v14.0.6+0.armv6l-linux-musleabihf.tar.gz/md5/4e5b576958f2a2e708eb5918ceef0de0
+LLVMLibUnwind.v14.0.6+0.armv6l-linux-musleabihf.tar.gz/sha512/2e98c472d3ee25c2e062efa4eb21ac9cfc49b26ea9d99ad4a8e7660c4c09f121d31193bd161f54ea332ce94785d601897311e9e6668adb1e25e2b666e0d5bb3f
+LLVMLibUnwind.v14.0.6+0.armv7l-linux-gnueabihf.tar.gz/md5/1c81a886e799663ce8d04400c5b516a9
+LLVMLibUnwind.v14.0.6+0.armv7l-linux-gnueabihf.tar.gz/sha512/236b78b9a17eaae74ab07349ac8dde16c3abbd48e0d075abd1c195d60efff48e2fbf799554df114ea3d3dba937e0369430a2788bde2a1201126e026ef6cdac42
+LLVMLibUnwind.v14.0.6+0.armv7l-linux-musleabihf.tar.gz/md5/0371f43ebcb571d0a635739252b88986
+LLVMLibUnwind.v14.0.6+0.armv7l-linux-musleabihf.tar.gz/sha512/605318ae3737e26ff89d6291311a7db3bc3ec7c8d1f2e72ae40fd3d9df0754ee2ebfb77687122605f26d76d62effb85157bc39982814920d5af46c124e71a5ff
+LLVMLibUnwind.v14.0.6+0.i686-linux-gnu.tar.gz/md5/cd3f1cdf404b6102754ced4bd3a890f6
+LLVMLibUnwind.v14.0.6+0.i686-linux-gnu.tar.gz/sha512/65fe2c5b1e04da1e1d8111a0b0083fa0fa9447eaea7af7a018c09fe6d5506566c491bbad296a7be8c488ca3495016ae16a6879d69f057f8866d94910147dee03
+LLVMLibUnwind.v14.0.6+0.i686-linux-musl.tar.gz/md5/abac9b416d2ba5abcf5ce849f43ffa96
+LLVMLibUnwind.v14.0.6+0.i686-linux-musl.tar.gz/sha512/fed677ed6f103c56eb9dd4578fa37a56ed2a4bc803aa1997c5af19762a623d2f82db1f72f429448d66fcef3b37af2104e6cb782f023aaabef086a921a862b042
+LLVMLibUnwind.v14.0.6+0.i686-w64-mingw32.tar.gz/md5/4c71ffd7c8cabb1c0ed6290b193883c5
+LLVMLibUnwind.v14.0.6+0.i686-w64-mingw32.tar.gz/sha512/6b1421a3268170467225112167cdb33fec962181993a2dad5594d4ee0623ac88ee0588cdc7d0656dc1cb9129ef96f621a97a224731cd161134d7d63c8fd32c16
+LLVMLibUnwind.v14.0.6+0.powerpc64le-linux-gnu.tar.gz/md5/06faf505f0dc354afcd01113cfc57af2
+LLVMLibUnwind.v14.0.6+0.powerpc64le-linux-gnu.tar.gz/sha512/1f9dfbd403e2ce121e126c217baede178cb1323012bb5e3cd1f778ff51e4216aed9dd69036e2baffbd60a6f5ae438ddaba6c13809459e94bb00be3f7bfc8c30e
+LLVMLibUnwind.v14.0.6+0.x86_64-apple-darwin.tar.gz/md5/516a11d99306e3f214968a7951b07a06
+LLVMLibUnwind.v14.0.6+0.x86_64-apple-darwin.tar.gz/sha512/885738599bbd96f20083f9b9368ce3f243bd5868d3ac9a45189de6cb40b6664a6dcdaece159989e504670231db8c2addfa8d544003eb0cdabba960e4ab6a4470
+LLVMLibUnwind.v14.0.6+0.x86_64-linux-gnu.tar.gz/md5/d851b90ea3f9664774316169fc494e21
+LLVMLibUnwind.v14.0.6+0.x86_64-linux-gnu.tar.gz/sha512/a1f529454f0881baaa508481ba97ecffb040fa92141b4cbc72278adcf8b84f0766fa918aea7fb99ce690c4fd80c36fec365987625db42f4e7bb36ad24ce177d0
+LLVMLibUnwind.v14.0.6+0.x86_64-linux-musl.tar.gz/md5/dc4e86eb2effe1f6cb0d0ceda635f226
+LLVMLibUnwind.v14.0.6+0.x86_64-linux-musl.tar.gz/sha512/c52de384853890f9df81aa9e422c1ba3fde12b2ae9c7b60b9ecdc6d0c88eab495dd336af2b6cd2c31d6eddcd0a213954eadbc7884bc39ce2039cec672eac32fe
+LLVMLibUnwind.v14.0.6+0.x86_64-unknown-freebsd.tar.gz/md5/8477e3624c73a820d8ab82a53e1e10fa
+LLVMLibUnwind.v14.0.6+0.x86_64-unknown-freebsd.tar.gz/sha512/32ce031245a5b59a779cd77fa3c9bf05ee59e48c913b75d4964bea49f37da232c59a42ad993f7b5edc88322148c1d7394984349682bfce3b69d33a51756ac8e3
+LLVMLibUnwind.v14.0.6+0.x86_64-w64-mingw32.tar.gz/md5/7be93eccbdb0aff427c43af651073d66
+LLVMLibUnwind.v14.0.6+0.x86_64-w64-mingw32.tar.gz/sha512/89a61a81ec664c72107ac09e717200b00434350bf77064267180bc0c101a59e0ee8c8af4dd6fe75eacdeb14e82743c138b2fc558ca08550d8796b8db93f89da4
diff --git a/deps/libuv.version b/deps/libuv.version
index bc8e2e57c9517..ebfc63927d9db 100644
--- a/deps/libuv.version
+++ b/deps/libuv.version
@@ -1,7 +1,9 @@
+# -*- makefile -*-
+
 ## jll artifact
 LIBUV_JLL_NAME := LibUV
 
 ## source build
 LIBUV_VER := 2
 LIBUV_BRANCH=julia-uv2-1.48.0
-LIBUV_SHA1=ca3a5a431a1c37859b6508e6b2a288092337029a
+LIBUV_SHA1=af4172ec713ee986ba1a989b9e33993a07c60c9e
diff --git a/deps/llvm.mk b/deps/llvm.mk
index 08aff443dcff8..3f4bc3e6746f0 100644
--- a/deps/llvm.mk
+++ b/deps/llvm.mk
@@ -102,7 +102,7 @@ endif
 LLVM_CMAKE += -DLLVM_TOOLS_INSTALL_DIR=$(call rel_path,$(build_prefix),$(build_depsbindir))
 LLVM_CMAKE += -DLLVM_UTILS_INSTALL_DIR=$(call rel_path,$(build_prefix),$(build_depsbindir))
 LLVM_CMAKE += -DLLVM_INCLUDE_UTILS=ON -DLLVM_INSTALL_UTILS=ON
-LLVM_CMAKE += -DLLVM_BINDINGS_LIST="" -DLLVM_ENABLE_BINDINGS=OFF -DLLVM_INCLUDE_DOCS=Off -DLLVM_ENABLE_TERMINFO=Off -DHAVE_LIBEDIT=Off
+LLVM_CMAKE += -DLLVM_BINDINGS_LIST="" -DLLVM_ENABLE_BINDINGS=OFF -DLLVM_INCLUDE_DOCS=Off -DLLVM_ENABLE_TERMINFO=Off -DHAVE_LIBEDIT=Off -DLLVM_ENABLE_LIBEDIT=OFF
 ifeq ($(LLVM_ASSERTIONS), 1)
 LLVM_CMAKE += -DLLVM_ENABLE_ASSERTIONS:BOOL=ON
 endif # LLVM_ASSERTIONS
@@ -292,6 +292,9 @@ ifeq ($(OS),Darwin)
 # https://github.com/JuliaLang/julia/issues/29981
 LLVM_INSTALL += && ln -s libLLVM.dylib $2$$(build_shlibdir)/libLLVM-$$(LLVM_VER_SHORT).dylib
 endif
+ifeq ($(BUILD_LLD), 1)
+LLVM_INSTALL += && cp $2$$(build_bindir)/lld$$(EXE) $2$$(build_depsbindir)
+endif
 
 $(eval $(call staged-install, \
 	llvm,$$(LLVM_SRC_DIR)/build_$$(LLVM_BUILDTYPE), \
diff --git a/deps/llvmunwind.version b/deps/llvmunwind.version
index 7d13af9a158f7..9c2a91c566ba2 100644
--- a/deps/llvmunwind.version
+++ b/deps/llvmunwind.version
@@ -2,4 +2,4 @@
 LLVMUNWIND_JLL_NAME := LLVMLibUnwind
 
 ## source build
-LLVMUNWIND_VER := 12.0.1
+LLVMUNWIND_VER := 14.0.6
diff --git a/deps/patches/llvm-libunwind-revert-monorepo-requirement.patch b/deps/patches/llvm-libunwind-revert-monorepo-requirement.patch
deleted file mode 100644
index 4e3897dfb9801..0000000000000
--- a/deps/patches/llvm-libunwind-revert-monorepo-requirement.patch
+++ /dev/null
@@ -1,156 +0,0 @@
-Upstream commit 8c03fdf34a659925a3f09c8f54016e47ea1c7519 changed the build such
-that it requires living inside the monorepo with libcxx available, only so that
-it can reuse a CMake file to simplify some build steps. This patch is a revert
-of that commit applied only to libunwind.
-
----
-diff --git a/libunwind/CMakeLists.txt b/libunwind/CMakeLists.txt
-index 570b8db90653..a383d7d77d6f 100644
---- a/libunwind/CMakeLists.txt
-+++ b/libunwind/CMakeLists.txt
-@@ -1,7 +1,3 @@
--if (NOT IS_DIRECTORY "${CMAKE_CURRENT_LIST_DIR}/../libcxx")
--  message(FATAL_ERROR "libunwind requires being built in a monorepo layout with libcxx available")
--endif()
--
- #===============================================================================
- # Setup Project
- #===============================================================================
-@@ -15,31 +11,103 @@ set(CMAKE_MODULE_PATH
-   ${CMAKE_MODULE_PATH}
-   )
- 
--set(LIBUNWIND_SOURCE_DIR  ${CMAKE_CURRENT_SOURCE_DIR})
--set(LIBUNWIND_BINARY_DIR  ${CMAKE_CURRENT_BINARY_DIR})
--set(LIBUNWIND_LIBCXX_PATH "${CMAKE_CURRENT_LIST_DIR}/../libcxx" CACHE PATH
--        "Specify path to libc++ source.")
--
- if (CMAKE_SOURCE_DIR STREQUAL CMAKE_CURRENT_SOURCE_DIR OR LIBUNWIND_STANDALONE_BUILD)
-   project(libunwind LANGUAGES C CXX ASM)
- 
-+  # Rely on llvm-config.
-+  set(CONFIG_OUTPUT)
-+  if(NOT LLVM_CONFIG_PATH)
-+    find_program(LLVM_CONFIG_PATH "llvm-config")
-+  endif()
-+  if (DEFINED LLVM_PATH)
-+    set(LLVM_INCLUDE_DIR ${LLVM_INCLUDE_DIR} CACHE PATH "Path to llvm/include")
-+    set(LLVM_PATH ${LLVM_PATH} CACHE PATH "Path to LLVM source tree")
-+    set(LLVM_MAIN_SRC_DIR ${LLVM_PATH})
-+    set(LLVM_CMAKE_PATH "${LLVM_PATH}/cmake/modules")
-+  elseif(LLVM_CONFIG_PATH)
-+    message(STATUS "Found LLVM_CONFIG_PATH as ${LLVM_CONFIG_PATH}")
-+    set(CONFIG_COMMAND ${LLVM_CONFIG_PATH} "--includedir" "--prefix" "--src-root")
-+    execute_process(COMMAND ${CONFIG_COMMAND}
-+                    RESULT_VARIABLE HAD_ERROR
-+                    OUTPUT_VARIABLE CONFIG_OUTPUT)
-+    if (NOT HAD_ERROR)
-+      string(REGEX REPLACE "[ \t]*[\r\n]+[ \t]*" ";"
-+             CONFIG_OUTPUT ${CONFIG_OUTPUT})
-+    else()
-+      string(REPLACE ";" " " CONFIG_COMMAND_STR "${CONFIG_COMMAND}")
-+      message(STATUS "${CONFIG_COMMAND_STR}")
-+      message(FATAL_ERROR "llvm-config failed with status ${HAD_ERROR}")
-+    endif()
-+
-+    list(GET CONFIG_OUTPUT 0 INCLUDE_DIR)
-+    list(GET CONFIG_OUTPUT 1 LLVM_OBJ_ROOT)
-+    list(GET CONFIG_OUTPUT 2 MAIN_SRC_DIR)
-+
-+    set(LLVM_INCLUDE_DIR ${INCLUDE_DIR} CACHE PATH "Path to llvm/include")
-+    set(LLVM_BINARY_DIR ${LLVM_OBJ_ROOT} CACHE PATH "Path to LLVM build tree")
-+    set(LLVM_MAIN_SRC_DIR ${MAIN_SRC_DIR} CACHE PATH "Path to LLVM source tree")
-+    set(LLVM_LIT_PATH "${LLVM_PATH}/utils/lit/lit.py")
-+
-+    # --cmakedir is supported since llvm r291218 (4.0 release)
-+    execute_process(
-+      COMMAND ${LLVM_CONFIG_PATH} --cmakedir
-+      RESULT_VARIABLE HAD_ERROR
-+      OUTPUT_VARIABLE CONFIG_OUTPUT
-+      ERROR_QUIET)
-+    if(NOT HAD_ERROR)
-+      string(STRIP "${CONFIG_OUTPUT}" LLVM_CMAKE_PATH_FROM_LLVM_CONFIG)
-+      file(TO_CMAKE_PATH "${LLVM_CMAKE_PATH_FROM_LLVM_CONFIG}" LLVM_CMAKE_PATH)
-+    else()
-+      file(TO_CMAKE_PATH "${LLVM_BINARY_DIR}" LLVM_BINARY_DIR_CMAKE_STYLE)
-+      set(LLVM_CMAKE_PATH "${LLVM_BINARY_DIR_CMAKE_STYLE}/lib${LLVM_LIBDIR_SUFFIX}/cmake/llvm")
-+    endif()
-+  else()
-+    message(WARNING "UNSUPPORTED LIBUNWIND CONFIGURATION DETECTED: "
-+                    "llvm-config not found and LLVM_MAIN_SRC_DIR not defined. "
-+                    "Reconfigure with -DLLVM_CONFIG=path/to/llvm-config "
-+                    "or -DLLVM_PATH=path/to/llvm-source-root.")
-+  endif()
-+
-+  if (EXISTS ${LLVM_CMAKE_PATH})
-+    list(APPEND CMAKE_MODULE_PATH "${LLVM_CMAKE_PATH}")
-+    include("${LLVM_CMAKE_PATH}/AddLLVM.cmake")
-+    include("${LLVM_CMAKE_PATH}/HandleLLVMOptions.cmake")
-+  else()
-+    message(WARNING "Not found: ${LLVM_CMAKE_PATH}")
-+  endif()
-+
-   set(PACKAGE_NAME libunwind)
-   set(PACKAGE_VERSION 12.0.1)
-   set(PACKAGE_STRING "${PACKAGE_NAME} ${PACKAGE_VERSION}")
-   set(PACKAGE_BUGREPORT "llvm-bugs@lists.llvm.org")
- 
--  # Add the CMake module path of libcxx so we can reuse HandleOutOfTreeLLVM.cmake
--  set(LIBUNWIND_LIBCXX_CMAKE_PATH "${LIBUNWIND_LIBCXX_PATH}/cmake/Modules")
--  list(APPEND CMAKE_MODULE_PATH "${LIBUNWIND_LIBCXX_CMAKE_PATH}")
-+  if (EXISTS ${LLVM_MAIN_SRC_DIR}/utils/lit/lit.py)
-+    set(LLVM_LIT ${LLVM_MAIN_SRC_DIR}/utils/lit/lit.py)
-+  else()
-+    # Seek installed Lit.
-+    find_program(LLVM_LIT "lit.py" ${LLVM_MAIN_SRC_DIR}/utils/lit
-+                 DOC "Path to lit.py")
-+  endif()
- 
--  # In a standalone build, we don't have llvm to automatically generate the
--  # llvm-lit script for us.  So we need to provide an explicit directory that
--  # the configurator should write the script into.
--  set(LIBUNWIND_STANDALONE_BUILD 1)
--  set(LLVM_LIT_OUTPUT_DIR "${LIBUNWIND_BINARY_DIR}/bin")
-+  if (LLVM_LIT)
-+    # Define the default arguments to use with 'lit', and an option for the user
-+    # to override.
-+    set(LIT_ARGS_DEFAULT "-sv")
-+    if (MSVC OR XCODE)
-+      set(LIT_ARGS_DEFAULT "${LIT_ARGS_DEFAULT} --no-progress-bar")
-+    endif()
-+    set(LLVM_LIT_ARGS "${LIT_ARGS_DEFAULT}" CACHE STRING "Default options for lit")
-+
-+    # On Win32 hosts, provide an option to specify the path to the GnuWin32 tools.
-+    if (WIN32 AND NOT CYGWIN)
-+      set(LLVM_LIT_TOOLS_DIR "" CACHE PATH "Path to GnuWin32 tools")
-+    endif()
-+  else()
-+    set(LLVM_INCLUDE_TESTS OFF)
-+  endif()
- 
--  # Find the LLVM sources and simulate LLVM CMake options.
--  include(HandleOutOfTreeLLVM)
-+  set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib${LLVM_LIBDIR_SUFFIX})
-+  set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib${LLVM_LIBDIR_SUFFIX})
- else()
-   set(LLVM_LIT "${CMAKE_SOURCE_DIR}/utils/lit/lit.py")
- endif()
-@@ -85,8 +153,6 @@ set(LIBUNWIND_TEST_COMPILER_FLAGS "" CACHE STRING
-     "Additional compiler flags for test programs.")
- set(LIBUNWIND_TEST_CONFIG "${CMAKE_CURRENT_SOURCE_DIR}/test/lit.site.cfg.in" CACHE STRING
-     "The Lit testing configuration to use when running the tests.")
--set(LIBUNWIND_TEST_PARAMS "" CACHE STRING
--    "A list of parameters to run the Lit test suite with.")
- 
- if (NOT LIBUNWIND_ENABLE_SHARED AND NOT LIBUNWIND_ENABLE_STATIC)
-   message(FATAL_ERROR "libunwind must be built as either a shared or static library.")
-@@ -113,6 +179,9 @@ set(CMAKE_MODULE_PATH
-     "${CMAKE_CURRENT_SOURCE_DIR}/cmake"
-     ${CMAKE_MODULE_PATH})
- 
-+set(LIBUNWIND_SOURCE_DIR  ${CMAKE_CURRENT_SOURCE_DIR})
-+set(LIBUNWIND_BINARY_DIR  ${CMAKE_CURRENT_BINARY_DIR})
-+
- if(LLVM_ENABLE_PER_TARGET_RUNTIME_DIR AND NOT APPLE)
-   set(LIBUNWIND_LIBRARY_DIR ${LLVM_LIBRARY_OUTPUT_INTDIR}/${LLVM_DEFAULT_TARGET_TRIPLE}/c++)
-   set(LIBUNWIND_INSTALL_LIBRARY_DIR lib${LLVM_LIBDIR_SUFFIX}/${LLVM_DEFAULT_TARGET_TRIPLE}/c++)
diff --git a/deps/terminfo.mk b/deps/terminfo.mk
index 63194f786f566..60865838a813e 100644
--- a/deps/terminfo.mk
+++ b/deps/terminfo.mk
@@ -22,8 +22,8 @@ $(BUILDDIR)/TermInfoDB-v$(TERMINFO_VER)/build-checked: $(BUILDDIR)/TermInfoDB-v$
 	echo 1 > $@
 
 define TERMINFO_INSTALL
-	mkdir -p $2/$$(build_datarootdir)
-	cp -R $1/terminfo $2/$$(build_datarootdir)
+	mkdir -p $2/$$(build_datarootdir)/julia
+	cp -R $1/terminfo $2/$$(build_datarootdir)/julia/
 endef
 $(eval $(call staged-install, \
 	terminfo,TermInfoDB-v$(TERMINFO_VER), \
diff --git a/deps/unwind.mk b/deps/unwind.mk
index 079e4d69b04a3..3951bbf36e22f 100644
--- a/deps/unwind.mk
+++ b/deps/unwind.mk
@@ -88,40 +88,41 @@ check-unwind: $(BUILDDIR)/libunwind-$(UNWIND_VER)/build-checked
 LLVMUNWIND_OPTS := $(CMAKE_COMMON) \
 	-DCMAKE_BUILD_TYPE=MinSizeRel \
 	-DLIBUNWIND_ENABLE_PEDANTIC=OFF \
-	-DLLVM_PATH=$(SRCCACHE)/$(LLVM_SRC_DIR)/llvm
+	-DLIBUNWIND_INCLUDE_DOCS=OFF \
+	-DLIBUNWIND_INCLUDE_TESTS=OFF \
+	-DLIBUNWIND_INSTALL_HEADERS=ON \
+	-DLIBUNWIND_ENABLE_ASSERTIONS=OFF \
+	-DLLVM_CONFIG_PATH=$(build_depsbindir)/llvm-config \
+	-DLLVM_PATH=$(SRCCACHE)/llvm-project-$(LLVMUNWIND_VER)/llvm
 
-$(SRCCACHE)/llvmunwind-$(LLVMUNWIND_VER).tar.xz: | $(SRCCACHE)
-	$(JLDOWNLOAD) $@ https://github.com/llvm/llvm-project/releases/download/llvmorg-$(LLVMUNWIND_VER)/libunwind-$(LLVMUNWIND_VER).src.tar.xz
+$(SRCCACHE)/llvm-project-$(LLVMUNWIND_VER).tar.xz: | $(SRCCACHE)
+	$(JLDOWNLOAD) $@ https://github.com/llvm/llvm-project/releases/download/llvmorg-$(LLVMUNWIND_VER)/llvm-project-$(LLVMUNWIND_VER).src.tar.xz
 
-$(SRCCACHE)/llvmunwind-$(LLVMUNWIND_VER)/source-extracted: $(SRCCACHE)/llvmunwind-$(LLVMUNWIND_VER).tar.xz
+$(SRCCACHE)/llvm-project-$(LLVMUNWIND_VER)/source-extracted: $(SRCCACHE)/llvm-project-$(LLVMUNWIND_VER).tar.xz
 	$(JLCHECKSUM) $<
 	cd $(dir $<) && $(TAR) xf $<
-	mv $(SRCCACHE)/libunwind-$(LLVMUNWIND_VER).src $(SRCCACHE)/llvmunwind-$(LLVMUNWIND_VER)
+	mv $(SRCCACHE)/llvm-project-$(LLVMUNWIND_VER).src $(SRCCACHE)/llvm-project-$(LLVMUNWIND_VER)
 	echo 1 > $@
 
-$(SRCCACHE)/llvmunwind-$(LLVMUNWIND_VER)/llvm-libunwind-prologue-epilogue.patch-applied: $(SRCCACHE)/llvmunwind-$(LLVMUNWIND_VER)/source-extracted
-	cd $(SRCCACHE)/llvmunwind-$(LLVMUNWIND_VER) && patch -p2 -f < $(SRCDIR)/patches/llvm-libunwind-prologue-epilogue.patch
+$(SRCCACHE)/llvm-project-$(LLVMUNWIND_VER)/libunwind/llvm-libunwind-prologue-epilogue.patch-applied: $(SRCCACHE)/llvm-project-$(LLVMUNWIND_VER)/source-extracted
+	cd $(SRCCACHE)/llvm-project-$(LLVMUNWIND_VER)/libunwind && patch -p2 -f < $(SRCDIR)/patches/llvm-libunwind-prologue-epilogue.patch
 	echo 1 > $@
 
-$(SRCCACHE)/llvmunwind-$(LLVMUNWIND_VER)/llvm-libunwind-force-dwarf.patch-applied: $(SRCCACHE)/llvmunwind-$(LLVMUNWIND_VER)/llvm-libunwind-prologue-epilogue.patch-applied
-	cd $(SRCCACHE)/llvmunwind-$(LLVMUNWIND_VER) && patch -p2 -f < $(SRCDIR)/patches/llvm-libunwind-force-dwarf.patch
+$(SRCCACHE)/llvm-project-$(LLVMUNWIND_VER)/libunwind/llvm-libunwind-force-dwarf.patch-applied: $(SRCCACHE)/llvm-project-$(LLVMUNWIND_VER)/libunwind/llvm-libunwind-prologue-epilogue.patch-applied
+	cd $(SRCCACHE)/llvm-project-$(LLVMUNWIND_VER)/libunwind && patch -p2 -f < $(SRCDIR)/patches/llvm-libunwind-force-dwarf.patch
 	echo 1 > $@
 
-$(SRCCACHE)/llvmunwind-$(LLVMUNWIND_VER)/llvm-libunwind-revert-monorepo-requirement.patch-applied: $(SRCCACHE)/llvmunwind-$(LLVMUNWIND_VER)/llvm-libunwind-force-dwarf.patch-applied
-	cd $(SRCCACHE)/llvmunwind-$(LLVMUNWIND_VER) && patch -p2 -f < $(SRCDIR)/patches/llvm-libunwind-revert-monorepo-requirement.patch
+$(SRCCACHE)/llvm-project-$(LLVMUNWIND_VER)/libunwind/llvm-libunwind-freebsd-libgcc-api-compat.patch-applied: $(SRCCACHE)/llvm-project-$(LLVMUNWIND_VER)/libunwind/llvm-libunwind-force-dwarf.patch-applied
+	cd $(SRCCACHE)/llvm-project-$(LLVMUNWIND_VER)/libunwind && patch -p2 -f < $(SRCDIR)/patches/llvm-libunwind-freebsd-libgcc-api-compat.patch
 	echo 1 > $@
 
-$(SRCCACHE)/llvmunwind-$(LLVMUNWIND_VER)/llvm-libunwind-freebsd-libgcc-api-compat.patch-applied: $(SRCCACHE)/llvmunwind-$(LLVMUNWIND_VER)/llvm-libunwind-revert-monorepo-requirement.patch-applied
-	cd $(SRCCACHE)/llvmunwind-$(LLVMUNWIND_VER) && patch -p2 -f < $(SRCDIR)/patches/llvm-libunwind-freebsd-libgcc-api-compat.patch
-	echo 1 > $@
-
-checksum-llvmunwind: $(SRCCACHE)/llvmunwind-$(LLVMUNWIND_VER).tar.xz
+checksum-llvmunwind: $(SRCCACHE)/llvm-project-$(LLVMUNWIND_VER).tar.xz
 	$(JLCHECKSUM) $<
 
-$(BUILDDIR)/llvmunwind-$(LLVMUNWIND_VER)/build-configured: $(SRCCACHE)/llvmunwind-$(LLVMUNWIND_VER)/source-extracted $(SRCCACHE)/llvmunwind-$(LLVMUNWIND_VER)/llvm-libunwind-freebsd-libgcc-api-compat.patch-applied
+$(BUILDDIR)/llvmunwind-$(LLVMUNWIND_VER)/build-configured: $(SRCCACHE)/llvm-project-$(LLVMUNWIND_VER)/source-extracted $(SRCCACHE)/llvm-project-$(LLVMUNWIND_VER)/libunwind/llvm-libunwind-freebsd-libgcc-api-compat.patch-applied
 	mkdir -p $(dir $@)
 	cd $(dir $@) && \
-	$(CMAKE) $(dir $<) $(LLVMUNWIND_OPTS)
+	$(CMAKE) $(dir $<)/libunwind $(LLVMUNWIND_OPTS)
 	echo 1 > $@
 
 $(BUILDDIR)/llvmunwind-$(LLVMUNWIND_VER)/build-compiled: $(BUILDDIR)/llvmunwind-$(LLVMUNWIND_VER)/build-configured
@@ -131,7 +132,7 @@ $(BUILDDIR)/llvmunwind-$(LLVMUNWIND_VER)/build-compiled: $(BUILDDIR)/llvmunwind-
 $(eval $(call staged-install, \
 	llvmunwind,llvmunwind-$(LLVMUNWIND_VER), \
 	MAKE_INSTALL,,, \
-	cp -fR $(SRCCACHE)/llvmunwind-$(LLVMUNWIND_VER)/include/* $(build_includedir)))
+	cp -fR $(SRCCACHE)/llvm-project-$(LLVMUNWIND_VER)/libunwind/* $(build_includedir)))
 
 clean-llvmunwind:
 	-rm -f $(BUILDDIR)/llvmunwind-$(LLVMUNWIND_VER)/build-configured $(BUILDDIR)/llvmunwind-$(LLVMUNWIND_VER)/build-compiled
@@ -139,14 +140,14 @@ clean-llvmunwind:
 	-$(MAKE) -C $(BUILDDIR)/llvmunwind-$(LLVMUNWIND_VER) clean
 
 distclean-llvmunwind:
-	rm -rf $(SRCCACHE)/llvmunwind-$(LLVMUNWIND_VER).tar.xz \
+	rm -rf $(SRCCACHE)/llvm-project-$(LLVMUNWIND_VER).tar.xz \
 		$(SRCCACHE)/llvmunwind-$(LLVMUNWIND_VER) \
 		$(BUILDDIR)/llvmunwind-$(LLVMUNWIND_VER)
 
-get-llvmunwind: $(SRCCACHE)/llvmunwind-$(LLVMUNWIND_VER).tar.xz
-extract-llvmunwind: $(SRCCACHE)/llvmunwind-$(LLVMUNWIND_VER)/source-extracted
-configure-llvmunwind: $(BUILDDIR)/llvmunwind-$(LLVMUNWIND_VER)/build-configured
-compile-llvmunwind: $(BUILDDIR)/llvmunwind-$(LLVMUNWIND_VER)/build-compiled
+get-llvmunwind: $(SRCCACHE)/llvm-project-$(LLVMUNWIND_VER).tar.xz
+extract-llvmunwind: $(SRCCACHE)/llvm-project-$(LLVMUNWIND_VER)/source-extracted
+configure-llvmunwind: $(BUILDDIR)/llvm-project-$(LLVMUNWIND_VER)/build-configured
+compile-llvmunwind: $(BUILDDIR)/llvm-project-$(LLVMUNWIND_VER)/build-compiled
 fastcheck-llvmunwind: check-llvmunwind
 check-llvmunwind: # no test/check provided by Makefile
 
diff --git a/doc/man/julia.1 b/doc/man/julia.1
index ebac4362b39a6..56cb690d66eeb 100644
--- a/doc/man/julia.1
+++ b/doc/man/julia.1
@@ -283,12 +283,17 @@ Generate an incremental output file (rather than complete)
 
 .TP
 --trace-compile={stderr|name}
-Print precompile statements for methods compiled during execution or save to a path
+Print precompile statements for methods compiled during execution or save to stderr or a path.
+Methods that were recompiled are printed in yellow or with a trailing comment if color is not supported
 
 .TP
 --trace-compile-timing=
 If --trace-compile is enabled show how long each took to compile in ms
 
+.TP
+--trace-dispatch={stderr|name}
+Print precompile statements for methods dispatched during execution or save to stderr or a path.
+
 .TP
 -image-codegen
 Force generate code in imaging mode
diff --git a/doc/src/base/file.md b/doc/src/base/file.md
index 22799f882bb26..300738a39322d 100644
--- a/doc/src/base/file.md
+++ b/doc/src/base/file.md
@@ -29,6 +29,7 @@ Base.Filesystem.operm
 Base.Filesystem.cp
 Base.download
 Base.Filesystem.mv
+Base.Filesystem.rename
 Base.Filesystem.rm
 Base.Filesystem.touch
 Base.Filesystem.tempname
diff --git a/doc/src/base/strings.md b/doc/src/base/strings.md
index b7d16ffc7d487..a9637a1a7be3a 100644
--- a/doc/src/base/strings.md
+++ b/doc/src/base/strings.md
@@ -17,11 +17,6 @@ Core.String(::AbstractString)
 Base.SubString
 Base.LazyString
 Base.@lazy_str
-Base.AnnotatedString
-Base.AnnotatedChar
-Base.annotatedstring
-Base.annotations
-Base.annotate!
 Base.transcode
 Base.unsafe_string
 Base.ncodeunits(::AbstractString)
@@ -101,3 +96,17 @@ Base.escape_string
 Base.escape_raw_string
 Base.unescape_string
 ```
+
+## `AnnotatedString`s
+
+!!! note
+    The API for AnnotatedStrings is considered experimental and is subject to change between
+    Julia versions.
+
+```@docs
+Base.AnnotatedString
+Base.AnnotatedChar
+Base.annotatedstring
+Base.annotations
+Base.annotate!
+```
diff --git a/doc/src/devdocs/build/distributing.md b/doc/src/devdocs/build/distributing.md
index 99c08923b415b..ed06c20fa0df3 100644
--- a/doc/src/devdocs/build/distributing.md
+++ b/doc/src/devdocs/build/distributing.md
@@ -108,7 +108,7 @@ Alternatively, Julia may be built as a framework by invoking `make` with the
 Windows
 -------
 
-Instructions for reating a Julia distribution on Windows are described in the
+Instructions for creating a Julia distribution on Windows are described in the
 [build devdocs for Windows](https://github.com/JuliaLang/julia/blob/master/doc/src/devdocs/build/windows.md).
 
 Notes on BLAS and LAPACK
diff --git a/doc/src/devdocs/gc.md b/doc/src/devdocs/gc.md
index 9b9038c9445f3..a45e8afb271ce 100644
--- a/doc/src/devdocs/gc.md
+++ b/doc/src/devdocs/gc.md
@@ -21,7 +21,7 @@ lists. Metadata for free pages, however, may be stored into three separate lock-
 
 Julia's pool allocator follows a "tiered" allocation discipline. When requesting a memory page for the pool allocator, Julia will:
 
-- Try to claim a page from `page_pool_lazily_freed`, which contains pages which were empty on the last stop-the-world phase, but not yet madivsed by a concurrent sweeper GC thread.
+- Try to claim a page from `page_pool_lazily_freed`, which contains pages which were empty on the last stop-the-world phase, but not yet madvised by a concurrent sweeper GC thread.
 
 - If it failed claiming a page from `page_pool_lazily_freed`, it will try to claim a page from `the page_pool_clean`, which contains pages which were mmaped on a previous page allocation request but never accessed.
 
diff --git a/doc/src/devdocs/llvm.md b/doc/src/devdocs/llvm.md
index ab8f7dde50022..c4b80f632cd4e 100644
--- a/doc/src/devdocs/llvm.md
+++ b/doc/src/devdocs/llvm.md
@@ -17,7 +17,7 @@ The code for lowering Julia AST to LLVM IR or interpreting it directly is in dir
 | `cgutils.cpp`                    | Lowering utilities, notably for array and tuple accesses           |
 | `codegen.cpp`                    | Top-level of code generation, pass list, lowering builtins         |
 | `debuginfo.cpp`                  | Tracks debug information for JIT code                              |
-| `disasm.cpp`                     | Handles native object file and JIT code diassembly                 |
+| `disasm.cpp`                     | Handles native object file and JIT code disassembly                 |
 | `gf.c`                           | Generic functions                                                  |
 | `intrinsics.cpp`                 | Lowering intrinsics                                                |
 | `jitlayers.cpp`                  | JIT-specific code, ORC compilation layers/utilities                |
diff --git a/doc/src/devdocs/sysimg.md b/doc/src/devdocs/sysimg.md
index 7d4f7afdbb86a..64c309e1fb02a 100644
--- a/doc/src/devdocs/sysimg.md
+++ b/doc/src/devdocs/sysimg.md
@@ -117,3 +117,80 @@ See code comments for each components for more implementation details.
     depending on the ISA. The target selection will prefer exact CPU name match,
     larger vector register size, and larger number of features.
     An overview of this process is in `src/processor.cpp`.
+
+## Trimming
+
+System images are typically quite large, since Base includes a lot of functionality, and by
+default system images also include several packages such as LinearAlgebra for convenience
+and backwards compatibility. Most programs will use only a fraction of the functions in
+these packages. Therefore it makes sense to build binaries that exclude unused functions
+to save space, referred to as "trimming".
+
+While the basic idea of trimming is sound, Julia has dynamic and reflective features that make it
+difficult (or impossible) to know in general which functions are unused. As an extreme example,
+consider code like
+
+```
+getglobal(Base, Symbol(readchomp(stdin)))(1)
+```
+
+This code reads a function name from `stdin` and calls the named function from Base on the value
+`1`. In this case it is impossible to predict which function will be called, so no functions
+can reliably be considered "unused". With some noteworthy exceptions (Julia's own REPL being
+one of them), most real-world programs do not do things like this.
+
+Less extreme cases occur, for example, when there are type instabilities that make it impossible
+for the compiler to predict which method will be called. However, if code is well-typed and does
+not use reflection, a complete and (hopefully) relatively small set of needed methods can be
+determined, and the rest can be removed. The `--trim` command-line option requests this kind of
+compilation.
+
+When `--trim` is specified in a command used to build a system image, the compiler begins
+tracing calls starting at methods marked using `Base.Experimental.entrypoint`. If a call is too
+dynamic to reasonably narrow down the possible call targets, an error is given at compile
+time showing the location of the call. For testing purposes, it is possible to skip these
+errors by specifying `--trim=unsafe` or `--trim=unsafe-warn`. Then you will get a system
+image built, but it may crash at run time if needed code is not present.
+
+It typically makes sense to specify `--strip-ir` along with `--trim`, since trimmed binaries
+are fully compiled and therefore don't need Julia IR. At some point we may make `--trim` imply
+`--strip-ir`, but for now we have kept them orthogonal.
+
+To get the smallest possible binary, it will also help to specify `--strip-metadata` and
+run the Unix `strip` utility. However, those steps remove Julia-specific and native (DWARF format)
+debug info, respectively, and so will make debugging more difficult.
+
+### Common problems
+
+- The Base global variables `stdin`, `stdout`, and `stderr` are non-constant and so their
+  types are not known. All printing should use a specific IO object with a known type.
+  The easiest substitution is to use `print(Core.stdout, x)` instead of `print(x)` or
+  `print(stdout, x)`.
+- Use tools like `JET`, `Cthulhu`, and/or `SnoopCompile` to identify failures of type-inference, and
+  follow our [Performance Tips](@ref) to fix them.
+
+### Compatibility concerns
+
+We have identified many small changes to Base that significantly increase the set of programs
+that can be reliably trimmed. Unfortunately some of those changes would be considered breaking,
+and so are only applied when trimming is requested (this is done by an external build script,
+currently maintained inside the test suite as `test/trimming/buildscript.jl`).
+Therefore in many cases trimming will require you to opt in to new variants of Base and some
+standard libraries.
+
+If you want to use trimming, it is important to set up continuous integration testing that
+performs a trimmed build and fully tests the resulting program.
+Fortunately, if your program successfully compiles with `--trim` then it is very likely to work
+the same as it did before. However, CI is needed to ensure that your program continues to build
+with trimming as you develop it.
+
+Package authors may wish to test that their package is "trimming safe", however this is impossible
+in general. Trimming is only expected to work given concrete entry points such as `main()` and
+library entry points meant to be called from outside Julia. For generic packages, existing tests
+for type stability like `@inferred` and `JET` are about as close as you can get to checking
+trim compatibility.
+
+Trimming also introduces new compatibility issues between minor versions of Julia. At this time,
+we are not able to guarantee that a program that can be trimmed in one version of Julia
+can also be trimmed in all future versions of Julia. However, breakage of that kind is expected
+to be rare. We also plan to try to *increase* the set of programs that can be trimmed over time.
diff --git a/doc/src/manual/calling-c-and-fortran-code.md b/doc/src/manual/calling-c-and-fortran-code.md
index 6f4d69b16bc81..b8d064f698208 100644
--- a/doc/src/manual/calling-c-and-fortran-code.md
+++ b/doc/src/manual/calling-c-and-fortran-code.md
@@ -996,7 +996,7 @@ A table of translations between the macro and function interfaces is given below
 |------------------------------------------------------------------------------|-----------------------------------------------------------------------------|
 | `@ccall clock()::Int32`                                                      | `ccall(:clock, Int32, ())`                                                  |
 | `@ccall f(a::Cint)::Cint`                                                    | `ccall(:a, Cint, (Cint,), a)`                                               |
-| `@ccall "mylib".f(a::Cint, b::Cdouble)::Cvoid`                               | `ccall((:f, "mylib"), Cvoid, (Cint, Cdouble), (a, b))`                      |
+| `@ccall "mylib".f(a::Cint, b::Cdouble)::Cvoid`                               | `ccall((:f, "mylib"), Cvoid, (Cint, Cdouble), a, b)`                        |
 | `@ccall $fptr.f()::Cvoid`                                                    | `ccall(fptr, f, Cvoid, ())`                                                 |
 | `@ccall printf("%s = %d\n"::Cstring ; "foo"::Cstring, foo::Cint)::Cint`      | `<unavailable>`                                                             |
 | `@ccall printf("%s = %s\n"::Cstring ; "2 + 2"::Cstring, "5"::Cstring)::Cint` | `ccall(:printf, Cint, (Cstring, Cstring...), "%s = %s\n", "2 + 2", "5")`    |
diff --git a/doc/src/manual/command-line-interface.md b/doc/src/manual/command-line-interface.md
index d1ed576c42a4f..734d7031db5e8 100644
--- a/doc/src/manual/command-line-interface.md
+++ b/doc/src/manual/command-line-interface.md
@@ -214,11 +214,12 @@ The following is a complete list of command-line switches available when launchi
 |`--output-bc <name>`                   |Generate LLVM bitcode (.bc)|
 |`--output-asm <name>`                  |Generate an assembly file (.s)|
 |`--output-incremental={yes\|no*}`      |Generate an incremental output file (rather than complete)|
-|`--trace-compile={stderr\|name}`       |Print precompile statements for methods compiled during execution or save to a path|
+|`--trace-compile={stderr\|name}`       |Print precompile statements for methods compiled during execution or save to stderr or a path. Methods that were recompiled are printed in yellow or with a trailing comment if color is not supported|
 |`--trace-compile-timing`               |If --trace-compile is enabled show how long each took to compile in ms|
+|`--trace-dispatch={stderr\|name}`      |Print precompile statements for methods dispatched during execution or save to stderr or a path.|
 |`--image-codegen`                      |Force generate code in imaging mode|
 |`--permalloc-pkgimg={yes\|no*}`        |Copy the data section of package images into memory|
-
+|`--trim={no*\|safe\|unsafe\|unsafe-warn}` |Build a sysimage including only code provably reachable from methods marked by calling `entrypoint`. The three non-default options differ in how they handle dynamic call sites. In safe mode, such sites result in compile-time errors. In unsafe mode, such sites are allowed but the resulting binary might be missing needed code and can throw runtime errors. With unsafe-warn, such sites will trigger warnings at compile-time and might error at runtime.|
 
 !!! compat "Julia 1.1"
     In Julia 1.0, the default `--project=@.` option did not search up from the root
diff --git a/doc/src/manual/environment-variables.md b/doc/src/manual/environment-variables.md
index 30f2263904f40..abbf9a842f288 100644
--- a/doc/src/manual/environment-variables.md
+++ b/doc/src/manual/environment-variables.md
@@ -144,7 +144,7 @@ files, artifacts, etc. For example, to switch the user depot to `/foo/bar` just
 ```sh
 export JULIA_DEPOT_PATH="/foo/bar:"
 ```
-All package operations, like cloning registrise or installing packages, will now write to
+All package operations, like cloning registries or installing packages, will now write to
 `/foo/bar`, but since the empty entry is expanded to the default system depot, any bundled
 resources will still be available. If you really only want to use the depot at `/foo/bar`,
 and not load any bundled resources, simply set the environment variable to `/foo/bar`
@@ -328,16 +328,25 @@ a master process to establish a connection before dying.
 
 ### [`JULIA_NUM_THREADS`](@id JULIA_NUM_THREADS)
 
-An unsigned 64-bit integer (`uint64_t`) that sets the maximum number of threads
-available to Julia. If `$JULIA_NUM_THREADS` is not positive or is not set, or
-if the number of CPU threads cannot be determined through system calls, then the
-number of threads is set to `1`.
+An unsigned 64-bit integer (`uint64_t`) or string that sets the maximum number
+of threads available to Julia. If `$JULIA_NUM_THREADS` is not set or is a
+non-positive integer, or if the number of CPU threads cannot be determined
+through system calls, then the number of threads is set to `1`.
 
 If `$JULIA_NUM_THREADS` is set to `auto`, then the number of threads will be set
-to the number of CPU threads.
+to the number of CPU threads. It can also be set to a comma-separated string to
+specify the size of the `:default` and `:interactive` [threadpools](@ref
+man-threadpools), respectively:
+```bash
+# 5 threads in the :default pool and 2 in the :interactive pool
+export JULIA_NUM_THREADS=5,2
+
+# `auto` threads in the :default pool and 1 in the :interactive pool
+export JULIA_NUM_THREADS=auto,1
+```
 
 !!! note
-    `JULIA_NUM_THREADS` must be defined before starting julia; defining it in
+    `JULIA_NUM_THREADS` must be defined before starting Julia; defining it in
     `startup.jl` is too late in the startup process.
 
 !!! compat "Julia 1.5"
@@ -347,6 +356,9 @@ to the number of CPU threads.
 !!! compat "Julia 1.7"
     The `auto` value for `$JULIA_NUM_THREADS` requires Julia 1.7 or above.
 
+!!! compat "Julia 1.9"
+    The `x,y` format for threadpools requires Julia 1.9 or above.
+
 ### [`JULIA_THREAD_SLEEP_THRESHOLD`](@id JULIA_THREAD_SLEEP_THRESHOLD)
 
 If set to a string that starts with the case-insensitive substring `"infinite"`,
@@ -380,6 +392,23 @@ If set to anything besides `0`, then Julia's thread policy is consistent with
 running on a dedicated machine: the master thread is on proc 0, and threads are
 affinitized. Otherwise, Julia lets the operating system handle thread policy.
 
+## Garbage Collection
+
+### [`JULIA_HEAP_SIZE_HINT`](@id JULIA_HEAP_SIZE_HINT)
+
+Environment variable equivalent to the `--heap-size-hint` command line option.
+
+Forces garbage collection if memory usage is higher than the given value. The value may be specified as a number of bytes, optionally in units of:
+
+    - b  (bytes)
+    - k  (kibibytes)
+    - m  (mebibytes)
+    - g  (gibibytes)
+    - t  (tebibytes)
+    - %  (percentage of physical memory)
+
+which are not case sensitive. For example, `JULIA_HEAP_SIZE_HINT=1G` would provide a 1 GB heap size hint to the garbage collector.
+
 ## REPL formatting
 
 Environment variables that determine how REPL output should be formatted at the
diff --git a/doc/src/manual/functions.md b/doc/src/manual/functions.md
index 9a91ea7467750..be81fe529ef7d 100644
--- a/doc/src/manual/functions.md
+++ b/doc/src/manual/functions.md
@@ -292,12 +292,12 @@ syntaxes:
 
 ```jldoctest
 julia> x -> x^2 + 2x - 1
-#1 (generic function with 1 method)
+#2 (generic function with 1 method)
 
 julia> function (x)
            x^2 + 2x - 1
        end
-#3 (generic function with 1 method)
+#5 (generic function with 1 method)
 ```
 
 Each statement creates a function taking one argument `x` and returning the value of the polynomial `x^2 +
diff --git a/doc/src/manual/noteworthy-differences.md b/doc/src/manual/noteworthy-differences.md
index 181fe0a30eb38..33285bde8a066 100644
--- a/doc/src/manual/noteworthy-differences.md
+++ b/doc/src/manual/noteworthy-differences.md
@@ -220,8 +220,8 @@ For users coming to Julia from R, these are some noteworthy differences:
   * Unlike Python, Julia allows [AbstractArrays with arbitrary indexes](https://julialang.org/blog/2017/04/offset-arrays/).
     Python's special interpretation of negative indexing, `a[-1]` and `a[-2]`, should be written
     `a[end]` and `a[end-1]` in Julia.
-  * Julia requires `end` for indexing until the last element. `x[1:]` in Python is equivalent to `x[2:end]` in Julia.
-  * In Julia, `:` before any object creates a [`Symbol`](@ref) or *quotes* an expression; so, `x[:5]` is same as `x[5]`. If you want to get the first `n` elements of an array, then use range indexing.
+  * Julia requires `end` for indexing until the last element. `x[2:end]` in Julia is equivalent to `x[1:]` in Python.
+  * In Julia, `:` before any object creates a [`Symbol`](@ref) or *quotes* an expression; so, `x[:5]` is the same as `x[5]`. If you want to get the first `n` elements of an array, then use range indexing.
   * Julia's range indexing has the format of `x[start:step:stop]`, whereas Python's format is `x[start:(stop+1):step]`. Hence, `x[0:10:2]` in Python is equivalent to `x[1:2:10]` in Julia. Similarly, `x[::-1]` in Python, which refers to the reversed array, is equivalent to `x[end:-1:1]` in Julia.
   * In Julia, ranges can be constructed independently as `start:step:stop`, the same syntax it uses
     in array-indexing. The `range` function is also supported.
diff --git a/doc/src/manual/performance-tips.md b/doc/src/manual/performance-tips.md
index 38e27476f0af8..3033720b5df8c 100644
--- a/doc/src/manual/performance-tips.md
+++ b/doc/src/manual/performance-tips.md
@@ -1394,6 +1394,125 @@ Prominent examples include [MKL.jl](https://github.com/JuliaLinearAlgebra/MKL.jl
 These are external packages, so we will not discuss them in detail here.
 Please refer to their respective documentations (especially because they have different behaviors than OpenBLAS with respect to multithreading).
 
+## Execution latency, package loading and package precompiling time
+
+### Reducing time to first plot etc.
+
+The first time a julia method is called it (and any methods it calls, or ones that can be statically determined) will be
+compiled. The [`@time`](@ref) macro family illustrates this.
+
+```
+julia> foo() = rand(2,2) * rand(2,2)
+foo (generic function with 1 method)
+
+julia> @time @eval foo();
+  0.252395 seconds (1.12 M allocations: 56.178 MiB, 2.93% gc time, 98.12% compilation time)
+
+julia> @time @eval foo();
+  0.000156 seconds (63 allocations: 2.453 KiB)
+```
+
+Note that `@time @eval` is better for measuring compilation time because without [`@eval`](@ref), some compilation may
+already be done before timing starts.
+
+When developing a package, you may be able to improve the experience of your users with *precompilation*
+so that when they use the package, the code they use is already compiled. To precompile package code effectively, it's
+recommended to use [`PrecompileTools.jl`](https://julialang.github.io/PrecompileTools.jl/stable/) to run a
+"precompile workload" during precompilation time that is representative of typical package usage, which will cache the
+native compiled code into the package `pkgimage` cache, greatly reducing "time to first execution" (often referred to as
+TTFX) for such usage.
+
+Note that [`PrecompileTools.jl`](https://julialang.github.io/PrecompileTools.jl/stable/) workloads can be
+disabled and sometimes configured via Preferences if you do not want to spend the extra time precompiling, which
+may be the case during development of a package.
+
+### Reducing package loading time
+
+Keeping the time taken to load the package down is usually helpful.
+General good practice for package developers includes:
+
+1. Reduce your dependencies to those you really need. Consider using [package extensions](@ref) to support interoperability with other packages without bloating your essential dependencies.
+3. Avoid use of [`__init__()`](@ref) functions unless there is no alternative, especially those which might trigger a lot
+   of compilation, or just take a long time to execute.
+4. Where possible, fix [invalidations](https://julialang.org/blog/2020/08/invalidations/) among your dependencies and from your package code.
+
+The tool [`@time_imports`](@ref) can be useful in the REPL to review the above factors.
+
+```julia-repl
+julia> @time @time_imports using Plots
+      0.5 ms  Printf
+     16.4 ms  Dates
+      0.7 ms  Statistics
+               ┌ 23.8 ms SuiteSparse_jll.__init__() 86.11% compilation time (100% recompilation)
+     90.1 ms  SuiteSparse_jll 91.57% compilation time (82% recompilation)
+      0.9 ms  Serialization
+               ┌ 39.8 ms SparseArrays.CHOLMOD.__init__() 99.47% compilation time (100% recompilation)
+    166.9 ms  SparseArrays 23.74% compilation time (100% recompilation)
+      0.4 ms  Statistics → SparseArraysExt
+      0.5 ms  TOML
+      8.0 ms  Preferences
+      0.3 ms  PrecompileTools
+      0.2 ms  Reexport
+... many deps omitted for example ...
+      1.4 ms  Tar
+               ┌ 73.8 ms p7zip_jll.__init__() 99.93% compilation time (100% recompilation)
+     79.4 ms  p7zip_jll 92.91% compilation time (100% recompilation)
+               ┌ 27.7 ms GR.GRPreferences.__init__() 99.77% compilation time (100% recompilation)
+     43.0 ms  GR 64.26% compilation time (100% recompilation)
+               ┌ 2.1 ms Plots.__init__() 91.80% compilation time (100% recompilation)
+    300.9 ms  Plots 0.65% compilation time (100% recompilation)
+  1.795602 seconds (3.33 M allocations: 190.153 MiB, 7.91% gc time, 39.45% compilation time: 97% of which was recompilation)
+
+```
+
+Notice that in this example there are multiple packages loaded, some with `__init__()` functions, some of which cause
+compilation of which some is recompilation. Recompilation is caused by earlier packages invalidating methods, then in
+these cases when the following packages run their `__init__()` function some hit recompilation before the code can be run.
+
+Further, note the `Statistics` extension `SparseArraysExt` has been activated because `SparseArrays` is in the dependency
+tree. i.e. see `0.4 ms  Statistics → SparseArraysExt`.
+
+This report gives a good opportunity to review whether the cost of dependency load time is worth the functionality it brings.
+Also the `Pkg` utility `why` can be used to report why a an indirect dependency exists.
+
+```
+(CustomPackage) pkg> why FFMPEG_jll
+  Plots → FFMPEG → FFMPEG_jll
+  Plots → GR → GR_jll → FFMPEG_jll
+```
+
+or to see the indirect dependencies that a package brings in, you can `pkg> rm` the package, see the deps that are removed
+from the manifest, then revert the change with `pkg> undo`.
+
+If loading time is dominated by slow `__init__()` methods having compilation, one verbose way to identify what is being
+compiled is to use the julia args `--trace-compile=stderr --trace-compile-timing` which will report a [`precompile`](@ref)
+statement each time a method is compiled, along with how long compilation took. The InteractiveUtils macro
+[`@trace_compile`](@ref) provides a way to enable those args for a specific call. So a call for a complete report report would look like:
+
+```
+julia> @time @time_imports @trace_compile using CustomPackage
+...
+```
+
+Note the `--startup-file=no` which helps isolate the test from packages you may have in your `startup.jl`.
+
+More analysis of the reasons for recompilation can be achieved with the
+[`SnoopCompile`](https://github.com/timholy/SnoopCompile.jl) package.
+
+### Reducing precompilation time
+
+If package precompilation is taking a long time, one option is to set the following internal and then precompile.
+```
+julia> Base.PRECOMPILE_TRACE_COMPILE[] = "stderr"
+
+pkg> precompile
+```
+
+This has the effect of setting `--trace-compile=stderr --trace-compile-timing` in the precompilation processes themselves,
+so will show which methods are precompiled and how long they took to precompile.
+
+There are also profiling options such as [using the external profiler Tracy to profile the precompilation process](@ref Profiling-package-precompilation-with-Tracy).
+
 
 ## Miscellaneous
 
@@ -1418,7 +1537,7 @@ be modified as suggested by the warnings.
 Sometimes you can enable better optimization by promising certain program properties.
 
   * Use [`@inbounds`](@ref) to eliminate array bounds checking within expressions. Be certain before doing
-    this. If the subscripts are ever out of bounds, you may suffer crashes or silent corruption.
+    this. If the indices are ever out of bounds, you may suffer crashes or silent corruption.
   * Use [`@fastmath`](@ref) to allow floating point optimizations that are correct for real numbers, but lead
     to differences for IEEE numbers. Be careful when doing this, as this may change numerical results.
     This corresponds to the `-ffast-math` option of clang.
diff --git a/doc/src/manual/strings.md b/doc/src/manual/strings.md
index 5ba27b3921cec..c04e5e6d6760e 100644
--- a/doc/src/manual/strings.md
+++ b/doc/src/manual/strings.md
@@ -1207,6 +1207,10 @@ last backslash escapes a quote, since these backslashes appear before a quote.
 
 ## [Annotated Strings](@id man-annotated-strings)
 
+!!! note
+    The API for AnnotatedStrings is considered experimental and is subject to change between
+    Julia versions.
+
 It is sometimes useful to be able to hold metadata relating to regions of a
 string. A [`AnnotatedString`](@ref Base.AnnotatedString) wraps another string and
 allows for regions of it to be annotated with labelled values (`:label => value`).
diff --git a/src/Makefile b/src/Makefile
index 52e673aa6cc1a..a6b1f433b73ce 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -287,10 +287,10 @@ endif
 	$(INSTALL_NAME_CMD)libccalltest.$(SHLIB_EXT) $@
 
 $(build_shlibdir)/libccalllazyfoo.$(SHLIB_EXT): $(SRCDIR)/ccalllazyfoo.c
-	@$(call PRINT_CC, $(CC) $(JCFLAGS) $(JL_CFLAGS) $(JCPPFLAGS) $(FLAGS) -O3 $< $(fPIC) -shared -o $@ $(LDFLAGS) $(COMMON_LIBPATHS) $(call SONAME_FLAGS,ccalllazyfoo.$(SHLIB_EXT)))
+	@$(call PRINT_CC, $(CC) $(JCFLAGS) $(JL_CFLAGS) $(JCPPFLAGS) $(FLAGS) -O3 $< $(fPIC) -shared -o $@ $(LDFLAGS) $(COMMON_LIBPATHS) $(call SONAME_FLAGS,libccalllazyfoo.$(SHLIB_EXT)))
 
 $(build_shlibdir)/libccalllazybar.$(SHLIB_EXT): $(SRCDIR)/ccalllazybar.c $(build_shlibdir)/libccalllazyfoo.$(SHLIB_EXT)
-	@$(call PRINT_CC, $(CC) $(JCFLAGS) $(JL_CFLAGS) $(JCPPFLAGS) $(FLAGS) -O3 $< $(fPIC) -shared -o $@ $(LDFLAGS) $(COMMON_LIBPATHS) $(call SONAME_FLAGS,ccalllazybar.$(SHLIB_EXT)) -lccalllazyfoo)
+	@$(call PRINT_CC, $(CC) $(JCFLAGS) $(JL_CFLAGS) $(JCPPFLAGS) $(FLAGS) -O3 $< $(fPIC) -shared -o $@ $(LDFLAGS) $(COMMON_LIBPATHS) $(call SONAME_FLAGS,libccalllazybar.$(SHLIB_EXT)) -lccalllazyfoo)
 
 $(build_shlibdir)/libllvmcalltest.$(SHLIB_EXT): $(SRCDIR)/llvmcalltest.cpp $(LLVM_CONFIG_ABSOLUTE)
 	@$(call PRINT_CC, $(CXX) $(LLVM_CXXFLAGS) $(FLAGS) $(CPPFLAGS) $(CXXFLAGS) -O3 $< $(fPIC) -shared -o $@ $(LDFLAGS) $(COMMON_LIBPATHS) $(NO_WHOLE_ARCHIVE) $(CG_LLVMLINK)) -lpthread
diff --git a/src/aotcompile.cpp b/src/aotcompile.cpp
index b4c8ef6095a55..c2f112f9c9d5c 100644
--- a/src/aotcompile.cpp
+++ b/src/aotcompile.cpp
@@ -95,6 +95,17 @@ void jl_get_function_id_impl(void *native_code, jl_code_instance_t *codeinst,
     }
 }
 
+extern "C" JL_DLLEXPORT_CODEGEN
+void jl_get_llvm_mis_impl(void *native_code, arraylist_t* MIs)
+{
+    jl_native_code_desc_t *data = (jl_native_code_desc_t*)native_code;
+    auto map = data->jl_fvar_map;
+    for (auto &ci : map) {
+        jl_method_instance_t *mi = ci.first->def;
+        arraylist_push(MIs, mi);
+    }
+}
+
 extern "C" JL_DLLEXPORT_CODEGEN
 void jl_get_llvm_gvs_impl(void *native_code, arraylist_t *gvs)
 {
@@ -284,6 +295,7 @@ jl_code_instance_t *jl_ci_cache_lookup(const jl_cgparams_t &cgparams, jl_method_
     jl_value_t *ci = cgparams.lookup(mi, world, world);
     JL_GC_PROMISE_ROOTED(ci);
     jl_code_instance_t *codeinst = NULL;
+    JL_GC_PUSH1(&codeinst);
     if (ci != jl_nothing && jl_atomic_load_relaxed(&((jl_code_instance_t *)ci)->inferred) != jl_nothing) {
         codeinst = (jl_code_instance_t*)ci;
     }
@@ -301,9 +313,11 @@ jl_code_instance_t *jl_ci_cache_lookup(const jl_cgparams_t &cgparams, jl_method_
                 jl_mi_cache_insert(mi, codeinst);
         }
     }
+    JL_GC_POP();
     return codeinst;
 }
 
+arraylist_t new_invokes;
 // takes the running content that has collected in the shadow module and dump it to disk
 // this builds the object file portion of the sysimage files for fast startup, and can
 // also be used be extern consumers like GPUCompiler.jl to obtain a module containing
@@ -353,8 +367,12 @@ void *jl_create_native_impl(jl_array_t *methods, LLVMOrcThreadSafeModuleRef llvm
     params.imaging_mode = imaging;
     params.debug_level = cgparams->debug_info_level;
     params.external_linkage = _external_linkage;
+    arraylist_new(&new_invokes, 0);
     size_t compile_for[] = { jl_typeinf_world, _world };
-    for (int worlds = 0; worlds < 2; worlds++) {
+    int worlds = 0;
+    if (jl_options.trim != JL_TRIM_NO)
+        worlds = 1;
+    for (; worlds < 2; worlds++) {
         JL_TIMING(NATIVE_AOT, NATIVE_Codegen);
         size_t this_world = compile_for[worlds];
         if (!this_world)
@@ -373,6 +391,7 @@ void *jl_create_native_impl(jl_array_t *methods, LLVMOrcThreadSafeModuleRef llvm
                 continue;
             }
             mi = (jl_method_instance_t*)item;
+compile_mi:
             src = NULL;
             // if this method is generally visible to the current compilation world,
             // and this is either the primary world, or not applicable in the primary world
@@ -380,16 +399,47 @@ void *jl_create_native_impl(jl_array_t *methods, LLVMOrcThreadSafeModuleRef llvm
             if (jl_atomic_load_relaxed(&mi->def.method->primary_world) <= this_world && this_world <= jl_atomic_load_relaxed(&mi->def.method->deleted_world)) {
                 // find and prepare the source code to compile
                 jl_code_instance_t *codeinst = jl_ci_cache_lookup(*cgparams, mi, this_world);
-                if (codeinst && !params.compiled_functions.count(codeinst)) {
+                if (jl_options.trim != JL_TRIM_NO && !codeinst) {
+                    // If we're building a small image, we need to compile everything
+                    // to ensure that we have all the information we need.
+                    jl_safe_printf("Codegen decided not to compile code root");
+                    jl_(mi);
+                    abort();
+                }
+                if (codeinst && !params.compiled_functions.count(codeinst) && !data->jl_fvar_map.count(codeinst)) {
                     // now add it to our compilation results
-                    JL_GC_PROMISE_ROOTED(codeinst->rettype);
-                    orc::ThreadSafeModule result_m = jl_create_ts_module(name_from_method_instance(codeinst->def),
-                            params.tsctx, clone.getModuleUnlocked()->getDataLayout(),
-                            Triple(clone.getModuleUnlocked()->getTargetTriple()));
-                    jl_llvm_functions_t decls = jl_emit_codeinst(result_m, codeinst, NULL, params);
-                    if (result_m)
-                        params.compiled_functions[codeinst] = {std::move(result_m), std::move(decls)};
+                    // Const returns do not do codegen, but juliac inspects codegen results so make a dummy fvar entry to represent it
+                    if (jl_options.trim != JL_TRIM_NO && jl_atomic_load_relaxed(&codeinst->invoke) == jl_fptr_const_return_addr) {
+                        data->jl_fvar_map[codeinst] = std::make_tuple((uint32_t)-3, (uint32_t)-3);
+                    } else {
+                        JL_GC_PROMISE_ROOTED(codeinst->rettype);
+                        orc::ThreadSafeModule result_m = jl_create_ts_module(name_from_method_instance(codeinst->def),
+                                params.tsctx, clone.getModuleUnlocked()->getDataLayout(),
+                                Triple(clone.getModuleUnlocked()->getTargetTriple()));
+                        jl_llvm_functions_t decls = jl_emit_codeinst(result_m, codeinst, NULL, params);
+                        if (result_m)
+                            params.compiled_functions[codeinst] = {std::move(result_m), std::move(decls)};
+                        else if (jl_options.trim != JL_TRIM_NO) {
+                            // if we're building a small image, we need to compile everything
+                            // to ensure that we have all the information we need.
+                            jl_safe_printf("codegen failed to compile code root");
+                            jl_(mi);
+                            abort();
+                        }
+                    }
                 }
+            } else if (this_world != jl_typeinf_world) {
+                /*
+                jl_safe_printf("Codegen could not find requested codeinstance to be compiled\n");
+                jl_(mi);
+                abort();
+                */
+            }
+            // TODO: is goto the best way to do this?
+            jl_compile_workqueue(params, policy);
+            mi = (jl_method_instance_t*)arraylist_pop(&new_invokes);
+            if (mi != NULL) {
+                goto compile_mi;
             }
         }
 
@@ -397,6 +447,7 @@ void *jl_create_native_impl(jl_array_t *methods, LLVMOrcThreadSafeModuleRef llvm
         jl_compile_workqueue(params, policy);
     }
     JL_GC_POP();
+    arraylist_free(&new_invokes);
 
     // process the globals array, before jl_merge_module destroys them
     SmallVector<std::string, 0> gvars(params.global_targets.size());
diff --git a/src/ast.c b/src/ast.c
index 26b95225fbf1c..ea1de429a946c 100644
--- a/src/ast.c
+++ b/src/ast.c
@@ -7,6 +7,7 @@
 #include <stdlib.h>
 #include <stdio.h>
 #include <string.h>
+
 #ifdef _OS_WINDOWS_
 #include <malloc.h>
 #endif
@@ -215,11 +216,46 @@ static value_t fl_nothrow_julia_global(fl_context_t *fl_ctx, value_t *args, uint
         decode_restriction_value(pku) : jl_atomic_load_relaxed(&b->value)) != NULL ? fl_ctx->T : fl_ctx->F;
 }
 
-static value_t fl_current_module_counter(fl_context_t *fl_ctx, value_t *args, uint32_t nargs) JL_NOTSAFEPOINT
+// Used to generate a unique suffix for a given symbol (e.g. variable or type name)
+// first argument contains a stack of method definitions seen so far by `closure-convert` in flisp.
+// if the top of the stack is non-NIL, we use it to augment the suffix so that it becomes
+// of the form $top_level_method_name##$counter, where `counter` is the smallest integer
+// such that the resulting name is not already defined in the current module's bindings.
+// If the top of the stack is NIL, we simply return the current module's counter.
+// This ensures that precompile statements are a bit more stable across different versions
+// of a codebase. see #53719
+static value_t fl_module_unique_name(fl_context_t *fl_ctx, value_t *args, uint32_t nargs)
 {
+    argcount(fl_ctx, "julia-module-unique-name", nargs, 1);
     jl_ast_context_t *ctx = jl_ast_ctx(fl_ctx);
-    assert(ctx->module);
-    return fixnum(jl_module_next_counter(ctx->module));
+    jl_module_t *m = ctx->module;
+    assert(m != NULL);
+    // Get the outermost function name from the `parsed_method_stack` top
+    char *funcname = NULL;
+    value_t parsed_method_stack = args[0];
+    if (parsed_method_stack != fl_ctx->NIL) {
+        value_t bottom_stack_symbol = fl_applyn(fl_ctx, 1, symbol_value(symbol(fl_ctx, "last")), parsed_method_stack);
+        funcname = tosymbol(fl_ctx, bottom_stack_symbol, "julia-module-unique-name")->name;
+    }
+    size_t sz = funcname != NULL ? strlen(funcname) + 32 : 32; // 32 is enough for the suffix
+    char *buf = (char*)alloca(sz);
+    if (funcname != NULL && strchr(funcname, '#') == NULL) {
+        for (int i = 0; ; i++) {
+            snprintf(buf, sz, "%s##%d", funcname, i);
+            jl_sym_t *sym = jl_symbol(buf);
+            JL_LOCK(&m->lock);
+            if (jl_get_module_binding(m, sym, 0) == NULL) { // make sure this name is not already taken
+                jl_get_module_binding(m, sym, 1); // create the binding
+                JL_UNLOCK(&m->lock);
+                return symbol(fl_ctx, buf);
+            }
+            JL_UNLOCK(&m->lock);
+        }
+    }
+    else {
+        snprintf(buf, sz, "%d", jl_module_next_counter(m));
+    }
+    return symbol(fl_ctx, buf);
 }
 
 static int jl_is_number(jl_value_t *v)
@@ -252,7 +288,7 @@ static jl_value_t *scm_to_julia_(fl_context_t *fl_ctx, value_t e, jl_module_t *m
 static const builtinspec_t julia_flisp_ast_ext[] = {
     { "defined-julia-global", fl_defined_julia_global }, // TODO: can we kill this safepoint
     { "nothrow-julia-global", fl_nothrow_julia_global },
-    { "current-julia-module-counter", fl_current_module_counter },
+    { "current-julia-module-counter", fl_module_unique_name },
     { "julia-scalar?", fl_julia_scalar },
     { NULL, NULL }
 };
diff --git a/src/builtin_proto.h b/src/builtin_proto.h
index 8b97c46df72da..7fbd555758675 100644
--- a/src/builtin_proto.h
+++ b/src/builtin_proto.h
@@ -69,6 +69,7 @@ DECLARE_BUILTIN(svec);
 DECLARE_BUILTIN(swapfield);
 DECLARE_BUILTIN(swapglobal);
 DECLARE_BUILTIN(throw);
+DECLARE_BUILTIN(throw_methoderror);
 DECLARE_BUILTIN(tuple);
 DECLARE_BUILTIN(typeassert);
 DECLARE_BUILTIN(typeof);
diff --git a/src/builtins.c b/src/builtins.c
index 8019ee3c0e2c6..96c4cec0f5087 100644
--- a/src/builtins.c
+++ b/src/builtins.c
@@ -580,6 +580,14 @@ JL_CALLABLE(jl_f_throw)
     return jl_nothing;
 }
 
+JL_CALLABLE(jl_f_throw_methoderror)
+{
+    JL_NARGSV(throw_methoderror, 1);
+    size_t world = jl_get_tls_world_age();
+    jl_method_error(args[0], &args[1], nargs, world);
+    return jl_nothing;
+}
+
 JL_CALLABLE(jl_f_ifelse)
 {
     JL_NARGS(ifelse, 3, 3);
@@ -2197,6 +2205,9 @@ static int equiv_type(jl_value_t *ta, jl_value_t *tb)
     JL_GC_PUSH2(&a, &b);
     a = jl_rewrap_unionall((jl_value_t*)dta->super, dta->name->wrapper);
     b = jl_rewrap_unionall((jl_value_t*)dtb->super, dtb->name->wrapper);
+    // if tb recursively refers to itself in its supertype, assume that it refers to ta
+    // before checking whether the supertypes are equal
+    b = jl_substitute_datatype(b, dtb, dta);
     if (!jl_types_equal(a, b))
         goto no;
     JL_TRY {
@@ -2433,7 +2444,8 @@ void jl_init_primitives(void) JL_GC_DISABLED
     add_builtin_func("finalizer", jl_f_finalizer);
     add_builtin_func("_compute_sparams", jl_f__compute_sparams);
     add_builtin_func("_svec_ref", jl_f__svec_ref);
-    add_builtin_func("current_scope", jl_f_current_scope);
+    jl_builtin_current_scope = add_builtin_func("current_scope", jl_f_current_scope);
+    add_builtin_func("throw_methoderror", jl_f_throw_methoderror);
 
     // builtin types
     add_builtin("Any", (jl_value_t*)jl_any_type);
diff --git a/src/ccall.cpp b/src/ccall.cpp
index 36808e13fdbf9..2de5be6906e7c 100644
--- a/src/ccall.cpp
+++ b/src/ccall.cpp
@@ -22,6 +22,8 @@ TRANSFORMED_CCALL_STAT(jl_cpu_wake);
 TRANSFORMED_CCALL_STAT(jl_gc_safepoint);
 TRANSFORMED_CCALL_STAT(jl_get_ptls_states);
 TRANSFORMED_CCALL_STAT(jl_threadid);
+TRANSFORMED_CCALL_STAT(jl_get_ptls_rng);
+TRANSFORMED_CCALL_STAT(jl_set_ptls_rng);
 TRANSFORMED_CCALL_STAT(jl_get_tls_world_age);
 TRANSFORMED_CCALL_STAT(jl_get_world_counter);
 TRANSFORMED_CCALL_STAT(jl_gc_enable_disable_finalizers_internal);
@@ -439,24 +441,13 @@ static Value *llvm_type_rewrite(
     // we need to use this alloca copy trick instead
     // On ARM and AArch64, the ABI requires casting through memory to different
     // sizes.
-    Value *from;
-    Value *to;
     const DataLayout &DL = ctx.builder.GetInsertBlock()->getModule()->getDataLayout();
     Align align = std::max(DL.getPrefTypeAlign(target_type), DL.getPrefTypeAlign(from_type));
-    if (DL.getTypeAllocSize(target_type) >= DL.getTypeAllocSize(from_type)) {
-        to = emit_static_alloca(ctx, target_type);
-        setName(ctx.emission_context, to, "type_rewrite_buffer");
-        cast<AllocaInst>(to)->setAlignment(align);
-        from = to;
-    }
-    else {
-        from = emit_static_alloca(ctx, from_type);
-        setName(ctx.emission_context, from, "type_rewrite_buffer");
-        cast<AllocaInst>(from)->setAlignment(align);
-        to = from;
-    }
-    ctx.builder.CreateAlignedStore(v, from, align);
-    auto pun = ctx.builder.CreateAlignedLoad(target_type, to, align);
+    size_t nb = std::max(DL.getTypeAllocSize(target_type), DL.getTypeAllocSize(from_type));
+    AllocaInst *cast = emit_static_alloca(ctx, nb, align);
+    setName(ctx.emission_context, cast, "type_rewrite_buffer");
+    ctx.builder.CreateAlignedStore(v, cast, align);
+    auto pun = ctx.builder.CreateAlignedLoad(target_type, cast, align);
     setName(ctx.emission_context, pun, "type_rewrite");
     return pun;
 }
@@ -494,7 +485,7 @@ static const std::string make_errmsg(const char *fname, int n, const char *err)
     return msg.str();
 }
 
-static void typeassert_input(jl_codectx_t &ctx, const jl_cgval_t &jvinfo, jl_value_t *jlto, jl_unionall_t *jlto_env, int argn)
+static jl_cgval_t typeassert_input(jl_codectx_t &ctx, const jl_cgval_t &jvinfo, jl_value_t *jlto, jl_unionall_t *jlto_env, int argn)
 {
     if (jlto != (jl_value_t*)jl_any_type && !jl_subtype(jvinfo.typ, jlto)) {
         if (jlto == (jl_value_t*)jl_voidpointer_type) {
@@ -502,6 +493,7 @@ static void typeassert_input(jl_codectx_t &ctx, const jl_cgval_t &jvinfo, jl_val
             if (!jl_is_cpointer_type(jvinfo.typ)) {
                 // emit a typecheck, if not statically known to be correct
                 emit_cpointercheck(ctx, jvinfo, make_errmsg("ccall", argn + 1, ""));
+                return update_julia_type(ctx, jvinfo, (jl_value_t*)jl_pointer_type);
             }
         }
         else {
@@ -526,8 +518,10 @@ static void typeassert_input(jl_codectx_t &ctx, const jl_cgval_t &jvinfo, jl_val
                 ctx.builder.CreateUnreachable();
                 ctx.builder.SetInsertPoint(passBB);
             }
+            return update_julia_type(ctx, jvinfo, jlto);
         }
     }
+    return jvinfo;
 }
 
 // Emit code to convert argument to form expected by C ABI
@@ -537,7 +531,7 @@ static void typeassert_input(jl_codectx_t &ctx, const jl_cgval_t &jvinfo, jl_val
 static Value *julia_to_native(
         jl_codectx_t &ctx,
         Type *to, bool toboxed, jl_value_t *jlto, jl_unionall_t *jlto_env,
-        const jl_cgval_t &jvinfo,
+        jl_cgval_t jvinfo,
         bool byRef, int argn)
 {
     // We're passing Any
@@ -547,24 +541,16 @@ static Value *julia_to_native(
     }
     assert(jl_is_datatype(jlto) && jl_struct_try_layout((jl_datatype_t*)jlto));
 
-    typeassert_input(ctx, jvinfo, jlto, jlto_env, argn);
+    jvinfo = typeassert_input(ctx, jvinfo, jlto, jlto_env, argn);
     if (!byRef)
         return emit_unbox(ctx, to, jvinfo, jlto);
 
     // pass the address of an alloca'd thing, not a box
     // since those are immutable.
-    Value *slot = emit_static_alloca(ctx, to);
     Align align(julia_alignment(jlto));
-    cast<AllocaInst>(slot)->setAlignment(align);
+    Value *slot = emit_static_alloca(ctx, to, align);
     setName(ctx.emission_context, slot, "native_convert_buffer");
-    if (!jvinfo.ispointer()) {
-        jl_aliasinfo_t ai = jl_aliasinfo_t::fromTBAA(ctx, jvinfo.tbaa);
-        ai.decorateInst(ctx.builder.CreateStore(emit_unbox(ctx, to, jvinfo, jlto), slot));
-    }
-    else {
-        jl_aliasinfo_t ai = jl_aliasinfo_t::fromTBAA(ctx, jvinfo.tbaa);
-        emit_memcpy(ctx, slot, ai, jvinfo, jl_datatype_size(jlto), align, align);
-    }
+    emit_unbox_store(ctx, jvinfo, slot, ctx.tbaa().tbaa_stack, align);
     return slot;
 }
 
@@ -1692,6 +1678,36 @@ static jl_cgval_t emit_ccall(jl_codectx_t &ctx, jl_value_t **args, size_t nargs)
         ai.decorateInst(tid);
         return mark_or_box_ccall_result(ctx, tid, retboxed, rt, unionall, static_rt);
     }
+    else if (is_libjulia_func(jl_get_ptls_rng)) {
+        ++CCALL_STAT(jl_get_ptls_rng);
+        assert(lrt == getInt64Ty(ctx.builder.getContext()));
+        assert(!isVa && !llvmcall && nccallargs == 0);
+        JL_GC_POP();
+        Value *ptls_p = get_current_ptls(ctx);
+        const int rng_offset = offsetof(jl_tls_states_t, rngseed);
+        Value *rng_ptr = ctx.builder.CreateInBoundsGEP(getInt8Ty(ctx.builder.getContext()), ptls_p, ConstantInt::get(ctx.types().T_size, rng_offset / sizeof(int8_t)));
+        setName(ctx.emission_context, rng_ptr, "rngseed_ptr");
+        LoadInst *rng_value = ctx.builder.CreateAlignedLoad(getInt64Ty(ctx.builder.getContext()), rng_ptr, Align(sizeof(void*)));
+        setName(ctx.emission_context, rng_value, "rngseed");
+        jl_aliasinfo_t ai = jl_aliasinfo_t::fromTBAA(ctx, ctx.tbaa().tbaa_gcframe);
+        ai.decorateInst(rng_value);
+        return mark_or_box_ccall_result(ctx, rng_value, retboxed, rt, unionall, static_rt);
+    }
+    else if (is_libjulia_func(jl_set_ptls_rng)) {
+        ++CCALL_STAT(jl_set_ptls_rng);
+        assert(lrt == getVoidTy(ctx.builder.getContext()));
+        assert(!isVa && !llvmcall && nccallargs == 1);
+        JL_GC_POP();
+        Value *ptls_p = get_current_ptls(ctx);
+        const int rng_offset = offsetof(jl_tls_states_t, rngseed);
+        Value *rng_ptr = ctx.builder.CreateInBoundsGEP(getInt8Ty(ctx.builder.getContext()), ptls_p, ConstantInt::get(ctx.types().T_size, rng_offset / sizeof(int8_t)));
+        setName(ctx.emission_context, rng_ptr, "rngseed_ptr");
+        assert(argv[0].V->getType() == getInt64Ty(ctx.builder.getContext()));
+        auto store = ctx.builder.CreateAlignedStore(argv[0].V, rng_ptr, Align(sizeof(void*)));
+        jl_aliasinfo_t ai = jl_aliasinfo_t::fromTBAA(ctx, ctx.tbaa().tbaa_gcframe);
+        ai.decorateInst(store);
+        return ghostValue(ctx, jl_nothing_type);
+    }
     else if (is_libjulia_func(jl_get_tls_world_age)) {
         bool toplevel = !(ctx.linfo && jl_is_method(ctx.linfo->def.method));
         if (!toplevel) { // top level code does not see a stable world age during execution
@@ -1822,8 +1838,8 @@ static jl_cgval_t emit_ccall(jl_codectx_t &ctx, jl_value_t **args, size_t nargs)
         ctx.builder.SetInsertPoint(checkBB);
         auto signal_page_load = ctx.builder.CreateLoad(
                 ctx.types().T_size,
-                ctx.builder.CreateConstInBoundsGEP1_32(ctx.types().T_size,
-                    get_current_signal_page_from_ptls(ctx.builder, ctx.types().T_size, get_current_ptls(ctx), ctx.tbaa().tbaa_const), -1),
+                emit_ptrgep(ctx, get_current_signal_page_from_ptls(ctx.builder, get_current_ptls(ctx), ctx.tbaa().tbaa_const),
+                    -sizeof(size_t)),
                 true);
         setName(ctx.emission_context, signal_page_load, "signal_page_load");
         ctx.builder.CreateBr(contBB);
@@ -1838,8 +1854,7 @@ static jl_cgval_t emit_ccall(jl_codectx_t &ctx, jl_value_t **args, size_t nargs)
         auto obj = emit_pointer_from_objref(ctx, boxed(ctx, argv[0])); // T_pprjlvalue
         // The inbounds gep makes it more clear to LLVM that the resulting value is not
         // a null pointer.
-        auto strp = ctx.builder.CreateConstInBoundsGEP1_32(ctx.types().T_prjlvalue, obj, 1);
-        setName(ctx.emission_context, strp, "string_ptr");
+        auto strp = emit_ptrgep(ctx, obj, ctx.types().sizeof_ptr, "string_ptr");
         JL_GC_POP();
         return mark_or_box_ccall_result(ctx, strp, retboxed, rt, unionall, static_rt);
     }
@@ -1850,9 +1865,7 @@ static jl_cgval_t emit_ccall(jl_codectx_t &ctx, jl_value_t **args, size_t nargs)
         auto obj = emit_pointer_from_objref(ctx, boxed(ctx, argv[0])); // T_pprjlvalue
         // The inbounds gep makes it more clear to LLVM that the resulting value is not
         // a null pointer.
-        auto strp = ctx.builder.CreateConstInBoundsGEP1_32(
-            ctx.types().T_prjlvalue, obj, (sizeof(jl_sym_t) + sizeof(void*) - 1) / sizeof(void*));
-        setName(ctx.emission_context, strp, "symbol_name");
+        auto strp = emit_ptrgep(ctx, obj, sizeof(jl_sym_t), "symbol_name");
         JL_GC_POP();
         return mark_or_box_ccall_result(ctx, strp, retboxed, rt, unionall, static_rt);
     }
@@ -1965,7 +1978,7 @@ static jl_cgval_t emit_ccall(jl_codectx_t &ctx, jl_value_t **args, size_t nargs)
             // If the value is not boxed, try to compute the object id without
             // reboxing it.
             auto T_p_derived = PointerType::get(ctx.builder.getContext(), AddressSpace::Derived);
-            if (!val.isghost && !val.ispointer())
+            if (!val.isghost)
                 val = value_to_pointer(ctx, val);
             Value *args[] = {
                 emit_typeof(ctx, val, false, true),
@@ -2061,7 +2074,7 @@ jl_cgval_t function_sig_t::emit_a_ccall(
     if (sret) {
         assert(!retboxed && jl_is_datatype(rt) && "sret return type invalid");
         if (jl_is_pointerfree(rt)) {
-            result = emit_static_alloca(ctx, lrt);
+            result = emit_static_alloca(ctx, lrt, Align(julia_alignment(rt)));
             setName(ctx.emission_context, result, "ccall_sret");
             sretty = lrt;
             argvals[0] = result;
@@ -2237,9 +2250,8 @@ jl_cgval_t function_sig_t::emit_a_ccall(
                 if (DL.getTypeStoreSize(resultTy) > rtsz) {
                     // ARM and AArch64 can use a LLVM type larger than the julia type.
                     // When this happens, cast through memory.
-                    auto slot = emit_static_alloca(ctx, resultTy);
+                    auto slot = emit_static_alloca(ctx, resultTy, boxalign);
                     setName(ctx.emission_context, slot, "type_pun_slot");
-                    slot->setAlignment(boxalign);
                     ctx.builder.CreateAlignedStore(result, slot, boxalign);
                     jl_aliasinfo_t ai = jl_aliasinfo_t::fromTBAA(ctx, tbaa);
                     emit_memcpy(ctx, strct, ai, slot, ai, rtsz, boxalign, boxalign);
diff --git a/src/cgmemmgr.cpp b/src/cgmemmgr.cpp
index c78e6092ca5db..8557698a4e513 100644
--- a/src/cgmemmgr.cpp
+++ b/src/cgmemmgr.cpp
@@ -833,28 +833,6 @@ class RTDyldMemoryManagerJL : public SectionMemoryManager {
         mapAddresses(Dyld, ro_alloc);
         mapAddresses(Dyld, exe_alloc);
     }
-#ifdef _OS_WINDOWS_
-    template <typename Alloc>
-    void *lookupWriteAddressFor(void *rt_addr, Alloc &&allocator)
-    {
-        for (auto &alloc: allocator->allocations) {
-            if (alloc.rt_addr == rt_addr) {
-                return alloc.wr_addr;
-            }
-        }
-        return nullptr;
-    }
-    void *lookupWriteAddressFor(void *rt_addr)
-    {
-        if (!ro_alloc)
-            return rt_addr;
-        if (void *ptr = lookupWriteAddressFor(rt_addr, ro_alloc))
-            return ptr;
-        if (void *ptr = lookupWriteAddressFor(rt_addr, exe_alloc))
-            return ptr;
-        return rt_addr;
-    }
-#endif // _OS_WINDOWS_
 };
 
 uint8_t *RTDyldMemoryManagerJL::allocateCodeSection(uintptr_t Size,
@@ -947,13 +925,6 @@ void RTDyldMemoryManagerJL::deregisterEHFrames(uint8_t *Addr,
 
 }
 
-#ifdef _OS_WINDOWS_
-void *lookupWriteAddressFor(RTDyldMemoryManager *memmgr, void *rt_addr)
-{
-    return ((RTDyldMemoryManagerJL*)memmgr)->lookupWriteAddressFor(rt_addr);
-}
-#endif
-
 RTDyldMemoryManager* createRTDyldMemoryManager()
 {
     return new RTDyldMemoryManagerJL();
diff --git a/src/cgutils.cpp b/src/cgutils.cpp
index 2d2d2aed22069..4547e693755cd 100644
--- a/src/cgutils.cpp
+++ b/src/cgutils.cpp
@@ -130,14 +130,8 @@ static Value *stringConstPtr(
     }
     // Doesn't need to be aligned, we shouldn't operate on these like julia objects
     GlobalVariable *gv = get_pointer_to_constant(emission_context, Data, Align(1), "_j_str_" + StringRef(ctxt.data(), ctxt.size()), *M);
-    Value *zero = ConstantInt::get(Type::getInt32Ty(irbuilder.getContext()), 0);
-    Value *Args[] = { zero, zero };
-    auto gep = irbuilder.CreateInBoundsGEP(gv->getValueType(),
-                                       // AddrSpaceCast in case globals are in non-0 AS
-                                       irbuilder.CreateAddrSpaceCast(gv, gv->getValueType()->getPointerTo(0)),
-                                       Args);
-    setName(emission_context, gep, "string_const_ptr");
-    return gep;
+    // AddrSpaceCast in case globals are in non-0 AS
+    return irbuilder.CreateAddrSpaceCast(gv, gv->getValueType()->getPointerTo(0));
 }
 
 
@@ -329,6 +323,8 @@ static bool type_is_permalloc(jl_value_t *typ)
 }
 
 
+// find the offset of pointer fields which never need a write barrier since their type-analysis
+// shows they are permanently rooted
 static void find_perm_offsets(jl_datatype_t *typ, SmallVectorImpl<unsigned> &res, unsigned offset)
 {
     // This is a inlined field at `offset`.
@@ -352,14 +348,37 @@ static void find_perm_offsets(jl_datatype_t *typ, SmallVectorImpl<unsigned> &res
     }
 }
 
-static llvm::SmallVector<llvm::Value*, 0> get_gc_roots_for(jl_codectx_t &ctx, const jl_cgval_t &x)
+// load a pointer to N inlined_roots into registers (as a SmallVector)
+static llvm::SmallVector<Value*,0> load_gc_roots(jl_codectx_t &ctx, Value *inline_roots_ptr, size_t npointers, bool isVolatile=false)
+{
+    SmallVector<Value*,0> gcroots(npointers);
+    Type *T_prjlvalue = ctx.types().T_prjlvalue;
+    auto roots_ai = jl_aliasinfo_t::fromTBAA(ctx, ctx.tbaa().tbaa_gcframe);
+    for (size_t i = 0; i < npointers; i++) {
+        auto *ptr = ctx.builder.CreateAlignedLoad(T_prjlvalue, emit_ptrgep(ctx, inline_roots_ptr, i * sizeof(jl_value_t*)), Align(sizeof(void*)), isVolatile);
+        roots_ai.decorateInst(ptr);
+        gcroots[i] = ptr;
+    }
+    return gcroots;
+}
+
+// inlined bool indicates whether this must return the inlined roots inside x separately, or whether x itself may be used as the root (if x is already isboxed)
+static llvm::SmallVector<Value*,0> get_gc_roots_for(jl_codectx_t &ctx, const jl_cgval_t &x, bool inlined=false)
 {
     if (x.constant || x.typ == jl_bottom_type)
         return {};
-    if (x.Vboxed) // superset of x.isboxed
+    if (!inlined && x.Vboxed) // superset of x.isboxed
         return {x.Vboxed};
-    assert(!x.isboxed);
-    if (x.ispointer()) {
+    assert(!x.isboxed || !inlined);
+    if (!x.inline_roots.empty()) {
+        // if (!inlined) { // TODO: implement this filter operation
+        //     SmallVector<unsigned,4> perm_offsets;
+        //     find_perm_offsets(typ, perm_offsets, 0);
+        //     return filter(!in(perm_offsets), x.inline_roots)
+        // }
+        return x.inline_roots;
+    }
+    if (!inlined && x.ispointer()) {
         assert(x.V);
         assert(x.V->getType()->getPointerAddressSpace() != AddressSpace::Tracked);
         return {x.V};
@@ -369,8 +388,7 @@ static llvm::SmallVector<llvm::Value*, 0> get_gc_roots_for(jl_codectx_t &ctx, co
         Type *T = julia_type_to_llvm(ctx, jltype);
         Value *agg = emit_unbox(ctx, T, x, jltype);
         SmallVector<unsigned,4> perm_offsets;
-        if (jltype && jl_is_datatype(jltype) && ((jl_datatype_t*)jltype)->layout)
-            find_perm_offsets((jl_datatype_t*)jltype, perm_offsets, 0);
+        find_perm_offsets((jl_datatype_t*)jltype, perm_offsets, 0);
         return ExtractTrackedValues(agg, agg->getType(), false, ctx.builder, perm_offsets);
     }
     // nothing here to root, move along
@@ -621,12 +639,6 @@ static unsigned convert_struct_offset(jl_codectx_t &ctx, Type *lty, unsigned byt
     return convert_struct_offset(ctx.builder.GetInsertBlock()->getModule()->getDataLayout(), lty, byte_offset);
 }
 
-static Value *emit_struct_gep(jl_codectx_t &ctx, Type *lty, Value *base, unsigned byte_offset)
-{
-    unsigned idx = convert_struct_offset(ctx, lty, byte_offset);
-    return ctx.builder.CreateConstInBoundsGEP2_32(lty, base, 0, idx);
-}
-
 static Type *_julia_struct_to_llvm(jl_codegen_params_t *ctx, LLVMContext &ctxt, jl_value_t *jt, bool *isboxed, bool llvmcall=false);
 
 static Type *_julia_type_to_llvm(jl_codegen_params_t *ctx, LLVMContext &ctxt, jl_value_t *jt, bool *isboxed)
@@ -1090,6 +1102,247 @@ static void emit_memcpy(jl_codectx_t &ctx, Value *dst, jl_aliasinfo_t const &dst
     emit_memcpy_llvm(ctx, dst, dst_ai, data_pointer(ctx, src), src_ai, sz, align_dst, align_src, is_volatile);
 }
 
+static bool allpointers(jl_datatype_t *typ)
+{
+    return jl_datatype_size(typ) == typ->layout->npointers * sizeof(void*);
+}
+
+// compute the space required by split_value_into, by simulating it
+// returns (sizeof(split_value), n_pointers)
+static std::pair<size_t,size_t> split_value_size(jl_datatype_t *typ)
+{
+    assert(jl_is_datatype(typ));
+    size_t dst_off = 0;
+    bool hasptr = typ->layout->first_ptr >= 0;
+    size_t npointers = hasptr ? typ->layout->npointers : 0;
+    // drop the data pointer if the entire structure is just pointers
+    // TODO: eventually we could drop the slots for the pointers from inside the
+    //       types to pack it together, but this can change the alignment of the bits
+    //       in the fields inside, even if those bits have no pointers themselves. So
+    //       we would actually need to compute, for each pointer, whether any
+    //       subsequent field needed the extra alignment (for example, we can
+    //       drop space for any runs of two/four pointer).  Some of these
+    //       functions are already written in a way to support that, but not
+    //       fully implemented yet.
+    bool nodata = allpointers(typ);
+    if (nodata)
+        dst_off = 0;
+    else
+        dst_off = jl_datatype_size(typ);
+    return std::make_pair(dst_off, npointers);
+}
+
+// take a value `x` and split its bits into dst and the roots into inline_roots
+static void split_value_into(jl_codectx_t &ctx, const jl_cgval_t &x, Align align_src, Value *dst, Align align_dst, jl_aliasinfo_t const &dst_ai, Value *inline_roots_ptr, jl_aliasinfo_t const &roots_ai, bool isVolatileStore=false)
+{
+    jl_datatype_t *typ = (jl_datatype_t*)x.typ;
+    assert(jl_is_concrete_type(x.typ));
+    auto src_ai = jl_aliasinfo_t::fromTBAA(ctx, x.tbaa);
+    Type *T_prjlvalue = ctx.types().T_prjlvalue;
+    if (!x.inline_roots.empty()) {
+        auto sizes = split_value_size(typ);
+        if (sizes.first > 0)
+            emit_memcpy(ctx, dst, dst_ai, x.V, src_ai, sizes.first, align_dst, align_src, isVolatileStore);
+        for (size_t i = 0; i < sizes.second; i++) {
+            Value *unbox = x.inline_roots[i];
+            roots_ai.decorateInst(ctx.builder.CreateAlignedStore(unbox, emit_ptrgep(ctx, inline_roots_ptr, i * sizeof(void*)), Align(sizeof(void*)), isVolatileStore));
+        }
+        return;
+    }
+    if (inline_roots_ptr == nullptr) {
+        emit_unbox_store(ctx, x, dst, ctx.tbaa().tbaa_stack, align_dst, isVolatileStore);
+        return;
+    }
+    Value *src = data_pointer(ctx, value_to_pointer(ctx, x));
+    bool isstack = isa<AllocaInst>(src->stripInBoundsOffsets()) || src_ai.tbaa == ctx.tbaa().tbaa_stack;
+    size_t dst_off = 0;
+    size_t src_off = 0;
+    bool hasptr = typ->layout->first_ptr >= 0;
+    size_t npointers = hasptr ? typ->layout->npointers : 0;
+    bool nodata = allpointers(typ);
+    for (size_t i = 0; true; i++) {
+        bool last = i == npointers;
+        size_t ptr = last ? jl_datatype_size(typ) : (jl_ptr_offset(typ, i) * sizeof(void*));
+        if (ptr > src_off) {
+            emit_memcpy(ctx,
+                emit_ptrgep(ctx, dst, dst_off),
+                dst_ai,
+                emit_ptrgep(ctx, src, src_off),
+                src_ai,
+                ptr - src_off,
+                align_dst,
+                align_src,
+                isVolatileStore);
+            dst_off += ptr - src_off;
+        }
+        if (last)
+            break;
+        auto *load = ctx.builder.CreateAlignedLoad(T_prjlvalue, emit_ptrgep(ctx, src, ptr), Align(sizeof(void*)));
+        if (!isstack)
+            load->setOrdering(AtomicOrdering::Unordered);
+        src_ai.decorateInst(load);
+        roots_ai.decorateInst(ctx.builder.CreateAlignedStore(load, emit_ptrgep(ctx, inline_roots_ptr, i * sizeof(void*)), Align(sizeof(void*)), isVolatileStore));
+        align_src = align_dst = Align(sizeof(void*));
+        src_off = ptr + sizeof(void*);
+        if (!nodata) {
+            // store an undef pointer here, to make sure nobody looks at this
+            dst_ai.decorateInst(ctx.builder.CreateAlignedStore(
+                ctx.builder.getIntN(sizeof(void*) * 8, (uint64_t)-1),
+                emit_ptrgep(ctx, dst, dst_off),
+                align_src,
+                isVolatileStore));
+            dst_off += sizeof(void*);
+            assert(dst_off == src_off);
+        }
+    }
+}
+
+static void split_value_into(jl_codectx_t &ctx, const jl_cgval_t &x, Align align_src, Value *dst, Align align_dst, jl_aliasinfo_t const &dst_ai, MutableArrayRef<Value*> inline_roots)
+{
+    jl_datatype_t *typ = (jl_datatype_t*)x.typ;
+    assert(jl_is_concrete_type(x.typ));
+    auto src_ai = jl_aliasinfo_t::fromTBAA(ctx, x.tbaa);
+    Type *T_prjlvalue = ctx.types().T_prjlvalue;
+    if (!x.inline_roots.empty()) {
+        auto sizes = split_value_size(typ);
+        if (sizes.first > 0)
+            emit_memcpy(ctx, dst, dst_ai, x.V, src_ai, sizes.first, align_dst, align_src);
+        for (size_t i = 0; i < sizes.second; i++)
+            inline_roots[i] = x.inline_roots[i];
+        return;
+    }
+    if (inline_roots.empty()) {
+        emit_unbox_store(ctx, x, dst, ctx.tbaa().tbaa_stack, align_dst);
+        return;
+    }
+    Value *src = data_pointer(ctx, value_to_pointer(ctx, x));
+    bool isstack = isa<AllocaInst>(src->stripInBoundsOffsets()) || src_ai.tbaa == ctx.tbaa().tbaa_stack;
+    size_t dst_off = 0;
+    size_t src_off = 0;
+    bool hasptr = typ->layout->first_ptr >= 0;
+    size_t npointers = hasptr ? typ->layout->npointers : 0;
+    bool nodata = allpointers(typ);
+    for (size_t i = 0; true; i++) {
+        bool last = i == npointers;
+        size_t ptr = last ? jl_datatype_size(typ) : (jl_ptr_offset(typ, i) * sizeof(void*));
+        if (ptr > src_off) {
+            emit_memcpy(ctx,
+                emit_ptrgep(ctx, dst, dst_off),
+                dst_ai,
+                emit_ptrgep(ctx, src, src_off),
+                src_ai,
+                ptr - src_off,
+                align_dst,
+                align_src);
+            dst_off += ptr - src_off;
+        }
+        if (last)
+            break;
+        auto *load = ctx.builder.CreateAlignedLoad(T_prjlvalue, emit_ptrgep(ctx, src, ptr), Align(sizeof(void*)));
+        if (!isstack)
+            load->setOrdering(AtomicOrdering::Unordered);
+        src_ai.decorateInst(load);
+        inline_roots[i] = load;
+        align_src = align_dst = Align(sizeof(void*));
+        src_off = ptr + sizeof(void*);
+        if (!nodata) {
+            // store an undef pointer here, to make sure nobody looks at this
+            dst_ai.decorateInst(ctx.builder.CreateAlignedStore(
+                ctx.builder.getIntN(sizeof(void*) * 8, (uint64_t)-1),
+                emit_ptrgep(ctx, dst, dst_off),
+                align_src));
+            dst_off += sizeof(void*);
+            assert(dst_off == src_off);
+        }
+    }
+}
+
+static std::pair<AllocaInst*, SmallVector<Value*,0>> split_value(jl_codectx_t &ctx, const jl_cgval_t &x, Align x_alignment)
+{
+    jl_datatype_t *typ = (jl_datatype_t*)x.typ;
+    auto sizes = split_value_size(typ);
+    Align align_dst(julia_alignment((jl_value_t*)typ));
+    AllocaInst *bits = sizes.first > 0 ? emit_static_alloca(ctx, sizes.first, align_dst) : nullptr;
+    SmallVector<Value*,0> roots(sizes.second);
+    auto stack_ai = jl_aliasinfo_t::fromTBAA(ctx, ctx.tbaa().tbaa_stack);
+    split_value_into(ctx, x, x_alignment, bits, align_dst, stack_ai, MutableArrayRef(roots));
+    return std::make_pair(bits, roots);
+}
+
+// Return the offset values corresponding to jl_field_offset, but into the two buffers for a split value (or -1)
+static std::pair<ssize_t,ssize_t> split_value_field(jl_datatype_t *typ, unsigned idx)
+{
+    size_t fldoff = jl_field_offset(typ, idx);
+    size_t src_off = 0;
+    size_t dst_off = 0;
+    assert(typ->layout->first_ptr >= 0);
+    size_t npointers = typ->layout->npointers;
+    bool nodata = allpointers(typ);
+    for (size_t i = 0; i < npointers; i++) {
+        size_t ptr = jl_ptr_offset(typ, i) * sizeof(void*);
+        if (ptr >= fldoff) {
+            if (ptr >= fldoff + jl_field_size(typ, idx))
+                break;
+            bool onlyptr = jl_field_isptr(typ, idx) || allpointers((jl_datatype_t*)jl_field_type(typ, idx));
+            return std::make_pair(onlyptr ? -1 : dst_off + fldoff - src_off, i);
+        }
+        dst_off += ptr - src_off;
+        src_off = ptr + sizeof(void*);
+        if (!nodata) {
+            assert(dst_off + sizeof(void*) == src_off);
+            dst_off = src_off;
+        }
+    }
+    return std::make_pair(dst_off + fldoff - src_off, -1);
+}
+
+// Copy `x` to `dst`, where `x` was a split value and dst needs to have a native layout, copying any inlined roots back into their native location.
+// This does not respect roots, so you must call emit_write_multibarrier afterwards.
+static void recombine_value(jl_codectx_t &ctx, const jl_cgval_t &x, Value *dst, jl_aliasinfo_t const &dst_ai, Align alignment, bool isVolatileStore)
+{
+    jl_datatype_t *typ = (jl_datatype_t*)x.typ;
+    assert(jl_is_concrete_type(x.typ));
+    assert(typ->layout->first_ptr >= 0 && !x.inline_roots.empty());
+    Align align_dst = alignment;
+    Align align_src(julia_alignment(x.typ));
+    Value *src = x.V;
+    auto src_ai = jl_aliasinfo_t::fromTBAA(ctx, x.tbaa);
+    size_t dst_off = 0;
+    size_t src_off = 0;
+    size_t npointers = typ->layout->npointers;
+    bool nodata = allpointers(typ);
+    bool isstack = isa<AllocaInst>(dst->stripInBoundsOffsets()) || dst_ai.tbaa == ctx.tbaa().tbaa_stack;
+    for (size_t i = 0; true; i++) {
+        bool last = i == npointers;
+        size_t ptr = last ? jl_datatype_size(typ) : (jl_ptr_offset(typ, i) * sizeof(void*));
+        if (ptr > dst_off) {
+            emit_memcpy(ctx,
+                emit_ptrgep(ctx, dst, dst_off),
+                dst_ai,
+                emit_ptrgep(ctx, src, src_off),
+                src_ai,
+                ptr - dst_off,
+                align_dst,
+                align_src,
+                isVolatileStore);
+            src_off += ptr - dst_off;
+        }
+        if (last)
+            break;
+        auto *root = x.inline_roots[i];
+        auto *store = ctx.builder.CreateAlignedStore(root, emit_ptrgep(ctx, dst, ptr), Align(sizeof(void*)), isVolatileStore);
+        if (!isstack)
+            store->setOrdering(AtomicOrdering::Unordered);
+        dst_ai.decorateInst(store);
+        align_dst = align_src = Align(sizeof(void*));
+        dst_off = ptr + sizeof(void*);
+        if (!nodata) {
+            assert(src_off + sizeof(void*) == dst_off);
+            src_off = dst_off;
+        }
+    }
+}
+
 static Value *emit_tagfrom(jl_codectx_t &ctx, jl_datatype_t *dt)
 {
     if (dt->smalltag)
@@ -1200,10 +1453,10 @@ static Value *emit_typeof(jl_codectx_t &ctx, const jl_cgval_t &p, bool maybenull
 static Value *emit_datatype_types(jl_codectx_t &ctx, Value *dt)
 {
     Value *Ptr = decay_derived(ctx, dt);
-    Value *Idx = ConstantInt::get(ctx.types().T_size, offsetof(jl_datatype_t, types) / sizeof(void*));
+    unsigned Idx = offsetof(jl_datatype_t, types);
     jl_aliasinfo_t ai = jl_aliasinfo_t::fromTBAA(ctx, ctx.tbaa().tbaa_const);
     auto types = ai.decorateInst(ctx.builder.CreateAlignedLoad(
-                ctx.types().T_pjlvalue, ctx.builder.CreateInBoundsGEP(ctx.types().T_pjlvalue, Ptr, Idx), Align(sizeof(void*))));
+                ctx.types().T_pjlvalue, emit_ptrgep(ctx, Ptr, Idx), Align(sizeof(void*))));
     setName(ctx.emission_context, types, "datatype_types");
     return types;
 }
@@ -1222,16 +1475,13 @@ static Value *emit_datatype_size(jl_codectx_t &ctx, Value *dt, bool add_isunion=
 {
     jl_aliasinfo_t ai = jl_aliasinfo_t::fromTBAA(ctx, ctx.tbaa().tbaa_const);
     Value *Ptr = decay_derived(ctx, dt);
-    Value *Idx = ConstantInt::get(ctx.types().T_size, offsetof(jl_datatype_t, layout) / sizeof(int32_t*));
-    Ptr = ctx.builder.CreateInBoundsGEP(getPointerTy(ctx.builder.getContext()), Ptr, Idx);
+    Ptr = emit_ptrgep(ctx, Ptr, offsetof(jl_datatype_t, layout));
     Ptr = ai.decorateInst(ctx.builder.CreateAlignedLoad(getPointerTy(ctx.builder.getContext()), Ptr, Align(sizeof(int32_t*))));
-    Idx = ConstantInt::get(ctx.types().T_size, offsetof(jl_datatype_layout_t, size) / sizeof(int32_t));
-    Value *SizePtr = ctx.builder.CreateInBoundsGEP(getInt32Ty(ctx.builder.getContext()), Ptr, Idx);
+    Value *SizePtr = emit_ptrgep(ctx, Ptr, offsetof(jl_datatype_layout_t, size));
     Value *Size = ai.decorateInst(ctx.builder.CreateAlignedLoad(getInt32Ty(ctx.builder.getContext()), SizePtr, Align(sizeof(int32_t))));
     setName(ctx.emission_context, Size, "datatype_size");
     if (add_isunion) {
-        Idx = ConstantInt::get(ctx.types().T_size, offsetof(jl_datatype_layout_t, flags) / sizeof(int8_t));
-        Value *FlagPtr = ctx.builder.CreateInBoundsGEP(getInt8Ty(ctx.builder.getContext()), Ptr, Idx);
+        Value *FlagPtr = emit_ptrgep(ctx, Ptr, offsetof(jl_datatype_layout_t, flags));
         Value *Flag = ai.decorateInst(ctx.builder.CreateAlignedLoad(getInt16Ty(ctx.builder.getContext()), FlagPtr, Align(sizeof(int16_t))));
         Flag = ctx.builder.CreateLShr(Flag, 4);
         Flag = ctx.builder.CreateAnd(Flag, ConstantInt::get(Flag->getType(), 1));
@@ -1308,7 +1558,7 @@ static Value *emit_datatype_mutabl(jl_codectx_t &ctx, Value *dt)
 static Value *emit_datatype_isprimitivetype(jl_codectx_t &ctx, Value *typ)
 {
     Value *isprimitive;
-    isprimitive = ctx.builder.CreateConstInBoundsGEP1_32(getInt8Ty(ctx.builder.getContext()), decay_derived(ctx, typ), offsetof(jl_datatype_t, hash) + sizeof(((jl_datatype_t*)nullptr)->hash));
+    isprimitive = emit_ptrgep(ctx, decay_derived(ctx, typ), offsetof(jl_datatype_t, hash) + sizeof(((jl_datatype_t*)nullptr)->hash));
     jl_aliasinfo_t ai = jl_aliasinfo_t::fromTBAA(ctx, ctx.tbaa().tbaa_const);
     isprimitive = ai.decorateInst(ctx.builder.CreateAlignedLoad(getInt8Ty(ctx.builder.getContext()), isprimitive, Align(1)));
     isprimitive = ctx.builder.CreateLShr(isprimitive, 7);
@@ -1320,10 +1570,7 @@ static Value *emit_datatype_isprimitivetype(jl_codectx_t &ctx, Value *typ)
 static Value *emit_datatype_name(jl_codectx_t &ctx, Value *dt)
 {
     unsigned n = offsetof(jl_datatype_t, name) / sizeof(char*);
-    Value *vptr = ctx.builder.CreateInBoundsGEP(
-            ctx.types().T_pjlvalue,
-            maybe_decay_tracked(ctx, dt),
-            ConstantInt::get(ctx.types().T_size, n));
+    Value *vptr = emit_ptrgep(ctx, maybe_decay_tracked(ctx, dt), n * sizeof(jl_value_t*));
     jl_aliasinfo_t ai = jl_aliasinfo_t::fromTBAA(ctx, ctx.tbaa().tbaa_const);
     auto name = ai.decorateInst(ctx.builder.CreateAlignedLoad(ctx.types().T_pjlvalue, vptr, Align(sizeof(void*))));
     setName(ctx.emission_context, name, "datatype_name");
@@ -1439,15 +1686,23 @@ static void null_load_check(jl_codectx_t &ctx, Value *v, jl_module_t *scope, jl_
 }
 
 template<typename Func>
-static Value *emit_guarded_test(jl_codectx_t &ctx, Value *ifnot, Value *defval, Func &&func)
+static void emit_guarded_test(jl_codectx_t &ctx, Value *ifnot, MutableArrayRef<Value*> defval, Func &&func)
 {
-    if (!ifnot) {
-        return func();
+    if (ifnot == nullptr) {
+        auto res = func();
+        assert(res.size() == defval.size());
+        for (size_t i = 0; i < defval.size(); i++)
+            defval[i] = res[i];
+        return;
     }
     if (auto Cond = dyn_cast<ConstantInt>(ifnot)) {
         if (Cond->isZero())
-            return defval;
-        return func();
+            return;
+        auto res = func();
+        assert(res.size() == defval.size());
+        for (size_t i = 0; i < defval.size(); i++)
+            defval[i] = res[i];
+        return;
     }
     ++EmittedGuards;
     BasicBlock *currBB = ctx.builder.GetInsertBlock();
@@ -1456,16 +1711,33 @@ static Value *emit_guarded_test(jl_codectx_t &ctx, Value *ifnot, Value *defval,
     ctx.builder.CreateCondBr(ifnot, passBB, exitBB);
     ctx.builder.SetInsertPoint(passBB);
     auto res = func();
+    assert(res.size() == defval.size());
     passBB = ctx.builder.GetInsertBlock();
     ctx.builder.CreateBr(exitBB);
     ctx.builder.SetInsertPoint(exitBB);
-    if (defval == nullptr)
+    for (size_t i = 0; i < defval.size(); i++) {
+        PHINode *phi = ctx.builder.CreatePHI(defval[i]->getType(), 2);
+        phi->addIncoming(defval[i], currBB);
+        phi->addIncoming(res[i], passBB);
+        setName(ctx.emission_context, phi, "guard_res");
+        defval[i] = phi;
+    }
+}
+
+template<typename Func>
+static Value *emit_guarded_test(jl_codectx_t &ctx, Value *ifnot, Value *defval, Func &&func)
+{
+    MutableArrayRef res(&defval, defval == nullptr ? 0 : 1);
+    auto funcwrap = [&func] () -> SmallVector<Value*,1> {
+        auto res = func();
+        if (res == nullptr)
+            return {};
+        return {res};
+    };
+    emit_guarded_test(ctx, ifnot, res, funcwrap);
+    if (res.empty())
         return nullptr;
-    PHINode *phi = ctx.builder.CreatePHI(defval->getType(), 2);
-    phi->addIncoming(defval, currBB);
-    phi->addIncoming(res, passBB);
-    setName(ctx.emission_context, phi, "guard_res");
-    return phi;
+    return res[0];
 }
 
 template<typename Func>
@@ -1522,7 +1794,7 @@ static Value *emit_typeof(jl_codectx_t &ctx, Value *v, bool maybenull, bool just
             // we lied a bit: this wasn't really an object (though it was valid for GC rooting)
             // and we need to use it as an index to get the real object now
             Module *M = jl_Module;
-            Value *smallp = ctx.builder.CreateInBoundsGEP(getInt8Ty(ctx.builder.getContext()), prepare_global_in(M, jl_small_typeof_var), tag);
+            Value *smallp = emit_ptrgep(ctx, prepare_global_in(M, jl_small_typeof_var), tag);
             jl_aliasinfo_t ai = jl_aliasinfo_t::fromTBAA(ctx, ctx.tbaa().tbaa_const);
             auto small = ctx.builder.CreateAlignedLoad(typetag->getType(), smallp, M->getDataLayout().getPointerABIAlignment(0));
             small->setMetadata(LLVMContext::MD_nonnull, MDNode::get(M->getContext(), None));
@@ -1773,7 +2045,7 @@ static std::pair<Value*, bool> emit_isa(jl_codectx_t &ctx, const jl_cgval_t &x,
 // declare that the pointer is legal (for zero bytes) even though it might be undef.
 static Value *emit_isa_and_defined(jl_codectx_t &ctx, const jl_cgval_t &val, jl_value_t *typ)
 {
-    return emit_nullcheck_guard(ctx, val.ispointer() ? val.V : nullptr, [&] {
+    return emit_nullcheck_guard(ctx, val.inline_roots.empty() && val.ispointer() ? val.V : nullptr, [&] {
         return emit_isa(ctx, val, typ, Twine()).first;
     });
 }
@@ -1802,7 +2074,7 @@ static void emit_typecheck(jl_codectx_t &ctx, const jl_cgval_t &x, jl_value_t *t
 static Value *emit_isconcrete(jl_codectx_t &ctx, Value *typ)
 {
     Value *isconcrete;
-    isconcrete = ctx.builder.CreateConstInBoundsGEP1_32(getInt8Ty(ctx.builder.getContext()), decay_derived(ctx, typ), offsetof(jl_datatype_t, hash) + sizeof(((jl_datatype_t*)nullptr)->hash));
+    isconcrete = emit_ptrgep(ctx, decay_derived(ctx, typ), offsetof(jl_datatype_t, hash) + sizeof(((jl_datatype_t*)nullptr)->hash));
     jl_aliasinfo_t ai = jl_aliasinfo_t::fromTBAA(ctx, ctx.tbaa().tbaa_const);
     isconcrete = ai.decorateInst(ctx.builder.CreateAlignedLoad(getInt8Ty(ctx.builder.getContext()), isconcrete, Align(1)));
     isconcrete = ctx.builder.CreateLShr(isconcrete, 1);
@@ -1856,6 +2128,9 @@ static Value *emit_bounds_check(jl_codectx_t &ctx, const jl_cgval_t &ainfo, jl_v
             if (ainfo.isghost) {
                 a = Constant::getNullValue(getPointerTy(ctx.builder.getContext()));
             }
+            else if (!ainfo.inline_roots.empty()) {
+                a = value_to_pointer(ctx, ainfo).V;
+            }
             else if (!ainfo.ispointer()) {
                 // CreateAlloca is OK here since we are on an error branch
                 Value *tempSpace = ctx.builder.CreateAlloca(a->getType());
@@ -1887,6 +2162,7 @@ static Value *CreateSimplifiedExtractValue(jl_codectx_t &ctx, Value *Agg, ArrayR
 static void emit_write_barrier(jl_codectx_t&, Value*, ArrayRef<Value*>);
 static void emit_write_barrier(jl_codectx_t&, Value*, Value*);
 static void emit_write_multibarrier(jl_codectx_t&, Value*, Value*, jl_value_t*);
+static void emit_write_multibarrier(jl_codectx_t &ctx, Value *parent, const jl_cgval_t &x);
 
 SmallVector<unsigned, 0> first_ptr(Type *T)
 {
@@ -1948,85 +2224,81 @@ static jl_cgval_t typed_load(jl_codectx_t &ctx, Value *ptr, Value *idx_0based, j
                              bool maybe_null_if_boxed = true, unsigned alignment = 0,
                              Value **nullcheck = nullptr)
 {
-    // TODO: we should use unordered loads for anything with CountTrackedPointers(elty).count > 0 (if not otherwise locked)
     Type *elty = isboxed ? ctx.types().T_prjlvalue : julia_type_to_llvm(ctx, jltype);
     if (type_is_ghost(elty)) {
         if (isStrongerThanMonotonic(Order))
             ctx.builder.CreateFence(Order);
         return ghostValue(ctx, jltype);
     }
+    if (isboxed)
+        alignment = sizeof(void*);
+    else if (!alignment)
+        alignment = julia_alignment(jltype);
+    if (idx_0based)
+        ptr = ctx.builder.CreateInBoundsGEP(elty, ptr, idx_0based);
     unsigned nb = isboxed ? sizeof(void*) : jl_datatype_size(jltype);
     // note that nb == jl_Module->getDataLayout().getTypeAllocSize(elty) or getTypeStoreSize, depending on whether it is a struct or primitive type
     AllocaInst *intcast = NULL;
-    if (Order == AtomicOrdering::NotAtomic) {
-        if (!isboxed && !aliasscope && elty->isAggregateType() && !CountTrackedPointers(elty).count) {
-            intcast = emit_static_alloca(ctx, elty);
-            setName(ctx.emission_context, intcast, "aggregate_load_box");
+    if (Order == AtomicOrdering::NotAtomic && !isboxed && !aliasscope && elty->isAggregateType() && !jl_is_genericmemoryref_type(jltype)) {
+        // use split_value to do this load
+        auto src = mark_julia_slot(ptr, jltype, NULL, tbaa);
+        auto copy = split_value(ctx, src, Align(alignment));
+        if (maybe_null_if_boxed && !copy.second.empty()) {
+            null_pointer_check(ctx, copy.second[0], nullcheck);
         }
+        return mark_julia_slot(copy.first, jltype, NULL, ctx.tbaa().tbaa_stack, copy.second);
     }
-    else {
+    Type *realelty = elty;
+    if (Order != AtomicOrdering::NotAtomic) {
         if (!isboxed && !elty->isIntOrPtrTy()) {
-            intcast = emit_static_alloca(ctx, elty);
+            intcast = emit_static_alloca(ctx, elty, Align(alignment));
             setName(ctx.emission_context, intcast, "atomic_load_box");
-            elty = Type::getIntNTy(ctx.builder.getContext(), 8 * nb);
+            realelty = elty = Type::getIntNTy(ctx.builder.getContext(), 8 * nb);
+        }
+        if (isa<IntegerType>(elty)) {
+            unsigned nb2 = PowerOf2Ceil(nb);
+            if (nb != nb2)
+                elty = Type::getIntNTy(ctx.builder.getContext(), 8 * nb2);
         }
     }
-    Type *realelty = elty;
-    if (Order != AtomicOrdering::NotAtomic && isa<IntegerType>(elty)) {
-        unsigned nb2 = PowerOf2Ceil(nb);
-        if (nb != nb2)
-            elty = Type::getIntNTy(ctx.builder.getContext(), 8 * nb2);
-    }
-    Value *data = ptr;
-    if (idx_0based)
-        data = ctx.builder.CreateInBoundsGEP(elty, data, idx_0based);
     Value *instr = nullptr;
-    if (isboxed)
-        alignment = sizeof(void*);
-    else if (!alignment)
-        alignment = julia_alignment(jltype);
-    if (intcast && Order == AtomicOrdering::NotAtomic) {
-        emit_memcpy(ctx, intcast, jl_aliasinfo_t::fromTBAA(ctx, ctx.tbaa().tbaa_stack), data, jl_aliasinfo_t::fromTBAA(ctx, tbaa), nb, Align(alignment), intcast->getAlign());
+    if (!isboxed && jl_is_genericmemoryref_type(jltype)) {
+        // load these FCA as individual fields, so LLVM does not need to split them later
+        Value *fld0 = ctx.builder.CreateStructGEP(elty, ptr, 0);
+        LoadInst *load0 = ctx.builder.CreateAlignedLoad(elty->getStructElementType(0), fld0, Align(alignment), false);
+        load0->setOrdering(Order);
+        jl_aliasinfo_t ai = jl_aliasinfo_t::fromTBAA(ctx, tbaa);
+        ai.scope = MDNode::concatenate(aliasscope, ai.scope);
+        ai.decorateInst(load0);
+        Value *fld1 = ctx.builder.CreateStructGEP(elty, ptr, 1);
+        LoadInst *load1 = ctx.builder.CreateAlignedLoad(elty->getStructElementType(1), fld1, Align(alignment), false);
+        static_assert(offsetof(jl_genericmemoryref_t, ptr_or_offset) == 0, "wrong field order");
+        maybe_mark_load_dereferenceable(load1, true, sizeof(void*)*2, alignof(void*));
+        load1->setOrdering(Order);
+        ai.decorateInst(load1);
+        instr = Constant::getNullValue(elty);
+        instr = ctx.builder.CreateInsertValue(instr, load0, 0);
+        instr = ctx.builder.CreateInsertValue(instr, load1, 1);
     }
     else {
-        if (!isboxed && jl_is_genericmemoryref_type(jltype)) {
-            // load these FCA as individual fields, so LLVM does not need to split them later
-            Value *fld0 = ctx.builder.CreateStructGEP(elty, data, 0);
-            LoadInst *load0 = ctx.builder.CreateAlignedLoad(elty->getStructElementType(0), fld0, Align(alignment), false);
-            load0->setOrdering(Order);
-            jl_aliasinfo_t ai = jl_aliasinfo_t::fromTBAA(ctx, tbaa);
-            ai.scope = MDNode::concatenate(aliasscope, ai.scope);
-            ai.decorateInst(load0);
-            Value *fld1 = ctx.builder.CreateStructGEP(elty, data, 1);
-            LoadInst *load1 = ctx.builder.CreateAlignedLoad(elty->getStructElementType(1), fld1, Align(alignment), false);
-            static_assert(offsetof(jl_genericmemoryref_t, ptr_or_offset) == 0, "wrong field order");
-            maybe_mark_load_dereferenceable(load1, true, sizeof(void*)*2, alignof(void*));
-            load1->setOrdering(Order);
-            ai.decorateInst(load1);
-            instr = Constant::getNullValue(elty);
-            instr = ctx.builder.CreateInsertValue(instr, load0, 0);
-            instr = ctx.builder.CreateInsertValue(instr, load1, 1);
-        }
-        else {
-            LoadInst *load = ctx.builder.CreateAlignedLoad(elty, data, Align(alignment), false);
-            load->setOrdering(Order);
-            if (isboxed)
-                maybe_mark_load_dereferenceable(load, true, jltype);
-            jl_aliasinfo_t ai = jl_aliasinfo_t::fromTBAA(ctx, tbaa);
-            ai.scope = MDNode::concatenate(aliasscope, ai.scope);
-            ai.decorateInst(load);
-            instr = load;
-        }
-        if (elty != realelty)
-            instr = ctx.builder.CreateTrunc(instr, realelty);
-        if (intcast) {
-            ctx.builder.CreateStore(instr, intcast);
-            instr = nullptr;
-        }
+        LoadInst *load = ctx.builder.CreateAlignedLoad(elty, ptr, Align(alignment), false);
+        load->setOrdering(Order);
+        if (isboxed)
+            maybe_mark_load_dereferenceable(load, true, jltype);
+        jl_aliasinfo_t ai = jl_aliasinfo_t::fromTBAA(ctx, tbaa);
+        ai.scope = MDNode::concatenate(aliasscope, ai.scope);
+        ai.decorateInst(load);
+        instr = load;
+    }
+    if (elty != realelty)
+        instr = ctx.builder.CreateTrunc(instr, realelty);
+    if (intcast) {
+        ctx.builder.CreateAlignedStore(instr, intcast, Align(alignment));
+        instr = nullptr;
     }
     if (maybe_null_if_boxed) {
         if (intcast)
-            instr = ctx.builder.CreateLoad(intcast->getAllocatedType(), intcast);
+            instr = ctx.builder.CreateAlignedLoad(intcast->getAllocatedType(), intcast, Align(alignment));
         Value *first_ptr = isboxed ? instr : extract_first_ptr(ctx, instr);
         if (first_ptr)
             null_pointer_check(ctx, first_ptr, nullcheck);
@@ -2039,7 +2311,7 @@ static jl_cgval_t typed_load(jl_codectx_t &ctx, Value *ptr, Value *idx_0based, j
         //    ConstantAsMetadata::get(ConstantInt::get(T_int8, 0)),
         //    ConstantAsMetadata::get(ConstantInt::get(T_int8, 2)) }));
         if (intcast)
-            instr = ctx.builder.CreateLoad(intcast->getAllocatedType(), intcast);
+            instr = ctx.builder.CreateAlignedLoad(intcast->getAllocatedType(), intcast, Align(alignment));
         instr = ctx.builder.CreateTrunc(instr, getInt1Ty(ctx.builder.getContext()));
     }
     if (instr)
@@ -2064,6 +2336,12 @@ static jl_cgval_t typed_store(jl_codectx_t &ctx,
             ret = emit_invoke(ctx, *modifyop, argv, 3, (jl_value_t*)jl_any_type);
         }
         else {
+            if (trim_may_error(ctx.params->trim)) {
+                // if we know the return type, we can assume the result is of that type
+                errs() << "ERROR: Dynamic call to setfield/modifyfield\n";
+                errs() << "In " << ctx.builder.getCurrentDebugLocation()->getFilename() << ":" << ctx.builder.getCurrentDebugLocation()->getLine() << "\n";
+                print_stacktrace(ctx, ctx.params->trim);
+            }
             Value *callval = emit_jlcall(ctx, jlapplygeneric_func, nullptr, argv, 3, julia_call);
             ret = mark_julia_type(ctx, callval, true, jl_any_type);
         }
@@ -2071,6 +2349,10 @@ static jl_cgval_t typed_store(jl_codectx_t &ctx,
         ret = update_julia_type(ctx, ret, jltype);
         return ret;
     };
+    if (isboxed)
+        alignment = sizeof(void*);
+    else if (!alignment)
+        alignment = julia_alignment(jltype);
     Type *elty = isboxed ? ctx.types().T_prjlvalue : julia_type_to_llvm(ctx, jltype);
     if (type_is_ghost(elty) ||
             (issetfieldonce && !maybe_null_if_boxed) ||
@@ -2113,7 +2395,7 @@ static jl_cgval_t typed_store(jl_codectx_t &ctx,
         intcast_eltyp = elty;
         elty = Type::getIntNTy(ctx.builder.getContext(), 8 * nb);
         if (!issetfield) {
-            intcast = emit_static_alloca(ctx, elty);
+            intcast = emit_static_alloca(ctx, elty, Align(alignment));
             setName(ctx.emission_context, intcast, "atomic_store_box");
         }
     }
@@ -2133,16 +2415,12 @@ static jl_cgval_t typed_store(jl_codectx_t &ctx,
             emit_unbox_store(ctx, rhs, intcast, ctx.tbaa().tbaa_stack, intcast->getAlign());
             r = ctx.builder.CreateLoad(realelty, intcast);
         }
-        else if (aliasscope || Order != AtomicOrdering::NotAtomic || tracked_pointers) {
+        else if (aliasscope || Order != AtomicOrdering::NotAtomic || (tracked_pointers && rhs.inline_roots.empty())) {
             r = emit_unbox(ctx, realelty, rhs, jltype);
         }
         if (realelty != elty)
             r = ctx.builder.CreateZExt(r, elty);
     }
-    if (isboxed)
-        alignment = sizeof(void*);
-    else if (!alignment)
-        alignment = julia_alignment(jltype);
     Value *instr = nullptr;
     Value *Compare = nullptr;
     Value *Success = nullptr;
@@ -2297,8 +2575,7 @@ static jl_cgval_t typed_store(jl_codectx_t &ctx,
                 if (!tracked_pointers) // oldval is a slot, so put the oldval back
                     ctx.builder.CreateStore(realCompare, intcast);
             }
-            else if (Order != AtomicOrdering::NotAtomic) {
-                assert(!tracked_pointers);
+            else if (Order != AtomicOrdering::NotAtomic || (tracked_pointers && rhs.inline_roots.empty())) {
                 r = emit_unbox(ctx, realelty, rhs, jltype);
             }
             if (realelty != elty)
@@ -2411,23 +2688,30 @@ static jl_cgval_t typed_store(jl_codectx_t &ctx,
         ctx.builder.SetInsertPoint(DoneBB);
     if (needlock)
         emit_lockstate_value(ctx, needlock, false);
-    if (parent != NULL && r && tracked_pointers && (!isboxed || !type_is_permalloc(rhs.typ))) {
+    if (parent != NULL && tracked_pointers && (!isboxed || !type_is_permalloc(rhs.typ))) {
         if (isreplacefield || issetfieldonce) {
             BasicBlock *BB = BasicBlock::Create(ctx.builder.getContext(), "xchg_wb", ctx.f);
             DoneBB = BasicBlock::Create(ctx.builder.getContext(), "done_xchg_wb", ctx.f);
             ctx.builder.CreateCondBr(Success, BB, DoneBB);
             ctx.builder.SetInsertPoint(BB);
         }
-        if (realelty != elty)
-            r = ctx.builder.Insert(CastInst::Create(Instruction::Trunc, r, realelty));
-        if (intcast) {
-            ctx.builder.CreateStore(r, intcast);
-            r = ctx.builder.CreateLoad(intcast_eltyp, intcast);
+        if (r) {
+            if (realelty != elty)
+                r = ctx.builder.Insert(CastInst::Create(Instruction::Trunc, r, realelty));
+            if (intcast) {
+                ctx.builder.CreateStore(r, intcast);
+                r = ctx.builder.CreateLoad(intcast_eltyp, intcast);
+            }
+            if (!isboxed)
+                emit_write_multibarrier(ctx, parent, r, rhs.typ);
+            else
+                emit_write_barrier(ctx, parent, r);
+        }
+        else {
+            assert(!isboxed);
+            assert(!rhs.inline_roots.empty());
+            emit_write_multibarrier(ctx, parent, rhs);
         }
-        if (!isboxed)
-            emit_write_multibarrier(ctx, parent, r, rhs.typ);
-        else if (!type_is_permalloc(rhs.typ))
-            emit_write_barrier(ctx, parent, r);
         if (isreplacefield || issetfieldonce) {
             ctx.builder.CreateBr(DoneBB);
             ctx.builder.SetInsertPoint(DoneBB);
@@ -2542,7 +2826,7 @@ static bool emit_getfield_unknownidx(jl_codectx_t &ctx,
     }
     assert(!jl_is_vecelement_type((jl_value_t*)stt));
 
-    if (!strct.ispointer()) { // unboxed
+    if (strct.inline_roots.empty() && !strct.ispointer()) { // unboxed
         assert(jl_is_concrete_immutable((jl_value_t*)stt));
         bool isboxed = is_datatype_all_pointers(stt);
         jl_svec_t *types = stt->types;
@@ -2598,7 +2882,8 @@ static bool emit_getfield_unknownidx(jl_codectx_t &ctx,
     }
 
     bool maybeatomic = stt->name->atomicfields != NULL;
-    if (strct.ispointer() && !maybeatomic) { // boxed or stack
+    if ((strct.inline_roots.empty() && strct.ispointer()) && !maybeatomic) { // boxed or stack
+        // COMBAK: inline_roots support could be implemented for this
         if (order != jl_memory_order_notatomic && order != jl_memory_order_unspecified) {
             emit_atomic_error(ctx, "getfield: non-atomic field cannot be accessed atomically");
             *ret = jl_cgval_t(); // unreachable
@@ -2674,11 +2959,8 @@ static jl_cgval_t emit_unionload(jl_codectx_t &ctx, Value *addr, Value *ptindex,
     Value *tindex = ctx.builder.CreateNUWAdd(ConstantInt::get(getInt8Ty(ctx.builder.getContext()), 1), tindex0);
     if (fsz > 0 && mutabl) {
         // move value to an immutable stack slot (excluding tindex)
-        Type *AT = ArrayType::get(IntegerType::get(ctx.builder.getContext(), 8 * al), (fsz + al - 1) / al);
-        AllocaInst *lv = emit_static_alloca(ctx, AT);
+        AllocaInst *lv = emit_static_alloca(ctx, fsz, Align(al));
         setName(ctx.emission_context, lv, "immutable_union");
-        if (al > 1)
-            lv->setAlignment(Align(al));
         jl_aliasinfo_t ai = jl_aliasinfo_t::fromTBAA(ctx, tbaa);
         emit_memcpy(ctx, lv, ai, addr, ai, fsz, Align(al), Align(al));
         addr = lv;
@@ -2845,39 +3127,51 @@ static jl_cgval_t emit_getfield_knownidx(jl_codectx_t &ctx, const jl_cgval_t &st
     }
     bool maybe_null = field_may_be_null(strct, jt, idx);
     size_t byte_offset = jl_field_offset(jt, idx);
-    if (strct.ispointer()) {
+    if (!strct.inline_roots.empty()) {
+        assert(!isatomic && !needlock);
         auto tbaa = best_field_tbaa(ctx, strct, jt, idx, byte_offset);
-        Value *staddr = data_pointer(ctx, strct);
-        bool isboxed;
-        Type *lt = julia_type_to_llvm(ctx, (jl_value_t*)jt, &isboxed);
-        Value *addr;
-        if (isboxed) {
-            // byte_offset == 0 is an important special case here, e.g.
-            // for single field wrapper types. Introducing the bitcast
-            // can pessimize mem2reg
-            if (byte_offset > 0) {
-                addr = ctx.builder.CreateInBoundsGEP(
-                        getInt8Ty(ctx.builder.getContext()),
-                        staddr,
-                        ConstantInt::get(ctx.types().T_size, byte_offset));
-            }
-            else {
-                addr = staddr;
-            }
+        auto offsets = split_value_field(jt, idx);
+        bool hasptr = offsets.second >= 0;
+        assert(hasptr == jl_field_isptr(jt, idx) || jl_type_hasptr(jfty));
+        ArrayRef<Value*> roots;
+        if (hasptr) {
+            roots = ArrayRef(strct.inline_roots).slice(offsets.second, jl_field_isptr(jt, idx) ? 1 : ((jl_datatype_t*)jfty)->layout->npointers);
+            if (maybe_null)
+                null_pointer_check(ctx, roots[0], nullcheck);
+        }
+        if (jl_field_isptr(jt, idx)) {
+            return mark_julia_type(ctx, roots[0], true, jfty);
+        }
+        Value *addr = offsets.first < 0 ? nullptr : offsets.first == 0 ? strct.V : emit_ptrgep(ctx, strct.V, offsets.first);
+        if (jl_is_uniontype(jfty)) {
+            size_t fsz = 0, al = 0;
+            int union_max = jl_islayout_inline(jfty, &fsz, &al);
+            size_t fsz1 = jl_field_size(jt, idx) - 1;
+            bool isptr = (union_max == 0);
+            assert(!isptr && fsz < jl_field_size(jt, idx)); (void)isptr;
+            Value *ptindex = emit_ptrgep(ctx, addr, fsz1);
+            return emit_unionload(ctx, addr, ptindex, jfty, fsz, al, tbaa, false, union_max, strct.tbaa);
+        }
+        else if (jfty == (jl_value_t*)jl_bool_type) {
+            unsigned align = jl_field_align(jt, idx);
+            return typed_load(ctx, addr, NULL, jfty, tbaa, nullptr, false,
+                    AtomicOrdering::NotAtomic, maybe_null, align, nullcheck);
         }
         else {
-            if (jl_is_vecelement_type((jl_value_t*)jt))
-                addr = staddr; // VecElement types are unwrapped in LLVM.
-            else if (isa<StructType>(lt))
-                addr = emit_struct_gep(ctx, lt, staddr, byte_offset);
-            else
-                addr = ctx.builder.CreateConstInBoundsGEP2_32(lt, staddr, 0, idx);
-            if (addr != staddr) {
-                setNameWithField(ctx.emission_context, addr, get_objname, jt, idx, Twine("_ptr"));
-            }
+            return mark_julia_slot(addr, jfty, nullptr, tbaa, roots);
         }
-        if (jl_field_isptr(jt, idx)) {
+    }
+    else if (strct.ispointer()) {
+        auto tbaa = best_field_tbaa(ctx, strct, jt, idx, byte_offset);
+        Value *staddr = data_pointer(ctx, strct);
+        Value *addr;
+        if (jl_is_vecelement_type((jl_value_t*)jt) || byte_offset == 0)
+            addr = staddr; // VecElement types are unwrapped in LLVM.
+        else
+            addr = emit_ptrgep(ctx, staddr, byte_offset);
+        if (addr != staddr)
             setNameWithField(ctx.emission_context, addr, get_objname, jt, idx, Twine("_ptr"));
+        if (jl_field_isptr(jt, idx)) {
             LoadInst *Load = ctx.builder.CreateAlignedLoad(ctx.types().T_prjlvalue, addr, Align(sizeof(void*)));
             setNameWithField(ctx.emission_context, Load, get_objname, jt, idx, Twine());
             Load->setOrdering(order <= jl_memory_order_notatomic ? AtomicOrdering::Unordered : get_llvm_atomic_order(order));
@@ -2894,14 +3188,7 @@ static jl_cgval_t emit_getfield_knownidx(jl_codectx_t &ctx, const jl_cgval_t &st
             bool isptr = (union_max == 0);
             assert(!isptr && fsz < jl_field_size(jt, idx)); (void)isptr;
             size_t fsz1 = jl_field_size(jt, idx) - 1;
-            Value *ptindex;
-            if (isboxed) {
-                ptindex = ctx.builder.CreateConstInBoundsGEP1_32(
-                    getInt8Ty(ctx.builder.getContext()), staddr, byte_offset + fsz1);
-            }
-            else {
-                ptindex = emit_struct_gep(ctx, cast<StructType>(lt), staddr, byte_offset + fsz1);
-            }
+            Value *ptindex = emit_ptrgep(ctx, staddr, byte_offset + fsz1);
             auto val = emit_unionload(ctx, addr, ptindex, jfty, fsz, al, tbaa, !jl_field_isconst(jt, idx), union_max, strct.tbaa);
             if (val.V && val.V != addr) {
                 setNameWithField(ctx.emission_context, val.V, get_objname, jt, idx, Twine());
@@ -2950,22 +3237,21 @@ static jl_cgval_t emit_getfield_knownidx(jl_codectx_t &ctx, const jl_cgval_t &st
                 unsigned st_idx = convert_struct_offset(ctx, T, byte_offset);
                 IntegerType *ET = cast<IntegerType>(T->getStructElementType(st_idx));
                 unsigned align = (ET->getBitWidth() + 7) / 8;
-                lv = emit_static_alloca(ctx, ET);
-                lv->setOperand(0, ConstantInt::get(getInt32Ty(ctx.builder.getContext()), (fsz + align - 1) / align));
+                lv = emit_static_alloca(ctx, fsz, Align(align));
                 // emit all of the align-sized words
                 unsigned i = 0;
                 for (; i < fsz / align; i++) {
                     unsigned fld = st_idx + i;
                     Value *fldv = ctx.builder.CreateExtractValue(obj, ArrayRef<unsigned>(fld));
-                    Value *fldp = ctx.builder.CreateConstInBoundsGEP1_32(ET, lv, i);
+                    Value *fldp = emit_ptrgep(ctx, lv, i * align);
                     ctx.builder.CreateAlignedStore(fldv, fldp, Align(align));
                 }
                 // emit remaining bytes up to tindex
                 if (i < ptindex - st_idx) {
-                    Value *staddr = ctx.builder.CreateConstInBoundsGEP1_32(ET, lv, i);
+                    Value *staddr = emit_ptrgep(ctx, lv, i * align);
                     for (; i < ptindex - st_idx; i++) {
                         Value *fldv = ctx.builder.CreateExtractValue(obj, ArrayRef<unsigned>(st_idx + i));
-                        Value *fldp = ctx.builder.CreateConstInBoundsGEP1_32(getInt8Ty(ctx.builder.getContext()), staddr, i);
+                        Value *fldp = emit_ptrgep(ctx, staddr, i);
                         ctx.builder.CreateAlignedStore(fldv, fldp, Align(1));
                     }
                 }
@@ -3026,12 +3312,15 @@ static Value *emit_genericmemoryelsize(jl_codectx_t &ctx, Value *v, jl_value_t *
         size_t sz = sty->layout->size;
         if (sty->layout->flags.arrayelem_isunion)
             sz++;
-        return ConstantInt::get(ctx.types().T_size, sz);
+        auto elsize = ConstantInt::get(ctx.types().T_size, sz);
+        return elsize;
     }
     else {
         Value *t = emit_typeof(ctx, v, false, false, true);
         Value *elsize = emit_datatype_size(ctx, t, add_isunion);
-        return ctx.builder.CreateZExt(elsize, ctx.types().T_size);
+        elsize = ctx.builder.CreateZExt(elsize, ctx.types().T_size);
+        setName(ctx.emission_context, elsize, "elsize");
+        return elsize;
     }
 }
 
@@ -3066,6 +3355,7 @@ static Value *emit_genericmemorylen(jl_codectx_t &ctx, Value *addr, jl_value_t *
     MDBuilder MDB(ctx.builder.getContext());
     auto rng = MDB.createRange(Constant::getNullValue(ctx.types().T_size), ConstantInt::get(ctx.types().T_size, genericmemoryype_maxsize(typ)));
     LI->setMetadata(LLVMContext::MD_range, rng);
+    setName(ctx.emission_context, LI, "memory_len");
     return LI;
 }
 
@@ -3075,7 +3365,7 @@ static Value *emit_genericmemoryptr(jl_codectx_t &ctx, Value *mem, const jl_data
     Value *addr = mem;
     addr = decay_derived(ctx, addr);
     addr = ctx.builder.CreateStructGEP(ctx.types().T_jlgenericmemory, addr, 1);
-    setName(ctx.emission_context, addr, ".data_ptr");
+    setName(ctx.emission_context, addr, "memory_data_ptr");
     PointerType *PPT = cast<PointerType>(ctx.types().T_jlgenericmemory->getElementType(1));
     LoadInst *LI = ctx.builder.CreateAlignedLoad(PPT, addr, Align(sizeof(char*)));
     LI->setOrdering(AtomicOrdering::NotAtomic);
@@ -3087,6 +3377,7 @@ static Value *emit_genericmemoryptr(jl_codectx_t &ctx, Value *mem, const jl_data
         assert(AS == AddressSpace::Loaded);
         ptr = ctx.builder.CreateCall(prepare_call(gc_loaded_func), { mem, ptr });
     }
+    setName(ctx.emission_context, ptr, "memory_data");
     return ptr;
 }
 
@@ -3100,7 +3391,7 @@ static Value *emit_genericmemoryowner(jl_codectx_t &ctx, Value *t)
     LI->setMetadata(LLVMContext::MD_nonnull, MDNode::get(ctx.builder.getContext(), None));
     jl_aliasinfo_t aliasinfo_mem = jl_aliasinfo_t::fromTBAA(ctx, ctx.tbaa().tbaa_memoryown);
     aliasinfo_mem.decorateInst(LI);
-    addr = ctx.builder.CreateConstInBoundsGEP1_32(ctx.types().T_prjlvalue,  m, JL_SMALL_BYTE_ALIGNMENT / sizeof(void*));
+    addr = emit_ptrgep(ctx, m, JL_SMALL_BYTE_ALIGNMENT);
     Value *foreign = ctx.builder.CreateICmpNE(addr, decay_derived(ctx, LI));
     return emit_guarded_test(ctx, foreign, t, [&] {
             addr = ctx.builder.CreateConstInBoundsGEP1_32(ctx.types().T_jlgenericmemory, m, 1);
@@ -3123,16 +3414,12 @@ static void init_bits_value(jl_codectx_t &ctx, Value *newv, Value *v, MDNode *tb
     ai.decorateInst(ctx.builder.CreateAlignedStore(v, newv, alignment));
 }
 
-static void init_bits_cgval(jl_codectx_t &ctx, Value *newv, const jl_cgval_t& v, MDNode *tbaa)
+static void init_bits_cgval(jl_codectx_t &ctx, Value *newv, const jl_cgval_t &v)
 {
-    // newv should already be tagged
-    if (v.ispointer()) {
-        unsigned align = std::max(julia_alignment(v.typ), (unsigned)sizeof(void*));
-        emit_memcpy(ctx, newv, jl_aliasinfo_t::fromTBAA(ctx, tbaa), v, jl_datatype_size(v.typ), Align(align), Align(julia_alignment(v.typ)));
-    }
-    else {
-        init_bits_value(ctx, newv, v.V, tbaa);
-    }
+    MDNode *tbaa = jl_is_mutable(v.typ) ? ctx.tbaa().tbaa_mutab : ctx.tbaa().tbaa_immut;
+    Align newv_align{std::max(julia_alignment(v.typ), (unsigned)sizeof(void*))};
+    newv = maybe_decay_tracked(ctx, newv);
+    emit_unbox_store(ctx, v, newv, tbaa, newv_align);
 }
 
 static jl_value_t *static_constant_instance(const llvm::DataLayout &DL, Constant *constant, jl_value_t *jt)
@@ -3249,7 +3536,7 @@ static Value *_boxed_special(jl_codectx_t &ctx, const jl_cgval_t &vinfo, Type *t
     if (t == getInt1Ty(ctx.builder.getContext()))
         return track_pjlvalue(ctx, julia_bool(ctx, as_value(ctx, t, vinfo)));
 
-    if (ctx.linfo && jl_is_method(ctx.linfo->def.method) && !vinfo.ispointer()) { // don't bother codegen pre-boxing for toplevel
+    if (ctx.linfo && jl_is_method(ctx.linfo->def.method) && vinfo.inline_roots.empty() && !vinfo.ispointer()) { // don't bother codegen pre-boxing for toplevel
         if (Constant *c = dyn_cast<Constant>(vinfo.V)) {
             jl_value_t *s = static_constant_instance(jl_Module->getDataLayout(), c, jt);
             if (s) {
@@ -3356,6 +3643,8 @@ static void union_alloca_type(jl_uniontype_t *ut,
             },
             (jl_value_t*)ut,
             counter);
+    if (align > JL_HEAP_ALIGNMENT)
+        align = JL_HEAP_ALIGNMENT;
 }
 
 static AllocaInst *try_emit_union_alloca(jl_codectx_t &ctx, jl_uniontype_t *ut, bool &allunbox, size_t &min_align, size_t &nbytes)
@@ -3364,12 +3653,9 @@ static AllocaInst *try_emit_union_alloca(jl_codectx_t &ctx, jl_uniontype_t *ut,
     union_alloca_type(ut, allunbox, nbytes, align, min_align);
     if (nbytes > 0) {
         // at least some of the values can live on the stack
-        // try to pick an Integer type size such that SROA will emit reasonable code
-        Type *AT = ArrayType::get(IntegerType::get(ctx.builder.getContext(), 8 * min_align), (nbytes + min_align - 1) / min_align);
-        AllocaInst *lv = emit_static_alloca(ctx, AT);
+        assert(align % min_align == 0);
+        AllocaInst *lv = emit_static_alloca(ctx, nbytes, Align(align));
         setName(ctx.emission_context, lv, "unionalloca");
-        if (align > 1)
-            lv->setAlignment(Align(align));
         return lv;
     }
     return NULL;
@@ -3425,7 +3711,7 @@ static Value *box_union(jl_codectx_t &ctx, const jl_cgval_t &vinfo, const SmallB
                     if (!box) {
                         box = emit_allocobj(ctx, jt, true);
                         setName(ctx.emission_context, box, "unionbox");
-                        init_bits_cgval(ctx, box, vinfo_r, jl_is_mutable(jt) ? ctx.tbaa().tbaa_mutab : ctx.tbaa().tbaa_immut);
+                        init_bits_cgval(ctx, box, vinfo_r);
                     }
                 }
                 tempBB = ctx.builder.GetInsertBlock(); // could have changed
@@ -3548,14 +3834,14 @@ static Value *boxed(jl_codectx_t &ctx, const jl_cgval_t &vinfo, bool is_promotab
         box = box_union(ctx, vinfo, skip_none);
     }
     else {
-        assert(vinfo.V && "Missing data for unboxed value.");
+        assert((vinfo.V || !vinfo.inline_roots.empty()) && "Missing data for unboxed value.");
         assert(jl_is_concrete_immutable(jt) && "This type shouldn't have been unboxed.");
         Type *t = julia_type_to_llvm(ctx, jt);
         assert(!type_is_ghost(t)); // ghost values should have been handled by vinfo.constant above!
         box = _boxed_special(ctx, vinfo, t);
         if (!box) {
             bool do_promote = vinfo.promotion_point;
-            if (do_promote && is_promotable) {
+            if (do_promote && is_promotable && vinfo.inline_roots.empty()) {
                 auto IP = ctx.builder.saveIP();
                 ctx.builder.SetInsertPoint(vinfo.promotion_point);
                 box = emit_allocobj(ctx, (jl_datatype_t*)jt, true);
@@ -3569,13 +3855,14 @@ static Value *boxed(jl_codectx_t &ctx, const jl_cgval_t &vinfo, bool is_promotab
                 // end illegal IR
                 originalAlloca->eraseFromParent();
                 ctx.builder.restoreIP(IP);
-            } else {
+            }
+            else {
                 auto arg_typename = [&] JL_NOTSAFEPOINT {
                     return "box::" + std::string(jl_symbol_name(((jl_datatype_t*)(jt))->name->name));
                 };
                 box = emit_allocobj(ctx, (jl_datatype_t*)jt, true);
                 setName(ctx.emission_context, box, arg_typename);
-                init_bits_cgval(ctx, box, vinfo, jl_is_mutable(jt) ? ctx.tbaa().tbaa_mutab : ctx.tbaa().tbaa_immut);
+                init_bits_cgval(ctx, box, vinfo);
             }
         }
     }
@@ -3588,30 +3875,25 @@ static void emit_unionmove(jl_codectx_t &ctx, Value *dest, MDNode *tbaa_dst, con
     if (AllocaInst *ai = dyn_cast<AllocaInst>(dest))
         // TODO: make this a lifetime_end & dereferenceable annotation?
         ctx.builder.CreateAlignedStore(UndefValue::get(ai->getAllocatedType()), ai, ai->getAlign());
-    if (jl_is_concrete_type(src.typ) || src.constant) {
-        jl_value_t *typ = src.constant ? jl_typeof(src.constant) : src.typ;
+    if (src.constant) {
+        jl_value_t *typ = jl_typeof(src.constant);
         assert(skip || jl_is_pointerfree(typ));
         if (jl_is_pointerfree(typ)) {
-            unsigned alignment = julia_alignment(typ);
-            if (!src.ispointer() || src.constant) {
+            emit_guarded_test(ctx, skip, nullptr, [&] {
+                unsigned alignment = julia_alignment(typ);
+                emit_unbox_store(ctx, mark_julia_const(ctx, src.constant), dest, tbaa_dst, Align(alignment), isVolatile);
+                return nullptr;
+            });
+        }
+    }
+    else if (jl_is_concrete_type(src.typ)) {
+        assert(skip || jl_is_pointerfree(src.typ));
+        if (jl_is_pointerfree(src.typ)) {
+            emit_guarded_test(ctx, skip, nullptr, [&] {
+                unsigned alignment = julia_alignment(src.typ);
                 emit_unbox_store(ctx, src, dest, tbaa_dst, Align(alignment), isVolatile);
-            }
-            else {
-                Value *src_ptr = data_pointer(ctx, src);
-                unsigned nb = jl_datatype_size(typ);
-                // TODO: this branch may be bad for performance, but is necessary to work around LLVM bugs with the undef option that we want to use:
-                //   select copy dest -> dest to simulate an undef value / conditional copy
-                // if (skip) src_ptr = ctx.builder.CreateSelect(skip, dest, src_ptr);
-                auto f = [&] {
-                    (void)emit_memcpy(ctx, dest, jl_aliasinfo_t::fromTBAA(ctx, tbaa_dst), src_ptr,
-                                      jl_aliasinfo_t::fromTBAA(ctx, src.tbaa), nb, Align(alignment), Align(alignment), isVolatile);
-                    return nullptr;
-                };
-                if (skip)
-                    emit_guarded_test(ctx, skip, nullptr, f);
-                else
-                    f();
-            }
+                return nullptr;
+            });
         }
     }
     else if (src.TIndex) {
@@ -3661,17 +3943,13 @@ static void emit_unionmove(jl_codectx_t &ctx, Value *dest, MDNode *tbaa_dst, con
     }
     else {
         assert(src.isboxed && "expected boxed value for sizeof/alignment computation");
-        auto f = [&] {
+        emit_guarded_test(ctx, skip, nullptr, [&] {
             Value *datatype = emit_typeof(ctx, src, false, false);
             Value *copy_bytes = emit_datatype_size(ctx, datatype);
-            (void)emit_memcpy(ctx, dest, jl_aliasinfo_t::fromTBAA(ctx, tbaa_dst), data_pointer(ctx, src),
-                              jl_aliasinfo_t::fromTBAA(ctx, src.tbaa), copy_bytes, Align(1), Align(1), isVolatile);
+            emit_memcpy(ctx, dest, jl_aliasinfo_t::fromTBAA(ctx, tbaa_dst), data_pointer(ctx, src),
+                        jl_aliasinfo_t::fromTBAA(ctx, src.tbaa), copy_bytes, Align(1), Align(1), isVolatile);
             return nullptr;
-        };
-        if (skip)
-            emit_guarded_test(ctx, skip, nullptr, f);
-        else
-            f();
+        });
     }
 }
 
@@ -3760,6 +4038,12 @@ static void emit_write_multibarrier(jl_codectx_t &ctx, Value *parent, Value *agg
     emit_write_barrier(ctx, parent, ptrs);
 }
 
+static void emit_write_multibarrier(jl_codectx_t &ctx, Value *parent, const jl_cgval_t &x)
+{
+    auto ptrs = get_gc_roots_for(ctx, x, true);
+    emit_write_barrier(ctx, parent, ptrs);
+}
+
 static jl_cgval_t union_store(jl_codectx_t &ctx,
         Value *ptr, Value *ptindex, jl_cgval_t rhs, jl_cgval_t cmp,
         jl_value_t *jltype, MDNode *tbaa, MDNode *tbaa_tindex,
@@ -3799,6 +4083,12 @@ static jl_cgval_t union_store(jl_codectx_t &ctx,
                 rhs = emit_invoke(ctx, *modifyop, argv, 3, (jl_value_t*)jl_any_type);
             }
             else {
+                if (trim_may_error(ctx.params->trim)) {
+                    // if we know the return type, we can assume the result is of that type
+                    errs() << "ERROR: Dynamic call to setfield/modifyfield\n";
+                    errs() << "In " << ctx.builder.getCurrentDebugLocation()->getFilename() << ":" << ctx.builder.getCurrentDebugLocation()->getLine() << "\n";
+                    print_stacktrace(ctx, ctx.params->trim);
+                }
                 Value *callval = emit_jlcall(ctx, jlapplygeneric_func, nullptr, argv, 3, julia_call);
                 rhs = mark_julia_type(ctx, callval, true, jl_any_type);
             }
@@ -3862,19 +4152,14 @@ static jl_cgval_t emit_setfield(jl_codectx_t &ctx,
     auto tbaa = best_field_tbaa(ctx, strct, sty, idx0, byte_offset);
     Value *addr = data_pointer(ctx, strct);
     if (byte_offset > 0) {
-        addr = ctx.builder.CreateInBoundsGEP(
-                getInt8Ty(ctx.builder.getContext()),
-                addr,
-                ConstantInt::get(ctx.types().T_size, byte_offset));
+        addr = emit_ptrgep(ctx, addr, byte_offset);
         setNameWithField(ctx.emission_context, addr, get_objname, sty, idx0, Twine("_ptr"));
     }
     jl_value_t *jfty = jl_field_type(sty, idx0);
     bool isboxed = jl_field_isptr(sty, idx0);
     if (!isboxed && jl_is_uniontype(jfty)) {
         size_t fsz1 = jl_field_size(sty, idx0) - 1;
-        Value *ptindex = ctx.builder.CreateInBoundsGEP(getInt8Ty(ctx.builder.getContext()),
-                addr,
-                ConstantInt::get(ctx.types().T_size, fsz1));
+        Value *ptindex = emit_ptrgep(ctx, addr, fsz1);
         setNameWithField(ctx.emission_context, ptindex, get_objname, sty, idx0, Twine(".tindex_ptr"));
         return union_store(ctx, addr, ptindex, rhs, cmp, jfty, tbaa, ctx.tbaa().tbaa_unionselbyte,
             Order, FailOrder,
@@ -3905,25 +4190,24 @@ static jl_cgval_t emit_new_struct(jl_codectx_t &ctx, jl_value_t *ty, size_t narg
             Type *lt = julia_type_to_llvm(ctx, ty);
             unsigned na = nargs < nf ? nargs : nf;
 
-            // whether we should perform the initialization with the struct as a IR value
-            // or instead initialize the stack buffer with stores
-            auto tracked = CountTrackedPointers(lt);
+            // choose whether we should perform the initialization with the struct as a IR value
+            // or instead initialize the stack buffer with stores (the later is nearly always better)
+            auto tracked = split_value_size(sty);
+            assert(CountTrackedPointers(lt).count == tracked.second);
             bool init_as_value = false;
             if (lt->isVectorTy() || jl_is_vecelement_type(ty)) { // maybe also check the size ?
                 init_as_value = true;
             }
-            else if (tracked.count) {
-                init_as_value = true;
-            }
 
             Instruction *promotion_point = nullptr;
             ssize_t promotion_ssa = -1;
             Value *strct;
+            SmallVector<Value*,0> inline_roots;
             if (type_is_ghost(lt)) {
-                strct = NULL;
+                strct = nullptr;
             }
             else if (init_as_value) {
-                if (tracked.count) {
+                if (tracked.second) {
                     strct = Constant::getNullValue(lt);
                 }
                 else {
@@ -3932,11 +4216,19 @@ static jl_cgval_t emit_new_struct(jl_codectx_t &ctx, jl_value_t *ty, size_t narg
                         strct = ctx.builder.CreateFreeze(strct);
                 }
             }
+            else if (tracked.second) {
+                inline_roots.resize(tracked.second, Constant::getNullValue(ctx.types().T_prjlvalue));
+                strct = nullptr;
+                if (tracked.first) {
+                    AllocaInst *bits = emit_static_alloca(ctx, tracked.first, Align(julia_alignment(ty)));
+                    strct = bits;
+                    setName(ctx.emission_context, bits, arg_typename);
+                    is_promotable = false; // wrong layout for promotion
+                }
+            }
             else {
-                strct = emit_static_alloca(ctx, lt);
+                strct = emit_static_alloca(ctx, lt, Align(julia_alignment(ty)));
                 setName(ctx.emission_context, strct, arg_typename);
-                if (tracked.count)
-                    undef_derived_strct(ctx, strct, sty, ctx.tbaa().tbaa_stack);
             }
 
             for (unsigned i = 0; i < na; i++) {
@@ -3948,26 +4240,33 @@ static jl_cgval_t emit_new_struct(jl_codectx_t &ctx, jl_value_t *ty, size_t narg
                 fval_info = update_julia_type(ctx, fval_info, jtype);
                 if (fval_info.typ == jl_bottom_type)
                     return jl_cgval_t();
+                if (type_is_ghost(lt))
+                    continue;
+                Type *fty = julia_type_to_llvm(ctx, jtype);
+                if (type_is_ghost(fty))
+                    continue;
+                Instruction *dest = nullptr;
+                MutableArrayRef<Value*> roots;
+                ssize_t offs = jl_field_offset(sty, i);
+                ssize_t ptrsoffs = -1;
+                if (!inline_roots.empty())
+                    std::tie(offs, ptrsoffs) = split_value_field(sty, i);
+                unsigned llvm_idx = init_as_value ? ((i > 0 && isa<StructType>(lt)) ? convert_struct_offset(ctx, lt, offs) : i) : -1u;
                 // TODO: Use (post-)domination instead.
                 bool field_promotable = !jl_is_uniontype(jtype) && !init_as_value && fval_info.promotion_ssa != -1 &&
+                    fval_info.inline_roots.empty() && inline_roots.empty() && // these need to be compatible, if they were to be implemented
                     fval_info.promotion_point && fval_info.promotion_point->getParent() == ctx.builder.GetInsertBlock();
                 if (field_promotable) {
                     savedIP = ctx.builder.saveIP();
                     ctx.builder.SetInsertPoint(fval_info.promotion_point);
                 }
-                if (type_is_ghost(lt))
-                    continue;
-                Type *fty = julia_type_to_llvm(ctx, jtype);
-                if (type_is_ghost(fty))
-                    continue;
-                Value *dest = NULL;
-                unsigned offs = jl_field_offset(sty, i);
-                unsigned llvm_idx = (i > 0 && isa<StructType>(lt)) ? convert_struct_offset(ctx, lt, offs) : i;
                 if (!init_as_value) {
                     // avoid unboxing the argument explicitly
                     // and use memcpy instead
-                    Instruction *inst;
-                    dest = inst = cast<Instruction>(ctx.builder.CreateConstInBoundsGEP2_32(lt, strct, 0, llvm_idx));
+                    Instruction *inst = strct && offs >= 0 ? cast<Instruction>(emit_ptrgep(ctx, strct, offs)) : nullptr;
+                    if (!inline_roots.empty() && ptrsoffs >= 0)
+                        roots = MutableArrayRef(inline_roots).slice(ptrsoffs, jl_field_isptr(sty, i) ? 1 : ((jl_datatype_t*)jtype)->layout->npointers);
+                    dest = inst;
                     // Our promotion point needs to come before
                     //  A) All of our arguments' promotion points
                     //  B) Any instructions we insert at any of our arguments' promotion points
@@ -3987,10 +4286,13 @@ static jl_cgval_t emit_new_struct(jl_codectx_t &ctx, jl_value_t *ty, size_t narg
                 if (jl_field_isptr(sty, i)) {
                     fval = boxed(ctx, fval_info, field_promotable);
                     if (!init_as_value) {
-                        jl_aliasinfo_t ai = jl_aliasinfo_t::fromTBAA(ctx, ctx.tbaa().tbaa_stack);
-                        StoreInst *SI = cast<StoreInst>(ai.decorateInst(
-                                ctx.builder.CreateAlignedStore(fval, dest, Align(jl_field_align(sty, i)))));
-                        SI->setOrdering(AtomicOrdering::Unordered);
+                        if (dest) {
+                            jl_aliasinfo_t ai = jl_aliasinfo_t::fromTBAA(ctx, ctx.tbaa().tbaa_stack);
+                            ai.decorateInst(ctx.builder.CreateAlignedStore(fval, dest, Align(jl_field_align(sty, i))));
+                        }
+                        else {
+                            roots[0] = fval;
+                        }
                     }
                 }
                 else if (jl_is_uniontype(jtype)) {
@@ -4013,23 +4315,22 @@ static jl_cgval_t emit_new_struct(jl_codectx_t &ctx, jl_value_t *ty, size_t narg
                         if (fsz1 > 0 && !fval_info.isghost) {
                             Type *ET = IntegerType::get(ctx.builder.getContext(), 8 * al);
                             assert(lt->getStructElementType(llvm_idx) == ET);
-                            AllocaInst *lv = emit_static_alloca(ctx, ET);
+                            AllocaInst *lv = emit_static_alloca(ctx, fsz1, Align(al));
                             setName(ctx.emission_context, lv, "unioninit");
-                            lv->setOperand(0, ConstantInt::get(getInt32Ty(ctx.builder.getContext()), (fsz1 + al - 1) / al));
                             emit_unionmove(ctx, lv, ctx.tbaa().tbaa_stack, fval_info, nullptr);
                             // emit all of the align-sized words
                             unsigned i = 0;
                             for (; i < fsz1 / al; i++) {
-                                Value *fldp = ctx.builder.CreateConstInBoundsGEP1_32(ET, lv, i);
+                                Value *fldp = emit_ptrgep(ctx, lv, i * al);
                                 jl_aliasinfo_t ai = jl_aliasinfo_t::fromTBAA(ctx, ctx.tbaa().tbaa_stack);
                                 Value *fldv = ai.decorateInst(ctx.builder.CreateAlignedLoad(ET, fldp, Align(al)));
                                 strct = ctx.builder.CreateInsertValue(strct, fldv, ArrayRef<unsigned>(llvm_idx + i));
                             }
                             // emit remaining bytes up to tindex
                             if (i < ptindex - llvm_idx) {
-                                Value *staddr = ctx.builder.CreateConstInBoundsGEP1_32(ET, lv, i);
+                                Value *staddr = emit_ptrgep(ctx, lv, i * al);
                                 for (; i < ptindex - llvm_idx; i++) {
-                                    Value *fldp = ctx.builder.CreateConstInBoundsGEP1_32(getInt8Ty(ctx.builder.getContext()), staddr, i);
+                                    Value *fldp = emit_ptrgep(ctx, staddr, i);
                                     jl_aliasinfo_t ai = jl_aliasinfo_t::fromTBAA(ctx, ctx.tbaa().tbaa_stack);
                                     Value *fldv = ai.decorateInst(ctx.builder.CreateAlignedLoad(getInt8Ty(ctx.builder.getContext()), fldp, Align(1)));
                                     strct = ctx.builder.CreateInsertValue(strct, fldv, ArrayRef<unsigned>(llvm_idx + i));
@@ -4042,7 +4343,7 @@ static jl_cgval_t emit_new_struct(jl_codectx_t &ctx, jl_value_t *ty, size_t narg
                             fval = ctx.builder.CreateInsertValue(strct, fval, ArrayRef<unsigned>(llvm_idx));
                     }
                     else {
-                        Value *ptindex = emit_struct_gep(ctx, lt, strct, offs + fsz1);
+                        Value *ptindex = emit_ptrgep(ctx, strct, offs + fsz1);
                         jl_aliasinfo_t ai = jl_aliasinfo_t::fromTBAA(ctx, ctx.tbaa().tbaa_unionselbyte);
                         ai.decorateInst(ctx.builder.CreateAlignedStore(tindex, ptindex, Align(1)));
                         if (!rhs_union.isghost)
@@ -4053,9 +4354,14 @@ static jl_cgval_t emit_new_struct(jl_codectx_t &ctx, jl_value_t *ty, size_t narg
                     if (field_promotable) {
                         fval_info.V->replaceAllUsesWith(dest);
                         cast<Instruction>(fval_info.V)->eraseFromParent();
-                    } else if (init_as_value) {
+                    }
+                    else if (init_as_value) {
                         fval = emit_unbox(ctx, fty, fval_info, jtype);
-                    } else {
+                    }
+                    else if (!roots.empty()) {
+                        split_value_into(ctx, fval_info, Align(julia_alignment(jtype)), dest, Align(jl_field_align(sty, i)), jl_aliasinfo_t::fromTBAA(ctx, ctx.tbaa().tbaa_stack), roots);
+                    }
+                    else {
                         emit_unbox_store(ctx, fval_info, dest, ctx.tbaa().tbaa_stack, Align(jl_field_align(sty, i)));
                     }
                 }
@@ -4076,26 +4382,35 @@ static jl_cgval_t emit_new_struct(jl_codectx_t &ctx, jl_value_t *ty, size_t narg
             }
             for (size_t i = nargs; i < nf; i++) {
                 if (!jl_field_isptr(sty, i) && jl_is_uniontype(jl_field_type(sty, i))) {
-                    unsigned offs = jl_field_offset(sty, i);
+                    ssize_t offs = jl_field_offset(sty, i);
+                    ssize_t ptrsoffs = -1;
+                    if (!inline_roots.empty())
+                        std::tie(offs, ptrsoffs) = split_value_field(sty, i);
+                    assert(ptrsoffs < 0 && offs >= 0);
                     int fsz = jl_field_size(sty, i) - 1;
-                    unsigned llvm_idx = convert_struct_offset(ctx, cast<StructType>(lt), offs + fsz);
-                    if (init_as_value)
+                    if (init_as_value) {
+                        unsigned llvm_idx = convert_struct_offset(ctx, cast<StructType>(lt), offs + fsz);
                         strct = ctx.builder.CreateInsertValue(strct, ConstantInt::get(getInt8Ty(ctx.builder.getContext()), 0), ArrayRef<unsigned>(llvm_idx));
+                    }
                     else {
                         jl_aliasinfo_t ai = jl_aliasinfo_t::fromTBAA(ctx, ctx.tbaa().tbaa_unionselbyte);
-                        ai.decorateInst(ctx.builder.CreateAlignedStore(
-                                ConstantInt::get(getInt8Ty(ctx.builder.getContext()), 0),
-                                ctx.builder.CreateConstInBoundsGEP2_32(lt, strct, 0, llvm_idx),
-                                Align(1)));
+                        Instruction *dest = cast<Instruction>(emit_ptrgep(ctx, strct, offs + fsz));
+                        if (promotion_point == nullptr)
+                            promotion_point = dest;
+                        ai.decorateInst(ctx.builder.CreateAlignedStore(ctx.builder.getInt8(0), dest, Align(1)));
                     }
                 }
             }
-            if (promotion_point && nargs < nf) {
+            if (nargs < nf) {
                 assert(!init_as_value);
                 IRBuilderBase::InsertPoint savedIP = ctx.builder.saveIP();
-                ctx.builder.SetInsertPoint(promotion_point);
-                promotion_point = cast<FreezeInst>(ctx.builder.CreateFreeze(UndefValue::get(lt)));
-                ctx.builder.CreateStore(promotion_point, strct);
+                if (promotion_point)
+                    ctx.builder.SetInsertPoint(promotion_point);
+                if (strct) {
+                    promotion_point = cast<FreezeInst>(ctx.builder.CreateFreeze(UndefValue::get(lt)));
+                    jl_aliasinfo_t ai = jl_aliasinfo_t::fromTBAA(ctx, ctx.tbaa().tbaa_stack);
+                    ai.decorateInst(ctx.builder.CreateStore(promotion_point, strct));
+                }
                 ctx.builder.restoreIP(savedIP);
             }
             if (type_is_ghost(lt))
@@ -4103,7 +4418,7 @@ static jl_cgval_t emit_new_struct(jl_codectx_t &ctx, jl_value_t *ty, size_t narg
             else if (init_as_value)
                 return mark_julia_type(ctx, strct, false, ty);
             else {
-                jl_cgval_t ret = mark_julia_slot(strct, ty, NULL, ctx.tbaa().tbaa_stack);
+                jl_cgval_t ret = mark_julia_slot(strct, ty, NULL, ctx.tbaa().tbaa_stack, inline_roots);
                 if (is_promotable && promotion_point) {
                     ret.promotion_point = promotion_point;
                     ret.promotion_ssa = promotion_ssa;
@@ -4121,8 +4436,7 @@ static jl_cgval_t emit_new_struct(jl_codectx_t &ctx, jl_value_t *ty, size_t narg
                 jl_aliasinfo_t ai = jl_aliasinfo_t::fromTBAA(ctx, ctx.tbaa().tbaa_unionselbyte);
                 ai.decorateInst(ctx.builder.CreateAlignedStore(
                         ConstantInt::get(getInt8Ty(ctx.builder.getContext()), 0),
-                        ctx.builder.CreateInBoundsGEP(getInt8Ty(ctx.builder.getContext()), strct,
-                                ConstantInt::get(ctx.types().T_size, jl_field_offset(sty, i) + jl_field_size(sty, i) - 1)),
+                        emit_ptrgep(ctx, strct, jl_field_offset(sty, i) + jl_field_size(sty, i) - 1),
                         Align(1)));
             }
         }
@@ -4164,9 +4478,7 @@ static Value *emit_defer_signal(jl_codectx_t &ctx)
 {
     ++EmittedDeferSignal;
     Value *ptls = get_current_ptls(ctx);
-    Constant *offset = ConstantInt::getSigned(getInt32Ty(ctx.builder.getContext()),
-            offsetof(jl_tls_states_t, defer_signal) / sizeof(sig_atomic_t));
-    return ctx.builder.CreateInBoundsGEP(ctx.types().T_sigatomic, ptls, ArrayRef<Value*>(offset), "jl_defer_signal");
+    return emit_ptrgep(ctx, ptls, offsetof(jl_tls_states_t, defer_signal));
 }
 
 #ifndef JL_NDEBUG
@@ -4195,6 +4507,7 @@ static jl_cgval_t _emit_memoryref(jl_codectx_t &ctx, Value *mem, Value *data, co
     Value *ref = Constant::getNullValue(get_memoryref_type(ctx.builder.getContext(), ctx.types().T_size, layout, 0));
     ref = ctx.builder.CreateInsertValue(ref, data, 0);
     ref = ctx.builder.CreateInsertValue(ref, mem, 1);
+    setName(ctx.emission_context, ref, "memory_ref");
     return mark_julia_type(ctx, ref, false, typ);
 }
 
@@ -4209,12 +4522,26 @@ static jl_cgval_t _emit_memoryref(jl_codectx_t &ctx, const jl_cgval_t &mem, cons
 
 static Value *emit_memoryref_FCA(jl_codectx_t &ctx, const jl_cgval_t &ref, const jl_datatype_layout_t *layout)
 {
-    if (ref.ispointer()) {
+    if (!ref.inline_roots.empty()) {
+        LLVMContext &C = ctx.builder.getContext();
+        StructType *type = get_memoryref_type(C, ctx.types().T_size, layout, 0);
+        LoadInst *load0 = ctx.builder.CreateLoad(type->getElementType(0), ref.V);
+        jl_aliasinfo_t ai0 = jl_aliasinfo_t::fromTBAA(ctx, ref.tbaa);
+        ai0.decorateInst(load0);
+        setName(ctx.emission_context, load0, "memory_ref_FCA0");
+        Value *root = ctx.builder.CreateBitCast(ref.inline_roots[0], type->getElementType(1));
+        Value *load = Constant::getNullValue(type);
+        load = ctx.builder.CreateInsertValue(load, load0, 0);
+        load = ctx.builder.CreateInsertValue(load, root, 1);
+        return load;
+    }
+    else if (ref.ispointer()) {
         LLVMContext &C = ctx.builder.getContext();
         Type *type = get_memoryref_type(C, ctx.types().T_size, layout, 0);
         LoadInst *load = ctx.builder.CreateLoad(type, data_pointer(ctx, ref));
         jl_aliasinfo_t ai = jl_aliasinfo_t::fromTBAA(ctx, ref.tbaa);
         ai.decorateInst(load);
+        setName(ctx.emission_context, load, "memory_ref_FCA");
         return load;
     }
     else {
@@ -4231,9 +4558,12 @@ static jl_cgval_t emit_memoryref(jl_codectx_t &ctx, const jl_cgval_t &ref, jl_cg
         return jl_cgval_t();
     Value *V = emit_memoryref_FCA(ctx, ref, layout);
     Value *data = CreateSimplifiedExtractValue(ctx, V, 0);
+    maybeSetName(ctx.emission_context, data, "memoryref_data");
     Value *mem = CreateSimplifiedExtractValue(ctx, V, 1);
+    maybeSetName(ctx.emission_context, mem, "memoryref_mem");
     Value *i = emit_unbox(ctx, ctx.types().T_size, idx, (jl_value_t*)jl_long_type);
     Value *offset = ctx.builder.CreateSub(i, ConstantInt::get(ctx.types().T_size, 1));
+    setName(ctx.emission_context, offset, "memoryref_offset");
     Value *elsz = emit_genericmemoryelsize(ctx, mem, ref.typ, false);
     bool bc = bounds_check_enabled(ctx, inbounds);
 #if 1
@@ -4245,12 +4575,14 @@ static jl_cgval_t emit_memoryref(jl_codectx_t &ctx, const jl_cgval_t &ref, jl_cg
     bool isghost = layout->size == 0;
     if ((!isboxed && isunion) || isghost) {
         newdata = ctx.builder.CreateAdd(data, offset);
+        setName(ctx.emission_context, newdata, "memoryref_data+offset");
         if (bc) {
             BasicBlock *failBB, *endBB;
             failBB = BasicBlock::Create(ctx.builder.getContext(), "oob");
             endBB = BasicBlock::Create(ctx.builder.getContext(), "idxend");
             Value *mlen = emit_genericmemorylen(ctx, mem, ref.typ);
             Value *inbound = ctx.builder.CreateICmpULT(newdata, mlen);
+            setName(ctx.emission_context, offset, "memoryref_isinbounds");
             ctx.builder.CreateCondBr(inbound, endBB, failBB);
             failBB->insertInto(ctx.f);
             ctx.builder.SetInsertPoint(failBB);
@@ -4278,10 +4610,13 @@ static jl_cgval_t emit_memoryref(jl_codectx_t &ctx, const jl_cgval_t &ref, jl_cg
             // and we can further rearrange that as ovflw = !( offset+len < len+len ) as unsigned math
             Value *mlen = emit_genericmemorylen(ctx, mem, ref.typ);
             ovflw = ctx.builder.CreateICmpUGE(ctx.builder.CreateAdd(offset, mlen), ctx.builder.CreateNUWAdd(mlen, mlen));
+            setName(ctx.emission_context, ovflw, "memoryref_ovflw");
         }
 #endif
         boffset = ctx.builder.CreateMul(offset, elsz);
-        newdata = ctx.builder.CreateInBoundsGEP(getInt8Ty(ctx.builder.getContext()), data, boffset);
+        setName(ctx.emission_context, boffset, "memoryref_byteoffset");
+        newdata = ctx.builder.CreateGEP(getInt8Ty(ctx.builder.getContext()), data, boffset);
+        setName(ctx.emission_context, newdata, "memoryref_data_byteoffset");
         (void)boffset; // LLVM is very bad at handling GEP with types different from the load
         if (bc) {
             BasicBlock *failBB, *endBB;
@@ -4304,8 +4639,11 @@ static jl_cgval_t emit_memoryref(jl_codectx_t &ctx, const jl_cgval_t &ref, jl_cg
                 ctx.builder.CreatePtrToInt(newdata, ctx.types().T_size),
                 ctx.builder.CreatePtrToInt(mptr, ctx.types().T_size));
             Value *blen = ctx.builder.CreateMul(mlen, elsz, "", true, true);
+            setName(ctx.emission_context, blen, "memoryref_bytelen");
             Value *inbound = ctx.builder.CreateICmpULT(bidx0, blen);
+            setName(ctx.emission_context, inbound, "memoryref_isinbounds");
             inbound = ctx.builder.CreateAnd(ctx.builder.CreateNot(ovflw), inbound);
+            setName(ctx.emission_context, inbound, "memoryref_isinbounds&notovflw");
 #else
             Value *idx0; // (newdata - mptr) / elsz
             idx0 = ctx.builder.CreateSub(
@@ -4342,8 +4680,10 @@ static jl_cgval_t emit_memoryref_offset(jl_codectx_t &ctx, const jl_cgval_t &ref
         offset = ctx.builder.CreateSub(
             ctx.builder.CreatePtrToInt(data, ctx.types().T_size),
             ctx.builder.CreatePtrToInt(mptr, ctx.types().T_size));
+        setName(ctx.emission_context, offset, "memoryref_offset");
         Value *elsz = emit_genericmemoryelsize(ctx, mem, ref.typ, false);
         offset = ctx.builder.CreateExactUDiv(offset, elsz);
+        setName(ctx.emission_context, offset, "memoryref_offsetidx");
     }
     offset = ctx.builder.CreateAdd(offset, ConstantInt::get(ctx.types().T_size, 1));
     return mark_julia_type(ctx, offset, false, jl_long_type);
@@ -4352,7 +4692,9 @@ static jl_cgval_t emit_memoryref_offset(jl_codectx_t &ctx, const jl_cgval_t &ref
 static Value *emit_memoryref_mem(jl_codectx_t &ctx, const jl_cgval_t &ref, const jl_datatype_layout_t *layout)
 {
     Value *V = emit_memoryref_FCA(ctx, ref, layout);
-    return CreateSimplifiedExtractValue(ctx, V, 1);
+    V = CreateSimplifiedExtractValue(ctx, V, 1);
+    maybeSetName(ctx.emission_context, V, "memoryref_mem");
+    return V;
 }
 
 static Value *emit_memoryref_ptr(jl_codectx_t &ctx, const jl_cgval_t &ref, const jl_datatype_layout_t *layout)
@@ -4374,13 +4716,15 @@ static Value *emit_memoryref_ptr(jl_codectx_t &ctx, const jl_cgval_t &ref, const
     data = ctx.builder.CreateCall(prepare_call(gc_loaded_func), { mem, data });
     if (!GEPlist.empty()) {
         for (auto &GEP : make_range(GEPlist.rbegin(), GEPlist.rend())) {
-            Instruction *GEP2 = GEP->clone();
+            GetElementPtrInst *GEP2 = cast<GetElementPtrInst>(GEP->clone());
             GEP2->mutateType(PointerType::get(GEP->getResultElementType(), AS));
             GEP2->setOperand(GetElementPtrInst::getPointerOperandIndex(), data);
+            GEP2->setIsInBounds(true);
             ctx.builder.Insert(GEP2);
             data = GEP2;
         }
     }
+    setName(ctx.emission_context, data, "memoryref_data");
     return data;
 }
 
diff --git a/src/codegen-stubs.c b/src/codegen-stubs.c
index 41812d903816c..7ddb68fd6b036 100644
--- a/src/codegen-stubs.c
+++ b/src/codegen-stubs.c
@@ -15,6 +15,7 @@ JL_DLLEXPORT void jl_dump_native_fallback(void *native_code,
         ios_t *z, ios_t *s) UNAVAILABLE
 JL_DLLEXPORT void jl_get_llvm_gvs_fallback(void *native_code, arraylist_t *gvs) UNAVAILABLE
 JL_DLLEXPORT void jl_get_llvm_external_fns_fallback(void *native_code, arraylist_t *gvs) UNAVAILABLE
+JL_DLLEXPORT void jl_get_llvm_mis_fallback(void *native_code, arraylist_t* MIs) UNAVAILABLE
 
 JL_DLLEXPORT void jl_extern_c_fallback(jl_function_t *f, jl_value_t *rt, jl_value_t *argt, char *name) UNAVAILABLE
 JL_DLLEXPORT jl_value_t *jl_dump_method_asm_fallback(jl_method_instance_t *linfo, size_t world,
diff --git a/src/codegen.cpp b/src/codegen.cpp
index 4091ec6c03db0..bcda527416676 100644
--- a/src/codegen.cpp
+++ b/src/codegen.cpp
@@ -16,6 +16,7 @@
 #include <array>
 #include <vector>
 #include <set>
+#include <unordered_set>
 #include <functional>
 
 // target machine computation
@@ -171,6 +172,14 @@ void setName(jl_codegen_params_t &params, Value *V, const Twine &Name)
     }
 }
 
+void maybeSetName(jl_codegen_params_t &params, Value *V, const Twine &Name)
+{
+    // To be used when we may get an Instruction or something that is not an instruction i.e Constants/Arguments
+    if (params.debug_level >= 2 && isa<Instruction>(V)) {
+        V->setName(Name);
+    }
+}
+
 void setName(jl_codegen_params_t &params, Value *V, std::function<std::string()> GetName)
 {
     assert((isa<Constant>(V) || isa<Instruction>(V)) && "Should only set names on instructions!");
@@ -1611,6 +1620,7 @@ static const auto &builtin_func_map() {
           { jl_f__call_in_world_addr,     new JuliaFunction<>{XSTR(jl_f__call_in_world), get_func_sig, get_func_attrs} },
           { jl_f__call_in_world_total_addr, new JuliaFunction<>{XSTR(jl_f__call_in_world_total), get_func_sig, get_func_attrs} },
           { jl_f_throw_addr,              new JuliaFunction<>{XSTR(jl_f_throw), get_func_sig, get_func_attrs} },
+          { jl_f_throw_methoderror_addr,  new JuliaFunction<>{XSTR(jl_f_throw_methoderror), get_func_sig, get_func_attrs} },
           { jl_f_tuple_addr,              jltuple_func },
           { jl_f_svec_addr,               new JuliaFunction<>{XSTR(jl_f_svec), get_func_sig, get_func_attrs} },
           { jl_f_applicable_addr,         new JuliaFunction<>{XSTR(jl_f_applicable), get_func_sig, get_func_attrs} },
@@ -1636,36 +1646,29 @@ static const auto &builtin_func_map() {
           { jl_f_donotdelete_addr,        new JuliaFunction<>{XSTR(jl_f_donotdelete), get_donotdelete_sig, get_donotdelete_func_attrs} },
           { jl_f_compilerbarrier_addr,    new JuliaFunction<>{XSTR(jl_f_compilerbarrier), get_func_sig, get_func_attrs} },
           { jl_f_finalizer_addr,          new JuliaFunction<>{XSTR(jl_f_finalizer), get_func_sig, get_func_attrs} },
-          { jl_f__svec_ref_addr,          new JuliaFunction<>{XSTR(jl_f__svec_ref), get_func_sig, get_func_attrs} }
+          { jl_f__svec_ref_addr,          new JuliaFunction<>{XSTR(jl_f__svec_ref), get_func_sig, get_func_attrs} },
+          { jl_f_current_scope_addr,      new JuliaFunction<>{XSTR(jl_f_current_scope), get_func_sig, get_func_attrs} },
         };
     return builtins;
 }
 
+static const auto &may_dispatch_builtins() {
+    static std::unordered_set<jl_fptr_args_t> builtins(
+        {jl_f__apply_iterate_addr,
+        jl_f__apply_pure_addr,
+        jl_f__call_in_world_addr,
+        jl_f__call_in_world_total_addr,
+        jl_f__call_latest_addr,
+        });
+    return builtins;
+}
+
 static const auto jl_new_opaque_closure_jlcall_func = new JuliaFunction<>{XSTR(jl_new_opaque_closure_jlcall), get_func_sig, get_func_attrs};
 
 static _Atomic(uint64_t) globalUniqueGeneratedNames{1};
 
 // --- code generation ---
 
-extern "C" {
-    jl_cgparams_t jl_default_cgparams = {
-        /* track_allocations */ 1,
-        /* code_coverage */ 1,
-        /* prefer_specsig */ 0,
-#ifdef _OS_WINDOWS_
-        /* gnu_pubnames */ 0,
-#else
-        /* gnu_pubnames */ 1,
-#endif
-        /* debug_info_kind */ (int) DICompileUnit::DebugEmissionKind::FullDebug,
-        /* debug_line_info */ 1,
-        /* safepoint_on_entry */ 1,
-        /* gcstack_arg */ 1,
-        /* use_jlplt*/ 1,
-        /* lookup */ jl_rettype_inferred_addr };
-}
-
-
 static MDNode *best_tbaa(jl_tbaacache_t &tbaa_cache, jl_value_t *jt) {
     jt = jl_unwrap_unionall(jt);
     if (jt == (jl_value_t*)jl_datatype_type ||
@@ -1806,11 +1809,12 @@ struct jl_cgval_t {
     Value *Vboxed;
 
     Value *TIndex; // if `V` is an unboxed (tagged) Union described by `typ`, this gives the DataType index (1-based, small int) as an i8
+    SmallVector<Value*,0> inline_roots; // if present, `V` is a pointer, but not in canonical layout
     jl_value_t *constant; // constant value (rooted in linfo.def.roots)
-    jl_value_t *typ; // the original type of V, never NULL
+    jl_value_t *typ; // the original type of V, never nullptr
     bool isboxed; // whether this value is a jl_value_t* allocated on the heap with the right type tag
     bool isghost; // whether this value is "ghost"
-    MDNode *tbaa; // The related tbaa node. Non-NULL iff this holds an address.
+    MDNode *tbaa; // The related tbaa node. Non-nullptr iff this holds an address.
     // If non-null, this memory location may be promoted on use, by hoisting the
     // destination memory above the promotion point.
     Instruction *promotion_point;
@@ -1821,13 +1825,15 @@ struct jl_cgval_t {
     bool ispointer() const
     {
         // whether this value is compatible with `data_pointer`
+        assert(inline_roots.empty());
         return tbaa != nullptr;
     }
     jl_cgval_t(Value *Vval, jl_value_t *typ, Value *tindex) : // general value constructor
-        V(Vval), // V is allowed to be NULL in a jl_varinfo_t context, but not during codegen contexts
+        V(Vval), // V is allowed to be nullptr in a jl_varinfo_t context, but not during codegen contexts
         Vboxed(nullptr),
         TIndex(tindex),
-        constant(NULL),
+        inline_roots(),
+        constant(nullptr),
         typ(typ),
         isboxed(false),
         isghost(false),
@@ -1835,13 +1841,15 @@ struct jl_cgval_t {
         promotion_point(nullptr),
         promotion_ssa(-1)
     {
-        assert(TIndex == NULL || TIndex->getType() == getInt8Ty(TIndex->getContext()));
+        assert(TIndex == nullptr || TIndex->getType() == getInt8Ty(TIndex->getContext()));
     }
-    jl_cgval_t(Value *Vptr, bool isboxed, jl_value_t *typ, Value *tindex, MDNode *tbaa) : // general pointer constructor
+    jl_cgval_t(Value *Vptr, bool isboxed, jl_value_t *typ, Value *tindex, MDNode *tbaa, Value* inline_roots) = delete;
+    jl_cgval_t(Value *Vptr, bool isboxed, jl_value_t *typ, Value *tindex, MDNode *tbaa, ArrayRef<Value*> inline_roots) : // general pointer constructor
         V(Vptr),
         Vboxed(isboxed ? Vptr : nullptr),
         TIndex(tindex),
-        constant(NULL),
+        inline_roots(inline_roots),
+        constant(nullptr),
         typ(typ),
         isboxed(isboxed),
         isghost(false),
@@ -1851,15 +1859,16 @@ struct jl_cgval_t {
     {
         if (Vboxed)
             assert(Vboxed->getType() == JuliaType::get_prjlvalue_ty(Vboxed->getContext()));
-        assert(tbaa != NULL);
-        assert(!(isboxed && TIndex != NULL));
-        assert(TIndex == NULL || TIndex->getType() == getInt8Ty(TIndex->getContext()));
+        assert(tbaa != nullptr);
+        assert(!(isboxed && TIndex != nullptr));
+        assert(TIndex == nullptr || TIndex->getType() == getInt8Ty(TIndex->getContext()));
     }
     explicit jl_cgval_t(jl_value_t *typ) : // ghost value constructor
-        // mark explicit to avoid being used implicitly for conversion from NULL (use jl_cgval_t() instead)
-        V(NULL),
-        Vboxed(NULL),
-        TIndex(NULL),
+        // mark explicit to avoid being used implicitly for conversion from nullptr (use jl_cgval_t() instead)
+        V(nullptr),
+        Vboxed(nullptr),
+        TIndex(nullptr),
+        inline_roots(),
         constant(((jl_datatype_t*)typ)->instance),
         typ(typ),
         isboxed(false),
@@ -1875,6 +1884,7 @@ struct jl_cgval_t {
         V(v.V),
         Vboxed(v.Vboxed),
         TIndex(tindex),
+        inline_roots(v.inline_roots),
         constant(v.constant),
         typ(typ),
         isboxed(v.isboxed),
@@ -1888,17 +1898,18 @@ struct jl_cgval_t {
         // this constructor expects we had a badly or equivalently typed version
         // make sure we aren't discarding the actual type information
         if (v.TIndex) {
-            assert((TIndex == NULL) == jl_is_concrete_type(typ));
+            assert((TIndex == nullptr) == jl_is_concrete_type(typ));
         }
         else {
             assert(isboxed || v.typ == typ || tindex);
         }
     }
     explicit jl_cgval_t() : // undef / unreachable constructor
-        V(NULL),
-        Vboxed(NULL),
-        TIndex(NULL),
-        constant(NULL),
+        V(nullptr),
+        Vboxed(nullptr),
+        TIndex(nullptr),
+        inline_roots(),
+        constant(nullptr),
         typ(jl_bottom_type),
         isboxed(false),
         isghost(true),
@@ -1914,6 +1925,7 @@ struct jl_varinfo_t {
     Instruction *boxroot; // an address, if the var might be in a jl_value_t** stack slot (marked ctx.tbaa().tbaa_const, if appropriate)
     jl_cgval_t value; // a stack slot or constant value
     Value *pTIndex; // i8* stack slot for the value.TIndex tag describing `value.V`
+    AllocaInst *inline_roots; // stack roots for the inline_roots array, if needed
     DILocalVariable *dinfo;
     // if the variable might be used undefined and is not boxed
     // this i1 flag is true when it is defined
@@ -1924,11 +1936,12 @@ struct jl_varinfo_t {
     bool usedUndef;
     bool used;
 
-    jl_varinfo_t(LLVMContext &ctxt) : boxroot(NULL),
+    jl_varinfo_t(LLVMContext &ctxt) : boxroot(nullptr),
                      value(jl_cgval_t()),
-                     pTIndex(NULL),
-                     dinfo(NULL),
-                     defFlag(NULL),
+                     pTIndex(nullptr),
+                     inline_roots(nullptr),
+                     dinfo(nullptr),
+                     defFlag(nullptr),
                      isSA(false),
                      isVolatile(false),
                      isArgument(false),
@@ -1952,7 +1965,7 @@ class jl_codectx_t {
     std::map<int, jl_varinfo_t> phic_slots;
     std::map<int, std::pair<Value*, Value*> > scope_restore;
     SmallVector<jl_cgval_t, 0> SAvalues;
-    SmallVector<std::tuple<jl_cgval_t, BasicBlock *, AllocaInst *, PHINode *, jl_value_t *>, 0> PhiNodes;
+    SmallVector<std::tuple<jl_cgval_t, BasicBlock *, AllocaInst *, PHINode *, SmallVector<PHINode*,0>, jl_value_t *>, 0> PhiNodes;
     SmallVector<bool, 0> ssavalue_assigned;
     SmallVector<int, 0> ssavalue_usecount;
     jl_module_t *module = NULL;
@@ -1967,7 +1980,7 @@ class jl_codectx_t {
     size_t max_world = -1;
     const char *name = NULL;
     StringRef file{};
-    ssize_t *line = NULL;
+    int32_t line = -1;
     Value *spvals_ptr = NULL;
     Value *argArray = NULL;
     Value *argCount = NULL;
@@ -2100,7 +2113,8 @@ jl_aliasinfo_t jl_aliasinfo_t::fromTBAA(jl_codectx_t &ctx, MDNode *tbaa) {
 }
 
 static Type *julia_type_to_llvm(jl_codectx_t &ctx, jl_value_t *jt, bool *isboxed = NULL);
-static jl_returninfo_t get_specsig_function(jl_codectx_t &ctx, Module *M, Value *fval, StringRef name, jl_value_t *sig, jl_value_t *jlrettype, bool is_opaque_closure, bool gcstack_arg, BitVector *used_arguments=nullptr, size_t *args_begin=nullptr);
+static jl_returninfo_t get_specsig_function(jl_codectx_t &ctx, Module *M, Value *fval, StringRef name, jl_value_t *sig, jl_value_t *jlrettype, bool is_opaque_closure, bool gcstack_arg,
+        ArrayRef<const char*> ArgNames=None, unsigned nreq=0);
 static jl_cgval_t emit_expr(jl_codectx_t &ctx, jl_value_t *expr, ssize_t ssaval = -1);
 static Value *global_binding_pointer(jl_codectx_t &ctx, jl_module_t *m, jl_sym_t *s,
                                      jl_binding_t **pbnd, bool assign, bool alloc);
@@ -2109,6 +2123,7 @@ static jl_cgval_t emit_sparam(jl_codectx_t &ctx, size_t i);
 static Value *emit_condition(jl_codectx_t &ctx, const jl_cgval_t &condV, const Twine &msg);
 static Value *get_current_task(jl_codectx_t &ctx);
 static Value *get_current_ptls(jl_codectx_t &ctx);
+static Value *get_scope_field(jl_codectx_t &ctx);
 static Value *get_tls_world_age_field(jl_codectx_t &ctx);
 static void CreateTrap(IRBuilder<> &irbuilder, bool create_new_block = true);
 static CallInst *emit_jlcall(jl_codectx_t &ctx, FunctionCallee theFptr, Value *theF,
@@ -2122,6 +2137,180 @@ static jl_cgval_t emit_invoke(jl_codectx_t &ctx, const jl_cgval_t &lival, ArrayR
 
 static Value *literal_pointer_val(jl_codectx_t &ctx, jl_value_t *p);
 static unsigned julia_alignment(jl_value_t *jt);
+static void recombine_value(jl_codectx_t &ctx, const jl_cgval_t &x, Value *dst, jl_aliasinfo_t const &dst_ai, Align alignment, bool isVolatile);
+
+static void print_stack_crumbs(jl_codectx_t &ctx)
+{
+    errs() << "\n";
+    errs() << "Stacktrace:\n";
+    jl_method_instance_t *caller = ctx.linfo;
+    jl_((jl_value_t*)caller);
+    errs() << "In " << ctx.file << ":" << ctx.line << "\n";
+    while (true) {
+        auto it = ctx.emission_context.enqueuers.find(caller);
+        if (it != ctx.emission_context.enqueuers.end()) {
+            caller = std::get<jl_method_instance_t *>(it->second);
+        } else {
+            break;
+        }
+        if (caller) {
+            if (jl_is_method_instance(caller)) {
+                for (auto it2 = std::get<CallFrames>(it->second).begin(); it2 != (std::prev(std::get<CallFrames>(it->second).end())); ++it2) {
+                    auto frame = *it2;
+                    errs() << std::get<0>(frame) << "<inlined> \n";
+                    errs() << "In " << std::get<1>(frame) << ":" << std::get<unsigned int>(frame) << "\n";
+                }
+                auto &frame = std::get<CallFrames>(it->second).front();
+                jl_((jl_value_t*)caller);
+                errs() << "In " << std::get<1>(frame) << ":" << std::get<unsigned int>(frame) << "\n";
+            }
+        }
+        else
+            break;
+    }
+    abort();
+}
+
+static jl_value_t *StackFrame(
+        jl_value_t *linfo,
+        std::string fn_name,
+        std::string filepath,
+        int32_t lineno,
+        jl_value_t *inlined)
+{
+    jl_value_t *StackFrame = jl_get_global(jl_base_module, jl_symbol("StackFrame"));
+    assert(StackFrame != nullptr);
+
+    jl_value_t *args[7] = {
+        /* func */ (jl_value_t *)jl_symbol(fn_name.c_str()),
+        /* line */ (jl_value_t *)jl_symbol(filepath.c_str()),
+        /* line */ jl_box_int32(lineno),
+        /* linfo */ (jl_value_t *)linfo,
+        /* from_c */ jl_false,
+        /* inlined */ inlined,
+        /* pointer */ jl_box_uint64(0)
+    };
+
+    jl_value_t *frame = nullptr;
+    JL_TRY {
+        frame = jl_apply_generic(StackFrame, args, 7);
+    } JL_CATCH {
+        jl_safe_printf("Error creating stack frame\n");
+    }
+    return frame;
+}
+
+static void push_frames(jl_codectx_t &ctx, jl_method_instance_t *caller, jl_method_instance_t *callee, int no_debug=false)
+{
+    CallFrames frames;
+    auto it = ctx.emission_context.enqueuers.find(callee);
+    if (it != ctx.emission_context.enqueuers.end())
+        return;
+    if (no_debug) { // Used in tojlinvoke
+        frames.push_back({ctx.funcName, "", 0});
+        ctx.emission_context.enqueuers.insert({callee, {caller, std::move(frames)}});
+        return;
+    }
+    auto DL = ctx.builder.getCurrentDebugLocation();
+    auto filename = std::string(DL->getFilename());
+    auto line = DL->getLine();
+    auto fname = std::string(DL->getScope()->getSubprogram()->getName());
+    frames.push_back({fname, filename, line});
+    auto DI = DL.getInlinedAt();
+    while (DI) {
+        auto filename = std::string(DI->getFilename());
+        auto line = DI->getLine();
+        auto fname = std::string(DI->getScope()->getSubprogram()->getName());
+        frames.push_back({fname, filename, line});
+        DI = DI->getInlinedAt();
+    }
+    ctx.emission_context.enqueuers.insert({callee, {caller, std::move(frames)}});
+}
+
+static jl_array_t* build_stack_crumbs(jl_codectx_t &ctx) JL_NOTSAFEPOINT
+{
+    static intptr_t counter = 5;
+    jl_method_instance_t *caller = (jl_method_instance_t*)counter; //nothing serves as a sentinel for the bottom for the stack
+    push_frames(ctx, ctx.linfo, (jl_method_instance_t*)caller);
+    counter++;
+    jl_array_t *out = jl_alloc_array_1d(jl_array_any_type, 0);
+    JL_GC_PUSH1(&out);
+    while (true) {
+        auto it = ctx.emission_context.enqueuers.find(caller);
+        if (it != ctx.emission_context.enqueuers.end()) {
+            caller = std::get<jl_method_instance_t *>(it->second);
+        } else {
+            break;
+        }
+        if (caller) {
+            assert(ctx.emission_context.enqueuers.count(caller) == 1);
+            if (jl_is_method_instance(caller)) {
+                //TODO: Use a subrange when C++20 is a thing
+                for (auto it2 = std::get<CallFrames>(it->second).begin(); it2 != (std::prev(std::get<CallFrames>(it->second).end())); ++it2) {
+                    auto frame = *it2;
+                    jl_value_t *stackframe = StackFrame(jl_nothing, std::get<0>(frame), std::get<1>(frame), std::get<unsigned int>(frame), jl_true);
+                    if (stackframe == nullptr)
+                        print_stack_crumbs(ctx);
+                    jl_array_ptr_1d_push(out, stackframe);
+                }
+                auto &frame = std::get<CallFrames>(it->second).back();
+                jl_value_t *stackframe = StackFrame((jl_value_t *)caller, std::get<0>(frame), std::get<1>(frame), std::get<unsigned int>(frame), jl_false);
+                if (stackframe == nullptr)
+                    print_stack_crumbs(ctx);
+                jl_array_ptr_1d_push(out, stackframe);
+            }
+        }
+        else
+            break;
+    }
+    JL_GC_POP();
+    return out;
+}
+
+static void print_stacktrace(jl_codectx_t &ctx, int trim)
+{
+    jl_task_t *ct = jl_get_current_task();
+    assert(ct);
+
+    // Temporarily operate in the current age
+    size_t last_age = ct->world_age;
+    ct->world_age = jl_get_world_counter();
+    jl_array_t* bt = build_stack_crumbs(ctx);
+    JL_GC_PUSH1(&bt);
+
+    // Call `reinit_stdio` to get TTY IO objects (w/ color)
+    jl_value_t *reinit_stdio = jl_get_global(jl_base_module, jl_symbol("_reinit_stdio"));
+    assert(reinit_stdio);
+    jl_apply_generic(reinit_stdio, nullptr, 0);
+
+    // Show the backtrace
+    jl_value_t *show_backtrace = jl_get_global(jl_base_module, jl_symbol("show_backtrace"));
+    jl_value_t *base_stderr = jl_get_global(jl_base_module, jl_symbol("stderr"));
+    assert(show_backtrace && base_stderr);
+
+    JL_TRY {
+        jl_value_t *args[2] = { base_stderr, (jl_value_t *)bt };
+        jl_apply_generic(show_backtrace, args, 2);
+    } JL_CATCH {
+        jl_printf(JL_STDERR,"Error showing backtrace\n");
+        print_stack_crumbs(ctx);
+    }
+
+    jl_printf(JL_STDERR, "\n\n");
+    JL_GC_POP();
+    ct->world_age = last_age;
+
+    if (trim == JL_TRIM_SAFE) {
+        jl_printf(JL_STDERR,"Aborting compilation due to finding a dynamic dispatch");
+        exit(1);
+    }
+    return;
+}
+
+static int trim_may_error(int trim)
+{
+    return (trim == JL_TRIM_SAFE) || (trim == JL_TRIM_UNSAFE_WARN);
+}
 
 static GlobalVariable *prepare_global_in(Module *M, JuliaVariable *G)
 {
@@ -2153,6 +2342,20 @@ static inline GlobalVariable *prepare_global_in(Module *M, GlobalVariable *G)
     return cast<GlobalVariable>(local);
 }
 
+static Value *emit_ptrgep(jl_codectx_t &ctx, Value *base, size_t byte_offset, const Twine &Name="")
+{
+    auto *gep = ctx.builder.CreateConstInBoundsGEP1_32(getInt8Ty(ctx.builder.getContext()), base, byte_offset);
+    setName(ctx.emission_context, gep, Name);
+    return gep;
+}
+
+static Value *emit_ptrgep(jl_codectx_t &ctx, Value *base, Value *byte_offset, const Twine &Name="")
+{
+    auto *gep = ctx.builder.CreateInBoundsGEP(getInt8Ty(ctx.builder.getContext()), base, byte_offset, Name);
+    setName(ctx.emission_context, gep, Name);
+    return gep;
+}
+
 
 // --- convenience functions for tagging llvm values with julia types ---
 
@@ -2186,10 +2389,32 @@ static GlobalVariable *get_pointer_to_constant(jl_codegen_params_t &emission_con
     return gv;
 }
 
-static AllocaInst *emit_static_alloca(jl_codectx_t &ctx, Type *lty)
+static AllocaInst *emit_static_alloca(jl_codectx_t &ctx, Type *lty, Align align)
 {
     ++EmittedAllocas;
-    return new AllocaInst(lty, ctx.topalloca->getModule()->getDataLayout().getAllocaAddrSpace(), "", /*InsertBefore=*/ctx.topalloca);
+    return new AllocaInst(lty, ctx.topalloca->getModule()->getDataLayout().getAllocaAddrSpace(), nullptr, align, "", /*InsertBefore=*/ctx.topalloca);
+}
+
+static AllocaInst *emit_static_alloca(jl_codectx_t &ctx, unsigned nb, Align align)
+{
+    // Stupid hack: SROA takes hints from the element type, and will happily split this allocation into lots of unaligned bits
+    // if it cannot find something better to do, which is terrible for performance.
+    // However, if we emit this with an element size equal to the alignment, it will instead split it into aligned chunks
+    // which is great for performance and vectorization.
+    if (alignTo(nb, align) == align.value()) // don't bother with making an array of length 1
+        return emit_static_alloca(ctx, ctx.builder.getIntNTy(align.value() * 8), align);
+    return emit_static_alloca(ctx, ArrayType::get(ctx.builder.getIntNTy(align.value() * 8), alignTo(nb, align) / align.value()), align);
+}
+
+static AllocaInst *emit_static_roots(jl_codectx_t &ctx, unsigned nroots)
+{
+    AllocaInst *staticroots = emit_static_alloca(ctx, ctx.types().T_prjlvalue, Align(sizeof(void*)));
+    staticroots->setOperand(0, ConstantInt::get(getInt32Ty(ctx.builder.getContext()), nroots));
+    IRBuilder<> builder(ctx.topalloca);
+    jl_aliasinfo_t ai = jl_aliasinfo_t::fromTBAA(ctx, ctx.tbaa().tbaa_gcframe);
+    // make sure these are nullptr early from LLVM's perspective, in case it decides to SROA it
+    ai.decorateInst(builder.CreateMemSet(staticroots, builder.getInt8(0), nroots * sizeof(void*), staticroots->getAlign()))->moveAfter(ctx.topalloca);
+    return staticroots;
 }
 
 static void undef_derived_strct(jl_codectx_t &ctx, Value *ptr, jl_datatype_t *sty, MDNode *tbaa)
@@ -2203,7 +2428,7 @@ static void undef_derived_strct(jl_codectx_t &ctx, Value *ptr, jl_datatype_t *st
     size_t i, np = sty->layout->npointers;
     auto T_prjlvalue = JuliaType::get_prjlvalue_ty(ctx.builder.getContext());
     for (i = 0; i < np; i++) {
-        Value *fld = ctx.builder.CreateConstInBoundsGEP1_32(T_prjlvalue, ptr, jl_ptr_offset(sty, i));
+        Value *fld = emit_ptrgep(ctx, ptr, jl_ptr_offset(sty, i) * sizeof(jl_value_t*));
         jl_aliasinfo_t ai = jl_aliasinfo_t::fromTBAA(ctx, tbaa);
         ai.decorateInst(ctx.builder.CreateStore(Constant::getNullValue(T_prjlvalue), fld));
     }
@@ -2239,7 +2464,7 @@ static inline jl_cgval_t ghostValue(jl_codectx_t &ctx, jl_value_t *typ)
     if (jl_is_type_type(typ)) {
         assert(is_uniquerep_Type(typ));
         // replace T::Type{T} with T, by assuming that T must be a leaftype of some sort
-        jl_cgval_t constant(NULL, true, typ, NULL, best_tbaa(ctx.tbaa(), typ));
+        jl_cgval_t constant(NULL, true, typ, NULL, best_tbaa(ctx.tbaa(), typ), None);
         constant.constant = jl_tparam0(typ);
         if (typ == (jl_value_t*)jl_typeofbottom_type->super)
             constant.isghost = true;
@@ -2263,16 +2488,16 @@ static inline jl_cgval_t mark_julia_const(jl_codectx_t &ctx, jl_value_t *jv)
         if (jl_is_datatype_singleton((jl_datatype_t*)typ))
             return ghostValue(ctx, typ);
     }
-    jl_cgval_t constant(NULL, true, typ, NULL, best_tbaa(ctx.tbaa(), typ));
+    jl_cgval_t constant(NULL, true, typ, NULL, best_tbaa(ctx.tbaa(), typ), None);
     constant.constant = jv;
     return constant;
 }
 
 
-static inline jl_cgval_t mark_julia_slot(Value *v, jl_value_t *typ, Value *tindex, MDNode *tbaa)
+static inline jl_cgval_t mark_julia_slot(Value *v, jl_value_t *typ, Value *tindex, MDNode *tbaa, ArrayRef<Value*> inline_roots=None)
 {
     // this enables lazy-copying of immutable values and stack or argument slots
-    jl_cgval_t tagval(v, false, typ, tindex, tbaa);
+    jl_cgval_t tagval(v, false, typ, tindex, tbaa, inline_roots);
     return tagval;
 }
 
@@ -2292,22 +2517,41 @@ static bool valid_as_globalinit(const Value *v) {
 
 static Value *zext_struct(jl_codectx_t &ctx, Value *V);
 
+// TODO: in the future, assume all callers will handle the interior pointers separately, and have
+// have zext_struct strip them out, so we aren't saving those to the stack here causing shadow stores
+// to be necessary too
 static inline jl_cgval_t value_to_pointer(jl_codectx_t &ctx, Value *v, jl_value_t *typ, Value *tindex)
 {
     Value *loc;
     v = zext_struct(ctx, v);
+    Align align(julia_alignment(typ));
     if (valid_as_globalinit(v)) { // llvm can't handle all the things that could be inside a ConstantExpr
         assert(jl_is_concrete_type(typ)); // not legal to have an unboxed abstract type
-        loc = get_pointer_to_constant(ctx.emission_context, cast<Constant>(v), Align(julia_alignment(typ)), "_j_const", *jl_Module);
+        loc = get_pointer_to_constant(ctx.emission_context, cast<Constant>(v), align, "_j_const", *jl_Module);
     }
     else {
-        loc = emit_static_alloca(ctx, v->getType());
-        ctx.builder.CreateStore(v, loc);
+        loc = emit_static_alloca(ctx, v->getType(), align);
+        ctx.builder.CreateAlignedStore(v, loc, align);
     }
     return mark_julia_slot(loc, typ, tindex, ctx.tbaa().tbaa_stack);
 }
 static inline jl_cgval_t value_to_pointer(jl_codectx_t &ctx, const jl_cgval_t &v)
 {
+    if (!v.inline_roots.empty()) {
+        //if (v.V == nullptr) {
+        //    AllocaInst *loc = emit_static_roots(ctx, v.inline_roots.size());
+        //    for (size_t i = 0; i < v.inline_roots.counts(); i++)
+        //        ctx.builder.CreateAlignedStore(v.inline_roots[i], emit_ptrgep(ctx, loc, i * sizeof(void*)), Align(sizeof(void*)));
+        //    return mark_julia_slot(loc, v.typ, v.TIndex, ctx.tbaa().tbaa_gcframe);
+        //}
+        Align align(julia_alignment(v.typ));
+        Type *ty = julia_type_to_llvm(ctx, v.typ);
+        AllocaInst *loc = emit_static_alloca(ctx, ty, align);
+        auto tbaa = v.V == nullptr ? ctx.tbaa().tbaa_gcframe : ctx.tbaa().tbaa_stack;
+        auto stack_ai = jl_aliasinfo_t::fromTBAA(ctx, tbaa);
+        recombine_value(ctx, v, loc, stack_ai, align, false);
+        return mark_julia_slot(loc, v.typ, v.TIndex, tbaa);
+    }
     if (v.ispointer())
         return v;
     return value_to_pointer(ctx, v.V, v.typ, v.TIndex);
@@ -2320,7 +2564,8 @@ static inline jl_cgval_t mark_julia_type(jl_codectx_t &ctx, Value *v, bool isbox
             // replace T::Type{T} with T
             return ghostValue(ctx, typ);
         }
-    } else if (jl_is_datatype(typ) && jl_is_datatype_singleton((jl_datatype_t*)typ)) {
+    }
+    else if (jl_is_datatype(typ) && jl_is_datatype_singleton((jl_datatype_t*)typ)) {
         // no need to explicitly load/store a constant/ghost value
         return ghostValue(ctx, typ);
     }
@@ -2328,13 +2573,14 @@ static inline jl_cgval_t mark_julia_type(jl_codectx_t &ctx, Value *v, bool isbox
     if (type_is_ghost(T)) {
         return ghostValue(ctx, typ);
     }
-    if (v && !isboxed && v->getType()->isAggregateType() && CountTrackedPointers(v->getType()).count == 0) {
+    if (v && !isboxed && v->getType()->isAggregateType()) {
         // eagerly put this back onto the stack
         // llvm mem2reg pass will remove this if unneeded
-        return value_to_pointer(ctx, v, typ, NULL);
+        if (CountTrackedPointers(v->getType()).count == 0)
+            return value_to_pointer(ctx, v, typ, NULL);
     }
     if (isboxed)
-        return jl_cgval_t(v, isboxed, typ, NULL, best_tbaa(ctx.tbaa(), typ));
+        return jl_cgval_t(v, isboxed, typ, NULL, best_tbaa(ctx.tbaa(), typ), None);
     return jl_cgval_t(v, typ, NULL);
 }
 
@@ -2369,7 +2615,7 @@ static inline jl_cgval_t update_julia_type(jl_codectx_t &ctx, const jl_cgval_t &
             if (alwaysboxed) {
                 // discovered that this union-split type must actually be isboxed
                 if (v.Vboxed) {
-                    return jl_cgval_t(v.Vboxed, true, typ, NULL, best_tbaa(ctx.tbaa(), typ));
+                    return jl_cgval_t(v.Vboxed, true, typ, NULL, best_tbaa(ctx.tbaa(), typ), v.inline_roots);
                 }
                 else {
                     // type mismatch (there weren't any boxed values in the union)
@@ -2412,7 +2658,7 @@ static void alloc_def_flag(jl_codectx_t &ctx, jl_varinfo_t& vi)
 {
     assert((!vi.boxroot || vi.pTIndex) && "undef check is null pointer for boxed things");
     if (vi.usedUndef) {
-        vi.defFlag = emit_static_alloca(ctx, getInt1Ty(ctx.builder.getContext()));
+        vi.defFlag = emit_static_alloca(ctx, getInt1Ty(ctx.builder.getContext()), Align(1));
         setName(ctx.emission_context, vi.defFlag, "isdefined");
         store_def_flag(ctx, vi, false);
     }
@@ -2598,14 +2844,14 @@ static jl_cgval_t convert_julia_type_union(jl_codectx_t &ctx, const jl_cgval_t &
                             decay_derived(ctx, boxv),
                             decay_derived(ctx, slotv));
             }
-            jl_cgval_t newv = jl_cgval_t(slotv, false, typ, new_tindex, tbaa);
+            jl_cgval_t newv = jl_cgval_t(slotv, false, typ, new_tindex, tbaa, v.inline_roots);
             assert(boxv->getType() == ctx.types().T_prjlvalue);
             newv.Vboxed = boxv;
             return newv;
         }
     }
     else {
-        return jl_cgval_t(boxed(ctx, v), true, typ, NULL, best_tbaa(ctx.tbaa(), typ));
+        return jl_cgval_t(boxed(ctx, v), true, typ, NULL, best_tbaa(ctx.tbaa(), typ), None);
     }
     return jl_cgval_t(v, typ, new_tindex);
 }
@@ -2636,7 +2882,7 @@ static jl_cgval_t convert_julia_type(jl_codectx_t &ctx, const jl_cgval_t &v, jl_
             if (skip) {
                 *skip = ctx.builder.CreateNot(emit_exactly_isa(ctx, v, (jl_datatype_t*)typ, true));
             }
-            return jl_cgval_t(v.Vboxed, true, typ, NULL, best_tbaa(ctx.tbaa(), typ));
+            return jl_cgval_t(v.Vboxed, true, typ, NULL, best_tbaa(ctx.tbaa(), typ), v.inline_roots);
         }
         if (mustbox_union) {
             // type mismatch: there weren't any boxed values in the union
@@ -2658,7 +2904,7 @@ static jl_cgval_t convert_julia_type(jl_codectx_t &ctx, const jl_cgval_t &v, jl_
             unsigned new_idx = get_box_tindex((jl_datatype_t*)v.typ, typ);
             if (new_idx) {
                 new_tindex = ConstantInt::get(getInt8Ty(ctx.builder.getContext()), new_idx);
-                if (v.V && !v.ispointer()) {
+                if (v.V && v.inline_roots.empty() && !v.ispointer()) {
                     // TODO: remove this branch once all consumers of v.TIndex understand how to handle a non-ispointer value
                     return jl_cgval_t(value_to_pointer(ctx, v), typ, new_tindex);
                 }
@@ -2682,7 +2928,7 @@ static jl_cgval_t convert_julia_type(jl_codectx_t &ctx, const jl_cgval_t &v, jl_
         }
         if (makeboxed) {
             // convert to a simple isboxed value
-            return jl_cgval_t(boxed(ctx, v), true, typ, NULL, best_tbaa(ctx.tbaa(), typ));
+            return mark_julia_type(ctx, boxed(ctx, v), true, typ);
         }
     }
     return jl_cgval_t(v, typ, new_tindex);
@@ -3461,25 +3707,26 @@ static size_t emit_masked_bits_compare(callback &emit_desc, jl_datatype_t *aty,
 static Value *emit_bits_compare(jl_codectx_t &ctx, jl_cgval_t arg1, jl_cgval_t arg2)
 {
     ++EmittedBitsCompares;
+    jl_value_t *argty = (arg1.constant ? jl_typeof(arg1.constant) : arg1.typ);
     bool isboxed;
     Type *at = julia_type_to_llvm(ctx, arg1.typ, &isboxed);
-    assert(jl_is_datatype(arg1.typ) && arg1.typ == arg2.typ && !isboxed);
+    assert(jl_is_datatype(arg1.typ) && arg1.typ == (arg2.constant ? jl_typeof(arg2.constant) : arg2.typ) && !isboxed);
 
     if (type_is_ghost(at))
         return ConstantInt::get(getInt1Ty(ctx.builder.getContext()), 1);
 
     if (at->isIntegerTy() || at->isPointerTy() || at->isFloatingPointTy()) {
         Type *at_int = INTT(at, ctx.emission_context.DL);
-        Value *varg1 = emit_unbox(ctx, at_int, arg1, arg1.typ);
-        Value *varg2 = emit_unbox(ctx, at_int, arg2, arg2.typ);
+        Value *varg1 = emit_unbox(ctx, at_int, arg1, argty);
+        Value *varg2 = emit_unbox(ctx, at_int, arg2, argty);
         return ctx.builder.CreateICmpEQ(varg1, varg2);
     }
 
     if (at->isVectorTy()) {
-        jl_svec_t *types = ((jl_datatype_t*)arg1.typ)->types;
+        jl_svec_t *types = ((jl_datatype_t*)argty)->types;
         Value *answer = ConstantInt::get(getInt1Ty(ctx.builder.getContext()), 1);
-        Value *varg1 = emit_unbox(ctx, at, arg1, arg1.typ);
-        Value *varg2 = emit_unbox(ctx, at, arg2, arg2.typ);
+        Value *varg1 = emit_unbox(ctx, at, arg1, argty);
+        Value *varg2 = emit_unbox(ctx, at, arg2, argty);
         for (size_t i = 0, l = jl_svec_len(types); i < l; i++) {
             jl_value_t *fldty = jl_svecref(types, i);
             Value *subAns, *fld1, *fld2;
@@ -3494,12 +3741,12 @@ static Value *emit_bits_compare(jl_codectx_t &ctx, jl_cgval_t arg1, jl_cgval_t a
     }
 
     if (at->isAggregateType()) { // Struct or Array
-        jl_datatype_t *sty = (jl_datatype_t*)arg1.typ;
+        jl_datatype_t *sty = (jl_datatype_t*)argty;
         size_t sz = jl_datatype_size(sty);
         if (sz > 512 && !sty->layout->flags.haspadding && sty->layout->flags.isbitsegal) {
-            Value *varg1 = arg1.ispointer() ? data_pointer(ctx, arg1) :
+            Value *varg1 = arg1.inline_roots.empty() && arg1.ispointer() ? data_pointer(ctx, arg1) :
                 value_to_pointer(ctx, arg1).V;
-            Value *varg2 = arg2.ispointer() ? data_pointer(ctx, arg2) :
+            Value *varg2 = arg2.inline_roots.empty() && arg2.ispointer() ? data_pointer(ctx, arg2) :
                 value_to_pointer(ctx, arg2).V;
             varg1 = emit_pointer_from_objref(ctx, varg1);
             varg2 = emit_pointer_from_objref(ctx, varg2);
@@ -3534,11 +3781,9 @@ static Value *emit_bits_compare(jl_codectx_t &ctx, jl_cgval_t arg1, jl_cgval_t a
             return ctx.builder.CreateICmpEQ(answer, ConstantInt::get(getInt32Ty(ctx.builder.getContext()), 0));
         }
         else if (sz > 512 && jl_struct_try_layout(sty) && sty->layout->flags.isbitsegal) {
-            Type *TInt8 = getInt8Ty(ctx.builder.getContext());
-            Type *TInt1 = getInt1Ty(ctx.builder.getContext());
-            Value *varg1 = arg1.ispointer() ? data_pointer(ctx, arg1) :
+            Value *varg1 = arg1.inline_roots.empty() && arg1.ispointer() ? data_pointer(ctx, arg1) :
                 value_to_pointer(ctx, arg1).V;
-            Value *varg2 = arg2.ispointer() ? data_pointer(ctx, arg2) :
+            Value *varg2 = arg2.inline_roots.empty() && arg2.ispointer() ? data_pointer(ctx, arg2) :
                 value_to_pointer(ctx, arg2).V;
             varg1 = emit_pointer_from_objref(ctx, varg1);
             varg2 = emit_pointer_from_objref(ctx, varg2);
@@ -3554,8 +3799,8 @@ static Value *emit_bits_compare(jl_codectx_t &ctx, jl_cgval_t arg1, jl_cgval_t a
                 Value *ptr1 = varg1;
                 Value *ptr2 = varg2;
                 if (desc.offset != 0) {
-                    ptr1 = ctx.builder.CreateConstInBoundsGEP1_32(TInt8, ptr1, desc.offset);
-                    ptr2 = ctx.builder.CreateConstInBoundsGEP1_32(TInt8, ptr2, desc.offset);
+                    ptr1 = emit_ptrgep(ctx, ptr1, desc.offset);
+                    ptr2 = emit_ptrgep(ctx, ptr2, desc.offset);
                 }
 
                 Value *new_ptr1 = ptr1;
@@ -3565,7 +3810,7 @@ static Value *emit_bits_compare(jl_codectx_t &ctx, jl_cgval_t arg1, jl_cgval_t a
                 PHINode *answerphi = nullptr;
                 if (desc.nrepeats != 1) {
                     // Set up loop
-                    endptr1 = ctx.builder.CreateConstInBoundsGEP1_32(TInt8, ptr1, desc.nrepeats * (desc.data_bytes + desc.padding_bytes));;
+                    endptr1 = emit_ptrgep(ctx, ptr1, desc.nrepeats * (desc.data_bytes + desc.padding_bytes));;
 
                     BasicBlock *currBB = ctx.builder.GetInsertBlock();
                     loopBB = BasicBlock::Create(ctx.builder.getContext(), "egal_loop", ctx.f);
@@ -3573,6 +3818,7 @@ static Value *emit_bits_compare(jl_codectx_t &ctx, jl_cgval_t arg1, jl_cgval_t a
                     ctx.builder.CreateBr(loopBB);
 
                     ctx.builder.SetInsertPoint(loopBB);
+                    Type *TInt1 = getInt1Ty(ctx.builder.getContext());
                     answerphi = ctx.builder.CreatePHI(TInt1, 2);
                     answerphi->addIncoming(answer ? answer : ConstantInt::get(TInt1, 1), currBB);
                     answer = answerphi;
@@ -3580,11 +3826,11 @@ static Value *emit_bits_compare(jl_codectx_t &ctx, jl_cgval_t arg1, jl_cgval_t a
                     PHINode *itr1 = ctx.builder.CreatePHI(ptr1->getType(), 2);
                     PHINode *itr2 = ctx.builder.CreatePHI(ptr2->getType(), 2);
 
-                    new_ptr1 = ctx.builder.CreateConstInBoundsGEP1_32(TInt8, itr1, desc.data_bytes + desc.padding_bytes);
+                    new_ptr1 = emit_ptrgep(ctx, itr1, desc.data_bytes + desc.padding_bytes);
                     itr1->addIncoming(ptr1, currBB);
                     itr1->addIncoming(new_ptr1, loopBB);
 
-                    Value *new_ptr2 = ctx.builder.CreateConstInBoundsGEP1_32(TInt8, itr2, desc.data_bytes + desc.padding_bytes);
+                    Value *new_ptr2 = emit_ptrgep(ctx, itr2, desc.data_bytes + desc.padding_bytes);
                     itr2->addIncoming(ptr2, currBB);
                     itr2->addIncoming(new_ptr2, loopBB);
 
@@ -3699,8 +3945,10 @@ static Value *emit_f_is(jl_codectx_t &ctx, const jl_cgval_t &arg1, const jl_cgva
     if (jl_type_intersection(rt1, rt2) == (jl_value_t*)jl_bottom_type) // types are disjoint (exhaustive test)
         return ConstantInt::get(getInt1Ty(ctx.builder.getContext()), 0);
 
-    bool justbits1 = jl_is_concrete_immutable(rt1);
-    bool justbits2 = jl_is_concrete_immutable(rt2);
+    // can compare any concrete immutable by bits, except for UnionAll
+    // which has a special non-bits based egal
+    bool justbits1 = jl_is_concrete_immutable(rt1) && !jl_is_kind(rt1);
+    bool justbits2 = jl_is_concrete_immutable(rt2) && !jl_is_kind(rt2);
     if (justbits1 || justbits2) { // whether this type is unique'd by value
         return emit_nullcheck_guard2(ctx, nullcheck1, nullcheck2, [&] () -> Value* {
             jl_datatype_t *typ = (jl_datatype_t*)(justbits1 ? rt1 : rt2);
@@ -4066,7 +4314,7 @@ static bool emit_f_opmemory(jl_codectx_t &ctx, jl_cgval_t *ret, jl_value_t *f,
             ptindex = ctx.builder.CreateInBoundsGEP(AT, data, mlen);
             data = ctx.builder.CreateInBoundsGEP(AT, data, idx0);
         }
-        ptindex = ctx.builder.CreateInBoundsGEP(getInt8Ty(ctx.builder.getContext()), ptindex, idx0);
+        ptindex = emit_ptrgep(ctx, ptindex, idx0);
         *ret = union_store(ctx, data, ptindex, val, cmp, ety,
             ctx.tbaa().tbaa_arraybuf, ctx.tbaa().tbaa_arrayselbyte,
             Order, FailOrder,
@@ -4081,7 +4329,7 @@ static bool emit_f_opmemory(jl_codectx_t &ctx, jl_cgval_t *ret, jl_value_t *f,
             assert(ptr);
             lock = ptr;
             // ptr += sizeof(lock);
-            ptr = ctx.builder.CreateConstInBoundsGEP1_32(getInt8Ty(ctx.builder.getContext()), ptr, LLT_ALIGN(sizeof(jl_mutex_t), JL_SMALL_BYTE_ALIGNMENT));
+            ptr = emit_ptrgep(ctx, ptr, LLT_ALIGN(sizeof(jl_mutex_t), JL_SMALL_BYTE_ALIGNMENT));
         }
         Value *data_owner = NULL; // owner object against which the write barrier must check
         if (isboxed || layout->first_ptr >= 0) { // if elements are just bits, don't need a write barrier
@@ -4196,9 +4444,15 @@ static bool emit_builtin_call(jl_codectx_t &ctx, jl_cgval_t *ret, jl_value_t *f,
 #ifdef _P64
                 nva = ctx.builder.CreateTrunc(nva, getInt32Ty(ctx.builder.getContext()));
 #endif
-                Value *theArgs = ctx.builder.CreateInBoundsGEP(ctx.types().T_prjlvalue, ctx.argArray, ConstantInt::get(ctx.types().T_size, ctx.nReqArgs));
+                Value *theArgs = emit_ptrgep(ctx, ctx.argArray, ctx.nReqArgs * sizeof(jl_value_t*));
                 Value *r = ctx.builder.CreateCall(prepare_call(jlapplygeneric_func), { theF, theArgs, nva });
                 *ret = mark_julia_type(ctx, r, true, jl_any_type);
+                if (trim_may_error(ctx.params->trim)) {
+                    // if we know the return type, we can assume the result is of that type
+                    errs() << "ERROR: Dynamic call to Core._apply_iterate detected\n";
+                    errs() << "In " << ctx.builder.getCurrentDebugLocation()->getFilename() << ":" << ctx.builder.getCurrentDebugLocation()->getLine() << "\n";
+                    print_stacktrace(ctx, ctx.params->trim);
+                }
                 return true;
             }
         }
@@ -4346,7 +4600,7 @@ static bool emit_builtin_call(jl_codectx_t &ctx, jl_cgval_t *ret, jl_value_t *f,
                     ptindex = ctx.builder.CreateInBoundsGEP(AT, data, mlen);
                     data = ctx.builder.CreateInBoundsGEP(AT, data, idx0);
                 }
-                ptindex = ctx.builder.CreateInBoundsGEP(getInt8Ty(ctx.builder.getContext()), ptindex, idx0);
+                ptindex = emit_ptrgep(ctx, ptindex, idx0);
                 size_t elsz_c = 0, al_c = 0;
                 int union_max = jl_islayout_inline(ety, &elsz_c, &al_c);
                 assert(union_max && LLT_ALIGN(elsz_c, al_c) == elsz && al_c == al);
@@ -4359,7 +4613,7 @@ static bool emit_builtin_call(jl_codectx_t &ctx, jl_cgval_t *ret, jl_value_t *f,
                     assert(ptr);
                     lock = ptr;
                     // ptr += sizeof(lock);
-                    ptr = ctx.builder.CreateConstInBoundsGEP1_32(getInt8Ty(ctx.builder.getContext()), ptr, LLT_ALIGN(sizeof(jl_mutex_t), JL_SMALL_BYTE_ALIGNMENT));
+                    ptr = emit_ptrgep(ctx, ptr, LLT_ALIGN(sizeof(jl_mutex_t), JL_SMALL_BYTE_ALIGNMENT));
                     emit_lockstate_value(ctx, lock, true);
                 }
                 *ret = typed_load(ctx, ptr, nullptr, ety,
@@ -4450,10 +4704,10 @@ static bool emit_builtin_call(jl_codectx_t &ctx, jl_cgval_t *ret, jl_value_t *f,
                 if (needlock) {
                     // n.b. no actual lock acquire needed, as the check itself only needs to load a single pointer and check for null
                     // elem += sizeof(lock);
-                    elem = ctx.builder.CreateConstInBoundsGEP1_32(getInt8Ty(ctx.builder.getContext()), elem, LLT_ALIGN(sizeof(jl_mutex_t), JL_SMALL_BYTE_ALIGNMENT));
+                    elem = emit_ptrgep(ctx, elem, LLT_ALIGN(sizeof(jl_mutex_t), JL_SMALL_BYTE_ALIGNMENT));
                 }
                 if (!isboxed)
-                    elem = ctx.builder.CreateConstInBoundsGEP1_32(ctx.types().T_prjlvalue, elem, layout->first_ptr);
+                    elem = emit_ptrgep(ctx, elem, layout->first_ptr * sizeof(void*));
                 // emit this using the same type as jl_builtin_memoryrefget
                 // so that LLVM may be able to load-load forward them and fold the result
                 auto tbaa = isboxed ? ctx.tbaa().tbaa_ptrarraybuf : ctx.tbaa().tbaa_arraybuf;
@@ -4541,7 +4795,7 @@ static bool emit_builtin_call(jl_codectx_t &ctx, jl_cgval_t *ret, jl_value_t *f,
                     if (load->getPointerOperand() == ctx.slots[ctx.vaSlot].boxroot && ctx.argArray) {
                         Value *valen = emit_n_varargs(ctx);
                         jl_cgval_t va_ary( // fake instantiation of a cgval, in order to call emit_bounds_check (it only checks the `.V` field)
-                                ctx.builder.CreateInBoundsGEP(ctx.types().T_prjlvalue, ctx.argArray, ConstantInt::get(ctx.types().T_size, ctx.nReqArgs)),
+                                emit_ptrgep(ctx, ctx.argArray, ctx.nReqArgs * sizeof(jl_value_t*)),
                                 NULL, NULL);
                         Value *idx = emit_unbox(ctx, ctx.types().T_size, fld, (jl_value_t*)jl_long_type);
                         idx = emit_bounds_check(ctx, va_ary, NULL, idx, valen, boundscheck);
@@ -4582,34 +4836,33 @@ static bool emit_builtin_call(jl_codectx_t &ctx, jl_cgval_t *ret, jl_value_t *f,
                     // For tuples, we can emit code even if we don't know the exact
                     // type (e.g. because we don't know the length). This is possible
                     // as long as we know that all elements are of the same (leaf) type.
-                    if (obj.ispointer()) {
-                        if (order != jl_memory_order_notatomic && order != jl_memory_order_unspecified) {
-                            emit_atomic_error(ctx, "getfield: non-atomic field cannot be accessed atomically");
-                            *ret = jl_cgval_t(); // unreachable
-                            return true;
-                        }
-                        // Determine which was the type that was homogeneous
-                        jl_value_t *jt = jl_tparam0(utt);
-                        if (jl_is_vararg(jt))
-                            jt = jl_unwrap_vararg(jt);
-                        assert(jl_is_datatype(jt));
-                        // This is not necessary for correctness, but allows to omit
-                        // the extra code for getting the length of the tuple
-                        if (!bounds_check_enabled(ctx, boundscheck)) {
-                            vidx = ctx.builder.CreateSub(vidx, ConstantInt::get(ctx.types().T_size, 1));
-                        }
-                        else {
-                            vidx = emit_bounds_check(ctx, obj, (jl_value_t*)obj.typ, vidx,
-                                emit_datatype_nfields(ctx, emit_typeof(ctx, obj, false, false)),
-                                jl_true);
-                        }
-                        bool isboxed = !jl_datatype_isinlinealloc((jl_datatype_t*)jt, 0);
-                        Value *ptr = data_pointer(ctx, obj);
-                        *ret = typed_load(ctx, ptr, vidx,
-                                isboxed ? (jl_value_t*)jl_any_type : jt,
-                                obj.tbaa, nullptr, isboxed, AtomicOrdering::NotAtomic, false);
+                    jl_cgval_t ptrobj = obj.isboxed ? obj : value_to_pointer(ctx, obj);
+                    if (order != jl_memory_order_notatomic && order != jl_memory_order_unspecified) {
+                        emit_atomic_error(ctx, "getfield: non-atomic field cannot be accessed atomically");
+                        *ret = jl_cgval_t(); // unreachable
                         return true;
                     }
+                    // Determine which was the type that was homogeneous
+                    jl_value_t *jt = jl_tparam0(utt);
+                    if (jl_is_vararg(jt))
+                        jt = jl_unwrap_vararg(jt);
+                    assert(jl_is_datatype(jt));
+                    // This is not necessary for correctness, but allows to omit
+                    // the extra code for getting the length of the tuple
+                    if (!bounds_check_enabled(ctx, boundscheck)) {
+                        vidx = ctx.builder.CreateSub(vidx, ConstantInt::get(ctx.types().T_size, 1));
+                    }
+                    else {
+                        vidx = emit_bounds_check(ctx, ptrobj, (jl_value_t*)ptrobj.typ, vidx,
+                            emit_datatype_nfields(ctx, emit_typeof(ctx, ptrobj, false, false)),
+                            jl_true);
+                    }
+                    bool isboxed = !jl_datatype_isinlinealloc((jl_datatype_t*)jt, 0);
+                    Value *ptr = data_pointer(ctx, ptrobj);
+                    *ret = typed_load(ctx, ptr, vidx,
+                            isboxed ? (jl_value_t*)jl_any_type : jt,
+                            ptrobj.tbaa, nullptr, isboxed, AtomicOrdering::NotAtomic, false);
+                    return true;
                 }
 
                 // Unknown object, but field known to be integer
@@ -4886,12 +5139,17 @@ static bool emit_builtin_call(jl_codectx_t &ctx, jl_cgval_t *ret, jl_value_t *f,
         else if (jl_field_isptr(stt, fieldidx) || jl_type_hasptr(jl_field_type(stt, fieldidx))) {
             Value *fldv;
             size_t offs = jl_field_offset(stt, fieldidx) / sizeof(jl_value_t*);
-            if (obj.ispointer()) {
+            if (!obj.inline_roots.empty()) {
+                auto offsets = split_value_field(stt, fieldidx);
+                assert(offsets.second >= 0);
+                fldv = obj.inline_roots[offsets.second];
+            }
+            else if (obj.ispointer()) {
                 auto tbaa = best_field_tbaa(ctx, obj, stt, fieldidx, offs);
                 if (!jl_field_isptr(stt, fieldidx))
                     offs += ((jl_datatype_t*)jl_field_type(stt, fieldidx))->layout->first_ptr;
                 Value *ptr = data_pointer(ctx, obj);
-                Value *addr = ctx.builder.CreateConstInBoundsGEP1_32(ctx.types().T_prjlvalue, ptr, offs);
+                Value *addr = emit_ptrgep(ctx, ptr, offs * sizeof(jl_value_t*));
                 // emit this using the same type as emit_getfield_knownidx
                 // so that LLVM may be able to load-load forward them and fold the result
                 jl_aliasinfo_t ai = jl_aliasinfo_t::fromTBAA(ctx, tbaa);
@@ -4919,6 +5177,14 @@ static bool emit_builtin_call(jl_codectx_t &ctx, jl_cgval_t *ret, jl_value_t *f,
         return true;
     }
 
+    else if (f == jl_builtin_current_scope && (nargs == 0)) {
+        jl_aliasinfo_t scope_ai = jl_aliasinfo_t::fromTBAA(ctx, ctx.tbaa().tbaa_gcframe);
+        Instruction *v = scope_ai.decorateInst(
+            ctx.builder.CreateAlignedLoad(ctx.types().T_prjlvalue, get_scope_field(ctx), ctx.types().alignof_ptr));
+        *ret = mark_julia_type(ctx, v, /*boxed*/ true, rt);
+        return true;
+    }
+
     else if (f == jl_builtin_donotdelete) {
         // For now we emit this as a vararg call to the builtin
         // (which doesn't look at the arguments). In the future,
@@ -4986,46 +5252,29 @@ static CallInst *emit_jlcall(jl_codectx_t &ctx, JuliaFunction<> *theFptr, Value
     return emit_jlcall(ctx, prepare_call(theFptr), theF, argv, nargs, trampoline);
 }
 
-static jl_cgval_t emit_call_specfun_other(jl_codectx_t &ctx, bool is_opaque_closure, jl_value_t *specTypes, jl_value_t *jlretty, llvm::Value *callee, StringRef specFunctionObject, jl_code_instance_t *fromexternal,
-                                          ArrayRef<jl_cgval_t> argv, size_t nargs, jl_returninfo_t::CallingConv *cc, unsigned *return_roots, jl_value_t *inferred_retty)
+static jl_cgval_t emit_call_specfun_other(jl_codectx_t &ctx, bool is_opaque_closure, jl_value_t *specTypes, jl_value_t *jlretty, jl_returninfo_t &returninfo, jl_code_instance_t *fromexternal,
+                                          ArrayRef<jl_cgval_t> argv, size_t nargs)
 {
     ++EmittedSpecfunCalls;
     // emit specialized call site
     bool gcstack_arg = JL_FEAT_TEST(ctx, gcstack_arg);
-    jl_returninfo_t returninfo = get_specsig_function(ctx, jl_Module, callee, specFunctionObject, specTypes, jlretty, is_opaque_closure, gcstack_arg);
     FunctionType *cft = returninfo.decl.getFunctionType();
-    *cc = returninfo.cc;
-    *return_roots = returninfo.return_roots;
-
     size_t nfargs = cft->getNumParams();
     SmallVector<Value *, 0> argvals(nfargs);
     unsigned idx = 0;
     AllocaInst *result = nullptr;
-    switch (returninfo.cc) {
-    case jl_returninfo_t::Boxed:
-    case jl_returninfo_t::Register:
-    case jl_returninfo_t::Ghosts:
-        break;
-    case jl_returninfo_t::SRet:
-        result = emit_static_alloca(ctx, getAttributeAtIndex(returninfo.attrs, 1, Attribute::StructRet).getValueAsType());
-        #if JL_LLVM_VERSION < 170000
-        assert(cast<PointerType>(result->getType())->hasSameElementTypeAs(cast<PointerType>(cft->getParamType(0))));
-        #endif
-        argvals[idx] = result;
-        idx++;
-        break;
-    case jl_returninfo_t::Union:
-        result = emit_static_alloca(ctx, ArrayType::get(getInt8Ty(ctx.builder.getContext()), returninfo.union_bytes));
+
+    if (returninfo.cc == jl_returninfo_t::SRet || returninfo.cc == jl_returninfo_t::Union) {
+        result = emit_static_alloca(ctx, returninfo.union_bytes, Align(returninfo.union_align));
         setName(ctx.emission_context, result, "sret_box");
-        if (returninfo.union_align > 1)
-            result->setAlignment(Align(returninfo.union_align));
         argvals[idx] = result;
         idx++;
-        break;
     }
 
+    AllocaInst *return_roots = nullptr;
     if (returninfo.return_roots) {
-        AllocaInst *return_roots = emit_static_alloca(ctx, ArrayType::get(ctx.types().T_prjlvalue, returninfo.return_roots));
+        assert(returninfo.cc == jl_returninfo_t::SRet);
+        return_roots = emit_static_roots(ctx, returninfo.return_roots);
         argvals[idx] = return_roots;
         idx++;
     }
@@ -5036,18 +5285,30 @@ static jl_cgval_t emit_call_specfun_other(jl_codectx_t &ctx, bool is_opaque_clos
     for (size_t i = 0; i < nargs; i++) {
         jl_value_t *jt = jl_nth_slot_type(specTypes, i);
         // n.b.: specTypes is required to be a datatype by construction for specsig
-        jl_cgval_t arg = argv[i];
         if (is_opaque_closure && i == 0) {
-            // Special optimization for opaque closures: We know that specsig opaque
-            // closures don't look at their type tag (they are fairly quickly discarded
-            // for their environments). Therefore, we can just pass these as a pointer,
-            // rather than a boxed value.
-            arg = value_to_pointer(ctx, arg);
-            argvals[idx] = decay_derived(ctx, data_pointer(ctx, arg));
-        }
-        else if (is_uniquerep_Type(jt)) {
+            // Special implementation for opaque closures: their jt and thus
+            // julia_type_to_llvm values are likely wrong, so override the
+            // behavior here to directly pass the expected pointer based instead
+            // just on passing arg as a pointer
+            jl_cgval_t arg = argv[i];
+            if (arg.isghost) {
+                argvals[idx] = Constant::getNullValue(ctx.builder.getPtrTy(AddressSpace::Derived));
+            }
+            else {
+                if (!arg.isboxed)
+                    arg = value_to_pointer(ctx, arg);
+                argvals[idx] = decay_derived(ctx, data_pointer(ctx, arg));
+            }
+            idx++;
             continue;
-        } else {
+        }
+        jl_cgval_t arg = update_julia_type(ctx, argv[i], jt);
+        if (arg.typ == jl_bottom_type)
+            return jl_cgval_t();
+        if (is_uniquerep_Type(jt)) {
+            continue;
+        }
+        else {
             bool isboxed = deserves_argbox(jt);
             Type *et = isboxed ? ctx.types().T_prjlvalue : julia_type_to_llvm(ctx, jt);
             if (type_is_ghost(et))
@@ -5057,9 +5318,24 @@ static jl_cgval_t emit_call_specfun_other(jl_codectx_t &ctx, bool is_opaque_clos
                 argvals[idx] = boxed(ctx, arg);
             }
             else if (et->isAggregateType()) {
-                arg = value_to_pointer(ctx, arg);
-                // can lazy load on demand, no copy needed
-                argvals[idx] = decay_derived(ctx, data_pointer(ctx, arg));
+                auto tracked = CountTrackedPointers(et);
+                if (tracked.count && !tracked.all) {
+                    Value *val = arg.V;
+                    SmallVector<Value*,0> roots(arg.inline_roots);
+                    if (roots.empty())
+                        std::tie(val, roots) = split_value(ctx, arg, Align(julia_alignment(jt)));
+                    AllocaInst *proots = emit_static_roots(ctx, roots.size());
+                    for (size_t i = 0; i < roots.size(); i++)
+                        ctx.builder.CreateAlignedStore(roots[i], emit_ptrgep(ctx, proots, i * sizeof(void*)), Align(sizeof(void*)));
+                    assert(val);
+                    argvals[idx++] = decay_derived(ctx, val);
+                    argvals[idx] = proots;
+                }
+                else {
+                    if (!arg.isboxed)
+                        arg = value_to_pointer(ctx, arg);
+                    argvals[idx] = decay_derived(ctx, data_pointer(ctx, arg));
+                }
             }
             else {
                 Value *val = emit_unbox(ctx, et, arg, jt);
@@ -5105,7 +5381,7 @@ static jl_cgval_t emit_call_specfun_other(jl_codectx_t &ctx, bool is_opaque_clos
             break;
         case jl_returninfo_t::SRet:
             assert(result);
-            retval = mark_julia_slot(result, jlretty, NULL, ctx.tbaa().tbaa_stack);
+            retval = mark_julia_slot(result, jlretty, NULL, ctx.tbaa().tbaa_gcframe, load_gc_roots(ctx, return_roots, returninfo.return_roots));
             break;
         case jl_returninfo_t::Union: {
             Value *box = ctx.builder.CreateExtractValue(call, 0);
@@ -5114,7 +5390,7 @@ static jl_cgval_t emit_call_specfun_other(jl_codectx_t &ctx, bool is_opaque_clos
                 ctx.builder.CreateICmpEQ(
                         ctx.builder.CreateAnd(tindex, ConstantInt::get(getInt8Ty(ctx.builder.getContext()), UNION_BOX_MARKER)),
                         ConstantInt::get(getInt8Ty(ctx.builder.getContext()), 0)),
-                decay_derived(ctx, argvals[0]),
+                decay_derived(ctx, result),
                 decay_derived(ctx, box)
             );
             retval = mark_julia_slot(derived,
@@ -5128,6 +5404,19 @@ static jl_cgval_t emit_call_specfun_other(jl_codectx_t &ctx, bool is_opaque_clos
             retval = mark_julia_slot(NULL, jlretty, call, ctx.tbaa().tbaa_stack);
             break;
     }
+    return retval;
+}
+
+static jl_cgval_t emit_call_specfun_other(jl_codectx_t &ctx, bool is_opaque_closure, jl_value_t *specTypes, jl_value_t *jlretty, llvm::Value *callee, StringRef specFunctionObject, jl_code_instance_t *fromexternal,
+                                          ArrayRef<jl_cgval_t> argv, size_t nargs, jl_returninfo_t::CallingConv *cc, unsigned *nreturn_roots, jl_value_t *inferred_retty)
+{
+    ++EmittedSpecfunCalls;
+    // emit specialized call site
+    bool gcstack_arg = JL_FEAT_TEST(ctx, gcstack_arg);
+    jl_returninfo_t returninfo = get_specsig_function(ctx, jl_Module, callee, specFunctionObject, specTypes, jlretty, is_opaque_closure, gcstack_arg);
+    *cc = returninfo.cc;
+    *nreturn_roots = returninfo.return_roots;
+    jl_cgval_t retval = emit_call_specfun_other(ctx, is_opaque_closure, specTypes, jlretty, returninfo, fromexternal, argv, nargs);
     // see if inference has a different / better type for the call than the lambda
     return update_julia_type(ctx, retval, inferred_retty);
 }
@@ -5271,12 +5560,25 @@ static jl_cgval_t emit_invoke(jl_codectx_t &ctx, const jl_cgval_t &lival, ArrayR
                     if (need_to_emit) {
                         Function *trampoline_decl = cast<Function>(jl_Module->getNamedValue(protoname));
                         ctx.call_targets[codeinst] = {cc, return_roots, trampoline_decl, specsig};
+                        if (trim_may_error(ctx.params->trim))
+                            push_frames(ctx, ctx.linfo, mi);
                     }
                 }
             }
         }
     }
     if (!handled) {
+        if (trim_may_error(ctx.params->trim)) {
+            if (lival.constant) {
+                arraylist_push(&new_invokes, lival.constant);
+                push_frames(ctx, ctx.linfo, (jl_method_instance_t*)lival.constant);
+            } else {
+                errs() << "Dynamic call to unknown function";
+                errs() << "In " << ctx.builder.getCurrentDebugLocation()->getFilename() << ":" << ctx.builder.getCurrentDebugLocation()->getLine() << "\n";
+
+                print_stacktrace(ctx, ctx.params->trim);
+            }
+        }
         Value *r = emit_jlcall(ctx, jlinvoke_func, boxed(ctx, lival), argv, nargs, julia_call2);
         result = mark_julia_type(ctx, r, true, rt);
     }
@@ -5336,7 +5638,12 @@ static jl_cgval_t emit_invoke_modify(jl_codectx_t &ctx, jl_expr_t *ex, jl_value_
             return mark_julia_type(ctx, oldnew, true, rt);
         }
     }
-
+    if (trim_may_error(ctx.params->trim)) {
+        errs() << "ERROR: dynamic invoke modify call to";
+        jl_(args[0]);
+        errs() << "In " << ctx.builder.getCurrentDebugLocation()->getFilename() << ":" << ctx.builder.getCurrentDebugLocation()->getLine() << "\n";
+        print_stacktrace(ctx, ctx.params->trim);
+    }
     // emit function and arguments
     Value *callval = emit_jlcall(ctx, jlapplygeneric_func, nullptr, argv, nargs, julia_call);
     return mark_julia_type(ctx, callval, true, rt);
@@ -5406,10 +5713,15 @@ static jl_cgval_t emit_call(jl_codectx_t &ctx, jl_expr_t *ex, jl_value_t *rt, bo
             bool handled = emit_builtin_call(ctx, &result, f.constant, argv, nargs - 1, rt, ex, is_promotable);
             if (handled)
                 return result;
-
+            jl_fptr_args_t builtin_fptr = jl_get_builtin_fptr((jl_datatype_t*)jl_typeof(f.constant));
             // special case for some known builtin not handled by emit_builtin_call
-            auto it = builtin_func_map().find(jl_get_builtin_fptr((jl_datatype_t*)jl_typeof(f.constant)));
+            auto it = builtin_func_map().find(builtin_fptr);
             if (it != builtin_func_map().end()) {
+                if (trim_may_error(ctx.params->trim) && may_dispatch_builtins().count(builtin_fptr)) {
+                    errs() << "ERROR: Dynamic call to builtin" << jl_symbol_name(((jl_datatype_t*)jl_typeof(f.constant))->name->name);
+                    errs() << "In " << ctx.builder.getCurrentDebugLocation()->getFilename() << ":" << ctx.builder.getCurrentDebugLocation()->getLine() << "\n";
+                    print_stacktrace(ctx, ctx.params->trim);
+                }
                 Value *ret = emit_jlcall(ctx, it->second, Constant::getNullValue(ctx.types().T_prjlvalue), ArrayRef<jl_cgval_t>(argv).drop_front(), nargs - 1, julia_call);
                 setName(ctx.emission_context, ret, it->second->name + "_ret");
                 return mark_julia_type(ctx, ret, true, rt);
@@ -5420,7 +5732,7 @@ static jl_cgval_t emit_call(jl_codectx_t &ctx, jl_expr_t *ex, jl_value_t *rt, bo
         JuliaFunction<> *cc;
         if (f.typ == (jl_value_t*)jl_intrinsic_type) {
             fptr = prepare_call(jlintrinsic_func);
-            F = f.ispointer() ? data_pointer(ctx, f) : value_to_pointer(ctx, f).V;
+            F = f.inline_roots.empty() && f.ispointer() ? data_pointer(ctx, f) : value_to_pointer(ctx, f).V;
             F = decay_derived(ctx, F);
             cc = julia_call3;
         }
@@ -5448,7 +5760,79 @@ static jl_cgval_t emit_call(jl_codectx_t &ctx, jl_expr_t *ex, jl_value_t *rt, bo
             }
         }
     }
+    int failed_dispatch = !argv[0].constant;
+    if (ctx.params->trim != JL_TRIM_NO) {
+        size_t min_valid = 1;
+        size_t max_valid = ~(size_t)0;
+        size_t latest_world = jl_get_world_counter(); // TODO: marshal the world age of the compilation here.
+
+        // Find all methods matching the call signature
+        jl_array_t *matches = NULL;
+        jl_value_t *tup = NULL;
+        JL_GC_PUSH2(&tup, &matches);
+        if (!failed_dispatch) {
+            SmallVector<jl_value_t*> argtypes;
+            for (auto& arg: argv)
+                argtypes.push_back(arg.typ);
+            tup = jl_apply_tuple_type_v(argtypes.data(), argtypes.size());
+            matches = (jl_array_t*)jl_matching_methods((jl_tupletype_t*)tup, jl_nothing, 10 /*TODO: make global*/, 1,
+                                                latest_world, &min_valid, &max_valid, NULL);
+            if ((jl_value_t*)matches == jl_nothing)
+                failed_dispatch = 1;
+        }
+
+        // Expand each matching method to its unique specialization, if it has exactly one
+        if (!failed_dispatch) {
+            size_t k;
+            size_t len = new_invokes.len;
+            for (k = 0; k < jl_array_nrows(matches); k++) {
+                jl_method_match_t *match = (jl_method_match_t *)jl_array_ptr_ref(matches, k);
+                jl_method_instance_t *mi = jl_method_match_to_mi(match, latest_world, min_valid, max_valid, 0);
+                if (!mi) {
+                    if (jl_array_nrows(matches) == 1) {
+                        // if the method match is not compileable, but there is only one, fall back to
+                        // unspecialized implementation
+                        mi = jl_get_unspecialized(match->method);
+                    }
+                    else {
+                        new_invokes.len = len;
+                        failed_dispatch = 1;
+                        break;
+                    }
+                }
+                arraylist_push(&new_invokes, mi);
+            }
+        }
+        JL_GC_POP();
+    }
 
+    if (failed_dispatch && trim_may_error(ctx.params->trim)) {
+        errs() << "Dynamic call to ";
+        jl_jmp_buf *old_buf = jl_get_safe_restore();
+        jl_jmp_buf buf;
+        jl_set_safe_restore(&buf);
+        if (!jl_setjmp(buf, 0)) {
+            jl_static_show((JL_STREAM*)STDERR_FILENO, (jl_value_t*)args[0]);
+            jl_printf((JL_STREAM*)STDERR_FILENO,"(");
+            for (size_t i = 1; i < nargs; ++i) {
+                jl_value_t *typ = argv[i].typ;
+                if (!jl_is_concrete_type(typ)) // Print type in red
+                    jl_printf((JL_STREAM*)STDERR_FILENO, "\x1b[31m");
+                jl_static_show((JL_STREAM*)STDERR_FILENO, (jl_value_t*)argv[i].typ);
+                if (!jl_is_concrete_type(typ))
+                    jl_printf((JL_STREAM*)STDERR_FILENO, "\x1b[0m");
+                if (i != nargs-1)
+                    jl_printf((JL_STREAM*)STDERR_FILENO,", ");
+            }
+            jl_printf((JL_STREAM*)STDERR_FILENO,")\n");
+        }
+        else {
+            jl_printf((JL_STREAM*)STDERR_FILENO, "\n!!! ERROR while printing error -- ABORTING !!!\n");
+        }
+        jl_set_safe_restore(old_buf);
+        errs() << "In " << ctx.builder.getCurrentDebugLocation()->getFilename() << ":" << ctx.builder.getCurrentDebugLocation()->getLine() << "\n";
+        print_stacktrace(ctx, ctx.params->trim);
+    }
     // emit function and arguments
     Value *callval = emit_jlcall(ctx, jlapplygeneric_func, nullptr, argv, n_generic_args, julia_call);
     return mark_julia_type(ctx, callval, true, rt);
@@ -5575,10 +5959,7 @@ static jl_cgval_t emit_sparam(jl_codectx_t &ctx, size_t i)
         }
     }
     assert(ctx.spvals_ptr != NULL);
-    Value *bp = ctx.builder.CreateConstInBoundsGEP1_32(
-            ctx.types().T_prjlvalue,
-            ctx.spvals_ptr,
-            i + sizeof(jl_svec_t) / sizeof(jl_value_t*));
+    Value *bp = emit_ptrgep(ctx, ctx.spvals_ptr, i * sizeof(jl_value_t*) + sizeof(jl_svec_t));
     jl_aliasinfo_t ai = jl_aliasinfo_t::fromTBAA(ctx, ctx.tbaa().tbaa_const);
     Value *sp = ai.decorateInst(ctx.builder.CreateAlignedLoad(ctx.types().T_prjlvalue, bp, Align(sizeof(void*))));
     setName(ctx.emission_context, sp, "sparam");
@@ -5631,10 +6012,7 @@ static jl_cgval_t emit_isdefined(jl_codectx_t &ctx, jl_value_t *sym, int allow_i
             }
         }
         assert(ctx.spvals_ptr != NULL);
-        Value *bp = ctx.builder.CreateConstInBoundsGEP1_32(
-                ctx.types().T_prjlvalue,
-                ctx.spvals_ptr,
-                i + sizeof(jl_svec_t) / sizeof(jl_value_t*));
+        Value *bp = emit_ptrgep(ctx, ctx.spvals_ptr, i * sizeof(jl_value_t*) + sizeof(jl_svec_t));
         jl_aliasinfo_t ai = jl_aliasinfo_t::fromTBAA(ctx, ctx.tbaa().tbaa_const);
         Value *sp = ai.decorateInst(ctx.builder.CreateAlignedLoad(ctx.types().T_prjlvalue, bp, Align(sizeof(void*))));
         isnull = ctx.builder.CreateICmpNE(emit_typeof(ctx, sp, false, true), emit_tagfrom(ctx, jl_tvar_type));
@@ -5678,42 +6056,53 @@ static jl_cgval_t emit_isdefined(jl_codectx_t &ctx, jl_value_t *sym, int allow_i
 }
 
 static jl_cgval_t emit_varinfo(jl_codectx_t &ctx, jl_varinfo_t &vi, jl_sym_t *varname) {
-    jl_value_t *typ = vi.value.typ;
     jl_cgval_t v;
     Value *isnull = NULL;
     if (vi.boxroot == NULL || vi.pTIndex != NULL) {
-        if ((!vi.isVolatile && vi.isSA) || vi.isArgument || vi.value.constant || !vi.value.V) {
+        if ((!vi.isVolatile && vi.isSA) || vi.isArgument || vi.value.constant || !(vi.value.V || vi.inline_roots)) {
             v = vi.value;
             if (vi.pTIndex)
                 v.TIndex = ctx.builder.CreateAlignedLoad(getInt8Ty(ctx.builder.getContext()), vi.pTIndex, Align(1));
         }
         else {
             // copy value to a non-mutable (non-volatile SSA) location
-            AllocaInst *varslot = cast<AllocaInst>(vi.value.V);
-            setName(ctx.emission_context, varslot, jl_symbol_name(varname));
-            Type *T = varslot->getAllocatedType();
-            assert(!varslot->isArrayAllocation() && "variables not expected to be VLA");
-            AllocaInst *ssaslot = cast<AllocaInst>(varslot->clone());
-            setName(ctx.emission_context, ssaslot, jl_symbol_name(varname) + StringRef(".ssa"));
-            ssaslot->insertAfter(varslot);
-            if (vi.isVolatile) {
-                Value *unbox = ctx.builder.CreateAlignedLoad(ssaslot->getAllocatedType(), varslot,
-                        varslot->getAlign(),
-                        true);
-                ctx.builder.CreateAlignedStore(unbox, ssaslot, ssaslot->getAlign());
-            }
-            else {
-                const DataLayout &DL = jl_Module->getDataLayout();
-                uint64_t sz = DL.getTypeStoreSize(T);
-                emit_memcpy(ctx, ssaslot, jl_aliasinfo_t::fromTBAA(ctx, ctx.tbaa().tbaa_stack), vi.value, sz, ssaslot->getAlign(), varslot->getAlign());
+            // since this might be a union slot, the most convenient approach to copying
+            // is to move the whole alloca chunk
+            AllocaInst *ssaslot = nullptr;
+            if (vi.value.V) {
+                auto stack_ai = jl_aliasinfo_t::fromTBAA(ctx, ctx.tbaa().tbaa_stack);
+                AllocaInst *varslot = cast<AllocaInst>(vi.value.V);
+                Type *T = varslot->getAllocatedType();
+                assert(!varslot->isArrayAllocation() && "variables not expected to be VLA");
+                ssaslot = cast<AllocaInst>(varslot->clone());
+                setName(ctx.emission_context, ssaslot, varslot->getName() + StringRef(".ssa"));
+                ssaslot->insertAfter(varslot);
+                if (vi.isVolatile) {
+                    Value *unbox = ctx.builder.CreateAlignedLoad(ssaslot->getAllocatedType(), varslot, varslot->getAlign(), true);
+                    stack_ai.decorateInst(ctx.builder.CreateAlignedStore(unbox, ssaslot, ssaslot->getAlign()));
+                }
+                else {
+                    const DataLayout &DL = jl_Module->getDataLayout();
+                    uint64_t sz = DL.getTypeStoreSize(T);
+                    emit_memcpy(ctx, ssaslot, stack_ai, vi.value, sz, ssaslot->getAlign(), varslot->getAlign());
+                }
             }
             Value *tindex = NULL;
             if (vi.pTIndex)
                 tindex = ctx.builder.CreateAlignedLoad(getInt8Ty(ctx.builder.getContext()), vi.pTIndex, Align(1), vi.isVolatile);
-            v = mark_julia_slot(ssaslot, vi.value.typ, tindex, ctx.tbaa().tbaa_stack);
+            v = mark_julia_slot(ssaslot, vi.value.typ, tindex, ctx.tbaa().tbaa_stack, None);
+        }
+        if (vi.inline_roots) {
+            AllocaInst *varslot = vi.inline_roots;
+            size_t nroots = cast<ConstantInt>(varslot->getArraySize())->getZExtValue();
+            auto T_prjlvalue = varslot->getAllocatedType();
+            if (auto AT = dyn_cast<ArrayType>(T_prjlvalue)) {
+                nroots *= AT->getNumElements();
+                T_prjlvalue = AT->getElementType();
+            }
+            assert(T_prjlvalue == ctx.types().T_prjlvalue);
+            v.inline_roots = load_gc_roots(ctx, varslot, nroots, vi.isVolatile);
         }
-        if (vi.boxroot == NULL)
-            v = update_julia_type(ctx, v, typ);
         if (vi.usedUndef) {
             assert(vi.defFlag);
             isnull = ctx.builder.CreateAlignedLoad(getInt1Ty(ctx.builder.getContext()), vi.defFlag, Align(1), vi.isVolatile);
@@ -5724,7 +6113,7 @@ static jl_cgval_t emit_varinfo(jl_codectx_t &ctx, jl_varinfo_t &vi, jl_sym_t *va
         Value *box_isnull = NULL;
         if (vi.usedUndef)
             box_isnull = ctx.builder.CreateICmpNE(boxed, Constant::getNullValue(ctx.types().T_prjlvalue));
-        maybe_mark_load_dereferenceable(boxed, vi.usedUndef || vi.pTIndex, typ);
+        maybe_mark_load_dereferenceable(boxed, vi.usedUndef || vi.pTIndex, vi.value.typ);
         if (vi.pTIndex) {
             // value is either boxed in the stack slot, or unboxed in value
             // as indicated by testing (pTIndex & UNION_BOX_MARKER)
@@ -5733,15 +6122,14 @@ static jl_cgval_t emit_varinfo(jl_codectx_t &ctx, jl_varinfo_t &vi, jl_sym_t *va
                         ConstantInt::get(getInt8Ty(ctx.builder.getContext()), 0));
             if (vi.usedUndef)
                 isnull = ctx.builder.CreateSelect(load_unbox, isnull, box_isnull);
-            if (v.V) { // v.V will be null if it is a union of all ghost values
+            if (v.V) // v.V will be null if it is a union of all ghost values
                 v.V = ctx.builder.CreateSelect(load_unbox, decay_derived(ctx, v.V), decay_derived(ctx, boxed));
-            } else
+            else
                 v.V = boxed;
             v.Vboxed = boxed;
-            v = update_julia_type(ctx, v, typ);
         }
         else {
-            v = mark_julia_type(ctx, boxed, true, typ);
+            v = mark_julia_type(ctx, boxed, true, vi.value.typ);
             if (vi.usedUndef)
                 isnull = box_isnull;
         }
@@ -5773,49 +6161,27 @@ static void emit_vi_assignment_unboxed(jl_codectx_t &ctx, jl_varinfo_t &vi, Valu
         store_def_flag(ctx, vi, true);
 
     if (!vi.value.constant) { // check that this is not a virtual store
-        assert(vi.value.ispointer() || (vi.pTIndex && vi.value.V == NULL));
+        assert(vi.inline_roots || vi.value.ispointer() || (vi.pTIndex && vi.value.V == NULL));
         // store value
-        if (vi.value.V == NULL) {
-            // all ghost values in destination - nothing to copy or store
-        }
-        else if (rval_info.constant || !rval_info.ispointer()) {
-            if (rval_info.isghost) {
-                // all ghost values in source - nothing to copy or store
-            }
-            else {
-                if (rval_info.typ != vi.value.typ && !vi.pTIndex && !rval_info.TIndex) {
-                    // isbits cast-on-assignment is invalid. this branch should be dead-code.
-                    CreateTrap(ctx.builder);
-                }
-                else {
-                    Value *dest = vi.value.V;
-                    if (vi.pTIndex) // TODO: use lifetime-end here instead
-                        ctx.builder.CreateStore(UndefValue::get(cast<AllocaInst>(vi.value.V)->getAllocatedType()), vi.value.V);
-                    Type *store_ty = julia_type_to_llvm(ctx, rval_info.constant ? jl_typeof(rval_info.constant) : rval_info.typ);
-                    jl_aliasinfo_t ai = jl_aliasinfo_t::fromTBAA(ctx, ctx.tbaa().tbaa_stack);
-                    ai.decorateInst(ctx.builder.CreateStore(
-                                      emit_unbox(ctx, store_ty, rval_info, rval_info.typ),
-                                      dest,
-                                      vi.isVolatile));
-                }
-            }
-        }
-        else {
-            if (vi.pTIndex == NULL) {
-                assert(jl_is_concrete_type(vi.value.typ));
-                // Sometimes we can get into situations where the LHS and RHS
-                // are the same slot. We're not allowed to memcpy in that case
-                // due to LLVM bugs.
-                // This check should probably mostly catch the relevant situations.
-                if (vi.value.V != rval_info.V) {
-                    Value *copy_bytes = ConstantInt::get(getInt32Ty(ctx.builder.getContext()), jl_datatype_size(vi.value.typ));
-                    Align alignment(julia_alignment(rval_info.typ));
-                    emit_memcpy(ctx, vi.value.V, jl_aliasinfo_t::fromTBAA(ctx, ctx.tbaa().tbaa_stack), rval_info, copy_bytes,
-                                alignment, alignment, vi.isVolatile);
-                }
-            }
+        rval_info = update_julia_type(ctx, rval_info, vi.value.typ);
+        if (rval_info.typ == jl_bottom_type)
+            return;
+        if (vi.pTIndex && vi.value.V) // TODO: use lifetime-end here instead
+            ctx.builder.CreateStore(UndefValue::get(cast<AllocaInst>(vi.value.V)->getAllocatedType()), vi.value.V);
+        // Sometimes we can get into situations where the LHS and RHS
+        // are the same slot. We're not allowed to memcpy in that case
+        // due to LLVM bugs.
+        // This check should probably mostly catch the relevant situations.
+        if (vi.value.V != nullptr ? vi.value.V != rval_info.V : vi.inline_roots != nullptr) {
+            MDNode *tbaa = ctx.tbaa().tbaa_stack; // Use vi.value.tbaa ?
+            if (rval_info.TIndex)
+                emit_unionmove(ctx, vi.value.V, tbaa, rval_info, /*skip*/isboxed, vi.isVolatile);
             else {
-                emit_unionmove(ctx, vi.value.V, ctx.tbaa().tbaa_stack, rval_info, /*skip*/isboxed, vi.isVolatile);
+                Align align(julia_alignment(rval_info.typ));
+                if (vi.inline_roots)
+                    split_value_into(ctx, rval_info, align, vi.value.V, align, jl_aliasinfo_t::fromTBAA(ctx, tbaa), vi.inline_roots, jl_aliasinfo_t::fromTBAA(ctx, ctx.tbaa().tbaa_gcframe), vi.isVolatile);
+                else
+                    emit_unbox_store(ctx, rval_info, vi.value.V, tbaa, align, vi.isVolatile);
             }
         }
     }
@@ -5830,7 +6196,8 @@ static void emit_phinode_assign(jl_codectx_t &ctx, ssize_t idx, jl_value_t *r)
     jl_value_t *phiType = NULL;
     if (jl_is_array(ssavalue_types)) {
         phiType = jl_array_ptr_ref(ssavalue_types, idx);
-    } else {
+    }
+    else {
         phiType = (jl_value_t*)jl_any_type;
     }
     jl_array_t *edges = (jl_array_t*)jl_fieldref_noalloc(r, 0);
@@ -5840,6 +6207,7 @@ static void emit_phinode_assign(jl_codectx_t &ctx, ssize_t idx, jl_value_t *r)
         return;
     }
     AllocaInst *dest = nullptr;
+    SmallVector<PHINode*,0> roots;
     // N.B.: For any memory space, used as a phi,
     // we need to emit space twice here. The reason for this is that
     // phi nodes may be arguments of other phi nodes, so if we don't
@@ -5850,7 +6218,7 @@ static void emit_phinode_assign(jl_codectx_t &ctx, ssize_t idx, jl_value_t *r)
         size_t min_align, nbytes;
         dest = try_emit_union_alloca(ctx, ((jl_uniontype_t*)phiType), allunbox, min_align, nbytes);
         if (dest) {
-            Instruction *phi = dest->clone();
+            AllocaInst *phi = cast<AllocaInst>(dest->clone());
             phi->insertAfter(dest);
             PHINode *Tindex_phi = PHINode::Create(getInt8Ty(ctx.builder.getContext()), jl_array_nrows(edges), "tindex_phi");
             Tindex_phi->insertInto(BB, InsertPt);
@@ -5859,14 +6227,14 @@ static void emit_phinode_assign(jl_codectx_t &ctx, ssize_t idx, jl_value_t *r)
             Value *isboxed = ctx.builder.CreateICmpNE(
                     ctx.builder.CreateAnd(Tindex_phi, ConstantInt::get(getInt8Ty(ctx.builder.getContext()), UNION_BOX_MARKER)),
                     ConstantInt::get(getInt8Ty(ctx.builder.getContext()), 0));
-            ctx.builder.CreateMemCpy(phi, MaybeAlign(min_align), dest, dest->getAlign(), nbytes, false);
+            ctx.builder.CreateMemCpy(phi, Align(min_align), dest, dest->getAlign(), nbytes, false);
             ctx.builder.CreateLifetimeEnd(dest);
             Value *ptr = ctx.builder.CreateSelect(isboxed,
                 decay_derived(ctx, ptr_phi),
                 decay_derived(ctx, phi));
             jl_cgval_t val = mark_julia_slot(ptr, phiType, Tindex_phi, best_tbaa(ctx.tbaa(), phiType));
             val.Vboxed = ptr_phi;
-            ctx.PhiNodes.push_back(std::make_tuple(val, BB, dest, ptr_phi, r));
+            ctx.PhiNodes.push_back(std::make_tuple(val, BB, dest, ptr_phi, roots, r));
             ctx.SAvalues[idx] = val;
             ctx.ssavalue_assigned[idx] = true;
             return;
@@ -5875,7 +6243,7 @@ static void emit_phinode_assign(jl_codectx_t &ctx, ssize_t idx, jl_value_t *r)
             PHINode *Tindex_phi = PHINode::Create(getInt8Ty(ctx.builder.getContext()), jl_array_nrows(edges), "tindex_phi");
             Tindex_phi->insertInto(BB, InsertPt);
             jl_cgval_t val = mark_julia_slot(NULL, phiType, Tindex_phi, ctx.tbaa().tbaa_stack);
-            ctx.PhiNodes.push_back(std::make_tuple(val, BB, dest, (PHINode*)NULL, r));
+            ctx.PhiNodes.push_back(std::make_tuple(val, BB, dest, (PHINode*)nullptr, roots, r));
             ctx.SAvalues[idx] = val;
             ctx.ssavalue_assigned[idx] = true;
             return;
@@ -5894,23 +6262,38 @@ static void emit_phinode_assign(jl_codectx_t &ctx, ssize_t idx, jl_value_t *r)
     }
     jl_cgval_t slot;
     PHINode *value_phi = NULL;
-    if (vtype->isAggregateType() && CountTrackedPointers(vtype).count == 0) {
+    if (!isboxed && vtype->isAggregateType()) {
         // the value will be moved into dest in the predecessor critical block.
         // here it's moved into phi in the successor (from dest)
-        dest = emit_static_alloca(ctx, vtype);
-        Value *phi = emit_static_alloca(ctx, vtype);
-        ctx.builder.CreateMemCpy(phi, Align(julia_alignment(phiType)),
-             dest, dest->getAlign(),
-             jl_datatype_size(phiType), false);
-        ctx.builder.CreateLifetimeEnd(dest);
-        slot = mark_julia_slot(phi, phiType, NULL, ctx.tbaa().tbaa_stack);
+        auto tracked = CountTrackedPointers(vtype);
+        if (tracked.count) {
+            roots.resize(tracked.count);
+            assert(tracked.count == split_value_size((jl_datatype_t*)phiType).second);
+            for (size_t nr = 0; nr < tracked.count; nr++) {
+                auto root_phi = PHINode::Create(ctx.types().T_prjlvalue, jl_array_nrows(edges), "root_phi");
+                root_phi->insertInto(BB, InsertPt);
+                roots[nr] = root_phi;
+            }
+        }
+        AllocaInst *phi = nullptr;
+        if (!tracked.all) {
+            Align align(julia_alignment(phiType));
+            unsigned nb = jl_datatype_size(phiType);
+            dest = emit_static_alloca(ctx, nb, align);
+            phi = cast<AllocaInst>(dest->clone());
+            phi->insertBefore(dest);
+            ctx.builder.CreateMemCpy(phi, align, dest, align, nb, false);
+            ctx.builder.CreateLifetimeEnd(dest);
+        }
+        slot = mark_julia_slot(phi, phiType, NULL, ctx.tbaa().tbaa_stack,
+                roots.empty() ? ArrayRef<Value*>() : ArrayRef((Value *const *)&roots.front(), roots.size()));
     }
     else {
         value_phi = PHINode::Create(vtype, jl_array_nrows(edges), "value_phi");
         value_phi->insertInto(BB, InsertPt);
         slot = mark_julia_type(ctx, value_phi, isboxed, phiType);
     }
-    ctx.PhiNodes.push_back(std::make_tuple(slot, BB, dest, value_phi, r));
+    ctx.PhiNodes.push_back(std::make_tuple(slot, BB, dest, value_phi, roots, r));
     ctx.SAvalues[idx] = slot;
     ctx.ssavalue_assigned[idx] = true;
     return;
@@ -5930,8 +6313,9 @@ static void emit_ssaval_assign(jl_codectx_t &ctx, ssize_t ssaidx_0based, jl_valu
             it = ctx.phic_slots.emplace(ssaidx_0based, jl_varinfo_t(ctx.builder.getContext())).first;
         }
         slot = emit_varinfo(ctx, it->second, jl_symbol("phic"));
-    } else {
-        slot = emit_expr(ctx, r, ssaidx_0based); // slot could be a jl_value_t (unboxed) or jl_value_t* (ispointer)
+    }
+    else {
+        slot = emit_expr(ctx, r, ssaidx_0based);
     }
     if (slot.isboxed || slot.TIndex) {
         // see if inference suggested a different type for the ssavalue than the expression
@@ -6090,11 +6474,22 @@ static void emit_upsilonnode(jl_codectx_t &ctx, ssize_t phic, jl_value_t *val)
                 vi.pTIndex, Align(1), true);
         }
         else if (vi.value.V && !vi.value.constant && vi.value.typ != jl_bottom_type) {
-            assert(vi.value.ispointer());
-            Type *T = cast<AllocaInst>(vi.value.V)->getAllocatedType();
-            if (CountTrackedPointers(T).count) {
-                // make sure gc pointers (including ptr_phi of union-split) are initialized to NULL
-                ctx.builder.CreateStore(Constant::getNullValue(T), vi.value.V, true);
+            assert(vi.inline_roots || vi.value.ispointer());
+            if (vi.inline_roots) {
+                // memory optimization: make gc pointers re-initialized to NULL
+                AllocaInst *ssaroots = vi.inline_roots;
+                size_t nroots = cast<ConstantInt>(ssaroots->getArraySize())->getZExtValue();
+                auto T_prjlvalue = ssaroots->getAllocatedType();
+                if (auto AT = dyn_cast<ArrayType>(T_prjlvalue)) {
+                    nroots *= AT->getNumElements();
+                    T_prjlvalue = AT->getElementType();
+                }
+                assert(T_prjlvalue == ctx.types().T_prjlvalue);
+                Value *nullval = Constant::getNullValue(T_prjlvalue);
+                auto stack_ai = jl_aliasinfo_t::fromTBAA(ctx, ctx.tbaa().tbaa_gcframe);
+                for (size_t i = 0; i < nroots; i++) {
+                    stack_ai.decorateInst(ctx.builder.CreateAlignedStore(nullval, emit_ptrgep(ctx, ssaroots, i * sizeof(void*)), ssaroots->getAlign(), true));
+                }
             }
         }
     }
@@ -6233,7 +6628,8 @@ static std::pair<Function*, Function*> get_oc_function(jl_codectx_t &ctx, jl_met
     if (closure_method->source) {
         mi = jl_specializations_get_linfo(closure_method, sigtype, jl_emptysvec);
         ci = (jl_code_instance_t*)jl_rettype_inferred_addr(mi, ctx.min_world, ctx.max_world);
-    } else {
+    }
+    else {
         mi = (jl_method_instance_t*)jl_atomic_load_relaxed(&closure_method->specializations);
         assert(jl_is_method_instance(mi));
         ci = jl_atomic_load_relaxed(&mi->cache);
@@ -6276,7 +6672,8 @@ static std::pair<Function*, Function*> get_oc_function(jl_codectx_t &ctx, jl_met
         closure_decls.specFunctionObject;
     if (GlobalValue *V = jl_Module->getNamedValue(fname)) {
         F = cast<Function>(V);
-    } else {
+    }
+    else {
         F = Function::Create(get_func_sig(ctx.builder.getContext()),
                              Function::ExternalLinkage,
                              fname, jl_Module);
@@ -6287,7 +6684,8 @@ static std::pair<Function*, Function*> get_oc_function(jl_codectx_t &ctx, jl_met
     Function *specF = NULL;
     if (!isspecsig) {
         specF = F;
-    } else {
+    }
+    else {
         //emission context holds context lock so can get module
         specF = closure_m.getModuleUnlocked()->getFunction(closure_decls.specFunctionObject);
         if (specF) {
@@ -6342,8 +6740,9 @@ static jl_cgval_t emit_expr(jl_codectx_t &ctx, jl_value_t *expr, ssize_t ssaidx_
         jl_value_t *val = expr;
         if (jl_is_quotenode(expr))
             val = jl_fieldref_noalloc(expr, 0);
-        if (jl_is_method(ctx.linfo->def.method)) // toplevel exprs are already rooted
-            val = jl_ensure_rooted(ctx, val);
+        // Toplevel exprs are rooted but because codegen assumes this is constant, it removes the write barriers for this code.
+        // This means we have to globally root the value here. (The other option would be to change how we optimize toplevel code)
+        val = jl_ensure_rooted(ctx, val);
         return mark_julia_const(ctx, val);
     }
 
@@ -6578,6 +6977,13 @@ static jl_cgval_t emit_expr(jl_codectx_t &ctx, jl_value_t *expr, ssize_t ssaidx_
             ((jl_method_t*)source.constant)->nargs > 0 &&
             jl_is_valid_oc_argtype((jl_tupletype_t*)argt.constant, (jl_method_t*)source.constant);
 
+        if (!can_optimize && trim_may_error(ctx.params->trim)) {
+            // if we know the return type, we can assume the result is of that type
+            errs() << "ERROR: Dynamic call to OpaqueClosure method\n";
+            errs() << "In " << ctx.builder.getCurrentDebugLocation()->getFilename() << ":" << ctx.builder.getCurrentDebugLocation()->getLine() << "\n";
+            print_stacktrace(ctx, ctx.params->trim);
+        }
+
         if (can_optimize) {
             jl_value_t *closure_t = NULL;
             jl_value_t *env_t = NULL;
@@ -6745,34 +7151,26 @@ static void allocate_gc_frame(jl_codectx_t &ctx, BasicBlock *b0, bool or_new=fal
 
 static Value *get_current_task(jl_codectx_t &ctx)
 {
-    return get_current_task_from_pgcstack(ctx.builder, ctx.types().T_size, ctx.pgcstack);
+    return get_current_task_from_pgcstack(ctx.builder, ctx.pgcstack);
 }
 
 // Get PTLS through current task.
 static Value *get_current_ptls(jl_codectx_t &ctx)
 {
-    return get_current_ptls_from_task(ctx.builder, ctx.types().T_size, get_current_task(ctx), ctx.tbaa().tbaa_gcframe);
+    return get_current_ptls_from_task(ctx.builder, get_current_task(ctx), ctx.tbaa().tbaa_gcframe);
 }
 
 // Get the address of the world age of the current task
 static Value *get_tls_world_age_field(jl_codectx_t &ctx)
 {
     Value *ct = get_current_task(ctx);
-    return ctx.builder.CreateInBoundsGEP(
-            ctx.types().T_size,
-            ct,
-            ConstantInt::get(ctx.types().T_size, offsetof(jl_task_t, world_age) / ctx.types().sizeof_ptr),
-            "world_age");
+    return emit_ptrgep(ctx, ct, offsetof(jl_task_t, world_age), "world_age");
 }
 
 static Value *get_scope_field(jl_codectx_t &ctx)
 {
     Value *ct = get_current_task(ctx);
-    return ctx.builder.CreateInBoundsGEP(
-            ctx.types().T_prjlvalue,
-            ct,
-            ConstantInt::get(ctx.types().T_size, offsetof(jl_task_t, scope) / ctx.types().sizeof_ptr),
-            "current_scope");
+    return emit_ptrgep(ctx, ct, offsetof(jl_task_t, scope), "current_scope");
 }
 
 static Function *emit_tojlinvoke(jl_code_instance_t *codeinst, StringRef theFptrName, Module *M, jl_codegen_params_t &params)
@@ -6785,6 +7183,11 @@ static Function *emit_tojlinvoke(jl_code_instance_t *codeinst, StringRef theFptr
             GlobalVariable::InternalLinkage,
             name, M);
     jl_init_function(f, params.TargetTriple);
+    if (trim_may_error(params.params->trim)) {
+        arraylist_push(&new_invokes, codeinst->def); // Try t compile this invoke
+        // TODO: Debuginfo!
+        push_frames(ctx, ctx.linfo, codeinst->def, 1);
+    }
     jl_name_jlfunc_args(params, f);
     //f->setAlwaysInline();
     ctx.f = f; // for jl_Module
@@ -6810,14 +7213,6 @@ static Function *emit_tojlinvoke(jl_code_instance_t *codeinst, StringRef theFptr
     return f;
 }
 
-static Type *get_returnroots_type(jl_codectx_t &ctx, unsigned rootcount) {
-    return ArrayType::get(ctx.types().T_prjlvalue, rootcount);
-}
-
-static Type *get_unionbytes_type(LLVMContext &C, unsigned unionbytes) {
-    return ArrayType::get(getInt8Ty(C), unionbytes);
-}
-
 static void emit_cfunc_invalidate(
         Function *gf_thunk, jl_returninfo_t::CallingConv cc, unsigned return_roots,
         jl_value_t *calltype, jl_value_t *rettype, bool is_for_opaque_closure,
@@ -6845,14 +7240,17 @@ static void emit_cfunc_invalidate(
         ++AI; // gcstack_arg
     }
     for (size_t i = 0; i < nargs; i++) {
+        // n.b. calltype is required to be a datatype by construction for specsig
         jl_value_t *jt = jl_nth_slot_type(calltype, i);
-        // n.b. specTypes is required to be a datatype by construction for specsig
-        bool isboxed = false;
-        Type *et;
         if (i == 0 && is_for_opaque_closure) {
-            et = PointerType::get(ctx.types().T_jlvalue, AddressSpace::Derived);
+            Value *arg_v = &*AI;
+            ++AI;
+            myargs[i] = mark_julia_slot(arg_v, jt, NULL, ctx.tbaa().tbaa_const);
+            continue;
         }
-        else if (deserves_argbox(jt)) {
+        bool isboxed = false;
+        Type *et;
+        if (deserves_argbox(jt)) {
             et = ctx.types().T_prjlvalue;
             isboxed = true;
         }
@@ -6869,15 +7267,19 @@ static void emit_cfunc_invalidate(
         else {
             Value *arg_v = &*AI;
             ++AI;
-            Type *at = arg_v->getType();
-            if ((i == 0 && is_for_opaque_closure) || (!isboxed && et->isAggregateType())) {
-                myargs[i] = mark_julia_slot(arg_v, jt, NULL, ctx.tbaa().tbaa_const);
+            if (!isboxed && et->isAggregateType()) {
+                auto tracked = CountTrackedPointers(et);
+                SmallVector<Value*,0> roots;
+                if (tracked.count && !tracked.all) {
+                    roots = load_gc_roots(ctx, &*AI, tracked.count);
+                    ++AI;
+                }
+                myargs[i] = mark_julia_slot(arg_v, jt, NULL, ctx.tbaa().tbaa_const, roots);
             }
             else {
-                assert(at == et);
+                assert(arg_v->getType() == et);
                 myargs[i] = mark_julia_type(ctx, arg_v, isboxed, jt);
             }
-            (void)at;
         }
     }
     assert(AI == gf_thunk->arg_end());
@@ -6885,6 +7287,7 @@ static void emit_cfunc_invalidate(
     jl_cgval_t gf_retbox = mark_julia_type(ctx, gf_ret, true, jl_any_type);
     if (cc != jl_returninfo_t::Boxed) {
         emit_typecheck(ctx, gf_retbox, rettype, "cfunction");
+        gf_retbox = update_julia_type(ctx, gf_retbox, rettype);
     }
 
     switch (cc) {
@@ -6902,17 +7305,15 @@ static void emit_cfunc_invalidate(
         break;
     }
     case jl_returninfo_t::SRet: {
+        Value *sret = &*gf_thunk->arg_begin();
+        Align align(julia_alignment(rettype));
         if (return_roots) {
-            Value *root1 = gf_thunk->arg_begin() + 1; // root1 has type [n x {}*]*
-            #if JL_LLVM_VERSION < 170000
-            assert(cast<PointerType>(root1->getType())->isOpaqueOrPointeeTypeMatches(get_returnroots_type(ctx, return_roots)));
-            #endif
-            root1 = ctx.builder.CreateConstInBoundsGEP2_32(get_returnroots_type(ctx, return_roots), root1, 0, 0);
-            ctx.builder.CreateStore(gf_ret, root1);
-        }
-        Align alignment(julia_alignment(rettype));
-        emit_memcpy(ctx, &*gf_thunk->arg_begin(), jl_aliasinfo_t::fromTBAA(ctx, nullptr), gf_ret,
-                    jl_aliasinfo_t::fromTBAA(ctx, nullptr), jl_datatype_size(rettype), Align(alignment), Align(alignment));
+            Value *roots = gf_thunk->arg_begin() + 1; // root1 has type [n x {}*]*
+            split_value_into(ctx, gf_retbox, align, sret, align, jl_aliasinfo_t::fromTBAA(ctx, ctx.tbaa().tbaa_stack), roots, jl_aliasinfo_t::fromTBAA(ctx, ctx.tbaa().tbaa_gcframe));
+        }
+        else {
+            emit_unbox_store(ctx, gf_retbox, sret, ctx.tbaa().tbaa_stack, align);
+        }
         ctx.builder.CreateRetVoid();
         break;
     }
@@ -7086,10 +7487,7 @@ static Function* gen_cfun_wrapper(
     if (calltype) {
         LoadInst *lam_max = ctx.builder.CreateAlignedLoad(
                 ctx.types().T_size,
-                ctx.builder.CreateConstInBoundsGEP1_32(
-                    ctx.types().T_size,
-                    literal_pointer_val(ctx, (jl_value_t*)codeinst),
-                    offsetof(jl_code_instance_t, max_world) / ctx.types().sizeof_ptr),
+                emit_ptrgep(ctx, literal_pointer_val(ctx, (jl_value_t*)codeinst), offsetof(jl_code_instance_t, max_world)),
                 ctx.types().alignof_ptr);
         age_ok = ctx.builder.CreateICmpUGE(lam_max, world_v);
     }
@@ -7170,7 +7568,7 @@ static Function* gen_cfun_wrapper(
                     *closure_types = jl_alloc_vec_any(0);
                 jl_array_ptr_1d_push(*closure_types, jargty);
                 Value *runtime_dt = ctx.builder.CreateAlignedLoad(ctx.types().T_prjlvalue,
-                        ctx.builder.CreateConstInBoundsGEP1_32(ctx.types().T_prjlvalue, nestPtr, jl_array_nrows(*closure_types)),
+                        emit_ptrgep(ctx, nestPtr, jl_array_nrows(*closure_types) * ctx.types().sizeof_ptr),
                         Align(sizeof(void*)));
                 BasicBlock *boxedBB = BasicBlock::Create(ctx.builder.getContext(), "isboxed", cw);
                 BasicBlock *loadBB = BasicBlock::Create(ctx.builder.getContext(), "need-load", cw);
@@ -7236,7 +7634,7 @@ static Function* gen_cfun_wrapper(
                         *closure_types = jl_alloc_vec_any(0);
                     jl_array_ptr_1d_push(*closure_types, jargty);
                     Value *runtime_dt = ctx.builder.CreateAlignedLoad(ctx.types().T_prjlvalue,
-                            ctx.builder.CreateConstInBoundsGEP1_32(ctx.types().T_prjlvalue, nestPtr, jl_array_nrows(*closure_types)),
+                            emit_ptrgep(ctx, nestPtr, jl_array_nrows(*closure_types) * ctx.types().sizeof_ptr),
                             Align(sizeof(void*)));
                     Value *strct = box_ccall_result(ctx, val, runtime_dt, jargty);
                     inputarg = mark_julia_type(ctx, strct, true, jargty_proper);
@@ -7305,77 +7703,9 @@ static Function* gen_cfun_wrapper(
         bool is_opaque_closure = jl_is_method(lam->def.value) && lam->def.method->is_for_opaque_closure;
         assert(calltype == 3);
         // emit a specsig call
-        bool gcstack_arg = JL_FEAT_TEST(ctx, gcstack_arg);
         StringRef protoname = jl_ExecutionEngine->getFunctionAtAddress((uintptr_t)callptr, invoke, codeinst);
+        bool gcstack_arg = JL_FEAT_TEST(ctx, gcstack_arg);
         jl_returninfo_t returninfo = get_specsig_function(ctx, M, NULL, protoname, lam->specTypes, astrt, is_opaque_closure, gcstack_arg);
-        FunctionType *cft = returninfo.decl.getFunctionType();
-        jlfunc_sret = (returninfo.cc == jl_returninfo_t::SRet);
-
-        // TODO: Can use use emit_call_specfun_other here?
-        SmallVector<Value*, 0> args;
-        Value *result = nullptr;
-        if (jlfunc_sret || returninfo.cc == jl_returninfo_t::Union) {
-            // fuse the two sret together, or emit an alloca to hold it
-            if (sig.sret && jlfunc_sret) {
-                result = emit_bitcast(ctx, sretPtr, cft->getParamType(0));
-            }
-            else {
-                if (jlfunc_sret) {
-                    result = emit_static_alloca(ctx, getAttributeAtIndex(returninfo.attrs, 1, Attribute::StructRet).getValueAsType());
-                    setName(ctx.emission_context, result, "sret");
-                    #if JL_LLVM_VERSION < 170000
-                    assert(cast<PointerType>(result->getType())->hasSameElementTypeAs(cast<PointerType>(cft->getParamType(0))));
-                    #endif
-                } else {
-                    result = emit_static_alloca(ctx, get_unionbytes_type(ctx.builder.getContext(), returninfo.union_bytes));
-                    setName(ctx.emission_context, result, "result_union");
-                    #if JL_LLVM_VERSION < 170000
-                    assert(cast<PointerType>(result->getType())->hasSameElementTypeAs(cast<PointerType>(cft->getParamType(0))));
-                    #endif
-                }
-            }
-            args.push_back(result);
-        }
-        if (returninfo.return_roots) {
-            AllocaInst *return_roots = emit_static_alloca(ctx, get_returnroots_type(ctx, returninfo.return_roots));
-            setName(ctx.emission_context, return_roots, "return_roots");
-            args.push_back(return_roots);
-        }
-        if (gcstack_arg)
-            args.push_back(ctx.pgcstack);
-        for (size_t i = 0; i < nargs + 1; i++) {
-            // figure out how to repack the arguments
-            jl_cgval_t &inputarg = inputargs[i];
-            Value *arg;
-            jl_value_t *spect = (i == 0 && is_opaque_closure) ? (jl_value_t*)jl_any_type :
-                jl_nth_slot_type(lam->specTypes, i);
-            // n.b. specTypes is required to be a datatype by construction for specsig
-            bool isboxed = deserves_argbox(spect);
-            Type *T = isboxed ? ctx.types().T_prjlvalue : julia_type_to_llvm(ctx, spect);
-            if (is_uniquerep_Type(spect)) {
-                continue;
-            }
-            else if (isboxed) {
-                arg = boxed(ctx, inputarg);
-            }
-            else if (type_is_ghost(T)) {
-                continue; // ghost types are skipped by the specsig method signature
-            }
-            else if (T->isAggregateType()) {
-                // aggregate types are passed by pointer
-                inputarg = value_to_pointer(ctx, inputarg);
-                arg = decay_derived(ctx, data_pointer(ctx, inputarg));
-            }
-            else {
-                arg = emit_unbox(ctx, T, inputarg, spect);
-                assert(!isa<UndefValue>(arg));
-            }
-
-            // add to argument list
-            args.push_back(arg);
-        }
-        Value *theFptr = returninfo.decl.getCallee();
-        assert(theFptr);
         if (age_ok) {
             funcName += "_gfthunk";
             Function *gf_thunk = Function::Create(returninfo.decl.getFunctionType(),
@@ -7387,49 +7717,17 @@ static Function* gen_cfun_wrapper(
             // but which has the signature of a specsig
             emit_cfunc_invalidate(gf_thunk, returninfo.cc, returninfo.return_roots, lam->specTypes, codeinst->rettype, is_opaque_closure, nargs + 1, ctx.emission_context,
                 min_world, max_world);
-            theFptr = ctx.builder.CreateSelect(age_ok, theFptr, gf_thunk);
+            returninfo.decl = FunctionCallee(returninfo.decl.getFunctionType(), ctx.builder.CreateSelect(age_ok, returninfo.decl.getCallee(), gf_thunk));
         }
-
-        #if JL_LLVM_VERSION < 170000
-        assert(cast<PointerType>(theFptr->getType())->isOpaqueOrPointeeTypeMatches(returninfo.decl.getFunctionType()));
-        #endif
-        CallInst *call = ctx.builder.CreateCall(
-            returninfo.decl.getFunctionType(),
-            theFptr, ArrayRef<Value*>(args));
-        call->setAttributes(returninfo.attrs);
-        if (gcstack_arg)
-            call->setCallingConv(CallingConv::Swift);
-
-        switch (returninfo.cc) {
-            case jl_returninfo_t::Boxed:
-                retval = mark_julia_type(ctx, call, true, astrt);
-                break;
-            case jl_returninfo_t::Register:
-                retval = mark_julia_type(ctx, call, false, astrt);
-                break;
-            case jl_returninfo_t::SRet:
-                retval = mark_julia_slot(result, astrt, NULL, ctx.tbaa().tbaa_stack);
-                break;
-            case jl_returninfo_t::Union: {
-                Value *box = ctx.builder.CreateExtractValue(call, 0);
-                Value *tindex = ctx.builder.CreateExtractValue(call, 1);
-                Value *derived = ctx.builder.CreateSelect(
-                    ctx.builder.CreateICmpEQ(
-                            ctx.builder.CreateAnd(tindex, ConstantInt::get(getInt8Ty(ctx.builder.getContext()), UNION_BOX_MARKER)),
-                            ConstantInt::get(getInt8Ty(ctx.builder.getContext()), 0)),
-                    decay_derived(ctx, result),
-                    decay_derived(ctx, box));
-                retval = mark_julia_slot(derived,
-                                         astrt,
-                                         tindex,
-                                         ctx.tbaa().tbaa_stack);
-                assert(box->getType() == ctx.types().T_prjlvalue);
-                retval.Vboxed = box;
-                break;
-            }
-            case jl_returninfo_t::Ghosts:
-                retval = mark_julia_slot(NULL, astrt, call, ctx.tbaa().tbaa_stack);
-                break;
+        retval = emit_call_specfun_other(ctx, is_opaque_closure, lam->specTypes, codeinst->rettype, returninfo, nullptr, inputargs, nargs + 1);
+        jlfunc_sret = (returninfo.cc == jl_returninfo_t::SRet);
+        if (jlfunc_sret && sig.sret) {
+            // fuse the two sret together
+            assert(retval.ispointer());
+            AllocaInst *result = cast<AllocaInst>(retval.V);
+            retval.V = sretPtr;
+            result->replaceAllUsesWith(sretPtr);
+            result->eraseFromParent();
         }
     }
 
@@ -7728,7 +8026,7 @@ const char *jl_generate_ccallable(LLVMOrcThreadSafeModuleRef llvmmod, void *sysi
 }
 
 // generate a julia-callable function that calls f (AKA lam)
-static Function *gen_invoke_wrapper(jl_method_instance_t *lam, jl_value_t *jlretty, const jl_returninfo_t &f, int retarg, StringRef funcName,
+static void gen_invoke_wrapper(jl_method_instance_t *lam, jl_value_t *jlretty, jl_returninfo_t &f, unsigned nargs, int retarg, StringRef funcName,
         Module *M, jl_codegen_params_t &params)
 {
     ++GeneratedInvokeWrappers;
@@ -7756,131 +8054,48 @@ static Function *gen_invoke_wrapper(jl_method_instance_t *lam, jl_value_t *jlret
     ctx.builder.SetCurrentDebugLocation(noDbg);
     allocate_gc_frame(ctx, b0);
 
-    // TODO: replace this with emit_call_specfun_other?
-    FunctionType *ftype = const_cast<llvm::FunctionCallee&>(f.decl).getFunctionType();
-    size_t nfargs = ftype->getNumParams();
-    SmallVector<Value *, 0> args(nfargs);
-    unsigned idx = 0;
-    AllocaInst *result = NULL;
-    switch (f.cc) {
-    case jl_returninfo_t::Boxed:
-    case jl_returninfo_t::Register:
-    case jl_returninfo_t::Ghosts:
-        break;
-    case jl_returninfo_t::SRet:
-        #if JL_LLVM_VERSION < 170000
-        assert(cast<PointerType>(ftype->getParamType(0))->isOpaqueOrPointeeTypeMatches(getAttributeAtIndex(f.attrs, 1, Attribute::StructRet).getValueAsType()));
-        #endif
-        result = ctx.builder.CreateAlloca(getAttributeAtIndex(f.attrs, 1, Attribute::StructRet).getValueAsType());
-        setName(ctx.emission_context, result, "sret");
-        args[idx] = result;
-        idx++;
-        break;
-    case jl_returninfo_t::Union:
-        result = ctx.builder.CreateAlloca(ArrayType::get(getInt8Ty(ctx.builder.getContext()), f.union_bytes));
-        if (f.union_align > 1)
-            result->setAlignment(Align(f.union_align));
-        args[idx] = result;
-        idx++;
-        setName(ctx.emission_context, result, "result_union");
-        break;
-    }
-    if (f.return_roots) {
-        AllocaInst *return_roots = emit_static_alloca(ctx, ArrayType::get(ctx.types().T_prjlvalue, f.return_roots));
-        setName(ctx.emission_context, return_roots, "return_roots");
-        args[idx] = return_roots;
-        idx++;
-    }
-    bool gcstack_arg = JL_FEAT_TEST(ctx, gcstack_arg);
-    if (gcstack_arg) {
-        args[idx] = ctx.pgcstack;
-        idx++;
-    }
+    SmallVector<jl_cgval_t, 0> argv(nargs);
     bool is_opaque_closure = jl_is_method(lam->def.value) && lam->def.method->is_for_opaque_closure;
-    for (size_t i = 0; i < jl_nparams(lam->specTypes) && idx < nfargs; ++i) {
+    jl_aliasinfo_t ai = jl_aliasinfo_t::fromTBAA(ctx, ctx.tbaa().tbaa_const);
+    for (size_t i = 0; i < nargs; ++i) {
         jl_value_t *ty = ((i == 0) && is_opaque_closure) ? (jl_value_t*)jl_any_type :
             jl_nth_slot_type(lam->specTypes, i);
-        // n.b. specTypes is required to be a datatype by construction for specsig
-        bool isboxed = deserves_argbox(ty);
-        Type *lty = isboxed ?  ctx.types().T_prjlvalue : julia_type_to_llvm(ctx, ty);
-        if (type_is_ghost(lty) || is_uniquerep_Type(ty))
-            continue;
         Value *theArg;
         if (i == 0) {
-            // This function adapts from generic jlcall to OC specsig. Generic jlcall pointers
-            // come in as ::Tracked, but specsig expected ::Derived.
-            if (is_opaque_closure)
-                theArg = decay_derived(ctx, funcArg);
-            else
-                theArg = funcArg;
+            theArg = funcArg;
         }
         else {
-            Value *argPtr = ctx.builder.CreateConstInBoundsGEP1_32(ctx.types().T_prjlvalue, argArray, i - 1);
-            jl_aliasinfo_t ai = jl_aliasinfo_t::fromTBAA(ctx, ctx.tbaa().tbaa_const);
+            Value *argPtr = emit_ptrgep(ctx, argArray, (i - 1) * ctx.types().sizeof_ptr);
             theArg = ai.decorateInst(maybe_mark_load_dereferenceable(
                     ctx.builder.CreateAlignedLoad(ctx.types().T_prjlvalue, argPtr, Align(sizeof(void*))),
                     false,
                     ty));
         }
-        if (!isboxed) {
-            theArg = decay_derived(ctx, theArg);
-            if (!lty->isAggregateType()) // keep "aggregate" type values in place as pointers
-                theArg = ctx.builder.CreateAlignedLoad(lty, theArg, Align(julia_alignment(ty)));
-        }
-        assert(!isa<UndefValue>(theArg));
-        args[idx] = theArg;
-        idx++;
+        argv[i] = mark_julia_type(ctx, theArg, true, ty);
     }
-    CallInst *call = ctx.builder.CreateCall(f.decl, args);
-    call->setAttributes(f.attrs);
-    if (gcstack_arg)
-        call->setCallingConv(CallingConv::Swift);
-    jl_cgval_t retval;
+    jl_cgval_t retval = emit_call_specfun_other(ctx, is_opaque_closure, lam->specTypes, jlretty, f, nullptr, argv, nargs);
     if (retarg != -1) {
         Value *theArg;
         if (retarg == 0)
             theArg = funcArg;
         else
             theArg = ctx.builder.CreateAlignedLoad(ctx.types().T_prjlvalue,
-                    ctx.builder.CreateConstInBoundsGEP1_32(ctx.types().T_prjlvalue, argArray, retarg - 1),
+                    emit_ptrgep(ctx, argArray, (retarg - 1) * ctx.types().sizeof_ptr),
                     Align(sizeof(void*)));
         retval = mark_julia_type(ctx, theArg, true, jl_any_type);
     }
-    else {
-        switch (f.cc) {
-        case jl_returninfo_t::Boxed:
-            retval = mark_julia_type(ctx, call, true, jlretty);
-            break;
-        case jl_returninfo_t::Register:
-            retval = mark_julia_type(ctx, call, false, jlretty);
-            break;
-        case jl_returninfo_t::SRet:
-            retval = mark_julia_slot(result, jlretty, NULL, ctx.tbaa().tbaa_stack);
-            break;
-        case jl_returninfo_t::Union:
-            // result is technically not right here, but `boxed` will only look at it
-            // for the unboxed values, so it's ok.
-            retval = mark_julia_slot(result,
-                                     jlretty,
-                                     ctx.builder.CreateExtractValue(call, 1),
-                                     ctx.tbaa().tbaa_stack);
-            retval.Vboxed = ctx.builder.CreateExtractValue(call, 0);
-            assert(retval.Vboxed->getType() == ctx.types().T_prjlvalue);
-            break;
-        case jl_returninfo_t::Ghosts:
-            retval = mark_julia_slot(NULL, jlretty, call, ctx.tbaa().tbaa_stack);
-            break;
-        }
-    }
-    ctx.builder.CreateRet(boxed(ctx, retval));
-    return w;
+    if (retval.typ == jl_bottom_type)
+        CreateTrap(ctx.builder, false);
+    else
+        ctx.builder.CreateRet(boxed(ctx, retval));
 }
 
-static jl_returninfo_t get_specsig_function(jl_codectx_t &ctx, Module *M, Value *fval, StringRef name, jl_value_t *sig, jl_value_t *jlrettype, bool is_opaque_closure, bool gcstack_arg, BitVector *used_arguments, size_t *arg_offset)
+static jl_returninfo_t get_specsig_function(jl_codectx_t &ctx, Module *M, Value *fval, StringRef name, jl_value_t *sig, jl_value_t *jlrettype, bool is_opaque_closure, bool gcstack_arg,
+        ArrayRef<const char*> ArgNames, unsigned nreq)
 {
     jl_returninfo_t props = {};
-    SmallVector<Type*, 8> fsig;
-    SmallVector<std::string, 4> argnames;
+    SmallVector<Type*,8> fsig;
+    SmallVector<std::string,4> argnames;
     Type *rt = NULL;
     Type *srt = NULL;
     if (jlrettype == (jl_value_t*)jl_bottom_type) {
@@ -7917,9 +8132,13 @@ static jl_returninfo_t get_specsig_function(jl_codectx_t &ctx, Module *M, Value
         if (rt != getVoidTy(ctx.builder.getContext()) && deserves_sret(jlrettype, rt)) {
             auto tracked = CountTrackedPointers(rt, true);
             assert(!tracked.derived);
-            if (tracked.count && !tracked.all)
+            if (tracked.count && !tracked.all) {
                 props.return_roots = tracked.count;
+                assert(props.return_roots == ((jl_datatype_t*)jlrettype)->layout->npointers);
+            }
             props.cc = jl_returninfo_t::SRet;
+            props.union_bytes = jl_datatype_size(jlrettype);
+            props.union_align = props.union_minalign = julia_alignment(jlrettype);
             // sret is always passed from alloca
             assert(M);
             fsig.push_back(rt->getPointerTo(M->getDataLayout().getAllocaAddrSpace()));
@@ -7961,7 +8180,7 @@ static jl_returninfo_t get_specsig_function(jl_codectx_t &ctx, Module *M, Value
         param.addAttribute(Attribute::NoCapture);
         param.addAttribute(Attribute::NoUndef);
         attrs.push_back(AttributeSet::get(ctx.builder.getContext(), param));
-        fsig.push_back(get_returnroots_type(ctx, props.return_roots)->getPointerTo(0));
+        fsig.push_back(ctx.types().T_ptr);
         argnames.push_back("return_roots");
     }
 
@@ -7974,33 +8193,25 @@ static jl_returninfo_t get_specsig_function(jl_codectx_t &ctx, Module *M, Value
         argnames.push_back("pgcstack_arg");
     }
 
-    if (arg_offset)
-        *arg_offset = fsig.size();
     size_t nparams = jl_nparams(sig);
-    if (used_arguments)
-        used_arguments->resize(nparams);
-
     for (size_t i = 0; i < nparams; i++) {
         jl_value_t *jt = jl_tparam(sig, i);
         bool isboxed = false;
-        Type *ty = NULL;
-        if (i == 0 && is_opaque_closure) {
-            ty = PointerType::get(ctx.types().T_jlvalue, AddressSpace::Derived);
-            isboxed = true; // true-ish anyway - we might not have the type tag
-        }
-        else {
+        Type *et = nullptr;
+        if (i != 0 || !is_opaque_closure) { // special token for OC argument
             if (is_uniquerep_Type(jt))
                 continue;
             isboxed = deserves_argbox(jt);
-            ty = isboxed ? ctx.types().T_prjlvalue : julia_type_to_llvm(ctx, jt);
+            et = isboxed ? ctx.types().T_prjlvalue : julia_type_to_llvm(ctx, jt);
+            if (type_is_ghost(et))
+                continue;
         }
-        if (type_is_ghost(ty))
-            continue;
         AttrBuilder param(ctx.builder.getContext());
-        if (ty->isAggregateType()) { // aggregate types are passed by pointer
+        Type *ty = et;
+        if (et == nullptr || et->isAggregateType()) { // aggregate types are passed by pointer
             param.addAttribute(Attribute::NoCapture);
             param.addAttribute(Attribute::ReadOnly);
-            ty = PointerType::get(ty, AddressSpace::Derived);
+            ty = ctx.builder.getPtrTy(AddressSpace::Derived);
         }
         else if (isboxed && jl_is_immutable_datatype(jt)) {
             param.addAttribute(Attribute::ReadOnly);
@@ -8012,8 +8223,26 @@ static jl_returninfo_t get_specsig_function(jl_codectx_t &ctx, Module *M, Value
         }
         attrs.push_back(AttributeSet::get(ctx.builder.getContext(), param));
         fsig.push_back(ty);
-        if (used_arguments)
-            used_arguments->set(i);
+        size_t argno = i < nreq ? i : nreq;
+        std::string genname;
+        if (!ArgNames.empty()) {
+            genname = ArgNames[argno];
+            if (genname.empty())
+                genname = (StringRef("#") + Twine(argno + 1)).str();
+            if (i >= nreq)
+                genname += (StringRef("[") + Twine(i - nreq + 1) + StringRef("]")).str();
+            const char *arg_typename = jl_is_datatype(jt) ? jl_symbol_name(((jl_datatype_t*)jt)->name->name) : "<unknown type>";
+            argnames.push_back((genname + StringRef("::") + arg_typename).str());
+        }
+        if (et && et->isAggregateType()) {
+            auto tracked = CountTrackedPointers(et);
+            if (tracked.count && !tracked.all) {
+                attrs.push_back(AttributeSet::get(ctx.builder.getContext(), param));
+                fsig.push_back(ctx.builder.getPtrTy(M->getDataLayout().getAllocaAddrSpace()));
+                if (!genname.empty())
+                    argnames.push_back((Twine(".roots.") + genname).str());
+            }
+        }
     }
 
     AttributeSet FnAttrs;
@@ -8061,12 +8290,6 @@ static jl_returninfo_t get_specsig_function(jl_codectx_t &ctx, Module *M, Value
     return props;
 }
 
-static void emit_sret_roots(jl_codectx_t &ctx, bool isptr, Value *Src, Type *T, Value *Shadow, Type *ShadowT, unsigned count)
-{
-    unsigned emitted = TrackWithShadow(Src, T, isptr, Shadow, ShadowT, ctx.builder); //This comes from Late-GC-Lowering??
-    assert(emitted == count); (void)emitted; (void)count;
-}
-
 static DISubroutineType *
 get_specsig_di(jl_codectx_t &ctx, jl_debugcache_t &debuginfo, jl_value_t *rt, jl_value_t *sig, DIBuilder &dbuilder)
 {
@@ -8182,6 +8405,7 @@ static jl_llvm_functions_t
     if (lam && jl_is_method(lam->def.method)) {
         toplineno = lam->def.method->line;
         ctx.file = jl_symbol_name(lam->def.method->file);
+        ctx.line = lam->def.method->line;
     }
     else if ((jl_value_t*)src->debuginfo != jl_nothing) {
         // look for the file and line info of the original start of this block, as reported by lowering
@@ -8190,6 +8414,7 @@ static jl_llvm_functions_t
             debuginfo = debuginfo->linetable;
         ctx.file = jl_debuginfo_file(debuginfo);
         struct jl_codeloc_t lineidx = jl_uncompress1_codeloc(debuginfo->codelocs, 0);
+        ctx.line = lineidx.line;
         toplineno = std::max((int32_t)0, lineidx.line);
     }
     if (ctx.file.empty())
@@ -8279,49 +8504,26 @@ static jl_llvm_functions_t
     Function *f = NULL;
     bool has_sret = false;
     if (specsig) { // assumes !va and !needsparams
-        BitVector used_args;
-        size_t args_begin;
-        returninfo = get_specsig_function(ctx, M, NULL, declarations.specFunctionObject, lam->specTypes,
-                                          jlrettype, ctx.is_opaque_closure, JL_FEAT_TEST(ctx,gcstack_arg), &used_args, &args_begin);
-        f = cast<Function>(returninfo.decl.getCallee());
-        has_sret = (returninfo.cc == jl_returninfo_t::SRet || returninfo.cc == jl_returninfo_t::Union);
-        jl_init_function(f, ctx.emission_context.TargetTriple);
+        SmallVector<const char*,0> ArgNames(0);
         if (ctx.emission_context.debug_level >= 2) {
-            auto arg_typename = [&](size_t i) JL_NOTSAFEPOINT {
-                auto tp = jl_tparam(lam->specTypes, i);
-                return jl_is_datatype(tp) ? jl_symbol_name(((jl_datatype_t*)tp)->name->name) : "<unknown type>";
-            };
-            size_t nreal = 0;
-            for (size_t i = 0; i < std::min(nreq, static_cast<size_t>(used_args.size())); i++) {
+            ArgNames.resize(ctx.nargs, "");
+            for (int i = 0; i < ctx.nargs; i++) {
                 jl_sym_t *argname = slot_symbol(ctx, i);
                 if (argname == jl_unused_sym)
                     continue;
-                if (used_args.test(i)) {
-                    auto &arg = *f->getArg(args_begin++);
-                    nreal++;
-                    auto name = jl_symbol_name(argname);
-                    if (!name[0]) {
-                        arg.setName(StringRef("#") + Twine(nreal) + StringRef("::") + arg_typename(i));
-                    } else {
-                        arg.setName(name + StringRef("::") + arg_typename(i));
-                    }
-                }
-            }
-            if (va && ctx.vaSlot != -1) {
-                size_t vidx = 0;
-                for (size_t i = nreq; i < used_args.size(); i++) {
-                    if (used_args.test(i)) {
-                        auto &arg = *f->getArg(args_begin++);
-                        auto type = arg_typename(i);
-                        const char *name = jl_symbol_name(slot_symbol(ctx, ctx.vaSlot));
-                        if (!name[0])
-                            name = "...";
-                        vidx++;
-                        arg.setName(name + StringRef("[") + Twine(vidx) + StringRef("]::") + type);
-                    }
-                }
+                const char *name = jl_symbol_name(argname);
+                if (name[0] == '\0' && ctx.vaSlot == i)
+                    ArgNames[i] = "...";
+                else
+                    ArgNames[i] = name;
             }
         }
+        returninfo = get_specsig_function(ctx, M, NULL, declarations.specFunctionObject, lam->specTypes,
+                                          jlrettype, ctx.is_opaque_closure, JL_FEAT_TEST(ctx,gcstack_arg),
+                                          ArgNames, nreq);
+        f = cast<Function>(returninfo.decl.getCallee());
+        has_sret = (returninfo.cc == jl_returninfo_t::SRet || returninfo.cc == jl_returninfo_t::Union);
+        jl_init_function(f, ctx.emission_context.TargetTriple);
 
         // common pattern: see if all return statements are an argument in that
         // case the apply-generic call can re-use the original box for the return
@@ -8350,7 +8552,8 @@ static jl_llvm_functions_t
         std::string wrapName;
         raw_string_ostream(wrapName) << "jfptr_" << ctx.name << "_" << jl_atomic_fetch_add_relaxed(&globalUniqueGeneratedNames, 1);
         declarations.functionObject = wrapName;
-        (void)gen_invoke_wrapper(lam, jlrettype, returninfo, retarg, declarations.functionObject, M, ctx.emission_context);
+        size_t nparams = jl_nparams(lam->specTypes);
+        gen_invoke_wrapper(lam, jlrettype, returninfo, nparams, retarg, declarations.functionObject, M, ctx.emission_context);
         // TODO: add attributes: maybe_mark_argument_dereferenceable(Arg, argType)
         // TODO: add attributes: dereferenceable<sizeof(void*) * nreq>
         // TODO: (if needsparams) add attributes: dereferenceable<sizeof(void*) * length(sp)>, readonly, nocapture
@@ -8521,14 +8724,16 @@ static jl_llvm_functions_t
     allocate_gc_frame(ctx, b0);
     Value *last_age = NULL;
     auto world_age_field = get_tls_world_age_field(ctx);
-    jl_aliasinfo_t ai = jl_aliasinfo_t::fromTBAA(ctx, ctx.tbaa().tbaa_gcframe);
-    last_age = ai.decorateInst(ctx.builder.CreateAlignedLoad(
-               ctx.types().T_size, world_age_field, ctx.types().alignof_ptr));
-    ctx.world_age_at_entry = last_age; // Load world age for use in get_tls_world_age
+    { // scope
+        jl_aliasinfo_t ai = jl_aliasinfo_t::fromTBAA(ctx, ctx.tbaa().tbaa_gcframe);
+        last_age = ai.decorateInst(ctx.builder.CreateAlignedLoad(
+                   ctx.types().T_size, world_age_field, ctx.types().alignof_ptr));
+        ctx.world_age_at_entry = last_age; // Load world age for use in get_tls_world_age
+    }
 
     // step 7. allocate local variables slots
     // must be in the first basic block for the llvm mem2reg pass to work
-    auto allocate_local = [&ctx, &dbuilder, &debugcache, topdebugloc, va, debug_enabled, M](jl_varinfo_t &varinfo, jl_sym_t *s, int i) {
+    auto allocate_local = [&ctx, &dbuilder, &debugcache, topdebugloc, va, debug_enabled](jl_varinfo_t &varinfo, jl_sym_t *s, int i) {
         jl_value_t *jt = varinfo.value.typ;
         assert(!varinfo.boxroot); // variables shouldn't have memory locs already
         if (varinfo.value.constant) {
@@ -8548,13 +8753,13 @@ static jl_llvm_functions_t
             if (lv) {
                 lv->setName(jl_symbol_name(s));
                 varinfo.value = mark_julia_slot(lv, jt, NULL, ctx.tbaa().tbaa_stack);
-                varinfo.pTIndex = emit_static_alloca(ctx, getInt8Ty(ctx.builder.getContext()));
+                varinfo.pTIndex = emit_static_alloca(ctx, 1, Align(1));
                 setName(ctx.emission_context, varinfo.pTIndex, "tindex");
                 // TODO: attach debug metadata to this variable
             }
             else if (allunbox) {
                 // all ghost values just need a selector allocated
-                AllocaInst *lv = emit_static_alloca(ctx, getInt8Ty(ctx.builder.getContext()));
+                AllocaInst *lv = emit_static_alloca(ctx, 1, Align(1));
                 lv->setName(jl_symbol_name(s));
                 varinfo.pTIndex = lv;
                 varinfo.value.tbaa = NULL;
@@ -8567,30 +8772,25 @@ static jl_llvm_functions_t
                 return;
         }
         else if (deserves_stack(jt)) {
-            bool isboxed;
-            Type *vtype = julia_type_to_llvm(ctx, jt, &isboxed);
-            assert(!isboxed);
-            assert(!type_is_ghost(vtype) && "constants should already be handled");
-            Value *lv = new AllocaInst(vtype, M->getDataLayout().getAllocaAddrSpace(), nullptr, Align(jl_datatype_align(jt)), jl_symbol_name(s), /*InsertBefore*/ctx.topalloca);
-            if (CountTrackedPointers(vtype).count) {
-                StoreInst *SI = new StoreInst(Constant::getNullValue(vtype), lv, false, Align(sizeof(void*)));
-                SI->insertAfter(ctx.topalloca);
-            }
-            varinfo.value = mark_julia_slot(lv, jt, NULL, ctx.tbaa().tbaa_stack);
+            auto sizes = split_value_size((jl_datatype_t*)jt);
+            AllocaInst *bits = sizes.first > 0 ? emit_static_alloca(ctx, sizes.first, Align(julia_alignment(jt))) : nullptr;
+            AllocaInst *roots = sizes.second > 0 ? emit_static_roots(ctx, sizes.second) : nullptr;
+            if (bits) bits->setName(jl_symbol_name(s));
+            if (roots) roots->setName(StringRef(".roots.") + jl_symbol_name(s));
+            varinfo.value = mark_julia_slot(bits, jt, NULL, ctx.tbaa().tbaa_stack, None);
+            varinfo.inline_roots = roots;
             alloc_def_flag(ctx, varinfo);
             if (debug_enabled && varinfo.dinfo) {
                 assert((Metadata*)varinfo.dinfo->getType() != debugcache.jl_pvalue_dillvmt);
-                dbuilder.insertDeclare(lv, varinfo.dinfo, dbuilder.createExpression(),
+                dbuilder.insertDeclare(bits ? bits : roots, varinfo.dinfo, dbuilder.createExpression(),
                                        topdebugloc,
                                        ctx.builder.GetInsertBlock());
             }
             return;
         }
         // otherwise give it a boxroot in this function
-        AllocaInst *av = new AllocaInst(ctx.types().T_prjlvalue, M->getDataLayout().getAllocaAddrSpace(),
-            nullptr, Align(sizeof(jl_value_t*)), jl_symbol_name(s), /*InsertBefore*/ctx.topalloca);
-        StoreInst *SI = new StoreInst(Constant::getNullValue(ctx.types().T_prjlvalue), av, false, Align(sizeof(void*)));
-        SI->insertAfter(ctx.topalloca);
+        AllocaInst *av = emit_static_roots(ctx, 1);
+        av->setName(jl_symbol_name(s));
         varinfo.boxroot = av;
         if (debug_enabled && varinfo.dinfo) {
             SmallVector<uint64_t, 1> addr;
@@ -8677,12 +8877,18 @@ static jl_llvm_functions_t
         ++AI;
         AttrBuilder param(ctx.builder.getContext(), f->getAttributes().getParamAttrs(Arg->getArgNo()));
         jl_cgval_t theArg;
-        if (llvmArgType->isAggregateType()) {
+        if (!isboxed && llvmArgType->isAggregateType()) {
             maybe_mark_argument_dereferenceable(param, argType);
-            theArg = mark_julia_slot(Arg, argType, NULL, ctx.tbaa().tbaa_const); // this argument is by-pointer
+            SmallVector<Value*,0> roots;
+            auto tracked = CountTrackedPointers(llvmArgType);
+            if (tracked.count && !tracked.all) {
+                roots = load_gc_roots(ctx, &*AI, tracked.count);
+                ++AI;
+            }
+            theArg = mark_julia_slot(Arg, argType, NULL, ctx.tbaa().tbaa_const, roots); // this argument is by-pointer
         }
         else {
-            if (isboxed) // e.g. is-pointer
+            if (isboxed)
                 maybe_mark_argument_dereferenceable(param, argType);
             theArg = mark_julia_type(ctx, Arg, isboxed, argType);
             if (theArg.tbaa == ctx.tbaa().tbaa_immut)
@@ -8739,99 +8945,92 @@ static jl_llvm_functions_t
         bool isboxed = deserves_argbox(argType);
         Type *llvmArgType = NULL;
         if (i == 0 && ctx.is_opaque_closure) {
-            isboxed = true;
-            llvmArgType = PointerType::get(ctx.types().T_jlvalue, AddressSpace::Derived);
+            isboxed = false;
+            llvmArgType = ctx.builder.getPtrTy(AddressSpace::Derived);
             argType = (jl_value_t*)jl_any_type;
         }
         else {
             llvmArgType = isboxed ? ctx.types().T_prjlvalue : julia_type_to_llvm(ctx, argType);
         }
-        if (s == jl_unused_sym) {
-            if (specsig && !type_is_ghost(llvmArgType) && !is_uniquerep_Type(argType))
-                ++AI;
-            continue;
-        }
         jl_varinfo_t &vi = ctx.slots[i];
-        jl_cgval_t theArg;
         if (s == jl_unused_sym || vi.value.constant) {
             assert(vi.boxroot == NULL);
-            if (specsig && !type_is_ghost(llvmArgType) && !is_uniquerep_Type(argType))
+            if (specsig && !type_is_ghost(llvmArgType) && !is_uniquerep_Type(argType)) {
                 ++AI;
+                auto tracked = CountTrackedPointers(llvmArgType);
+                if (tracked.count && !tracked.all)
+                    ++AI;
+            }
+            continue;
+        }
+        jl_cgval_t theArg;
+        // If this is an opaque closure, implicitly load the env and switch
+        // the world age.
+        if (i == 0 && ctx.is_opaque_closure) {
+            // Load closure world
+            Value *oc_this = decay_derived(ctx, &*AI++);
+            Value *argaddr = oc_this;
+            Value *worldaddr = emit_ptrgep(ctx, argaddr, offsetof(jl_opaque_closure_t, world));
+
+            jl_cgval_t closure_world = typed_load(ctx, worldaddr, NULL, (jl_value_t*)jl_long_type,
+                nullptr, nullptr, false, AtomicOrdering::NotAtomic, false, ctx.types().alignof_ptr.value());
+            ctx.world_age_at_entry = closure_world.V; // The tls world in a OC is the world of the closure
+            emit_unbox_store(ctx, closure_world, world_age_field, ctx.tbaa().tbaa_gcframe, ctx.types().alignof_ptr);
+
+            // Load closure env
+            Value *envaddr = emit_ptrgep(ctx, argaddr, offsetof(jl_opaque_closure_t, captures));
+
+            jl_cgval_t closure_env = typed_load(ctx, envaddr, NULL, (jl_value_t*)jl_any_type,
+                nullptr, nullptr, true, AtomicOrdering::NotAtomic, false, sizeof(void*));
+            theArg = update_julia_type(ctx, closure_env, vi.value.typ);
+        }
+        else if (specsig) {
+            theArg = get_specsig_arg(argType, llvmArgType, isboxed);
         }
         else {
-            // If this is an opaque closure, implicitly load the env and switch
-            // the world age.
-            if (i == 0 && ctx.is_opaque_closure) {
-                // Load closure world
-                Value *oc_this = decay_derived(ctx, &*AI++);
-                Value *argaddr = oc_this;
-                Value *worldaddr = ctx.builder.CreateInBoundsGEP(
-                        getInt8Ty(ctx.builder.getContext()), argaddr,
-                        ConstantInt::get(ctx.types().T_size, offsetof(jl_opaque_closure_t, world)));
-
-                jl_cgval_t closure_world = typed_load(ctx, worldaddr, NULL, (jl_value_t*)jl_long_type,
-                    nullptr, nullptr, false, AtomicOrdering::NotAtomic, false, ctx.types().alignof_ptr.value());
-                ctx.world_age_at_entry = closure_world.V; // The tls world in a OC is the world of the closure
-                emit_unbox_store(ctx, closure_world, world_age_field, ctx.tbaa().tbaa_gcframe, ctx.types().alignof_ptr);
-
-                // Load closure env
-                Value *envaddr = ctx.builder.CreateInBoundsGEP(
-                        getInt8Ty(ctx.builder.getContext()), argaddr,
-                        ConstantInt::get(ctx.types().T_size, offsetof(jl_opaque_closure_t, captures)));
-
-                jl_cgval_t closure_env = typed_load(ctx, envaddr, NULL, (jl_value_t*)jl_any_type,
-                    nullptr, nullptr, true, AtomicOrdering::NotAtomic, false, sizeof(void*));
-                theArg = update_julia_type(ctx, closure_env, vi.value.typ);
-            }
-            else if (specsig) {
-                theArg = get_specsig_arg(argType, llvmArgType, isboxed);
+            if (i == 0) {
+                // first (function) arg is separate in jlcall
+                theArg = mark_julia_type(ctx, fArg, true, vi.value.typ);
             }
             else {
-                if (i == 0) {
-                    // first (function) arg is separate in jlcall
-                    theArg = mark_julia_type(ctx, fArg, true, vi.value.typ);
-                }
-                else {
-                    Value *argPtr = ctx.builder.CreateConstInBoundsGEP1_32(ctx.types().T_prjlvalue, argArray, i - 1);
-                    jl_aliasinfo_t ai = jl_aliasinfo_t::fromTBAA(ctx, ctx.tbaa().tbaa_const);
-                    Value *load = ai.decorateInst(maybe_mark_load_dereferenceable(
-                            ctx.builder.CreateAlignedLoad(ctx.types().T_prjlvalue, argPtr, Align(sizeof(void*))),
-                            false, vi.value.typ));
-                    theArg = mark_julia_type(ctx, load, true, vi.value.typ);
-                    if (debug_enabled && vi.dinfo && !vi.boxroot) {
-                        SmallVector<uint64_t, 8> addr;
+                Value *argPtr = emit_ptrgep(ctx, argArray, (i - 1) * ctx.types().sizeof_ptr);
+                jl_aliasinfo_t ai = jl_aliasinfo_t::fromTBAA(ctx, ctx.tbaa().tbaa_const);
+                Value *load = ai.decorateInst(maybe_mark_load_dereferenceable(
+                        ctx.builder.CreateAlignedLoad(ctx.types().T_prjlvalue, argPtr, Align(sizeof(void*))),
+                        false, vi.value.typ));
+                theArg = mark_julia_type(ctx, load, true, vi.value.typ);
+                if (debug_enabled && vi.dinfo && !vi.boxroot) {
+                    SmallVector<uint64_t, 8> addr;
+                    addr.push_back(llvm::dwarf::DW_OP_deref);
+                    addr.push_back(llvm::dwarf::DW_OP_plus_uconst);
+                    addr.push_back((i - 1) * sizeof(void*));
+                    if ((Metadata*)vi.dinfo->getType() != debugcache.jl_pvalue_dillvmt)
                         addr.push_back(llvm::dwarf::DW_OP_deref);
-                        addr.push_back(llvm::dwarf::DW_OP_plus_uconst);
-                        addr.push_back((i - 1) * sizeof(void*));
-                        if ((Metadata*)vi.dinfo->getType() != debugcache.jl_pvalue_dillvmt)
-                            addr.push_back(llvm::dwarf::DW_OP_deref);
-                        dbuilder.insertDeclare(pargArray, vi.dinfo, dbuilder.createExpression(addr),
-                                        topdebugloc,
-                                        ctx.builder.GetInsertBlock());
-                    }
+                    dbuilder.insertDeclare(pargArray, vi.dinfo, dbuilder.createExpression(addr),
+                                    topdebugloc,
+                                    ctx.builder.GetInsertBlock());
                 }
             }
+        }
 
-
-            if (vi.boxroot == NULL) {
-                assert(vi.value.V == NULL && "unexpected variable slot created for argument");
-                // keep track of original (possibly boxed) value to avoid re-boxing or moving
-                vi.value = theArg;
-                if (debug_enabled && vi.dinfo && theArg.V) {
-                    if (theArg.ispointer()) {
-                        dbuilder.insertDeclare(theArg.V, vi.dinfo, dbuilder.createExpression(),
-                                               topdebugloc, ctx.builder.GetInsertBlock());
-                    }
-                    else {
-                        dbuilder.insertDbgValueIntrinsic(theArg.V, vi.dinfo, dbuilder.createExpression(),
-                                                         topdebugloc, ctx.builder.GetInsertBlock());
-                    }
+        if (vi.boxroot == nullptr) {
+            assert(vi.value.V == nullptr && vi.inline_roots == nullptr && "unexpected variable slot created for argument");
+            // keep track of original (possibly boxed) value to avoid re-boxing or moving
+            vi.value = theArg;
+            if (debug_enabled && vi.dinfo && theArg.V) {
+                if (!theArg.inline_roots.empty() || theArg.ispointer()) {
+                    dbuilder.insertDeclare(theArg.V, vi.dinfo, dbuilder.createExpression(),
+                                            topdebugloc, ctx.builder.GetInsertBlock());
+                }
+                else {
+                    dbuilder.insertDbgValueIntrinsic(theArg.V, vi.dinfo, dbuilder.createExpression(),
+                                                        topdebugloc, ctx.builder.GetInsertBlock());
                 }
             }
-            else {
-                Value *argp = boxed(ctx, theArg);
-                ctx.builder.CreateStore(argp, vi.boxroot);
-            }
+        }
+        else {
+            Value *argp = boxed(ctx, theArg);
+            ctx.builder.CreateStore(argp, vi.boxroot);
         }
     }
     // step 9. allocate rest argument
@@ -8868,10 +9067,8 @@ static jl_llvm_functions_t
             restTuple =
                 ctx.builder.CreateCall(F,
                         { Constant::getNullValue(ctx.types().T_prjlvalue),
-                          ctx.builder.CreateInBoundsGEP(ctx.types().T_prjlvalue, argArray,
-                                  ConstantInt::get(ctx.types().T_size, nreq - 1)),
-                          ctx.builder.CreateSub(argCount,
-                                  ConstantInt::get(getInt32Ty(ctx.builder.getContext()), nreq - 1)) });
+                          emit_ptrgep(ctx, argArray, (nreq - 1) * sizeof(jl_value_t*)),
+                          ctx.builder.CreateSub(argCount, ctx.builder.getInt32(nreq - 1)) });
             restTuple->setAttributes(F->getAttributes());
             ctx.builder.CreateStore(restTuple, vi.boxroot);
         }
@@ -9308,29 +9505,31 @@ static jl_llvm_functions_t
                 break;
             }
             if (sret) {
-                if (retvalinfo.ispointer()) {
-                    if (returninfo.return_roots) {
-                        Type *store_ty = julia_type_to_llvm(ctx, retvalinfo.typ);
-                        emit_sret_roots(ctx, true, data_pointer(ctx, retvalinfo), store_ty, f->arg_begin() + 1, get_returnroots_type(ctx, returninfo.return_roots), returninfo.return_roots);
-                    }
+                Align align(returninfo.union_align);
+                if (!returninfo.return_roots && !retvalinfo.inline_roots.empty()) {
+                    assert(retvalinfo.V == nullptr);
+                    assert(returninfo.cc == jl_returninfo_t::SRet);
+                    split_value_into(ctx, retvalinfo, align, nullptr, align,
+                            jl_aliasinfo_t::fromTBAA(ctx, ctx.tbaa().tbaa_stack), sret, jl_aliasinfo_t::fromTBAA(ctx, ctx.tbaa().tbaa_gcframe));
+                }
+                else if (returninfo.return_roots) {
+                    assert(returninfo.cc == jl_returninfo_t::SRet);
+                    Value *return_roots = f->arg_begin() + 1;
+                    split_value_into(ctx, retvalinfo, align, sret, align,
+                            jl_aliasinfo_t::fromTBAA(ctx, ctx.tbaa().tbaa_stack), return_roots, jl_aliasinfo_t::fromTBAA(ctx, ctx.tbaa().tbaa_gcframe));
+                }
+                else if (retvalinfo.ispointer()) {
                     if (returninfo.cc == jl_returninfo_t::SRet) {
                         assert(jl_is_concrete_type(jlrettype));
-                        Align alignment(julia_alignment(jlrettype));
                         emit_memcpy(ctx, sret, jl_aliasinfo_t::fromTBAA(ctx, nullptr), retvalinfo,
-                                    jl_datatype_size(jlrettype), alignment, alignment);
+                                    jl_datatype_size(jlrettype), align, align);
                     }
                     else { // must be jl_returninfo_t::Union
                         emit_unionmove(ctx, sret, nullptr, retvalinfo, /*skip*/isboxed_union);
                     }
                 }
                 else {
-                    Type *store_ty = retvalinfo.V->getType();
-                    Value *Val = retvalinfo.V;
-                    if (returninfo.return_roots) {
-                        assert(julia_type_to_llvm(ctx, retvalinfo.typ) == store_ty);
-                        emit_sret_roots(ctx, false, Val, store_ty, f->arg_begin() + 1, get_returnroots_type(ctx, returninfo.return_roots), returninfo.return_roots);
-                    }
-                    ctx.builder.CreateAlignedStore(Val, sret, Align(julia_alignment(retvalinfo.typ)));
+                    ctx.builder.CreateAlignedStore(retvalinfo.V, sret, align);
                     assert(retvalinfo.TIndex == NULL && "unreachable"); // unimplemented representation
                 }
             }
@@ -9439,11 +9638,7 @@ static jl_llvm_functions_t
                     ctx.builder.CreateBr(handlr);
                 }
                 ctx.builder.SetInsertPoint(tryblk);
-                auto ehptr = ctx.builder.CreateInBoundsGEP(
-                    ctx.types().T_ptr,
-                    ct,
-                    ConstantInt::get(ctx.types().T_size, offsetof(jl_task_t, eh) / ctx.types().sizeof_ptr),
-                    "eh");
+                auto ehptr = emit_ptrgep(ctx, ct, offsetof(jl_task_t, eh));
                 ctx.builder.CreateAlignedStore(ehbuf, ehptr, ctx.types().alignof_ptr);
             }
         }
@@ -9471,8 +9666,9 @@ static jl_llvm_functions_t
         PHINode *VN;
         jl_value_t *r;
         AllocaInst *dest;
+        SmallVector<PHINode*,0> roots;
         BasicBlock *PhiBB;
-        std::tie(phi_result, PhiBB, dest, VN, r) = tup;
+        std::tie(phi_result, PhiBB, dest, VN, roots, r) = tup;
         jl_value_t *phiType = phi_result.typ;
         jl_array_t *edges = (jl_array_t*)jl_fieldref_noalloc(r, 0);
         jl_array_t *values = (jl_array_t*)jl_fieldref_noalloc(r, 1);
@@ -9530,6 +9726,7 @@ static jl_llvm_functions_t
                 val = mark_julia_const(ctx, val.constant); // be over-conservative at making sure `.typ` is set concretely, not tindex
             if (!jl_is_uniontype(phiType) || !TindexN) {
                 if (VN) {
+                    assert(roots.empty() && !dest);
                     Value *V;
                     if (val.typ == (jl_value_t*)jl_bottom_type) {
                         V = undef_value_for_type(VN->getType());
@@ -9550,14 +9747,34 @@ static jl_llvm_functions_t
                     VN->addIncoming(V, ctx.builder.GetInsertBlock());
                     assert(!TindexN);
                 }
-                else if (dest && val.typ != (jl_value_t*)jl_bottom_type) {
+                else if ((dest || !roots.empty()) && val.typ != (jl_value_t*)jl_bottom_type) {
                     // must be careful to emit undef here (rather than a bitcast or
                     // load of val) if the runtime type of val isn't phiType
+                    auto tracked = split_value_size((jl_datatype_t*)phiType).second;
                     Value *isvalid = emit_isa_and_defined(ctx, val, phiType);
-                    emit_guarded_test(ctx, isvalid, nullptr, [&] {
-                        emit_unbox_store(ctx, update_julia_type(ctx, val, phiType), dest, ctx.tbaa().tbaa_stack, Align(julia_alignment(phiType)));
-                        return nullptr;
+                    assert(roots.size() == tracked && isvalid != nullptr);
+                    SmallVector<Value*,0> incomingroots(0);
+                    if (tracked)
+                        incomingroots.resize(tracked, Constant::getNullValue(ctx.types().T_prjlvalue));
+                    emit_guarded_test(ctx, isvalid, incomingroots, [&] {
+                        jl_cgval_t typedval = update_julia_type(ctx, val, phiType);
+                        SmallVector<Value*,0> mayberoots(tracked, Constant::getNullValue(ctx.types().T_prjlvalue));
+                        if (typedval.typ != jl_bottom_type) {
+                            Align align(julia_alignment(phiType));
+                            if (tracked)
+                                split_value_into(ctx, typedval, align, dest, align, jl_aliasinfo_t::fromTBAA(ctx, ctx.tbaa().tbaa_stack), mayberoots);
+                            else
+                                emit_unbox_store(ctx, typedval, dest, ctx.tbaa().tbaa_stack, align);
+                        }
+                        return mayberoots;
                     });
+                    for (size_t nr = 0; nr < tracked; nr++)
+                        roots[nr]->addIncoming(incomingroots[nr], ctx.builder.GetInsertBlock());
+                }
+                else if (!roots.empty()) {
+                    Value *V = Constant::getNullValue(ctx.types().T_prjlvalue);
+                    for (size_t nr = 0; nr < roots.size(); nr++)
+                        roots[nr]->addIncoming(V, ctx.builder.GetInsertBlock());
                 }
             }
             else {
@@ -9566,6 +9783,7 @@ static jl_llvm_functions_t
                 // `V` is always initialized when it is used.
                 // Ref https://gcc.gnu.org/bugzilla/show_bug.cgi?id=96629
                 Value *V = nullptr;
+                assert(roots.empty());
                 if (val.typ == (jl_value_t*)jl_bottom_type) {
                     if (VN)
                         V = undef_value_for_type(VN->getType());
@@ -9656,11 +9874,10 @@ static jl_llvm_functions_t
             }
             if (TindexN)
                 TindexN->addIncoming(RTindex, FromBB);
-            if (dest) {
+            if (dest)
                 ctx.builder.CreateLifetimeStart(dest);
-                if (CountTrackedPointers(dest->getAllocatedType()).count)
-                    ctx.builder.CreateStore(Constant::getNullValue(dest->getAllocatedType()), dest);
-            }
+            for (size_t nr = 0; nr < roots.size(); nr++)
+                roots[nr]->addIncoming(Constant::getNullValue(ctx.types().T_prjlvalue), FromBB);
             ctx.builder.ClearInsertionPoint();
         }
     }
@@ -9707,15 +9924,19 @@ static jl_llvm_functions_t
 
     if (ctx.vaSlot > 0) {
         // remove VA allocation if we never referenced it
+        assert(ctx.slots[ctx.vaSlot].isSA && ctx.slots[ctx.vaSlot].isArgument);
         Instruction *root = cast_or_null<Instruction>(ctx.slots[ctx.vaSlot].boxroot);
         if (root) {
-            Instruction *store_value = NULL;
             bool have_real_use = false;
             for (Use &U : root->uses()) {
                 User *RU = U.getUser();
                 if (StoreInst *SRU = dyn_cast<StoreInst>(RU)) {
-                    if (!store_value)
-                        store_value = dyn_cast<Instruction>(SRU->getValueOperand());
+                    assert(isa<ConstantPointerNull>(SRU->getValueOperand()) || SRU->getValueOperand() == restTuple);
+                    (void)SRU;
+                }
+                else if (MemSetInst *MSI = dyn_cast<MemSetInst>(RU)) {
+                    assert(MSI->getValue() == ctx.builder.getInt8(0));
+                    (void)MSI;
                 }
                 else if (isa<DbgInfoIntrinsic>(RU)) {
                 }
@@ -9737,7 +9958,6 @@ static jl_llvm_functions_t
                 if (use)
                     use->eraseFromParent();
                 root->eraseFromParent();
-                assert(!store_value || store_value == restTuple);
                 restTuple->eraseFromParent();
             }
         }
@@ -9965,7 +10185,7 @@ void jl_compile_workqueue(
             if (it == params.compiled_functions.end()) {
                 // Reinfer the function. The JIT came along and removed the inferred
                 // method body. See #34993
-                if (policy != CompilationPolicy::Default &&
+                if ((policy != CompilationPolicy::Default || params.params->trim) &&
                     jl_atomic_load_relaxed(&codeinst->inferred) == jl_nothing) {
                     // XXX: SOURCE_MODE_FORCE_SOURCE is wrong here (neither sufficient nor necessary)
                     codeinst = jl_type_infer(codeinst->def, jl_atomic_load_relaxed(&codeinst->max_world), SOURCE_MODE_FORCE_SOURCE);
@@ -9996,6 +10216,16 @@ void jl_compile_workqueue(
         if (proto.specsig) {
             // expected specsig
             if (!preal_specsig) {
+                if (params.params->trim) {
+                    auto it = params.compiled_functions.find(codeinst); //TODO: What to do about this
+                    errs() << "Bailed out to invoke when compiling:";
+                    jl_(codeinst->def);
+                    if (it != params.compiled_functions.end()) {
+                        errs() << it->second.second.functionObject << "\n";
+                        errs() << it->second.second.specFunctionObject << "\n";
+                    } else
+                        errs() << "codeinst not in compile_functions\n";
+                }
                 // emit specsig-to-(jl)invoke conversion
                 StringRef invokeName;
                 if (invoke != NULL)
@@ -10141,6 +10371,7 @@ static void init_jit_functions(void)
 
 #ifdef _OS_WINDOWS_
 #if defined(_CPU_X86_64_)
+    add_named_global("__julia_personality", &__julia_personality);
 #if defined(_COMPILER_GCC_)
     add_named_global("___chkstk_ms", &___chkstk_ms);
 #else
@@ -10166,7 +10397,7 @@ static void init_jit_functions(void)
 }
 
 #ifdef JL_USE_INTEL_JITEVENTS
-char jl_using_intel_jitevents; // Non-zero if running under Intel VTune Amplifier
+char jl_using_intel_jitevents = 0; // Non-zero if running under Intel VTune Amplifier
 #endif
 
 #ifdef JL_USE_OPROFILE_JITEVENTS
@@ -10185,6 +10416,22 @@ int jl_opaque_ptrs_set = 0;
 
 extern "C" void jl_init_llvm(void)
 {
+    jl_default_cgparams = {
+        /* track_allocations */ 1,
+        /* code_coverage */ 1,
+        /* prefer_specsig */ 0,
+#ifdef _OS_WINDOWS_
+        /* gnu_pubnames */ 0,
+#else
+        /* gnu_pubnames */ 1,
+#endif
+        /* debug_info_kind */ (int) DICompileUnit::DebugEmissionKind::FullDebug,
+        /* debug_info_level */ (int) jl_options.debug_level,
+        /* safepoint_on_entry */ 1,
+        /* gcstack_arg */ 1,
+        /* use_jlplt*/ 1,
+        /* trim */ 0,
+        /* lookup */ jl_rettype_inferred_addr };
     jl_page_size = jl_getpagesize();
     jl_default_debug_info_kind = (int) DICompileUnit::DebugEmissionKind::FullDebug;
     jl_default_cgparams.debug_info_level = (int) jl_options.debug_level;
@@ -10264,9 +10511,6 @@ extern "C" void jl_init_llvm(void)
 #if defined(JL_USE_INTEL_JITEVENTS) || \
     defined(JL_USE_OPROFILE_JITEVENTS) || \
     defined(JL_USE_PERF_JITEVENTS)
-#ifdef JL_USE_JITLINK
-#pragma message("JIT profiling support (JL_USE_*_JITEVENTS) not yet available on platforms that use JITLink")
-#else
     const char *jit_profiling = getenv("ENABLE_JITPROFILING");
 
 #if defined(JL_USE_INTEL_JITEVENTS)
@@ -10283,24 +10527,23 @@ extern "C" void jl_init_llvm(void)
 
 #if defined(JL_USE_PERF_JITEVENTS)
     if (jit_profiling && atoi(jit_profiling)) {
-        jl_using_perf_jitevents= 1;
+        jl_using_perf_jitevents = 1;
     }
 #endif
 
 #ifdef JL_USE_INTEL_JITEVENTS
     if (jl_using_intel_jitevents)
-        jl_ExecutionEngine->RegisterJITEventListener(JITEventListener::createIntelJITEventListener());
+        jl_ExecutionEngine->enableIntelJITEventListener();
 #endif
 
 #ifdef JL_USE_OPROFILE_JITEVENTS
     if (jl_using_oprofile_jitevents)
-        jl_ExecutionEngine->RegisterJITEventListener(JITEventListener::createOProfileJITEventListener());
+        jl_ExecutionEngine->enableOProfileJITEventListener();
 #endif
 
 #ifdef JL_USE_PERF_JITEVENTS
     if (jl_using_perf_jitevents)
-        jl_ExecutionEngine->RegisterJITEventListener(JITEventListener::createPerfJITEventListener());
-#endif
+        jl_ExecutionEngine->enablePerfJITEventListener();
 #endif
 #endif
 
diff --git a/src/datatype.c b/src/datatype.c
index 1157c1d425cb2..c78b00fdd2245 100644
--- a/src/datatype.c
+++ b/src/datatype.c
@@ -20,23 +20,21 @@ extern "C" {
 
 // allocating TypeNames -----------------------------------------------------------
 
-static int is10digit(char c) JL_NOTSAFEPOINT
-{
-    return (c >= '0' && c <= '9');
-}
-
 static jl_sym_t *jl_demangle_typename(jl_sym_t *s) JL_NOTSAFEPOINT
 {
     char *n = jl_symbol_name(s);
     if (n[0] != '#')
         return s;
-    char *end = strrchr(n, '#');
+    char *end = strchr(&n[1], '#');
+    // handle `#f...##...#...`
+    if (end != NULL && end[1] == '#')
+        end = strchr(&end[2], '#');
     int32_t len;
-    if (end == n || end == n+1)
+    if (end == NULL || end == n+1)
         len = strlen(n) - 1;
     else
         len = (end-n) - 1;  // extract `f` from `#f#...`
-    if (is10digit(n[1]))
+    if (isdigit(n[1]) || is_canonicalized_anonfn_typename(n))
         return _jl_symbol(n, len+1);
     return _jl_symbol(&n[1], len);
 }
diff --git a/src/debug-registry.h b/src/debug-registry.h
index 85a94245ce6aa..4c9e13d8cd72d 100644
--- a/src/debug-registry.h
+++ b/src/debug-registry.h
@@ -145,8 +145,7 @@ class JITDebugInfoRegistry
     void add_code_in_flight(llvm::StringRef name, jl_code_instance_t *codeinst, const llvm::DataLayout &DL) JL_NOTSAFEPOINT;
     jl_method_instance_t *lookupLinfo(size_t pointer) JL_NOTSAFEPOINT;
     void registerJITObject(const llvm::object::ObjectFile &Object,
-                        std::function<uint64_t(const llvm::StringRef &)> getLoadAddress,
-                        std::function<void*(void*)> lookupWriteAddress);
+                        std::function<uint64_t(const llvm::StringRef &)> getLoadAddress);
     objectmap_t& getObjectMap() JL_NOTSAFEPOINT;
     void add_image_info(image_info_t info) JL_NOTSAFEPOINT;
     bool get_image_info(uint64_t base, image_info_t *info) const JL_NOTSAFEPOINT;
diff --git a/src/debuginfo.cpp b/src/debuginfo.cpp
index 84550811072fe..cfaf8d4c70ee9 100644
--- a/src/debuginfo.cpp
+++ b/src/debuginfo.cpp
@@ -223,11 +223,21 @@ static void create_PRUNTIME_FUNCTION(uint8_t *Code, size_t Size, StringRef fnnam
 #endif
 
 void JITDebugInfoRegistry::registerJITObject(const object::ObjectFile &Object,
-                        std::function<uint64_t(const StringRef &)> getLoadAddress,
-                        std::function<void*(void*)> lookupWriteAddress)
+                        std::function<uint64_t(const StringRef &)> getLoadAddress)
 {
     object::section_iterator EndSection = Object.section_end();
 
+    bool anyfunctions = false;
+    for (const object::SymbolRef &sym_iter : Object.symbols()) {
+        object::SymbolRef::Type SymbolType = cantFail(sym_iter.getType());
+        if (SymbolType != object::SymbolRef::ST_Function)
+            continue;
+        anyfunctions = true;
+        break;
+    }
+    if (!anyfunctions)
+        return;
+
 #ifdef _CPU_ARM_
     // ARM does not have/use .eh_frame
     uint64_t arm_exidx_addr = 0;
@@ -281,14 +291,13 @@ void JITDebugInfoRegistry::registerJITObject(const object::ObjectFile &Object,
 #if defined(_OS_WINDOWS_)
     uint64_t SectionAddrCheck = 0;
     uint64_t SectionLoadCheck = 0; (void)SectionLoadCheck;
-    uint64_t SectionWriteCheck = 0; (void)SectionWriteCheck;
     uint8_t *UnwindData = NULL;
 #if defined(_CPU_X86_64_)
     uint8_t *catchjmp = NULL;
     for (const object::SymbolRef &sym_iter : Object.symbols()) {
         StringRef sName = cantFail(sym_iter.getName());
         if (sName.equals("__UnwindData") || sName.equals("__catchjmp")) {
-            uint64_t Addr = cantFail(sym_iter.getAddress());
+            uint64_t Addr = cantFail(sym_iter.getAddress()); // offset into object (including section offset)
             auto Section = cantFail(sym_iter.getSection());
             assert(Section != EndSection && Section->isText());
             uint64_t SectionAddr = Section->getAddress();
@@ -300,10 +309,7 @@ void JITDebugInfoRegistry::registerJITObject(const object::ObjectFile &Object,
                         SectionLoadCheck == SectionLoadAddr);
             SectionAddrCheck = SectionAddr;
             SectionLoadCheck = SectionLoadAddr;
-            SectionWriteCheck = SectionLoadAddr;
-            if (lookupWriteAddress)
-                SectionWriteCheck = (uintptr_t)lookupWriteAddress((void*)SectionLoadAddr);
-            Addr += SectionWriteCheck - SectionLoadCheck;
+            Addr += SectionLoadAddr - SectionAddr;
             if (sName.equals("__UnwindData")) {
                 UnwindData = (uint8_t*)Addr;
             }
@@ -314,25 +320,7 @@ void JITDebugInfoRegistry::registerJITObject(const object::ObjectFile &Object,
     }
     assert(catchjmp);
     assert(UnwindData);
-    assert(SectionAddrCheck);
     assert(SectionLoadCheck);
-    assert(!memcmp(catchjmp, "\0\0\0\0\0\0\0\0\0\0\0\0", 12) &&
-            !memcmp(UnwindData, "\0\0\0\0\0\0\0\0\0\0\0\0", 12));
-    catchjmp[0] = 0x48;
-    catchjmp[1] = 0xb8; // mov RAX, QWORD PTR [&__julia_personality]
-    *(uint64_t*)(&catchjmp[2]) = (uint64_t)&__julia_personality;
-    catchjmp[10] = 0xff;
-    catchjmp[11] = 0xe0; // jmp RAX
-    UnwindData[0] = 0x09; // version info, UNW_FLAG_EHANDLER
-    UnwindData[1] = 4;    // size of prolog (bytes)
-    UnwindData[2] = 2;    // count of unwind codes (slots)
-    UnwindData[3] = 0x05; // frame register (rbp) = rsp
-    UnwindData[4] = 4;    // second instruction
-    UnwindData[5] = 0x03; // mov RBP, RSP
-    UnwindData[6] = 1;    // first instruction
-    UnwindData[7] = 0x50; // push RBP
-    *(DWORD*)&UnwindData[8] = (DWORD)(catchjmp - (uint8_t*)SectionWriteCheck); // relative location of catchjmp
-    UnwindData -= SectionWriteCheck - SectionLoadCheck;
 #endif // defined(_OS_X86_64_)
 #endif // defined(_OS_WINDOWS_)
 
@@ -353,7 +341,7 @@ void JITDebugInfoRegistry::registerJITObject(const object::ObjectFile &Object,
         uint64_t SectionAddr = Section->getAddress();
         StringRef secName = cantFail(Section->getName());
         uint64_t SectionLoadAddr = getLoadAddress(secName);
-        Addr -= SectionAddr - SectionLoadAddr;
+        Addr += SectionLoadAddr - SectionAddr;
         StringRef sName = cantFail(sym_iter.getName());
         uint64_t SectionSize = Section->getSize();
         size_t Size = sym_size.second;
@@ -404,10 +392,9 @@ void JITDebugInfoRegistry::registerJITObject(const object::ObjectFile &Object,
 }
 
 void jl_register_jit_object(const object::ObjectFile &Object,
-                            std::function<uint64_t(const StringRef &)> getLoadAddress,
-                            std::function<void *(void *)> lookupWriteAddress)
+                            std::function<uint64_t(const StringRef &)> getLoadAddress)
 {
-    getJITDebugRegistry().registerJITObject(Object, getLoadAddress, lookupWriteAddress);
+    getJITDebugRegistry().registerJITObject(Object, getLoadAddress);
 }
 
 // TODO: convert the safe names from aotcomile.cpp:makeSafeName back into symbols
diff --git a/src/flisp/flisp.h b/src/flisp/flisp.h
index 669753a9f5302..f8dd1cfd81ed0 100644
--- a/src/flisp/flisp.h
+++ b/src/flisp/flisp.h
@@ -158,7 +158,7 @@ value_t fl_cons(fl_context_t *fl_ctx, value_t a, value_t b) JL_NOTSAFEPOINT;
 value_t fl_list2(fl_context_t *fl_ctx, value_t a, value_t b) JL_NOTSAFEPOINT;
 value_t fl_listn(fl_context_t *fl_ctx, size_t n, ...) JL_NOTSAFEPOINT;
 value_t symbol(fl_context_t *fl_ctx, const char *str) JL_NOTSAFEPOINT;
-char *symbol_name(fl_context_t *fl_ctx, value_t v);
+char *symbol_name(fl_context_t *fl_ctx, value_t v) JL_NOTSAFEPOINT;
 int fl_is_keyword_name(const char *str, size_t len);
 value_t alloc_vector(fl_context_t *fl_ctx, size_t n, int init);
 size_t llength(value_t v);
diff --git a/src/gc-heap-snapshot.cpp b/src/gc-heap-snapshot.cpp
index fcda11dad4f8a..72eb17115f4c7 100644
--- a/src/gc-heap-snapshot.cpp
+++ b/src/gc-heap-snapshot.cpp
@@ -618,38 +618,32 @@ void final_serialize_heap_snapshot(ios_t *json, ios_t *strings, HeapSnapshot &sn
 {
     // mimicking https://github.com/nodejs/node/blob/5fd7a72e1c4fbaf37d3723c4c81dce35c149dc84/deps/v8/src/profiler/heap-snapshot-generator.cc#L2567-L2567
     // also https://github.com/microsoft/vscode-v8-heap-tools/blob/c5b34396392397925ecbb4ecb904a27a2754f2c1/v8-heap-parser/src/decoder.rs#L43-L51
-    ios_printf(json, "{\"snapshot\":{");
+    ios_printf(json, "{\"snapshot\":{\n");
 
-    ios_printf(json, "\"meta\":{");
-    ios_printf(json, "\"node_fields\":[\"type\",\"name\",\"id\",\"self_size\",\"edge_count\",\"trace_node_id\",\"detachedness\"],");
-    ios_printf(json, "\"node_types\":[");
+    ios_printf(json, "  \"meta\":{\n");
+    ios_printf(json, "    \"node_fields\":[\"type\",\"name\",\"id\",\"self_size\",\"edge_count\",\"trace_node_id\",\"detachedness\"],\n");
+    ios_printf(json, "    \"node_types\":[");
     snapshot.node_types.print_json_array(json, false);
     ios_printf(json, ",");
-    ios_printf(json, "\"string\", \"number\", \"number\", \"number\", \"number\", \"number\"],");
-    ios_printf(json, "\"edge_fields\":[\"type\",\"name_or_index\",\"to_node\"],");
-    ios_printf(json, "\"edge_types\":[");
+    ios_printf(json, "\"string\", \"number\", \"number\", \"number\", \"number\", \"number\"],\n");
+    ios_printf(json, "    \"edge_fields\":[\"type\",\"name_or_index\",\"to_node\"],\n");
+    ios_printf(json, "    \"edge_types\":[");
     snapshot.edge_types.print_json_array(json, false);
     ios_printf(json, ",");
-    ios_printf(json, "\"string_or_number\",\"from_node\"],");
+    ios_printf(json, "\"string_or_number\",\"from_node\"],\n");
     // not used. Required by microsoft/vscode-v8-heap-tools
-    ios_printf(json, "\"trace_function_info_fields\":[\"function_id\",\"name\",\"script_name\",\"script_id\",\"line\",\"column\"],");
-    ios_printf(json, "\"trace_node_fields\":[\"id\",\"function_info_index\",\"count\",\"size\",\"children\"],");
-    ios_printf(json, "\"sample_fields\":[\"timestamp_us\",\"last_assigned_id\"],");
-    ios_printf(json, "\"location_fields\":[\"object_index\",\"script_id\",\"line\",\"column\"]");
+    ios_printf(json, "    \"trace_function_info_fields\":[\"function_id\",\"name\",\"script_name\",\"script_id\",\"line\",\"column\"],\n");
+    ios_printf(json, "    \"trace_node_fields\":[\"id\",\"function_info_index\",\"count\",\"size\",\"children\"],\n");
+    ios_printf(json, "    \"sample_fields\":[\"timestamp_us\",\"last_assigned_id\"],\n");
+    ios_printf(json, "    \"location_fields\":[\"object_index\",\"script_id\",\"line\",\"column\"]\n");
     // end not used
-    ios_printf(json, "},\n"); // end "meta"
+    ios_printf(json, "  },\n"); // end "meta"
 
-    ios_printf(json, "\"node_count\":%zu,", snapshot.num_nodes);
-    ios_printf(json, "\"edge_count\":%zu,", snapshot.num_edges);
-    ios_printf(json, "\"trace_function_count\":0"); // not used. Required by microsoft/vscode-v8-heap-tools
-    ios_printf(json, "},\n"); // end "snapshot"
-
-    // not used. Required by microsoft/vscode-v8-heap-tools
-    ios_printf(json, "\"trace_function_infos\":[],");
-    ios_printf(json, "\"trace_tree\":[],");
-    ios_printf(json, "\"samples\":[],");
-    ios_printf(json, "\"locations\":[]");
-    // end not used
+    ios_printf(json, "  \"node_count\":%zu,\n", snapshot.num_nodes);
+    ios_printf(json, "  \"edge_count\":%zu,\n", snapshot.num_edges);
+    ios_printf(json, "  \"trace_function_count\":0\n"); // not used. Required by microsoft/vscode-v8-heap-tools
+    ios_printf(json, "}\n"); // end "snapshot"
 
+    // this } is removed by the julia reassembler in Profile
     ios_printf(json, "}");
 }
diff --git a/src/gc-stock.c b/src/gc-stock.c
index d61d9387fe143..c732a60610dfb 100644
--- a/src/gc-stock.c
+++ b/src/gc-stock.c
@@ -40,6 +40,8 @@ uv_sem_t gc_sweep_assists_needed;
 uv_mutex_t gc_queue_observer_lock;
 // Tag for sentinel nodes in bigval list
 uintptr_t gc_bigval_sentinel_tag;
+// Table recording number of full GCs due to each reason
+JL_DLLEXPORT uint64_t jl_full_sweep_reasons[FULL_SWEEP_NUM_REASONS];
 
 // Flag that tells us whether we need to support conservative marking
 // of objects.
@@ -3043,10 +3045,12 @@ static int _jl_gc_collect(jl_ptls_t ptls, jl_gc_collection_t collection)
     // we either free some space or get an OOM error.
     if (gc_sweep_always_full) {
         sweep_full = 1;
+        gc_count_full_sweep_reason(FULL_SWEEP_REASON_SWEEP_ALWAYS_FULL);
     }
     if (collection == JL_GC_FULL && !prev_sweep_full) {
         sweep_full = 1;
         recollect = 1;
+        gc_count_full_sweep_reason(FULL_SWEEP_REASON_FORCED_FULL_SWEEP);
     }
     if (sweep_full) {
         // these are the difference between the number of gc-perm bytes scanned
@@ -3182,10 +3186,17 @@ static int _jl_gc_collect(jl_ptls_t ptls, jl_gc_collection_t collection)
     }
 
     double old_ratio = (double)promoted_bytes/(double)heap_size;
-    if (heap_size > user_max || old_ratio > 0.15)
+    if (heap_size > user_max) {
         next_sweep_full = 1;
-    else
+        gc_count_full_sweep_reason(FULL_SWEEP_REASON_USER_MAX_EXCEEDED);
+    }
+    else if (old_ratio > 0.15) {
+        next_sweep_full = 1;
+        gc_count_full_sweep_reason(FULL_SWEEP_REASON_LARGE_PROMOTION_RATE);
+    }
+    else {
         next_sweep_full = 0;
+    }
     if (heap_size > user_max || thrashing)
         under_pressure = 1;
     // sweeping is over
diff --git a/src/gc-stock.h b/src/gc-stock.h
index 45c93bf4289ae..46f7d3e11e105 100644
--- a/src/gc-stock.h
+++ b/src/gc-stock.h
@@ -505,6 +505,20 @@ FORCE_INLINE void gc_big_object_link(bigval_t *sentinel_node, bigval_t *node) JL
     sentinel_node->next = node;
 }
 
+// Must be kept in sync with `base/timing.jl`
+#define FULL_SWEEP_REASON_SWEEP_ALWAYS_FULL (0)
+#define FULL_SWEEP_REASON_FORCED_FULL_SWEEP (1)
+#define FULL_SWEEP_REASON_USER_MAX_EXCEEDED (2)
+#define FULL_SWEEP_REASON_LARGE_PROMOTION_RATE (3)
+#define FULL_SWEEP_NUM_REASONS (4)
+
+extern JL_DLLEXPORT uint64_t jl_full_sweep_reasons[FULL_SWEEP_NUM_REASONS];
+STATIC_INLINE void gc_count_full_sweep_reason(int reason) JL_NOTSAFEPOINT
+{
+    assert(reason >= 0 && reason < FULL_SWEEP_NUM_REASONS);
+    jl_full_sweep_reasons[reason]++;
+}
+
 extern uv_mutex_t gc_perm_lock;
 extern uv_mutex_t gc_threads_lock;
 extern uv_cond_t gc_threads_cond;
diff --git a/src/gf.c b/src/gf.c
index 95bab0d0f832e..fc2e62ebff96b 100644
--- a/src/gf.c
+++ b/src/gf.c
@@ -1360,8 +1360,7 @@ static inline jl_typemap_entry_t *lookup_leafcache(jl_genericmemory_t *leafcache
     }
     return NULL;
 }
-
-static jl_method_instance_t *cache_method(
+jl_method_instance_t *cache_method(
         jl_methtable_t *mt, _Atomic(jl_typemap_t*) *cache, jl_value_t *parent JL_PROPAGATES_ROOT,
         jl_tupletype_t *tt, // the original tupletype of the signature
         jl_method_t *definition,
@@ -1707,7 +1706,7 @@ static void method_overwrite(jl_typemap_entry_t *newentry, jl_method_t *oldvalue
         jl_printf(s, ".\n");
         jl_uv_flush(s);
     }
-    if (jl_generating_output()) {
+    if (jl_generating_output() && jl_options.incremental) {
         jl_printf(JL_STDERR, "ERROR: Method overwriting is not permitted during Module precompilation. Use `__precompile__(false)` to opt-out of precompilation.\n");
         jl_throw(jl_precompilable_error);
     }
@@ -2335,7 +2334,7 @@ JL_DLLEXPORT void jl_method_table_insert(jl_methtable_t *mt, jl_method_t *method
     JL_GC_POP();
 }
 
-static void JL_NORETURN jl_method_error_bare(jl_function_t *f, jl_value_t *args, size_t world)
+static void JL_NORETURN jl_method_error_bare(jl_value_t *f, jl_value_t *args, size_t world)
 {
     if (jl_methoderror_type) {
         jl_value_t *e = jl_new_struct_uninit(jl_methoderror_type);
@@ -2360,7 +2359,7 @@ static void JL_NORETURN jl_method_error_bare(jl_function_t *f, jl_value_t *args,
     // not reached
 }
 
-void JL_NORETURN jl_method_error(jl_function_t *f, jl_value_t **args, size_t na, size_t world)
+void JL_NORETURN jl_method_error(jl_value_t *f, jl_value_t **args, size_t na, size_t world)
 {
     jl_value_t *argtup = jl_f_tuple(NULL, args, na - 1);
     JL_GC_PUSH1(&argtup);
@@ -2411,7 +2410,7 @@ JL_DLLEXPORT jl_method_instance_t *jl_method_lookup(jl_value_t **args, size_t na
 // spvals is any matched static parameter values, m is the Method,
 // full is a boolean indicating if that method fully covers the input
 //
-// lim is the max # of methods to return. if there are more, returns jl_false.
+// lim is the max # of methods to return. if there are more, returns jl_nothing.
 // Negative values stand for no limit.
 // Unless lim == -1, remove matches that are unambiguously covered by earlier ones
 JL_DLLEXPORT jl_value_t *jl_matching_methods(jl_tupletype_t *types, jl_value_t *mt, int lim, int include_ambiguous,
@@ -2431,7 +2430,7 @@ JL_DLLEXPORT jl_value_t *jl_matching_methods(jl_tupletype_t *types, jl_value_t *
     return ml_matches((jl_methtable_t*)mt, types, lim, include_ambiguous, 1, world, 1, min_valid, max_valid, ambig);
 }
 
-jl_method_instance_t *jl_get_unspecialized(jl_method_t *def JL_PROPAGATES_ROOT)
+JL_DLLEXPORT jl_method_instance_t *jl_get_unspecialized(jl_method_t *def JL_PROPAGATES_ROOT)
 {
     // one unspecialized version of a function can be shared among all cached specializations
     if (!jl_is_method(def) || def->source == NULL) {
@@ -2514,12 +2513,32 @@ jl_code_instance_t *jl_method_inferred_with_abi(jl_method_instance_t *mi JL_PROP
 
 jl_mutex_t precomp_statement_out_lock;
 
-static void record_precompile_statement(jl_method_instance_t *mi, double compilation_time)
+_Atomic(uint8_t) jl_force_trace_compile_timing_enabled = 0;
+
+/**
+ * @brief Enable force trace compile to stderr with timing.
+ */
+JL_DLLEXPORT void jl_force_trace_compile_timing_enable(void)
+{
+    // Increment the flag to allow reentrant callers to `@trace_compile`.
+    jl_atomic_fetch_add(&jl_force_trace_compile_timing_enabled, 1);
+}
+/**
+ * @brief Disable force trace compile to stderr with timing.
+ */
+JL_DLLEXPORT void jl_force_trace_compile_timing_disable(void)
+{
+    // Increment the flag to allow reentrant callers to `@trace_compile`.
+    jl_atomic_fetch_add(&jl_force_trace_compile_timing_enabled, -1);
+}
+
+static void record_precompile_statement(jl_method_instance_t *mi, double compilation_time, int is_recompile)
 {
     static ios_t f_precompile;
     static JL_STREAM* s_precompile = NULL;
     jl_method_t *def = mi->def.method;
-    if (jl_options.trace_compile == NULL)
+    uint8_t force_trace_compile = jl_atomic_load_relaxed(&jl_force_trace_compile_timing_enabled);
+    if (force_trace_compile == 0 && jl_options.trace_compile == NULL)
         return;
     if (!jl_is_method(def))
         return;
@@ -2529,7 +2548,7 @@ static void record_precompile_statement(jl_method_instance_t *mi, double compila
     JL_LOCK(&precomp_statement_out_lock);
     if (s_precompile == NULL) {
         const char *t = jl_options.trace_compile;
-        if (!strncmp(t, "stderr", 6)) {
+        if (force_trace_compile || !strncmp(t, "stderr", 6)) {
             s_precompile = JL_STDERR;
         }
         else {
@@ -2539,17 +2558,80 @@ static void record_precompile_statement(jl_method_instance_t *mi, double compila
         }
     }
     if (!jl_has_free_typevars(mi->specTypes)) {
-        if (jl_options.trace_compile_timing)
+        if (is_recompile && s_precompile == JL_STDERR && jl_options.color != JL_OPTIONS_COLOR_OFF)
+            jl_printf(s_precompile, "\e[33m");
+        if (force_trace_compile || jl_options.trace_compile_timing)
             jl_printf(s_precompile, "#= %6.1f ms =# ", compilation_time / 1e6);
         jl_printf(s_precompile, "precompile(");
         jl_static_show(s_precompile, mi->specTypes);
-        jl_printf(s_precompile, ")\n");
+        jl_printf(s_precompile, ")");
+        if (is_recompile) {
+            if (s_precompile == JL_STDERR && jl_options.color != JL_OPTIONS_COLOR_OFF) {
+                jl_printf(s_precompile, "\e[0m");
+            }
+            else {
+                jl_printf(s_precompile, " # recompile");
+            }
+        }
+        jl_printf(s_precompile, "\n");
         if (s_precompile != JL_STDERR)
             ios_flush(&f_precompile);
     }
     JL_UNLOCK(&precomp_statement_out_lock);
 }
 
+jl_mutex_t dispatch_statement_out_lock;
+
+_Atomic(uint8_t) jl_force_trace_dispatch_enabled = 0;
+
+/**
+ * @brief Enable force trace dispatch to stderr.
+ */
+JL_DLLEXPORT void jl_force_trace_dispatch_enable(void)
+{
+    // Increment the flag to allow reentrant callers to `@trace_dispatch`.
+    jl_atomic_fetch_add(&jl_force_trace_dispatch_enabled, 1);
+}
+/**
+ * @brief Disable force trace dispatch to stderr.
+ */
+JL_DLLEXPORT void jl_force_trace_dispatch_disable(void)
+{
+    // Increment the flag to allow reentrant callers to `@trace_dispatch`.
+    jl_atomic_fetch_add(&jl_force_trace_dispatch_enabled, -1);
+}
+
+static void record_dispatch_statement(jl_method_instance_t *mi)
+{
+    static ios_t f_dispatch;
+    static JL_STREAM* s_dispatch = NULL;
+    jl_method_t *def = mi->def.method;
+    if (!jl_is_method(def))
+        return;
+
+    uint8_t force_trace_dispatch = jl_atomic_load_relaxed(&jl_force_trace_dispatch_enabled);
+    JL_LOCK(&dispatch_statement_out_lock);
+    if (s_dispatch == NULL) {
+        const char *t = jl_options.trace_dispatch;
+        if (force_trace_dispatch || !strncmp(t, "stderr", 6)) {
+            s_dispatch = JL_STDERR;
+        }
+        else {
+            if (ios_file(&f_dispatch, t, 1, 1, 1, 1) == NULL)
+                jl_errorf("cannot open dispatch statement file \"%s\" for writing", t);
+            s_dispatch = (JL_STREAM*) &f_dispatch;
+        }
+    }
+    if (!jl_has_free_typevars(mi->specTypes)) {
+        jl_printf(s_dispatch, "precompile(");
+        jl_static_show(s_dispatch, mi->specTypes);
+        jl_printf(s_dispatch, ")\n");
+        if (s_dispatch != JL_STDERR)
+            ios_flush(&f_dispatch);
+    }
+    JL_UNLOCK(&dispatch_statement_out_lock);
+}
+
 // If waitcompile is 0, this will return NULL if compiling is on-going in the JIT. This is
 // useful for the JIT itself, since it just doesn't cause redundant work or missed updates,
 // but merely causes it to look into the current JIT worklist.
@@ -2674,7 +2756,7 @@ jl_code_instance_t *jl_compile_method_internal(jl_method_instance_t *mi, size_t
                     // unspec is probably not specsig, but might be using specptr
                     jl_atomic_store_relaxed(&codeinst->specsigflags, specsigflags & ~0b1); // clear specsig flag
                     jl_mi_cache_insert(mi, codeinst);
-                    record_precompile_statement(mi, 0);
+                    record_precompile_statement(mi, 0, 0);
                     return codeinst;
                 }
             }
@@ -2691,7 +2773,7 @@ jl_code_instance_t *jl_compile_method_internal(jl_method_instance_t *mi, size_t
                 0, 1, ~(size_t)0, 0, jl_nothing, 0, NULL);
             jl_atomic_store_release(&codeinst->invoke, jl_fptr_interpret_call);
             jl_mi_cache_insert(mi, codeinst);
-            record_precompile_statement(mi, 0);
+            record_precompile_statement(mi, 0, 0);
             return codeinst;
         }
         if (compile_option == JL_OPTIONS_COMPILE_OFF) {
@@ -2740,7 +2822,7 @@ jl_code_instance_t *jl_compile_method_internal(jl_method_instance_t *mi, size_t
             codeinst = NULL;
         }
         else if (did_compile && codeinst->owner == jl_nothing) {
-            record_precompile_statement(mi, compile_time);
+            record_precompile_statement(mi, compile_time, is_recompile);
         }
         JL_GC_POP();
     }
@@ -2899,7 +2981,7 @@ jl_method_instance_t *jl_normalize_to_compilable_mi(jl_method_instance_t *mi JL_
 }
 
 // return a MethodInstance for a compileable method_match
-jl_method_instance_t *jl_method_match_to_mi(jl_method_match_t *match, size_t world, size_t min_valid, size_t max_valid, int mt_cache)
+JL_DLLEXPORT jl_method_instance_t *jl_method_match_to_mi(jl_method_match_t *match, size_t world, size_t min_valid, size_t max_valid, int mt_cache)
 {
     jl_method_t *m = match->method;
     jl_svec_t *env = match->sparams;
@@ -3057,7 +3139,8 @@ static void jl_compile_now(jl_method_instance_t *mi)
 JL_DLLEXPORT void jl_compile_method_instance(jl_method_instance_t *mi, jl_tupletype_t *types, size_t world)
 {
     size_t tworld = jl_typeinf_world;
-    jl_atomic_store_relaxed(&mi->precompiled, 1);
+    uint8_t miflags = jl_atomic_load_relaxed(&mi->flags) | JL_MI_FLAGS_MASK_PRECOMPILED;
+    jl_atomic_store_relaxed(&mi->flags, miflags);
     if (jl_generating_output()) {
         jl_compile_now(mi);
         // In addition to full compilation of the compilation-signature, if `types` is more specific (e.g. due to nospecialize),
@@ -3072,7 +3155,8 @@ JL_DLLEXPORT void jl_compile_method_instance(jl_method_instance_t *mi, jl_tuplet
             types2 = jl_type_intersection_env((jl_value_t*)types, (jl_value_t*)mi->def.method->sig, &tpenv2);
             jl_method_instance_t *mi2 = jl_specializations_get_linfo(mi->def.method, (jl_value_t*)types2, tpenv2);
             JL_GC_POP();
-            jl_atomic_store_relaxed(&mi2->precompiled, 1);
+            miflags = jl_atomic_load_relaxed(&mi2->flags) | JL_MI_FLAGS_MASK_PRECOMPILED;
+            jl_atomic_store_relaxed(&mi2->flags, miflags);
             if (jl_rettype_inferred_native(mi2, world, world) == jl_nothing)
                 (void)jl_type_infer(mi2, world, SOURCE_MODE_NOT_REQUIRED);
             if (jl_typeinf_func && jl_atomic_load_relaxed(&mi->def.method->primary_world) <= tworld) {
@@ -3101,6 +3185,21 @@ JL_DLLEXPORT int jl_compile_hint(jl_tupletype_t *types)
     return 1;
 }
 
+JL_DLLEXPORT int jl_add_entrypoint(jl_tupletype_t *types)
+{
+    size_t world = jl_atomic_load_acquire(&jl_world_counter);
+    size_t min_valid = 0;
+    size_t max_valid = ~(size_t)0;
+    jl_method_instance_t *mi = jl_get_compile_hint_specialization(types, world, &min_valid, &max_valid, 1);
+    if (mi == NULL)
+        return 0;
+    JL_GC_PROMISE_ROOTED(mi);
+    if (jl_generating_output() && jl_options.trim) {
+        arraylist_push(jl_entrypoint_mis, mi);
+    }
+    return 1;
+}
+
 // add type of `f` to front of argument tuple type
 jl_value_t *jl_argtype_with_function(jl_value_t *f, jl_value_t *types0)
 {
@@ -3333,6 +3432,17 @@ STATIC_INLINE jl_method_instance_t *jl_lookup_generic_(jl_value_t *F, jl_value_t
             jl_method_error(F, args, nargs, world);
             // unreachable
         }
+        // mfunc is about to be dispatched
+        uint8_t force_trace_dispatch = jl_atomic_load_relaxed(&jl_force_trace_dispatch_enabled);
+        if (force_trace_dispatch || jl_options.trace_dispatch != NULL) {
+            uint8_t miflags = jl_atomic_load_relaxed(&mfunc->flags);
+            uint8_t was_dispatched = miflags & JL_MI_FLAGS_MASK_DISPATCHED;
+            if (!was_dispatched) {
+                miflags |= JL_MI_FLAGS_MASK_DISPATCHED;
+                jl_atomic_store_relaxed(&mfunc->flags, miflags);
+                record_dispatch_statement(mfunc);
+            }
+        }
     }
 
 #ifdef JL_TRACE
@@ -3455,6 +3565,16 @@ jl_value_t *jl_gf_invoke_by_method(jl_method_t *method, jl_value_t *gf, jl_value
             jl_gc_sync_total_bytes(last_alloc); // discard allocation count from compilation
     }
     JL_GC_PROMISE_ROOTED(mfunc);
+    uint8_t force_trace_dispatch = jl_atomic_load_relaxed(&jl_force_trace_dispatch_enabled);
+    if (force_trace_dispatch || jl_options.trace_dispatch != NULL) {
+        uint8_t miflags = jl_atomic_load_relaxed(&mfunc->flags);
+        uint8_t was_dispatched = miflags & JL_MI_FLAGS_MASK_DISPATCHED;
+        if (!was_dispatched) {
+            miflags |= JL_MI_FLAGS_MASK_DISPATCHED;
+            jl_atomic_store_relaxed(&mfunc->flags, miflags);
+            record_dispatch_statement(mfunc);
+        }
+    }
     size_t world = jl_current_task->world_age;
     return _jl_invoke(gf, args, nargs - 1, mfunc, world);
 }
diff --git a/src/init.c b/src/init.c
index 1d466a0a736f9..413d4e8055e54 100644
--- a/src/init.c
+++ b/src/init.c
@@ -44,6 +44,7 @@ extern BOOL (WINAPI *hSymRefreshModuleList)(HANDLE);
 
 // list of modules being deserialized with __init__ methods
 jl_array_t *jl_module_init_order;
+arraylist_t *jl_entrypoint_mis;
 
 JL_DLLEXPORT size_t jl_page_size;
 
@@ -67,6 +68,7 @@ void jl_init_stack_limits(int ismaster, void **stack_lo, void **stack_hi)
 #  if defined(_OS_LINUX_) || defined(_OS_FREEBSD_)
         pthread_attr_t attr;
 #if defined(_OS_FREEBSD_)
+        pthread_attr_init(&attr);
         pthread_attr_get_np(pthread_self(), &attr);
 #else
         pthread_getattr_np(pthread_self(), &attr);
@@ -720,6 +722,7 @@ static void restore_fp_env(void)
 static NOINLINE void _finish_julia_init(JL_IMAGE_SEARCH rel, jl_ptls_t ptls, jl_task_t *ct);
 
 JL_DLLEXPORT int jl_default_debug_info_kind;
+JL_DLLEXPORT jl_cgparams_t jl_default_cgparams;
 
 static void init_global_mutexes(void) {
     JL_MUTEX_INIT(&jl_modules_mutex, "jl_modules_mutex");
@@ -840,8 +843,10 @@ static NOINLINE void _finish_julia_init(JL_IMAGE_SEARCH rel, jl_ptls_t ptls, jl_
     JL_TIMING(JULIA_INIT, JULIA_INIT);
     jl_resolve_sysimg_location(rel);
     // loads sysimg if available, and conditionally sets jl_options.cpu_target
-    if (rel == JL_IMAGE_IN_MEMORY)
+    if (rel == JL_IMAGE_IN_MEMORY) {
         jl_set_sysimg_so(jl_exe_handle);
+        jl_options.image_file = jl_options.julia_bin;
+    }
     else if (jl_options.image_file)
         jl_preload_sysimg_so(jl_options.image_file);
     if (jl_options.cpu_target == NULL)
@@ -898,6 +903,11 @@ static NOINLINE void _finish_julia_init(JL_IMAGE_SEARCH rel, jl_ptls_t ptls, jl_
         JL_GC_POP();
     }
 
+    if (jl_options.trim) {
+        jl_entrypoint_mis = (arraylist_t *)malloc_s(sizeof(arraylist_t));
+        arraylist_new(jl_entrypoint_mis, 0);
+    }
+
     if (jl_options.handle_signals == JL_OPTIONS_HANDLE_SIGNALS_ON)
         jl_install_sigint_handler();
 }
diff --git a/src/intrinsics.cpp b/src/intrinsics.cpp
index 4bfe3f184d24b..09916297e16ff 100644
--- a/src/intrinsics.cpp
+++ b/src/intrinsics.cpp
@@ -405,10 +405,11 @@ static Value *emit_unboxed_coercion(jl_codectx_t &ctx, Type *to, Value *unboxed)
     }
     else if (!ty->isIntOrPtrTy() && !ty->isFloatingPointTy()) {
         assert(DL.getTypeSizeInBits(ty) == DL.getTypeSizeInBits(to));
-        AllocaInst *cast = emit_static_alloca(ctx, ty);
+        Align align = std::max(DL.getPrefTypeAlign(ty), DL.getPrefTypeAlign(to));
+        AllocaInst *cast = emit_static_alloca(ctx, ty, align);
         setName(ctx.emission_context, cast, "coercion");
-        ctx.builder.CreateStore(unboxed, cast);
-        unboxed = ctx.builder.CreateLoad(to, cast);
+        ctx.builder.CreateAlignedStore(unboxed, cast, align);
+        unboxed = ctx.builder.CreateAlignedLoad(to, cast, align);
     }
     else if (frompointer) {
         Type *INTT_to = INTT(to, DL);
@@ -440,14 +441,14 @@ static Value *emit_unbox(jl_codectx_t &ctx, Type *to, const jl_cgval_t &x, jl_va
         // up being dead code, and type inference knows that the other
         // branch's type is the only one that matters.
         if (type_is_ghost(to)) {
-            return NULL;
+            return nullptr;
         }
         CreateTrap(ctx.builder);
         return UndefValue::get(to); // type mismatch error
     }
 
-    Constant *c = x.constant ? julia_const_to_llvm(ctx, x.constant) : NULL;
-    if (!x.ispointer() || c) { // already unboxed, but sometimes need conversion
+    Constant *c = x.constant ? julia_const_to_llvm(ctx, x.constant) : nullptr;
+    if ((x.inline_roots.empty() && !x.ispointer()) || c != nullptr) { // already unboxed, but sometimes need conversion
         Value *unboxed = c ? c : x.V;
         return emit_unboxed_coercion(ctx, to, unboxed);
     }
@@ -472,28 +473,17 @@ static Value *emit_unbox(jl_codectx_t &ctx, Type *to, const jl_cgval_t &x, jl_va
     }
 
     unsigned alignment = julia_alignment(jt);
-    Type *ptype = to->getPointerTo();
-    if (p->getType() != ptype && isa<AllocaInst>(p)) {
-        // LLVM's mem2reg can't handle coercion if the load/store type does
-        // not match the type of the alloca. As such, it is better to
-        // perform the load using the alloca's type and then perform the
-        // appropriate coercion manually.
-        AllocaInst *AI = cast<AllocaInst>(p);
-        Type *AllocType = AI->getAllocatedType();
-        const DataLayout &DL = jl_Module->getDataLayout();
-        if (!AI->isArrayAllocation() &&
-                (AllocType->isFloatingPointTy() || AllocType->isIntegerTy() || AllocType->isPointerTy()) &&
-                (to->isFloatingPointTy() || to->isIntegerTy() || to->isPointerTy()) &&
-                DL.getTypeSizeInBits(AllocType) == DL.getTypeSizeInBits(to)) {
-            Instruction *load = ctx.builder.CreateAlignedLoad(AllocType, p, Align(alignment));
-            setName(ctx.emission_context, load, p->getName() + ".unbox");
-            jl_aliasinfo_t ai = jl_aliasinfo_t::fromTBAA(ctx, x.tbaa);
-            return emit_unboxed_coercion(ctx, to, ai.decorateInst(load));
-        }
+    jl_aliasinfo_t ai = jl_aliasinfo_t::fromTBAA(ctx, x.tbaa);
+    if (!x.inline_roots.empty()) {
+        assert(x.typ == jt);
+        AllocaInst *combined = emit_static_alloca(ctx, to, Align(alignment));
+        auto combined_ai = jl_aliasinfo_t::fromTBAA(ctx, ctx.tbaa().tbaa_stack);
+        recombine_value(ctx, x, combined, combined_ai, Align(alignment), false);
+        p = combined;
+        ai = combined_ai;
     }
     Instruction *load = ctx.builder.CreateAlignedLoad(to, p, Align(alignment));
     setName(ctx.emission_context, load, p->getName() + ".unbox");
-    jl_aliasinfo_t ai = jl_aliasinfo_t::fromTBAA(ctx, x.tbaa);
     return ai.decorateInst(load);
 }
 
@@ -507,18 +497,25 @@ static void emit_unbox_store(jl_codectx_t &ctx, const jl_cgval_t &x, Value *dest
         return;
     }
 
+    auto dest_ai = jl_aliasinfo_t::fromTBAA(ctx, tbaa_dest);
+
+    if (!x.inline_roots.empty()) {
+        recombine_value(ctx, x, dest, dest_ai, alignment, isVolatile);
+        return;
+    }
+
     if (!x.ispointer()) { // already unboxed, but sometimes need conversion (e.g. f32 -> i32)
         assert(x.V);
         Value *unboxed = zext_struct(ctx, x.V);
         StoreInst *store = ctx.builder.CreateAlignedStore(unboxed, dest, alignment);
         store->setVolatile(isVolatile);
-        jl_aliasinfo_t ai = jl_aliasinfo_t::fromTBAA(ctx, tbaa_dest);
-        ai.decorateInst(store);
+        dest_ai.decorateInst(store);
         return;
     }
 
     Value *src = data_pointer(ctx, x);
-    emit_memcpy(ctx, dest, jl_aliasinfo_t::fromTBAA(ctx, tbaa_dest), src, jl_aliasinfo_t::fromTBAA(ctx, x.tbaa), jl_datatype_size(x.typ), Align(alignment), Align(julia_alignment(x.typ)), isVolatile);
+    auto src_ai = jl_aliasinfo_t::fromTBAA(ctx, x.tbaa);
+    emit_memcpy(ctx, dest, dest_ai, src, src_ai, jl_datatype_size(x.typ), Align(alignment), Align(julia_alignment(x.typ)), isVolatile);
 }
 
 static jl_datatype_t *staticeval_bitstype(const jl_cgval_t &targ)
@@ -692,10 +689,11 @@ static jl_cgval_t generic_cast(
             // understood that everything is implicitly rounded to 23 bits,
             // but if we start looking at more bits we need to actually do the
             // rounding first instead of carrying around incorrect low bits.
-            Value *jlfloattemp_var = emit_static_alloca(ctx, from->getType());
+            Align align(julia_alignment((jl_value_t*)jlto));
+            Value *jlfloattemp_var = emit_static_alloca(ctx, from->getType(), align);
             setName(ctx.emission_context, jlfloattemp_var, "rounding_slot");
-            ctx.builder.CreateStore(from, jlfloattemp_var);
-            from  = ctx.builder.CreateLoad(from->getType(), jlfloattemp_var, /*force this to load from the stack*/true);
+            ctx.builder.CreateAlignedStore(from, jlfloattemp_var, align);
+            from = ctx.builder.CreateAlignedLoad(from->getType(), jlfloattemp_var, align, /*force this to load from the stack*/true);
             setName(ctx.emission_context, from, "rounded");
         }
     }
@@ -767,7 +765,7 @@ static jl_cgval_t emit_pointerref(jl_codectx_t &ctx, ArrayRef<jl_cgval_t> argv)
                     LLT_ALIGN(size, jl_datatype_align(ety))));
         setName(ctx.emission_context, im1, "pointerref_offset");
         Value *thePtr = emit_unbox(ctx, getPointerTy(ctx.builder.getContext()), e, e.typ);
-        thePtr = ctx.builder.CreateInBoundsGEP(getInt8Ty(ctx.builder.getContext()), thePtr, im1);
+        thePtr = emit_ptrgep(ctx, thePtr, im1);
         setName(ctx.emission_context, thePtr, "pointerref_src");
         MDNode *tbaa = best_tbaa(ctx.tbaa(), ety);
         emit_memcpy(ctx, strct, jl_aliasinfo_t::fromTBAA(ctx, tbaa), thePtr, jl_aliasinfo_t::fromTBAA(ctx, nullptr), size, Align(sizeof(jl_value_t*)), Align(align_nb));
@@ -830,10 +828,9 @@ static jl_cgval_t emit_pointerset(jl_codectx_t &ctx, ArrayRef<jl_cgval_t> argv)
     Value *im1 = ctx.builder.CreateSub(idx, ConstantInt::get(ctx.types().T_size, 1));
     setName(ctx.emission_context, im1, "pointerset_idx");
 
-    Value *thePtr;
+    Value *thePtr = emit_unbox(ctx, getPointerTy(ctx.builder.getContext()), e, e.typ);
     if (ety == (jl_value_t*)jl_any_type) {
         // unsafe_store to Ptr{Any} is allowed to implicitly drop GC roots.
-        thePtr = emit_unbox(ctx, ctx.types().T_size->getPointerTo(), e, e.typ);
         auto gep = ctx.builder.CreateInBoundsGEP(ctx.types().T_size, thePtr, im1);
         setName(ctx.emission_context, gep, "pointerset_ptr");
         auto val = ctx.builder.CreatePtrToInt(emit_pointer_from_objref(ctx, boxed(ctx, x)), ctx.types().T_size);
@@ -842,13 +839,15 @@ static jl_cgval_t emit_pointerset(jl_codectx_t &ctx, ArrayRef<jl_cgval_t> argv)
         jl_aliasinfo_t ai = jl_aliasinfo_t::fromTBAA(ctx, ctx.tbaa().tbaa_data);
         ai.decorateInst(store);
     }
+    else if (!x.inline_roots.empty()) {
+        recombine_value(ctx, e, thePtr, jl_aliasinfo_t(), Align(align_nb), false);
+    }
     else if (x.ispointer()) {
-        thePtr = emit_unbox(ctx, getPointerTy(ctx.builder.getContext()), e, e.typ);
         uint64_t size = jl_datatype_size(ety);
         im1 = ctx.builder.CreateMul(im1, ConstantInt::get(ctx.types().T_size,
                     LLT_ALIGN(size, jl_datatype_align(ety))));
         setName(ctx.emission_context, im1, "pointerset_offset");
-        auto gep = ctx.builder.CreateInBoundsGEP(getInt8Ty(ctx.builder.getContext()), thePtr, im1);
+        auto gep = emit_ptrgep(ctx, thePtr, im1);
         setName(ctx.emission_context, gep, "pointerset_ptr");
         emit_memcpy(ctx, gep, jl_aliasinfo_t::fromTBAA(ctx, nullptr), x, size, Align(align_nb), Align(julia_alignment(ety)));
     }
@@ -857,7 +856,6 @@ static jl_cgval_t emit_pointerset(jl_codectx_t &ctx, ArrayRef<jl_cgval_t> argv)
         Type *ptrty = julia_type_to_llvm(ctx, ety, &isboxed);
         assert(!isboxed);
         if (!type_is_ghost(ptrty)) {
-            thePtr = emit_unbox(ctx, ptrty->getPointerTo(), e, e.typ);
             thePtr = ctx.builder.CreateInBoundsGEP(ptrty, thePtr, im1);
             typed_store(ctx, thePtr, x, jl_cgval_t(), ety, ctx.tbaa().tbaa_data, nullptr, nullptr, isboxed,
                         AtomicOrdering::NotAtomic, AtomicOrdering::NotAtomic, align_nb, nullptr, true, false, false, false, false, false, nullptr, "atomic_pointerset", nullptr, nullptr);
diff --git a/src/jitlayers.cpp b/src/jitlayers.cpp
index 442103c91be0f..d1757cadee05c 100644
--- a/src/jitlayers.cpp
+++ b/src/jitlayers.cpp
@@ -14,6 +14,15 @@
 #include <llvm/ExecutionEngine/Orc/ExecutionUtils.h>
 #include <llvm/ExecutionEngine/Orc/DebugObjectManagerPlugin.h>
 #include <llvm/ExecutionEngine/Orc/TargetProcess/JITLoaderGDB.h>
+#if JL_LLVM_VERSION >= 180000
+#include <llvm/ExecutionEngine/Orc/Debugging/DebugInfoSupport.h>
+#include <llvm/ExecutionEngine/Orc/Debugging/PerfSupportPlugin.h>
+#include <llvm/ExecutionEngine/Orc/TargetProcess/JITLoaderPerf.h>
+#endif
+#if JL_LLVM_VERSION >= 190000
+#include <llvm/ExecutionEngine/Orc/Debugging/VTuneSupportPlugin.h>
+#include <llvm/ExecutionEngine/Orc/TargetProcess/JITLoaderVTune.h>
+#endif
 #include <llvm/ExecutionEngine/Orc/ExecutorProcessControl.h>
 #include <llvm/IR/Verifier.h>
 #include <llvm/Support/DynamicLibrary.h>
@@ -142,13 +151,14 @@ void jl_dump_llvm_opt_impl(void *s)
     **jl_ExecutionEngine->get_dump_llvm_opt_stream() = (ios_t*)s;
 }
 
+#ifndef JL_USE_JITLINK
 static int jl_add_to_ee(
         orc::ThreadSafeModule &M,
         const StringMap<orc::ThreadSafeModule*> &NewExports,
         DenseMap<orc::ThreadSafeModule*, int> &Queued,
         SmallVectorImpl<orc::ThreadSafeModule*> &Stack) JL_NOTSAFEPOINT;
+#endif
 static void jl_decorate_module(Module &M) JL_NOTSAFEPOINT;
-static uint64_t getAddressForFunction(StringRef fname) JL_NOTSAFEPOINT;
 
 void jl_link_global(GlobalVariable *GV, void *addr) JL_NOTSAFEPOINT
 {
@@ -177,23 +187,6 @@ void jl_jit_globals(std::map<void *, GlobalVariable*> &globals) JL_NOTSAFEPOINT
     }
 }
 
-// used for image_codegen, where we keep all the gvs external
-// so we can't jit them directly into each module
-static orc::ThreadSafeModule jl_get_globals_module(orc::ThreadSafeContext &ctx, const DataLayout &DL, const Triple &T, std::map<void *, GlobalVariable*> &globals) JL_NOTSAFEPOINT
-{
-    auto lock = ctx.getLock();
-    auto GTSM = jl_create_ts_module("globals", ctx, DL, T);
-    auto GM = GTSM.getModuleUnlocked();
-    for (auto &global : globals) {
-        auto GV = global.second;
-        auto GV2 = new GlobalVariable(*GM, GV->getValueType(), GV->isConstant(), GlobalValue::ExternalLinkage, literal_static_pointer_val(global.first, GV->getValueType()), GV->getName(), nullptr, GV->getThreadLocalMode(), GV->getAddressSpace(), false);
-        GV2->copyAttributesFrom(GV);
-        GV2->setDSOLocal(true);
-        GV2->setAlignment(GV->getAlign());
-    }
-    return GTSM;
-}
-
 // this generates llvm code for the lambda info
 // and adds the result to the jitlayers
 // (and the shadow module),
@@ -238,8 +231,21 @@ static jl_callptr_t _jl_compile_codeinst(
         // to ensure that the globals are defined when they are compiled.
         if (params.imaging_mode) {
             // Won't contain any PLT/dlsym calls, so no need to optimize those
-            jl_ExecutionEngine->addModule(jl_get_globals_module(params.tsctx, params.DL, params.TargetTriple, params.global_targets));
-        } else {
+            if (!params.global_targets.empty()) {
+                void **globalslots = new void*[params.global_targets.size()];
+                void **slot = globalslots;
+                for (auto &global : params.global_targets) {
+                    auto GV = global.second;
+                    *slot = global.first;
+                    jl_ExecutionEngine->addGlobalMapping(GV->getName(), (uintptr_t)slot);
+                    slot++;
+                }
+#ifdef __clang_analyzer__
+                static void **leaker = globalslots; // for the purpose of the analyzer, we need to expressly leak this variable or it thinks we forgot to free it
+#endif
+            }
+        }
+        else {
             StringMap<void*> NewGlobals;
             for (auto &global : params.global_targets) {
                 NewGlobals[global.second->getName()] = global.first;
@@ -255,6 +261,7 @@ static jl_callptr_t _jl_compile_codeinst(
             }
         }
 
+#ifndef JL_USE_JITLINK
         // Collect the exported functions from the params.compiled_functions modules,
         // which form dependencies on which functions need to be
         // compiled first. Cycles of functions are compiled together.
@@ -281,18 +288,40 @@ static jl_callptr_t _jl_compile_codeinst(
             jl_add_to_ee(M, NewExports, Queued, Stack);
             assert(Queued.empty() && Stack.empty() && !M);
         }
+#else
+        for (auto &def : params.compiled_functions) {
+            // Add the results to the execution engine now
+            orc::ThreadSafeModule &M = std::get<0>(def.second);
+            if (M)
+                jl_ExecutionEngine->addModule(std::move(M));
+        }
+#endif
         ++CompiledCodeinsts;
         MaxWorkqueueSize.updateMax(params.compiled_functions.size());
         IndirectCodeinsts += params.compiled_functions.size() - 1;
     }
 
+    // batch compile job for all new functions
+    SmallVector<StringRef> NewDefs;
+    for (auto &def : params.compiled_functions) {
+        jl_llvm_functions_t &decls = std::get<1>(def.second);
+        if (decls.functionObject != "jl_fptr_args" &&
+            decls.functionObject != "jl_fptr_sparam" &&
+            decls.functionObject != "jl_f_opaque_closure_call")
+            NewDefs.push_back(decls.functionObject);
+        if (!decls.specFunctionObject.empty())
+            NewDefs.push_back(decls.specFunctionObject);
+    }
+    auto Addrs = jl_ExecutionEngine->findSymbols(NewDefs);
+
     size_t i = 0;
+    size_t nextaddr = 0;
     for (auto &def : params.compiled_functions) {
         jl_code_instance_t *this_code = def.first;
         if (i < jl_timing_print_limit)
             jl_timing_show_func_sig(this_code->def->specTypes, JL_TIMING_DEFAULT_BLOCK);
 
-        jl_llvm_functions_t decls = std::get<1>(def.second);
+        jl_llvm_functions_t &decls = std::get<1>(def.second);
         jl_callptr_t addr;
         bool isspecsig = false;
         if (decls.functionObject == "jl_fptr_args") {
@@ -305,12 +334,16 @@ static jl_callptr_t _jl_compile_codeinst(
             addr = jl_f_opaque_closure_call_addr;
         }
         else {
-            addr = (jl_callptr_t)getAddressForFunction(decls.functionObject);
+            assert(NewDefs[nextaddr] == decls.functionObject);
+            addr = (jl_callptr_t)Addrs[nextaddr++];
+            assert(addr);
             isspecsig = true;
         }
         if (!decls.specFunctionObject.empty()) {
             void *prev_specptr = NULL;
-            auto spec = (void*)getAddressForFunction(decls.specFunctionObject);
+            assert(NewDefs[nextaddr] == decls.specFunctionObject);
+            void *spec = (void*)Addrs[nextaddr++];
+            assert(spec);
             if (jl_atomic_cmpswap_acqrel(&this_code->specptr.fptr, &prev_specptr, spec)) {
                 // only set specsig and invoke if we were the first to set specptr
                 jl_atomic_store_relaxed(&this_code->specsigflags, (uint8_t) isspecsig);
@@ -601,48 +634,6 @@ static auto countBasicBlocks(const Function &F) JL_NOTSAFEPOINT
 
 static constexpr size_t N_optlevels = 4;
 
-static Expected<orc::ThreadSafeModule> validateExternRelocations(orc::ThreadSafeModule TSM, orc::MaterializationResponsibility &R) JL_NOTSAFEPOINT {
-#if !defined(JL_NDEBUG) && !defined(JL_USE_JITLINK)
-    auto isIntrinsicFunction = [](GlobalObject &GO) JL_NOTSAFEPOINT {
-        auto F = dyn_cast<Function>(&GO);
-        if (!F)
-            return false;
-        return F->isIntrinsic() || F->getName().starts_with("julia.");
-    };
-    // validate the relocations for M (only for RuntimeDyld, JITLink performs its own symbol validation)
-    auto Err = TSM.withModuleDo([isIntrinsicFunction](Module &M) JL_NOTSAFEPOINT {
-        Error Err = Error::success();
-        for (auto &GO : make_early_inc_range(M.global_objects())) {
-            if (!GO.isDeclarationForLinker())
-                continue;
-            if (GO.use_empty()) {
-                GO.eraseFromParent();
-                continue;
-            }
-            if (isIntrinsicFunction(GO))
-                continue;
-            auto sym = jl_ExecutionEngine->findUnmangledSymbol(GO.getName());
-            if (sym)
-                continue;
-            // TODO have we ever run into this check? It's been guaranteed to not
-            // fire in an assert build, since previously LLVM would abort due to
-            // not handling the error if we didn't find the unmangled symbol
-            if (SectionMemoryManager::getSymbolAddressInProcess(
-                            jl_ExecutionEngine->getMangledName(GO.getName()))) {
-                consumeError(sym.takeError());
-                continue;
-            }
-            Err = joinErrors(std::move(Err), sym.takeError());
-        }
-        return Err;
-    });
-    if (Err) {
-        return std::move(Err);
-    }
-#endif
-    return std::move(TSM);
-}
-
 static Expected<orc::ThreadSafeModule> selectOptLevel(orc::ThreadSafeModule TSM, orc::MaterializationResponsibility &R) {
     TSM.withModuleDo([](Module &M) {
         size_t opt_level = std::max(static_cast<int>(jl_options.opt_level), 0);
@@ -673,21 +664,8 @@ static Expected<orc::ThreadSafeModule> selectOptLevel(orc::ThreadSafeModule TSM,
     return std::move(TSM);
 }
 
-static void recordDebugTSM(orc::MaterializationResponsibility &, orc::ThreadSafeModule TSM) JL_NOTSAFEPOINT {
-    auto ptr = TSM.withModuleDo([](Module &M) JL_NOTSAFEPOINT {
-        auto md = M.getModuleFlag("julia.__jit_debug_tsm_addr");
-        if (!md)
-            return static_cast<orc::ThreadSafeModule *>(nullptr);
-        return reinterpret_cast<orc::ThreadSafeModule *>(cast<ConstantInt>(cast<ConstantAsMetadata>(md)->getValue())->getZExtValue());
-    });
-    if (ptr) {
-        *ptr = std::move(TSM);
-    }
-}
-
 void jl_register_jit_object(const object::ObjectFile &debugObj,
-                            std::function<uint64_t(const StringRef &)> getLoadAddress,
-                            std::function<void *(void *)> lookupWriteAddress);
+                            std::function<uint64_t(const StringRef &)> getLoadAddress);
 
 namespace {
 
@@ -747,7 +725,7 @@ class JLDebuginfoPlugin : public ObjectLinkingLayer::Plugin {
                 return result->second;
             };
 
-            jl_register_jit_object(*NewInfo->Object, getLoadAddress, nullptr);
+            jl_register_jit_object(*NewInfo->Object, getLoadAddress);
             PendingObjs.erase(&MR);
         }
 
@@ -978,10 +956,6 @@ class ForwardingMemoryManager : public RuntimeDyld::MemoryManager {
 };
 
 
-#if defined(_OS_WINDOWS_) && defined(_CPU_X86_64_)
-void *lookupWriteAddressFor(RTDyldMemoryManager *MemMgr, void *rt_addr);
-#endif
-
 void registerRTDyldJITObject(const object::ObjectFile &Object,
                              const RuntimeDyld::LoadedObjectInfo &L,
                              const std::shared_ptr<RTDyldMemoryManager> &MemMgr)
@@ -1004,14 +978,7 @@ void registerRTDyldJITObject(const object::ObjectFile &Object,
     };
 
     auto DebugObject = L.getObjectForDebug(Object); // ELF requires us to make a copy to mutate the header with the section load addresses. On other platforms this is a no-op.
-    jl_register_jit_object(DebugObject.getBinary() ? *DebugObject.getBinary() : Object,
-        getLoadAddress,
-#if defined(_OS_WINDOWS_) && defined(_CPU_X86_64_)
-        [MemMgr](void *p) { return lookupWriteAddressFor(MemMgr.get(), p); }
-#else
-        nullptr
-#endif
-    );
+    jl_register_jit_object(DebugObject.getBinary() ? *DebugObject.getBinary() : Object, getLoadAddress);
 }
 namespace {
     static std::unique_ptr<TargetMachine> createTargetMachine() JL_NOTSAFEPOINT {
@@ -1302,8 +1269,9 @@ namespace {
                 }
 
                 // Windows needs some inline asm to help
-                // build unwind tables
-                jl_decorate_module(M);
+                // build unwind tables, if they have any functions to decorate
+                if (!M.functions().empty())
+                    jl_decorate_module(M);
             });
             return std::move(TSM);
         }
@@ -1584,6 +1552,7 @@ JuliaOJIT::JuliaOJIT()
 #ifdef JL_USE_JITLINK
     MemMgr(createJITLinkMemoryManager()),
     ObjectLayer(ES, *MemMgr),
+    CompileLayer(ES, ObjectLayer, std::make_unique<CompilerT<N_optlevels>>(orc::irManglingOptionsFromTargetOptions(TM->Options), *TM)),
 #else
     MemMgr(createRTDyldMemoryManager()),
     ObjectLayer(
@@ -1593,15 +1562,12 @@ JuliaOJIT::JuliaOJIT()
                 return result;
             }
         ),
-#endif
     LockLayer(ObjectLayer),
     CompileLayer(ES, LockLayer, std::make_unique<CompilerT<N_optlevels>>(orc::irManglingOptionsFromTargetOptions(TM->Options), *TM)),
+#endif
     JITPointersLayer(ES, CompileLayer, orc::IRTransformLayer::TransformFunction(JITPointersT(SharedBytes, RLST_mutex))),
     OptimizeLayer(ES, JITPointersLayer, orc::IRTransformLayer::TransformFunction(OptimizerT<N_optlevels>(*TM, PrintLLVMTimers, llvm_printing_mutex))),
-    OptSelLayer(ES, OptimizeLayer, orc::IRTransformLayer::TransformFunction(selectOptLevel)),
-    DepsVerifyLayer(ES, OptSelLayer, orc::IRTransformLayer::TransformFunction(validateExternRelocations)),
-    ExternalCompileLayer(ES, LockLayer,
-        std::make_unique<CompilerT<N_optlevels>>(orc::irManglingOptionsFromTargetOptions(TM->Options), *TM))
+    OptSelLayer(ES, OptimizeLayer, orc::IRTransformLayer::TransformFunction(selectOptLevel))
 {
     JL_MUTEX_INIT(&this->jitlock, "JuliaOJIT");
 #ifdef JL_USE_JITLINK
@@ -1625,7 +1591,6 @@ JuliaOJIT::JuliaOJIT()
             registerRTDyldJITObject(Object, LO, MemMgr);
         });
 #endif
-    CompileLayer.setNotifyCompiled(recordDebugTSM);
 
     std::string ErrorStr;
 
@@ -1786,8 +1751,8 @@ void JuliaOJIT::addModule(orc::ThreadSafeModule TSM)
 {
     JL_TIMING(LLVM_JIT, JIT_Total);
     ++ModulesAdded;
+#ifndef JL_USE_JITLINK
     orc::SymbolLookupSet NewExports;
-    orc::ThreadSafeModule CurrentlyCompiling;
     TSM.withModuleDo([&](Module &M) JL_NOTSAFEPOINT {
         for (auto &F : M.global_values()) {
             if (!F.isDeclaration() && F.getLinkage() == GlobalValue::ExternalLinkage) {
@@ -1796,42 +1761,24 @@ void JuliaOJIT::addModule(orc::ThreadSafeModule TSM)
             }
         }
         assert(!verifyLLVMIR(M));
-        auto jit_debug_tsm_addr = ConstantInt::get(Type::getIntNTy(M.getContext(), sizeof(void*) * CHAR_BIT), (uintptr_t) &CurrentlyCompiling);
-        M.addModuleFlag(Module::Error, "julia.__jit_debug_tsm_addr", jit_debug_tsm_addr);
     });
+#endif
 
-    // TODO: what is the performance characteristics of this?
-    auto Err = DepsVerifyLayer.add(JD, std::move(TSM));
+    auto Err = OptSelLayer.add(JD, std::move(TSM));
     if (Err) {
         ES.reportError(std::move(Err));
         errs() << "Failed to add module to JIT!\n";
-        if (CurrentlyCompiling) {
-            CurrentlyCompiling.withModuleDo([](Module &M) JL_NOTSAFEPOINT { errs() << "Dumping failing module\n" << M << "\n"; });
-        } else {
-            errs() << "Module unavailable to be printed\n";
-        }
         abort();
     }
+#ifndef JL_USE_JITLINK
     // force eager compilation (for now), due to memory management specifics
     // (can't handle compilation recursion)
     auto Lookups = ES.lookup({{&JD, orc::JITDylibLookupFlags::MatchExportedSymbolsOnly}}, NewExports);
     if (!Lookups) {
         ES.reportError(Lookups.takeError());
         errs() << "Failed to lookup symbols in module!\n";
-        if (CurrentlyCompiling) {
-            CurrentlyCompiling.withModuleDo([](Module &M) JL_NOTSAFEPOINT { errs() << "Dumping failing module\n" << M << "\n"; });
-        } else {
-            errs() << "Module unavailable to be printed\n";
-        }
-    }
-    for (auto &Sym : *Lookups) {
-        #if JL_LLVM_VERSION >= 170000
-        assert(Sym.second.getAddress());
-        #else
-        assert(Sym.second);
-        #endif
-        (void) Sym;
     }
+#endif
 }
 
 Error JuliaOJIT::addExternalModule(orc::JITDylib &JD, orc::ThreadSafeModule TSM, bool ShouldOptimize)
@@ -1850,12 +1797,33 @@ Error JuliaOJIT::addExternalModule(orc::JITDylib &JD, orc::ThreadSafeModule TSM,
             return Error::success();
             }))
         return Err;
-    return ExternalCompileLayer.add(JD.getDefaultResourceTracker(), std::move(TSM));
+    return CompileLayer.add(JD.getDefaultResourceTracker(), std::move(TSM));
 }
 
 Error JuliaOJIT::addObjectFile(orc::JITDylib &JD, std::unique_ptr<MemoryBuffer> Obj) {
     assert(Obj && "Can not add null object");
+#ifdef JL_USE_JITLINK
+    return ObjectLayer.add(JD.getDefaultResourceTracker(), std::move(Obj));
+#else
     return LockLayer.add(JD.getDefaultResourceTracker(), std::move(Obj));
+#endif
+}
+
+SmallVector<uint64_t> JuliaOJIT::findSymbols(ArrayRef<StringRef> Names)
+{
+    DenseMap<orc::NonOwningSymbolStringPtr, size_t> Unmangled;
+    orc::SymbolLookupSet Exports;
+    for (StringRef Name : Names) {
+        auto Mangled = ES.intern(getMangledName(Name));
+        Unmangled[NonOwningSymbolStringPtr(Mangled)] = Unmangled.size();
+        Exports.add(std::move(Mangled));
+    }
+    SymbolMap Syms = cantFail(ES.lookup(orc::makeJITDylibSearchOrder(ArrayRef(&JD)), std::move(Exports)));
+    SmallVector<uint64_t> Addrs(Names.size());
+    for (auto it : Syms) {
+        Addrs[Unmangled.at(orc::NonOwningSymbolStringPtr(it.first))] = it.second.getAddress().getValue();
+    }
+    return Addrs;
 }
 
 #if JL_LLVM_VERSION >= 170000
@@ -1887,7 +1855,7 @@ uint64_t JuliaOJIT::getGlobalValueAddress(StringRef Name)
         consumeError(addr.takeError());
         return 0;
     }
-    return cantFail(std::move(addr)).getAddress().getValue();
+    return addr->getAddress().getValue();
 }
 
 uint64_t JuliaOJIT::getFunctionAddress(StringRef Name)
@@ -1897,7 +1865,7 @@ uint64_t JuliaOJIT::getFunctionAddress(StringRef Name)
         consumeError(addr.takeError());
         return 0;
     }
-    return cantFail(std::move(addr)).getAddress().getValue();
+    return addr->getAddress().getValue();
 }
 #else
 JL_JITSymbol JuliaOJIT::findSymbol(StringRef Name, bool ExportedSymbolsOnly)
@@ -1973,41 +1941,92 @@ StringRef JuliaOJIT::getFunctionAtAddress(uint64_t Addr, jl_callptr_t invoke, jl
     return *fname;
 }
 
-
 #ifdef JL_USE_JITLINK
-extern "C" orc::shared::CWrapperFunctionResult
-llvm_orc_registerJITLoaderGDBAllocAction(const char *Data, size_t Size);
+#if JL_LLVM_VERSION >= 170000
+#define addAbsoluteToMap(map,name) \
+    (map[mangle(#name)] = {ExecutorAddr::fromPtr(&name), JITSymbolFlags::Exported | JITSymbolFlags::Callable}, orc::ExecutorAddr::fromPtr(&name))
+#else
+#define addAbsoluteToMap(map,name) \
+    (map[mangle(#name)] = JITEvaluatedSymbol::fromPointer(&name, JITSymbolFlags::Exported | JITSymbolFlags::Callable), orc::ExecutorAddr::fromPtr(&name))
+#endif
 
 void JuliaOJIT::enableJITDebuggingSupport()
 {
     orc::SymbolMap GDBFunctions;
-    #if JL_LLVM_VERSION >= 170000
-    GDBFunctions[mangle("llvm_orc_registerJITLoaderGDBAllocAction")] = {ExecutorAddr::fromPtr(&llvm_orc_registerJITLoaderGDBAllocAction), JITSymbolFlags::Exported | JITSymbolFlags::Callable};
-    GDBFunctions[mangle("llvm_orc_registerJITLoaderGDBWrapper")] = {ExecutorAddr::fromPtr(&llvm_orc_registerJITLoaderGDBWrapper), JITSymbolFlags::Exported | JITSymbolFlags::Callable};
-    #else
-    GDBFunctions[mangle("llvm_orc_registerJITLoaderGDBAllocAction")] = JITEvaluatedSymbol::fromPointer(&llvm_orc_registerJITLoaderGDBAllocAction, JITSymbolFlags::Exported | JITSymbolFlags::Callable);
-    GDBFunctions[mangle("llvm_orc_registerJITLoaderGDBWrapper")] = JITEvaluatedSymbol::fromPointer(&llvm_orc_registerJITLoaderGDBWrapper, JITSymbolFlags::Exported | JITSymbolFlags::Callable);
-    #endif
+    addAbsoluteToMap(GDBFunctions,llvm_orc_registerJITLoaderGDBAllocAction);
+    auto registerJITLoaderGDBWrapper = addAbsoluteToMap(GDBFunctions,llvm_orc_registerJITLoaderGDBWrapper);
     cantFail(JD.define(orc::absoluteSymbols(GDBFunctions)));
     if (TM->getTargetTriple().isOSBinFormatMachO())
         ObjectLayer.addPlugin(cantFail(orc::GDBJITDebugInfoRegistrationPlugin::Create(ES, JD, TM->getTargetTriple())));
 #ifndef _COMPILER_ASAN_ENABLED_ // TODO: Fix duplicated sections spam #51794
     else if (TM->getTargetTriple().isOSBinFormatELF())
         //EPCDebugObjectRegistrar doesn't take a JITDylib, so we have to directly provide the call address
-        ObjectLayer.addPlugin(std::make_unique<orc::DebugObjectManagerPlugin>(ES, std::make_unique<orc::EPCDebugObjectRegistrar>(ES, orc::ExecutorAddr::fromPtr(&llvm_orc_registerJITLoaderGDBWrapper))));
+        ObjectLayer.addPlugin(std::make_unique<orc::DebugObjectManagerPlugin>(ES, std::make_unique<orc::EPCDebugObjectRegistrar>(ES, registerJITLoaderGDBWrapper)));
+#endif
+}
+
+void JuliaOJIT::enableIntelJITEventListener()
+{
+#if JL_LLVM_VERSION >= 190000
+    if (TT.isOSBinFormatELF()) {
+        orc::SymbolMap VTuneFunctions;
+        auto RegisterImplAddr = addAbsoluteToMap(VTuneFunctions,llvm_orc_registerVTuneImpl);
+        auto UnregisterImplAddr = addAbsoluteToMap(VTuneFunctions,llvm_orc_unregisterVTuneImpl);
+        ObjectLayer.addPlugin(cantFail(DebugInfoPreservationPlugin::Create()));
+        //ObjectLayer.addPlugin(cantFail(VTuneSupportPlugin::Create(ES.getExecutorProcessControl(),
+        //                           JD, /*EmitDebugInfo=*/true,
+        //                           /*TestMode=*/false)));
+        bool EmitDebugInfo = true;
+        ObjectLayer.addPlugin(std::make_unique<VTuneSupportPlugin>(
+            ES.getExecutorProcessControl(), RegisterImplAddr, UnregisterImplAddr, EmitDebugInfo));
+    }
+#endif
+}
+
+void JuliaOJIT::enableOProfileJITEventListener()
+{
+    // implement when available in LLVM
+}
+
+void JuliaOJIT::enablePerfJITEventListener()
+{
+#if JL_LLVM_VERSION >= 180000
+    orc::SymbolMap PerfFunctions;
+    auto StartAddr = addAbsoluteToMap(PerfFunctions,llvm_orc_registerJITLoaderPerfStart);
+    auto EndAddr = addAbsoluteToMap(PerfFunctions,llvm_orc_registerJITLoaderPerfEnd);
+    auto ImplAddr = addAbsoluteToMap(PerfFunctions,llvm_orc_registerJITLoaderPerfImpl);
+    cantFail(JD.define(orc::absoluteSymbols(PerfFunctions)));
+    if (TM->getTargetTriple().isOSBinFormatELF()) {
+        ObjectLayer.addPlugin(cantFail(DebugInfoPreservationPlugin::Create()));
+        //ObjectLayer.addPlugin(cantFail(PerfSupportPlugin::Create(
+        //    ES.getExecutorProcessControl(), *JD, true, true)));
+        bool EmitDebugInfo = true, EmitUnwindInfo = true;
+        ObjectLayer.addPlugin(std::make_unique<PerfSupportPlugin>(
+            ES.getExecutorProcessControl(), StartAddr, EndAddr, ImplAddr, EmitDebugInfo, EmitUnwindInfo));
+    }
 #endif
 }
 #else
+void JuliaOJIT::RegisterJITEventListener(JITEventListener *L)
+{
+    if (L)
+        ObjectLayer.registerJITEventListener(*L);
+}
 void JuliaOJIT::enableJITDebuggingSupport()
 {
     RegisterJITEventListener(JITEventListener::createGDBRegistrationListener());
 }
-
-void JuliaOJIT::RegisterJITEventListener(JITEventListener *L)
+void JuliaOJIT::enableIntelJITEventListener()
 {
-    if (!L)
-        return;
-    this->ObjectLayer.registerJITEventListener(*L);
+    RegisterJITEventListener(JITEventListener::createIntelJITEventListener());
+}
+void JuliaOJIT::enableOProfileJITEventListener()
+{
+    RegisterJITEventListener(JITEventListener::createOProfileJITEventListener());
+}
+void JuliaOJIT::enablePerfJITEventListener()
+{
+    RegisterJITEventListener(JITEventListener::createPerfJITEventListener());
 }
 #endif
 
@@ -2225,32 +2244,67 @@ static void jl_decorate_module(Module &M) {
     if (TT.isOSWindows() && TT.getArch() == Triple::x86_64) {
         // Add special values used by debuginfo to build the UnwindData table registration for Win64
         // This used to be GV, but with https://reviews.llvm.org/D100944 we no longer can emit GV into `.text`
-        // TODO: The data is set in debuginfo.cpp but it should be okay to actually emit it here.
-        std::string inline_asm = "\
-    .section ";
-        inline_asm +=
+        // and with JITLink it became difficult to change the content afterwards, but we
+        // would prefer that this simple content wasn't recompiled in every single module,
+        // so we emit the necessary PLT trampoline as inline assembly.
+        // This is somewhat duplicated with the .pdata section, but we haven't been able to
+        // use that yet due to relocation issues.
+#define ASM_USES_ELF // use ELF or COFF syntax based on FORCE_ELF
+        StringRef inline_asm(
+    ".section"
 #if JL_LLVM_VERSION >= 180000
-    ".ltext,\"ax\",@progbits";
+        " .ltext,\"ax\",@progbits\n"
 #else
-    ".text";
+        " .text\n"
 #endif
-        inline_asm +=              "\n\
-    .type   __UnwindData,@object    \n\
-    .p2align        2, 0x90         \n\
-    __UnwindData:                   \n\
-        .zero   12                  \n\
-        .size   __UnwindData, 12    \n\
-                                    \n\
-        .type   __catchjmp,@object  \n\
-        .p2align        2, 0x90     \n\
-    __catchjmp:                     \n\
-        .zero   12                  \n\
-        .size   __catchjmp, 12";
-
+    ".globl __julia_personality\n"
+    "\n"
+#ifdef ASM_USES_ELF
+    ".type __UnwindData,@object\n"
+#else
+    ".def __UnwindData\n"
+    ".scl 2\n"
+    ".type 0\n"
+    ".endef\n"
+#endif
+    ".p2align        2, 0x90\n"
+    "__UnwindData:\n"
+    "  .byte 0x09;\n" // version info, UNW_FLAG_EHANDLER
+    "  .byte 4;\n"    // size of prolog (bytes)
+    "  .byte 2;\n"    // count of unwind codes (slots)
+    "  .byte 0x05;\n" // frame register (rbp) = rsp
+    "  .byte 4;\n"    // second instruction
+    "  .byte 0x03;\n" // mov RBP, RSP
+    "  .byte 1;\n"    // first instruction
+    "  .byte 0x50;\n" // push RBP
+    "  .int __catchjmp - "
+#if JL_LLVM_VERSION >= 180000
+    ".ltext;\n" // Section-relative offset (if using COFF and JITLink, this can be relative to __ImageBase instead, though then we could possibly use pdata/xdata directly then)
+#else
+    ".text;\n"
+#endif
+    ".size __UnwindData, 12\n"
+    "\n"
+#ifdef ASM_USES_ELF
+    ".type __catchjmp,@function\n"
+#else
+    ".def __catchjmp\n"
+    ".scl 2\n"
+    ".type 32\n"
+    ".endef\n"
+#endif
+    ".p2align        2, 0x90\n"
+    "__catchjmp:\n"
+    "  movabsq $__julia_personality, %rax\n"
+    "  jmpq *%rax\n"
+    ".size __catchjmp, . - __catchjmp\n"
+    "\n");
         M.appendModuleInlineAsm(inline_asm);
     }
+#undef ASM_USES_ELF
 }
 
+#ifndef JL_USE_JITLINK
 // Implements Tarjan's SCC (strongly connected components) algorithm, simplified to remove the count variable
 static int jl_add_to_ee(
         orc::ThreadSafeModule &M,
@@ -2316,13 +2370,7 @@ static int jl_add_to_ee(
     jl_ExecutionEngine->addModule(std::move(M));
     return 0;
 }
-
-static uint64_t getAddressForFunction(StringRef fname)
-{
-    auto addr = jl_ExecutionEngine->getFunctionAddress(fname);
-    assert(addr);
-    return addr;
-}
+#endif
 
 // helper function for adding a DLLImport (dlsym) address to the execution engine
 void add_named_global(StringRef name, void *addr)
diff --git a/src/jitlayers.h b/src/jitlayers.h
index 107782e354d4a..3353a4093bd27 100644
--- a/src/jitlayers.h
+++ b/src/jitlayers.h
@@ -29,7 +29,7 @@
 #include "llvm-version.h"
 #include <stack>
 #include <queue>
-
+#include <tuple>
 
 // As of LLVM 13, there are two runtime JIT linker implementations, the older
 // RuntimeDyld (used via orc::RTDyldObjectLinkingLayer) and the newer JITLink
@@ -47,7 +47,7 @@
 // and feature support (e.g. Windows, JITEventListeners for various profilers,
 // etc.). Thus, we currently only use JITLink where absolutely required, that is,
 // for Mac/aarch64 and Linux/aarch64.
-// #define JL_FORCE_JITLINK
+//#define JL_FORCE_JITLINK
 
 #if defined(_COMPILER_ASAN_ENABLED_) || defined(_COMPILER_MSAN_ENABLED_) || defined(_COMPILER_TSAN_ENABLED_)
 # define HAS_SANITIZER
@@ -65,6 +65,7 @@
 using namespace llvm;
 
 extern "C" jl_cgparams_t jl_default_cgparams;
+extern arraylist_t new_invokes;
 
 DEFINE_SIMPLE_CONVERSION_FUNCTIONS(orc::ThreadSafeContext, LLVMOrcThreadSafeContextRef)
 DEFINE_SIMPLE_CONVERSION_FUNCTIONS(orc::ThreadSafeModule, LLVMOrcThreadSafeModuleRef)
@@ -211,7 +212,7 @@ struct jl_codegen_call_target_t {
 typedef SmallVector<std::pair<jl_code_instance_t*, jl_codegen_call_target_t>, 0> jl_workqueue_t;
 // TODO DenseMap?
 typedef std::map<jl_code_instance_t*, std::pair<orc::ThreadSafeModule, jl_llvm_functions_t>> jl_compiled_functions_t;
-
+typedef std::list<std::tuple<std::string, std::string, unsigned int>> CallFrames;
 struct jl_codegen_params_t {
     orc::ThreadSafeContext tsctx;
     orc::ThreadSafeContext::Lock tsctx_lock;
@@ -230,6 +231,7 @@ struct jl_codegen_params_t {
     std::map<jl_datatype_t*, DIType*> ditypes;
     std::map<jl_datatype_t*, Type*> llvmtypes;
     DenseMap<Constant*, GlobalVariable*> mergedConstants;
+    llvm::MapVector<jl_method_instance_t*, std::tuple<jl_method_instance_t*, CallFrames>> enqueuers;
     // Map from symbol name (in a certain library) to its GV in sysimg and the
     // DL handle address in the current session.
     StringMap<std::pair<GlobalVariable*,SymMapGV>> libMapGV;
@@ -361,7 +363,6 @@ class JuliaOJIT {
     typedef orc::ObjectLinkingLayer ObjLayerT;
 #else
     typedef orc::RTDyldObjectLinkingLayer ObjLayerT;
-#endif
     struct LockLayerT : public orc::ObjectLayer {
 
         LockLayerT(orc::ObjectLayer &BaseLayer) JL_NOTSAFEPOINT : orc::ObjectLayer(BaseLayer.getExecutionSession()), BaseLayer(BaseLayer) {}
@@ -379,11 +380,11 @@ class JuliaOJIT {
         orc::ObjectLayer &BaseLayer;
         std::mutex EmissionMutex;
     };
+#endif
     typedef orc::IRCompileLayer CompileLayerT;
     typedef orc::IRTransformLayer JITPointersLayerT;
     typedef orc::IRTransformLayer OptimizeLayerT;
     typedef orc::IRTransformLayer OptSelLayerT;
-    typedef orc::IRTransformLayer DepsVerifyLayerT;
     typedef object::OwningBinary<object::ObjectFile> OwningObj;
     template
     <typename ResourceT, size_t max = 0,
@@ -498,10 +499,9 @@ class JuliaOJIT {
 
     struct DLSymOptimizer;
 
-private:
-    // Custom object emission notification handler for the JuliaOJIT
-    template <typename ObjT, typename LoadResult>
-    void registerObject(const ObjT &Obj, const LoadResult &LO);
+#ifndef JL_USE_JITLINK
+    void RegisterJITEventListener(JITEventListener *L) JL_NOTSAFEPOINT;
+#endif
 
 public:
 
@@ -509,10 +509,9 @@ class JuliaOJIT {
     ~JuliaOJIT() JL_NOTSAFEPOINT;
 
     void enableJITDebuggingSupport() JL_NOTSAFEPOINT;
-#ifndef JL_USE_JITLINK
-    // JITLink doesn't support old JITEventListeners (yet).
-    void RegisterJITEventListener(JITEventListener *L) JL_NOTSAFEPOINT;
-#endif
+    void enableIntelJITEventListener() JL_NOTSAFEPOINT;
+    void enableOProfileJITEventListener() JL_NOTSAFEPOINT;
+    void enablePerfJITEventListener() JL_NOTSAFEPOINT;
 
     orc::SymbolStringPtr mangle(StringRef Name) JL_NOTSAFEPOINT;
     void addGlobalMapping(StringRef Name, uint64_t Addr) JL_NOTSAFEPOINT;
@@ -523,7 +522,7 @@ class JuliaOJIT {
                             bool ShouldOptimize = false) JL_NOTSAFEPOINT;
     Error addObjectFile(orc::JITDylib &JD,
                         std::unique_ptr<MemoryBuffer> Obj) JL_NOTSAFEPOINT;
-    orc::IRCompileLayer &getIRCompileLayer() JL_NOTSAFEPOINT { return ExternalCompileLayer; };
+    orc::IRCompileLayer &getIRCompileLayer() JL_NOTSAFEPOINT { return CompileLayer; };
     orc::ExecutionSession &getExecutionSession() JL_NOTSAFEPOINT { return ES; }
     orc::JITDylib &getExternalJITDylib() JL_NOTSAFEPOINT { return ExternalJD; }
 
@@ -531,6 +530,7 @@ class JuliaOJIT {
     Expected<llvm::orc::ExecutorSymbolDef> findSymbol(StringRef Name, bool ExportedSymbolsOnly) JL_NOTSAFEPOINT;
     Expected<llvm::orc::ExecutorSymbolDef> findUnmangledSymbol(StringRef Name) JL_NOTSAFEPOINT;
     Expected<llvm::orc::ExecutorSymbolDef> findExternalJDSymbol(StringRef Name, bool ExternalJDOnly) JL_NOTSAFEPOINT;
+    SmallVector<uint64_t> findSymbols(ArrayRef<StringRef> Names) JL_NOTSAFEPOINT;
     #else
     JITEvaluatedSymbol findSymbol(StringRef Name, bool ExportedSymbolsOnly) JL_NOTSAFEPOINT;
     JITEvaluatedSymbol findUnmangledSymbol(StringRef Name) JL_NOTSAFEPOINT;
@@ -614,13 +614,13 @@ class JuliaOJIT {
     const std::unique_ptr<jitlink::JITLinkMemoryManager> MemMgr;
 #endif
     ObjLayerT ObjectLayer;
+#ifndef JL_USE_JITLINK
     LockLayerT LockLayer;
+#endif
     CompileLayerT CompileLayer;
     JITPointersLayerT JITPointersLayer;
     OptimizeLayerT OptimizeLayer;
     OptSelLayerT OptSelLayer;
-    DepsVerifyLayerT DepsVerifyLayer;
-    CompileLayerT ExternalCompileLayer;
 };
 extern JuliaOJIT *jl_ExecutionEngine;
 std::unique_ptr<Module> jl_create_llvm_module(StringRef name, LLVMContext &ctx, const DataLayout &DL = jl_ExecutionEngine->getDataLayout(), const Triple &triple = jl_ExecutionEngine->getTargetTriple()) JL_NOTSAFEPOINT;
diff --git a/src/jl_exported_funcs.inc b/src/jl_exported_funcs.inc
index 11cc8ee6fddd9..a00a0171d23b7 100644
--- a/src/jl_exported_funcs.inc
+++ b/src/jl_exported_funcs.inc
@@ -316,6 +316,7 @@
     XX(jl_module_name) \
     XX(jl_module_names) \
     XX(jl_module_parent) \
+    XX(jl_module_getloc) \
     XX(jl_module_public) \
     XX(jl_module_public_p) \
     XX(jl_module_use) \
@@ -344,6 +345,8 @@
     XX(jl_new_typevar) \
     XX(jl_next_from_addrinfo) \
     XX(jl_normalize_to_compilable_sig) \
+    XX(jl_method_match_to_mi) \
+    XX(jl_get_unspecialized) \
     XX(jl_no_exc_handler) \
     XX(jl_object_id) \
     XX(jl_object_id_) \
@@ -420,7 +423,6 @@
     XX(jl_set_zero_subnormals) \
     XX(jl_sigatomic_begin) \
     XX(jl_sigatomic_end) \
-    XX(jl_sig_throw) \
     XX(jl_spawn) \
     XX(jl_specializations_get_linfo) \
     XX(jl_specializations_lookup) \
@@ -453,6 +455,8 @@
     XX(jl_test_cpu_feature) \
     XX(jl_threadid) \
     XX(jl_threadpoolid) \
+    XX(jl_get_ptls_rng) \
+    XX(jl_set_ptls_rng) \
     XX(jl_throw) \
     XX(jl_throw_out_of_memory_error) \
     XX(jl_too_few_args) \
@@ -521,6 +525,7 @@
     YY(jl_dump_native) \
     YY(jl_get_llvm_gvs) \
     YY(jl_get_llvm_external_fns) \
+    YY(jl_get_llvm_mis) \
     YY(jl_dump_function_asm) \
     YY(jl_LLVMCreateDisasm) \
     YY(jl_LLVMDisasmInstruction) \
diff --git a/src/jlfrontend.scm b/src/jlfrontend.scm
index 2c5f42eda5ce8..463e39c41d00a 100644
--- a/src/jlfrontend.scm
+++ b/src/jlfrontend.scm
@@ -211,11 +211,11 @@
           (block
            ,@loc
            (call (core eval) ,name ,x)))
-       (= (call include ,x)
+       (= (call include (:: ,x (top AbstractString)))
           (block
            ,@loc
            (call (core _call_latest) (top include) ,name ,x)))
-       (= (call include (:: ,mex (top Function)) ,x)
+       (= (call include (:: ,mex (top Function)) (:: ,x (top AbstractString)))
           (block
            ,@loc
            (call (core _call_latest) (top include) ,mex ,name ,x)))))
diff --git a/src/jloptions.c b/src/jloptions.c
index fd0d5f95d3344..71b0368edd1c4 100644
--- a/src/jloptions.c
+++ b/src/jloptions.c
@@ -125,6 +125,7 @@ JL_DLLEXPORT void jl_init_options(void)
                         1,    // can_inline
                         JL_OPTIONS_POLLY_ON, // polly
                         NULL, // trace_compile
+                        NULL, // trace_dispatch
                         JL_OPTIONS_FAST_MATH_DEFAULT,
                         0,    // worker
                         NULL, // cookie
@@ -149,6 +150,7 @@ JL_DLLEXPORT void jl_init_options(void)
                         0, // permalloc_pkgimg
                         0, // heap-size-hint
                         0, // trace_compile_timing
+                        0, // trim
     };
     jl_options_initialized = 1;
 }
@@ -299,18 +301,27 @@ static const char opts_hidden[]  =
     " --strip-ir                                    Remove IR (intermediate representation) of compiled\n"
     "                                               functions\n\n"
 
-    // compiler debugging (see the devdocs for tips on using these options)
+    // compiler debugging and experimental (see the devdocs for tips on using these options)
     " --output-unopt-bc <name>                      Generate unoptimized LLVM bitcode (.bc)\n"
     " --output-bc <name>                            Generate LLVM bitcode (.bc)\n"
     " --output-asm <name>                           Generate an assembly file (.s)\n"
     " --output-incremental={yes|no*}                Generate an incremental output file (rather than\n"
     "                                               complete)\n"
     " --trace-compile={stderr|name}                 Print precompile statements for methods compiled\n"
-    "                                               during execution or save to a path\n"
+    "                                               during execution or save to stderr or a path. Methods that\n"
+    "                                               were recompiled are printed in yellow or with a trailing\n"
+    "                                               comment if color is not supported\n"
     " --trace-compile-timing                        If --trace-compile is enabled show how long each took to\n"
     "                                               compile in ms\n"
     " --image-codegen                               Force generate code in imaging mode\n"
     " --permalloc-pkgimg={yes|no*}                  Copy the data section of package images into memory\n"
+    " --trim={no*|safe|unsafe|unsafe-warn}\n"
+    "                                               Build a sysimage including only code provably reachable\n"
+    "                                               from methods marked by calling `entrypoint`. In unsafe\n"
+    "                                               mode, the resulting binary might be missing needed code\n"
+    "                                               and can throw errors. With unsafe-warn warnings will be\n"
+    "                                               printed for dynamic call sites that might lead to such\n"
+    "                                               errors. In safe mode compile-time errors are given instead.\n"
 ;
 
 JL_DLLEXPORT void jl_parse_opts(int *argcp, char ***argvp)
@@ -332,6 +343,7 @@ JL_DLLEXPORT void jl_parse_opts(int *argcp, char ***argvp)
            opt_polly,
            opt_trace_compile,
            opt_trace_compile_timing,
+           opt_trace_dispatch,
            opt_math_mode,
            opt_worker,
            opt_bind_to,
@@ -357,7 +369,8 @@ JL_DLLEXPORT void jl_parse_opts(int *argcp, char ***argvp)
            opt_strip_ir,
            opt_heap_size_hint,
            opt_gc_threads,
-           opt_permalloc_pkgimg
+           opt_permalloc_pkgimg,
+           opt_trim,
     };
     static const char* const shortopts = "+vhqH:e:E:L:J:C:it:p:O:g:m:";
     static const struct option longopts[] = {
@@ -409,6 +422,7 @@ JL_DLLEXPORT void jl_parse_opts(int *argcp, char ***argvp)
         { "polly",           required_argument, 0, opt_polly },
         { "trace-compile",   required_argument, 0, opt_trace_compile },
         { "trace-compile-timing",  no_argument, 0, opt_trace_compile_timing },
+        { "trace-dispatch",  required_argument, 0, opt_trace_dispatch },
         { "math-mode",       required_argument, 0, opt_math_mode },
         { "handle-signals",  required_argument, 0, opt_handle_signals },
         // hidden command line options
@@ -421,6 +435,7 @@ JL_DLLEXPORT void jl_parse_opts(int *argcp, char ***argvp)
         { "strip-ir",        no_argument,       0, opt_strip_ir },
         { "permalloc-pkgimg",required_argument, 0, opt_permalloc_pkgimg },
         { "heap-size-hint",  required_argument, 0, opt_heap_size_hint },
+        { "trim",  optional_argument, 0, opt_trim },
         { 0, 0, 0, 0 }
     };
 
@@ -864,6 +879,11 @@ JL_DLLEXPORT void jl_parse_opts(int *argcp, char ***argvp)
         case opt_trace_compile_timing:
             jl_options.trace_compile_timing = 1;
             break;
+         case opt_trace_dispatch:
+            jl_options.trace_dispatch = strdup(optarg);
+            if (!jl_options.trace_dispatch)
+                jl_errorf("fatal error: failed to allocate memory: %s", strerror(errno));
+            break;
         case opt_math_mode:
             if (!strcmp(optarg,"ieee"))
                 jl_options.fast_math = JL_OPTIONS_FAST_MATH_OFF;
@@ -938,6 +958,18 @@ JL_DLLEXPORT void jl_parse_opts(int *argcp, char ***argvp)
             else
                 jl_errorf("julia: invalid argument to --permalloc-pkgimg={yes|no} (%s)", optarg);
             break;
+        case opt_trim:
+            if (optarg == NULL || !strcmp(optarg,"safe"))
+                jl_options.trim = JL_TRIM_SAFE;
+            else if (!strcmp(optarg,"no"))
+                jl_options.trim = JL_TRIM_NO;
+            else if (!strcmp(optarg,"unsafe"))
+                jl_options.trim = JL_TRIM_UNSAFE;
+            else if (!strcmp(optarg,"unsafe-warn"))
+                jl_options.trim = JL_TRIM_UNSAFE_WARN;
+            else
+                jl_errorf("julia: invalid argument to --trim={safe|no|unsafe|unsafe-warn} (%s)", optarg);
+            break;
         default:
             jl_errorf("julia: unhandled option -- %c\n"
                       "This is a bug, please report it.", c);
diff --git a/src/jloptions.h b/src/jloptions.h
index aac2a64a373a8..e58797caace3c 100644
--- a/src/jloptions.h
+++ b/src/jloptions.h
@@ -38,6 +38,7 @@ typedef struct {
     int8_t can_inline;
     int8_t polly;
     const char *trace_compile;
+    const char *trace_dispatch;
     int8_t fast_math;
     int8_t worker;
     const char *cookie;
@@ -62,6 +63,7 @@ typedef struct {
     int8_t permalloc_pkgimg;
     uint64_t heap_size_hint;
     int8_t trace_compile_timing;
+    int8_t trim;
 } jl_options_t;
 
 #endif
diff --git a/src/jltypes.c b/src/jltypes.c
index adf39162cc7f0..11f1d11a14edc 100644
--- a/src/jltypes.c
+++ b/src/jltypes.c
@@ -1607,6 +1607,118 @@ jl_value_t *jl_rewrap_unionall_(jl_value_t *t, jl_value_t *u)
     return t;
 }
 
+// Create a copy of type expression t where any occurrence of data type x is replaced by y.
+// If x does not occur in t, return t without any copy.
+// For example, jl_substitute_datatype(Foo{Bar}, Foo{T}, Qux{S}) is Qux{Bar}, with T and S
+// free type variables.
+// To substitute type variables, use jl_substitute_var instead.
+jl_value_t *jl_substitute_datatype(jl_value_t *t, jl_datatype_t * x, jl_datatype_t * y)
+{
+    if jl_is_datatype(t) {
+        jl_datatype_t *typ = (jl_datatype_t*)t;
+        // For datatypes call itself recursively on the parameters to form new parameters.
+        // Then, if typename(t) == typename(x), rewrap the wrapper of y around the new
+        // parameters. Otherwise, do the same around the wrapper of t.
+        // This ensures that the types and supertype are properly set.
+        // Start by check whether there is a parameter that needs replacing.
+        long i_firstnewparam = -1;
+        size_t nparams = jl_svec_len(typ->parameters);
+        jl_value_t *firstnewparam = NULL;
+        JL_GC_PUSH1(&firstnewparam);
+        for (size_t i = 0; i < nparams; i++) {
+            jl_value_t *param = NULL;
+            JL_GC_PUSH1(&param);
+            param = jl_svecref(typ->parameters, i);
+            firstnewparam = jl_substitute_datatype(param, x, y);
+            if (param != firstnewparam) {
+                i_firstnewparam = i;
+                JL_GC_POP();
+                break;
+            }
+            JL_GC_POP();
+        }
+        // If one of the parameters needs to be updated, or if the type name is that to
+        // substitute, create a new datataype
+        if (i_firstnewparam != -1 || typ->name == x->name) {
+            jl_datatype_t *uw = typ->name == x->name ? y : typ; // substitution occurs here
+            jl_value_t *wrapper = uw->name->wrapper;
+            jl_datatype_t *w = (jl_datatype_t*)jl_unwrap_unionall(wrapper);
+            jl_svec_t *sv = jl_alloc_svec_uninit(jl_svec_len(uw->parameters));
+            JL_GC_PUSH1(&sv);
+            jl_value_t **vals = jl_svec_data(sv);
+            // no JL_GC_PUSHARGS(vals, ...) since GC is already aware of sv
+            for (long i = 0; i < i_firstnewparam; i++) { // copy the identical parameters
+                vals[i] = jl_svecref(typ->parameters, i); // value
+            }
+            if (i_firstnewparam != -1) { // insert the first non-identical parameter
+                vals[i_firstnewparam] = firstnewparam;
+            }
+            for (size_t i = i_firstnewparam+1; i < nparams; i++) { // insert the remaining parameters
+                vals[i] = jl_substitute_datatype(jl_svecref(typ->parameters, i), x, y);
+            }
+            if (jl_is_tuple_type(wrapper)) {
+                // special case for tuples, since the wrapper (Tuple) does not have as
+                // many parameters as t (it only has a Vararg instead).
+                t = jl_apply_tuple_type(sv, 0);
+            } else {
+                t = jl_instantiate_type_in_env((jl_value_t*)w, (jl_unionall_t*)wrapper, vals);
+            }
+            JL_GC_POP();
+        }
+        JL_GC_POP();
+    }
+    else if jl_is_unionall(t) { // recursively call itself on body and var bounds
+        jl_unionall_t* ut = (jl_unionall_t*)t;
+        jl_value_t *lb = NULL;
+        jl_value_t *ub = NULL;
+        jl_value_t *body = NULL;
+        JL_GC_PUSH3(&lb, &ub, &body);
+        lb = jl_substitute_datatype(ut->var->lb, x, y);
+        ub = jl_substitute_datatype(ut->var->ub, x, y);
+        body = jl_substitute_datatype(ut->body, x, y);
+        if (lb != ut->var->lb || ub != ut->var->ub) {
+            jl_tvar_t *newtvar = jl_new_typevar(ut->var->name, lb, ub);
+            JL_GC_PUSH1(&newtvar);
+            body = jl_substitute_var(body, ut->var, (jl_value_t*)newtvar);
+            t = jl_new_struct(jl_unionall_type, newtvar, body);
+            JL_GC_POP();
+        }
+        else if (body != ut->body) {
+            t = jl_new_struct(jl_unionall_type, ut->var, body);
+        }
+        JL_GC_POP();
+    }
+    else if jl_is_uniontype(t) { // recursively call itself on a and b
+        jl_uniontype_t *u = (jl_uniontype_t*)t;
+        jl_value_t *a = NULL;
+        jl_value_t *b = NULL;
+        JL_GC_PUSH2(&a, &b);
+        a = jl_substitute_datatype(u->a, x, y);
+        b = jl_substitute_datatype(u->b, x, y);
+        if (a != u->a || b != u->b) {
+            t = jl_new_struct(jl_uniontype_type, a, b);
+        }
+        JL_GC_POP();
+    }
+    else if jl_is_vararg(t) { // recursively call itself on T
+        jl_vararg_t *vt = (jl_vararg_t*)t;
+        if (vt->T) { // vt->T could be NULL
+            jl_value_t *rT = NULL;
+            JL_GC_PUSH1(&rT);
+            rT = jl_substitute_datatype(vt->T, x, y);
+            if (rT != vt->T) {
+                jl_task_t *ct = jl_current_task;
+                t = jl_gc_alloc(ct->ptls, sizeof(jl_vararg_t), jl_vararg_type);
+                jl_set_typetagof((jl_vararg_t *)t, jl_vararg_tag, 0);
+                ((jl_vararg_t *)t)->T = rT;
+                ((jl_vararg_t *)t)->N = vt->N;
+            }
+            JL_GC_POP();
+        }
+    }
+    return t;
+}
+
 static jl_value_t *lookup_type_stack(jl_typestack_t *stack, jl_datatype_t *tt, size_t ntp,
                                      jl_value_t **iparams)
 {
@@ -3505,7 +3617,7 @@ void jl_init_types(void) JL_GC_DISABLED
                             "backedges",
                             "cache",
                             "cache_with_orig",
-                            "precompiled"),
+                            "flags"),
                         jl_svec(7,
                             jl_new_struct(jl_uniontype_type, jl_method_type, jl_module_type),
                             jl_any_type,
diff --git a/src/julia-syntax.scm b/src/julia-syntax.scm
index a2d3ffdd66f67..f1acb9c3250e1 100644
--- a/src/julia-syntax.scm
+++ b/src/julia-syntax.scm
@@ -225,6 +225,19 @@
                  (if lb (list lb ub) (list ub))
                  (if lb (list lb '(core Any)) '())))))
 
+(define (is-method? x)
+  (if (and (pair? x) (eq? (car x) 'method))
+      (let ((name (cadr x)))
+        (if (and (pair? name) (eq? (car name) 'globalref))
+            (let ((name (caddr name)))
+              (if (symbol? name)
+                  #t
+                  #f))
+            (if (symbol? name)
+                #t
+                #f)))
+      #f))
+
 (define (method-expr-name m)
   (let ((name (cadr m)))
       (cond ((globalref? name) (caddr name))
@@ -372,7 +385,7 @@
             (generator (if (expr-contains-p if-generated? body (lambda (x) (not (function-def? x))))
                            (let* ((gen    (generated-version body))
                                   (nongen (non-generated-version body))
-                                  (gname  (symbol (string (gensy) "#" (current-julia-module-counter))))
+                                  (gname  (symbol (string (gensy) "#" (current-julia-module-counter '()))))
                                   (gf     (make-generator-function gname names anames gen)))
                              (set! body (insert-after-meta
                                          nongen
@@ -512,7 +525,7 @@
                                         ""
                                         "#")
                                     (or und '_) "#"
-                                    (string (current-julia-module-counter)))))))
+                                    (string (current-julia-module-counter '())))))))
       ;; this is a hack: nest these statements inside a call so they get closure
       ;; converted together, allowing all needed types to be defined before any methods.
       `(call (core ifelse) (false) (false) (block
@@ -1251,7 +1264,7 @@
                    (list a)))
          ;; TODO: always use a specific special name like #anon# or _, then ignore
          ;; this as a local variable name.
-         (name (symbol (string "#" (current-julia-module-counter)))))
+         (name (symbol (string "#" (current-julia-module-counter '())))))
     (expand-forms
      `(block (local ,name)
              (function
@@ -3537,9 +3550,9 @@ f(x) = yt(x)
 (define (clear-capture-bits vinfos)
   (map vinfo:not-capt vinfos))
 
-(define (convert-lambda lam fname interp capt-sp opaq)
+(define (convert-lambda lam fname interp capt-sp opaq parsed-method-stack)
   (let ((body (add-box-inits-to-body
-               lam (cl-convert (cadddr lam) fname lam (table) (table) #f interp opaq (table) (vinfo-to-table (car (lam:vinfo lam)))))))
+               lam (cl-convert (cadddr lam) fname lam (table) (table) #f interp opaq parsed-method-stack (table) (vinfo-to-table (car (lam:vinfo lam)))))))
     `(lambda ,(lam:args lam)
        (,(clear-capture-bits (car (lam:vinfo lam)))
         ()
@@ -3614,7 +3627,7 @@ f(x) = yt(x)
 ;; declared types.
 ;; when doing this, the original value needs to be preserved, to
 ;; ensure the expression `a=b` always returns exactly `b`.
-(define (convert-assignment var rhs0 fname lam interp opaq globals locals)
+(define (convert-assignment var rhs0 fname lam interp opaq parsed-method-stack globals locals)
   (cond
     ((symbol? var)
      (let* ((vi (get locals var #f))
@@ -3632,7 +3645,7 @@ f(x) = yt(x)
                                 (equal? rhs0 '(the_exception)))
                             rhs0
                             (make-ssavalue)))
-                  (rhs  (convert-for-type-decl rhs1 (cl-convert vt fname lam #f #f #f interp opaq (table) locals) #t lam))
+                  (rhs  (convert-for-type-decl rhs1 (cl-convert vt fname lam #f #f #f interp opaq parsed-method-stack (table) locals) #t lam))
                   (ex (cond (closed `(call (core setfield!)
                                            ,(if interp
                                                 `($ ,var)
@@ -3916,17 +3929,17 @@ f(x) = yt(x)
 (define (toplevel-preserving? e)
   (and (pair? e) (memq (car e) '(if elseif block trycatch tryfinally trycatchelse))))
 
-(define (map-cl-convert exprs fname lam namemap defined toplevel interp opaq (globals (table)) (locals (table)))
+(define (map-cl-convert exprs fname lam namemap defined toplevel interp opaq parsed-method-stack (globals (table)) (locals (table)))
   (if toplevel
       (map (lambda (x)
              (let ((tl (lift-toplevel (cl-convert x fname lam namemap defined
                                                   (and toplevel (toplevel-preserving? x))
-                                                  interp opaq globals locals))))
+                                                  interp opaq parsed-method-stack globals locals))))
                (if (null? (cdr tl))
                    (car tl)
                    `(block ,@(cdr tl) ,(car tl)))))
            exprs)
-      (map (lambda (x) (cl-convert x fname lam namemap defined #f interp opaq globals locals)) exprs)))
+      (map (lambda (x) (cl-convert x fname lam namemap defined #f interp opaq parsed-method-stack globals locals)) exprs)))
 
 (define (prepare-lambda! lam)
   ;; mark all non-arguments as assigned, since locals that are never assigned
@@ -3935,11 +3948,17 @@ f(x) = yt(x)
             (list-tail (car (lam:vinfo lam)) (length (lam:args lam))))
   (lambda-optimize-vars! lam))
 
-(define (cl-convert e fname lam namemap defined toplevel interp opaq (globals (table)) (locals (table)))
+;; must start with a hash and second character must be numeric
+(define (anon-function-name? str)
+  (and (>= (string-length str) 2)
+       (char=? (string.char str 0) #\#)
+       (char-numeric? (string.char str 1))))
+
+(define (cl-convert- e fname lam namemap defined toplevel interp opaq parsed-method-stack (globals (table)) (locals (table)))
   (if (and (not lam)
            (not (and (pair? e) (memq (car e) '(lambda method macro opaque_closure)))))
       (if (atom? e) e
-          (cons (car e) (map-cl-convert (cdr e) fname lam namemap defined toplevel interp opaq globals locals)))
+          (cons (car e) (map-cl-convert (cdr e) fname lam namemap defined toplevel interp opaq parsed-method-stack globals locals)))
       (cond
        ((symbol? e)
         (define (new-undef-var name)
@@ -3958,7 +3977,10 @@ f(x) = yt(x)
                  (val (if (equal? typ '(core Any))
                           val
                           `(call (core typeassert) ,val
-                                 ,(cl-convert typ fname lam namemap defined toplevel interp opaq globals locals)))))
+                                 ,(let ((convt (cl-convert typ fname lam namemap defined toplevel interp opaq parsed-method-stack globals locals)))
+                                    (if (or (symbol-like? convt) (quoted? convt))
+                                        convt
+                                        (renumber-assigned-ssavalues convt)))))))
             `(block
                ,@(if (eq? box access) '() `((= ,access ,box)))
                ,undefcheck
@@ -3990,8 +4012,8 @@ f(x) = yt(x)
            e)
           ((=)
            (let ((var (cadr e))
-                 (rhs (cl-convert (caddr e) fname lam namemap defined toplevel interp opaq globals locals)))
-             (convert-assignment var rhs fname lam interp opaq globals locals)))
+                 (rhs (cl-convert (caddr e) fname lam namemap defined toplevel interp opaq parsed-method-stack globals locals)))
+             (convert-assignment var rhs fname lam interp opaq parsed-method-stack globals locals)))
           ((local-def) ;; make new Box for local declaration of defined variable
            (let ((vi (get locals (cadr e) #f)))
              (if (and vi (vinfo:asgn vi) (vinfo:capt vi))
@@ -4043,7 +4065,7 @@ f(x) = yt(x)
                                    cvs)))
                `(new_opaque_closure
                  ,(cadr e) ,(or (caddr e) '(call (core apply_type) (core Union))) ,(or (cadddr e) '(core Any)) ,allow-partial
-                 (opaque_closure_method (null) ,nargs ,isva ,functionloc ,(convert-lambda lam2 (car (lam:args lam2)) #f '() (symbol-to-idx-map cvs)))
+                 (opaque_closure_method (null) ,nargs ,isva ,functionloc ,(convert-lambda lam2 (car (lam:args lam2)) #f '() (symbol-to-idx-map cvs) parsed-method-stack))
                  ,@var-exprs))))
           ((method)
            (let* ((name  (method-expr-name e))
@@ -4057,7 +4079,7 @@ f(x) = yt(x)
                   (sp-inits (if (or short (not (eq? (car sig) 'block)))
                                 '()
                                 (map-cl-convert (butlast (cdr sig))
-                                                fname lam namemap defined toplevel interp opaq globals locals)))
+                                                fname lam namemap defined toplevel interp opaq parsed-method-stack globals locals)))
                   (sig      (and sig (if (eq? (car sig) 'block)
                                          (last sig)
                                          sig))))
@@ -4084,22 +4106,22 @@ f(x) = yt(x)
                                           ;; anonymous functions with keyword args generate global
                                           ;; functions that refer to the type of a local function
                                           (rename-sig-types sig namemap)
-                                          fname lam namemap defined toplevel interp opaq globals locals)
+                                          fname lam namemap defined toplevel interp opaq parsed-method-stack globals locals)
                                   ,(let ((body (add-box-inits-to-body
                                                 lam2
-                                                (cl-convert (cadddr lam2) 'anon lam2 (table) (table) #f interp opaq (table)
+                                                (cl-convert (cadddr lam2) 'anon lam2 (table) (table) #f interp opaq parsed-method-stack (table)
                                                             (vinfo-to-table (car (lam:vinfo lam2)))))))
                                      `(lambda ,(cadr lam2)
                                         (,(clear-capture-bits (car vis))
                                          ,@(cdr vis))
                                         ,body)))))
                        (else
-                        (let* ((exprs     (lift-toplevel (convert-lambda lam2 '|#anon| #t '() #f)))
+                        (let* ((exprs     (lift-toplevel (convert-lambda lam2 '|#anon| #t '() #f parsed-method-stack)))
                                (top-stmts (cdr exprs))
                                (newlam    (compact-and-renumber (linearize (car exprs)) 'none 0)))
                           `(toplevel-butfirst
                             (block ,@sp-inits
-                                   (method ,(cadr e) ,(cl-convert sig fname lam namemap defined toplevel interp opaq globals locals)
+                                   (method ,(cadr e) ,(cl-convert sig fname lam namemap defined toplevel interp opaq parsed-method-stack globals locals)
                                            ,(julia-bq-macro newlam)))
                             ,@top-stmts))))
 
@@ -4108,9 +4130,11 @@ f(x) = yt(x)
                         (type-name  (or (get namemap name #f)
                                         (and name
                                              (symbol (string (if (= (string.char (string name) 0) #\#)
-                                                                 ""
-                                                                 "#")
-                                                             name "#" (current-julia-module-counter))))))
+                                                                  (if (anon-function-name? (string name))
+                                                                    (string "#" (current-julia-module-counter parsed-method-stack))
+                                                                    name)
+                                                                  (string "#" name))
+                                                              "#" (current-julia-module-counter parsed-method-stack))))))
                         (alldefs (expr-find-all
                                   (lambda (ex) (and (length> ex 2) (eq? (car ex) 'method)
                                                     (not (eq? ex e))
@@ -4202,12 +4226,12 @@ f(x) = yt(x)
                                (append (map (lambda (gs tvar)
                                               (make-assignment gs `(call (core TypeVar) ',tvar (core Any))))
                                             closure-param-syms closure-param-names)
-                                       `((method #f ,(cl-convert arg-defs fname lam namemap defined toplevel interp opaq globals locals)
+                                       `((method #f ,(cl-convert arg-defs fname lam namemap defined toplevel interp opaq parsed-method-stack globals locals)
                                                  ,(convert-lambda lam2
                                                                   (if iskw
                                                                       (caddr (lam:args lam2))
                                                                       (car (lam:args lam2)))
-                                                                  #f closure-param-names #f)))))))
+                                                                  #f closure-param-names #f parsed-method-stack)))))))
                         (mk-closure  ;; expression to make the closure
                          (let* ((var-exprs (map (lambda (v)
                                                   (let ((cv (assq v (cadr (lam:vinfo lam)))))
@@ -4241,7 +4265,7 @@ f(x) = yt(x)
                        (begin
                          (put! defined name #t)
                          `(toplevel-butfirst
-                           ,(convert-assignment name mk-closure fname lam interp opaq globals locals)
+                           ,(convert-assignment name mk-closure fname lam interp opaq parsed-method-stack globals locals)
                            ,@typedef
                            ,@(map (lambda (v) `(moved-local ,v)) moved-vars)
                            ,@sp-inits
@@ -4255,14 +4279,14 @@ f(x) = yt(x)
                                        (table)
                                        (table)
                                        (null? (cadr e)) ;; only toplevel thunks have 0 args
-                                       interp opaq globals (vinfo-to-table (car (lam:vinfo e))))))
+                                       interp opaq parsed-method-stack globals (vinfo-to-table (car (lam:vinfo e))))))
              `(lambda ,(cadr e)
                 (,(clear-capture-bits (car (lam:vinfo e)))
                  () ,@(cddr (lam:vinfo e)))
                 (block ,@body))))
           ;; remaining `::` expressions are type assertions
           ((|::|)
-           (cl-convert `(call (core typeassert) ,@(cdr e)) fname lam namemap defined toplevel interp opaq globals locals))
+           (cl-convert `(call (core typeassert) ,@(cdr e)) fname lam namemap defined toplevel interp opaq parsed-method-stack globals locals))
           ;; remaining `decl` expressions are only type assertions if the
           ;; argument is global or a non-symbol.
           ((decl)
@@ -4280,13 +4304,20 @@ f(x) = yt(x)
                              (globaldecl ,ref ,(caddr e))
                              (null)))
                          `(call (core typeassert) ,@(cdr e))))
-                   fname lam namemap defined toplevel interp opaq globals locals))))
+                   fname lam namemap defined toplevel interp opaq parsed-method-stack globals locals))))
           ;; `with-static-parameters` expressions can be removed now; used only by analyze-vars
           ((with-static-parameters)
-           (cl-convert (cadr e) fname lam namemap defined toplevel interp opaq globals locals))
+           (cl-convert (cadr e) fname lam namemap defined toplevel interp opaq parsed-method-stack globals locals))
           (else
            (cons (car e)
-                 (map-cl-convert (cdr e) fname lam namemap defined toplevel interp opaq globals locals))))))))
+                 (map-cl-convert (cdr e) fname lam namemap defined toplevel interp opaq parsed-method-stack globals locals))))))))
+
+;; wrapper for `cl-convert-`
+(define (cl-convert e fname lam namemap defined toplevel interp opaq (parsed-method-stack '()) (globals (table)) (locals (table)))
+  (if (is-method? e)
+      (let ((name (method-expr-name e)))
+        (cl-convert- e fname lam namemap defined toplevel interp opaq (cons name parsed-method-stack) globals locals))
+      (cl-convert- e fname lam namemap defined toplevel interp opaq parsed-method-stack globals locals)))
 
 (define (closure-convert e) (cl-convert e #f #f (table) (table) #f #f #f))
 
@@ -4970,6 +5001,10 @@ f(x) = yt(x)
             ((≔ ⩴ ≕ :=)
              (error (string "unsupported assignment operator \"" (deparse (car e)) "\"")))
 
+            ;; bare :escape
+            ((escape)
+             (error (string "\"esc(...)\" used outside of macro expansion")))
+
             ((error)
              (error (cadr e)))
             (else
diff --git a/src/julia.expmap.in b/src/julia.expmap.in
index e5f9ee890205f..29366f6296a85 100644
--- a/src/julia.expmap.in
+++ b/src/julia.expmap.in
@@ -5,8 +5,8 @@
     asprintf;
     bitvector_*;
     ios_*;
-    arraylist_grow;
-    small_arraylist_grow;
+    arraylist_*;
+    small_arraylist_*;
     jl_*;
     ijl_*;
     _jl_mutex_*;
diff --git a/src/julia.h b/src/julia.h
index f09afc8479564..b8d2eb16f36a0 100644
--- a/src/julia.h
+++ b/src/julia.h
@@ -234,6 +234,7 @@ JL_DLLEXPORT extern const jl_callptr_t jl_f_opaque_closure_call_addr;
 JL_DLLEXPORT extern const jl_callptr_t jl_fptr_wait_for_compiled_addr;
 
 typedef struct _jl_line_info_node_t {
+    JL_DATA_TYPE
     struct _jl_module_t *module;
     jl_value_t *method; // may contain a jl_symbol, jl_method_t, or jl_method_instance_t
     jl_sym_t *file;
@@ -281,6 +282,7 @@ typedef union __jl_purity_overrides_t {
 
 // This type describes a single function body
 typedef struct _jl_code_info_t {
+    JL_DATA_TYPE
     // ssavalue-indexed arrays of properties:
     jl_array_t *code;  // Any array of statements
     jl_debuginfo_t *debuginfo; // Table of edge data for each statement
@@ -408,8 +410,14 @@ struct _jl_method_instance_t {
     jl_array_t *backedges; // list of method-instances which call this method-instance; `invoke` records (invokesig, caller) pairs
     _Atomic(struct _jl_code_instance_t*) cache;
     uint8_t cache_with_orig; // !cache_with_specTypes
-    _Atomic(uint8_t) precompiled; // true if this instance was generated by an explicit `precompile(...)` call
+
+    // flags for this method instance
+    //   bit 0: generated by an explicit `precompile(...)`
+    //   bit 1: dispatched
+    _Atomic(uint8_t) flags;
 };
+#define JL_MI_FLAGS_MASK_PRECOMPILED    0x01
+#define JL_MI_FLAGS_MASK_DISPATCHED     0x02
 
 // OpaqueClosure
 typedef struct _jl_opaque_closure_t {
@@ -702,6 +710,8 @@ typedef struct _jl_module_t {
     struct _jl_module_t *parent;
     _Atomic(jl_svec_t*) bindings;
     _Atomic(jl_genericmemory_t*) bindingkeyset; // index lookup by name into bindings
+    jl_sym_t *file;
+    int32_t line;
     // hidden fields:
     arraylist_t usings;  // modules with all bindings potentially imported
     jl_uuid_t build_id;
@@ -718,6 +728,7 @@ typedef struct _jl_module_t {
 } jl_module_t;
 
 struct _jl_globalref_t {
+    JL_DATA_TYPE
     jl_module_t *mod;
     jl_sym_t *name;
     jl_binding_t *binding;
@@ -2310,7 +2321,6 @@ JL_DLLEXPORT int jl_set_task_tid(jl_task_t *task, int16_t tid) JL_NOTSAFEPOINT;
 JL_DLLEXPORT int jl_set_task_threadpoolid(jl_task_t *task, int8_t tpid) JL_NOTSAFEPOINT;
 JL_DLLEXPORT void JL_NORETURN jl_throw(jl_value_t *e JL_MAYBE_UNROOTED);
 JL_DLLEXPORT void JL_NORETURN jl_rethrow(void);
-JL_DLLEXPORT void JL_NORETURN jl_sig_throw(void);
 JL_DLLEXPORT void JL_NORETURN jl_rethrow_other(jl_value_t *e JL_MAYBE_UNROOTED);
 JL_DLLEXPORT void JL_NORETURN jl_no_exc_handler(jl_value_t *e, jl_task_t *ct);
 JL_DLLEXPORT JL_CONST_FUNC jl_gcframe_t **(jl_get_pgcstack)(void) JL_GLOBALLY_ROOTED JL_NOTSAFEPOINT;
@@ -2579,6 +2589,11 @@ JL_DLLEXPORT int jl_generating_output(void) JL_NOTSAFEPOINT;
 #define JL_OPTIONS_USE_PKGIMAGES_YES 1
 #define JL_OPTIONS_USE_PKGIMAGES_NO 0
 
+#define JL_TRIM_NO 0
+#define JL_TRIM_SAFE 1
+#define JL_TRIM_UNSAFE 2
+#define JL_TRIM_UNSAFE_WARN 3
+
 // Version information
 #include <julia_version.h> // Generated file
 
@@ -2626,10 +2641,12 @@ typedef struct {
     int gcstack_arg; // Pass the ptls value as an argument with swiftself
 
     int use_jlplt; // Whether to use the Julia PLT mechanism or emit symbols directly
+    int trim; // can we emit dynamic dispatches?
     // Cache access. Default: jl_rettype_inferred_native.
     jl_codeinstance_lookup_t lookup;
 } jl_cgparams_t;
 extern JL_DLLEXPORT int jl_default_debug_info_kind;
+extern JL_DLLEXPORT jl_cgparams_t jl_default_cgparams;
 
 typedef struct {
     int emit_metadata;
diff --git a/src/julia_internal.h b/src/julia_internal.h
index 652aae54860b5..20d90fede3d5e 100644
--- a/src/julia_internal.h
+++ b/src/julia_internal.h
@@ -13,6 +13,7 @@
 #include "support/strtod.h"
 #include "gc-alloc-profiler.h"
 #include "support/rle.h"
+#include <ctype.h>
 #include <stdint.h>
 #include <uv.h>
 #include <llvm-c/Types.h>
@@ -663,8 +664,9 @@ JL_DLLEXPORT jl_code_instance_t *jl_compile_method_internal(jl_method_instance_t
 JL_DLLEXPORT jl_code_instance_t *jl_get_method_inferred(
         jl_method_instance_t *mi JL_PROPAGATES_ROOT, jl_value_t *rettype,
         size_t min_world, size_t max_world, jl_debuginfo_t *edges);
-jl_method_instance_t *jl_get_unspecialized(jl_method_t *def JL_PROPAGATES_ROOT);
+JL_DLLEXPORT jl_method_instance_t *jl_get_unspecialized(jl_method_t *def JL_PROPAGATES_ROOT);
 JL_DLLEXPORT void jl_read_codeinst_invoke(jl_code_instance_t *ci, uint8_t *specsigflags, jl_callptr_t *invoke, void **specptr, int waitcompile) JL_NOTSAFEPOINT;
+JL_DLLEXPORT jl_method_instance_t *jl_method_match_to_mi(jl_method_match_t *match, size_t world, size_t min_valid, size_t max_valid, int mt_cache);
 
 JL_DLLEXPORT jl_code_instance_t *jl_new_codeinst_uninit(jl_method_instance_t *mi, jl_value_t *owner);
 JL_DLLEXPORT jl_code_instance_t *jl_new_codeinst(
@@ -682,6 +684,7 @@ JL_DLLEXPORT const char *jl_debuginfo_name(jl_value_t *func) JL_NOTSAFEPOINT;
 
 JL_DLLEXPORT void jl_compile_method_instance(jl_method_instance_t *mi, jl_tupletype_t *types, size_t world);
 JL_DLLEXPORT int jl_compile_hint(jl_tupletype_t *types);
+JL_DLLEXPORT int jl_add_entrypoint(jl_tupletype_t *types);
 jl_code_info_t *jl_code_for_interpreter(jl_method_instance_t *lam JL_PROPAGATES_ROOT, size_t world);
 jl_value_t *jl_code_or_ci_for_interpreter(jl_method_instance_t *lam JL_PROPAGATES_ROOT, size_t world);
 int jl_code_requires_compiler(jl_code_info_t *src, int include_force_compile);
@@ -704,7 +707,7 @@ int jl_valid_type_param(jl_value_t *v);
 
 JL_DLLEXPORT jl_value_t *jl_apply_2va(jl_value_t *f, jl_value_t **args, uint32_t nargs);
 
-void JL_NORETURN jl_method_error(jl_function_t *f, jl_value_t **args, size_t na, size_t world);
+void JL_NORETURN jl_method_error(jl_value_t *F, jl_value_t **args, size_t na, size_t world);
 JL_DLLEXPORT jl_value_t *jl_get_exceptionf(jl_datatype_t *exception_type, const char *fmt, ...);
 
 JL_DLLEXPORT void jl_typeassert(jl_value_t *x, jl_value_t *t);
@@ -769,6 +772,7 @@ jl_unionall_t *jl_rename_unionall(jl_unionall_t *u);
 JL_DLLEXPORT jl_value_t *jl_unwrap_unionall(jl_value_t *v JL_PROPAGATES_ROOT) JL_NOTSAFEPOINT;
 JL_DLLEXPORT jl_value_t *jl_rewrap_unionall(jl_value_t *t, jl_value_t *u);
 JL_DLLEXPORT jl_value_t *jl_rewrap_unionall_(jl_value_t *t, jl_value_t *u);
+jl_value_t* jl_substitute_datatype(jl_value_t *t, jl_datatype_t * x, jl_datatype_t * y);
 int jl_count_union_components(jl_value_t *v);
 JL_DLLEXPORT jl_value_t *jl_nth_union_component(jl_value_t *v JL_PROPAGATES_ROOT, int i) JL_NOTSAFEPOINT;
 int jl_find_union_component(jl_value_t *haystack, jl_value_t *needle, unsigned *nth) JL_NOTSAFEPOINT;
@@ -851,6 +855,7 @@ extern htable_t jl_current_modules JL_GLOBALLY_ROOTED;
 extern JL_DLLEXPORT jl_module_t *jl_precompile_toplevel_module JL_GLOBALLY_ROOTED;
 extern jl_genericmemory_t *jl_global_roots_list JL_GLOBALLY_ROOTED;
 extern jl_genericmemory_t *jl_global_roots_keyset JL_GLOBALLY_ROOTED;
+extern arraylist_t *jl_entrypoint_mis;
 JL_DLLEXPORT int jl_is_globally_rooted(jl_value_t *val JL_MAYBE_UNROOTED) JL_NOTSAFEPOINT;
 JL_DLLEXPORT jl_value_t *jl_as_global_root(jl_value_t *val, int insert) JL_GLOBALLY_ROOTED;
 
@@ -946,12 +951,31 @@ STATIC_INLINE jl_ptr_kind_union_t jl_walk_binding_inplace(jl_binding_t **bnd, jl
 }
 #endif
 
+STATIC_INLINE int is10digit(char c) JL_NOTSAFEPOINT
+{
+    return (c >= '0' && c <= '9');
+}
+
 STATIC_INLINE int is_anonfn_typename(char *name)
 {
     if (name[0] != '#' || name[1] == '#')
         return 0;
     char *other = strrchr(name, '#');
-    return other > &name[1] && other[1] > '0' && other[1] <= '9';
+    return other > &name[1] && is10digit(other[1]);
+}
+
+// Returns true for typenames of anounymous functions that have been canonicalized (i.e.
+// we mangled the name of the outermost enclosing function in their name).
+STATIC_INLINE int is_canonicalized_anonfn_typename(char *name) JL_NOTSAFEPOINT
+{
+    char *delim = strchr(&name[1], '#');
+    if (delim == NULL)
+        return 0;
+    if (delim[1] != '#')
+        return 0;
+    if (!is10digit(delim[2]))
+        return 0;
+    return 1;
 }
 
 // Each tuple can exist in one of 4 Vararg states:
@@ -1147,6 +1171,12 @@ JL_DLLEXPORT jl_code_instance_t *jl_cache_uninferred(jl_method_instance_t *mi, j
 JL_DLLEXPORT jl_code_instance_t *jl_new_codeinst_for_uninferred(jl_method_instance_t *mi, jl_code_info_t *src);
 JL_DLLEXPORT extern jl_value_t *(*const jl_rettype_inferred_addr)(jl_method_instance_t *mi JL_PROPAGATES_ROOT, size_t min_world, size_t max_world) JL_NOTSAFEPOINT;
 
+JL_DLLEXPORT void jl_force_trace_compile_timing_enable(void);
+JL_DLLEXPORT void jl_force_trace_compile_timing_disable(void);
+
+JL_DLLEXPORT void jl_force_trace_dispatch_enable(void);
+JL_DLLEXPORT void jl_force_trace_dispatch_disable(void);
+
 uint32_t jl_module_next_counter(jl_module_t *m) JL_NOTSAFEPOINT;
 jl_tupletype_t *arg_type_tuple(jl_value_t *arg1, jl_value_t **args, size_t nargs);
 
@@ -1881,7 +1911,7 @@ JL_DLLIMPORT void jl_get_function_id(void *native_code, jl_code_instance_t *ncod
         int32_t *func_idx, int32_t *specfunc_idx);
 JL_DLLIMPORT void jl_register_fptrs(uint64_t image_base, const struct _jl_image_fptrs_t *fptrs,
                                     jl_method_instance_t **linfos, size_t n);
-
+JL_DLLIMPORT void jl_get_llvm_mis(void *native_code, arraylist_t* MIs);
 JL_DLLIMPORT void jl_init_codegen(void);
 JL_DLLIMPORT void jl_teardown_codegen(void) JL_NOTSAFEPOINT;
 JL_DLLIMPORT int jl_getFunctionInfo(jl_frame_t **frames, uintptr_t pointer, int skipC, int noInline) JL_NOTSAFEPOINT;
diff --git a/src/julia_threads.h b/src/julia_threads.h
index e56ff5edd6176..b697a0bf030ed 100644
--- a/src/julia_threads.h
+++ b/src/julia_threads.h
@@ -18,6 +18,8 @@ extern "C" {
 
 JL_DLLEXPORT int16_t jl_threadid(void);
 JL_DLLEXPORT int8_t jl_threadpoolid(int16_t tid) JL_NOTSAFEPOINT;
+JL_DLLEXPORT uint64_t jl_get_ptls_rng(void) JL_NOTSAFEPOINT;
+JL_DLLEXPORT void jl_set_ptls_rng(uint64_t new_seed) JL_NOTSAFEPOINT;
 
 // JULIA_ENABLE_THREADING may be controlled by altering JULIA_THREADS in Make.user
 
@@ -187,6 +189,9 @@ typedef struct _jl_tls_states_t {
     // Saved exception for previous *external* API call or NULL if cleared.
     // Access via jl_exception_occurred().
     struct _jl_value_t *previous_exception;
+#ifdef _OS_DARWIN_
+    jl_jmp_buf *volatile safe_restore;
+#endif
 
     // currently-held locks, to be released when an exception is thrown
     small_arraylist_t locks;
diff --git a/src/llvm-alloc-opt.cpp b/src/llvm-alloc-opt.cpp
index 5984ad55d221c..188955fd50972 100644
--- a/src/llvm-alloc-opt.cpp
+++ b/src/llvm-alloc-opt.cpp
@@ -770,26 +770,7 @@ void Optimizer::moveToStack(CallInst *orig_inst, size_t sz, bool has_ref, AllocF
             user->replaceUsesOfWith(orig_i, replace);
         }
         else if (isa<AddrSpaceCastInst>(user) || isa<BitCastInst>(user)) {
-            #if JL_LLVM_VERSION >= 170000
-            #ifndef JL_NDEBUG
-            auto cast_t = PointerType::get(user->getType(), new_i->getType()->getPointerAddressSpace());
-            Type *new_t = new_i->getType();
-            assert(cast_t == new_t);
-            #endif
-            auto replace_i = new_i;
-            #else
-            auto cast_t = PointerType::getWithSamePointeeType(cast<PointerType>(user->getType()), new_i->getType()->getPointerAddressSpace());
-            auto replace_i = new_i;
-            Type *new_t = new_i->getType();
-            if (cast_t != new_t) {
-                // Shouldn't get here when using opaque pointers, so the new BitCastInst is fine
-                assert(cast_t->getContext().supportsTypedPointers());
-                replace_i = new BitCastInst(replace_i, cast_t, "", user);
-                replace_i->setDebugLoc(user->getDebugLoc());
-                replace_i->takeName(user);
-            }
-            #endif
-            push_frame(user, replace_i);
+            push_frame(user, new_i);
         }
         else if (auto gep = dyn_cast<GetElementPtrInst>(user)) {
             SmallVector<Value *, 4> IdxOperands(gep->idx_begin(), gep->idx_end());
diff --git a/src/llvm-codegen-shared.h b/src/llvm-codegen-shared.h
index 242dab021f101..a99e18f3e3762 100644
--- a/src/llvm-codegen-shared.h
+++ b/src/llvm-codegen-shared.h
@@ -125,7 +125,6 @@ struct CountTrackedPointers {
     CountTrackedPointers(llvm::Type *T, bool ignore_loaded=false);
 };
 
-unsigned TrackWithShadow(llvm::Value *Src, llvm::Type *T, bool isptr, llvm::Value *Dst, llvm::Type *DTy, llvm::IRBuilder<> &irbuilder);
 llvm::SmallVector<llvm::Value*, 0> ExtractTrackedValues(llvm::Value *Src, llvm::Type *STy, bool isptr, llvm::IRBuilder<> &irbuilder, llvm::ArrayRef<unsigned> perm_offsets={});
 
 static inline void llvm_dump(llvm::Value *v)
@@ -187,45 +186,39 @@ static inline llvm::Instruction *tbaa_decorate(llvm::MDNode *md, llvm::Instructi
 }
 
 // Get PTLS through current task.
-static inline llvm::Value *get_current_task_from_pgcstack(llvm::IRBuilder<> &builder, llvm::Type *T_size, llvm::Value *pgcstack)
+static inline llvm::Value *get_current_task_from_pgcstack(llvm::IRBuilder<> &builder, llvm::Value *pgcstack)
 {
     using namespace llvm;
-    auto T_pjlvalue = JuliaType::get_pjlvalue_ty(builder.getContext());
+    auto i8 = builder.getInt8Ty();
     const int pgcstack_offset = offsetof(jl_task_t, gcstack);
-    return builder.CreateInBoundsGEP(
-            T_pjlvalue, pgcstack,
-            ConstantInt::get(T_size, -(pgcstack_offset / sizeof(void *))),
-            "current_task");
+    return builder.CreateConstInBoundsGEP1_32(i8, pgcstack, -pgcstack_offset, "current_task");
 }
 
 // Get PTLS through current task.
-static inline llvm::Value *get_current_ptls_from_task(llvm::IRBuilder<> &builder, llvm::Type *T_size, llvm::Value *current_task, llvm::MDNode *tbaa)
+static inline llvm::Value *get_current_ptls_from_task(llvm::IRBuilder<> &builder, llvm::Value *current_task, llvm::MDNode *tbaa)
 {
     using namespace llvm;
-    auto T_pjlvalue = JuliaType::get_pjlvalue_ty(builder.getContext());
+    auto i8 = builder.getInt8Ty();
+    auto T_ptr = builder.getPtrTy();
     const int ptls_offset = offsetof(jl_task_t, ptls);
-    llvm::Value *pptls = builder.CreateInBoundsGEP(
-            T_pjlvalue, current_task,
-            ConstantInt::get(T_size, ptls_offset / sizeof(void *)),
-            "ptls_field");
-    LoadInst *ptls_load = builder.CreateAlignedLoad(T_pjlvalue,
-            pptls, Align(sizeof(void *)), "ptls_load");
+    llvm::Value *pptls = builder.CreateConstInBoundsGEP1_32(i8, current_task, ptls_offset, "ptls_field");
+    LoadInst *ptls_load = builder.CreateAlignedLoad(T_ptr, pptls, Align(sizeof(void *)), "ptls_load");
     // Note: Corresponding store (`t->ptls = ptls`) happens in `ctx_switch` of tasks.c.
     tbaa_decorate(tbaa, ptls_load);
     return ptls_load;
 }
 
 // Get signal page through current task.
-static inline llvm::Value *get_current_signal_page_from_ptls(llvm::IRBuilder<> &builder, llvm::Type *T_size, llvm::Value *ptls, llvm::MDNode *tbaa)
+static inline llvm::Value *get_current_signal_page_from_ptls(llvm::IRBuilder<> &builder, llvm::Value *ptls, llvm::MDNode *tbaa)
 {
     using namespace llvm;
     // return builder.CreateCall(prepare_call(reuse_signal_page_func));
-    auto T_psize = T_size->getPointerTo();
-    int nthfield = offsetof(jl_tls_states_t, safepoint) / sizeof(void *);
-    llvm::Value *psafepoint = builder.CreateInBoundsGEP(
-            T_psize, ptls, ConstantInt::get(T_size, nthfield));
+    auto T_ptr = builder.getPtrTy();
+    auto i8 = builder.getInt8Ty();
+    int nthfield = offsetof(jl_tls_states_t, safepoint);
+    llvm::Value *psafepoint = builder.CreateConstInBoundsGEP1_32(i8, ptls, nthfield);
     LoadInst *ptls_load = builder.CreateAlignedLoad(
-            T_psize, psafepoint, Align(sizeof(void *)), "safepoint");
+            T_ptr, psafepoint, Align(sizeof(void *)), "safepoint");
     tbaa_decorate(tbaa, ptls_load);
     return ptls_load;
 }
@@ -239,7 +232,7 @@ static inline void emit_signal_fence(llvm::IRBuilder<> &builder)
 static inline void emit_gc_safepoint(llvm::IRBuilder<> &builder, llvm::Type *T_size, llvm::Value *ptls, llvm::MDNode *tbaa, bool final = false)
 {
     using namespace llvm;
-    llvm::Value *signal_page = get_current_signal_page_from_ptls(builder, T_size, ptls, tbaa);
+    llvm::Value *signal_page = get_current_signal_page_from_ptls(builder, ptls, tbaa);
     emit_signal_fence(builder);
     Module *M = builder.GetInsertBlock()->getModule();
     LLVMContext &C = builder.getContext();
@@ -250,8 +243,7 @@ static inline void emit_gc_safepoint(llvm::IRBuilder<> &builder, llvm::Type *T_s
     else {
         Function *F = M->getFunction("julia.safepoint");
         if (!F) {
-            auto T_psize = T_size->getPointerTo();
-            FunctionType *FT = FunctionType::get(Type::getVoidTy(C), {T_psize}, false);
+            FunctionType *FT = FunctionType::get(Type::getVoidTy(C), {T_size->getPointerTo()}, false);
             F = Function::Create(FT, Function::ExternalLinkage, "julia.safepoint", M);
 #if JL_LLVM_VERSION >= 160000
             F->setMemoryEffects(MemoryEffects::inaccessibleOrArgMemOnly());
@@ -268,8 +260,8 @@ static inline llvm::Value *emit_gc_state_set(llvm::IRBuilder<> &builder, llvm::T
 {
     using namespace llvm;
     Type *T_int8 = state->getType();
-    Constant *offset = ConstantInt::getSigned(builder.getInt32Ty(), offsetof(jl_tls_states_t, gc_state));
-    Value *gc_state = builder.CreateInBoundsGEP(T_int8, ptls, ArrayRef<Value*>(offset), "gc_state");
+    unsigned offset = offsetof(jl_tls_states_t, gc_state);
+    Value *gc_state = builder.CreateConstInBoundsGEP1_32(T_int8, ptls, offset, "gc_state");
     if (old_state == nullptr) {
         old_state = builder.CreateLoad(T_int8, gc_state);
         cast<LoadInst>(old_state)->setOrdering(AtomicOrdering::Monotonic);
diff --git a/src/llvm-final-gc-lowering.cpp b/src/llvm-final-gc-lowering.cpp
index 0605098bec361..76dcd944890ab 100644
--- a/src/llvm-final-gc-lowering.cpp
+++ b/src/llvm-final-gc-lowering.cpp
@@ -202,9 +202,9 @@ bool FinalLowerGC::runOnFunction(Function &F)
             } while (0)
 
             LOWER_INTRINSIC(newGCFrame, lowerNewGCFrame);
+            LOWER_INTRINSIC(getGCFrameSlot, lowerGetGCFrameSlot);
             LOWER_INTRINSIC(pushGCFrame, lowerPushGCFrame);
             LOWER_INTRINSIC(popGCFrame, lowerPopGCFrame);
-            LOWER_INTRINSIC(getGCFrameSlot, lowerGetGCFrameSlot);
             LOWER_INTRINSIC(GCAllocBytes, lowerGCAllocBytes);
             LOWER_INTRINSIC(queueGCRoot, lowerQueueGCRoot);
             LOWER_INTRINSIC(safepoint, lowerSafepoint);
diff --git a/src/llvm-gc-interface-passes.h b/src/llvm-gc-interface-passes.h
index cb485751d407b..d33567e887118 100644
--- a/src/llvm-gc-interface-passes.h
+++ b/src/llvm-gc-interface-passes.h
@@ -312,7 +312,6 @@ struct State {
     SmallVector<SmallVector<int, 0>> CalleeRoots;
     // We don't bother doing liveness on Allocas that were not mem2reg'ed.
     // they just get directly sunk into the root array.
-    SmallVector<AllocaInst *, 0> Allocas;
     DenseMap<AllocaInst *, unsigned> ArrayAllocas;
     DenseMap<AllocaInst *, AllocaInst *> ShadowAllocas;
     SmallVector<std::pair<StoreInst *, unsigned>, 0> TrackedStores;
@@ -332,9 +331,9 @@ struct LateLowerGCFrame:  private JuliaPassContext {
 
     void MaybeNoteDef(State &S, BBState &BBS, Value *Def, const ArrayRef<int> &SafepointsSoFar,
                       SmallVector<int, 1> &&RefinedPtr = SmallVector<int, 1>());
-    void NoteUse(State &S, BBState &BBS, Value *V, LargeSparseBitVector &Uses);
-    void NoteUse(State &S, BBState &BBS, Value *V) {
-        NoteUse(S, BBS, V, BBS.UpExposedUses);
+    void NoteUse(State &S, BBState &BBS, Value *V, LargeSparseBitVector &Uses, Function &F);
+    void NoteUse(State &S, BBState &BBS, Value *V, Function &F) {
+        NoteUse(S, BBS, V, BBS.UpExposedUses, F);
     }
 
     void LiftPhi(State &S, PHINode *Phi);
@@ -348,7 +347,7 @@ struct LateLowerGCFrame:  private JuliaPassContext {
     SmallVector<int, 0> NumberAll(State &S, Value *V);
     SmallVector<int, 0> NumberAllBase(State &S, Value *Base);
 
-    void NoteOperandUses(State &S, BBState &BBS, User &UI);
+    void NoteOperandUses(State &S, BBState &BBS, Instruction &UI);
     void MaybeTrackDst(State &S, MemTransferInst *MI);
     void MaybeTrackStore(State &S, StoreInst *I);
     State LocalScan(Function &F);
diff --git a/src/llvm-late-gc-lowering.cpp b/src/llvm-late-gc-lowering.cpp
index e08f08860dfaf..1d390a5115207 100644
--- a/src/llvm-late-gc-lowering.cpp
+++ b/src/llvm-late-gc-lowering.cpp
@@ -350,15 +350,7 @@ void LateLowerGCFrame::LiftSelect(State &S, SelectInst *SI) {
                     ConstantInt::get(Type::getInt32Ty(Cond->getContext()), i),
                     "", SI);
         }
-        #if JL_LLVM_VERSION >= 170000
         assert(FalseElem->getType() == TrueElem->getType());
-        #else
-        if (FalseElem->getType() != TrueElem->getType()) {
-            // Shouldn't get here when using opaque pointers, so the new BitCastInst is fine
-            assert(FalseElem->getContext().supportsTypedPointers());
-            FalseElem = new BitCastInst(FalseElem, TrueElem->getType(), "", SI);
-        }
-        #endif
         SelectInst *SelectBase = SelectInst::Create(Cond, TrueElem, FalseElem, "gclift", SI);
         int Number = ++S.MaxPtrNumber;
         S.AllPtrNumbering[SelectBase] = Number;
@@ -427,33 +419,7 @@ void LateLowerGCFrame::LiftPhi(State &S, PHINode *Phi) {
                 BaseElem = Base;
             else
                 BaseElem = IncomingBases[i];
-            #if JL_LLVM_VERSION >= 170000
             assert(BaseElem->getType() == T_prjlvalue);
-            #else
-            if (BaseElem->getType() != T_prjlvalue) {
-                // Shouldn't get here when using opaque pointers, so the new BitCastInst is fine
-                assert(BaseElem->getContext().supportsTypedPointers());
-                auto &remap = CastedRoots[i][BaseElem];
-                if (!remap) {
-                    if (auto constant = dyn_cast<Constant>(BaseElem)) {
-                        remap = ConstantExpr::getBitCast(constant, T_prjlvalue, "");
-                    } else {
-                        Instruction *InsertBefore;
-                        if (auto arg = dyn_cast<Argument>(BaseElem)) {
-                            InsertBefore = &*arg->getParent()->getEntryBlock().getFirstInsertionPt();
-                        } else {
-                            assert(isa<Instruction>(BaseElem) && "Unknown value type detected!");
-                            InsertBefore = cast<Instruction>(BaseElem)->getNextNonDebugInstruction();
-                        }
-                        while (isa<PHINode>(InsertBefore)) {
-                            InsertBefore = InsertBefore->getNextNonDebugInstruction();
-                        }
-                        remap = new BitCastInst(BaseElem, T_prjlvalue, "", InsertBefore);
-                    }
-                }
-                BaseElem = remap;
-            }
-            #endif
             lift->addIncoming(BaseElem, IncomingBB);
         }
     }
@@ -729,8 +695,15 @@ static int NoteSafepoint(State &S, BBState &BBS, CallInst *CI, SmallVectorImpl<i
     return Number;
 }
 
-void LateLowerGCFrame::NoteUse(State &S, BBState &BBS, Value *V, LargeSparseBitVector &Uses) {
+void LateLowerGCFrame::NoteUse(State &S, BBState &BBS, Value *V, LargeSparseBitVector &Uses, Function &F) {
     // Short circuit to avoid having to deal with vectors of constants, etc.
+//#ifndef NDEBUG
+//    if (isa<PointerType>(V->getType())) {
+//        if (isSpecialPtr(V->getType()))
+//            if (isa<UndefValue>(V) && !isa<PoisonValue>(V))
+//                F.dump();
+//    }
+//#endif
     if (isa<Constant>(V))
         return;
     if (isa<PointerType>(V->getType())) {
@@ -752,9 +725,9 @@ void LateLowerGCFrame::NoteUse(State &S, BBState &BBS, Value *V, LargeSparseBitV
     }
 }
 
-void LateLowerGCFrame::NoteOperandUses(State &S, BBState &BBS, User &UI) {
+void LateLowerGCFrame::NoteOperandUses(State &S, BBState &BBS, Instruction &UI) {
     for (Use &U : UI.operands()) {
-        NoteUse(S, BBS, U);
+        NoteUse(S, BBS, U, *UI.getFunction());
     }
 }
 
@@ -1411,7 +1384,7 @@ State LateLowerGCFrame::LocalScan(Function &F) {
                     unsigned nIncoming = Phi->getNumIncomingValues();
                     for (unsigned i = 0; i < nIncoming; ++i) {
                         BBState &IncomingBBS = S.BBStates[Phi->getIncomingBlock(i)];
-                        NoteUse(S, IncomingBBS, Phi->getIncomingValue(i), IncomingBBS.PhiOuts);
+                        NoteUse(S, IncomingBBS, Phi->getIncomingValue(i), IncomingBBS.PhiOuts, F);
                     }
                 } else if (tracked.count) {
                     // We need to insert extra phis for the GC roots
@@ -1437,7 +1410,7 @@ State LateLowerGCFrame::LocalScan(Function &F) {
             } else if (auto *AI = dyn_cast<AllocaInst>(&I)) {
                 Type *ElT = AI->getAllocatedType();
                 if (AI->isStaticAlloca() && isa<PointerType>(ElT) && ElT->getPointerAddressSpace() == AddressSpace::Tracked) {
-                    S.Allocas.push_back(AI);
+                    S.ArrayAllocas[AI] = cast<ConstantInt>(AI->getArraySize())->getZExtValue();
                 }
             }
         }
@@ -1528,21 +1501,17 @@ SmallVector<Value*, 0> ExtractTrackedValues(Value *Src, Type *STy, bool isptr, I
     return Ptrs;
 }
 
-unsigned TrackWithShadow(Value *Src, Type *STy, bool isptr, Value *Dst, Type *DTy, IRBuilder<> &irbuilder) {
-    auto Ptrs = ExtractTrackedValues(Src, STy, isptr, irbuilder);
-    for (unsigned i = 0; i < Ptrs.size(); ++i) {
-        Value *Elem = Ptrs[i];// Dst has type `[n x {}*]*`
-        Value *Slot = irbuilder.CreateConstInBoundsGEP2_32(DTy, Dst, 0, i);
-        #if JL_LLVM_VERSION < 170000
-        assert(cast<PointerType>(Dst->getType())->isOpaqueOrPointeeTypeMatches(DTy));
-        #endif
-        StoreInst *shadowStore = irbuilder.CreateAlignedStore(Elem, Slot, Align(sizeof(void*)));
-        shadowStore->setOrdering(AtomicOrdering::NotAtomic);
-        // TODO: shadowStore->setMetadata(LLVMContext::MD_tbaa, tbaa_gcframe);
-    }
-    return Ptrs.size();
-}
-
+//static unsigned TrackWithShadow(Value *Src, Type *STy, bool isptr, Value *Dst, IRBuilder<> &irbuilder) {
+//    auto Ptrs = ExtractTrackedValues(Src, STy, isptr, irbuilder);
+//    for (unsigned i = 0; i < Ptrs.size(); ++i) {
+//        Value *Elem = Ptrs[i];
+//        Value *Slot = irbuilder.CreateConstInBoundsGEP1_32(irbuilder.getInt8Ty(), Dst, i * sizeof(void*));
+//        StoreInst *shadowStore = irbuilder.CreateAlignedStore(Elem, Slot, Align(sizeof(void*)));
+//        shadowStore->setOrdering(AtomicOrdering::NotAtomic);
+//        // TODO: shadowStore->setMetadata(LLVMContext::MD_tbaa, tbaa_gcframe);
+//    }
+//    return Ptrs.size();
+//}
 
 // turn a memcpy into a set of loads
 void LateLowerGCFrame::MaybeTrackDst(State &S, MemTransferInst *MI) {
@@ -2133,7 +2102,7 @@ bool LateLowerGCFrame::CleanupIR(Function &F, State *S, bool *CFGModified) {
                 // the type tag. (Note that if the size is not a constant, it will call
                 // gc_alloc_obj, and will redundantly set the tag.)
                 auto allocBytesIntrinsic = getOrDeclare(jl_intrinsics::GCAllocBytes);
-                auto ptls = get_current_ptls_from_task(builder, T_size, CI->getArgOperand(0), tbaa_gcframe);
+                auto ptls = get_current_ptls_from_task(builder, CI->getArgOperand(0), tbaa_gcframe);
                 auto newI = builder.CreateCall(
                     allocBytesIntrinsic,
                     {
@@ -2319,15 +2288,7 @@ void LateLowerGCFrame::PlaceGCFrameStore(State &S, unsigned R, unsigned MinColor
     // Pointee types don't have semantics, so the optimizer is
     // free to rewrite them if convenient. We need to change
     // it back here for the store.
-    #if JL_LLVM_VERSION >= 170000
     assert(Val->getType() == T_prjlvalue);
-    #else
-    if (Val->getType() != T_prjlvalue) {
-        // Shouldn't get here when using opaque pointers, so the new BitCastInst is fine
-        assert(Val->getContext().supportsTypedPointers());
-        Val = new BitCastInst(Val, T_prjlvalue, "", InsertBefore);
-    }
-    #endif
     new StoreInst(Val, slotAddress, InsertBefore);
 }
 
@@ -2366,7 +2327,7 @@ void LateLowerGCFrame::PlaceRootsAndUpdateCalls(SmallVectorImpl<int> &Colors, St
             MaxColor = C;
 
     // Insert instructions for the actual gc frame
-    if (MaxColor != -1 || !S.Allocas.empty() || !S.ArrayAllocas.empty() || !S.TrackedStores.empty()) {
+    if (MaxColor != -1 || !S.ArrayAllocas.empty() || !S.TrackedStores.empty()) {
         // Create and push a GC frame.
         auto gcframe = CallInst::Create(
             getOrDeclare(jl_intrinsics::newGCFrame),
@@ -2379,6 +2340,43 @@ void LateLowerGCFrame::PlaceRootsAndUpdateCalls(SmallVectorImpl<int> &Colors, St
             {gcframe, ConstantInt::get(T_int32, 0)});
         pushGcframe->insertAfter(pgcstack);
 
+        // we don't run memsetopt after this, so run a basic approximation of it
+        // that removes any redundant memset calls in the prologue since getGCFrameSlot already includes the null store
+        Instruction *toerase = nullptr;
+        for (auto &I : F->getEntryBlock()) {
+            if (toerase)
+                toerase->eraseFromParent();
+            toerase = nullptr;
+            Value *ptr;
+            Value *value;
+            bool isvolatile;
+            if (auto *SI = dyn_cast<StoreInst>(&I)) {
+                ptr = SI->getPointerOperand();
+                value = SI->getValueOperand();
+                isvolatile = SI->isVolatile();
+            }
+            else if (auto *MSI = dyn_cast<MemSetInst>(&I)) {
+                ptr = MSI->getDest();
+                value = MSI->getValue();
+                isvolatile = MSI->isVolatile();
+            }
+            else {
+                continue;
+            }
+            ptr = ptr->stripInBoundsOffsets();
+            AllocaInst *AI = dyn_cast<AllocaInst>(ptr);
+            if (isa<GetElementPtrInst>(ptr))
+                break;
+            if (!S.ArrayAllocas.count(AI))
+                continue;
+            if (isvolatile || !isa<Constant>(value) || !cast<Constant>(value)->isNullValue())
+                break; // stop once we reach a pointer operation that couldn't be analyzed or isn't a null store
+            toerase = &I;
+        }
+        if (toerase)
+            toerase->eraseFromParent();
+        toerase = nullptr;
+
         // Replace Allocas
         unsigned AllocaSlot = 2; // first two words are metadata
         auto replace_alloca = [this, gcframe, &AllocaSlot, T_int32](AllocaInst *&AI) {
@@ -2407,27 +2405,11 @@ void LateLowerGCFrame::PlaceRootsAndUpdateCalls(SmallVectorImpl<int> &Colors, St
             for (CallInst *II : ToDelete) {
                 II->eraseFromParent();
             }
-            #if JL_LLVM_VERSION >= 170000
             assert(slotAddress->getType() == AI->getType());
-            #else
-            if (slotAddress->getType() != AI->getType()) {
-                // If we're replacing an ArrayAlloca, the pointer element type may need to be fixed up
-                // Shouldn't get here when using opaque pointers, so the new BitCastInst is fine
-                assert(slotAddress->getContext().supportsTypedPointers());
-                auto BCI  = new BitCastInst(slotAddress, AI->getType());
-                BCI->insertAfter(slotAddress);
-                slotAddress = BCI;
-            }
-            #endif
             AI->replaceAllUsesWith(slotAddress);
             AI->eraseFromParent();
             AI = NULL;
         };
-        for (AllocaInst *AI : S.Allocas) {
-            auto ns = cast<ConstantInt>(AI->getArraySize())->getZExtValue();
-            replace_alloca(AI);
-            AllocaSlot += ns;
-        }
         for (auto AI : S.ArrayAllocas) {
             replace_alloca(AI.first);
             AllocaSlot += AI.second;
@@ -2443,15 +2425,7 @@ void LateLowerGCFrame::PlaceRootsAndUpdateCalls(SmallVectorImpl<int> &Colors, St
                 slotAddress->insertAfter(gcframe);
                 auto ValExpr = std::make_pair(Base, isa<PointerType>(Base->getType()) ? -1 : i);
                 auto Elem = MaybeExtractScalar(S, ValExpr, SI);
-                #if JL_LLVM_VERSION >= 170000
                 assert(Elem->getType() == T_prjlvalue);
-                #else
-                if (Elem->getType() != T_prjlvalue) {
-                    // Shouldn't get here when using opaque pointers, so the new BitCastInst is fine
-                    assert(Elem->getContext().supportsTypedPointers());
-                    Elem = new BitCastInst(Elem, T_prjlvalue, "", SI);
-                }
-                #endif
                 //auto Idxs = ArrayRef<unsigned>(Tracked[i]);
                 //Value *Elem = ExtractScalar(Base, true, Idxs, SI);
                 Value *shadowStore = new StoreInst(Elem, slotAddress, SI);
diff --git a/src/llvm-ptls.cpp b/src/llvm-ptls.cpp
index 9e49aa5ba2f39..488dd46cade21 100644
--- a/src/llvm-ptls.cpp
+++ b/src/llvm-ptls.cpp
@@ -128,7 +128,7 @@ Instruction *LowerPTLS::emit_pgcstack_tp(Value *offset, Instruction *insertBefor
             offset = ConstantInt::getSigned(T_size, jl_tls_offset);
         auto tp = InlineAsm::get(FunctionType::get(PointerType::get(builder.getContext(), 0), false), asm_str, "=r", false);
         tls = builder.CreateCall(tp, {}, "thread_ptr");
-        tls = builder.CreateGEP(Type::getInt8Ty(builder.getContext()), tls, {offset}, "tls_ppgcstack");
+        tls = builder.CreateInBoundsGEP(Type::getInt8Ty(builder.getContext()), tls, {offset}, "tls_ppgcstack");
     }
     return builder.CreateLoad(T_pppjlvalue, tls, "tls_pgcstack");
 }
@@ -191,7 +191,7 @@ void LowerPTLS::fix_pgcstack_use(CallInst *pgcstack, Function *pgcstack_getter,
         builder.SetInsertPoint(fastTerm->getParent());
         fastTerm->removeFromParent();
         MDNode *tbaa = tbaa_gcframe;
-        Value *prior = emit_gc_unsafe_enter(builder, T_size, get_current_ptls_from_task(builder, T_size, get_current_task_from_pgcstack(builder, T_size, pgcstack), tbaa), true);
+        Value *prior = emit_gc_unsafe_enter(builder, T_size, get_current_ptls_from_task(builder, get_current_task_from_pgcstack(builder, pgcstack), tbaa), true);
         builder.Insert(fastTerm);
         phi->addIncoming(pgcstack, fastTerm->getParent());
         // emit pre-return cleanup
@@ -203,7 +203,7 @@ void LowerPTLS::fix_pgcstack_use(CallInst *pgcstack, Function *pgcstack_getter,
             for (auto &BB : *pgcstack->getParent()->getParent()) {
                 if (isa<ReturnInst>(BB.getTerminator())) {
                     builder.SetInsertPoint(BB.getTerminator());
-                    emit_gc_unsafe_leave(builder, T_size, get_current_ptls_from_task(builder, T_size, get_current_task_from_pgcstack(builder, T_size, phi), tbaa), last_gc_state, true);
+                    emit_gc_unsafe_leave(builder, T_size, get_current_ptls_from_task(builder, get_current_task_from_pgcstack(builder, phi), tbaa), last_gc_state, true);
                 }
             }
         }
diff --git a/src/method.c b/src/method.c
index d4457b1549353..6aba60e7fe12c 100644
--- a/src/method.c
+++ b/src/method.c
@@ -629,7 +629,7 @@ JL_DLLEXPORT jl_method_instance_t *jl_new_method_instance_uninit(void)
     mi->backedges = NULL;
     jl_atomic_store_relaxed(&mi->cache, NULL);
     mi->cache_with_orig = 0;
-    jl_atomic_store_relaxed(&mi->precompiled, 0);
+    jl_atomic_store_relaxed(&mi->flags, 0);
     return mi;
 }
 
diff --git a/src/module.c b/src/module.c
index 7f03fc7e66a30..f4da7e1e994de 100644
--- a/src/module.c
+++ b/src/module.c
@@ -52,6 +52,8 @@ JL_DLLEXPORT jl_module_t *jl_new_module_(jl_sym_t *name, jl_module_t *parent, ui
     m->compile = -1;
     m->infer = -1;
     m->max_methods = -1;
+    m->file = name; // Using the name as a placeholder is better than nothing
+    m->line = 0;
     m->hash = parent == NULL ? bitmix(name->hash, jl_module_type->hash) :
         bitmix(name->hash, parent->hash);
     JL_MUTEX_INIT(&m->lock, "module->lock");
@@ -384,7 +386,7 @@ static inline jl_module_t *module_usings_getidx(jl_module_t *m JL_PROPAGATES_ROO
 static int eq_bindings(jl_binding_partition_t *owner, jl_binding_t *alias, size_t world)
 {
     jl_ptr_kind_union_t owner_pku = jl_atomic_load_relaxed(&owner->restriction);
-    assert(decode_restriction_kind(owner_pku) == BINDING_KIND_GLOBAL ||
+    assert(decode_restriction_kind(owner_pku) == BINDING_KIND_GLOBAL || decode_restriction_kind(owner_pku) == BINDING_KIND_DECLARED ||
            jl_bkind_is_some_constant(decode_restriction_kind(owner_pku)));
     jl_binding_partition_t *alias_bpart = jl_get_binding_partition(alias, world);
     if (owner == alias_bpart)
@@ -419,7 +421,7 @@ static jl_binding_t *using_resolve_binding(jl_module_t *m JL_PROPAGATES_ROOT, jl
                 continue;
             jl_binding_partition_t *tempbpart = jl_get_binding_partition(tempb, jl_current_task->world_age);
             jl_ptr_kind_union_t tempb_pku = jl_atomic_load_relaxed(&tempbpart->restriction);
-            assert(decode_restriction_kind(tempb_pku) == BINDING_KIND_GLOBAL || jl_bkind_is_some_constant(decode_restriction_kind(tempb_pku)));
+            assert(decode_restriction_kind(tempb_pku) == BINDING_KIND_GLOBAL || decode_restriction_kind(tempb_pku) == BINDING_KIND_DECLARED || jl_bkind_is_some_constant(decode_restriction_kind(tempb_pku)));
             (void)tempb_pku;
             if (bpart != NULL && !tempb->deprecated && !b->deprecated && !eq_bindings(tempbpart, b, jl_current_task->world_age)) {
                 if (warn) {
@@ -663,7 +665,7 @@ static void module_import_(jl_module_t *to, jl_module_t *from, jl_sym_t *asname,
     else {
         jl_binding_partition_t *bpart = jl_get_binding_partition(b, jl_current_task->world_age);
         jl_ptr_kind_union_t pku = jl_atomic_load_relaxed(&bpart->restriction);
-        assert(decode_restriction_kind(pku) == BINDING_KIND_GLOBAL || jl_bkind_is_some_constant(decode_restriction_kind(pku)));
+        assert(decode_restriction_kind(pku) == BINDING_KIND_GLOBAL || decode_restriction_kind(pku) == BINDING_KIND_DECLARED || jl_bkind_is_some_constant(decode_restriction_kind(pku)));
         (void)pku;
         if (b->deprecated) {
             if (jl_get_binding_value(b) == jl_nothing) {
@@ -856,7 +858,7 @@ JL_DLLEXPORT int jl_binding_resolved_p(jl_module_t *m, jl_sym_t *var)
     return kind == BINDING_KIND_DECLARED || !jl_bkind_is_some_guard(kind);
 }
 
-static uint_t bindingkey_hash(size_t idx, jl_value_t *data)
+uint_t bindingkey_hash(size_t idx, jl_value_t *data)
 {
     jl_binding_t *b = (jl_binding_t*)jl_svecref(data, idx); // This must always happen inside the lock
     jl_sym_t *var = b->globalref->name;
@@ -1179,6 +1181,14 @@ jl_module_t *jl_module_root(jl_module_t *m)
     }
 }
 
+JL_DLLEXPORT jl_sym_t *jl_module_getloc(jl_module_t *m, int32_t *line)
+{
+    if (line) {
+        *line = m->line;
+    }
+    return m->file;
+}
+
 JL_DLLEXPORT jl_uuid_t jl_module_build_id(jl_module_t *m) { return m->build_id; }
 JL_DLLEXPORT jl_uuid_t jl_module_uuid(jl_module_t* m) { return m->uuid; }
 
diff --git a/src/precompile.c b/src/precompile.c
index c40e867ea699e..c21cf5367fba6 100644
--- a/src/precompile.c
+++ b/src/precompile.c
@@ -39,9 +39,17 @@ void write_srctext(ios_t *f, jl_array_t *udeps, int64_t srctextpos) {
         static jl_value_t *replace_depot_func = NULL;
         if (!replace_depot_func)
             replace_depot_func = jl_get_global(jl_base_module, jl_symbol("replace_depot_path"));
+        static jl_value_t *normalize_depots_func = NULL;
+        if (!normalize_depots_func)
+            normalize_depots_func = jl_get_global(jl_base_module, jl_symbol("normalize_depots_for_relocation"));
         ios_t srctext;
-        jl_value_t *deptuple = NULL;
-        JL_GC_PUSH2(&deptuple, &udeps);
+        jl_value_t *deptuple = NULL, *depots = NULL;
+        JL_GC_PUSH3(&deptuple, &udeps, &depots);
+        jl_task_t *ct = jl_current_task;
+        size_t last_age = ct->world_age;
+        ct->world_age = jl_atomic_load_acquire(&jl_world_counter);
+        depots = jl_apply(&normalize_depots_func, 1);
+        ct->world_age = last_age;
         for (size_t i = 0; i < len; i++) {
             deptuple = jl_array_ptr_ref(udeps, i);
             jl_value_t *depmod = jl_fieldref(deptuple, 0);  // module
@@ -60,13 +68,14 @@ void write_srctext(ios_t *f, jl_array_t *udeps, int64_t srctextpos) {
                 }
 
                 jl_value_t **replace_depot_args;
-                JL_GC_PUSHARGS(replace_depot_args, 2);
+                JL_GC_PUSHARGS(replace_depot_args, 3);
                 replace_depot_args[0] = replace_depot_func;
                 replace_depot_args[1] = abspath;
+                replace_depot_args[2] = depots;
                 jl_task_t *ct = jl_current_task;
                 size_t last_age = ct->world_age;
                 ct->world_age = jl_atomic_load_acquire(&jl_world_counter);
-                jl_value_t *depalias = (jl_value_t*)jl_apply(replace_depot_args, 2);
+                jl_value_t *depalias = (jl_value_t*)jl_apply(replace_depot_args, 3);
                 ct->world_age = last_age;
                 JL_GC_POP();
 
@@ -116,14 +125,16 @@ JL_DLLEXPORT void jl_write_compiler_output(void)
         if (f) {
             jl_array_ptr_1d_push(jl_module_init_order, m);
             int setting = jl_get_module_compile((jl_module_t*)m);
-            if (setting != JL_OPTIONS_COMPILE_OFF &&
-                setting != JL_OPTIONS_COMPILE_MIN) {
+            if ((setting != JL_OPTIONS_COMPILE_OFF && (jl_options.trim ||
+                (setting != JL_OPTIONS_COMPILE_MIN)))) {
                 // TODO: this would be better handled if moved entirely to jl_precompile
                 // since it's a slightly duplication of effort
                 jl_value_t *tt = jl_is_type(f) ? (jl_value_t*)jl_wrap_Type(f) : jl_typeof(f);
                 JL_GC_PUSH1(&tt);
                 tt = jl_apply_tuple_type_v(&tt, 1);
                 jl_compile_hint((jl_tupletype_t*)tt);
+                if (jl_options.trim)
+                    jl_add_entrypoint((jl_tupletype_t*)tt);
                 JL_GC_POP();
             }
         }
@@ -188,6 +199,10 @@ JL_DLLEXPORT void jl_write_compiler_output(void)
             jl_printf(JL_STDERR, "\n  ** incremental compilation may be broken for this module **\n\n");
         }
     }
+    if (jl_options.trim) {
+        exit(0); // Some finalizers need to run and we've blown up the bindings table
+        // TODO: Is this still needed
+    }
     JL_GC_POP();
     jl_gc_enable_finalizers(ct, 1);
 }
diff --git a/src/precompile_utils.c b/src/precompile_utils.c
index 5a4f599d1f0eb..a78d1e66dbb51 100644
--- a/src/precompile_utils.c
+++ b/src/precompile_utils.c
@@ -321,3 +321,83 @@ static void *jl_precompile_worklist(jl_array_t *worklist, jl_array_t *extext_met
     JL_GC_POP();
     return native_code;
 }
+
+static int enq_ccallable_entrypoints_(jl_typemap_entry_t *def, void *closure)
+{
+    jl_method_t *m = def->func.method;
+    if (m->external_mt)
+        return 1;
+    if (m->ccallable)
+        jl_add_entrypoint((jl_tupletype_t*)jl_svecref(m->ccallable, 1));
+    return 1;
+}
+
+static int enq_ccallable_entrypoints(jl_methtable_t *mt, void *env)
+{
+    return jl_typemap_visitor(jl_atomic_load_relaxed(&mt->defs), enq_ccallable_entrypoints_, env);
+}
+
+JL_DLLEXPORT void jl_add_ccallable_entrypoints(void)
+{
+    jl_foreach_reachable_mtable(enq_ccallable_entrypoints, NULL);
+}
+
+static void *jl_precompile_trimmed(size_t world)
+{
+    // array of MethodInstances and ccallable aliases to include in the output
+    jl_array_t *m = jl_alloc_vec_any(0);
+    jl_value_t *ccallable = NULL;
+    JL_GC_PUSH2(&m, &ccallable);
+    jl_method_instance_t *mi;
+    while (1)
+    {
+        mi = (jl_method_instance_t*)arraylist_pop(jl_entrypoint_mis);
+        if (mi == NULL)
+            break;
+        assert(jl_is_method_instance(mi));
+
+        jl_array_ptr_1d_push(m, (jl_value_t*)mi);
+        ccallable = (jl_value_t *)mi->def.method->ccallable;
+        if (ccallable)
+            jl_array_ptr_1d_push(m, ccallable);
+    }
+
+    jl_cgparams_t params = jl_default_cgparams;
+    params.trim = jl_options.trim;
+    void *native_code = jl_create_native(m, NULL, &params, 0, /* imaging */ 1, 0,
+                                         world);
+    JL_GC_POP();
+    return native_code;
+}
+
+static void jl_rebuild_methtables(arraylist_t* MIs, htable_t* mtables)
+{
+    size_t i;
+    for (i = 0; i < MIs->len; i++) {
+        jl_method_instance_t *mi = (jl_method_instance_t*)MIs->items[i];
+        jl_method_t *m = mi->def.method;
+        jl_methtable_t *old_mt = jl_method_get_table(m);
+        if ((jl_value_t *)old_mt == jl_nothing)
+            continue;
+        jl_sym_t *name = old_mt->name;
+        if (!ptrhash_has(mtables, old_mt))
+            ptrhash_put(mtables, old_mt, jl_new_method_table(name, m->module));
+        jl_methtable_t *mt = (jl_methtable_t*)ptrhash_get(mtables, old_mt);
+        size_t world =  jl_atomic_load_acquire(&jl_world_counter);
+        jl_value_t * lookup = jl_methtable_lookup(mt, m->sig, world);
+        // Check if the method is already in the new table, if not then insert it there
+        if (lookup == jl_nothing || (jl_method_t*)lookup != m) {
+            //TODO: should this be a function like unsafe_insert_method?
+            size_t min_world = jl_atomic_load_relaxed(&m->primary_world);
+            size_t max_world = jl_atomic_load_relaxed(&m->deleted_world);
+            jl_atomic_store_relaxed(&m->primary_world, ~(size_t)0);
+            jl_atomic_store_relaxed(&m->deleted_world, 1);
+            jl_typemap_entry_t *newentry = jl_method_table_add(mt, m, NULL);
+            jl_atomic_store_relaxed(&m->primary_world, min_world);
+            jl_atomic_store_relaxed(&m->deleted_world, max_world);
+            jl_atomic_store_relaxed(&newentry->min_world, min_world);
+            jl_atomic_store_relaxed(&newentry->max_world, max_world);
+        }
+    }
+
+}
diff --git a/src/rtutils.c b/src/rtutils.c
index a6a7fd5614de0..85a9be5e0b1da 100644
--- a/src/rtutils.c
+++ b/src/rtutils.c
@@ -269,10 +269,11 @@ JL_DLLEXPORT void jl_eh_restore_state(jl_task_t *ct, jl_handler_t *eh)
     // `eh` may be not equal to `ct->eh`. See `jl_pop_handler`
     // This function should **NOT** have any safepoint before the ones at the
     // end.
-    sig_atomic_t old_defer_signal = ct->ptls->defer_signal;
+    jl_ptls_t ptls = ct->ptls;
+    sig_atomic_t old_defer_signal = ptls->defer_signal;
     ct->eh = eh->prev;
     ct->gcstack = eh->gcstack;
-    small_arraylist_t *locks = &ct->ptls->locks;
+    small_arraylist_t *locks = &ptls->locks;
     int unlocks = locks->len > eh->locks_len;
     if (unlocks) {
         for (size_t i = locks->len; i > eh->locks_len; i--)
@@ -280,14 +281,26 @@ JL_DLLEXPORT void jl_eh_restore_state(jl_task_t *ct, jl_handler_t *eh)
         locks->len = eh->locks_len;
     }
     ct->world_age = eh->world_age;
-    ct->ptls->defer_signal = eh->defer_signal;
-    int8_t old_gc_state = jl_atomic_load_relaxed(&ct->ptls->gc_state);
+    ptls->defer_signal = eh->defer_signal;
+    int8_t old_gc_state = jl_atomic_load_relaxed(&ptls->gc_state);
     if (old_gc_state != eh->gc_state)
-        jl_atomic_store_release(&ct->ptls->gc_state, eh->gc_state);
+        jl_atomic_store_release(&ptls->gc_state, eh->gc_state);
     if (!old_gc_state || !eh->gc_state) // it was or is unsafe now
-        jl_gc_safepoint_(ct->ptls);
+        jl_gc_safepoint_(ptls);
+    jl_value_t *exception = ptls->sig_exception;
+    if (exception) {
+        int8_t oldstate = jl_gc_unsafe_enter(ptls);
+        /* The temporary ptls->bt_data is rooted by special purpose code in the
+        GC. This exists only for the purpose of preserving bt_data until we
+        set ptls->bt_size=0 below. */
+        jl_push_excstack(ct, &ct->excstack, exception,
+                         ptls->bt_data, ptls->bt_size);
+        ptls->bt_size = 0;
+        ptls->sig_exception = NULL;
+        jl_gc_unsafe_leave(ptls, oldstate);
+    }
     if (old_defer_signal && !eh->defer_signal)
-        jl_sigint_safepoint(ct->ptls);
+        jl_sigint_safepoint(ptls);
     if (jl_atomic_load_relaxed(&jl_gc_have_pending_finalizers) &&
             unlocks && eh->locks_len == 0) {
         jl_gc_run_pending_finalizers(ct);
diff --git a/src/scheduler.c b/src/scheduler.c
index bd7da13aa42e3..bb2f85b52283f 100644
--- a/src/scheduler.c
+++ b/src/scheduler.c
@@ -84,15 +84,6 @@ JL_DLLEXPORT int jl_set_task_threadpoolid(jl_task_t *task, int8_t tpid) JL_NOTSA
 extern int jl_gc_mark_queue_obj_explicit(jl_gc_mark_cache_t *gc_cache,
                                          jl_gc_markqueue_t *mq, jl_value_t *obj) JL_NOTSAFEPOINT;
 
-// parallel task runtime
-// ---
-
-JL_DLLEXPORT uint32_t jl_rand_ptls(uint32_t max) // [0, n)
-{
-    jl_ptls_t ptls = jl_current_task->ptls;
-    return cong(max, &ptls->rngseed);
-}
-
 // initialize the threading infrastructure
 // (called only by the main thread)
 void jl_init_threadinginfra(void)
diff --git a/src/signals-mach.c b/src/signals-mach.c
index c31b6d506b4e6..a939e4df71ae0 100644
--- a/src/signals-mach.c
+++ b/src/signals-mach.c
@@ -222,38 +222,92 @@ typedef arm_exception_state64_t host_exception_state_t;
 #define HOST_EXCEPTION_STATE_COUNT ARM_EXCEPTION_STATE64_COUNT
 #endif
 
-static void jl_call_in_state(jl_ptls_t ptls2, host_thread_state_t *state,
-                             void (*fptr)(void))
+// create a fake function that describes the variable manipulations in jl_call_in_state
+__attribute__((naked)) static void fake_stack_pop(void)
 {
 #ifdef _CPU_X86_64_
-    uintptr_t rsp = state->__rsp;
+    __asm__ volatile (
+        "  .cfi_signal_frame\n"
+        "  .cfi_def_cfa %rsp, 0\n" // CFA here uses %rsp directly
+        "  .cfi_offset %rip, 0\n" // previous value of %rip at CFA
+        "  .cfi_offset %rsp, 8\n" // previous value of %rsp at CFA
+        "  nop\n"
+    );
 #elif defined(_CPU_AARCH64_)
-    uintptr_t rsp = state->__sp;
+    __asm__ volatile (
+        "  .cfi_signal_frame\n"
+        "  .cfi_def_cfa sp, 0\n" // use sp as fp here
+        "  .cfi_offset lr, 0\n"
+        "  .cfi_offset sp, 8\n"
+        // Anything else got smashed, since we didn't explicitly copy all of the
+        // state object to the stack (to build a real sigreturn frame).
+        // This is also not quite valid, since the AArch64 DWARF spec lacks the ability to define how to restore the LR register correctly,
+        // so normally libunwind implementations on linux detect this function specially and hack around the invalid info:
+        // https://github.com/llvm/llvm-project/commit/c82deed6764cbc63966374baf9721331901ca958
+        " nop\n"
+    );
 #else
-#error "julia: throw-in-context not supported on this platform"
+CFI_NORETURN
 #endif
-    if (ptls2 == NULL || is_addr_on_sigstack(ptls2, (void*)rsp)) {
-        rsp = (rsp - 256) & ~(uintptr_t)15; // redzone and re-alignment
-    }
-    else {
-        rsp = (uintptr_t)ptls2->signal_stack + (ptls2->signal_stack_size ? ptls2->signal_stack_size : sig_stack_size);
-    }
-    assert(rsp % 16 == 0);
-    rsp -= 16;
+}
 
+static void jl_call_in_state(host_thread_state_t *state, void (*fptr)(void))
+{
 #ifdef _CPU_X86_64_
-    rsp -= sizeof(void*);
-    state->__rsp = rsp; // set stack pointer
+    uintptr_t sp = state->__rsp;
+#elif defined(_CPU_AARCH64_)
+    uintptr_t sp = state->__sp;
+#endif
+    sp = (sp - 256) & ~(uintptr_t)15; // redzone and re-alignment
+    assert(sp % 16 == 0);
+    sp -= 16;
+#ifdef _CPU_X86_64_
+    // set return address to NULL
+    *(uintptr_t*)sp = 0;
+    // pushq %sp
+    sp -= sizeof(void*);
+    *(uintptr_t*)sp = state->__rsp;
+    // pushq %rip
+    sp -= sizeof(void*);
+    *(uintptr_t*)sp = state->__rip;
+    // pushq .fake_stack_pop + 1; aka call from fake_stack_pop
+    sp -= sizeof(void*);
+    *(uintptr_t*)sp = (uintptr_t)&fake_stack_pop + 1;
+    state->__rsp = sp; // set stack pointer
     state->__rip = (uint64_t)fptr; // "call" the function
 #elif defined(_CPU_AARCH64_)
-    state->__sp = rsp;
-    state->__pc = (uint64_t)fptr;
-    state->__lr = 0;
+    // push {%sp, %pc + 4}
+    sp -= sizeof(void*);
+    *(uintptr_t*)sp = state->__sp;
+    sp -= sizeof(void*);
+    *(uintptr_t*)sp = (uintptr_t)state->__pc;
+    state->__sp = sp; // x31
+    state->__pc = (uint64_t)fptr; // pc
+    state->__lr = (uintptr_t)&fake_stack_pop + 4; // x30
 #else
 #error "julia: throw-in-context not supported on this platform"
 #endif
 }
 
+static void jl_longjmp_in_state(host_thread_state_t *state, jl_jmp_buf jmpbuf)
+{
+
+    if (!jl_simulate_longjmp(jmpbuf, (bt_context_t*)state)) {
+        // for sanitizer builds, fallback to calling longjmp on the original stack
+        // (this will fail for stack overflow, but that is hardly sanitizer-legal anyways)
+#ifdef _CPU_X86_64_
+    state->__rdi = (uintptr_t)jmpbuf;
+    state->__rsi = 1;
+#elif defined(_CPU_AARCH64_)
+    state->__x[0] = (uintptr_t)jmpbuf;
+    state->__x[1] = 1;
+#else
+#error "julia: jl_longjmp_in_state not supported on this platform"
+#endif
+        jl_call_in_state(state, (void (*)(void))longjmp);
+    }
+}
+
 #ifdef _CPU_X86_64_
 int is_write_fault(host_exception_state_t exc_state) {
     return exc_reg_is_write_fault(exc_state.__err);
@@ -275,14 +329,26 @@ static void jl_throw_in_thread(jl_ptls_t ptls2, mach_port_t thread, jl_value_t *
     host_thread_state_t state;
     kern_return_t ret = thread_get_state(thread, MACH_THREAD_STATE, (thread_state_t)&state, &count);
     HANDLE_MACH_ERROR("thread_get_state", ret);
-    if (1) { // XXX: !jl_has_safe_restore(ptls2)
+    if (ptls2->safe_restore) {
+        jl_longjmp_in_state(&state, *ptls2->safe_restore);
+    }
+    else {
         assert(exception);
         ptls2->bt_size =
             rec_backtrace_ctx(ptls2->bt_data, JL_MAX_BT_SIZE, (bt_context_t *)&state,
-                              NULL /*current_task?*/);
+                            NULL /*current_task?*/);
         ptls2->sig_exception = exception;
+        ptls2->io_wait = 0;
+        jl_task_t *ct = ptls2->current_task;
+        jl_handler_t *eh = ct->eh;
+        if (eh != NULL) {
+            asan_unpoison_task_stack(ct, &eh->eh_ctx);
+            jl_longjmp_in_state(&state, eh->eh_ctx);
+        }
+        else {
+            jl_no_exc_handler(exception, ct);
+        }
     }
-    jl_call_in_state(ptls2, &state, &jl_sig_throw);
     ret = thread_set_state(thread, MACH_THREAD_STATE, (thread_state_t)&state, count);
     HANDLE_MACH_ERROR("thread_set_state", ret);
 }
@@ -290,14 +356,15 @@ static void jl_throw_in_thread(jl_ptls_t ptls2, mach_port_t thread, jl_value_t *
 static void segv_handler(int sig, siginfo_t *info, void *context)
 {
     assert(sig == SIGSEGV || sig == SIGBUS);
-    if (jl_get_safe_restore()) { // restarting jl_ or jl_unwind_stepn
-        jl_task_t *ct = jl_get_current_task();
-        jl_ptls_t ptls = ct == NULL ? NULL : ct->ptls;
-        jl_call_in_state(ptls, (host_thread_state_t*)jl_to_bt_context(context), &jl_sig_throw);
+    jl_jmp_buf *saferestore = jl_get_safe_restore();
+    if (saferestore) { // restarting jl_ or jl_unwind_stepn
+        jl_longjmp_in_state((host_thread_state_t*)jl_to_bt_context(context), *saferestore);
         return;
     }
     jl_task_t *ct = jl_get_current_task();
-    if ((sig != SIGBUS || info->si_code == BUS_ADRERR) && is_addr_on_stack(ct, info->si_addr)) { // stack overflow and not a BUS_ADRALN (alignment error)
+    if ((sig != SIGBUS || info->si_code == BUS_ADRERR) &&
+    !(ct == NULL || ct->ptls == NULL || jl_atomic_load_relaxed(&ct->ptls->gc_state) == JL_GC_STATE_WAITING || ct->eh == NULL)
+    && is_addr_on_stack(ct, info->si_addr)) { // stack overflow and not a BUS_ADRALN (alignment error)
         stack_overflow_warning();
     }
     sigdie_handler(sig, info, context);
@@ -352,12 +419,10 @@ kern_return_t catch_mach_exception_raise(
         jl_safe_printf("ERROR: Exception handler triggered on unmanaged thread.\n");
         return KERN_INVALID_ARGUMENT;
     }
-    // XXX: jl_throw_in_thread or segv_handler will eventually check this, but
-    //      we would like to avoid some of this work if we could detect this earlier
-    // if (jl_has_safe_restore(ptls2)) {
-    //     jl_throw_in_thread(ptls2, thread, NULL);
-    //     return KERN_SUCCESS;
-    // }
+    if (ptls2->safe_restore) {
+        jl_throw_in_thread(ptls2, thread, NULL);
+        return KERN_SUCCESS;
+    }
     if (jl_atomic_load_acquire(&ptls2->gc_state) == JL_GC_STATE_WAITING)
         return KERN_FAILURE;
     if (exception == EXC_ARITHMETIC) {
@@ -516,7 +581,6 @@ static void jl_try_deliver_sigint(void)
 
 static void JL_NORETURN jl_exit_thread0_cb(int signo)
 {
-CFI_NORETURN
     jl_critical_error(signo, 0, NULL, jl_current_task);
     jl_atexit_hook(128);
     jl_raise(signo);
@@ -548,7 +612,7 @@ static void jl_exit_thread0(int signo, jl_bt_element_t *bt_data, size_t bt_size)
 #else
 #error Fill in first integer argument here
 #endif
-    jl_call_in_state(ptls2, &state, (void (*)(void))&jl_exit_thread0_cb);
+    jl_call_in_state(&state, (void (*)(void))&jl_exit_thread0_cb);
     unsigned int count = MACH_THREAD_STATE_COUNT;
     ret = thread_set_state(thread, MACH_THREAD_STATE, (thread_state_t)&state, count);
     HANDLE_MACH_ERROR("thread_set_state", ret);
diff --git a/src/signals-unix.c b/src/signals-unix.c
index 005422bea03d3..f99eca31730b6 100644
--- a/src/signals-unix.c
+++ b/src/signals-unix.c
@@ -44,7 +44,7 @@ static const size_t sig_stack_size = 8 * 1024 * 1024;
 
 // helper function for returning the unw_context_t inside a ucontext_t
 // (also used by stackwalk.c)
-bt_context_t *jl_to_bt_context(void *sigctx)
+bt_context_t *jl_to_bt_context(void *sigctx) JL_NOTSAFEPOINT
 {
 #ifdef __APPLE__
     return (bt_context_t*)&((ucontext64_t*)sigctx)->uc_mcontext64->__ss;
@@ -62,7 +62,11 @@ bt_context_t *jl_to_bt_context(void *sigctx)
 static int thread0_exit_count = 0;
 static void jl_exit_thread0(int signo, jl_bt_element_t *bt_data, size_t bt_size);
 
-static inline __attribute__((unused)) uintptr_t jl_get_rsp_from_ctx(const void *_ctx)
+int jl_simulate_longjmp(jl_jmp_buf mctx, bt_context_t *c) JL_NOTSAFEPOINT;
+static void jl_longjmp_in_ctx(int sig, void *_ctx, jl_jmp_buf jmpbuf);
+
+#if !defined(_OS_DARWIN_)
+static inline uintptr_t jl_get_rsp_from_ctx(const void *_ctx)
 {
 #if defined(_OS_LINUX_) && defined(_CPU_X86_64_)
     const ucontext_t *ctx = (const ucontext_t*)_ctx;
@@ -76,12 +80,6 @@ static inline __attribute__((unused)) uintptr_t jl_get_rsp_from_ctx(const void *
 #elif defined(_OS_LINUX_) && defined(_CPU_ARM_)
     const ucontext_t *ctx = (const ucontext_t*)_ctx;
     return ctx->uc_mcontext.arm_sp;
-#elif defined(_OS_DARWIN_) && defined(_CPU_X86_64_)
-    const ucontext64_t *ctx = (const ucontext64_t*)_ctx;
-    return ctx->uc_mcontext64->__ss.__rsp;
-#elif defined(_OS_DARWIN_) && defined(_CPU_AARCH64_)
-    const ucontext64_t *ctx = (const ucontext64_t*)_ctx;
-    return ctx->uc_mcontext64->__ss.__sp;
 #elif defined(_OS_FREEBSD_) && defined(_CPU_X86_64_)
     const ucontext_t *ctx = (const ucontext_t*)_ctx;
     return ctx->uc_mcontext.mc_rsp;
@@ -97,7 +95,7 @@ static inline __attribute__((unused)) uintptr_t jl_get_rsp_from_ctx(const void *
 #endif
 }
 
-static int is_addr_on_sigstack(jl_ptls_t ptls, void *ptr)
+static int is_addr_on_sigstack(jl_ptls_t ptls, void *ptr) JL_NOTSAFEPOINT
 {
     // One guard page for signal_stack.
     return ptls->signal_stack == NULL ||
@@ -105,10 +103,8 @@ static int is_addr_on_sigstack(jl_ptls_t ptls, void *ptr)
             (char*)ptr <= (char*)ptls->signal_stack + (ptls->signal_stack_size ? ptls->signal_stack_size : sig_stack_size));
 }
 
-// Modify signal context `_ctx` so that `fptr` will execute when the signal
-// returns. `fptr` will execute on the signal stack, and must not return.
-// jl_call_in_ctx is also currently executing on that signal stack,
-// so be careful not to smash it
+// Modify signal context `_ctx` so that `fptr` will execute when the signal returns
+// The function `fptr` itself must not return.
 JL_NO_ASAN static void jl_call_in_ctx(jl_ptls_t ptls, void (*fptr)(void), int sig, void *_ctx)
 {
     // Modifying the ucontext should work but there is concern that
@@ -118,44 +114,36 @@ JL_NO_ASAN static void jl_call_in_ctx(jl_ptls_t ptls, void (*fptr)(void), int si
     // checks that the syscall is made in the signal handler and that
     // the ucontext address is valid. Hopefully the value of the ucontext
     // will not be part of the validation...
-    if (!ptls) {
-        sigset_t sset;
-        sigemptyset(&sset);
-        sigaddset(&sset, sig);
-        pthread_sigmask(SIG_UNBLOCK, &sset, NULL);
-        fptr();
-        return;
-    }
     uintptr_t rsp = jl_get_rsp_from_ctx(_ctx);
-    if (is_addr_on_sigstack(ptls, (void*)rsp))
-        rsp = (rsp - 256) & ~(uintptr_t)15; // redzone and re-alignment
-    else
-        rsp = (uintptr_t)ptls->signal_stack + (ptls->signal_stack_size ? ptls->signal_stack_size : sig_stack_size);
-    assert(rsp % 16 == 0);
-    rsp -= 16;
+    rsp = (rsp - 256) & ~(uintptr_t)15; // redzone and re-alignment
 #if defined(_OS_LINUX_) && defined(_CPU_X86_64_)
     ucontext_t *ctx = (ucontext_t*)_ctx;
     rsp -= sizeof(void*);
+    *(uintptr_t*)rsp = 0;
     ctx->uc_mcontext.gregs[REG_RSP] = rsp;
     ctx->uc_mcontext.gregs[REG_RIP] = (uintptr_t)fptr;
 #elif defined(_OS_FREEBSD_) && defined(_CPU_X86_64_)
     ucontext_t *ctx = (ucontext_t*)_ctx;
     rsp -= sizeof(void*);
+    *(uintptr_t*)rsp = 0;
     ctx->uc_mcontext.mc_rsp = rsp;
     ctx->uc_mcontext.mc_rip = (uintptr_t)fptr;
 #elif defined(_OS_LINUX_) && defined(_CPU_X86_)
     ucontext_t *ctx = (ucontext_t*)_ctx;
     rsp -= sizeof(void*);
+    *(uintptr_t*)rsp = 0;
     ctx->uc_mcontext.gregs[REG_ESP] = rsp;
     ctx->uc_mcontext.gregs[REG_EIP] = (uintptr_t)fptr;
 #elif defined(_OS_FREEBSD_) && defined(_CPU_X86_)
     ucontext_t *ctx = (ucontext_t*)_ctx;
     rsp -= sizeof(void*);
+    *(uintptr_t*)rsp = 0;
     ctx->uc_mcontext.mc_esp = rsp;
     ctx->uc_mcontext.mc_eip = (uintptr_t)fptr;
 #elif defined(_OS_OPENBSD_) && defined(_CPU_X86_64_)
     struct sigcontext *ctx = (struct sigcontext *)_ctx;
     rsp -= sizeof(void*);
+    *(uintptr_t*)rsp = 0;
     ctx->sc_rsp = rsp;
     ctx->sc_rip = fptr;
 #elif defined(_OS_LINUX_) && defined(_CPU_AARCH64_)
@@ -187,22 +175,6 @@ JL_NO_ASAN static void jl_call_in_ctx(jl_ptls_t ptls, void (*fptr)(void), int si
     ctx->uc_mcontext.arm_sp = rsp;
     ctx->uc_mcontext.arm_lr = 0; // Clear link register
     ctx->uc_mcontext.arm_pc = target;
-#elif defined(_OS_DARWIN_) && (defined(_CPU_X86_64_) || defined(_CPU_AARCH64_))
-    // Only used for SIGFPE.
-    // This doesn't seems to be reliable when the SIGFPE is generated
-    // from a divide-by-zero exception, which is now handled by
-    // `catch_exception_raise`. It works fine when a signal is received
-    // due to `kill`/`raise` though.
-    ucontext64_t *ctx = (ucontext64_t*)_ctx;
-#if defined(_CPU_X86_64_)
-    rsp -= sizeof(void*);
-    ctx->uc_mcontext64->__ss.__rsp = rsp;
-    ctx->uc_mcontext64->__ss.__rip = (uintptr_t)fptr;
-#else
-    ctx->uc_mcontext64->__ss.__sp = rsp;
-    ctx->uc_mcontext64->__ss.__pc = (uintptr_t)fptr;
-    ctx->uc_mcontext64->__ss.__lr = 0;
-#endif
 #else
 #pragma message("julia: throw-in-context not supported on this platform")
     // TODO Add support for PowerPC(64)?
@@ -213,22 +185,30 @@ JL_NO_ASAN static void jl_call_in_ctx(jl_ptls_t ptls, void (*fptr)(void), int si
     fptr();
 #endif
 }
+#endif
 
 static void jl_throw_in_ctx(jl_task_t *ct, jl_value_t *e, int sig, void *sigctx)
 {
     jl_ptls_t ptls = ct->ptls;
-    if (!jl_get_safe_restore()) {
-        ptls->bt_size =
-            rec_backtrace_ctx(ptls->bt_data, JL_MAX_BT_SIZE, jl_to_bt_context(sigctx),
-                              ct->gcstack);
-        ptls->sig_exception = e;
+    assert(!jl_get_safe_restore());
+    ptls->bt_size =
+        rec_backtrace_ctx(ptls->bt_data, JL_MAX_BT_SIZE, jl_to_bt_context(sigctx),
+                            ct->gcstack);
+    ptls->sig_exception = e;
+    ptls->io_wait = 0;
+    jl_handler_t *eh = ct->eh;
+    if (eh != NULL) {
+        asan_unpoison_task_stack(ct, &eh->eh_ctx);
+        jl_longjmp_in_ctx(sig, sigctx, eh->eh_ctx);
+    }
+    else {
+        jl_no_exc_handler(e, ct);
     }
-    jl_call_in_ctx(ptls, &jl_sig_throw, sig, sigctx);
 }
 
 static pthread_t signals_thread;
 
-static int is_addr_on_stack(jl_task_t *ct, void *addr)
+static int is_addr_on_stack(jl_task_t *ct, void *addr) JL_NOTSAFEPOINT
 {
     if (ct->ctx.copy_stack) {
         jl_ptls_t ptls = ct->ptls;
@@ -379,7 +359,7 @@ int is_write_fault(void *context) {
 }
 #endif
 
-static int jl_is_on_sigstack(jl_ptls_t ptls, void *ptr, void *context)
+static int jl_is_on_sigstack(jl_ptls_t ptls, void *ptr, void *context) JL_NOTSAFEPOINT
 {
     return (ptls->signal_stack != NULL &&
             is_addr_on_sigstack(ptls, ptr) &&
@@ -389,8 +369,9 @@ static int jl_is_on_sigstack(jl_ptls_t ptls, void *ptr, void *context)
 JL_NO_ASAN static void segv_handler(int sig, siginfo_t *info, void *context)
 {
     assert(sig == SIGSEGV || sig == SIGBUS);
-    if (jl_get_safe_restore()) { // restarting jl_ or profile
-        jl_call_in_ctx(NULL, &jl_sig_throw, sig, context);
+    jl_jmp_buf *saferestore = jl_get_safe_restore();
+    if (saferestore) { // restarting jl_ or profile
+        jl_longjmp_in_ctx(sig, context, *saferestore);
         return;
     }
     jl_task_t *ct = jl_get_current_task();
@@ -445,9 +426,11 @@ pthread_mutex_t in_signal_lock; // shared with jl_delete_thread
 static bt_context_t *signal_context; // protected by in_signal_lock
 static int exit_signal_cond = -1;
 static int signal_caught_cond = -1;
+static int signals_inflight = 0;
 
 int jl_thread_suspend_and_get_state(int tid, int timeout, bt_context_t *ctx)
 {
+    int err;
     pthread_mutex_lock(&in_signal_lock);
     jl_ptls_t ptls2 = jl_atomic_load_relaxed(&jl_all_tls_states)[tid];
     jl_task_t *ct2 = ptls2 ? jl_atomic_load_relaxed(&ptls2->current_task) : NULL;
@@ -456,24 +439,45 @@ int jl_thread_suspend_and_get_state(int tid, int timeout, bt_context_t *ctx)
         pthread_mutex_unlock(&in_signal_lock);
         return 0;
     }
-    sig_atomic_t request = 0;
-    if (!jl_atomic_cmpswap(&ptls2->signal_request, &request, 1)) {
+    while (signals_inflight) {
         // something is wrong, or there is already a usr2 in flight elsewhere
-        pthread_mutex_unlock(&in_signal_lock);
-        return 0;
+        // try to wait for it to finish or wait for timeout
+        struct pollfd event = {signal_caught_cond, POLLIN, 0};
+        do {
+            err = poll(&event, 1, timeout * 1000);
+        } while (err == -1 && errno == EINTR);
+        if (err == -1 || (event.revents & POLLIN) == 0) {
+            // not ready after timeout: cancel this request
+            pthread_mutex_unlock(&in_signal_lock);
+            return 0;
+        }
+        // consume it before continuing
+        eventfd_t got;
+        do {
+            err = read(signal_caught_cond, &got, sizeof(eventfd_t));
+        } while (err == -1 && errno == EINTR);
+        if (err != sizeof(eventfd_t)) abort();
+        assert(signals_inflight >= got);
+        signals_inflight -= got;
     }
+    signals_inflight++;
+    sig_atomic_t request = jl_atomic_exchange(&ptls2->signal_request, 1);
+    assert(request == 0 || request == -1);
     request = 1;
-    int err = pthread_kill(ptls2->system_id, SIGUSR2);
-    // wait for thread to acknowledge or timeout
-    struct pollfd event = {signal_caught_cond, POLLIN, 0};
+    err = pthread_kill(ptls2->system_id, SIGUSR2);
     if (err == 0) {
+        // wait for thread to acknowledge or timeout
+        struct pollfd event = {signal_caught_cond, POLLIN, 0};
         do {
             err = poll(&event, 1, timeout * 1000);
         } while (err == -1 && errno == EINTR);
+        if (err != 1 || (event.revents & POLLIN) == 0)
+            err = -1;
     }
-    if ((event.revents & POLLIN) == 0) {
+    if (err == -1) {
         // not ready after timeout: try to cancel this request
         if (jl_atomic_cmpswap(&ptls2->signal_request, &request, 0)) {
+            signals_inflight--;
             pthread_mutex_unlock(&in_signal_lock);
             return 0;
         }
@@ -483,11 +487,13 @@ int jl_thread_suspend_and_get_state(int tid, int timeout, bt_context_t *ctx)
         err = read(signal_caught_cond, &got, sizeof(eventfd_t));
     } while (err == -1 && errno == EINTR);
     if (err != sizeof(eventfd_t)) abort();
-    assert(got == 1); (void) got;
+    assert(signals_inflight >= got);
+    signals_inflight -= got;
+    signals_inflight++;
     // Now the other thread is waiting on exit_signal_cond (verify that here by
     // checking it is 0, and add an acquire barrier for good measure)
     request = jl_atomic_load_acquire(&ptls2->signal_request);
-    assert(request == 0); (void) request;
+    assert(request == 0 || request == -1); (void) request;
     jl_atomic_store_release(&ptls2->signal_request, 4); // prepare to resume normally, but later code may change this
     *ctx = *signal_context;
     return 1;
@@ -510,6 +516,7 @@ static void jl_try_deliver_sigint(void)
     jl_safepoint_enable_sigint();
     jl_wake_libuv();
     pthread_mutex_lock(&in_signal_lock);
+    signals_inflight++;
     jl_atomic_store_release(&ptls2->signal_request, 2);
     // This also makes sure `sleep` is aborted.
     pthread_kill(ptls2->system_id, SIGUSR2);
@@ -546,6 +553,7 @@ static void jl_exit_thread0(int signo, jl_bt_element_t *bt_data, size_t bt_size)
 }
 
 // request:
+// -1: processing
 //  0: nothing [not from here]
 //  1: get state & wait for request
 //  2: throw sigint if `!defer_signal && io_wait` or if force throw threshold
@@ -561,22 +569,36 @@ void usr2_handler(int sig, siginfo_t *info, void *ctx)
     if (ptls == NULL)
         return;
     int errno_save = errno;
-    // acknowledge that we saw the signal_request
-    sig_atomic_t request = jl_atomic_exchange(&ptls->signal_request, 0);
+    sig_atomic_t request = jl_atomic_load(&ptls->signal_request);
+    if (request == 0)
+        return;
+    if (!jl_atomic_cmpswap(&ptls->signal_request, &request, -1))
+        return;
     if (request == 1) {
         signal_context = jl_to_bt_context(ctx);
+        // acknowledge that we saw the signal_request and set signal_context
         int err;
         eventfd_t got = 1;
         err = write(signal_caught_cond, &got, sizeof(eventfd_t));
         if (err != sizeof(eventfd_t)) abort();
+        sig_atomic_t processing = -1;
+        jl_atomic_cmpswap(&ptls->signal_request, &processing, 0);
+        // wait for exit signal
         do {
             err = read(exit_signal_cond, &got, sizeof(eventfd_t));
         } while (err == -1 && errno == EINTR);
         if (err != sizeof(eventfd_t)) abort();
         assert(got == 1);
-        request = jl_atomic_exchange(&ptls->signal_request, 0);
+        request = jl_atomic_exchange(&ptls->signal_request, -1);
+        signal_context = NULL;
         assert(request == 2 || request == 3 || request == 4);
     }
+    int err;
+    eventfd_t got = 1;
+    err = write(signal_caught_cond, &got, sizeof(eventfd_t));
+    if (err != sizeof(eventfd_t)) abort();
+    sig_atomic_t processing = -1;
+    jl_atomic_cmpswap(&ptls->signal_request, &processing, 0);
     if (request == 2) {
         int force = jl_check_force_sigint();
         if (force || (!ptls->defer_signal && ptls->io_wait)) {
@@ -585,7 +607,11 @@ void usr2_handler(int sig, siginfo_t *info, void *ctx)
                 jl_safe_printf("WARNING: Force throwing a SIGINT\n");
             // Force a throw
             jl_clear_force_sigint();
-            jl_throw_in_ctx(ct, jl_interrupt_exception, sig, ctx);
+            jl_jmp_buf *saferestore = jl_get_safe_restore();
+            if (saferestore) // restarting jl_ or profile
+                jl_longjmp_in_ctx(sig, ctx, *saferestore);
+            else
+                jl_throw_in_ctx(ct, jl_interrupt_exception, sig, ctx);
         }
     }
     else if (request == 3) {
@@ -1055,8 +1081,9 @@ void restore_signals(void)
 static void fpe_handler(int sig, siginfo_t *info, void *context)
 {
     (void)info;
-    if (jl_get_safe_restore()) { // restarting jl_ or profile
-        jl_call_in_ctx(NULL, &jl_sig_throw, sig, context);
+    jl_jmp_buf *saferestore = jl_get_safe_restore();
+    if (saferestore) { // restarting jl_ or profile
+        jl_longjmp_in_ctx(sig, context, *saferestore);
         return;
     }
     jl_task_t *ct = jl_get_current_task();
@@ -1066,6 +1093,21 @@ static void fpe_handler(int sig, siginfo_t *info, void *context)
         jl_throw_in_ctx(ct, jl_diverror_exception, sig, context);
 }
 
+static void jl_longjmp_in_ctx(int sig, void *_ctx, jl_jmp_buf jmpbuf)
+{
+#if defined(_OS_DARWIN_)
+    jl_longjmp_in_state((host_thread_state_t*)jl_to_bt_context(_ctx), jmpbuf);
+#else
+    if (jl_simulate_longjmp(jmpbuf, jl_to_bt_context(_ctx)))
+        return;
+    sigset_t sset;
+    sigemptyset(&sset);
+    sigaddset(&sset, sig);
+    pthread_sigmask(SIG_UNBLOCK, &sset, NULL);
+    jl_longjmp(jmpbuf, 1);
+#endif
+}
+
 static void sigint_handler(int sig)
 {
     jl_sigint_passed = 1;
diff --git a/src/signals-win.c b/src/signals-win.c
index d7288b5d365d8..b5f8dd8bd79d9 100644
--- a/src/signals-win.c
+++ b/src/signals-win.c
@@ -86,9 +86,13 @@ void __cdecl crt_sig_handler(int sig, int num)
         }
         break;
     default: // SIGSEGV, SIGTERM, SIGILL, SIGABRT
-        if (sig == SIGSEGV && jl_get_safe_restore()) {
-            signal(sig, (void (__cdecl *)(int))crt_sig_handler);
-            jl_sig_throw();
+        if (sig == SIGSEGV) { // restarting jl_ or profile
+            jl_jmp_buf *saferestore = jl_get_safe_restore();
+            if (saferestore) {
+                signal(sig, (void (__cdecl *)(int))crt_sig_handler);
+                jl_longjmp(*saferestore, 1);
+                return;
+            }
         }
         memset(&Context, 0, sizeof(Context));
         RtlCaptureContext(&Context);
@@ -109,6 +113,8 @@ static jl_ptls_t stkerror_ptls;
 static int have_backtrace_fiber;
 static void JL_NORETURN start_backtrace_fiber(void)
 {
+    // print the warning (this mysteriously needs a lot of stack for the WriteFile syscall)
+    stack_overflow_warning();
     // collect the backtrace
     stkerror_ptls->bt_size =
         rec_backtrace_ctx(stkerror_ptls->bt_data, JL_MAX_BT_SIZE, stkerror_ctx,
@@ -124,41 +130,41 @@ void restore_signals(void)
     SetConsoleCtrlHandler(NULL, 0);
 }
 
-void jl_throw_in_ctx(jl_task_t *ct, jl_value_t *excpt, PCONTEXT ctxThread)
+int jl_simulate_longjmp(jl_jmp_buf mctx, bt_context_t *c);
+
+static void jl_throw_in_ctx(jl_task_t *ct, jl_value_t *excpt, PCONTEXT ctxThread)
 {
-#if defined(_CPU_X86_64_)
-    DWORD64 Rsp = (ctxThread->Rsp & (DWORD64)-16) - 8;
-#elif defined(_CPU_X86_)
-    DWORD32 Esp = (ctxThread->Esp & (DWORD32)-16) - 4;
-#else
-#error WIN16 not supported :P
-#endif
-    if (ct && !jl_get_safe_restore()) {
-        assert(excpt != NULL);
-        jl_ptls_t ptls = ct->ptls;
-        ptls->bt_size = 0;
-        if (excpt != jl_stackovf_exception) {
-            ptls->bt_size = rec_backtrace_ctx(ptls->bt_data, JL_MAX_BT_SIZE, ctxThread,
-                                              ct->gcstack);
-        }
-        else if (have_backtrace_fiber) {
-            uv_mutex_lock(&backtrace_lock);
-            stkerror_ctx = ctxThread;
-            stkerror_ptls = ptls;
-            jl_swapcontext(&error_return_fiber, &collect_backtrace_fiber);
-            uv_mutex_unlock(&backtrace_lock);
-        }
-        ptls->sig_exception = excpt;
+    jl_jmp_buf *saferestore = jl_get_safe_restore();
+    if (saferestore) { // restarting jl_ or profile
+        if (!jl_simulate_longjmp(*saferestore, ctxThread))
+            abort();
+        return;
+    }
+    assert(ct && excpt);
+    jl_ptls_t ptls = ct->ptls;
+    ptls->bt_size = 0;
+    if (excpt != jl_stackovf_exception) {
+        ptls->bt_size = rec_backtrace_ctx(ptls->bt_data, JL_MAX_BT_SIZE, ctxThread,
+                                          ct->gcstack);
+    }
+    else if (have_backtrace_fiber) {
+        uv_mutex_lock(&backtrace_lock);
+        stkerror_ctx = ctxThread;
+        stkerror_ptls = ptls;
+        jl_swapcontext(&error_return_fiber, &collect_backtrace_fiber);
+        uv_mutex_unlock(&backtrace_lock);
+    }
+    ptls->sig_exception = excpt;
+    ptls->io_wait = 0;
+    jl_handler_t *eh = ct->eh;
+    if (eh != NULL) {
+        asan_unpoison_task_stack(ct, &eh->eh_ctx);
+        if (!jl_simulate_longjmp(eh->eh_ctx, ctxThread))
+            abort();
+    }
+    else {
+        jl_no_exc_handler(excpt, ct);
     }
-#if defined(_CPU_X86_64_)
-    *(DWORD64*)Rsp = 0;
-    ctxThread->Rsp = Rsp;
-    ctxThread->Rip = (DWORD64)&jl_sig_throw;
-#elif defined(_CPU_X86_)
-    *(DWORD32*)Esp = 0;
-    ctxThread->Esp = Esp;
-    ctxThread->Eip = (DWORD)&jl_sig_throw;
-#endif
 }
 
 HANDLE hMainThread = INVALID_HANDLE_VALUE;
@@ -244,7 +250,6 @@ LONG WINAPI jl_exception_handler(struct _EXCEPTION_POINTERS *ExceptionInfo)
         case EXCEPTION_STACK_OVERFLOW:
             if (ct->eh != NULL) {
                 ptls->needs_resetstkoflw = 1;
-                stack_overflow_warning();
                 jl_throw_in_ctx(ct, jl_stackovf_exception, ExceptionInfo->ContextRecord);
                 return EXCEPTION_CONTINUE_EXECUTION;
             }
diff --git a/src/stackwalk.c b/src/stackwalk.c
index 15a9fddeac9a4..6aa36fa8b499c 100644
--- a/src/stackwalk.c
+++ b/src/stackwalk.c
@@ -919,7 +919,273 @@ _os_ptr_munge(uintptr_t ptr) JL_NOTSAFEPOINT
 #endif
 
 
-extern bt_context_t *jl_to_bt_context(void *sigctx);
+extern bt_context_t *jl_to_bt_context(void *sigctx) JL_NOTSAFEPOINT;
+
+// Some notes: this simulates a longjmp call occurring in context `c`, as if the
+// user was to set the PC in `c` to call longjmp and the PC in the longjmp to
+// return here. This helps work around many cases where siglongjmp out of a
+// signal handler is not supported (e.g. missing a _sigunaltstack call).
+// Additionally note that this doesn't restore the MXCSR or FP control word
+// (which some, but not most longjmp implementations do).  It also doesn't
+// support shadow stacks, so if those are in use, you might need to use a direct
+// jl_longjmp instead to leave the signal frame instead of relying on simulating
+// it and attempting to return normally.
+int jl_simulate_longjmp(jl_jmp_buf mctx, bt_context_t *c) JL_NOTSAFEPOINT
+{
+#if (defined(_COMPILER_ASAN_ENABLED_) || defined(_COMPILER_TSAN_ENABLED_))
+    https://github.com/llvm/llvm-project/blob/main/compiler-rt/lib/hwasan/hwasan_interceptors.cpp
+    return 0;
+#elif defined(_OS_WINDOWS_)
+    _JUMP_BUFFER* _ctx = (_JUMP_BUFFER*)mctx;
+    #if defined(_CPU_X86_64_)
+    c->Rbx = _ctx->Rbx;
+    c->Rsp = _ctx->Rsp;
+    c->Rbp = _ctx->Rbp;
+    c->Rsi = _ctx->Rsi;
+    c->Rdi = _ctx->Rdi;
+    c->R12 = _ctx->R12;
+    c->R13 = _ctx->R13;
+    c->R14 = _ctx->R14;
+    c->R15 = _ctx->R15;
+    c->Rip = _ctx->Rip;
+    memcpy(&c->Xmm6, &_ctx->Xmm6, 10 * sizeof(_ctx->Xmm6)); // Xmm6-Xmm15
+    // c->MxCsr = _ctx->MxCsr;
+    // c->FloatSave.ControlWord = _ctx->FpCsr;
+    // c->SegGS[0] = _ctx->Frame;
+    c->Rax = 1;
+    c->Rsp += sizeof(void*);
+    assert(c->Rsp % 16 == 0);
+    return 1;
+    #elif defined(_CPU_X86_)
+    c->Ebp = _ctx->Ebp;
+    c->Ebx = _ctx->Ebx;
+    c->Edi = _ctx->Edi;
+    c->Esi = _ctx->Esi;
+    c->Esp = _ctx->Esp;
+    c->Eip = _ctx->Eip;
+    // c->SegFS[0] = _ctx->Registration;
+    // c->FloatSave.ControlWord = _ctx->FpCsr;
+    c->Eax = 1;
+    c->Esp += sizeof(void*);
+    assert(c->Esp % 16 == 0);
+    return 1;
+    #else
+    #error Windows is currently only supported on x86 and x86_64
+    #endif
+#elif defined(_OS_LINUX_) && defined(__GLIBC__)
+    __jmp_buf *_ctx = &mctx->__jmpbuf;
+    mcontext_t *mc = &c->uc_mcontext;
+    #if defined(_CPU_X86_)
+    // https://github.com/bminor/glibc/blame/master/sysdeps/i386/__longjmp.S
+    // https://github.com/bminor/glibc/blame/master/sysdeps/i386/jmpbuf-offsets.h
+    // https://github.com/bminor/musl/blame/master/src/setjmp/i386/longjmp.s
+    mc->gregs[REG_EBX] = (*_ctx)[0];
+    mc->gregs[REG_ESI] = (*_ctx)[1];
+    mc->gregs[REG_EDI] = (*_ctx)[2];
+    mc->gregs[REG_EBP] = (*_ctx)[3];
+    mc->gregs[REG_ESP] = (*_ctx)[4];
+    mc->gregs[REG_EIP] = (*_ctx)[5];
+    // ifdef PTR_DEMANGLE ?
+    mc->gregs[REG_ESP] = ptr_demangle(mc->gregs[REG_ESP]);
+    mc->gregs[REG_EIP] = ptr_demangle(mc->gregs[REG_EIP]);
+    mc->gregs[REG_EAX] = 1;
+    assert(mc->gregs[REG_ESP] % 16 == 0);
+    return 1;
+    #elif defined(_CPU_X86_64_)
+    // https://github.com/bminor/glibc/blame/master/sysdeps/x86_64/__longjmp.S
+    // https://github.com/bminor/glibc/blame/master/sysdeps/x86_64/jmpbuf-offsets.h
+    // https://github.com/bminor/musl/blame/master/src/setjmp/x86_64/setjmp.s
+    mc->gregs[REG_RBX] = (*_ctx)[0];
+    mc->gregs[REG_RBP] = (*_ctx)[1];
+    mc->gregs[REG_R12] = (*_ctx)[2];
+    mc->gregs[REG_R13] = (*_ctx)[3];
+    mc->gregs[REG_R14] = (*_ctx)[4];
+    mc->gregs[REG_R15] = (*_ctx)[5];
+    mc->gregs[REG_RSP] = (*_ctx)[6];
+    mc->gregs[REG_RIP] = (*_ctx)[7];
+    // ifdef PTR_DEMANGLE ?
+    mc->gregs[REG_RBP] = ptr_demangle(mc->gregs[REG_RBP]);
+    mc->gregs[REG_RSP] = ptr_demangle(mc->gregs[REG_RSP]);
+    mc->gregs[REG_RIP] = ptr_demangle(mc->gregs[REG_RIP]);
+    mc->gregs[REG_RAX] = 1;
+    assert(mc->gregs[REG_RSP] % 16 == 0);
+    return 1;
+    #elif defined(_CPU_ARM_)
+    // https://github.com/bminor/glibc/blame/master/sysdeps/arm/__longjmp.S
+    // https://github.com/bminor/glibc/blame/master/sysdeps/arm/include/bits/setjmp.h
+    // https://github.com/bminor/musl/blame/master/src/setjmp/arm/longjmp.S
+    mc->arm_sp = (*_ctx)[0];
+    mc->arm_lr = (*_ctx)[1];
+    mc->arm_r4 = (*_ctx)[2]; // aka v1
+    mc->arm_r5 = (*_ctx)[3]; // aka v2
+    mc->arm_r6 = (*_ctx)[4]; // aka v3
+    mc->arm_r7 = (*_ctx)[5]; // aka v4
+    mc->arm_r8 = (*_ctx)[6]; // aka v5
+    mc->arm_r9 = (*_ctx)[7]; // aka v6 aka sb
+    mc->arm_r10 = (*_ctx)[8]; // aka v7 aka sl
+    mc->arm_fp = (*_ctx)[10]; // aka v8 aka r11
+    // ifdef PTR_DEMANGLE ?
+    mc->arm_sp = ptr_demangle(mc->arm_sp);
+    mc->arm_lr = ptr_demangle(mc->arm_lr);
+    mc->arm_pc = mc->arm_lr;
+    mc->arm_r0 = 1;
+    assert(mc->arm_sp % 16 == 0);
+    return 1;
+    #elif defined(_CPU_AARCH64_)
+    // https://github.com/bminor/glibc/blame/master/sysdeps/aarch64/__longjmp.S
+    // https://github.com/bminor/glibc/blame/master/sysdeps/aarch64/jmpbuf-offsets.h
+    // https://github.com/bminor/musl/blame/master/src/setjmp/aarch64/longjmp.s
+    // https://github.com/libunwind/libunwind/blob/ec171c9ba7ea3abb2a1383cee2988a7abd483a1f/src/aarch64/unwind_i.h#L62
+    unw_fpsimd_context_t *mcfp = (unw_fpsimd_context_t*)&mc->__reserved;
+    mc->regs[19] = (*_ctx)[0];
+    mc->regs[20] = (*_ctx)[1];
+    mc->regs[21] = (*_ctx)[2];
+    mc->regs[22] = (*_ctx)[3];
+    mc->regs[23] = (*_ctx)[4];
+    mc->regs[24] = (*_ctx)[5];
+    mc->regs[25] = (*_ctx)[6];
+    mc->regs[26] = (*_ctx)[7];
+    mc->regs[27] = (*_ctx)[8];
+    mc->regs[28] = (*_ctx)[9];
+    mc->regs[29] = (*_ctx)[10]; // aka fp
+    mc->regs[30] = (*_ctx)[11]; // aka lr
+    // Yes, they did skip 12 why writing the code originally; and, no, I do not know why.
+    mc->sp = (*_ctx)[13];
+    mcfp->vregs[7] = (*_ctx)[14]; // aka d8
+    mcfp->vregs[8] = (*_ctx)[15]; // aka d9
+    mcfp->vregs[9] = (*_ctx)[16]; // aka d10
+    mcfp->vregs[10] = (*_ctx)[17]; // aka d11
+    mcfp->vregs[11] = (*_ctx)[18]; // aka d12
+    mcfp->vregs[12] = (*_ctx)[19]; // aka d13
+    mcfp->vregs[13] = (*_ctx)[20]; // aka d14
+    mcfp->vregs[14] = (*_ctx)[21]; // aka d15
+    // ifdef PTR_DEMANGLE ?
+    mc->sp = ptr_demangle(mc->sp);
+    mc->regs[30] = ptr_demangle(mc->regs[30]);
+    mc->pc = mc->regs[30];
+    mc->regs[0] = 1;
+    assert(mc->sp % 16 == 0);
+    return 1;
+    #else
+    #pragma message("jl_record_backtrace not defined for ASM/SETJMP on unknown linux")
+    (void)mc;
+    (void)mctx;
+    return 0;
+    #endif
+#elif defined(_OS_DARWIN_)
+    #if defined(_CPU_X86_64_)
+    // from https://github.com/apple/darwin-libplatform/blob/main/src/setjmp/x86_64/_setjmp.s
+    x86_thread_state64_t *mc = (x86_thread_state64_t*)c;
+    mc->__rbx = ((uint64_t*)mctx)[0];
+    mc->__rbp = ((uint64_t*)mctx)[1];
+    mc->__rsp = ((uint64_t*)mctx)[2];
+    mc->__r12 = ((uint64_t*)mctx)[3];
+    mc->__r13 = ((uint64_t*)mctx)[4];
+    mc->__r14 = ((uint64_t*)mctx)[5];
+    mc->__r15 = ((uint64_t*)mctx)[6];
+    mc->__rip = ((uint64_t*)mctx)[7];
+    // added in libsystem_platform 177.200.16 (macOS Mojave 10.14.3)
+    // prior to that _os_ptr_munge_token was (hopefully) typically 0,
+    // so x ^ 0 == x and this is a no-op
+    mc->__rbp = _OS_PTR_UNMUNGE(mc->__rbp);
+    mc->__rsp = _OS_PTR_UNMUNGE(mc->__rsp);
+    mc->__rip = _OS_PTR_UNMUNGE(mc->__rip);
+    mc->__rax = 1;
+    assert(mc->__rsp % 16 == 0);
+    return 1;
+    #elif defined(_CPU_AARCH64_)
+    // from https://github.com/apple/darwin-libplatform/blob/main/src/setjmp/arm64/setjmp.s
+    // https://github.com/apple/darwin-xnu/blob/main/osfmk/mach/arm/_structs.h
+    // https://github.com/llvm/llvm-project/blob/7714e0317520207572168388f22012dd9e152e9e/libunwind/src/Registers.hpp -> Registers_arm64
+    arm_thread_state64_t *mc = (arm_thread_state64_t*)c;
+    mc->__x[19] = ((uint64_t*)mctx)[0];
+    mc->__x[20] = ((uint64_t*)mctx)[1];
+    mc->__x[21] = ((uint64_t*)mctx)[2];
+    mc->__x[22] = ((uint64_t*)mctx)[3];
+    mc->__x[23] = ((uint64_t*)mctx)[4];
+    mc->__x[24] = ((uint64_t*)mctx)[5];
+    mc->__x[25] = ((uint64_t*)mctx)[6];
+    mc->__x[26] = ((uint64_t*)mctx)[7];
+    mc->__x[27] = ((uint64_t*)mctx)[8];
+    mc->__x[28] = ((uint64_t*)mctx)[9];
+    mc->__x[10] = ((uint64_t*)mctx)[10];
+    mc->__x[11] = ((uint64_t*)mctx)[11];
+    mc->__x[12] = ((uint64_t*)mctx)[12];
+    // 13 is reserved/unused
+    double *mcfp = (double*)&mc[1];
+    mcfp[7] = ((uint64_t*)mctx)[14]; // aka d8
+    mcfp[8] = ((uint64_t*)mctx)[15]; // aka d9
+    mcfp[9] = ((uint64_t*)mctx)[16]; // aka d10
+    mcfp[10] = ((uint64_t*)mctx)[17]; // aka d11
+    mcfp[11] = ((uint64_t*)mctx)[18]; // aka d12
+    mcfp[12] = ((uint64_t*)mctx)[19]; // aka d13
+    mcfp[13] = ((uint64_t*)mctx)[20]; // aka d14
+    mcfp[14] = ((uint64_t*)mctx)[21]; // aka d15
+    mc->__fp = _OS_PTR_UNMUNGE(mc->__x[10]);
+    mc->__lr = _OS_PTR_UNMUNGE(mc->__x[11]);
+    mc->__x[12] = _OS_PTR_UNMUNGE(mc->__x[12]);
+    mc->__sp = mc->__x[12];
+    // libunwind is broken for signed-pointers, but perhaps best not to leave the signed pointer lying around either
+    mc->__pc = ptrauth_strip(mc->__lr, 0);
+    mc->__pad = 0; // aka __ra_sign_state = not signed
+    mc->__x[0] = 1;
+    assert(mc->__sp % 16 == 0);
+    return 1;
+    #else
+    #pragma message("jl_record_backtrace not defined for ASM/SETJMP on unknown darwin")
+    (void)mctx;
+    return 0;
+#endif
+#elif defined(_OS_FREEBSD_)
+    mcontext_t *mc = &c->uc_mcontext;
+    #if defined(_CPU_X86_64_)
+    // https://github.com/freebsd/freebsd-src/blob/releng/13.1/lib/libc/amd64/gen/_setjmp.S
+    mc->mc_rip = ((long*)mctx)[0];
+    mc->mc_rbx = ((long*)mctx)[1];
+    mc->mc_rsp = ((long*)mctx)[2];
+    mc->mc_rbp = ((long*)mctx)[3];
+    mc->mc_r12 = ((long*)mctx)[4];
+    mc->mc_r13 = ((long*)mctx)[5];
+    mc->mc_r14 = ((long*)mctx)[6];
+    mc->mc_r15 = ((long*)mctx)[7];
+    mc->mc_rax = 1;
+    mc->mc_rsp += sizeof(void*);
+    assert(mc->mc_rsp % 16 == 0);
+    return 1;
+    #elif defined(_CPU_AARCH64_)
+    mc->mc_gpregs.gp_x[19] = ((long*)mctx)[0];
+    mc->mc_gpregs.gp_x[20] = ((long*)mctx)[1];
+    mc->mc_gpregs.gp_x[21] = ((long*)mctx)[2];
+    mc->mc_gpregs.gp_x[22] = ((long*)mctx)[3];
+    mc->mc_gpregs.gp_x[23] = ((long*)mctx)[4];
+    mc->mc_gpregs.gp_x[24] = ((long*)mctx)[5];
+    mc->mc_gpregs.gp_x[25] = ((long*)mctx)[6];
+    mc->mc_gpregs.gp_x[26] = ((long*)mctx)[7];
+    mc->mc_gpregs.gp_x[27] = ((long*)mctx)[8];
+    mc->mc_gpregs.gp_x[28] = ((long*)mctx)[9];
+    mc->mc_gpregs.gp_x[29] = ((long*)mctx)[10];
+    mc->mc_gpregs.gp_lr = ((long*)mctx)[11];
+    mc->mc_gpregs.gp_sp = ((long*)mctx)[12];
+    mc->mc_fpregs.fp_q[7] = ((long*)mctx)[13];
+    mc->mc_fpregs.fp_q[8] = ((long*)mctx)[14];
+    mc->mc_fpregs.fp_q[9] = ((long*)mctx)[15];
+    mc->mc_fpregs.fp_q[10] = ((long*)mctx)[16];
+    mc->mc_fpregs.fp_q[11] = ((long*)mctx)[17];
+    mc->mc_fpregs.fp_q[12] = ((long*)mctx)[18];
+    mc->mc_fpregs.fp_q[13] = ((long*)mctx)[19];
+    mc->mc_fpregs.fp_q[14] = ((long*)mctx)[20];
+    mc->mc_gpregs.gp_x[0] = 1;
+    assert(mc->mc_gpregs.gp_sp % 16 == 0);
+    return 1;
+    #else
+    #pragma message("jl_record_backtrace not defined for ASM/SETJMP on unknown freebsd")
+    (void)mctx;
+    return 0;
+    #endif
+#else
+return 0;
+#endif
+}
 
 JL_DLLEXPORT size_t jl_record_backtrace(jl_task_t *t, jl_bt_element_t *bt_data, size_t max_bt_size) JL_NOTSAFEPOINT
 {
@@ -955,234 +1221,19 @@ JL_DLLEXPORT size_t jl_record_backtrace(jl_task_t *t, jl_bt_element_t *bt_data,
     }
     if (context == NULL && (!t->ctx.copy_stack && t->ctx.started && t->ctx.ctx != NULL)) {
         // need to read the context from the task stored state
+        jl_jmp_buf *mctx = &t->ctx.ctx->uc_mcontext;
 #if defined(_OS_WINDOWS_)
         memset(&c, 0, sizeof(c));
-        _JUMP_BUFFER *mctx = (_JUMP_BUFFER*)&t->ctx.ctx->uc_mcontext;
-#if defined(_CPU_X86_64_)
-        c.Rbx = mctx->Rbx;
-        c.Rsp = mctx->Rsp;
-        c.Rbp = mctx->Rbp;
-        c.Rsi = mctx->Rsi;
-        c.Rdi = mctx->Rdi;
-        c.R12 = mctx->R12;
-        c.R13 = mctx->R13;
-        c.R14 = mctx->R14;
-        c.R15 = mctx->R15;
-        c.Rip = mctx->Rip;
-        memcpy(&c.Xmm6, &mctx->Xmm6, 10 * sizeof(mctx->Xmm6)); // Xmm6-Xmm15
-#elif defined(_CPU_X86_)
-        c.Eip = mctx->Eip;
-        c.Esp = mctx->Esp;
-        c.Ebp = mctx->Ebp;
-#else
-        #error Windows is currently only supported on x86 and x86_64
-#endif
-        context = &c;
+        if (jl_simulate_longjmp(*mctx, &c))
+            context = &c;
 #elif defined(JL_HAVE_UNW_CONTEXT)
         context = t->ctx.ctx;
 #elif defined(JL_HAVE_UCONTEXT)
         context = jl_to_bt_context(t->ctx.ctx);
 #elif defined(JL_HAVE_ASM)
         memset(&c, 0, sizeof(c));
-     #if defined(_OS_LINUX_) && defined(__GLIBC__)
-        __jmp_buf *mctx = &t->ctx.ctx->uc_mcontext->__jmpbuf;
-        mcontext_t *mc = &c.uc_mcontext;
-      #if defined(_CPU_X86_)
-        // https://github.com/bminor/glibc/blame/master/sysdeps/i386/__longjmp.S
-        // https://github.com/bminor/glibc/blame/master/sysdeps/i386/jmpbuf-offsets.h
-        // https://github.com/bminor/musl/blame/master/src/setjmp/i386/longjmp.s
-        mc->gregs[REG_EBX] = (*mctx)[0];
-        mc->gregs[REG_ESI] = (*mctx)[1];
-        mc->gregs[REG_EDI] = (*mctx)[2];
-        mc->gregs[REG_EBP] = (*mctx)[3];
-        mc->gregs[REG_ESP] = (*mctx)[4];
-        mc->gregs[REG_EIP] = (*mctx)[5];
-        // ifdef PTR_DEMANGLE ?
-        mc->gregs[REG_ESP] = ptr_demangle(mc->gregs[REG_ESP]);
-        mc->gregs[REG_EIP] = ptr_demangle(mc->gregs[REG_EIP]);
-        context = &c;
-      #elif defined(_CPU_X86_64_)
-        // https://github.com/bminor/glibc/blame/master/sysdeps/x86_64/__longjmp.S
-        // https://github.com/bminor/glibc/blame/master/sysdeps/x86_64/jmpbuf-offsets.h
-        // https://github.com/bminor/musl/blame/master/src/setjmp/x86_64/setjmp.s
-        mc->gregs[REG_RBX] = (*mctx)[0];
-        mc->gregs[REG_RBP] = (*mctx)[1];
-        mc->gregs[REG_R12] = (*mctx)[2];
-        mc->gregs[REG_R13] = (*mctx)[3];
-        mc->gregs[REG_R14] = (*mctx)[4];
-        mc->gregs[REG_R15] = (*mctx)[5];
-        mc->gregs[REG_RSP] = (*mctx)[6];
-        mc->gregs[REG_RIP] = (*mctx)[7];
-        // ifdef PTR_DEMANGLE ?
-        mc->gregs[REG_RBP] = ptr_demangle(mc->gregs[REG_RBP]);
-        mc->gregs[REG_RSP] = ptr_demangle(mc->gregs[REG_RSP]);
-        mc->gregs[REG_RIP] = ptr_demangle(mc->gregs[REG_RIP]);
-        context = &c;
-      #elif defined(_CPU_ARM_)
-        // https://github.com/bminor/glibc/blame/master/sysdeps/arm/__longjmp.S
-        // https://github.com/bminor/glibc/blame/master/sysdeps/arm/include/bits/setjmp.h
-        // https://github.com/bminor/musl/blame/master/src/setjmp/arm/longjmp.S
-        mc->arm_sp = (*mctx)[0];
-        mc->arm_lr = (*mctx)[1];
-        mc->arm_r4 = (*mctx)[2]; // aka v1
-        mc->arm_r5 = (*mctx)[3]; // aka v2
-        mc->arm_r6 = (*mctx)[4]; // aka v3
-        mc->arm_r7 = (*mctx)[5]; // aka v4
-        mc->arm_r8 = (*mctx)[6]; // aka v5
-        mc->arm_r9 = (*mctx)[7]; // aka v6 aka sb
-        mc->arm_r10 = (*mctx)[8]; // aka v7 aka sl
-        mc->arm_fp = (*mctx)[10]; // aka v8 aka r11
-        // ifdef PTR_DEMANGLE ?
-        mc->arm_sp = ptr_demangle(mc->arm_sp);
-        mc->arm_lr = ptr_demangle(mc->arm_lr);
-        mc->arm_pc = mc->arm_lr;
-        context = &c;
-      #elif defined(_CPU_AARCH64_)
-        // https://github.com/bminor/glibc/blame/master/sysdeps/aarch64/__longjmp.S
-        // https://github.com/bminor/glibc/blame/master/sysdeps/aarch64/jmpbuf-offsets.h
-        // https://github.com/bminor/musl/blame/master/src/setjmp/aarch64/longjmp.s
-        // https://github.com/libunwind/libunwind/blob/ec171c9ba7ea3abb2a1383cee2988a7abd483a1f/src/aarch64/unwind_i.h#L62
-        unw_fpsimd_context_t *mcfp = (unw_fpsimd_context_t*)&mc->__reserved;
-        mc->regs[19] = (*mctx)[0];
-        mc->regs[20] = (*mctx)[1];
-        mc->regs[21] = (*mctx)[2];
-        mc->regs[22] = (*mctx)[3];
-        mc->regs[23] = (*mctx)[4];
-        mc->regs[24] = (*mctx)[5];
-        mc->regs[25] = (*mctx)[6];
-        mc->regs[26] = (*mctx)[7];
-        mc->regs[27] = (*mctx)[8];
-        mc->regs[28] = (*mctx)[9];
-        mc->regs[29] = (*mctx)[10]; // aka fp
-        mc->regs[30] = (*mctx)[11]; // aka lr
-        // Yes, they did skip 12 why writing the code originally; and, no, I do not know why.
-        mc->sp = (*mctx)[13];
-        mcfp->vregs[7] = (*mctx)[14]; // aka d8
-        mcfp->vregs[8] = (*mctx)[15]; // aka d9
-        mcfp->vregs[9] = (*mctx)[16]; // aka d10
-        mcfp->vregs[10] = (*mctx)[17]; // aka d11
-        mcfp->vregs[11] = (*mctx)[18]; // aka d12
-        mcfp->vregs[12] = (*mctx)[19]; // aka d13
-        mcfp->vregs[13] = (*mctx)[20]; // aka d14
-        mcfp->vregs[14] = (*mctx)[21]; // aka d15
-        // ifdef PTR_DEMANGLE ?
-        mc->sp = ptr_demangle(mc->sp);
-        mc->regs[30] = ptr_demangle(mc->regs[30]);
-        mc->pc = mc->regs[30];
-        context = &c;
-      #else
-       #pragma message("jl_record_backtrace not defined for ASM/SETJMP on unknown linux")
-       (void)mc;
-       (void)c;
-       (void)mctx;
-      #endif
-     #elif defined(_OS_DARWIN_)
-        sigjmp_buf *mctx = &t->ctx.ctx->uc_mcontext;
-      #if defined(_CPU_X86_64_)
-        // from https://github.com/apple/darwin-libplatform/blob/main/src/setjmp/x86_64/_setjmp.s
-        x86_thread_state64_t *mc = (x86_thread_state64_t*)&c;
-        mc->__rbx = ((uint64_t*)mctx)[0];
-        mc->__rbp = ((uint64_t*)mctx)[1];
-        mc->__rsp = ((uint64_t*)mctx)[2];
-        mc->__r12 = ((uint64_t*)mctx)[3];
-        mc->__r13 = ((uint64_t*)mctx)[4];
-        mc->__r14 = ((uint64_t*)mctx)[5];
-        mc->__r15 = ((uint64_t*)mctx)[6];
-        mc->__rip = ((uint64_t*)mctx)[7];
-        // added in libsystem_platform 177.200.16 (macOS Mojave 10.14.3)
-        // prior to that _os_ptr_munge_token was (hopefully) typically 0,
-        // so x ^ 0 == x and this is a no-op
-        mc->__rbp = _OS_PTR_UNMUNGE(mc->__rbp);
-        mc->__rsp = _OS_PTR_UNMUNGE(mc->__rsp);
-        mc->__rip = _OS_PTR_UNMUNGE(mc->__rip);
-        context = &c;
-      #elif defined(_CPU_AARCH64_)
-        // from https://github.com/apple/darwin-libplatform/blob/main/src/setjmp/arm64/setjmp.s
-        // https://github.com/apple/darwin-xnu/blob/main/osfmk/mach/arm/_structs.h
-        // https://github.com/llvm/llvm-project/blob/7714e0317520207572168388f22012dd9e152e9e/libunwind/src/Registers.hpp -> Registers_arm64
-        arm_thread_state64_t *mc = (arm_thread_state64_t*)&c;
-        mc->__x[19] = ((uint64_t*)mctx)[0];
-        mc->__x[20] = ((uint64_t*)mctx)[1];
-        mc->__x[21] = ((uint64_t*)mctx)[2];
-        mc->__x[22] = ((uint64_t*)mctx)[3];
-        mc->__x[23] = ((uint64_t*)mctx)[4];
-        mc->__x[24] = ((uint64_t*)mctx)[5];
-        mc->__x[25] = ((uint64_t*)mctx)[6];
-        mc->__x[26] = ((uint64_t*)mctx)[7];
-        mc->__x[27] = ((uint64_t*)mctx)[8];
-        mc->__x[28] = ((uint64_t*)mctx)[9];
-        mc->__x[10] = ((uint64_t*)mctx)[10];
-        mc->__x[11] = ((uint64_t*)mctx)[11];
-        mc->__x[12] = ((uint64_t*)mctx)[12];
-        // 13 is reserved/unused
-        double *mcfp = (double*)&mc[1];
-        mcfp[7] = ((uint64_t*)mctx)[14]; // aka d8
-        mcfp[8] = ((uint64_t*)mctx)[15]; // aka d9
-        mcfp[9] = ((uint64_t*)mctx)[16]; // aka d10
-        mcfp[10] = ((uint64_t*)mctx)[17]; // aka d11
-        mcfp[11] = ((uint64_t*)mctx)[18]; // aka d12
-        mcfp[12] = ((uint64_t*)mctx)[19]; // aka d13
-        mcfp[13] = ((uint64_t*)mctx)[20]; // aka d14
-        mcfp[14] = ((uint64_t*)mctx)[21]; // aka d15
-        mc->__fp = _OS_PTR_UNMUNGE(mc->__x[10]);
-        mc->__lr = _OS_PTR_UNMUNGE(mc->__x[11]);
-        mc->__x[12] = _OS_PTR_UNMUNGE(mc->__x[12]);
-        mc->__sp = mc->__x[12];
-        // libunwind is broken for signed-pointers, but perhaps best not to leave the signed pointer lying around either
-        mc->__pc = ptrauth_strip(mc->__lr, 0);
-        mc->__pad = 0; // aka __ra_sign_state = not signed
-        context = &c;
-      #else
-       #pragma message("jl_record_backtrace not defined for ASM/SETJMP on unknown darwin")
-        (void)mctx;
-        (void)c;
-      #endif
-     #elif defined(_OS_FREEBSD_)
-        sigjmp_buf *mctx = &t->ctx.ctx->uc_mcontext;
-        mcontext_t *mc = &c.uc_mcontext;
-      #if defined(_CPU_X86_64_)
-        // https://github.com/freebsd/freebsd-src/blob/releng/13.1/lib/libc/amd64/gen/_setjmp.S
-        mc->mc_rip = ((long*)mctx)[0];
-        mc->mc_rbx = ((long*)mctx)[1];
-        mc->mc_rsp = ((long*)mctx)[2];
-        mc->mc_rbp = ((long*)mctx)[3];
-        mc->mc_r12 = ((long*)mctx)[4];
-        mc->mc_r13 = ((long*)mctx)[5];
-        mc->mc_r14 = ((long*)mctx)[6];
-        mc->mc_r15 = ((long*)mctx)[7];
-        context = &c;
-      #elif defined(_CPU_AARCH64_)
-        mc->mc_gpregs.gp_x[19] = ((long*)mctx)[0];
-        mc->mc_gpregs.gp_x[20] = ((long*)mctx)[1];
-        mc->mc_gpregs.gp_x[21] = ((long*)mctx)[2];
-        mc->mc_gpregs.gp_x[22] = ((long*)mctx)[3];
-        mc->mc_gpregs.gp_x[23] = ((long*)mctx)[4];
-        mc->mc_gpregs.gp_x[24] = ((long*)mctx)[5];
-        mc->mc_gpregs.gp_x[25] = ((long*)mctx)[6];
-        mc->mc_gpregs.gp_x[26] = ((long*)mctx)[7];
-        mc->mc_gpregs.gp_x[27] = ((long*)mctx)[8];
-        mc->mc_gpregs.gp_x[28] = ((long*)mctx)[9];
-        mc->mc_gpregs.gp_x[29] = ((long*)mctx)[10];
-        mc->mc_gpregs.gp_lr = ((long*)mctx)[11];
-        mc->mc_gpregs.gp_sp = ((long*)mctx)[12];
-        mc->mc_fpregs.fp_q[7] = ((long*)mctx)[13];
-        mc->mc_fpregs.fp_q[8] = ((long*)mctx)[14];
-        mc->mc_fpregs.fp_q[9] = ((long*)mctx)[15];
-        mc->mc_fpregs.fp_q[10] = ((long*)mctx)[16];
-        mc->mc_fpregs.fp_q[11] = ((long*)mctx)[17];
-        mc->mc_fpregs.fp_q[12] = ((long*)mctx)[18];
-        mc->mc_fpregs.fp_q[13] = ((long*)mctx)[19];
-        mc->mc_fpregs.fp_q[14] = ((long*)mctx)[20];
-        context = &c;
-      #else
-       #pragma message("jl_record_backtrace not defined for ASM/SETJMP on unknown freebsd")
-        (void)mctx;
-        (void)c;
-      #endif
-     #else
-      #pragma message("jl_record_backtrace not defined for ASM/SETJMP on unknown system")
-      (void)c;
-     #endif
+        if (jl_simulate_longjmp(*mctx, &c))
+            context = &c;
 #else
      #pragma message("jl_record_backtrace not defined for unknown task system")
 #endif
diff --git a/src/staticdata.c b/src/staticdata.c
index 6dfe5e91a9c55..0a8cbe6db7c67 100644
--- a/src/staticdata.c
+++ b/src/staticdata.c
@@ -100,7 +100,7 @@ extern "C" {
 // TODO: put WeakRefs on the weak_refs list during deserialization
 // TODO: handle finalizers
 
-#define NUM_TAGS    192
+#define NUM_TAGS    193
 
 // An array of references that need to be restored from the sysimg
 // This is a manually constructed dual of the gvars array, which would be produced by codegen for Julia code, for C.
@@ -312,6 +312,7 @@ jl_value_t **const*const get_tags(void) {
         INSERT_TAG(jl_builtin_modifyglobal);
         INSERT_TAG(jl_builtin_replaceglobal);
         INSERT_TAG(jl_builtin_setglobalonce);
+        INSERT_TAG(jl_builtin_current_scope);
         // n.b. must update NUM_TAGS when you add something here
 #undef INSERT_TAG
         assert(i == NUM_TAGS - 1);
@@ -362,6 +363,9 @@ static void *to_seroder_entry(size_t idx)
     return (void*)((char*)HT_NOTFOUND + 1 + idx);
 }
 
+static htable_t new_methtables;
+static size_t precompilation_world;
+
 static int ptr_cmp(const void *l, const void *r)
 {
     uintptr_t left = *(const uintptr_t*)l;
@@ -497,7 +501,7 @@ static htable_t relocatable_ext_cis;
 // (reverse of fptr_to_id)
 // This is a manually constructed dual of the fvars array, which would be produced by codegen for Julia code, for C.
 static const jl_fptr_args_t id_to_fptrs[] = {
-    &jl_f_throw, &jl_f_is, &jl_f_typeof, &jl_f_issubtype, &jl_f_isa,
+    &jl_f_throw, &jl_f_throw_methoderror, &jl_f_is, &jl_f_typeof, &jl_f_issubtype, &jl_f_isa,
     &jl_f_typeassert, &jl_f__apply_iterate, &jl_f__apply_pure,
     &jl_f__call_latest, &jl_f__call_in_world, &jl_f__call_in_world_total, &jl_f_isdefined,
     &jl_f_tuple, &jl_f_svec, &jl_f_intrinsic_call,
@@ -769,22 +773,41 @@ static uintptr_t jl_fptr_id(void *fptr)
 #define jl_queue_for_serialization(s, v) jl_queue_for_serialization_((s), (jl_value_t*)(v), 1, 0)
 static void jl_queue_for_serialization_(jl_serializer_state *s, jl_value_t *v, int recursive, int immediate) JL_GC_DISABLED;
 
-
 static void jl_queue_module_for_serialization(jl_serializer_state *s, jl_module_t *m) JL_GC_DISABLED
 {
     jl_queue_for_serialization(s, m->name);
     jl_queue_for_serialization(s, m->parent);
-    jl_queue_for_serialization(s, jl_atomic_load_relaxed(&m->bindings));
+    if (jl_options.trim) {
+        jl_queue_for_serialization_(s, (jl_value_t*)jl_atomic_load_relaxed(&m->bindings), 0, 1);
+    } else {
+        jl_queue_for_serialization(s, jl_atomic_load_relaxed(&m->bindings));
+    }
     jl_queue_for_serialization(s, jl_atomic_load_relaxed(&m->bindingkeyset));
-    if (jl_options.strip_metadata) {
+    if (jl_options.strip_metadata || jl_options.trim) {
         jl_svec_t *table = jl_atomic_load_relaxed(&m->bindings);
         for (size_t i = 0; i < jl_svec_len(table); i++) {
             jl_binding_t *b = (jl_binding_t*)jl_svecref(table, i);
             if ((void*)b == jl_nothing)
                 break;
-            jl_sym_t *name = b->globalref->name;
-            if (name == jl_docmeta_sym && jl_get_binding_value(b))
-                record_field_change((jl_value_t**)&b->value, jl_nothing);
+            if (jl_options.strip_metadata) {
+                jl_sym_t *name = b->globalref->name;
+                if (name == jl_docmeta_sym && jl_get_binding_value(b))
+                    record_field_change((jl_value_t**)&b->value, jl_nothing);
+            }
+            if (jl_options.trim) {
+                jl_value_t *val = jl_get_binding_value(b);
+                // keep binding objects that are defined and ...
+                if (val &&
+                    // ... point to modules ...
+                    (jl_is_module(val) ||
+                     // ... or point to __init__ methods ...
+                     !strcmp(jl_symbol_name(b->globalref->name), "__init__") ||
+                     // ... or point to Base functions accessed by the runtime
+                     (m == jl_base_module && (!strcmp(jl_symbol_name(b->globalref->name), "wait") ||
+                                              !strcmp(jl_symbol_name(b->globalref->name), "task_done_hook"))))) {
+                    jl_queue_for_serialization(s, b);
+                }
+            }
         }
     }
 
@@ -943,6 +966,23 @@ static void jl_insert_into_serialization_queue(jl_serializer_state *s, jl_value_
         jl_queue_for_serialization_(s, get_replaceable_field((jl_value_t**)&bpart->next, 0), 1, immediate);
     }
     else if (layout->nfields > 0) {
+        if (jl_options.trim) {
+            if (jl_is_method(v)) {
+                jl_method_t *m = (jl_method_t *)v;
+                if (jl_is_svec(jl_atomic_load_relaxed(&m->specializations)))
+                    jl_queue_for_serialization_(s, (jl_value_t*)jl_atomic_load_relaxed(&m->specializations), 0, 1);
+            }
+            else if (jl_typetagis(v, jl_typename_type)) {
+                jl_typename_t *tn = (jl_typename_t*)v;
+                if (tn->mt != NULL && !tn->mt->frozen) {
+                    jl_methtable_t * new_methtable = (jl_methtable_t *)ptrhash_get(&new_methtables, tn->mt);
+                    if (new_methtable != HT_NOTFOUND)
+                        record_field_change((jl_value_t **)&tn->mt, (jl_value_t*)new_methtable);
+                    else
+                        record_field_change((jl_value_t **)&tn->mt, NULL);
+                }
+            }
+        }
         char *data = (char*)jl_data_ptr(v);
         size_t i, np = layout->npointers;
         for (i = 0; i < np; i++) {
@@ -988,6 +1028,7 @@ done_fields: ;
     }
 }
 
+
 static void jl_queue_for_serialization_(jl_serializer_state *s, jl_value_t *v, int recursive, int immediate) JL_GC_DISABLED
 {
     if (!jl_needs_serialization(s, v))
@@ -1218,6 +1259,9 @@ static void jl_write_module(jl_serializer_state *s, uintptr_t item, jl_module_t
     jl_atomic_store_relaxed(&newm->bindingkeyset, NULL);
     arraylist_push(&s->relocs_list, (void*)(reloc_offset + offsetof(jl_module_t, bindingkeyset)));
     arraylist_push(&s->relocs_list, (void*)backref_id(s, jl_atomic_load_relaxed(&m->bindingkeyset), s->link_ids_relocs));
+    newm->file = NULL;
+    arraylist_push(&s->relocs_list, (void*)(reloc_offset + offsetof(jl_module_t, file)));
+    arraylist_push(&s->relocs_list, (void*)backref_id(s, m->file, s->link_ids_relocs));
 
     // write out the usings list
     memset(&newm->usings._space, 0, sizeof(newm->usings._space));
@@ -1677,7 +1721,7 @@ static void jl_write_values(jl_serializer_state *s) JL_GC_DISABLED
             else if (jl_is_method_instance(v)) {
                 assert(f == s->s);
                 jl_method_instance_t *newmi = (jl_method_instance_t*)&f->buf[reloc_offset];
-                jl_atomic_store_relaxed(&newmi->precompiled, 0);
+                jl_atomic_store_relaxed(&newmi->flags, 0);
             }
             else if (jl_is_code_instance(v)) {
                 assert(f == s->s);
@@ -2406,6 +2450,53 @@ static void jl_prune_type_cache_linear(jl_svec_t *cache)
         jl_svecset(cache, ins++, jl_nothing);
 }
 
+uint_t bindingkey_hash(size_t idx, jl_value_t *data);
+
+static void jl_prune_module_bindings(jl_module_t * m) JL_GC_DISABLED
+{
+    jl_svec_t * bindings = jl_atomic_load_relaxed(&m->bindings);
+    size_t l = jl_svec_len(bindings), i;
+    arraylist_t bindings_list;
+    arraylist_new(&bindings_list, 0);
+    if (l == 0)
+        return;
+    for (i = 0; i < l; i++) {
+        jl_value_t *ti = jl_svecref(bindings, i);
+        if (ti == jl_nothing)
+            continue;
+        jl_binding_t *ref = ((jl_binding_t*)ti);
+        if (!((ptrhash_get(&serialization_order, ref) == HT_NOTFOUND) &&
+            (ptrhash_get(&serialization_order, ref->globalref) == HT_NOTFOUND))) {
+            jl_svecset(bindings, i, jl_nothing);
+            arraylist_push(&bindings_list, ref);
+        }
+    }
+    jl_genericmemory_t* bindingkeyset = jl_atomic_load_relaxed(&m->bindingkeyset);
+    _Atomic(jl_genericmemory_t*)bindingkeyset2;
+    jl_atomic_store_relaxed(&bindingkeyset2,(jl_genericmemory_t*)jl_an_empty_memory_any);
+    jl_svec_t *bindings2 = jl_alloc_svec_uninit(bindings_list.len);
+    for (i = 0; i < bindings_list.len; i++) {
+        jl_binding_t *ref = (jl_binding_t*)bindings_list.items[i];
+        jl_svecset(bindings2, i, ref);
+        jl_smallintset_insert(&bindingkeyset2, (jl_value_t*)m, bindingkey_hash, i, (jl_value_t*)bindings2);
+    }
+    void *idx = ptrhash_get(&serialization_order, bindings);
+    assert(idx != HT_NOTFOUND && idx != (void*)(uintptr_t)-1);
+    assert(serialization_queue.items[(char*)idx - 1 - (char*)HT_NOTFOUND] == bindings);
+    ptrhash_put(&serialization_order, bindings2, idx);
+    serialization_queue.items[(char*)idx - 1 - (char*)HT_NOTFOUND] = bindings2;
+
+    idx = ptrhash_get(&serialization_order, bindingkeyset);
+    assert(idx != HT_NOTFOUND && idx != (void*)(uintptr_t)-1);
+    assert(serialization_queue.items[(char*)idx - 1 - (char*)HT_NOTFOUND] == bindingkeyset);
+    ptrhash_put(&serialization_order, jl_atomic_load_relaxed(&bindingkeyset2), idx);
+    serialization_queue.items[(char*)idx - 1 - (char*)HT_NOTFOUND] = jl_atomic_load_relaxed(&bindingkeyset2);
+    jl_atomic_store_relaxed(&m->bindings, bindings2);
+    jl_atomic_store_relaxed(&m->bindingkeyset, jl_atomic_load_relaxed(&bindingkeyset2));
+    jl_gc_wb(m, bindings2);
+    jl_gc_wb(m, jl_atomic_load_relaxed(&bindingkeyset2));
+}
+
 static void strip_slotnames(jl_array_t *slotnames)
 {
     // replace slot names with `?`, except unused_sym since the compiler looks at it
@@ -2472,7 +2563,7 @@ static int strip_all_codeinfos__(jl_typemap_entry_t *def, void *_env)
     if (m->source) {
         int stripped_ir = 0;
         if (jl_options.strip_ir) {
-            int should_strip_ir = 0;
+            int should_strip_ir = jl_options.trim;
             if (!should_strip_ir) {
                 if (jl_atomic_load_relaxed(&m->unspecialized)) {
                     jl_code_instance_t *unspec = jl_atomic_load_relaxed(&jl_atomic_load_relaxed(&m->unspecialized)->cache);
@@ -2674,8 +2765,46 @@ static void jl_save_system_image_to_stream(ios_t *f, jl_array_t *mod_array,
     // strip metadata and IR when requested
     if (jl_options.strip_metadata || jl_options.strip_ir)
         jl_strip_all_codeinfos();
+    // collect needed methods and replace method tables that are in the tags array
+    htable_new(&new_methtables, 0);
+    arraylist_t MIs;
+    arraylist_new(&MIs, 0);
+    arraylist_t gvars;
+    arraylist_new(&gvars, 0);
+    arraylist_t external_fns;
+    arraylist_new(&external_fns, 0);
 
     int en = jl_gc_enable(0);
+    if (native_functions) {
+        jl_get_llvm_gvs(native_functions, &gvars);
+        jl_get_llvm_external_fns(native_functions, &external_fns);
+        if (jl_options.trim)
+            jl_get_llvm_mis(native_functions, &MIs);
+    }
+    if (jl_options.trim) {
+        jl_rebuild_methtables(&MIs, &new_methtables);
+        jl_methtable_t *mt = (jl_methtable_t *)ptrhash_get(&new_methtables, jl_type_type_mt);
+        JL_GC_PROMISE_ROOTED(mt);
+        if (mt != HT_NOTFOUND)
+            jl_type_type_mt = mt;
+        else
+            jl_type_type_mt = jl_new_method_table(jl_type_type_mt->name, jl_type_type_mt->module);
+
+        mt = (jl_methtable_t *)ptrhash_get(&new_methtables, jl_kwcall_mt);
+        JL_GC_PROMISE_ROOTED(mt);
+        if (mt != HT_NOTFOUND)
+            jl_kwcall_mt = mt;
+        else
+            jl_kwcall_mt = jl_new_method_table(jl_kwcall_mt->name, jl_kwcall_mt->module);
+
+        mt = (jl_methtable_t *)ptrhash_get(&new_methtables, jl_nonfunction_mt);
+        JL_GC_PROMISE_ROOTED(mt);
+        if (mt != HT_NOTFOUND)
+            jl_nonfunction_mt = mt;
+        else
+            jl_nonfunction_mt = jl_new_method_table(jl_nonfunction_mt->name, jl_nonfunction_mt->module);
+    }
+
     nsym_tag = 0;
     htable_new(&symbol_table, 0);
     htable_new(&fptr_to_id, sizeof(id_to_fptrs) / sizeof(*id_to_fptrs));
@@ -2721,14 +2850,6 @@ static void jl_save_system_image_to_stream(ios_t *f, jl_array_t *mod_array,
     htable_new(&s.callers_with_edges, 0);
     jl_value_t **const*const tags = get_tags(); // worklist == NULL ? get_tags() : NULL;
 
-    arraylist_t gvars;
-    arraylist_t external_fns;
-    arraylist_new(&gvars, 0);
-    arraylist_new(&external_fns, 0);
-    if (native_functions) {
-        jl_get_llvm_gvs(native_functions, &gvars);
-        jl_get_llvm_external_fns(native_functions, &external_fns);
-    }
 
     if (worklist == NULL) {
         // empty!(Core.ARGS)
@@ -2787,6 +2908,8 @@ static void jl_save_system_image_to_stream(ios_t *f, jl_array_t *mod_array,
         // step 1.2: ensure all gvars are part of the sysimage too
         record_gvars(&s, &gvars);
         record_external_fns(&s, &external_fns);
+        if (jl_options.trim)
+            record_gvars(&s, &MIs);
         jl_serialize_reachable(&s);
         // step 1.3: prune (garbage collect) special weak references from the jl_global_roots_list
         if (worklist == NULL) {
@@ -2807,8 +2930,30 @@ static void jl_save_system_image_to_stream(ios_t *f, jl_array_t *mod_array,
         // step 1.4: prune (garbage collect) some special weak references from
         // built-in type caches too
         for (i = 0; i < serialization_queue.len; i++) {
-            jl_typename_t *tn = (jl_typename_t*)serialization_queue.items[i];
-            if (jl_is_typename(tn)) {
+            jl_value_t *v = (jl_value_t*)serialization_queue.items[i];
+            if (jl_options.trim) {
+                if (jl_is_method(v)){
+                    jl_method_t *m = (jl_method_t*)v;
+                    jl_value_t *specializations_ = jl_atomic_load_relaxed(&m->specializations);
+                    if (!jl_is_svec(specializations_))
+                        continue;
+
+                    jl_svec_t *specializations = (jl_svec_t *)specializations_;
+                    size_t l = jl_svec_len(specializations), i;
+                    for (i = 0; i < l; i++) {
+                        jl_value_t *mi = jl_svecref(specializations, i);
+                        if (mi == jl_nothing)
+                            continue;
+                        if (ptrhash_get(&serialization_order, mi) == HT_NOTFOUND)
+                            jl_svecset(specializations, i, jl_nothing);
+                    }
+                } else if (jl_is_module(v)) {
+                    jl_prune_module_bindings((jl_module_t*)v);
+                }
+            }
+            // Not else
+            if (jl_is_typename(v)) {
+                jl_typename_t *tn = (jl_typename_t*)v;
                 jl_atomic_store_relaxed(&tn->cache,
                     jl_prune_type_cache_hash(jl_atomic_load_relaxed(&tn->cache)));
                 jl_gc_wb(tn, jl_atomic_load_relaxed(&tn->cache));
@@ -2917,7 +3062,9 @@ static void jl_save_system_image_to_stream(ios_t *f, jl_array_t *mod_array,
             jl_write_value(&s, global_roots_keyset);
             jl_write_value(&s, s.ptls->root_task->tls);
             write_uint32(f, jl_get_gs_ctr());
-            write_uint(f, jl_atomic_load_acquire(&jl_world_counter));
+            size_t world = jl_atomic_load_acquire(&jl_world_counter);
+            // assert(world == precompilation_world); // This triggers on a normal build of julia
+            write_uint(f, world);
             write_uint(f, jl_typeinf_world);
         }
         else {
@@ -2970,6 +3117,7 @@ static void jl_save_system_image_to_stream(ios_t *f, jl_array_t *mod_array,
     htable_free(&nullptrs);
     htable_free(&symbol_table);
     htable_free(&fptr_to_id);
+    htable_free(&new_methtables);
     nsym_tag = 0;
 
     jl_gc_enable(en);
@@ -2999,6 +3147,10 @@ static void jl_write_header_for_incremental(ios_t *f, jl_array_t *worklist, jl_a
 JL_DLLEXPORT void jl_create_system_image(void **_native_data, jl_array_t *worklist, bool_t emit_split,
                                          ios_t **s, ios_t **z, jl_array_t **udeps, int64_t *srctextpos)
 {
+    if (jl_options.strip_ir || jl_options.trim) {
+        // make sure this is precompiled for jl_foreach_reachable_mtable
+        jl_get_loaded_modules();
+    }
     jl_gc_collect(JL_GC_FULL);
     jl_gc_collect(JL_GC_INCREMENTAL);   // sweep finalizers
     JL_TIMING(SYSIMG_DUMP, SYSIMG_DUMP);
@@ -3048,7 +3200,11 @@ JL_DLLEXPORT void jl_create_system_image(void **_native_data, jl_array_t *workli
         }
     }
     else if (_native_data != NULL) {
-        *_native_data = jl_precompile(jl_options.compile_enabled == JL_OPTIONS_COMPILE_ALL);
+        precompilation_world = jl_atomic_load_acquire(&jl_world_counter);
+        if (jl_options.trim)
+            *_native_data = jl_precompile_trimmed(precompilation_world);
+        else
+            *_native_data = jl_precompile(jl_options.compile_enabled == JL_OPTIONS_COMPILE_ALL);
     }
 
     // Make sure we don't run any Julia code concurrently after this point
diff --git a/src/staticdata_utils.c b/src/staticdata_utils.c
index f39e5357c6782..8eb223d3cfbde 100644
--- a/src/staticdata_utils.c
+++ b/src/staticdata_utils.c
@@ -159,7 +159,8 @@ static int has_backedge_to_worklist(jl_method_instance_t *mi, htable_t *visited,
     if (jl_is_method(mod))
         mod = ((jl_method_t*)mod)->module;
     assert(jl_is_module(mod));
-    if (jl_atomic_load_relaxed(&mi->precompiled) || !jl_object_in_image((jl_value_t*)mod) || type_in_worklist(mi->specTypes)) {
+    uint8_t is_precompiled = jl_atomic_load_relaxed(&mi->flags) & JL_MI_FLAGS_MASK_PRECOMPILED;
+    if (is_precompiled || !jl_object_in_image((jl_value_t*)mod) || type_in_worklist(mi->specTypes)) {
         return 1;
     }
     if (!mi->backedges) {
@@ -752,6 +753,16 @@ static int64_t write_dependency_list(ios_t *s, jl_array_t* worklist, jl_array_t
     static jl_value_t *replace_depot_func = NULL;
     if (!replace_depot_func)
         replace_depot_func = jl_get_global(jl_base_module, jl_symbol("replace_depot_path"));
+    static jl_value_t *normalize_depots_func = NULL;
+    if (!normalize_depots_func)
+        normalize_depots_func = jl_get_global(jl_base_module, jl_symbol("normalize_depots_for_relocation"));
+
+    jl_value_t *depots = NULL, *prefs_hash = NULL, *prefs_list = NULL;
+    JL_GC_PUSH2(&depots, &prefs_list);
+    last_age = ct->world_age;
+    ct->world_age = jl_atomic_load_acquire(&jl_world_counter);
+    depots = jl_apply(&normalize_depots_func, 1);
+    ct->world_age = last_age;
 
     // write a placeholder for total size so that we can quickly seek past all of the
     // dependencies if we don't need them
@@ -764,13 +775,14 @@ static int64_t write_dependency_list(ios_t *s, jl_array_t* worklist, jl_array_t
 
         if (replace_depot_func) {
             jl_value_t **replace_depot_args;
-            JL_GC_PUSHARGS(replace_depot_args, 2);
+            JL_GC_PUSHARGS(replace_depot_args, 3);
             replace_depot_args[0] = replace_depot_func;
             replace_depot_args[1] = deppath;
+            replace_depot_args[2] = depots;
             ct = jl_current_task;
             size_t last_age = ct->world_age;
             ct->world_age = jl_atomic_load_acquire(&jl_world_counter);
-            deppath = (jl_value_t*)jl_apply(replace_depot_args, 2);
+            deppath = (jl_value_t*)jl_apply(replace_depot_args, 3);
             ct->world_age = last_age;
             JL_GC_POP();
         }
@@ -803,9 +815,6 @@ static int64_t write_dependency_list(ios_t *s, jl_array_t* worklist, jl_array_t
     write_int32(s, 0); // terminator, for ease of reading
 
     // Calculate Preferences hash for current package.
-    jl_value_t *prefs_hash = NULL;
-    jl_value_t *prefs_list = NULL;
-    JL_GC_PUSH1(&prefs_list);
     if (jl_base_module) {
         // Toplevel module is the module we're currently compiling, use it to get our preferences hash
         jl_value_t * toplevel = (jl_value_t*)jl_get_global(jl_base_module, jl_symbol("__toplevel__"));
@@ -852,7 +861,7 @@ static int64_t write_dependency_list(ios_t *s, jl_array_t* worklist, jl_array_t
         write_int32(s, 0);
         write_uint64(s, 0);
     }
-    JL_GC_POP(); // for prefs_list
+    JL_GC_POP(); // for depots, prefs_list
 
     // write a dummy file position to indicate the beginning of the source-text
     pos = ios_pos(s);
diff --git a/src/subtype.c b/src/subtype.c
index 4118bbeab649b..65ee4d5916bce 100644
--- a/src/subtype.c
+++ b/src/subtype.c
@@ -1304,6 +1304,7 @@ static int subtype_tuple(jl_datatype_t *xd, jl_datatype_t *yd, jl_stenv_t *e, in
 }
 
 static int try_subtype_by_bounds(jl_value_t *a, jl_value_t *b, jl_stenv_t *e);
+static int has_exists_typevar(jl_value_t *x, jl_stenv_t *e) JL_NOTSAFEPOINT;
 
 // `param` means we are currently looking at a parameter of a type constructor
 // (as opposed to being outside any type constructor, or comparing variable bounds).
@@ -1314,7 +1315,7 @@ static int subtype(jl_value_t *x, jl_value_t *y, jl_stenv_t *e, int param)
     if (jl_is_uniontype(x)) {
         if (obviously_egal(x, y))
             return 1;
-        if (e->Runions.depth == 0 && jl_is_typevar(y) && !jl_has_free_typevars(x) && !jl_has_free_typevars(((jl_tvar_t*)y)->ub)) {
+        if (e->Runions.depth == 0 && jl_is_typevar(y) && !jl_has_free_typevars(x)) {
             // Similar to fast path for repeated elements: if there have been no outer
             // unions on the right, and the right side is a typevar, then we can handle the
             // typevar first before picking a union element, under the theory that it may
@@ -1325,7 +1326,17 @@ static int subtype(jl_value_t *x, jl_value_t *y, jl_stenv_t *e, int param)
             // free typevars, since the typevars presence might lead to those elements
             // getting eliminated (omit_bad_union) or degenerate (Union{Ptr{T}, Ptr}) or
             // combined (Union{T, S} where {T, S <: T}).
-            return subtype_var((jl_tvar_t*)y, x, e, 1, param);
+            jl_tvar_t *yvar = (jl_tvar_t *)y;
+            jl_varbinding_t *yb = lookup(e, yvar);
+            while (e->intersection && yb != NULL && yb->lb == yb->ub && jl_is_typevar(yb->lb)) {
+                yvar = (jl_tvar_t *)yb->lb;
+                yb = lookup(e, yvar);
+            }
+            // Note: `x <: ∃y` performs a local ∀-∃ check between `x` and `yb->ub`.
+            // We need to ensure that there's no ∃ typevar as otherwise that check
+            // might cause false alarm due to the accumulated env change.
+            if (yb == NULL || yb->right == 0 || !has_exists_typevar(yb->ub, e))
+                return subtype_var(yvar, x, e, 1, param);
         }
         x = pick_union_element(x, e, 0);
     }
@@ -2412,24 +2423,47 @@ static jl_value_t *intersect_aside(jl_value_t *x, jl_value_t *y, jl_stenv_t *e,
     if (obviously_egal(x, y))
         return x;
 
+    jl_varbinding_t *vars = NULL;
+    jl_varbinding_t *bbprev = NULL;
+    jl_varbinding_t *xb = jl_is_typevar(x) ? lookup(e, (jl_tvar_t *)x) : NULL;
+    jl_varbinding_t *yb = jl_is_typevar(y) ? lookup(e, (jl_tvar_t *)y) : NULL;
+    int simple_x = !jl_has_free_typevars(!jl_is_typevar(x) ? x : xb ? xb->ub : ((jl_tvar_t *)x)->ub);
+    int simple_y = !jl_has_free_typevars(!jl_is_typevar(y) ? y : yb ? yb->ub : ((jl_tvar_t *)y)->ub);
+    if (simple_x && simple_y && !(xb && yb)) {
+        vars = e->vars;
+        e->vars = xb ? xb : yb;
+        if (e->vars != NULL) {
+            bbprev = e->vars->prev;
+            e->vars->prev = NULL;
+        }
+    }
     jl_saved_unionstate_t oldRunions; push_unionstate(&oldRunions, &e->Runions);
     int savedepth = e->invdepth;
     e->invdepth = depth;
     jl_value_t *res = intersect_all(x, y, e);
     e->invdepth = savedepth;
     pop_unionstate(&e->Runions, &oldRunions);
+    if (bbprev) e->vars->prev = bbprev;
+    if (vars) e->vars = vars;
     return res;
 }
 
 static jl_value_t *intersect_union(jl_value_t *x, jl_uniontype_t *u, jl_stenv_t *e, int8_t R, int param)
 {
-    if (param == 2 || (!jl_has_free_typevars(x) && !jl_has_free_typevars((jl_value_t*)u))) {
+    int no_free = !jl_has_free_typevars(x) && !jl_has_free_typevars((jl_value_t*)u);
+    if (param == 2 || no_free) {
         jl_value_t *a=NULL, *b=NULL;
         JL_GC_PUSH2(&a, &b);
+        jl_varbinding_t *vars = NULL;
+        if (no_free) {
+            vars = e->vars;
+            e->vars = NULL;
+        }
         jl_saved_unionstate_t oldRunions; push_unionstate(&oldRunions, &e->Runions);
         a = R ? intersect_all(x, u->a, e) : intersect_all(u->a, x, e);
         b = R ? intersect_all(x, u->b, e) : intersect_all(u->b, x, e);
         pop_unionstate(&e->Runions, &oldRunions);
+        if (vars) e->vars = vars;
         jl_value_t *i = simple_join(a,b);
         JL_GC_POP();
         return i;
@@ -4127,9 +4161,13 @@ static jl_value_t *intersect_all(jl_value_t *x, jl_value_t *y, jl_stenv_t *e)
     save_env(e, &se, 1);
     int niter = 0, total_iter = 0;
     is[0] = intersect(x, y, e, 0); // root
-    if (is[0] != jl_bottom_type)
+    if (is[0] == jl_bottom_type) {
+        restore_env(e, &se, 1);
+    }
+    else if (!e->emptiness_only && has_next_union_state(e, 1)) {
         niter = merge_env(e, &me, &se, niter);
-    restore_env(e, &se, 1);
+        restore_env(e, &se, 1);
+    }
     while (next_union_state(e, 1)) {
         if (e->emptiness_only && is[0] != jl_bottom_type)
             break;
@@ -4137,9 +4175,16 @@ static jl_value_t *intersect_all(jl_value_t *x, jl_value_t *y, jl_stenv_t *e)
         e->Runions.more = 0;
 
         is[1] = intersect(x, y, e, 0);
-        if (is[1] != jl_bottom_type)
+        if (is[1] == jl_bottom_type) {
+            restore_env(e, &se, 1);
+        }
+        else if (niter > 0 || (!e->emptiness_only && has_next_union_state(e, 1))) {
             niter = merge_env(e, &me, &se, niter);
-        restore_env(e, &se, 1);
+            restore_env(e, &se, 1);
+        }
+        else {
+            assert(is[0] == jl_bottom_type);
+        }
         if (is[0] == jl_bottom_type)
             is[0] = is[1];
         else if (is[1] != jl_bottom_type) {
diff --git a/src/support/arraylist.h b/src/support/arraylist.h
index 6ad2f0e2f28c9..a83bd2808756c 100644
--- a/src/support/arraylist.h
+++ b/src/support/arraylist.h
@@ -20,11 +20,11 @@ typedef struct {
     void *_space[AL_N_INLINE];
 } arraylist_t;
 
-arraylist_t *arraylist_new(arraylist_t *a, size_t size) JL_NOTSAFEPOINT;
-void arraylist_free(arraylist_t *a) JL_NOTSAFEPOINT;
+JL_DLLEXPORT arraylist_t *arraylist_new(arraylist_t *a, size_t size) JL_NOTSAFEPOINT;
+JL_DLLEXPORT void arraylist_free(arraylist_t *a) JL_NOTSAFEPOINT;
 
-void arraylist_push(arraylist_t *a, void *elt) JL_NOTSAFEPOINT;
-void *arraylist_pop(arraylist_t *a) JL_NOTSAFEPOINT;
+JL_DLLEXPORT void arraylist_push(arraylist_t *a, void *elt) JL_NOTSAFEPOINT;
+JL_DLLEXPORT void *arraylist_pop(arraylist_t *a) JL_NOTSAFEPOINT;
 JL_DLLEXPORT void arraylist_grow(arraylist_t *a, size_t n) JL_NOTSAFEPOINT;
 
 typedef struct {
@@ -34,11 +34,12 @@ typedef struct {
     void *_space[SMALL_AL_N_INLINE];
 } small_arraylist_t;
 
-small_arraylist_t *small_arraylist_new(small_arraylist_t *a, uint32_t size) JL_NOTSAFEPOINT;
-void small_arraylist_free(small_arraylist_t *a) JL_NOTSAFEPOINT;
 
-void small_arraylist_push(small_arraylist_t *a, void *elt) JL_NOTSAFEPOINT;
-void *small_arraylist_pop(small_arraylist_t *a) JL_NOTSAFEPOINT;
+JL_DLLEXPORT small_arraylist_t *small_arraylist_new(small_arraylist_t *a, uint32_t size) JL_NOTSAFEPOINT;
+JL_DLLEXPORT void small_arraylist_free(small_arraylist_t *a) JL_NOTSAFEPOINT;
+
+JL_DLLEXPORT void small_arraylist_push(small_arraylist_t *a, void *elt) JL_NOTSAFEPOINT;
+JL_DLLEXPORT void *small_arraylist_pop(small_arraylist_t *a) JL_NOTSAFEPOINT;
 JL_DLLEXPORT void small_arraylist_grow(small_arraylist_t *a, uint32_t n) JL_NOTSAFEPOINT;
 
 #ifdef __cplusplus
diff --git a/src/sys.c b/src/sys.c
index 107a8f7637763..fa9054bb93e9a 100644
--- a/src/sys.c
+++ b/src/sys.c
@@ -102,7 +102,6 @@ JL_DLLEXPORT int32_t jl_nb_available(ios_t *s)
 
 // --- dir/file stuff ---
 
-JL_DLLEXPORT int jl_sizeof_uv_fs_t(void) { return sizeof(uv_fs_t); }
 JL_DLLEXPORT char *jl_uv_fs_t_ptr(uv_fs_t *req) { return (char*)req->ptr; }
 JL_DLLEXPORT char *jl_uv_fs_t_path(uv_fs_t *req) { return (char*)req->path; }
 
@@ -478,25 +477,10 @@ JL_DLLEXPORT int jl_cpu_threads(void) JL_NOTSAFEPOINT
 
 JL_DLLEXPORT int jl_effective_threads(void) JL_NOTSAFEPOINT
 {
-    int cpu = jl_cpu_threads();
-    int masksize = uv_cpumask_size();
-    if (masksize < 0 || jl_running_under_rr(0))
-        return cpu;
-    uv_thread_t tid = uv_thread_self();
-    char *cpumask = (char *)calloc(masksize, sizeof(char));
-    int err = uv_thread_getaffinity(&tid, cpumask, masksize);
-    if (err) {
-        free(cpumask);
-        jl_safe_printf("WARNING: failed to get thread affinity (%s %d)\n", uv_err_name(err),
-                       err);
-        return cpu;
-    }
-    int n = 0;
-    for (size_t i = 0; i < masksize; i++) {
-        n += cpumask[i];
-    }
-    free(cpumask);
-    return n < cpu ? n : cpu;
+    // We want the more conservative estimate of the two.
+    int cpu_threads = jl_cpu_threads();
+    int available_parallelism = uv_available_parallelism();
+    return available_parallelism < cpu_threads ? available_parallelism : cpu_threads;
 }
 
 
@@ -787,26 +771,11 @@ JL_DLLEXPORT jl_sym_t *jl_get_ARCH(void) JL_NOTSAFEPOINT
 
 JL_DLLEXPORT size_t jl_maxrss(void)
 {
-#if defined(_OS_WINDOWS_)
-    PROCESS_MEMORY_COUNTERS counter;
-    GetProcessMemoryInfo( GetCurrentProcess( ), &counter, sizeof(counter) );
-    return (size_t)counter.PeakWorkingSetSize;
-
-// FIXME: `rusage` is available on OpenBSD, DragonFlyBSD and NetBSD as well.
-//        All of them return `ru_maxrss` in kilobytes.
-#elif defined(_OS_LINUX_) || defined(_OS_DARWIN_) || defined (_OS_FREEBSD_) || defined (_OS_OPENBSD_)
-    struct rusage rusage;
-    getrusage( RUSAGE_SELF, &rusage );
-
-#if defined(_OS_LINUX_) || defined(_OS_FREEBSD_) || defined (_OS_OPENBSD_)
-    return (size_t)(rusage.ru_maxrss * 1024);
-#else
-    return (size_t)rusage.ru_maxrss;
-#endif
-
-#else
-    return (size_t)0;
-#endif
+    uv_rusage_t rusage;
+    if (uv_getrusage(&rusage) == 0) {
+        return rusage.ru_maxrss * 1024;
+    }
+    return 0;
 }
 
 // Simple `rand()` like function, with global seed and added thread-safety
diff --git a/src/task.c b/src/task.c
index 86acac23a186a..f86e0ab3a880d 100644
--- a/src/task.c
+++ b/src/task.c
@@ -771,48 +771,31 @@ JL_DLLEXPORT JL_NORETURN void jl_no_exc_handler(jl_value_t *e, jl_task_t *ct)
 #define pop_timings_stack() /* Nothing */
 #endif
 
-#define throw_internal_body(altstack)                                          \
-    assert(!jl_get_safe_restore());                                            \
-    jl_ptls_t ptls = ct->ptls;                                                 \
-    ptls->io_wait = 0;                                                         \
-    jl_gc_unsafe_enter(ptls);                                                  \
-    if (exception) {                                                           \
-        /* The temporary ptls->bt_data is rooted by special purpose code in the\
-           GC. This exists only for the purpose of preserving bt_data until we \
-           set ptls->bt_size=0 below. */                                       \
-        jl_push_excstack(ct, &ct->excstack, exception,                         \
-                         ptls->bt_data, ptls->bt_size);                        \
-        ptls->bt_size = 0;                                                     \
-    }                                                                          \
-    assert(ct->excstack && ct->excstack->top);                                 \
-    jl_handler_t *eh = ct->eh;                                                 \
-    if (eh != NULL) {                                                          \
-        if (altstack) ptls->sig_exception = NULL;                              \
-        pop_timings_stack()                                                    \
-        asan_unpoison_task_stack(ct, &eh->eh_ctx);                             \
-        jl_longjmp(eh->eh_ctx, 1);                                             \
-    }                                                                          \
-    else {                                                                     \
-        jl_no_exc_handler(exception, ct);                                      \
-    }                                                                          \
-    assert(0);
-
 static void JL_NORETURN throw_internal(jl_task_t *ct, jl_value_t *exception JL_MAYBE_UNROOTED)
 {
-CFI_NORETURN
     JL_GC_PUSH1(&exception);
-    throw_internal_body(0);
-    jl_unreachable();
-}
-
-/* On the signal stack, we don't want to create any asan frames, but we do on the
-   normal, stack, so we split this function in two, depending on which context
-   we're calling it in. This also lets us avoid making a GC frame on the altstack,
-   which might end up getting corrupted if we recur here through another signal. */
-JL_NO_ASAN static void JL_NORETURN throw_internal_altstack(jl_task_t *ct, jl_value_t *exception)
-{
-CFI_NORETURN
-    throw_internal_body(1);
+    jl_ptls_t ptls = ct->ptls;
+    ptls->io_wait = 0;
+    jl_gc_unsafe_enter(ptls);
+    if (exception) {
+        /* The temporary ptls->bt_data is rooted by special purpose code in the\
+           GC. This exists only for the purpose of preserving bt_data until we
+           set ptls->bt_size=0 below. */
+        jl_push_excstack(ct, &ct->excstack, exception,
+                         ptls->bt_data, ptls->bt_size);
+        ptls->bt_size = 0;
+    }
+    assert(ct->excstack && ct->excstack->top);
+    jl_handler_t *eh = ct->eh;
+    if (eh != NULL) {
+        pop_timings_stack()
+        asan_unpoison_task_stack(ct, &eh->eh_ctx);
+        jl_longjmp(eh->eh_ctx, 1);
+    }
+    else {
+        jl_no_exc_handler(exception, ct);
+    }
+    assert(0);
     jl_unreachable();
 }
 
@@ -842,24 +825,6 @@ JL_DLLEXPORT void jl_rethrow(void)
     throw_internal(ct, NULL);
 }
 
-// Special case throw for errors detected inside signal handlers.  This is not
-// (cannot be) called directly in the signal handler itself, but is returned to
-// after the signal handler exits.
-JL_DLLEXPORT JL_NO_ASAN void JL_NORETURN jl_sig_throw(void)
-{
-CFI_NORETURN
-    jl_jmp_buf *safe_restore = jl_get_safe_restore();
-    jl_task_t *ct = jl_current_task;
-    if (safe_restore) {
-        asan_unpoison_task_stack(ct, safe_restore);
-        jl_longjmp(*safe_restore, 1);
-    }
-    jl_ptls_t ptls = ct->ptls;
-    jl_value_t *e = ptls->sig_exception;
-    JL_GC_PROMISE_ROOTED(e);
-    throw_internal_altstack(ct, e);
-}
-
 JL_DLLEXPORT void jl_rethrow_other(jl_value_t *e JL_MAYBE_UNROOTED)
 {
     // TODO: Should uses of `rethrow(exc)` be replaced with a normal throw, now
diff --git a/src/threading.c b/src/threading.c
index a6050ace01833..44b1192528531 100644
--- a/src/threading.c
+++ b/src/threading.c
@@ -74,6 +74,16 @@ JL_DLLEXPORT jl_jmp_buf *jl_get_safe_restore(void)
 
 JL_DLLEXPORT void jl_set_safe_restore(jl_jmp_buf *sr)
 {
+#ifdef _OS_DARWIN_
+    jl_task_t *ct = jl_get_current_task();
+    if (ct != NULL && ct->ptls) {
+        if (sr == NULL)
+            pthread_setspecific(jl_safe_restore_key, (void*)sr);
+        ct->ptls->safe_restore = sr;
+        if (sr == NULL)
+            return;
+    }
+#endif
     pthread_setspecific(jl_safe_restore_key, (void*)sr);
 }
 #endif
@@ -304,6 +314,18 @@ JL_DLLEXPORT int8_t jl_threadpoolid(int16_t tid) JL_NOTSAFEPOINT
     return -1; // everything else uses threadpool -1 (does not belong to any threadpool)
 }
 
+// get thread local rng
+JL_DLLEXPORT uint64_t jl_get_ptls_rng(void) JL_NOTSAFEPOINT
+{
+    return jl_current_task->ptls->rngseed;
+}
+
+// get thread local rng
+JL_DLLEXPORT void jl_set_ptls_rng(uint64_t new_seed) JL_NOTSAFEPOINT
+{
+    jl_current_task->ptls->rngseed = new_seed;
+}
+
 jl_ptls_t jl_init_threadtls(int16_t tid)
 {
 #ifndef _OS_WINDOWS_
diff --git a/src/toplevel.c b/src/toplevel.c
index 5d17a3fcf89a7..8caa8b086ec00 100644
--- a/src/toplevel.c
+++ b/src/toplevel.c
@@ -213,6 +213,10 @@ static jl_value_t *jl_eval_module_expr(jl_module_t *parent_module, jl_expr_t *ex
         form = NULL;
     }
 
+    newm->file = jl_symbol(filename);
+    jl_gc_wb_knownold(newm, newm->file);
+    newm->line = lineno;
+
     for (int i = 0; i < jl_array_nrows(exprs); i++) {
         // process toplevel form
         ct->world_age = jl_atomic_load_acquire(&jl_world_counter);
diff --git a/stdlib/Artifacts/src/Artifacts.jl b/stdlib/Artifacts/src/Artifacts.jl
index bfc884cc30634..9bca72f6c7a14 100644
--- a/stdlib/Artifacts/src/Artifacts.jl
+++ b/stdlib/Artifacts/src/Artifacts.jl
@@ -175,13 +175,11 @@ function load_overrides(;force::Bool = false)::Dict{Symbol, Any}
         end
     end
 
-    overrides = Dict{Symbol,Any}(
-        # Overrides by UUID
-        :UUID => overrides_uuid,
-
-        # Overrides by hash
-        :hash => overrides_hash
-    )
+    overrides = Dict{Symbol,Any}()
+    # Overrides by UUID
+    overrides[:UUID] = overrides_uuid
+    # Overrides by hash
+    overrides[:hash] = overrides_hash
 
     ARTIFACT_OVERRIDES[] = overrides
     return overrides
@@ -351,7 +349,7 @@ function process_overrides(artifact_dict::Dict, pkg_uuid::Base.UUID)
 
             # If we've got a platform-specific friend, override all hashes:
             artifact_dict_name = artifact_dict[name]
-            if isa(artifact_dict_name, Array)
+            if isa(artifact_dict_name, Vector{Any})
                 for entry in artifact_dict_name
                     entry = entry::Dict{String,Any}
                     hash = SHA1(entry["git-tree-sha1"]::String)
@@ -544,7 +542,7 @@ function jointail(dir, tail)
     end
 end
 
-function _artifact_str(__module__, artifacts_toml, name, path_tail, artifact_dict, hash, platform, @nospecialize(lazyartifacts))
+function _artifact_str(__module__, artifacts_toml, name, path_tail, artifact_dict, hash, platform, ::Val{LazyArtifacts}) where LazyArtifacts
     pkg = Base.PkgId(__module__)
     if pkg.uuid !== nothing
         # Process overrides for this UUID, if we know what it is
@@ -563,11 +561,11 @@ function _artifact_str(__module__, artifacts_toml, name, path_tail, artifact_dic
     # If not, try determining what went wrong:
     meta = artifact_meta(name, artifact_dict, artifacts_toml; platform)
     if meta !== nothing && get(meta, "lazy", false)
-        if lazyartifacts isa Module && isdefined(lazyartifacts, :ensure_artifact_installed)
-            if nameof(lazyartifacts) in (:Pkg, :Artifacts)
+        if LazyArtifacts isa Module && isdefined(LazyArtifacts, :ensure_artifact_installed)
+            if nameof(LazyArtifacts) in (:Pkg, :Artifacts)
                 Base.depwarn("using Pkg instead of using LazyArtifacts is deprecated", :var"@artifact_str", force=true)
             end
-            return jointail(lazyartifacts.ensure_artifact_installed(string(name), meta, artifacts_toml; platform), path_tail)
+            return jointail(LazyArtifacts.ensure_artifact_installed(string(name), meta, artifacts_toml; platform), path_tail)
         end
         error("Artifact $(repr(name)) is a lazy artifact; package developers must call `using LazyArtifacts` in $(__module__) before using lazy artifacts.")
     end
@@ -699,10 +697,10 @@ macro artifact_str(name, platform=nothing)
 
     # Check if the user has provided `LazyArtifacts`, and thus supports lazy artifacts
     # If not, check to see if `Pkg` or `Pkg.Artifacts` has been imported.
-    lazyartifacts = nothing
+    LazyArtifacts = nothing
     for module_name in (:LazyArtifacts, :Pkg, :Artifacts)
         if isdefined(__module__, module_name)
-            lazyartifacts = GlobalRef(__module__, module_name)
+            LazyArtifacts = GlobalRef(__module__, module_name)
             break
         end
     end
@@ -714,7 +712,7 @@ macro artifact_str(name, platform=nothing)
         platform = HostPlatform()
         artifact_name, artifact_path_tail, hash = artifact_slash_lookup(name, artifact_dict, artifacts_toml, platform)
         return quote
-            Base.invokelatest(_artifact_str, $(__module__), $(artifacts_toml), $(artifact_name), $(artifact_path_tail), $(artifact_dict), $(hash), $(platform), $(lazyartifacts))::String
+            Base.invokelatest(_artifact_str, $(__module__), $(artifacts_toml), $(artifact_name), $(artifact_path_tail), $(artifact_dict), $(hash), $(platform), Val($(LazyArtifacts)))::String
         end
     else
         if platform === nothing
@@ -723,7 +721,7 @@ macro artifact_str(name, platform=nothing)
         return quote
             local platform = $(esc(platform))
             local artifact_name, artifact_path_tail, hash = artifact_slash_lookup($(esc(name)), $(artifact_dict), $(artifacts_toml), platform)
-            Base.invokelatest(_artifact_str, $(__module__), $(artifacts_toml), artifact_name, artifact_path_tail, $(artifact_dict), hash, platform, $(lazyartifacts))::String
+            Base.invokelatest(_artifact_str, $(__module__), $(artifacts_toml), artifact_name, artifact_path_tail, $(artifact_dict), hash, platform, Val($(LazyArtifacts)))::String
         end
     end
 end
diff --git a/stdlib/Dates/test/types.jl b/stdlib/Dates/test/types.jl
index f5284b376ca4a..29395ccf3a271 100644
--- a/stdlib/Dates/test/types.jl
+++ b/stdlib/Dates/test/types.jl
@@ -263,7 +263,11 @@ end
 end
 
 @testset "issue #31524" begin
-    dt1 = Libc.strptime("%Y-%M-%dT%H:%M:%SZ", "2018-11-16T10:26:14Z")
+    # Ensure the result doesn't depend on local timezone, especially on macOS
+    # where an extra internal call to `mktime` is affected by timezone settings.
+    dt1 = withenv("TZ" => "UTC") do
+        Libc.strptime("%Y-%m-%dT%H:%M:%SZ", "2018-11-16T10:26:14Z")
+    end
     dt2 = Libc.TmStruct(14, 30, 5, 10, 1, 99, 3, 40, 0)
 
     time = Time(dt1)
diff --git a/stdlib/Downloads.version b/stdlib/Downloads.version
index cb041d86d7f66..b539771fbdb47 100644
--- a/stdlib/Downloads.version
+++ b/stdlib/Downloads.version
@@ -1,4 +1,4 @@
 DOWNLOADS_BRANCH = master
-DOWNLOADS_SHA1 = 1061ecc377a053fce0df94e1a19e5260f7c030f5
+DOWNLOADS_SHA1 = 89d3c7dded535a77551e763a437a6d31e4d9bf84
 DOWNLOADS_GIT_URL := https://github.com/JuliaLang/Downloads.jl.git
 DOWNLOADS_TAR_URL = https://api.github.com/repos/JuliaLang/Downloads.jl/tarball/$1
diff --git a/stdlib/FileWatching/docs/src/index.md b/stdlib/FileWatching/docs/src/index.md
index 1b2212fcc5a28..15d4e39a45117 100644
--- a/stdlib/FileWatching/docs/src/index.md
+++ b/stdlib/FileWatching/docs/src/index.md
@@ -5,11 +5,17 @@ EditURL = "https://github.com/JuliaLang/julia/blob/master/stdlib/FileWatching/do
 # [File Events](@id lib-filewatching)
 
 ```@docs
-FileWatching.poll_fd
-FileWatching.poll_file
-FileWatching.watch_file
-FileWatching.watch_folder
-FileWatching.unwatch_folder
+poll_fd
+poll_file
+watch_file
+watch_folder
+unwatch_folder
+```
+```@docs
+FileMonitor
+FolderMonitor
+PollingFileWatcher
+FDWatcher
 ```
 
 # Pidfile
diff --git a/stdlib/FileWatching/src/FileWatching.jl b/stdlib/FileWatching/src/FileWatching.jl
index 0c987ad01c828..7c743ce634193 100644
--- a/stdlib/FileWatching/src/FileWatching.jl
+++ b/stdlib/FileWatching/src/FileWatching.jl
@@ -6,7 +6,7 @@ Utilities for monitoring files and file descriptors for events.
 module FileWatching
 
 export
-    # one-shot API (returns results):
+    # one-shot API (returns results, race-y):
     watch_file, # efficient for small numbers of files
     watch_folder, # efficient for large numbers of files
     unwatch_folder,
@@ -22,11 +22,11 @@ export
     trymkpidlock
 
 import Base: @handle_as, wait, close, eventloop, notify_error, IOError,
-    _sizeof_uv_poll, _sizeof_uv_fs_poll, _sizeof_uv_fs_event, _uv_hook_close, uv_error, _UVError,
-    iolock_begin, iolock_end, associate_julia_struct, disassociate_julia_struct,
-    preserve_handle, unpreserve_handle, isreadable, iswritable, isopen,
-    |, getproperty, propertynames
-import Base.Filesystem.StatStruct
+    uv_req_data, uv_req_set_data, associate_julia_struct, disassociate_julia_struct,
+    _sizeof_uv_poll, _sizeof_uv_fs, _sizeof_uv_fs_event, _uv_hook_close, uv_error, _UVError,
+    iolock_begin, iolock_end, preserve_handle, unpreserve_handle,
+    isreadable, iswritable, isopen, |, getproperty, propertynames
+import Base.Filesystem: StatStruct, uv_fs_req_cleanup
 if Sys.iswindows()
     import Base.WindowsRawSocket
 end
@@ -38,13 +38,13 @@ const UV_CHANGE = Int32(2)
 struct FileEvent
     renamed::Bool
     changed::Bool
-    timedout::Bool
+    timedout::Bool # aka canceled
     FileEvent(r::Bool, c::Bool, t::Bool) = new(r, c, t)
 end
 FileEvent() = FileEvent(false, false, true)
 FileEvent(flags::Integer) = FileEvent((flags & UV_RENAME) != 0,
                                       (flags & UV_CHANGE) != 0,
-                                      false)
+                                      iszero(flags))
 |(a::FileEvent, b::FileEvent) =
     FileEvent(a.renamed | b.renamed,
               a.changed | b.changed,
@@ -78,34 +78,183 @@ isreadable(f::FDEvent) = f.readable
 iswritable(f::FDEvent) = f.writable
 |(a::FDEvent, b::FDEvent) = FDEvent(getfield(a, :events) | getfield(b, :events))
 
+# Callback functions
+
+function uv_fseventscb_file(handle::Ptr{Cvoid}, filename::Ptr, events::Int32, status::Int32)
+    t = @handle_as handle FileMonitor
+    lock(t.notify)
+    try
+        if status != 0
+            t.ioerrno = status
+            notify_error(t.notify, _UVError("FileMonitor", status))
+            uvfinalize(t)
+        elseif events != t.events
+            events = t.events |= events
+            notify(t.notify, all=false)
+        end
+    finally
+        unlock(t.notify)
+    end
+    nothing
+end
+
+function uv_fseventscb_folder(handle::Ptr{Cvoid}, filename::Ptr, events::Int32, status::Int32)
+    t = @handle_as handle FolderMonitor
+    lock(t.notify)
+    try
+        if status != 0
+            notify_error(t.notify, _UVError("FolderMonitor", status))
+        else
+            fname = (filename == C_NULL) ? "" : unsafe_string(convert(Cstring, filename))
+            push!(t.channel, fname => FileEvent(events))
+            notify(t.notify)
+        end
+    finally
+        unlock(t.notify)
+    end
+    nothing
+end
+
+function uv_pollcb(handle::Ptr{Cvoid}, status::Int32, events::Int32)
+    t = @handle_as handle _FDWatcher
+    lock(t.notify)
+    try
+        if status != 0
+            notify_error(t.notify, _UVError("FDWatcher", status))
+        else
+            t.events |= events
+            if t.active[1] || t.active[2]
+                if isempty(t.notify)
+                    # if we keep hearing about events when nobody appears to be listening,
+                    # stop the poll to save cycles
+                    t.active = (false, false)
+                    ccall(:uv_poll_stop, Int32, (Ptr{Cvoid},), t.handle)
+                end
+            end
+            notify(t.notify, events)
+        end
+    finally
+        unlock(t.notify)
+    end
+    nothing
+end
+
+function uv_fspollcb(req::Ptr{Cvoid})
+    pfw = unsafe_pointer_to_objref(uv_req_data(req))::PollingFileWatcher
+    pfw.active = false
+    unpreserve_handle(pfw)
+    @assert pointer(pfw.stat_req) == req
+    r = Int32(ccall(:uv_fs_get_result, Cssize_t, (Ptr{Cvoid},), req))
+    statbuf = ccall(:uv_fs_get_statbuf, Ptr{UInt8}, (Ptr{Cvoid},), req)
+    curr_stat = StatStruct(pfw.file, statbuf, r)
+    uv_fs_req_cleanup(req)
+    lock(pfw.notify)
+    try
+        if !isempty(pfw.notify) # must discard the update if nobody watching
+            if pfw.ioerrno != r || (r == 0 && pfw.prev_stat != curr_stat)
+                if r == 0
+                    pfw.prev_stat = curr_stat
+                end
+                pfw.ioerrno = r
+                notify(pfw.notify, true)
+            end
+            pfw.timer = Timer(pfw.interval) do t
+                # async task
+                iolock_begin()
+                lock(pfw.notify)
+                try
+                    if pfw.timer === t # use identity check to test if this callback is stale by the time we got the lock
+                        pfw.timer = nothing
+                        @assert !pfw.active
+                        if isopen(pfw) && !isempty(pfw.notify)
+                            preserve_handle(pfw)
+                            uv_jl_fspollcb = @cfunction(uv_fspollcb, Cvoid, (Ptr{Cvoid},))
+                            err = ccall(:uv_fs_stat, Cint, (Ptr{Cvoid}, Ptr{Cvoid}, Cstring, Ptr{Cvoid}),
+                                eventloop(), pfw.stat_req, pfw.file, uv_jl_fspollcb::Ptr{Cvoid})
+                            err == 0 || notify(pfw.notify, _UVError("PollingFileWatcher (start)", err), error=true) # likely just ENOMEM
+                            pfw.active = true
+                        end
+                    end
+                finally
+                    unlock(pfw.notify)
+                end
+                iolock_end()
+                nothing
+            end
+        end
+    finally
+        unlock(pfw.notify)
+    end
+    nothing
+end
+
+# Types
+
+"""
+    FileMonitor(path::AbstractString)
+
+Watch file or directory `path` (which must exist) for changes until a change occurs. This
+function does not poll the file system and instead uses platform-specific functionality to
+receive notifications from the operating system (e.g. via inotify on Linux). See the NodeJS
+documentation linked below for details.
+
+`fm = FileMonitor(path)` acts like an auto-reset Event, so `wait(fm)` blocks until there has
+been at least one event in the file originally at the given path and then returns an object
+with boolean fields `renamed`, `changed`, `timedout` summarizing all changes that have
+occurred since the last call to `wait` returned.
+
+This behavior of this function varies slightly across platforms. See
+<https://nodejs.org/api/fs.html#fs_caveats> for more detailed information.
+"""
 mutable struct FileMonitor
     @atomic handle::Ptr{Cvoid}
-    file::String
-    notify::Base.ThreadSynchronizer
-    events::Int32
-    active::Bool
+    const file::String
+    const notify::Base.ThreadSynchronizer
+    events::Int32 # accumulator for events that occurred since the last wait call, similar to Event with autoreset
+    ioerrno::Int32 # record the error, if any occurs (unlikely)
     FileMonitor(file::AbstractString) = FileMonitor(String(file))
     function FileMonitor(file::String)
         handle = Libc.malloc(_sizeof_uv_fs_event)
-        this = new(handle, file, Base.ThreadSynchronizer(), 0, false)
+        this = new(handle, file, Base.ThreadSynchronizer(), 0, 0)
         associate_julia_struct(handle, this)
         iolock_begin()
         err = ccall(:uv_fs_event_init, Cint, (Ptr{Cvoid}, Ptr{Cvoid}), eventloop(), handle)
         if err != 0
             Libc.free(handle)
-            throw(_UVError("FileMonitor", err))
+            uv_error("FileMonitor", err)
         end
-        iolock_end()
         finalizer(uvfinalize, this)
+        uv_jl_fseventscb_file = @cfunction(uv_fseventscb_file, Cvoid, (Ptr{Cvoid}, Ptr{Int8}, Int32, Int32))
+        uv_error("FileMonitor (start)",
+                 ccall(:uv_fs_event_start, Int32, (Ptr{Cvoid}, Ptr{Cvoid}, Cstring, Int32),
+                       this.handle, uv_jl_fseventscb_file::Ptr{Cvoid}, file, 0))
+        iolock_end()
         return this
     end
 end
 
+
+"""
+    FolderMonitor(folder::AbstractString)
+
+Watch a file or directory `path` for changes until a change has occurred. This function does
+not poll the file system and instead uses platform-specific functionality to receive
+notifications from the operating system (e.g. via inotify on Linux). See the NodeJS
+documentation linked below for details.
+
+This acts similar to a Channel, so calling `take!` (or `wait`) blocks until some change has
+occurred. The `wait` function will return a pair where the first field is the name of the
+changed file (if available) and the second field is an object with boolean fields `renamed`
+and `changed`, giving the event that occurred on it.
+
+This behavior of this function varies slightly across platforms. See
+<https://nodejs.org/api/fs.html#fs_caveats> for more detailed information.
+"""
 mutable struct FolderMonitor
     @atomic handle::Ptr{Cvoid}
     # notify::Channel{Any} # eltype = Union{Pair{String, FileEvent}, IOError}
-    notify::Base.ThreadSynchronizer
-    channel::Vector{Any} # eltype = Pair{String, FileEvent}
+    const notify::Base.ThreadSynchronizer
+    const channel::Vector{Any} # eltype = Pair{String, FileEvent}
     FolderMonitor(folder::AbstractString) = FolderMonitor(String(folder))
     function FolderMonitor(folder::String)
         handle = Libc.malloc(_sizeof_uv_fs_event)
@@ -118,6 +267,7 @@ mutable struct FolderMonitor
             throw(_UVError("FolderMonitor", err))
         end
         finalizer(uvfinalize, this)
+        uv_jl_fseventscb_folder = @cfunction(uv_fseventscb_folder, Cvoid, (Ptr{Cvoid}, Ptr{Int8}, Int32, Int32))
         uv_error("FolderMonitor (start)",
                  ccall(:uv_fs_event_start, Int32, (Ptr{Cvoid}, Ptr{Cvoid}, Cstring, Int32),
                        handle, uv_jl_fseventscb_folder::Ptr{Cvoid}, folder, 0))
@@ -126,36 +276,55 @@ mutable struct FolderMonitor
     end
 end
 
+# this is similar to uv_fs_poll, but strives to avoid the design mistakes that make it unsuitable for any usable purpose
+# https://github.com/libuv/libuv/issues/4543
+"""
+    PollingFileWatcher(path::AbstractString, interval_s::Real=5.007)
+
+Monitor a file for changes by polling `stat` every `interval_s` seconds until a change
+occurs or `timeout_s` seconds have elapsed. The `interval_s` should be a long period; the
+default is 5.007 seconds. Call `stat` on it to get the most recent, but old, result.
+
+This acts like an auto-reset Event, so calling `wait` blocks until the `stat` result has
+changed since the previous value captured upon entry to the `wait` call. The `wait` function
+will return a pair of status objects `(previous, current)` once any `stat` change is
+detected since the previous time that `wait` was called. The `previous` status is always a
+`StatStruct`, but it may have all of the fields zeroed (indicating the file didn't
+previously exist, or wasn't previously accessible).
+
+The `current` status object may be a `StatStruct`, an `EOFError` (if the wait is canceled by
+closing this object), or some other `Exception` subtype (if the `stat` operation failed: for
+example, if the path is removed). Note that `stat` value may be outdated if the file has
+changed again multiple times.
+
+Using [`FileMonitor`](@ref) for this operation is preferred, since it is more reliable and
+efficient, although in some situations it may not be available.
+"""
 mutable struct PollingFileWatcher
-    @atomic handle::Ptr{Cvoid}
     file::String
-    interval::UInt32
-    notify::Base.ThreadSynchronizer
-    active::Bool
-    curr_error::Int32
-    curr_stat::StatStruct
+    interval::Float64
+    const notify::Base.ThreadSynchronizer # lock protects all fields which can be changed (including interval and file, if you really must)
+    timer::Union{Nothing,Timer}
+    const stat_req::Memory{UInt8}
+    active::Bool # whether there is already an uv_fspollcb in-flight, so to speak
+    closed::Bool # whether the user has explicitly destroyed this
+    ioerrno::Int32 # the stat errno as of the last result
+    prev_stat::StatStruct # the stat as of the last successful result
     PollingFileWatcher(file::AbstractString, interval::Float64=5.007) = PollingFileWatcher(String(file), interval)
     function PollingFileWatcher(file::String, interval::Float64=5.007) # same default as nodejs
-        handle = Libc.malloc(_sizeof_uv_fs_poll)
-        this = new(handle, file, round(UInt32, interval * 1000), Base.ThreadSynchronizer(), false, 0, StatStruct())
-        associate_julia_struct(handle, this)
-        iolock_begin()
-        err = ccall(:uv_fs_poll_init, Int32, (Ptr{Cvoid}, Ptr{Cvoid}), eventloop(), handle)
-        if err != 0
-            Libc.free(handle)
-            throw(_UVError("PollingFileWatcher", err))
-        end
-        finalizer(uvfinalize, this)
-        iolock_end()
+        stat_req = Memory{UInt8}(undef, Int(_sizeof_uv_fs))
+        this = new(file, interval, Base.ThreadSynchronizer(), nothing, stat_req, false, false, 0, StatStruct())
+        uv_req_set_data(stat_req, this)
+        wait(this) # initialize with the current stat before return
         return this
     end
 end
 
 mutable struct _FDWatcher
     @atomic handle::Ptr{Cvoid}
-    fdnum::Int # this is NOT the file descriptor
+    const fdnum::Int # this is NOT the file descriptor
     refcount::Tuple{Int, Int}
-    notify::Base.ThreadSynchronizer
+    const notify::Base.ThreadSynchronizer
     events::Int32
     active::Tuple{Bool, Bool}
 
@@ -274,9 +443,28 @@ mutable struct _FDWatcher
     end
 end
 
+"""
+    FDWatcher(fd::Union{RawFD,WindowsRawSocket}, readable::Bool, writable::Bool)
+
+Monitor a file descriptor `fd` for changes in the read or write availability.
+
+The keyword arguments determine which of read and/or write status should be monitored; at
+least one of them must be set to `true`.
+
+The returned value is an object with boolean fields `readable`, `writable`, and `timedout`,
+giving the result of the polling.
+
+This acts like a level-set event, so calling `wait` blocks until one of those conditions is
+met, but then continues to return without blocking until the condition is cleared (either
+there is no more to read, or no more space in the write buffer, or both).
+
+!!! warning
+    You must call `close` manually, when finished with this object, before the fd
+    argument is closed. Failure to do so risks serious crashes.
+"""
 mutable struct FDWatcher
     # WARNING: make sure `close` has been manually called on this watcher before closing / destroying `fd`
-    watcher::_FDWatcher
+    const watcher::_FDWatcher
     mask::FDEvent
     function FDWatcher(fd::RawFD, readable::Bool, writable::Bool)
         return FDWatcher(fd, FDEvent(readable, writable, false, false))
@@ -327,7 +515,7 @@ function close(t::FDWatcher)
     close(t.watcher, mask)
 end
 
-function uvfinalize(uv::Union{FileMonitor, FolderMonitor, PollingFileWatcher})
+function uvfinalize(uv::Union{FileMonitor, FolderMonitor})
     iolock_begin()
     if uv.handle != C_NULL
         disassociate_julia_struct(uv) # close (and free) without notify
@@ -336,7 +524,7 @@ function uvfinalize(uv::Union{FileMonitor, FolderMonitor, PollingFileWatcher})
     iolock_end()
 end
 
-function close(t::Union{FileMonitor, FolderMonitor, PollingFileWatcher})
+function close(t::Union{FileMonitor, FolderMonitor})
     iolock_begin()
     if t.handle != C_NULL
         ccall(:jl_close_uv, Cvoid, (Ptr{Cvoid},), t.handle)
@@ -344,6 +532,21 @@ function close(t::Union{FileMonitor, FolderMonitor, PollingFileWatcher})
     iolock_end()
 end
 
+function close(pfw::PollingFileWatcher)
+    timer = nothing
+    lock(pfw.notify)
+    try
+        pfw.closed = true
+        notify(pfw.notify, false)
+        timer = pfw.timer
+        pfw.timer = nothing
+    finally
+        unlock(pfw.notify)
+    end
+    timer === nothing || close(timer)
+    nothing
+end
+
 function _uv_hook_close(uv::_FDWatcher)
     # fyi: jl_atexit_hook can cause this to get called too
     Libc.free(@atomicswap :monotonic uv.handle = C_NULL)
@@ -351,24 +554,11 @@ function _uv_hook_close(uv::_FDWatcher)
     nothing
 end
 
-function _uv_hook_close(uv::PollingFileWatcher)
-    lock(uv.notify)
-    try
-        uv.active = false
-        Libc.free(@atomicswap :monotonic uv.handle = C_NULL)
-        notify(uv.notify, StatStruct())
-    finally
-        unlock(uv.notify)
-    end
-    nothing
-end
-
 function _uv_hook_close(uv::FileMonitor)
     lock(uv.notify)
     try
-        uv.active = false
         Libc.free(@atomicswap :monotonic uv.handle = C_NULL)
-        notify(uv.notify, FileEvent())
+        notify(uv.notify)
     finally
         unlock(uv.notify)
     end
@@ -388,179 +578,11 @@ end
 
 isopen(fm::FileMonitor) = fm.handle != C_NULL
 isopen(fm::FolderMonitor) = fm.handle != C_NULL
-isopen(pfw::PollingFileWatcher) = pfw.handle != C_NULL
+isopen(pfw::PollingFileWatcher) = !pfw.closed
 isopen(pfw::_FDWatcher) = pfw.refcount != (0, 0)
 isopen(pfw::FDWatcher) = !pfw.mask.timedout
 
-function uv_fseventscb_file(handle::Ptr{Cvoid}, filename::Ptr, events::Int32, status::Int32)
-    t = @handle_as handle FileMonitor
-    lock(t.notify)
-    try
-        if status != 0
-            notify_error(t.notify, _UVError("FileMonitor", status))
-        else
-            t.events |= events
-            notify(t.notify, FileEvent(events))
-        end
-    finally
-        unlock(t.notify)
-    end
-    nothing
-end
-
-function uv_fseventscb_folder(handle::Ptr{Cvoid}, filename::Ptr, events::Int32, status::Int32)
-    t = @handle_as handle FolderMonitor
-    lock(t.notify)
-    try
-        if status != 0
-            notify_error(t.notify, _UVError("FolderMonitor", status))
-        else
-            fname = (filename == C_NULL) ? "" : unsafe_string(convert(Cstring, filename))
-            push!(t.channel, fname => FileEvent(events))
-            notify(t.notify)
-        end
-    finally
-        unlock(t.notify)
-    end
-    nothing
-end
-
-function uv_pollcb(handle::Ptr{Cvoid}, status::Int32, events::Int32)
-    t = @handle_as handle _FDWatcher
-    lock(t.notify)
-    try
-        if status != 0
-            notify_error(t.notify, _UVError("FDWatcher", status))
-        else
-            t.events |= events
-            if t.active[1] || t.active[2]
-                if isempty(t.notify)
-                    # if we keep hearing about events when nobody appears to be listening,
-                    # stop the poll to save cycles
-                    t.active = (false, false)
-                    ccall(:uv_poll_stop, Int32, (Ptr{Cvoid},), t.handle)
-                end
-            end
-            notify(t.notify, events)
-        end
-    finally
-        unlock(t.notify)
-    end
-    nothing
-end
-
-function uv_fspollcb(handle::Ptr{Cvoid}, status::Int32, prev::Ptr, curr::Ptr)
-    t = @handle_as handle PollingFileWatcher
-    old_status = t.curr_error
-    t.curr_error = status
-    if status == 0
-        t.curr_stat = StatStruct(convert(Ptr{UInt8}, curr))
-    end
-    if status == 0 || status != old_status
-        prev_stat = StatStruct(convert(Ptr{UInt8}, prev))
-        lock(t.notify)
-        try
-            notify(t.notify, prev_stat)
-        finally
-            unlock(t.notify)
-        end
-    end
-    nothing
-end
-
-global uv_jl_pollcb::Ptr{Cvoid}
-global uv_jl_fspollcb::Ptr{Cvoid}
-global uv_jl_fseventscb_file::Ptr{Cvoid}
-global uv_jl_fseventscb_folder::Ptr{Cvoid}
-
-function __init__()
-    global uv_jl_pollcb = @cfunction(uv_pollcb, Cvoid, (Ptr{Cvoid}, Cint, Cint))
-    global uv_jl_fspollcb = @cfunction(uv_fspollcb, Cvoid, (Ptr{Cvoid}, Cint, Ptr{Cvoid}, Ptr{Cvoid}))
-    global uv_jl_fseventscb_file = @cfunction(uv_fseventscb_file, Cvoid, (Ptr{Cvoid}, Ptr{Int8}, Int32, Int32))
-    global uv_jl_fseventscb_folder = @cfunction(uv_fseventscb_folder, Cvoid, (Ptr{Cvoid}, Ptr{Int8}, Int32, Int32))
-
-    Base.mkpidlock_hook = mkpidlock
-    Base.trymkpidlock_hook = trymkpidlock
-    Base.parse_pidfile_hook = Pidfile.parse_pidfile
-
-    nothing
-end
-
-function start_watching(t::_FDWatcher)
-    iolock_begin()
-    t.handle == C_NULL && throw(ArgumentError("FDWatcher is closed"))
-    readable = t.refcount[1] > 0
-    writable = t.refcount[2] > 0
-    if t.active[1] != readable || t.active[2] != writable
-        # make sure the READABLE / WRITEABLE state is updated
-        uv_error("FDWatcher (start)",
-                 ccall(:uv_poll_start, Int32, (Ptr{Cvoid}, Int32, Ptr{Cvoid}),
-                       t.handle,
-                       (readable ? UV_READABLE : 0) | (writable ? UV_WRITABLE : 0),
-                       uv_jl_pollcb::Ptr{Cvoid}))
-        t.active = (readable, writable)
-    end
-    iolock_end()
-    nothing
-end
-
-function start_watching(t::PollingFileWatcher)
-    iolock_begin()
-    t.handle == C_NULL && throw(ArgumentError("PollingFileWatcher is closed"))
-    if !t.active
-        uv_error("PollingFileWatcher (start)",
-                 ccall(:uv_fs_poll_start, Int32, (Ptr{Cvoid}, Ptr{Cvoid}, Cstring, UInt32),
-                       t.handle, uv_jl_fspollcb::Ptr{Cvoid}, t.file, t.interval))
-        t.active = true
-    end
-    iolock_end()
-    nothing
-end
-
-function stop_watching(t::PollingFileWatcher)
-    iolock_begin()
-    lock(t.notify)
-    try
-        if t.active && isempty(t.notify)
-            t.active = false
-            uv_error("PollingFileWatcher (stop)",
-                     ccall(:uv_fs_poll_stop, Int32, (Ptr{Cvoid},), t.handle))
-        end
-    finally
-        unlock(t.notify)
-    end
-    iolock_end()
-    nothing
-end
-
-function start_watching(t::FileMonitor)
-    iolock_begin()
-    t.handle == C_NULL && throw(ArgumentError("FileMonitor is closed"))
-    if !t.active
-        uv_error("FileMonitor (start)",
-                 ccall(:uv_fs_event_start, Int32, (Ptr{Cvoid}, Ptr{Cvoid}, Cstring, Int32),
-                       t.handle, uv_jl_fseventscb_file::Ptr{Cvoid}, t.file, 0))
-        t.active = true
-    end
-    iolock_end()
-    nothing
-end
-
-function stop_watching(t::FileMonitor)
-    iolock_begin()
-    lock(t.notify)
-    try
-        if t.active && isempty(t.notify)
-            t.active = false
-            uv_error("FileMonitor (stop)",
-                     ccall(:uv_fs_event_stop, Int32, (Ptr{Cvoid},), t.handle))
-        end
-    finally
-        unlock(t.notify)
-    end
-    iolock_end()
-    nothing
-end
+Base.stat(pfw::PollingFileWatcher) = Base.checkstat(@lock pfw.notify pfw.prev_stat)
 
 # n.b. this _wait may return spuriously early with a timedout event
 function _wait(fdw::_FDWatcher, mask::FDEvent)
@@ -572,7 +594,20 @@ function _wait(fdw::_FDWatcher, mask::FDEvent)
         if !isopen(fdw) # !open
             throw(EOFError())
         elseif events.timedout
-            start_watching(fdw) # make sure the poll is active
+            fdw.handle == C_NULL && throw(ArgumentError("FDWatcher is closed"))
+            # start_watching to make sure the poll is active
+            readable = fdw.refcount[1] > 0
+            writable = fdw.refcount[2] > 0
+            if fdw.active[1] != readable || fdw.active[2] != writable
+                # make sure the READABLE / WRITEABLE state is updated
+                uv_jl_pollcb = @cfunction(uv_pollcb, Cvoid, (Ptr{Cvoid}, Cint, Cint))
+                uv_error("FDWatcher (start)",
+                         ccall(:uv_poll_start, Int32, (Ptr{Cvoid}, Int32, Ptr{Cvoid}),
+                               fdw.handle,
+                               (readable ? UV_READABLE : 0) | (writable ? UV_WRITABLE : 0),
+                               uv_jl_pollcb::Ptr{Cvoid}))
+                fdw.active = (readable, writable)
+            end
             iolock_end()
             return FDEvent(wait(fdw.notify)::Int32)
         else
@@ -640,52 +675,88 @@ end
 
 function wait(pfw::PollingFileWatcher)
     iolock_begin()
-    preserve_handle(pfw)
     lock(pfw.notify)
-    local prevstat
+    prevstat = pfw.prev_stat
+    havechange = false
+    timer = nothing
     try
-        start_watching(pfw)
+        # we aren't too strict about the first interval after `wait`, but rather always
+        # check right away to see if it had immediately changed again, and then repeatedly
+        # after interval again until success
+        pfw.closed && throw(ArgumentError("PollingFileWatcher is closed"))
+        timer = pfw.timer
+        pfw.timer = nothing # disable Timer callback
+        # start_watching
+        if !pfw.active
+            preserve_handle(pfw)
+            uv_jl_fspollcb = @cfunction(uv_fspollcb, Cvoid, (Ptr{Cvoid},))
+            err = ccall(:uv_fs_stat, Cint, (Ptr{Cvoid}, Ptr{Cvoid}, Cstring, Ptr{Cvoid}),
+                eventloop(), pfw.stat_req, pfw.file, uv_jl_fspollcb::Ptr{Cvoid})
+            err == 0 || uv_error("PollingFileWatcher (start)", err) # likely just ENOMEM
+            pfw.active = true
+        end
         iolock_end()
-        prevstat = wait(pfw.notify)::StatStruct
+        havechange = wait(pfw.notify)::Bool
         unlock(pfw.notify)
         iolock_begin()
-        lock(pfw.notify)
-    finally
-        unlock(pfw.notify)
-        unpreserve_handle(pfw)
+    catch
+        # stop_watching: cleanup any timers from before or after starting this wait before it failed, if there are no other watchers
+        latetimer = nothing
+        try
+            if isempty(pfw.notify)
+                latetimer = pfw.timer
+                pfw.timer = nothing
+            end
+        finally
+            unlock(pfw.notify)
+        end
+        if timer !== nothing || latetimer !== nothing
+            iolock_end()
+            timer === nothing || close(timer)
+            latetimer === nothing || close(latetimer)
+            iolock_begin()
+        end
+        rethrow()
     end
-    stop_watching(pfw)
     iolock_end()
-    if pfw.handle == C_NULL
+    timer === nothing || close(timer) # cleanup resources so we don't hang on exit
+    if !havechange # user canceled by calling close
         return prevstat, EOFError()
-    elseif pfw.curr_error != 0
-        return prevstat, _UVError("PollingFileWatcher", pfw.curr_error)
+    end
+    # grab the most up-to-date stat result as of this time, even if it was a bit newer than
+    # the notify call (unlikely, as there would need to be a concurrent call to wait)
+    lock(pfw.notify)
+    currstat = pfw.prev_stat
+    ioerrno = pfw.ioerrno
+    unlock(pfw.notify)
+    if ioerrno == 0
+        @assert currstat.ioerrno == 0
+        return prevstat, currstat
+    elseif ioerrno in (Base.UV_ENOENT, Base.UV_ENOTDIR, Base.UV_EINVAL)
+        return prevstat, StatStruct(pfw.file, Ptr{UInt8}(0), ioerrno)
     else
-        return prevstat, pfw.curr_stat
+        return prevstat, _UVError("PollingFileWatcher", ioerrno)
     end
 end
 
 function wait(m::FileMonitor)
-    iolock_begin()
+    m.handle == C_NULL && throw(EOFError())
     preserve_handle(m)
     lock(m.notify)
-    local events
     try
-        start_watching(m)
-        iolock_end()
-        events = wait(m.notify)::FileEvent
-        events |= FileEvent(m.events)
-        m.events = 0
-        unlock(m.notify)
-        iolock_begin()
-        lock(m.notify)
+        while true
+            m.handle == C_NULL && throw(EOFError())
+            events = @atomicswap :not_atomic m.events = 0
+            events == 0 || return FileEvent(events)
+            if m.ioerrno != 0
+                uv_error("FileMonitor", m.ioerrno)
+            end
+            wait(m.notify)
+        end
     finally
         unlock(m.notify)
         unpreserve_handle(m)
     end
-    stop_watching(m)
-    iolock_end()
-    return events
 end
 
 function wait(m::FolderMonitor)
@@ -704,6 +775,7 @@ function wait(m::FolderMonitor)
         end
     return evt::Pair{String, FileEvent}
 end
+Base.take!(m::FolderMonitor) = wait(m) # Channel-like API
 
 
 """
@@ -717,6 +789,10 @@ least one of them must be set to `true`.
 
 The returned value is an object with boolean fields `readable`, `writable`, and `timedout`,
 giving the result of the polling.
+
+This is a thin wrapper over calling `wait` on a [`FDWatcher`](@ref), which implements the
+functionality but requires the user to call `close` manually when finished with it, or risk
+serious crashes.
 """
 function poll_fd(s::Union{RawFD, Sys.iswindows() ? WindowsRawSocket : Union{}}, timeout_s::Real=-1; readable=false, writable=false)
     mask = FDEvent(readable, writable, false, false)
@@ -774,6 +850,15 @@ giving the result of watching the file.
 
 This behavior of this function varies slightly across platforms. See
 <https://nodejs.org/api/fs.html#fs_caveats> for more detailed information.
+
+This is a thin wrapper over calling `wait` on a [`FileMonitor`](@ref). This function has a
+small race window between consecutive calls to `watch_file` where the file might change
+without being detected. To avoid this race, use
+
+    fm = FileMonitor(path)
+    wait(fm)
+
+directly, re-using the same `fm` each time you `wait`.
 """
 function watch_file(s::String, timeout_s::Float64=-1.0)
     fm = FileMonitor(s)
@@ -784,7 +869,12 @@ function watch_file(s::String, timeout_s::Float64=-1.0)
                 close(fm)
             end
         end
-        return wait(fm)
+        try
+            return wait(fm)
+        catch ex
+            ex isa EOFError && return FileEvent()
+            rethrow()
+        end
     finally
         close(fm)
         @isdefined(timer) && close(timer)
@@ -795,7 +885,7 @@ watch_file(s::AbstractString, timeout_s::Real=-1) = watch_file(String(s), Float6
 """
     watch_folder(path::AbstractString, timeout_s::Real=-1)
 
-Watches a file or directory `path` for changes until a change has occurred or `timeout_s`
+Watch a file or directory `path` for changes until a change has occurred or `timeout_s`
 seconds have elapsed. This function does not poll the file system and instead uses platform-specific
 functionality to receive notifications from the operating system (e.g. via inotify on Linux).
 See the NodeJS documentation linked below for details.
@@ -809,10 +899,12 @@ giving the event.
 
 This behavior of this function varies slightly across platforms. See
 <https://nodejs.org/api/fs.html#fs_caveats> for more detailed information.
+
+This function is a thin wrapper over calling `wait` on a [`FolderMonitor`](@ref), with added timeout support.
 """
 watch_folder(s::AbstractString, timeout_s::Real=-1) = watch_folder(String(s), timeout_s)
 function watch_folder(s::String, timeout_s::Real=-1)
-    fm = get!(watched_folders, s) do
+    fm = @lock watched_folders get!(watched_folders[], s) do
         return FolderMonitor(s)
     end
     local timer
@@ -859,12 +951,12 @@ It is not recommended to do this while another task is waiting for
 """
 unwatch_folder(s::AbstractString) = unwatch_folder(String(s))
 function unwatch_folder(s::String)
-    fm = pop!(watched_folders, s, nothing)
+    fm = @lock watched_folders pop!(watched_folders[], s, nothing)
     fm === nothing || close(fm)
     nothing
 end
 
-const watched_folders = Dict{String, FolderMonitor}()
+const watched_folders = Lockable(Dict{String, FolderMonitor}())
 
 """
     poll_file(path::AbstractString, interval_s::Real=5.007, timeout_s::Real=-1) -> (previous::StatStruct, current)
@@ -878,11 +970,15 @@ The `previous` status is always a `StatStruct`, but it may have all of the field
 (indicating the file didn't previously exist, or wasn't previously accessible).
 
 The `current` status object may be a `StatStruct`, an `EOFError` (indicating the timeout elapsed),
-or some other `Exception` subtype (if the `stat` operation failed - for example, if the path does not exist).
+or some other `Exception` subtype (if the `stat` operation failed: for example, if the path does not exist).
+
+To determine when a file was modified, compare `!(current isa StatStruct && prev == current)` to detect
+notification of changes to the mtime or inode. However, using [`watch_file`](@ref) for this operation
+is preferred, since it is more reliable and efficient, although in some situations it may not be available.
 
-To determine when a file was modified, compare `current isa StatStruct && mtime(prev) != mtime(current)` to detect
-notification of changes. However, using [`watch_file`](@ref) for this operation is preferred, since
-it is more reliable and efficient, although in some situations it may not be available.
+This is a thin wrapper over calling `wait` on a [`PollingFileWatcher`](@ref), which implements
+the functionality, but this function has a small race window between consecutive calls to
+`poll_file` where the file might change without being detected.
 """
 function poll_file(s::AbstractString, interval_seconds::Real=5.007, timeout_s::Real=-1)
     pfw = PollingFileWatcher(s, Float64(interval_seconds))
@@ -893,12 +989,7 @@ function poll_file(s::AbstractString, interval_seconds::Real=5.007, timeout_s::R
                 close(pfw)
             end
         end
-        statdiff = wait(pfw)
-        if isa(statdiff[2], IOError)
-            # file didn't initially exist, continue watching for it to be created (or the error to change)
-            statdiff = wait(pfw)
-        end
-        return statdiff
+        return wait(pfw)
     finally
         close(pfw)
         @isdefined(timer) && close(timer)
@@ -908,4 +999,11 @@ end
 include("pidfile.jl")
 import .Pidfile: mkpidlock, trymkpidlock
 
+function __init__()
+    Base.mkpidlock_hook = mkpidlock
+    Base.trymkpidlock_hook = trymkpidlock
+    Base.parse_pidfile_hook = Pidfile.parse_pidfile
+    nothing
+end
+
 end
diff --git a/stdlib/FileWatching/src/pidfile.jl b/stdlib/FileWatching/src/pidfile.jl
index 4c821a3d897e4..95b8f20face29 100644
--- a/stdlib/FileWatching/src/pidfile.jl
+++ b/stdlib/FileWatching/src/pidfile.jl
@@ -4,14 +4,14 @@ module Pidfile
 export mkpidlock, trymkpidlock
 
 using Base:
-    IOError, UV_EEXIST, UV_ESRCH,
+    IOError, UV_EEXIST, UV_ESRCH, UV_ENOENT,
     Process
 
 using Base.Filesystem:
     File, open, JL_O_CREAT, JL_O_RDWR, JL_O_RDONLY, JL_O_EXCL,
     rename, samefile, path_separator
 
-using ..FileWatching: watch_file
+using ..FileWatching: FileMonitor
 using Base.Sys: iswindows
 
 """
@@ -256,19 +256,43 @@ function open_exclusive(path::String;
         end
     end
     # fall-back: wait for the lock
-
+    watch = Lockable(Core.Box(nothing))
     while true
-        # start the file-watcher prior to checking for the pidfile existence
-        t = @async try
-            watch_file(path, poll_interval)
+        # now try again to create it
+        # try to start the file-watcher prior to checking for the pidfile existence
+        watch = try
+            FileMonitor(path)
         catch ex
             isa(ex, IOError) || rethrow(ex)
-            sleep(poll_interval) # if the watch failed, convert to just doing a sleep
+            ex.code != UV_ENOENT # if the file was deleted in the meantime, don't sleep at all, even if the lock fails
+        end
+        timeout = nothing
+        if watch isa FileMonitor && stale_age > 0
+            let watch = watch
+                timeout = Timer(stale_age) do t
+                    close(watch)
+                end
+            end
+        end
+        try
+            file = tryopen_exclusive(path, mode)
+            file === nothing || return file
+            if watch isa FileMonitor
+                try
+                    Base.wait(watch) # will time-out after stale_age passes
+                catch ex
+                    isa(ex, EOFError) || isa(ex, IOError) || rethrow(ex)
+                end
+            end
+            if watch === true # if the watch failed, convert to just doing a sleep
+                sleep(poll_interval)
+            end
+        finally
+            # something changed about the path, so watch is now possibly monitoring the wrong file handle
+            # it will need to be recreated just before the next tryopen_exclusive attempt
+            timeout isa Timer && close(timeout)
+            watch isa FileMonitor && close(watch)
         end
-        # now try again to create it
-        file = tryopen_exclusive(path, mode)
-        file === nothing || return file
-        Base.wait(t) # sleep for a bit before trying again
         if stale_age > 0 && stale_pidfile(path, stale_age, refresh)
             # if the file seems stale, try to remove it before attempting again
             # set stale_age to zero so we won't attempt again, even if the attempt fails
diff --git a/stdlib/FileWatching/test/runtests.jl b/stdlib/FileWatching/test/runtests.jl
index 2592aea024386..def555154264d 100644
--- a/stdlib/FileWatching/test/runtests.jl
+++ b/stdlib/FileWatching/test/runtests.jl
@@ -2,6 +2,7 @@
 
 using Test, FileWatching
 using Base: uv_error, Experimental
+using Base.Filesystem: StatStruct
 
 @testset "FileWatching" begin
 
@@ -168,12 +169,13 @@ file = joinpath(dir, "afile.txt")
 
 # initialize a watch_folder instance and create afile.txt
 function test_init_afile()
-    @test isempty(FileWatching.watched_folders)
+    watched_folders = FileWatching.watched_folders
+    @test @lock watched_folders isempty(watched_folders[])
     @test(watch_folder(dir, 0) == ("" => FileWatching.FileEvent()))
     @test @elapsed(@test(watch_folder(dir, 0) == ("" => FileWatching.FileEvent()))) <= 0.5
-    @test length(FileWatching.watched_folders) == 1
+    @test @lock(watched_folders, length(FileWatching.watched_folders[])) == 1
     @test unwatch_folder(dir) === nothing
-    @test isempty(FileWatching.watched_folders)
+    @test @lock watched_folders isempty(watched_folders[])
     @test 0.002 <= @elapsed(@test(watch_folder(dir, 0.004) == ("" => FileWatching.FileEvent())))
     @test 0.002 <= @elapsed(@test(watch_folder(dir, 0.004) == ("" => FileWatching.FileEvent()))) <= 0.5
     @test unwatch_folder(dir) === nothing
@@ -203,7 +205,7 @@ function test_init_afile()
     @test unwatch_folder(dir) === nothing
     @test(watch_folder(dir, 0) == ("" => FileWatching.FileEvent()))
     @test 0.9 <= @elapsed(@test(watch_folder(dir, 1) == ("" => FileWatching.FileEvent())))
-    @test length(FileWatching.watched_folders) == 1
+    @test @lock(watched_folders, length(FileWatching.watched_folders[])) == 1
     nothing
 end
 
@@ -218,7 +220,7 @@ function test_timeout(tval)
         @async test_file_poll(channel, 10, tval)
         tr = take!(channel)
     end
-    @test tr[1] === Base.Filesystem.StatStruct() && tr[2] === EOFError()
+    @test ispath(tr[1]::StatStruct) && tr[2] === EOFError()
     @test tval <= t_elapsed
 end
 
@@ -231,7 +233,7 @@ function test_touch(slval)
     write(f, "Hello World\n")
     close(f)
     tr = take!(channel)
-    @test ispath(tr[1]) && ispath(tr[2])
+    @test ispath(tr[1]::StatStruct) && ispath(tr[2]::StatStruct)
     fetch(t)
 end
 
@@ -435,11 +437,11 @@ end
 @test_throws(Base._UVError("FolderMonitor (start)", Base.UV_ENOENT),
              watch_folder("____nonexistent_file", 10))
 @test(@elapsed(
-    @test(poll_file("____nonexistent_file", 1, 3.1) ===
-          (Base.Filesystem.StatStruct(), EOFError()))) > 3)
+    @test(poll_file("____nonexistent_file", 1, 3.1) ==
+          (StatStruct(), EOFError()))) > 3)
 
 unwatch_folder(dir)
-@test isempty(FileWatching.watched_folders)
+@test @lock FileWatching.watched_folders isempty(FileWatching.watched_folders[])
 rm(file)
 rm(dir)
 
@@ -450,10 +452,6 @@ rm(dir)
     include("pidfile.jl")
 end
 
-@testset "Docstrings" begin
-    undoc = Docs.undocumented_names(FileWatching)
-    @test_broken isempty(undoc)
-    @test undoc == [:FDWatcher, :FileMonitor, :FolderMonitor, :PollingFileWatcher]
-end
+@test isempty(Docs.undocumented_names(FileWatching))
 
 end # testset
diff --git a/stdlib/InteractiveUtils/docs/src/index.md b/stdlib/InteractiveUtils/docs/src/index.md
index dbfb42b9a931d..69b68a27e4e81 100644
--- a/stdlib/InteractiveUtils/docs/src/index.md
+++ b/stdlib/InteractiveUtils/docs/src/index.md
@@ -33,5 +33,7 @@ InteractiveUtils.@code_llvm
 InteractiveUtils.code_native
 InteractiveUtils.@code_native
 InteractiveUtils.@time_imports
+InteractiveUtils.@trace_compile
+InteractiveUtils.@trace_dispatch
 InteractiveUtils.clipboard
 ```
diff --git a/stdlib/InteractiveUtils/src/InteractiveUtils.jl b/stdlib/InteractiveUtils/src/InteractiveUtils.jl
index 835988ddf149f..f3c1ff7fba59f 100644
--- a/stdlib/InteractiveUtils/src/InteractiveUtils.jl
+++ b/stdlib/InteractiveUtils/src/InteractiveUtils.jl
@@ -11,7 +11,7 @@ Base.Experimental.@optlevel 1
 
 export apropos, edit, less, code_warntype, code_llvm, code_native, methodswith, varinfo,
     versioninfo, subtypes, supertypes, @which, @edit, @less, @functionloc, @code_warntype,
-    @code_typed, @code_lowered, @code_llvm, @code_native, @time_imports, clipboard
+    @code_typed, @code_lowered, @code_llvm, @code_native, @time_imports, clipboard, @trace_compile, @trace_dispatch
 
 import Base.Docs.apropos
 
diff --git a/stdlib/InteractiveUtils/src/macros.jl b/stdlib/InteractiveUtils/src/macros.jl
index bb56c47b4f9ca..211687df47954 100644
--- a/stdlib/InteractiveUtils/src/macros.jl
+++ b/stdlib/InteractiveUtils/src/macros.jl
@@ -256,6 +256,28 @@ macro time_imports(ex)
     end
 end
 
+macro trace_compile(ex)
+    quote
+        try
+            ccall(:jl_force_trace_compile_timing_enable, Cvoid, ())
+            $(esc(ex))
+        finally
+            ccall(:jl_force_trace_compile_timing_disable, Cvoid, ())
+        end
+    end
+end
+
+macro trace_dispatch(ex)
+    quote
+        try
+            ccall(:jl_force_trace_dispatch_enable, Cvoid, ())
+            $(esc(ex))
+        finally
+            ccall(:jl_force_trace_dispatch_disable, Cvoid, ())
+        end
+    end
+end
+
 """
     @functionloc
 
@@ -409,3 +431,36 @@ julia> @time_imports using CSV
 
 """
 :@time_imports
+
+"""
+    @trace_compile
+
+A macro to execute an expression and show any methods that were compiled (or recompiled in yellow),
+like the julia args `--trace-compile=stderr --trace-compile-timing` but specifically for a call.
+
+```julia-repl
+julia> @trace_compile rand(2,2) * rand(2,2)
+#=   39.1 ms =# precompile(Tuple{typeof(Base.rand), Int64, Int64})
+#=  102.0 ms =# precompile(Tuple{typeof(Base.:(*)), Array{Float64, 2}, Array{Float64, 2}})
+2×2 Matrix{Float64}:
+ 0.421704  0.864841
+ 0.211262  0.444366
+```
+
+!!! compat "Julia 1.12"
+    This macro requires at least Julia 1.12
+
+"""
+:@trace_compile
+
+"""
+    @trace_dispatch
+
+A macro to execute an expression and report methods that were compiled via dynamic dispatch,
+like the julia arg `--trace-dispatch=stderr` but specifically for a call.
+
+!!! compat "Julia 1.12"
+    This macro requires at least Julia 1.12
+
+"""
+:@trace_dispatch
diff --git a/stdlib/InteractiveUtils/test/runtests.jl b/stdlib/InteractiveUtils/test/runtests.jl
index 424564b70384c..8e7090cb53020 100644
--- a/stdlib/InteractiveUtils/test/runtests.jl
+++ b/stdlib/InteractiveUtils/test/runtests.jl
@@ -708,7 +708,7 @@ let
         length((@code_lowered sum(1:10)).code)
 end
 
-@testset "@time_imports" begin
+@testset "@time_imports, @trace_compile, @trace_dispatch" begin
     mktempdir() do dir
         cd(dir) do
             try
@@ -717,7 +717,16 @@ end
                 write(foo_file,
                     """
                     module Foo3242
-                    foo() = 1
+                    function foo()
+                        Base.Experimental.@force_compile
+                        foo(1)
+                    end
+                    foo(x) = x
+                    function bar()
+                        Base.Experimental.@force_compile
+                        bar(1)
+                    end
+                    bar(x) = x
                     end
                     """)
 
@@ -734,6 +743,27 @@ end
 
                 @test occursin("ms  Foo3242", String(buf))
 
+                fname = tempname()
+                f = open(fname, "w")
+                redirect_stderr(f) do
+                    @trace_compile @eval Foo3242.foo()
+                end
+                close(f)
+                buf = read(fname)
+                rm(fname)
+
+                @test occursin("ms =# precompile(", String(buf))
+
+                fname = tempname()
+                f = open(fname, "w")
+                redirect_stderr(f) do
+                    @trace_dispatch @eval Foo3242.bar()
+                end
+                close(f)
+                buf = read(fname)
+                rm(fname)
+
+                @test occursin("precompile(", String(buf))
             finally
                 filter!((≠)(dir), LOAD_PATH)
             end
diff --git a/stdlib/LLVMLibUnwind_jll/Project.toml b/stdlib/LLVMLibUnwind_jll/Project.toml
index 36c24111d4d31..0cb0fe5440066 100644
--- a/stdlib/LLVMLibUnwind_jll/Project.toml
+++ b/stdlib/LLVMLibUnwind_jll/Project.toml
@@ -1,6 +1,6 @@
 name = "LLVMLibUnwind_jll"
 uuid = "47c5dbc3-30ba-59ef-96a6-123e260183d9"
-version = "12.0.1+0"
+version = "14.0.6+0"
 
 [deps]
 Libdl = "8f399da3-3557-5675-b5ff-fb832c97cbdb"
diff --git a/stdlib/LibUV_jll/Project.toml b/stdlib/LibUV_jll/Project.toml
index 7c61fdf89df70..fb03c6b996048 100644
--- a/stdlib/LibUV_jll/Project.toml
+++ b/stdlib/LibUV_jll/Project.toml
@@ -1,6 +1,6 @@
 name = "LibUV_jll"
 uuid = "183b4373-6708-53ba-ad28-60e28bb38547"
-version = "2.0.1+16"
+version = "2.0.1+18"
 
 [deps]
 Libdl = "8f399da3-3557-5675-b5ff-fb832c97cbdb"
diff --git a/stdlib/LinearAlgebra/src/LinearAlgebra.jl b/stdlib/LinearAlgebra/src/LinearAlgebra.jl
index be59516f086ab..17216845b350c 100644
--- a/stdlib/LinearAlgebra/src/LinearAlgebra.jl
+++ b/stdlib/LinearAlgebra/src/LinearAlgebra.jl
@@ -673,7 +673,9 @@ matprod_dest(A::Diagonal, B::Diagonal, TS) = _matprod_dest_diag(B, TS)
 _matprod_dest_diag(A, TS) = similar(A, TS)
 function _matprod_dest_diag(A::SymTridiagonal, TS)
     n = size(A, 1)
-    Tridiagonal(similar(A, TS, n-1), similar(A, TS, n), similar(A, TS, n-1))
+    ev = similar(A, TS, max(0, n-1))
+    dv = similar(A, TS, n)
+    Tridiagonal(ev, dv, similar(ev))
 end
 
 # Special handling for adj/trans vec
@@ -843,9 +845,9 @@ function __init__()
     # https://github.com/xianyi/OpenBLAS/blob/c43ec53bdd00d9423fc609d7b7ecb35e7bf41b85/README.md#setting-the-number-of-threads-using-environment-variables
     if !haskey(ENV, "OPENBLAS_NUM_THREADS") && !haskey(ENV, "GOTO_NUM_THREADS") && !haskey(ENV, "OMP_NUM_THREADS")
         @static if Sys.isapple() && Base.BinaryPlatforms.arch(Base.BinaryPlatforms.HostPlatform()) == "aarch64"
-            BLAS.set_num_threads(max(1, Sys.CPU_THREADS))
+            BLAS.set_num_threads(max(1, @ccall(jl_effective_threads()::Cint)))
         else
-            BLAS.set_num_threads(max(1, Sys.CPU_THREADS ÷ 2))
+            BLAS.set_num_threads(max(1, @ccall(jl_effective_threads()::Cint) ÷ 2))
         end
     end
 end
diff --git a/stdlib/LinearAlgebra/src/bidiag.jl b/stdlib/LinearAlgebra/src/bidiag.jl
index d86bad7e41435..e5482cbba5595 100644
--- a/stdlib/LinearAlgebra/src/bidiag.jl
+++ b/stdlib/LinearAlgebra/src/bidiag.jl
@@ -166,10 +166,11 @@ end
 end
 
 @inline function getindex(A::Bidiagonal{T}, b::BandIndex) where T
-    @boundscheck checkbounds(A, _cartinds(b))
+    @boundscheck checkbounds(A, b)
     if b.band == 0
         return @inbounds A.dv[b.index]
-    elseif b.band == _offdiagind(A.uplo)
+    elseif b.band ∈ (-1,1) && b.band == _offdiagind(A.uplo)
+        # we explicitly compare the possible bands as b.band may be constant-propagated
         return @inbounds A.ev[b.index]
     else
         return bidiagzero(A, Tuple(_cartinds(b))...)
@@ -441,6 +442,32 @@ end
 -(A::Bidiagonal)=Bidiagonal(-A.dv,-A.ev,A.uplo)
 *(A::Bidiagonal, B::Number) = Bidiagonal(A.dv*B, A.ev*B, A.uplo)
 *(B::Number, A::Bidiagonal) = Bidiagonal(B*A.dv, B*A.ev, A.uplo)
+function rmul!(B::Bidiagonal, x::Number)
+    if size(B,1) > 1
+        isupper = B.uplo == 'U'
+        row, col = 1 + isupper, 1 + !isupper
+        # ensure that zeros are preserved on scaling
+        y = B[row,col] * x
+        iszero(y) || throw(ArgumentError(LazyString(lazy"cannot set index ($row, $col) off ",
+            lazy"the tridiagonal band to a nonzero value ($y)")))
+    end
+    @. B.dv *= x
+    @. B.ev *= x
+    return B
+end
+function lmul!(x::Number, B::Bidiagonal)
+    if size(B,1) > 1
+        isupper = B.uplo == 'U'
+        row, col = 1 + isupper, 1 + !isupper
+        # ensure that zeros are preserved on scaling
+        y = x * B[row,col]
+        iszero(y) || throw(ArgumentError(LazyString(lazy"cannot set index ($row, $col) off ",
+            lazy"the tridiagonal band to a nonzero value ($y)")))
+    end
+    @. B.dv = x * B.dv
+    @. B.ev = x * B.ev
+    return B
+end
 /(A::Bidiagonal, B::Number) = Bidiagonal(A.dv/B, A.ev/B, A.uplo)
 \(B::Number, A::Bidiagonal) = Bidiagonal(B\A.dv, B\A.ev, A.uplo)
 
@@ -557,7 +584,8 @@ end
 # function to get the internally stored vectors for Bidiagonal and [Sym]Tridiagonal
 # to avoid allocations in _mul! below (#24324, #24578)
 _diag(A::Tridiagonal, k) = k == -1 ? A.dl : k == 0 ? A.d : A.du
-_diag(A::SymTridiagonal, k) = k == 0 ? A.dv : A.ev
+_diag(A::SymTridiagonal{<:Number}, k) = k == 0 ? A.dv : A.ev
+_diag(A::SymTridiagonal, k) = k == 0 ? view(A, diagind(A, IndexStyle(A))) : view(A, diagind(A, 1, IndexStyle(A)))
 function _diag(A::Bidiagonal, k)
     if k == 0
         return A.dv
@@ -577,12 +605,45 @@ function _bibimul!(C, A, B, _add)
     check_A_mul_B!_sizes(size(C), size(A), size(B))
     n = size(A,1)
     iszero(n) && return C
-    n <= 3 && return mul!(C, Array(A), Array(B), _add.alpha, _add.beta)
     # We use `_rmul_or_fill!` instead of `_modify!` here since using
     # `_modify!` in the following loop will not update the
     # off-diagonal elements for non-zero beta.
     _rmul_or_fill!(C, _add.beta)
     iszero(_add.alpha) && return C
+    if n <= 3
+        # naive multiplication
+        for I in CartesianIndices(C)
+            C[I] += _add(sum(A[I[1], k] * B[k, I[2]] for k in axes(A,2)))
+        end
+        return C
+    end
+    @inbounds begin
+        # first column of C
+        C[1,1] += _add(A[1,1]*B[1,1] + A[1, 2]*B[2,1])
+        C[2,1] += _add(A[2,1]*B[1,1] + A[2,2]*B[2,1])
+        C[3,1] += _add(A[3,2]*B[2,1])
+        # second column of C
+        C[1,2] += _add(A[1,1]*B[1,2] + A[1,2]*B[2,2])
+        C[2,2] += _add(A[2,1]*B[1,2] + A[2,2]*B[2,2] + A[2,3]*B[3,2])
+        C[3,2] += _add(A[3,2]*B[2,2] + A[3,3]*B[3,2])
+        C[4,2] += _add(A[4,3]*B[3,2])
+    end # inbounds
+    # middle columns
+    __bibimul!(C, A, B, _add)
+    @inbounds begin
+        C[n-3,n-1] += _add(A[n-3,n-2]*B[n-2,n-1])
+        C[n-2,n-1] += _add(A[n-2,n-2]*B[n-2,n-1] + A[n-2,n-1]*B[n-1,n-1])
+        C[n-1,n-1] += _add(A[n-1,n-2]*B[n-2,n-1] + A[n-1,n-1]*B[n-1,n-1] + A[n-1,n]*B[n,n-1])
+        C[n,  n-1] += _add(A[n,n-1]*B[n-1,n-1] + A[n,n]*B[n,n-1])
+        # last column of C
+        C[n-2,  n] += _add(A[n-2,n-1]*B[n-1,n])
+        C[n-1,  n] += _add(A[n-1,n-1]*B[n-1,n  ] + A[n-1,n]*B[n,n  ])
+        C[n,    n] += _add(A[n,n-1]*B[n-1,n  ] + A[n,n]*B[n,n  ])
+    end # inbounds
+    C
+end
+function __bibimul!(C, A, B, _add)
+    n = size(A,1)
     Al = _diag(A, -1)
     Ad = _diag(A, 0)
     Au = _diag(A, 1)
@@ -590,44 +651,198 @@ function _bibimul!(C, A, B, _add)
     Bd = _diag(B, 0)
     Bu = _diag(B, 1)
     @inbounds begin
-        # first row of C
-        C[1,1] += _add(A[1,1]*B[1,1] + A[1, 2]*B[2, 1])
-        C[1,2] += _add(A[1,1]*B[1,2] + A[1,2]*B[2,2])
-        C[1,3] += _add(A[1,2]*B[2,3])
-        # second row of C
-        C[2,1] += _add(A[2,1]*B[1,1] + A[2,2]*B[2,1])
-        C[2,2] += _add(A[2,1]*B[1,2] + A[2,2]*B[2,2] + A[2,3]*B[3,2])
-        C[2,3] += _add(A[2,2]*B[2,3] + A[2,3]*B[3,3])
-        C[2,4] += _add(A[2,3]*B[3,4])
         for j in 3:n-2
-            Ajj₋1   = Al[j-1]
-            Ajj     = Ad[j]
+            Aj₋2j₋1 = Au[j-2]
+            Aj₋1j   = Au[j-1]
             Ajj₊1   = Au[j]
-            Bj₋1j₋2 = Bl[j-2]
-            Bj₋1j₋1 = Bd[j-1]
+            Aj₋1j₋1 = Ad[j-1]
+            Ajj     = Ad[j]
+            Aj₊1j₊1 = Ad[j+1]
+            Ajj₋1   = Al[j-1]
+            Aj₊1j   = Al[j]
+            Aj₊2j₊1 = Al[j+1]
             Bj₋1j   = Bu[j-1]
-            Bjj₋1   = Bl[j-1]
             Bjj     = Bd[j]
-            Bjj₊1   = Bu[j]
             Bj₊1j   = Bl[j]
-            Bj₊1j₊1 = Bd[j+1]
-            Bj₊1j₊2 = Bu[j+1]
-            C[j,j-2]  += _add( Ajj₋1*Bj₋1j₋2)
-            C[j, j-1] += _add(Ajj₋1*Bj₋1j₋1 + Ajj*Bjj₋1)
-            C[j, j  ] += _add(Ajj₋1*Bj₋1j   + Ajj*Bjj       + Ajj₊1*Bj₊1j)
-            C[j, j+1] += _add(Ajj  *Bjj₊1   + Ajj₊1*Bj₊1j₊1)
-            C[j, j+2] += _add(Ajj₊1*Bj₊1j₊2)
+
+            C[j-2, j] += _add(Aj₋2j₋1*Bj₋1j)
+            C[j-1, j] += _add(Aj₋1j₋1*Bj₋1j + Aj₋1j*Bjj)
+            C[j,   j] += _add(Ajj₋1*Bj₋1j + Ajj*Bjj + Ajj₊1*Bj₊1j)
+            C[j+1, j] += _add(Aj₊1j*Bjj + Aj₊1j₊1*Bj₊1j)
+            C[j+2, j] += _add(Aj₊2j₊1*Bj₊1j)
         end
-        # row before last of C
-        C[n-1,n-3] += _add(A[n-1,n-2]*B[n-2,n-3])
-        C[n-1,n-2] += _add(A[n-1,n-1]*B[n-1,n-2] + A[n-1,n-2]*B[n-2,n-2])
-        C[n-1,n-1] += _add(A[n-1,n-2]*B[n-2,n-1] + A[n-1,n-1]*B[n-1,n-1] + A[n-1,n]*B[n,n-1])
-        C[n-1,n  ] += _add(A[n-1,n-1]*B[n-1,n  ] + A[n-1,  n]*B[n  ,n  ])
-        # last row of C
-        C[n,n-2] += _add(A[n,n-1]*B[n-1,n-2])
-        C[n,n-1] += _add(A[n,n-1]*B[n-1,n-1] + A[n,n]*B[n,n-1])
-        C[n,n  ] += _add(A[n,n-1]*B[n-1,n  ] + A[n,n]*B[n,n  ])
-    end # inbounds
+    end
+    C
+end
+function __bibimul!(C, A, B::Bidiagonal, _add)
+    n = size(A,1)
+    Al = _diag(A, -1)
+    Ad = _diag(A, 0)
+    Au = _diag(A, 1)
+    Bd = _diag(B, 0)
+    if B.uplo == 'U'
+        Bu = _diag(B, 1)
+        @inbounds begin
+            for j in 3:n-2
+                Aj₋2j₋1 = Au[j-2]
+                Aj₋1j   = Au[j-1]
+                Aj₋1j₋1 = Ad[j-1]
+                Ajj     = Ad[j]
+                Ajj₋1   = Al[j-1]
+                Aj₊1j   = Al[j]
+                Bj₋1j   = Bu[j-1]
+                Bjj     = Bd[j]
+
+                C[j-2, j] += _add(Aj₋2j₋1*Bj₋1j)
+                C[j-1, j] += _add(Aj₋1j₋1*Bj₋1j + Aj₋1j*Bjj)
+                C[j,   j] += _add(Ajj₋1*Bj₋1j + Ajj*Bjj)
+                C[j+1, j] += _add(Aj₊1j*Bjj)
+            end
+        end
+    else # B.uplo == 'L'
+        Bl = _diag(B, -1)
+        @inbounds begin
+            for j in 3:n-2
+                Aj₋1j   = Au[j-1]
+                Ajj₊1   = Au[j]
+                Ajj     = Ad[j]
+                Aj₊1j₊1 = Ad[j+1]
+                Aj₊1j   = Al[j]
+                Aj₊2j₊1 = Al[j+1]
+                Bjj     = Bd[j]
+                Bj₊1j   = Bl[j]
+
+                C[j-1, j] += _add(Aj₋1j*Bjj)
+                C[j,   j] += _add(Ajj*Bjj + Ajj₊1*Bj₊1j)
+                C[j+1, j] += _add(Aj₊1j*Bjj + Aj₊1j₊1*Bj₊1j)
+                C[j+2, j] += _add(Aj₊2j₊1*Bj₊1j)
+            end
+        end
+    end
+    C
+end
+function __bibimul!(C, A::Bidiagonal, B, _add)
+    n = size(A,1)
+    Bl = _diag(B, -1)
+    Bd = _diag(B, 0)
+    Bu = _diag(B, 1)
+    Ad = _diag(A, 0)
+    if A.uplo == 'U'
+        Au = _diag(A, 1)
+        @inbounds begin
+            for j in 3:n-2
+                Aj₋2j₋1 = Au[j-2]
+                Aj₋1j   = Au[j-1]
+                Ajj₊1   = Au[j]
+                Aj₋1j₋1 = Ad[j-1]
+                Ajj     = Ad[j]
+                Aj₊1j₊1 = Ad[j+1]
+                Bj₋1j   = Bu[j-1]
+                Bjj     = Bd[j]
+                Bj₊1j   = Bl[j]
+
+                C[j-2, j] += _add(Aj₋2j₋1*Bj₋1j)
+                C[j-1, j] += _add(Aj₋1j₋1*Bj₋1j + Aj₋1j*Bjj)
+                C[j,   j] += _add(Ajj*Bjj       + Ajj₊1*Bj₊1j)
+                C[j+1, j] += _add(Aj₊1j₊1*Bj₊1j)
+            end
+        end
+    else # A.uplo == 'L'
+        Al = _diag(A, -1)
+        @inbounds begin
+            for j in 3:n-2
+                Aj₋1j₋1 = Ad[j-1]
+                Ajj     = Ad[j]
+                Aj₊1j₊1 = Ad[j+1]
+                Ajj₋1   = Al[j-1]
+                Aj₊1j   = Al[j]
+                Aj₊2j₊1 = Al[j+1]
+                Bj₋1j   = Bu[j-1]
+                Bjj     = Bd[j]
+                Bj₊1j   = Bl[j]
+
+                C[j-1, j] += _add(Aj₋1j₋1*Bj₋1j)
+                C[j,   j] += _add(Ajj₋1*Bj₋1j   + Ajj*Bjj)
+                C[j+1, j] += _add(Aj₊1j*Bjj   + Aj₊1j₊1*Bj₊1j)
+                C[j+2, j] += _add(Aj₊2j₊1*Bj₊1j)
+            end
+        end
+    end
+    C
+end
+function __bibimul!(C, A::Bidiagonal, B::Bidiagonal, _add)
+    n = size(A,1)
+    Ad = _diag(A, 0)
+    Bd = _diag(B, 0)
+    if A.uplo == 'U' && B.uplo == 'U'
+        Au = _diag(A, 1)
+        Bu = _diag(B, 1)
+        @inbounds begin
+            for j in 3:n-2
+                Aj₋2j₋1 = Au[j-2]
+                Aj₋1j   = Au[j-1]
+                Aj₋1j₋1 = Ad[j-1]
+                Ajj     = Ad[j]
+                Bj₋1j   = Bu[j-1]
+                Bjj     = Bd[j]
+
+                C[j-2, j] += _add(Aj₋2j₋1*Bj₋1j)
+                C[j-1, j] += _add(Aj₋1j₋1*Bj₋1j + Aj₋1j*Bjj)
+                C[j,   j] += _add(Ajj*Bjj)
+            end
+        end
+    elseif A.uplo == 'U' && B.uplo == 'L'
+        Au = _diag(A, 1)
+        Bl = _diag(B, -1)
+        @inbounds begin
+            for j in 3:n-2
+                Aj₋1j   = Au[j-1]
+                Ajj₊1   = Au[j]
+                Ajj     = Ad[j]
+                Aj₊1j₊1 = Ad[j+1]
+                Bjj     = Bd[j]
+                Bj₊1j   = Bl[j]
+
+                C[j-1, j] += _add(Aj₋1j*Bjj)
+                C[j,   j] += _add(Ajj*Bjj + Ajj₊1*Bj₊1j)
+                C[j+1, j] += _add(Aj₊1j₊1*Bj₊1j)
+            end
+        end
+    elseif A.uplo == 'L' && B.uplo == 'U'
+        Al = _diag(A, -1)
+        Bu = _diag(B, 1)
+        @inbounds begin
+            for j in 3:n-2
+                Aj₋1j₋1 = Ad[j-1]
+                Ajj     = Ad[j]
+                Ajj₋1   = Al[j-1]
+                Aj₊1j   = Al[j]
+                Bj₋1j   = Bu[j-1]
+                Bjj     = Bd[j]
+
+                C[j-1, j] += _add(Aj₋1j₋1*Bj₋1j)
+                C[j,   j] += _add(Ajj₋1*Bj₋1j   + Ajj*Bjj)
+                C[j+1, j] += _add(Aj₊1j*Bjj)
+            end
+        end
+    else # A.uplo == 'L' && B.uplo == 'L'
+        Al = _diag(A, -1)
+        Bl = _diag(B, -1)
+        @inbounds begin
+            for j in 3:n-2
+                Ajj     = Ad[j]
+                Aj₊1j₊1 = Ad[j+1]
+                Aj₊1j   = Al[j]
+                Aj₊2j₊1 = Al[j+1]
+                Bjj     = Bd[j]
+                Bj₊1j   = Bl[j]
+
+                C[j,   j] += _add(Ajj*Bjj)
+                C[j+1, j] += _add(Aj₊1j*Bjj   + Aj₊1j₊1*Bj₊1j)
+                C[j+2, j] += _add(Aj₊2j₊1*Bj₊1j)
+            end
+        end
+    end
     C
 end
 
@@ -744,7 +959,52 @@ function _mul!(C::AbstractVecOrMat, A::BiTriSym, B::AbstractVecOrMat, _add::MulA
     nB = size(B,2)
     (iszero(nA) || iszero(nB)) && return C
     iszero(_add.alpha) && return _rmul_or_fill!(C, _add.beta)
-    nA <= 3 && return mul!(C, Array(A), Array(B), _add.alpha, _add.beta)
+    if nA <= 3
+        # naive multiplication
+        for I in CartesianIndices(C)
+            col = Base.tail(Tuple(I))
+            _modify!(_add, sum(A[I[1], k] * B[k, col...] for k in axes(A,2)), C, I)
+        end
+        return C
+    end
+    _mul_bitrisym!(C, A, B, _add)
+end
+function _mul_bitrisym!(C::AbstractVecOrMat, A::Bidiagonal, B::AbstractVecOrMat, _add::MulAddMul)
+    nA = size(A,1)
+    nB = size(B,2)
+    d = A.dv
+    if A.uplo == 'U'
+        u = A.ev
+        @inbounds begin
+            for j = 1:nB
+                b₀, b₊ = B[1, j], B[2, j]
+                _modify!(_add, d[1]*b₀ + u[1]*b₊, C, (1, j))
+                for i = 2:nA - 1
+                    b₀, b₊ = b₊, B[i + 1, j]
+                    _modify!(_add, d[i]*b₀ + u[i]*b₊, C, (i, j))
+                end
+                _modify!(_add, d[nA]*b₊, C, (nA, j))
+            end
+        end
+    else
+        l = A.ev
+        @inbounds begin
+            for j = 1:nB
+                b₀, b₊ = B[1, j], B[2, j]
+                _modify!(_add, d[1]*b₀, C, (1, j))
+                for i = 2:nA - 1
+                    b₋, b₀, b₊ = b₀, b₊, B[i + 1, j]
+                    _modify!(_add, l[i - 1]*b₋ + d[i]*b₀, C, (i, j))
+                end
+                _modify!(_add, l[nA - 1]*b₀ + d[nA]*b₊, C, (nA, j))
+            end
+        end
+    end
+    C
+end
+function _mul_bitrisym!(C::AbstractVecOrMat, A::TriSym, B::AbstractVecOrMat, _add::MulAddMul)
+    nA = size(A,1)
+    nB = size(B,2)
     l = _diag(A, -1)
     d = _diag(A, 0)
     u = _diag(A, 1)
@@ -767,10 +1027,10 @@ function _mul!(C::AbstractMatrix, A::AbstractMatrix, B::TriSym, _add::MulAddMul)
     check_A_mul_B!_sizes(size(C), size(A), size(B))
     n = size(A,1)
     m = size(B,2)
-    (iszero(m) || iszero(n)) && return C
-    iszero(_add.alpha) && return _rmul_or_fill!(C, _add.beta)
-    if n <= 3 || m <= 1
-        return mul!(C, Array(A), Array(B), _add.alpha, _add.beta)
+    (iszero(_add.alpha) || iszero(m)) && return _rmul_or_fill!(C, _add.beta)
+    if m == 1
+        B11 = B[1,1]
+        return mul!(C, A, B11, _add.alpha, _add.beta)
     end
     Bl = _diag(B, -1)
     Bd = _diag(B, 0)
@@ -804,21 +1064,18 @@ function _mul!(C::AbstractMatrix, A::AbstractMatrix, B::Bidiagonal, _add::MulAdd
     m, n = size(A)
     (iszero(m) || iszero(n)) && return C
     iszero(_add.alpha) && return _rmul_or_fill!(C, _add.beta)
-    if size(A, 1) <= 3 || size(B, 2) <= 1
-        return mul!(C, Array(A), Array(B), _add.alpha, _add.beta)
-    end
     @inbounds if B.uplo == 'U'
+        for j in n:-1:2, i in 1:m
+            _modify!(_add, A[i,j] * B.dv[j] + A[i,j-1] * B.ev[j-1], C, (i, j))
+        end
         for i in 1:m
-            for j in n:-1:2
-                _modify!(_add, A[i,j] * B.dv[j] + A[i,j-1] * B.ev[j-1], C, (i, j))
-            end
             _modify!(_add, A[i,1] * B.dv[1], C, (i, 1))
         end
     else # uplo == 'L'
+        for j in 1:n-1, i in 1:m
+            _modify!(_add, A[i,j] * B.dv[j] + A[i,j+1] * B.ev[j], C, (i, j))
+        end
         for i in 1:m
-            for j in 1:n-1
-                _modify!(_add, A[i,j] * B.dv[j] + A[i,j+1] * B.ev[j], C, (i, j))
-            end
             _modify!(_add, A[i,n] * B.dv[n], C, (i, n))
         end
     end
@@ -834,9 +1091,17 @@ function _dibimul!(C, A, B, _add)
     check_A_mul_B!_sizes(size(C), size(A), size(B))
     n = size(A,1)
     iszero(n) && return C
-    n <= 3 && return mul!(C, Array(A), Array(B), _add.alpha, _add.beta)
-    _rmul_or_fill!(C, _add.beta)  # see the same use above
+    # ensure that we fill off-band elements in the destination
+    _rmul_or_fill!(C, _add.beta)
     iszero(_add.alpha) && return C
+    if n <= 3
+        # For simplicity, use a naive multiplication for small matrices
+        # that loops over all elements.
+        for I in CartesianIndices(C)
+            C[I] += _add(A.diag[I[1]] * B[I[1], I[2]])
+        end
+        return C
+    end
     Ad = A.diag
     Bl = _diag(B, -1)
     Bd = _diag(B, 0)
@@ -870,7 +1135,8 @@ function _dibimul!(C::AbstractMatrix, A::Diagonal, B::Bidiagonal, _add)
     check_A_mul_B!_sizes(size(C), size(A), size(B))
     n = size(A,1)
     iszero(n) && return C
-    _rmul_or_fill!(C, _add.beta)  # see the same use above
+    # ensure that we fill off-band elements in the destination
+    _rmul_or_fill!(C, _add.beta)
     iszero(_add.alpha) && return C
     Ad = A.diag
     Bdv, Bev = B.dv, B.ev
diff --git a/stdlib/LinearAlgebra/src/blas.jl b/stdlib/LinearAlgebra/src/blas.jl
index 413b7866c5444..3c15630091162 100644
--- a/stdlib/LinearAlgebra/src/blas.jl
+++ b/stdlib/LinearAlgebra/src/blas.jl
@@ -159,7 +159,7 @@ function check()
     interface = USE_BLAS64 ? :ilp64 : :lp64
     if !any(lib.interface == interface for lib in config.loaded_libs)
         interfacestr = uppercase(string(interface))
-        @error("No loaded BLAS libraries were built with $interfacestr support.")
+        println(Core.stderr, "No loaded BLAS libraries were built with $interfacestr support.")
         exit(1)
     end
 end
diff --git a/stdlib/LinearAlgebra/src/dense.jl b/stdlib/LinearAlgebra/src/dense.jl
index 62096cbb172f2..aacc5479bfa9d 100644
--- a/stdlib/LinearAlgebra/src/dense.jl
+++ b/stdlib/LinearAlgebra/src/dense.jl
@@ -110,7 +110,7 @@ norm2(x::Union{Array{T},StridedVector{T}}) where {T<:BlasFloat} =
 # Conservative assessment of types that have zero(T) defined for themselves
 haszero(::Type) = false
 haszero(::Type{T}) where {T<:Number} = isconcretetype(T)
-@propagate_inbounds _zero(M::AbstractArray{T}, i, j) where {T} = haszero(T) ? zero(T) : zero(M[i,j])
+@propagate_inbounds _zero(M::AbstractArray{T}, inds...) where {T} = haszero(T) ? zero(T) : zero(M[inds...])
 
 """
     triu!(M, k::Integer)
diff --git a/stdlib/LinearAlgebra/src/diagonal.jl b/stdlib/LinearAlgebra/src/diagonal.jl
index 526ec20ddafa1..0a95bac5ffb93 100644
--- a/stdlib/LinearAlgebra/src/diagonal.jl
+++ b/stdlib/LinearAlgebra/src/diagonal.jl
@@ -227,7 +227,6 @@ Base._reverse(A::Diagonal, dims) = reverse!(Matrix(A); dims)
 Base._reverse(A::Diagonal, ::Colon) = Diagonal(reverse(A.diag))
 Base._reverse!(A::Diagonal, ::Colon) = (reverse!(A.diag); A)
 
-ishermitian(D::Diagonal{<:Real}) = true
 ishermitian(D::Diagonal{<:Number}) = isreal(D.diag)
 ishermitian(D::Diagonal) = all(ishermitian, D.diag)
 issymmetric(D::Diagonal{<:Number}) = true
@@ -272,23 +271,28 @@ end
 (+)(Da::Diagonal, Db::Diagonal) = Diagonal(Da.diag + Db.diag)
 (-)(Da::Diagonal, Db::Diagonal) = Diagonal(Da.diag - Db.diag)
 
-for f in (:+, :-)
-    @eval function $f(D::Diagonal{<:Number}, S::Symmetric)
-        return Symmetric($f(D, S.data), sym_uplo(S.uplo))
-    end
-    @eval function $f(S::Symmetric, D::Diagonal{<:Number})
-        return Symmetric($f(S.data, D), sym_uplo(S.uplo))
-    end
-    @eval function $f(D::Diagonal{<:Real}, H::Hermitian)
-        return Hermitian($f(D, H.data), sym_uplo(H.uplo))
+(*)(x::Number, D::Diagonal) = Diagonal(x * D.diag)
+(*)(D::Diagonal, x::Number) = Diagonal(D.diag * x)
+function lmul!(x::Number, D::Diagonal)
+    if size(D,1) > 1
+        # ensure that zeros are preserved on scaling
+        y = D[2,1] * x
+        iszero(y) || throw(ArgumentError(LazyString("cannot set index (2, 1) off ",
+            lazy"the tridiagonal band to a nonzero value ($y)")))
     end
-    @eval function $f(H::Hermitian, D::Diagonal{<:Real})
-        return Hermitian($f(H.data, D), sym_uplo(H.uplo))
+    @. D.diag = x * D.diag
+    return D
+end
+function rmul!(D::Diagonal, x::Number)
+    if size(D,1) > 1
+        # ensure that zeros are preserved on scaling
+        y = x * D[2,1]
+        iszero(y) || throw(ArgumentError(LazyString("cannot set index (2, 1) off ",
+            lazy"the tridiagonal band to a nonzero value ($y)")))
     end
+    @. D.diag *= x
+    return D
 end
-
-(*)(x::Number, D::Diagonal) = Diagonal(x * D.diag)
-(*)(D::Diagonal, x::Number) = Diagonal(D.diag * x)
 (/)(D::Diagonal, x::Number) = Diagonal(D.diag / x)
 (\)(x::Number, D::Diagonal) = Diagonal(x \ D.diag)
 (^)(D::Diagonal, a::Number) = Diagonal(D.diag .^ a)
diff --git a/stdlib/LinearAlgebra/src/generic.jl b/stdlib/LinearAlgebra/src/generic.jl
index f60016125d2e9..e5f23b4981616 100644
--- a/stdlib/LinearAlgebra/src/generic.jl
+++ b/stdlib/LinearAlgebra/src/generic.jl
@@ -2014,20 +2014,12 @@ function copytrito!(B::AbstractMatrix, A::AbstractMatrix, uplo::AbstractChar)
     m1,n1 = size(B)
     A = Base.unalias(B, A)
     if uplo == 'U'
-        if n < m
-            (m1 < n || n1 < n) && throw(DimensionMismatch(lazy"B of size ($m1,$n1) should have at least size ($n,$n)"))
-        else
-            (m1 < m || n1 < n) && throw(DimensionMismatch(lazy"B of size ($m1,$n1) should have at least size ($m,$n)"))
-        end
+        LAPACK.lacpy_size_check((m1, n1), (n < m ? n : m, n))
         for j in 1:n, i in 1:min(j,m)
             @inbounds B[i,j] = A[i,j]
         end
     else # uplo == 'L'
-        if m < n
-            (m1 < m || n1 < m) && throw(DimensionMismatch(lazy"B of size ($m1,$n1) should have at least size ($m,$m)"))
-        else
-            (m1 < m || n1 < n) && throw(DimensionMismatch(lazy"B of size ($m1,$n1) should have at least size ($m,$n)"))
-        end
+        LAPACK.lacpy_size_check((m1, n1), (m, m < n ? m : n))
         for j in 1:n, i in j:m
             @inbounds B[i,j] = A[i,j]
         end
diff --git a/stdlib/LinearAlgebra/src/lbt.jl b/stdlib/LinearAlgebra/src/lbt.jl
index 606ddedbe1343..81d10f930c8c5 100644
--- a/stdlib/LinearAlgebra/src/lbt.jl
+++ b/stdlib/LinearAlgebra/src/lbt.jl
@@ -17,7 +17,7 @@ end
 macro get_warn(map, key)
     return quote
         if !haskey($(esc(map)), $(esc(key)))
-            @warn(string("[LBT] Unknown key into ", $(string(map)), ": ", $(esc(key)), ", defaulting to :unknown"))
+            println(Core.stderr, string("Warning: [LBT] Unknown key into ", $(string(map)), ": ", $(esc(key)), ", defaulting to :unknown"))
             # All the unknown values share a common value: `-1`
             $(esc(map))[$(esc(LBT_INTERFACE_UNKNOWN))]
         else
@@ -132,7 +132,7 @@ struct LBTConfig
             if str_ptr != C_NULL
                 push!(exported_symbols, unsafe_string(str_ptr))
             else
-                @error("NULL string in lbt_config.exported_symbols[$(sym_idx)]")
+                println(Core.stderr, "Error: NULL string in lbt_config.exported_symbols[$(sym_idx)]")
             end
         end
 
diff --git a/stdlib/LinearAlgebra/src/special.jl b/stdlib/LinearAlgebra/src/special.jl
index 5fea1e32460ff..32a5476842933 100644
--- a/stdlib/LinearAlgebra/src/special.jl
+++ b/stdlib/LinearAlgebra/src/special.jl
@@ -112,6 +112,8 @@ for op in (:+, :-)
     end
 end
 
+(*)(Da::Diagonal, A::BandedMatrix, Db::Diagonal) = _tri_matmul(Da, A, Db)
+
 # disambiguation between triangular and banded matrices, banded ones "dominate"
 _mul!(C::AbstractMatrix, A::AbstractTriangular, B::BandedMatrix, alpha::Number, beta::Number) =
     @stable_muladdmul _mul!(C, A, B, MulAddMul(alpha, beta))
@@ -288,6 +290,25 @@ function (-)(A::UniformScaling, B::Diagonal)
     Diagonal(Ref(A) .- B.diag)
 end
 
+for f in (:+, :-)
+    @eval function $f(D::Diagonal{<:Number}, S::Symmetric)
+        uplo = sym_uplo(S.uplo)
+        return Symmetric(parentof_applytri($f, Symmetric(D, uplo), S), uplo)
+    end
+    @eval function $f(S::Symmetric, D::Diagonal{<:Number})
+        uplo = sym_uplo(S.uplo)
+        return Symmetric(parentof_applytri($f, S, Symmetric(D, uplo)), uplo)
+    end
+    @eval function $f(D::Diagonal{<:Real}, H::Hermitian)
+        uplo = sym_uplo(H.uplo)
+        return Hermitian(parentof_applytri($f, Hermitian(D, uplo), H), uplo)
+    end
+    @eval function $f(H::Hermitian, D::Diagonal{<:Real})
+        uplo = sym_uplo(H.uplo)
+        return Hermitian(parentof_applytri($f, H, Hermitian(D, uplo)), uplo)
+    end
+end
+
 ## Diagonal construction from UniformScaling
 Diagonal{T}(s::UniformScaling, m::Integer) where {T} = Diagonal{T}(fill(T(s.λ), m))
 Diagonal(s::UniformScaling, m::Integer) = Diagonal{eltype(s)}(s, m)
@@ -567,3 +588,7 @@ function cholesky(S::RealHermSymComplexHerm{<:Real,<:SymTridiagonal}, ::NoPivot
     B = Bidiagonal{T}(diag(S, 0), diag(S, S.uplo == 'U' ? 1 : -1), sym_uplo(S.uplo))
     cholesky!(Hermitian(B, sym_uplo(S.uplo)), NoPivot(); check = check)
 end
+
+# istriu/istril for triangular wrappers of structured matrices
+_istril(A::LowerTriangular{<:Any, <:BandedMatrix}, k) = istril(parent(A), k)
+_istriu(A::UpperTriangular{<:Any, <:BandedMatrix}, k) = istriu(parent(A), k)
diff --git a/stdlib/LinearAlgebra/src/structuredbroadcast.jl b/stdlib/LinearAlgebra/src/structuredbroadcast.jl
index 21f6a7414d872..0c06f84116fc7 100644
--- a/stdlib/LinearAlgebra/src/structuredbroadcast.jl
+++ b/stdlib/LinearAlgebra/src/structuredbroadcast.jl
@@ -96,7 +96,7 @@ structured_broadcast_alloc(bc, ::Type{UnitLowerTriangular}, ::Type{ElType}, n) w
 structured_broadcast_alloc(bc, ::Type{UnitUpperTriangular}, ::Type{ElType}, n) where {ElType} =
     UnitUpperTriangular(Array{ElType}(undef, n, n))
 structured_broadcast_alloc(bc, ::Type{Matrix}, ::Type{ElType}, n) where {ElType} =
-    Matrix(Array{ElType}(undef, n, n))
+    Array{ElType}(undef, n, n)
 
 # A _very_ limited list of structure-preserving functions known at compile-time. This list is
 # derived from the formerly-implemented `broadcast` methods in 0.6. Note that this must
@@ -199,6 +199,8 @@ function Broadcast.newindex(A::StructuredMatrix, b::BandIndex)
     # and we apply newindex to both the axes at once to obtain the result
     size(A,1) > 1 ? b : BandIndex(0, 1)
 end
+# All structured matrices are square, and therefore they only broadcast out if they are size (1, 1)
+Broadcast.newindex(D::StructuredMatrix, I::CartesianIndex{2}) = size(D) == (1,1) ? CartesianIndex(1,1) : I
 
 function copyto!(dest::Diagonal, bc::Broadcasted{<:StructuredMatrixStyle})
     isvalidstructbc(dest, bc) || return copyto!(dest, convert(Broadcasted{Nothing}, bc))
diff --git a/stdlib/LinearAlgebra/src/symmetric.jl b/stdlib/LinearAlgebra/src/symmetric.jl
index ab7b5ee031260..a7739596a73bb 100644
--- a/stdlib/LinearAlgebra/src/symmetric.jl
+++ b/stdlib/LinearAlgebra/src/symmetric.jl
@@ -687,10 +687,10 @@ for f in (:+, :-)
     @eval begin
         $f(A::Hermitian, B::Symmetric{<:Real}) = $f(A, Hermitian(parent(B), sym_uplo(B.uplo)))
         $f(A::Symmetric{<:Real}, B::Hermitian) = $f(Hermitian(parent(A), sym_uplo(A.uplo)), B)
-        $f(A::SymTridiagonal, B::Symmetric) = Symmetric($f(A, B.data), sym_uplo(B.uplo))
-        $f(A::Symmetric, B::SymTridiagonal) = Symmetric($f(A.data, B), sym_uplo(A.uplo))
-        $f(A::SymTridiagonal{<:Real}, B::Hermitian) = Hermitian($f(A, B.data), sym_uplo(B.uplo))
-        $f(A::Hermitian, B::SymTridiagonal{<:Real}) = Hermitian($f(A.data, B), sym_uplo(A.uplo))
+        $f(A::SymTridiagonal, B::Symmetric) = $f(Symmetric(A, sym_uplo(B.uplo)), B)
+        $f(A::Symmetric, B::SymTridiagonal) = $f(A, Symmetric(B, sym_uplo(A.uplo)))
+        $f(A::SymTridiagonal{<:Real}, B::Hermitian) = $f(Hermitian(A, sym_uplo(B.uplo)), B)
+        $f(A::Hermitian, B::SymTridiagonal{<:Real}) = $f(A, Hermitian(B, sym_uplo(A.uplo)))
     end
 end
 
diff --git a/stdlib/LinearAlgebra/src/symmetriceigen.jl b/stdlib/LinearAlgebra/src/symmetriceigen.jl
index fee524a702187..68a1b29f5dbc7 100644
--- a/stdlib/LinearAlgebra/src/symmetriceigen.jl
+++ b/stdlib/LinearAlgebra/src/symmetriceigen.jl
@@ -20,13 +20,6 @@ function eigen!(A::RealHermSymComplexHerm{<:BlasReal,<:StridedMatrix}, alg::Algo
         throw(ArgumentError("Unsupported value for `alg` keyword."))
     end
 end
-function eigen(A::RealHermSymComplexHerm{Float16}; sortby::Union{Function,Nothing}=nothing)
-    S = eigtype(eltype(A))
-    E = eigen!(eigencopy_oftype(A, S), sortby=sortby)
-    values = convert(AbstractVector{Float16}, E.values)
-    vectors = convert(AbstractMatrix{isreal(E.vectors) ? Float16 : Complex{Float16}}, E.vectors)
-    return Eigen(values, vectors)
-end
 
 """
     eigen(A::Union{Hermitian, Symmetric}, alg::Algorithm = default_eigen_alg(A)) -> Eigen
@@ -53,10 +46,22 @@ The default `alg` used may change in the future.
 The following functions are available for `Eigen` objects: [`inv`](@ref), [`det`](@ref), and [`isposdef`](@ref).
 """
 function eigen(A::RealHermSymComplexHerm, alg::Algorithm = default_eigen_alg(A); sortby::Union{Function,Nothing}=nothing)
+    _eigen(A, alg; sortby)
+end
+
+# we dispatch on the eltype in an internal method to avoid ambiguities
+function _eigen(A::RealHermSymComplexHerm, alg::Algorithm; sortby)
     S = eigtype(eltype(A))
     eigen!(eigencopy_oftype(A, S), alg; sortby)
 end
 
+function _eigen(A::RealHermSymComplexHerm{Float16}, alg::Algorithm; sortby::Union{Function,Nothing}=nothing)
+    S = eigtype(eltype(A))
+    E = eigen!(eigencopy_oftype(A, S), alg, sortby=sortby)
+    values = convert(AbstractVector{Float16}, E.values)
+    vectors = convert(AbstractMatrix{isreal(E.vectors) ? Float16 : Complex{Float16}}, E.vectors)
+    return Eigen(values, vectors)
+end
 
 eigen!(A::RealHermSymComplexHerm{<:BlasReal,<:StridedMatrix}, irange::UnitRange) =
     Eigen(LAPACK.syevr!('V', 'I', A.uplo, A.data, 0.0, 0.0, irange.start, irange.stop, -1.0)...)
diff --git a/stdlib/LinearAlgebra/src/triangular.jl b/stdlib/LinearAlgebra/src/triangular.jl
index 923e13e488c85..e1d61e4035966 100644
--- a/stdlib/LinearAlgebra/src/triangular.jl
+++ b/stdlib/LinearAlgebra/src/triangular.jl
@@ -236,6 +236,20 @@ Base.isstored(A::UpperTriangular, i::Int, j::Int) =
 @propagate_inbounds getindex(A::UpperTriangular, i::Int, j::Int) =
     i <= j ? A.data[i,j] : _zero(A.data,j,i)
 
+# these specialized getindex methods enable constant-propagation of the band
+Base.@constprop :aggressive @propagate_inbounds function getindex(A::UnitLowerTriangular{T}, b::BandIndex) where {T}
+    b.band < 0 ? A.data[b] : ifelse(b.band == 0, oneunit(T), zero(T))
+end
+Base.@constprop :aggressive @propagate_inbounds function getindex(A::LowerTriangular, b::BandIndex)
+    b.band <= 0 ? A.data[b] : _zero(A.data, b)
+end
+Base.@constprop :aggressive @propagate_inbounds function getindex(A::UnitUpperTriangular{T}, b::BandIndex) where {T}
+    b.band > 0 ? A.data[b] : ifelse(b.band == 0, oneunit(T), zero(T))
+end
+Base.@constprop :aggressive @propagate_inbounds function getindex(A::UpperTriangular, b::BandIndex)
+    b.band >= 0 ? A.data[b] : _zero(A.data, b)
+end
+
 _zero_triangular_half_str(::Type{<:UpperOrUnitUpperTriangular}) = "lower"
 _zero_triangular_half_str(::Type{<:LowerOrUnitLowerTriangular}) = "upper"
 
@@ -330,14 +344,32 @@ function Base.replace_in_print_matrix(A::Union{LowerTriangular,UnitLowerTriangul
     return i >= j ? s : Base.replace_with_centered_mark(s)
 end
 
-Base.@constprop :aggressive function istril(A::Union{LowerTriangular,UnitLowerTriangular}, k::Integer=0)
+istril(A::UnitLowerTriangular, k::Integer=0) = k >= 0
+istriu(A::UnitUpperTriangular, k::Integer=0) = k <= 0
+Base.@constprop :aggressive function istril(A::LowerTriangular, k::Integer=0)
     k >= 0 && return true
     return _istril(A, k)
 end
-Base.@constprop :aggressive function istriu(A::Union{UpperTriangular,UnitUpperTriangular}, k::Integer=0)
+@inline function _istril(A::LowerTriangular, k)
+    P = parent(A)
+    m = size(A, 1)
+    for j in max(1, k + 2):m
+        all(iszero, view(P, j:min(j - k - 1, m), j)) || return false
+    end
+    return true
+end
+Base.@constprop :aggressive function istriu(A::UpperTriangular, k::Integer=0)
     k <= 0 && return true
     return _istriu(A, k)
 end
+@inline function _istriu(A::UpperTriangular, k)
+    P = parent(A)
+    m = size(A, 1)
+    for j in 1:min(m, m + k - 1)
+        all(iszero, view(P, max(1, j - k + 1):j, j)) || return false
+    end
+    return true
+end
 istril(A::Adjoint, k::Integer=0) = istriu(A.parent, -k)
 istril(A::Transpose, k::Integer=0) = istriu(A.parent, -k)
 istriu(A::Adjoint, k::Integer=0) = istril(A.parent, -k)
@@ -832,51 +864,90 @@ fillstored!(A::UpperTriangular, x)     = (fillband!(A.data, x, 0, size(A,2)-1);
 fillstored!(A::UnitUpperTriangular, x) = (fillband!(A.data, x, 1, size(A,2)-1); A)
 
 # Binary operations
-+(A::UpperTriangular, B::UpperTriangular) = UpperTriangular(A.data + B.data)
-+(A::LowerTriangular, B::LowerTriangular) = LowerTriangular(A.data + B.data)
-+(A::UpperTriangular, B::UnitUpperTriangular) = UpperTriangular(A.data + triu(B.data, 1) + I)
-+(A::LowerTriangular, B::UnitLowerTriangular) = LowerTriangular(A.data + tril(B.data, -1) + I)
-+(A::UnitUpperTriangular, B::UpperTriangular) = UpperTriangular(triu(A.data, 1) + B.data + I)
-+(A::UnitLowerTriangular, B::LowerTriangular) = LowerTriangular(tril(A.data, -1) + B.data + I)
-+(A::UnitUpperTriangular, B::UnitUpperTriangular) = UpperTriangular(triu(A.data, 1) + triu(B.data, 1) + 2I)
-+(A::UnitLowerTriangular, B::UnitLowerTriangular) = LowerTriangular(tril(A.data, -1) + tril(B.data, -1) + 2I)
+# use broadcasting if the parents are strided, where we loop only over the triangular part
+function +(A::UpperTriangular, B::UpperTriangular)
+    (parent(A) isa StridedMatrix || parent(B) isa StridedMatrix) && return A .+ B
+    UpperTriangular(A.data + B.data)
+end
+function +(A::LowerTriangular, B::LowerTriangular)
+    (parent(A) isa StridedMatrix || parent(B) isa StridedMatrix) && return A .+ B
+    LowerTriangular(A.data + B.data)
+end
+function +(A::UpperTriangular, B::UnitUpperTriangular)
+    (parent(A) isa StridedMatrix || parent(B) isa StridedMatrix) && return A .+ B
+    UpperTriangular(A.data + triu(B.data, 1) + I)
+end
+function +(A::LowerTriangular, B::UnitLowerTriangular)
+    (parent(A) isa StridedMatrix || parent(B) isa StridedMatrix) && return A .+ B
+    LowerTriangular(A.data + tril(B.data, -1) + I)
+end
+function +(A::UnitUpperTriangular, B::UpperTriangular)
+    (parent(A) isa StridedMatrix || parent(B) isa StridedMatrix) && return A .+ B
+    UpperTriangular(triu(A.data, 1) + B.data + I)
+end
+function +(A::UnitLowerTriangular, B::LowerTriangular)
+    (parent(A) isa StridedMatrix || parent(B) isa StridedMatrix) && return A .+ B
+    LowerTriangular(tril(A.data, -1) + B.data + I)
+end
+function +(A::UnitUpperTriangular, B::UnitUpperTriangular)
+    (parent(A) isa StridedMatrix || parent(B) isa StridedMatrix) && return A .+ B
+    UpperTriangular(triu(A.data, 1) + triu(B.data, 1) + 2I)
+end
+function +(A::UnitLowerTriangular, B::UnitLowerTriangular)
+    (parent(A) isa StridedMatrix || parent(B) isa StridedMatrix) && return A .+ B
+    LowerTriangular(tril(A.data, -1) + tril(B.data, -1) + 2I)
+end
 +(A::AbstractTriangular, B::AbstractTriangular) = copyto!(similar(parent(A)), A) + copyto!(similar(parent(B)), B)
 
--(A::UpperTriangular, B::UpperTriangular) = UpperTriangular(A.data - B.data)
--(A::LowerTriangular, B::LowerTriangular) = LowerTriangular(A.data - B.data)
--(A::UpperTriangular, B::UnitUpperTriangular) = UpperTriangular(A.data - triu(B.data, 1) - I)
--(A::LowerTriangular, B::UnitLowerTriangular) = LowerTriangular(A.data - tril(B.data, -1) - I)
--(A::UnitUpperTriangular, B::UpperTriangular) = UpperTriangular(triu(A.data, 1) - B.data + I)
--(A::UnitLowerTriangular, B::LowerTriangular) = LowerTriangular(tril(A.data, -1) - B.data + I)
--(A::UnitUpperTriangular, B::UnitUpperTriangular) = UpperTriangular(triu(A.data, 1) - triu(B.data, 1))
--(A::UnitLowerTriangular, B::UnitLowerTriangular) = LowerTriangular(tril(A.data, -1) - tril(B.data, -1))
--(A::AbstractTriangular, B::AbstractTriangular) = copyto!(similar(parent(A)), A) - copyto!(similar(parent(B)), B)
-
-# use broadcasting if the parents are strided, where we loop only over the triangular part
-for op in (:+, :-)
-    for TM1 in (:LowerTriangular, :UnitLowerTriangular), TM2 in (:LowerTriangular, :UnitLowerTriangular)
-        @eval $op(A::$TM1{<:Any, <:StridedMaybeAdjOrTransMat}, B::$TM2{<:Any, <:StridedMaybeAdjOrTransMat}) = broadcast($op, A, B)
-    end
-    for TM1 in (:UpperTriangular, :UnitUpperTriangular), TM2 in (:UpperTriangular, :UnitUpperTriangular)
-        @eval $op(A::$TM1{<:Any, <:StridedMaybeAdjOrTransMat}, B::$TM2{<:Any, <:StridedMaybeAdjOrTransMat}) = broadcast($op, A, B)
-    end
+function -(A::UpperTriangular, B::UpperTriangular)
+    (parent(A) isa StridedMatrix || parent(B) isa StridedMatrix) && return A .- B
+    UpperTriangular(A.data - B.data)
+end
+function -(A::LowerTriangular, B::LowerTriangular)
+    (parent(A) isa StridedMatrix || parent(B) isa StridedMatrix) && return A .- B
+    LowerTriangular(A.data - B.data)
+end
+function -(A::UpperTriangular, B::UnitUpperTriangular)
+    (parent(A) isa StridedMatrix || parent(B) isa StridedMatrix) && return A .- B
+    UpperTriangular(A.data - triu(B.data, 1) - I)
+end
+function -(A::LowerTriangular, B::UnitLowerTriangular)
+    (parent(A) isa StridedMatrix || parent(B) isa StridedMatrix) && return A .- B
+    LowerTriangular(A.data - tril(B.data, -1) - I)
 end
+function -(A::UnitUpperTriangular, B::UpperTriangular)
+    (parent(A) isa StridedMatrix || parent(B) isa StridedMatrix) && return A .- B
+    UpperTriangular(triu(A.data, 1) - B.data + I)
+end
+function -(A::UnitLowerTriangular, B::LowerTriangular)
+    (parent(A) isa StridedMatrix || parent(B) isa StridedMatrix) && return A .- B
+    LowerTriangular(tril(A.data, -1) - B.data + I)
+end
+function -(A::UnitUpperTriangular, B::UnitUpperTriangular)
+    (parent(A) isa StridedMatrix || parent(B) isa StridedMatrix) && return A .- B
+    UpperTriangular(triu(A.data, 1) - triu(B.data, 1))
+end
+function -(A::UnitLowerTriangular, B::UnitLowerTriangular)
+    (parent(A) isa StridedMatrix || parent(B) isa StridedMatrix) && return A .- B
+    LowerTriangular(tril(A.data, -1) - tril(B.data, -1))
+end
+-(A::AbstractTriangular, B::AbstractTriangular) = copyto!(similar(parent(A)), A) - copyto!(similar(parent(B)), B)
 
-function kron(A::UpperTriangular{<:Number,<:StridedMaybeAdjOrTransMat}, B::UpperTriangular{<:Number,<:StridedMaybeAdjOrTransMat})
-    C = UpperTriangular(Matrix{promote_op(*, eltype(A), eltype(B))}(undef, _kronsize(A, B)))
+function kron(A::UpperTriangular{T,<:StridedMaybeAdjOrTransMat}, B::UpperTriangular{S,<:StridedMaybeAdjOrTransMat}) where {T,S}
+    C = UpperTriangular(Matrix{promote_op(*, T, S)}(undef, _kronsize(A, B)))
     return kron!(C, A, B)
 end
-function kron(A::LowerTriangular{<:Number,<:StridedMaybeAdjOrTransMat}, B::LowerTriangular{<:Number,<:StridedMaybeAdjOrTransMat})
-    C = LowerTriangular(Matrix{promote_op(*, eltype(A), eltype(B))}(undef, _kronsize(A, B)))
+function kron(A::LowerTriangular{T,<:StridedMaybeAdjOrTransMat}, B::LowerTriangular{S,<:StridedMaybeAdjOrTransMat}) where {T,S}
+    C = LowerTriangular(Matrix{promote_op(*, T, S)}(undef, _kronsize(A, B)))
     return kron!(C, A, B)
 end
 
-function kron!(C::UpperTriangular{<:Number,<:StridedMaybeAdjOrTransMat}, A::UpperTriangular{<:Number,<:StridedMaybeAdjOrTransMat}, B::UpperTriangular{<:Number,<:StridedMaybeAdjOrTransMat})
+function kron!(C::UpperTriangular{<:Any,<:StridedMaybeAdjOrTransMat}, A::UpperTriangular{<:Any,<:StridedMaybeAdjOrTransMat}, B::UpperTriangular{<:Any,<:StridedMaybeAdjOrTransMat})
     size(C) == _kronsize(A, B) || throw(DimensionMismatch("kron!"))
     _triukron!(C.data, A.data, B.data)
     return C
 end
-function kron!(C::LowerTriangular{<:Number,<:StridedMaybeAdjOrTransMat}, A::LowerTriangular{<:Number,<:StridedMaybeAdjOrTransMat}, B::LowerTriangular{<:Number,<:StridedMaybeAdjOrTransMat})
+function kron!(C::LowerTriangular{<:Any,<:StridedMaybeAdjOrTransMat}, A::LowerTriangular{<:Any,<:StridedMaybeAdjOrTransMat}, B::LowerTriangular{<:Any,<:StridedMaybeAdjOrTransMat})
     size(C) == _kronsize(A, B) || throw(DimensionMismatch("kron!"))
     _trilkron!(C.data, A.data, B.data)
     return C
@@ -895,7 +966,7 @@ function _triukron!(C, A, B)
                     C[inB+k, jnB+l] = Aij * B[k, l]
                 end
                 for k = 1:(l-1)
-                    C[inB+l, jnB+k] = zero(eltype(C))
+                    C[inB+l, jnB+k] = zero(C[inB+k, jnB+l])
                 end
             end
         end
@@ -927,7 +998,7 @@ function _trilkron!(C, A, B)
                     C[inB+k, jnB+l] = Aij * B[k, l]
                 end
                 for k = (l+1):n_B
-                    C[inB+l, jnB+k] = zero(eltype(C))
+                    C[inB+l, jnB+k] = zero(C[inB+k, jnB+l])
                 end
             end
         end
@@ -953,8 +1024,6 @@ isunit_char(::UnitUpperTriangular) = 'U'
 isunit_char(::LowerTriangular) = 'N'
 isunit_char(::UnitLowerTriangular) = 'U'
 
-lmul!(A::Tridiagonal, B::AbstractTriangular) = A*full!(B)
-
 # generic fallback for AbstractTriangular matrices outside of the four subtypes provided here
 _trimul!(C::AbstractVecOrMat, A::AbstractTriangular, B::AbstractVector) =
     lmul!(A, copyto!(C, B))
@@ -2766,6 +2835,14 @@ end
 
 # Generic eigensystems
 eigvals(A::AbstractTriangular) = diag(A)
+# fallback for unknown types
+function eigvecs(A::AbstractTriangular{<:BlasFloat})
+    if istriu(A)
+        eigvecs(UpperTriangular(Matrix(A)))
+    else # istril(A)
+        eigvecs(LowerTriangular(Matrix(A)))
+    end
+end
 function eigvecs(A::AbstractTriangular{T}) where T
     TT = promote_type(T, Float32)
     if TT <: BlasFloat
diff --git a/stdlib/LinearAlgebra/src/tridiag.jl b/stdlib/LinearAlgebra/src/tridiag.jl
index 3f8eb5da9fc9d..ca61eb8519d42 100644
--- a/stdlib/LinearAlgebra/src/tridiag.jl
+++ b/stdlib/LinearAlgebra/src/tridiag.jl
@@ -228,6 +228,29 @@ end
 -(A::SymTridiagonal) = SymTridiagonal(-A.dv, -A.ev)
 *(A::SymTridiagonal, B::Number) = SymTridiagonal(A.dv*B, A.ev*B)
 *(B::Number, A::SymTridiagonal) = SymTridiagonal(B*A.dv, B*A.ev)
+function rmul!(A::SymTridiagonal, x::Number)
+    if size(A,1) > 2
+        # ensure that zeros are preserved on scaling
+        y = A[3,1] * x
+        iszero(y) || throw(ArgumentError(LazyString("cannot set index (3, 1) off ",
+            lazy"the tridiagonal band to a nonzero value ($y)")))
+    end
+    A.dv .*= x
+    _evview(A) .*= x
+    return A
+end
+function lmul!(x::Number, B::SymTridiagonal)
+    if size(B,1) > 2
+        # ensure that zeros are preserved on scaling
+        y = x * B[3,1]
+        iszero(y) || throw(ArgumentError(LazyString("cannot set index (3, 1) off ",
+            lazy"the tridiagonal band to a nonzero value ($y)")))
+    end
+    @. B.dv = x * B.dv
+    ev = _evview(B)
+    @. ev = x * ev
+    return B
+end
 /(A::SymTridiagonal, B::Number) = SymTridiagonal(A.dv/B, A.ev/B)
 \(B::Number, A::SymTridiagonal) = SymTridiagonal(B\A.dv, B\A.ev)
 ==(A::SymTridiagonal{<:Number}, B::SymTridiagonal{<:Number}) =
@@ -296,8 +319,6 @@ eigmax(A::SymTridiagonal) = eigvals(A, size(A, 1):size(A, 1))[1]
 eigmin(A::SymTridiagonal) = eigvals(A, 1:1)[1]
 
 #Compute selected eigenvectors only corresponding to particular eigenvalues
-eigvecs(A::SymTridiagonal) = eigen(A).vectors
-
 """
     eigvecs(A::SymTridiagonal[, eigvals]) -> Matrix
 
@@ -372,7 +393,7 @@ function tril!(M::SymTridiagonal{T}, k::Integer=0) where T
         return Tridiagonal(M.ev,M.dv,zero(M.ev))
     elseif k == 0
         return Tridiagonal(M.ev,M.dv,zero(M.ev))
-    elseif k >= 1
+    else # if k >= 1
         return Tridiagonal(M.ev,M.dv,copy(M.ev))
     end
 end
@@ -391,7 +412,7 @@ function triu!(M::SymTridiagonal{T}, k::Integer=0) where T
         return Tridiagonal(zero(M.ev),M.dv,M.ev)
     elseif k == 0
         return Tridiagonal(zero(M.ev),M.dv,M.ev)
-    elseif k <= -1
+    else # if k <= -1
         return Tridiagonal(M.ev,M.dv,copy(M.ev))
     end
 end
@@ -836,6 +857,30 @@ tr(M::Tridiagonal) = sum(M.d)
 -(A::Tridiagonal) = Tridiagonal(-A.dl, -A.d, -A.du)
 *(A::Tridiagonal, B::Number) = Tridiagonal(A.dl*B, A.d*B, A.du*B)
 *(B::Number, A::Tridiagonal) = Tridiagonal(B*A.dl, B*A.d, B*A.du)
+function rmul!(T::Tridiagonal, x::Number)
+    if size(T,1) > 2
+        # ensure that zeros are preserved on scaling
+        y = T[3,1] * x
+        iszero(y) || throw(ArgumentError(LazyString("cannot set index (3, 1) off ",
+            lazy"the tridiagonal band to a nonzero value ($y)")))
+    end
+    T.dl .*= x
+    T.d .*= x
+    T.du .*= x
+    return T
+end
+function lmul!(x::Number, T::Tridiagonal)
+    if size(T,1) > 2
+        # ensure that zeros are preserved on scaling
+        y = x * T[3,1]
+        iszero(y) || throw(ArgumentError(LazyString("cannot set index (3, 1) off ",
+            lazy"the tridiagonal band to a nonzero value ($y)")))
+    end
+    @. T.dl = x * T.dl
+    @. T.d = x * T.d
+    @. T.du = x * T.du
+    return T
+end
 /(A::Tridiagonal, B::Number) = Tridiagonal(A.dl/B, A.d/B, A.du/B)
 \(B::Number, A::Tridiagonal) = Tridiagonal(B\A.dl, B\A.d, B\A.du)
 
diff --git a/stdlib/LinearAlgebra/src/uniformscaling.jl b/stdlib/LinearAlgebra/src/uniformscaling.jl
index b75886b8d99fb..472ea53078f87 100644
--- a/stdlib/LinearAlgebra/src/uniformscaling.jl
+++ b/stdlib/LinearAlgebra/src/uniformscaling.jl
@@ -403,6 +403,16 @@ function copyto!(A::Tridiagonal, J::UniformScaling)
     return A
 end
 
+"""
+    copy!(dest::AbstractMatrix, src::UniformScaling)
+
+Copies a [`UniformScaling`](@ref) onto a matrix.
+
+!!! compat "Julia 1.12"
+    This method is available as of Julia 1.12.
+"""
+Base.copy!(A::AbstractMatrix, J::UniformScaling) = copyto!(A, J)
+
 function cond(J::UniformScaling{T}) where T
     onereal = inv(one(real(J.λ)))
     return J.λ ≠ zero(T) ? onereal : oftype(onereal, Inf)
diff --git a/stdlib/LinearAlgebra/test/addmul.jl b/stdlib/LinearAlgebra/test/addmul.jl
index 3fff8289242f7..208fa930e8ee1 100644
--- a/stdlib/LinearAlgebra/test/addmul.jl
+++ b/stdlib/LinearAlgebra/test/addmul.jl
@@ -164,8 +164,7 @@ end
         Bc = Matrix(B)
         returned_mat = mul!(C, A, B, α, β)
         @test returned_mat === C
-        # This test is skipped because it is flakey, but should be fixed and put back (see #49966)
-        @test_skip collect(returned_mat) ≈ α * Ac * Bc + β * Cc  rtol=rtol
+        @test collect(returned_mat) ≈ α * Ac * Bc + β * Cc  rtol=rtol
 
         y = C[:, 1]
         x = B[:, 1]
@@ -190,8 +189,7 @@ end
 
                     returned_mat = mul!(C, Af, Bf, α, β)
                     @test returned_mat === C
-                    # This test is skipped because it is flakey, but should be fixed and put back (see #49966)
-                    @test_skip collect(returned_mat) ≈ α * Ac * Bc + β * Cc  rtol=rtol
+                    @test collect(returned_mat) ≈ α * Ac * Bc + β * Cc  rtol=rtol
                 end
             end
         end
@@ -203,8 +201,7 @@ end
                 Bc = Matrix(B)
                 returned_mat = mul!(C, A, B, α, zero(eltype(C)))
                 @test returned_mat === C
-                # This test is skipped because it is flakey, but should be fixed and put back (see #49966)
-                @test_skip collect(returned_mat) ≈ α * Ac * Bc  rtol=rtol
+                @test collect(returned_mat) ≈ α * Ac * Bc  rtol=rtol
             end
         end
 
@@ -220,4 +217,26 @@ end
     end
 end
 
+@testset "issue #55727" begin
+    C = zeros(1,1)
+    @testset "$(nameof(typeof(A)))" for A in Any[Diagonal([NaN]),
+                Bidiagonal([NaN], Float64[], :U),
+                Bidiagonal([NaN], Float64[], :L),
+                SymTridiagonal([NaN], Float64[]),
+                Tridiagonal(Float64[], [NaN], Float64[]),
+                ]
+        @testset "$(nameof(typeof(B)))" for B in Any[
+                    Diagonal([1.0]),
+                    Bidiagonal([1.0], Float64[], :U),
+                    Bidiagonal([1.0], Float64[], :L),
+                    SymTridiagonal([1.0], Float64[]),
+                    Tridiagonal(Float64[], [1.0], Float64[]),
+                    ]
+            C .= 0
+            @test mul!(C, A, B, 0.0, false)[] === 0.0
+            @test mul!(C, B, A, 0.0, false)[] === 0.0
+        end
+    end
+end
+
 end  # module
diff --git a/stdlib/LinearAlgebra/test/bidiag.jl b/stdlib/LinearAlgebra/test/bidiag.jl
index ef50658a642fb..d633a99a2390e 100644
--- a/stdlib/LinearAlgebra/test/bidiag.jl
+++ b/stdlib/LinearAlgebra/test/bidiag.jl
@@ -969,6 +969,19 @@ end
     end
 end
 
+@testset "rmul!/lmul! with numbers" begin
+    for T in (Bidiagonal(rand(4), rand(3), :U), Bidiagonal(rand(4), rand(3), :L))
+        @test rmul!(copy(T), 0.2) ≈ rmul!(Array(T), 0.2)
+        @test lmul!(0.2, copy(T)) ≈ lmul!(0.2, Array(T))
+        @test_throws ArgumentError rmul!(T, NaN)
+        @test_throws ArgumentError lmul!(NaN, T)
+    end
+    for T in (Bidiagonal(rand(1), rand(0), :U), Bidiagonal(rand(1), rand(0), :L))
+        @test all(isnan, rmul!(copy(T), NaN))
+        @test all(isnan, lmul!(NaN, copy(T)))
+    end
+end
+
 @testset "mul with Diagonal" begin
     for n in 0:4
         dv, ev = rand(n), rand(max(n-1,0))
@@ -1048,4 +1061,71 @@ end
     @test mul!(similar(D), B, D) == mul!(similar(D), D, B) == B * D
 end
 
+@testset "mul for small matrices" begin
+    @testset for n in 0:6
+        D = Diagonal(rand(n))
+        v = rand(n)
+        @testset for uplo in (:L, :U)
+            B = Bidiagonal(rand(n), rand(max(n-1,0)), uplo)
+            M = Matrix(B)
+
+            @test B * v ≈ M * v
+            @test mul!(similar(v), B, v) ≈ M * v
+            @test mul!(ones(size(v)), B, v, 2, 3) ≈ M * v * 2 .+ 3
+
+            @test B * B ≈ M * M
+            @test mul!(similar(B, size(B)), B, B) ≈ M * M
+            @test mul!(ones(size(B)), B, B, 2, 4) ≈ M * M * 2 .+ 4
+
+            for m in 0:6
+                AL = rand(m,n)
+                AR = rand(n,m)
+                @test AL * B ≈ AL * M
+                @test B * AR ≈ M * AR
+                @test mul!(similar(AL), AL, B) ≈ AL * M
+                @test mul!(similar(AR), B, AR) ≈ M * AR
+                @test mul!(ones(size(AL)), AL, B, 2, 4) ≈ AL * M * 2 .+ 4
+                @test mul!(ones(size(AR)), B, AR, 2, 4) ≈ M * AR * 2 .+ 4
+            end
+
+            @test B * D ≈ M * D
+            @test D * B ≈ D * M
+            @test mul!(similar(B), B, D) ≈ M * D
+            @test mul!(similar(B), B, D) ≈ M * D
+            @test mul!(similar(B, size(B)), D, B) ≈ D * M
+            @test mul!(similar(B, size(B)), B, D) ≈ M * D
+            @test mul!(ones(size(B)), D, B, 2, 4) ≈ D * M * 2 .+ 4
+            @test mul!(ones(size(B)), B, D, 2, 4) ≈ M * D * 2 .+ 4
+        end
+        BL = Bidiagonal(rand(n), rand(max(0, n-1)), :L)
+        ML = Matrix(BL)
+        BU = Bidiagonal(rand(n), rand(max(0, n-1)), :U)
+        MU = Matrix(BU)
+        T = Tridiagonal(zeros(max(0, n-1)), zeros(n), zeros(max(0, n-1)))
+        @test mul!(T, BL, BU) ≈ ML * MU
+        @test mul!(T, BU, BL) ≈ MU * ML
+        T = Tridiagonal(ones(max(0, n-1)), ones(n), ones(max(0, n-1)))
+        @test mul!(copy(T), BL, BU, 2, 3) ≈ ML * MU * 2 + T * 3
+        @test mul!(copy(T), BU, BL, 2, 3) ≈ MU * ML * 2 + T * 3
+    end
+
+    n = 4
+    arr = SizedArrays.SizedArray{(2,2)}(reshape([1:4;],2,2))
+    for B in (
+            Bidiagonal(fill(arr,n), fill(arr,n-1), :L),
+            Bidiagonal(fill(arr,n), fill(arr,n-1), :U),
+            )
+        @test B * B ≈ Matrix(B) * Matrix(B)
+        BL = Bidiagonal(fill(arr,n), fill(arr,n-1), :L)
+        BU = Bidiagonal(fill(arr,n), fill(arr,n-1), :U)
+        @test BL * B ≈ Matrix(BL) * Matrix(B)
+        @test BU * B ≈ Matrix(BU) * Matrix(B)
+        @test B * BL ≈ Matrix(B) * Matrix(BL)
+        @test B * BU ≈ Matrix(B) * Matrix(BU)
+        D = Diagonal(fill(arr,n))
+        @test D * B ≈ Matrix(D) * Matrix(B)
+        @test B * D ≈ Matrix(B) * Matrix(D)
+    end
+end
+
 end # module TestBidiagonal
diff --git a/stdlib/LinearAlgebra/test/diagonal.jl b/stdlib/LinearAlgebra/test/diagonal.jl
index 83d5e4fcdf170..98f5498c71033 100644
--- a/stdlib/LinearAlgebra/test/diagonal.jl
+++ b/stdlib/LinearAlgebra/test/diagonal.jl
@@ -1265,6 +1265,17 @@ end
     @test *(Diagonal(ones(n)), Diagonal(1:n), Diagonal(ones(n)), Diagonal(1:n)) isa Diagonal
 end
 
+@testset "triple multiplication with a sandwiched BandedMatrix" begin
+    D = Diagonal(StepRangeLen(NaN, 0, 4));
+    B = Bidiagonal(1:4, 1:3, :U)
+    C = D * B * D
+    @test iszero(diag(C, 2))
+    # test associativity
+    C1 = (D * B) * D
+    C2 = D * (B * D)
+    @test diag(C,2) == diag(C1,2) == diag(C2,2)
+end
+
 @testset "diagind" begin
     D = Diagonal(1:4)
     M = Matrix(D)
@@ -1345,6 +1356,17 @@ end
     end
 end
 
+@testset "rmul!/lmul! with numbers" begin
+    D = Diagonal(rand(4))
+    @test rmul!(copy(D), 0.2) ≈ rmul!(Array(D), 0.2)
+    @test lmul!(0.2, copy(D)) ≈ lmul!(0.2, Array(D))
+    @test_throws ArgumentError rmul!(D, NaN)
+    @test_throws ArgumentError lmul!(NaN, D)
+    D = Diagonal(rand(1))
+    @test all(isnan, rmul!(copy(D), NaN))
+    @test all(isnan, lmul!(NaN, copy(D)))
+end
+
 @testset "+/- with block Symmetric/Hermitian" begin
     for p in ([1 2; 3 4], [1 2+im; 2-im 4+2im])
         m = SizedArrays.SizedArray{(2,2)}(p)
diff --git a/stdlib/LinearAlgebra/test/special.jl b/stdlib/LinearAlgebra/test/special.jl
index 8d3733e6b1289..4b91bcfc1a4d5 100644
--- a/stdlib/LinearAlgebra/test/special.jl
+++ b/stdlib/LinearAlgebra/test/special.jl
@@ -790,6 +790,19 @@ end
     end
 end
 
+@testset "Partly filled Hermitian and Diagonal algebra" begin
+    D = Diagonal([1,2])
+    for S in (Symmetric, Hermitian), uplo in (:U, :L)
+        M = Matrix{BigInt}(undef, 2, 2)
+        M[1,1] = M[2,2] = M[1+(uplo == :L), 1 + (uplo == :U)] = 3
+        H = S(M, uplo)
+        HM = Matrix(H)
+        @test H + D == D + H == HM + D
+        @test H - D == HM - D
+        @test D - H == D - HM
+    end
+end
+
 @testset "block SymTridiagonal" begin
     m = SizedArrays.SizedArray{(2,2)}(reshape([1:4;;],2,2))
     S = SymTridiagonal(fill(m,4), fill(m,3))
diff --git a/stdlib/LinearAlgebra/test/symmetric.jl b/stdlib/LinearAlgebra/test/symmetric.jl
index 939e677039dc7..7a51ab9d454af 100644
--- a/stdlib/LinearAlgebra/test/symmetric.jl
+++ b/stdlib/LinearAlgebra/test/symmetric.jl
@@ -1135,4 +1135,29 @@ end
     end
 end
 
+@testset "partly iniitalized matrices" begin
+    a = Matrix{BigFloat}(undef, 2,2)
+    a[1] = 1; a[3] = 1; a[4] = 1
+    h = Hermitian(a)
+    s = Symmetric(a)
+    d = Diagonal([1,1])
+    symT = SymTridiagonal([1 1;1 1])
+    @test h+d == Array(h) + Array(d)
+    @test h+symT == Array(h) + Array(symT)
+    @test s+d == Array(s) + Array(d)
+    @test s+symT == Array(s) + Array(symT)
+    @test h-d == Array(h) - Array(d)
+    @test h-symT == Array(h) - Array(symT)
+    @test s-d == Array(s) - Array(d)
+    @test s-symT == Array(s) - Array(symT)
+    @test d+h == Array(d) + Array(h)
+    @test symT+h == Array(symT) + Array(h)
+    @test d+s == Array(d) + Array(s)
+    @test symT+s == Array(symT) + Array(s)
+    @test d-h == Array(d) - Array(h)
+    @test symT-h == Array(symT) - Array(h)
+    @test d-s == Array(d) - Array(s)
+    @test symT-s == Array(symT) - Array(s)
+end
+
 end # module TestSymmetric
diff --git a/stdlib/LinearAlgebra/test/symmetriceigen.jl b/stdlib/LinearAlgebra/test/symmetriceigen.jl
index d55d1deb6bf33..71087ae4d8d24 100644
--- a/stdlib/LinearAlgebra/test/symmetriceigen.jl
+++ b/stdlib/LinearAlgebra/test/symmetriceigen.jl
@@ -171,6 +171,11 @@ end
     @test D isa Eigen{ComplexF16, Float16, Matrix{ComplexF16}, Vector{Float16}}
     @test D.values ≈ D32.values
     @test D.vectors ≈ D32.vectors
+
+    # ensure that different algorithms dispatch correctly
+    λ, V = eigen(C, LinearAlgebra.QRIteration())
+    @test λ isa Vector{Float16}
+    @test C * V ≈ V * Diagonal(λ)
 end
 
 @testset "complex Symmetric" begin
diff --git a/stdlib/LinearAlgebra/test/triangular.jl b/stdlib/LinearAlgebra/test/triangular.jl
index 5f0a829f9cdda..ec9a3079e2643 100644
--- a/stdlib/LinearAlgebra/test/triangular.jl
+++ b/stdlib/LinearAlgebra/test/triangular.jl
@@ -6,7 +6,7 @@ debug = false
 using Test, LinearAlgebra, Random
 using LinearAlgebra: BlasFloat, errorbounds, full!, transpose!,
     UnitUpperTriangular, UnitLowerTriangular,
-    mul!, rdiv!, rmul!, lmul!
+    mul!, rdiv!, rmul!, lmul!, BandIndex
 
 const BASE_TEST_PATH = joinpath(Sys.BINDIR, "..", "share", "julia", "test")
 
@@ -442,8 +442,6 @@ Base.getindex(A::MyTriangular, i::Int, j::Int) = A.data[i,j]
 
             debug && println("elty1: $elty1, A1: $t1, B: $eltyB")
 
-            Tri = Tridiagonal(rand(eltyB,n-1),rand(eltyB,n),rand(eltyB,n-1))
-            @test lmul!(Tri,copy(A1)) ≈ Tri*M1
             Tri = Tridiagonal(rand(eltyB,n-1),rand(eltyB,n),rand(eltyB,n-1))
             C = Matrix{promote_type(elty1,eltyB)}(undef, n, n)
             mul!(C, Tri, A1)
@@ -1050,6 +1048,9 @@ end
             @test 2\L == 2\B
             @test real(L) == real(B)
             @test imag(L) == imag(B)
+            if MT == LowerTriangular
+                @test isa(kron(L,L), MT)
+            end
             @test kron(L,L) == kron(B,B)
             @test transpose!(MT(copy(A))) == transpose(L) broken=!(A isa Matrix)
             @test adjoint!(MT(copy(A))) == adjoint(L) broken=!(A isa Matrix)
@@ -1072,6 +1073,9 @@ end
             @test 2\U == 2\B
             @test real(U) == real(B)
             @test imag(U) == imag(B)
+            if MT == UpperTriangular
+                @test isa(kron(U,U), MT)
+            end
             @test kron(U,U) == kron(B,B)
             @test transpose!(MT(copy(A))) == transpose(U) broken=!(A isa Matrix)
             @test adjoint!(MT(copy(A))) == adjoint(U) broken=!(A isa Matrix)
@@ -1083,10 +1087,20 @@ end
     for T in (UpperTriangular, LowerTriangular)
         t = T(fill(ones(2,2), 2, 2))
         m = Matrix(t)
+        @test isa(kron(t,t), T)
         @test kron(t, t) ≈ kron(m, m)
     end
 end
 
+@testset "kron with triangular matrices of mixed eltypes" begin
+    for T in (UpperTriangular, LowerTriangular)
+        U = T(Matrix{Union{Missing,Int}}(fill(2, 2, 2)))
+        U[1, 1] = missing
+        @test kron(U, U)[2, 3] == 0
+        @test kron(U, U)[3, 2] == 0
+    end
+end
+
 @testset "copyto! tests" begin
     @testset "copyto! with aliasing (#39460)" begin
         M = Matrix(reshape(1:36, 6, 6))
@@ -1200,6 +1214,22 @@ end
     end
 end
 
+@testset "eigvecs for AbstractTriangular" begin
+    S = SizedArrays.SizedArray{(3,3)}(reshape(1:9,3,3))
+    for T in (UpperTriangular, UnitUpperTriangular,
+                LowerTriangular, UnitLowerTriangular)
+        U = T(S)
+        V = eigvecs(U)
+        λ = eigvals(U)
+        @test U * V ≈ V * Diagonal(λ)
+
+        MU = MyTriangular(U)
+        V = eigvecs(U)
+        λ = eigvals(U)
+        @test MU * V ≈ V * Diagonal(λ)
+    end
+end
+
 @testset "(l/r)mul! and (l/r)div! for generic triangular" begin
     @testset for T in (UpperTriangular, LowerTriangular, UnitUpperTriangular, UnitLowerTriangular)
         M = MyTriangular(T(rand(4,4)))
@@ -1228,4 +1258,68 @@ end
     end
 end
 
+@testset "istriu/istril forwards to parent" begin
+    @testset "$(nameof(typeof(M)))" for M in [Tridiagonal(rand(n-1), rand(n), rand(n-1)),
+                Tridiagonal(zeros(n-1), zeros(n), zeros(n-1)),
+                Diagonal(randn(n)),
+                Diagonal(zeros(n)),
+                ]
+        @testset for TriT in (UpperTriangular, UnitUpperTriangular, LowerTriangular, UnitLowerTriangular)
+            U = TriT(M)
+            A = Array(U)
+            for k in -n:n
+                @test istriu(U, k) == istriu(A, k)
+                @test istril(U, k) == istril(A, k)
+            end
+        end
+    end
+    z = zeros(n,n)
+    @testset for TriT in (UpperTriangular, UnitUpperTriangular, LowerTriangular, UnitLowerTriangular)
+        P = Matrix{BigFloat}(undef, n, n)
+        copytrito!(P, z, TriT <: Union{UpperTriangular, UnitUpperTriangular} ? 'U' : 'L')
+        U = TriT(P)
+        A = Array(U)
+        @testset for k in -n:n
+            @test istriu(U, k) == istriu(A, k)
+            @test istril(U, k) == istril(A, k)
+        end
+    end
+end
+
+@testset "indexing with a BandIndex" begin
+    # these tests should succeed even if the linear index along
+    # the band isn't a constant, or type-inferred at all
+    M = rand(Int,2,2)
+    f(A,j, v::Val{n}) where {n} = Val(A[BandIndex(n,j)])
+    function common_tests(M, ind)
+        j = ind[]
+        @test @inferred(f(UpperTriangular(M), j, Val(-1))) == Val(0)
+        @test @inferred(f(UnitUpperTriangular(M), j, Val(-1))) == Val(0)
+        @test @inferred(f(UnitUpperTriangular(M), j, Val(0))) == Val(1)
+        @test @inferred(f(LowerTriangular(M), j, Val(1))) == Val(0)
+        @test @inferred(f(UnitLowerTriangular(M), j, Val(1))) == Val(0)
+        @test @inferred(f(UnitLowerTriangular(M), j, Val(0))) == Val(1)
+    end
+    common_tests(M, Any[1])
+
+    M = Diagonal([1,2])
+    common_tests(M, Any[1])
+    # extra tests for banded structure of the parent
+    for T in (UpperTriangular, UnitUpperTriangular)
+        @test @inferred(f(T(M), 1, Val(1))) == Val(0)
+    end
+    for T in (LowerTriangular, UnitLowerTriangular)
+        @test @inferred(f(T(M), 1, Val(-1))) == Val(0)
+    end
+
+    M = Tridiagonal([1,2], [1,2,3], [1,2])
+    common_tests(M, Any[1])
+    for T in (UpperTriangular, UnitUpperTriangular)
+        @test @inferred(f(T(M), 1, Val(2))) == Val(0)
+    end
+    for T in (LowerTriangular, UnitLowerTriangular)
+        @test @inferred(f(T(M), 1, Val(-2))) == Val(0)
+    end
+end
+
 end # module TestTriangular
diff --git a/stdlib/LinearAlgebra/test/tridiag.jl b/stdlib/LinearAlgebra/test/tridiag.jl
index 759d692f8bc68..826a6e62355d0 100644
--- a/stdlib/LinearAlgebra/test/tridiag.jl
+++ b/stdlib/LinearAlgebra/test/tridiag.jl
@@ -135,27 +135,43 @@ end
         @test_throws ArgumentError tril!(SymTridiagonal(d, dl), n)
         @test_throws ArgumentError tril!(Tridiagonal(dl, d, du), -n - 2)
         @test_throws ArgumentError tril!(Tridiagonal(dl, d, du), n)
-        @test tril(SymTridiagonal(d,dl))    == Tridiagonal(dl,d,zerosdl)
-        @test tril(SymTridiagonal(d,dl),1)  == Tridiagonal(dl,d,dl)
-        @test tril(SymTridiagonal(d,dl),-1) == Tridiagonal(dl,zerosd,zerosdl)
-        @test tril(SymTridiagonal(d,dl),-2) == Tridiagonal(zerosdl,zerosd,zerosdl)
-        @test tril(Tridiagonal(dl,d,du))    == Tridiagonal(dl,d,zerosdu)
-        @test tril(Tridiagonal(dl,d,du),1)  == Tridiagonal(dl,d,du)
-        @test tril(Tridiagonal(dl,d,du),-1) == Tridiagonal(dl,zerosd,zerosdu)
-        @test tril(Tridiagonal(dl,d,du),-2) == Tridiagonal(zerosdl,zerosd,zerosdu)
+        @test @inferred(tril(SymTridiagonal(d,dl)))    == Tridiagonal(dl,d,zerosdl)
+        @test @inferred(tril(SymTridiagonal(d,dl),1))  == Tridiagonal(dl,d,dl)
+        @test @inferred(tril(SymTridiagonal(d,dl),-1)) == Tridiagonal(dl,zerosd,zerosdl)
+        @test @inferred(tril(SymTridiagonal(d,dl),-2)) == Tridiagonal(zerosdl,zerosd,zerosdl)
+        @test @inferred(tril(Tridiagonal(dl,d,du)))    == Tridiagonal(dl,d,zerosdu)
+        @test @inferred(tril(Tridiagonal(dl,d,du),1))  == Tridiagonal(dl,d,du)
+        @test @inferred(tril(Tridiagonal(dl,d,du),-1)) == Tridiagonal(dl,zerosd,zerosdu)
+        @test @inferred(tril(Tridiagonal(dl,d,du),-2)) == Tridiagonal(zerosdl,zerosd,zerosdu)
+        @test @inferred(tril!(copy(SymTridiagonal(d,dl))))    == Tridiagonal(dl,d,zerosdl)
+        @test @inferred(tril!(copy(SymTridiagonal(d,dl)),1))  == Tridiagonal(dl,d,dl)
+        @test @inferred(tril!(copy(SymTridiagonal(d,dl)),-1)) == Tridiagonal(dl,zerosd,zerosdl)
+        @test @inferred(tril!(copy(SymTridiagonal(d,dl)),-2)) == Tridiagonal(zerosdl,zerosd,zerosdl)
+        @test @inferred(tril!(copy(Tridiagonal(dl,d,du))))    == Tridiagonal(dl,d,zerosdu)
+        @test @inferred(tril!(copy(Tridiagonal(dl,d,du)),1))  == Tridiagonal(dl,d,du)
+        @test @inferred(tril!(copy(Tridiagonal(dl,d,du)),-1)) == Tridiagonal(dl,zerosd,zerosdu)
+        @test @inferred(tril!(copy(Tridiagonal(dl,d,du)),-2)) == Tridiagonal(zerosdl,zerosd,zerosdu)
 
         @test_throws ArgumentError triu!(SymTridiagonal(d, dl), -n)
         @test_throws ArgumentError triu!(SymTridiagonal(d, dl), n + 2)
         @test_throws ArgumentError triu!(Tridiagonal(dl, d, du), -n)
         @test_throws ArgumentError triu!(Tridiagonal(dl, d, du), n + 2)
-        @test triu(SymTridiagonal(d,dl))    == Tridiagonal(zerosdl,d,dl)
-        @test triu(SymTridiagonal(d,dl),-1) == Tridiagonal(dl,d,dl)
-        @test triu(SymTridiagonal(d,dl),1)  == Tridiagonal(zerosdl,zerosd,dl)
-        @test triu(SymTridiagonal(d,dl),2)  == Tridiagonal(zerosdl,zerosd,zerosdl)
-        @test triu(Tridiagonal(dl,d,du))    == Tridiagonal(zerosdl,d,du)
-        @test triu(Tridiagonal(dl,d,du),-1) == Tridiagonal(dl,d,du)
-        @test triu(Tridiagonal(dl,d,du),1)  == Tridiagonal(zerosdl,zerosd,du)
-        @test triu(Tridiagonal(dl,d,du),2)  == Tridiagonal(zerosdl,zerosd,zerosdu)
+        @test @inferred(triu(SymTridiagonal(d,dl)))    == Tridiagonal(zerosdl,d,dl)
+        @test @inferred(triu(SymTridiagonal(d,dl),-1)) == Tridiagonal(dl,d,dl)
+        @test @inferred(triu(SymTridiagonal(d,dl),1))  == Tridiagonal(zerosdl,zerosd,dl)
+        @test @inferred(triu(SymTridiagonal(d,dl),2))  == Tridiagonal(zerosdl,zerosd,zerosdl)
+        @test @inferred(triu(Tridiagonal(dl,d,du)))    == Tridiagonal(zerosdl,d,du)
+        @test @inferred(triu(Tridiagonal(dl,d,du),-1)) == Tridiagonal(dl,d,du)
+        @test @inferred(triu(Tridiagonal(dl,d,du),1))  == Tridiagonal(zerosdl,zerosd,du)
+        @test @inferred(triu(Tridiagonal(dl,d,du),2))  == Tridiagonal(zerosdl,zerosd,zerosdu)
+        @test @inferred(triu!(copy(SymTridiagonal(d,dl))))    == Tridiagonal(zerosdl,d,dl)
+        @test @inferred(triu!(copy(SymTridiagonal(d,dl)),-1)) == Tridiagonal(dl,d,dl)
+        @test @inferred(triu!(copy(SymTridiagonal(d,dl)),1))  == Tridiagonal(zerosdl,zerosd,dl)
+        @test @inferred(triu!(copy(SymTridiagonal(d,dl)),2))  == Tridiagonal(zerosdl,zerosd,zerosdl)
+        @test @inferred(triu!(copy(Tridiagonal(dl,d,du))))    == Tridiagonal(zerosdl,d,du)
+        @test @inferred(triu!(copy(Tridiagonal(dl,d,du)),-1)) == Tridiagonal(dl,d,du)
+        @test @inferred(triu!(copy(Tridiagonal(dl,d,du)),1))  == Tridiagonal(zerosdl,zerosd,du)
+        @test @inferred(triu!(copy(Tridiagonal(dl,d,du)),2))  == Tridiagonal(zerosdl,zerosd,zerosdu)
 
         @test !istril(SymTridiagonal(d,dl))
         @test istril(SymTridiagonal(d,zerosdl))
@@ -919,6 +935,19 @@ end
     end
 end
 
+@testset "rmul!/lmul! with numbers" begin
+    for T in (SymTridiagonal(rand(4), rand(3)), Tridiagonal(rand(3), rand(4), rand(3)))
+        @test rmul!(copy(T), 0.2) ≈ rmul!(Array(T), 0.2)
+        @test lmul!(0.2, copy(T)) ≈ lmul!(0.2, Array(T))
+        @test_throws ArgumentError rmul!(T, NaN)
+        @test_throws ArgumentError lmul!(NaN, T)
+    end
+    for T in (SymTridiagonal(rand(2), rand(1)), Tridiagonal(rand(1), rand(2), rand(1)))
+        @test all(isnan, rmul!(copy(T), NaN))
+        @test all(isnan, lmul!(NaN, copy(T)))
+    end
+end
+
 @testset "mul with empty arrays" begin
     A = zeros(5,0)
     T = Tridiagonal(zeros(0), zeros(0), zeros(0))
@@ -954,4 +983,75 @@ end
     @test sprint(show, S) == "SymTridiagonal($(repr(diag(S))), $(repr(diag(S,1))))"
 end
 
+@testset "mul for small matrices" begin
+    @testset for n in 0:6
+        for T in (
+                Tridiagonal(rand(max(n-1,0)), rand(n), rand(max(n-1,0))),
+                SymTridiagonal(rand(n), rand(max(n-1,0))),
+                )
+            M = Matrix(T)
+            @test T * T ≈ M * M
+            @test mul!(similar(T, size(T)), T, T) ≈ M * M
+            @test mul!(ones(size(T)), T, T, 2, 4) ≈ M * M * 2 .+ 4
+
+            for m in 0:6
+                AR = rand(n,m)
+                AL = rand(m,n)
+                @test AL * T ≈ AL * M
+                @test T * AR ≈ M * AR
+                @test mul!(similar(AL), AL, T) ≈ AL * M
+                @test mul!(similar(AR), T, AR) ≈ M * AR
+                @test mul!(ones(size(AL)), AL, T, 2, 4) ≈ AL * M * 2 .+ 4
+                @test mul!(ones(size(AR)), T, AR, 2, 4) ≈ M * AR * 2 .+ 4
+            end
+
+            v = rand(n)
+            @test T * v ≈ M * v
+            @test mul!(similar(v), T, v) ≈ M * v
+
+            D = Diagonal(rand(n))
+            @test T * D ≈ M * D
+            @test D * T ≈ D * M
+            @test mul!(Tridiagonal(similar(T)), D, T) ≈ D * M
+            @test mul!(Tridiagonal(similar(T)), T, D) ≈ M * D
+            @test mul!(similar(T, size(T)), D, T) ≈ D * M
+            @test mul!(similar(T, size(T)), T, D) ≈ M * D
+            @test mul!(ones(size(T)), D, T, 2, 4) ≈ D * M * 2 .+ 4
+            @test mul!(ones(size(T)), T, D, 2, 4) ≈ M * D * 2 .+ 4
+
+            for uplo in (:U, :L)
+                B = Bidiagonal(rand(n), rand(max(0, n-1)), uplo)
+                @test T * B ≈ M * B
+                @test B * T ≈ B * M
+                if n <= 2
+                    @test mul!(Tridiagonal(similar(T)), B, T) ≈ B * M
+                    @test mul!(Tridiagonal(similar(T)), T, B) ≈ M * B
+                end
+                @test mul!(similar(T, size(T)), B, T) ≈ B * M
+                @test mul!(similar(T, size(T)), T, B) ≈ M * B
+                @test mul!(ones(size(T)), B, T, 2, 4) ≈ B * M * 2 .+ 4
+                @test mul!(ones(size(T)), T, B, 2, 4) ≈ M * B * 2 .+ 4
+            end
+        end
+    end
+
+    n = 4
+    arr = SizedArrays.SizedArray{(2,2)}(reshape([1:4;],2,2))
+    for T in (
+            SymTridiagonal(fill(arr,n), fill(arr,n-1)),
+            Tridiagonal(fill(arr,n-1), fill(arr,n), fill(arr,n-1)),
+            )
+        @test T * T ≈ Matrix(T) * Matrix(T)
+        BL = Bidiagonal(fill(arr,n), fill(arr,n-1), :L)
+        BU = Bidiagonal(fill(arr,n), fill(arr,n-1), :U)
+        @test BL * T ≈ Matrix(BL) * Matrix(T)
+        @test BU * T ≈ Matrix(BU) * Matrix(T)
+        @test T * BL ≈ Matrix(T) * Matrix(BL)
+        @test T * BU ≈ Matrix(T) * Matrix(BU)
+        D = Diagonal(fill(arr,n))
+        @test D * T ≈ Matrix(D) * Matrix(T)
+        @test T * D ≈ Matrix(T) * Matrix(D)
+    end
+end
+
 end # module TestTridiagonal
diff --git a/stdlib/LinearAlgebra/test/uniformscaling.jl b/stdlib/LinearAlgebra/test/uniformscaling.jl
index 92547e8648d8a..d335cd6f63521 100644
--- a/stdlib/LinearAlgebra/test/uniformscaling.jl
+++ b/stdlib/LinearAlgebra/test/uniformscaling.jl
@@ -226,6 +226,13 @@ let
         @test copyto!(B, J) == [λ zero(λ)]
     end
 
+    @testset "copy!" begin
+        A = Matrix{Int}(undef, (3,3))
+        @test copy!(A, I) == one(A)
+        B = Matrix{ComplexF64}(undef, (1,2))
+        @test copy!(B, J) == [λ zero(λ)]
+    end
+
     @testset "binary ops with vectors" begin
         v = complex.(randn(3), randn(3))
         # As shown in #20423@GitHub, vector acts like x1 matrix when participating in linear algebra
diff --git a/stdlib/Manifest.toml b/stdlib/Manifest.toml
index c9d2086432a85..f9fb307190838 100644
--- a/stdlib/Manifest.toml
+++ b/stdlib/Manifest.toml
@@ -68,12 +68,12 @@ version = "1.11.0"
 [[deps.JuliaSyntaxHighlighting]]
 deps = ["StyledStrings"]
 uuid = "dc6e5ff7-fb65-4e79-a425-ec3bc9c03011"
-version = "1.11.0"
+version = "1.12.0"
 
 [[deps.LLD_jll]]
 deps = ["Artifacts", "Libdl", "Zlib_jll", "libLLVM_jll"]
 uuid = "d55e3150-da41-5e91-b323-ecfd1eec6109"
-version = "16.0.6+4"
+version = "18.1.7+2"
 
 [[deps.LLVMLibUnwind_jll]]
 deps = ["Artifacts", "Libdl"]
@@ -113,12 +113,12 @@ version = "1.11.0+1"
 [[deps.LibUV_jll]]
 deps = ["Artifacts", "Libdl"]
 uuid = "183b4373-6708-53ba-ad28-60e28bb38547"
-version = "2.0.1+16"
+version = "2.0.1+17"
 
 [[deps.LibUnwind_jll]]
 deps = ["Artifacts", "Libdl"]
 uuid = "745a5e78-f969-53e9-954f-d19f2f74f4e3"
-version = "1.8.1+0"
+version = "1.8.1+1"
 
 [[deps.Libdl]]
 uuid = "8f399da3-3557-5675-b5ff-fb832c97cbdb"
@@ -163,7 +163,7 @@ version = "1.2.0"
 [[deps.OpenBLAS_jll]]
 deps = ["Artifacts", "CompilerSupportLibraries_jll", "Libdl"]
 uuid = "4536629a-c528-5b80-bd46-f80d51c5b363"
-version = "0.3.26+2"
+version = "0.3.28+2"
 
 [[deps.OpenLibm_jll]]
 deps = ["Artifacts", "Libdl"]
@@ -190,6 +190,7 @@ uuid = "de0858da-6303-5e67-8744-51eddeeeb8d7"
 version = "1.11.0"
 
 [[deps.Profile]]
+deps = ["StyledStrings"]
 uuid = "9abbd945-dff8-562f-b5e8-e1ebf5ef1b79"
 version = "1.11.0"
 
@@ -223,7 +224,7 @@ version = "1.11.0"
 [[deps.SparseArrays]]
 deps = ["Libdl", "LinearAlgebra", "Random", "Serialization", "SuiteSparse_jll"]
 uuid = "2f01184e-e22b-5df5-ae63-d93ebab69eaf"
-version = "1.11.0"
+version = "1.12.0"
 
 [[deps.Statistics]]
 deps = ["LinearAlgebra"]
@@ -242,7 +243,7 @@ version = "1.11.0"
 [[deps.SuiteSparse_jll]]
 deps = ["Artifacts", "Libdl", "libblastrampoline_jll"]
 uuid = "bea87d4a-7f5b-5778-9afe-8cc45184846c"
-version = "7.7.0+0"
+version = "7.8.0+0"
 
 [[deps.TOML]]
 deps = ["Dates"]
@@ -281,12 +282,12 @@ version = "2.2.5+0"
 [[deps.libLLVM_jll]]
 deps = ["Artifacts", "Libdl"]
 uuid = "8f36deef-c2a5-5394-99ed-8e07531fb29a"
-version = "16.0.6+4"
+version = "18.1.7+2"
 
 [[deps.libblastrampoline_jll]]
 deps = ["Artifacts", "Libdl"]
 uuid = "8e850b90-86db-534c-a0d3-1478176c7d93"
-version = "5.8.0+1"
+version = "5.11.0+0"
 
 [[deps.nghttp2_jll]]
 deps = ["Artifacts", "Libdl"]
diff --git a/stdlib/Markdown/src/Markdown.jl b/stdlib/Markdown/src/Markdown.jl
index b9ff56297fe51..0d45d9e534df2 100644
--- a/stdlib/Markdown/src/Markdown.jl
+++ b/stdlib/Markdown/src/Markdown.jl
@@ -56,7 +56,8 @@ const MARKDOWN_FACES = [
 
 __init__() = foreach(addface!, MARKDOWN_FACES)
 
-parse(markdown::AbstractString; flavor = julia) = parse(IOBuffer(markdown), flavor = flavor)
+parse(markdown::String; flavor = julia) = parse(IOBuffer(markdown), flavor = flavor)
+parse(markdown::AbstractString; flavor = julia) = parse(String(markdown), flavor = flavor)
 parse_file(file::AbstractString; flavor = julia) = parse(read(file, String), flavor = flavor)
 
 function mdexpr(s, flavor = :julia)
@@ -122,4 +123,25 @@ import Base.Docs: catdoc
 
 catdoc(md::MD...) = MD(md...)
 
+if Base.generating_output()
+    # workload to reduce latency
+    md"""
+    # H1
+    ## H2
+    ### H3
+    **bold text**
+    *italicized text*
+    > blockquote
+    1. First item
+    2. Second item
+    3. Third item
+    - First item
+    - Second item
+    - Third item
+    `code`
+    Horizontal Rule
+    ---
+    """
+end
+
 end
diff --git a/stdlib/Markdown/src/render/terminal/render.jl b/stdlib/Markdown/src/render/terminal/render.jl
index 619b2c8b8ef4a..a97d273131536 100644
--- a/stdlib/Markdown/src/render/terminal/render.jl
+++ b/stdlib/Markdown/src/render/terminal/render.jl
@@ -116,7 +116,7 @@ function term(io::AnnotIO, md::Header{l}, columns) where l
 end
 
 function term(io::IO, md::Code, columns)
-    code = if md.language ∈ ("", "julia")
+    code = if md.language == "julia"
         highlight(md.code)
     elseif md.language == "julia-repl" || Base.startswith(md.language, "jldoctest")
         hl = AnnotatedString(md.code)
diff --git a/stdlib/Markdown/test/runtests.jl b/stdlib/Markdown/test/runtests.jl
index ffdb735f3b7cd..35608f75b2426 100644
--- a/stdlib/Markdown/test/runtests.jl
+++ b/stdlib/Markdown/test/runtests.jl
@@ -1308,3 +1308,7 @@ end
     # https://github.com/JuliaLang/julia/issues/37757
     @test insert_hlines(nothing) === nothing
 end
+
+@testset "Lazy Strings" begin
+    @test Markdown.parse(lazy"foo") == Markdown.parse("foo")
+end
diff --git a/stdlib/Mmap/src/Mmap.jl b/stdlib/Mmap/src/Mmap.jl
index e6987582bf511..7d57bf053940d 100644
--- a/stdlib/Mmap/src/Mmap.jl
+++ b/stdlib/Mmap/src/Mmap.jl
@@ -86,6 +86,8 @@ grow!(::Anonymous,o::Integer,l::Integer) = return
 function grow!(io::IO, offset::Integer, len::Integer)
     pos = position(io)
     filelen = filesize(io)
+    # If non-regular file skip trying to grow since we know that will fail the ftruncate syscall
+    filelen == 0 && !isfile(io) && return
     if filelen < offset + len
         failure = ccall(:jl_ftruncate, Cint, (Cint, Int64), fd(io), offset+len)
         Base.systemerror(:ftruncate, failure != 0)
@@ -211,14 +213,12 @@ function mmap(io::IO,
     szfile = convert(Csize_t, len + offset)
     requestedSizeLarger = false
     if !(io isa Mmap.Anonymous)
-        @static if !Sys.isapple()
-            requestedSizeLarger = szfile > filesize(io)
-        end
+        requestedSizeLarger = szfile > filesize(io)
     end
     # platform-specific mmapping
     @static if Sys.isunix()
         prot, flags, iswrite = settings(file_desc, shared)
-        if requestedSizeLarger
+        if requestedSizeLarger && isfile(io) # add a condition to this line to ensure it only checks files
             if iswrite
                 if grow
                     grow!(io, offset, len)
@@ -229,9 +229,6 @@ function mmap(io::IO,
                 throw(ArgumentError("unable to increase file size to $szfile due to read-only permissions"))
             end
         end
-        @static if Sys.isapple()
-            iswrite && grow && grow!(io, offset, len)
-        end
         # mmap the file
         ptr = ccall(:jl_mmap, Ptr{Cvoid}, (Ptr{Cvoid}, Csize_t, Cint, Cint, RawFD, Int64),
             C_NULL, mmaplen, prot, flags, file_desc, offset_page)
diff --git a/stdlib/Pkg.version b/stdlib/Pkg.version
index 60d2914b7f853..34233c58702b4 100644
--- a/stdlib/Pkg.version
+++ b/stdlib/Pkg.version
@@ -1,4 +1,4 @@
 PKG_BRANCH = master
-PKG_SHA1 = 43e7849ce37545493d0da3226cd7449f5f88563e
+PKG_SHA1 = 51d4910c114a863d888659cb8962c1e161b2a421
 PKG_GIT_URL := https://github.com/JuliaLang/Pkg.jl.git
 PKG_TAR_URL = https://api.github.com/repos/JuliaLang/Pkg.jl/tarball/$1
diff --git a/stdlib/Profile/Project.toml b/stdlib/Profile/Project.toml
index ad0107ecf9404..13cd11f70d9b4 100644
--- a/stdlib/Profile/Project.toml
+++ b/stdlib/Profile/Project.toml
@@ -2,6 +2,12 @@ name = "Profile"
 uuid = "9abbd945-dff8-562f-b5e8-e1ebf5ef1b79"
 version = "1.11.0"
 
+[deps]
+StyledStrings = "f489334b-da3d-4c2e-b8f0-e476e12c162b"
+
+[compat]
+StyledStrings = "1.11.0"
+
 [extras]
 Base64 = "2a0f44e3-6c83-55bd-87e4-b1978d98bd5f"
 Logging = "56ddb016-857b-54e1-b83d-db4d58db5568"
diff --git a/stdlib/Profile/docs/src/index.md b/stdlib/Profile/docs/src/index.md
index 5b4db77b9cb16..0b358e5decfa9 100644
--- a/stdlib/Profile/docs/src/index.md
+++ b/stdlib/Profile/docs/src/index.md
@@ -155,3 +155,8 @@ julia> Profile.HeapSnapshot.assemble_snapshot("snapshot", "snapshot.heapsnapshot
 
 The resulting heap snapshot file can be uploaded to chrome devtools to be viewed.
 For more information, see the [chrome devtools docs](https://developer.chrome.com/docs/devtools/memory-problems/heap-snapshots/#view_snapshots).
+An alternative for analyzing Chromium heap snapshots is with the VS Code extension
+`ms-vscode.vscode-js-profile-flame`.
+
+The Firefox heap snapshots are of a different format, and Firefox currently may
+*not* be used for viewing the heap snapshots generated by Julia.
diff --git a/stdlib/Profile/src/Allocs.jl b/stdlib/Profile/src/Allocs.jl
index 31d703a151ad8..9d0b18cb468ca 100644
--- a/stdlib/Profile/src/Allocs.jl
+++ b/stdlib/Profile/src/Allocs.jl
@@ -321,7 +321,7 @@ end
 function flat(io::IO, data::Vector{Alloc}, cols::Int, fmt::ProfileFormat)
     fmt.combine || error(ArgumentError("combine=false"))
     lilist, n, m, totalbytes = parse_flat(fmt.combine ? StackFrame : UInt64, data, fmt.C)
-    filenamemap = Dict{Symbol,String}()
+    filenamemap = Profile.FileNameMap()
     if isempty(lilist)
         warning_empty()
         return true
diff --git a/stdlib/Profile/src/Profile.jl b/stdlib/Profile/src/Profile.jl
index 799f23034b9ac..b753c9ca88f24 100644
--- a/stdlib/Profile/src/Profile.jl
+++ b/stdlib/Profile/src/Profile.jl
@@ -7,7 +7,7 @@ Profiling support.
 
 ## CPU profiling
 - `@profile foo()` to profile a specific call.
-- `Profile.print()` to print the report.
+- `Profile.print()` to print the report. Paths are clickable links in supported terminals and specialized for JULIA_EDITOR etc.
 - `Profile.clear()` to clear the buffer.
 - Send a $(Sys.isbsd() ? "SIGINFO (ctrl-t)" : "SIGUSR1") signal to the process to automatically trigger a profile and print.
 
@@ -38,6 +38,8 @@ public clear,
     Allocs
 
 import Base.StackTraces: lookup, UNKNOWN, show_spec_linfo, StackFrame
+import Base: AnnotatedString
+using StyledStrings: @styled_str
 
 const nmeta = 4 # number of metadata fields per block (threadid, taskid, cpu_cycle_clock, thread_sleeping)
 
@@ -63,10 +65,10 @@ end
 
 # An internal function called to show the report after an information request (SIGINFO or SIGUSR1).
 function _peek_report()
-    iob = IOBuffer()
+    iob = Base.AnnotatedIOBuffer()
     ioc = IOContext(IOContext(iob, stderr), :displaysize=>displaysize(stderr))
     print(ioc, groupby = [:thread, :task])
-    Base.print(stderr, String(take!(iob)))
+    Base.print(stderr, read(seekstart(iob), AnnotatedString))
 end
 # This is a ref so that it can be overridden by other profile info consumers.
 const peek_report = Ref{Function}(_peek_report)
@@ -196,7 +198,9 @@ const META_OFFSET_THREADID = 5
 
 Prints profiling results to `io` (by default, `stdout`). If you do not
 supply a `data` vector, the internal buffer of accumulated backtraces
-will be used.
+will be used. Paths are clickable links in supported terminals and
+specialized for [`JULIA_EDITOR`](@ref) with line numbers, or just file
+links if no editor is set.
 
 The keyword arguments can be any combination of:
 
@@ -266,7 +270,7 @@ function print(io::IO,
         end
         any_nosamples = true
         if format === :tree
-            Base.print(io, "Overhead ╎ [+additional indent] Count File:Line; Function\n")
+            Base.print(io, "Overhead ╎ [+additional indent] Count File:Line  Function\n")
             Base.print(io, "=========================================================\n")
         end
         if groupby == [:task, :thread]
@@ -501,12 +505,23 @@ function flatten(data::Vector, lidict::LineInfoDict)
     return (newdata, newdict)
 end
 
+const SRC_DIR = normpath(joinpath(Sys.BUILD_ROOT_PATH, "src"))
+
 # Take a file-system path and try to form a concise representation of it
 # based on the package ecosystem
-function short_path(spath::Symbol, filenamecache::Dict{Symbol, String})
+function short_path(spath::Symbol, filenamecache::Dict{Symbol, Tuple{String,String,String}})
     return get!(filenamecache, spath) do
         path = Base.fixup_stdlib_path(string(spath))
-        if isabspath(path)
+        path_norm = normpath(path)
+        possible_base_path = normpath(joinpath(Sys.BINDIR, Base.DATAROOTDIR, "julia", "base", path))
+        lib_dir = abspath(Sys.BINDIR, Base.LIBDIR)
+        if startswith(path_norm, SRC_DIR)
+            remainder = only(split(path_norm, SRC_DIR, keepempty=false))
+            return (isfile(path_norm) ? path_norm : ""), "@juliasrc", remainder
+        elseif startswith(path_norm, lib_dir)
+            remainder = only(split(path_norm, lib_dir, keepempty=false))
+            return (isfile(path_norm) ? path_norm : ""), "@julialib", remainder
+        elseif isabspath(path)
             if ispath(path)
                 # try to replace the file-system prefix with a short "@Module" one,
                 # assuming that profile came from the current machine
@@ -522,20 +537,21 @@ function short_path(spath::Symbol, filenamecache::Dict{Symbol, String})
                             pkgid = Base.project_file_name_uuid(project_file, "")
                             isempty(pkgid.name) && return path # bad Project file
                             # return the joined the module name prefix and path suffix
-                            path = path[nextind(path, sizeof(root)):end]
-                            return string("@", pkgid.name, path)
+                            _short_path = path[nextind(path, sizeof(root)):end]
+                            return path, string("@", pkgid.name), _short_path
                         end
                     end
                 end
             end
-            return path
-        elseif isfile(joinpath(Sys.BINDIR, Base.DATAROOTDIR, "julia", "base", path))
+            return path, "", path
+        elseif isfile(possible_base_path)
             # do the same mechanic for Base (or Core/Compiler) files as above,
             # but they start from a relative path
-            return joinpath("@Base", normpath(path))
+            return possible_base_path, "@Base", normpath(path)
         else
             # for non-existent relative paths (such as "REPL[1]"), just consider simplifying them
-            return normpath(path) # drop leading "./"
+            path = normpath(path)
+            return "", "", path # drop leading "./"
         end
     end
 end
@@ -678,7 +694,7 @@ function add_fake_meta(data; threadid = 1, taskid = 0xf0f0f0f0)
     !isempty(data) && has_meta(data) && error("input already has metadata")
     cpu_clock_cycle = UInt64(99)
     data_with_meta = similar(data, 0)
-    for i = 1:length(data)
+    for i in eachindex(data)
         val = data[i]
         if iszero(val)
             # META_OFFSET_THREADID, META_OFFSET_TASKID, META_OFFSET_CPUCYCLECLOCK, META_OFFSET_SLEEPSTATE
@@ -756,6 +772,8 @@ function parse_flat(::Type{T}, data::Vector{UInt64}, lidict::Union{LineInfoDict,
     return (lilist, n, m, totalshots, nsleeping)
 end
 
+const FileNameMap = Dict{Symbol,Tuple{String,String,String}}
+
 function flat(io::IO, data::Vector{UInt64}, lidict::Union{LineInfoDict, LineInfoFlatDict}, cols::Int, fmt::ProfileFormat,
                 threads::Union{Int,AbstractVector{Int}}, tasks::Union{UInt,AbstractVector{UInt}}, is_subsection::Bool)
     lilist, n, m, totalshots, nsleeping = parse_flat(fmt.combine ? StackFrame : UInt64, data, lidict, fmt.C, threads, tasks)
@@ -766,7 +784,7 @@ function flat(io::IO, data::Vector{UInt64}, lidict::Union{LineInfoDict, LineInfo
         m = m[keep]
     end
     util_perc = (1 - (nsleeping / totalshots)) * 100
-    filenamemap = Dict{Symbol,String}()
+    filenamemap = FileNameMap()
     if isempty(lilist)
         if is_subsection
             Base.print(io, "Total snapshots: ")
@@ -788,9 +806,43 @@ function flat(io::IO, data::Vector{UInt64}, lidict::Union{LineInfoDict, LineInfo
     return false
 end
 
+# make a terminal-clickable link to the file and linenum.
+# Similar to `define_default_editors` in `Base.Filesystem` but for creating URIs not commands
+function editor_link(path::String, linenum::Int)
+    # Note: the editor path can include spaces (if escaped) and flags.
+    editor = nothing
+    for var in ["JULIA_EDITOR", "VISUAL", "EDITOR"]
+        str = get(ENV, var, nothing)
+        str isa String || continue
+        editor = str
+        break
+    end
+    path_encoded = Base.Filesystem.encode_uri_component(path)
+    if editor !== nothing
+        if editor == "code"
+            return "vscode://file/$path_encoded:$linenum"
+        elseif editor == "subl" || editor == "sublime_text"
+            return "subl://open?url=file://$path_encoded&line=$linenum"
+        elseif editor == "idea" || occursin("idea", editor)
+            return "idea://open?file=$path_encoded&line=$linenum"
+        elseif editor == "pycharm"
+            return "pycharm://open?file=$path_encoded&line=$linenum"
+        elseif editor == "atom"
+            return "atom://core/open/file?filename=$path_encoded&line=$linenum"
+        elseif editor == "emacsclient" || editor == "emacs"
+            return "emacs://open?file=$path_encoded&line=$linenum"
+        elseif editor == "vim" || editor == "nvim"
+            # Note: Vim/Nvim may not support standard URI schemes without specific plugins
+            return "vim://open?file=$path_encoded&line=$linenum"
+        end
+    end
+    # fallback to generic URI, but line numbers are not supported by generic URI
+    return Base.Filesystem.uripath(path)
+end
+
 function print_flat(io::IO, lilist::Vector{StackFrame},
         n::Vector{Int}, m::Vector{Int},
-        cols::Int, filenamemap::Dict{Symbol,String},
+        cols::Int, filenamemap::FileNameMap,
         fmt::ProfileFormat)
     if fmt.sortedby === :count
         p = sortperm(n)
@@ -802,18 +854,18 @@ function print_flat(io::IO, lilist::Vector{StackFrame},
     lilist = lilist[p]
     n = n[p]
     m = m[p]
-    filenames = String[short_path(li.file, filenamemap) for li in lilist]
+    pkgnames_filenames = Tuple{String,String,String}[short_path(li.file, filenamemap) for li in lilist]
     funcnames = String[string(li.func) for li in lilist]
     wcounts = max(6, ndigits(maximum(n)))
     wself = max(9, ndigits(maximum(m)))
     maxline = 1
     maxfile = 6
     maxfunc = 10
-    for i in 1:length(lilist)
+    for i in eachindex(lilist)
         li = lilist[i]
         maxline = max(maxline, li.line)
-        maxfunc = max(maxfunc, length(funcnames[i]))
-        maxfile = max(maxfile, length(filenames[i]))
+        maxfunc = max(maxfunc, textwidth(funcnames[i]))
+        maxfile = max(maxfile, sum(textwidth, pkgnames_filenames[i][2:3]) + 1)
     end
     wline = max(5, ndigits(maxline))
     ntext = max(20, cols - wcounts - wself - wline - 3)
@@ -829,7 +881,7 @@ function print_flat(io::IO, lilist::Vector{StackFrame},
             rpad("File", wfile, " "), " ", lpad("Line", wline, " "), " Function")
     println(io, lpad("=====", wcounts, " "), " ", lpad("========", wself, " "), " ",
             rpad("====", wfile, " "), " ", lpad("====", wline, " "), " ========")
-    for i = 1:length(n)
+    for i in eachindex(n)
         n[i] < fmt.mincount && continue
         li = lilist[i]
         Base.print(io, lpad(string(n[i]), wcounts, " "), " ")
@@ -841,16 +893,29 @@ function print_flat(io::IO, lilist::Vector{StackFrame},
                 Base.print(io, "[any unknown stackframes]")
             end
         else
-            file = filenames[i]
+            path, pkgname, file = pkgnames_filenames[i]
             isempty(file) && (file = "[unknown file]")
-            Base.print(io, rpad(rtruncto(file, wfile), wfile, " "), " ")
+            pkgcolor = get!(() -> popfirst!(Base.STACKTRACE_MODULECOLORS), PACKAGE_FIXEDCOLORS, pkgname)
+            Base.printstyled(io, pkgname, color=pkgcolor)
+            file_trunc = ltruncate(file, max(1, wfile))
+            wpad = wfile - textwidth(pkgname)
+            if !isempty(pkgname) && !startswith(file_trunc, "/")
+                Base.print(io, "/")
+                wpad -= 1
+            end
+            if isempty(path)
+                Base.print(io, rpad(file_trunc, wpad, " "))
+            else
+                link = editor_link(path, li.line)
+                Base.print(io, rpad(styled"{link=$link:$file_trunc}", wpad, " "))
+            end
             Base.print(io, lpad(li.line > 0 ? string(li.line) : "?", wline, " "), " ")
             fname = funcnames[i]
             if !li.from_c && li.linfo !== nothing
                 fname = sprint(show_spec_linfo, li)
             end
             isempty(fname) && (fname = "[unknown function]")
-            Base.print(io, ltruncto(fname, wfunc))
+            Base.print(io, rtruncate(fname, wfunc))
         end
         println(io)
     end
@@ -889,21 +954,24 @@ function indent(depth::Int)
     return indent
 end
 
-function tree_format(frames::Vector{<:StackFrameTree}, level::Int, cols::Int, maxes, filenamemap::Dict{Symbol,String}, showpointer::Bool)
+# mimics Stacktraces
+const PACKAGE_FIXEDCOLORS = Dict{String, Any}("@Base" => :gray, "@Core" => :gray)
+
+function tree_format(frames::Vector{<:StackFrameTree}, level::Int, cols::Int, maxes, filenamemap::FileNameMap, showpointer::Bool)
     nindent = min(cols>>1, level)
     ndigoverhead = ndigits(maxes.overhead)
     ndigcounts = ndigits(maxes.count)
     ndigline = ndigits(maximum(frame.frame.line for frame in frames)) + 6
     ntext = max(30, cols - ndigoverhead - nindent - ndigcounts - ndigline - 6)
     widthfile = 2*ntext÷5 # min 12
-    strs = Vector{String}(undef, length(frames))
+    strs = Vector{AnnotatedString{String}}(undef, length(frames))
     showextra = false
     if level > nindent
         nextra = level - nindent
         nindent -= ndigits(nextra) + 2
         showextra = true
     end
-    for i = 1:length(frames)
+    for i in eachindex(frames)
         frame = frames[i]
         li = frame.frame
         stroverhead = lpad(frame.overhead > 0 ? string(frame.overhead) : "", ndigoverhead, " ")
@@ -924,7 +992,7 @@ function tree_format(frames::Vector{<:StackFrameTree}, level::Int, cols::Int, ma
                 else
                     fname = string(li.func)
                 end
-                filename = short_path(li.file, filenamemap)
+                path, pkgname, filename = short_path(li.file, filenamemap)
                 if showpointer
                     fname = string(
                         "0x",
@@ -932,17 +1000,26 @@ function tree_format(frames::Vector{<:StackFrameTree}, level::Int, cols::Int, ma
                         " ",
                         fname)
                 end
-                strs[i] = string(stroverhead, "╎", base, strcount, " ",
-                    rtruncto(filename, widthfile),
-                    ":",
-                    li.line == -1 ? "?" : string(li.line),
-                    "; ",
-                    fname)
+                pkgcolor = get!(() -> popfirst!(Base.STACKTRACE_MODULECOLORS), PACKAGE_FIXEDCOLORS, pkgname)
+                remaining_path = ltruncate(filename, max(1, widthfile - textwidth(pkgname) - 1))
+                linenum = li.line == -1 ? "?" : string(li.line)
+                slash = (!isempty(pkgname) && !startswith(remaining_path, "/")) ? "/" : ""
+                styled_path = styled"{$pkgcolor:$pkgname}$slash$remaining_path:$linenum"
+                rich_file = if isempty(path)
+                    styled_path
+                else
+                    link = editor_link(path, li.line)
+                    styled"{link=$link:$styled_path}"
+                end
+                strs[i] = Base.annotatedstring(stroverhead, "╎", base, strcount, " ", rich_file, "  ", fname)
+                if frame.overhead > 0
+                    strs[i] = styled"{bold:$(strs[i])}"
+                end
             end
         else
             strs[i] = string(stroverhead, "╎", base, strcount, " [unknown stackframe]")
         end
-        strs[i] = ltruncto(strs[i], cols)
+        strs[i] = rtruncate(strs[i], cols)
     end
     return strs
 end
@@ -1101,10 +1178,10 @@ end
 # avoid stack overflows.
 function print_tree(io::IO, bt::StackFrameTree{T}, cols::Int, fmt::ProfileFormat, is_subsection::Bool) where T
     maxes = maxstats(bt)
-    filenamemap = Dict{Symbol,String}()
-    worklist = [(bt, 0, 0, "")]
+    filenamemap = FileNameMap()
+    worklist = [(bt, 0, 0, AnnotatedString(""))]
     if !is_subsection
-        Base.print(io, "Overhead ╎ [+additional indent] Count File:Line; Function\n")
+        Base.print(io, "Overhead ╎ [+additional indent] Count File:Line  Function\n")
         Base.print(io, "=========================================================\n")
     end
     while !isempty(worklist)
@@ -1135,7 +1212,7 @@ function print_tree(io::IO, bt::StackFrameTree{T}, cols::Int, fmt::ProfileFormat
             count = down.count
             count < fmt.mincount && continue
             count < noisefloor && continue
-            str = strs[i]
+            str = strs[i]::AnnotatedString
             noisefloor_down = fmt.noisefloor > 0 ? floor(Int, fmt.noisefloor * sqrt(count)) : 0
             pushfirst!(worklist, (down, level + 1, noisefloor_down, str))
         end
@@ -1196,24 +1273,7 @@ function callersf(matchfunc::Function, bt::Vector, lidict::LineInfoFlatDict)
     return [(v[i], k[i]) for i in p]
 end
 
-# Utilities
-function rtruncto(str::String, w::Int)
-    if textwidth(str) <= w
-        return str
-    else
-        return string("…", str[prevind(str, end, w-2):end])
-    end
-end
-function ltruncto(str::String, w::Int)
-    if textwidth(str) <= w
-        return str
-    else
-        return string(str[1:nextind(str, 1, w-2)], "…")
-    end
-end
-
-
-truncto(str::Symbol, w::Int) = truncto(string(str), w)
+## Utilities
 
 # Order alphabetically (file, function) and then by line number
 function liperm(lilist::Vector{StackFrame})
diff --git a/stdlib/Profile/src/heapsnapshot_reassemble.jl b/stdlib/Profile/src/heapsnapshot_reassemble.jl
index 2413ae538b8ac..b2d86ee1f27b6 100644
--- a/stdlib/Profile/src/heapsnapshot_reassemble.jl
+++ b/stdlib/Profile/src/heapsnapshot_reassemble.jl
@@ -155,7 +155,8 @@ function assemble_snapshot(in_prefix, io::IO)
         _write_decimal_number(io, nodes.edge_count[i], _digits_buf)
         print(io, ",0,0")
     end
-    print(io, "],\"edges\":[")
+    print(io, "],\n")
+    print(io, "\"edges\":[")
     e = 1
     for n in 1:length(nodes)
         count = nodes.edge_count[n]
@@ -177,6 +178,13 @@ function assemble_snapshot(in_prefix, io::IO)
     end
     println(io, "],")
 
+    # not used. Required by microsoft/vscode-v8-heap-tools
+    # This order of these fields is required by chrome dev tools otherwise loading fails
+    println(io, "\"trace_function_infos\":[],")
+    println(io, "\"trace_tree\":[],")
+    println(io, "\"samples\":[],")
+    println(io, "\"locations\":[],")
+
     println(io, "\"strings\":[")
     open(string(in_prefix, ".strings"), "r") do strings_io
         first = true
diff --git a/stdlib/Profile/test/runtests.jl b/stdlib/Profile/test/runtests.jl
index 32d628130c4ac..1769cbd12da3e 100644
--- a/stdlib/Profile/test/runtests.jl
+++ b/stdlib/Profile/test/runtests.jl
@@ -168,7 +168,8 @@ let cmd = Base.julia_cmd()
         println("done")
         print(Profile.len_data())
         """
-    p = open(`$cmd -e $script`)
+    # use multiple threads here to ensure that profiling works with threading
+    p = open(`$cmd -t2 -e $script`)
     t = Timer(120) do t
         # should be under 10 seconds, so give it 2 minutes then report failure
         println("KILLING debuginfo registration test BY PROFILE TEST WATCHDOG\n")
diff --git a/stdlib/REPL/src/Pkg_beforeload.jl b/stdlib/REPL/src/Pkg_beforeload.jl
index ebd0cd255ce19..472fbc924668d 100644
--- a/stdlib/REPL/src/Pkg_beforeload.jl
+++ b/stdlib/REPL/src/Pkg_beforeload.jl
@@ -88,7 +88,7 @@ function projname(project_file::String)
     end
     for depot in Base.DEPOT_PATH
         envdir = joinpath(depot, "environments")
-        if startswith(abspath(project_file), abspath(envdir))
+        if startswith(safe_realpath(project_file), safe_realpath(envdir))
             return "@" * name
         end
     end
diff --git a/stdlib/REPL/src/REPL.jl b/stdlib/REPL/src/REPL.jl
index ddf2f55d0b9f7..272b907165341 100644
--- a/stdlib/REPL/src/REPL.jl
+++ b/stdlib/REPL/src/REPL.jl
@@ -74,7 +74,17 @@ end
 function _UndefVarError_warnfor(io::IO, m::Module, var::Symbol)
     Base.isbindingresolved(m, var) || return false
     (Base.isexported(m, var) || Base.ispublic(m, var)) || return false
-    print(io, "\nHint: a global variable of this name also exists in $m.")
+    active_mod = Base.active_module()
+    print(io, "\nHint: ")
+    if isdefined(active_mod, Symbol(m))
+        print(io, "a global variable of this name also exists in $m.")
+    else
+        if Symbol(m) == var
+            print(io, "$m is loaded but not imported in the active module $active_mod.")
+        else
+            print(io, "a global variable of this name may be made accessible by importing $m in the current active module $active_mod")
+        end
+    end
     return true
 end
 
@@ -758,11 +768,11 @@ setmodifiers!(c::REPLCompletionProvider, m::LineEdit.Modifiers) = c.modifiers =
 Set `mod` as the default contextual module in the REPL,
 both for evaluating expressions and printing them.
 """
-function activate(mod::Module=Main)
+function activate(mod::Module=Main; interactive_utils::Bool=true)
     mistate = (Base.active_repl::LineEditREPL).mistate
     mistate === nothing && return nothing
     mistate.active_module = mod
-    Base.load_InteractiveUtils(mod)
+    interactive_utils && Base.load_InteractiveUtils(mod)
     return nothing
 end
 
diff --git a/stdlib/REPL/src/Terminals.jl b/stdlib/REPL/src/Terminals.jl
index 4f3e99f1d206c..0cf6888d248e8 100644
--- a/stdlib/REPL/src/Terminals.jl
+++ b/stdlib/REPL/src/Terminals.jl
@@ -97,6 +97,7 @@ abstract type UnixTerminal <: TextTerminal end
 pipe_reader(t::UnixTerminal) = t.in_stream::IO
 pipe_writer(t::UnixTerminal) = t.out_stream::IO
 
+@nospecialize
 mutable struct TerminalBuffer <: UnixTerminal
     out_stream::IO
 end
@@ -107,6 +108,7 @@ mutable struct TTYTerminal <: UnixTerminal
     out_stream::IO
     err_stream::IO
 end
+@specialize
 
 const CSI = "\x1b["
 
diff --git a/stdlib/REPL/src/docview.jl b/stdlib/REPL/src/docview.jl
index 5086aa0c9485c..3c5e102bb657e 100644
--- a/stdlib/REPL/src/docview.jl
+++ b/stdlib/REPL/src/docview.jl
@@ -81,7 +81,8 @@ function formatdoc(d::DocStr)
     for part in d.text
         formatdoc(buffer, d, part)
     end
-    Markdown.MD(Any[Markdown.parse(seekstart(buffer))])
+    md = Markdown.MD(Any[Markdown.parse(seekstart(buffer))])
+    assume_julia_code!(md)
 end
 @noinline formatdoc(buffer, d, part) = print(buffer, part)
 
@@ -95,6 +96,27 @@ function parsedoc(d::DocStr)
     d.object
 end
 
+"""
+    assume_julia_code!(doc::Markdown.MD) -> doc
+
+Assume that code blocks with no language specified are Julia code.
+"""
+function assume_julia_code!(doc::Markdown.MD)
+    assume_julia_code!(doc.content)
+    doc
+end
+
+function assume_julia_code!(blocks::Vector)
+    for (i, block) in enumerate(blocks)
+        if block isa Markdown.Code && block.language == ""
+            blocks[i] = Markdown.Code("julia", block.code)
+        elseif block isa Vector || block isa Markdown.MD
+            assume_julia_code!(block)
+        end
+    end
+    blocks
+end
+
 ## Trimming long help ("# Extended help")
 
 struct Message  # For direct messages to the terminal
diff --git a/stdlib/REPL/src/precompile.jl b/stdlib/REPL/src/precompile.jl
index a6effb9f013fc..f7961a205e0b1 100644
--- a/stdlib/REPL/src/precompile.jl
+++ b/stdlib/REPL/src/precompile.jl
@@ -13,7 +13,20 @@ finally
     Base._track_dependencies[] = true
 end
 
-let
+function repl_workload()
+    # these are intentionally triggered
+    allowed_errors = [
+        "BoundsError: attempt to access 0-element Vector{Any} at index [1]",
+        "MethodError: no method matching f(::$Int, ::$Int)",
+        "Padding of type", # reinterpret docstring has ERROR examples
+    ]
+    function check_errors(out)
+        str = String(out)
+        if occursin("ERROR:", str) && !any(occursin(e, str) for e in allowed_errors)
+            @error "Unexpected error (Review REPL precompilation with debug_output on):\n$str"
+            exit(1)
+        end
+    end
     ## Debugging options
     # View the code sent to the repl by setting this to `stdout`
     debug_output = devnull # or stdout
@@ -24,6 +37,20 @@ let
     UP_ARROW = "\e[A"
     DOWN_ARROW = "\e[B"
 
+    # This is notified as soon as the first prompt appears
+    repl_init_event = Base.Event()
+
+    atreplinit() do repl
+        # Main is closed so we can't evaluate in it, but atreplinit runs at
+        # a time that repl.mistate === nothing so REPL.activate fails. So do
+        # it async and wait for the first prompt to know its ready.
+        t = @async begin
+            wait(repl_init_event)
+            REPL.activate(REPL.Precompile; interactive_utils=false)
+        end
+        Base.errormonitor(t)
+    end
+
     repl_script = """
     2+2
     print("")
@@ -96,7 +123,7 @@ let
         repltask = @task try
             Base.run_std_repl(REPL, false, :yes, true)
         finally
-            redirect_stderr(isopen(orig_stderr) ? orig_stderr : devnull)
+            redirect_stdin(isopen(orig_stdin) ? orig_stdin : devnull)
             redirect_stdout(isopen(orig_stdout) ? orig_stdout : devnull)
             close(pts)
         end
@@ -106,17 +133,17 @@ let
             redirect_stdin(pts)
             redirect_stdout(pts)
             redirect_stderr(pts)
-            REPL.print_qualified_access_warning(Base.Iterators, Base, :minimum) # trigger the warning while stderr is suppressed
             try
-                schedule(repltask)
-                # wait for the definitive prompt before start writing to the TTY
-                readuntil(output_copy, JULIA_PROMPT)
+                REPL.print_qualified_access_warning(Base.Iterators, Base, :minimum) # trigger the warning while stderr is suppressed
             finally
                 redirect_stderr(isopen(orig_stderr) ? orig_stderr : devnull)
             end
+            schedule(repltask)
+            # wait for the definitive prompt before start writing to the TTY
+            check_errors(readuntil(output_copy, JULIA_PROMPT))
             write(debug_output, "\n#### REPL STARTED ####\n")
             sleep(0.1)
-            readavailable(output_copy)
+            check_errors(readavailable(output_copy))
             # Input our script
             precompile_lines = split(repl_script::String, '\n'; keepempty=false)
             curr = 0
@@ -124,16 +151,16 @@ let
                 sleep(0.1)
                 curr += 1
                 # consume any other output
-                bytesavailable(output_copy) > 0 && readavailable(output_copy)
+                bytesavailable(output_copy) > 0 && check_errors(readavailable(output_copy))
                 # push our input
                 write(debug_output, "\n#### inputting statement: ####\n$(repr(l))\n####\n")
                 # If the line ends with a CTRL_C, don't write an extra newline, which would
                 # cause a second empty prompt. Our code below expects one new prompt per
                 # input line and can race out of sync with the unexpected second line.
                 endswith(l, CTRL_C) ? write(ptm, l) : write(ptm, l, "\n")
-                readuntil(output_copy, "\n")
+                check_errors(readuntil(output_copy, "\n"))
                 # wait for the next prompt-like to appear
-                readuntil(output_copy, "\n")
+                check_errors(readuntil(output_copy, "\n"))
                 strbuf = ""
                 while !eof(output_copy)
                     strbuf *= String(readavailable(output_copy))
@@ -143,14 +170,16 @@ let
                     occursin(HELP_PROMPT, strbuf) && break
                     sleep(0.1)
                 end
+                notify(repl_init_event)
+                check_errors(strbuf)
             end
             write(debug_output, "\n#### COMPLETED - Closing REPL ####\n")
             write(ptm, "$CTRL_D")
             wait(repltask)
         finally
-            close(pts)
             redirect_stdin(isopen(orig_stdin) ? orig_stdin : devnull)
             redirect_stdout(isopen(orig_stdout) ? orig_stdout : devnull)
+            close(pts)
         end
         wait(tee)
     end
@@ -158,9 +187,38 @@ let
     nothing
 end
 
-precompile(Tuple{typeof(Base.setindex!), Base.Dict{Any, Any}, Any, Int})
-precompile(Tuple{typeof(Base.delete!), Base.Set{Any}, String})
-precompile(Tuple{typeof(Base.:(==)), Char, String})
-precompile(Tuple{typeof(Base.reseteof), Base.TTY})
+# Copied from PrecompileTools.jl
+let
+    function check_edges(node)
+        parentmi = node.mi_info.mi
+        for child in node.children
+            childmi = child.mi_info.mi
+            if !(isdefined(childmi, :backedges) && parentmi ∈ childmi.backedges)
+                precompile(childmi.specTypes)
+            end
+            check_edges(child)
+        end
+    end
+
+    if Base.generating_output() && Base.JLOptions().use_pkgimages != 0
+        Core.Compiler.Timings.reset_timings()
+        Core.Compiler.__set_measure_typeinf(true)
+        try
+            repl_workload()
+        finally
+            Core.Compiler.__set_measure_typeinf(false)
+            Core.Compiler.Timings.close_current_timer()
+        end
+        roots = Core.Compiler.Timings._timings[1].children
+        for child in roots
+            precompile(child.mi_info.mi.specTypes)
+            check_edges(child)
+        end
+        precompile(Tuple{typeof(Base.setindex!), Base.Dict{Any, Any}, Any, Int})
+        precompile(Tuple{typeof(Base.delete!), Base.Set{Any}, String})
+        precompile(Tuple{typeof(Base.:(==)), Char, String})
+        precompile(Tuple{typeof(Base.reseteof), Base.TTY})
+    end
+end
 
 end # Precompile
diff --git a/stdlib/REPL/test/repl.jl b/stdlib/REPL/test/repl.jl
index f4d594b2a02e1..85a8137fa003e 100644
--- a/stdlib/REPL/test/repl.jl
+++ b/stdlib/REPL/test/repl.jl
@@ -1966,11 +1966,20 @@ end
 
 @testset "Dummy Pkg prompt" begin
     # do this in an empty depot to test default for new users
-    withenv("JULIA_DEPOT_PATH" => mktempdir(), "JULIA_LOAD_PATH" => nothing) do
+    withenv("JULIA_DEPOT_PATH" => mktempdir() * (Sys.iswindows() ? ";" : ":"), "JULIA_LOAD_PATH" => nothing) do
         prompt = readchomp(`$(Base.julia_cmd()[1]) --startup-file=no -e "using REPL; print(REPL.Pkg_promptf())"`)
         @test prompt == "(@v$(VERSION.major).$(VERSION.minor)) pkg> "
     end
 
+    # Issue 55850
+    tmp_55850 = mktempdir()
+    tmp_sym_link = joinpath(tmp_55850, "sym")
+    symlink(tmp_55850, tmp_sym_link; dir_target=true)
+    withenv("JULIA_DEPOT_PATH" => tmp_sym_link * (Sys.iswindows() ? ";" : ":"), "JULIA_LOAD_PATH" => nothing) do
+        prompt = readchomp(`$(Base.julia_cmd()[1]) --startup-file=no -e "using REPL; print(REPL.projname(REPL.find_project_file()))"`)
+        @test prompt == "@v$(VERSION.major).$(VERSION.minor)"
+    end
+
     get_prompt(proj::String) = readchomp(`$(Base.julia_cmd()[1]) --startup-file=no $(proj) -e "using REPL; print(REPL.Pkg_promptf())"`)
 
     @test get_prompt("--project=$(pkgdir(REPL))") == "(REPL) pkg> "
diff --git a/stdlib/Random/src/Xoshiro.jl b/stdlib/Random/src/Xoshiro.jl
index 5569d6d5c1da5..09a3e386e9a2b 100644
--- a/stdlib/Random/src/Xoshiro.jl
+++ b/stdlib/Random/src/Xoshiro.jl
@@ -185,8 +185,8 @@ end
     TaskLocalRNG
 
 The `TaskLocalRNG` has state that is local to its task, not its thread.
-It is seeded upon task creation, from the state of its parent task.
-Therefore, task creation is an event that changes the parent's RNG state.
+It is seeded upon task creation, from the state of its parent task, but without
+advancing the state of the parent's RNG.
 
 As an upside, the `TaskLocalRNG` is pretty fast, and permits reproducible
 multithreaded simulations (barring race conditions), independent of scheduler
@@ -203,6 +203,9 @@ may be any integer.
 
 !!! compat "Julia 1.11"
     Seeding `TaskLocalRNG()` with a negative integer seed requires at least Julia 1.11.
+
+!!! compat "Julia 1.10"
+    Task creation no longer advances the parent task's RNG state as of Julia 1.10.
 """
 struct TaskLocalRNG <: AbstractRNG end
 TaskLocalRNG(::Nothing) = TaskLocalRNG()
@@ -294,7 +297,7 @@ rand(r::Union{TaskLocalRNG, Xoshiro}, ::SamplerTrivial{UInt52{UInt64}})    = ran
 rand(r::Union{TaskLocalRNG, Xoshiro}, ::SamplerTrivial{UInt104{UInt128}})  = rand(r, UInt104Raw())
 
 rand(r::Union{TaskLocalRNG, Xoshiro}, ::SamplerTrivial{CloseOpen01{Float16}}) =
-    Float16(Float32(rand(r, UInt16) >>> 5) * Float32(0x1.0p-11))
+    Float16(rand(r, UInt16) >>> 5) * Float16(0x1.0p-11)
 
 rand(r::Union{TaskLocalRNG, Xoshiro}, ::SamplerTrivial{CloseOpen01{Float32}}) =
     Float32(rand(r, UInt32) >>> 8) * Float32(0x1.0p-24)
diff --git a/stdlib/Random/src/XoshiroSimd.jl b/stdlib/Random/src/XoshiroSimd.jl
index 6d4886f31d22b..1c5f8306cc302 100644
--- a/stdlib/Random/src/XoshiroSimd.jl
+++ b/stdlib/Random/src/XoshiroSimd.jl
@@ -44,6 +44,17 @@ simdThreshold(::Type{Bool}) = 640
     l = Float32(li >>> 8) * Float32(0x1.0p-24)
     (UInt64(reinterpret(UInt32, u)) << 32) | UInt64(reinterpret(UInt32, l))
 end
+@inline function _bits2float(x::UInt64, ::Type{Float16})
+    i1 = (x>>>48) % UInt16
+    i2 = (x>>>32) % UInt16
+    i3 = (x>>>16) % UInt16
+    i4 = x % UInt16
+    f1 = Float16(i1 >>> 5) * Float16(0x1.0p-11)
+    f2 = Float16(i2 >>> 5) * Float16(0x1.0p-11)
+    f3 = Float16(i3 >>> 5) * Float16(0x1.0p-11)
+    f4 = Float16(i4 >>> 5) * Float16(0x1.0p-11)
+    return (UInt64(reinterpret(UInt16, f1)) << 48) | (UInt64(reinterpret(UInt16, f2)) << 32) | (UInt64(reinterpret(UInt16, f3)) << 16) | UInt64(reinterpret(UInt16, f4))
+end
 
 # required operations. These could be written more concisely with `ntuple`, but the compiler
 # sometimes refuses to properly vectorize.
@@ -118,6 +129,18 @@ for N in [4,8,16]
         ret <$N x i64> %i
         """
         @eval @inline _bits2float(x::$VT, ::Type{Float32}) = llvmcall($code, $VT, Tuple{$VT}, x)
+
+        code = """
+        %as16 = bitcast <$N x i64> %0 to <$(4N) x i16>
+        %shiftamt = shufflevector <1 x i16> <i16 5>, <1 x i16> undef, <$(4N) x i32> zeroinitializer
+        %sh = lshr <$(4N) x i16> %as16, %shiftamt
+        %f = uitofp <$(4N) x i16> %sh to <$(4N) x half>
+        %scale = shufflevector <1 x half> <half 0x3f40000000000000>, <1 x half> undef, <$(4N) x i32> zeroinitializer
+        %m = fmul <$(4N) x half> %f, %scale
+        %i = bitcast <$(4N) x half> %m to <$N x i64>
+        ret <$N x i64> %i
+        """
+        @eval @inline _bits2float(x::$VT, ::Type{Float16}) = llvmcall($code, $VT, Tuple{$VT}, x)
     end
 end
 
@@ -137,7 +160,7 @@ end
 
 _id(x, T) = x
 
-@inline function xoshiro_bulk(rng::Union{TaskLocalRNG, Xoshiro}, dst::Ptr{UInt8}, len::Int, T::Union{Type{UInt8}, Type{Bool}, Type{Float32}, Type{Float64}}, ::Val{N}, f::F = _id) where {N, F}
+@inline function xoshiro_bulk(rng::Union{TaskLocalRNG, Xoshiro}, dst::Ptr{UInt8}, len::Int, T::Union{Type{UInt8}, Type{Bool}, Type{Float16}, Type{Float32}, Type{Float64}}, ::Val{N}, f::F = _id) where {N, F}
     if len >= simdThreshold(T)
         written = xoshiro_bulk_simd(rng, dst, len, T, Val(N), f)
         len -= written
@@ -265,13 +288,8 @@ end
 end
 
 
-function rand!(rng::Union{TaskLocalRNG, Xoshiro}, dst::Array{Float32}, ::SamplerTrivial{CloseOpen01{Float32}})
-    GC.@preserve dst xoshiro_bulk(rng, convert(Ptr{UInt8}, pointer(dst)), length(dst)*4, Float32, xoshiroWidth(), _bits2float)
-    dst
-end
-
-function rand!(rng::Union{TaskLocalRNG, Xoshiro}, dst::Array{Float64}, ::SamplerTrivial{CloseOpen01{Float64}})
-    GC.@preserve dst xoshiro_bulk(rng, convert(Ptr{UInt8}, pointer(dst)), length(dst)*8, Float64, xoshiroWidth(), _bits2float)
+function rand!(rng::Union{TaskLocalRNG, Xoshiro}, dst::Array{T}, ::SamplerTrivial{CloseOpen01{T}}) where {T<:Union{Float16,Float32,Float64}}
+    GC.@preserve dst xoshiro_bulk(rng, convert(Ptr{UInt8}, pointer(dst)), length(dst)*sizeof(T), T, xoshiroWidth(), _bits2float)
     dst
 end
 
diff --git a/stdlib/Random/src/generation.jl b/stdlib/Random/src/generation.jl
index d8bb48d2764d2..b605dff9e5d80 100644
--- a/stdlib/Random/src/generation.jl
+++ b/stdlib/Random/src/generation.jl
@@ -66,7 +66,7 @@ function _rand!(rng::AbstractRNG, z::BigFloat, sp::SamplerBigFloat)
         limbs[end] |= Limb_high_bit
     end
     z.sign = 1
-    GC.@preserve limbs unsafe_copyto!(z.d, pointer(limbs), sp.nlimbs)
+    copyto!(z.d, limbs)
     randbool
 end
 
diff --git a/stdlib/TOML/src/TOML.jl b/stdlib/TOML/src/TOML.jl
index 94d2808c0bc24..b37a5ca83c251 100644
--- a/stdlib/TOML/src/TOML.jl
+++ b/stdlib/TOML/src/TOML.jl
@@ -25,7 +25,7 @@ module Internals
 end
 
 # https://github.com/JuliaLang/julia/issues/36605
-readstring(f::AbstractString) = isfile(f) ? read(f, String) : error(repr(f), ": No such file")
+_readstring(f::AbstractString) = isfile(f) ? read(f, String) : error(repr(f), ": No such file")
 
 """
     Parser()
@@ -36,12 +36,14 @@ explicitly create a `Parser` but instead one directly use use
 will however reuse some internal data structures which can be beneficial for
 performance if a larger number of small files are parsed.
 """
-const Parser = Internals.Parser
+struct Parser
+    _p::Internals.Parser{Dates}
+end
 
 # Dates-enabled constructors
-Parser() = Parser{Dates}()
-Parser(io::IO) = Parser{Dates}(io)
-Parser(str::String; filepath=nothing) = Parser{Dates}(str; filepath)
+Parser() = Parser(Internals.Parser{Dates}())
+Parser(io::IO) = Parser(Internals.Parser{Dates}(io))
+Parser(str::String; filepath=nothing) = Parser(Internals.Parser{Dates}(str; filepath))
 
 """
     parsefile(f::AbstractString)
@@ -53,9 +55,9 @@ Parse file `f` and return the resulting table (dictionary). Throw a
 See also [`TOML.tryparsefile`](@ref).
 """
 parsefile(f::AbstractString) =
-    Internals.parse(Parser(readstring(f); filepath=abspath(f)))
+    Internals.parse(Internals.Parser{Dates}(_readstring(f); filepath=abspath(f)))
 parsefile(p::Parser, f::AbstractString) =
-    Internals.parse(Internals.reinit!(p, readstring(f); filepath=abspath(f)))
+    Internals.parse(Internals.reinit!(p._p, _readstring(f); filepath=abspath(f)))
 
 """
     tryparsefile(f::AbstractString)
@@ -67,9 +69,9 @@ Parse file `f` and return the resulting table (dictionary). Return a
 See also [`TOML.parsefile`](@ref).
 """
 tryparsefile(f::AbstractString) =
-    Internals.tryparse(Parser(readstring(f); filepath=abspath(f)))
+    Internals.tryparse(Internals.Parser{Dates}(_readstring(f); filepath=abspath(f)))
 tryparsefile(p::Parser, f::AbstractString) =
-    Internals.tryparse(Internals.reinit!(p, readstring(f); filepath=abspath(f)))
+    Internals.tryparse(Internals.reinit!(p._p, _readstring(f); filepath=abspath(f)))
 
 """
     parse(x::Union{AbstractString, IO})
@@ -80,10 +82,11 @@ Throw a [`ParserError`](@ref) upon failure.
 
 See also [`TOML.tryparse`](@ref).
 """
+parse(p::Parser) = Internals.parse(p._p)
 parse(str::AbstractString) =
-    Internals.parse(Parser(String(str)))
+    Internals.parse(Internals.Parser{Dates}(String(str)))
 parse(p::Parser, str::AbstractString) =
-    Internals.parse(Internals.reinit!(p, String(str)))
+    Internals.parse(Internals.reinit!(p._p, String(str)))
 parse(io::IO) = parse(read(io, String))
 parse(p::Parser, io::IO) = parse(p, read(io, String))
 
@@ -96,10 +99,11 @@ Return a [`ParserError`](@ref) upon failure.
 
 See also [`TOML.parse`](@ref).
 """
+tryparse(p::Parser) = Internals.tryparse(p._p)
 tryparse(str::AbstractString) =
-    Internals.tryparse(Parser(String(str)))
+    Internals.tryparse(Internals.Parser{Dates}(String(str)))
 tryparse(p::Parser, str::AbstractString) =
-    Internals.tryparse(Internals.reinit!(p, String(str)))
+    Internals.tryparse(Internals.reinit!(p._p, String(str)))
 tryparse(io::IO) = tryparse(read(io, String))
 tryparse(p::Parser, io::IO) = tryparse(p, read(io, String))
 
@@ -131,4 +135,17 @@ supported type.
 """
 const print = Internals.Printer.print
 
+public Parser, parsefile, tryparsefile, parse, tryparse, ParserError, print
+
+# These methods are private Base interfaces, but we do our best to support them over
+# the TOML stdlib types anyway to minimize downstream breakage.
+Base.TOMLCache(p::Parser) = Base.TOMLCache(p._p, Dict{String, Base.CachedTOMLDict}())
+Base.TOMLCache(p::Parser, d::Base.CachedTOMLDict) = Base.TOMLCache(p._p, d)
+Base.TOMLCache(p::Parser, d::Dict{String, Dict{String, Any}}) = Base.TOMLCache(p._p, d)
+
+Internals.reinit!(p::Parser, str::String; filepath::Union{Nothing, String}=nothing) =
+    Internals.reinit!(p._p, str; filepath)
+Internals.parse(p::Parser) = Internals.parse(p._p)
+Internals.tryparse(p::Parser) = Internals.tryparse(p._p)
+
 end
diff --git a/stdlib/TOML/test/values.jl b/stdlib/TOML/test/values.jl
index 4fc49d47fc98d..53be1b04708b3 100644
--- a/stdlib/TOML/test/values.jl
+++ b/stdlib/TOML/test/values.jl
@@ -172,6 +172,6 @@ end
 @testset "Array" begin
     @test testval("[1,2,3]", Int64[1,2,3])
     @test testval("[1.0, 2.0, 3.0]", Float64[1.0, 2.0, 3.0])
-    @test testval("[1.0, 2.0, 3]", Union{Int64, Float64}[1.0, 2.0, Int64(3)])
+    @test testval("[1.0, 2.0, 3]", Any[1.0, 2.0, Int64(3)])
     @test testval("[1.0, 2, \"foo\"]", Any[1.0, Int64(2), "foo"])
 end
diff --git a/stdlib/Test/src/Test.jl b/stdlib/Test/src/Test.jl
index b4ada2ce3a9cf..46bc2d8790cec 100644
--- a/stdlib/Test/src/Test.jl
+++ b/stdlib/Test/src/Test.jl
@@ -2087,7 +2087,7 @@ function detect_ambiguities(mods::Module...;
     while !isempty(work)
         mod = pop!(work)
         for n in names(mod, all = true)
-            Base.isdeprecated(mod, n) && continue
+            (!Base.isbindingresolved(mod, n) || Base.isdeprecated(mod, n)) && continue
             if !isdefined(mod, n)
                 if is_in_mods(mod, recursive, mods)
                     if allowed_undefineds === nothing || GlobalRef(mod, n) ∉ allowed_undefineds
@@ -2158,7 +2158,7 @@ function detect_unbound_args(mods...;
     while !isempty(work)
         mod = pop!(work)
         for n in names(mod, all = true)
-            Base.isdeprecated(mod, n) && continue
+            (!Base.isbindingresolved(mod, n) || Base.isdeprecated(mod, n)) && continue
             if !isdefined(mod, n)
                 if is_in_mods(mod, recursive, mods)
                     if allowed_undefineds === nothing || GlobalRef(mod, n) ∉ allowed_undefineds
diff --git a/test/Makefile b/test/Makefile
index 1b9cb377c943d..6ebdd3c764fd5 100644
--- a/test/Makefile
+++ b/test/Makefile
@@ -24,6 +24,8 @@ EMBEDDING_ARGS := "JULIA=$(JULIA_EXECUTABLE)" "BIN=$(SRCDIR)/embedding" "CC=$(CC
 
 GCEXT_ARGS := "JULIA=$(JULIA_EXECUTABLE)" "BIN=$(SRCDIR)/gcext" "CC=$(CC)"
 
+TRIMMING_ARGS := "JULIA=$(JULIA_EXECUTABLE)" "BIN=$(JULIAHOME)/usr/bin" "CC=$(CC)"
+
 default:
 
 $(TESTS):
@@ -66,6 +68,9 @@ embedding:
 gcext:
 	@$(MAKE) -C $(SRCDIR)/$@ check $(GCEXT_ARGS)
 
+trimming:
+	@$(MAKE) -C $(SRCDIR)/$@ check $(TRIMMING_ARGS)
+
 clangsa:
 	@$(MAKE) -C $(SRCDIR)/$@
 
@@ -73,5 +78,6 @@ clean:
 	@$(MAKE) -C embedding $@ $(EMBEDDING_ARGS)
 	@$(MAKE) -C gcext $@ $(GCEXT_ARGS)
 	@$(MAKE) -C llvmpasses $@
+	@$(MAKE) -C trimming $@ $(TRIMMING_ARGS)
 
-.PHONY: $(TESTS) $(addprefix revise-, $(TESTS)) relocatedepot revise-relocatedepot embedding gcext clangsa clean
+.PHONY: $(TESTS) $(addprefix revise-, $(TESTS)) relocatedepot revise-relocatedepot embedding gcext trimming clangsa clean
diff --git a/test/ambiguous.jl b/test/ambiguous.jl
index acdfdc70ba30c..2f8a4193cf592 100644
--- a/test/ambiguous.jl
+++ b/test/ambiguous.jl
@@ -162,6 +162,22 @@ end
 ambs = detect_ambiguities(Ambig48312)
 @test length(ambs) == 4
 
+module UnboundAmbig55868
+    module B
+        struct C end
+        export C
+        Base.@deprecate_binding D C
+    end
+    using .B
+    export C, D
+end
+@test !Base.isbindingresolved(UnboundAmbig55868, :C)
+@test !Base.isbindingresolved(UnboundAmbig55868, :D)
+@test isempty(detect_unbound_args(UnboundAmbig55868))
+@test isempty(detect_ambiguities(UnboundAmbig55868))
+@test !Base.isbindingresolved(UnboundAmbig55868, :C)
+@test !Base.isbindingresolved(UnboundAmbig55868, :D)
+
 # Test that Core and Base are free of ambiguities
 # not using isempty so this prints more information when it fails
 @testset "detect_ambiguities" begin
diff --git a/test/channels.jl b/test/channels.jl
index d62c0b581775c..eed7a7ecc0566 100644
--- a/test/channels.jl
+++ b/test/channels.jl
@@ -12,6 +12,9 @@ using Base: n_avail
     end
     @test wait(a) == "success"
     @test fetch(t) == "finished"
+
+    # Test printing
+    @test repr(a) == "Condition()"
 end
 
 @testset "wait first behavior of wait on Condition" begin
diff --git a/test/cmdlineargs.jl b/test/cmdlineargs.jl
index c6720e23739d8..cc3f8950f0dc0 100644
--- a/test/cmdlineargs.jl
+++ b/test/cmdlineargs.jl
@@ -787,6 +787,17 @@ let exename = `$(Base.julia_cmd()) --startup-file=no --color=no`
     # tested in test/parallel.jl)
     @test errors_not_signals(`$exename --worker=true`)
 
+    # --trace-compile
+    let
+        io = IOBuffer()
+        v = writereadpipeline(
+            "foo(x) = begin Base.Experimental.@force_compile; x; end; foo(1)",
+            `$exename --trace-compile=stderr -i`,
+            stderr=io)
+        _stderr = String(take!(io))
+        @test occursin("precompile(Tuple{typeof(Main.foo), Int", _stderr)
+    end
+
     # --trace-compile-timing
     let
         io = IOBuffer()
@@ -798,6 +809,17 @@ let exename = `$(Base.julia_cmd()) --startup-file=no --color=no`
         @test occursin(" ms =# precompile(Tuple{typeof(Main.foo), Int", _stderr)
     end
 
+    # --trace-dispatch
+    let
+        io = IOBuffer()
+        v = writereadpipeline(
+            "foo(x) = begin Base.Experimental.@force_compile; x; end; foo(1)",
+            `$exename --trace-dispatch=stderr -i`,
+            stderr=io)
+        _stderr = String(take!(io))
+        @test occursin("precompile(Tuple{typeof(Main.foo), Int", _stderr)
+    end
+
     # test passing arguments
     mktempdir() do dir
         testfile, io = mktemp(dir)
diff --git a/test/compiler/AbstractInterpreter.jl b/test/compiler/AbstractInterpreter.jl
index d95354cefa80c..009128b289ade 100644
--- a/test/compiler/AbstractInterpreter.jl
+++ b/test/compiler/AbstractInterpreter.jl
@@ -409,15 +409,19 @@ end
 CC.nsplit_impl(info::NoinlineCallInfo) = CC.nsplit(info.info)
 CC.getsplit_impl(info::NoinlineCallInfo, idx::Int) = CC.getsplit(info.info, idx)
 CC.getresult_impl(info::NoinlineCallInfo, idx::Int) = CC.getresult(info.info, idx)
+CC.add_uncovered_edges_impl(edges::Vector{Any}, info::NoinlineCallInfo, @nospecialize(atype)) = CC.add_uncovered_edges!(edges, info.info, atype)
 
 function CC.abstract_call(interp::NoinlineInterpreter,
     arginfo::CC.ArgInfo, si::CC.StmtInfo, sv::CC.InferenceState, max_methods::Int)
     ret = @invoke CC.abstract_call(interp::CC.AbstractInterpreter,
         arginfo::CC.ArgInfo, si::CC.StmtInfo, sv::CC.InferenceState, max_methods::Int)
-    if sv.mod in noinline_modules(interp)
-        return CC.CallMeta(ret.rt, ret.exct, ret.effects, NoinlineCallInfo(ret.info))
+    return CC.Future{CC.CallMeta}(ret, interp, sv) do ret, interp, sv
+        if sv.mod in noinline_modules(interp)
+            (;rt, exct, effects, info) = ret
+            return CC.CallMeta(rt, exct, effects, NoinlineCallInfo(info))
+        end
+        return ret
     end
-    return ret
 end
 function CC.src_inlining_policy(interp::NoinlineInterpreter,
     @nospecialize(src), @nospecialize(info::CallInfo), stmt_flag::UInt32)
@@ -431,6 +435,8 @@ end
 @inline function inlined_usually(x, y, z)
     return x * y + z
 end
+foo_split(x::Float64) = 1
+foo_split(x::Int) = 2
 
 # check if the inlining algorithm works as expected
 let src = code_typed1((Float64,Float64,Float64)) do x, y, z
@@ -444,6 +450,7 @@ let NoinlineModule = Module()
     main_func(x, y, z) = inlined_usually(x, y, z)
     @eval NoinlineModule noinline_func(x, y, z) = $inlined_usually(x, y, z)
     @eval OtherModule other_func(x, y, z) = $inlined_usually(x, y, z)
+    @eval NoinlineModule bar_split_error() = $foo_split(Core.compilerbarrier(:type, nothing))
 
     interp = NoinlineInterpreter(Set((NoinlineModule,)))
 
@@ -473,12 +480,12 @@ let NoinlineModule = Module()
         @test count(isinvoke(:inlined_usually), src.code) == 0
         @test count(iscall((src, inlined_usually)), src.code) == 0
     end
-end
 
-# Make sure that Core.Compiler has enough NamedTuple infrastructure
-# to properly give error messages for basic kwargs...
-Core.eval(Core.Compiler, quote f(;a=1) = a end)
-@test_throws MethodError Core.Compiler.f(;b=2)
+    let src = code_typed1(NoinlineModule.bar_split_error)
+        @test count(iscall((src, foo_split)), src.code) == 0
+        @test count(iscall((src, Core.throw_methoderror)), src.code) > 0
+    end
+end
 
 # custom inferred data
 # ====================
diff --git a/test/compiler/EscapeAnalysis/EAUtils.jl b/test/compiler/EscapeAnalysis/EAUtils.jl
index 188ec93ebc5be..b8ad4589db626 100644
--- a/test/compiler/EscapeAnalysis/EAUtils.jl
+++ b/test/compiler/EscapeAnalysis/EAUtils.jl
@@ -116,12 +116,14 @@ CC.get_inference_world(interp::EscapeAnalyzer) = interp.world
 CC.get_inference_cache(interp::EscapeAnalyzer) = interp.inf_cache
 CC.cache_owner(::EscapeAnalyzer) = EAToken()
 
-function CC.ipo_dataflow_analysis!(interp::EscapeAnalyzer, ir::IRCode, caller::InferenceResult)
+function CC.ipo_dataflow_analysis!(interp::EscapeAnalyzer, opt::OptimizationState,
+                                   ir::IRCode, caller::InferenceResult)
     # run EA on all frames that have been optimized
-    nargs = let def = caller.linfo.def; isa(def, Method) ? Int(def.nargs) : 0; end
+    nargs = Int(opt.src.nargs)
+    𝕃ₒ = CC.optimizer_lattice(interp)
     get_escape_cache = GetEscapeCache(interp)
     estate = try
-        analyze_escapes(ir, nargs, CC.optimizer_lattice(interp), get_escape_cache)
+        analyze_escapes(ir, nargs, 𝕃ₒ, get_escape_cache)
     catch err
         @error "error happened within EA, inspect `Main.failed_escapeanalysis`"
         Main.failed_escapeanalysis = FailedAnalysis(ir, nargs, get_escape_cache)
@@ -133,7 +135,8 @@ function CC.ipo_dataflow_analysis!(interp::EscapeAnalyzer, ir::IRCode, caller::I
     end
     record_escapes!(interp, caller, estate, ir)
 
-    @invoke CC.ipo_dataflow_analysis!(interp::AbstractInterpreter, ir::IRCode, caller::InferenceResult)
+    @invoke CC.ipo_dataflow_analysis!(interp::AbstractInterpreter, opt::OptimizationState,
+                                      ir::IRCode, caller::InferenceResult)
 end
 
 function record_escapes!(interp::EscapeAnalyzer,
diff --git a/test/compiler/EscapeAnalysis/EscapeAnalysis.jl b/test/compiler/EscapeAnalysis/EscapeAnalysis.jl
index 31c21f7228014..99bd86228f50a 100644
--- a/test/compiler/EscapeAnalysis/EscapeAnalysis.jl
+++ b/test/compiler/EscapeAnalysis/EscapeAnalysis.jl
@@ -2240,13 +2240,13 @@ end
 # accounts for ThrownEscape via potential MethodError
 
 # no method error
-@noinline identity_if_string(x::SafeRef) = (println("preventing inlining"); nothing)
+@noinline identity_if_string(x::SafeRef{<:AbstractString}) = (println("preventing inlining"); nothing)
 let result = code_escapes((SafeRef{String},)) do x
         identity_if_string(x)
     end
     @test has_no_escape(ignore_argescape(result.state[Argument(2)]))
 end
-let result = code_escapes((Union{SafeRef{String},Nothing},)) do x
+let result = code_escapes((SafeRef,)) do x
         identity_if_string(x)
     end
     i = only(findall(iscall((result.ir, identity_if_string)), result.ir.stmts.stmt))
@@ -2299,4 +2299,21 @@ let result = code_escapes((SafeRef{String},Any)) do x, y
     @test has_all_escape(result.state[Argument(3)])  # y
 end
 
+@eval function scope_folding()
+    $(Expr(:tryfinally,
+        Expr(:block,
+            Expr(:tryfinally, :(), :(), 2),
+            :(return Core.current_scope())),
+    :(), 1))
+end
+@eval function scope_folding_opt()
+    $(Expr(:tryfinally,
+        Expr(:block,
+            Expr(:tryfinally, :(), :(), :(Base.inferencebarrier(2))),
+            :(return Core.current_scope())),
+    :(), :(Base.inferencebarrier(1))))
+end
+@test (@code_escapes scope_folding()) isa EAUtils.EscapeResult
+@test (@code_escapes scope_folding_opt()) isa EAUtils.EscapeResult
+
 end # module test_EA
diff --git a/test/compiler/codegen.jl b/test/compiler/codegen.jl
index 0260113044a3b..26ae965b35319 100644
--- a/test/compiler/codegen.jl
+++ b/test/compiler/codegen.jl
@@ -501,10 +501,9 @@ function f37262(x)
     end
 end
 @testset "#37262" begin
-    str = "store volatile { i8, {}*, {}*, {}*, {}* } zeroinitializer, { i8, {}*, {}*, {}*, {}* }* %phic"
-    str_opaque = "store volatile { i8, ptr, ptr, ptr, ptr } zeroinitializer, ptr %phic"
+    str_opaque = "getelementptr inbounds i8, ptr %.roots.phic, i32 8\n  store volatile ptr null"
     llvmstr = get_llvm(f37262, (Bool,), false, false, false)
-    @test (contains(llvmstr, str) || contains(llvmstr, str_opaque)) || llvmstr
+    @test contains(llvmstr, str_opaque)
     @test f37262(Base.inferencebarrier(true)) === nothing
 end
 
@@ -999,3 +998,8 @@ for (T, StructName) in ((Int128, :Issue55558), (UInt128, :UIssue55558))
         @test sizeof($(StructName)) == 48 broken=broken_i128
     end
 end
+
+@noinline Base.@nospecializeinfer f55768(@nospecialize z::UnionAll) = z === Vector
+@test f55768(Vector)
+@test f55768(Vector{T} where T)
+@test !f55768(Vector{S} where S)
diff --git a/test/compiler/effects.jl b/test/compiler/effects.jl
index 11c30aad0b9a4..8bc5f27e31766 100644
--- a/test/compiler/effects.jl
+++ b/test/compiler/effects.jl
@@ -1141,6 +1141,14 @@ end
 @test_broken Core.Compiler.is_effect_free(Base.infer_effects(set_arr_with_unused_arg_2, (Vector{Int},)))
 @test_broken Core.Compiler.is_effect_free_if_inaccessiblememonly(Base.infer_effects(set_arg_arr!, (Vector{Int},)))
 
+# EA-based refinement of :effect_free
+function f_EA_refine(ax, b)
+    bx = Ref{Any}()
+    @noinline bx[] = b
+    return ax[] + b
+end
+@test Core.Compiler.is_effect_free(Base.infer_effects(f_EA_refine, (Base.RefValue{Int},Int)))
+
 function issue51837(; openquotechar::Char, newlinechar::Char)
     ncodeunits(openquotechar) == 1 || throw(ArgumentError("`openquotechar` must be a single-byte character"))
     if !isnothing(newlinechar)
diff --git a/test/compiler/inference.jl b/test/compiler/inference.jl
index 485ee579abd52..7c7726413004a 100644
--- a/test/compiler/inference.jl
+++ b/test/compiler/inference.jl
@@ -3887,113 +3887,6 @@ f_apply_cglobal(args...) = cglobal(args...)
 f37532(T, x) = (Core.bitcast(Ptr{T}, x); x)
 @test Base.return_types(f37532, Tuple{Any, Int}) == Any[Int]
 
-# PR #37749
-# Helper functions for Core.Compiler.Timings. These are normally accessed via a package -
-# usually (SnoopCompileCore).
-function time_inference(f)
-    Core.Compiler.Timings.reset_timings()
-    Core.Compiler.__set_measure_typeinf(true)
-    f()
-    Core.Compiler.__set_measure_typeinf(false)
-    Core.Compiler.Timings.close_current_timer()
-    return Core.Compiler.Timings._timings[1]
-end
-function depth(t::Core.Compiler.Timings.Timing)
-    maximum(depth.(t.children), init=0) + 1
-end
-function flatten_times(t::Core.Compiler.Timings.Timing)
-    collect(Iterators.flatten([(t.time => t.mi_info,), flatten_times.(t.children)...]))
-end
-# Some very limited testing of timing the type inference (#37749).
-@testset "Core.Compiler.Timings" begin
-    # Functions that call each other
-    @eval module M1
-        i(x) = x+5
-        i2(x) = x+2
-        h(a::Array) = i2(a[1]::Integer) + i(a[1]::Integer) + 2
-        g(y::Integer, x) = h(Any[y]) + Int(x)
-    end
-    timing1 = time_inference() do
-        @eval M1.g(2, 3.0)
-    end
-    @test occursin(r"Core.Compiler.Timings.Timing\(InferenceFrameInfo for Core.Compiler.Timings.ROOT\(\)\) with \d+ children", sprint(show, timing1))
-    # The last two functions to be inferred should be `i` and `i2`, inferred at runtime with
-    # their concrete types.
-    @test sort([mi_info.mi.def.name for (time,mi_info) in flatten_times(timing1)[end-1:end]]) == [:i, :i2]
-    @test all(child->isa(child.bt, Vector), timing1.children)
-    @test all(child->child.bt===nothing, timing1.children[1].children)
-    # Test the stacktrace
-    @test isa(stacktrace(timing1.children[1].bt), Vector{Base.StackTraces.StackFrame})
-    # Test that inference has cached some of the Method Instances
-    timing2 = time_inference() do
-        @eval M1.g(2, 3.0)
-    end
-    @test length(flatten_times(timing2)) < length(flatten_times(timing1))
-    # Printing of InferenceFrameInfo for mi.def isa Module
-    @eval module M2
-        i(x) = x+5
-        i2(x) = x+2
-        h(a::Array) = i2(a[1]::Integer) + i(a[1]::Integer) + 2
-        g(y::Integer, x) = h(Any[y]) + Int(x)
-    end
-    # BEGIN LINE NUMBER SENSITIVITY (adjust the line offset below as needed)
-    timingmod = time_inference() do
-        @eval @testset "Outer" begin
-            @testset "Inner" begin
-                for i = 1:2 M2.g(2, 3.0) end
-            end
-        end
-    end
-    @test occursin("thunk from $(@__MODULE__) starting at $(@__FILE__):$((@__LINE__) - 6)", string(timingmod.children))
-    # END LINE NUMBER SENSITIVITY
-
-    # Recursive function
-    @eval module _Recursive f(n::Integer) = n == 0 ? 0 : f(n-1) + 1 end
-    timing = time_inference() do
-        @eval _Recursive.f(Base.inferencebarrier(5))
-    end
-    @test 2 <= depth(timing) <= 3  # root -> f (-> +)
-    @test 2 <= length(flatten_times(timing)) <= 3  # root, f, +
-
-    # Functions inferred with multiple constants
-    @eval module C
-        i(x) = x === 0 ? 0 : 1 / x
-        a(x) = i(0) * i(x)
-        b() = i(0) * i(1) * i(0)
-        function loopc(n)
-            s = 0
-            for i = 1:n
-                s += i
-            end
-            return s
-        end
-        call_loopc() = loopc(5)
-        myfloor(::Type{T}, x) where T = floor(T, x)
-        d(x) = myfloor(Int16, x)
-    end
-    timing = time_inference() do
-        @eval C.a(2)
-        @eval C.b()
-        @eval C.call_loopc()
-        @eval C.d(3.2)
-    end
-    ft = flatten_times(timing)
-    @test !isempty(ft)
-    str = sprint(show, ft)
-    @test occursin("InferenceFrameInfo for /(1::$Int, ::$Int)", str)  # inference constants
-    @test occursin("InferenceFrameInfo for Core.Compiler.Timings.ROOT()", str) # qualified
-    # loopc has internal slots, check constant printing in this case
-    sel = filter(ti -> ti.second.mi.def.name === :loopc, ft)
-    ifi = sel[end].second
-    @test length(ifi.slottypes) > ifi.nargs
-    str = sprint(show, sel)
-    @test occursin("InferenceFrameInfo for $(@__MODULE__).C.loopc(5::$Int)", str)
-    # check that types aren't double-printed as `T::Type{T}`
-    sel = filter(ti -> ti.second.mi.def.name === :myfloor, ft)
-    str = sprint(show, sel)
-    @test occursin("InferenceFrameInfo for $(@__MODULE__).C.myfloor(::Type{Int16}, ::Float64)", str)
-end
-
 # issue #37638
 @test only(Base.return_types(() -> (nothing, Any[]...)[2])) isa Type
 
@@ -6076,3 +5969,82 @@ end
 fcondvarargs(a, b, c, d) = isa(d, Int64)
 gcondvarargs(a, x...) = return fcondvarargs(a, x...) ? isa(a, Int64) : !isa(a, Int64)
 @test Core.Compiler.return_type(gcondvarargs, Tuple{Vararg{Any}}) === Bool
+
+# JuliaLang/julia#55627: argtypes check in `abstract_call_opaque_closure`
+issue55627_make_oc() = Base.Experimental.@opaque (x::Int) -> 2x
+@test Base.infer_return_type() do
+    f = issue55627_make_oc()
+    return f(1), f()
+end == Union{}
+@test Base.infer_return_type((Vector{Int},)) do xs
+    f = issue55627_make_oc()
+    return f(1), f(xs...)
+end == Tuple{Int,Int}
+@test Base.infer_exception_type() do
+    f = issue55627_make_oc()
+    return f(1), f()
+end >: MethodError
+@test Base.infer_exception_type() do
+    f = issue55627_make_oc()
+    return f(1), f('1')
+end >: TypeError
+
+# `exct` modeling for opaque closure
+oc_exct_1() = Base.Experimental.@opaque (x) -> x < 0 ? throw(x) : x
+@test Base.infer_exception_type((Int,)) do x
+    oc_exct_1()(x)
+end == Int
+oc_exct_2() = Base.Experimental.@opaque Tuple{Number}->Number (x) -> '1'
+@test Base.infer_exception_type((Int,)) do x
+    oc_exct_2()(x)
+end == TypeError
+
+# nothrow modeling for `invoke` calls
+f_invoke_nothrow(::Number) = :number
+f_invoke_nothrow(::Int) = :int
+@test Base.infer_effects((Int,)) do x
+    @invoke f_invoke_nothrow(x::Number)
+end |> Core.Compiler.is_nothrow
+@test Base.infer_effects((Char,)) do x
+    @invoke f_invoke_nothrow(x::Number)
+end |> !Core.Compiler.is_nothrow
+@test Base.infer_effects((Union{Nothing,Int},)) do x
+    @invoke f_invoke_nothrow(x::Number)
+end |> !Core.Compiler.is_nothrow
+
+# `exct` modeling for `invoke` calls
+f_invoke_exct(x::Number) = x < 0 ? throw(x) : x
+f_invoke_exct(x::Int) = x
+@test Base.infer_exception_type((Int,)) do x
+    @invoke f_invoke_exct(x::Number)
+end == Int
+@test Base.infer_exception_type() do
+    @invoke f_invoke_exct(42::Number)
+end == Union{}
+@test Base.infer_exception_type((Union{Nothing,Int},)) do x
+    @invoke f_invoke_exct(x::Number)
+end == Union{Int,TypeError}
+@test Base.infer_exception_type((Int,)) do x
+    invoke(f_invoke_exct, Number, x)
+end == TypeError
+@test Base.infer_exception_type((Char,)) do x
+    invoke(f_invoke_exct, Tuple{Number}, x)
+end == TypeError
+
+@test Base.infer_exception_type((Vector{Any},)) do args
+    Core.throw_methoderror(args...)
+end == Union{MethodError,ArgumentError}
+
+# Issue https://github.com/JuliaLang/julia/issues/55751
+
+abstract type AbstractGrid55751{T, N} <: AbstractArray{T, N} end
+struct Grid55751{T, N, AT} <: AbstractGrid55751{T, N}
+    axes::AT
+end
+
+t155751 = Union{AbstractArray{UInt8, 4}, Array{Float32, 4}, Grid55751{Float32, 3, _A} where _A}
+t255751 = Array{Float32, 3}
+@test Core.Compiler.tmerge_types_slow(t155751,t255751) == AbstractArray # shouldn't hang
+
+issue55882_nfields(x::Union{T,Nothing}) where T<:Number = nfields(x)
+@test Base.infer_return_type(issue55882_nfields) <: Int
diff --git a/test/compiler/inline.jl b/test/compiler/inline.jl
index a8b5fd66dcd0d..2de6d9950d4e4 100644
--- a/test/compiler/inline.jl
+++ b/test/compiler/inline.jl
@@ -876,7 +876,7 @@ let src = code_typed1((Any,)) do x
         abstract_unionsplit_fallback(x)
     end
     @test count(isinvoke(:abstract_unionsplit_fallback), src.code) == 2
-    @test count(iscall((src, abstract_unionsplit_fallback)), src.code) == 1 # fallback dispatch
+    @test count(iscall((src, Core.throw_methoderror)), src.code) == 1 # fallback method error
 end
 let src = code_typed1((Union{Type,Number},)) do x
         abstract_unionsplit_fallback(x)
@@ -912,7 +912,7 @@ let src = code_typed1((Any,)) do x
     @test count(iscall((src, typeof)), src.code) == 2
     @test count(isinvoke(:println), src.code) == 0
     @test count(iscall((src, println)), src.code) == 0
-    @test count(iscall((src, abstract_unionsplit_fallback)), src.code) == 1 # fallback dispatch
+    @test count(iscall((src, Core.throw_methoderror)), src.code) == 1 # fallback method error
 end
 let src = code_typed1((Union{Type,Number},)) do x
         abstract_unionsplit_fallback(false, x)
@@ -960,8 +960,8 @@ let # aggressive inlining of single, abstract method match
     end |> only |> first
     # both callsites should be inlined
     @test count(isinvoke(:has_free_typevars), src.code) == 2
-    # `isGoodType(y::Any)` isn't fully covered, thus a runtime type check and fallback dynamic dispatch should be inserted
-    @test count(iscall((src,isGoodType)), src.code) == 1
+    # `isGoodType(y::Any)` isn't fully covered, so the fallback is a method error
+    @test count(iscall((src, Core.throw_methoderror)), src.code) == 1 # fallback method error
 end
 
 @inline isGoodType2(cnd, @nospecialize x::Type) =
@@ -973,8 +973,8 @@ let # aggressive inlining of single, abstract method match (with constant-prop'e
     # both callsite should be inlined with constant-prop'ed result
     @test count(isinvoke(:isType), src.code) == 2
     @test count(isinvoke(:has_free_typevars), src.code) == 0
-    # `isGoodType(y::Any)` isn't fully covered, thus a runtime type check and fallback dynamic dispatch should be inserted
-    @test count(iscall((src,isGoodType2)), src.code) == 1
+    # `isGoodType(y::Any)` isn't fully covered, thus a MethodError gets inserted
+    @test count(iscall((src, Core.throw_methoderror)), src.code) == 1 # fallback method error
 end
 
 @noinline function checkBadType!(@nospecialize x::Type)
@@ -989,8 +989,8 @@ let # aggressive static dispatch of single, abstract method match
     end |> only |> first
     # both callsites should be resolved statically
     @test count(isinvoke(:checkBadType!), src.code) == 2
-    # `checkBadType!(y::Any)` isn't fully covered, thus a runtime type check and fallback dynamic dispatch should be inserted
-    @test count(iscall((src,checkBadType!)), src.code) == 1
+    # `checkBadType!(y::Any)` isn't fully covered, thus a MethodError gets inserted
+    @test count(iscall((src, Core.throw_methoderror)), src.code) == 1 # fallback method error
 end
 
 @testset "late_inline_special_case!" begin
@@ -1570,7 +1570,6 @@ let
     @test get_finalization_count() == 1000
 end
 
-
 function cfg_finalization7(io)
     for i = -999:1000
         o = DoAllocWithField(0)
@@ -1597,6 +1596,31 @@ let
     @test get_finalization_count() == 1000
 end
 
+# Load forwarding with `finalizer` elision
+let src = code_typed1((Int,)) do x
+        xs = finalizer(Ref(x)) do obj
+            @noinline
+            Base.@assume_effects :nothrow :notaskstate
+            Core.println("finalizing: ", obj[])
+        end
+        Base.@assume_effects :nothrow @noinline println("xs[] = ", @inline xs[])
+        return xs[]
+    end
+    @test count(iscall((src, getfield)), src.code) == 0
+end
+let src = code_typed1((Int,)) do x
+        xs = finalizer(Ref(x)) do obj
+            @noinline
+            Base.@assume_effects :nothrow :notaskstate
+            Core.println("finalizing: ", obj[])
+        end
+        Base.@assume_effects :nothrow @noinline println("xs[] = ", @inline xs[])
+        xs[] += 1
+        return xs[]
+    end
+    @test count(iscall((src, getfield)), src.code) == 0
+    @test count(iscall((src, setfield!)), src.code) == 1
+end
 
 # optimize `[push!|pushfirst!](::Vector{Any}, x...)`
 @testset "optimize `$f(::Vector{Any}, x...)`" for f = Any[push!, pushfirst!]
@@ -2004,7 +2028,7 @@ f48397(::Tuple{String,String}) = :ok
 let src = code_typed1((Union{Bool,Tuple{String,Any}},)) do x
         f48397(x)
     end
-    @test any(iscall((src, f48397)), src.code)
+    @test any(iscall((src, Core.throw_methoderror)), src.code) # fallback method error)
 end
 g48397::Union{Bool,Tuple{String,Any}} = ("48397", 48397)
 let res = @test_throws MethodError let
@@ -2175,11 +2199,6 @@ let src = code_typed1() do
     @test count(isinvoke(:iterate), src.code) == 0
 end
 
-# JuliaLang/julia#53062: proper `joint_effects` for call with empty method matches
-let ir = first(only(Base.code_ircode(setproperty!, (Base.RefValue{Int},Symbol,Base.RefValue{Int}))))
-    i = findfirst(iscall((ir, convert)), ir.stmts.stmt)::Int
-    @test iszero(ir.stmts.flag[i] & Core.Compiler.IR_FLAG_NOTHROW)
-end
 function issue53062(cond)
     x = Ref{Int}(0)
     if cond
@@ -2214,3 +2233,19 @@ let ir = Base.code_ircode((Issue52644,); optimize_until="Inlining") do t
     @test irfunc(Issue52644(Tuple{})) === :DataType
     @test_throws MethodError irfunc(Issue52644(Tuple{<:Integer}))
 end
+
+foo_split(x::Float64) = 1
+foo_split(x::Int) = 2
+bar_inline_error() = foo_split(nothing)
+bar_split_error() = foo_split(Core.compilerbarrier(:type,nothing))
+
+let src = code_typed1(bar_inline_error, Tuple{})
+    # Should inline method errors
+    @test count(iscall((src, foo_split)), src.code) == 0
+    @test count(iscall((src, Core.throw_methoderror)), src.code) > 0
+end
+let src = code_typed1(bar_split_error, Tuple{})
+    # Should inline method errors
+    @test count(iscall((src, foo_split)), src.code) == 0
+    @test count(iscall((src, Core.throw_methoderror)), src.code) > 0
+end
diff --git a/test/compiler/irpasses.jl b/test/compiler/irpasses.jl
index 281317ac25bf8..740ac5f4958e4 100644
--- a/test/compiler/irpasses.jl
+++ b/test/compiler/irpasses.jl
@@ -576,7 +576,6 @@ let # lifting `isa` through Core.ifelse
     @test count(iscall((src, isa)), src.code) == 0
 end
 
-
 let # lifting `isdefined` through PhiNode
     src = code_typed1((Bool,Some{Int},)) do c, x
         y = c ? x : nothing
@@ -1035,8 +1034,7 @@ exc39508 = ErrorException("expected")
 end
 @test test39508() === exc39508
 
-let
-    # `typeassert` elimination after SROA
+let # `typeassert` elimination after SROA
     # NOTE we can remove this optimization once inference is able to reason about memory-effects
     src = @eval Module() begin
         mutable struct Foo; x; end
@@ -1051,8 +1049,7 @@ let
     @test count(iscall((src, typeassert)), src.code) == 0
 end
 
-let
-    # Test for https://github.com/JuliaLang/julia/issues/43402
+let # Test for https://github.com/JuliaLang/julia/issues/43402
     # Ensure that structs required not used outside of the ccall,
     # still get listed in the ccall_preserves
 
diff --git a/test/copy.jl b/test/copy.jl
index d2f555604c4d8..559bf5d3e757a 100644
--- a/test/copy.jl
+++ b/test/copy.jl
@@ -282,6 +282,8 @@ end
 
 @testset "`deepcopy` a `GenericCondition`" begin
     a = Base.GenericCondition(ReentrantLock())
+    # Test printing
+    @test repr(a) == "Base.GenericCondition(ReentrantLock())"
     @test !islocked(a.lock)
     lock(a.lock)
     @test islocked(a.lock)
diff --git a/test/core.jl b/test/core.jl
index 74df09bcdfd91..b27832209a835 100644
--- a/test/core.jl
+++ b/test/core.jl
@@ -34,7 +34,7 @@ for (T, c) in (
         (Core.CodeInfo, []),
         (Core.CodeInstance, [:next, :min_world, :max_world, :inferred, :debuginfo, :ipo_purity_bits, :invoke, :specptr, :specsigflags, :precompile]),
         (Core.Method, [:primary_world, :deleted_world]),
-        (Core.MethodInstance, [:cache, :precompiled]),
+        (Core.MethodInstance, [:cache, :flags]),
         (Core.MethodTable, [:defs, :leafcache, :cache, :max_args]),
         (Core.TypeMapEntry, [:next, :min_world, :max_world]),
         (Core.TypeMapLevel, [:arg1, :targ, :name1, :tname, :list, :any]),
@@ -1183,7 +1183,7 @@ end
 
 # Make sure that `Module` is not resolved to `Core.Module` during sysimg generation
 # so that users can define their own binding named `Module` in Main.
-@test !Base.isbindingresolved(Main, :Module)
+@test success(`$(Base.julia_cmd()) -e '@assert !Base.isbindingresolved(Main, :Module)'`)
 
 # Module() constructor
 @test names(Module(:anonymous), all = true, imported = true) == [:anonymous]
@@ -5611,6 +5611,26 @@ end
     x::Array{T} where T<:Integer
 end
 
+# issue #54757, type redefinitions with recursive reference in supertype
+struct T54757{A>:Int,N} <: AbstractArray{Tuple{X,Tuple{Vararg},Union{T54757{Union{X,Integer}},T54757{A,N}},Vararg{Y,N}} where {X,Y<:T54757}, N}
+    x::A
+    y::Union{A,T54757{A,N}}
+    z::T54757{A}
+end
+
+struct T54757{A>:Int,N} <: AbstractArray{Tuple{X,Tuple{Vararg},Union{T54757{Union{X,Integer}},T54757{A,N}},Vararg{Y,N}} where {X,Y<:T54757}, N}
+    x::A
+    y::Union{A,T54757{A,N}}
+    z::T54757{A}
+end
+
+@test_throws ErrorException struct T54757{A>:Int,N} <: AbstractArray{Tuple{X,Tuple{Vararg},Union{T54757{Union{X,Integer}},T54757{A}},Vararg{Y,N}} where {X,Y<:T54757}, N}
+    x::A
+    y::Union{A,T54757{A,N}}
+    z::T54757{A}
+end
+
+
 let a = Vector{Core.TypeofBottom}(undef, 2)
     @test a[1] == Union{}
     @test a == [Union{}, Union{}]
@@ -8268,3 +8288,19 @@ end
 @test_broken (Tuple{Vararg{T}} where T) === Union{Tuple{T, T, Vararg{T}} where T, Tuple{}, Tuple{T} where T}
 
 @test sizeof(Pair{Union{typeof(Union{}),Nothing}, Union{Type{Union{}},Nothing}}(Union{}, Union{})) == 2
+
+# Make sure that Core.Compiler has enough NamedTuple infrastructure
+# to properly give error messages for basic kwargs...
+Core.eval(Core.Compiler, quote issue50174(;a=1) = a end)
+@test_throws MethodError Core.Compiler.issue50174(;b=2)
+
+let s = mktemp() do path, io
+        xxx = 42
+        redirect_stdout(io) do
+            Base.@assume_effects :nothrow @show xxx
+        end
+        flush(io)
+        read(path, String)
+    end
+    @test strip(s) == "xxx = 42"
+end
diff --git a/test/dict.jl b/test/dict.jl
index 13c60d5a6a053..909afb3607907 100644
--- a/test/dict.jl
+++ b/test/dict.jl
@@ -1049,7 +1049,7 @@ Dict(1 => rand(2,3), 'c' => "asdf") # just make sure this does not trigger a dep
 
     # issue #26939
     d26939 = WeakKeyDict()
-    (@noinline d -> d[big"1.0" + 1.1] = 1)(d26939)
+    (@noinline d -> d[big"1" + 1] = 1)(d26939)
     GC.gc() # primarily to make sure this doesn't segfault
     @test count(d26939) == 0
     @test length(d26939.ht) == 1
diff --git a/test/docs.jl b/test/docs.jl
index f62f7f8b63b2c..92d45fe05e397 100644
--- a/test/docs.jl
+++ b/test/docs.jl
@@ -4,6 +4,7 @@ import Base.Docs: meta, @var, DocStr, parsedoc
 
 # check that @doc can work before REPL is loaded
 @test !startswith(read(`$(Base.julia_cmd()) -E '@doc sin'`, String), "nothing")
+@test !startswith(read(`$(Base.julia_cmd()) -E '@doc @time'`, String), "nothing")
 
 using Markdown
 using REPL
@@ -574,8 +575,8 @@ end
 
 let T = meta(DocVars)[@var(DocVars.T)],
     S = meta(DocVars)[@var(DocVars.S)],
-    Tname = Markdown.parse("```\n$(curmod_prefix)DocVars.T\n```"),
-    Sname = Markdown.parse("```\n$(curmod_prefix)DocVars.S\n```")
+    Tname = Markdown.parse("```julia\n$(curmod_prefix)DocVars.T\n```"),
+    Sname = Markdown.parse("```julia\n$(curmod_prefix)DocVars.S\n```")
     # Splicing the expression directly doesn't work
     @test docstrings_equal(T.docs[Union{}],
         doc"""
diff --git a/test/errorshow.jl b/test/errorshow.jl
index 80352ddeaa9cf..db22fea1131d1 100644
--- a/test/errorshow.jl
+++ b/test/errorshow.jl
@@ -10,7 +10,8 @@ Base.Experimental.register_error_hint(Base.noncallable_number_hint_handler, Meth
 Base.Experimental.register_error_hint(Base.string_concatenation_hint_handler, MethodError)
 Base.Experimental.register_error_hint(Base.methods_on_iterable, MethodError)
 Base.Experimental.register_error_hint(Base.nonsetable_type_hint_handler, MethodError)
-Base.Experimental.register_error_hint(Base.fielderror_hint_handler, FieldError)
+Base.Experimental.register_error_hint(Base.fielderror_listfields_hint_handler, FieldError)
+Base.Experimental.register_error_hint(Base.fielderror_dict_hint_handler, FieldError)
 
 @testset "SystemError" begin
     err = try; systemerror("reason", Cint(0)); false; catch ex; ex; end::SystemError
@@ -738,8 +739,7 @@ end
 pop!(Base.Experimental._hint_handlers[DomainError])  # order is undefined, don't copy this
 
 struct ANumber <: Number end
-let err_str
-    err_str = @except_str ANumber()(3 + 4) MethodError
+let err_str = @except_str ANumber()(3 + 4) MethodError
     @test occursin("objects of type $(curmod_prefix)ANumber are not callable", err_str)
     @test count(==("Maybe you forgot to use an operator such as *, ^, %, / etc. ?"), split(err_str, '\n')) == 1
     # issue 40478
@@ -747,22 +747,25 @@ let err_str
     @test count(==("Maybe you forgot to use an operator such as *, ^, %, / etc. ?"), split(err_str, '\n')) == 1
 end
 
-let err_str
-    a = [1 2; 3 4];
+let a = [1 2; 3 4];
     err_str = @except_str (a[1][2] = 5) MethodError
     @test occursin("\nAre you trying to index into an array? For multi-dimensional arrays, separate the indices with commas: ", err_str)
     @test occursin("a[1, 2]", err_str)
     @test occursin("rather than a[1][2]", err_str)
 end
 
-let err_str
-    d = Dict
+let d = Dict
     err_str = @except_str (d[1] = 5) MethodError
     @test occursin("\nYou attempted to index the type Dict, rather than an instance of the type. Make sure you create the type using its constructor: ", err_str)
     @test occursin("d = Dict([...])", err_str)
     @test occursin(" rather than d = Dict", err_str)
 end
 
+let s = Some("foo")
+    err_str = @except_str (s[] = "bar") MethodError
+    @test !occursin("You attempted to index the type String", err_str)
+end
+
 # Execute backtrace once before checking formatting, see #38858
 backtrace()
 
@@ -808,12 +811,13 @@ end
 @test_throws ArgumentError("invalid index: \"foo\" of type String") [1]["foo"]
 @test_throws ArgumentError("invalid index: nothing of type Nothing") [1][nothing]
 
-# issue #53618
-@testset "FieldErrorHint" begin
+# issue #53618, pr #55165
+@testset "FieldErrorHints" begin
     struct FieldFoo
         a::Float32
         b::Int
     end
+    Base.propertynames(foo::FieldFoo) = (:a, :x, :y)
 
     s = FieldFoo(1, 2)
 
@@ -823,7 +827,9 @@ end
 
     # Check error message first
     errorMsg = sprint(Base.showerror, ex)
-    @test occursin("FieldError: type FieldFoo has no field c", errorMsg)
+    @test occursin("FieldError: type FieldFoo has no field `c`", errorMsg)
+    @test occursin("available fields: `a`, `b`", errorMsg)
+    @test occursin("Available properties: `x`, `y`", errorMsg)
 
     d = Dict(s => 1)
 
@@ -840,7 +846,7 @@ end
     ex = test.value::FieldError
 
     errorMsg = sprint(Base.showerror, ex)
-    @test occursin("FieldError: type Dict has no field c", errorMsg)
+    @test occursin("FieldError: type Dict has no field `c`", errorMsg)
     # Check hint message
     hintExpected = "Did you mean to access dict values using key: `:c` ? Consider using indexing syntax dict[:c]\n"
     @test occursin(hintExpected, errorMsg)
@@ -1079,6 +1085,12 @@ let err_str
     @test occursin("String concatenation is performed with *", err_str)
 end
 
+# https://github.com/JuliaLang/julia/issues/55745
+let err_str
+    err_str = @except_str +() MethodError
+    @test !occursin("String concatenation is performed with *", err_str)
+end
+
 struct MissingLength; end
 struct MissingSize; end
 Base.IteratorSize(::Type{MissingSize}) = Base.HasShape{2}()
diff --git a/test/file.jl b/test/file.jl
index 4531cd8e66998..a4262c4eaaa21 100644
--- a/test/file.jl
+++ b/test/file.jl
@@ -823,6 +823,303 @@ mktempdir() do tmpdir
     rm(b_tmpdir)
 end
 
+@testset "rename" begin
+    # some of the windows specific behavior may be fixed in new versions of julia
+    mktempdir() do dir
+        # see if can make symlinks
+        local can_symlink = try
+            symlink("foo", joinpath(dir, "link"))
+            rm(joinpath(dir, "link"))
+            true
+        catch
+            false
+        end
+        local f1 = joinpath(dir, "file1")
+        local f2 = joinpath(dir, "file2")
+        local d1 = joinpath(dir, "dir1")
+        local d2 = joinpath(dir, "dir2")
+        local subd1f1 = joinpath(d1, "file1")
+        local subd1f2 = joinpath(d1, "file2")
+        local subd2f1 = joinpath(d2, "file1")
+        local subd2f2 = joinpath(d2, "file2")
+        local h1 = joinpath(dir, "hlink1")
+        local h2 = joinpath(dir, "hlink2")
+        local s1 = joinpath(dir, "slink1")
+        local s2 = joinpath(dir, "slink2")
+        @testset "renaming to non existing newpath in same directory" begin
+            # file, make sure isexecutable is copied
+            for mode in (0o644, 0o755)
+                write(f1, b"data")
+                chmod(f1, mode)
+                Base.rename(f1, f2)
+                @test !isfile(f1)
+                @test isfile(f2)
+                @test read(f2) == b"data"
+                if mode == 0o644
+                    @test !isexecutable(f2)
+                else
+                    @test isexecutable(f2)
+                end
+                rm(f2)
+            end
+            # empty directory
+            mkdir(d1)
+            Base.rename(d1, d2)
+            @test !isdir(d1)
+            @test isdir(d2)
+            @test isempty(readdir(d2))
+            rm(d2)
+            # non empty directory
+            mkdir(d1)
+            write(subd1f1, b"data")
+            chmod(subd1f1, 0o644)
+            write(subd1f2, b"exe")
+            chmod(subd1f2, 0o755)
+            Base.rename(d1, d2)
+            @test !isdir(d1)
+            @test isdir(d2)
+            @test read(subd2f1) == b"data"
+            @test read(subd2f2) == b"exe"
+            @test !isexecutable(subd2f1)
+            @test isexecutable(subd2f2)
+            rm(d2; recursive=true)
+            # hardlink
+            write(f1, b"data")
+            hardlink(f1, h1)
+            Base.rename(h1, h2)
+            @test isfile(f1)
+            @test !isfile(h1)
+            @test isfile(h2)
+            @test read(h2) == b"data"
+            write(h2, b"data2")
+            @test read(f1) == b"data2"
+            rm(h2)
+            rm(f1)
+            # symlink
+            if can_symlink
+                symlink("foo", s1)
+                Base.rename(s1, s2)
+                @test !islink(s1)
+                @test islink(s2)
+                @test readlink(s2) == "foo"
+                rm(s2)
+            end
+        end
+        @test isempty(readdir(dir)) # make sure everything got cleaned up
+
+        # Get the error code from failed rename, or nothing if it worked
+        function rename_errorcodes(oldpath, newpath)
+            try
+                Base.rename(oldpath, newpath)
+                nothing
+            catch e
+                e.code
+            end
+        end
+        @testset "errors" begin
+            # invalid paths
+            @test_throws ArgumentError Base.rename(f1*"\0", "")
+            @test Base.UV_ENOENT == rename_errorcodes("", "")
+            write(f1, b"data")
+            @test Base.UV_ENOENT == rename_errorcodes(f1, "")
+            @test read(f1) == b"data"
+            @test Base.UV_ENOENT == rename_errorcodes("", f1)
+            @test read(f1) == b"data"
+            @test Base.UV_ENOENT == rename_errorcodes(f2, f1)
+            @test read(f1) == b"data"
+            @test Base.UV_ENOENT == rename_errorcodes(f1, subd1f1)
+            @test read(f1) == b"data"
+            rm(f1)
+            # attempt to make a directory a subdirectory of itself
+            mkdir(d1)
+            if Sys.iswindows()
+                @test rename_errorcodes(d1, joinpath(d1, "subdir")) ∈ (Base.UV_EINVAL, Base.UV_EBUSY)
+            else
+                @test Base.UV_EINVAL == rename_errorcodes(d1, joinpath(d1, "subdir"))
+            end
+            rm(d1)
+            # rename to child of a file
+            mkdir(d1)
+            write(f2, "foo")
+            if Sys.iswindows()
+                @test Base.UV_EINVAL == rename_errorcodes(d1, joinpath(f2, "subdir"))
+            else
+                @test Base.UV_ENOTDIR == rename_errorcodes(d1, joinpath(f2, "subdir"))
+            end
+            # replace a file with a directory
+            if !Sys.iswindows()
+                @test Base.UV_ENOTDIR == rename_errorcodes(d1, f2)
+            else
+                # this should work on windows
+                Base.rename(d1, f2)
+                @test isdir(f2)
+                @test !ispath(d1)
+            end
+            rm(f2; force=true)
+            rm(d1; force=true)
+            # symlink loop
+            if can_symlink
+                symlink(s1, s2)
+                symlink(s2, s1)
+                @test Base.UV_ELOOP == rename_errorcodes(joinpath(s1, "foo"), f2)
+                write(f2, b"data")
+                @test Base.UV_ELOOP == rename_errorcodes(f2, joinpath(s1, "foo"))
+                rm(s1)
+                rm(s2)
+                rm(f2)
+            end
+            # newpath is a nonempty directory
+            mkdir(d1)
+            mkdir(d2)
+            write(subd2f1, b"data")
+            write(f1, b"otherdata")
+            if Sys.iswindows()
+                @test Base.UV_EACCES == rename_errorcodes(f1, d1)
+                @test Base.UV_EACCES == rename_errorcodes(f1, d2)
+                @test Base.UV_EACCES == rename_errorcodes(d1, d2)
+                @test Base.UV_EACCES == rename_errorcodes(subd2f1, d2)
+            else
+                @test Base.UV_EISDIR == rename_errorcodes(f1, d1)
+                @test Base.UV_EISDIR == rename_errorcodes(f1, d2)
+                @test rename_errorcodes(d1, d2) ∈ (Base.UV_ENOTEMPTY, Base.UV_EEXIST)
+                @test rename_errorcodes(subd2f1, d2) ∈ (Base.UV_ENOTEMPTY, Base.UV_EEXIST, Base.UV_EISDIR)
+            end
+            rm(f1)
+            rm(d1)
+            rm(d2; recursive=true)
+        end
+        @test isempty(readdir(dir)) # make sure everything got cleaned up
+
+        @testset "replacing existing file" begin
+            write(f2, b"olddata")
+            chmod(f2, 0o755)
+            write(f1, b"newdata")
+            chmod(f1, 0o644)
+            @test isexecutable(f2)
+            @test !isexecutable(f1)
+            Base.rename(f1, f2)
+            @test !ispath(f1)
+            @test read(f2) == b"newdata"
+            @test !isexecutable(f2)
+            rm(f2)
+        end
+
+        @testset "replacing file with itself" begin
+            write(f1, b"data")
+            Base.rename(f1, f1)
+            @test read(f1) == b"data"
+            hardlink(f1, h1)
+            Base.rename(f1, h1)
+            if Sys.iswindows()
+                # On Windows f1 gets deleted
+                @test !ispath(f1)
+            else
+                @test read(f1) == b"data"
+            end
+            @test read(h1) == b"data"
+            rm(h1)
+            rm(f1; force=true)
+        end
+
+        @testset "replacing existing file in different directories" begin
+            mkdir(d1)
+            mkdir(d2)
+            write(subd2f2, b"olddata")
+            chmod(subd2f2, 0o755)
+            write(subd1f1, b"newdata")
+            chmod(subd1f1, 0o644)
+            @test isexecutable(subd2f2)
+            @test !isexecutable(subd1f1)
+            Base.rename(subd1f1, subd2f2)
+            @test !ispath(subd1f1)
+            @test read(subd2f2) == b"newdata"
+            @test !isexecutable(subd2f2)
+            @test isdir(d1)
+            @test isdir(d2)
+            rm(d1; recursive=true)
+            rm(d2; recursive=true)
+        end
+
+        @testset "rename with open files" begin
+            # both open
+            write(f2, b"olddata")
+            write(f1, b"newdata")
+            open(f1) do handle1
+                open(f2) do handle2
+                    if Sys.iswindows()
+                        # currently this doesn't work on windows
+                        @test Base.UV_EBUSY == rename_errorcodes(f1, f2)
+                    else
+                        Base.rename(f1, f2)
+                        @test !ispath(f1)
+                        @test read(f2) == b"newdata"
+                    end
+                    # rename doesn't break already opened files
+                    @test read(handle1) == b"newdata"
+                    @test read(handle2) == b"olddata"
+                end
+            end
+            rm(f1; force=true)
+            rm(f2; force=true)
+
+            # oldpath open
+            write(f2, b"olddata")
+            write(f1, b"newdata")
+            open(f1) do handle1
+                if Sys.iswindows()
+                    # currently this doesn't work on windows
+                    @test Base.UV_EBUSY == rename_errorcodes(f1, f2)
+                else
+                    Base.rename(f1, f2)
+                    @test !ispath(f1)
+                    @test read(f2) == b"newdata"
+                end
+                # rename doesn't break already opened files
+                @test read(handle1) == b"newdata"
+            end
+            rm(f1; force=true)
+            rm(f2; force=true)
+
+            # newpath open
+            write(f2, b"olddata")
+            write(f1, b"newdata")
+            open(f2) do handle2
+                if Sys.iswindows()
+                    # currently this doesn't work on windows
+                    @test Base.UV_EACCES == rename_errorcodes(f1, f2)
+                else
+                    Base.rename(f1, f2)
+                    @test !ispath(f1)
+                    @test read(f2) == b"newdata"
+                end
+                # rename doesn't break already opened files
+                @test read(handle2) == b"olddata"
+            end
+            rm(f1; force=true)
+            rm(f2; force=true)
+        end
+
+        @testset "replacing empty directory with directory" begin
+            mkdir(d1)
+            mkdir(d2)
+            write(subd1f1, b"data")
+            if Sys.iswindows()
+                # currently this doesn't work on windows
+                @test Base.UV_EACCES == rename_errorcodes(d1, d2)
+                rm(d1; recursive=true)
+                rm(d2)
+            else
+                Base.rename(d1, d2)
+                @test isdir(d2)
+                @test read(subd2f1) == b"data"
+                @test !ispath(d1)
+                rm(d2; recursive=true)
+            end
+        end
+        @test isempty(readdir(dir)) # make sure everything got cleaned up
+    end
+end
+
 # issue #10506 #10434
 ## Tests for directories and links to directories
 if !Sys.iswindows() || Sys.windows_version() >= Sys.WINDOWS_VISTA_VER
@@ -1472,7 +1769,7 @@ rm(dir)
 
 
 ##################
-# Return values of mkpath, mkdir, cp, mv and touch
+# Return values of mkpath, mkdir, cp, mv, rename and touch
 ####################
 mktempdir() do dir
     name1 = joinpath(dir, "apples")
@@ -1489,6 +1786,9 @@ mktempdir() do dir
     @test cp(name2, name1) == name1
     @test isfile(name1)
     @test isfile(name2)
+    @test Base.rename(name1, name2) == name2
+    @test !ispath(name1)
+    @test isfile(name2)
     namedir = joinpath(dir, "chalk")
     namepath = joinpath(dir, "chalk", "cheese", "fresh")
     @test !ispath(namedir)
@@ -1608,6 +1908,26 @@ end
     end
 end
 
+@testset "pwd tests" begin
+    mktempdir() do dir
+        cd(dir) do
+            withenv("OLDPWD" => nothing) do
+                io = IOBuffer()
+                Base.repl_cmd(@cmd("cd"), io)
+                Base.repl_cmd(@cmd("cd -"), io)
+                @test realpath(pwd()) == realpath(dir)
+                if !Sys.iswindows()
+                    # Delete the working directory and check we can cd out of it
+                    # Cannot delete the working directory on Windows
+                    rm(dir)
+                    @test_throws Base._UVError("pwd()", Base.UV_ENOENT) pwd()
+                    Base.repl_cmd(@cmd("cd \\~"), io)
+                end
+            end
+        end
+    end
+end
+
 @testset "readdir tests" begin
     ≛(a, b) = sort(a) == sort(b)
     mktempdir() do dir
@@ -1808,6 +2128,16 @@ Base.joinpath(x::URI50890) = URI50890(x.f)
         @test !isnothing(Base.Filesystem.getusername(s.uid))
         @test !isnothing(Base.Filesystem.getgroupname(s.gid))
     end
+    s = Base.Filesystem.StatStruct()
+    stat_show_str = sprint(show, s)
+    stat_show_str_multi = sprint(show, MIME("text/plain"), s)
+    @test startswith(stat_show_str, "StatStruct(\"\" ENOENT: ") && endswith(stat_show_str, ")")
+    @test startswith(stat_show_str_multi, "StatStruct for \"\"\n ENOENT: ") && !endswith(stat_show_str_multi, r"\s")
+    s = Base.Filesystem.StatStruct("my/test", Ptr{UInt8}(0), Int32(Base.UV_ENOTDIR))
+    stat_show_str = sprint(show, s)
+    stat_show_str_multi = sprint(show, MIME("text/plain"), s)
+    @test startswith(stat_show_str, "StatStruct(\"my/test\" ENOTDIR: ") && endswith(stat_show_str, ")")
+    @test startswith(stat_show_str_multi, "StatStruct for \"my/test\"\n ENOTDIR: ") && !endswith(stat_show_str_multi, r"\s")
 end
 
 @testset "diskstat() works" begin
diff --git a/test/floatfuncs.jl b/test/floatfuncs.jl
index f33ec75b58322..d5d697634bcfa 100644
--- a/test/floatfuncs.jl
+++ b/test/floatfuncs.jl
@@ -257,6 +257,35 @@ end
     end
 end
 
+@testset "isapprox and unsigned integers" begin
+    for T in Base.BitUnsigned_types
+        # Test also combinations of different integer types
+        W = widen(T)
+        # The order of the operands for difference between unsigned integers is
+        # very important, test both combinations.
+        @test isapprox(T(42), T(42); rtol=T(0), atol=0.5)
+        @test isapprox(T(42), W(42); rtol=T(0), atol=0.5)
+        @test !isapprox(T(0), T(1); rtol=T(0), atol=0.5)
+        @test !isapprox(T(1), T(0); rtol=T(0), atol=0.5)
+        @test isapprox(T(1), T(3); atol=T(2))
+        @test isapprox(T(4), T(2); atol=T(2))
+        @test isapprox(T(1), W(3); atol=T(2))
+        @test isapprox(T(4), W(2); atol=T(2))
+        @test isapprox(T(5), T(7); atol=typemax(T))
+        @test isapprox(T(8), T(6); atol=typemax(T))
+        @test isapprox(T(1), T(2); rtol=1)
+        @test isapprox(T(6), T(3); rtol=1)
+        @test isapprox(T(1), W(2); rtol=1)
+        @test isapprox(T(6), W(3); rtol=1)
+        @test !isapprox(typemin(T), typemax(T))
+        @test !isapprox(typemax(T), typemin(T))
+        @test !isapprox(typemin(T), typemax(T); atol=typemax(T)-T(1))
+        @test !isapprox(typemax(T), typemin(T); atol=typemax(T)-T(1))
+        @test isapprox(typemin(T), typemax(T); atol=typemax(T))
+        @test isapprox(typemax(T), typemin(T); atol=typemax(T))
+    end
+end
+
 @testset "Conversion from floating point to unsigned integer near extremes (#51063)" begin
     @test_throws InexactError UInt32(4.2949673f9)
     @test_throws InexactError UInt64(1.8446744f19)
diff --git a/test/gc.jl b/test/gc.jl
index f924f4952cfb0..c532f17f04eb5 100644
--- a/test/gc.jl
+++ b/test/gc.jl
@@ -49,6 +49,13 @@ function issue_54275_test()
     @test !live_bytes_has_grown_too_much
 end
 
+function full_sweep_reasons_test()
+    GC.gc()
+    reasons = Base.full_sweep_reasons()
+    @test reasons[:FULL_SWEEP_REASON_FORCED_FULL_SWEEP] >= 1
+    @test keys(reasons) == Set(Base.FULL_SWEEP_REASONS)
+end
+
 # !!! note:
 #     Since we run our tests on 32bit OS as well we confine ourselves
 #     to parameters that allocate about 512MB of objects. Max RSS is lower
@@ -72,3 +79,21 @@ end
 @testset "Base.GC docstrings" begin
     @test isempty(Docs.undocumented_names(GC))
 end
+
+@testset "Full GC reasons" begin
+    full_sweep_reasons_test()
+end
+
+#testset doesn't work here because this needs to run in top level
+#Check that we ensure objects in toplevel exprs are rooted
+global dims54422 = [] # allocate the Binding
+GC.gc(); GC.gc(); # force the binding to be old
+GC.enable(false); # prevent new objects from being old
+@eval begin
+    Base.Experimental.@force_compile # use the compiler
+    dims54422 = $([])
+    nothing
+end
+GC.enable(true); GC.gc(false) # incremental collection
+@test typeof(dims54422) == Vector{Any}
+@test isempty(dims54422)
diff --git a/test/intfuncs.jl b/test/intfuncs.jl
index deb1dd10681e8..6f1bde69dddfe 100644
--- a/test/intfuncs.jl
+++ b/test/intfuncs.jl
@@ -616,3 +616,20 @@ end
 @test Base.infer_effects(gcdx, (Int,Int)) |> Core.Compiler.is_foldable
 @test Base.infer_effects(invmod, (Int,Int)) |> Core.Compiler.is_foldable
 @test Base.infer_effects(binomial, (Int,Int)) |> Core.Compiler.is_foldable
+
+@testset "literal power" begin
+    @testset for T in Base.uniontypes(Base.HWReal)
+        ns = (T(0), T(1), T(5))
+        if T <: AbstractFloat
+            ns = (ns..., T(3.14), T(-2.71))
+        end
+        for n in ns
+            @test n ^ 0 === T(1)
+            @test n ^ 1 === n
+            @test n ^ 2 === n * n
+            @test n ^ 3 === n * n * n
+            @test n ^ -1 ≈ inv(n)
+            @test n ^ -2 ≈ inv(n) * inv(n)
+        end
+    end
+end
diff --git a/test/llvmpasses/alloc-opt-gcframe.ll b/test/llvmpasses/alloc-opt-gcframe.ll
index e8644899f0914..f53a4d5c01df7 100644
--- a/test/llvmpasses/alloc-opt-gcframe.ll
+++ b/test/llvmpasses/alloc-opt-gcframe.ll
@@ -10,7 +10,7 @@ target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
 ; CHECK-NOT: @julia.gc_alloc_obj
 
 ; OPAQUE: %current_task = getelementptr inbounds ptr, ptr %gcstack, i64 -12
-; OPAQUE: [[ptls_field:%.*]] = getelementptr inbounds ptr, ptr %current_task, i64 16
+; OPAQUE: [[ptls_field:%.*]] = getelementptr inbounds i8, ptr %current_task,
 ; OPAQUE-NEXT: [[ptls_load:%.*]] = load ptr, ptr [[ptls_field]], align 8, !tbaa !0
 ; OPAQUE-NEXT: %v = call noalias nonnull align {{[0-9]+}} dereferenceable({{[0-9]+}}) ptr addrspace(10) @ijl_gc_small_alloc(ptr [[ptls_load]], i32 [[SIZE_T:[0-9]+]], i32 16, i64 {{.*}} @tag {{.*}})
 ; OPAQUE: store atomic ptr addrspace(10) @tag, ptr addrspace(10) {{.*}} unordered, align 8, !tbaa !4
diff --git a/test/llvmpasses/late-lower-gc-addrspaces.ll b/test/llvmpasses/late-lower-gc-addrspaces.ll
index 702e44b2b0e28..9c041664a9682 100644
--- a/test/llvmpasses/late-lower-gc-addrspaces.ll
+++ b/test/llvmpasses/late-lower-gc-addrspaces.ll
@@ -1,6 +1,6 @@
 ; This file is a part of Julia. License is MIT: https://julialang.org/license
 
-; RUN: opt --load-pass-plugin=libjulia-codegen%shlibext -passes='function(LateLowerGCFrame)' -S %s | FileCheck %s --check-prefixes=CHECK,OPAQUE
+; RUN: opt --load-pass-plugin=libjulia-codegen%shlibext -passes='function(LateLowerGCFrame)' -S %s | FileCheck %s
 
 target triple = "amdgcn-amd-amdhsa"
 target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7-ni:10:11:12:13"
@@ -19,28 +19,28 @@ define void @gc_frame_lowering(i64 %a, i64 %b) {
 top:
 ; CHECK-LABEL: @gc_frame_lowering
 
-; OPAQUE: %gcframe = call ptr @julia.new_gc_frame(i32 2)
-; OPAQUE:  %pgcstack = call ptr @julia.get_pgcstack()
+; CHECK: %gcframe = call ptr @julia.new_gc_frame(i32 2)
+; CHECK:  %pgcstack = call ptr @julia.get_pgcstack()
     %pgcstack = call {}*** @julia.get_pgcstack()
 
-; OPAQUE-NEXT: call void @julia.push_gc_frame(ptr %gcframe, i32 2)
-; OPAQUE-NEXT: call ptr addrspace(10) @jl_box_int64
+; CHECK-NEXT: call void @julia.push_gc_frame(ptr %gcframe, i32 2)
+; CHECK-NEXT: call ptr addrspace(10) @jl_box_int64
     %aboxed = call {} addrspace(10)* @jl_box_int64(i64 signext %a)
 
-; OPAQUE: [[GEP0:%.*]] = call ptr @julia.get_gc_frame_slot(ptr %gcframe, i32 [[GEPSLOT0:[0-9]+]])
-; OPAQUE-NEXT: store ptr addrspace(10) %aboxed, ptr [[GEP0]]
+; CHECK: [[GEP0:%.*]] = call ptr @julia.get_gc_frame_slot(ptr %gcframe, i32 [[GEPSLOT0:[0-9]+]])
+; CHECK-NEXT: store ptr addrspace(10) %aboxed, ptr [[GEP0]]
     %bboxed = call {} addrspace(10)* @jl_box_int64(i64 signext %b)
 ; CHECK-NEXT: %bboxed =
 ; Make sure the same gc slot isn't re-used
 
-; OPAQUE-NOT: call ptr @julia.get_gc_frame_slot(ptr %gcframe, i32 [[GEPSLOT0]])
-; OPAQUE: [[GEP1:%.*]] = call ptr @julia.get_gc_frame_slot(ptr %gcframe, i32 [[GEPSLOT1:[0-9]+]])
-; OPAQUE-NEXT: store ptr addrspace(10) %bboxed, ptr [[GEP1]]
+; CHECK-NOT: call ptr @julia.get_gc_frame_slot(ptr %gcframe, i32 [[GEPSLOT0]])
+; CHECK: [[GEP1:%.*]] = call ptr @julia.get_gc_frame_slot(ptr %gcframe, i32 [[GEPSLOT1:[0-9]+]])
+; CHECK-NEXT: store ptr addrspace(10) %bboxed, ptr [[GEP1]]
 
 ; CHECK-NEXT: call void @boxed_simple
     call void @boxed_simple({} addrspace(10)* %aboxed,
                             {} addrspace(10)* %bboxed)
-; OPAQUE-NEXT: call void @julia.pop_gc_frame(ptr %gcframe)
+; CHECK-NEXT: call void @julia.pop_gc_frame(ptr %gcframe)
     ret void
 }
 
@@ -51,14 +51,14 @@ top:
     %0 = bitcast {}*** %pgcstack to {}**
     %current_task = getelementptr inbounds {}*, {}** %0, i64 -12
 
-; OPAQUE: %current_task = getelementptr inbounds ptr, ptr %0, i64 -12
-; OPAQUE-NEXT: [[ptls_field:%.*]] = getelementptr inbounds ptr, ptr %current_task, i64 16
-; OPAQUE-NEXT: [[ptls_load:%.*]] = load ptr, ptr [[ptls_field]], align 8, !tbaa !0
-; OPAQUE-NEXT: %v = call noalias nonnull ptr addrspace(10) @julia.gc_alloc_bytes(ptr [[ptls_load]], [[SIZE_T:i.[0-9]+]] 8, i64 {{.*}} @tag {{.*}})
-; OPAQUE-NEXT: [[V_HEADROOM:%.*]] = getelementptr inbounds ptr addrspace(10), ptr addrspace(10) %v, i64 -1
-; OPAQUE-NEXT: store atomic ptr addrspace(10) @tag, ptr addrspace(10) [[V_HEADROOM]] unordered, align 8, !tbaa !4
+; CHECK: %current_task = getelementptr inbounds ptr, ptr %0, i64 -12
+; CHECK-NEXT: [[ptls_field:%.*]] = getelementptr inbounds i8, ptr %current_task,
+; CHECK-NEXT: [[ptls_load:%.*]] = load ptr, ptr [[ptls_field]], align 8, !tbaa !0
+; CHECK-NEXT: %v = call noalias nonnull ptr addrspace(10) @julia.gc_alloc_bytes(ptr [[ptls_load]], [[SIZE_T:i.[0-9]+]] 8, i64 {{.*}} @tag {{.*}})
+; CHECK-NEXT: [[V_HEADROOM:%.*]] = getelementptr inbounds ptr addrspace(10), ptr addrspace(10) %v, i64 -1
+; CHECK-NEXT: store atomic ptr addrspace(10) @tag, ptr addrspace(10) [[V_HEADROOM]] unordered, align 8, !tbaa !4
     %v = call noalias {} addrspace(10)* @julia.gc_alloc_obj({}** %current_task, i64 8, {} addrspace(10)* @tag)
-; OPAQUE-NEXT: ret ptr addrspace(10) %v
+; CHECK-NEXT: ret ptr addrspace(10) %v
     ret {} addrspace(10)* %v
 }
 
@@ -74,20 +74,20 @@ top:
     %0 = bitcast {}*** %pgcstack to {}**
     %current_task = getelementptr inbounds {}*, {}** %0, i64 -12
 
-; OPAQUE: %current_task = getelementptr inbounds ptr, ptr %0, i64 -12
-; OPAQUE-NEXT: [[ptls_field:%.*]] = getelementptr inbounds ptr, ptr %current_task, i64 16
-; OPAQUE-NEXT: [[ptls_load:%.*]] = load ptr, ptr [[ptls_field]], align 8, !tbaa !0
-; OPAQUE-NEXT: %v = call noalias nonnull ptr addrspace(10) @julia.gc_alloc_bytes(ptr [[ptls_load]], [[SIZE_T:i.[0-9]+]] 8, i64 {{.*}} @tag {{.*}})
-; OPAQUE-NEXT: [[V_HEADROOM:%.*]] = getelementptr inbounds ptr addrspace(10), ptr addrspace(10) %v, i64 -1
-; OPAQUE-NEXT: store atomic ptr addrspace(10) @tag, ptr addrspace(10) [[V_HEADROOM]] unordered, align 8, !tbaa !4
+; CHECK: %current_task = getelementptr inbounds ptr, ptr %0, i64 -12
+; CHECK-NEXT: [[ptls_field:%.*]] = getelementptr inbounds i8, ptr %current_task,
+; CHECK-NEXT: [[ptls_load:%.*]] = load ptr, ptr [[ptls_field]], align 8, !tbaa !0
+; CHECK-NEXT: %v = call noalias nonnull ptr addrspace(10) @julia.gc_alloc_bytes(ptr [[ptls_load]], [[SIZE_T:i.[0-9]+]] 8, i64 {{.*}} @tag {{.*}})
+; CHECK-NEXT: [[V_HEADROOM:%.*]] = getelementptr inbounds ptr addrspace(10), ptr addrspace(10) %v, i64 -1
+; CHECK-NEXT: store atomic ptr addrspace(10) @tag, ptr addrspace(10) [[V_HEADROOM]] unordered, align 8, !tbaa !4
     %v = call noalias {} addrspace(10)* @julia.gc_alloc_obj({}** %current_task, i64 8, {} addrspace(10)* @tag)
-; OPAQUE-NEXT: %v64 = bitcast ptr addrspace(10) %v to ptr addrspace(10)
+; CHECK-NEXT: %v64 = bitcast ptr addrspace(10) %v to ptr addrspace(10)
     %v64 = bitcast {} addrspace(10)* %v to i64 addrspace(10)*
-; OPAQUE-NEXT: %loadedval = load i64, ptr addrspace(10) %v64, align 8, !range !7
+; CHECK-NEXT: %loadedval = load i64, ptr addrspace(10) %v64, align 8, !range !7
     %loadedval = load i64, i64 addrspace(10)* %v64, align 8, !range !0, !invariant.load !1
-; OPAQUE-NEXT: store i64 %loadedval, ptr addrspace(10) %v64, align 8, !noalias !8
+; CHECK-NEXT: store i64 %loadedval, ptr addrspace(10) %v64, align 8, !noalias !8
     store i64 %loadedval, i64 addrspace(10)* %v64, align 8, !noalias !2
-; OPAQUE-NEXT: %lv2 = load i64, ptr addrspace(10) %v64, align 8, !tbaa !11, !range !7
+; CHECK-NEXT: %lv2 = load i64, ptr addrspace(10) %v64, align 8, !tbaa !11, !range !7
     %lv2 = load i64, i64 addrspace(10)* %v64, align 8, !range !0, !tbaa !4
 ; CHECK-NEXT: ret void
     ret void
diff --git a/test/llvmpasses/late-lower-gc.ll b/test/llvmpasses/late-lower-gc.ll
index 093cab1358141..d294847db8f9d 100644
--- a/test/llvmpasses/late-lower-gc.ll
+++ b/test/llvmpasses/late-lower-gc.ll
@@ -1,6 +1,6 @@
 ; This file is a part of Julia. License is MIT: https://julialang.org/license
 
-; RUN: opt --load-pass-plugin=libjulia-codegen%shlibext -passes='function(LateLowerGCFrame)' -S %s | FileCheck %s --check-prefixes=CHECK,OPAQUE
+; RUN: opt --load-pass-plugin=libjulia-codegen%shlibext -passes='function(LateLowerGCFrame)' -S %s | FileCheck %s
 
 @tag = external addrspace(10) global {}, align 16
 
@@ -16,28 +16,28 @@ define void @gc_frame_lowering(i64 %a, i64 %b) {
 top:
 ; CHECK-LABEL: @gc_frame_lowering
 
-; OPAQUE: %gcframe = call ptr @julia.new_gc_frame(i32 2)
-; OPAQUE:  %pgcstack = call ptr @julia.get_pgcstack()
+; CHECK: %gcframe = call ptr @julia.new_gc_frame(i32 2)
+; CHECK:  %pgcstack = call ptr @julia.get_pgcstack()
     %pgcstack = call {}*** @julia.get_pgcstack()
 
-; OPAQUE-NEXT: call void @julia.push_gc_frame(ptr %gcframe, i32 2)
-; OPAQUE-NEXT: call ptr addrspace(10) @jl_box_int64
+; CHECK-NEXT: call void @julia.push_gc_frame(ptr %gcframe, i32 2)
+; CHECK-NEXT: call ptr addrspace(10) @jl_box_int64
     %aboxed = call {} addrspace(10)* @jl_box_int64(i64 signext %a)
 
-; OPAQUE: [[GEP0:%.*]] = call ptr @julia.get_gc_frame_slot(ptr %gcframe, i32 [[GEPSLOT0:[0-9]+]])
-; OPAQUE-NEXT: store ptr addrspace(10) %aboxed, ptr [[GEP0]]
+; CHECK: [[GEP0:%.*]] = call ptr @julia.get_gc_frame_slot(ptr %gcframe, i32 [[GEPSLOT0:[0-9]+]])
+; CHECK-NEXT: store ptr addrspace(10) %aboxed, ptr [[GEP0]]
     %bboxed = call {} addrspace(10)* @jl_box_int64(i64 signext %b)
 ; CHECK-NEXT: %bboxed =
 ; Make sure the same gc slot isn't re-used
 
-; OPAQUE-NOT: call ptr @julia.get_gc_frame_slot(ptr %gcframe, i32 [[GEPSLOT0]])
-; OPAQUE: [[GEP1:%.*]] = call ptr @julia.get_gc_frame_slot(ptr %gcframe, i32 [[GEPSLOT1:[0-9]+]])
-; OPAQUE-NEXT: store ptr addrspace(10) %bboxed, ptr [[GEP1]]
+; CHECK-NOT: call ptr @julia.get_gc_frame_slot(ptr %gcframe, i32 [[GEPSLOT0]])
+; CHECK: [[GEP1:%.*]] = call ptr @julia.get_gc_frame_slot(ptr %gcframe, i32 [[GEPSLOT1:[0-9]+]])
+; CHECK-NEXT: store ptr addrspace(10) %bboxed, ptr [[GEP1]]
 
 ; CHECK-NEXT: call void @boxed_simple
     call void @boxed_simple({} addrspace(10)* %aboxed,
                             {} addrspace(10)* %bboxed)
-; OPAQUE-NEXT: call void @julia.pop_gc_frame(ptr %gcframe)
+; CHECK-NEXT: call void @julia.pop_gc_frame(ptr %gcframe)
     ret void
 }
 
@@ -48,14 +48,14 @@ top:
     %0 = bitcast {}*** %pgcstack to {}**
     %current_task = getelementptr inbounds {}*, {}** %0, i64 -12
 
-; OPAQUE: %current_task = getelementptr inbounds ptr, ptr %0, i64 -12
-; OPAQUE-NEXT: [[ptls_field:%.*]] = getelementptr inbounds ptr, ptr %current_task, i64 16
-; OPAQUE-NEXT: [[ptls_load:%.*]] = load ptr, ptr [[ptls_field]], align 8, !tbaa !0
-; OPAQUE-NEXT: %v = call noalias nonnull ptr addrspace(10) @julia.gc_alloc_bytes(ptr [[ptls_load]], [[SIZE_T:i.[0-9]+]] 8, i64 {{.*}} @tag {{.*}})
-; OPAQUE-NEXT: [[V_HEADROOM:%.*]] = getelementptr inbounds ptr addrspace(10), ptr addrspace(10) %v, i64 -1
-; OPAQUE-NEXT: store atomic ptr addrspace(10) @tag, ptr addrspace(10) [[V_HEADROOM]] unordered, align 8, !tbaa !4
+; CHECK: %current_task = getelementptr inbounds ptr, ptr %0, i64 -12
+; CHECK-NEXT: [[ptls_field:%.*]] = getelementptr inbounds i8, ptr %current_task,
+; CHECK-NEXT: [[ptls_load:%.*]] = load ptr, ptr [[ptls_field]], align 8, !tbaa !0
+; CHECK-NEXT: %v = call noalias nonnull ptr addrspace(10) @julia.gc_alloc_bytes(ptr [[ptls_load]], [[SIZE_T:i.[0-9]+]] 8, i64 {{.*}} @tag {{.*}})
+; CHECK-NEXT: [[V_HEADROOM:%.*]] = getelementptr inbounds ptr addrspace(10), ptr addrspace(10) %v, i64 -1
+; CHECK-NEXT: store atomic ptr addrspace(10) @tag, ptr addrspace(10) [[V_HEADROOM]] unordered, align 8, !tbaa !4
     %v = call noalias {} addrspace(10)* @julia.gc_alloc_obj({}** %current_task, i64 8, {} addrspace(10)* @tag)
-; OPAQUE-NEXT: ret ptr addrspace(10) %v
+; CHECK-NEXT: ret ptr addrspace(10) %v
     ret {} addrspace(10)* %v
 }
 
@@ -71,20 +71,20 @@ top:
     %0 = bitcast {}*** %pgcstack to {}**
     %current_task = getelementptr inbounds {}*, {}** %0, i64 -12
 
-; OPAQUE: %current_task = getelementptr inbounds ptr, ptr %0, i64 -12
-; OPAQUE-NEXT: [[ptls_field:%.*]] = getelementptr inbounds ptr, ptr %current_task, i64 16
-; OPAQUE-NEXT: [[ptls_load:%.*]] = load ptr, ptr [[ptls_field]], align 8, !tbaa !0
-; OPAQUE-NEXT: %v = call noalias nonnull ptr addrspace(10) @julia.gc_alloc_bytes(ptr [[ptls_load]], [[SIZE_T:i.[0-9]+]] 8, i64 {{.*}} @tag {{.*}})
-; OPAQUE-NEXT: [[V_HEADROOM:%.*]] = getelementptr inbounds ptr addrspace(10), ptr addrspace(10) %v, i64 -1
-; OPAQUE-NEXT: store atomic ptr addrspace(10) @tag, ptr addrspace(10) [[V_HEADROOM]] unordered, align 8, !tbaa !4
+; CHECK: %current_task = getelementptr inbounds ptr, ptr %0, i64 -12
+; CHECK-NEXT: [[ptls_field:%.*]] = getelementptr inbounds i8, ptr %current_task,
+; CHECK-NEXT: [[ptls_load:%.*]] = load ptr, ptr [[ptls_field]], align 8, !tbaa !0
+; CHECK-NEXT: %v = call noalias nonnull ptr addrspace(10) @julia.gc_alloc_bytes(ptr [[ptls_load]], [[SIZE_T:i.[0-9]+]] 8, i64 {{.*}} @tag {{.*}})
+; CHECK-NEXT: [[V_HEADROOM:%.*]] = getelementptr inbounds ptr addrspace(10), ptr addrspace(10) %v, i64 -1
+; CHECK-NEXT: store atomic ptr addrspace(10) @tag, ptr addrspace(10) [[V_HEADROOM]] unordered, align 8, !tbaa !4
     %v = call noalias {} addrspace(10)* @julia.gc_alloc_obj({}** %current_task, i64 8, {} addrspace(10)* @tag)
-; OPAQUE-NEXT: %v64 = bitcast ptr addrspace(10) %v to ptr addrspace(10)
+; CHECK-NEXT: %v64 = bitcast ptr addrspace(10) %v to ptr addrspace(10)
     %v64 = bitcast {} addrspace(10)* %v to i64 addrspace(10)*
-; OPAQUE-NEXT: %loadedval = load i64, ptr addrspace(10) %v64, align 8, !range !7
+; CHECK-NEXT: %loadedval = load i64, ptr addrspace(10) %v64, align 8, !range !7
     %loadedval = load i64, i64 addrspace(10)* %v64, align 8, !range !0, !invariant.load !1
-; OPAQUE-NEXT: store i64 %loadedval, ptr addrspace(10) %v64, align 8, !noalias !8
+; CHECK-NEXT: store i64 %loadedval, ptr addrspace(10) %v64, align 8, !noalias !8
     store i64 %loadedval, i64 addrspace(10)* %v64, align 8, !noalias !2
-; OPAQUE-NEXT: %lv2 = load i64, ptr addrspace(10) %v64, align 8, !tbaa !11, !range !7
+; CHECK-NEXT: %lv2 = load i64, ptr addrspace(10) %v64, align 8, !tbaa !11, !range !7
     %lv2 = load i64, i64 addrspace(10)* %v64, align 8, !range !0, !tbaa !4
 ; CHECK-NEXT: ret void
     ret void
@@ -162,13 +162,13 @@ define void @decayar([2 x {} addrspace(10)* addrspace(11)*] %ar) {
 
 ; CHECK-LABEL: @decayar
 
-; OPAQUE:  %gcframe = call ptr @julia.new_gc_frame(i32 2)
-; OPAQUE: [[gc_slot_addr_:%.*]]1 = call ptr @julia.get_gc_frame_slot(ptr %gcframe, i32 1)
-; OPAQUE:  store ptr addrspace(10) %l0, ptr [[gc_slot_addr_:%.*]], align 8
-; OPAQUE:  [[gc_slot_addr_:%.*]] = call ptr @julia.get_gc_frame_slot(ptr %gcframe, i32 0)
-; OPAQUE: store ptr addrspace(10) %l1, ptr [[gc_slot_addr_:%.*]], align 8
-; OPAQUE: %r = call i32 @callee_root(ptr addrspace(10) %l0, ptr addrspace(10) %l1)
-; OPAQUE: call void @julia.pop_gc_frame(ptr %gcframe)
+; CHECK:  %gcframe = call ptr @julia.new_gc_frame(i32 2)
+; CHECK: [[gc_slot_addr_:%.*]]1 = call ptr @julia.get_gc_frame_slot(ptr %gcframe, i32 1)
+; CHECK:  store ptr addrspace(10) %l0, ptr [[gc_slot_addr_:%.*]], align 8
+; CHECK:  [[gc_slot_addr_:%.*]] = call ptr @julia.get_gc_frame_slot(ptr %gcframe, i32 0)
+; CHECK: store ptr addrspace(10) %l1, ptr [[gc_slot_addr_:%.*]], align 8
+; CHECK: %r = call i32 @callee_root(ptr addrspace(10) %l0, ptr addrspace(10) %l1)
+; CHECK: call void @julia.pop_gc_frame(ptr %gcframe)
 
 !0 = !{i64 0, i64 23}
 !1 = !{!1}
diff --git a/test/llvmpasses/names.jl b/test/llvmpasses/names.jl
index fe692d0fab787..1ab2204044804 100644
--- a/test/llvmpasses/names.jl
+++ b/test/llvmpasses/names.jl
@@ -135,7 +135,8 @@ emit(f2, Float64, Float64, Float64, Float64, Float64, Float64, Float64)
 
 # CHECK: define {{(swiftcc )?}}nonnull ptr @julia_f5
 # CHECK-SAME: %"a::A"
-# CHECK: %"a::A.b_ptr.c_ptr.d
+# CHECK: %"a::A.d
+# COM: this text check relies on our LLVM code emission being relatively poor, which is not always the case
 emit(f5, A)
 
 # CHECK: define {{(swiftcc )?}}nonnull ptr @julia_f6
diff --git a/test/loading.jl b/test/loading.jl
index 51e0c45d2faf1..1674a9f59a0c3 100644
--- a/test/loading.jl
+++ b/test/loading.jl
@@ -1,10 +1,10 @@
 # This file is a part of Julia. License is MIT: https://julialang.org/license
 
-original_depot_path = copy(Base.DEPOT_PATH)
-
 using Test
 
 # Tests for @__LINE__ inside and outside of macros
+# NOTE: the __LINE__ numbers for these first couple tests are significant, so
+# adding any lines here will make those tests fail
 @test (@__LINE__) == 8
 
 macro macro_caller_lineno()
@@ -33,6 +33,9 @@ end
 @test @nested_LINE_expansion() == ((@__LINE__() - 4, @__LINE__() - 12), @__LINE__())
 @test @nested_LINE_expansion2() == ((@__LINE__() - 5, @__LINE__() - 9), @__LINE__())
 
+original_depot_path = copy(Base.DEPOT_PATH)
+include("precompile_utils.jl")
+
 loaded_files = String[]
 push!(Base.include_callbacks, (mod::Module, fn::String) -> push!(loaded_files, fn))
 include("test_sourcepath.jl")
@@ -793,6 +796,17 @@ import .Foo28190.Libdl; import Libdl
     end
 end
 
+@testset "`::AbstractString` constraint on the path argument to `include`" begin
+    for m ∈ (NotPkgModule, evalfile("testhelpers/just_module.jl"))
+        let i = m.include
+            @test !applicable(i, (nothing,))
+            @test !applicable(i, (identity, nothing,))
+            @test !hasmethod(i, Tuple{Nothing})
+            @test !hasmethod(i, Tuple{Function,Nothing})
+        end
+    end
+end
+
 @testset "`Base.project_names` and friends" begin
     # Some functions in Pkg assumes that these tuples have the same length
     n = length(Base.project_names)
@@ -855,22 +869,6 @@ end
     end
 end
 
-@testset "error message loading pkg bad module name" begin
-    mktempdir() do tmp
-        old_loadpath = copy(LOAD_PATH)
-        try
-            push!(LOAD_PATH, tmp)
-            write(joinpath(tmp, "BadCase.jl"), "module badcase end")
-            @test_logs (:warn, r"The call to compilecache failed.*") match_mode=:any begin
-                @test_throws ErrorException("package `BadCase` did not define the expected module `BadCase`, \
-                    check for typos in package module name") (@eval using BadCase)
-            end
-        finally
-            copy!(LOAD_PATH, old_loadpath)
-        end
-    end
-end
-
 @testset "Preferences loading" begin
     mktempdir() do dir
         this_uuid = uuid4()
@@ -1034,6 +1032,16 @@ end
 end
 
 @testset "Extensions" begin
+    test_ext = """
+    function test_ext(parent::Module, ext::Symbol)
+        _ext = Base.get_extension(parent, ext)
+        _ext isa Module || error("expected extension \$ext to be loaded")
+        _pkgdir = pkgdir(_ext)
+        _pkgdir == pkgdir(parent) != nothing || error("unexpected extension \$ext pkgdir path: \$_pkgdir")
+        _pkgversion = pkgversion(_ext)
+        _pkgversion == pkgversion(parent) || error("unexpected extension \$ext version: \$_pkgversion")
+    end
+    """
     depot_path = mktempdir()
     try
         proj = joinpath(@__DIR__, "project", "Extensions", "HasDepWithExtensions.jl")
@@ -1044,6 +1052,7 @@ end
             cmd = """
             $load_distr
             begin
+                $ew $test_ext
                 $ew push!(empty!(DEPOT_PATH), $(repr(depot_path)))
                 using HasExtensions
                 $ew using HasExtensions
@@ -1051,6 +1060,7 @@ end
                 $ew HasExtensions.ext_loaded && error("ext_loaded set")
                 using HasDepWithExtensions
                 $ew using HasDepWithExtensions
+                $ew test_ext(HasExtensions, :Extension)
                 $ew Base.get_extension(HasExtensions, :Extension).extvar == 1 || error("extvar in Extension not set")
                 $ew HasExtensions.ext_loaded || error("ext_loaded not set")
                 $ew HasExtensions.ext_folder_loaded && error("ext_folder_loaded set")
@@ -1102,13 +1112,14 @@ end
 
         test_ext_proj = """
         begin
+            $test_ext
             using HasExtensions
             using ExtDep
-            Base.get_extension(HasExtensions, :Extension) isa Module || error("expected extension to load")
+            test_ext(HasExtensions, :Extension)
             using ExtDep2
-            Base.get_extension(HasExtensions, :ExtensionFolder) isa Module || error("expected extension to load")
+            test_ext(HasExtensions, :ExtensionFolder)
             using ExtDep3
-            Base.get_extension(HasExtensions, :ExtensionDep) isa Module || error("expected extension to load")
+            test_ext(HasExtensions, :ExtensionDep)
         end
         """
         for compile in (`--compiled-modules=no`, ``)
@@ -1147,6 +1158,19 @@ end
         finally
             copy!(LOAD_PATH, old_load_path)
         end
+
+        # Extension with cycles in dependencies
+        code = """
+        using CyclicExtensions
+        Base.get_extension(CyclicExtensions, :ExtA) isa Module || error("expected extension to load")
+        Base.get_extension(CyclicExtensions, :ExtB) isa Module || error("expected extension to load")
+        CyclicExtensions.greet()
+        """
+        proj = joinpath(@__DIR__, "project", "Extensions", "CyclicExtensions")
+        cmd =  `$(Base.julia_cmd()) --startup-file=no -e $code`
+        cmd = addenv(cmd, "JULIA_LOAD_PATH" => proj)
+        @test occursin("Hello Cycles!", String(read(cmd)))
+
     finally
         try
             rm(depot_path, force=true, recursive=true)
@@ -1255,96 +1279,6 @@ end
     @test success(`$(Base.julia_cmd()) --startup-file=no -e 'using Statistics'`)
 end
 
-@testset "checking srcpath modules" begin
-    p = Base.PkgId("Dummy")
-    fpath, _ = mktemp()
-    @testset "valid" begin
-        write(fpath, """
-        module Foo
-        using Bar
-        end
-        """)
-        @test Base.check_src_module_wrap(p, fpath)
-
-        write(fpath, """
-        baremodule Foo
-        using Bar
-        end
-        """)
-        @test Base.check_src_module_wrap(p, fpath)
-
-        write(fpath, """
-        \"\"\"
-        Foo
-        using Foo
-        \"\"\"
-        module Foo
-        using Bar
-        end
-        """)
-        @test Base.check_src_module_wrap(p, fpath)
-
-        write(fpath, """
-        \"\"\" Foo \"\"\"
-        module Foo
-        using Bar
-        end
-        """)
-        @test Base.check_src_module_wrap(p, fpath)
-
-        write(fpath, """
-        \"\"\"
-        Foo
-        \"\"\" module Foo
-        using Bar
-        end
-        """)
-        @test Base.check_src_module_wrap(p, fpath)
-
-        write(fpath, """
-        @doc let x = 1
-            x
-        end module Foo
-        using Bar
-        end
-        """)
-        @test Base.check_src_module_wrap(p, fpath)
-
-        write(fpath, """
-        # using foo
-        module Foo
-        using Bar
-        end
-        """)
-        @test Base.check_src_module_wrap(p, fpath)
-    end
-    @testset "invalid" begin
-        write(fpath, """
-        # module Foo
-        using Bar
-        # end
-        """)
-        @test_throws ErrorException Base.check_src_module_wrap(p, fpath)
-
-        write(fpath, """
-        using Bar
-        module Foo
-        end
-        """)
-        @test_throws ErrorException Base.check_src_module_wrap(p, fpath)
-
-        write(fpath, """
-        using Bar
-        """)
-        @test_throws ErrorException Base.check_src_module_wrap(p, fpath)
-
-        write(fpath, """
-        x = 1
-        """)
-        @test_throws ErrorException Base.check_src_module_wrap(p, fpath)
-    end
-end
-
 @testset "relocatable upgrades #51989" begin
     mktempdir() do depot
         # realpath is needed because Pkg is used for one of the precompile paths below, and Pkg calls realpath on the
@@ -1584,6 +1518,7 @@ end
 @testset "-m" begin
     rot13proj = joinpath(@__DIR__, "project", "Rot13")
     @test readchomp(`$(Base.julia_cmd()) --startup-file=no --project=$rot13proj -m Rot13 --project nowhere ABJURER`) == "--cebwrpg abjurer NOWHERE "
+    @test readchomp(`$(Base.julia_cmd()) --startup-file=no --project=$rot13proj -m Rot13.Rot26 --project nowhere ABJURER`) == "--project nowhere ABJURER "
 end
 
 @testset "workspace loading" begin
@@ -1671,3 +1606,32 @@ end
         copy!(LOAD_PATH, old_load_path)
     end
 end
+
+@testset "require_stdlib loading duplication" begin
+    depot_path = mktempdir()
+    oldBase64 = nothing
+    try
+        push!(empty!(DEPOT_PATH), depot_path)
+        Base64_key = Base.PkgId(Base.UUID("2a0f44e3-6c83-55bd-87e4-b1978d98bd5f"), "Base64")
+        oldBase64 = Base.unreference_module(Base64_key)
+        cc = Base.compilecache(Base64_key)
+        @test Base.isprecompiled(Base64_key, cachepaths=String[cc[1]])
+        empty!(DEPOT_PATH)
+        Base.require_stdlib(Base64_key)
+        push!(DEPOT_PATH, depot_path)
+        append!(DEPOT_PATH, original_depot_path)
+        oldloaded = @lock(Base.require_lock, length(get(Base.loaded_precompiles, Base64_key, Module[])))
+        Base.require(Base64_key)
+        @test @lock(Base.require_lock, length(get(Base.loaded_precompiles, Base64_key, Module[]))) == oldloaded
+        Base.unreference_module(Base64_key)
+        empty!(DEPOT_PATH)
+        push!(DEPOT_PATH, depot_path)
+        Base.require(Base64_key)
+        @test @lock(Base.require_lock, length(get(Base.loaded_precompiles, Base64_key, Module[]))) == oldloaded + 1
+        Base.unreference_module(Base64_key)
+    finally
+        oldBase64 === nothing || Base.register_root_module(oldBase64)
+        copy!(DEPOT_PATH, original_depot_path)
+        rm(depot_path, force=true, recursive=true)
+    end
+end
diff --git a/test/misc.jl b/test/misc.jl
index 87605d685fb3e..e089395ce4557 100644
--- a/test/misc.jl
+++ b/test/misc.jl
@@ -159,6 +159,16 @@ let
     @test @lock(lockable2, lockable2[]["foo"]) == "hello"
 end
 
+@testset "`show` for ReentrantLock" begin
+    l = ReentrantLock()
+    @test repr(l) == "ReentrantLock()"
+    @test repr("text/plain", l) == "ReentrantLock() (unlocked)"
+    @lock l begin
+        @test startswith(repr("text/plain", l), "ReentrantLock() (locked by current Task (")
+    end
+    @test repr("text/plain", l) == "ReentrantLock() (unlocked)"
+end
+
 for l in (Threads.SpinLock(), ReentrantLock())
     @test get_finalizers_inhibited() == 0
     @test lock(get_finalizers_inhibited, l) == 1
@@ -350,6 +360,15 @@ let foo() = 1
     @test @timev foo() true
 end
 
+# this is internal, but used for easy testing
+@test sprint(Base.time_print, 1e9) == "  1.000000 seconds"
+@test sprint(Base.time_print, 1e9, 111, 0, 222) == "  1.000000 seconds (222 allocations: 111 bytes)"
+@test sprint(Base.time_print, 1e9, 111, 0.5e9, 222) == "  1.000000 seconds (222 allocations: 111 bytes, 50.00% gc time)"
+@test sprint(Base.time_print, 1e9, 111, 0, 222, 333) == "  1.000000 seconds (222 allocations: 111 bytes, 333 lock conflicts)"
+@test sprint(Base.time_print, 1e9, 0, 0, 0, 333) == "  1.000000 seconds (333 lock conflicts)"
+@test sprint(Base.time_print, 1e9, 111, 0, 222, 333, 0.25e9) == "  1.000000 seconds (222 allocations: 111 bytes, 333 lock conflicts, 25.00% compilation time)"
+@test sprint(Base.time_print, 1e9, 111, 0.5e9, 222, 333, 0.25e9, 0.175e9) == "  1.000000 seconds (222 allocations: 111 bytes, 50.00% gc time, 333 lock conflicts, 25.00% compilation time: 70% of which was recompilation)"
+
 # @showtime
 @test @showtime true
 let foo() = true
diff --git a/test/mpfr.jl b/test/mpfr.jl
index 9a9698ba72c2c..c212bdfc92821 100644
--- a/test/mpfr.jl
+++ b/test/mpfr.jl
@@ -1088,3 +1088,12 @@ end
         clear_flags()
     end
 end
+
+@testset "BigFloatData truncation OOB read" begin
+    @testset "T: $T" for T ∈ (UInt8, UInt16, UInt32, UInt64, UInt128)
+        v = Base.MPFR.BigFloatData{T}(fill(typemax(T), 1 + Base.MPFR.offset_p_limbs))
+        @testset "bit_count: $bit_count" for bit_count ∈ (0:10:80)
+            @test Base.MPFR.truncated(UInt128, v, bit_count) isa Any
+        end
+    end
+end
diff --git a/test/numbers.jl b/test/numbers.jl
index 34e775f9b2eea..fc3dc2c06bb7c 100644
--- a/test/numbers.jl
+++ b/test/numbers.jl
@@ -1158,6 +1158,8 @@ end
 end
 
 @testset "Irrationals compared with Rationals and Floats" begin
+    @test pi != Float64(pi)
+    @test Float64(pi) != pi
     @test Float64(pi,RoundDown) < pi
     @test Float64(pi,RoundUp) > pi
     @test !(Float64(pi,RoundDown) > pi)
@@ -1176,6 +1178,7 @@ end
     @test nextfloat(big(pi)) > pi
     @test !(prevfloat(big(pi)) > pi)
     @test !(nextfloat(big(pi)) < pi)
+    @test big(typeof(pi)) == BigFloat
 
     @test 2646693125139304345//842468587426513207 < pi
     @test !(2646693125139304345//842468587426513207 > pi)
diff --git a/test/path.jl b/test/path.jl
index 2f4f2d0983a58..4c2c7034577d5 100644
--- a/test/path.jl
+++ b/test/path.jl
@@ -311,6 +311,19 @@
         test_relpath()
     end
 
+    @testset "uripath" begin
+        host = if Sys.iswindows() "" else gethostname() end
+        sysdrive, uridrive = if Sys.iswindows() "C:\\", "C:/" else "/", "" end
+        @test Base.Filesystem.uripath("$(sysdrive)some$(sep)file.txt") == "file://$host/$(uridrive)some/file.txt"
+        @test Base.Filesystem.uripath("$(sysdrive)another$(sep)$(sep)folder$(sep)file.md") == "file://$host/$(uridrive)another/folder/file.md"
+        @test Base.Filesystem.uripath("$(sysdrive)some file with ^odd% chars") == "file://$host/$(uridrive)some%20file%20with%20%5Eodd%25%20chars"
+        @test Base.Filesystem.uripath("$(sysdrive)weird chars like @#&()[]{}") == "file://$host/$(uridrive)weird%20chars%20like%20%40%23%26%28%29%5B%5D%7B%7D"
+        @test Base.Filesystem.uripath("$sysdrive") == "file://$host/$uridrive"
+        @test Base.Filesystem.uripath(".") == Base.Filesystem.uripath(pwd())
+        @test Base.Filesystem.uripath("$(sysdrive)unicode$(sep)Δεδομένα") == "file://$host/$(uridrive)unicode/%CE%94%CE%B5%CE%B4%CE%BF%CE%BC%CE%AD%CE%BD%CE%B1"
+        @test Base.Filesystem.uripath("$(sysdrive)unicode$(sep)🧮🐛🔨") == "file://$host/$(uridrive)unicode/%F0%9F%A7%AE%F0%9F%90%9B%F0%9F%94%A8"
+    end
+
     if Sys.iswindows()
         @testset "issue #23646" begin
             @test lowercase(relpath("E:\\a\\b", "C:\\c")) == "e:\\a\\b"
diff --git a/test/precompile.jl b/test/precompile.jl
index bc738e557bb51..7a6e41061f9b1 100644
--- a/test/precompile.jl
+++ b/test/precompile.jl
@@ -2093,4 +2093,78 @@ precompile_test_harness("Binding Unique") do load_path
     @test UniqueBinding2.thebinding2 === ccall(:jl_get_module_binding, Ref{Core.Binding}, (Any, Any, Cint), UniqueBinding2, :thebinding, true)
 end
 
+precompile_test_harness("Detecting importing outside of a package module") do load_path
+    io = IOBuffer()
+    write(joinpath(load_path, "ImportBeforeMod.jl"),
+    """
+    import Printf
+    module ImportBeforeMod
+    end #module
+    """)
+    @test_throws r"Failed to precompile ImportBeforeMod" Base.compilecache(Base.identify_package("ImportBeforeMod"), io, io)
+    @test occursin(
+        "`using/import Printf` outside of a Module detected. Importing a package outside of a module is not allowed during package precompilation.",
+        String(take!(io)))
+
+
+    write(joinpath(load_path, "HarmlessComments.jl"),
+    """
+    # import Printf
+    #=
+    import Printf
+    =#
+    module HarmlessComments
+    end #module
+    # import Printf
+    #=
+    import Printf
+    =#
+    """)
+    Base.compilecache(Base.identify_package("HarmlessComments"))
+
+
+    write(joinpath(load_path, "ImportAfterMod.jl"), """
+    module ImportAfterMod
+    end #module
+    import Printf
+    """)
+    @test_throws r"Failed to precompile ImportAfterMod" Base.compilecache(Base.identify_package("ImportAfterMod"), io, io)
+    @test occursin(
+        "`using/import Printf` outside of a Module detected. Importing a package outside of a module is not allowed during package precompilation.",
+        String(take!(io)))
+end
+
+precompile_test_harness("No package module") do load_path
+    io = IOBuffer()
+    write(joinpath(load_path, "NoModule.jl"),
+    """
+    1
+    """)
+    @test_throws r"Failed to precompile NoModule" Base.compilecache(Base.identify_package("NoModule"), io, io)
+    @test occursin(
+        "NoModule [top-level] did not define the expected module `NoModule`, check for typos in package module name",
+        String(take!(io)))
+
+
+    write(joinpath(load_path, "WrongModuleName.jl"),
+    """
+    module DifferentName
+    x = 1
+    end #module
+    """)
+    @test_throws r"Failed to precompile WrongModuleName" Base.compilecache(Base.identify_package("WrongModuleName"), io, io)
+    @test occursin(
+        "WrongModuleName [top-level] did not define the expected module `WrongModuleName`, check for typos in package module name",
+        String(take!(io)))
+
+
+    write(joinpath(load_path, "NoModuleWithImport.jl"), """
+    import Printf
+    """)
+    @test_throws r"Failed to precompile NoModuleWithImport" Base.compilecache(Base.identify_package("NoModuleWithImport"), io, io)
+    @test occursin(
+        "`using/import Printf` outside of a Module detected. Importing a package outside of a module is not allowed during package precompilation.",
+        String(take!(io)))
+end
+
 finish_precompile_test!()
diff --git a/test/project/Extensions/CyclicExtensions/Manifest.toml b/test/project/Extensions/CyclicExtensions/Manifest.toml
new file mode 100644
index 0000000000000..a506825cf7995
--- /dev/null
+++ b/test/project/Extensions/CyclicExtensions/Manifest.toml
@@ -0,0 +1,21 @@
+# This file is machine-generated - editing it directly is not advised
+
+julia_version = "1.10.4"
+manifest_format = "2.0"
+project_hash = "ec25ff8df3a5e2212a173c3de2c7d716cc47cd36"
+
+[[deps.ExtDep]]
+deps = ["SomePackage"]
+path = "../ExtDep.jl"
+uuid = "fa069be4-f60b-4d4c-8b95-f8008775090c"
+version = "0.1.0"
+
+[[deps.ExtDep2]]
+path = "../ExtDep2"
+uuid = "55982ee5-2ad5-4c40-8cfe-5e9e1b01500d"
+version = "0.1.0"
+
+[[deps.SomePackage]]
+path = "../SomePackage"
+uuid = "678608ae-7bb3-42c7-98b1-82102067a3d8"
+version = "0.1.0"
diff --git a/test/project/Extensions/CyclicExtensions/Project.toml b/test/project/Extensions/CyclicExtensions/Project.toml
new file mode 100644
index 0000000000000..08d539dcc40ae
--- /dev/null
+++ b/test/project/Extensions/CyclicExtensions/Project.toml
@@ -0,0 +1,13 @@
+name = "CyclicExtensions"
+uuid = "17d4f0df-b55c-4714-ac4b-55fa23f7355c"
+version = "0.1.0"
+
+[deps]
+ExtDep = "fa069be4-f60b-4d4c-8b95-f8008775090c"
+
+[weakdeps]
+SomePackage = "678608ae-7bb3-42c7-98b1-82102067a3d8"
+
+[extensions]
+ExtA = ["SomePackage"]
+ExtB = ["SomePackage"]
diff --git a/test/project/Extensions/CyclicExtensions/ext/ExtA.jl b/test/project/Extensions/CyclicExtensions/ext/ExtA.jl
new file mode 100644
index 0000000000000..fa0c0961633cb
--- /dev/null
+++ b/test/project/Extensions/CyclicExtensions/ext/ExtA.jl
@@ -0,0 +1,6 @@
+module ExtA
+
+using CyclicExtensions
+using SomePackage
+
+end
diff --git a/test/project/Extensions/CyclicExtensions/ext/ExtB.jl b/test/project/Extensions/CyclicExtensions/ext/ExtB.jl
new file mode 100644
index 0000000000000..8f6da556d39b8
--- /dev/null
+++ b/test/project/Extensions/CyclicExtensions/ext/ExtB.jl
@@ -0,0 +1,6 @@
+module ExtB
+
+using CyclicExtensions
+using SomePackage
+
+end
diff --git a/test/project/Extensions/CyclicExtensions/src/CyclicExtensions.jl b/test/project/Extensions/CyclicExtensions/src/CyclicExtensions.jl
new file mode 100644
index 0000000000000..f1c2ec2077562
--- /dev/null
+++ b/test/project/Extensions/CyclicExtensions/src/CyclicExtensions.jl
@@ -0,0 +1,7 @@
+module CyclicExtensions
+
+using ExtDep
+
+greet() = print("Hello Cycles!")
+
+end # module CyclicExtensions
diff --git a/test/project/Rot13/src/Rot13.jl b/test/project/Rot13/src/Rot13.jl
index 1d19cbbe6df91..66f077812d878 100644
--- a/test/project/Rot13/src/Rot13.jl
+++ b/test/project/Rot13/src/Rot13.jl
@@ -12,4 +12,17 @@ function (@main)(args)
     return 0
 end
 
+module Rot26 # LOL
+
+import ..rot13
+
+rot26(str::AbstractString) = map(rot13 ∘ rot13, str)
+
+function (@main)(args)
+    foreach(arg -> print(rot26(arg), " "), args)
+    return 0
+end
+
+end
+
 end # module Rot13
diff --git a/test/ranges.jl b/test/ranges.jl
index 16b2c6bf7b77b..86cd1c3f2345c 100644
--- a/test/ranges.jl
+++ b/test/ranges.jl
@@ -292,15 +292,10 @@ end
 
     rand_twiceprecision(::Type{T}) where {T<:Number} = Base.TwicePrecision{T}(rand(widen(T)))
 
-    rand_twiceprecision_is_ok(::Type{T}) where {T<:Number} = @test !iszero(rand_twiceprecision(T).lo)
-
     # For this test the `BigFloat` mantissa needs to be just a bit
     # larger than the `Float64` mantissa
     setprecision(BigFloat, 70) do
         n = 10
-        @testset "rand twiceprecision is ok" for T ∈ (Float32, Float64), i ∈ 1:n
-            rand_twiceprecision_is_ok(T)
-        end
         @testset "twiceprecision roundtrip is not lossy 1" for i ∈ 1:n
             twiceprecision_roundtrip_is_not_lossy(Float64, rand(BigFloat))
         end
diff --git a/test/rational.jl b/test/rational.jl
index c6f81372de0b9..20a0971068876 100644
--- a/test/rational.jl
+++ b/test/rational.jl
@@ -801,3 +801,20 @@ end
     @test rationalize(Int64, nextfloat(0.1) * im; tol=0) == precise_next * im
     @test rationalize(0.1im; tol=eps(0.1)) == rationalize(0.1im)
 end
+
+@testset "complex numerator, denominator" begin
+    z = complex(3*3, 2*3*5)
+    @test z === numerator(z) === numerator(z // 2) === numerator(z // 5)
+    @test complex(3, 2*5) === numerator(z // 3)
+    @test isone(denominator(z))
+    @test 2 === denominator(z // 2)
+    @test 1 === denominator(z // 3)
+    @test 5 === denominator(z // 5)
+    for den ∈ 1:10
+        q = z // den
+        @test q === (numerator(q)//denominator(q))
+    end
+    @testset "do not overflow silently" begin
+        @test_throws OverflowError numerator(Int8(1)//Int8(31) + Int8(8)im//Int8(3))
+    end
+end
diff --git a/test/relocatedepot.jl b/test/relocatedepot.jl
index 039d422c35e25..2ef6dec90dbc1 100644
--- a/test/relocatedepot.jl
+++ b/test/relocatedepot.jl
@@ -1,3 +1,5 @@
+# This file is a part of Julia. License is MIT: https://julialang.org/license
+
 using Test
 
 
@@ -26,16 +28,38 @@ end
 
 if !test_relocated_depot
 
-    @testset "insert @depot tag in path" begin
+    @testset "edge cases when inserting @depot tag in path" begin
 
+        # insert @depot only once for first match
         test_harness() do
             mktempdir() do dir
                 pushfirst!(DEPOT_PATH, dir)
-                path = dir*dir
-                @test Base.replace_depot_path(path) == "@depot"*dir
+                if Sys.iswindows()
+                    # dirs start with a drive letter instead of a path separator
+                    path = dir*Base.Filesystem.pathsep()*dir
+                    @test Base.replace_depot_path(path) == "@depot"*Base.Filesystem.pathsep()*dir
+                else
+                    path = dir*dir
+                    @test Base.replace_depot_path(path) == "@depot"*dir
+                end
+            end
+
+            # 55340
+            empty!(DEPOT_PATH)
+            mktempdir() do dir
+                jlrc = joinpath(dir, "julia-rc2")
+                jl   = joinpath(dir, "julia")
+                mkdir(jl)
+                push!(DEPOT_PATH, jl)
+                @test Base.replace_depot_path(jl) == "@depot"
+                @test Base.replace_depot_path(string(jl,Base.Filesystem.pathsep())) ==
+                            string("@depot",Base.Filesystem.pathsep())
+                @test Base.replace_depot_path(jlrc) != "@depot-rc2"
+                @test Base.replace_depot_path(jlrc) == jlrc
             end
         end
 
+        # deal with and without trailing path separators
         test_harness() do
             mktempdir() do dir
                 pushfirst!(DEPOT_PATH, dir)
@@ -43,9 +67,9 @@ if !test_relocated_depot
                 if isdirpath(DEPOT_PATH[1])
                     DEPOT_PATH[1] = dirname(DEPOT_PATH[1]) # strip trailing pathsep
                 end
-                tag = joinpath("@depot", "") # append a pathsep
+                tag = string("@depot", Base.Filesystem.pathsep())
                 @test startswith(Base.replace_depot_path(path), tag)
-                DEPOT_PATH[1] = joinpath(DEPOT_PATH[1], "") # append a pathsep
+                DEPOT_PATH[1] = string(DEPOT_PATH[1], Base.Filesystem.pathsep())
                 @test startswith(Base.replace_depot_path(path), tag)
                 popfirst!(DEPOT_PATH)
                 @test !startswith(Base.replace_depot_path(path), tag)
diff --git a/test/rounding.jl b/test/rounding.jl
index 76b15ec1d9118..6fad6f62e8dfe 100644
--- a/test/rounding.jl
+++ b/test/rounding.jl
@@ -470,3 +470,28 @@ end
         @test prevfloat(f) < i
     end
 end
+
+@testset "π to `BigFloat` with `setrounding`" begin
+    function irrational_to_big_float(c::AbstractIrrational)
+        BigFloat(c)
+    end
+
+    function irrational_to_big_float_with_rounding_mode(c::AbstractIrrational, rm::RoundingMode)
+        f = () -> irrational_to_big_float(c)
+        setrounding(f, BigFloat, rm)
+    end
+
+    function irrational_to_big_float_with_rounding_mode_and_precision(c::AbstractIrrational, rm::RoundingMode, prec::Int)
+        f = () -> irrational_to_big_float_with_rounding_mode(c, rm)
+        setprecision(f, BigFloat, prec)
+    end
+
+    for c ∈ (π, MathConstants.γ, MathConstants.catalan)
+        for p ∈ 1:40
+            @test (
+                irrational_to_big_float_with_rounding_mode_and_precision(c, RoundDown, p) < c <
+                irrational_to_big_float_with_rounding_mode_and_precision(c, RoundUp, p)
+            )
+        end
+    end
+end
diff --git a/test/runtests.jl b/test/runtests.jl
index c46472ac93fa8..e48e896f4069e 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -438,9 +438,9 @@ cd(@__DIR__) do
     # o_ts.verbose = true # set to true to show all timings when successful
     Test.print_test_results(o_ts, 1)
     if !o_ts.anynonpass
-        println("    \033[32;1mSUCCESS\033[0m")
+        printstyled("    SUCCESS\n"; bold=true, color=:green)
     else
-        println("    \033[31;1mFAILURE\033[0m\n")
+        printstyled("    FAILURE\n\n"; bold=true, color=:red)
         skipped > 0 &&
             println("$skipped test", skipped > 1 ? "s were" : " was", " skipped due to failure.")
         println("The global RNG seed was 0x$(string(seed, base = 16)).\n")
diff --git a/test/show.jl b/test/show.jl
index 63663152d9d91..d9c3585b7c1df 100644
--- a/test/show.jl
+++ b/test/show.jl
@@ -755,6 +755,69 @@ end
 
 @test startswith(sprint(show, typeof(x->x), context = :module=>@__MODULE__), "var\"")
 
+# PR 53719
+module M53719
+    f = x -> x + 1
+    function foo(x)
+        function bar(y)
+            function baz(z)
+                return x + y + z
+            end
+            return baz
+        end
+        return bar
+    end
+    function foo2(x)
+        function bar2(y)
+            return z -> x + y + z
+        end
+        return bar2
+    end
+    lambda1 = (x)->begin
+        function foo(y)
+            return x + y
+        end
+        return foo
+    end
+    lambda2 = (x)->begin
+        y -> x + y
+    end
+end
+
+@testset "PR 53719 function names" begin
+    # M53719.f should be printed as var"#[0-9]+"
+    @test occursin(r"var\"#[0-9]+", sprint(show, M53719.f, context = :module=>M53719))
+    # M53719.foo(1) should be printed as var"#bar"
+    @test occursin(r"var\"#bar", sprint(show, M53719.foo(1), context = :module=>M53719))
+    # M53719.foo(1)(2) should be printed as var"#baz"
+    @test occursin(r"var\"#baz", sprint(show, M53719.foo(1)(2), context = :module=>M53719))
+    # M53719.foo2(1) should be printed as var"#bar2"
+    @test occursin(r"var\"#bar2", sprint(show, M53719.foo2(1), context = :module=>M53719))
+    # M53719.foo2(1)(2) should be printed as var"#foo2##[0-9]+"
+    @test occursin(r"var\"#foo2##[0-9]+", sprint(show, M53719.foo2(1)(2), context = :module=>M53719))
+    # M53719.lambda1(1) should be printed as var"#foo"
+    @test occursin(r"var\"#foo", sprint(show, M53719.lambda1(1), context = :module=>M53719))
+    # M53719.lambda2(1) should be printed as var"#[0-9]+"
+    @test occursin(r"var\"#[0-9]+", sprint(show, M53719.lambda2(1), context = :module=>M53719))
+end
+
+@testset "PR 53719 function types" begin
+    # typeof(M53719.f) should be printed as var"#[0-9]+#[0-9]+"
+    @test occursin(r"var\"#[0-9]+#[0-9]+", sprint(show, typeof(M53719.f), context = :module=>M53719))
+    #typeof(M53719.foo(1)) should be printed as var"#bar#foo##[0-9]+"
+    @test occursin(r"var\"#bar#foo##[0-9]+", sprint(show, typeof(M53719.foo(1)), context = :module=>M53719))
+    #typeof(M53719.foo(1)(2)) should be printed as var"#baz#foo##[0-9]+"
+    @test occursin(r"var\"#baz#foo##[0-9]+", sprint(show, typeof(M53719.foo(1)(2)), context = :module=>M53719))
+    #typeof(M53719.foo2(1)) should be printed as var"#bar2#foo2##[0-9]+"
+    @test occursin(r"var\"#bar2#foo2##[0-9]+", sprint(show, typeof(M53719.foo2(1)), context = :module=>M53719))
+    #typeof(M53719.foo2(1)(2)) should be printed as var"#foo2##[0-9]+#foo2##[0-9]+"
+    @test occursin(r"var\"#foo2##[0-9]+#foo2##[0-9]+", sprint(show, typeof(M53719.foo2(1)(2)), context = :module=>M53719))
+    #typeof(M53719.lambda1(1)) should be printed as var"#foo#[0-9]+"
+    @test occursin(r"var\"#foo#[0-9]+", sprint(show, typeof(M53719.lambda1(1)), context = :module=>M53719))
+    #typeof(M53719.lambda2(1)) should be printed as var"#[0-9]+#[0-9]+"
+    @test occursin(r"var\"#[0-9]+#[0-9]+", sprint(show, typeof(M53719.lambda2(1)), context = :module=>M53719))
+end
+
 #test methodshow.jl functions
 @test Base.inbase(Base)
 @test !Base.inbase(LinearAlgebra)
@@ -2705,3 +2768,8 @@ let topmi = ccall(:jl_new_method_instance_uninit, Ref{Core.MethodInstance}, ());
     topmi.def = Main
     @test contains(repr(topmi), "Toplevel MethodInstance")
 end
+
+@testset "show(<do-block expr>) no trailing whitespace" begin
+    do_expr1 = :(foo() do; bar(); end)
+    @test !contains(sprint(show, do_expr1), " \n")
+end
diff --git a/test/staged.jl b/test/staged.jl
index aec4a3bf135d3..1b28144639f97 100644
--- a/test/staged.jl
+++ b/test/staged.jl
@@ -381,11 +381,17 @@ let
     @test length(ir.cfg.blocks) == 1
 end
 
+function generate_lambda_ex(world::UInt, source::LineNumberNode,
+                            argnames, spnames, @nospecialize body)
+    stub = Core.GeneratedFunctionStub(identity, Core.svec(argnames...), Core.svec(spnames...))
+    return stub(world, source, body)
+end
+
 # Test that `Core.CachedGenerator` works as expected
 struct Generator54916 <: Core.CachedGenerator end
 function (::Generator54916)(world::UInt, source::LineNumberNode, args...)
-    stub = Core.GeneratedFunctionStub(identity, Core.svec(:doit54916, :func, :arg), Core.svec())
-    return stub(world, source, :(func(arg)))
+    return generate_lambda_ex(world, source,
+        (:doit54916, :func, :arg), (), :(func(arg)))
 end
 @eval function doit54916(func, arg)
     $(Expr(:meta, :generated, Generator54916()))
@@ -412,8 +418,8 @@ function generator49715(world, source, self, f, tt)
     sig = Tuple{f, tt.parameters...}
     mi = Base._which(sig; world)
     error("oh no")
-    stub = Core.GeneratedFunctionStub(identity, Core.svec(:methodinstance, :ctx, :x, :f), Core.svec())
-    stub(world, source, :(nothing))
+    return generate_lambda_ex(world, source,
+        (:doit49715, :f, :tt), (), nothing)
 end
 @eval function doit49715(f, tt)
     $(Expr(:meta, :generated, generator49715))
@@ -426,9 +432,10 @@ function overdubbee54341(a, b)
     a + b
 end
 const overdubee_codeinfo54341 = code_lowered(overdubbee54341, Tuple{Any, Any})[1]
-function overdub_generator54341(world::UInt, source::LineNumberNode, args...)
-    if length(args) != 2
-        :(error("Wrong number of arguments"))
+function overdub_generator54341(world::UInt, source::LineNumberNode, selftype, fargtypes)
+    if length(fargtypes) != 2
+        return generate_lambda_ex(world, source,
+            (:overdub54341, :args), (), :(error("Wrong number of arguments")))
     else
         return copy(overdubee_codeinfo54341)
     end
@@ -438,3 +445,7 @@ end
     $(Expr(:meta, :generated_only))
 end
 @test overdub54341(1, 2) == 3
+# check if the inlining pass handles `nargs`/`isva` correctly
+@test first(only(code_typed((Int,Int)) do x, y; @inline overdub54341(x, y); end)) isa Core.CodeInfo
+@test first(only(code_typed((Int,)) do x; @inline overdub54341(x, 1); end)) isa Core.CodeInfo
+@test_throws "Wrong number of arguments" overdub54341(1, 2, 3)
diff --git a/test/strings/annotated.jl b/test/strings/annotated.jl
index 90aaadd6ede24..ee53c3d5846eb 100644
--- a/test/strings/annotated.jl
+++ b/test/strings/annotated.jl
@@ -5,14 +5,22 @@
     @test str == Base.AnnotatedString(str.string, Tuple{UnitRange{Int}, Pair{Symbol, Any}}[])
     @test length(str) == 11
     @test ncodeunits(str) == 11
+    @test codeunits(str) == codeunits("some string")
+    @test codeunit(str) == UInt8
+    @test codeunit(str, 1) == codeunit("some string", 1)
+    @test firstindex(str) == firstindex("some string")
     @test convert(Base.AnnotatedString, str) === str
     @test eltype(str) == Base.AnnotatedChar{eltype(str.string)}
     @test first(str) == Base.AnnotatedChar(first(str.string), Pair{Symbol, Any}[])
     @test str[1:4] isa SubString{typeof(str)}
     @test str[1:4] == Base.AnnotatedString("some")
+    big_byte_str = Base.AnnotatedString("आख")
+    @test_throws StringIndexError big_byte_str[5]
     @test "a" * str == Base.AnnotatedString("asome string")
     @test str * "a" == Base.AnnotatedString("some stringa")
     @test str * str == Base.AnnotatedString("some stringsome string")
+    @test cmp(str, "some stringy thingy") == -1
+    @test cmp("some stringy thingy", str) == 1
     @test str[3:4] == SubString("me")
     @test SubString("me") == str[3:4]
     Base.annotate!(str, 1:4, :thing => 0x01)
diff --git a/test/strings/basic.jl b/test/strings/basic.jl
index a7266f52f16fc..874607f3c1b20 100644
--- a/test/strings/basic.jl
+++ b/test/strings/basic.jl
@@ -1398,9 +1398,14 @@ end
     str_2 = "αβγ"
     # string starting with a 3 byte UTF-8 character
     str_3 = "आख"
-    @testset for str in (str_1, str_2, str_3)
+    # string starting with a 4 byte UTF-8 character
+    str_4 = "𒃵𒃰"
+    @testset for str in (str_1, str_2, str_3, str_4)
+        @test transcode(String, str) === str
         @test transcode(String, transcode(UInt16, str)) == str
         @test transcode(String, transcode(UInt16, transcode(UInt8, str))) == str
+        @test transcode(String, transcode(Int32, transcode(UInt8, str))) == str
+        @test transcode(String, transcode(UInt32, transcode(UInt8, str))) == str
         @test transcode(String, transcode(UInt8, transcode(UInt16, str))) == str
     end
 end
diff --git a/test/strings/search.jl b/test/strings/search.jl
index 692286359868d..c43327fe2971b 100644
--- a/test/strings/search.jl
+++ b/test/strings/search.jl
@@ -4,26 +4,27 @@
 astr = "Hello, world.\n"
 u8str = "∀ ε > 0, ∃ δ > 0: |x-y| < δ ⇒ |f(x)-f(y)| < ε"
 
-# I think these should give error on 4 also, and "" is not treated
-# consistently with SubString("",1,1), nor with Char[]
-for ind in (0, 5)
-    @test_throws BoundsError findnext(SubString("",1,1), "foo", ind)
-    @test_throws BoundsError findprev(SubString("",1,1), "foo", ind)
-end
+@testset "BoundsError for findnext/findprev" begin
+    # I think these should give error on 4 also, and "" is not treated
+    # consistently with SubString("",1,1), nor with Char[]
+    for ind in (0, 5)
+        @test_throws BoundsError findnext(SubString("",1,1), "foo", ind)
+        @test_throws BoundsError findprev(SubString("",1,1), "foo", ind)
+    end
 
-# Note: the commented out test will be enabled after fixes to make
-# sure that findnext/findprev are consistent
-# no matter what type of AbstractString the second argument is
-@test_throws BoundsError findnext(isequal('a'), "foo", 0)
-@test_throws BoundsError findnext(in(Char[]), "foo", 5)
-# @test_throws BoundsError findprev(in(Char[]), "foo", 0)
-@test_throws BoundsError findprev(in(Char[]), "foo", 5)
+    # Note: the commented out test will be enabled after fixes to make
+    # sure that findnext/findprev are consistent
+    # no matter what type of AbstractString the second argument is
+    @test_throws BoundsError findnext(isequal('a'), "foo", 0)
+    @test_throws BoundsError findnext(in(Char[]), "foo", 5)
+    # @test_throws BoundsError findprev(in(Char[]), "foo", 0)
+    @test_throws BoundsError findprev(in(Char[]), "foo", 5)
 
-# @test_throws ErrorException in("foobar","bar")
-@test_throws BoundsError findnext(isequal(0x1),b"\x1\x2",0)
+    # @test_throws ErrorException in("foobar","bar")
+    @test_throws BoundsError findnext(isequal(0x1),b"\x1\x2",0)
+end
 
-# ascii forward search
-for str in [astr, GenericString(astr)]
+@testset "ascii forward search $(typeof(str))" for str in [astr, GenericString(astr)]
     @test_throws BoundsError findnext(isequal('z'), str, 0)
     @test_throws BoundsError findnext(isequal('∀'), str, 0)
     @test findfirst(isequal('x'), str) === nothing
@@ -41,9 +42,7 @@ for str in [astr, GenericString(astr)]
     @test findnext(isequal('\n'), str, 15) === nothing
     @test_throws BoundsError findnext(isequal('ε'), str, nextind(str,lastindex(str))+1)
     @test_throws BoundsError findnext(isequal('a'), str, nextind(str,lastindex(str))+1)
-end
 
-for str in [astr, GenericString(astr)]
     @test_throws BoundsError findnext('z', str, 0)
     @test_throws BoundsError findnext('∀', str, 0)
     @test findfirst('x', str) === nothing
@@ -65,8 +64,8 @@ for str in [astr, GenericString(astr)]
     @test_throws BoundsError findnext('a', str, nextind(str,lastindex(str))+1)
 end
 
-# ascii backward search
-for str in [astr]
+@testset "ascii backward search" begin
+    str = astr
     @test findlast(isequal('x'), str) === nothing
     @test findlast(isequal('\0'), str) === nothing
     @test findlast(isequal('\u80'), str) === nothing
@@ -81,9 +80,7 @@ for str in [astr]
     @test findlast(isequal(','), str) == 6
     @test findprev(isequal(','), str, 5) === nothing
     @test findlast(isequal('\n'), str) == 14
-end
 
-for str in [astr]
     @test findlast('x', str) === nothing
     @test findlast('\0', str) === nothing
     @test findlast('\u80', str) === nothing
@@ -102,8 +99,7 @@ for str in [astr]
     @test findlast('\n', str) == 14
 end
 
-# utf-8 forward search
-for str in (u8str, GenericString(u8str))
+@testset "utf-8 forward search $(typeof(str))" for str in (u8str, GenericString(u8str))
     @test_throws BoundsError findnext(isequal('z'), str, 0)
     @test_throws BoundsError findnext(isequal('∀'), str, 0)
     @test findfirst(isequal('z'), str) === nothing
@@ -132,8 +128,8 @@ for str in (u8str, GenericString(u8str))
     @test_throws BoundsError findnext(isequal('a'), str, nextind(str,lastindex(str))+1)
 end
 
-# utf-8 backward search
-for str in [u8str]
+@testset "utf-8 backward search" begin
+    str = u8str
     @test findlast(isequal('z'), str) === nothing
     @test findlast(isequal('\0'), str) === nothing
     @test findlast(isequal('\u80'), str) === nothing
@@ -155,238 +151,262 @@ for str in [u8str]
     @test findprev(isequal('ε'), str, 4) === nothing
 end
 
-# string forward search with a single-char string
-@test findfirst("x", astr) === nothing
-@test findfirst("H", astr) == 1:1
-@test findnext("H", astr, 2) === nothing
-@test findfirst("l", astr) == 3:3
-@test findnext("l", astr, 4) == 4:4
-@test findnext("l", astr, 5) == 11:11
-@test findnext("l", astr, 12) === nothing
-@test findfirst("\n", astr) == 14:14
-@test findnext("\n", astr, 15) === nothing
-
-@test findfirst("z", u8str) === nothing
-@test findfirst("∄", u8str) === nothing
-@test findfirst("∀", u8str) == 1:1
-@test findnext("∀", u8str, 4) === nothing
-@test findfirst("∃", u8str) == 13:13
-@test findnext("∃", u8str, 16) === nothing
-@test findfirst("x", u8str) == 26:26
-@test findnext("x", u8str, 27) == 43:43
-@test findnext("x", u8str, 44) === nothing
-@test findfirst("ε", u8str) == 5:5
-@test findnext("ε", u8str, 7) == 54:54
-@test findnext("ε", u8str, 56) === nothing
-
-# strifindprev  backward search with a single-char string
-@test findlast("x", astr) === nothing
-@test findlast("H", astr) == 1:1
-@test findprev("H", astr, 2) == 1:1
-@test findprev("H", astr, 0) === nothing
-@test findlast("l", astr) == 11:11
-@test findprev("l", astr, 10) == 4:4
-@test findprev("l", astr, 4) == 4:4
-@test findprev("l", astr, 3) == 3:3
-@test findprev("l", astr, 2) === nothing
-@test findlast("\n", astr) == 14:14
-@test findprev("\n", astr, 13) === nothing
-
-@test findlast("z", u8str) === nothing
-@test findlast("∄", u8str) === nothing
-@test findlast("∀", u8str) == 1:1
-@test findprev("∀", u8str, 0) === nothing
-#TODO: setting the limit in the middle of a wide char
-#      makes findnext fail but findprev succeed.
-#      Should findprev fail as well?
-#@test findprev("∀", u8str, 2) === nothing # gives 1:3
-@test findlast("∃", u8str) == 13:13
-@test findprev("∃", u8str, 12) === nothing
-@test findlast("x", u8str) == 43:43
-@test findprev("x", u8str, 42) == 26:26
-@test findprev("x", u8str, 25) === nothing
-@test findlast("ε", u8str) == 54:54
-@test findprev("ε", u8str, 53) == 5:5
-@test findprev("ε", u8str, 4) === nothing
-
-# string forward search with a single-char regex
-@test findfirst(r"x", astr) === nothing
-@test findfirst(r"H", astr) == 1:1
-@test findnext(r"H", astr, 2) === nothing
-@test findfirst(r"l", astr) == 3:3
-@test findnext(r"l", astr, 4) == 4:4
-@test findnext(r"l", astr, 5) == 11:11
-@test findnext(r"l", astr, 12) === nothing
-@test findfirst(r"\n", astr) == 14:14
-@test findnext(r"\n", astr, 15) === nothing
-@test findfirst(r"z", u8str) === nothing
-@test findfirst(r"∄", u8str) === nothing
-@test findfirst(r"∀", u8str) == 1:1
-@test findnext(r"∀", u8str, 4) === nothing
-@test findfirst(r"∀", u8str) == findfirst(r"\u2200", u8str)
-@test findnext(r"∀", u8str, 4) == findnext(r"\u2200", u8str, 4)
-@test findfirst(r"∃", u8str) == 13:13
-@test findnext(r"∃", u8str, 16) === nothing
-@test findfirst(r"x", u8str) == 26:26
-@test findnext(r"x", u8str, 27) == 43:43
-@test findnext(r"x", u8str, 44) === nothing
-@test findfirst(r"ε", u8str) == 5:5
-@test findnext(r"ε", u8str, 7) == 54:54
-@test findnext(r"ε", u8str, 56) === nothing
-for i = 1:lastindex(astr)
-    @test findnext(r"."s, astr, i) == i:i
+@testset "string forward search with a single-char string" begin
+    @test findfirst("x", astr) === nothing
+    @test findfirst("H", astr) == 1:1
+    @test findnext("H", astr, 2) === nothing
+    @test findfirst("l", astr) == 3:3
+    @test findnext("l", astr, 4) == 4:4
+    @test findnext("l", astr, 5) == 11:11
+    @test findnext("l", astr, 12) === nothing
+    @test findfirst("\n", astr) == 14:14
+    @test findnext("\n", astr, 15) === nothing
+
+    @test findfirst("z", u8str) === nothing
+    @test findfirst("∄", u8str) === nothing
+    @test findfirst("∀", u8str) == 1:1
+    @test findnext("∀", u8str, 4) === nothing
+    @test findfirst("∃", u8str) == 13:13
+    @test findnext("∃", u8str, 16) === nothing
+    @test findfirst("x", u8str) == 26:26
+    @test findnext("x", u8str, 27) == 43:43
+    @test findnext("x", u8str, 44) === nothing
+    @test findfirst("ε", u8str) == 5:5
+    @test findnext("ε", u8str, 7) == 54:54
+    @test findnext("ε", u8str, 56) === nothing
 end
-for i = 1:lastindex(u8str)
-    if isvalid(u8str,i)
-        @test findnext(r"."s, u8str, i) == i:i
+
+@testset "findprev backward search with a single-char string" begin
+    @test findlast("x", astr) === nothing
+    @test findlast("H", astr) == 1:1
+    @test findprev("H", astr, 2) == 1:1
+    @test findprev("H", astr, 0) === nothing
+    @test findlast("l", astr) == 11:11
+    @test findprev("l", astr, 10) == 4:4
+    @test findprev("l", astr, 4) == 4:4
+    @test findprev("l", astr, 3) == 3:3
+    @test findprev("l", astr, 2) === nothing
+    @test findlast("\n", astr) == 14:14
+    @test findprev("\n", astr, 13) === nothing
+
+    @test findlast("z", u8str) === nothing
+    @test findlast("∄", u8str) === nothing
+    @test findlast("∀", u8str) == 1:1
+    @test findprev("∀", u8str, 0) === nothing
+    #TODO: setting the limit in the middle of a wide char
+    #      makes findnext fail but findprev succeed.
+    #      Should findprev fail as well?
+    #@test findprev("∀", u8str, 2) === nothing # gives 1:3
+    @test findlast("∃", u8str) == 13:13
+    @test findprev("∃", u8str, 12) === nothing
+    @test findlast("x", u8str) == 43:43
+    @test findprev("x", u8str, 42) == 26:26
+    @test findprev("x", u8str, 25) === nothing
+    @test findlast("ε", u8str) == 54:54
+    @test findprev("ε", u8str, 53) == 5:5
+    @test findprev("ε", u8str, 4) === nothing
+end
+
+@testset "string forward search with a single-char regex" begin
+    @test findfirst(r"x", astr) === nothing
+    @test findfirst(r"H", astr) == 1:1
+    @test findnext(r"H", astr, 2) === nothing
+    @test findfirst(r"l", astr) == 3:3
+    @test findnext(r"l", astr, 4) == 4:4
+    @test findnext(r"l", astr, 5) == 11:11
+    @test findnext(r"l", astr, 12) === nothing
+    @test findfirst(r"\n", astr) == 14:14
+    @test findnext(r"\n", astr, 15) === nothing
+    @test findfirst(r"z", u8str) === nothing
+    @test findfirst(r"∄", u8str) === nothing
+    @test findfirst(r"∀", u8str) == 1:1
+    @test findnext(r"∀", u8str, 4) === nothing
+    @test findfirst(r"∀", u8str) == findfirst(r"\u2200", u8str)
+    @test findnext(r"∀", u8str, 4) == findnext(r"\u2200", u8str, 4)
+    @test findfirst(r"∃", u8str) == 13:13
+    @test findnext(r"∃", u8str, 16) === nothing
+    @test findfirst(r"x", u8str) == 26:26
+    @test findnext(r"x", u8str, 27) == 43:43
+    @test findnext(r"x", u8str, 44) === nothing
+    @test findfirst(r"ε", u8str) == 5:5
+    @test findnext(r"ε", u8str, 7) == 54:54
+    @test findnext(r"ε", u8str, 56) === nothing
+    for i = 1:lastindex(astr)
+        @test findnext(r"."s, astr, i) == i:i
+    end
+    for i = 1:lastindex(u8str)
+        if isvalid(u8str,i)
+            @test findnext(r"."s, u8str, i) == i:i
+        end
     end
 end
 
-# string forward search with a zero-char string
-for i = 1:lastindex(astr)
-    @test findnext("", astr, i) == i:i-1
+@testset "string forward search with a zero-char string" begin
+    for i = 1:lastindex(astr)
+        @test findnext("", astr, i) == i:i-1
+    end
+    for i = 1:lastindex(u8str)
+        @test findnext("", u8str, i) == i:i-1
+    end
+    @test findfirst("", "") === 1:0
 end
-for i = 1:lastindex(u8str)
-    @test findnext("", u8str, i) == i:i-1
+
+@testset "string backward search with a zero-char string" begin
+    for i = 1:lastindex(astr)
+        @test findprev("", astr, i) == i:i-1
+    end
+    for i = 1:lastindex(u8str)
+        @test findprev("", u8str, i) == i:i-1
+    end
+    @test findlast("", "") === 1:0
 end
-@test findfirst("", "") === 1:0
 
-# string backward search with a zero-char string
-for i = 1:lastindex(astr)
-    @test findprev("", astr, i) == i:i-1
+@testset "string forward search with a zero-char regex" begin
+    for i = 1:lastindex(astr)
+        @test findnext(r"", astr, i) == i:i-1
+    end
+    for i = 1:lastindex(u8str)
+        # TODO: should regex search fast-forward invalid indices?
+        if isvalid(u8str,i)
+            @test findnext(r"", u8str, i) == i:i-1
+        end
+    end
 end
-for i = 1:lastindex(u8str)
-    @test findprev("", u8str, i) == i:i-1
+
+# See the comments in #54579
+@testset "Search for invalid chars" begin
+    @test findfirst(==('\xff'), "abc\xffde") == 4
+    @test findprev(isequal('\xa6'), "abc\xa69", 5) == 4
+    @test isnothing(findfirst(==('\xff'), "abcdeæd"))
+
+    @test isnothing(findnext(==('\xa6'), "æ", 1))
+    @test isnothing(findprev(==('\xa6'), "æa", 2))
 end
-@test findlast("", "") === 1:0
 
-# string forward search with a zero-char regex
-for i = 1:lastindex(astr)
-    @test findnext(r"", astr, i) == i:i-1
+@testset "string forward search with a two-char string literal" begin
+    @test findfirst("xx", "foo,bar,baz") === nothing
+    @test findfirst("fo", "foo,bar,baz") == 1:2
+    @test findnext("fo", "foo,bar,baz", 3) === nothing
+    @test findfirst("oo", "foo,bar,baz") == 2:3
+    @test findnext("oo", "foo,bar,baz", 4) === nothing
+    @test findfirst("o,", "foo,bar,baz") == 3:4
+    @test findnext("o,", "foo,bar,baz", 5) === nothing
+    @test findfirst(",b", "foo,bar,baz") == 4:5
+    @test findnext(",b", "foo,bar,baz", 6) == 8:9
+    @test findnext(",b", "foo,bar,baz", 10) === nothing
+    @test findfirst("az", "foo,bar,baz") == 10:11
+    @test findnext("az", "foo,bar,baz", 12) === nothing
 end
-for i = 1:lastindex(u8str)
-    # TODO: should regex search fast-forward invalid indices?
-    if isvalid(u8str,i)
-        @test findnext(r"", u8str, i) == i:i-1
-    end
+
+@testset "issue #9365" begin
+    # string forward search with a two-char UTF-8 (2 byte) string literal
+    @test findfirst("éé", "ééé") == 1:3
+    @test findnext("éé", "ééé", 1) == 1:3
+    # string forward search with a two-char UTF-8 (3 byte) string literal
+    @test findfirst("€€", "€€€") == 1:4
+    @test findnext("€€", "€€€", 1) == 1:4
+    # string forward search with a two-char UTF-8 (4 byte) string literal
+    @test findfirst("\U1f596\U1f596", "\U1f596\U1f596\U1f596") == 1:5
+    @test findnext("\U1f596\U1f596", "\U1f596\U1f596\U1f596", 1) == 1:5
+
+    # string forward search with a two-char UTF-8 (2 byte) string literal
+    @test findfirst("éé", "éé") == 1:3
+    @test findnext("éé", "éé", 1) == 1:3
+    # string forward search with a two-char UTF-8 (3 byte) string literal
+    @test findfirst("€€", "€€") == 1:4
+    @test findnext("€€", "€€", 1) == 1:4
+    # string forward search with a two-char UTF-8 (4 byte) string literal
+    @test findfirst("\U1f596\U1f596", "\U1f596\U1f596") == 1:5
+    @test findnext("\U1f596\U1f596", "\U1f596\U1f596", 1) == 1:5
+
+    # string backward search with a two-char UTF-8 (2 byte) string literal
+    @test findlast("éé", "ééé") == 3:5
+    @test findprev("éé", "ééé", lastindex("ééé")) == 3:5
+    # string backward search with a two-char UTF-8 (3 byte) string literal
+    @test findlast("€€", "€€€") == 4:7
+    @test findprev("€€", "€€€", lastindex("€€€")) == 4:7
+    # string backward search with a two-char UTF-8 (4 byte) string literal
+    @test findlast("\U1f596\U1f596", "\U1f596\U1f596\U1f596") == 5:9
+    @test findprev("\U1f596\U1f596", "\U1f596\U1f596\U1f596", lastindex("\U1f596\U1f596\U1f596")) == 5:9
+
+    # string backward search with a two-char UTF-8 (2 byte) string literal
+    @test findlast("éé", "éé") == 1:3        # should really be 1:4!
+    @test findprev("éé", "éé", lastindex("ééé")) == 1:3
+    # string backward search with a two-char UTF-8 (3 byte) string literal
+    @test findlast("€€", "€€") == 1:4        # should really be 1:6!
+    @test findprev("€€", "€€", lastindex("€€€")) == 1:4
+    # string backward search with a two-char UTF-8 (4 byte) string literal
+    @test findlast("\U1f596\U1f596", "\U1f596\U1f596") == 1:5        # should really be 1:8!
+    @test findprev("\U1f596\U1f596", "\U1f596\U1f596", lastindex("\U1f596\U1f596\U1f596")) == 1:5
+end
+
+@testset "string backward search with a two-char string literal" begin
+    @test findlast("xx", "foo,bar,baz") === nothing
+    @test findlast("fo", "foo,bar,baz") == 1:2
+    @test findprev("fo", "foo,bar,baz", 1) === nothing
+    @test findlast("oo", "foo,bar,baz") == 2:3
+    @test findprev("oo", "foo,bar,baz", 2) === nothing
+    @test findlast("o,", "foo,bar,baz") == 3:4
+    @test findprev("o,", "foo,bar,baz", 1) === nothing
+    @test findlast(",b", "foo,bar,baz") == 8:9
+    @test findprev(",b", "foo,bar,baz", 6) == 4:5
+    @test findprev(",b", "foo,bar,baz", 3) === nothing
+    @test findlast("az", "foo,bar,baz") == 10:11
+    @test findprev("az", "foo,bar,baz", 10) === nothing
 end
 
-# string forward search with a two-char string literal
-@test findfirst("xx", "foo,bar,baz") === nothing
-@test findfirst("fo", "foo,bar,baz") == 1:2
-@test findnext("fo", "foo,bar,baz", 3) === nothing
-@test findfirst("oo", "foo,bar,baz") == 2:3
-@test findnext("oo", "foo,bar,baz", 4) === nothing
-@test findfirst("o,", "foo,bar,baz") == 3:4
-@test findnext("o,", "foo,bar,baz", 5) === nothing
-@test findfirst(",b", "foo,bar,baz") == 4:5
-@test findnext(",b", "foo,bar,baz", 6) == 8:9
-@test findnext(",b", "foo,bar,baz", 10) === nothing
-@test findfirst("az", "foo,bar,baz") == 10:11
-@test findnext("az", "foo,bar,baz", 12) === nothing
-
-# issue #9365
-# string forward search with a two-char UTF-8 (2 byte) string literal
-@test findfirst("éé", "ééé") == 1:3
-@test findnext("éé", "ééé", 1) == 1:3
-# string forward search with a two-char UTF-8 (3 byte) string literal
-@test findfirst("€€", "€€€") == 1:4
-@test findnext("€€", "€€€", 1) == 1:4
-# string forward search with a two-char UTF-8 (4 byte) string literal
-@test findfirst("\U1f596\U1f596", "\U1f596\U1f596\U1f596") == 1:5
-@test findnext("\U1f596\U1f596", "\U1f596\U1f596\U1f596", 1) == 1:5
-
-# string forward search with a two-char UTF-8 (2 byte) string literal
-@test findfirst("éé", "éé") == 1:3
-@test findnext("éé", "éé", 1) == 1:3
-# string forward search with a two-char UTF-8 (3 byte) string literal
-@test findfirst("€€", "€€") == 1:4
-@test findnext("€€", "€€", 1) == 1:4
-# string forward search with a two-char UTF-8 (4 byte) string literal
-@test findfirst("\U1f596\U1f596", "\U1f596\U1f596") == 1:5
-@test findnext("\U1f596\U1f596", "\U1f596\U1f596", 1) == 1:5
-
-# string backward search with a two-char UTF-8 (2 byte) string literal
-@test findlast("éé", "ééé") == 3:5
-@test findprev("éé", "ééé", lastindex("ééé")) == 3:5
-# string backward search with a two-char UTF-8 (3 byte) string literal
-@test findlast("€€", "€€€") == 4:7
-@test findprev("€€", "€€€", lastindex("€€€")) == 4:7
-# string backward search with a two-char UTF-8 (4 byte) string literal
-@test findlast("\U1f596\U1f596", "\U1f596\U1f596\U1f596") == 5:9
-@test findprev("\U1f596\U1f596", "\U1f596\U1f596\U1f596", lastindex("\U1f596\U1f596\U1f596")) == 5:9
-
-# string backward search with a two-char UTF-8 (2 byte) string literal
-@test findlast("éé", "éé") == 1:3        # should really be 1:4!
-@test findprev("éé", "éé", lastindex("ééé")) == 1:3
-# string backward search with a two-char UTF-8 (3 byte) string literal
-@test findlast("€€", "€€") == 1:4        # should really be 1:6!
-@test findprev("€€", "€€", lastindex("€€€")) == 1:4
-# string backward search with a two-char UTF-8 (4 byte) string literal
-@test findlast("\U1f596\U1f596", "\U1f596\U1f596") == 1:5        # should really be 1:8!
-@test findprev("\U1f596\U1f596", "\U1f596\U1f596", lastindex("\U1f596\U1f596\U1f596")) == 1:5
-
-# string backward search with a two-char string literal
-@test findlast("xx", "foo,bar,baz") === nothing
-@test findlast("fo", "foo,bar,baz") == 1:2
-@test findprev("fo", "foo,bar,baz", 1) === nothing
-@test findlast("oo", "foo,bar,baz") == 2:3
-@test findprev("oo", "foo,bar,baz", 2) === nothing
-@test findlast("o,", "foo,bar,baz") == 3:4
-@test findprev("o,", "foo,bar,baz", 1) === nothing
-@test findlast(",b", "foo,bar,baz") == 8:9
-@test findprev(",b", "foo,bar,baz", 6) == 4:5
-@test findprev(",b", "foo,bar,baz", 3) === nothing
-@test findlast("az", "foo,bar,baz") == 10:11
-@test findprev("az", "foo,bar,baz", 10) === nothing
-
-# string search with a two-char regex
-@test findfirst(r"xx", "foo,bar,baz") === nothing
-@test findfirst(r"fo", "foo,bar,baz") == 1:2
-@test findnext(r"fo", "foo,bar,baz", 3) === nothing
-@test findfirst(r"oo", "foo,bar,baz") == 2:3
-@test findnext(r"oo", "foo,bar,baz", 4) === nothing
-@test findfirst(r"o,", "foo,bar,baz") == 3:4
-@test findnext(r"o,", "foo,bar,baz", 5) === nothing
-@test findfirst(r",b", "foo,bar,baz") == 4:5
-@test findnext(r",b", "foo,bar,baz", 6) == 8:9
-@test findnext(r",b", "foo,bar,baz", 10) === nothing
-@test findfirst(r"az", "foo,bar,baz") == 10:11
-@test findnext(r"az", "foo,bar,baz", 12) === nothing
-
-# occursin with a String and Char needle
-@test occursin("o", "foo")
-@test occursin('o', "foo")
-# occursin in curried form
-@test occursin("foo")("o")
-@test occursin("foo")('o')
-
-# contains
-@test contains("foo", "o")
-@test contains("foo", 'o')
-# contains in curried form
-@test contains("o")("foo")
-@test contains('o')("foo")
-
-@test_throws ErrorException "ab" ∈ "abc"
-
-# issue #15723
-@test findfirst(isequal('('), "⨳(") == 4
-@test findnext(isequal('('), "(⨳(", 2) == 5
-@test findlast(isequal('('), "(⨳(") == 5
-@test findprev(isequal('('), "(⨳(", 2) == 1
-
-@test @inferred findall(isequal('a'), "éa") == [3]
-@test @inferred findall(isequal('€'), "€€") == [1, 4]
-@test @inferred isempty(findall(isequal('é'), ""))
-
-# issue #18109
-s_18109 = "fooα🐨βcd3"
-@test findlast(isequal('o'), s_18109) == 3
-@test findfirst(isequal('d'), s_18109) == 13
-
-# findall (issue #31788)
-@testset "findall" begin
+@testset "string search with a two-char regex" begin
+    @test findfirst(r"xx", "foo,bar,baz") === nothing
+    @test findfirst(r"fo", "foo,bar,baz") == 1:2
+    @test findnext(r"fo", "foo,bar,baz", 3) === nothing
+    @test findfirst(r"oo", "foo,bar,baz") == 2:3
+    @test findnext(r"oo", "foo,bar,baz", 4) === nothing
+    @test findfirst(r"o,", "foo,bar,baz") == 3:4
+    @test findnext(r"o,", "foo,bar,baz", 5) === nothing
+    @test findfirst(r",b", "foo,bar,baz") == 4:5
+    @test findnext(r",b", "foo,bar,baz", 6) == 8:9
+    @test findnext(r",b", "foo,bar,baz", 10) === nothing
+    @test findfirst(r"az", "foo,bar,baz") == 10:11
+    @test findnext(r"az", "foo,bar,baz", 12) === nothing
+end
+
+@testset "occursin/contains" begin
+    # occursin with a String and Char needle
+    @test occursin("o", "foo")
+    @test occursin('o', "foo")
+    # occursin in curried form
+    @test occursin("foo")("o")
+    @test occursin("foo")('o')
+
+    # contains
+    @test contains("foo", "o")
+    @test contains("foo", 'o')
+    # contains in curried form
+    @test contains("o")("foo")
+    @test contains('o')("foo")
+
+    @test_throws ErrorException "ab" ∈ "abc"
+end
+
+@testset "issue #15723" begin
+    @test findfirst(isequal('('), "⨳(") == 4
+    @test findnext(isequal('('), "(⨳(", 2) == 5
+    @test findlast(isequal('('), "(⨳(") == 5
+    @test findprev(isequal('('), "(⨳(", 2) == 1
+
+    @test @inferred findall(isequal('a'), "éa") == [3]
+    @test @inferred findall(isequal('€'), "€€") == [1, 4]
+    @test @inferred isempty(findall(isequal('é'), ""))
+end
+
+
+@testset "issue #18109" begin
+    s_18109 = "fooα🐨βcd3"
+    @test findlast(isequal('o'), s_18109) == 3
+    @test findfirst(isequal('d'), s_18109) == 13
+end
+
+@testset "findall (issue #31788)" begin
     @test findall("fooo", "foo") == UnitRange{Int}[]
     @test findall("ing", "Spinning laughing dancing") == [6:8, 15:17, 23:25]
     @test all(findall("", "foo") .=== [1:0, 2:1, 3:2, 4:3]) # use === to compare empty ranges
@@ -445,6 +465,45 @@ end
             @test_throws BoundsError findprev(pattern, A, -3)
         end
     end
+
+    @test findall([0x01, 0x02], [0x03, 0x01, 0x02, 0x01, 0x02, 0x06]) == [2:3, 4:5]
+    @test isempty(findall([0x04, 0x05], [0x03, 0x04, 0x06]))
+end
+
+# Issue 54578
+@testset "No conflation of Int8 and UInt8" begin
+    # Work for mixed types if the values are the same
+    @test findfirst(==(Int8(1)), [0x01]) == 1
+    @test findnext(iszero, Int8[0, -2, 0, -3], 2) == 3
+    @test findfirst(Int8[1,4], UInt8[0, 2, 4, 1, 8, 1, 4, 2]) == 6:7
+    @test findprev(UInt8[5, 6], Int8[1, 9, 2, 5, 6, 3], 6) == 4:5
+
+    # Returns nothing for the same methods if the values are different,
+    # even if the bitpatterns are the same
+    @test isnothing(findfirst(==(Int8(-1)), [0xff]))
+    @test isnothing(findnext(isequal(0xff), Int8[-1, -2, -1], 2))
+    @test isnothing(findfirst(UInt8[0xff, 0xfe], Int8[0, -1, -2, 1, 8, 1, 4, 2]))
+    @test isnothing(findprev(UInt8[0xff, 0xfe], Int8[1, 9, 2, -1, -2, 3], 6))
+end
+
+@testset "DenseArray with offsets" begin
+    isdefined(Main, :OffsetDenseArrays) || @eval Main include("../testhelpers/OffsetDenseArrays.jl")
+    OffsetDenseArrays = Main.OffsetDenseArrays
+
+    A = OffsetDenseArrays.OffsetDenseArray(collect(0x61:0x69), 100)
+    @test findfirst(==(0x61), A) == 101
+    @test findlast(==(0x61), A) == 101
+    @test findfirst(==(0x00), A) === nothing
+
+    @test findfirst([0x62, 0x63, 0x64], A) == 102:104
+    @test findlast([0x63, 0x64], A) == 103:104
+    @test findall([0x62, 0x63], A) == [102:103]
+
+    @test findfirst(iszero, A) === nothing
+    A = OffsetDenseArrays.OffsetDenseArray([0x01, 0x02, 0x00, 0x03], -100)
+    @test findfirst(iszero, A) == -97
+    @test findnext(==(0x02), A, -99) == -98
+    @test findnext(==(0x02), A, -97) === nothing
 end
 
 # issue 32568
diff --git a/test/strings/types.jl b/test/strings/types.jl
index dbcf65b1d843b..c09652c3a608d 100644
--- a/test/strings/types.jl
+++ b/test/strings/types.jl
@@ -2,196 +2,211 @@
 
 ## SubString and Cstring tests ##
 
-## SubString tests ##
-u8str = "∀ ε > 0, ∃ δ > 0: |x-y| < δ ⇒ |f(x)-f(y)| < ε"
-u8str2 = u8str^2
-len_u8str = length(u8str)
-slen_u8str = length(u8str)
-len_u8str2 = length(u8str2)
-slen_u8str2 = length(u8str2)
-
-@test len_u8str2 == 2 * len_u8str
-@test slen_u8str2 == 2 * slen_u8str
-
-u8str2plain = String(u8str2)
-
-for i1 = 1:length(u8str2)
-    if !isvalid(u8str2, i1); continue; end
-    for i2 = i1:length(u8str2)
-        if !isvalid(u8str2, i2); continue; end
-        @test length(u8str2[i1:i2]) == length(u8str2plain[i1:i2])
-        @test length(u8str2[i1:i2]) == length(u8str2plain[i1:i2])
-        @test u8str2[i1:i2] == u8str2plain[i1:i2]
+@testset "SubString" begin
+    u8str = "∀ ε > 0, ∃ δ > 0: |x-y| < δ ⇒ |f(x)-f(y)| < ε"
+    u8str2 = u8str^2
+    len_u8str = length(u8str)
+    slen_u8str = length(u8str)
+    len_u8str2 = length(u8str2)
+    slen_u8str2 = length(u8str2)
+
+    @test len_u8str2 == 2 * len_u8str
+    @test slen_u8str2 == 2 * slen_u8str
+
+    u8str2plain = String(u8str2)
+    @test !isascii(u8str2)
+    @test cmp(u8str2, u8str^3) == -1
+    @test cmp(u8str2, u8str2)  == 0
+    @test cmp(u8str^3, u8str2) == 1
+    @test codeunit(u8str2) == codeunit(u8str2plain)
+
+    @test convert(Union{String, SubString{String}}, u8str2)      === u8str2
+    @test convert(Union{String, SubString{String}}, u8str2plain) === u8str2plain
+
+    for i1 = 1:ncodeunits(u8str2)
+        if !isvalid(u8str2, i1); continue; end
+        for i2 = i1:ncodeunits(u8str2)
+            if !isvalid(u8str2, i2); continue; end
+            @test length(u8str2[i1:i2]) == length(u8str2plain[i1:i2])
+            @test length(u8str2[i1:i2]) == length(u8str2plain[i1:i2])
+            @test u8str2[i1:i2] == u8str2plain[i1:i2]
+        end
     end
-end
 
-# tests that SubString of a single multibyte `Char` string, like "∀" which takes 3 bytes
-# gives the same result as `getindex` (except that it is a view not a copy)
-for idx in 0:1
-    @test SubString("∀", 1, idx) == "∀"[1:idx]
-end
+    # tests that SubString of a single multibyte `Char` string, like "∀" which takes 3 bytes
+    # gives the same result as `getindex` (except that it is a view not a copy)
+    for idx in 0:1
+        @test SubString("∀", 1, idx) == "∀"[1:idx]
+    end
 
-# Substring provided with invalid end index throws BoundsError
-@test_throws StringIndexError SubString("∀", 1, 2)
-@test_throws StringIndexError SubString("∀", 1, 3)
-@test_throws BoundsError SubString("∀", 1, 4)
-
-# Substring provided with invalid start index throws BoundsError
-@test SubString("∀∀", 1:1) == "∀"
-@test SubString("∀∀", 1:4) == "∀∀"
-@test SubString("∀∀", 4:4) == "∀"
-@test_throws StringIndexError SubString("∀∀", 1:2)
-@test_throws StringIndexError SubString("∀∀", 1:5)
-@test_throws StringIndexError SubString("∀∀", 2:4)
-@test_throws BoundsError SubString("∀∀", 0:1)
-@test_throws BoundsError SubString("∀∀", 0:4)
-@test_throws BoundsError SubString("∀∀", 1:7)
-@test_throws BoundsError SubString("∀∀", 4:7)
-
-# tests for SubString of more than one multibyte `Char` string
-# we are consistent with `getindex` for `String`
-for idx in [0, 1, 4]
-    @test SubString("∀∀", 1, idx) == "∀∀"[1:idx]
-    @test SubString("∀∀", 4, idx) == "∀∀"[4:idx]
-end
+    @testset "invalid end index" begin
+        # Substring provided with invalid end index throws BoundsError
+        @test_throws StringIndexError SubString("∀", 1, 2)
+        @test_throws StringIndexError SubString("∀", 1, 3)
+        @test_throws BoundsError SubString("∀", 1, 4)
+    end
 
-# index beyond lastindex("∀∀")
-for idx in [2:3; 5:6]
-    @test_throws StringIndexError SubString("∀∀", 1, idx)
-end
-for idx in 7:8
-    @test_throws BoundsError SubString("∀∀", 1, idx)
-end
+    @testset "invalid start index" begin
+        # Substring provided with invalid start index throws BoundsError
+        @test SubString("∀∀", 1:1) == "∀"
+        @test SubString("∀∀", 1:4) == "∀∀"
+        @test SubString("∀∀", 4:4) == "∀"
+        @test_throws StringIndexError SubString("∀∀", 1:2)
+        @test_throws StringIndexError SubString("∀∀", 1:5)
+        @test_throws StringIndexError SubString("∀∀", 2:4)
+        @test_throws BoundsError SubString("∀∀", 0:1)
+        @test_throws BoundsError SubString("∀∀", 0:4)
+        @test_throws BoundsError SubString("∀∀", 1:7)
+        @test_throws BoundsError SubString("∀∀", 4:7)
+    end
 
-let str="tempus fugit"              #length(str)==12
-    ss=SubString(str,1,lastindex(str)) #match source string
-    @test length(ss)==length(str)
+    # tests for SubString of more than one multibyte `Char` string
+    # we are consistent with `getindex` for `String`
+    for idx in [0, 1, 4]
+        @test SubString("∀∀", 1, idx) == "∀∀"[1:idx]
+        @test SubString("∀∀", 4, idx) == "∀∀"[4:idx]
+    end
 
-    ss=SubString(str,1:lastindex(str))
-    @test length(ss)==length(str)
+    @testset "index beyond lastindex(\"∀∀\")" begin
+        for idx in [2:3; 5:6]
+            @test_throws StringIndexError SubString("∀∀", 1, idx)
+        end
+        for idx in 7:8
+            @test_throws BoundsError SubString("∀∀", 1, idx)
+        end
+    end
 
-    ss=SubString(str,1,0)    #empty SubString
-    @test length(ss)==0
+    let str="tempus fugit"              #length(str)==12
+        ss=SubString(str,1,lastindex(str)) #match source string
+        @test length(ss)==length(str)
 
-    ss=SubString(str,1:0)
-    @test length(ss)==0
+        ss=SubString(str,1:lastindex(str))
+        @test length(ss)==length(str)
 
-    @test_throws BoundsError SubString(str, 14, 20)  #start indexing beyond source string length
-    @test_throws BoundsError SubString(str, 10, 16)  #end indexing beyond source string length
+        ss=SubString(str,1,0)    #empty SubString
+        @test length(ss)==0
 
-    @test_throws BoundsError SubString("", 1, 4)  #empty source string
-    @test_throws BoundsError SubString("", 1, 1)  #empty source string, identical start and end index
-    @test_throws BoundsError SubString("", 10, 12)
-    @test SubString("", 12, 10) == ""
-end
+        ss=SubString(str,1:0)
+        @test length(ss)==0
 
-@test SubString("foobar", big(1), big(3)) == "foo"
-
-let str = "aa\u2200\u2222bb"
-    u = SubString(str, 3, 6)
-    @test length(u) == 2
-    b = IOBuffer()
-    write(b, u)
-    @test String(take!(b)) == "\u2200\u2222"
-
-    @test_throws StringIndexError SubString(str, 4, 5)
-    @test_throws BoundsError iterate(u, 0)
-    @test_throws BoundsError iterate(u, 8)
-    @test_throws BoundsError getindex(u, 0)
-    @test_throws BoundsError getindex(u, 7)
-    @test_throws BoundsError getindex(u, 0:1)
-    @test_throws BoundsError getindex(u, 7:7)
-    @test reverseind(u, 1) == 4
-    @test typeof(Base.cconvert(Ptr{Int8}, u)) == SubString{String}
-    @test Base.cconvert(Ptr{Int8}, u) == u
-end
+        @test_throws BoundsError SubString(str, 14, 20)  #start indexing beyond source string length
+        @test_throws BoundsError SubString(str, 10, 16)  #end indexing beyond source string length
 
-let str = "føøbar"
-    @test_throws BoundsError SubString(str, 10, 10)
-    u = SubString(str, 4, 3)
-    @test length(u) == 0
-    b = IOBuffer()
-    write(b, u)
-    @test String(take!(b)) == ""
-end
+        @test_throws BoundsError SubString("", 1, 4)  #empty source string
+        @test_throws BoundsError SubString("", 1, 1)  #empty source string, identical start and end index
+        @test_throws BoundsError SubString("", 10, 12)
+        @test SubString("", 12, 10) == ""
+    end
 
-# search and SubString (issue #5679)
-let str = "Hello, world!"
-    u = SubString(str, 1, 5)
-    @test findlast("World", u) === nothing
-    @test findlast(isequal('z'), u) === nothing
-    @test findlast("ll", u) == 3:4
-end
+    @test SubString("foobar", big(1), big(3)) == "foo"
+
+    let str = "aa\u2200\u2222bb"
+        u = SubString(str, 3, 6)
+        @test length(u) == 2
+        b = IOBuffer()
+        write(b, u)
+        @test String(take!(b)) == "\u2200\u2222"
+
+        @test_throws StringIndexError SubString(str, 4, 5)
+        @test_throws BoundsError iterate(u, 0)
+        @test_throws BoundsError iterate(u, 8)
+        @test_throws BoundsError getindex(u, 0)
+        @test_throws BoundsError getindex(u, 7)
+        @test_throws BoundsError getindex(u, 0:1)
+        @test_throws BoundsError getindex(u, 7:7)
+        @test reverseind(u, 1) == 4
+        @test typeof(Base.cconvert(Ptr{Int8}, u)) == SubString{String}
+        @test Base.cconvert(Ptr{Int8}, u) == u
+    end
 
-# SubString created from SubString
-let str = "Hello, world!"
-    u = SubString(str, 2, 5)
-    for idx in 1:4
-        @test SubString(u, 2, idx) == u[2:idx]
-        @test SubString(u, 2:idx) == u[2:idx]
+    let str = "føøbar"
+        @test_throws BoundsError SubString(str, 10, 10)
+        u = SubString(str, 4, 3)
+        @test length(u) == 0
+        b = IOBuffer()
+        write(b, u)
+        @test String(take!(b)) == ""
     end
-    @test_throws BoundsError SubString(u, 1, 10)
-    @test_throws BoundsError SubString(u, 1:10)
-    @test_throws BoundsError SubString(u, 20:30)
-    @test SubString(u, 20:15) == ""
-    @test_throws BoundsError SubString(u, -1:10)
-    @test SubString(u, -1, -10) == ""
-    @test SubString(SubString("123", 1, 2), -10, -20) == ""
-end
 
-# sizeof
-@test sizeof(SubString("abc\u2222def",4,4)) == 3
-
-# issue #3710
-@test prevind(SubString("{var}",2,4),4) == 3
-
-# issue #4183
-@test split(SubString("x", 2, 0), "y") == [""]
-
-# issue #6772
-@test parse(Float64, SubString("10",1,1)) === 1.0
-@test parse(Float64, SubString("1 0",1,1)) === 1.0
-@test parse(Float32, SubString("10",1,1)) === 1.0f0
-
-# issue #5870
-@test !occursin(Regex("aa"), SubString("",1,0))
-@test occursin(Regex(""), SubString("",1,0))
-
-# isvalid, length, prevind, nextind for SubString{String}
-let s = "lorem ipsum", sdict = Dict(
-    SubString(s, 1, 11)  => "lorem ipsum",
-    SubString(s, 1, 6)   => "lorem ",
-    SubString(s, 1, 0)   => "",
-    SubString(s, 2, 4)   => "ore",
-    SubString(s, 2, 11)  => "orem ipsum",
-    SubString(s, 15, 14) => "",
-)
-    for (ss, s) in sdict
-        @test ncodeunits(ss) == ncodeunits(s)
-        for i in -2:13
-            @test isvalid(ss, i) == isvalid(s, i)
-        end
-        for i in 1:ncodeunits(ss), j = i-1:ncodeunits(ss)
-            @test length(ss, i, j) == length(s, i, j)
+    @testset "search and SubString (issue #5679)" begin
+        str = "Hello, world!"
+        u = SubString(str, 1, 5)
+        @test findlast("World", u) === nothing
+        @test findlast(isequal('z'), u) === nothing
+        @test findlast("ll", u) == 3:4
+    end
+
+    @testset "SubString created from SubString" begin
+        str = "Hello, world!"
+        u = SubString(str, 2, 5)
+        for idx in 1:4
+            @test SubString(u, 2, idx) == u[2:idx]
+            @test SubString(u, 2:idx) == u[2:idx]
         end
+        @test_throws BoundsError SubString(u, 1, 10)
+        @test_throws BoundsError SubString(u, 1:10)
+        @test_throws BoundsError SubString(u, 20:30)
+        @test SubString(u, 20:15) == ""
+        @test_throws BoundsError SubString(u, -1:10)
+        @test SubString(u, -1, -10) == ""
+        @test SubString(SubString("123", 1, 2), -10, -20) == ""
+    end
+
+    # sizeof
+    @test sizeof(SubString("abc\u2222def",4,4)) == 3
+
+    # issue #3710
+    @test prevind(SubString("{var}",2,4),4) == 3
+
+    # issue #4183
+    @test split(SubString("x", 2, 0), "y") == [""]
+
+    @testset "issue #6772" begin
+        @test parse(Float64, SubString("10",1,1)) === 1.0
+        @test parse(Float64, SubString("1 0",1,1)) === 1.0
+        @test parse(Float32, SubString("10",1,1)) === 1.0f0
     end
-    for (ss, s) in sdict
-        @test length(ss) == length(s)
-        for i in 0:ncodeunits(ss), j = 0:length(ss)+1
-            @test prevind(ss, i+1, j) == prevind(s, i+1, j)
-            @test nextind(ss, i, j) == nextind(s, i, j)
+
+    @testset "issue #5870" begin
+        @test !occursin(Regex("aa"), SubString("",1,0))
+        @test occursin(Regex(""), SubString("",1,0))
+    end
+    @testset" isvalid, length, prevind, nextind for SubString{String}" begin
+        s = "lorem ipsum"
+        sdict = Dict(
+            SubString(s, 1, 11)  => "lorem ipsum",
+            SubString(s, 1, 6)   => "lorem ",
+            SubString(s, 1, 0)   => "",
+            SubString(s, 2, 4)   => "ore",
+            SubString(s, 2, 11)  => "orem ipsum",
+            SubString(s, 15, 14) => "",
+        )
+        for (ss, s) in sdict
+            @test ncodeunits(ss) == ncodeunits(s)
+            for i in -2:13
+                @test isvalid(ss, i) == isvalid(s, i)
+            end
+            for i in 1:ncodeunits(ss), j = i-1:ncodeunits(ss)
+                @test length(ss, i, j) == length(s, i, j)
+            end
+        end
+        for (ss, s) in sdict
+            @test length(ss) == length(s)
+            for i in 0:ncodeunits(ss), j = 0:length(ss)+1
+                @test prevind(ss, i+1, j) == prevind(s, i+1, j)
+                @test nextind(ss, i, j) == nextind(s, i, j)
+            end
+            @test_throws BoundsError prevind(s, 0)
+            @test_throws BoundsError prevind(ss, 0)
+            @test_throws BoundsError nextind(s, ncodeunits(ss)+1)
+            @test_throws BoundsError nextind(ss, ncodeunits(ss)+1)
         end
-        @test_throws BoundsError prevind(s, 0)
-        @test_throws BoundsError prevind(ss, 0)
-        @test_throws BoundsError nextind(s, ncodeunits(ss)+1)
-        @test_throws BoundsError nextind(ss, ncodeunits(ss)+1)
     end
-end
 
-# proper nextind/prevind/thisind for SubString{String}
-let rng = MersenneTwister(1), strs = ["∀∃∀"*String(rand(rng, UInt8, 40))*"∀∃∀",
+    rng = MersenneTwister(1)
+    strs = ["∀∃∀"*String(rand(rng, UInt8, 40))*"∀∃∀",
                                       String(rand(rng, UInt8, 50))]
-    for s in strs
+    @testset "proper nextind/prevind/thisind for SubString{String}: $(repr(s))" for s in strs
         a = 0
         while a <= ncodeunits(s)
             a = nextind(s, a)
@@ -223,111 +238,115 @@ let rng = MersenneTwister(1), strs = ["∀∃∀"*String(rand(rng, UInt8, 40))*"
             end
         end
     end
-end
 
-# for isvalid(SubString{String})
-let s = "Σx + βz - 2"
-    for i in -1:ncodeunits(s)+2
-        if checkbounds(Bool, s, i)
-            if isvalid(s, i)
-                ss = SubString(s, 1, i)
-                for j = 1:ncodeunits(ss)
-                    @test isvalid(ss, j) == isvalid(s, j)
+    # for isvalid(SubString{String})
+    let s = "Σx + βz - 2"
+        for i in -1:ncodeunits(s)+2
+            if checkbounds(Bool, s, i)
+                if isvalid(s, i)
+                    ss = SubString(s, 1, i)
+                    for j = 1:ncodeunits(ss)
+                        @test isvalid(ss, j) == isvalid(s, j)
+                    end
+                else
+                    @test_throws StringIndexError SubString(s, 1, i)
                 end
+            elseif i > 0
+                @test_throws BoundsError SubString(s, 1, i)
             else
-                @test_throws StringIndexError SubString(s, 1, i)
+                @test SubString(s, 1, i) == ""
             end
-        elseif i > 0
-            @test_throws BoundsError SubString(s, 1, i)
-        else
-            @test SubString(s, 1, i) == ""
         end
     end
-end
 
-let ss = SubString("hello", 1, 5)
-    @test length(ss, 1, 0) == 0
-    @test_throws BoundsError length(ss, 1, -1)
-    @test_throws BoundsError length(ss, 1, 6)
-    @test_throws BoundsError length(ss, 1, 10)
-    @test_throws BoundsError prevind(ss, 0, 1)
-    @test prevind(ss, 1, 1) == 0
-    @test prevind(ss, 6, 1) == 5
-    @test_throws BoundsError prevind(ss, 7, 1)
-    @test_throws BoundsError nextind(ss, -1, 1)
-    @test nextind(ss, 0, 1) == 1
-    @test nextind(ss, 5, 1) == 6
-    @test_throws BoundsError nextind(ss, 6, 1)
-end
+    let ss = SubString("hello", 1, 5)
+        @test length(ss, 1, 0) == 0
+        @test_throws BoundsError length(ss, 1, -1)
+        @test_throws BoundsError length(ss, 1, 6)
+        @test_throws BoundsError length(ss, 1, 10)
+        @test_throws BoundsError prevind(ss, 0, 1)
+        @test prevind(ss, 1, 1) == 0
+        @test prevind(ss, 6, 1) == 5
+        @test_throws BoundsError prevind(ss, 7, 1)
+        @test_throws BoundsError nextind(ss, -1, 1)
+        @test nextind(ss, 0, 1) == 1
+        @test nextind(ss, 5, 1) == 6
+        @test_throws BoundsError nextind(ss, 6, 1)
+    end
 
-# length(SubString{String}) performance specialization
-let s = "|η(α)-ϕ(κ)| < ε"
-    @test length(SubString(s, 1, 0)) == length(s[1:0])
-    @test length(SubString(s, 4, 4)) == length(s[4:4])
-    @test length(SubString(s, 1, 7)) == length(s[1:7])
-    @test length(SubString(s, 4, 11)) == length(s[4:11])
-end
+    # length(SubString{String}) performance specialization
+    let s = "|η(α)-ϕ(κ)| < ε"
+        @test length(SubString(s, 1, 0)) == length(s[1:0])
+        @test length(SubString(s, 4, 4)) == length(s[4:4])
+        @test length(SubString(s, 1, 7)) == length(s[1:7])
+        @test length(SubString(s, 4, 11)) == length(s[4:11])
+    end
 
-@testset "reverseind" for T in (String, SubString, GenericString)
-    for prefix in ("", "abcd", "\U0001d6a4\U0001d4c1", "\U0001d6a4\U0001d4c1c", " \U0001d6a4\U0001d4c1")
-        for suffix in ("", "abcde", "\U0001d4c1β\U0001d6a4", "\U0001d4c1β\U0001d6a4c", " \U0001d4c1β\U0001d6a4")
-            for c in ('X', 'δ', '\U0001d6a5')
-                s = convert(T, string(prefix, c, suffix))
-                r = reverse(s)
-                ri = findfirst(isequal(c), r)
-                @test c == s[reverseind(s, ri)] == r[ri]
-                s = convert(T, string(prefix, prefix, c, suffix, suffix))
-                pre = convert(T, prefix)
-                sb = SubString(s, nextind(pre, lastindex(pre)),
-                               lastindex(convert(T, string(prefix, prefix, c, suffix))))
-                r = reverse(sb)
-                ri = findfirst(isequal(c), r)
-                @test c == sb[reverseind(sb, ri)] == r[ri]
+    @testset "reverseind" for T in (String, SubString, GenericString)
+        for prefix in ("", "abcd", "\U0001d6a4\U0001d4c1", "\U0001d6a4\U0001d4c1c", " \U0001d6a4\U0001d4c1")
+            for suffix in ("", "abcde", "\U0001d4c1β\U0001d6a4", "\U0001d4c1β\U0001d6a4c", " \U0001d4c1β\U0001d6a4")
+                for c in ('X', 'δ', '\U0001d6a5')
+                    s = convert(T, string(prefix, c, suffix))
+                    r = reverse(s)
+                    ri = findfirst(isequal(c), r)
+                    @test c == s[reverseind(s, ri)] == r[ri]
+                    s = convert(T, string(prefix, prefix, c, suffix, suffix))
+                    pre = convert(T, prefix)
+                    sb = SubString(s, nextind(pre, lastindex(pre)),
+                                   lastindex(convert(T, string(prefix, prefix, c, suffix))))
+                    r = reverse(sb)
+                    ri = findfirst(isequal(c), r)
+                    @test c == sb[reverseind(sb, ri)] == r[ri]
+                end
             end
         end
     end
-end
 
-@testset "reverseind of empty strings" begin
-    for s in ("",
-              SubString("", 1, 0),
-              SubString("ab", 1, 0),
-              SubString("ab", 2, 1),
-              SubString("ab", 3, 2),
-              GenericString(""))
-        @test reverseind(s, 0) == 1
-        @test reverseind(s, 1) == 0
+    @testset "reverseind of empty strings" begin
+        for s in ("",
+                  SubString("", 1, 0),
+                  SubString("ab", 1, 0),
+                  SubString("ab", 2, 1),
+                  SubString("ab", 3, 2),
+                  GenericString(""))
+            @test reverseind(s, 0) == 1
+            @test reverseind(s, 1) == 0
+        end
     end
 end
 
-## Cstring tests ##
-
-# issue #13974: comparison against pointers
-let
-    str = String("foobar")
-    ptr = pointer(str)
-    cstring = Cstring(ptr)
-    @test ptr == cstring
-    @test cstring == ptr
-
-    # convenient NULL string creation from Ptr{Cvoid}
-    nullstr = Cstring(C_NULL)
-
-    # Comparisons against NULL strings
-    @test ptr != nullstr
-    @test nullstr != ptr
+@testset "Cstring" begin
+    @testset "issue #13974: comparison against pointers" begin
+        str = String("foobar")
+        ptr = pointer(str)
+        cstring = Cstring(ptr)
+        @test ptr == cstring
+        @test cstring == ptr
+
+        # convenient NULL string creation from Ptr{Cvoid}
+        nullstr = Cstring(C_NULL)
+
+        # Comparisons against NULL strings
+        @test ptr != nullstr
+        @test nullstr != ptr
+
+        # Short-hand comparison against C_NULL
+        @test nullstr == C_NULL
+        @test C_NULL == nullstr
+        @test cstring != C_NULL
+        @test C_NULL != cstring
+    end
 
-    # Short-hand comparison against C_NULL
-    @test nullstr == C_NULL
-    @test C_NULL == nullstr
-    @test cstring != C_NULL
-    @test C_NULL != cstring
+    @testset "issue #31381: eltype(Cstring) != Cchar" begin
+        s = Cstring(C_NULL)
+        @test eltype(Cstring) == Cchar
+        @test eltype(s) == Cchar
+        @test pointer(s) isa Ptr{Cchar}
+    end
 end
 
-# issue #31381: eltype(Cstring) != Cchar
-let
-    s = Cstring(C_NULL)
-    @test eltype(Cstring) == Cchar
-    @test eltype(s) == Cchar
-    @test pointer(s) isa Ptr{Cchar}
+@testset "Codeunits" begin
+    s = "I'm a string!"
+    @test codeunit(s) == UInt8
+    @test codeunit(s, Int8(1)) == codeunit(s, 1)
 end
diff --git a/test/syntax.jl b/test/syntax.jl
index da69bd98dc010..c19721b5c54b3 100644
--- a/test/syntax.jl
+++ b/test/syntax.jl
@@ -3975,3 +3975,15 @@ module UsingFailedExplicit
     using .A: x as x
     @test x === 1
 end
+
+# issue #45494
+begin
+  local b::Tuple{<:Any} = (0,)
+  function f45494()
+    b = b
+    b
+  end
+end
+@test f45494() === (0,)
+
+@test_throws "\"esc(...)\" used outside of macro expansion" eval(esc(:(const x=1)))
diff --git a/test/testhelpers/OffsetDenseArrays.jl b/test/testhelpers/OffsetDenseArrays.jl
new file mode 100644
index 0000000000000..44a1b8d627800
--- /dev/null
+++ b/test/testhelpers/OffsetDenseArrays.jl
@@ -0,0 +1,31 @@
+"""
+    module OffsetDenseArrays
+
+A minimal implementation of an offset array which is also <: DenseArray.
+"""
+module OffsetDenseArrays
+
+struct OffsetDenseArray{A <: DenseVector, T} <: DenseVector{T}
+    x::A
+    offset::Int
+end
+OffsetDenseArray(x::AbstractVector{T}, i::Integer) where {T} = OffsetDenseArray{typeof(x), T}(x, Int(i))
+
+Base.size(x::OffsetDenseArray) = size(x.x)
+Base.pointer(x::OffsetDenseArray) = pointer(x.x)
+
+function Base.getindex(x::OffsetDenseArray, i::Integer)
+    @boundscheck checkbounds(x.x, i - x.offset)
+    x.x[i - x.offset]
+end
+
+function Base.setindex(x::OffsetDenseArray, v, i::Integer)
+    @boundscheck checkbounds(x.x, i - x.offset)
+    x.x[i - x.offset] = v
+end
+
+IndexStyle(::Type{<:OffsetDenseArray}) = Base.IndexLinear()
+Base.axes(x::OffsetDenseArray) = (x.offset + 1 : x.offset + length(x.x),)
+Base.keys(x::OffsetDenseArray) = only(axes(x))
+
+end # module
diff --git a/test/testhelpers/just_module.jl b/test/testhelpers/just_module.jl
new file mode 100644
index 0000000000000..71bd87e660eae
--- /dev/null
+++ b/test/testhelpers/just_module.jl
@@ -0,0 +1 @@
+@__MODULE__
diff --git a/test/threads.jl b/test/threads.jl
index 2832f2a0e972c..6265368c2ac79 100644
--- a/test/threads.jl
+++ b/test/threads.jl
@@ -359,3 +359,18 @@ end
         @test jl_setaffinity(0, mask, cpumasksize) == 0
     end
 end
+
+# Make sure default number of BLAS threads respects CPU affinity: issue #55572.
+@testset "LinearAlgebra number of default threads" begin
+    if AFFINITY_SUPPORTED
+        allowed_cpus = findall(uv_thread_getaffinity())
+        cmd = addenv(`$(Base.julia_cmd()) --startup-file=no -E 'using LinearAlgebra; BLAS.get_num_threads()'`,
+                     # Remove all variables which could affect the default number of threads
+                     "OPENBLAS_NUM_THREADS"=>nothing,
+                     "GOTO_NUM_THREADS"=>nothing,
+                     "OMP_NUM_THREADS"=>nothing)
+        for n in 1:min(length(allowed_cpus), 8) # Cap to 8 to avoid too many tests on large systems
+            @test readchomp(setcpuaffinity(cmd, allowed_cpus[1:n])) == string(max(1, n ÷ 2))
+        end
+    end
+end
diff --git a/test/threads_exec.jl b/test/threads_exec.jl
index 595f8991d58d7..ac54dd009390c 100644
--- a/test/threads_exec.jl
+++ b/test/threads_exec.jl
@@ -1235,6 +1235,7 @@ end
             @testset "throw=true" begin
                 tasks, event = create_tasks()
                 push!(tasks, Threads.@spawn error("Error"))
+                wait(tasks[end]; throw=false)
 
                 @test_throws CompositeException begin
                     waitany(convert_tasks(tasks_type, tasks); throw=true)
diff --git a/test/trimming/Makefile b/test/trimming/Makefile
new file mode 100644
index 0000000000000..c6e105d637013
--- /dev/null
+++ b/test/trimming/Makefile
@@ -0,0 +1,55 @@
+# This file is a part of Julia. License is MIT: https://julialang.org/license
+
+# This Makefile template requires the following variables to be set
+# in the environment or on the command-line:
+#   JULIA: path to julia[.exe] executable
+#   BIN:   binary build directory
+
+ifndef JULIA
+  $(error "Please pass JULIA=[path of target julia binary], or set as environment variable!")
+endif
+ifndef BIN
+  $(error "Please pass BIN=[path of build directory], or set as environment variable!")
+endif
+
+#=============================================================================
+# location of test source
+SRCDIR := $(abspath $(dir $(lastword $(MAKEFILE_LIST))))
+JULIAHOME := $(abspath $(SRCDIR)/../..)
+BUILDSCRIPT := $(BIN)/../share/julia/juliac-buildscript.jl
+include $(JULIAHOME)/Make.inc
+
+# get the executable suffix, if any
+EXE := $(suffix $(abspath $(JULIA)))
+
+# get compiler and linker flags. (see: `contrib/julia-config.jl`)
+JULIA_CONFIG := $(JULIA) -e 'include(joinpath(Sys.BINDIR, Base.DATAROOTDIR, "julia", "julia-config.jl"))' --
+CPPFLAGS_ADD :=
+CFLAGS_ADD = $(shell $(JULIA_CONFIG) --cflags)
+LDFLAGS_ADD = -lm $(shell $(JULIA_CONFIG) --ldflags --ldlibs) -ljulia-internal
+
+#=============================================================================
+
+release: hello$(EXE)
+
+hello.o: $(SRCDIR)/hello.jl $(BUILDSCRIPT)
+	$(JULIA) -t 1 -J $(BIN)/../lib/julia/sys.so --startup-file=no --history-file=no --output-o $@ --output-incremental=no --strip-ir --strip-metadata --trim $(BUILDSCRIPT) $(SRCDIR)/hello.jl --output-exe true
+
+init.o: $(SRCDIR)/init.c
+	$(CC) -c -o $@ $< $(CPPFLAGS_ADD) $(CPPFLAGS) $(CFLAGS_ADD) $(CFLAGS)
+
+hello$(EXE): hello.o init.o
+	$(CC) -o $@ $(WHOLE_ARCHIVE) hello.o $(NO_WHOLE_ARCHIVE) init.o $(CPPFLAGS_ADD) $(CPPFLAGS) $(CFLAGS_ADD) $(CFLAGS) $(LDFLAGS_ADD) $(LDFLAGS)
+
+check: hello$(EXE)
+	$(JULIA) --depwarn=error $(SRCDIR)/../runtests.jl $(SRCDIR)/trimming
+
+clean:
+	-rm -f hello$(EXE) init.o hello.o
+
+.PHONY: release clean check
+
+# Makefile debugging trick:
+# call print-VARIABLE to see the runtime value of any variable
+print-%:
+	@echo '$*=$($*)'
diff --git a/test/trimming/hello.jl b/test/trimming/hello.jl
new file mode 100644
index 0000000000000..307bf820f325b
--- /dev/null
+++ b/test/trimming/hello.jl
@@ -0,0 +1,6 @@
+module MyApp
+Base.@ccallable function main()::Cint
+    println(Core.stdout, "Hello, world!")
+    return 0
+end
+end
diff --git a/test/trimming/init.c b/test/trimming/init.c
new file mode 100644
index 0000000000000..ea1b02f8e5c8f
--- /dev/null
+++ b/test/trimming/init.c
@@ -0,0 +1,9 @@
+#include <julia.h>
+
+__attribute__((constructor)) void static_init(void)
+{
+    if (jl_is_initialized())
+        return;
+    julia_init(JL_IMAGE_IN_MEMORY);
+    jl_exception_clear();
+}
diff --git a/test/trimming/trimming.jl b/test/trimming/trimming.jl
new file mode 100644
index 0000000000000..dfacae7f8e531
--- /dev/null
+++ b/test/trimming/trimming.jl
@@ -0,0 +1,7 @@
+using Test
+
+exe_path = joinpath(@__DIR__, "hello"*splitext(Base.julia_exename())[2])
+
+@test readchomp(`$exe_path`) == "Hello, world!"
+
+@test filesize(exe_path) < filesize(unsafe_string(Base.JLOptions().image_file))/10