From 1dee0006c453874a1f8965d55a27befc2620f253 Mon Sep 17 00:00:00 2001 From: Zentrik Date: Fri, 26 Jul 2024 14:30:46 +0100 Subject: [PATCH] Add BOLT Makefile (#54107) This uses LLVM's BOLT to optimize libLLVM, libjulia-internal and libjulia-codegen. This improves the allinference benchmarks by about 10% largely due to the optimization of libjulia-internal. The example in issue https://github.com/JuliaLang/julia/issues/45395 which stresses LLVM significantly more also sees a ~10% improvement. We see a 20% improvement on ```julia @time for i in 1:100000000 string(i) end ``` When building corecompiler.ji: BOLT gives about a 16% improvement PGO+LTO gives about a 21% improvement PGO+LTO+BOLT gives about a 23% improvement This only requires a single build of LLVM and theoretically none if we change the binary builder script (i.e. we build with relocations and the `-fno-reorder-blocks-and-partition` and then we can use BOLT to get binaries with no relocations and reordered blocks and then ship both binaries?) compared to the 2 in PGO. Also, this theoretically can improve performance of a PGO+LTO build by a couple %. The only reproducible test problem I see is that the BOLT, PGO+LTO and PGO+LTO+BOLT builds all cause `readelf` to emit warnings as part of the `osutils` tests. ``` readelf: Warning: Unrecognised form: 0x22 readelf: Warning: DIE has locviews without loclist readelf: Warning: Unrecognised form: 0x23 readelf: Warning: DIE at offset 0x227399 refers to abbreviation number 14754 which does not exist readelf: Warning: Bogus end-of-siblings marker detected at offset 212aa9 in .debug_info section readelf: Warning: Bogus end-of-siblings marker detected at offset 212ab0 in .debug_info section readelf: Warning: Further warnings about bogus end-of-sibling markers suppressed ``` The unrecognised form warnings seem to be a bug in binutils, https://sourceware.org/bugzilla/show_bug.cgi?id=28981. `DIE at offset` warning I believe was fixed in binutils 2.36, https://sourceware.org/bugzilla/show_bug.cgi?id=26808, but `ld -v` says I have 2.38. I assume these are all benign. I also don't see them on CI here https://buildkite.com/julialang/julia-buildkite/builds/1507#018f00e7-0737-4a42-bcd9-d4061dc8c93e so could just be a local issue. --- Make.inc | 13 +- contrib/bolt/.gitignore | 10 + contrib/bolt/Makefile | 134 +++++++++++++ contrib/bolt/README.md | 17 ++ contrib/pgo-lto-bolt/.gitignore | 14 ++ contrib/pgo-lto-bolt/Makefile | 185 ++++++++++++++++++ contrib/pgo-lto-bolt/README.md | 18 ++ deps/BOLT.mk | 118 +++++++++++ deps/BOLT.version | 11 ++ deps/Makefile | 1 + .../md5 | 1 + .../sha512 | 1 + deps/llvm.mk | 5 +- src/Makefile | 4 +- 14 files changed, 525 insertions(+), 7 deletions(-) create mode 100644 contrib/bolt/.gitignore create mode 100644 contrib/bolt/Makefile create mode 100644 contrib/bolt/README.md create mode 100644 contrib/pgo-lto-bolt/.gitignore create mode 100644 contrib/pgo-lto-bolt/Makefile create mode 100644 contrib/pgo-lto-bolt/README.md create mode 100644 deps/BOLT.mk create mode 100644 deps/BOLT.version create mode 100644 deps/checksums/BOLT.v18.1.4+0.x86_64-linux-gnu-cxx11.tar.gz/md5 create mode 100644 deps/checksums/BOLT.v18.1.4+0.x86_64-linux-gnu-cxx11.tar.gz/sha512 diff --git a/Make.inc b/Make.inc index d9a131b4d8a98..0da638cfab52e 100644 --- a/Make.inc +++ b/Make.inc @@ -516,6 +516,11 @@ SHIPFLAGS_COMMON := -O3 SHIPFLAGS_CLANG := $(SHIPFLAGS_COMMON) -g SHIPFLAGS_GCC := $(SHIPFLAGS_COMMON) -ggdb2 -falign-functions +BOLT_LDFLAGS := + +BOLT_CFLAGS_GCC := +BOLT_CFLAGS_CLANG := + ifeq ($(OS), Darwin) JCPPFLAGS_CLANG += -D_LARGEFILE_SOURCE -D_DARWIN_USE_64_BIT_INODE=1 endif @@ -532,7 +537,8 @@ JCFLAGS := $(JCFLAGS_GCC) JCPPFLAGS := $(JCPPFLAGS_GCC) JCXXFLAGS := $(JCXXFLAGS_GCC) DEBUGFLAGS := $(DEBUGFLAGS_GCC) -SHIPFLAGS := $(SHIPFLAGS_GCC) +SHIPFLAGS := $(SHIPFLAGS_GCC) $(BOLT_CFLAGS_GCC) +BOLT_CFLAGS := $(BOLT_CFLAGS_GCC) endif ifeq ($(USECLANG),1) @@ -542,7 +548,8 @@ JCFLAGS := $(JCFLAGS_CLANG) JCPPFLAGS := $(JCPPFLAGS_CLANG) JCXXFLAGS := $(JCXXFLAGS_CLANG) DEBUGFLAGS := $(DEBUGFLAGS_CLANG) -SHIPFLAGS := $(SHIPFLAGS_CLANG) +SHIPFLAGS := $(SHIPFLAGS_CLANG) $(BOLT_CFLAGS_CLANG) +BOLT_CFLAGS := $(BOLT_CFLAGS_CLANG) ifeq ($(OS), Darwin) CC += -mmacosx-version-min=$(MACOSX_VERSION_MIN) @@ -1295,7 +1302,7 @@ CSL_NEXT_GLIBCXX_VERSION=GLIBCXX_3\.4\.33|GLIBCXX_3\.5\.|GLIBCXX_4\. # Note: we explicitly _do not_ define `CSL` here, since it requires some more # advanced techniques to decide whether it should be installed from a BB source # or not. See `deps/csl.mk` for more detail. -BB_PROJECTS := BLASTRAMPOLINE OPENBLAS LLVM LIBSUITESPARSE OPENLIBM GMP MBEDTLS LIBSSH2 NGHTTP2 MPFR CURL LIBGIT2 PCRE LIBUV LIBUNWIND DSFMT OBJCONV ZLIB P7ZIP LLD LIBTRACYCLIENT +BB_PROJECTS := BLASTRAMPOLINE OPENBLAS LLVM LIBSUITESPARSE OPENLIBM GMP MBEDTLS LIBSSH2 NGHTTP2 MPFR CURL LIBGIT2 PCRE LIBUV LIBUNWIND DSFMT OBJCONV ZLIB P7ZIP LLD LIBTRACYCLIENT BOLT define SET_BB_DEFAULT # First, check to see if BB is disabled on a global setting ifeq ($$(USE_BINARYBUILDER),0) diff --git a/contrib/bolt/.gitignore b/contrib/bolt/.gitignore new file mode 100644 index 0000000000000..921d429130268 --- /dev/null +++ b/contrib/bolt/.gitignore @@ -0,0 +1,10 @@ +profiles-bolt* +optimized.build +toolchain + +bolt +bolt_instrument +merge_data +copy_originals +stage0 +stage1 diff --git a/contrib/bolt/Makefile b/contrib/bolt/Makefile new file mode 100644 index 0000000000000..2e911fcbcdc68 --- /dev/null +++ b/contrib/bolt/Makefile @@ -0,0 +1,134 @@ +.PHONY: clean clean_profiles restore_originals + +# Settings taken from https://github.com/rust-lang/rust/blob/master/src/tools/opt-dist/src/bolt.rs +BOLT_ARGS := +# Reorder basic blocks within functions +BOLT_ARGS += -reorder-blocks=ext-tsp +# Reorder functions within the binary +BOLT_ARGS += -reorder-functions=cdsort +# Split function code into hot and code regions +BOLT_ARGS += -split-functions +# Split as many basic blocks as possible +BOLT_ARGS += -split-all-cold +# Move jump tables to a separate section +BOLT_ARGS += -jump-tables=move +# Use regular size pages for code alignment +BOLT_ARGS += -no-huge-pages +# Fold functions with identical code +BOLT_ARGS += -icf=1 +# Split using best available strategy (three-way splitting, Cache-Directed Sort) +# Disabled for libjulia-internal till https://github.com/llvm/llvm-project/issues/89508 is fixed +# BOLT_ARGS += -split-strategy=cdsplit +# Update DWARF debug info in the final binary +BOLT_ARGS += -update-debug-sections +# Print optimization statistics +BOLT_ARGS += -dyno-stats +# BOLT doesn't fully support computed gotos, https://github.com/llvm/llvm-project/issues/89117 +# Use escaped regex as the name BOLT recognises is often a bit different, e.g. apply_cl/1(*2) +# This doesn't actually seem to do anything, the actual mitigation is not using --use-old-text +# which we do in the bolt target +BOLT_ARGS += -skip-funcs=.\*apply_cl.\* + +# -fno-reorder-blocks-and-partition is needed on gcc >= 8. +BOLT_FLAGS := $\ + "BOLT_CFLAGS_GCC+=-fno-reorder-blocks-and-partition" $\ + "BOLT_LDFLAGS=-Wl,--emit-relocs" + +STAGE0_BUILD:=$(CURDIR)/toolchain +STAGE1_BUILD:=$(CURDIR)/optimized.build + +STAGE0_BINARIES:=$(STAGE0_BUILD)/usr/bin/ + +PROFILE_DIR:=$(CURDIR)/profiles-bolt +JULIA_ROOT:=$(CURDIR)/../.. + +LLVM_BOLT:=$(STAGE0_BINARIES)llvm-bolt +LLVM_MERGEFDATA:=$(STAGE0_BINARIES)merge-fdata + +# If you add new files to optimize, you need to add BOLT_LDFLAGS and BOLT_CFLAGS to the build of your new file. +SYMLINKS_TO_OPTIMIZE := libLLVM.so libjulia-internal.so libjulia-codegen.so +FILES_TO_OPTIMIZE := $(shell for file in $(SYMLINKS_TO_OPTIMIZE); do readlink $(STAGE1_BUILD)/usr/lib/$$file; done) + +AFTER_INSTRUMENT_MESSAGE:='Run `make finish_stage1` to finish off the build. $\ + You can now optionally collect more profiling data by running Julia with an appropriate workload, $\ + if you wish, run `make clean_profiles` before doing so to remove any profiling data generated by `make finish_stage1`. $\ + You should end up with some data in $(PROFILE_DIR). Afterwards run `make merge_data && make bolt`. $\ + +$(STAGE0_BUILD) $(STAGE1_BUILD): + $(MAKE) -C $(JULIA_ROOT) O=$@ configure + +stage0: | $(STAGE0_BUILD) + $(MAKE) -C $(STAGE0_BUILD)/deps install-BOLT && \ + touch $@ + +# Build with our custom flags, binary builder doesn't use them so we need to build LLVM for now. +# We manually skip package image creation so that we can profile it +$(STAGE1_BUILD): stage0 +stage1: export USE_BINARYBUILDER_LLVM=0 +stage1: | $(STAGE1_BUILD) + $(MAKE) -C $(STAGE1_BUILD) $(BOLT_FLAGS) julia-src-release julia-symlink julia-libccalltest \ + julia-libccalllazyfoo julia-libccalllazybar julia-libllvmcalltest && \ + touch $@ + +copy_originals: stage1 + for file in $(FILES_TO_OPTIMIZE); do \ + abs_file=$(STAGE1_BUILD)/usr/lib/$$file; \ + cp $$abs_file "$$abs_file.original"; \ + done && \ + touch $@ + +# I don't think there's any particular reason to have -no-huge-pages here, perhaps slightly more accurate profile data +# as the final build uses -no-huge-pages +bolt_instrument: copy_originals + for file in $(FILES_TO_OPTIMIZE); do \ + abs_file=$(STAGE1_BUILD)/usr/lib/$$file; \ + $(LLVM_BOLT) "$$abs_file.original" -o $$abs_file --instrument --instrumentation-file-append-pid --instrumentation-file="$(PROFILE_DIR)/$$file-prof" -no-huge-pages; \ + mkdir -p $$(dirname "$(PROFILE_DIR)/$$file-prof"); \ + printf "\n"; \ + done && \ + touch $@ + @echo $(AFTER_INSTRUMENT_MESSAGE) + +# We don't want to rebuild julia-src as then we lose the bolt instrumentation +# So we have to manually build the sysimage and package image +finish_stage1: stage1 + $(MAKE) -C $(STAGE1_BUILD) julia-base-cache && \ + $(MAKE) -C $(STAGE1_BUILD) -f sysimage.mk sysimg-release && \ + $(MAKE) -C $(STAGE1_BUILD) -f pkgimage.mk release + +merge_data: bolt_instrument + for file in $(FILES_TO_OPTIMIZE); do \ + profiles=$(PROFILE_DIR)/$$file-prof.*.fdata; \ + $(LLVM_MERGEFDATA) $$profiles > "$(PROFILE_DIR)/$$file-prof.merged.fdata"; \ + done && \ + touch $@ + +# The --use-old-text saves about 16 MiB of libLLVM.so size. +# However, the rust folk found it succeeds very non-deterministically for them. +# It tries to reuse old text segments to reduce binary size +# BOLT doesn't fully support computed gotos https://github.com/llvm/llvm-project/issues/89117, so we cannot use --use-old-text on libjulia-internal +# That flag saves less than 1 MiB for libjulia-internal so oh well. +bolt: merge_data + for file in $(FILES_TO_OPTIMIZE); do \ + abs_file=$(STAGE1_BUILD)/usr/lib/$$file; \ + $(LLVM_BOLT) "$$abs_file.original" -data "$(PROFILE_DIR)/$$file-prof.merged.fdata" -o $$abs_file $(BOLT_ARGS) $$(if [ "$$file" != $(shell readlink $(STAGE1_BUILD)/usr/lib/libjulia-internal.so) ]; then echo "--use-old-text -split-strategy=cdsplit"; fi); \ + done && \ + touch $@ + +clean_profiles: + rm -rf $(PROFILE_DIR) + +clean: + rm -f stage0 stage1 bolt copy_originals merge_data bolt_instrument + +restore_originals: copy_originals + for file in $(FILES_TO_OPTIMIZE); do \ + abs_file=$(STAGE1_BUILD)/usr/lib/$$file; \ + cp -P "$$abs_file.original" $$abs_file; \ + done + +delete_originals: copy_originals + for file in $(FILES_TO_OPTIMIZE); do \ + abs_file=$(STAGE1_BUILD)/usr/lib/$$file; \ + rm "$$abs_file.original"; \ + done diff --git a/contrib/bolt/README.md b/contrib/bolt/README.md new file mode 100644 index 0000000000000..8680939ef6276 --- /dev/null +++ b/contrib/bolt/README.md @@ -0,0 +1,17 @@ +BOLT only works on x86_64 and arch64 on Linux. + +DO NOT STRIP THE RESULTING .so FILES, https://github.com/llvm/llvm-project/issues/56738. +If you really need to, try adding `-use-gnu-stack` to `BOLT_ARGS`. + +To build a BOLT-optimized version of Julia run the following commands (`cd` into this directory first) +```bash +make stage1 +make copy_originals +make bolt_instrument +make finish_stage1 +make merge_data +make bolt +``` +After these commands finish, the optimized version of Julia will be built in the `optimized.build` directory. + +This doesn't align the code to support huge pages as it doesn't seem that we do that currently, this decreases the size of the .so files by 2-4mb. diff --git a/contrib/pgo-lto-bolt/.gitignore b/contrib/pgo-lto-bolt/.gitignore new file mode 100644 index 0000000000000..1b29279acc0da --- /dev/null +++ b/contrib/pgo-lto-bolt/.gitignore @@ -0,0 +1,14 @@ +stage0* +stage1* +stage2* +bolt +bolt_instrument +merge_data +copy_originals + +profiles +profiles-bolt + +toolchain +pgo-instrumented.build +optimized.build diff --git a/contrib/pgo-lto-bolt/Makefile b/contrib/pgo-lto-bolt/Makefile new file mode 100644 index 0000000000000..6787b3bc4e919 --- /dev/null +++ b/contrib/pgo-lto-bolt/Makefile @@ -0,0 +1,185 @@ +.PHONY: clean clean_profiles restore_originals + +# See the makefiles in contrib/bolt and contrib/pgo-lto for more information. + +# Settings taken from https://github.com/rust-lang/rust/blob/master/src/tools/opt-dist/src/bolt.rs +BOLT_ARGS := +# Reorder basic blocks within functions +BOLT_ARGS += -reorder-blocks=ext-tsp +# Reorder functions within the binary +BOLT_ARGS += -reorder-functions=cdsort +# Split function code into hot and code regions +BOLT_ARGS += -split-functions +# Split as many basic blocks as possible +BOLT_ARGS += -split-all-cold +# Move jump tables to a separate section +BOLT_ARGS += -jump-tables=move +# Use regular size pages for code alignment +BOLT_ARGS += -no-huge-pages +# Fold functions with identical code +BOLT_ARGS += -icf=1 +# Split using best available strategy (three-way splitting, Cache-Directed Sort) +# Disabled for libjulia-internal till https://github.com/llvm/llvm-project/issues/89508 is fixed +# BOLT_ARGS += -split-strategy=cdsplit +# Update DWARF debug info in the final binary +BOLT_ARGS += -update-debug-sections +# Print optimization statistics +BOLT_ARGS += -dyno-stats +# BOLT doesn't fully support computed gotos, https://github.com/llvm/llvm-project/issues/89117 +# Use escaped regex as the name BOLT recognises is often a bit different, e.g. apply_cl/1(*2) +# This doesn't actually seem to do anything, the actual mitigation is not using --use-old-text +# which we do in the bolt target +BOLT_ARGS += -skip-funcs=.\*apply_cl.\* + +# -fno-reorder-blocks-and-partition is needed on gcc >= 8. +BOLT_FLAGS := $\ + "BOLT_CFLAGS_GCC+=-fno-reorder-blocks-and-partition" $\ + "BOLT_LDFLAGS=-Wl,--emit-relocs" + +STAGE0_BUILD:=$(CURDIR)/toolchain +STAGE1_BUILD:=$(CURDIR)/pgo-instrumented.build +STAGE2_BUILD:=$(CURDIR)/optimized.build + +STAGE0_BINARIES:=$(STAGE0_BUILD)/usr/bin/ +STAGE0_TOOLS:=$(STAGE0_BUILD)/usr/tools/ + +BOLT_PROFILE_DIR:=$(CURDIR)/profiles-bolt +PGO_PROFILE_DIR:=$(CURDIR)/profiles +PGO_PROFILE_FILE:=$(PGO_PROFILE_DIR)/merged.prof +PGO_PROFRAW_FILES:=$(wildcard $(PGO_PROFILE_DIR)/*.profraw) +JULIA_ROOT:=$(CURDIR)/../.. + +LLVM_BOLT:=$(STAGE0_BINARIES)llvm-bolt +LLVM_MERGEFDATA:=$(STAGE0_BINARIES)merge-fdata +LLVM_CXXFILT:=$(STAGE0_TOOLS)llvm-cxxfilt +LLVM_PROFDATA:=$(STAGE0_TOOLS)llvm-profdata +LLVM_OBJCOPY:=$(STAGE0_TOOLS)llvm-objcopy + +# If you add new files to optimize, you need to add BOLT_LDFLAGS and BOLT_CFLAGS to the build of your new file. +SYMLINKS_TO_OPTIMIZE := libLLVM.so libjulia-internal.so libjulia-codegen.so +FILES_TO_OPTIMIZE := $(shell for file in $(SYMLINKS_TO_OPTIMIZE); do readlink $(STAGE1_BUILD)/usr/lib/$$file; done) + +AFTER_INSTRUMENT_MESSAGE:='Run `make finish_stage2` to finish off the build. $\ + You can now optionally collect more profiling data by running Julia with an appropriate workload, $\ + if you wish, run `make clean_profiles` before doing so to remove any profiling data generated by `make finish_stage2`. $\ + You should end up with some data in $(BOLT_PROFILE_DIR). Afterwards run `make merge_data && make bolt`. $\ + +# When building a single libLLVM.so we need to increase -vp-counters-per-site +# significantly +COUNTERS_PER_SITE:=6 +# Note: profile counters are not atomic by default, https://discourse.llvm.org/t/profile-guided-optimization-pgo-related-questions-and-suggestions/75232/5 + +AFTER_STAGE1_MESSAGE:='You can now optionally collect more profiling data for use in PGO by running Julia $\ + with an appropriate workload. If you wish, run `make clean_profiles` before doing so to remove any profiling data $\ + generated by building Julia. You should end up with about 15MB of data in $(PGO_PROFILE_DIR). $\ + Note that running extensive scripts may result in counter overflows, which can be detected by running $\ + `make top`. Afterwards run `make stage2`.' + +TOOLCHAIN_FLAGS = $\ + "CC=$(STAGE0_TOOLS)clang" $\ + "CXX=$(STAGE0_TOOLS)clang++" $\ + "LD=$(STAGE0_TOOLS)ld.lld" $\ + "AR=$(STAGE0_TOOLS)llvm-ar" $\ + "RANLIB=$(STAGE0_TOOLS)llvm-ranlib" $\ + "CFLAGS+=$(PGO_CFLAGS)" $\ + "CXXFLAGS+=$(PGO_CXXFLAGS)" $\ + "LDFLAGS+=$(PGO_LDFLAGS)" + +$(STAGE0_BUILD) $(STAGE1_BUILD) $(STAGE2_BUILD): + $(MAKE) -C $(JULIA_ROOT) O=$@ configure + +stage0: export USE_BINARYBUILDER_LLVM=1 +stage0: | $(STAGE0_BUILD) + # Turn [cd]tors into init/fini_array sections in libclang_rt, since lld + # doesn't do that, and otherwise the profile constructor is not executed + $(MAKE) -C $(STAGE0_BUILD)/deps install-clang install-llvm install-lld install-llvm-tools install-BOLT && \ + find $< -name 'libclang_rt.profile-*.a' -exec $(LLVM_OBJCOPY) --rename-section .ctors=.init_array --rename-section .dtors=.fini_array {} + && \ + touch $@ + +$(STAGE1_BUILD): stage0 +stage1: PGO_CFLAGS:=-fprofile-generate=$(PGO_PROFILE_DIR) -Xclang -mllvm -Xclang -vp-counters-per-site=$(COUNTERS_PER_SITE) +stage1: PGO_CXXFLAGS:=-fprofile-generate=$(PGO_PROFILE_DIR) -Xclang -mllvm -Xclang -vp-counters-per-site=$(COUNTERS_PER_SITE) +stage1: PGO_LDFLAGS:=-fuse-ld=lld -flto=thin -fprofile-generate=$(PGO_PROFILE_DIR) +stage1: export USE_BINARYBUILDER_LLVM=0 +stage1: | $(STAGE1_BUILD) + $(MAKE) -C $(STAGE1_BUILD) $(TOOLCHAIN_FLAGS) && touch $@ + @echo $(AFTER_STAGE1_MESSAGE) + +stage2: PGO_CFLAGS:=-fprofile-use=$(PGO_PROFILE_FILE) +stage2: PGO_CXXFLAGS:=-fprofile-use=$(PGO_PROFILE_FILE) +stage2: PGO_LDFLAGS:=-fuse-ld=lld -flto=thin -fprofile-use=$(PGO_PROFILE_FILE) -Wl,--icf=safe +stage2: export USE_BINARYBUILDER_LLVM=0 +stage2: $(PGO_PROFILE_FILE) | $(STAGE2_BUILD) + $(MAKE) -C $(STAGE2_BUILD) $(TOOLCHAIN_FLAGS) $(BOLT_FLAGS) julia-src-release julia-symlink julia-libccalltest \ + julia-libccalllazyfoo julia-libccalllazybar julia-libllvmcalltest && \ + touch $@ + +copy_originals: stage2 + for file in $(FILES_TO_OPTIMIZE); do \ + abs_file=$(STAGE2_BUILD)/usr/lib/$$file; \ + cp $$abs_file "$$abs_file.original"; \ + done && \ + touch $@ + +# I don't think there's any particular reason to have -no-huge-pages here, perhaps slightly more accurate profile data +# as the final build uses -no-huge-pages +bolt_instrument: copy_originals + for file in $(FILES_TO_OPTIMIZE); do \ + abs_file=$(STAGE2_BUILD)/usr/lib/$$file; \ + $(LLVM_BOLT) "$$abs_file.original" -o $$abs_file --instrument --instrumentation-file-append-pid --instrumentation-file="$(BOLT_PROFILE_DIR)/$$file-prof" -no-huge-pages; \ + mkdir -p $$(dirname "$(BOLT_PROFILE_DIR)/$$file-prof"); \ + printf "\n"; \ + done && \ + touch $@ + @echo $(AFTER_INSTRUMENT_MESSAGE) + +# We don't want to rebuild julia-src as then we lose the bolt instrumentation +# So we have to manually build the sysimage and package image +finish_stage2: stage2 + $(MAKE) -C $(STAGE2_BUILD) julia-base-cache && \ + $(MAKE) -C $(STAGE2_BUILD) -f sysimage.mk sysimg-release && \ + $(MAKE) -C $(STAGE2_BUILD) -f pkgimage.mk release + +merge_data: bolt_instrument + for file in $(FILES_TO_OPTIMIZE); do \ + profiles=$(BOLT_PROFILE_DIR)/$$file-prof.*.fdata; \ + $(LLVM_MERGEFDATA) $$profiles > "$(BOLT_PROFILE_DIR)/$$file-prof.merged.fdata"; \ + done && \ + touch $@ + +# The --use-old-text saves about 16 MiB of libLLVM.so size. +# However, the rust folk found it succeeds very non-deterministically for them. +# It tries to reuse old text segments to reduce binary size +# BOLT doesn't fully support computed gotos https://github.com/llvm/llvm-project/issues/89117, so we cannot use --use-old-text on libjulia-internal +# That flag saves less than 1 MiB for libjulia-internal so oh well. +bolt: merge_data + for file in $(FILES_TO_OPTIMIZE); do \ + abs_file=$(STAGE2_BUILD)/usr/lib/$$file; \ + $(LLVM_BOLT) "$$abs_file.original" -data "$(BOLT_PROFILE_DIR)/$$file-prof.merged.fdata" -o $$abs_file $(BOLT_ARGS) $$(if [ "$$file" != $(shell readlink $(STAGE2_BUILD)/usr/lib/libjulia-internal.so) ]; then echo "--use-old-text -split-strategy=cdsplit"; fi); \ + done && \ + touch $@ + +clean_profiles: + rm -rf $(PGO_PROFILE_DIR) $(BOLT_PROFILE_DIR) + +clean: + rm -f stage0 stage1 stage2 $(PGO_PROFILE_FILE) bolt copy_originals merge_data bolt_instrument + +restore_originals: copy_originals + for file in $(FILES_TO_OPTIMIZE); do \ + abs_file=$(STAGE2_BUILD)/usr/lib/$$file; \ + cp -P "$$abs_file.original" $$abs_file; \ + done + +delete_originals: copy_originals + for file in $(FILES_TO_OPTIMIZE); do \ + abs_file=$(STAGE2_BUILD)/usr/lib/$$file; \ + rm "$$abs_file.original"; \ + done + +$(PGO_PROFILE_FILE): stage1 $(PGO_PROFRAW_FILES) + $(LLVM_PROFDATA) merge -output=$@ $(PGO_PROFRAW_FILES) + +# show top 50 functions +top: $(PGO_PROFILE_FILE) + $(LLVM_PROFDATA) show --topn=50 $< | $(LLVM_CXXFILT) diff --git a/contrib/pgo-lto-bolt/README.md b/contrib/pgo-lto-bolt/README.md new file mode 100644 index 0000000000000..ab574907c292f --- /dev/null +++ b/contrib/pgo-lto-bolt/README.md @@ -0,0 +1,18 @@ +BOLT only works on x86_64 and arch64 on Linux. + +DO NOT STRIP THE RESULTING .so FILES, https://github.com/llvm/llvm-project/issues/56738. +If you really need to, try adding `-use-gnu-stack` to `BOLT_ARGS`. + +To build a PGO+LTO+BOLT version of Julia run the following commands (`cd` into this directory first) +```bash +make stage1 +make stage2 +make copy_originals +make bolt_instrument +make finish_stage2 +make merge_data +make bolt +``` +After these commands finish, the optimized version of Julia will be built in the `optimized.build` directory. + +This doesn't align the code to support huge pages as it doesn't seem that we do that currently, this decreases the size of the .so files by 2-4mb. diff --git a/deps/BOLT.mk b/deps/BOLT.mk new file mode 100644 index 0000000000000..70c5d03c762ec --- /dev/null +++ b/deps/BOLT.mk @@ -0,0 +1,118 @@ +## BOLT ## +include $(SRCDIR)/BOLT.version + +ifneq ($(USE_BINARYBUILDER_BOLT), 1) +BOLT_GIT_URL:=https://github.com/llvm/llvm-project.git +BOLT_TAR_URL=https://api.github.com/repos/llvm/llvm-project/tarball/$1 +$(eval $(call git-external,BOLT,BOLT,CMakeLists.txt,,$(SRCCACHE))) + +BOLT_BUILDDIR := $(BUILDDIR)/$(BOLT_SRC_DIR)/build + +LLVM_ENABLE_PROJECTS := bolt + +LLVM_CFLAGS := +LLVM_CXXFLAGS := +LLVM_CPPFLAGS := +LLVM_LDFLAGS := +LLVM_CMAKE := + +LLVM_CMAKE += -DLLVM_ENABLE_PROJECTS="$(LLVM_ENABLE_PROJECTS)" + +# Otherwise LLVM will translate \\ to / on mingw +LLVM_CMAKE += -DLLVM_WINDOWS_PREFER_FORWARD_SLASH=False + +# Allow adding LLVM specific flags +LLVM_CFLAGS += $(CFLAGS) +LLVM_CXXFLAGS += $(CXXFLAGS) +LLVM_CXXFLAGS += $(LLVM_CXXFLAGS) +LLVM_CPPFLAGS += $(CPPFLAGS) +LLVM_LDFLAGS += $(LDFLAGS) +LLVM_LDFLAGS += $(LLVM_LDFLAGS) +LLVM_CMAKE += -DLLVM_TARGETS_TO_BUILD:STRING=host -DCMAKE_BUILD_TYPE=Release +LLVM_CMAKE += -DLLVM_ENABLE_LIBXML2=OFF -DLLVM_HOST_TRIPLE="$(or $(XC_HOST),$(BUILD_MACHINE))" +LLVM_CMAKE += -DLLVM_ENABLE_ZLIB=ON -DZLIB_LIBRARY="$(build_prefix)/lib" + +LLVM_CMAKE += -DLLVM_BINDINGS_LIST="" -DLLVM_ENABLE_BINDINGS=OFF -DLLVM_INCLUDE_DOCS=Off -DLLVM_ENABLE_TERMINFO=Off -DHAVE_LIBEDIT=Off + +ifeq ($(OS), WINNT) +LLVM_CPPFLAGS += -D__USING_SJLJ_EXCEPTIONS__ -D__CRT__NO_INLINE +endif # OS == WINNT +ifneq ($(HOSTCC),$(CC)) +LLVM_CMAKE += -DCROSS_TOOLCHAIN_FLAGS_NATIVE="-DCMAKE_C_COMPILER=$$(which $(HOSTCC));-DCMAKE_CXX_COMPILER=$$(which $(HOSTCXX))" + +# Defaults to off when crosscompiling, starting from LLVM 18 +LLVM_CMAKE += -DBOLT_ENABLE_RUNTIME=ON +endif +ifeq ($(OS), emscripten) +LLVM_CMAKE += -DCMAKE_TOOLCHAIN_FILE=$(EMSCRIPTEN)/cmake/Modules/Platform/Emscripten.cmake -DLLVM_INCLUDE_TOOLS=OFF -DLLVM_BUILD_TOOLS=OFF -DLLVM_INCLUDE_TESTS=OFF -DLLVM_ENABLE_THREADS=OFF -DLLVM_BUILD_UTILS=OFF +endif # OS == emscripten + +ifneq (,$(filter $(ARCH), powerpc64le ppc64le)) +ifeq (${USECLANG},0) +LLVM_CXXFLAGS += -mminimal-toc +endif +endif + +ifeq ($(fPIC),) +LLVM_CMAKE += -DLLVM_ENABLE_PIC=OFF +endif + +LLVM_CMAKE += -DCMAKE_C_FLAGS="$(LLVM_CPPFLAGS) $(LLVM_CFLAGS)" \ + -DCMAKE_CXX_FLAGS="$(LLVM_CPPFLAGS) $(LLVM_CXXFLAGS)" +ifeq ($(OS),Darwin) +# Explicitly use the default for -mmacosx-version-min=10.9 and later +LLVM_CMAKE += -DLLVM_ENABLE_LIBCXX=ON +endif + +LLVM_CMAKE += -DCMAKE_EXE_LINKER_FLAGS="$(LLVM_LDFLAGS)" \ + -DCMAKE_SHARED_LINKER_FLAGS="$(LLVM_LDFLAGS)" + +ifeq ($(USE_SYSTEM_ZLIB), 0) +$(BOLT_BUILDDIR)/build-configured: | $(build_prefix)/manifest/zlib +endif + +$(BOLT_BUILDDIR)/build-configured: $(SRCCACHE)/$(BOLT_SRC_DIR)/source-extracted + mkdir -p $(dir $@) + cd $(dir $@) && \ + $(CMAKE) $(SRCCACHE)/$(BOLT_SRC_DIR)/llvm $(CMAKE_GENERATOR_COMMAND) $(CMAKE_COMMON) $(LLVM_CMAKE) \ + || { echo '*** To install a newer version of cmake, run contrib/download_cmake.sh ***' && false; } + echo 1 > $@ + +$(BOLT_BUILDDIR)/build-compiled: $(BOLT_BUILDDIR)/build-configured + cd $(BOLT_BUILDDIR) && \ + $(if $(filter $(CMAKE_GENERATOR),make), \ + $(MAKE), \ + $(CMAKE) --build . --target bolt) + echo 1 > $@ + +$(BOLT_BUILDDIR)/build-checked: $(BOLT_BUILDDIR)/build-compiled +ifeq ($(OS),$(BUILD_OS)) + cd $(BOLT_BUILDDIR) && \ + $(CMAKE) --build . --target check-bolt +endif + echo 1 > $@ + +BOLT_INSTALL = \ + cd $1 && mkdir -p $2$$(build_depsbindir) && \ + $$(CMAKE) -DCMAKE_INSTALL_PREFIX="$2$$(build_prefix)" -P tools/bolt/cmake_install.cmake + +$(eval $(call staged-install, \ + bolt,$$(BOLT_SRC_DIR)/build, \ + BOLT_INSTALL,,,)) + +clean-bolt: + -rm -f $(BOLT_BUILDDIR)/build-configured $(BOLT_BUILDDIR)/build-compiled + -$(MAKE) -C $(BOLT_BUILDDIR) clean + +get-bolt: $(BOLT_SRC_FILE) +extract-bolt: $(SRCCACHE)/$(BOLT_SRC_DIR)/source-extracted +configure-bolt: $(BOLT_BUILDDIR)/build-configured +compile-bolt: $(BOLT_BUILDDIR)/build-compiled +fastcheck-bolt: #none +check-bolt: $(BOLT_BUILDDIR)/build-checked + +else # USE_BINARYBUILDER_BOLT + +$(eval $(call bb-install,BOLT,BOLT,false,true)) + +endif # USE_BINARYBUILDER_BOLT diff --git a/deps/BOLT.version b/deps/BOLT.version new file mode 100644 index 0000000000000..6a785041e163f --- /dev/null +++ b/deps/BOLT.version @@ -0,0 +1,11 @@ +# -*- makefile -*- + +BOLT_VER := 18.1.4 +BOLT_JLL_VER := 18.1.4+0 + +## jll artifact +BOLT_JLL_NAME := BOLT + +## source build +BOLT_BRANCH=llvmorg-$(BOLT_VER) +BOLT_SHA1=e6c3289804a67ea0bb6a86fadbe454dd93b8d855 diff --git a/deps/Makefile b/deps/Makefile index 83493981a40b4..2f9050f448d67 100644 --- a/deps/Makefile +++ b/deps/Makefile @@ -231,6 +231,7 @@ distcleanall: $(addprefix distclean-, $(DEP_LIBS_ALL)) rm -rf $(build_prefix) getall: $(addprefix get-, $(DEP_LIBS_ALL)) +include $(SRCDIR)/BOLT.mk include $(SRCDIR)/csl.mk include $(SRCDIR)/sanitizers.mk include $(SRCDIR)/ittapi.mk diff --git a/deps/checksums/BOLT.v18.1.4+0.x86_64-linux-gnu-cxx11.tar.gz/md5 b/deps/checksums/BOLT.v18.1.4+0.x86_64-linux-gnu-cxx11.tar.gz/md5 new file mode 100644 index 0000000000000..62e63ff3174d6 --- /dev/null +++ b/deps/checksums/BOLT.v18.1.4+0.x86_64-linux-gnu-cxx11.tar.gz/md5 @@ -0,0 +1 @@ +c12540d5889cef05bc87183a4ce5a54c diff --git a/deps/checksums/BOLT.v18.1.4+0.x86_64-linux-gnu-cxx11.tar.gz/sha512 b/deps/checksums/BOLT.v18.1.4+0.x86_64-linux-gnu-cxx11.tar.gz/sha512 new file mode 100644 index 0000000000000..0635e180ac9a5 --- /dev/null +++ b/deps/checksums/BOLT.v18.1.4+0.x86_64-linux-gnu-cxx11.tar.gz/sha512 @@ -0,0 +1 @@ +61cc7cc42b925f37502eed0d31eafadbfdc24a9ebc892c9b8d96a27b004cbccf2e5da7face5c8d9c9db57fac1b5cf662d890a67337436c5d4aa3373256638ab1 diff --git a/deps/llvm.mk b/deps/llvm.mk index 7ead1dab1d925..5e73c18f53289 100644 --- a/deps/llvm.mk +++ b/deps/llvm.mk @@ -86,10 +86,11 @@ endif LLVM_CMAKE += -DLLVM_WINDOWS_PREFER_FORWARD_SLASH=False # Allow adding LLVM specific flags -LLVM_CFLAGS += $(CFLAGS) -LLVM_CXXFLAGS += $(CXXFLAGS) +LLVM_CFLAGS += $(CFLAGS) $(BOLT_CFLAGS) +LLVM_CXXFLAGS += $(CXXFLAGS) $(BOLT_CFLAGS) LLVM_CPPFLAGS += $(CPPFLAGS) LLVM_LDFLAGS += $(LDFLAGS) +LLVM_LDFLAGS += $(BOLT_LDFLAGS) LLVM_CMAKE += -DLLVM_TARGETS_TO_BUILD:STRING="$(LLVM_TARGETS)" -DCMAKE_BUILD_TYPE="$(LLVM_CMAKE_BUILDTYPE)" LLVM_CMAKE += -DLLVM_EXPERIMENTAL_TARGETS_TO_BUILD:STRING="$(LLVM_EXPERIMENTAL_TARGETS)" LLVM_CMAKE += -DLLVM_ENABLE_LIBXML2=OFF -DLLVM_HOST_TRIPLE="$(or $(XC_HOST),$(BUILD_MACHINE))" diff --git a/src/Makefile b/src/Makefile index e29c56a6cba9c..4da44a8cc8d81 100644 --- a/src/Makefile +++ b/src/Makefile @@ -391,7 +391,7 @@ $(BUILDDIR)/julia.expmap: $(SRCDIR)/julia.expmap.in $(build_shlibdir)/libjulia-internal.$(JL_MAJOR_MINOR_SHLIB_EXT): $(BUILDDIR)/julia.expmap $(OBJS) $(BUILDDIR)/flisp/libflisp.a $(BUILDDIR)/support/libsupport.a $(LIBUV) @$(call PRINT_LINK, $(CXXLD) $(call IMPLIB_FLAGS,$@) $(JCXXFLAGS) $(JL_CXXFLAGS) $(CXXLDFLAGS) $(SHIPFLAGS) $(OBJS) $(RPATH_LIB) -o $@ \ - $(JLDFLAGS) $(JLIBLDFLAGS) $(RT_RELEASE_LIBS) $(call SONAME_FLAGS,libjulia-internal.$(JL_MAJOR_SHLIB_EXT))) + $(JLDFLAGS) $(BOLT_LDFLAGS) $(JLIBLDFLAGS) $(RT_RELEASE_LIBS) $(call SONAME_FLAGS,libjulia-internal.$(JL_MAJOR_SHLIB_EXT))) @$(INSTALL_NAME_CMD)libjulia-internal.$(SHLIB_EXT) $@ $(DSYMUTIL) $@ @@ -419,7 +419,7 @@ libjulia-internal-debug libjulia-internal-release: $(PUBLIC_HEADER_TARGETS) $(build_shlibdir)/libjulia-codegen.$(JL_MAJOR_MINOR_SHLIB_EXT): $(BUILDDIR)/julia.expmap $(CODEGEN_OBJS) $(BUILDDIR)/support/libsupport.a $(build_shlibdir)/libjulia-internal.$(JL_MAJOR_MINOR_SHLIB_EXT) @$(call PRINT_LINK, $(CXXLD) $(call IMPLIB_FLAGS,$@) $(JCXXFLAGS) $(JL_CXXFLAGS) $(CXXLDFLAGS) $(SHIPFLAGS) $(CODEGEN_OBJS) $(RPATH_LIB) -o $@ \ - $(JLDFLAGS) $(JLIBLDFLAGS) $(CG_RELEASE_LIBS) $(call SONAME_FLAGS,libjulia-codegen.$(JL_MAJOR_SHLIB_EXT))) + $(JLDFLAGS) $(BOLT_LDFLAGS) $(JLIBLDFLAGS) $(CG_RELEASE_LIBS) $(call SONAME_FLAGS,libjulia-codegen.$(JL_MAJOR_SHLIB_EXT))) @$(INSTALL_NAME_CMD)libjulia-codegen.$(SHLIB_EXT) $@ $(DSYMUTIL) $@