From abd7e4526a35d34b6ab088543838c8015ccb5f80 Mon Sep 17 00:00:00 2001
From: scottmcm <scottmcm@users.noreply.github.com>
Date: Mon, 13 Nov 2023 18:06:21 -0800
Subject: [PATCH] Optimize `Entity::eq` (#10519)

(This is my first PR here, so I've probably missed some things. Please
let me know what else I should do to help you as a reviewer!)

# Objective

Due to https://github.com/rust-lang/rust/issues/117800, the `derive`'d
`PartialEq::eq` on `Entity` isn't as good as it could be. Since that's
used in hashtable lookup, let's improve it.

## Solution

The derived `PartialEq::eq` short-circuits if the generation doesn't
match. However, having a branch there is sub-optimal, especially on
64-bit systems like x64 that could just load the whole `Entity` in one
load anyway.

Due to complications around `poison` in LLVM and the exact details of
what unsafe code is allowed to do with reference in Rust
(https://github.com/rust-lang/unsafe-code-guidelines/issues/346), LLVM
isn't allowed to completely remove the short-circuiting. `&Entity` is
marked `dereferencable(8)` so LLVM knows it's allowed to *load* all 8
bytes -- and does so -- but it has to assume that the `index` might be
undef/poison if the `generation` doesn't match, and thus while it finds
a way to do it without needing a branch, it has to do something slightly
more complicated than optimal to combine the results. (LLVM is allowed
to change non-short-circuiting code to use branches, but not the other
way around.)

Here's a link showing the codegen today:
<https://rust.godbolt.org/z/9WzjxrY7c>
```rust
#[no_mangle]
pub fn demo_eq_ref(a: &Entity, b: &Entity) -> bool {
    a == b
}
```
ends up generating the following assembly:
```asm
demo_eq_ref:
        movq    xmm0, qword ptr [rdi]
        movq    xmm1, qword ptr [rsi]
        pcmpeqd xmm1, xmm0
        pshufd  xmm0, xmm1, 80
        movmskpd        eax, xmm0
        cmp     eax, 3
        sete    al
        ret
```
(It's usually not this bad in real uses after inlining and LTO, but it
makes a strong demo.)

This PR manually implements `PartialEq::eq` *without* short-circuiting,
and because that tells LLVM that neither the generations nor the index
can be poison, it doesn't need to be so careful and can generate the
"just compare the two 64-bit values" code you'd have probably already
expected:
```asm
demo_eq_ref:
        mov     rax, qword ptr [rsi]
        cmp     qword ptr [rdi], rax
        sete    al
        ret
```

Since this doesn't change the representation of `Entity`, if it's
instead passed by *value*, then each `Entity` is two `u32` registers,
and the old and the new code do exactly the same thing. (Other
approaches, like changing `Entity` to be `[u32; 2]` or `u64`, affect
this case.)

This should hopefully merge easily with changes like
https://github.com/bevyengine/bevy/pull/9907 that also want to change
`Entity`.

## Benchmarks

I'm not super-confident that I got my machine fully consistent for
benchmarking, but whether I run the old or the new one first I get
reasonably consistent results.

Here's a fairly typical example of the benchmarks I added in this PR:

![image](https://github.com/bevyengine/bevy/assets/18526288/24226308-4616-4082-b0ff-88fc06285ef1)

Building the sets seems to be basically the same. It's usually reported
as noise, but sometimes I see a few percent slower or faster.

But lookup hits in particular -- since a hit checks that the key is
equal -- consistently shows around 10% improvement.

`cargo run --example many_cubes --features bevy/trace_tracy --release --
--benchmark` showed as slightly faster with this change, though if I had
to bet I'd probably say it's more noise than meaningful (but at least
it's not worse either):

![image](https://github.com/bevyengine/bevy/assets/18526288/58bb8c96-9c45-487f-a5ab-544bbfe9fba0)

This is my first PR here -- and my first time running Tracy -- so please
let me know what else I should run, or run things on your own more
reliable machines to double-check.

---

## Changelog

(probably not worth including)

Changed: micro-optimized `Entity::eq` to help LLVM slightly.

## Migration Guide

(I really hope nobody was using this on uninitialized entities where
sufficiently tortured `unsafe` could could technically notice that this
has changed.)
---
 benches/Cargo.toml                        |  5 ++
 benches/benches/bevy_utils/entity_hash.rs | 75 +++++++++++++++++++++++
 crates/bevy_ecs/src/entity/mod.rs         | 37 ++++++++++-
 3 files changed, 116 insertions(+), 1 deletion(-)
 create mode 100644 benches/benches/bevy_utils/entity_hash.rs

diff --git a/benches/Cargo.toml b/benches/Cargo.toml
index f060d83ebfda8..b78352a55dabe 100644
--- a/benches/Cargo.toml
+++ b/benches/Cargo.toml
@@ -61,3 +61,8 @@ harness = false
 name = "bezier"
 path = "benches/bevy_math/bezier.rs"
 harness = false
+
+[[bench]]
+name = "utils"
+path = "benches/bevy_utils/entity_hash.rs"
+harness = false
diff --git a/benches/benches/bevy_utils/entity_hash.rs b/benches/benches/bevy_utils/entity_hash.rs
new file mode 100644
index 0000000000000..fa83ee3950d47
--- /dev/null
+++ b/benches/benches/bevy_utils/entity_hash.rs
@@ -0,0 +1,75 @@
+use bevy_ecs::entity::Entity;
+use bevy_utils::EntityHashSet;
+use criterion::{criterion_group, criterion_main, BenchmarkId, Criterion, Throughput};
+use rand::{Rng, SeedableRng};
+use rand_chacha::ChaCha8Rng;
+
+criterion_group!(benches, entity_set_build_and_lookup,);
+criterion_main!(benches);
+
+const SIZES: [usize; 5] = [100, 316, 1000, 3162, 10000];
+
+fn make_entity(rng: &mut impl Rng, size: usize) -> Entity {
+    // -log₂(1-x) gives an exponential distribution with median 1.0
+    // That lets us get values that are mostly small, but some are quite large
+    // * For ids, half are in [0, size), half are unboundedly larger.
+    // * For generations, half are in [0, 2), half are unboundedly larger.
+
+    let x: f64 = rng.gen();
+    let id = -(1.0 - x).log2() * (size as f64);
+    let x: f64 = rng.gen();
+    let gen = -(1.0 - x).log2() * 2.0;
+
+    // this is not reliable, but we're internal so a hack is ok
+    let bits = ((gen as u64) << 32) | (id as u64);
+    let e = Entity::from_bits(bits);
+    assert_eq!(e.index(), id as u32);
+    assert_eq!(e.generation(), gen as u32);
+    e
+}
+
+fn entity_set_build_and_lookup(c: &mut Criterion) {
+    let mut group = c.benchmark_group("entity_hash");
+    for size in SIZES {
+        // Get some random-but-consistent entities to use for all the benches below.
+        let mut rng = ChaCha8Rng::seed_from_u64(size as u64);
+        let entities = Vec::from_iter(
+            std::iter::repeat_with(|| make_entity(&mut rng, size)).take(size),
+        );
+
+        group.throughput(Throughput::Elements(size as u64));
+        group.bench_function(
+            BenchmarkId::new("entity_set_build", size),
+            |bencher| {
+                bencher.iter_with_large_drop(|| EntityHashSet::from_iter(entities.iter().copied()));
+            },
+        );
+        group.bench_function(
+            BenchmarkId::new("entity_set_lookup_hit", size),
+            |bencher| {
+                let set = EntityHashSet::from_iter(entities.iter().copied());
+                bencher.iter(|| entities.iter().copied().filter(|e| set.contains(e)).count());
+            },
+        );
+        group.bench_function(
+            BenchmarkId::new("entity_set_lookup_miss_id", size),
+            |bencher| {
+                let set = EntityHashSet::from_iter(entities.iter().copied());
+                bencher.iter(|| entities.iter()
+                    .copied()
+                    .map(|e| Entity::from_bits(e.to_bits() + 1))
+                    .filter(|e| set.contains(e)).count());
+            },
+        );
+        group.bench_function(
+            BenchmarkId::new("entity_set_lookup_miss_gen", size),
+            |bencher| {
+                let set = EntityHashSet::from_iter(entities.iter().copied());
+                bencher.iter(|| entities.iter()
+                    .copied()
+                    .map(|e| Entity::from_bits(e.to_bits() + (1 << 32)))
+                    .filter(|e| set.contains(e)).count());
+            },
+        );
+    }
+}
diff --git a/crates/bevy_ecs/src/entity/mod.rs b/crates/bevy_ecs/src/entity/mod.rs
index 677477680f10c..e0bbd568afd80 100644
--- a/crates/bevy_ecs/src/entity/mod.rs
+++ b/crates/bevy_ecs/src/entity/mod.rs
@@ -115,12 +115,21 @@ type IdCursor = isize;
 /// [`EntityCommands`]: crate::system::EntityCommands
 /// [`Query::get`]: crate::system::Query::get
 /// [`World`]: crate::world::World
-#[derive(Clone, Copy, Eq, Ord, PartialEq, PartialOrd)]
+#[derive(Clone, Copy, Eq, Ord, PartialOrd)]
 pub struct Entity {
     generation: u32,
     index: u32,
 }
 
+// By not short-circuiting in comparisons, we get better codegen.
+// See <https://github.com/rust-lang/rust/issues/117800>
+impl PartialEq for Entity {
+    #[inline]
+    fn eq(&self, other: &Entity) -> bool {
+        (self.generation == other.generation) & (self.index == other.index)
+    }
+}
+
 impl Hash for Entity {
     #[inline]
     fn hash<H: std::hash::Hasher>(&self, state: &mut H) {
@@ -917,4 +926,30 @@ mod tests {
         assert_eq!(next_entity.index(), entity.index());
         assert!(next_entity.generation > entity.generation + GENERATIONS);
     }
+
+    #[test]
+    fn entity_comparison() {
+        // This is intentionally testing `lt` and `ge` as separate functions.
+        #![allow(clippy::nonminimal_bool)]
+
+        assert!(Entity::new(123, 456) == Entity::new(123, 456));
+        assert!(Entity::new(123, 789) != Entity::new(123, 456));
+        assert!(Entity::new(123, 456) != Entity::new(123, 789));
+        assert!(Entity::new(123, 456) != Entity::new(456, 123));
+
+        // ordering is by generation then by index
+
+        assert!(Entity::new(123, 456) >= Entity::new(123, 456));
+        assert!(Entity::new(123, 456) <= Entity::new(123, 456));
+        assert!(!(Entity::new(123, 456) < Entity::new(123, 456)));
+        assert!(!(Entity::new(123, 456) > Entity::new(123, 456)));
+
+        assert!(Entity::new(9, 1) < Entity::new(1, 9));
+        assert!(Entity::new(1, 9) > Entity::new(9, 1));
+
+        assert!(Entity::new(1, 1) < Entity::new(2, 1));
+        assert!(Entity::new(1, 1) <= Entity::new(2, 1));
+        assert!(Entity::new(2, 2) > Entity::new(1, 2));
+        assert!(Entity::new(2, 2) >= Entity::new(1, 2));
+    }
 }