scx: Branchless implementation of highest_bit #208

vax-r · 2024-05-21T07:30:57Z

Summary

Origin implementation of function highest_bit utilize the function of fls() to calculate the most significant bit of the input parameter flags. Normally we can return the mask with 1 << (fls(flags) - 1), but fls(flags) will return 0 if the value of flags is 0, which will cause the evaluation become 1 << (-1) and it's illegal. So we use a branch to determine whether the return value of fls(flags) is 0.

We can remove the use of branch first left shift fls(flags) number of bits and then right shift 1 bit. When the value of fls(flags) is 0 then this evaluation will simply become 0 without any error. As more values other than 0, evaluation is the same as it did for 1 << (fls(flags) - 1).

This implementation can prevent any possible branch prediction fault and pure shift operations are cheaper than branch operation for if-else statements.

Experiments

Correctness

In order to prove the correctness of my implementation, I use the following userspace program to test whether the two different implementation generate exactly the same output for every number within the range of u32 which is usually unsigned .

#include <stdlib.h>
#include <stdio.h>
#include <limits.h>

int generic_fls(unsigned int x)
{
	int r = 32;

	if (!x)
		return 0;
	if (!(x & 0xffff0000u)) {
		x <<= 16;
		r -= 16;
	}
	if (!(x & 0xff000000u)) {
		x <<= 8;
		r -= 8;
	}
	if (!(x & 0xf0000000u)) {
		x <<= 4;
		r -= 4;
	}
	if (!(x & 0xc0000000u)) {
		x <<= 2;
		r -= 2;
	}
	if (!(x & 0x80000000u)) {
		x <<= 1;
		r -= 1;
	}
	return r;
}

#define fls(x) generic_fls(x)

static unsigned highest_bit(unsigned flags)
{
	int bit = fls(flags);
	return bit ? 1 << (bit - 1) : 0;
}

static unsigned highest_bit_new(unsigned flags)
{
	int bit = fls(flags);
	return (1UL << bit) >> 1;
}

int main(void) {

    for (unsigned i = 0x00000000; i < 0x88888888; i++) {
	unsigned ans = highest_bit(i);
	unsigned ans_2 = highest_bit_new(i);
	if (ans != ans_2)
		printf("ans : %u, ans_2 : %u\n", ans, ans_2);
    }
    unsigned ans = highest_bit(0x88888888);
    unsigned ans_2 = highest_bit_new(0x88888888);
    if (ans != ans_2)
        printf("ans : %u, ans_2 : %u\n", ans, ans_2);

    return 0;
}

Compile the program and execute

$ gcc -o out main.c
$ ./out

Test Passed ! The correctness are proved.

Performance

Change the user space program abit , only test a subset within the region of unsigned (so it won't take too much time for 1 single execution) . Using perf stat to observe the performance of the two different implementation

Branch version

#include <stdlib.h>
#include <stdio.h>
#include <limits.h>

int generic_fls(unsigned int x)
{
	int r = 32;

	if (!x)
		return 0;
	if (!(x & 0xffff0000u)) {
		x <<= 16;
		r -= 16;
	}
	if (!(x & 0xff000000u)) {
		x <<= 8;
		r -= 8;
	}
	if (!(x & 0xf0000000u)) {
		x <<= 4;
		r -= 4;
	}
	if (!(x & 0xc0000000u)) {
		x <<= 2;
		r -= 2;
	}
	if (!(x & 0x80000000u)) {
		x <<= 1;
		r -= 1;
	}
	return r;
}

#define fls(x) generic_fls(x)

static unsigned highest_bit(unsigned flags)
{
	int bit = fls(flags);
	return bit ? 1 << (bit - 1) : 0;
}

int main(void) {

    for (unsigned i = 0x70000000; i < 0x80000008; i++) {
	unsigned ans = highest_bit(i);
    }

    return 0;
}

Compile and use perf to observe the performance.

$ gcc -o out main.c
$ sudo perf stat --repeat 5 ./out

 Performance counter stats for './out' (5 runs):

            809.51 msec task-clock                       #    1.000 CPUs utilized               ( +-  2.03% )
                 2      context-switches                 #    2.471 /sec                        ( +- 25.50% )
                 0      cpu-migrations                   #    0.000 /sec                      
                50      page-faults                      #   61.766 /sec                        ( +-  0.40% )
      42,4302,2494      cycles                           #    5.241 GHz                         ( +-  0.04% )  (82.95%)
         1827,2293      stalled-cycles-frontend          #    0.43% frontend cycles idle        ( +-  0.71% )  (83.29%)
          158,1598      stalled-cycles-backend           #    0.04% backend cycles idle         ( +- 27.60% )  (83.45%)
     155,7262,2684      instructions                     #    3.67  insn per cycle            
                                                  #    0.00  stalled cycles per insn     ( +-  0.03% )  (83.46%)
      34,9005,2738      branches                         #    4.311 G/sec                       ( +-  0.04% )  (83.45%)
            5,5183      branch-misses                    #    0.00% of all branches             ( +-  7.62% )  (83.40%)

            0.8099 +- 0.0165 seconds time elapsed  ( +-  2.04% )

Branchless version

#include <stdlib.h>
#include <stdio.h>
#include <limits.h>

int generic_fls(unsigned int x)
{
	int r = 32;

	if (!x)
		return 0;
	if (!(x & 0xffff0000u)) {
		x <<= 16;
		r -= 16;
	}
	if (!(x & 0xff000000u)) {
		x <<= 8;
		r -= 8;
	}
	if (!(x & 0xf0000000u)) {
		x <<= 4;
		r -= 4;
	}
	if (!(x & 0xc0000000u)) {
		x <<= 2;
		r -= 2;
	}
	if (!(x & 0x80000000u)) {
		x <<= 1;
		r -= 1;
	}
	return r;
}

#define fls(x) generic_fls(x)

static unsigned highest_bit(unsigned flags)
{
	int bit = fls(flags);
	return (1UL << bit) >> 1;
}

int main(void) {

    for (unsigned i = 0x70000000; i < 0x80000008; i++) {
	unsigned ans = highest_bit(i);
    }

    return 0;
}

Compile and use perf to observe the performance.

$ gcc -o out main.c
$ sudo perf stat --repeat 5 ./out

 Performance counter stats for './out' (5 runs):

            726.41 msec task-clock                       #    1.000 CPUs utilized               ( +-  2.24% )
                 1      context-switches                 #    1.377 /sec                        ( +- 37.42% )
                 0      cpu-migrations                   #    0.000 /sec                      
                50      page-faults                      #   68.832 /sec                        ( +-  0.40% )
      37,8968,1629      cycles                           #    5.217 GHz                         ( +-  0.02% )  (83.25%)
          250,0461      stalled-cycles-frontend          #    0.07% frontend cycles idle        ( +-  2.88% )  (83.25%)
           91,6112      stalled-cycles-backend           #    0.02% backend cycles idle         ( +- 11.66% )  (83.26%)
     147,6344,7563      instructions                     #    3.90  insn per cycle            
                                                  #    0.00  stalled cycles per insn     ( +-  0.01% )  (83.25%)
      29,5408,1409      branches                         #    4.067 G/sec                       ( +-  0.01% )  (83.54%)
            4,8595      branch-misses                    #    0.00% of all branches             ( +-  2.67% )  (83.45%)

            0.7268 +- 0.0163 seconds time elapsed  ( +-  2.25% )

We can see significant improvements on cycles, stalled-cycles-frontend, especially branches and branch-misses.
The test runs on x86_64 AMD Ryzen 7 7700X 8-Core Processor , the operating system is Ubuntu 22.04.4 LTS .

htejun · 2024-05-21T20:24:04Z

Thanks for the thorough PR but this unfortunately would break on 32bit builds as the highest bit would be lost in 32bit unsigned long.

vax-r · 2024-05-22T01:49:48Z

Thanks for the thorough PR but this unfortunately would break on 32bit builds as the highest bit would be lost in 32bit unsigned long.

Oh I see, what do you think about doing a type conversion from u32 to u64 for the integer 1 ? if that's possible I'll try perform the same experiment and see if the performance is better.
I mean something like this.

((u64) 1 << bit) >> 1

htejun · 2024-05-22T16:57:10Z

Yeah, that should work. Can you verify it on 32bit builds too? That's probably going to be slower but as long as it's not drastic, that should be okay.

vax-r · 2024-05-23T04:20:11Z

Yeah, that should work. Can you verify it on 32bit builds too? That's probably going to be slower but as long as it's not drastic, that should be okay.

Sorry I don't really understand the meaning of 32 bit builds, do you mean to re-compile the kernel with a config setting of 32 bit builds and test the correctness inside 32bit builded kernel ?
If you mean to build a bare-metal 32 bit machine then it's not available for me currently as I only have a x86_64 machine in my lab.

htejun · 2024-05-23T06:48:55Z

You can just build the test binary with -m32 to verify the correctness on 32bit.

Origin implementation of function highest_bit utilize the function of "fls()" to calculate the most significant bit of the input parameter "flags". Normally we can return the mask with "1 << (fls(flags) - 1)", but "fls(flags)" will return 0 if the value of "flags" is 0, which will cause the evaluation become "1 << (-1)" and it's illegal. So we use a branch to determine whether the return value of "fls(flags)" is 0. We can remove the use of branch first left shift "fls(flags)" number of bits and then right shift 1 bit. When the value of "fls(flags)" is 0 then this evaluation will simply become 0 without any error. As more values other than 0, evaluation is the same as it did for "1 << (fls(flags) - 1)". This implementation can prevent any possible branch prediction fault and pure shift operations are cheaper than branch operation for if-else statements.

vax-r · 2024-05-23T08:13:02Z

You can just build the test binary with -m32 to verify the correctness on 32bit.

Just used the proposed implementation to built with -m32 option and the built was successful , also the ran the test script .github/workflows/run-schedulers without error locally. Let me know if any further checks are needed, thanks.

vax-r · 2024-05-23T08:28:56Z

Also ran the comparison test again, the result shown as the following. Btw this time I iterate through the whole range within u32 from 0x00000000 to 0x88888888 for a complete check.

Branch version

 Performance counter stats for './out' (5 runs):

         1,2084.96 msec task-clock                       #    1.000 CPUs utilized               ( +-  0.06% )
                43      context-switches                 #    3.558 /sec                        ( +-  5.72% )
                 3      cpu-migrations                   #    0.248 /sec                        ( +- 22.61% )
                53      page-faults                      #    4.386 /sec                        ( +-  1.13% )
     542,6104,9686      cycles                           #    4.490 GHz                         ( +-  0.02% )
    1151,4982,4346      instructions                     #    2.12  insn per cycle              ( +-  0.00% )
     297,9091,6871      branches                         #    2.465 G/sec                       ( +-  0.00% )
           22,3366      branch-misses                    #    0.00% of all branches             ( +-  1.28% )

          12.08519 +- 0.00751 seconds time elapsed  ( +-  0.06% )

Branch-less version

 Performance counter stats for './out' (5 runs):

         1,1723.60 msec task-clock                       #    1.000 CPUs utilized               ( +-  0.07% )
                61      context-switches                 #    5.203 /sec                        ( +-  6.99% )
                 2      cpu-migrations                   #    0.171 /sec                        ( +- 36.74% )
                53      page-faults                      #    4.521 /sec                        ( +-  0.92% )
     525,9809,7637      cycles                           #    4.487 GHz                         ( +-  0.01% )
    1082,7544,6816      instructions                     #    2.06  insn per cycle              ( +-  0.00% )
     252,0922,3734      branches                         #    2.150 G/sec                       ( +-  0.00% )
           20,6333      branch-misses                    #    0.00% of all branches             ( +-  1.77% )

          11.72393 +- 0.00828 seconds time elapsed  ( +-  0.07% )

Improvements can be seen in task-clock, cycles, instructions, branches, branch-misses . Especially branches and branch-misses which is at most concern in this case .

vax-r force-pushed the Branchless_highest_bit branch from e24cf06 to 52514a5 Compare May 21, 2024 07:45

vax-r force-pushed the Branchless_highest_bit branch from 52514a5 to baff66e Compare May 23, 2024 08:10

htejun approved these changes May 23, 2024

View reviewed changes

htejun merged commit adadcf3 into sched-ext:sched_ext May 23, 2024
1 check passed

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

scx: Branchless implementation of highest_bit #208

scx: Branchless implementation of highest_bit #208

vax-r commented May 21, 2024 •

edited

Loading

htejun commented May 21, 2024

vax-r commented May 22, 2024

htejun commented May 22, 2024

vax-r commented May 23, 2024

htejun commented May 23, 2024

vax-r commented May 23, 2024 •

edited

Loading

vax-r commented May 23, 2024

scx: Branchless implementation of highest_bit #208

scx: Branchless implementation of highest_bit #208

Conversation

vax-r commented May 21, 2024 • edited Loading

Summary

Experiments

Correctness

Performance

Branch version

Branchless version

htejun commented May 21, 2024

vax-r commented May 22, 2024

htejun commented May 22, 2024

vax-r commented May 23, 2024

htejun commented May 23, 2024

vax-r commented May 23, 2024 • edited Loading

vax-r commented May 23, 2024

Branch version

Branch-less version

vax-r commented May 21, 2024 •

edited

Loading

vax-r commented May 23, 2024 •

edited

Loading