Skip to content
This repository has been archived by the owner on Jun 18, 2024. It is now read-only.

scx: Branchless implementation of highest_bit #208

Merged
merged 1 commit into from
May 23, 2024

Conversation

vax-r
Copy link
Contributor

@vax-r vax-r commented May 21, 2024

Summary

Origin implementation of function highest_bit utilize the function of fls() to calculate the most significant bit of the input parameter flags. Normally we can return the mask with 1 << (fls(flags) - 1), but fls(flags) will return 0 if the value of flags is 0, which will cause the evaluation become 1 << (-1) and it's illegal. So we use a branch to determine whether the return value of fls(flags) is 0.

We can remove the use of branch first left shift fls(flags) number of bits and then right shift 1 bit. When the value of fls(flags) is 0 then this evaluation will simply become 0 without any error. As more values other than 0, evaluation is the same as it did for 1 << (fls(flags) - 1).

This implementation can prevent any possible branch prediction fault and pure shift operations are cheaper than branch operation for if-else statements.

Experiments

Correctness

In order to prove the correctness of my implementation, I use the following userspace program to test whether the two different implementation generate exactly the same output for every number within the range of u32 which is usually unsigned .

#include <stdlib.h>
#include <stdio.h>
#include <limits.h>

int generic_fls(unsigned int x)
{
	int r = 32;

	if (!x)
		return 0;
	if (!(x & 0xffff0000u)) {
		x <<= 16;
		r -= 16;
	}
	if (!(x & 0xff000000u)) {
		x <<= 8;
		r -= 8;
	}
	if (!(x & 0xf0000000u)) {
		x <<= 4;
		r -= 4;
	}
	if (!(x & 0xc0000000u)) {
		x <<= 2;
		r -= 2;
	}
	if (!(x & 0x80000000u)) {
		x <<= 1;
		r -= 1;
	}
	return r;
}

#define fls(x) generic_fls(x)

static unsigned highest_bit(unsigned flags)
{
	int bit = fls(flags);
	return bit ? 1 << (bit - 1) : 0;
}

static unsigned highest_bit_new(unsigned flags)
{
	int bit = fls(flags);
	return (1UL << bit) >> 1;
}

int main(void) {

    for (unsigned i = 0x00000000; i < 0x88888888; i++) {
	unsigned ans = highest_bit(i);
	unsigned ans_2 = highest_bit_new(i);
	if (ans != ans_2)
		printf("ans : %u, ans_2 : %u\n", ans, ans_2);
    }
    unsigned ans = highest_bit(0x88888888);
    unsigned ans_2 = highest_bit_new(0x88888888);
    if (ans != ans_2)
        printf("ans : %u, ans_2 : %u\n", ans, ans_2);

    return 0;
}

Compile the program and execute

$ gcc -o out main.c
$ ./out

Test Passed ! The correctness are proved.

Performance

Change the user space program abit , only test a subset within the region of unsigned (so it won't take too much time for 1 single execution) . Using perf stat to observe the performance of the two different implementation

Branch version

#include <stdlib.h>
#include <stdio.h>
#include <limits.h>

int generic_fls(unsigned int x)
{
	int r = 32;

	if (!x)
		return 0;
	if (!(x & 0xffff0000u)) {
		x <<= 16;
		r -= 16;
	}
	if (!(x & 0xff000000u)) {
		x <<= 8;
		r -= 8;
	}
	if (!(x & 0xf0000000u)) {
		x <<= 4;
		r -= 4;
	}
	if (!(x & 0xc0000000u)) {
		x <<= 2;
		r -= 2;
	}
	if (!(x & 0x80000000u)) {
		x <<= 1;
		r -= 1;
	}
	return r;
}

#define fls(x) generic_fls(x)

static unsigned highest_bit(unsigned flags)
{
	int bit = fls(flags);
	return bit ? 1 << (bit - 1) : 0;
}

int main(void) {

    for (unsigned i = 0x70000000; i < 0x80000008; i++) {
	unsigned ans = highest_bit(i);
    }

    return 0;
}

Compile and use perf to observe the performance.

$ gcc -o out main.c
$ sudo perf stat --repeat 5 ./out

 Performance counter stats for './out' (5 runs):

            809.51 msec task-clock                       #    1.000 CPUs utilized               ( +-  2.03% )
                 2      context-switches                 #    2.471 /sec                        ( +- 25.50% )
                 0      cpu-migrations                   #    0.000 /sec                      
                50      page-faults                      #   61.766 /sec                        ( +-  0.40% )
      42,4302,2494      cycles                           #    5.241 GHz                         ( +-  0.04% )  (82.95%)
         1827,2293      stalled-cycles-frontend          #    0.43% frontend cycles idle        ( +-  0.71% )  (83.29%)
          158,1598      stalled-cycles-backend           #    0.04% backend cycles idle         ( +- 27.60% )  (83.45%)
     155,7262,2684      instructions                     #    3.67  insn per cycle            
                                                  #    0.00  stalled cycles per insn     ( +-  0.03% )  (83.46%)
      34,9005,2738      branches                         #    4.311 G/sec                       ( +-  0.04% )  (83.45%)
            5,5183      branch-misses                    #    0.00% of all branches             ( +-  7.62% )  (83.40%)

            0.8099 +- 0.0165 seconds time elapsed  ( +-  2.04% )

Branchless version

#include <stdlib.h>
#include <stdio.h>
#include <limits.h>

int generic_fls(unsigned int x)
{
	int r = 32;

	if (!x)
		return 0;
	if (!(x & 0xffff0000u)) {
		x <<= 16;
		r -= 16;
	}
	if (!(x & 0xff000000u)) {
		x <<= 8;
		r -= 8;
	}
	if (!(x & 0xf0000000u)) {
		x <<= 4;
		r -= 4;
	}
	if (!(x & 0xc0000000u)) {
		x <<= 2;
		r -= 2;
	}
	if (!(x & 0x80000000u)) {
		x <<= 1;
		r -= 1;
	}
	return r;
}

#define fls(x) generic_fls(x)

static unsigned highest_bit(unsigned flags)
{
	int bit = fls(flags);
	return (1UL << bit) >> 1;
}

int main(void) {

    for (unsigned i = 0x70000000; i < 0x80000008; i++) {
	unsigned ans = highest_bit(i);
    }

    return 0;
}

Compile and use perf to observe the performance.

$ gcc -o out main.c
$ sudo perf stat --repeat 5 ./out

 Performance counter stats for './out' (5 runs):

            726.41 msec task-clock                       #    1.000 CPUs utilized               ( +-  2.24% )
                 1      context-switches                 #    1.377 /sec                        ( +- 37.42% )
                 0      cpu-migrations                   #    0.000 /sec                      
                50      page-faults                      #   68.832 /sec                        ( +-  0.40% )
      37,8968,1629      cycles                           #    5.217 GHz                         ( +-  0.02% )  (83.25%)
          250,0461      stalled-cycles-frontend          #    0.07% frontend cycles idle        ( +-  2.88% )  (83.25%)
           91,6112      stalled-cycles-backend           #    0.02% backend cycles idle         ( +- 11.66% )  (83.26%)
     147,6344,7563      instructions                     #    3.90  insn per cycle            
                                                  #    0.00  stalled cycles per insn     ( +-  0.01% )  (83.25%)
      29,5408,1409      branches                         #    4.067 G/sec                       ( +-  0.01% )  (83.54%)
            4,8595      branch-misses                    #    0.00% of all branches             ( +-  2.67% )  (83.45%)

            0.7268 +- 0.0163 seconds time elapsed  ( +-  2.25% )

We can see significant improvements on cycles, stalled-cycles-frontend, especially branches and branch-misses.
The test runs on x86_64 AMD Ryzen 7 7700X 8-Core Processor , the operating system is Ubuntu 22.04.4 LTS .

@htejun
Copy link
Collaborator

htejun commented May 21, 2024

Thanks for the thorough PR but this unfortunately would break on 32bit builds as the highest bit would be lost in 32bit unsigned long.

@vax-r
Copy link
Contributor Author

vax-r commented May 22, 2024

Thanks for the thorough PR but this unfortunately would break on 32bit builds as the highest bit would be lost in 32bit unsigned long.

Oh I see, what do you think about doing a type conversion from u32 to u64 for the integer 1 ? if that's possible I'll try perform the same experiment and see if the performance is better.
I mean something like this.

((u64) 1 << bit) >> 1

@htejun
Copy link
Collaborator

htejun commented May 22, 2024

Yeah, that should work. Can you verify it on 32bit builds too? That's probably going to be slower but as long as it's not drastic, that should be okay.

@vax-r
Copy link
Contributor Author

vax-r commented May 23, 2024

Yeah, that should work. Can you verify it on 32bit builds too? That's probably going to be slower but as long as it's not drastic, that should be okay.

Sorry I don't really understand the meaning of 32 bit builds, do you mean to re-compile the kernel with a config setting of 32 bit builds and test the correctness inside 32bit builded kernel ?
If you mean to build a bare-metal 32 bit machine then it's not available for me currently as I only have a x86_64 machine in my lab.

@htejun
Copy link
Collaborator

htejun commented May 23, 2024

You can just build the test binary with -m32 to verify the correctness on 32bit.

Origin implementation of function highest_bit utilize the function of
"fls()" to calculate the most significant bit of the input parameter
"flags". Normally we can return the mask with "1 << (fls(flags) - 1)",
but "fls(flags)" will return 0 if the value of "flags" is 0, which will
cause the evaluation become "1 << (-1)" and it's illegal. So we use a
branch to determine whether the return value of "fls(flags)" is 0.

We can remove the use of branch first left shift "fls(flags)" number of
bits and then right shift 1 bit. When the value of "fls(flags)" is 0
then this evaluation will simply become 0 without any error. As more
values other than 0, evaluation is the same as it did for
"1 << (fls(flags) - 1)".

This implementation can prevent any possible branch prediction fault and
pure shift operations are cheaper than branch operation for if-else
statements.
@vax-r
Copy link
Contributor Author

vax-r commented May 23, 2024

You can just build the test binary with -m32 to verify the correctness on 32bit.

Just used the proposed implementation to built with -m32 option and the built was successful , also the ran the test script .github/workflows/run-schedulers without error locally. Let me know if any further checks are needed, thanks.

@vax-r
Copy link
Contributor Author

vax-r commented May 23, 2024

Also ran the comparison test again, the result shown as the following. Btw this time I iterate through the whole range within u32 from 0x00000000 to 0x88888888 for a complete check.

Branch version

 Performance counter stats for './out' (5 runs):

         1,2084.96 msec task-clock                       #    1.000 CPUs utilized               ( +-  0.06% )
                43      context-switches                 #    3.558 /sec                        ( +-  5.72% )
                 3      cpu-migrations                   #    0.248 /sec                        ( +- 22.61% )
                53      page-faults                      #    4.386 /sec                        ( +-  1.13% )
     542,6104,9686      cycles                           #    4.490 GHz                         ( +-  0.02% )
    1151,4982,4346      instructions                     #    2.12  insn per cycle              ( +-  0.00% )
     297,9091,6871      branches                         #    2.465 G/sec                       ( +-  0.00% )
           22,3366      branch-misses                    #    0.00% of all branches             ( +-  1.28% )

          12.08519 +- 0.00751 seconds time elapsed  ( +-  0.06% )

Branch-less version

 Performance counter stats for './out' (5 runs):

         1,1723.60 msec task-clock                       #    1.000 CPUs utilized               ( +-  0.07% )
                61      context-switches                 #    5.203 /sec                        ( +-  6.99% )
                 2      cpu-migrations                   #    0.171 /sec                        ( +- 36.74% )
                53      page-faults                      #    4.521 /sec                        ( +-  0.92% )
     525,9809,7637      cycles                           #    4.487 GHz                         ( +-  0.01% )
    1082,7544,6816      instructions                     #    2.06  insn per cycle              ( +-  0.00% )
     252,0922,3734      branches                         #    2.150 G/sec                       ( +-  0.00% )
           20,6333      branch-misses                    #    0.00% of all branches             ( +-  1.77% )

          11.72393 +- 0.00828 seconds time elapsed  ( +-  0.07% )

Improvements can be seen in task-clock, cycles, instructions, branches, branch-misses . Especially branches and branch-misses which is at most concern in this case .

@htejun htejun merged commit adadcf3 into sched-ext:sched_ext May 23, 2024
1 check passed
Sign up for free to subscribe to this conversation on GitHub. Already have an account? Sign in.
Labels
None yet
Projects
None yet
Development

Successfully merging this pull request may close these issues.

2 participants