Skip to content

A moderate rustc/gcc performance difference #141853

Open
@leonardo-m

Description

@leonardo-m

Here I report a moderate performance difference, if you think it isn't important or there's nothing worth doing here, then close this issue.

This is a small Rust function foo:

fn foo() -> i64 {
    const LIM: i64 = 30;
    let mut result: i64 = 3;

    let mut i = 4;
    while i <= LIM {
        let n: i64 = (1 << i) - i;
        let n21: i64 = (n - 1) << 1;

        let mut x: i64 = 0;
        let mut y: i64 = n - 1;
        let mut r: i64 = 0;
        let mut count: i64 = 1;
        let mut odd: i64 = 0;
        let mut was: i64 = 0;

        while x < y {
            if (x << 1) < n21 - r {
                r += x;
                x += 1;
                r += x;
                count += 1;
            } else {
                r -= y;
                y -= 1;
                r -= y;

                let new_was: i64;
                if r < 0 {
                    r += x;
                    x += 1;
                    r += x;
                    new_was = 0;
                } else {
                    new_was = 1;
                };
                was += new_was;

                if (count & 1) != 0 {
                    result += (count - 2 * was) << 1;
                } else {
                    result -= (count - 2 * was) << 1;
                }

                odd += count + (count & 1) - was;
                was = new_was;
                count = 1;
            }
        }

        if x == y {
            odd += 1 - was;
            if count > 1 {
                result += 4 * was - 1;
            } else {
                result += 1;
            }
        }

        result += 2 * odd * (n - odd);
        i += 2;
    }
    result
}

fn main() {
    assert_eq!(foo(), 467178235146843549);
}

For testing I've translated it to (hopefully) equivalent C code (but at the end it puts() a string instead of asserting):

#include <stdio.h>
#include <stdint.h>

int64_t foo() {
    const int64_t LIM = 30;
    int64_t result = 3;

    int64_t i = 4;
    while (i <= LIM) {
        const int64_t n = (1 << i) - i;
        const int64_t n21 = (n - 1) << 1;

        int64_t x = 0;
        int64_t y = n - 1;
        int64_t r = 0;
        int64_t count = 1;
        int64_t odd = 0;
        int64_t was = 0;

        while (x < y) {
            if ((x << 1) < n21 - r) {
                r += x;
                x += 1;
                r += x;
                count += 1;
            } else {
                r -= y;
                y -= 1;
                r -= y;

                int64_t new_was;
                if (r < 0) {
                    r += x;
                    x += 1;
                    r += x;
                    new_was = 0;
                } else {
                    new_was = 1;
                }
                was += new_was;

                if ((count & 1) != 0) {
                    result += (count - 2 * was) << 1;
                } else {
                    result -= (count - 2 * was) << 1;
                }

                odd += count + (count & 1) - was;
                was = new_was;
                count = 1;
            }
        }

        if (x == y) {
            odd += 1 - was;
            if (count > 1) {
                result += 4 * was - 1;
            } else {
                result += 1;
            }
        }

        result += 2 * odd * (n - odd);
        i += 2;
    }
    return result;
}

int main() {
    printf("%s\n", foo() == 467178235146843549LL ? "true" : "false");
    return 0;
}

I've compiled both versions with simple compiler switches, for the Rust code:

rustc --edition 2024 -C opt-level=3 -C strip=symbols test1.rs

Using rustc 1.89.0-nightly (4d08223 2025-05-31), the run-time I'm seeing is about 1.84 seconds.

For the C code:

gcc -O3 test2.c -o test2

Using g++ 16.0.0 20250601 (latest trunk), the run-time I'm seeing is about 1.60 seconds.

The asm generated by GCC:

foo:
        pushq   %r15
        movl    $4, %ecx
        movl    $3, %r10d
        movl    $11, %r9d
        pushq   %r14
        pushq   %r13
        pushq   %r12
        pushq   %rbp
        movl    $12, %ebp
        pushq   %rbx
.L13:
        leaq    (%r9,%r9), %r13
        xorl    %r12d, %r12d
        xorl    %ebx, %ebx
        movl    $1, %edi
        xorl    %eax, %eax
        xorl    %edx, %edx
        jmp     .L8
.L18:
        addq    %rdx, %rax
        addq    $1, %rdx
        movq    %rsi, %rdi
        addq    %rdx, %rax
        cmpq    %rdx, %r9
        jle     .L17
.L8:
        movq    %r13, %r11
        leaq    (%rdx,%rdx), %r8
        leaq    1(%rdi), %rsi
        subq    %rax, %r11
        cmpq    %r8, %r11
        jg      .L18
        subq    %r9, %rax
        subq    $1, %r9
        subq    %r9, %rax
        js      .L19
        leaq    1(%r12), %r11
        movl    $1, %r12d
.L7:
        leaq    (%r11,%r11), %r15
        movq    %rdi, %r8
        subq    %r15, %r8
        addq    %r8, %r8
        leaq    (%r10,%r8), %r15
        subq    %r8, %r10
        andl    $1, %edi
        movl    $1, %edi
        cmovne  %r15, %r10
        andq    $-2, %rsi
        subq    %r11, %rsi
        addq    %rsi, %rbx
        cmpq    %rdx, %r9
        jg      .L8
.L17:
        je      .L9
        subq    %rbx, %rbp
        imulq   %rbx, %rbp
        leaq    (%r10,%rbp,2), %r10
.L10:
        addq    $2, %rcx
        cmpq    $32, %rcx
        je      .L1
.L20:
        movl    $1, %ebp
        sall    %cl, %ebp
        movslq  %ebp, %rbp
        subq    %rcx, %rbp
        leaq    -1(%rbp), %r9
        jmp     .L13
.L19:
        addq    %rdx, %rax
        addq    $1, %rdx
        movq    %r12, %r11
        xorl    %r12d, %r12d
        addq    %rdx, %rax
        jmp     .L7
.L9:
        movq    %r12, %rax
        xorq    $1, %rax
        addq    %rbx, %rax
        subq    %rax, %rbp
        imulq   %rax, %rbp
        cmpq    $1, %rdi
        je      .L11
        leaq    -1(%r10,%r12,4), %rax
        addq    $2, %rcx
        leaq    (%rax,%rbp,2), %r10
        cmpq    $32, %rcx
        jne     .L20
.L1:
        popq    %rbx
        movq    %r10, %rax
        popq    %rbp
        popq    %r12
        popq    %r13
        popq    %r14
        popq    %r15
        ret
.L11:
        leaq    1(%r10,%rbp,2), %r10
        jmp     .L10

And the asm generated by rustc:

foo::h1870fe44660fcb04:
        pushq   %r15
        pushq   %r14
        pushq   %r12
        pushq   %rbx
        movl    $4, %ecx
        movl    $3, %eax
        jmp     .LBB0_1
.LBB0_9:
        testq   %rdi, %rdi
        je      .LBB0_10
        xorl    %esi, %esi
.LBB0_13:
        subq    %rsi, %rdx
        imulq   %rsi, %rdx
        leaq    (%rax,%rdx,2), %rax
        leaq    2(%rcx), %rdx
        cmpq    $29, %rcx
        movq    %rdx, %rcx
        jae     .LBB0_14
.LBB0_1:
        movl    $1, %edx
        shlq    %cl, %rdx
        subq    %rcx, %rdx
        leaq    -1(%rdx), %rdi
        cmpq    $2, %rdx
        jl      .LBB0_9
        leaq    -2(,%rdx,2), %r9
        movl    $1, %r8d
        xorl    %ebx, %ebx
        xorl    %r11d, %r11d
        xorl    %esi, %esi
        xorl    %r10d, %r10d
        jmp     .LBB0_3
.LBB0_15:
        addq    %rbx, %r11
        addq    %rbx, %r11
        incq    %r11
        incq    %rbx
        incq    %r8
        movq    %rbx, %r14
        cmpq    %rdi, %r14
        jge     .LBB0_6
.LBB0_3:
        leaq    (%rbx,%rbx), %r14
        movq    %r9, %r15
        subq    %r11, %r15
        cmpq    %r15, %r14
        jl      .LBB0_15
        subq    %rdi, %r11
        subq    %rdi, %r11
        leaq    1(%r11), %r12
        leaq    1(%rbx), %r14
        xorl    %r15d, %r15d
        testq   %r12, %r12
        leaq    1(%r11,%rbx), %r11
        leaq    1(%rbx,%r11), %r11
        cmovnsq %r12, %r11
        cmovnsq %rbx, %r14
        setns   %r15b
        addq    %r15, %r10
        movq    %r8, %rbx
        andq    $1, %rbx
        jne     .LBB0_16
        subq    %r8, %rax
        subq    %r8, %rax
        leaq    (%rax,%r10,4), %rax
        jmp     .LBB0_17
.LBB0_16:
        leaq    (%rax,%r8,2), %rax
        leaq    (,%r10,4), %r12
        subq    %r12, %rax
.LBB0_17:
        decq    %rdi
        addq    %rsi, %r8
        addq    %rbx, %r8
        subq    %r10, %r8
        movq    %r15, %r10
        movq    %r8, %rsi
        movl    $1, %r8d
        movq    %r14, %rbx
        cmpq    %rdi, %r14
        jl      .LBB0_3
.LBB0_6:
        jne     .LBB0_13
        subq    %r10, %rsi
        incq    %rsi
        cmpq    $1, %r8
        jle     .LBB0_11
        leaq    (%rax,%r10,4), %rax
        decq    %rax
        jmp     .LBB0_13
.LBB0_10:
        movl    $1, %esi
.LBB0_11:
        incq    %rax
        jmp     .LBB0_13
.LBB0_14:
        popq    %rbx
        popq    %r12
        popq    %r14
        popq    %r15
        retq

My asm skills aren't refined enough to spot where/how the gcc asm is more efficient than the rustc asm. Can you spot it?

Metadata

Metadata

Assignees

No one assigned

    Labels

    C-optimizationCategory: An issue highlighting optimization opportunities or PRs implementing suchI-slowIssue: Problems and improvements with respect to performance of generated code.T-compilerRelevant to the compiler team, which will review and decide on the PR/issue.

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions