relational-operators - c++ - assembly language speed - <比<=快吗?

c++ to assembly / c++ / performance / assembly

if( a < 901 )if( a <= 900 ) 快吗?

snoopy



Answer #1
void foo(unsigned size) {
    unsigned upper_bound = size - 1;  //或任何可能产生UINT_MAX的计算
    for(unsigned i=0 ; i <= upper_bound ; i++)
        ...
unsigned sum_1_to_n_finite(unsigned n) {
    unsigned total = 0;
    for (unsigned i = 0 ; i < n+1 ; ++i)
        total += i;
    return total;
}

在Godbolt编译器探索器上从clang7.0和gcc8.2获得x86-64 asm

 #clang7.0 -O3闭式
    cmp     edi, -1       #n在EDI中传递:x86-64 System V调用约定
    je      .LBB1_1       #如果(n == UINT_MAX)返回0; // C ++循环运行0次
          #否则将陷入封闭形式的计算
    mov     ecx, edi         #将n零扩展到RCX
    lea     eax, [rdi - 1]   # n-1
    imul    rax, rcx         #n *(n-1)#64位
    shr     rax              # n * (n-1) / 2
    add     eax, edi         #n +(stuff / 2)= n *(n + 1)/ 2#截断为32位
    ret          #计算出在右移之前产品没有可能的溢出
.LBB1_1:
    xor     eax, eax
    ret

但是对于幼稚的版本,我们只是从clang获得了一个哑循环。

unsigned sum_1_to_n_naive(unsigned n) {
    unsigned total = 0;
    for (unsigned i = 0 ; i<=n ; ++i)
        total += i;
    return total;
}
# clang7.0 -O3
sum_1_to_n(unsigned int):
    xor     ecx, ecx           # i = 0
    xor     eax, eax           # retval = 0
.LBB0_1:                       # do {
    add     eax, ecx             # retval += i
    add     ecx, 1               # ++1
    cmp     ecx, edi
    jbe     .LBB0_1            # } while( i<n );
    ret
#“幼稚的”内循环
.L3:
    add     eax, 1       # 做 {
    paddd   xmm0, xmm1    # vect_total_4.6, vect_vec_iv_.5
    paddd   xmm1, xmm2    # vect_vec_iv_.5, tmp114
    cmp     edx, eax      #bnd.1,ivtmp.14#我认为是绑定和感应变量tmp。
    ja      .L3 #,       # }while( n > i )

 "finite" inner loop
  #循环前:
  #xmm0 = 0 =总计
  # xmm1 = {0,1,2,3} = i
  # xmm2 = set1_epi32(4)
 .L13:                # 做 {
    add     eax, 1       #我++
    paddd   xmm0, xmm1    # total[0..3] += i[0..3]
    paddd   xmm1, xmm2    # i[0..3] += 4
    cmp     eax, edx
    jne     .L13      # }while( i != upper_limit );

     then horizontal sum xmm0
     and peeled cleanup for the last n%3 iterations, or something.