Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Bizarre optimization of simple array-filling loop in -Os #66652

Open
dzaima opened this issue Sep 18, 2023 · 10 comments · May be fixed by #109289
Open

Bizarre optimization of simple array-filling loop in -Os #66652

dzaima opened this issue Sep 18, 2023 · 10 comments · May be fixed by #109289

Comments

@dzaima
Copy link

dzaima commented Sep 18, 2023

This code:

#include<stdint.h>
#include<stddef.h>
void fill_i16(int16_t* a, int16_t v, size_t l) {
  for (size_t i = 0; i < l; i++) a[i] = v;
}

compiled with -Os on either x86-64 or aarch64, leads to bizarre code, e.g. x86-64:

bizarre x86-64
.LCPI0_0:
        .quad   6                               # 0x6
        .quad   7                               # 0x7
.LCPI0_1:
        .quad   4                               # 0x4
        .quad   5                               # 0x5
.LCPI0_2:
        .quad   2                               # 0x2
        .quad   3                               # 0x3
.LCPI0_3:
        .byte   0                               # 0x0
        .byte   0                               # 0x0
        .byte   0                               # 0x0
        .byte   0                               # 0x0
        .byte   0                               # 0x0
        .byte   0                               # 0x0
        .byte   0                               # 0x0
        .byte   0                               # 0x0
        .byte   1                               # 0x1
        .byte   0                               # 0x0
        .byte   0                               # 0x0
        .byte   0                               # 0x0
        .byte   0                               # 0x0
        .byte   0                               # 0x0
        .byte   0                               # 0x0
        .byte   0                               # 0x0
.LCPI0_4:
        .quad   -9223372034707292160            # 0x8000000080000000
        .quad   -9223372034707292160            # 0x8000000080000000
.LCPI0_5:
        .quad   8                               # 0x8
        .quad   8                               # 0x8
fill_i16:                               # @fill_i16
        test    rdx, rdx
        je      .LBB0_19
        lea     rax, [rdx + 7]
        and     rax, -8
        dec     rdx
        movq    xmm0, rdx
        pshufd  xmm0, xmm0, 68                  # xmm0 = xmm0[0,1,0,1]
        movdqa  xmm1, xmmword ptr [rip + .LCPI0_0] # xmm1 = [6,7]
        movdqa  xmm2, xmmword ptr [rip + .LCPI0_1] # xmm2 = [4,5]
        movdqa  xmm3, xmmword ptr [rip + .LCPI0_2] # xmm3 = [2,3]
        movdqa  xmm4, xmmword ptr [rip + .LCPI0_3] # xmm4 = [0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0]
        xor     ecx, ecx
        movdqa  xmm5, xmmword ptr [rip + .LCPI0_4] # xmm5 = [9223372039002259456,9223372039002259456]
        pxor    xmm0, xmm5
        pcmpeqd xmm6, xmm6
        movdqa  xmm7, xmmword ptr [rip + .LCPI0_5] # xmm7 = [8,8]
.LBB0_2:                                # =>This Inner Loop Header: Depth=1
        movdqa  xmm8, xmm4
        pxor    xmm8, xmm5
        movdqa  xmm10, xmm8
        pcmpgtd xmm10, xmm0
        pshufd  xmm9, xmm10, 160                # xmm9 = xmm10[0,0,2,2]
        pshuflw xmm11, xmm9, 232                # xmm11 = xmm9[0,2,2,3,4,5,6,7]
        pcmpeqd xmm8, xmm0
        pshufd  xmm8, xmm8, 245                 # xmm8 = xmm8[1,1,3,3]
        pshuflw xmm12, xmm8, 232                # xmm12 = xmm8[0,2,2,3,4,5,6,7]
        pand    xmm12, xmm11
        pshufd  xmm10, xmm10, 245               # xmm10 = xmm10[1,1,3,3]
        pshuflw xmm11, xmm10, 232               # xmm11 = xmm10[0,2,2,3,4,5,6,7]
        por     xmm11, xmm12
        pxor    xmm11, xmm6
        packssdw        xmm11, xmm11
        movd    edx, xmm11
        test    dl, 1
        je      .LBB0_4
        mov     word ptr [rdi + 2*rcx], si
.LBB0_4:                                #   in Loop: Header=BB0_2 Depth=1
        pand    xmm8, xmm9
        por     xmm8, xmm10
        packssdw        xmm8, xmm8
        pxor    xmm8, xmm6
        packssdw        xmm8, xmm8
        movd    edx, xmm8
        shr     edx, 16
        test    dl, 1
        je      .LBB0_6
        mov     word ptr [rdi + 2*rcx + 2], si
.LBB0_6:                                #   in Loop: Header=BB0_2 Depth=1
        movdqa  xmm9, xmm3
        pxor    xmm9, xmm5
        movdqa  xmm10, xmm9
        pcmpgtd xmm10, xmm0
        pshufd  xmm8, xmm10, 160                # xmm8 = xmm10[0,0,2,2]
        pcmpeqd xmm9, xmm0
        pshufd  xmm9, xmm9, 245                 # xmm9 = xmm9[1,1,3,3]
        movdqa  xmm11, xmm9
        pand    xmm11, xmm8
        pshufd  xmm10, xmm10, 245               # xmm10 = xmm10[1,1,3,3]
        por     xmm11, xmm10
        packssdw        xmm11, xmm11
        pxor    xmm11, xmm6
        packssdw        xmm11, xmm11
        pextrw  edx, xmm11, 2
        test    dl, 1
        je      .LBB0_8
        mov     word ptr [rdi + 2*rcx + 4], si
.LBB0_8:                                #   in Loop: Header=BB0_2 Depth=1
        pshufhw xmm8, xmm8, 132                 # xmm8 = xmm8[0,1,2,3,4,5,4,6]
        pshufhw xmm9, xmm9, 132                 # xmm9 = xmm9[0,1,2,3,4,5,4,6]
        pand    xmm9, xmm8
        pshufhw xmm8, xmm10, 132                # xmm8 = xmm10[0,1,2,3,4,5,4,6]
        por     xmm8, xmm9
        pxor    xmm8, xmm6
        packssdw        xmm8, xmm8
        pextrw  edx, xmm8, 3
        test    dl, 1
        je      .LBB0_10
        mov     word ptr [rdi + 2*rcx + 6], si
.LBB0_10:                               #   in Loop: Header=BB0_2 Depth=1
        movdqa  xmm8, xmm2
        pxor    xmm8, xmm5
        movdqa  xmm10, xmm8
        pcmpgtd xmm10, xmm0
        pshufd  xmm9, xmm10, 160                # xmm9 = xmm10[0,0,2,2]
        pshuflw xmm11, xmm9, 232                # xmm11 = xmm9[0,2,2,3,4,5,6,7]
        pcmpeqd xmm8, xmm0
        pshufd  xmm8, xmm8, 245                 # xmm8 = xmm8[1,1,3,3]
        pshuflw xmm12, xmm8, 232                # xmm12 = xmm8[0,2,2,3,4,5,6,7]
        pand    xmm12, xmm11
        pshufd  xmm10, xmm10, 245               # xmm10 = xmm10[1,1,3,3]
        pshuflw xmm11, xmm10, 232               # xmm11 = xmm10[0,2,2,3,4,5,6,7]
        por     xmm11, xmm12
        pxor    xmm11, xmm6
        packssdw        xmm11, xmm11
        pextrw  edx, xmm11, 4
        test    dl, 1
        je      .LBB0_12
        mov     word ptr [rdi + 2*rcx + 8], si
.LBB0_12:                               #   in Loop: Header=BB0_2 Depth=1
        pand    xmm8, xmm9
        por     xmm8, xmm10
        packssdw        xmm8, xmm8
        pxor    xmm8, xmm6
        packssdw        xmm8, xmm8
        pextrw  edx, xmm8, 5
        test    dl, 1
        je      .LBB0_14
        mov     word ptr [rdi + 2*rcx + 10], si
.LBB0_14:                               #   in Loop: Header=BB0_2 Depth=1
        movdqa  xmm9, xmm1
        pxor    xmm9, xmm5
        movdqa  xmm10, xmm9
        pcmpgtd xmm10, xmm0
        pshufd  xmm8, xmm10, 160                # xmm8 = xmm10[0,0,2,2]
        pcmpeqd xmm9, xmm0
        pshufd  xmm9, xmm9, 245                 # xmm9 = xmm9[1,1,3,3]
        movdqa  xmm11, xmm9
        pand    xmm11, xmm8
        pshufd  xmm10, xmm10, 245               # xmm10 = xmm10[1,1,3,3]
        por     xmm11, xmm10
        packssdw        xmm11, xmm11
        pxor    xmm11, xmm6
        packssdw        xmm11, xmm11
        pextrw  edx, xmm11, 6
        test    dl, 1
        je      .LBB0_16
        mov     word ptr [rdi + 2*rcx + 12], si
.LBB0_16:                               #   in Loop: Header=BB0_2 Depth=1
        pshufhw xmm8, xmm8, 132                 # xmm8 = xmm8[0,1,2,3,4,5,4,6]
        pshufhw xmm9, xmm9, 132                 # xmm9 = xmm9[0,1,2,3,4,5,4,6]
        pand    xmm9, xmm8
        pshufhw xmm8, xmm10, 132                # xmm8 = xmm10[0,1,2,3,4,5,4,6]
        por     xmm8, xmm9
        pxor    xmm8, xmm6
        packssdw        xmm8, xmm8
        pextrw  edx, xmm8, 7
        test    dl, 1
        je      .LBB0_18
        mov     word ptr [rdi + 2*rcx + 14], si
.LBB0_18:                               #   in Loop: Header=BB0_2 Depth=1
        add     rcx, 8
        paddq   xmm4, xmm7
        paddq   xmm3, xmm7
        paddq   xmm2, xmm7
        paddq   xmm1, xmm7
        cmp     rax, rcx
        jne     .LBB0_2
.LBB0_19:
        ret

Compiler Explorer: https://godbolt.org/z/9jfYh8Wz8; it appears the first version with this behavior was clang 12.

Use of SIMD here (the loop is unrolled, not vectorized) is completely unnecessary; the output of -Oz, which is a simple scalar non-unrolled loop, is ~5x faster in a simple test.

The "optimized" IR in question:

IR
define dso_local void @fill_i16(ptr nocapture noundef writeonly %a, i16 noundef signext %v, i64 noundef %l) local_unnamed_addr {
entry:
  %cmp3.not = icmp eq i64 %l, 0
  br i1 %cmp3.not, label %for.cond.cleanup, label %vector.ph

vector.ph:
  %n.rnd.up = add i64 %l, 7
  %n.vec = and i64 %n.rnd.up, -8
  %trip.count.minus.1 = add i64 %l, -1
  %broadcast.splatinsert = insertelement <8 x i64> poison, i64 %trip.count.minus.1, i64 0
  %broadcast.splat = shufflevector <8 x i64> %broadcast.splatinsert, <8 x i64> poison, <8 x i32> zeroinitializer
  br label %vector.body

vector.body:
  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %pred.store.continue18 ]
  %vec.ind = phi <8 x i64> [ <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>, %vector.ph ], [ %vec.ind.next, %pred.store.continue18 ]
  %0 = icmp ule <8 x i64> %vec.ind, %broadcast.splat
  %1 = extractelement <8 x i1> %0, i64 0
  br i1 %1, label %pred.store.if, label %pred.store.continue

pred.store.if:
  %2 = getelementptr inbounds i16, ptr %a, i64 %index
  store i16 %v, ptr %2, align 2
  br label %pred.store.continue

pred.store.continue:
  %3 = extractelement <8 x i1> %0, i64 1
  br i1 %3, label %pred.store.if5, label %pred.store.continue6

pred.store.if5:
  %4 = or i64 %index, 1
  %5 = getelementptr inbounds i16, ptr %a, i64 %4
  store i16 %v, ptr %5, align 2
  br label %pred.store.continue6

pred.store.continue6:
  %6 = extractelement <8 x i1> %0, i64 2
  br i1 %6, label %pred.store.if7, label %pred.store.continue8

pred.store.if7:
  %7 = or i64 %index, 2
  %8 = getelementptr inbounds i16, ptr %a, i64 %7
  store i16 %v, ptr %8, align 2
  br label %pred.store.continue8

pred.store.continue8:
  %9 = extractelement <8 x i1> %0, i64 3
  br i1 %9, label %pred.store.if9, label %pred.store.continue10

pred.store.if9:
  %10 = or i64 %index, 3
  %11 = getelementptr inbounds i16, ptr %a, i64 %10
  store i16 %v, ptr %11, align 2
  br label %pred.store.continue10

pred.store.continue10:
  %12 = extractelement <8 x i1> %0, i64 4
  br i1 %12, label %pred.store.if11, label %pred.store.continue12

pred.store.if11:
  %13 = or i64 %index, 4
  %14 = getelementptr inbounds i16, ptr %a, i64 %13
  store i16 %v, ptr %14, align 2
  br label %pred.store.continue12

pred.store.continue12:
  %15 = extractelement <8 x i1> %0, i64 5
  br i1 %15, label %pred.store.if13, label %pred.store.continue14

pred.store.if13:
  %16 = or i64 %index, 5
  %17 = getelementptr inbounds i16, ptr %a, i64 %16
  store i16 %v, ptr %17, align 2
  br label %pred.store.continue14

pred.store.continue14:
  %18 = extractelement <8 x i1> %0, i64 6
  br i1 %18, label %pred.store.if15, label %pred.store.continue16

pred.store.if15:
  %19 = or i64 %index, 6
  %20 = getelementptr inbounds i16, ptr %a, i64 %19
  store i16 %v, ptr %20, align 2
  br label %pred.store.continue16

pred.store.continue16:
  %21 = extractelement <8 x i1> %0, i64 7
  br i1 %21, label %pred.store.if17, label %pred.store.continue18

pred.store.if17:
  %22 = or i64 %index, 7
  %23 = getelementptr inbounds i16, ptr %a, i64 %22
  store i16 %v, ptr %23, align 2
  br label %pred.store.continue18

pred.store.continue18:
  %index.next = add i64 %index, 8
  %vec.ind.next = add <8 x i64> %vec.ind, <i64 8, i64 8, i64 8, i64 8, i64 8, i64 8, i64 8, i64 8>
  %24 = icmp eq i64 %index.next, %n.vec
  br i1 %24, label %for.cond.cleanup, label %vector.body

for.cond.cleanup:
  ret void
}
@hiraditya
Copy link
Collaborator

yeah even with -fno-unroll-loops it is producing so much code. https://godbolt.org/z/aGbs7Yos6

@hiraditya
Copy link
Collaborator

hiraditya commented Sep 20, 2023

RISCV code OTOH: https://godbolt.org/z/5jf97WM3a

@hiraditya
Copy link
Collaborator

Try sve

// clang++ -Os -march=armv8-a+sve

fill_i16:                               // @fill_i16
        cbz     x2, .LBB0_3
        cnth    x8
        mov     z0.h, w1
        mov     x10, xzr
        subs    x9, x2, x8
        csel    x9, xzr, x9, lo
        whilelo p0.h, xzr, x2
.LBB0_2:                                // =>This Inner Loop Header: Depth=1
        st1h    { z0.h }, p0, [x0, x10, lsl #1]
        whilelo p0.h, x10, x9
        add     x10, x10, x8
        b.mi    .LBB0_2
.LBB0_3:
        ret

@hiraditya
Copy link
Collaborator

@ilinpv suggested this could be in vectorizer and that's why -fno-unroll-loops doesn't have any effect.

@dzaima
Copy link
Author

dzaima commented Sep 20, 2023

Yeah, my guess of it being unrolling seems to be wrong; #pragma clang loop unroll(disable) has no effect too, but #pragma clang loop vectorize(disable) does disable the problematic behavior. And it's LoopVectorizePass that generates the core branchy IR.

@dzaima
Copy link
Author

dzaima commented Sep 20, 2023

Considering the SVE & RVV behavior (and also AVX-512), it's presumably trying to do a predicated tail, but as there's no masked store on SSE2/NEON, it falls back to branchy stores without properly considering the cost (and then also happens to do the mask extracts very inefficiently)?

Adding any load to the body makes it go to a regular scalar loop, Compiler Explorer's "Opt viewer" giving a "Missed - the cost-model indicates that vectorization is not beneficial".

(for fun, here's an even worse case - AVX2 & int8_t elements makes it consider 32 elements/iteration, targeting 256-bit vectors)

@davemgreen
Copy link
Collaborator

There is a Hack in the vectorizer cost model to give very high costs (3000000) to predicated vector loads/stores operations, but it only applies if there are loads or >1 store. See useEmulatedMaskMemRefHack.

So this case falls through the cracks and doesn't get as high cost as expected. The option -vectorize-num-stores-pred=0 would fix the issue by not excluding the case in this ticket where there is a single store. I presume in the long run we would want to work on removing the Hack.

@the8472
Copy link

the8472 commented Sep 21, 2023

On x86 shouldn't this be turned into a rep stosw, especially when optimizing for size?

@hiraditya
Copy link
Collaborator

On x86 shouldn't this be turned into a rep stosw, especially when optimizing for size?

yup. and maybe in general rep stosw is quite performant these days.

@john-brawn-arm
Copy link
Collaborator

Looking at this it looks like there are two problems here:

  1. The cost calculation of scalarized predicated memory instruction is wrong. One thing I spotted is that it's adding the cost of one extract and branch, but we actually have an extract and branch for each vector element.
  2. Cost calculation is wrong in general for -Os, as everything is done with CostKind TCK_RecipThroughput when it should be TCK_CodeSize (or maybe TCK_SizeAndLatency). Also there are things like the cost being divided by the probability for predicated instructions, which doesn't make sense for code size.

john-brawn-arm added a commit to john-brawn-arm/llvm-project that referenced this issue Sep 19, 2024
…lding

When an instruction is scalarized due to tail folding the cost that
we calculate (which is then returned by getWideningCost and thus
getInstructionCost) is already the scalarized cost, so
computePredInstDiscount shouldn't apply a scalarization discount to
these instructions.

Fixes llvm#66652
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Projects
None yet
7 participants