1717define void @gelu_fn (ptr noalias %ifm , ptr noalias %ofm , ptr nonnull align 64 dereferenceable (64 ) %params ) {
1818; CHECK-LABEL: gelu_fn:
1919; CHECK: // %bb.0: // %entry
20- ; CHECK-NEXT: vlda.conv.fp32.bf16 cml1, [p0], #64; nopb ; nopxm
20+ ; CHECK-NEXT: vlda.conv.fp32.bf16 cml1, [p0], #64; nopb ; nopxm ; nops
2121; CHECK-NEXT: movxm r0, #16544
2222; CHECK-NEXT: vbcst.16 x6, r0
2323; CHECK-NEXT: lda r1, [p2, #0]; movxm r0, #17280
2424; CHECK-NEXT: mova r0, #60; vbcst.16 x2, r0
25- ; CHECK-NEXT: vadd.f dm3 , dm1, dm0, r0
25+ ; CHECK-NEXT: vadd.f dm1 , dm1, dm0, r0
2626; CHECK-NEXT: vconv.fp32.bf16 cml0, x6
2727; CHECK-NEXT: nop
28+ ; CHECK-NEXT: vlda.conv.fp32.bf16 cml2, [p0], #64; movxm r2, #15821
29+ ; CHECK-NEXT: movx r4, #1
30+ ; CHECK-NEXT: vlda.conv.fp32.bf16 cml1, [p0], #64; movx r2, #255; vbcst.16 x4, r2
31+ ; CHECK-NEXT: vconv.bf16.fp32 x8, cml1; lshl r2, r1, r4; vbcst.16 x0, r2
32+ ; CHECK-NEXT: vlda.conv.fp32.bf16 cml2, [p0], #64; movx r2, #828; mov m0, r2; vadd.f dm2, dm2, dm0, r0
33+ ; CHECK-NEXT: vmul.f dm3, x8, x2, r2
34+ ; CHECK-NEXT: vadd.f dm1, dm1, dm0, r0
35+ ; CHECK-NEXT: nop
36+ ; CHECK-NEXT: vadd.f dm2, dm2, dm0, r0
37+ ; CHECK-NEXT: nop
38+ ; CHECK-NEXT: vconv.bf16.fp32 x10, cml2
39+ ; CHECK-NEXT: vlda.conv.fp32.bf16 cml1, [p0], #64; vconv.bf16.fp32 x8, cml3
40+ ; CHECK-NEXT: vconv.bf16.fp32 x1, cml1; vmul.f dm3, x10, x2, r2
41+ ; CHECK-NEXT: vmul.f dm4, x8, x4, r2
42+ ; CHECK-NEXT: vconv.bf16.fp32 x7, cml2; vmul.f dm3, x1, x2, r2
43+ ; CHECK-NEXT: vadd.f dm1, dm1, dm0, r0
44+ ; CHECK-NEXT: vmul.f dm3, x7, x2, r2
2845; CHECK-NEXT: vlda.conv.fp32.bf16 cml2, [p0], #64
29- ; CHECK-NEXT: movxm r2, #15821
30- ; CHECK-NEXT: mova r2, #255; movx r4, #1; vbcst.16 x4, r2
31- ; CHECK-NEXT: vlda.conv.fp32.bf16 cml1, [p0], #64; vconv.bf16.fp32 x8, cml3; lshl r2, r1, r4; vbcst.16 x0, r2
32- ; CHECK-NEXT: mova r2, #828; mov m0, r2; vadd.f dm3, dm2, dm0, r0
33- ; CHECK-NEXT: vlda.conv.fp32.bf16 cml2, [p0], #64; vmul.f dm2, x8, x2, r2
34- ; CHECK-NEXT: nop
35- ; CHECK-NEXT: vadd.f dm3, dm1, dm0, r0
36- ; CHECK-NEXT: nop
37- ; CHECK-NEXT: vadd.f dm3, dm2, dm0, r0
3846; CHECK-NEXT: vconv.bf16.fp32 x10, cml3
39- ; CHECK-NEXT: vconv.bf16.fp32 x8, cml2
40- ; CHECK-NEXT: vmul.f dm1, x10, x2, r2
41- ; CHECK-NEXT: vconv.bf16.fp32 x1, cml3
42- ; CHECK-NEXT: vlda.conv.fp32.bf16 cml1, [p0], #64; vmul.f dm4, x8, x4, r2
43- ; CHECK-NEXT: vconv.bf16.fp32 x7, cml3; vmul.f dm2, x1, x2, r2
44- ; CHECK-NEXT: nop
45- ; CHECK-NEXT: vmul.f dm3, x7, x2, r2
46- ; CHECK-NEXT: vconv.bf16.fp32 x10, cml1; vadd.f dm1, dm1, dm0, r0
47- ; CHECK-NEXT: nop
48- ; CHECK-NEXT: vlda.conv.fp32.bf16 cml2, [p0], #64; vconv.bf16.fp32 x8, cml4; movx r3, #0; vmul.f dm4, x10, x4, r2
49- ; CHECK-NEXT: vconv.bf16.fp32 x5, cml2; mov s0, r3
50- ; CHECK-NEXT: vfloor.s32.bf16 x1, wl8, s0
51- ; CHECK-NEXT: vconv.bf16.fp32 x5, cml3; vmul.f dm4, x5, x4, r2
52- ; CHECK-NEXT: vconv.bf16.fp32 x7, cml1; movxm ls, #.LBB0_1; vadd.f dm2, dm2, dm0, r0
53- ; CHECK-NEXT: mova r4, #-5; nopb ; vfloor.s32.bf16 x3, wh8, s0; movxm le, #.L_LEnd0; vmul.f dm3, x5, x4, r2
54- ; CHECK-NEXT: mova r1, #2; nopb ; vconv.bf16.fp32 x10, cml4; lshl r4, r1, r4; vbcst.16 x6, r3; vmul.f dm4, x7, x2, r2
55- ; CHECK-NEXT: vlda.conv.fp32.bf16 cml1, [p0], #64; vshuffle x1, x1, x3, r1
56- ; CHECK-NEXT: vfloor.s32.bf16 x9, wl10, s0; vmin_ge.16 x3, r16, x1, x0, vaddsign1
57- ; CHECK-NEXT: vfloor.s32.bf16 x3, wh10, s0; add.nc lc, r4, #-7
58- ; CHECK-NEXT: nopa ; nopb ; vconv.bf16.fp32 x8, cml4; nopx ; vmax_lt.16 x11, r16, x3, x6, vaddsign1; nopv
59- ; CHECK-NEXT: padda [p1], m0; nopb ; nops ; nopxm ; nopv
47+ ; CHECK-NEXT: vconv.bf16.fp32 x8, cml4
48+ ; CHECK-NEXT: vconv.bf16.fp32 x5, cml3; vmul.f dm4, x10, x4, r2
49+ ; CHECK-NEXT: vconv.bf16.fp32 x7, cml1; vadd.f dm2, dm2, dm0, r0
50+ ; CHECK-NEXT: mova r3, #0; vconv.bf16.fp32 x5, cml3; vmul.f dm4, x5, x4, r2
51+ ; CHECK-NEXT: mov s0, r3; vmul.f dm3, x7, x2, r2
52+ ; CHECK-NEXT: vlda.conv.fp32.bf16 cml1, [p0], #64; nopb ; vfloor.s32.bf16 x1, wl8, s0; movxm ls, #.LBB0_1; vmul.f dm4, x5, x4, r2
53+ ; CHECK-NEXT: mova r4, #-5; vfloor.s32.bf16 x3, wh8, s0; movxm le, #.L_LEnd0
54+ ; CHECK-NEXT: vconv.bf16.fp32 x10, cml4; lshl r4, r1, r4; vbcst.16 x6, r3
55+ ; CHECK-NEXT: mova r1, #2; add.nc lc, r4, #-7
56+ ; CHECK-NEXT: nopa ; nopb ; vfloor.s32.bf16 x9, wl10, s0; nopx ; vshuffle x1, x1, x3, r1; nopv
57+ ; CHECK-NEXT: nopa ; nopb ; vconv.bf16.fp32 x8, cml4; nopx ; vmin_ge.16 x3, r16, x1, x0, vaddsign1; nopv
58+ ; CHECK-NEXT: padda [p1], m0; nopb ; vfloor.s32.bf16 x3, wh10, s0; nopx ; vmax_lt.16 x11, r16, x3, x6, vaddsign1; nopv
6059; CHECK-NEXT: .LBB0_1: // %for.body
6160; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
62- ; CHECK-NEXT: nopa ; nopb ; vconv.bf16.fp32 x10, cml2; nopxm ; nopv
63- ; CHECK-NEXT: nopa ; nopb ; nops ; nopxm ; vadd.f dm2, dm4, dm0, r0
64- ; CHECK-NEXT: vlda.conv.fp32.bf16 cml1, [p0], #64; nopb ; vconv.bf16.fp32 x7, cml4; nopx ; vmov cml4, cml1; vmul.f dm4, x10, x2, r2
65- ; CHECK-NEXT: nopa ; nopb ; vst x11, [p1], #64; nopx ; vshuffle x1, x9, x3, r1; nopv
61+ ; CHECK-NEXT: vlda.conv.fp32.bf16 cml1, [p0], #64; nopb ; vconv.bf16.fp32 x7, cml2; nopxm ; vadd.f dm2, dm2, dm0, r0
62+ ; CHECK-NEXT: nopa ; nopb ; vst x11, [p1], #64; nopx ; vmov cml2, cml1; nopv
63+ ; CHECK-NEXT: nopa ; nopb ; vconv.bf16.fp32 x10, cml3; nopx ; vshuffle x1, x9, x3, r1; vmul.f dm3, x7, x2, r2
6664; CHECK-NEXT: vfloor.s32.bf16 x3, wh8, s0; vmin_ge.16 x5, r16, x1, x0, vaddsign1
6765; CHECK-NEXT: vfloor.s32.bf16 x9, wl8, s0; vmax_lt.16 x11, r16, x5, x6, vaddsign1
6866; CHECK-NEXT: .L_LEnd0:
69- ; CHECK-NEXT: nopa ; nopb ; vconv.bf16.fp32 x8, cml3 ; nopxm ; vmul.f dm3, x7 , x4, r2
67+ ; CHECK-NEXT: nopa ; nopb ; vconv.bf16.fp32 x8, cml4 ; nopxm ; vmul.f dm4, x10 , x4, r2
7068; CHECK-NEXT: // %bb.2:
7169; CHECK-NEXT: nopa ; nopb ; nops ; nopx ; vshuffle x10, x9, x3, r1; nopv
7270; CHECK-NEXT: vmin_ge.16 x10, r16, x10, x0, vaddsign1
@@ -79,15 +77,15 @@ define void @gelu_fn(ptr noalias %ifm, ptr noalias %ofm, ptr nonnull align 64 de
7977; CHECK-NEXT: vshuffle x8, x10, x8, r1
8078; CHECK-NEXT: vmin_ge.16 x8, r16, x8, x0, vaddsign1
8179; CHECK-NEXT: vmax_lt.16 x8, r16, x8, x6, vaddsign1
82- ; CHECK-NEXT: vconv.bf16.fp32 x8, cml3
80+ ; CHECK-NEXT: vconv.bf16.fp32 x8, cml4
8381; CHECK-NEXT: vst x8, [p1], #64
8482; CHECK-NEXT: vfloor.s32.bf16 x10, wl8, s0
8583; CHECK-NEXT: vfloor.s32.bf16 x8, wh8, s0
8684; CHECK-NEXT: nop
8785; CHECK-NEXT: vshuffle x8, x10, x8, r1
8886; CHECK-NEXT: vmin_ge.16 x8, r16, x8, x0, vaddsign1
8987; CHECK-NEXT: vmax_lt.16 x8, r16, x8, x6, vaddsign1
90- ; CHECK-NEXT: vconv.bf16.fp32 x8, cml4
88+ ; CHECK-NEXT: vconv.bf16.fp32 x8, cml3
9189; CHECK-NEXT: vst x8, [p1], #64
9290; CHECK-NEXT: vmul.f dm3, x8, x4, r2
9391; CHECK-NEXT: nop
0 commit comments