-
Notifications
You must be signed in to change notification settings - Fork 16k
[X86] Fold EXPAND(X,Y,M) -> SELECT(M,X,Y) when M is a lowest bit mask #179630
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
If a EXPAND node mask is just the lowest bits, then we can replace it with a more general SELECT node, which can be cheaper and potentially allow predication. Fixes llvm#179008
|
@llvm/pr-subscribers-backend-x86 Author: Simon Pilgrim (RKSimon) ChangesIf a EXPAND node mask is just the lowest bits, then we can replace it with a more general SELECT node, which can be cheaper and potentially allow predication. Fixes #179008 Full diff: https://github.com/llvm/llvm-project/pull/179630.diff 4 Files Affected:
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index b2fac92676eaa..368cdeb6d0fce 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -43476,6 +43476,15 @@ static SDValue combineTargetShuffle(SDValue N, const SDLoc &DL,
return SDValue();
}
+ case X86ISD::EXPAND: {
+ SDValue ExpVec = N.getOperand(0);
+ SDValue PassThru = N.getOperand(1);
+ SDValue ExpMask = N.getOperand(2);
+ if (auto *Msk = dyn_cast<ConstantSDNode>(peekThroughBitcasts(ExpMask)))
+ if (Msk->getAPIntValue().isMask())
+ return DAG.getSelect(DL, VT, ExpMask, ExpVec, PassThru);
+ return SDValue();
+ }
case X86ISD::VPERMV: {
// Combine VPERMV to VPERMV3 if the source operand can be freely split.
SmallVector<int, 32> Mask;
diff --git a/llvm/test/CodeGen/X86/avx512bwvl-arith.ll b/llvm/test/CodeGen/X86/avx512bwvl-arith.ll
index 97ca0d88b7d4d..28ff1b3e2da0d 100644
--- a/llvm/test/CodeGen/X86/avx512bwvl-arith.ll
+++ b/llvm/test/CodeGen/X86/avx512bwvl-arith.ll
@@ -243,7 +243,7 @@ define i16 @PR90356(<16 x i1> %a) {
; CHECK-NEXT: vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
; CHECK-NEXT: movb $63, %al
; CHECK-NEXT: kmovd %eax, %k1
-; CHECK-NEXT: vpexpandq %zmm0, %zmm0 {%k1} {z}
+; CHECK-NEXT: vmovdqa64 %zmm0, %zmm0 {%k1} {z}
; CHECK-NEXT: vptestmd %zmm0, %zmm0, %k0
; CHECK-NEXT: kmovd %k0, %eax
; CHECK-NEXT: # kill: def $ax killed $ax killed $eax
diff --git a/llvm/test/CodeGen/X86/legalize-vec-assertzext.ll b/llvm/test/CodeGen/X86/legalize-vec-assertzext.ll
index 2cf37c68b8b40..1799dd3832aad 100644
--- a/llvm/test/CodeGen/X86/legalize-vec-assertzext.ll
+++ b/llvm/test/CodeGen/X86/legalize-vec-assertzext.ll
@@ -23,7 +23,7 @@ define i64 @widen_assertzext(ptr %x) nounwind {
; CHECK-NEXT: callq test2@PLT
; CHECK-NEXT: movb $127, %al
; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vpexpandq %zmm0, %zmm0 {%k1} {z}
+; CHECK-NEXT: vmovdqa64 %zmm0, %zmm0 {%k1} {z}
; CHECK-NEXT: vextracti32x4 $3, %zmm0, %xmm0
; CHECK-NEXT: vmovq %xmm0, %rax
; CHECK-NEXT: popq %rcx
@@ -41,7 +41,7 @@ define i64 @widen_assertzext_range_attr(ptr %x) nounwind {
; CHECK-NEXT: callq test2@PLT
; CHECK-NEXT: movb $127, %al
; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vpexpandq %zmm0, %zmm0 {%k1} {z}
+; CHECK-NEXT: vmovdqa64 %zmm0, %zmm0 {%k1} {z}
; CHECK-NEXT: vextracti32x4 $3, %zmm0, %xmm0
; CHECK-NEXT: vmovq %xmm0, %rax
; CHECK-NEXT: popq %rcx
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512f.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512f.ll
index ed1d9632d272d..3edc6a33f025f 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512f.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512f.ll
@@ -1062,7 +1062,7 @@ define <8 x i64> @PR179008(ptr %p0) {
; X86-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-AVX512F-NEXT: movb $31, %cl
; X86-AVX512F-NEXT: kmovw %ecx, %k1
-; X86-AVX512F-NEXT: vpexpandq (%eax), %zmm0 {%k1} {z}
+; X86-AVX512F-NEXT: vmovdqu64 (%eax), %zmm0 {%k1} {z}
; X86-AVX512F-NEXT: retl
;
; X86-AVX512BW-LABEL: PR179008:
@@ -1070,21 +1070,21 @@ define <8 x i64> @PR179008(ptr %p0) {
; X86-AVX512BW-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-AVX512BW-NEXT: movb $31, %cl
; X86-AVX512BW-NEXT: kmovd %ecx, %k1
-; X86-AVX512BW-NEXT: vpexpandq (%eax), %zmm0 {%k1} {z}
+; X86-AVX512BW-NEXT: vmovdqu64 (%eax), %zmm0 {%k1} {z}
; X86-AVX512BW-NEXT: retl
;
; X64-AVX512F-LABEL: PR179008:
; X64-AVX512F: # %bb.0:
; X64-AVX512F-NEXT: movb $31, %al
; X64-AVX512F-NEXT: kmovw %eax, %k1
-; X64-AVX512F-NEXT: vpexpandq (%rdi), %zmm0 {%k1} {z}
+; X64-AVX512F-NEXT: vmovdqu64 (%rdi), %zmm0 {%k1} {z}
; X64-AVX512F-NEXT: retq
;
; X64-AVX512BW-LABEL: PR179008:
; X64-AVX512BW: # %bb.0:
; X64-AVX512BW-NEXT: movb $31, %al
; X64-AVX512BW-NEXT: kmovd %eax, %k1
-; X64-AVX512BW-NEXT: vpexpandq (%rdi), %zmm0 {%k1} {z}
+; X64-AVX512BW-NEXT: vmovdqu64 (%rdi), %zmm0 {%k1} {z}
; X64-AVX512BW-NEXT: retq
%load = load <8 x i64>, ptr %p0, align 1
%shuf = shufflevector <8 x i64> %load, <8 x i64> <i64 poison, i64 poison, i64 poison, i64 poison, i64 poison, i64 0, i64 0, i64 0>, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 13, i32 14, i32 15>
|
phoebewang
left a comment
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM.
| ; CHECK-NEXT: vpmovb2m %xmm0, %k1 | ||
| ; CHECK-NEXT: vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1 | ||
| ; CHECK-NEXT: movb $63, %al | ||
| ; CHECK-NEXT: kmovd %eax, %k1 | ||
| ; CHECK-NEXT: vpexpandq %zmm0, %zmm0 {%k1} {z} | ||
| ; CHECK-NEXT: vmovdqa64 %zmm0, %zmm0 {%k1} {z} | ||
| ; CHECK-NEXT: vptestmd %zmm0, %zmm0, %k0 | ||
| ; CHECK-NEXT: kmovd %k0, %eax | ||
| ; CHECK-NEXT: # kill: def $ax killed $ax killed $eax | ||
| ; CHECK-NEXT: vzeroupper |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Tangent: these seem redundant, we can leverage scalar instructions like
vpmovmakb %xmm0, %eax
andw $4095, %ax
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yes - lower1BitShuffle is missing basic AND blend masking support that would avoid all the extensions etc. and should probably eventually simplify to movmsk
If a EXPAND node mask is just the lowest bits, then we can replace it with a more general SELECT node, which can be cheaper and potentially allow predication.
Fixes #179008