diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index bbe32aee10db5..c7e0de350593b 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -3497,6 +3497,18 @@ bool X86TargetLowering::decomposeMulByConstant(LLVMContext &Context, EVT VT, if (!ISD::isConstantSplatVector(C.getNode(), MulC)) return false; + if (VT.isVector() && VT.getScalarSizeInBits() == 8) { + // Check whether a vXi8 multiply can be decomposed into two shifts + // (decomposing 2^m ± 2^n as 2^(a+b) ± 2^b). Similar to + // DAGCombiner::visitMUL, consider the constant `2` decomposable as + // (2^0 + 1). + APInt ShiftedMulC = MulC.abs(); + unsigned TZeros = ShiftedMulC == 2 ? 0 : ShiftedMulC.countr_zero(); + ShiftedMulC.lshrInPlace(TZeros); + if ((ShiftedMulC - 1).isPowerOf2() || (ShiftedMulC + 1).isPowerOf2()) + return true; + } + // Find the type this will be legalized too. Otherwise we might prematurely // convert this to shl+add/sub and then still have to type legalize those ops. // Another choice would be to defer the decision for illegal types until diff --git a/llvm/test/CodeGen/X86/vector-mul-i8-decompose.ll b/llvm/test/CodeGen/X86/vector-mul-i8-decompose.ll new file mode 100644 index 0000000000000..9b47d7de4b0ed --- /dev/null +++ b/llvm/test/CodeGen/X86/vector-mul-i8-decompose.ll @@ -0,0 +1,1769 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s --check-prefixes=CHECK,SSE2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=CHECK,AVX2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=CHECK,AVX512 + +;; Tests vXi8 constant-multiply decomposition into shift/add/sub sequences. +;; +;; Examples: +;; 6 = 2^2 + 2^1 = 4 + 2 (or 8 - 2) +;; 10 = 2^3 + 2^1 = 8 + 2 +;; 12 = 2^3 + 2^2 = 8 + 4 (or 16 - 4) +;; 18 = 2^4 + 2^1 = 16 + 2 +;; 20 = 2^4 + 2^2 = 16 + 4 +;; 24 = 2^4 + 2^3 = 16 + 8 (or 32 - 8) +;; +;; To run this test: +;; llvm-lit llvm/test/CodeGen/X86/vector-mul-i8-decompose.ll +;; +;; To regenerate CHECK lines: +;; python llvm/utils/update_llc_test_checks.py llvm/test/CodeGen/X86/vector-mul-i8-decompose.ll + +;; ============================================================================ +;; v16i8 Tests (128-bit vectors) - Sum of two powers of 2 +;; ============================================================================ + +; Test multiply by 6 = 4 + 2 = (1 << 2) + (1 << 1) +define <16 x i8> @mul_v16i8_const6(<16 x i8> %a) nounwind { +; SSE2-LABEL: mul_v16i8_const6: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: paddb %xmm0, %xmm1 +; SSE2-NEXT: psllw $2, %xmm0 +; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE2-NEXT: paddb %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; AVX2-LABEL: mul_v16i8_const6: +; AVX2: # %bb.0: +; AVX2-NEXT: vpaddb %xmm0, %xmm0, %xmm1 +; AVX2-NEXT: vpsllw $2, %xmm0, %xmm0 +; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: vpaddb %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: retq +; +; AVX512-LABEL: mul_v16i8_const6: +; AVX512: # %bb.0: +; AVX512-NEXT: vpaddb %xmm0, %xmm0, %xmm1 +; AVX512-NEXT: vpsllw $2, %xmm0, %xmm0 +; AVX512-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX512-NEXT: vpaddb %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: retq + %result = mul <16 x i8> %a, splat (i8 6) + ret <16 x i8> %result +} + +; Test multiply by 10 = 8 + 2 = (1 << 3) + (1 << 1) +define <16 x i8> @mul_v16i8_const10(<16 x i8> %a) nounwind { +; SSE2-LABEL: mul_v16i8_const10: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: paddb %xmm0, %xmm1 +; SSE2-NEXT: psllw $3, %xmm0 +; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE2-NEXT: paddb %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; AVX2-LABEL: mul_v16i8_const10: +; AVX2: # %bb.0: +; AVX2-NEXT: vpaddb %xmm0, %xmm0, %xmm1 +; AVX2-NEXT: vpsllw $3, %xmm0, %xmm0 +; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: vpaddb %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: retq +; +; AVX512-LABEL: mul_v16i8_const10: +; AVX512: # %bb.0: +; AVX512-NEXT: vpaddb %xmm0, %xmm0, %xmm1 +; AVX512-NEXT: vpsllw $3, %xmm0, %xmm0 +; AVX512-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX512-NEXT: vpaddb %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: retq + %result = mul <16 x i8> %a, splat (i8 10) + ret <16 x i8> %result +} + +; Test multiply by 12 = 8 + 4 = (1 << 3) + (1 << 2) +define <16 x i8> @mul_v16i8_const12(<16 x i8> %a) nounwind { +; SSE2-LABEL: mul_v16i8_const12: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: psllw $2, %xmm1 +; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; SSE2-NEXT: psllw $3, %xmm0 +; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE2-NEXT: paddb %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; AVX2-LABEL: mul_v16i8_const12: +; AVX2: # %bb.0: +; AVX2-NEXT: vpsllw $2, %xmm0, %xmm1 +; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX2-NEXT: vpsllw $3, %xmm0, %xmm0 +; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: vpaddb %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: retq +; +; AVX512-LABEL: mul_v16i8_const12: +; AVX512: # %bb.0: +; AVX512-NEXT: vpsllw $2, %xmm0, %xmm1 +; AVX512-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX512-NEXT: vpsllw $3, %xmm0, %xmm0 +; AVX512-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX512-NEXT: vpaddb %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: retq + %result = mul <16 x i8> %a, splat (i8 12) + ret <16 x i8> %result +} + +; Test multiply by 18 = 16 + 2 = (1 << 4) + (1 << 1) +define <16 x i8> @mul_v16i8_const18(<16 x i8> %a) nounwind { +; SSE2-LABEL: mul_v16i8_const18: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: paddb %xmm0, %xmm1 +; SSE2-NEXT: psllw $4, %xmm0 +; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE2-NEXT: paddb %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; AVX2-LABEL: mul_v16i8_const18: +; AVX2: # %bb.0: +; AVX2-NEXT: vpaddb %xmm0, %xmm0, %xmm1 +; AVX2-NEXT: vpsllw $4, %xmm0, %xmm0 +; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: vpaddb %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: retq +; +; AVX512-LABEL: mul_v16i8_const18: +; AVX512: # %bb.0: +; AVX512-NEXT: vpaddb %xmm0, %xmm0, %xmm1 +; AVX512-NEXT: vpsllw $4, %xmm0, %xmm0 +; AVX512-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX512-NEXT: vpaddb %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: retq + %result = mul <16 x i8> %a, splat (i8 18) + ret <16 x i8> %result +} + +; Test multiply by 20 = 16 + 4 = (1 << 4) + (1 << 2) +define <16 x i8> @mul_v16i8_const20(<16 x i8> %a) nounwind { +; SSE2-LABEL: mul_v16i8_const20: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: psllw $2, %xmm1 +; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; SSE2-NEXT: psllw $4, %xmm0 +; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE2-NEXT: paddb %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; AVX2-LABEL: mul_v16i8_const20: +; AVX2: # %bb.0: +; AVX2-NEXT: vpsllw $2, %xmm0, %xmm1 +; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX2-NEXT: vpsllw $4, %xmm0, %xmm0 +; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: vpaddb %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: retq +; +; AVX512-LABEL: mul_v16i8_const20: +; AVX512: # %bb.0: +; AVX512-NEXT: vpsllw $2, %xmm0, %xmm1 +; AVX512-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX512-NEXT: vpsllw $4, %xmm0, %xmm0 +; AVX512-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX512-NEXT: vpaddb %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: retq + %result = mul <16 x i8> %a, splat (i8 20) + ret <16 x i8> %result +} + +; Test multiply by 24 = 16 + 8 = (1 << 4) + (1 << 3) +define <16 x i8> @mul_v16i8_const24(<16 x i8> %a) nounwind { +; SSE2-LABEL: mul_v16i8_const24: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: psllw $3, %xmm1 +; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; SSE2-NEXT: psllw $4, %xmm0 +; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE2-NEXT: paddb %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; AVX2-LABEL: mul_v16i8_const24: +; AVX2: # %bb.0: +; AVX2-NEXT: vpsllw $3, %xmm0, %xmm1 +; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX2-NEXT: vpsllw $4, %xmm0, %xmm0 +; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: vpaddb %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: retq +; +; AVX512-LABEL: mul_v16i8_const24: +; AVX512: # %bb.0: +; AVX512-NEXT: vpsllw $3, %xmm0, %xmm1 +; AVX512-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX512-NEXT: vpsllw $4, %xmm0, %xmm0 +; AVX512-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX512-NEXT: vpaddb %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: retq + %result = mul <16 x i8> %a, splat (i8 24) + ret <16 x i8> %result +} + +; Test multiply by 34 = 32 + 2 = (1 << 5) + (1 << 1) +define <16 x i8> @mul_v16i8_const34(<16 x i8> %a) nounwind { +; SSE2-LABEL: mul_v16i8_const34: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: paddb %xmm0, %xmm1 +; SSE2-NEXT: psllw $5, %xmm0 +; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE2-NEXT: paddb %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; AVX2-LABEL: mul_v16i8_const34: +; AVX2: # %bb.0: +; AVX2-NEXT: vpaddb %xmm0, %xmm0, %xmm1 +; AVX2-NEXT: vpsllw $5, %xmm0, %xmm0 +; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: vpaddb %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: retq +; +; AVX512-LABEL: mul_v16i8_const34: +; AVX512: # %bb.0: +; AVX512-NEXT: vpaddb %xmm0, %xmm0, %xmm1 +; AVX512-NEXT: vpsllw $5, %xmm0, %xmm0 +; AVX512-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX512-NEXT: vpaddb %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: retq + %result = mul <16 x i8> %a, splat (i8 34) + ret <16 x i8> %result +} + +; Test multiply by 36 = 32 + 4 = (1 << 5) + (1 << 2) +define <16 x i8> @mul_v16i8_const36(<16 x i8> %a) nounwind { +; SSE2-LABEL: mul_v16i8_const36: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: psllw $2, %xmm1 +; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; SSE2-NEXT: psllw $5, %xmm0 +; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE2-NEXT: paddb %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; AVX2-LABEL: mul_v16i8_const36: +; AVX2: # %bb.0: +; AVX2-NEXT: vpsllw $2, %xmm0, %xmm1 +; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX2-NEXT: vpsllw $5, %xmm0, %xmm0 +; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: vpaddb %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: retq +; +; AVX512-LABEL: mul_v16i8_const36: +; AVX512: # %bb.0: +; AVX512-NEXT: vpsllw $2, %xmm0, %xmm1 +; AVX512-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX512-NEXT: vpsllw $5, %xmm0, %xmm0 +; AVX512-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX512-NEXT: vpaddb %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: retq + %result = mul <16 x i8> %a, splat (i8 36) + ret <16 x i8> %result +} + +; Test multiply by 40 = 32 + 8 = (1 << 5) + (1 << 3) +define <16 x i8> @mul_v16i8_const40(<16 x i8> %a) nounwind { +; SSE2-LABEL: mul_v16i8_const40: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: psllw $3, %xmm1 +; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; SSE2-NEXT: psllw $5, %xmm0 +; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE2-NEXT: paddb %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; AVX2-LABEL: mul_v16i8_const40: +; AVX2: # %bb.0: +; AVX2-NEXT: vpsllw $3, %xmm0, %xmm1 +; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX2-NEXT: vpsllw $5, %xmm0, %xmm0 +; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: vpaddb %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: retq +; +; AVX512-LABEL: mul_v16i8_const40: +; AVX512: # %bb.0: +; AVX512-NEXT: vpsllw $3, %xmm0, %xmm1 +; AVX512-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX512-NEXT: vpsllw $5, %xmm0, %xmm0 +; AVX512-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX512-NEXT: vpaddb %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: retq + %result = mul <16 x i8> %a, splat (i8 40) + ret <16 x i8> %result +} + +; Test multiply by 48 = 32 + 16 = (1 << 5) + (1 << 4) +define <16 x i8> @mul_v16i8_const48(<16 x i8> %a) nounwind { +; SSE2-LABEL: mul_v16i8_const48: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: psllw $4, %xmm1 +; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; SSE2-NEXT: psllw $5, %xmm0 +; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE2-NEXT: paddb %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; AVX2-LABEL: mul_v16i8_const48: +; AVX2: # %bb.0: +; AVX2-NEXT: vpsllw $4, %xmm0, %xmm1 +; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX2-NEXT: vpsllw $5, %xmm0, %xmm0 +; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: vpaddb %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: retq +; +; AVX512-LABEL: mul_v16i8_const48: +; AVX512: # %bb.0: +; AVX512-NEXT: vpsllw $4, %xmm0, %xmm1 +; AVX512-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX512-NEXT: vpsllw $5, %xmm0, %xmm0 +; AVX512-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX512-NEXT: vpaddb %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: retq + %result = mul <16 x i8> %a, splat (i8 48) + ret <16 x i8> %result +} + +;; ============================================================================ +;; v16i8 Tests (128-bit vectors) - Difference of two powers of 2 +;; ============================================================================ + +; Test multiply by 14 = 16 - 2 = (1 << 4) - (1 << 1) +define <16 x i8> @mul_v16i8_const14(<16 x i8> %a) nounwind { +; SSE2-LABEL: mul_v16i8_const14: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: paddb %xmm0, %xmm1 +; SSE2-NEXT: psllw $4, %xmm0 +; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE2-NEXT: psubb %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; AVX2-LABEL: mul_v16i8_const14: +; AVX2: # %bb.0: +; AVX2-NEXT: vpaddb %xmm0, %xmm0, %xmm1 +; AVX2-NEXT: vpsllw $4, %xmm0, %xmm0 +; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: vpsubb %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: retq +; +; AVX512-LABEL: mul_v16i8_const14: +; AVX512: # %bb.0: +; AVX512-NEXT: vpaddb %xmm0, %xmm0, %xmm1 +; AVX512-NEXT: vpsllw $4, %xmm0, %xmm0 +; AVX512-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX512-NEXT: vpsubb %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: retq + %result = mul <16 x i8> %a, splat (i8 14) + ret <16 x i8> %result +} + +; Test multiply by 30 = 32 - 2 = (1 << 5) - (1 << 1) +define <16 x i8> @mul_v16i8_const30(<16 x i8> %a) nounwind { +; SSE2-LABEL: mul_v16i8_const30: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: paddb %xmm0, %xmm1 +; SSE2-NEXT: psllw $5, %xmm0 +; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE2-NEXT: psubb %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; AVX2-LABEL: mul_v16i8_const30: +; AVX2: # %bb.0: +; AVX2-NEXT: vpaddb %xmm0, %xmm0, %xmm1 +; AVX2-NEXT: vpsllw $5, %xmm0, %xmm0 +; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: vpsubb %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: retq +; +; AVX512-LABEL: mul_v16i8_const30: +; AVX512: # %bb.0: +; AVX512-NEXT: vpaddb %xmm0, %xmm0, %xmm1 +; AVX512-NEXT: vpsllw $5, %xmm0, %xmm0 +; AVX512-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX512-NEXT: vpsubb %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: retq + %result = mul <16 x i8> %a, splat (i8 30) + ret <16 x i8> %result +} + +; Test multiply by 60 = 64 - 4 = (1 << 6) - (1 << 2) +define <16 x i8> @mul_v16i8_const60(<16 x i8> %a) nounwind { +; SSE2-LABEL: mul_v16i8_const60: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: psllw $2, %xmm1 +; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; SSE2-NEXT: psllw $6, %xmm0 +; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE2-NEXT: psubb %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; AVX2-LABEL: mul_v16i8_const60: +; AVX2: # %bb.0: +; AVX2-NEXT: vpsllw $2, %xmm0, %xmm1 +; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX2-NEXT: vpsllw $6, %xmm0, %xmm0 +; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: vpsubb %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: retq +; +; AVX512-LABEL: mul_v16i8_const60: +; AVX512: # %bb.0: +; AVX512-NEXT: vpsllw $2, %xmm0, %xmm1 +; AVX512-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX512-NEXT: vpsllw $6, %xmm0, %xmm0 +; AVX512-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX512-NEXT: vpsubb %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: retq + %result = mul <16 x i8> %a, splat (i8 60) + ret <16 x i8> %result +} + +; Test multiply by 96 = 64 + 32 = (1 << 6) + (1 << 5) +define <16 x i8> @mul_v16i8_const96(<16 x i8> %a) nounwind { +; SSE2-LABEL: mul_v16i8_const96: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: psllw $5, %xmm1 +; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; SSE2-NEXT: psllw $6, %xmm0 +; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE2-NEXT: paddb %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; AVX2-LABEL: mul_v16i8_const96: +; AVX2: # %bb.0: +; AVX2-NEXT: vpsllw $5, %xmm0, %xmm1 +; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX2-NEXT: vpsllw $6, %xmm0, %xmm0 +; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: vpaddb %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: retq +; +; AVX512-LABEL: mul_v16i8_const96: +; AVX512: # %bb.0: +; AVX512-NEXT: vpsllw $5, %xmm0, %xmm1 +; AVX512-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX512-NEXT: vpsllw $6, %xmm0, %xmm0 +; AVX512-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX512-NEXT: vpaddb %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: retq + %result = mul <16 x i8> %a, splat (i8 96) + ret <16 x i8> %result +} + +; Test multiply by 160 = 128 + 32 = (1 << 7) + (1 << 5) +define <16 x i8> @mul_v16i8_const160(<16 x i8> %a) nounwind { +; SSE2-LABEL: mul_v16i8_const160: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: psllw $5, %xmm1 +; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; SSE2-NEXT: psllw $6, %xmm0 +; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE2-NEXT: paddb %xmm1, %xmm0 +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: psubb %xmm0, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; AVX2-LABEL: mul_v16i8_const160: +; AVX2: # %bb.0: +; AVX2-NEXT: vpsllw $5, %xmm0, %xmm1 +; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX2-NEXT: vpsllw $6, %xmm0, %xmm0 +; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: vpaddb %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpsubb %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: retq +; +; AVX512-LABEL: mul_v16i8_const160: +; AVX512: # %bb.0: +; AVX512-NEXT: vpsllw $5, %xmm0, %xmm1 +; AVX512-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX512-NEXT: vpsllw $6, %xmm0, %xmm0 +; AVX512-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX512-NEXT: vpaddb %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512-NEXT: vpsubb %xmm0, %xmm1, %xmm0 +; AVX512-NEXT: retq + %result = mul <16 x i8> %a, splat (i8 160) + ret <16 x i8> %result +} + +;; ============================================================================ +;; v16i8 Tests (128-bit vectors) - Negative constants +;; ============================================================================ + +; Test multiply by -6 = -(4 + 2) +define <16 x i8> @mul_v16i8_const_neg6(<16 x i8> %a) nounwind { +; SSE2-LABEL: mul_v16i8_const_neg6: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: paddb %xmm0, %xmm1 +; SSE2-NEXT: psllw $2, %xmm0 +; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE2-NEXT: paddb %xmm1, %xmm0 +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: psubb %xmm0, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; AVX2-LABEL: mul_v16i8_const_neg6: +; AVX2: # %bb.0: +; AVX2-NEXT: vpaddb %xmm0, %xmm0, %xmm1 +; AVX2-NEXT: vpsllw $2, %xmm0, %xmm0 +; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: vpaddb %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpsubb %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: retq +; +; AVX512-LABEL: mul_v16i8_const_neg6: +; AVX512: # %bb.0: +; AVX512-NEXT: vpaddb %xmm0, %xmm0, %xmm1 +; AVX512-NEXT: vpsllw $2, %xmm0, %xmm0 +; AVX512-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX512-NEXT: vpaddb %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512-NEXT: vpsubb %xmm0, %xmm1, %xmm0 +; AVX512-NEXT: retq + %result = mul <16 x i8> %a, splat (i8 250) + ret <16 x i8> %result +} + +; Test multiply by -10 = -(8 + 2) +define <16 x i8> @mul_v16i8_const_neg10(<16 x i8> %a) nounwind { +; SSE2-LABEL: mul_v16i8_const_neg10: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: paddb %xmm0, %xmm1 +; SSE2-NEXT: psllw $3, %xmm0 +; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE2-NEXT: paddb %xmm1, %xmm0 +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: psubb %xmm0, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; AVX2-LABEL: mul_v16i8_const_neg10: +; AVX2: # %bb.0: +; AVX2-NEXT: vpaddb %xmm0, %xmm0, %xmm1 +; AVX2-NEXT: vpsllw $3, %xmm0, %xmm0 +; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: vpaddb %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpsubb %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: retq +; +; AVX512-LABEL: mul_v16i8_const_neg10: +; AVX512: # %bb.0: +; AVX512-NEXT: vpaddb %xmm0, %xmm0, %xmm1 +; AVX512-NEXT: vpsllw $3, %xmm0, %xmm0 +; AVX512-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX512-NEXT: vpaddb %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512-NEXT: vpsubb %xmm0, %xmm1, %xmm0 +; AVX512-NEXT: retq + %result = mul <16 x i8> %a, splat (i8 246) + ret <16 x i8> %result +} + +; Test multiply by -12 = -(8+4) +define <16 x i8> @mul_v16i8_const_neg12(<16 x i8> %a) nounwind { +; SSE2-LABEL: mul_v16i8_const_neg12: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: psllw $2, %xmm1 +; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; SSE2-NEXT: psllw $3, %xmm0 +; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE2-NEXT: paddb %xmm1, %xmm0 +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: psubb %xmm0, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; AVX2-LABEL: mul_v16i8_const_neg12: +; AVX2: # %bb.0: +; AVX2-NEXT: vpsllw $2, %xmm0, %xmm1 +; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX2-NEXT: vpsllw $3, %xmm0, %xmm0 +; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: vpaddb %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpsubb %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: retq +; +; AVX512-LABEL: mul_v16i8_const_neg12: +; AVX512: # %bb.0: +; AVX512-NEXT: vpsllw $2, %xmm0, %xmm1 +; AVX512-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX512-NEXT: vpsllw $3, %xmm0, %xmm0 +; AVX512-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX512-NEXT: vpaddb %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512-NEXT: vpsubb %xmm0, %xmm1, %xmm0 +; AVX512-NEXT: retq + %result = mul <16 x i8> %a, splat (i8 244) + ret <16 x i8> %result +} + +; Test multiply by -24 = -(8 + 16) +define <16 x i8> @mul_v16i8_const_neg24(<16 x i8> %a) nounwind { +; SSE2-LABEL: mul_v16i8_const_neg24: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: psllw $3, %xmm1 +; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; SSE2-NEXT: psllw $4, %xmm0 +; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE2-NEXT: paddb %xmm1, %xmm0 +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: psubb %xmm0, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; AVX2-LABEL: mul_v16i8_const_neg24: +; AVX2: # %bb.0: +; AVX2-NEXT: vpsllw $3, %xmm0, %xmm1 +; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX2-NEXT: vpsllw $4, %xmm0, %xmm0 +; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: vpaddb %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpsubb %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: retq +; +; AVX512-LABEL: mul_v16i8_const_neg24: +; AVX512: # %bb.0: +; AVX512-NEXT: vpsllw $3, %xmm0, %xmm1 +; AVX512-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX512-NEXT: vpsllw $4, %xmm0, %xmm0 +; AVX512-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX512-NEXT: vpaddb %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512-NEXT: vpsubb %xmm0, %xmm1, %xmm0 +; AVX512-NEXT: retq + %result = mul <16 x i8> %a, splat (i8 232) + ret <16 x i8> %result +} + +;; ============================================================================ +;; v32i8 Tests (256-bit vectors) +;; ============================================================================ + +; Test multiply by 6 = 4 + 2 +define <32 x i8> @mul_v32i8_const6(<32 x i8> %a) nounwind { +; SSE2-LABEL: mul_v32i8_const6: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: psllw $2, %xmm2 +; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252] +; SSE2-NEXT: pand %xmm3, %xmm2 +; SSE2-NEXT: paddb %xmm0, %xmm0 +; SSE2-NEXT: paddb %xmm2, %xmm0 +; SSE2-NEXT: movdqa %xmm1, %xmm2 +; SSE2-NEXT: psllw $2, %xmm2 +; SSE2-NEXT: pand %xmm3, %xmm2 +; SSE2-NEXT: paddb %xmm1, %xmm1 +; SSE2-NEXT: paddb %xmm2, %xmm1 +; SSE2-NEXT: retq +; +; AVX2-LABEL: mul_v32i8_const6: +; AVX2: # %bb.0: +; AVX2-NEXT: vpaddb %ymm0, %ymm0, %ymm1 +; AVX2-NEXT: vpsllw $2, %ymm0, %ymm0 +; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX2-NEXT: vpaddb %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: retq +; +; AVX512-LABEL: mul_v32i8_const6: +; AVX512: # %bb.0: +; AVX512-NEXT: vpaddb %ymm0, %ymm0, %ymm1 +; AVX512-NEXT: vpsllw $2, %ymm0, %ymm0 +; AVX512-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX512-NEXT: vpaddb %ymm1, %ymm0, %ymm0 +; AVX512-NEXT: retq + %result = mul <32 x i8> %a, splat (i8 6) + ret <32 x i8> %result +} + +; Test multiply by 10 = 8 + 2 +define <32 x i8> @mul_v32i8_const10(<32 x i8> %a) nounwind { +; SSE2-LABEL: mul_v32i8_const10: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: psllw $3, %xmm2 +; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248] +; SSE2-NEXT: pand %xmm3, %xmm2 +; SSE2-NEXT: paddb %xmm0, %xmm0 +; SSE2-NEXT: paddb %xmm2, %xmm0 +; SSE2-NEXT: movdqa %xmm1, %xmm2 +; SSE2-NEXT: psllw $3, %xmm2 +; SSE2-NEXT: pand %xmm3, %xmm2 +; SSE2-NEXT: paddb %xmm1, %xmm1 +; SSE2-NEXT: paddb %xmm2, %xmm1 +; SSE2-NEXT: retq +; +; AVX2-LABEL: mul_v32i8_const10: +; AVX2: # %bb.0: +; AVX2-NEXT: vpaddb %ymm0, %ymm0, %ymm1 +; AVX2-NEXT: vpsllw $3, %ymm0, %ymm0 +; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX2-NEXT: vpaddb %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: retq +; +; AVX512-LABEL: mul_v32i8_const10: +; AVX512: # %bb.0: +; AVX512-NEXT: vpaddb %ymm0, %ymm0, %ymm1 +; AVX512-NEXT: vpsllw $3, %ymm0, %ymm0 +; AVX512-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX512-NEXT: vpaddb %ymm1, %ymm0, %ymm0 +; AVX512-NEXT: retq + %result = mul <32 x i8> %a, splat (i8 10) + ret <32 x i8> %result +} + +; Test multiply by 12 = 8 + 4 +define <32 x i8> @mul_v32i8_const12(<32 x i8> %a) nounwind { +; SSE2-LABEL: mul_v32i8_const12: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: psllw $2, %xmm2 +; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252] +; SSE2-NEXT: pand %xmm3, %xmm2 +; SSE2-NEXT: psllw $3, %xmm0 +; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248] +; SSE2-NEXT: pand %xmm4, %xmm0 +; SSE2-NEXT: paddb %xmm2, %xmm0 +; SSE2-NEXT: movdqa %xmm1, %xmm2 +; SSE2-NEXT: psllw $2, %xmm2 +; SSE2-NEXT: pand %xmm3, %xmm2 +; SSE2-NEXT: psllw $3, %xmm1 +; SSE2-NEXT: pand %xmm4, %xmm1 +; SSE2-NEXT: paddb %xmm2, %xmm1 +; SSE2-NEXT: retq +; +; AVX2-LABEL: mul_v32i8_const12: +; AVX2: # %bb.0: +; AVX2-NEXT: vpsllw $2, %ymm0, %ymm1 +; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 +; AVX2-NEXT: vpsllw $3, %ymm0, %ymm0 +; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX2-NEXT: vpaddb %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: retq +; +; AVX512-LABEL: mul_v32i8_const12: +; AVX512: # %bb.0: +; AVX512-NEXT: vpsllw $2, %ymm0, %ymm1 +; AVX512-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 +; AVX512-NEXT: vpsllw $3, %ymm0, %ymm0 +; AVX512-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX512-NEXT: vpaddb %ymm1, %ymm0, %ymm0 +; AVX512-NEXT: retq + %result = mul <32 x i8> %a, splat (i8 12) + ret <32 x i8> %result +} + +; Test multiply by 20 = 16 + 4 +define <32 x i8> @mul_v32i8_const20(<32 x i8> %a) nounwind { +; SSE2-LABEL: mul_v32i8_const20: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: psllw $2, %xmm2 +; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252] +; SSE2-NEXT: pand %xmm3, %xmm2 +; SSE2-NEXT: psllw $4, %xmm0 +; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] +; SSE2-NEXT: pand %xmm4, %xmm0 +; SSE2-NEXT: paddb %xmm2, %xmm0 +; SSE2-NEXT: movdqa %xmm1, %xmm2 +; SSE2-NEXT: psllw $2, %xmm2 +; SSE2-NEXT: pand %xmm3, %xmm2 +; SSE2-NEXT: psllw $4, %xmm1 +; SSE2-NEXT: pand %xmm4, %xmm1 +; SSE2-NEXT: paddb %xmm2, %xmm1 +; SSE2-NEXT: retq +; +; AVX2-LABEL: mul_v32i8_const20: +; AVX2: # %bb.0: +; AVX2-NEXT: vpsllw $2, %ymm0, %ymm1 +; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 +; AVX2-NEXT: vpsllw $4, %ymm0, %ymm0 +; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX2-NEXT: vpaddb %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: retq +; +; AVX512-LABEL: mul_v32i8_const20: +; AVX512: # %bb.0: +; AVX512-NEXT: vpsllw $2, %ymm0, %ymm1 +; AVX512-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 +; AVX512-NEXT: vpsllw $4, %ymm0, %ymm0 +; AVX512-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX512-NEXT: vpaddb %ymm1, %ymm0, %ymm0 +; AVX512-NEXT: retq + %result = mul <32 x i8> %a, splat (i8 20) + ret <32 x i8> %result +} + +; Test multiply by 24 = 16 + 8 +define <32 x i8> @mul_v32i8_const24(<32 x i8> %a) nounwind { +; SSE2-LABEL: mul_v32i8_const24: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: psllw $3, %xmm2 +; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248] +; SSE2-NEXT: pand %xmm3, %xmm2 +; SSE2-NEXT: psllw $4, %xmm0 +; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] +; SSE2-NEXT: pand %xmm4, %xmm0 +; SSE2-NEXT: paddb %xmm2, %xmm0 +; SSE2-NEXT: movdqa %xmm1, %xmm2 +; SSE2-NEXT: psllw $3, %xmm2 +; SSE2-NEXT: pand %xmm3, %xmm2 +; SSE2-NEXT: psllw $4, %xmm1 +; SSE2-NEXT: pand %xmm4, %xmm1 +; SSE2-NEXT: paddb %xmm2, %xmm1 +; SSE2-NEXT: retq +; +; AVX2-LABEL: mul_v32i8_const24: +; AVX2: # %bb.0: +; AVX2-NEXT: vpsllw $3, %ymm0, %ymm1 +; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 +; AVX2-NEXT: vpsllw $4, %ymm0, %ymm0 +; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX2-NEXT: vpaddb %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: retq +; +; AVX512-LABEL: mul_v32i8_const24: +; AVX512: # %bb.0: +; AVX512-NEXT: vpsllw $3, %ymm0, %ymm1 +; AVX512-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 +; AVX512-NEXT: vpsllw $4, %ymm0, %ymm0 +; AVX512-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX512-NEXT: vpaddb %ymm1, %ymm0, %ymm0 +; AVX512-NEXT: retq + %result = mul <32 x i8> %a, splat (i8 24) + ret <32 x i8> %result +} + +; Test multiply by -6 = -(4 + 2) +define <32 x i8> @mul_v32i8_const_neg6(<32 x i8> %a) nounwind { +; SSE2-LABEL: mul_v32i8_const_neg6: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: psllw $2, %xmm2 +; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252] +; SSE2-NEXT: pand %xmm4, %xmm2 +; SSE2-NEXT: paddb %xmm0, %xmm0 +; SSE2-NEXT: paddb %xmm2, %xmm0 +; SSE2-NEXT: pxor %xmm2, %xmm2 +; SSE2-NEXT: pxor %xmm3, %xmm3 +; SSE2-NEXT: psubb %xmm0, %xmm3 +; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: psllw $2, %xmm0 +; SSE2-NEXT: pand %xmm4, %xmm0 +; SSE2-NEXT: paddb %xmm1, %xmm1 +; SSE2-NEXT: paddb %xmm0, %xmm1 +; SSE2-NEXT: psubb %xmm1, %xmm2 +; SSE2-NEXT: movdqa %xmm3, %xmm0 +; SSE2-NEXT: movdqa %xmm2, %xmm1 +; SSE2-NEXT: retq +; +; AVX2-LABEL: mul_v32i8_const_neg6: +; AVX2: # %bb.0: +; AVX2-NEXT: vpaddb %ymm0, %ymm0, %ymm1 +; AVX2-NEXT: vpsllw $2, %ymm0, %ymm0 +; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX2-NEXT: vpaddb %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpsubb %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: retq +; +; AVX512-LABEL: mul_v32i8_const_neg6: +; AVX512: # %bb.0: +; AVX512-NEXT: vpaddb %ymm0, %ymm0, %ymm1 +; AVX512-NEXT: vpsllw $2, %ymm0, %ymm0 +; AVX512-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX512-NEXT: vpaddb %ymm1, %ymm0, %ymm0 +; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512-NEXT: vpsubb %ymm0, %ymm1, %ymm0 +; AVX512-NEXT: retq + %result = mul <32 x i8> %a, splat (i8 250) + ret <32 x i8> %result +} + +; Test multiply by -12 = -(8 + 4) +define <32 x i8> @mul_v32i8_const_neg12(<32 x i8> %a) nounwind { +; SSE2-LABEL: mul_v32i8_const_neg12: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: psllw $2, %xmm2 +; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252] +; SSE2-NEXT: pand %xmm4, %xmm2 +; SSE2-NEXT: psllw $3, %xmm0 +; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248] +; SSE2-NEXT: pand %xmm5, %xmm0 +; SSE2-NEXT: paddb %xmm2, %xmm0 +; SSE2-NEXT: pxor %xmm2, %xmm2 +; SSE2-NEXT: pxor %xmm3, %xmm3 +; SSE2-NEXT: psubb %xmm0, %xmm3 +; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: psllw $2, %xmm0 +; SSE2-NEXT: pand %xmm4, %xmm0 +; SSE2-NEXT: psllw $3, %xmm1 +; SSE2-NEXT: pand %xmm5, %xmm1 +; SSE2-NEXT: paddb %xmm0, %xmm1 +; SSE2-NEXT: psubb %xmm1, %xmm2 +; SSE2-NEXT: movdqa %xmm3, %xmm0 +; SSE2-NEXT: movdqa %xmm2, %xmm1 +; SSE2-NEXT: retq +; +; AVX2-LABEL: mul_v32i8_const_neg12: +; AVX2: # %bb.0: +; AVX2-NEXT: vpsllw $2, %ymm0, %ymm1 +; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 +; AVX2-NEXT: vpsllw $3, %ymm0, %ymm0 +; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX2-NEXT: vpaddb %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpsubb %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: retq +; +; AVX512-LABEL: mul_v32i8_const_neg12: +; AVX512: # %bb.0: +; AVX512-NEXT: vpsllw $2, %ymm0, %ymm1 +; AVX512-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 +; AVX512-NEXT: vpsllw $3, %ymm0, %ymm0 +; AVX512-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX512-NEXT: vpaddb %ymm1, %ymm0, %ymm0 +; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512-NEXT: vpsubb %ymm0, %ymm1, %ymm0 +; AVX512-NEXT: retq + %result = mul <32 x i8> %a, splat (i8 244) + ret <32 x i8> %result +} + +;; ============================================================================ +;; v64i8 Tests (512-bit vectors) +;; ============================================================================ + +; Test multiply by 6 = 4 + 2 +define <64 x i8> @mul_v64i8_const6(<64 x i8> %a) nounwind { +; SSE2-LABEL: mul_v64i8_const6: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa %xmm0, %xmm4 +; SSE2-NEXT: psllw $2, %xmm4 +; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252] +; SSE2-NEXT: pand %xmm5, %xmm4 +; SSE2-NEXT: paddb %xmm0, %xmm0 +; SSE2-NEXT: paddb %xmm4, %xmm0 +; SSE2-NEXT: movdqa %xmm1, %xmm4 +; SSE2-NEXT: psllw $2, %xmm4 +; SSE2-NEXT: pand %xmm5, %xmm4 +; SSE2-NEXT: paddb %xmm1, %xmm1 +; SSE2-NEXT: paddb %xmm4, %xmm1 +; SSE2-NEXT: movdqa %xmm2, %xmm4 +; SSE2-NEXT: psllw $2, %xmm4 +; SSE2-NEXT: pand %xmm5, %xmm4 +; SSE2-NEXT: paddb %xmm2, %xmm2 +; SSE2-NEXT: paddb %xmm4, %xmm2 +; SSE2-NEXT: movdqa %xmm3, %xmm4 +; SSE2-NEXT: psllw $2, %xmm4 +; SSE2-NEXT: pand %xmm5, %xmm4 +; SSE2-NEXT: paddb %xmm3, %xmm3 +; SSE2-NEXT: paddb %xmm4, %xmm3 +; SSE2-NEXT: retq +; +; AVX2-LABEL: mul_v64i8_const6: +; AVX2: # %bb.0: +; AVX2-NEXT: vpsllw $2, %ymm0, %ymm2 +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm3 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252] +; AVX2-NEXT: vpand %ymm3, %ymm2, %ymm2 +; AVX2-NEXT: vpaddb %ymm0, %ymm0, %ymm0 +; AVX2-NEXT: vpaddb %ymm0, %ymm2, %ymm0 +; AVX2-NEXT: vpsllw $2, %ymm1, %ymm2 +; AVX2-NEXT: vpand %ymm3, %ymm2, %ymm2 +; AVX2-NEXT: vpaddb %ymm1, %ymm1, %ymm1 +; AVX2-NEXT: vpaddb %ymm1, %ymm2, %ymm1 +; AVX2-NEXT: retq +; +; AVX512-LABEL: mul_v64i8_const6: +; AVX512: # %bb.0: +; AVX512-NEXT: vpaddb %zmm0, %zmm0, %zmm1 +; AVX512-NEXT: vpsllw $2, %zmm0, %zmm0 +; AVX512-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0 +; AVX512-NEXT: vpaddb %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: retq + %result = mul <64 x i8> %a, splat (i8 6) + ret <64 x i8> %result +} + +; Test multiply by 10 = 8 + 2 +define <64 x i8> @mul_v64i8_const10(<64 x i8> %a) nounwind { +; SSE2-LABEL: mul_v64i8_const10: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa %xmm0, %xmm4 +; SSE2-NEXT: psllw $3, %xmm4 +; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248] +; SSE2-NEXT: pand %xmm5, %xmm4 +; SSE2-NEXT: paddb %xmm0, %xmm0 +; SSE2-NEXT: paddb %xmm4, %xmm0 +; SSE2-NEXT: movdqa %xmm1, %xmm4 +; SSE2-NEXT: psllw $3, %xmm4 +; SSE2-NEXT: pand %xmm5, %xmm4 +; SSE2-NEXT: paddb %xmm1, %xmm1 +; SSE2-NEXT: paddb %xmm4, %xmm1 +; SSE2-NEXT: movdqa %xmm2, %xmm4 +; SSE2-NEXT: psllw $3, %xmm4 +; SSE2-NEXT: pand %xmm5, %xmm4 +; SSE2-NEXT: paddb %xmm2, %xmm2 +; SSE2-NEXT: paddb %xmm4, %xmm2 +; SSE2-NEXT: movdqa %xmm3, %xmm4 +; SSE2-NEXT: psllw $3, %xmm4 +; SSE2-NEXT: pand %xmm5, %xmm4 +; SSE2-NEXT: paddb %xmm3, %xmm3 +; SSE2-NEXT: paddb %xmm4, %xmm3 +; SSE2-NEXT: retq +; +; AVX2-LABEL: mul_v64i8_const10: +; AVX2: # %bb.0: +; AVX2-NEXT: vpsllw $3, %ymm0, %ymm2 +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm3 = [248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248] +; AVX2-NEXT: vpand %ymm3, %ymm2, %ymm2 +; AVX2-NEXT: vpaddb %ymm0, %ymm0, %ymm0 +; AVX2-NEXT: vpaddb %ymm0, %ymm2, %ymm0 +; AVX2-NEXT: vpsllw $3, %ymm1, %ymm2 +; AVX2-NEXT: vpand %ymm3, %ymm2, %ymm2 +; AVX2-NEXT: vpaddb %ymm1, %ymm1, %ymm1 +; AVX2-NEXT: vpaddb %ymm1, %ymm2, %ymm1 +; AVX2-NEXT: retq +; +; AVX512-LABEL: mul_v64i8_const10: +; AVX512: # %bb.0: +; AVX512-NEXT: vpaddb %zmm0, %zmm0, %zmm1 +; AVX512-NEXT: vpsllw $3, %zmm0, %zmm0 +; AVX512-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0 +; AVX512-NEXT: vpaddb %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: retq + %result = mul <64 x i8> %a, splat (i8 10) + ret <64 x i8> %result +} + +; Test multiply by 20 = 16 + 4 +define <64 x i8> @mul_v64i8_const20(<64 x i8> %a) nounwind { +; SSE2-LABEL: mul_v64i8_const20: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa %xmm0, %xmm5 +; SSE2-NEXT: psllw $2, %xmm5 +; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252] +; SSE2-NEXT: pand %xmm4, %xmm5 +; SSE2-NEXT: psllw $4, %xmm0 +; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] +; SSE2-NEXT: pand %xmm6, %xmm0 +; SSE2-NEXT: paddb %xmm5, %xmm0 +; SSE2-NEXT: movdqa %xmm1, %xmm5 +; SSE2-NEXT: psllw $2, %xmm5 +; SSE2-NEXT: pand %xmm4, %xmm5 +; SSE2-NEXT: psllw $4, %xmm1 +; SSE2-NEXT: pand %xmm6, %xmm1 +; SSE2-NEXT: paddb %xmm5, %xmm1 +; SSE2-NEXT: movdqa %xmm2, %xmm5 +; SSE2-NEXT: psllw $2, %xmm5 +; SSE2-NEXT: pand %xmm4, %xmm5 +; SSE2-NEXT: psllw $4, %xmm2 +; SSE2-NEXT: pand %xmm6, %xmm2 +; SSE2-NEXT: paddb %xmm5, %xmm2 +; SSE2-NEXT: movdqa %xmm3, %xmm5 +; SSE2-NEXT: psllw $2, %xmm5 +; SSE2-NEXT: pand %xmm4, %xmm5 +; SSE2-NEXT: psllw $4, %xmm3 +; SSE2-NEXT: pand %xmm6, %xmm3 +; SSE2-NEXT: paddb %xmm5, %xmm3 +; SSE2-NEXT: retq +; +; AVX2-LABEL: mul_v64i8_const20: +; AVX2: # %bb.0: +; AVX2-NEXT: vpsllw $2, %ymm0, %ymm2 +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm3 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252] +; AVX2-NEXT: vpand %ymm3, %ymm2, %ymm2 +; AVX2-NEXT: vpsllw $4, %ymm0, %ymm0 +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm4 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] +; AVX2-NEXT: vpand %ymm4, %ymm0, %ymm0 +; AVX2-NEXT: vpaddb %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpsllw $2, %ymm1, %ymm2 +; AVX2-NEXT: vpand %ymm3, %ymm2, %ymm2 +; AVX2-NEXT: vpsllw $4, %ymm1, %ymm1 +; AVX2-NEXT: vpand %ymm4, %ymm1, %ymm1 +; AVX2-NEXT: vpaddb %ymm2, %ymm1, %ymm1 +; AVX2-NEXT: retq +; +; AVX512-LABEL: mul_v64i8_const20: +; AVX512: # %bb.0: +; AVX512-NEXT: vpsllw $2, %zmm0, %zmm1 +; AVX512-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm1 +; AVX512-NEXT: vpsllw $4, %zmm0, %zmm0 +; AVX512-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0 +; AVX512-NEXT: vpaddb %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: retq + %result = mul <64 x i8> %a, splat (i8 20) + ret <64 x i8> %result +} + +; Test multiply by 24 = 16 + 8 +define <64 x i8> @mul_v64i8_const24(<64 x i8> %a) nounwind { +; SSE2-LABEL: mul_v64i8_const24: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa %xmm0, %xmm5 +; SSE2-NEXT: psllw $3, %xmm5 +; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248] +; SSE2-NEXT: pand %xmm4, %xmm5 +; SSE2-NEXT: psllw $4, %xmm0 +; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] +; SSE2-NEXT: pand %xmm6, %xmm0 +; SSE2-NEXT: paddb %xmm5, %xmm0 +; SSE2-NEXT: movdqa %xmm1, %xmm5 +; SSE2-NEXT: psllw $3, %xmm5 +; SSE2-NEXT: pand %xmm4, %xmm5 +; SSE2-NEXT: psllw $4, %xmm1 +; SSE2-NEXT: pand %xmm6, %xmm1 +; SSE2-NEXT: paddb %xmm5, %xmm1 +; SSE2-NEXT: movdqa %xmm2, %xmm5 +; SSE2-NEXT: psllw $3, %xmm5 +; SSE2-NEXT: pand %xmm4, %xmm5 +; SSE2-NEXT: psllw $4, %xmm2 +; SSE2-NEXT: pand %xmm6, %xmm2 +; SSE2-NEXT: paddb %xmm5, %xmm2 +; SSE2-NEXT: movdqa %xmm3, %xmm5 +; SSE2-NEXT: psllw $3, %xmm5 +; SSE2-NEXT: pand %xmm4, %xmm5 +; SSE2-NEXT: psllw $4, %xmm3 +; SSE2-NEXT: pand %xmm6, %xmm3 +; SSE2-NEXT: paddb %xmm5, %xmm3 +; SSE2-NEXT: retq +; +; AVX2-LABEL: mul_v64i8_const24: +; AVX2: # %bb.0: +; AVX2-NEXT: vpsllw $3, %ymm0, %ymm2 +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm3 = [248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248] +; AVX2-NEXT: vpand %ymm3, %ymm2, %ymm2 +; AVX2-NEXT: vpsllw $4, %ymm0, %ymm0 +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm4 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] +; AVX2-NEXT: vpand %ymm4, %ymm0, %ymm0 +; AVX2-NEXT: vpaddb %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpsllw $3, %ymm1, %ymm2 +; AVX2-NEXT: vpand %ymm3, %ymm2, %ymm2 +; AVX2-NEXT: vpsllw $4, %ymm1, %ymm1 +; AVX2-NEXT: vpand %ymm4, %ymm1, %ymm1 +; AVX2-NEXT: vpaddb %ymm2, %ymm1, %ymm1 +; AVX2-NEXT: retq +; +; AVX512-LABEL: mul_v64i8_const24: +; AVX512: # %bb.0: +; AVX512-NEXT: vpsllw $3, %zmm0, %zmm1 +; AVX512-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm1 +; AVX512-NEXT: vpsllw $4, %zmm0, %zmm0 +; AVX512-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0 +; AVX512-NEXT: vpaddb %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: retq + %result = mul <64 x i8> %a, splat (i8 24) + ret <64 x i8> %result +} + +;; ============================================================================ +;; Non-decomposable / edge-case constants +;; ============================================================================ + +; Test multiply by 11 (not decomposable) +define <16 x i8> @mul_v16i8_11(<16 x i8> %a) nounwind { +; SSE2-LABEL: mul_v16i8_11: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [11,u,11,u,11,u,11,u,11,u,11,u,11,u,11,u] +; SSE2-NEXT: pmullw %xmm2, %xmm1 +; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] +; SSE2-NEXT: pand %xmm3, %xmm1 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: pmullw %xmm2, %xmm0 +; SSE2-NEXT: pand %xmm3, %xmm0 +; SSE2-NEXT: packuswb %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; AVX2-LABEL: mul_v16i8_11: +; AVX2: # %bb.0: +; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero +; AVX2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11] +; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: mul_v16i8_11: +; AVX512: # %bb.0: +; AVX512-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero +; AVX512-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11] +; AVX512-NEXT: vpmovwb %zmm0, %ymm0 +; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %result = mul <16 x i8> %a, splat (i8 11) + ret <16 x i8> %result +} + +; Test multiply by 13 (not decomposable) +define <16 x i8> @mul_v16i8_13(<16 x i8> %a) nounwind { +; SSE2-LABEL: mul_v16i8_13: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [13,u,13,u,13,u,13,u,13,u,13,u,13,u,13,u] +; SSE2-NEXT: pmullw %xmm2, %xmm1 +; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] +; SSE2-NEXT: pand %xmm3, %xmm1 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: pmullw %xmm2, %xmm0 +; SSE2-NEXT: pand %xmm3, %xmm0 +; SSE2-NEXT: packuswb %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; AVX2-LABEL: mul_v16i8_13: +; AVX2: # %bb.0: +; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero +; AVX2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13] +; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: mul_v16i8_13: +; AVX512: # %bb.0: +; AVX512-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero +; AVX512-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13] +; AVX512-NEXT: vpmovwb %zmm0, %ymm0 +; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %result = mul <16 x i8> %a, splat (i8 13) + ret <16 x i8> %result +} + +; Test multiply by 19 (not decomposable) +define <16 x i8> @mul_v16i8_19(<16 x i8> %a) nounwind { +; SSE2-LABEL: mul_v16i8_19: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [19,u,19,u,19,u,19,u,19,u,19,u,19,u,19,u] +; SSE2-NEXT: pmullw %xmm2, %xmm1 +; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] +; SSE2-NEXT: pand %xmm3, %xmm1 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: pmullw %xmm2, %xmm0 +; SSE2-NEXT: pand %xmm3, %xmm0 +; SSE2-NEXT: packuswb %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; AVX2-LABEL: mul_v16i8_19: +; AVX2: # %bb.0: +; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero +; AVX2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [19,19,19,19,19,19,19,19,19,19,19,19,19,19,19,19] +; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: mul_v16i8_19: +; AVX512: # %bb.0: +; AVX512-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero +; AVX512-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [19,19,19,19,19,19,19,19,19,19,19,19,19,19,19,19] +; AVX512-NEXT: vpmovwb %zmm0, %ymm0 +; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %result = mul <16 x i8> %a, splat (i8 19) + ret <16 x i8> %result +} + +; Test multiply by 23 (not decomposable) +define <16 x i8> @mul_v16i8_23(<16 x i8> %a) nounwind { +; SSE2-LABEL: mul_v16i8_23: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [23,u,23,u,23,u,23,u,23,u,23,u,23,u,23,u] +; SSE2-NEXT: pmullw %xmm2, %xmm1 +; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] +; SSE2-NEXT: pand %xmm3, %xmm1 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: pmullw %xmm2, %xmm0 +; SSE2-NEXT: pand %xmm3, %xmm0 +; SSE2-NEXT: packuswb %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; AVX2-LABEL: mul_v16i8_23: +; AVX2: # %bb.0: +; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero +; AVX2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [23,23,23,23,23,23,23,23,23,23,23,23,23,23,23,23] +; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: mul_v16i8_23: +; AVX512: # %bb.0: +; AVX512-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero +; AVX512-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [23,23,23,23,23,23,23,23,23,23,23,23,23,23,23,23] +; AVX512-NEXT: vpmovwb %zmm0, %ymm0 +; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %result = mul <16 x i8> %a, splat (i8 23) + ret <16 x i8> %result +} + +; Test multiply by 29 (not decomposable) +define <16 x i8> @mul_v16i8_29(<16 x i8> %a) nounwind { +; SSE2-LABEL: mul_v16i8_29: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [29,u,29,u,29,u,29,u,29,u,29,u,29,u,29,u] +; SSE2-NEXT: pmullw %xmm2, %xmm1 +; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] +; SSE2-NEXT: pand %xmm3, %xmm1 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: pmullw %xmm2, %xmm0 +; SSE2-NEXT: pand %xmm3, %xmm0 +; SSE2-NEXT: packuswb %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; AVX2-LABEL: mul_v16i8_29: +; AVX2: # %bb.0: +; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero +; AVX2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [29,29,29,29,29,29,29,29,29,29,29,29,29,29,29,29] +; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: mul_v16i8_29: +; AVX512: # %bb.0: +; AVX512-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero +; AVX512-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [29,29,29,29,29,29,29,29,29,29,29,29,29,29,29,29] +; AVX512-NEXT: vpmovwb %zmm0, %ymm0 +; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %result = mul <16 x i8> %a, splat (i8 29) + ret <16 x i8> %result +} + +; Test multiply by 37 (not decomposable) +define <16 x i8> @mul_v16i8_37(<16 x i8> %a) nounwind { +; SSE2-LABEL: mul_v16i8_37: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [37,u,37,u,37,u,37,u,37,u,37,u,37,u,37,u] +; SSE2-NEXT: pmullw %xmm2, %xmm1 +; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] +; SSE2-NEXT: pand %xmm3, %xmm1 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: pmullw %xmm2, %xmm0 +; SSE2-NEXT: pand %xmm3, %xmm0 +; SSE2-NEXT: packuswb %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; AVX2-LABEL: mul_v16i8_37: +; AVX2: # %bb.0: +; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero +; AVX2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37] +; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: mul_v16i8_37: +; AVX512: # %bb.0: +; AVX512-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero +; AVX512-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37] +; AVX512-NEXT: vpmovwb %zmm0, %ymm0 +; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %result = mul <16 x i8> %a, splat (i8 37) + ret <16 x i8> %result +} + +; Test multiply by 41 (not decomposable) +define <16 x i8> @mul_v16i8_41(<16 x i8> %a) nounwind { +; SSE2-LABEL: mul_v16i8_41: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [41,u,41,u,41,u,41,u,41,u,41,u,41,u,41,u] +; SSE2-NEXT: pmullw %xmm2, %xmm1 +; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] +; SSE2-NEXT: pand %xmm3, %xmm1 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: pmullw %xmm2, %xmm0 +; SSE2-NEXT: pand %xmm3, %xmm0 +; SSE2-NEXT: packuswb %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; AVX2-LABEL: mul_v16i8_41: +; AVX2: # %bb.0: +; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero +; AVX2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [41,41,41,41,41,41,41,41,41,41,41,41,41,41,41,41] +; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: mul_v16i8_41: +; AVX512: # %bb.0: +; AVX512-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero +; AVX512-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [41,41,41,41,41,41,41,41,41,41,41,41,41,41,41,41] +; AVX512-NEXT: vpmovwb %zmm0, %ymm0 +; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %result = mul <16 x i8> %a, splat (i8 41) + ret <16 x i8> %result +} + +; Test special cases: multiply by 0 and 1 + +; Test multiply by 0 (should be optimized to zero vector) +define <16 x i8> @mul_v16i8_0(<16 x i8> %a) nounwind { +; SSE2-LABEL: mul_v16i8_0: +; SSE2: # %bb.0: +; SSE2-NEXT: xorps %xmm0, %xmm0 +; SSE2-NEXT: retq +; +; AVX2-LABEL: mul_v16i8_0: +; AVX2: # %bb.0: +; AVX2-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; AVX2-NEXT: retq +; +; AVX512-LABEL: mul_v16i8_0: +; AVX512: # %bb.0: +; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; AVX512-NEXT: retq + %result = mul <16 x i8> %a, zeroinitializer + ret <16 x i8> %result +} + +; Test multiply by 1 (should be optimized to identity) +define <16 x i8> @mul_v16i8_1(<16 x i8> %a) nounwind { +; CHECK-LABEL: mul_v16i8_1: +; CHECK: # %bb.0: +; CHECK-NEXT: retq + %result = mul <16 x i8> %a, splat (i8 1) + ret <16 x i8> %result +} + +; Test non-uniform (non-splat) vectors - optimization should only apply to uniform constants + +; All constants are individually decomposable (3, 5, 7), but vector is non-uniform +define <16 x i8> @mul_v16i8_non_uniform_all_decomposable(<16 x i8> %a) nounwind { +; SSE2-LABEL: mul_v16i8_non_uniform_all_decomposable: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [7,u,3,u,5,u,7,u,3,u,5,u,7,u,3,u] +; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] +; SSE2-NEXT: pand %xmm2, %xmm1 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3,u,5,u,7,u,3,u,5,u,7,u,3,u,5,u] +; SSE2-NEXT: pand %xmm2, %xmm0 +; SSE2-NEXT: packuswb %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; AVX2-LABEL: mul_v16i8_non_uniform_all_decomposable: +; AVX2: # %bb.0: +; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero +; AVX2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [3,5,7,3,5,7,3,5,7,3,5,7,3,5,7,3] +; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: mul_v16i8_non_uniform_all_decomposable: +; AVX512: # %bb.0: +; AVX512-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero +; AVX512-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [3,5,7,3,5,7,3,5,7,3,5,7,3,5,7,3] +; AVX512-NEXT: vpmovwb %zmm0, %ymm0 +; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %result = mul <16 x i8> %a, + ret <16 x i8> %result +} + +; Mixed decomposable and non-decomposable constants +define <16 x i8> @mul_v16i8_non_uniform_mixed(<16 x i8> %a) nounwind { +; SSE2-LABEL: mul_v16i8_non_uniform_mixed: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [15,u,29,u,17,u,37,u,31,u,41,u,33,u,11,u] +; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] +; SSE2-NEXT: pand %xmm2, %xmm1 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3,u,11,u,5,u,13,u,7,u,19,u,9,u,23,u] +; SSE2-NEXT: pand %xmm2, %xmm0 +; SSE2-NEXT: packuswb %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; AVX2-LABEL: mul_v16i8_non_uniform_mixed: +; AVX2: # %bb.0: +; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero +; AVX2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [3,11,5,13,7,19,9,23,15,29,17,37,31,41,33,11] +; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: mul_v16i8_non_uniform_mixed: +; AVX512: # %bb.0: +; AVX512-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero +; AVX512-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [3,11,5,13,7,19,9,23,15,29,17,37,31,41,33,11] +; AVX512-NEXT: vpmovwb %zmm0, %ymm0 +; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %result = mul <16 x i8> %a, + ret <16 x i8> %result +} + +; Different powers of 2 (non-uniform) +define <16 x i8> @mul_v16i8_non_uniform_powers_of_2(<16 x i8> %a) nounwind { +; SSE2-LABEL: mul_v16i8_non_uniform_powers_of_2: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [4,u,8,u,16,u,32,u,64,u,128,u,2,u,4,u] +; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] +; SSE2-NEXT: pand %xmm2, %xmm1 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [2,u,4,u,8,u,16,u,32,u,64,u,128,u,2,u] +; SSE2-NEXT: pand %xmm2, %xmm0 +; SSE2-NEXT: packuswb %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; AVX2-LABEL: mul_v16i8_non_uniform_powers_of_2: +; AVX2: # %bb.0: +; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero +; AVX2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [2,4,8,16,32,64,128,2,4,8,16,32,64,128,2,4] +; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: mul_v16i8_non_uniform_powers_of_2: +; AVX512: # %bb.0: +; AVX512-NEXT: vpmovsxbw {{.*#+}} ymm1 = [1,2,3,4,5,6,7,1,2,3,4,5,6,7,1,2] +; AVX512-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero +; AVX512-NEXT: vpsllvw %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpmovwb %zmm0, %ymm0 +; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %result = mul <16 x i8> %a, + ret <16 x i8> %result +} + +; Test v32i8 non-decomposable cases (AVX2) + +; Test v32i8 multiply by 11 (not decomposable) +define <32 x i8> @mul_v32i8_11(<32 x i8> %a) nounwind { +; SSE2-LABEL: mul_v32i8_11: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [11,u,11,u,11,u,11,u,11,u,11,u,11,u,11,u] +; SSE2-NEXT: pmullw %xmm3, %xmm2 +; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255] +; SSE2-NEXT: pand %xmm4, %xmm2 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: pmullw %xmm3, %xmm0 +; SSE2-NEXT: pand %xmm4, %xmm0 +; SSE2-NEXT: packuswb %xmm2, %xmm0 +; SSE2-NEXT: movdqa %xmm1, %xmm2 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; SSE2-NEXT: pmullw %xmm3, %xmm2 +; SSE2-NEXT: pand %xmm4, %xmm2 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: pmullw %xmm3, %xmm1 +; SSE2-NEXT: pand %xmm4, %xmm1 +; SSE2-NEXT: packuswb %xmm2, %xmm1 +; SSE2-NEXT: retq +; +; AVX2-LABEL: mul_v32i8_11: +; AVX2: # %bb.0: +; AVX2-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm1 # [0,11,0,11,0,11,0,11,0,11,0,11,0,11,0,11,0,11,0,11,0,11,0,11,0,11,0,11,0,11,0,11] +; AVX2-NEXT: vpsllw $8, %ymm1, %ymm1 +; AVX2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11] +; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: retq +; +; AVX512-LABEL: mul_v32i8_11: +; AVX512: # %bb.0: +; AVX512-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero +; AVX512-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 # [11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11] +; AVX512-NEXT: vpmovwb %zmm0, %ymm0 +; AVX512-NEXT: retq + %result = mul <32 x i8> %a, splat (i8 11) + ret <32 x i8> %result +} + +; Test v32i8 multiply by 13 (not decomposable) +define <32 x i8> @mul_v32i8_13(<32 x i8> %a) nounwind { +; SSE2-LABEL: mul_v32i8_13: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [13,u,13,u,13,u,13,u,13,u,13,u,13,u,13,u] +; SSE2-NEXT: pmullw %xmm3, %xmm2 +; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255] +; SSE2-NEXT: pand %xmm4, %xmm2 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: pmullw %xmm3, %xmm0 +; SSE2-NEXT: pand %xmm4, %xmm0 +; SSE2-NEXT: packuswb %xmm2, %xmm0 +; SSE2-NEXT: movdqa %xmm1, %xmm2 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; SSE2-NEXT: pmullw %xmm3, %xmm2 +; SSE2-NEXT: pand %xmm4, %xmm2 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: pmullw %xmm3, %xmm1 +; SSE2-NEXT: pand %xmm4, %xmm1 +; SSE2-NEXT: packuswb %xmm2, %xmm1 +; SSE2-NEXT: retq +; +; AVX2-LABEL: mul_v32i8_13: +; AVX2: # %bb.0: +; AVX2-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm1 # [0,13,0,13,0,13,0,13,0,13,0,13,0,13,0,13,0,13,0,13,0,13,0,13,0,13,0,13,0,13,0,13] +; AVX2-NEXT: vpsllw $8, %ymm1, %ymm1 +; AVX2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13] +; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: retq +; +; AVX512-LABEL: mul_v32i8_13: +; AVX512: # %bb.0: +; AVX512-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero +; AVX512-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 # [13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13] +; AVX512-NEXT: vpmovwb %zmm0, %ymm0 +; AVX512-NEXT: retq + %result = mul <32 x i8> %a, splat (i8 13) + ret <32 x i8> %result +} + +; Test v32i8 multiply by 0 +define <32 x i8> @mul_v32i8_0(<32 x i8> %a) nounwind { +; SSE2-LABEL: mul_v32i8_0: +; SSE2: # %bb.0: +; SSE2-NEXT: xorps %xmm0, %xmm0 +; SSE2-NEXT: xorps %xmm1, %xmm1 +; SSE2-NEXT: retq +; +; AVX2-LABEL: mul_v32i8_0: +; AVX2: # %bb.0: +; AVX2-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; AVX2-NEXT: retq +; +; AVX512-LABEL: mul_v32i8_0: +; AVX512: # %bb.0: +; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; AVX512-NEXT: retq + %result = mul <32 x i8> %a, zeroinitializer + ret <32 x i8> %result +} + +; Test v32i8 multiply by 1 +define <32 x i8> @mul_v32i8_1(<32 x i8> %a) nounwind { +; CHECK-LABEL: mul_v32i8_1: +; CHECK: # %bb.0: +; CHECK-NEXT: retq + %result = mul <32 x i8> %a, splat (i8 1) + ret <32 x i8> %result +} + +; Test v32i8 non-uniform vector +define <32 x i8> @mul_v32i8_non_uniform(<32 x i8> %a) nounwind { +; SSE2-LABEL: mul_v32i8_non_uniform: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [3,u,5,u,7,u,11,u,3,u,5,u,7,u,11,u] +; SSE2-NEXT: pmullw %xmm3, %xmm2 +; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255] +; SSE2-NEXT: pand %xmm4, %xmm2 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: pmullw %xmm3, %xmm0 +; SSE2-NEXT: pand %xmm4, %xmm0 +; SSE2-NEXT: packuswb %xmm2, %xmm0 +; SSE2-NEXT: movdqa %xmm1, %xmm2 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; SSE2-NEXT: pmullw %xmm3, %xmm2 +; SSE2-NEXT: pand %xmm4, %xmm2 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: pmullw %xmm3, %xmm1 +; SSE2-NEXT: pand %xmm4, %xmm1 +; SSE2-NEXT: packuswb %xmm2, %xmm1 +; SSE2-NEXT: retq +; +; AVX2-LABEL: mul_v32i8_non_uniform: +; AVX2: # %bb.0: +; AVX2-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm1 # [0,5,0,11,0,5,0,11,0,5,0,11,0,5,0,11,0,5,0,11,0,5,0,11,0,5,0,11,0,5,0,11] +; AVX2-NEXT: vpsllw $8, %ymm1, %ymm1 +; AVX2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [3,5,7,11,3,5,7,11,3,5,7,11,3,5,7,11,3,5,7,11,3,5,7,11,3,5,7,11,3,5,7,11] +; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: retq +; +; AVX512-LABEL: mul_v32i8_non_uniform: +; AVX512: # %bb.0: +; AVX512-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero +; AVX512-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 # [3,5,7,11,3,5,7,11,3,5,7,11,3,5,7,11,3,5,7,11,3,5,7,11,3,5,7,11,3,5,7,11] +; AVX512-NEXT: vpmovwb %zmm0, %ymm0 +; AVX512-NEXT: retq + %result = mul <32 x i8> %a, + ret <32 x i8> %result +} +