diff --git a/codec/common/inc/intra_pred_common.h b/codec/common/inc/intra_pred_common.h index d2fb2921af..7ea8c55735 100644 --- a/codec/common/inc/intra_pred_common.h +++ b/codec/common/inc/intra_pred_common.h @@ -1,6 +1,7 @@ /*! * \copy * Copyright (c) 2009-2013, Cisco Systems + * Copyright (c) 2026, Richard Ben Aleya * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -56,6 +57,10 @@ extern "C" { //for intra-prediction ASM functions void WelsI16x16LumaPredV_sse2 (uint8_t* pPred, uint8_t* pRef, const int32_t kiStride); void WelsI16x16LumaPredH_sse2 (uint8_t* pPred, uint8_t* pRef, const int32_t kiStride); +#if defined(HAVE_AVX2) +void WelsI16x16LumaPredV_avx2 (uint8_t* pPred, uint8_t* pRef, const int32_t kiStride); +void WelsI16x16LumaPredH_avx2 (uint8_t* pPred, uint8_t* pRef, const int32_t kiStride); +#endif//HAVE_AVX2 #endif//X86_ASM #if defined(HAVE_NEON) diff --git a/codec/common/x86/intra_pred_com.asm b/codec/common/x86/intra_pred_com.asm index b0bd1318b3..066019011d 100644 --- a/codec/common/x86/intra_pred_com.asm +++ b/codec/common/x86/intra_pred_com.asm @@ -1,6 +1,7 @@ ;*! ;* \copy ;* Copyright (c) 2009-2013, Cisco Systems +;* Copyright (c) 2026, Richard Ben Aleya ;* All rights reserved. ;* ;* Redistribution and use in source and binary forms, with or without @@ -115,3 +116,46 @@ WELS_EXTERN WelsI16x16LumaPredV_sse2 ret +%ifdef HAVE_AVX2 +;*********************************************************************** +; void WelsI16x16LumaPredV_avx2(uint8_t *pred, uint8_t *pRef, int32_t stride); +;*********************************************************************** +WELS_EXTERN WelsI16x16LumaPredV_avx2 + %assign push_num 0 + LOAD_3_PARA + SIGN_EXTENSION r2, r2d + sub r1, r2 + vbroadcasti128 ymm0, [r1] + vmovdqu [r0], ymm0 + vmovdqu [r0+32], ymm0 + vmovdqu [r0+64], ymm0 + vmovdqu [r0+96], ymm0 + vmovdqu [r0+128], ymm0 + vmovdqu [r0+160], ymm0 + vmovdqu [r0+192], ymm0 + vmovdqu [r0+224], ymm0 + vzeroupper + ret + +;*********************************************************************** +; void WelsI16x16LumaPredH_avx2(uint8_t *pred, uint8_t *pRef, int32_t stride); +;*********************************************************************** +WELS_EXTERN WelsI16x16LumaPredH_avx2 + %assign push_num 0 + LOAD_3_PARA + SIGN_EXTENSION r2, r2d + dec r1 + %assign h_row 0 + %rep 16 + %if h_row > 0 + add r1, r2 + %endif + vpbroadcastb xmm0, [r1] + vmovdqa [r0 + h_row * 16], xmm0 + %assign h_row h_row + 1 + %endrep + vzeroupper + ret + +%endif ; HAVE_AVX2 + diff --git a/codec/encoder/core/inc/get_intra_predictor.h b/codec/encoder/core/inc/get_intra_predictor.h index b5da88b774..b6e0e1558e 100644 --- a/codec/encoder/core/inc/get_intra_predictor.h +++ b/codec/encoder/core/inc/get_intra_predictor.h @@ -1,6 +1,7 @@ /*! * \copy * Copyright (c) 2009-2013, Cisco Systems + * Copyright (c) 2026, Richard Ben Aleya * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -109,6 +110,12 @@ void WelsI4x4LumaPredVR_mmx (uint8_t* pPred, uint8_t* pRef, const int32_t kiStri void WelsI4x4LumaPredHD_mmx (uint8_t* pPred, uint8_t* pRef, const int32_t kiStride); void WelsI4x4LumaPredVL_mmx (uint8_t* pPred, uint8_t* pRef, const int32_t kiStride); void WelsI4x4LumaPredHU_mmx (uint8_t* pPred, uint8_t* pRef, const int32_t kiStride); + +#if defined(HAVE_AVX2) +void WelsI16x16LumaPredDc_avx2 (uint8_t* pPred, uint8_t* pRef, const int32_t kiStride); +void WelsI16x16LumaPredPlane_avx2 (uint8_t* pPred, uint8_t* pRef, const int32_t kiStride); +void WelsIChromaPredV_avx2 (uint8_t* pPred, uint8_t* pRef, const int32_t kiStride); +#endif//HAVE_AVX2 #endif//X86_ASM #if defined(HAVE_NEON) diff --git a/codec/encoder/core/src/get_intra_predictor.cpp b/codec/encoder/core/src/get_intra_predictor.cpp index 331c2275b2..f7a18588ba 100644 --- a/codec/encoder/core/src/get_intra_predictor.cpp +++ b/codec/encoder/core/src/get_intra_predictor.cpp @@ -1,6 +1,7 @@ /*! * \copy * Copyright (c) 2009-2013, Cisco Systems + * Copyright (c) 2026, Richard Ben Aleya * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -719,6 +720,15 @@ void WelsInitIntraPredFuncs (SWelsFuncPtrList* pFuncList, const uint32_t kuiCpuF pFuncList->pfGetChromaPred[C_PRED_V] = WelsIChromaPredV_sse2; pFuncList->pfGetChromaPred[C_PRED_P] = WelsIChromaPredPlane_sse2; } +#if defined(HAVE_AVX2) + if (kuiCpuFlag & WELS_CPU_AVX2) { + pFuncList->pfGetLumaI16x16Pred[I16_PRED_V] = WelsI16x16LumaPredV_avx2; + pFuncList->pfGetLumaI16x16Pred[I16_PRED_H] = WelsI16x16LumaPredH_avx2; + pFuncList->pfGetLumaI16x16Pred[I16_PRED_DC] = WelsI16x16LumaPredDc_avx2; + pFuncList->pfGetLumaI16x16Pred[I16_PRED_P] = WelsI16x16LumaPredPlane_avx2; + pFuncList->pfGetChromaPred[C_PRED_V] = WelsIChromaPredV_avx2; + } +#endif//HAVE_AVX2 #endif #if defined(HAVE_MMI) diff --git a/codec/encoder/core/x86/intra_pred.asm b/codec/encoder/core/x86/intra_pred.asm index edb2cd318a..1c501f77b3 100644 --- a/codec/encoder/core/x86/intra_pred.asm +++ b/codec/encoder/core/x86/intra_pred.asm @@ -1,6 +1,7 @@ ;*! ;* \copy ;* Copyright (c) 2009-2013, Cisco Systems +;* Copyright (c) 2026, Richard Ben Aleya ;* All rights reserved. ;* ;* Redistribution and use in source and binary forms, with or without @@ -1127,3 +1128,219 @@ WELS_EXTERN WelsI16x16LumaPredDc_sse2 pop r4 pop r3 ret + +%ifdef HAVE_AVX2 + +;*********************************************************************** +; void WelsI16x16LumaPredDc_avx2(uint8_t *pred, uint8_t *pRef, int32_t stride); +;*********************************************************************** +WELS_EXTERN WelsI16x16LumaPredDc_avx2 + push r3 + push r4 + %assign push_num 2 + LOAD_3_PARA + SIGN_EXTENSION r2, r2d + sub r1, r2 + vmovdqa xmm0, [r1] + vpxor xmm1, xmm1, xmm1 + vpsadbw xmm0, xmm0, xmm1 + vpsrldq xmm1, xmm0, 8 + vpaddw xmm0, xmm0, xmm1 + + movzx r3, byte [r1+r2-0x01] + movzx r4, byte [r1+2*r2-0x01] + add r3, r4 + lea r1, [r1+r2] + LOAD_2_LEFT_AND_ADD + LOAD_2_LEFT_AND_ADD + LOAD_2_LEFT_AND_ADD + LOAD_2_LEFT_AND_ADD + LOAD_2_LEFT_AND_ADD + LOAD_2_LEFT_AND_ADD + LOAD_2_LEFT_AND_ADD + add r3, 0x10 + vmovd xmm1, r3d + vpaddw xmm0, xmm0, xmm1 + vpsrld xmm0, xmm0, 5 + vpbroadcastb ymm0, xmm0 + + vmovdqu [r0], ymm0 + vmovdqu [r0+32], ymm0 + vmovdqu [r0+64], ymm0 + vmovdqu [r0+96], ymm0 + vmovdqu [r0+128], ymm0 + vmovdqu [r0+160], ymm0 + vmovdqu [r0+192], ymm0 + vmovdqu [r0+224], ymm0 + vzeroupper + pop r4 + pop r3 + ret + +;*********************************************************************** +; void WelsI16x16LumaPredPlane_avx2(uint8_t *pred, uint8_t *pRef, int32_t stride); +;*********************************************************************** +WELS_EXTERN WelsI16x16LumaPredPlane_avx2 + push r3 + push r4 + %assign push_num 2 + INIT_X86_32_PIC r5 + LOAD_3_PARA + PUSH_XMM 6 + SIGN_EXTENSION r2, r2d + sub r1, 1 + sub r1, r2 + + ; Compute H (horizontal gradient of top row) + vpxor xmm7, xmm7, xmm7 + vmovq xmm0, [r1] + vmovdqa xmm5, [pic(sse2_plane_dec)] + vpunpcklbw xmm0, xmm0, xmm7 + vpmullw xmm0, xmm0, xmm5 + vmovq xmm1, [r1 + 9] + vmovdqa xmm6, [pic(sse2_plane_inc)] + vpunpcklbw xmm1, xmm1, xmm7 + vpmullw xmm1, xmm1, xmm6 + vpsubw xmm1, xmm1, xmm0 + + ; Horizontal sum -> H + vpsrldq xmm0, xmm1, 8 + vpaddw xmm1, xmm1, xmm0 + vpsrldq xmm0, xmm1, 4 + vpaddw xmm1, xmm1, xmm0 + vpsrldq xmm0, xmm1, 2 + vpaddw xmm1, xmm1, xmm0 + + vmovd r3d, xmm1 + movsx r3, r3w + imul r3, 5 + add r3, 32 + sar r3, 6 ; b = (5 * H + 32) >> 6 + vmovd xmm1, r3d + vpbroadcastw ymm1, xmm1 ; ymm1 = b (16 words) + + ; Load top[15] + movzx r4, BYTE [r1+16] + + ; Load upper column (rows 0-7 of left border) using VEX instructions + sub r1, 3 + vmovd xmm0, [r1] + vmovd xmm2, [r1+r2] + vpunpcklbw xmm0, xmm0, xmm2 + lea r1, [r1+2*r2] + vmovd xmm3, [r1] + vmovd xmm2, [r1+r2] + vpunpcklbw xmm3, xmm3, xmm2 + vpunpcklwd xmm0, xmm0, xmm3 + lea r1, [r1+2*r2] + vmovd xmm4, [r1] + vmovd xmm2, [r1+r2] + vpunpcklbw xmm4, xmm4, xmm2 + lea r1, [r1+2*r2] + vmovd xmm3, [r1] + vmovd xmm2, [r1+r2] + lea r1, [r1+2*r2] + vpunpcklbw xmm3, xmm3, xmm2 + vpunpcklwd xmm4, xmm4, xmm3 + vpunpckhdq xmm0, xmm0, xmm4 + + ; Get left[15*stride] + add r1, 3 + movzx r3, BYTE [r1+8*r2] + add r4, r3 + shl r4, 4 ; a = (left[15*stride] + top[15]) << 4 + + ; Load lower column (rows 8-15 of left border) using VEX instructions + sub r1, 3 + add r1, r2 + vmovd xmm7, [r1] + vmovd xmm2, [r1+r2] + vpunpcklbw xmm7, xmm7, xmm2 + lea r1, [r1+2*r2] + vmovd xmm3, [r1] + vmovd xmm2, [r1+r2] + vpunpcklbw xmm3, xmm3, xmm2 + vpunpcklwd xmm7, xmm7, xmm3 + lea r1, [r1+2*r2] + vmovd xmm4, [r1] + vmovd xmm2, [r1+r2] + vpunpcklbw xmm4, xmm4, xmm2 + lea r1, [r1+2*r2] + vmovd xmm3, [r1] + vmovd xmm2, [r1+r2] + lea r1, [r1+2*r2] + vpunpcklbw xmm3, xmm3, xmm2 + vpunpcklwd xmm4, xmm4, xmm3 + vpunpckhdq xmm7, xmm7, xmm4 + + ; Compute V gradient + vpxor xmm4, xmm4, xmm4 + vpunpckhbw xmm0, xmm0, xmm4 + vpmullw xmm0, xmm0, xmm5 + vpunpckhbw xmm7, xmm7, xmm4 + vpmullw xmm7, xmm7, xmm6 + vpsubw xmm7, xmm7, xmm0 + + ; Horizontal sum -> V + vpsrldq xmm0, xmm7, 8 + vpaddw xmm7, xmm7, xmm0 + vpsrldq xmm0, xmm7, 4 + vpaddw xmm7, xmm7, xmm0 + vpsrldq xmm0, xmm7, 2 + vpaddw xmm7, xmm7, xmm0 + + vmovd r3d, xmm7 + movsx r3, r3w + imul r3, 5 + add r3, 32 + sar r3, 6 ; c = (5 * V + 32) >> 6 + vmovd xmm4, r3d + vpbroadcastw ymm4, xmm4 ; ymm4 = c (16 words) + + ; s = a + 16 + (-7)*c + add r4, 16 + imul r3, -7 + add r3, r4 + vmovd xmm0, r3d + vpbroadcastw ymm0, xmm0 ; ymm0 = s (16 words) + + ; Build 256-bit multiplier: [-7,-6,-5,-4,-3,-2,-1,0 | 1,2,3,4,5,6,7,8] + vmovdqa xmm5, [pic(sse2_plane_inc_minus)] + vinserti128 ymm5, ymm5, [pic(sse2_plane_inc)], 1 + + xor r3, r3 +.loop_plane_avx2: + vpmullw ymm2, ymm1, ymm5 ; b * [-7..8] + vpaddw ymm2, ymm2, ymm0 ; + s + vpsraw ymm2, ymm2, 5 ; >> 5 + vpackuswb ymm2, ymm2, ymm2 ; pack to bytes (within each lane) + vpermq ymm2, ymm2, 0x08 ; gather low qwords from each lane + vmovdqu [r0], xmm2 ; store 16 bytes + vpaddw ymm0, ymm0, ymm4 ; s += c + add r0, 16 + inc r3 + cmp r3, 16 + jnz .loop_plane_avx2 + + vzeroupper + POP_XMM + DEINIT_X86_32_PIC + pop r4 + pop r3 + ret + +;*********************************************************************** +; void WelsIChromaPredV_avx2(uint8_t *pred, uint8_t *pRef, int32_t stride); +;*********************************************************************** +WELS_EXTERN WelsIChromaPredV_avx2 + %assign push_num 0 + LOAD_3_PARA + SIGN_EXTENSION r2, r2d + sub r1, r2 + vpbroadcastq ymm0, [r1] + vmovdqu [r0], ymm0 + vmovdqu [r0+32], ymm0 + vzeroupper + ret + +%endif ; HAVE_AVX2