From fa97d8308a71197095d8fd35a7dc1ee58184eabc Mon Sep 17 00:00:00 2001 From: Zuoqiang He Date: Mon, 18 May 2026 08:27:33 +0800 Subject: [PATCH] feat: add ARM NEON optimization for startcode prefix search MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit SIMD fast filter: load 64 bytes, use ext to create adjacent byte pairs, detect consecutive 0x00 bytes via orr+umin+uminv reduction. No candidate pair → skip entire 64B block (~97% filtered); found → fall back to precise C scan for 0x000001. Performance (C: 8268.15 → NEON: 501.80 cycles/iter, ~16.5x) Refactor into pfDetectStartCodePrefix function pointer dispatch, selecting C/NEON at runtime based on CPU capabilities. Add comprehensive decoder unit tests for startcode detection. --- .../core/arm64/au_parser_aarch64_neon.S | 123 +++++++++++++ codec/decoder/core/inc/au_parser.h | 10 +- codec/decoder/core/inc/decoder_context.h | 4 + codec/decoder/core/src/au_parser.cpp | 11 +- codec/decoder/core/src/decoder.cpp | 8 +- codec/decoder/meson.build | 1 + test/PMUTimer.h | 110 ++++++++++++ test/decoder/DecUT_StartCode.cpp | 164 ++++++++++++++++++ test/decoder/meson.build | 1 + 9 files changed, 426 insertions(+), 6 deletions(-) create mode 100644 codec/decoder/core/arm64/au_parser_aarch64_neon.S create mode 100644 test/PMUTimer.h create mode 100644 test/decoder/DecUT_StartCode.cpp diff --git a/codec/decoder/core/arm64/au_parser_aarch64_neon.S b/codec/decoder/core/arm64/au_parser_aarch64_neon.S new file mode 100644 index 0000000000..6e0ec1a3be --- /dev/null +++ b/codec/decoder/core/arm64/au_parser_aarch64_neon.S @@ -0,0 +1,123 @@ +/*! + * \copy + * Copyright (c) 2013, Cisco Systems + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS + * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE + * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN + * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#if defined(HAVE_NEON_AARCH64) && defined(__aarch64__) +#include "arm_arch64_common_macro.S" + +.extern DetectStartCodePrefixC + +WELS_ASM_AARCH64_FUNC_BEGIN DetectStartCodePrefixNEON + cmp w2, #79 + b.gt .L_init + b DetectStartCodePrefixC + +.L_init: + stp x29, x30, [sp, #-32]! + stp x20, x19, [sp, #16] + mov x29, sp + mov x9, #0 + add x8, x0, w2, sxtw + sub x10, x8, #80 + b .L_simd + +.L_advance: + add x9, x9, #64 + add x8, x0, x9 + cmp x8, x10 + b.hi .L_fallback + +.L_simd: + add x8, x0, x9 + ldp q1, q2, [x8] + ldp q3, q0, [x8, #32] + ext v4.16b, v0.16b, v1.16b, #15 + ext v5.16b, v1.16b, v2.16b, #15 + ext v6.16b, v2.16b, v3.16b, #15 + ext v7.16b, v3.16b, v0.16b, #15 + orr v1.16b, v4.16b, v1.16b + orr v2.16b, v5.16b, v2.16b + orr v3.16b, v6.16b, v3.16b + orr v4.16b, v7.16b, v0.16b + umin v1.16b, v1.16b, v2.16b + umin v2.16b, v3.16b, v4.16b + umin v1.16b, v1.16b, v2.16b + uminv b1, v1.16b + fmov w11, s1 + cbz w11, .L_call_c + + umov w8, v0.b[15] + cbnz w8, .L_advance + + add x8, x0, x9 + ldrb w11, [x8, #64] + cbnz w11, .L_advance + + ldrb w8, [x8, #65] + cmp w8, #1 + b.ne .L_advance + + add w8, w9, #66 + str w8, [x1] + add x0, x0, w8, sxtw + +.L_return: + ldp x20, x19, [sp, #16] + ldp x29, x30, [sp], #32 + ret + +.L_call_c: + mov x19, x0 + mov x0, x8 + mov x20, x1 + mov w2, #65 + bl DetectStartCodePrefixC + cbnz x0, .L_found + b .L_return + +.L_fallback: + mov x19, x0 + sub x0, x8, #1 + sub w8, w2, w9 + add w2, w8, #1 + mov x20, x1 + bl DetectStartCodePrefixC + cbz x0, .L_return + +.L_found: + sub w8, w0, w19 + str w8, [x20] + ldp x20, x19, [sp, #16] + ldp x29, x30, [sp], #32 + ret + +WELS_ASM_AARCH64_FUNC_END + +#endif diff --git a/codec/decoder/core/inc/au_parser.h b/codec/decoder/core/inc/au_parser.h index 8a233af90a..ddc9ce50c2 100644 --- a/codec/decoder/core/inc/au_parser.h +++ b/codec/decoder/core/inc/au_parser.h @@ -48,9 +48,10 @@ namespace WelsDec { +extern "C" { /*! ************************************************************************************* - * \brief Start Code Prefix (0x 00 00 00 01) detection + * \brief Start Code Prefix (0x 00 00 00 01) detection - C implementation * * \param pBuf bitstream payload buffer * \param pOffset offset between NAL rbsp and original bitsteam that @@ -62,7 +63,12 @@ namespace WelsDec { * \note N/A ************************************************************************************* */ -uint8_t* DetectStartCodePrefix (const uint8_t* kpBuf, int32_t* pOffset, int32_t iBufSize); +uint8_t* DetectStartCodePrefixC (const uint8_t* kpBuf, int32_t* pOffset, int32_t iBufSize); + +#if defined(HAVE_NEON_AARCH64) && defined(__aarch64__) +uint8_t* DetectStartCodePrefixNEON (const uint8_t* kpBuf, int32_t* pOffset, int32_t iBufSize); +#endif +} /*! ************************************************************************************* diff --git a/codec/decoder/core/inc/decoder_context.h b/codec/decoder/core/inc/decoder_context.h index fe825058eb..8bf1aa8d51 100644 --- a/codec/decoder/core/inc/decoder_context.h +++ b/codec/decoder/core/inc/decoder_context.h @@ -145,6 +145,8 @@ typedef void (*PExpandPictureFunc) (uint8_t* pDst, const int32_t kiStride, const typedef void (*PGetIntraPred8x8Func) (uint8_t* pPred, const int32_t kiLumaStride, bool bTLAvail, bool bTRAvail); +typedef uint8_t* (*PDetectStartCodePrefixFunc) (const uint8_t* kpBuf, int32_t* pOffset, int32_t iBufSize); + /**/ typedef struct TagRefPic { PPicture pRefList[LIST_A][MAX_DPB_COUNT]; // reference picture marking plus FIFO scheme @@ -465,6 +467,8 @@ typedef struct TagWelsDecoderContext { /* For Block */ SBlockFunc sBlockFunc; + PDetectStartCodePrefixFunc pfDetectStartCodePrefix; + int32_t iCurSeqIntervalTargetDependId; int32_t iCurSeqIntervalMaxPicWidth; int32_t iCurSeqIntervalMaxPicHeight; diff --git a/codec/decoder/core/src/au_parser.cpp b/codec/decoder/core/src/au_parser.cpp index 63a882e19d..347a6f1493 100644 --- a/codec/decoder/core/src/au_parser.cpp +++ b/codec/decoder/core/src/au_parser.cpp @@ -49,9 +49,16 @@ #define _PARSE_NALHRD_VCLHRD_PARAMS_ 1 namespace WelsDec { + +// External declarations for NEON-optimized version +#if defined(HAVE_NEON_AARCH64) && defined(__aarch64__) +extern "C" uint8_t* DetectStartCodePrefixNEON(const uint8_t* kpBuf, int32_t* pOffset, int32_t iBufSize); +#endif + /*! ************************************************************************************* - * \brief Start Code Prefix (0x 00 00 00 01) detection + * \brief C implementation of Start Code Prefix (0x 00 00 00 01) detection + * (extern "C" to allow calling from assembly) * * \param pBuf bitstream payload buffer * \param pOffset offset between NAL rbsp and original bitsteam that @@ -63,7 +70,7 @@ namespace WelsDec { * \note N/A ************************************************************************************* */ -uint8_t* DetectStartCodePrefix (const uint8_t* kpBuf, int32_t* pOffset, int32_t iBufSize) { +uint8_t* DetectStartCodePrefixC (const uint8_t* kpBuf, int32_t* pOffset, int32_t iBufSize) { uint8_t* pBits = (uint8_t*)kpBuf; do { diff --git a/codec/decoder/core/src/decoder.cpp b/codec/decoder/core/src/decoder.cpp index f52f97927e..1a73dd321a 100644 --- a/codec/decoder/core/src/decoder.cpp +++ b/codec/decoder/core/src/decoder.cpp @@ -756,8 +756,8 @@ int32_t WelsDecodeBs (PWelsDecoderContext pCtx, const uint8_t* kpBsBuf, const in uint8_t* pNalPayload = NULL; - if (NULL == DetectStartCodePrefix (kpBsBuf, &iOffset, - kiBsLen)) { //CAN'T find the 00 00 01 start prefix from the source buffer + if (NULL == pCtx->pfDetectStartCodePrefix (kpBsBuf, &iOffset, + kiBsLen)) { //CAN'T find the 00 00 01 start prefix from the source buffer pCtx->iErrorCode |= dsBitstreamError; return dsBitstreamError; } @@ -1054,6 +1054,8 @@ void InitPredFunc (PWelsDecoderContext pCtx, uint32_t uiCpuFlag) { pCtx->pIdctResAddPredFunc8x8 = IdctResAddPred8x8_c; + pCtx->pfDetectStartCodePrefix = DetectStartCodePrefixC; + #if defined(HAVE_NEON) if (uiCpuFlag & WELS_CPU_NEON) { pCtx->pIdctResAddPredFunc = IdctResAddPred_neon; @@ -1108,6 +1110,8 @@ void InitPredFunc (PWelsDecoderContext pCtx, uint32_t uiCpuFlag) { pCtx->pGetIChromaPredFunc[C_PRED_P ] = WelsDecoderIChromaPredPlane_AArch64_neon; pCtx->pGetIChromaPredFunc[C_PRED_DC] = WelsDecoderIChromaPredDc_AArch64_neon; pCtx->pGetIChromaPredFunc[C_PRED_DC_T] = WelsDecoderIChromaPredDcTop_AArch64_neon; + + pCtx->pfDetectStartCodePrefix = DetectStartCodePrefixNEON; } #endif//HAVE_NEON_AARCH64 diff --git a/codec/decoder/meson.build b/codec/decoder/meson.build index 23352ff8dc..02650f7359 100644 --- a/codec/decoder/meson.build +++ b/codec/decoder/meson.build @@ -43,6 +43,7 @@ elif cpu_family == 'aarch64' asm_sources = [ 'core/arm64/block_add_aarch64_neon.S', 'core/arm64/intra_pred_aarch64_neon.S', + 'core/arm64/au_parser_aarch64_neon.S', ] if use_asm_gen objs_asm = asm_gen.process(asm_sources) diff --git a/test/PMUTimer.h b/test/PMUTimer.h new file mode 100644 index 0000000000..26e903ff32 --- /dev/null +++ b/test/PMUTimer.h @@ -0,0 +1,110 @@ +/*! + * \copy + * Copyright (c) 2026, OpenH264 Contributors + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS + * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE + * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN + * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * + * \file PMUTimer.h + * + * \brief PMU performance measurement utility + * + ************************************************************************************* + */ + +#ifndef PMU_TIMER_H +#define PMU_TIMER_H + +#include +#include +#include +#include +#include + +class PMUTimer { +public: + enum Mode { + INSTRUCTIONS, + CPU_CYCLES + }; + + PMUTimer(Mode mode = INSTRUCTIONS) : fd_(-1), mode_(mode) { + init(); + } + + ~PMUTimer() { + if (fd_ >= 0) + close(fd_); + } + + bool is_available() const { return fd_ >= 0; } + + void start() { + if (fd_ >= 0) { + ioctl(fd_, PERF_EVENT_IOC_RESET, 0); + ioctl(fd_, PERF_EVENT_IOC_ENABLE, 0); + } + } + + void stop() { + if (fd_ >= 0) { + ioctl(fd_, PERF_EVENT_IOC_DISABLE, 0); + } + } + + uint64_t read() { + uint64_t value = 0; + if (fd_ >= 0) { + ::read(fd_, &value, sizeof(value)); + } else { + // Fallback to cntvct_el0 + __asm__ volatile("isb\nmrs %0, cntvct_el0" : "=r"(value) :: "memory"); + } + return value; + } + + const char* mode_name() const { + return mode_ == INSTRUCTIONS ? "instructions" : "cycles"; + } + +private: + void init() { + struct perf_event_attr attr = { + .type = PERF_TYPE_HARDWARE, + .config = mode_ == INSTRUCTIONS ? PERF_COUNT_HW_INSTRUCTIONS : PERF_COUNT_HW_CPU_CYCLES, + .disabled = 1, + .exclude_kernel = 1, + .exclude_hv = 1, + }; + fd_ = syscall(__NR_perf_event_open, &attr, 0, -1, -1, 0); + } + + int fd_; + Mode mode_; +}; + +#endif /* PMU_TIMER_H */ \ No newline at end of file diff --git a/test/decoder/DecUT_StartCode.cpp b/test/decoder/DecUT_StartCode.cpp new file mode 100644 index 0000000000..44ca8c9631 --- /dev/null +++ b/test/decoder/DecUT_StartCode.cpp @@ -0,0 +1,164 @@ +#include +#include +#include +#include "au_parser.h" +#include "PMUTimer.h" + +using namespace WelsDec; + +namespace { + +#define BUF_SIZE 4096 +#define PADDING_SIZE 64 + +void TestThreeByteStartcode(PDetectStartCodePrefixFunc func, const char* impl_name) { + uint8_t buf[BUF_SIZE + PADDING_SIZE]; + int32_t offset; + + for (int i = 0; i < 100; i++) { + memset(buf, 0xFF, BUF_SIZE); + buf[i] = 0x00; + buf[i + 1] = 0x00; + buf[i + 2] = 0x01; + + uint8_t* result = func(buf, &offset, BUF_SIZE); + ASSERT_NE(result, nullptr) << impl_name << ": Failed at offset " << i; + EXPECT_EQ(offset, i + 3) << impl_name << ": Offset mismatch at position " << i; + } +} + +void TestLeadingZerosPatterns(PDetectStartCodePrefixFunc func, const char* impl_name) { + uint8_t buf[BUF_SIZE + PADDING_SIZE]; + int32_t offset; + + for (int leading_zeros = 2; leading_zeros <= 10; leading_zeros++) { + for (int i = 0; i < 50; i++) { + memset(buf, 0xFF, BUF_SIZE); + + for (int z = 0; z < leading_zeros; z++) { + buf[i + z] = 0x00; + } + buf[i + leading_zeros] = 0x01; + + uint8_t* result = func(buf, &offset, BUF_SIZE); + ASSERT_NE(result, nullptr) << impl_name << ": Failed at offset " << i << " with " << leading_zeros << " leading zeros"; + EXPECT_EQ(offset, i + leading_zeros + 1) << impl_name << ": Offset mismatch at position " << i << " with " << leading_zeros << " leading zeros"; + } + } +} + +void TestNoStartcodePatterns(PDetectStartCodePrefixFunc func, const char* impl_name) { + uint8_t buf[BUF_SIZE + PADDING_SIZE]; + int32_t offset; + + // Pattern 1: All zeros except no 0x01 + memset(buf, 0x00, BUF_SIZE); + uint8_t* result = func(buf, &offset, BUF_SIZE); + EXPECT_EQ(result, nullptr) << impl_name << ": Should not find startcode in all zeros"; + + // Pattern 2: 0x00 0x00 0x02 (wrong last byte) + for (int i = 0; i < 50; i++) { + memset(buf, 0xFF, BUF_SIZE); + buf[i] = 0x00; + buf[i + 1] = 0x00; + buf[i + 2] = 0x02; + + result = func(buf, &offset, BUF_SIZE); + EXPECT_EQ(result, nullptr) << impl_name << ": Should not find startcode at offset " << i; + } +} + +void TestSmallBuffers(PDetectStartCodePrefixFunc func, const char* impl_name) { + uint8_t buf[16]; + int32_t offset; + + for (int size = 4; size <= 16; size++) { + memset(buf, 0xFF, 16); + buf[0] = 0x00; + buf[1] = 0x00; + buf[2] = 0x00; + buf[3] = 0x01; + + uint8_t* result = func(buf, &offset, size); + ASSERT_NE(result, nullptr) << impl_name << ": Failed for buffer size " << size; + EXPECT_EQ(offset, 4) << impl_name << ": Offset mismatch for buffer size " << size; + } +} + +void TestMultipleStartcodes(PDetectStartCodePrefixFunc func, const char* impl_name) { + uint8_t buf[BUF_SIZE + PADDING_SIZE]; + int32_t offset; + + for (int first_pos = 10; first_pos < 50; first_pos += 10) { + for (int second_pos = first_pos + 10; second_pos < first_pos + 50; second_pos += 10) { + memset(buf, 0xFF, BUF_SIZE); + + buf[first_pos] = 0x00; + buf[first_pos + 1] = 0x00; + buf[first_pos + 2] = 0x00; + buf[first_pos + 3] = 0x01; + + buf[second_pos] = 0x00; + buf[second_pos + 1] = 0x00; + buf[second_pos + 2] = 0x01; + + uint8_t* result = func(buf, &offset, BUF_SIZE); + ASSERT_NE(result, nullptr) << impl_name << ": Failed at first_pos=" << first_pos << ", second_pos=" << second_pos; + EXPECT_EQ(offset, first_pos + 4) << impl_name << ": Should find first startcode at " << first_pos; + } + } +} + +void TestPerformance(PDetectStartCodePrefixFunc func, const char* impl_name) { + const int buf_size = 4096; + uint8_t buf[buf_size]; + int32_t offset; + uint64_t total_cycles = 0; + + // Performance test with startcode at different positions + PMUTimer timer_cycles(PMUTimer::CPU_CYCLES); + for (int i = 0; i < buf_size - 4; i++) { + memset(buf, 0x2, buf_size); + buf[i] = 0x00; + buf[i + 1] = 0x00; + buf[i + 2] = 0x00; + buf[i + 3] = 0x01; + timer_cycles.start(); + func(buf, &offset, buf_size); + timer_cycles.stop(); + total_cycles += timer_cycles.read(); + } + + double cycles_per_iter = static_cast(total_cycles) / (buf_size - 4); + std::cout << std::endl; + std::cout << "=== Performance Test(" << impl_name << ") ===" << std::endl; + std::cout << "Cycles per iteration: " << std::fixed << std::setprecision(2) << cycles_per_iter << std::endl; +} + +// Macro to run tests for different implementations +#define RUN_DETECT_STARTCODE_TESTS(impl_func, impl_name) \ + TestThreeByteStartcode(impl_func, impl_name); \ + TestLeadingZerosPatterns(impl_func, impl_name); \ + TestNoStartcodePatterns(impl_func, impl_name); \ + TestSmallBuffers(impl_func, impl_name); \ + TestMultipleStartcodes(impl_func, impl_name) + +// Test C implementation +TEST(DetectStartCodePrefixTest, C) { + RUN_DETECT_STARTCODE_TESTS(DetectStartCodePrefixC, "C"); +} + +#if defined(HAVE_NEON_AARCH64) && defined(__aarch64__) +// Test NEON implementation +TEST(DetectStartCodePrefixTest, NEON) { + RUN_DETECT_STARTCODE_TESTS(DetectStartCodePrefixNEON, "NEON"); +} + +// Test performance comparison between C and NEON +TEST(DetectStartCodePrefixTest, Performance) { + TestPerformance(DetectStartCodePrefixC, "C"); + TestPerformance(DetectStartCodePrefixNEON, "NEON"); +} +#endif + +} // namespace \ No newline at end of file diff --git a/test/decoder/meson.build b/test/decoder/meson.build index bf83360b14..c4c9ee2d14 100644 --- a/test/decoder/meson.build +++ b/test/decoder/meson.build @@ -7,6 +7,7 @@ test_sources = [ 'DecUT_IntraPrediction.cpp', 'DecUT_ParseSyntax.cpp', 'DecUT_PredMv.cpp', + 'DecUT_StartCode.cpp', ] e = executable('test_decoder', test_sources,