Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
123 changes: 123 additions & 0 deletions codec/decoder/core/arm64/au_parser_aarch64_neon.S
Original file line number Diff line number Diff line change
@@ -0,0 +1,123 @@
/*!
* \copy
* Copyright (c) 2013, Cisco Systems
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
* COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/

#if defined(HAVE_NEON_AARCH64) && defined(__aarch64__)
#include "arm_arch64_common_macro.S"

.extern DetectStartCodePrefixC

WELS_ASM_AARCH64_FUNC_BEGIN DetectStartCodePrefixNEON
cmp w2, #79
b.gt .L_init
b DetectStartCodePrefixC

.L_init:
stp x29, x30, [sp, #-32]!
stp x20, x19, [sp, #16]
mov x29, sp
mov x9, #0
add x8, x0, w2, sxtw
sub x10, x8, #80
b .L_simd

.L_advance:
add x9, x9, #64
add x8, x0, x9
cmp x8, x10
b.hi .L_fallback

.L_simd:
add x8, x0, x9
ldp q1, q2, [x8]
ldp q3, q0, [x8, #32]
ext v4.16b, v0.16b, v1.16b, #15
ext v5.16b, v1.16b, v2.16b, #15
ext v6.16b, v2.16b, v3.16b, #15
ext v7.16b, v3.16b, v0.16b, #15
orr v1.16b, v4.16b, v1.16b
orr v2.16b, v5.16b, v2.16b
orr v3.16b, v6.16b, v3.16b
orr v4.16b, v7.16b, v0.16b
umin v1.16b, v1.16b, v2.16b
umin v2.16b, v3.16b, v4.16b
umin v1.16b, v1.16b, v2.16b
uminv b1, v1.16b
fmov w11, s1
cbz w11, .L_call_c

umov w8, v0.b[15]
cbnz w8, .L_advance

add x8, x0, x9
ldrb w11, [x8, #64]
cbnz w11, .L_advance

ldrb w8, [x8, #65]
cmp w8, #1
b.ne .L_advance

add w8, w9, #66
str w8, [x1]
add x0, x0, w8, sxtw

.L_return:
ldp x20, x19, [sp, #16]
ldp x29, x30, [sp], #32
ret

.L_call_c:
mov x19, x0
mov x0, x8
mov x20, x1
mov w2, #65
bl DetectStartCodePrefixC
cbnz x0, .L_found
b .L_return

.L_fallback:
mov x19, x0
sub x0, x8, #1
sub w8, w2, w9
add w2, w8, #1
mov x20, x1
bl DetectStartCodePrefixC
cbz x0, .L_return

.L_found:
sub w8, w0, w19
str w8, [x20]
ldp x20, x19, [sp, #16]
ldp x29, x30, [sp], #32
ret

WELS_ASM_AARCH64_FUNC_END

#endif
10 changes: 8 additions & 2 deletions codec/decoder/core/inc/au_parser.h
Original file line number Diff line number Diff line change
Expand Up @@ -48,9 +48,10 @@

namespace WelsDec {

extern "C" {
/*!
*************************************************************************************
* \brief Start Code Prefix (0x 00 00 00 01) detection
* \brief Start Code Prefix (0x 00 00 00 01) detection - C implementation
*
* \param pBuf bitstream payload buffer
* \param pOffset offset between NAL rbsp and original bitsteam that
Expand All @@ -62,7 +63,12 @@ namespace WelsDec {
* \note N/A
*************************************************************************************
*/
uint8_t* DetectStartCodePrefix (const uint8_t* kpBuf, int32_t* pOffset, int32_t iBufSize);
uint8_t* DetectStartCodePrefixC (const uint8_t* kpBuf, int32_t* pOffset, int32_t iBufSize);

#if defined(HAVE_NEON_AARCH64) && defined(__aarch64__)
uint8_t* DetectStartCodePrefixNEON (const uint8_t* kpBuf, int32_t* pOffset, int32_t iBufSize);
#endif
}

/*!
*************************************************************************************
Expand Down
4 changes: 4 additions & 0 deletions codec/decoder/core/inc/decoder_context.h
Original file line number Diff line number Diff line change
Expand Up @@ -145,6 +145,8 @@ typedef void (*PExpandPictureFunc) (uint8_t* pDst, const int32_t kiStride, const

typedef void (*PGetIntraPred8x8Func) (uint8_t* pPred, const int32_t kiLumaStride, bool bTLAvail, bool bTRAvail);

typedef uint8_t* (*PDetectStartCodePrefixFunc) (const uint8_t* kpBuf, int32_t* pOffset, int32_t iBufSize);

/**/
typedef struct TagRefPic {
PPicture pRefList[LIST_A][MAX_DPB_COUNT]; // reference picture marking plus FIFO scheme
Expand Down Expand Up @@ -465,6 +467,8 @@ typedef struct TagWelsDecoderContext {
/* For Block */
SBlockFunc sBlockFunc;

PDetectStartCodePrefixFunc pfDetectStartCodePrefix;

int32_t iCurSeqIntervalTargetDependId;
int32_t iCurSeqIntervalMaxPicWidth;
int32_t iCurSeqIntervalMaxPicHeight;
Expand Down
11 changes: 9 additions & 2 deletions codec/decoder/core/src/au_parser.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -49,9 +49,16 @@
#define _PARSE_NALHRD_VCLHRD_PARAMS_ 1

namespace WelsDec {

// External declarations for NEON-optimized version
#if defined(HAVE_NEON_AARCH64) && defined(__aarch64__)
extern "C" uint8_t* DetectStartCodePrefixNEON(const uint8_t* kpBuf, int32_t* pOffset, int32_t iBufSize);
#endif

/*!
*************************************************************************************
* \brief Start Code Prefix (0x 00 00 00 01) detection
* \brief C implementation of Start Code Prefix (0x 00 00 00 01) detection
* (extern "C" to allow calling from assembly)
*
* \param pBuf bitstream payload buffer
* \param pOffset offset between NAL rbsp and original bitsteam that
Expand All @@ -63,7 +70,7 @@ namespace WelsDec {
* \note N/A
*************************************************************************************
*/
uint8_t* DetectStartCodePrefix (const uint8_t* kpBuf, int32_t* pOffset, int32_t iBufSize) {
uint8_t* DetectStartCodePrefixC (const uint8_t* kpBuf, int32_t* pOffset, int32_t iBufSize) {
uint8_t* pBits = (uint8_t*)kpBuf;

do {
Expand Down
8 changes: 6 additions & 2 deletions codec/decoder/core/src/decoder.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -756,8 +756,8 @@ int32_t WelsDecodeBs (PWelsDecoderContext pCtx, const uint8_t* kpBsBuf, const in
uint8_t* pNalPayload = NULL;


if (NULL == DetectStartCodePrefix (kpBsBuf, &iOffset,
kiBsLen)) { //CAN'T find the 00 00 01 start prefix from the source buffer
if (NULL == pCtx->pfDetectStartCodePrefix (kpBsBuf, &iOffset,
kiBsLen)) { //CAN'T find the 00 00 01 start prefix from the source buffer
pCtx->iErrorCode |= dsBitstreamError;
return dsBitstreamError;
}
Expand Down Expand Up @@ -1054,6 +1054,8 @@ void InitPredFunc (PWelsDecoderContext pCtx, uint32_t uiCpuFlag) {

pCtx->pIdctResAddPredFunc8x8 = IdctResAddPred8x8_c;

pCtx->pfDetectStartCodePrefix = DetectStartCodePrefixC;

#if defined(HAVE_NEON)
if (uiCpuFlag & WELS_CPU_NEON) {
pCtx->pIdctResAddPredFunc = IdctResAddPred_neon;
Expand Down Expand Up @@ -1108,6 +1110,8 @@ void InitPredFunc (PWelsDecoderContext pCtx, uint32_t uiCpuFlag) {
pCtx->pGetIChromaPredFunc[C_PRED_P ] = WelsDecoderIChromaPredPlane_AArch64_neon;
pCtx->pGetIChromaPredFunc[C_PRED_DC] = WelsDecoderIChromaPredDc_AArch64_neon;
pCtx->pGetIChromaPredFunc[C_PRED_DC_T] = WelsDecoderIChromaPredDcTop_AArch64_neon;

pCtx->pfDetectStartCodePrefix = DetectStartCodePrefixNEON;
}
#endif//HAVE_NEON_AARCH64

Expand Down
1 change: 1 addition & 0 deletions codec/decoder/meson.build
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@ elif cpu_family == 'aarch64'
asm_sources = [
'core/arm64/block_add_aarch64_neon.S',
'core/arm64/intra_pred_aarch64_neon.S',
'core/arm64/au_parser_aarch64_neon.S',
]
if use_asm_gen
objs_asm = asm_gen.process(asm_sources)
Expand Down
110 changes: 110 additions & 0 deletions test/PMUTimer.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,110 @@
/*!
* \copy
* Copyright (c) 2026, OpenH264 Contributors
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
* COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*
*
* \file PMUTimer.h
*
* \brief PMU performance measurement utility
*
*************************************************************************************
*/

#ifndef PMU_TIMER_H
#define PMU_TIMER_H

#include <stdint.h>
#include <unistd.h>
#include <sys/ioctl.h>
#include <sys/syscall.h>
#include <linux/perf_event.h>

class PMUTimer {
public:
enum Mode {
INSTRUCTIONS,
CPU_CYCLES
};

PMUTimer(Mode mode = INSTRUCTIONS) : fd_(-1), mode_(mode) {
init();
}

~PMUTimer() {
if (fd_ >= 0)
close(fd_);
}

bool is_available() const { return fd_ >= 0; }

void start() {
if (fd_ >= 0) {
ioctl(fd_, PERF_EVENT_IOC_RESET, 0);
ioctl(fd_, PERF_EVENT_IOC_ENABLE, 0);
}
}

void stop() {
if (fd_ >= 0) {
ioctl(fd_, PERF_EVENT_IOC_DISABLE, 0);
}
}

uint64_t read() {
uint64_t value = 0;
if (fd_ >= 0) {
::read(fd_, &value, sizeof(value));
} else {
// Fallback to cntvct_el0
__asm__ volatile("isb\nmrs %0, cntvct_el0" : "=r"(value) :: "memory");
}
return value;
}

const char* mode_name() const {
return mode_ == INSTRUCTIONS ? "instructions" : "cycles";
}

private:
void init() {
struct perf_event_attr attr = {
.type = PERF_TYPE_HARDWARE,
.config = mode_ == INSTRUCTIONS ? PERF_COUNT_HW_INSTRUCTIONS : PERF_COUNT_HW_CPU_CYCLES,
.disabled = 1,
.exclude_kernel = 1,
.exclude_hv = 1,
};
fd_ = syscall(__NR_perf_event_open, &attr, 0, -1, -1, 0);
}

int fd_;
Mode mode_;
};

#endif /* PMU_TIMER_H */
Loading