Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
80 changes: 80 additions & 0 deletions src/layer/riscv/elu_riscv.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
// Copyright 2025 Tencent
// SPDX-License-Identifier: BSD-3-Clause

#include "elu_riscv.h"

#if __riscv_vector
#include <riscv_vector.h>
#include "rvv_mathfun.h"
#endif // __riscv_vector

#include "cpu.h"

namespace ncnn {

ELU_riscv::ELU_riscv()
{
support_packing = true;
#if NCNN_ZFH
#if __riscv_vector
support_fp16_storage = cpu_support_riscv_zvfh();
#else
support_fp16_storage = cpu_support_riscv_zfh();
#endif
#endif
}

int ELU_riscv::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
{
#if C906
// FIXME -O3 leads illegal instruction
return ELU::forward_inplace(bottom_top_blob, opt);
#endif

int elembits = bottom_top_blob.elembits();

#if NCNN_ZFH
if (support_fp16_storage && opt.use_fp16_storage && elembits == 16)
return forward_inplace_fp16s(bottom_top_blob, opt);
#endif

int w = bottom_top_blob.w;
int h = bottom_top_blob.h;
int d = bottom_top_blob.d;
int channels = bottom_top_blob.c;
int elempack = bottom_top_blob.elempack;
int size = w * h * d * elempack;

#pragma omp parallel for num_threads(opt.num_threads)
for (int q = 0; q < channels; q++)
{
float* ptr = bottom_top_blob.channel(q);
#if __riscv_vector
int n = size;
while (n > 0)
{
size_t vl = __riscv_vsetvl_e32m8(n);
vfloat32m8_t _p = __riscv_vle32_v_f32m8(ptr, vl);
vbool4_t _lower = __riscv_vmflt_vf_f32m8_b4(_p, 0.f, vl);

vfloat32m8_t _exp_v = exp_ps(_p, vl);
_exp_v = __riscv_vfsub_vf_f32m8(_exp_v, 1.f, vl);
_exp_v = __riscv_vfmul_vf_f32m8(_exp_v, alpha, vl);
_p = __riscv_vmerge_vvm_f32m8(_p, _exp_v, _lower, vl);

__riscv_vse32_v_f32m8(ptr, _p, vl);
ptr += vl;
n -= vl;
}
#else
for (int i = 0; i < size; i++)
{
if (ptr[i] < 0.f)
ptr[i] = alpha * (expf(ptr[i]) - 1.f);
}
#endif // __riscv_vector
}
return 0;
}

} // namespace ncnn
26 changes: 26 additions & 0 deletions src/layer/riscv/elu_riscv.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
// Copyright 2025 Tencent
// SPDX-License-Identifier: BSD-3-Clause

#ifndef LAYER_ELU_RISCV_H
#define LAYER_ELU_RISCV_H

#include "elu.h"

namespace ncnn {

class ELU_riscv : public ELU
{
public:
ELU_riscv();

virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const;

protected:
#if NCNN_ZFH
int forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt) const;
#endif
};

} // namespace ncnn

#endif // LAYER_ELU_RISCV_H
61 changes: 61 additions & 0 deletions src/layer/riscv/elu_riscv_zfh.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
// Copyright 2025 Tencent
// SPDX-License-Identifier: BSD-3-Clause

#include "elu_riscv.h"

#if __riscv_vector
#include <riscv_vector.h>
#include "rvv_mathfun.h"
#endif // __riscv_vector

namespace ncnn {

#if NCNN_ZFH
int ELU_riscv::forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt) const
{
int w = bottom_top_blob.w;
int h = bottom_top_blob.h;
int d = bottom_top_blob.d;
int channels = bottom_top_blob.c;
int elempack = bottom_top_blob.elempack;
int size = w * h * d * elempack;

#pragma omp parallel for num_threads(opt.num_threads)
for (int q = 0; q < channels; q++)
{
__fp16* ptr = bottom_top_blob.channel(q);

int n = size;
#if __riscv_zvfh
while (n > 0)
{
size_t vl = __riscv_vsetvl_e16m4(n);
vfloat16m4_t _p = __riscv_vle16_v_f16m4(ptr, vl);
vbool4_t _lower = __riscv_vmflt_vf_f16m4_b4(_p, (__fp16)0.f, vl);

// Convert to float32 for exp calculation
vfloat32m8_t _p_f32 = __riscv_vfwcvt_f_f_v_f32m8(_p, vl);
vfloat32m8_t _exp_v = exp_ps(_p_f32, vl);
_exp_v = __riscv_vfsub_vf_f32m8(_exp_v, 1.f, vl);
_exp_v = __riscv_vfmul_vf_f32m8(_exp_v, alpha, vl);
vfloat16m4_t _exp_v_f16 = __riscv_vfncvt_f_f_w_f16m4(_exp_v, vl);

_p = __riscv_vmerge_vvm_f16m4(_p, _exp_v_f16, _lower, vl);

__riscv_vse16_v_f16m4(ptr, _p, vl);
ptr += vl;
n -= vl;
}
#else // __riscv_zvfh
for (int i = 0; i < size; i++)
{
if (ptr[i] < (__fp16)0.f)
ptr[i] = (__fp16)(alpha * (expf((float)ptr[i]) - 1.f));
}
#endif // __riscv_zvfh
}
return 0;
}
#endif // NCNN_ZFH

} // namespace ncnn
80 changes: 80 additions & 0 deletions src/layer/riscv/erf_riscv.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
// Copyright 2026 Futz12 <pchar.cn>
// SPDX-License-Identifier: BSD-3-Clause

#include "erf_riscv.h"

#if __riscv_vector
#include <riscv_vector.h>
#include "rvv_mathfun.h"
#endif // __riscv_vector

#include "cpu.h"

namespace ncnn {

Erf_riscv::Erf_riscv()
{
#if __riscv_vector
support_packing = true;
#endif // __riscv_vector
#if NCNN_ZFH
#if __riscv_vector
support_fp16_storage = cpu_support_riscv_zvfh();
#else
support_fp16_storage = cpu_support_riscv_zfh();
#endif
#endif
}

int Erf_riscv::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
{
#if NCNN_ZFH
int elembits = bottom_top_blob.elembits();

if (opt.use_fp16_storage && elembits == 16)
{
if (opt.use_fp16_arithmetic)
return forward_inplace_fp16sa(bottom_top_blob, opt);
else
return forward_inplace_fp16s(bottom_top_blob, opt);
}
#endif

int w = bottom_top_blob.w;
int h = bottom_top_blob.h;
int d = bottom_top_blob.d;
int channels = bottom_top_blob.c;
int elempack = bottom_top_blob.elempack;
int size = w * h * d * elempack;

#pragma omp parallel for num_threads(opt.num_threads)
for (int q = 0; q < channels; q++)
{
float* ptr = bottom_top_blob.channel(q);

#if __riscv_vector
int n = size;
while (n > 0)
{
size_t vl = __riscv_vsetvl_e32m8(n);

vfloat32m8_t _p = __riscv_vle32_v_f32m8(ptr, vl);
_p = erf_ps(_p, vl);
__riscv_vse32_v_f32m8(ptr, _p, vl);

ptr += vl;
n -= vl;
}
#else // __riscv_vector
for (int i = 0; i < size; i++)
{
*ptr = erff(*ptr);
ptr++;
}
#endif // __riscv_vector
}

return 0;
}

} // namespace ncnn
27 changes: 27 additions & 0 deletions src/layer/riscv/erf_riscv.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
// Copyright 2026 Futz12 <pchar.cn>
// SPDX-License-Identifier: BSD-3-Clause

#ifndef LAYER_ERF_RISCV_H
#define LAYER_ERF_RISCV_H

#include "erf.h"

namespace ncnn {

class Erf_riscv : public Erf
{
public:
Erf_riscv();

virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const;

protected:
#if NCNN_ZFH
int forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt) const;
int forward_inplace_fp16sa(Mat& bottom_top_blob, const Option& opt) const;
#endif
};

} // namespace ncnn

#endif // LAYER_ERF_RISCV_H
96 changes: 96 additions & 0 deletions src/layer/riscv/erf_riscv_zfh.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,96 @@
// Copyright 2026 Futz12 <pchar.cn>
// SPDX-License-Identifier: BSD-3-Clause

#include "erf_riscv.h"

#if __riscv_vector
#include <riscv_vector.h>
#include "rvv_mathfun.h"
#if __riscv_zvfh
#include "rvv_mathfun_fp16s.h"
#endif
#endif // __riscv_vector

namespace ncnn {

#if NCNN_ZFH
int Erf_riscv::forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt) const
{
int w = bottom_top_blob.w;
int h = bottom_top_blob.h;
int d = bottom_top_blob.d;
int channels = bottom_top_blob.c;
int elempack = bottom_top_blob.elempack;
int size = w * h * d * elempack;

#pragma omp parallel for num_threads(opt.num_threads)
for (int q = 0; q < channels; q++)
{
__fp16* ptr = bottom_top_blob.channel(q);

#if __riscv_zvfh
int n = size;
while (n > 0)
{
size_t vl = __riscv_vsetvl_e16m4(n);

vfloat32m8_t _p = __riscv_vfwcvt_f_f_v_f32m8(__riscv_vle16_v_f16m4(ptr, vl), vl);
_p = erf_ps(_p, vl);
__riscv_vse16_v_f16m4(ptr, __riscv_vfncvt_f_f_w_f16m4(_p, vl), vl);

ptr += vl;
n -= vl;
}
#else // __riscv_zvfh
for (int i = 0; i < size; i++)
{
*ptr = (__fp16)erff((float)*ptr);
ptr++;
}
#endif // __riscv_zvfh
}

return 0;
}

int Erf_riscv::forward_inplace_fp16sa(Mat& bottom_top_blob, const Option& opt) const
{
int w = bottom_top_blob.w;
int h = bottom_top_blob.h;
int d = bottom_top_blob.d;
int channels = bottom_top_blob.c;
int elempack = bottom_top_blob.elempack;
int size = w * h * d * elempack;

#pragma omp parallel for num_threads(opt.num_threads)
for (int q = 0; q < channels; q++)
{
__fp16* ptr = bottom_top_blob.channel(q);

#if __riscv_zvfh
int n = size;
while (n > 0)
{
size_t vl = __riscv_vsetvl_e16m8(n);

vfloat16m8_t _p = __riscv_vle16_v_f16m8(ptr, vl);
_p = erf_ps(_p, vl);
__riscv_vse16_v_f16m8(ptr, _p, vl);

ptr += vl;
n -= vl;
}
#else // __riscv_zvfh
for (int i = 0; i < size; i++)
{
*ptr = (__fp16)erff((float)*ptr);
ptr++;
}
#endif // __riscv_zvfh
}

return 0;
}
#endif // NCNN_ZFH

} // namespace ncnn
Loading
Loading