Tencent · futz12 · Mar 17, 2026 · Mar 18, 2026 · Mar 18, 2026 · Mar 20, 2026
diff --git a/src/layer/riscv/elu_riscv.cpp b/src/layer/riscv/elu_riscv.cpp
@@ -0,0 +1,80 @@
+// Copyright 2025 Tencent
+// SPDX-License-Identifier: BSD-3-Clause
+
+#include "elu_riscv.h"
+
+#if __riscv_vector
+#include <riscv_vector.h>
+#include "rvv_mathfun.h"
+#endif // __riscv_vector
+
+#include "cpu.h"
+
+namespace ncnn {
+
+ELU_riscv::ELU_riscv()
+{
+    support_packing = true;
+#if NCNN_ZFH
+#if __riscv_vector
+    support_fp16_storage = cpu_support_riscv_zvfh();
+#else
+    support_fp16_storage = cpu_support_riscv_zfh();
+#endif
+#endif
+}
+
+int ELU_riscv::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
+{
+#if C906
+    // FIXME -O3 leads illegal instruction
+    return ELU::forward_inplace(bottom_top_blob, opt);
+#endif
+
+    int elembits = bottom_top_blob.elembits();
+
+#if NCNN_ZFH
+    if (support_fp16_storage && opt.use_fp16_storage && elembits == 16)
+        return forward_inplace_fp16s(bottom_top_blob, opt);
+#endif
+
+    int w = bottom_top_blob.w;
+    int h = bottom_top_blob.h;
+    int d = bottom_top_blob.d;
+    int channels = bottom_top_blob.c;
+    int elempack = bottom_top_blob.elempack;
+    int size = w * h * d * elempack;
+
+    #pragma omp parallel for num_threads(opt.num_threads)
+    for (int q = 0; q < channels; q++)
+    {
+        float* ptr = bottom_top_blob.channel(q);
+#if __riscv_vector
+        int n = size;
+        while (n > 0)
+        {
+            size_t vl = __riscv_vsetvl_e32m8(n);
+            vfloat32m8_t _p = __riscv_vle32_v_f32m8(ptr, vl);
+            vbool4_t _lower = __riscv_vmflt_vf_f32m8_b4(_p, 0.f, vl);
+
+            vfloat32m8_t _exp_v = exp_ps(_p, vl);
+            _exp_v = __riscv_vfsub_vf_f32m8(_exp_v, 1.f, vl);
+            _exp_v = __riscv_vfmul_vf_f32m8(_exp_v, alpha, vl);
+            _p = __riscv_vmerge_vvm_f32m8(_p, _exp_v, _lower, vl);
+
+            __riscv_vse32_v_f32m8(ptr, _p, vl);
+            ptr += vl;
+            n -= vl;
+        }
+#else
+        for (int i = 0; i < size; i++)
+        {
+            if (ptr[i] < 0.f)
+                ptr[i] = alpha * (expf(ptr[i]) - 1.f);
+        }
+#endif // __riscv_vector
+    }
+    return 0;
+}
+
+} // namespace ncnn
diff --git a/src/layer/riscv/elu_riscv.h b/src/layer/riscv/elu_riscv.h
@@ -0,0 +1,26 @@
+// Copyright 2025 Tencent
+// SPDX-License-Identifier: BSD-3-Clause
+
+#ifndef LAYER_ELU_RISCV_H
+#define LAYER_ELU_RISCV_H
+
+#include "elu.h"
+
+namespace ncnn {
+
+class ELU_riscv : public ELU
+{
+public:
+    ELU_riscv();
+
+    virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const;
+
+protected:
+#if NCNN_ZFH
+    int forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt) const;
+#endif
+};
+
+} // namespace ncnn
+
+#endif // LAYER_ELU_RISCV_H
diff --git a/src/layer/riscv/elu_riscv_zfh.cpp b/src/layer/riscv/elu_riscv_zfh.cpp
@@ -0,0 +1,61 @@
+// Copyright 2025 Tencent
+// SPDX-License-Identifier: BSD-3-Clause
+
+#include "elu_riscv.h"
+
+#if __riscv_vector
+#include <riscv_vector.h>
+#include "rvv_mathfun.h"
+#endif // __riscv_vector
+
+namespace ncnn {
+
+#if NCNN_ZFH
+int ELU_riscv::forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt) const
+{
+    int w = bottom_top_blob.w;
+    int h = bottom_top_blob.h;
+    int d = bottom_top_blob.d;
+    int channels = bottom_top_blob.c;
+    int elempack = bottom_top_blob.elempack;
+    int size = w * h * d * elempack;
+
+    #pragma omp parallel for num_threads(opt.num_threads)
+    for (int q = 0; q < channels; q++)
+    {
+        __fp16* ptr = bottom_top_blob.channel(q);
+
+        int n = size;
+#if __riscv_zvfh
+        while (n > 0)
+        {
+            size_t vl = __riscv_vsetvl_e16m4(n);
+            vfloat16m4_t _p = __riscv_vle16_v_f16m4(ptr, vl);
+            vbool4_t _lower = __riscv_vmflt_vf_f16m4_b4(_p, (__fp16)0.f, vl);
+
+            // Convert to float32 for exp calculation
+            vfloat32m8_t _p_f32 = __riscv_vfwcvt_f_f_v_f32m8(_p, vl);
+            vfloat32m8_t _exp_v = exp_ps(_p_f32, vl);
+            _exp_v = __riscv_vfsub_vf_f32m8(_exp_v, 1.f, vl);
+            _exp_v = __riscv_vfmul_vf_f32m8(_exp_v, alpha, vl);
+            vfloat16m4_t _exp_v_f16 = __riscv_vfncvt_f_f_w_f16m4(_exp_v, vl);
+
+            _p = __riscv_vmerge_vvm_f16m4(_p, _exp_v_f16, _lower, vl);
+
+            __riscv_vse16_v_f16m4(ptr, _p, vl);
+            ptr += vl;
+            n -= vl;
+        }
+#else  // __riscv_zvfh
+        for (int i = 0; i < size; i++)
+        {
+            if (ptr[i] < (__fp16)0.f)
+                ptr[i] = (__fp16)(alpha * (expf((float)ptr[i]) - 1.f));
+        }
+#endif // __riscv_zvfh
+    }
+    return 0;
+}
+#endif // NCNN_ZFH
+
+} // namespace ncnn
diff --git a/src/layer/riscv/erf_riscv.cpp b/src/layer/riscv/erf_riscv.cpp
@@ -0,0 +1,80 @@
+// Copyright 2026 Futz12 <pchar.cn>
+// SPDX-License-Identifier: BSD-3-Clause
+
+#include "erf_riscv.h"
+
+#if __riscv_vector
+#include <riscv_vector.h>
+#include "rvv_mathfun.h"
+#endif // __riscv_vector
+
+#include "cpu.h"
+
+namespace ncnn {
+
+Erf_riscv::Erf_riscv()
+{
+#if __riscv_vector
+    support_packing = true;
+#endif // __riscv_vector
+#if NCNN_ZFH
+#if __riscv_vector
+    support_fp16_storage = cpu_support_riscv_zvfh();
+#else
+    support_fp16_storage = cpu_support_riscv_zfh();
+#endif
+#endif
+}
+
+int Erf_riscv::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
+{
+#if NCNN_ZFH
+    int elembits = bottom_top_blob.elembits();
+
+    if (opt.use_fp16_storage && elembits == 16)
+    {
+        if (opt.use_fp16_arithmetic)
+            return forward_inplace_fp16sa(bottom_top_blob, opt);
+        else
+            return forward_inplace_fp16s(bottom_top_blob, opt);
+    }
+#endif
+
+    int w = bottom_top_blob.w;
+    int h = bottom_top_blob.h;
+    int d = bottom_top_blob.d;
+    int channels = bottom_top_blob.c;
+    int elempack = bottom_top_blob.elempack;
+    int size = w * h * d * elempack;
+
+    #pragma omp parallel for num_threads(opt.num_threads)
+    for (int q = 0; q < channels; q++)
+    {
+        float* ptr = bottom_top_blob.channel(q);
+
+#if __riscv_vector
+        int n = size;
+        while (n > 0)
+        {
+            size_t vl = __riscv_vsetvl_e32m8(n);
+
+            vfloat32m8_t _p = __riscv_vle32_v_f32m8(ptr, vl);
+            _p = erf_ps(_p, vl);
+            __riscv_vse32_v_f32m8(ptr, _p, vl);
+
+            ptr += vl;
+            n -= vl;
+        }
+#else  // __riscv_vector
+        for (int i = 0; i < size; i++)
+        {
+            *ptr = erff(*ptr);
+            ptr++;
+        }
+#endif // __riscv_vector
+    }
+
+    return 0;
+}
+
+} // namespace ncnn
diff --git a/src/layer/riscv/erf_riscv.h b/src/layer/riscv/erf_riscv.h
@@ -0,0 +1,27 @@
+// Copyright 2026 Futz12 <pchar.cn>
+// SPDX-License-Identifier: BSD-3-Clause
+
+#ifndef LAYER_ERF_RISCV_H
+#define LAYER_ERF_RISCV_H
+
+#include "erf.h"
+
+namespace ncnn {
+
+class Erf_riscv : public Erf
+{
+public:
+    Erf_riscv();
+
+    virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const;
+
+protected:
+#if NCNN_ZFH
+    int forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt) const;
+    int forward_inplace_fp16sa(Mat& bottom_top_blob, const Option& opt) const;
+#endif
+};
+
+} // namespace ncnn
+
+#endif // LAYER_ERF_RISCV_H
diff --git a/src/layer/riscv/erf_riscv_zfh.cpp b/src/layer/riscv/erf_riscv_zfh.cpp
@@ -0,0 +1,96 @@
+// Copyright 2026 Futz12 <pchar.cn>
+// SPDX-License-Identifier: BSD-3-Clause
+
+#include "erf_riscv.h"
+
+#if __riscv_vector
+#include <riscv_vector.h>
+#include "rvv_mathfun.h"
+#if __riscv_zvfh
+#include "rvv_mathfun_fp16s.h"
+#endif
+#endif // __riscv_vector
+
+namespace ncnn {
+
+#if NCNN_ZFH
+int Erf_riscv::forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt) const
+{
+    int w = bottom_top_blob.w;
+    int h = bottom_top_blob.h;
+    int d = bottom_top_blob.d;
+    int channels = bottom_top_blob.c;
+    int elempack = bottom_top_blob.elempack;
+    int size = w * h * d * elempack;
+
+    #pragma omp parallel for num_threads(opt.num_threads)
+    for (int q = 0; q < channels; q++)
+    {
+        __fp16* ptr = bottom_top_blob.channel(q);
+
+#if __riscv_zvfh
+        int n = size;
+        while (n > 0)
+        {
+            size_t vl = __riscv_vsetvl_e16m4(n);
+
+            vfloat32m8_t _p = __riscv_vfwcvt_f_f_v_f32m8(__riscv_vle16_v_f16m4(ptr, vl), vl);
+            _p = erf_ps(_p, vl);
+            __riscv_vse16_v_f16m4(ptr, __riscv_vfncvt_f_f_w_f16m4(_p, vl), vl);
+
+            ptr += vl;
+            n -= vl;
+        }
+#else  // __riscv_zvfh
+        for (int i = 0; i < size; i++)
+        {
+            *ptr = (__fp16)erff((float)*ptr);
+            ptr++;
+        }
+#endif // __riscv_zvfh
+    }
+
+    return 0;
+}
+
+int Erf_riscv::forward_inplace_fp16sa(Mat& bottom_top_blob, const Option& opt) const
+{
+    int w = bottom_top_blob.w;
+    int h = bottom_top_blob.h;
+    int d = bottom_top_blob.d;
+    int channels = bottom_top_blob.c;
+    int elempack = bottom_top_blob.elempack;
+    int size = w * h * d * elempack;
+
+    #pragma omp parallel for num_threads(opt.num_threads)
+    for (int q = 0; q < channels; q++)
+    {
+        __fp16* ptr = bottom_top_blob.channel(q);
+
+#if __riscv_zvfh
+        int n = size;
+        while (n > 0)
+        {
+            size_t vl = __riscv_vsetvl_e16m8(n);
+
+            vfloat16m8_t _p = __riscv_vle16_v_f16m8(ptr, vl);
+            _p = erf_ps(_p, vl);
+            __riscv_vse16_v_f16m8(ptr, _p, vl);
+
+            ptr += vl;
+            n -= vl;
+        }
+#else  // __riscv_zvfh
+        for (int i = 0; i < size; i++)
+        {
+            *ptr = (__fp16)erff((float)*ptr);
+            ptr++;
+        }
+#endif // __riscv_zvfh
+    }
+
+    return 0;
+}
+#endif // NCNN_ZFH
+
+} // namespace ncnn