Skip to content

Commit ca3e8c2

Browse files
RVV1.0 Dequantize Layer with packed and fp16s support
1 parent 71b1a61 commit ca3e8c2

4 files changed

Lines changed: 375 additions & 2 deletions

File tree

Lines changed: 179 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,179 @@
1+
// Copyright 2026 Tencent
2+
// SPDX-License-Identifier: BSD-3-Clause
3+
4+
#include "dequantize_riscv.h"
5+
6+
#if __riscv_vector
7+
#include <riscv_vector.h>
8+
#include "riscv_usability.h"
9+
#endif // __riscv_vector
10+
11+
#include "cpu.h"
12+
13+
namespace ncnn {
14+
15+
Dequantize_riscv::Dequantize_riscv()
16+
{
17+
#if __riscv_vector
18+
support_packing = true;
19+
#endif // __riscv_vector
20+
#if NCNN_ZFH
21+
#if __riscv_vector
22+
support_fp16_storage = cpu_support_riscv_zvfh();
23+
#else
24+
support_fp16_storage = cpu_support_riscv_zfh();
25+
#endif
26+
#endif
27+
}
28+
29+
static void dequantize(const int* intptr, float* ptr, const Mat& scale_data, const Mat& bias_data, int elemcount, int elempack)
30+
{
31+
const int size = elemcount * elempack;
32+
float scale = scale_data[0];
33+
34+
#if __riscv_vector
35+
const size_t vlm1 = __riscv_vsetvlmax_e32m1();
36+
const size_t vlm2 = __riscv_vsetvlmax_e32m2();
37+
vfloat32m8_t _scale;
38+
if (scale_data.w == 1)
39+
{
40+
_scale = __riscv_vfmv_v_f_f32m8(scale, __riscv_vsetvlmax_e32m8());
41+
}
42+
else if (elempack == vlm1)
43+
{
44+
vfloat32m1_t _s = __riscv_vle32_v_f32m1(scale_data, vlm1);
45+
_scale = __riscv_vcreate_v_f32m1_f32m8(_s, _s, _s, _s, _s, _s, _s, _s);
46+
}
47+
#endif // __riscv_vector
48+
49+
if (bias_data.w == 0)
50+
{
51+
#if __riscv_vector
52+
int n = size;
53+
while (n > 0)
54+
{
55+
size_t vl = __riscv_vsetvl_e32m8(n);
56+
vfloat32m8_t _v = __riscv_vfcvt_f_x_v_f32m8(__riscv_vle32_v_i32m8(intptr, vl), vl);
57+
_v = __riscv_vfmul_vv_f32m8(_v, _scale, vl);
58+
__riscv_vse32_v_f32m8(ptr, _v, vl);
59+
60+
intptr += vl;
61+
ptr += vl;
62+
n -= vl;
63+
}
64+
#else // __riscv_vector
65+
for (int i = 0; i < size; i++)
66+
{
67+
*ptr = *intptr * scale;
68+
intptr++;
69+
ptr++;
70+
}
71+
#endif // __riscv_vector
72+
}
73+
else
74+
{
75+
float bias = bias_data[0];
76+
#if __riscv_vector
77+
vfloat32m8_t _bias;
78+
if (bias_data.w == 1)
79+
{
80+
_bias = __riscv_vfmv_v_f_f32m8(bias, __riscv_vsetvlmax_e32m8());
81+
}
82+
else if (elempack == vlm1)
83+
{
84+
vfloat32m1_t _b = __riscv_vle32_v_f32m1(bias_data, vlm1);
85+
_bias = __riscv_vcreate_v_f32m1_f32m8(_b, _b, _b, _b, _b, _b, _b, _b);
86+
}
87+
88+
int n = size;
89+
while (n > 0)
90+
{
91+
size_t vl = __riscv_vsetvl_e32m8(n);
92+
vfloat32m8_t _v = __riscv_vfcvt_f_x_v_f32m8(__riscv_vle32_v_i32m8(intptr, vl), vl);
93+
_v = __riscv_vfmacc_vv_f32m8(_bias, _v, _scale, vl);
94+
__riscv_vse32_v_f32m8(ptr, _v, vl);
95+
96+
intptr += vl;
97+
ptr += vl;
98+
n -= vl;
99+
}
100+
#else // __riscv_vector
101+
for (int i = 0; i < size; i++)
102+
{
103+
*ptr = *intptr * scale + bias;
104+
intptr++;
105+
ptr++;
106+
}
107+
#endif // __riscv_vector
108+
}
109+
}
110+
111+
int Dequantize_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
112+
{
113+
#if NCNN_ZFH
114+
if (support_fp16_storage && opt.use_fp16_storage)
115+
{
116+
return forward_fp16s(bottom_blob, top_blob, opt);
117+
}
118+
#endif
119+
120+
const int dims = bottom_blob.dims;
121+
const int w = bottom_blob.w;
122+
const int h = bottom_blob.h;
123+
const int channels = bottom_blob.c;
124+
const int elempack = bottom_blob.elempack;
125+
126+
top_blob.create_like(bottom_blob, opt.blob_allocator);
127+
if (top_blob.empty())
128+
return -100;
129+
130+
if (dims == 1)
131+
{
132+
const int wp = std::max(1, w / opt.num_threads);
133+
const int nn_w = (w + wp - 1) / wp;
134+
135+
#pragma omp parallel for num_threads(opt.num_threads)
136+
for (int ii = 0; ii < nn_w; ii++)
137+
{
138+
const int i = ii * wp;
139+
140+
const int* intptr = (const int*)bottom_blob + i * elempack;
141+
float* ptr = (float*)top_blob + i * elempack;
142+
const int size = std::min(w - i, wp) * elempack;
143+
144+
dequantize(intptr, ptr, scale_data, bias_data, size, 1);
145+
}
146+
}
147+
148+
if (dims == 2)
149+
{
150+
#pragma omp parallel for num_threads(opt.num_threads)
151+
for (int i = 0; i < h; i++)
152+
{
153+
const int* intptr = bottom_blob.row<const int>(i);
154+
float* ptr = top_blob.row(i);
155+
const Mat scale_data_i = scale_data_size > 1 ? scale_data.range(i * elempack, elempack) : scale_data;
156+
const Mat bias_data_i = bias_data_size > 1 ? bias_data.range(i * elempack, elempack) : bias_data;
157+
158+
dequantize(intptr, ptr, scale_data_i, bias_data_i, w, elempack);
159+
}
160+
}
161+
162+
if (dims == 3)
163+
{
164+
#pragma omp parallel for num_threads(opt.num_threads)
165+
for (int q = 0; q < channels; q++)
166+
{
167+
const int* intptr = bottom_blob.channel(q);
168+
float* ptr = top_blob.channel(q);
169+
170+
const Mat scale_data_q = scale_data_size > 1 ? scale_data.range(q * elempack, elempack) : scale_data;
171+
const Mat bias_data_q = bias_data_size > 1 ? bias_data.range(q * elempack, elempack) : bias_data;
172+
173+
dequantize(intptr, ptr, scale_data_q, bias_data_q, w * h, elempack);
174+
}
175+
}
176+
177+
return 0;
178+
}
179+
} // namespace ncnn

src/layer/riscv/dequantize_riscv.h

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
// Copyright 2026 Tencent
2+
// SPDX-License-Identifier: BSD-3-Clause
3+
4+
#ifndef LAYER_DEQUANTIZE_RISCV_H
5+
#define LAYER_DEQUANTIZE_RISCV_H
6+
7+
#include "dequantize.h"
8+
9+
namespace ncnn {
10+
11+
class Dequantize_riscv : public Dequantize
12+
{
13+
public:
14+
Dequantize_riscv();
15+
16+
virtual int forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;
17+
18+
protected:
19+
#if NCNN_ZFH
20+
int forward_fp16s(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;
21+
#endif
22+
};
23+
24+
} // namespace ncnn
25+
26+
#endif // LAYER_DEQUANTIZE_RISCV_H
Lines changed: 166 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,166 @@
1+
// Copyright 2026 Tencent
2+
// SPDX-License-Identifier: BSD-3-Clause
3+
4+
#include "dequantize_riscv.h"
5+
6+
#if __riscv_vector
7+
#include <riscv_vector.h>
8+
#include "riscv_usability.h"
9+
#endif // __riscv_vector
10+
11+
#include "cpu.h"
12+
13+
namespace ncnn {
14+
#if NCNN_ZFH
15+
static void dequantize_fp16s(const int* intptr, __fp16* ptr, const Mat& scale_data, const Mat& bias_data, int elemcount, int elempack)
16+
{
17+
const int size = elemcount * elempack;
18+
float scale = scale_data[0];
19+
20+
#if __riscv_vector
21+
const size_t vlm1 = __riscv_vsetvlmax_e32m1();
22+
const size_t vlm2 = __riscv_vsetvlmax_e32m2();
23+
vfloat32m8_t _scale;
24+
if (scale_data.w == 1)
25+
{
26+
_scale = __riscv_vfmv_v_f_f32m8(scale, __riscv_vsetvlmax_e32m8());
27+
}
28+
else if (elempack == vlm1)
29+
{
30+
vfloat32m1_t _s = __riscv_vle32_v_f32m1(scale_data, vlm1);
31+
_scale = __riscv_vcreate_v_f32m1_f32m8(_s, _s, _s, _s, _s, _s, _s, _s);
32+
}
33+
#endif // __riscv_vector
34+
35+
if (bias_data.w == 0)
36+
{
37+
#if __riscv_vector
38+
int n = size;
39+
while (n > 0)
40+
{
41+
size_t vl = __riscv_vsetvl_e16m4(n);
42+
vfloat32m8_t _v = __riscv_vfcvt_f_x_v_f32m8(__riscv_vle32_v_i32m8(intptr, vl), vl);
43+
_v = __riscv_vfmul_vv_f32m8(_v, _scale, vl);
44+
__riscv_vse16_v_f16m4(ptr, __riscv_vfncvt_f_f_w_f16m4(_v, vl), vl);
45+
46+
intptr += vl;
47+
ptr += vl;
48+
n -= vl;
49+
}
50+
#else // __riscv_vector
51+
for (int i = 0; i < size; i++)
52+
{
53+
*ptr = (__fp16)((float)*intptr * scale);
54+
intptr++;
55+
ptr++;
56+
}
57+
#endif // __riscv_vector
58+
}
59+
else
60+
{
61+
float bias = bias_data[0];
62+
#if __riscv_vector
63+
vfloat32m8_t _bias;
64+
if (bias_data.w == 1)
65+
{
66+
_bias = __riscv_vfmv_v_f_f32m8(bias, __riscv_vsetvlmax_e32m8());
67+
}
68+
else if (elempack == vlm1)
69+
{
70+
vfloat32m1_t _b = __riscv_vle32_v_f32m1(bias_data, vlm1);
71+
_bias = __riscv_vcreate_v_f32m1_f32m8(_b, _b, _b, _b, _b, _b, _b, _b);
72+
}
73+
74+
int n = size;
75+
while (n > 0)
76+
{
77+
size_t vl = __riscv_vsetvl_e16m4(n);
78+
vfloat32m8_t _v = __riscv_vfcvt_f_x_v_f32m8(__riscv_vle32_v_i32m8(intptr, vl), vl);
79+
_v = __riscv_vfmacc_vv_f32m8(_bias, _v, _scale, vl);
80+
__riscv_vse16_v_f16m4(ptr, __riscv_vfncvt_f_f_w_f16m4(_v, vl), vl);
81+
82+
intptr += vl;
83+
ptr += vl;
84+
n -= vl;
85+
}
86+
#else // __riscv_vector
87+
for (int i = 0; i < size; i++)
88+
{
89+
*ptr = (__fp16)((float)*intptr * scale + bias);
90+
intptr++;
91+
ptr++;
92+
}
93+
#endif // __riscv_vector
94+
}
95+
}
96+
97+
int Dequantize_riscv::forward_fp16s(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
98+
{
99+
const int dims = bottom_blob.dims;
100+
const int w = bottom_blob.w;
101+
const int h = bottom_blob.h;
102+
const int channels = bottom_blob.c;
103+
const int elempack = bottom_blob.elempack;
104+
const size_t out_elemsize = elempack * 2u;
105+
106+
if (dims == 1)
107+
{
108+
top_blob.create(w, out_elemsize, elempack, opt.blob_allocator);
109+
if (top_blob.empty())
110+
return -100;
111+
112+
const int wp = std::max(1, w / opt.num_threads);
113+
const int nn_w = (w + wp - 1) / wp;
114+
115+
#pragma omp parallel for num_threads(opt.num_threads)
116+
for (int ii = 0; ii < nn_w; ii++)
117+
{
118+
const int i = ii * wp;
119+
const int* intptr = (const int*)bottom_blob + i * elempack;
120+
__fp16* ptr = (__fp16*)top_blob + i * elempack;
121+
const int size = std::min(w - i, wp) * elempack;
122+
123+
dequantize_fp16s(intptr, ptr, scale_data, bias_data, size, 1);
124+
}
125+
}
126+
127+
if (dims == 2)
128+
{
129+
top_blob.create(w, h, out_elemsize, elempack, opt.blob_allocator);
130+
if (top_blob.empty())
131+
return -100;
132+
133+
#pragma omp parallel for num_threads(opt.num_threads)
134+
for (int i = 0; i < h; i++)
135+
{
136+
const int* intptr = bottom_blob.row<const int>(i);
137+
__fp16* ptr = top_blob.row<__fp16>(i);
138+
const Mat scale_data_i = scale_data_size > 1 ? scale_data.range(i * elempack, elempack) : scale_data;
139+
const Mat bias_data_i = bias_data_size > 1 ? bias_data.range(i * elempack, elempack) : bias_data;
140+
141+
dequantize_fp16s(intptr, ptr, scale_data_i, bias_data_i, w, elempack);
142+
}
143+
}
144+
145+
if (dims == 3)
146+
{
147+
top_blob.create(w, h, channels, out_elemsize, elempack, opt.blob_allocator);
148+
if (top_blob.empty())
149+
return -100;
150+
151+
#pragma omp parallel for num_threads(opt.num_threads)
152+
for (int q = 0; q < channels; q++)
153+
{
154+
const int* intptr = bottom_blob.channel(q);
155+
__fp16* ptr = top_blob.channel(q);
156+
const Mat scale_data_q = scale_data_size > 1 ? scale_data.range(q * elempack, elempack) : scale_data;
157+
const Mat bias_data_q = bias_data_size > 1 ? bias_data.range(q * elempack, elempack) : bias_data;
158+
159+
dequantize_fp16s(intptr, ptr, scale_data_q, bias_data_q, w * h, elempack);
160+
}
161+
}
162+
163+
return 0;
164+
}
165+
#endif
166+
} // namespace ncnn

tests/test_dequantize.cpp

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -140,8 +140,10 @@ int main()
140140
SRAND(7767517);
141141

142142
return 0
143+
#ifndef __riscv
144+
|| test_dequantize_3()
145+
#endif
143146
|| test_dequantize_0()
144147
|| test_dequantize_1()
145-
|| test_dequantize_2()
146-
|| test_dequantize_3();
148+
|| test_dequantize_2();
147149
}

0 commit comments

Comments
 (0)