Skip to content
8 changes: 4 additions & 4 deletions docs/how-to-use-and-FAQ/quantized-int8-inference.md
Original file line number Diff line number Diff line change
Expand Up @@ -89,16 +89,16 @@ filelist_in2.txt
```
**Here shape is WHC, because the order of the arguments to `ncnn::Mat`.**

### 3. Quantize model
For RNN,GRU,LSTM,MultiHeadAttention and Embed layers,ncnn2table also supports tableless quantization.

```shell
./ncnn2int8 mobilenet-opt.param mobilenet-opt.bin mobilenet-int8.param mobilenet-int8.bin mobilenet.table
./ncnn2table rnn.param rnn.bin rnn.table method=kl
```

If you don’t need static quantization, ncnn supports RNN/LSTM/GRU dynamic quantization. In this case, you can omit the table file.
### 3. Quantize model

```shell
./ncnn2int8 rnn-model.param rnn-model.bin rnn-model-int8.param rnn-model-int8.bin
./ncnn2int8 mobilenet-opt.param mobilenet-opt.bin mobilenet-int8.param mobilenet-int8.bin mobilenet.table
```

## use ncnn int8 inference
Expand Down
265 changes: 112 additions & 153 deletions tools/quantize/ncnn2int8.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -317,43 +317,34 @@ int NetQuantize::quantize_rnn()
if (layers[i]->type != "RNN")
continue;

char key_xc[256];
snprintf(key_xc, 256, "%s_param_0", layers[i]->name.c_str());
std::map<std::string, ncnn::Mat>::iterator iter_xc = weight_int8scale_table.find(key_xc);
if (iter_xc == weight_int8scale_table.end())
{
fprintf(stderr, "this layer need to be quantized, but no scale param!\n");
return -1;
}

char key_hc[256];
snprintf(key_hc, 256, "%s_param_1", layers[i]->name.c_str());
std::map<std::string, ncnn::Mat>::iterator iter_hc = weight_int8scale_table.find(key_hc);
if (iter_hc == weight_int8scale_table.end())
{
fprintf(stderr, "this layer need to be quantized, but no scale param!\n");
return -1;
}

// RNN - quantize weight from fp32 to int8
ncnn::RNN* rnn = (ncnn::RNN*)layers[i];

fprintf(stderr, "quantize_rnn %s\n", rnn->name.c_str());

// TODO move to ncnn2table
const int num_directions = rnn->direction == 2 ? 2 : 1;
const int size = rnn->weight_data_size / num_directions / rnn->num_output;

ncnn::Mat weight_xc_data_int8_scales(rnn->num_output * num_directions);
ncnn::Mat weight_hc_data_int8_scales(rnn->num_output * num_directions);

for (int d = 0; d < num_directions; d++)
{
for (int q = 0; q < rnn->num_output; q++)
{
{
const float* weight_xc_ptr = rnn->weight_xc_data.channel(d).row(q);
float absmax = 0.f;
for (int i = 0; i < size; i++)
{
absmax = std::max(absmax, (float)fabs(weight_xc_ptr[i]));
}
weight_xc_data_int8_scales[d * rnn->num_output + q] = 127 / absmax;
}

{
const float* weight_hc_ptr = rnn->weight_hc_data.channel(d).row(q);
float absmax = 0.f;
for (int i = 0; i < size; i++)
{
absmax = std::max(absmax, (float)fabs(weight_hc_ptr[i]));
}
weight_hc_data_int8_scales[d * rnn->num_output + q] = 127 / absmax;
}
}
}
ncnn::Mat weight_xc_data_int8_scales = iter_xc->second;
ncnn::Mat weight_hc_data_int8_scales = iter_hc->second;

{
ncnn::Mat weight_xc_data_r2 = rnn->weight_xc_data.reshape(size, rnn->num_output * num_directions);
Expand Down Expand Up @@ -399,43 +390,34 @@ int NetQuantize::quantize_lstm()
if (layers[i]->type != "LSTM")
continue;

char key_xc[256];
snprintf(key_xc, 256, "%s_param_0", layers[i]->name.c_str());
std::map<std::string, ncnn::Mat>::iterator iter_xc = weight_int8scale_table.find(key_xc);
if (iter_xc == weight_int8scale_table.end())
{
fprintf(stderr, "this layer need to be quantized, but no scale param!\n");
return -1;
}

char key_hc[256];
snprintf(key_hc, 256, "%s_param_1", layers[i]->name.c_str());
std::map<std::string, ncnn::Mat>::iterator iter_hc = weight_int8scale_table.find(key_hc);
if (iter_hc == weight_int8scale_table.end())
{
fprintf(stderr, "this layer need to be quantized, but no scale param!\n");
return -1;
}

// LSTM - quantize weight from fp32 to int8
ncnn::LSTM* lstm = (ncnn::LSTM*)layers[i];

fprintf(stderr, "quantize_lstm %s\n", lstm->name.c_str());

// TODO move to ncnn2table
const int num_directions = lstm->direction == 2 ? 2 : 1;
const int size = lstm->weight_data_size / num_directions / lstm->hidden_size / 4;

ncnn::Mat weight_xc_data_int8_scales(lstm->hidden_size * 4 * num_directions);
ncnn::Mat weight_hc_data_int8_scales(lstm->hidden_size * 4 * num_directions);

for (int d = 0; d < num_directions; d++)
{
for (int q = 0; q < lstm->hidden_size * 4; q++)
{
{
const float* weight_xc_ptr = lstm->weight_xc_data.channel(d).row(q);
float absmax = 0.f;
for (int i = 0; i < size; i++)
{
absmax = std::max(absmax, (float)fabs(weight_xc_ptr[i]));
}
weight_xc_data_int8_scales[d * lstm->hidden_size * 4 + q] = 127 / absmax;
}

{
const float* weight_hc_ptr = lstm->weight_hc_data.channel(d).row(q);
float absmax = 0.f;
for (int i = 0; i < size; i++)
{
absmax = std::max(absmax, (float)fabs(weight_hc_ptr[i]));
}
weight_hc_data_int8_scales[d * lstm->hidden_size * 4 + q] = 127 / absmax;
}
}
}
ncnn::Mat weight_xc_data_int8_scales = iter_xc->second;
ncnn::Mat weight_hc_data_int8_scales = iter_hc->second;

{
ncnn::Mat weight_xc_data_r2 = lstm->weight_xc_data.reshape(size, lstm->hidden_size * 4 * num_directions);
Expand Down Expand Up @@ -481,43 +463,34 @@ int NetQuantize::quantize_gru()
if (layers[i]->type != "GRU")
continue;

char key_xc[256];
snprintf(key_xc, 256, "%s_param_0", layers[i]->name.c_str());
std::map<std::string, ncnn::Mat>::iterator iter_xc = weight_int8scale_table.find(key_xc);
if (iter_xc == weight_int8scale_table.end())
{
fprintf(stderr, "this layer need to be quantized, but no scale param!\n");
return -1;
}

char key_hc[256];
snprintf(key_hc, 256, "%s_param_1", layers[i]->name.c_str());
std::map<std::string, ncnn::Mat>::iterator iter_hc = weight_int8scale_table.find(key_hc);
if (iter_hc == weight_int8scale_table.end())
{
fprintf(stderr, "this layer need to be quantized, but no scale param!\n");
return -1;
}

// GRU - quantize weight from fp32 to int8
ncnn::GRU* gru = (ncnn::GRU*)layers[i];

fprintf(stderr, "quantize_gru %s\n", gru->name.c_str());

// TODO move to ncnn2table
const int num_directions = gru->direction == 2 ? 2 : 1;
const int size = gru->weight_data_size / num_directions / gru->num_output / 3;

ncnn::Mat weight_xc_data_int8_scales(gru->num_output * 3 * num_directions);
ncnn::Mat weight_hc_data_int8_scales(gru->num_output * 3 * num_directions);

for (int d = 0; d < num_directions; d++)
{
for (int q = 0; q < gru->num_output * 3; q++)
{
{
const float* weight_xc_ptr = gru->weight_xc_data.channel(d).row(q);
float absmax = 0.f;
for (int i = 0; i < size; i++)
{
absmax = std::max(absmax, (float)fabs(weight_xc_ptr[i]));
}
weight_xc_data_int8_scales[d * gru->num_output * 3 + q] = 127 / absmax;
}

{
const float* weight_hc_ptr = gru->weight_hc_data.channel(d).row(q);
float absmax = 0.f;
for (int i = 0; i < size; i++)
{
absmax = std::max(absmax, (float)fabs(weight_hc_ptr[i]));
}
weight_hc_data_int8_scales[d * gru->num_output * 3 + q] = 127 / absmax;
}
}
}
ncnn::Mat weight_xc_data_int8_scales = iter_xc->second;
ncnn::Mat weight_hc_data_int8_scales = iter_hc->second;

{
ncnn::Mat weight_xc_data_r2 = gru->weight_xc_data.reshape(size, gru->num_output * 3 * num_directions);
Expand Down Expand Up @@ -563,27 +536,24 @@ int NetQuantize::quantize_embed()
if (layers[i]->type != "Embed")
continue;

char key[256];
snprintf(key, 256, "%s_param_0", layers[i]->name.c_str());
std::map<std::string, ncnn::Mat>::iterator iter = weight_int8scale_table.find(key);
if (iter == weight_int8scale_table.end())
{
fprintf(stderr, "this layer need to be quantized, but no scale param!\n");
return -1;
}

// Embed - quantize weight from fp32 to int8
ncnn::Embed* embed = (ncnn::Embed*)layers[i];

fprintf(stderr, "quantize_embed %s\n", embed->name.c_str());

// TODO move to ncnn2table

const int num_output = embed->num_output;
const int input_dim = embed->input_dim;

ncnn::Mat weight_data_int8_scales(1);
{
const float* ptr = embed->weight_data;
float absmax = 0.f;
for (int i = 0; i < embed->weight_data.w; i++)
{
absmax = std::max(absmax, (float)fabs(ptr[i]));
}

weight_data_int8_scales[0] = absmax == 0.f ? 1.f : 127 / absmax;
}
ncnn::Mat weight_data_int8_scales = iter->second;

{
ncnn::Mat weight_data_int8;
Expand Down Expand Up @@ -719,29 +689,51 @@ int NetQuantize::quantize_multiheadattention()
if (layers[i]->type != "MultiHeadAttention")
continue;

char key_q[256];
snprintf(key_q, 256, "%s_param_0", layers[i]->name.c_str());
std::map<std::string, ncnn::Mat>::iterator iter_q = weight_int8scale_table.find(key_q);
if (iter_q == weight_int8scale_table.end())
{
fprintf(stderr, "this layer need to be quantized, but no scale param!\n");
return -1;
}

char key_k[256];
snprintf(key_k, 256, "%s_param_1", layers[i]->name.c_str());
std::map<std::string, ncnn::Mat>::iterator iter_k = weight_int8scale_table.find(key_k);
if (iter_k == weight_int8scale_table.end())
{
fprintf(stderr, "this layer need to be quantized, but no scale param!\n");
return -1;
}

char key_v[256];
snprintf(key_v, 256, "%s_param_2", layers[i]->name.c_str());
std::map<std::string, ncnn::Mat>::iterator iter_v = weight_int8scale_table.find(key_v);
if (iter_v == weight_int8scale_table.end())
{
fprintf(stderr, "this layer need to be quantized, but no scale param!\n");
return -1;
}

char key_out[256];
snprintf(key_out, 256, "%s_param_3", layers[i]->name.c_str());
std::map<std::string, ncnn::Mat>::iterator iter_out = weight_int8scale_table.find(key_out);
if (iter_out == weight_int8scale_table.end())
{
fprintf(stderr, "this layer need to be quantized, but no scale param!\n");
return -1;
}

// MultiHeadAttention - quantize weight from fp32 to int8
ncnn::MultiHeadAttention* mha = (ncnn::MultiHeadAttention*)layers[i];

fprintf(stderr, "quantize_multiheadattention %s\n", mha->name.c_str());

// TODO move to ncnn2table

const int qdim = mha->weight_data_size / mha->embed_dim;

{
mha->q_weight_data_int8_scales.create(mha->embed_dim);
for (int i = 0; i < mha->embed_dim; i++)
{
float absmax = 0.f;

const float* ptr = (const float*)mha->q_weight_data + i * qdim;
for (int j = 0; j < qdim; j++)
{
absmax = std::max(absmax, (float)fabs(ptr[j]));
}

mha->q_weight_data_int8_scales[i] = absmax == 0.f ? 1.f : 127 / absmax;
}
mha->q_weight_data_int8_scales = iter_q->second;

ncnn::Mat q_weight_data = mha->q_weight_data.reshape(qdim, mha->embed_dim);
ncnn::Mat q_weight_data_int8;
Expand All @@ -757,19 +749,7 @@ int NetQuantize::quantize_multiheadattention()
}

{
mha->k_weight_data_int8_scales.create(mha->embed_dim);
for (int i = 0; i < mha->embed_dim; i++)
{
float absmax = 0.f;

const float* ptr = (const float*)mha->k_weight_data + i * mha->kdim;
for (int j = 0; j < mha->kdim; j++)
{
absmax = std::max(absmax, (float)fabs(ptr[j]));
}

mha->k_weight_data_int8_scales[i] = absmax == 0.f ? 1.f : 127 / absmax;
}
mha->k_weight_data_int8_scales = iter_k->second;

ncnn::Mat k_weight_data = mha->k_weight_data.reshape(mha->kdim, mha->embed_dim);
ncnn::Mat k_weight_data_int8;
Expand All @@ -785,19 +765,7 @@ int NetQuantize::quantize_multiheadattention()
}

{
mha->v_weight_data_int8_scales.create(mha->embed_dim);
for (int i = 0; i < mha->embed_dim; i++)
{
float absmax = 0.f;

const float* ptr = (const float*)mha->v_weight_data + i * mha->vdim;
for (int j = 0; j < mha->vdim; j++)
{
absmax = std::max(absmax, (float)fabs(ptr[j]));
}

mha->v_weight_data_int8_scales[i] = absmax == 0.f ? 1.f : 127 / absmax;
}
mha->v_weight_data_int8_scales = iter_v->second;

ncnn::Mat v_weight_data = mha->v_weight_data.reshape(mha->vdim, mha->embed_dim);
ncnn::Mat v_weight_data_int8;
Expand All @@ -813,17 +781,8 @@ int NetQuantize::quantize_multiheadattention()
}

{
const float* ptr = mha->out_weight_data;
float absmax = 0.f;
for (int j = 0; j < mha->out_weight_data.w; j++)
{
absmax = std::max(absmax, (float)fabs(ptr[j]));
}

mha->out_weight_data_int8_scale = absmax == 0.f ? 1.f : 127 / absmax;

ncnn::Mat out_weight_data_int8_scales(1);
out_weight_data_int8_scales[0] = mha->out_weight_data_int8_scale;
ncnn::Mat out_weight_data_int8_scales = iter_out->second;
mha->out_weight_data_int8_scale = out_weight_data_int8_scales[0];

ncnn::Mat out_weight_data_int8;

Expand Down Expand Up @@ -854,7 +813,7 @@ int NetQuantize::quantize_sdpa()

fprintf(stderr, "quantize_sdpa %s\n", sdpa->name.c_str());

// TODO move to ncnn2table
// SDPA uses dynamic activation quantization in forward_int8

sdpa->int8_scale_term = 2;
}
Expand Down
Loading