diff --git a/docs/how-to-use-and-FAQ/quantized-int8-inference.md b/docs/how-to-use-and-FAQ/quantized-int8-inference.md index bb0b0a7330b7..030215d5be5e 100644 --- a/docs/how-to-use-and-FAQ/quantized-int8-inference.md +++ b/docs/how-to-use-and-FAQ/quantized-int8-inference.md @@ -89,16 +89,16 @@ filelist_in2.txt ``` **Here shape is WHC, because the order of the arguments to `ncnn::Mat`.** -### 3. Quantize model +For RNN,GRU,LSTM,MultiHeadAttention and Embed layers,ncnn2table also supports tableless quantization. ```shell -./ncnn2int8 mobilenet-opt.param mobilenet-opt.bin mobilenet-int8.param mobilenet-int8.bin mobilenet.table +./ncnn2table rnn.param rnn.bin rnn.table method=kl ``` -If you don’t need static quantization, ncnn supports RNN/LSTM/GRU dynamic quantization. In this case, you can omit the table file. +### 3. Quantize model ```shell -./ncnn2int8 rnn-model.param rnn-model.bin rnn-model-int8.param rnn-model-int8.bin +./ncnn2int8 mobilenet-opt.param mobilenet-opt.bin mobilenet-int8.param mobilenet-int8.bin mobilenet.table ``` ## use ncnn int8 inference diff --git a/tools/quantize/ncnn2int8.cpp b/tools/quantize/ncnn2int8.cpp index 55db8d79c2af..a92305a75324 100644 --- a/tools/quantize/ncnn2int8.cpp +++ b/tools/quantize/ncnn2int8.cpp @@ -317,43 +317,34 @@ int NetQuantize::quantize_rnn() if (layers[i]->type != "RNN") continue; + char key_xc[256]; + snprintf(key_xc, 256, "%s_param_0", layers[i]->name.c_str()); + std::map::iterator iter_xc = weight_int8scale_table.find(key_xc); + if (iter_xc == weight_int8scale_table.end()) + { + fprintf(stderr, "this layer need to be quantized, but no scale param!\n"); + return -1; + } + + char key_hc[256]; + snprintf(key_hc, 256, "%s_param_1", layers[i]->name.c_str()); + std::map::iterator iter_hc = weight_int8scale_table.find(key_hc); + if (iter_hc == weight_int8scale_table.end()) + { + fprintf(stderr, "this layer need to be quantized, but no scale param!\n"); + return -1; + } + // RNN - quantize weight from fp32 to int8 ncnn::RNN* rnn = (ncnn::RNN*)layers[i]; fprintf(stderr, "quantize_rnn %s\n", rnn->name.c_str()); - // TODO move to ncnn2table const int num_directions = rnn->direction == 2 ? 2 : 1; const int size = rnn->weight_data_size / num_directions / rnn->num_output; - ncnn::Mat weight_xc_data_int8_scales(rnn->num_output * num_directions); - ncnn::Mat weight_hc_data_int8_scales(rnn->num_output * num_directions); - - for (int d = 0; d < num_directions; d++) - { - for (int q = 0; q < rnn->num_output; q++) - { - { - const float* weight_xc_ptr = rnn->weight_xc_data.channel(d).row(q); - float absmax = 0.f; - for (int i = 0; i < size; i++) - { - absmax = std::max(absmax, (float)fabs(weight_xc_ptr[i])); - } - weight_xc_data_int8_scales[d * rnn->num_output + q] = 127 / absmax; - } - - { - const float* weight_hc_ptr = rnn->weight_hc_data.channel(d).row(q); - float absmax = 0.f; - for (int i = 0; i < size; i++) - { - absmax = std::max(absmax, (float)fabs(weight_hc_ptr[i])); - } - weight_hc_data_int8_scales[d * rnn->num_output + q] = 127 / absmax; - } - } - } + ncnn::Mat weight_xc_data_int8_scales = iter_xc->second; + ncnn::Mat weight_hc_data_int8_scales = iter_hc->second; { ncnn::Mat weight_xc_data_r2 = rnn->weight_xc_data.reshape(size, rnn->num_output * num_directions); @@ -399,43 +390,34 @@ int NetQuantize::quantize_lstm() if (layers[i]->type != "LSTM") continue; + char key_xc[256]; + snprintf(key_xc, 256, "%s_param_0", layers[i]->name.c_str()); + std::map::iterator iter_xc = weight_int8scale_table.find(key_xc); + if (iter_xc == weight_int8scale_table.end()) + { + fprintf(stderr, "this layer need to be quantized, but no scale param!\n"); + return -1; + } + + char key_hc[256]; + snprintf(key_hc, 256, "%s_param_1", layers[i]->name.c_str()); + std::map::iterator iter_hc = weight_int8scale_table.find(key_hc); + if (iter_hc == weight_int8scale_table.end()) + { + fprintf(stderr, "this layer need to be quantized, but no scale param!\n"); + return -1; + } + // LSTM - quantize weight from fp32 to int8 ncnn::LSTM* lstm = (ncnn::LSTM*)layers[i]; fprintf(stderr, "quantize_lstm %s\n", lstm->name.c_str()); - // TODO move to ncnn2table const int num_directions = lstm->direction == 2 ? 2 : 1; const int size = lstm->weight_data_size / num_directions / lstm->hidden_size / 4; - ncnn::Mat weight_xc_data_int8_scales(lstm->hidden_size * 4 * num_directions); - ncnn::Mat weight_hc_data_int8_scales(lstm->hidden_size * 4 * num_directions); - - for (int d = 0; d < num_directions; d++) - { - for (int q = 0; q < lstm->hidden_size * 4; q++) - { - { - const float* weight_xc_ptr = lstm->weight_xc_data.channel(d).row(q); - float absmax = 0.f; - for (int i = 0; i < size; i++) - { - absmax = std::max(absmax, (float)fabs(weight_xc_ptr[i])); - } - weight_xc_data_int8_scales[d * lstm->hidden_size * 4 + q] = 127 / absmax; - } - - { - const float* weight_hc_ptr = lstm->weight_hc_data.channel(d).row(q); - float absmax = 0.f; - for (int i = 0; i < size; i++) - { - absmax = std::max(absmax, (float)fabs(weight_hc_ptr[i])); - } - weight_hc_data_int8_scales[d * lstm->hidden_size * 4 + q] = 127 / absmax; - } - } - } + ncnn::Mat weight_xc_data_int8_scales = iter_xc->second; + ncnn::Mat weight_hc_data_int8_scales = iter_hc->second; { ncnn::Mat weight_xc_data_r2 = lstm->weight_xc_data.reshape(size, lstm->hidden_size * 4 * num_directions); @@ -481,43 +463,34 @@ int NetQuantize::quantize_gru() if (layers[i]->type != "GRU") continue; + char key_xc[256]; + snprintf(key_xc, 256, "%s_param_0", layers[i]->name.c_str()); + std::map::iterator iter_xc = weight_int8scale_table.find(key_xc); + if (iter_xc == weight_int8scale_table.end()) + { + fprintf(stderr, "this layer need to be quantized, but no scale param!\n"); + return -1; + } + + char key_hc[256]; + snprintf(key_hc, 256, "%s_param_1", layers[i]->name.c_str()); + std::map::iterator iter_hc = weight_int8scale_table.find(key_hc); + if (iter_hc == weight_int8scale_table.end()) + { + fprintf(stderr, "this layer need to be quantized, but no scale param!\n"); + return -1; + } + // GRU - quantize weight from fp32 to int8 ncnn::GRU* gru = (ncnn::GRU*)layers[i]; fprintf(stderr, "quantize_gru %s\n", gru->name.c_str()); - // TODO move to ncnn2table const int num_directions = gru->direction == 2 ? 2 : 1; const int size = gru->weight_data_size / num_directions / gru->num_output / 3; - ncnn::Mat weight_xc_data_int8_scales(gru->num_output * 3 * num_directions); - ncnn::Mat weight_hc_data_int8_scales(gru->num_output * 3 * num_directions); - - for (int d = 0; d < num_directions; d++) - { - for (int q = 0; q < gru->num_output * 3; q++) - { - { - const float* weight_xc_ptr = gru->weight_xc_data.channel(d).row(q); - float absmax = 0.f; - for (int i = 0; i < size; i++) - { - absmax = std::max(absmax, (float)fabs(weight_xc_ptr[i])); - } - weight_xc_data_int8_scales[d * gru->num_output * 3 + q] = 127 / absmax; - } - - { - const float* weight_hc_ptr = gru->weight_hc_data.channel(d).row(q); - float absmax = 0.f; - for (int i = 0; i < size; i++) - { - absmax = std::max(absmax, (float)fabs(weight_hc_ptr[i])); - } - weight_hc_data_int8_scales[d * gru->num_output * 3 + q] = 127 / absmax; - } - } - } + ncnn::Mat weight_xc_data_int8_scales = iter_xc->second; + ncnn::Mat weight_hc_data_int8_scales = iter_hc->second; { ncnn::Mat weight_xc_data_r2 = gru->weight_xc_data.reshape(size, gru->num_output * 3 * num_directions); @@ -563,27 +536,24 @@ int NetQuantize::quantize_embed() if (layers[i]->type != "Embed") continue; + char key[256]; + snprintf(key, 256, "%s_param_0", layers[i]->name.c_str()); + std::map::iterator iter = weight_int8scale_table.find(key); + if (iter == weight_int8scale_table.end()) + { + fprintf(stderr, "this layer need to be quantized, but no scale param!\n"); + return -1; + } + // Embed - quantize weight from fp32 to int8 ncnn::Embed* embed = (ncnn::Embed*)layers[i]; fprintf(stderr, "quantize_embed %s\n", embed->name.c_str()); - // TODO move to ncnn2table - const int num_output = embed->num_output; const int input_dim = embed->input_dim; - ncnn::Mat weight_data_int8_scales(1); - { - const float* ptr = embed->weight_data; - float absmax = 0.f; - for (int i = 0; i < embed->weight_data.w; i++) - { - absmax = std::max(absmax, (float)fabs(ptr[i])); - } - - weight_data_int8_scales[0] = absmax == 0.f ? 1.f : 127 / absmax; - } + ncnn::Mat weight_data_int8_scales = iter->second; { ncnn::Mat weight_data_int8; @@ -719,29 +689,51 @@ int NetQuantize::quantize_multiheadattention() if (layers[i]->type != "MultiHeadAttention") continue; + char key_q[256]; + snprintf(key_q, 256, "%s_param_0", layers[i]->name.c_str()); + std::map::iterator iter_q = weight_int8scale_table.find(key_q); + if (iter_q == weight_int8scale_table.end()) + { + fprintf(stderr, "this layer need to be quantized, but no scale param!\n"); + return -1; + } + + char key_k[256]; + snprintf(key_k, 256, "%s_param_1", layers[i]->name.c_str()); + std::map::iterator iter_k = weight_int8scale_table.find(key_k); + if (iter_k == weight_int8scale_table.end()) + { + fprintf(stderr, "this layer need to be quantized, but no scale param!\n"); + return -1; + } + + char key_v[256]; + snprintf(key_v, 256, "%s_param_2", layers[i]->name.c_str()); + std::map::iterator iter_v = weight_int8scale_table.find(key_v); + if (iter_v == weight_int8scale_table.end()) + { + fprintf(stderr, "this layer need to be quantized, but no scale param!\n"); + return -1; + } + + char key_out[256]; + snprintf(key_out, 256, "%s_param_3", layers[i]->name.c_str()); + std::map::iterator iter_out = weight_int8scale_table.find(key_out); + if (iter_out == weight_int8scale_table.end()) + { + fprintf(stderr, "this layer need to be quantized, but no scale param!\n"); + return -1; + } + // MultiHeadAttention - quantize weight from fp32 to int8 ncnn::MultiHeadAttention* mha = (ncnn::MultiHeadAttention*)layers[i]; fprintf(stderr, "quantize_multiheadattention %s\n", mha->name.c_str()); - // TODO move to ncnn2table - const int qdim = mha->weight_data_size / mha->embed_dim; { - mha->q_weight_data_int8_scales.create(mha->embed_dim); - for (int i = 0; i < mha->embed_dim; i++) - { - float absmax = 0.f; - - const float* ptr = (const float*)mha->q_weight_data + i * qdim; - for (int j = 0; j < qdim; j++) - { - absmax = std::max(absmax, (float)fabs(ptr[j])); - } - - mha->q_weight_data_int8_scales[i] = absmax == 0.f ? 1.f : 127 / absmax; - } + mha->q_weight_data_int8_scales = iter_q->second; ncnn::Mat q_weight_data = mha->q_weight_data.reshape(qdim, mha->embed_dim); ncnn::Mat q_weight_data_int8; @@ -757,19 +749,7 @@ int NetQuantize::quantize_multiheadattention() } { - mha->k_weight_data_int8_scales.create(mha->embed_dim); - for (int i = 0; i < mha->embed_dim; i++) - { - float absmax = 0.f; - - const float* ptr = (const float*)mha->k_weight_data + i * mha->kdim; - for (int j = 0; j < mha->kdim; j++) - { - absmax = std::max(absmax, (float)fabs(ptr[j])); - } - - mha->k_weight_data_int8_scales[i] = absmax == 0.f ? 1.f : 127 / absmax; - } + mha->k_weight_data_int8_scales = iter_k->second; ncnn::Mat k_weight_data = mha->k_weight_data.reshape(mha->kdim, mha->embed_dim); ncnn::Mat k_weight_data_int8; @@ -785,19 +765,7 @@ int NetQuantize::quantize_multiheadattention() } { - mha->v_weight_data_int8_scales.create(mha->embed_dim); - for (int i = 0; i < mha->embed_dim; i++) - { - float absmax = 0.f; - - const float* ptr = (const float*)mha->v_weight_data + i * mha->vdim; - for (int j = 0; j < mha->vdim; j++) - { - absmax = std::max(absmax, (float)fabs(ptr[j])); - } - - mha->v_weight_data_int8_scales[i] = absmax == 0.f ? 1.f : 127 / absmax; - } + mha->v_weight_data_int8_scales = iter_v->second; ncnn::Mat v_weight_data = mha->v_weight_data.reshape(mha->vdim, mha->embed_dim); ncnn::Mat v_weight_data_int8; @@ -813,17 +781,8 @@ int NetQuantize::quantize_multiheadattention() } { - const float* ptr = mha->out_weight_data; - float absmax = 0.f; - for (int j = 0; j < mha->out_weight_data.w; j++) - { - absmax = std::max(absmax, (float)fabs(ptr[j])); - } - - mha->out_weight_data_int8_scale = absmax == 0.f ? 1.f : 127 / absmax; - - ncnn::Mat out_weight_data_int8_scales(1); - out_weight_data_int8_scales[0] = mha->out_weight_data_int8_scale; + ncnn::Mat out_weight_data_int8_scales = iter_out->second; + mha->out_weight_data_int8_scale = out_weight_data_int8_scales[0]; ncnn::Mat out_weight_data_int8; @@ -854,7 +813,7 @@ int NetQuantize::quantize_sdpa() fprintf(stderr, "quantize_sdpa %s\n", sdpa->name.c_str()); - // TODO move to ncnn2table + // SDPA uses dynamic activation quantization in forward_int8 sdpa->int8_scale_term = 2; } diff --git a/tools/quantize/ncnn2table.cpp b/tools/quantize/ncnn2table.cpp index 7edbdd15128d..55aac995c0e6 100644 --- a/tools/quantize/ncnn2table.cpp +++ b/tools/quantize/ncnn2table.cpp @@ -38,6 +38,11 @@ #include "layer/convolution.h" #include "layer/convolutiondepthwise.h" #include "layer/innerproduct.h" +#include "layer/embed.h" +#include "layer/multiheadattention.h" +#include "layer/rnn.h" +#include "layer/lstm.h" +#include "layer/gru.h" class QuantBlobStat { @@ -61,6 +66,23 @@ class QuantBlobStat std::vector histogram_normed; }; +class QuantMHAStat +{ +public: + ncnn::Mat q_weight_scales; + ncnn::Mat k_weight_scales; + ncnn::Mat v_weight_scales; + float out_weight_scale; +}; + +// rnn, gru, lstm +class QuantRecurrentStat +{ +public: + ncnn::Mat weight_xc_scales; + ncnn::Mat weight_hc_scales; +}; + class QuantNet : public ncnn::Net { public: @@ -91,11 +113,21 @@ class QuantNet : public ncnn::Net std::vector conv_layers; std::vector conv_bottom_blobs; std::vector conv_top_blobs; + std::vector embed_layers; + std::vector mha_layers; + std::vector rnn_layers; + std::vector lstm_layers; + std::vector gru_layers; // result std::vector quant_blob_stats; std::vector weight_scales; std::vector bottom_blob_scales; + std::vector embed_weight_scales; + std::vector mha_stats; + std::vector rnn_stats; + std::vector lstm_stats; + std::vector gru_stats; }; QuantNet::QuantNet() @@ -126,14 +158,48 @@ int QuantNet::init() conv_bottom_blobs.push_back(layer->bottoms[0]); conv_top_blobs.push_back(layer->tops[0]); } + + // find embed layers + else if (layer->type == "Embed") + { + embed_layers.push_back(i); + } + + // find all mha layers + else if (layer->type == "MultiHeadAttention") + { + mha_layers.push_back(i); + } + else if (layer->type == "RNN") + { + rnn_layers.push_back(i); + } + else if (layer->type == "LSTM") + { + lstm_layers.push_back(i); + } + else if (layer->type == "GRU") + { + gru_layers.push_back(i); + } } const int conv_layer_count = (int)conv_layers.size(); const int conv_bottom_blob_count = (int)conv_bottom_blobs.size(); + const int embed_layer_count = (int)embed_layers.size(); + const int mha_layer_count = (int)mha_layers.size(); + const int rnn_layer_count = (int)rnn_layers.size(); + const int lstm_layer_count = (int)lstm_layers.size(); + const int gru_layer_count = (int)gru_layers.size(); quant_blob_stats.resize(conv_bottom_blob_count); weight_scales.resize(conv_layer_count); bottom_blob_scales.resize(conv_bottom_blob_count); + embed_weight_scales.resize(embed_layer_count); + mha_stats.resize(mha_layer_count); + rnn_stats.resize(rnn_layer_count); + lstm_stats.resize(lstm_layer_count); + gru_stats.resize(gru_layer_count); return 0; } @@ -149,6 +215,11 @@ int QuantNet::save_table(const char* tablepath) const int conv_layer_count = (int)conv_layers.size(); const int conv_bottom_blob_count = (int)conv_bottom_blobs.size(); + const int embed_layer_count = (int)embed_layers.size(); + const int mha_layer_count = (int)mha_layers.size(); + const int rnn_layer_count = (int)rnn_layers.size(); + const int lstm_layer_count = (int)lstm_layers.size(); + const int gru_layer_count = (int)gru_layers.size(); fprintf(stdout, "param:%d\n", conv_layer_count); @@ -176,6 +247,110 @@ int QuantNet::save_table(const char* tablepath) fprintf(fp, "\n"); } + fprintf(stdout, "param:%d\n", embed_layer_count); + for (int i = 0; i < embed_layer_count; i++) + { + fprintf(fp, "%s_param_0 ", layers[embed_layers[i]]->name.c_str()); + fprintf(fp, "%f ", embed_weight_scales[i]); + fprintf(fp, "\n"); + } + + fprintf(stdout, "param:%d\n", mha_layer_count); + for (int i = 0; i < mha_layer_count; i++) + { + // q_weight + const ncnn::Mat q_weight_scales = mha_stats[i].q_weight_scales; + fprintf(fp, "%s_param_0 ", layers[mha_layers[i]]->name.c_str()); + for (int j = 0; j < q_weight_scales.w; j++) + { + fprintf(fp, "%f ", q_weight_scales[j]); + } + fprintf(fp, "\n"); + + // k_weight + const ncnn::Mat k_weight_scales = mha_stats[i].k_weight_scales; + fprintf(fp, "%s_param_1 ", layers[mha_layers[i]]->name.c_str()); + for (int j = 0; j < k_weight_scales.w; j++) + { + fprintf(fp, "%f ", k_weight_scales[j]); + } + fprintf(fp, "\n"); + + // v_weight + const ncnn::Mat v_weight_scales = mha_stats[i].v_weight_scales; + fprintf(fp, "%s_param_2 ", layers[mha_layers[i]]->name.c_str()); + for (int j = 0; j < v_weight_scales.w; j++) + { + fprintf(fp, "%f ", v_weight_scales[j]); + } + fprintf(fp, "\n"); + + // out_weight + fprintf(fp, "%s_param_3 ", layers[mha_layers[i]]->name.c_str()); + fprintf(fp, "%f ", mha_stats[i].out_weight_scale); + fprintf(fp, "\n"); + } + + fprintf(stdout, "param:%d\n", rnn_layer_count); + for (int i = 0; i < rnn_layer_count; i++) + { + const ncnn::Mat weight_xc_scales = rnn_stats[i].weight_xc_scales; + fprintf(fp, "%s_param_0 ", layers[rnn_layers[i]]->name.c_str()); + for (int j = 0; j < weight_xc_scales.w; j++) + { + fprintf(fp, "%f ", weight_xc_scales[j]); + } + fprintf(fp, "\n"); + + const ncnn::Mat weight_hc_scales = rnn_stats[i].weight_hc_scales; + fprintf(fp, "%s_param_1 ", layers[rnn_layers[i]]->name.c_str()); + for (int j = 0; j < weight_hc_scales.w; j++) + { + fprintf(fp, "%f ", weight_hc_scales[j]); + } + fprintf(fp, "\n"); + } + + fprintf(stdout, "param:%d\n", lstm_layer_count); + for (int i = 0; i < lstm_layer_count; i++) + { + const ncnn::Mat weight_xc_scales = lstm_stats[i].weight_xc_scales; + fprintf(fp, "%s_param_0 ", layers[lstm_layers[i]]->name.c_str()); + for (int j = 0; j < weight_xc_scales.w; j++) + { + fprintf(fp, "%f ", weight_xc_scales[j]); + } + fprintf(fp, "\n"); + + const ncnn::Mat weight_hc_scales = lstm_stats[i].weight_hc_scales; + fprintf(fp, "%s_param_1 ", layers[lstm_layers[i]]->name.c_str()); + for (int j = 0; j < weight_hc_scales.w; j++) + { + fprintf(fp, "%f ", weight_hc_scales[j]); + } + fprintf(fp, "\n"); + } + + fprintf(stdout, "param:%d\n", gru_layer_count); + for (int i = 0; i < gru_layer_count; i++) + { + const ncnn::Mat weight_xc_scales = gru_stats[i].weight_xc_scales; + fprintf(fp, "%s_param_0 ", layers[gru_layers[i]]->name.c_str()); + for (int j = 0; j < weight_xc_scales.w; j++) + { + fprintf(fp, "%f ", weight_xc_scales[j]); + } + fprintf(fp, "\n"); + + const ncnn::Mat weight_hc_scales = gru_stats[i].weight_hc_scales; + fprintf(fp, "%s_param_1 ", layers[gru_layers[i]]->name.c_str()); + for (int j = 0; j < weight_hc_scales.w; j++) + { + fprintf(fp, "%f ", weight_hc_scales[j]); + } + fprintf(fp, "\n"); + } + fclose(fp); fprintf(stderr, "ncnn int8 calibration table create success, best wish for your int8 inference has a low accuracy loss...\\(^0^)/...233...\n"); @@ -302,12 +477,11 @@ int QuantNet::quantize_KL() const int input_blob_count = (int)input_blobs.size(); const int conv_layer_count = (int)conv_layers.size(); const int conv_bottom_blob_count = (int)conv_bottom_blobs.size(); - const int file_count = (int)listspaths[0].size(); - - const int num_histogram_bins = 2048; - - std::vector blob_allocators(quantize_num_threads); - std::vector workspace_allocators(quantize_num_threads); + const int embed_layer_count = (int)embed_layers.size(); + const int mha_layer_count = (int)mha_layers.size(); + const int rnn_layer_count = (int)rnn_layers.size(); + const int lstm_layer_count = (int)lstm_layers.size(); + const int gru_layer_count = (int)gru_layers.size(); // initialize conv weight scales #pragma omp parallel for num_threads(quantize_num_threads) @@ -407,6 +581,205 @@ int QuantNet::quantize_KL() } } + // initialize embed weight scales + for (int i = 0; i < embed_layer_count; i++) + { + const ncnn::Layer* layer = layers[embed_layers[i]]; + const ncnn::Embed* embed = (const ncnn::Embed*)layer; + const float* ptr = embed->weight_data; + + float absmax = 0.f; + for (int j = 0; j < embed->weight_data.w; j++) + { + absmax = std::max(absmax, (float)fabs(ptr[j])); + } + embed_weight_scales[i] = absmax == 0.f ? 1.f : 127 / absmax; + } + + // initialize mha weight scales + for (int i = 0; i < mha_layer_count; i++) + { + const ncnn::Layer* layer = layers[mha_layers[i]]; + const ncnn::MultiHeadAttention* mha = (ncnn::MultiHeadAttention*)layer; + + const int qdim = mha->weight_data_size / mha->embed_dim; + mha_stats[i].q_weight_scales.create(mha->embed_dim); + for (int j = 0; j < mha->embed_dim; j++) + { + float q_absmax = 0.f; + + const float* q_ptr = (const float*)mha->q_weight_data + j * qdim; + for (int k = 0; k < qdim; k++) + { + q_absmax = std::max(q_absmax, (float)fabs(q_ptr[k])); + } + mha_stats[i].q_weight_scales[j] = q_absmax == 0.f ? 1.f : 127 / q_absmax; + } + + const int kdim = mha->kdim; + mha_stats[i].k_weight_scales.create(mha->embed_dim); + for (int j = 0; j < mha->embed_dim; j++) + { + float k_absmax = 0.f; + + const float* k_ptr = (const float*)mha->k_weight_data + j * kdim; + for (int k = 0; k < kdim; k++) + { + k_absmax = std::max(k_absmax, (float)fabs(k_ptr[k])); + } + mha_stats[i].k_weight_scales[j] = k_absmax == 0.f ? 1.f : 127 / k_absmax; + } + + const int vdim = mha->vdim; + mha_stats[i].v_weight_scales.create(mha->embed_dim); + for (int j = 0; j < mha->embed_dim; j++) + { + float v_absmax = 0.f; + + const float* v_ptr = (const float*)mha->v_weight_data + j * vdim; + for (int k = 0; k < vdim; k++) + { + v_absmax = std::max(v_absmax, (float)fabs(v_ptr[k])); + } + mha_stats[i].v_weight_scales[j] = v_absmax == 0.f ? 1.f : 127 / v_absmax; + } + + const float* o_ptr = (const float*)mha->out_weight_data; + float o_absmax = 0.f; + for (int k = 0; k < mha->out_weight_data.w; k++) + { + o_absmax = std::max(o_absmax, (float)fabs(o_ptr[k])); + } + mha_stats[i].out_weight_scale = o_absmax == 0.f ? 1.f : 127 / o_absmax; + } + + // initialize rnn weight scales + for (int i = 0; i < rnn_layer_count; i++) + { + const ncnn::Layer* layer = layers[rnn_layers[i]]; + const ncnn::RNN* rnn = (const ncnn::RNN*)layer; + + const int num_directions = rnn->direction == 2 ? 2 : 1; + const int size = rnn->weight_data_size / num_directions / rnn->num_output; + + rnn_stats[i].weight_xc_scales.create(rnn->num_output * num_directions); + rnn_stats[i].weight_hc_scales.create(rnn->num_output * num_directions); + + for (int d = 0; d < num_directions; d++) + { + for (int q = 0; q < rnn->num_output; q++) + { + { + const float* weight_xc_ptr = rnn->weight_xc_data.channel(d).row(q); + float absmax = 0.f; + for (int j = 0; j < size; j++) + { + absmax = std::max(absmax, (float)fabs(weight_xc_ptr[j])); + } + rnn_stats[i].weight_xc_scales[d * rnn->num_output + q] = absmax == 0.f ? 1.f : 127 / absmax; + } + + { + const float* weight_hc_ptr = rnn->weight_hc_data.channel(d).row(q); + float absmax = 0.f; + for (int j = 0; j < rnn->num_output; j++) + { + absmax = std::max(absmax, (float)fabs(weight_hc_ptr[j])); + } + rnn_stats[i].weight_hc_scales[d * rnn->num_output + q] = absmax == 0.f ? 1.f : 127 / absmax; + } + } + } + } + + // initialize lstm weight scales + for (int i = 0; i < lstm_layer_count; i++) + { + const ncnn::Layer* layer = layers[lstm_layers[i]]; + const ncnn::LSTM* lstm = (const ncnn::LSTM*)layer; + + const int num_directions = lstm->direction == 2 ? 2 : 1; + const int size = lstm->weight_data_size / num_directions / lstm->hidden_size / 4; + + lstm_stats[i].weight_xc_scales.create(lstm->hidden_size * 4 * num_directions); + lstm_stats[i].weight_hc_scales.create(lstm->hidden_size * 4 * num_directions); + + for (int d = 0; d < num_directions; d++) + { + for (int q = 0; q < lstm->hidden_size * 4; q++) + { + { + const float* weight_xc_ptr = lstm->weight_xc_data.channel(d).row(q); + float absmax = 0.f; + for (int j = 0; j < size; j++) + { + absmax = std::max(absmax, (float)fabs(weight_xc_ptr[j])); + } + lstm_stats[i].weight_xc_scales[d * lstm->hidden_size * 4 + q] = absmax == 0.f ? 1.f : 127 / absmax; + } + + { + const float* weight_hc_ptr = lstm->weight_hc_data.channel(d).row(q); + float absmax = 0.f; + for (int j = 0; j < lstm->num_output; j++) + { + absmax = std::max(absmax, (float)fabs(weight_hc_ptr[j])); + } + lstm_stats[i].weight_hc_scales[d * lstm->hidden_size * 4 + q] = absmax == 0.f ? 1.f : 127 / absmax; + } + } + } + } + + // initialize gru weight scales + for (int i = 0; i < gru_layer_count; i++) + { + const ncnn::Layer* layer = layers[gru_layers[i]]; + const ncnn::GRU* gru = (const ncnn::GRU*)layer; + + const int num_directions = gru->direction == 2 ? 2 : 1; + const int size = gru->weight_data_size / num_directions / gru->num_output / 3; + + gru_stats[i].weight_xc_scales.create(gru->num_output * 3 * num_directions); + gru_stats[i].weight_hc_scales.create(gru->num_output * 3 * num_directions); + + for (int d = 0; d < num_directions; d++) + { + for (int q = 0; q < gru->num_output * 3; q++) + { + { + const float* weight_xc_ptr = gru->weight_xc_data.channel(d).row(q); + float absmax = 0.f; + for (int j = 0; j < size; j++) + { + absmax = std::max(absmax, (float)fabs(weight_xc_ptr[j])); + } + gru_stats[i].weight_xc_scales[d * gru->num_output * 3 + q] = absmax == 0.f ? 1.f : 127 / absmax; + } + + { + const float* weight_hc_ptr = gru->weight_hc_data.channel(d).row(q); + float absmax = 0.f; + for (int j = 0; j < gru->num_output; j++) + { + absmax = std::max(absmax, (float)fabs(weight_hc_ptr[j])); + } + gru_stats[i].weight_hc_scales[d * gru->num_output * 3 + q] = absmax == 0.f ? 1.f : 127 / absmax; + } + } + } + } + + if (conv_layer_count == 0) + return 0; + + const int file_count = (int)listspaths[0].size(); + + const int num_histogram_bins = 2048; + + std::vector blob_allocators(quantize_num_threads); + std::vector workspace_allocators(quantize_num_threads); + // count the absmax #pragma omp parallel for num_threads(quantize_num_threads) schedule(static, 1) for (int i = 0; i < file_count; i++) @@ -780,10 +1153,11 @@ int QuantNet::quantize_ACIQ() const int input_blob_count = (int)input_blobs.size(); const int conv_layer_count = (int)conv_layers.size(); const int conv_bottom_blob_count = (int)conv_bottom_blobs.size(); - const int file_count = (int)listspaths[0].size(); - - std::vector blob_allocators(quantize_num_threads); - std::vector workspace_allocators(quantize_num_threads); + const int embed_layer_count = (int)embed_layers.size(); + const int rnn_layer_count = (int)rnn_layers.size(); + const int lstm_layer_count = (int)lstm_layers.size(); + const int gru_layer_count = (int)gru_layers.size(); + const int mha_layer_count = (int)mha_layers.size(); // initialize conv weight scales #pragma omp parallel for num_threads(quantize_num_threads) @@ -887,6 +1261,203 @@ int QuantNet::quantize_ACIQ() } } + // initialize embed weight scales + for (int i = 0; i < embed_layer_count; i++) + { + const ncnn::Layer* layer = layers[embed_layers[i]]; + const ncnn::Embed* embed = (const ncnn::Embed*)layer; + const float* ptr = embed->weight_data; + + float absmax = 0.f; + for (int j = 0; j < embed->weight_data.w; j++) + { + absmax = std::max(absmax, (float)fabs(ptr[j])); + } + embed_weight_scales[i] = absmax == 0.f ? 1.f : 127 / absmax; + } + + // initialize rnn weight scales + for (int i = 0; i < rnn_layer_count; i++) + { + const ncnn::Layer* layer = layers[rnn_layers[i]]; + const ncnn::RNN* rnn = (const ncnn::RNN*)layer; + + const int num_directions = rnn->direction == 2 ? 2 : 1; + const int size = rnn->weight_data_size / num_directions / rnn->num_output; + + rnn_stats[i].weight_xc_scales.create(rnn->num_output * num_directions); + rnn_stats[i].weight_hc_scales.create(rnn->num_output * num_directions); + + for (int d = 0; d < num_directions; d++) + { + for (int q = 0; q < rnn->num_output; q++) + { + { + const float* weight_xc_ptr = rnn->weight_xc_data.channel(d).row(q); + float absmax = 0.f; + for (int j = 0; j < size; j++) + { + absmax = std::max(absmax, (float)fabs(weight_xc_ptr[j])); + } + rnn_stats[i].weight_xc_scales[d * rnn->num_output + q] = absmax == 0.f ? 1.f : 127 / absmax; + } + + { + const float* weight_hc_ptr = rnn->weight_hc_data.channel(d).row(q); + float absmax = 0.f; + for (int j = 0; j < rnn->num_output; j++) + { + absmax = std::max(absmax, (float)fabs(weight_hc_ptr[j])); + } + rnn_stats[i].weight_hc_scales[d * rnn->num_output + q] = absmax == 0.f ? 1.f : 127 / absmax; + } + } + } + } + + // initialize lstm weight scales + for (int i = 0; i < lstm_layer_count; i++) + { + const ncnn::Layer* layer = layers[lstm_layers[i]]; + const ncnn::LSTM* lstm = (const ncnn::LSTM*)layer; + + const int num_directions = lstm->direction == 2 ? 2 : 1; + const int size = lstm->weight_data_size / num_directions / lstm->hidden_size / 4; + + lstm_stats[i].weight_xc_scales.create(lstm->hidden_size * 4 * num_directions); + lstm_stats[i].weight_hc_scales.create(lstm->hidden_size * 4 * num_directions); + + for (int d = 0; d < num_directions; d++) + { + for (int q = 0; q < lstm->hidden_size * 4; q++) + { + { + const float* weight_xc_ptr = lstm->weight_xc_data.channel(d).row(q); + float absmax = 0.f; + for (int j = 0; j < size; j++) + { + absmax = std::max(absmax, (float)fabs(weight_xc_ptr[j])); + } + lstm_stats[i].weight_xc_scales[d * lstm->hidden_size * 4 + q] = absmax == 0.f ? 1.f : 127 / absmax; + } + + { + const float* weight_hc_ptr = lstm->weight_hc_data.channel(d).row(q); + float absmax = 0.f; + for (int j = 0; j < lstm->num_output; j++) + { + absmax = std::max(absmax, (float)fabs(weight_hc_ptr[j])); + } + lstm_stats[i].weight_hc_scales[d * lstm->hidden_size * 4 + q] = absmax == 0.f ? 1.f : 127 / absmax; + } + } + } + } + + // initialize gru weight scales + for (int i = 0; i < gru_layer_count; i++) + { + const ncnn::Layer* layer = layers[gru_layers[i]]; + const ncnn::GRU* gru = (const ncnn::GRU*)layer; + + const int num_directions = gru->direction == 2 ? 2 : 1; + const int size = gru->weight_data_size / num_directions / gru->num_output / 3; + + gru_stats[i].weight_xc_scales.create(gru->num_output * 3 * num_directions); + gru_stats[i].weight_hc_scales.create(gru->num_output * 3 * num_directions); + + for (int d = 0; d < num_directions; d++) + { + for (int q = 0; q < gru->num_output * 3; q++) + { + { + const float* weight_xc_ptr = gru->weight_xc_data.channel(d).row(q); + float absmax = 0.f; + for (int j = 0; j < size; j++) + { + absmax = std::max(absmax, (float)fabs(weight_xc_ptr[j])); + } + gru_stats[i].weight_xc_scales[d * gru->num_output * 3 + q] = absmax == 0.f ? 1.f : 127 / absmax; + } + + { + const float* weight_hc_ptr = gru->weight_hc_data.channel(d).row(q); + float absmax = 0.f; + for (int j = 0; j < gru->num_output; j++) + { + absmax = std::max(absmax, (float)fabs(weight_hc_ptr[j])); + } + gru_stats[i].weight_hc_scales[d * gru->num_output * 3 + q] = absmax == 0.f ? 1.f : 127 / absmax; + } + } + } + } + + // initialize mha weight scales + for (int i = 0; i < mha_layer_count; i++) + { + const ncnn::Layer* layer = layers[mha_layers[i]]; + const ncnn::MultiHeadAttention* mha = (ncnn::MultiHeadAttention*)layer; + + const int qdim = mha->weight_data_size / mha->embed_dim; + mha_stats[i].q_weight_scales.create(mha->embed_dim); + for (int j = 0; j < mha->embed_dim; j++) + { + float q_absmax = 0.f; + + const float* q_ptr = (const float*)mha->q_weight_data + j * qdim; + for (int k = 0; k < qdim; k++) + { + q_absmax = std::max(q_absmax, (float)fabs(q_ptr[k])); + } + mha_stats[i].q_weight_scales[j] = q_absmax == 0.f ? 1.f : 127 / q_absmax; + } + + const int kdim = mha->kdim; + mha_stats[i].k_weight_scales.create(mha->embed_dim); + for (int j = 0; j < mha->embed_dim; j++) + { + float k_absmax = 0.f; + + const float* k_ptr = (const float*)mha->k_weight_data + j * kdim; + for (int k = 0; k < kdim; k++) + { + k_absmax = std::max(k_absmax, (float)fabs(k_ptr[k])); + } + mha_stats[i].k_weight_scales[j] = k_absmax == 0.f ? 1.f : 127 / k_absmax; + } + + const int vdim = mha->vdim; + mha_stats[i].v_weight_scales.create(mha->embed_dim); + for (int j = 0; j < mha->embed_dim; j++) + { + float v_absmax = 0.f; + + const float* v_ptr = (const float*)mha->v_weight_data + j * vdim; + for (int k = 0; k < vdim; k++) + { + v_absmax = std::max(v_absmax, (float)fabs(v_ptr[k])); + } + mha_stats[i].v_weight_scales[j] = v_absmax == 0.f ? 1.f : 127 / v_absmax; + } + + const float* o_ptr = (const float*)mha->out_weight_data; + float o_absmax = 0.f; + for (int k = 0; k < mha->out_weight_data.w; k++) + { + o_absmax = std::max(o_absmax, (float)fabs(o_ptr[k])); + } + mha_stats[i].out_weight_scale = o_absmax == 0.f ? 1.f : 127 / o_absmax; + } + + if (conv_layer_count == 0) + return 0; + + const int file_count = (int)listspaths[0].size(); + + std::vector blob_allocators(quantize_num_threads); + std::vector workspace_allocators(quantize_num_threads); + // count the absmax #pragma omp parallel for num_threads(quantize_num_threads) schedule(static, 1) for (int i = 0; i < file_count; i++) @@ -1112,6 +1683,9 @@ int QuantNet::quantize_EQ() const int conv_layer_count = (int)conv_layers.size(); const int conv_bottom_blob_count = (int)conv_bottom_blobs.size(); + if (conv_layer_count == 0) + return 0; + std::vector blob_allocators(quantize_num_threads); std::vector workspace_allocators(quantize_num_threads); @@ -1661,6 +2235,7 @@ static void print_pixel_type_list(const std::vector& list) static void show_usage() { fprintf(stderr, "Usage: ncnn2table [ncnnparam] [ncnnbin] [list,...] [ncnntable] [(key=value)...]\n"); + fprintf(stderr, " ncnn2table [ncnnparam] [ncnnbin] [ncnntable] [(key=value)...]\n"); fprintf(stderr, " mean=[104.0,117.0,123.0],...\n"); fprintf(stderr, " norm=[1.0,1.0,1.0],...\n"); fprintf(stderr, " shape=[224,224,3],...[w,h,c] or [w,h] **[0,0] will not resize\n"); @@ -1671,11 +2246,12 @@ static void show_usage() fprintf(stderr, "Sample usage:\n"); fprintf(stderr, " ncnn2table squeezenet.param squeezenet.bin filelist.txt squeezenet.table mean=[104.0,117.0,123.0] norm=[1.0,1.0,1.0] shape=[227,227,3] pixel=BGR method=kl\n"); fprintf(stderr, " ncnn2table test.param test.bin filelist.txt squeezenet.table shape=[227,227,3] method=kl type=1\n"); + fprintf(stderr, " ncnn2table rnn.param rnn.bin rnn.table method=kl\n"); } int main(int argc, char** argv) { - if (argc < 5) + if (argc < 4) { show_usage(); return -1; @@ -1692,8 +2268,6 @@ int main(int argc, char** argv) const char* inparam = argv[1]; const char* inbin = argv[2]; - char* lists = argv[3]; - const char* outtable = argv[4]; ncnn::Option opt; opt.num_threads = 1; @@ -1709,13 +2283,47 @@ int main(int argc, char** argv) net.init(); - // load lists - net.listspaths = parse_comma_path_list(lists); + const bool need_calibration_dataset = !net.conv_layers.empty(); + + const char* outtable = 0; + int kv_start = 0; + + if (need_calibration_dataset) + { + if (argc < 5) + { + show_usage(); + return -1; + } + + net.listspaths = parse_comma_path_list(argv[3]); + outtable = argv[4]; + kv_start = 5; + } + else + { + if (argc >= 5 && strchr(argv[4], '=')) + { + outtable = argv[3]; + kv_start = 4; + } + else if (argc >= 5) + { + net.listspaths = parse_comma_path_list(argv[3]); + outtable = argv[4]; + kv_start = 5; + } + else + { + outtable = argv[3]; + kv_start = 4; + } + } std::string method = "kl"; net.file_type = 0; - for (int i = 5; i < argc; i++) + for (int i = kv_start; i < argc; i++) { // key=value char* kv = argv[i]; @@ -1751,27 +2359,27 @@ int main(int argc, char** argv) // sanity check const size_t input_blob_count = net.input_blobs.size(); - if (net.listspaths.size() != input_blob_count) + if (need_calibration_dataset && net.listspaths.size() != input_blob_count) { fprintf(stderr, "expect %d lists, but got %d\n", (int)input_blob_count, (int)net.listspaths.size()); return -1; } - if ((0 == net.file_type) && (net.means.size() != input_blob_count)) + if (need_calibration_dataset && (0 == net.file_type) && (net.means.size() != input_blob_count)) { fprintf(stderr, "expect %d means, but got %d\n", (int)input_blob_count, (int)net.means.size()); return -1; } - if ((0 == net.file_type) && (net.norms.size() != input_blob_count)) + if (need_calibration_dataset && (0 == net.file_type) && (net.norms.size() != input_blob_count)) { fprintf(stderr, "expect %d norms, but got %d\n", (int)input_blob_count, (int)net.norms.size()); return -1; } - if (net.shapes.size() != input_blob_count) + if (need_calibration_dataset && net.shapes.size() != input_blob_count) { fprintf(stderr, "expect %d shapes, but got %d\n", (int)input_blob_count, (int)net.shapes.size()); return -1; } - if ((0 == net.file_type) && (net.type_to_pixels.size() != input_blob_count)) + if (need_calibration_dataset && (0 == net.file_type) && (net.type_to_pixels.size() != input_blob_count)) { fprintf(stderr, "expect %d pixels, but got %d\n", (int)input_blob_count, (int)net.type_to_pixels.size()); return -1;