Tencent · Roundaboutt · Apr 17, 2026 · Apr 19, 2026 · Apr 19, 2026 · Apr 19, 2026
diff --git a/docs/how-to-use-and-FAQ/quantized-int8-inference.md b/docs/how-to-use-and-FAQ/quantized-int8-inference.md
@@ -89,16 +89,16 @@ filelist_in2.txt
 ```
 **Here shape is WHC, because the order of the arguments to `ncnn::Mat`.**
 
-### 3. Quantize model
+For RNN,GRU,LSTM,MultiHeadAttention and Embed layers,ncnn2table also supports tableless quantization.
 
 ```shell
-./ncnn2int8 mobilenet-opt.param mobilenet-opt.bin mobilenet-int8.param mobilenet-int8.bin mobilenet.table
+./ncnn2table rnn.param rnn.bin rnn.table method=kl
 ```
 
-If you don’t need static quantization, ncnn supports RNN/LSTM/GRU dynamic quantization. In this case, you can omit the table file.
+### 3. Quantize model
 
 ```shell
-./ncnn2int8 rnn-model.param rnn-model.bin rnn-model-int8.param rnn-model-int8.bin
+./ncnn2int8 mobilenet-opt.param mobilenet-opt.bin mobilenet-int8.param mobilenet-int8.bin mobilenet.table
 ```
 
 ## use ncnn int8 inference

diff --git a/tools/quantize/ncnn2int8.cpp b/tools/quantize/ncnn2int8.cpp
@@ -317,43 +317,34 @@ int NetQuantize::quantize_rnn()
         if (layers[i]->type != "RNN")
             continue;
 
+        char key_xc[256];
+        snprintf(key_xc, 256, "%s_param_0", layers[i]->name.c_str());
+        std::map<std::string, ncnn::Mat>::iterator iter_xc = weight_int8scale_table.find(key_xc);
+        if (iter_xc == weight_int8scale_table.end())
+        {
+            fprintf(stderr, "this layer need to be quantized, but no scale param!\n");
+            return -1;
+        }
+
+        char key_hc[256];
+        snprintf(key_hc, 256, "%s_param_1", layers[i]->name.c_str());
+        std::map<std::string, ncnn::Mat>::iterator iter_hc = weight_int8scale_table.find(key_hc);
+        if (iter_hc == weight_int8scale_table.end())
+        {
+            fprintf(stderr, "this layer need to be quantized, but no scale param!\n");
+            return -1;
+        }
+
         // RNN - quantize weight from fp32 to int8
         ncnn::RNN* rnn = (ncnn::RNN*)layers[i];
 
         fprintf(stderr, "quantize_rnn %s\n", rnn->name.c_str());
 
-        // TODO move to ncnn2table
         const int num_directions = rnn->direction == 2 ? 2 : 1;
         const int size = rnn->weight_data_size / num_directions / rnn->num_output;
 
-        ncnn::Mat weight_xc_data_int8_scales(rnn->num_output * num_directions);
-        ncnn::Mat weight_hc_data_int8_scales(rnn->num_output * num_directions);
-
-        for (int d = 0; d < num_directions; d++)
-        {
-            for (int q = 0; q < rnn->num_output; q++)
-            {
-                {
-                    const float* weight_xc_ptr = rnn->weight_xc_data.channel(d).row(q);
-                    float absmax = 0.f;
-                    for (int i = 0; i < size; i++)
-                    {
-                        absmax = std::max(absmax, (float)fabs(weight_xc_ptr[i]));
-                    }
-                    weight_xc_data_int8_scales[d * rnn->num_output + q] = 127 / absmax;
-                }
-
-                {
-                    const float* weight_hc_ptr = rnn->weight_hc_data.channel(d).row(q);
-                    float absmax = 0.f;
-                    for (int i = 0; i < size; i++)
-                    {
-                        absmax = std::max(absmax, (float)fabs(weight_hc_ptr[i]));
-                    }
-                    weight_hc_data_int8_scales[d * rnn->num_output + q] = 127 / absmax;
-                }
-            }
-        }
+        ncnn::Mat weight_xc_data_int8_scales = iter_xc->second;
+        ncnn::Mat weight_hc_data_int8_scales = iter_hc->second;
 
         {
             ncnn::Mat weight_xc_data_r2 = rnn->weight_xc_data.reshape(size, rnn->num_output * num_directions);
@@ -399,43 +390,34 @@ int NetQuantize::quantize_lstm()
         if (layers[i]->type != "LSTM")
             continue;
 
+        char key_xc[256];
+        snprintf(key_xc, 256, "%s_param_0", layers[i]->name.c_str());
+        std::map<std::string, ncnn::Mat>::iterator iter_xc = weight_int8scale_table.find(key_xc);
+        if (iter_xc == weight_int8scale_table.end())
+        {
+            fprintf(stderr, "this layer need to be quantized, but no scale param!\n");
+            return -1;
+        }
+
+        char key_hc[256];
+        snprintf(key_hc, 256, "%s_param_1", layers[i]->name.c_str());
+        std::map<std::string, ncnn::Mat>::iterator iter_hc = weight_int8scale_table.find(key_hc);
+        if (iter_hc == weight_int8scale_table.end())
+        {
+            fprintf(stderr, "this layer need to be quantized, but no scale param!\n");
+            return -1;
+        }
+
         // LSTM - quantize weight from fp32 to int8
         ncnn::LSTM* lstm = (ncnn::LSTM*)layers[i];
 
         fprintf(stderr, "quantize_lstm %s\n", lstm->name.c_str());
 
-        // TODO move to ncnn2table
         const int num_directions = lstm->direction == 2 ? 2 : 1;
         const int size = lstm->weight_data_size / num_directions / lstm->hidden_size / 4;
 
-        ncnn::Mat weight_xc_data_int8_scales(lstm->hidden_size * 4 * num_directions);
-        ncnn::Mat weight_hc_data_int8_scales(lstm->hidden_size * 4 * num_directions);
-
-        for (int d = 0; d < num_directions; d++)
-        {
-            for (int q = 0; q < lstm->hidden_size * 4; q++)
-            {
-                {
-                    const float* weight_xc_ptr = lstm->weight_xc_data.channel(d).row(q);
-                    float absmax = 0.f;
-                    for (int i = 0; i < size; i++)
-                    {
-                        absmax = std::max(absmax, (float)fabs(weight_xc_ptr[i]));
-                    }
-                    weight_xc_data_int8_scales[d * lstm->hidden_size * 4 + q] = 127 / absmax;
-                }
-
-                {
-                    const float* weight_hc_ptr = lstm->weight_hc_data.channel(d).row(q);
-                    float absmax = 0.f;
-                    for (int i = 0; i < size; i++)
-                    {
-                        absmax = std::max(absmax, (float)fabs(weight_hc_ptr[i]));
-                    }
-                    weight_hc_data_int8_scales[d * lstm->hidden_size * 4 + q] = 127 / absmax;
-                }
-            }
-        }
+        ncnn::Mat weight_xc_data_int8_scales = iter_xc->second;
+        ncnn::Mat weight_hc_data_int8_scales = iter_hc->second;
 
         {
             ncnn::Mat weight_xc_data_r2 = lstm->weight_xc_data.reshape(size, lstm->hidden_size * 4 * num_directions);
@@ -481,43 +463,34 @@ int NetQuantize::quantize_gru()
         if (layers[i]->type != "GRU")
             continue;
 
+        char key_xc[256];
+        snprintf(key_xc, 256, "%s_param_0", layers[i]->name.c_str());
+        std::map<std::string, ncnn::Mat>::iterator iter_xc = weight_int8scale_table.find(key_xc);
+        if (iter_xc == weight_int8scale_table.end())
+        {
+            fprintf(stderr, "this layer need to be quantized, but no scale param!\n");
+            return -1;
+        }
+
+        char key_hc[256];
+        snprintf(key_hc, 256, "%s_param_1", layers[i]->name.c_str());
+        std::map<std::string, ncnn::Mat>::iterator iter_hc = weight_int8scale_table.find(key_hc);
+        if (iter_hc == weight_int8scale_table.end())
+        {
+            fprintf(stderr, "this layer need to be quantized, but no scale param!\n");
+            return -1;
+        }
+
         // GRU - quantize weight from fp32 to int8
         ncnn::GRU* gru = (ncnn::GRU*)layers[i];
 
         fprintf(stderr, "quantize_gru %s\n", gru->name.c_str());
 
-        // TODO move to ncnn2table
         const int num_directions = gru->direction == 2 ? 2 : 1;
         const int size = gru->weight_data_size / num_directions / gru->num_output / 3;
 
-        ncnn::Mat weight_xc_data_int8_scales(gru->num_output * 3 * num_directions);
-        ncnn::Mat weight_hc_data_int8_scales(gru->num_output * 3 * num_directions);
-
-        for (int d = 0; d < num_directions; d++)
-        {
-            for (int q = 0; q < gru->num_output * 3; q++)
-            {
-                {
-                    const float* weight_xc_ptr = gru->weight_xc_data.channel(d).row(q);
-                    float absmax = 0.f;
-                    for (int i = 0; i < size; i++)
-                    {
-                        absmax = std::max(absmax, (float)fabs(weight_xc_ptr[i]));
-                    }
-                    weight_xc_data_int8_scales[d * gru->num_output * 3 + q] = 127 / absmax;
-                }
-
-                {
-                    const float* weight_hc_ptr = gru->weight_hc_data.channel(d).row(q);
-                    float absmax = 0.f;
-                    for (int i = 0; i < size; i++)
-                    {
-                        absmax = std::max(absmax, (float)fabs(weight_hc_ptr[i]));
-                    }
-                    weight_hc_data_int8_scales[d * gru->num_output * 3 + q] = 127 / absmax;
-                }
-            }
-        }
+        ncnn::Mat weight_xc_data_int8_scales = iter_xc->second;
+        ncnn::Mat weight_hc_data_int8_scales = iter_hc->second;
 
         {
             ncnn::Mat weight_xc_data_r2 = gru->weight_xc_data.reshape(size, gru->num_output * 3 * num_directions);
@@ -563,27 +536,24 @@ int NetQuantize::quantize_embed()
         if (layers[i]->type != "Embed")
             continue;
 
+        char key[256];
+        snprintf(key, 256, "%s_param_0", layers[i]->name.c_str());
+        std::map<std::string, ncnn::Mat>::iterator iter = weight_int8scale_table.find(key);
+        if (iter == weight_int8scale_table.end())
+        {
+            fprintf(stderr, "this layer need to be quantized, but no scale param!\n");
+            return -1;
+        }
+
         // Embed - quantize weight from fp32 to int8
         ncnn::Embed* embed = (ncnn::Embed*)layers[i];
 
         fprintf(stderr, "quantize_embed %s\n", embed->name.c_str());
 
-        // TODO move to ncnn2table
-
         const int num_output = embed->num_output;
         const int input_dim = embed->input_dim;
 
-        ncnn::Mat weight_data_int8_scales(1);
-        {
-            const float* ptr = embed->weight_data;
-            float absmax = 0.f;
-            for (int i = 0; i < embed->weight_data.w; i++)
-            {
-                absmax = std::max(absmax, (float)fabs(ptr[i]));
-            }
-
-            weight_data_int8_scales[0] = absmax == 0.f ? 1.f : 127 / absmax;
-        }
+        ncnn::Mat weight_data_int8_scales = iter->second;
 
         {
             ncnn::Mat weight_data_int8;
@@ -719,29 +689,51 @@ int NetQuantize::quantize_multiheadattention()
         if (layers[i]->type != "MultiHeadAttention")
             continue;
 
+        char key_q[256];
+        snprintf(key_q, 256, "%s_param_0", layers[i]->name.c_str());
+        std::map<std::string, ncnn::Mat>::iterator iter_q = weight_int8scale_table.find(key_q);
+        if (iter_q == weight_int8scale_table.end())
+        {
+            fprintf(stderr, "this layer need to be quantized, but no scale param!\n");
+            return -1;
+        }
+
+        char key_k[256];
+        snprintf(key_k, 256, "%s_param_1", layers[i]->name.c_str());
+        std::map<std::string, ncnn::Mat>::iterator iter_k = weight_int8scale_table.find(key_k);
+        if (iter_k == weight_int8scale_table.end())
+        {
+            fprintf(stderr, "this layer need to be quantized, but no scale param!\n");
+            return -1;
+        }
+
+        char key_v[256];
+        snprintf(key_v, 256, "%s_param_2", layers[i]->name.c_str());
+        std::map<std::string, ncnn::Mat>::iterator iter_v = weight_int8scale_table.find(key_v);
+        if (iter_v == weight_int8scale_table.end())
+        {
+            fprintf(stderr, "this layer need to be quantized, but no scale param!\n");
+            return -1;
+        }
+
+        char key_out[256];
+        snprintf(key_out, 256, "%s_param_3", layers[i]->name.c_str());
+        std::map<std::string, ncnn::Mat>::iterator iter_out = weight_int8scale_table.find(key_out);
+        if (iter_out == weight_int8scale_table.end())
+        {
+            fprintf(stderr, "this layer need to be quantized, but no scale param!\n");
+            return -1;
+        }
+
         // MultiHeadAttention - quantize weight from fp32 to int8
         ncnn::MultiHeadAttention* mha = (ncnn::MultiHeadAttention*)layers[i];
 
         fprintf(stderr, "quantize_multiheadattention %s\n", mha->name.c_str());
 
-        // TODO move to ncnn2table
-
         const int qdim = mha->weight_data_size / mha->embed_dim;
 
         {
-            mha->q_weight_data_int8_scales.create(mha->embed_dim);
-            for (int i = 0; i < mha->embed_dim; i++)
-            {
-                float absmax = 0.f;
-
-                const float* ptr = (const float*)mha->q_weight_data + i * qdim;
-                for (int j = 0; j < qdim; j++)
-                {
-                    absmax = std::max(absmax, (float)fabs(ptr[j]));
-                }
-
-                mha->q_weight_data_int8_scales[i] = absmax == 0.f ? 1.f : 127 / absmax;
-            }
+            mha->q_weight_data_int8_scales = iter_q->second;
 
             ncnn::Mat q_weight_data = mha->q_weight_data.reshape(qdim, mha->embed_dim);
             ncnn::Mat q_weight_data_int8;
@@ -757,19 +749,7 @@ int NetQuantize::quantize_multiheadattention()
         }
 
         {
-            mha->k_weight_data_int8_scales.create(mha->embed_dim);
-            for (int i = 0; i < mha->embed_dim; i++)
-            {
-                float absmax = 0.f;
-
-                const float* ptr = (const float*)mha->k_weight_data + i * mha->kdim;
-                for (int j = 0; j < mha->kdim; j++)
-                {
-                    absmax = std::max(absmax, (float)fabs(ptr[j]));
-                }
-
-                mha->k_weight_data_int8_scales[i] = absmax == 0.f ? 1.f : 127 / absmax;
-            }
+            mha->k_weight_data_int8_scales = iter_k->second;
 
             ncnn::Mat k_weight_data = mha->k_weight_data.reshape(mha->kdim, mha->embed_dim);
             ncnn::Mat k_weight_data_int8;
@@ -785,19 +765,7 @@ int NetQuantize::quantize_multiheadattention()
         }
 
         {
-            mha->v_weight_data_int8_scales.create(mha->embed_dim);
-            for (int i = 0; i < mha->embed_dim; i++)
-            {
-                float absmax = 0.f;
-
-                const float* ptr = (const float*)mha->v_weight_data + i * mha->vdim;
-                for (int j = 0; j < mha->vdim; j++)
-                {
-                    absmax = std::max(absmax, (float)fabs(ptr[j]));
-                }
-
-                mha->v_weight_data_int8_scales[i] = absmax == 0.f ? 1.f : 127 / absmax;
-            }
+            mha->v_weight_data_int8_scales = iter_v->second;
 
             ncnn::Mat v_weight_data = mha->v_weight_data.reshape(mha->vdim, mha->embed_dim);
             ncnn::Mat v_weight_data_int8;
@@ -813,17 +781,8 @@ int NetQuantize::quantize_multiheadattention()
         }
 
         {
-            const float* ptr = mha->out_weight_data;
-            float absmax = 0.f;
-            for (int j = 0; j < mha->out_weight_data.w; j++)
-            {
-                absmax = std::max(absmax, (float)fabs(ptr[j]));
-            }
-
-            mha->out_weight_data_int8_scale = absmax == 0.f ? 1.f : 127 / absmax;
-
-            ncnn::Mat out_weight_data_int8_scales(1);
-            out_weight_data_int8_scales[0] = mha->out_weight_data_int8_scale;
+            ncnn::Mat out_weight_data_int8_scales = iter_out->second;
+            mha->out_weight_data_int8_scale = out_weight_data_int8_scales[0];
 
             ncnn::Mat out_weight_data_int8;
 
@@ -854,7 +813,7 @@ int NetQuantize::quantize_sdpa()
 
         fprintf(stderr, "quantize_sdpa %s\n", sdpa->name.c_str());
 
-        // TODO move to ncnn2table
+        // SDPA uses dynamic activation quantization in forward_int8
 
         sdpa->int8_scale_term = 2;
     }