Oneflow-Inc · xiezipeng-ML · Mar 13, 2023 · Mar 20, 2023 · BBuf · Mar 20, 2023
diff --git a/docs/oneflow2onnx/op_list.md b/docs/oneflow2onnx/op_list.md
@@ -30,4 +30,4 @@
 | 90   | ScalarLogicalLess| 91| ScalarLogicalGreater| 92| Gather  | 93  | Expand             |
 | 94   | fill_      | 95   | GeLU           | 96   | LayerNorm    | 97  | AmpIdentity        |
 | 98   | fast_gelu  | 99   | quick_gelu     | 100  | fused_self_attention |101 |RMSLayerNorm |
-| 102  | RMSNorm    | 103  | fused_bias_add_scale_mask_softmax_dropout |
+| 102  | RMSNorm    | 103  | fused_bias_add_scale_mask_softmax_dropout | 104 | fused_fast_gelu_mul |
diff --git a/examples/oneflow2onnx/nodes/GPU/test_fused_fast_gelu_mul.py b/examples/oneflow2onnx/nodes/GPU/test_fused_fast_gelu_mul.py
@@ -0,0 +1,54 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import tempfile
+import oneflow as flow
+from oneflow_onnx.oneflow2onnx.util import convert_to_onnx_and_check
+
+
+class FusedFastGelu(flow.nn.Module):
+    def __init__(self) -> None:
+        super(FusedFastGelu, self).__init__()
+
+    def forward(self, x: flow.Tensor, hidden: int) -> flow.Tensor:
+        hidden_states = flow._C.fused_fast_gelu_mul(x, hidden)
+        return hidden_states
+
+
+fused_fast_gelu = FusedFastGelu()
+fused_fast_gelu = fused_fast_gelu.to("cuda")
+
+
+class FusedFastGeluOpGraph(flow.nn.Graph):
+    def __init__(self):
+        super().__init__()
+        self.m = fused_fast_gelu
+
+    def build(self, x, hidden):
+        hidden_states = self.m(x, hidden)
+        return hidden_states
+
+
+def test_fused_self_attention():
+
+    graph = FusedFastGeluOpGraph()
+    graph._compile(flow.randn(4, 3, 2).to("cuda"), flow.randn(4, 3, 2).to("cuda"))
+
+    with tempfile.TemporaryDirectory() as tmpdirname:
+        flow.save(fused_fast_gelu.state_dict(), tmpdirname, save_as_external_data=True)
+        convert_to_onnx_and_check(graph, onnx_model_path="/tmp", device="gpu", input_tensor_range=[-0.001, 0.001])
+
+
+test_fused_self_attention()
diff --git a/oneflow_onnx/oneflow2onnx/handlers/math.py b/oneflow_onnx/oneflow2onnx/handlers/math.py
@@ -1143,3 +1143,32 @@ def Version_1(cls, ctx, node, **kwargs):
         ctx.RemoveNode(node.name)
         ctx.MakeNode("Identity", [softmax_y.output_tensor_names[0]], outputs=[output_name1], op_name_scope=node.name, dtypes=[dtypes[0]])
         ctx.MakeNode("Identity", [softmax_y.output_tensor_names[0]], outputs=[output_name2], op_name_scope=node.name, dtypes=[dtypes[0]])
+
+
+@flow_op("fused_fast_gelu_mul")
+class FusedFastGeluMul:
+    @classmethod
+    def Version_1(cls, ctx, node, **kwargs):
+        dtypes = node.output_dtypes
+        output_name = node.output_tensor_names[0]
+
+        kBeta = math.sqrt(2 / math.pi)
+        kKappa = 0.044715
+        beta = ctx.MakeConst(oneflow._oneflow_internal.UniqueStr("beta"), np.array(kBeta, dtype=util.Onnx2NumpyDtype(dtypes[0])))
+        kappa = ctx.MakeConst(oneflow._oneflow_internal.UniqueStr("kKappa"), np.array(kKappa, dtype=util.Onnx2NumpyDtype(dtypes[0])))
+        one = ctx.MakeConst(oneflow._oneflow_internal.UniqueStr("one"), np.array(1.0, dtype=util.Onnx2NumpyDtype(dtypes[0])))
+        half = ctx.MakeConst(oneflow._oneflow_internal.UniqueStr("half"), np.array(0.5, dtype=util.Onnx2NumpyDtype(dtypes[0])))
+        mul_node_1 = ctx.MakeNode("Mul", [node.input_tensor_names[0], node.input_tensor_names[0]], op_name_scope=node.name, name="mul1", dtypes=dtypes)
+        cube = ctx.MakeNode("Mul", [node.input_tensor_names[0], mul_node_1.output_tensor_names[0]], op_name_scope=node.name, name="cube", dtypes=dtypes)
+        mul_node_2 = ctx.MakeNode("Mul", [kappa.output_tensor_names[0], cube.output_tensor_names[0]], op_name_scope=node.name, name="mul2", dtypes=dtypes)
+        add_node_1 = ctx.MakeNode("Add", [mul_node_2.output_tensor_names[0], node.input_tensor_names[0]], op_name_scope=node.name, name="add1", dtypes=dtypes)
+        inner = ctx.MakeNode("Mul", [add_node_1.output_tensor_names[0], beta.output_tensor_names[0]], op_name_scope=node.name, name="inner", dtypes=dtypes)
+        tanh_node = ctx.MakeNode("Tanh", [inner.output_tensor_names[0]], op_name_scope=node.name, name="tanh", dtypes=dtypes)
+        add_node_2 = ctx.MakeNode("Add", [tanh_node.output_tensor_names[0], one.output_tensor_names[0]], op_name_scope=node.name, name="add2", dtypes=dtypes)
+        mul_node_3 = ctx.MakeNode("Mul", [add_node_2.output_tensor_names[0], node.input_tensor_names[0]], op_name_scope=node.name, name="mul3", dtypes=dtypes)
+        mul_node_4 = ctx.MakeNode("Mul", [mul_node_3.output_tensor_names[0], half.output_tensor_names[0]], op_name_scope=node.name, name="mul4", dtypes=dtypes)
+
+        mul_node = ctx.MakeNode("Mul", [mul_node_4.output_tensor_names[0], node.input_tensor_names[1]], op_name_scope=node.name, name="mul_node", dtypes=[dtypes[0]])
+
+        ctx.RemoveNode(node.name)
+        ctx.MakeNode("Identity", [mul_node.output_tensor_names[0]], outputs=[output_name], op_name_scope=node.name, dtypes=[dtypes[0]])