From 8e39f839caf86c404d50b97ea3676b657edf3767 Mon Sep 17 00:00:00 2001
From: Taimuraz Kaitmazov <atassikay38@gmail.com>
Date: Thu, 18 Jun 2026 18:38:44 +0300
Subject: [PATCH 1/7] transpose: add num_batches to batch independent
 transposes into one dispatch

GEMV and StridedCopy already take num_batches to batch B independent same-shape
operations into a single dispatch; Transpose did not, forcing callers to unroll
B per-head/per-batch transposes into B separate dispatches for identical kernel
work (a common multi-head-attention pattern).

num_batches>1 lays B contiguous (M,N) matrices back-to-back and streams them
through the same ObjectFifos (one task group per batch); the core still only
sees s*s sub-tiles, so the kernel is unchanged. num_batches=1 (default) is
byte-identical to the previous single-transpose schedule.
---
 iron/operators/transpose/design.py | 119 ++++++++++++++++++-----------
 iron/operators/transpose/op.py     |  23 +++++-
 2 files changed, 93 insertions(+), 49 deletions(-)

diff --git a/iron/operators/transpose/design.py b/iron/operators/transpose/design.py
index bec4382b..9e08ddb7 100644
--- a/iron/operators/transpose/design.py
+++ b/iron/operators/transpose/design.py
@@ -10,7 +10,9 @@
 from aie.iron.controlflow import range_
 
 
-def shuffle_transpose(dev, M, N, num_columns, num_channels, m, n, s, func_prefix=""):
+def shuffle_transpose(
+    dev, M, N, num_columns, num_channels, m, n, s, num_batches=1, func_prefix=""
+):
     num_elements = M * N
     per_tile_elements = m * n
     dtype = bfloat16
@@ -34,8 +36,9 @@ def shuffle_transpose(dev, M, N, num_columns, num_channels, m, n, s, func_prefix
     if s == 8 and (m <= 16 or n <= 16):
         raise ValueError(f"Kernel tile {s} needs AIE tile rows > 16 and columns > 16.")
 
-    # Define tensor types
-    tensor_ty = np.ndarray[(num_elements,), np.dtype[dtype]]
+    # Define tensor types. The runtime tensor spans all batches (contiguous matrices);
+    # per-tile work on the cores is identical regardless of batch count.
+    tensor_ty = np.ndarray[(num_batches * num_elements,), np.dtype[dtype]]
     tile_ty = np.ndarray[(per_tile_elements,), np.dtype[dtype]]
 
     fifodepth = 1 if per_tile_elements > 4096 else 2
@@ -47,13 +50,25 @@ def shuffle_transpose(dev, M, N, num_columns, num_channels, m, n, s, func_prefix
     # and channels. Partially transposes the input
     # data so that the kernel only needs to
     # transpose s*s-sized sub-tiles.
+    # For num_batches>1 the L3 tensors hold that many contiguous (M,N) matrices, stacked along
+    # the row dimension: in-dims (num_batches*M, N), out-dims (num_batches*N, M). At num_batches==1
+    # these reduce to (M,N)/(N,M) — identical to the original single-transpose patterns. Each (i,j)
+    # column/channel gets one TAP per batch (offset += batch*num_elements); the per-batch internal
+    # sizes/strides are unchanged because each matrix is contiguous and row-major.
+    in_dims = (num_batches * M, N)
+    out_dims = (num_batches * N, M)
     taps_in_L3L2 = [
-        TensorAccessPattern(
-            (M, N),
-            (M // num_channels) * j * N + (N // num_columns) * i,
-            [M // num_channels // m, N // num_columns // n, m, n],
-            [m * N, n, N, 1],
-        )
+        [
+            TensorAccessPattern(
+                in_dims,
+                batch * num_elements
+                + (M // num_channels) * j * N
+                + (N // num_columns) * i,
+                [M // num_channels // m, N // num_columns // n, m, n],
+                [m * N, n, N, 1],
+            )
+            for batch in range(num_batches)
+        ]
         for i in range(num_columns)
         for j in range(num_channels)
     ]
@@ -68,12 +83,17 @@ def shuffle_transpose(dev, M, N, num_columns, num_channels, m, n, s, func_prefix
         for j in range(num_channels)
     ]
     taps_out_L1L3 = [
-        TensorAccessPattern(
-            (N, M),
-            (N // num_columns) * i * M + (M // num_channels) * j,
-            [M // num_channels // m, N // num_columns // n, n, m],
-            [m, n * M, M, 1],
-        )
+        [
+            TensorAccessPattern(
+                out_dims,
+                batch * num_elements
+                + (N // num_columns) * i * M
+                + (M // num_channels) * j,
+                [M // num_channels // m, N // num_columns // n, n, m],
+                [m, n * M, M, 1],
+            )
+            for batch in range(num_batches)
+        ]
         for i in range(num_columns)
         for j in range(num_channels)
     ]
@@ -106,14 +126,17 @@ def shuffle_transpose(dev, M, N, num_columns, num_channels, m, n, s, func_prefix
 
     # Define a task that will run on a compute tile
     def core_body(of_in1, of_out, transpose_kernel):
-        # Number of sub-matrix "tile" iterations
-        for _ in range_(N // n // num_columns):
-            for _ in range_(M // m // num_channels):
-                elem_in1 = of_in1.acquire(1)
-                elem_out = of_out.acquire(1)
-                transpose_kernel(elem_in1, elem_out)
-                of_out.release(1)
-                of_in1.release(1)
+        # Process num_batches contiguous matrices through the same FIFOs: num_batches x the per-matrix
+        # tile iterations. The kernel only ever sees s*s sub-tiles, so it is batch-agnostic.
+        for _ in range_(num_batches):
+            # Number of sub-matrix "tile" iterations
+            for _ in range_(N // n // num_columns):
+                for _ in range_(M // m // num_channels):
+                    elem_in1 = of_in1.acquire(1)
+                    elem_out = of_out.acquire(1)
+                    transpose_kernel(elem_in1, elem_out)
+                    of_out.release(1)
+                    of_in1.release(1)
 
     # Create a worker to run the task on a compute tile
     my_workers = [
@@ -134,29 +157,33 @@ def core_body(of_in1, of_out, transpose_kernel):
     with rt.sequence(tensor_ty, tensor_ty) as (A, C):
         rt.start(*my_workers)
 
-        # Initialize a group for parallel drain tasks, with fill resources free'd when drains complete.
-        tg = rt.task_group()
-
-        # Fill the input objectFIFOs with data
-        for i in range(num_columns):
-            for j in range(num_channels):
-                rt.fill(
-                    of_in1s_L3L2[i * num_channels + j].prod(),
-                    A,
-                    taps_in_L3L2[i * num_channels + j],
-                    task_group=tg,
-                )
-        # Drain the output objectFIFOs with data
-        for i in range(num_columns):
-            for j in range(num_channels):
-                rt.drain(
-                    of_outs[i * num_channels + j].cons(),
-                    C,
-                    taps_out_L1L3[i * num_channels + j],
-                    wait=True,  # wait for the transfer to complete and data to be available
-                    task_group=tg,
-                )
-        rt.finish_task_group(tg)
+        # One task group per batch (each a parallel fill+drain over all columns/channels), so the
+        # num_batches contiguous matrices stream through the same FIFOs in sequence. At num_batches==1
+        # this is a single pass — identical to the original single-transpose schedule.
+        for batch in range(num_batches):
+            # Initialize a group for parallel drain tasks, with fill resources free'd when drains complete.
+            tg = rt.task_group()
+
+            # Fill the input objectFIFOs with data
+            for i in range(num_columns):
+                for j in range(num_channels):
+                    rt.fill(
+                        of_in1s_L3L2[i * num_channels + j].prod(),
+                        A,
+                        taps_in_L3L2[i * num_channels + j][batch],
+                        task_group=tg,
+                    )
+            # Drain the output objectFIFOs with data
+            for i in range(num_columns):
+                for j in range(num_channels):
+                    rt.drain(
+                        of_outs[i * num_channels + j].cons(),
+                        C,
+                        taps_out_L1L3[i * num_channels + j][batch],
+                        wait=True,  # wait for the transfer to complete and data to be available
+                        task_group=tg,
+                    )
+            rt.finish_task_group(tg)
 
     # Place program components (assign them resources on the device) and generate an MLIR module
     return Program(dev, rt).resolve_program(SequentialPlacer())
diff --git a/iron/operators/transpose/op.py b/iron/operators/transpose/op.py
index d37e0101..1768103a 100644
--- a/iron/operators/transpose/op.py
+++ b/iron/operators/transpose/op.py
@@ -2,6 +2,7 @@
 # SPDX-License-Identifier: Apache-2.0
 
 from dataclasses import dataclass, field
+from typing import ClassVar, Dict
 
 import aie.utils as aie_utils
 from iron.common import (
@@ -16,7 +17,13 @@
 
 @dataclass
 class Transpose(MLIROperator):
-    """AIE-accelerated transpose operator"""
+    """AIE-accelerated transpose operator.
+
+    ``num_batches`` > 1 performs that many independent (M,N)->(N,M) transposes on
+    contiguous matrices laid back-to-back in memory (results concatenated), mirroring
+    GEMV's batching — the per-batch tile work rides the same ObjectFifos, so B batched
+    transposes cost ONE dispatch instead of B unrolled ones.
+    """
 
     M: int
     N: int
@@ -25,8 +32,14 @@ class Transpose(MLIROperator):
     m: int
     n: int
     s: int
+    num_batches: int = 1
     context: object = field(default=None, repr=False)
 
+    _name_aliases: ClassVar[Dict[str, str]] = {
+        **MLIROperator._name_aliases,
+        "num_batches": "batch",
+    }
+
     def __post_init__(self):
         if self.M % self.m != 0:
             raise ValueError(f"Matrix rows ({self.M}) must be a multiple of {self.m}")
@@ -66,6 +79,7 @@ def get_mlir_artifact(self):
                     self.m,
                     self.n,
                     self.s,
+                    self.num_batches,
                 ),
             ),
         )
@@ -90,7 +104,10 @@ def get_kernel_artifacts(self):
         ]
 
     def get_arg_spec(self):
+        # num_batches==1 keeps the exact flat (M*N,) spec (unchanged behavior for existing callers);
+        # >1 prepends a batch dim over contiguous matrices.
+        batch_dim = (self.num_batches,) if self.num_batches > 1 else ()
         return [
-            AIERuntimeArgSpec("in", (self.M * self.N,)),
-            AIERuntimeArgSpec("out", (self.M * self.N,)),
+            AIERuntimeArgSpec("in", batch_dim + (self.M * self.N,)),
+            AIERuntimeArgSpec("out", batch_dim + (self.N * self.M,)),
         ]

From 52db12625665d3f908fca996af56ead2611391c8 Mon Sep 17 00:00:00 2001
From: Taimuraz Kaitmazov <atassikay38@gmail.com>
Date: Thu, 18 Jun 2026 18:57:07 +0300
Subject: [PATCH 2/7] transpose: test num_batches>1 (batched transpose
 correctness)

Adds num_batches=2 (default suite) and num_batches=4 (extensive) cases to the
transpose test, with a batched golden reference. The operator's batched path was
previously untested. Verified on device (NPU2): num_batches in {1,2,4} pass.
---
 iron/operators/transpose/reference.py | 18 +++++++++++++++---
 iron/operators/transpose/test.py      | 26 +++++++++++++++++++++++---
 2 files changed, 38 insertions(+), 6 deletions(-)

diff --git a/iron/operators/transpose/reference.py b/iron/operators/transpose/reference.py
index 672a11fd..be354585 100644
--- a/iron/operators/transpose/reference.py
+++ b/iron/operators/transpose/reference.py
@@ -5,9 +5,21 @@
 from iron.common.test_utils import torch_dtype_map
 
 
-def generate_golden_reference(rows: int, cols: int, dtype="bf16", seed=42):
+def generate_golden_reference(
+    rows: int, cols: int, dtype="bf16", seed=42, num_batches=1
+):
     torch.manual_seed(seed)
     val_range = 4
-    input_tensor = torch.rand(rows, cols, dtype=torch_dtype_map[dtype]) * val_range
-    output_tensor = torch.transpose(input_tensor, 0, 1)
+    if num_batches == 1:
+        input_tensor = torch.rand(rows, cols, dtype=torch_dtype_map[dtype]) * val_range
+        output_tensor = torch.transpose(input_tensor, 0, 1)
+        return {"input": input_tensor, "output": output_tensor}
+    # num_batches>1: B independent (rows,cols) matrices laid back-to-back; each is
+    # transposed independently and the results concatenated in the same order.
+    input_tensor = (
+        torch.rand(num_batches, rows, cols, dtype=torch_dtype_map[dtype]) * val_range
+    )
+    output_tensor = torch.stack(
+        [torch.transpose(input_tensor[b], 0, 1) for b in range(num_batches)]
+    )
     return {"input": input_tensor, "output": output_tensor}
diff --git a/iron/operators/transpose/test.py b/iron/operators/transpose/test.py
index 19b6242f..1e404193 100755
--- a/iron/operators/transpose/test.py
+++ b/iron/operators/transpose/test.py
@@ -47,10 +47,29 @@ def get_params():
                                 m,
                                 n,
                                 s,
+                                1,
                                 marks=marks,
                             )
                         )
 
+    # num_batches>1: batch B independent same-shape transposes into one dispatch
+    # (regular shape, single column/channel). num_batches=2 runs in the default
+    # suite; the larger batch is extensive.
+    for nb in (2, 4):
+        params.append(
+            pytest.param(
+                2048,
+                64,
+                1,
+                1,
+                m,
+                n,
+                8,
+                nb,
+                marks=[] if nb == 2 else [pytest.mark.extensive],
+            )
+        )
+
     return params
 
 
@@ -58,9 +77,9 @@ def get_params():
     Latency=r"Latency \(us\): (?P<value>[\d\.]+)",
     Bandwidth=r"Effective Bandwidth: (?P<value>[\d\.e\+-]+) GB/s",
 )
-@pytest.mark.parametrize("M,N,aie_columns,channels,m,n,s", get_params())
-def test_transpose(M, N, aie_columns, channels, m, n, s, aie_context):
-    golden_ref = generate_golden_reference(rows=M, cols=N)
+@pytest.mark.parametrize("M,N,aie_columns,channels,m,n,s,num_batches", get_params())
+def test_transpose(M, N, aie_columns, channels, m, n, s, num_batches, aie_context):
+    golden_ref = generate_golden_reference(rows=M, cols=N, num_batches=num_batches)
 
     operator = Transpose(
         M=M,
@@ -70,6 +89,7 @@ def test_transpose(M, N, aie_columns, channels, m, n, s, aie_context):
         m=m,
         n=n,
         s=s,
+        num_batches=num_batches,
         context=aie_context,
     )
 

From 93e120dcd62d339d7077ee14856536392f9c367e Mon Sep 17 00:00:00 2001
From: Taimuraz Kaitmazov <atassikay38@gmail.com>
Date: Mon, 22 Jun 2026 22:03:55 +0300
Subject: [PATCH 3/7] Update iron/operators/transpose/op.py
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Co-authored-by: André Rösti <androsti@amd.com>
---
 iron/operators/transpose/op.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/iron/operators/transpose/op.py b/iron/operators/transpose/op.py
index 1768103a..699720c7 100644
--- a/iron/operators/transpose/op.py
+++ b/iron/operators/transpose/op.py
@@ -104,8 +104,6 @@ def get_kernel_artifacts(self):
         ]
 
     def get_arg_spec(self):
-        # num_batches==1 keeps the exact flat (M*N,) spec (unchanged behavior for existing callers);
-        # >1 prepends a batch dim over contiguous matrices.
         batch_dim = (self.num_batches,) if self.num_batches > 1 else ()
         return [
             AIERuntimeArgSpec("in", batch_dim + (self.M * self.N,)),

From ed7f78b3cd6707b3c5679e1b68eb837fd65b75b9 Mon Sep 17 00:00:00 2001
From: Taimuraz Kaitmazov <atassikay38@gmail.com>
Date: Mon, 22 Jun 2026 22:04:07 +0300
Subject: [PATCH 4/7] Update iron/operators/transpose/design.py
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Co-authored-by: André Rösti <androsti@amd.com>
---
 iron/operators/transpose/design.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/iron/operators/transpose/design.py b/iron/operators/transpose/design.py
index 9e08ddb7..bf8403a7 100644
--- a/iron/operators/transpose/design.py
+++ b/iron/operators/transpose/design.py
@@ -158,8 +158,7 @@ def core_body(of_in1, of_out, transpose_kernel):
         rt.start(*my_workers)
 
         # One task group per batch (each a parallel fill+drain over all columns/channels), so the
-        # num_batches contiguous matrices stream through the same FIFOs in sequence. At num_batches==1
-        # this is a single pass — identical to the original single-transpose schedule.
+        # num_batches contiguous matrices stream through the same FIFOs in sequence.
         for batch in range(num_batches):
             # Initialize a group for parallel drain tasks, with fill resources free'd when drains complete.
             tg = rt.task_group()

From 12ea37f475ab32ecb2ccb419bb0950428588d0da Mon Sep 17 00:00:00 2001
From: Taimuraz Kaitmazov <atassikay38@gmail.com>
Date: Mon, 22 Jun 2026 22:04:55 +0300
Subject: [PATCH 5/7] Update iron/operators/transpose/design.py
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Co-authored-by: André Rösti <androsti@amd.com>
---
 iron/operators/transpose/design.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/iron/operators/transpose/design.py b/iron/operators/transpose/design.py
index bf8403a7..15a64776 100644
--- a/iron/operators/transpose/design.py
+++ b/iron/operators/transpose/design.py
@@ -172,7 +172,7 @@ def core_body(of_in1, of_out, transpose_kernel):
                         taps_in_L3L2[i * num_channels + j][batch],
                         task_group=tg,
                     )
-            # Drain the output objectFIFOs with data
+            # Drain the output objectFIFOs of data
             for i in range(num_columns):
                 for j in range(num_channels):
                     rt.drain(

From 4aa4c9194de36bed9395f549e41e636cc36cece0 Mon Sep 17 00:00:00 2001
From: Taimuraz Kaitmazov <atassikay38@gmail.com>
Date: Mon, 22 Jun 2026 22:05:07 +0300
Subject: [PATCH 6/7] Update iron/operators/transpose/reference.py
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Co-authored-by: André Rösti <androsti@amd.com>
---
 iron/operators/transpose/reference.py | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/iron/operators/transpose/reference.py b/iron/operators/transpose/reference.py
index be354585..a2a16843 100644
--- a/iron/operators/transpose/reference.py
+++ b/iron/operators/transpose/reference.py
@@ -10,10 +10,6 @@ def generate_golden_reference(
 ):
     torch.manual_seed(seed)
     val_range = 4
-    if num_batches == 1:
-        input_tensor = torch.rand(rows, cols, dtype=torch_dtype_map[dtype]) * val_range
-        output_tensor = torch.transpose(input_tensor, 0, 1)
-        return {"input": input_tensor, "output": output_tensor}
     # num_batches>1: B independent (rows,cols) matrices laid back-to-back; each is
     # transposed independently and the results concatenated in the same order.
     input_tensor = (
@@ -22,4 +18,7 @@ def generate_golden_reference(
     output_tensor = torch.stack(
         [torch.transpose(input_tensor[b], 0, 1) for b in range(num_batches)]
     )
+    # drop batch dimension if num_batches == 1
+    input_tensor = torch.squeeze(input_tensor, 0)
+    output_tensor = torch.squeeze(output_tensor, 0)
     return {"input": input_tensor, "output": output_tensor}

From a90732d7325dbf55a19d1cf5f5ba7350a7ca33d3 Mon Sep 17 00:00:00 2001
From: Taimuraz Kaitmazov <atassikay38@gmail.com>
Date: Mon, 22 Jun 2026 22:12:03 +0300
Subject: [PATCH 7/7] transpose: reword L3 layout comment to be self-contained

Drop the diff-relative phrasing ('original'/'unchanged') flagged in review;
the comment now describes the access-pattern layout as-is. Rationale moved to
the PR description.
---
 iron/operators/transpose/design.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/iron/operators/transpose/design.py b/iron/operators/transpose/design.py
index 15a64776..ff5c325c 100644
--- a/iron/operators/transpose/design.py
+++ b/iron/operators/transpose/design.py
@@ -50,11 +50,11 @@ def shuffle_transpose(
     # and channels. Partially transposes the input
     # data so that the kernel only needs to
     # transpose s*s-sized sub-tiles.
-    # For num_batches>1 the L3 tensors hold that many contiguous (M,N) matrices, stacked along
-    # the row dimension: in-dims (num_batches*M, N), out-dims (num_batches*N, M). At num_batches==1
-    # these reduce to (M,N)/(N,M) — identical to the original single-transpose patterns. Each (i,j)
-    # column/channel gets one TAP per batch (offset += batch*num_elements); the per-batch internal
-    # sizes/strides are unchanged because each matrix is contiguous and row-major.
+    # The L3 tensors hold num_batches contiguous (M,N) matrices stacked along the row
+    # dimension: in-dims (num_batches*M, N), out-dims (num_batches*N, M); at num_batches==1
+    # these are simply (M,N)/(N,M). Each (i,j) column/channel emits one TAP per batch, offset
+    # by batch*num_elements; the per-batch internal sizes/strides are the same for every batch
+    # because each matrix is contiguous and row-major.
     in_dims = (num_batches * M, N)
     out_dims = (num_batches * N, M)
     taps_in_L3L2 = [