From 3183fdd0bffdd88649cc30068c4d1de58a0fddda Mon Sep 17 00:00:00 2001
From: Simon Gerber <simon.gerber@amd.com>
Date: Tue, 14 Apr 2026 12:14:07 +0100
Subject: [PATCH 01/10] [Feature] Add LUT-based compressor tree generator

Port of compressor-python library for efficient low-bitwidth dot product
computation using LUT primitives instead of DSP blocks.

Architecture:
- Counter-based compressor trees
- Fused accumulation with constant propagation
- Target-specific primitive selection (CARRY4/CARRY8/LOOKAHEAD8)

FPGA Support:
- Versal: Fully functional
- 7-Series: Functional without fused accumulation and gate absorption (not ready for mvau integration)
- UltraScale/UltraScale+: Not yet implemented

Integration scripts for both dotp_comp and add_multi optimization modes included.

Implementation:
- Python-based compressor graph construction and optimization
- SystemVerilog template expansion for RTL generation
- mul_comp_map module for partial product broadcasting

This commit adds the generator infrastructure only. Integration with
FINN's RTL backend follows in subsequent commits.
---
 src/finn/compressor/Makefile                  |  17 +
 src/finn/compressor/README.md                 |  71 ++
 src/finn/compressor/__init__.py               |  13 +
 src/finn/compressor/hdl/dotp_comp_template.sv | 154 ++++
 src/finn/compressor/hdl/dotp_template.sv      |  66 ++
 src/finn/compressor/hdl/mul_comp_map.sv       | 239 ++++++
 src/finn/compressor/src/__init__.py           |   8 +
 src/finn/compressor/src/add_multi_finn.py     | 408 ++++++++++
 src/finn/compressor/src/benchmark.py          |  61 ++
 src/finn/compressor/src/dotp.py               |  97 +++
 src/finn/compressor/src/dotp_finn.py          | 264 +++++++
 src/finn/compressor/src/evaluation.py         | 253 ++++++
 src/finn/compressor/src/graph/__init__.py     |   8 +
 src/finn/compressor/src/graph/accumulator.py  |  96 +++
 .../compressor/src/graph/counters/__init__.py |   8 +
 .../counters/absorption_counter_candidates.py | 299 +++++++
 .../src/graph/counters/counter_candidates.py  | 737 ++++++++++++++++++
 src/finn/compressor/src/graph/final_adder.py  | 364 +++++++++
 src/finn/compressor/src/graph/nodes.py        | 393 ++++++++++
 src/finn/compressor/src/graph/primitives.py   | 113 +++
 src/finn/compressor/src/graph/visitor.py      |  45 ++
 src/finn/compressor/src/main.py               | 169 ++++
 src/finn/compressor/src/passes/__init__.py    |   8 +
 .../src/passes/compressor_constructor.py      | 183 +++++
 .../src/passes/compressor_pipeliner.py        |  33 +
 .../compressor/src/passes/cost_estimator.py   |  35 +
 src/finn/compressor/src/passes/emitter.py     | 317 ++++++++
 .../compressor/src/passes/io_annotator.py     |  54 ++
 src/finn/compressor/src/passes/lut_placer.py  |  85 ++
 .../compressor/src/passes/node_iterator.py    | 123 +++
 src/finn/compressor/src/passes/printer.py     |  54 ++
 .../compressor/src/passes/wire_inserter.py    |  40 +
 src/finn/compressor/src/target.py             | 102 +++
 src/finn/compressor/src/tests/__init__.py     |   8 +
 src/finn/compressor/src/tests/test_gen.py     | 150 ++++
 src/finn/compressor/src/tests/tester.py       |  41 +
 src/finn/compressor/src/utils/__init__.py     |   8 +
 src/finn/compressor/src/utils/mul_comp_map.py |  58 ++
 src/finn/compressor/src/utils/shape.py        |  51 ++
 39 files changed, 5233 insertions(+)
 create mode 100644 src/finn/compressor/Makefile
 create mode 100644 src/finn/compressor/README.md
 create mode 100644 src/finn/compressor/__init__.py
 create mode 100644 src/finn/compressor/hdl/dotp_comp_template.sv
 create mode 100644 src/finn/compressor/hdl/dotp_template.sv
 create mode 100644 src/finn/compressor/hdl/mul_comp_map.sv
 create mode 100644 src/finn/compressor/src/__init__.py
 create mode 100644 src/finn/compressor/src/add_multi_finn.py
 create mode 100644 src/finn/compressor/src/benchmark.py
 create mode 100644 src/finn/compressor/src/dotp.py
 create mode 100644 src/finn/compressor/src/dotp_finn.py
 create mode 100644 src/finn/compressor/src/evaluation.py
 create mode 100644 src/finn/compressor/src/graph/__init__.py
 create mode 100644 src/finn/compressor/src/graph/accumulator.py
 create mode 100644 src/finn/compressor/src/graph/counters/__init__.py
 create mode 100644 src/finn/compressor/src/graph/counters/absorption_counter_candidates.py
 create mode 100644 src/finn/compressor/src/graph/counters/counter_candidates.py
 create mode 100644 src/finn/compressor/src/graph/final_adder.py
 create mode 100644 src/finn/compressor/src/graph/nodes.py
 create mode 100644 src/finn/compressor/src/graph/primitives.py
 create mode 100644 src/finn/compressor/src/graph/visitor.py
 create mode 100644 src/finn/compressor/src/main.py
 create mode 100644 src/finn/compressor/src/passes/__init__.py
 create mode 100644 src/finn/compressor/src/passes/compressor_constructor.py
 create mode 100644 src/finn/compressor/src/passes/compressor_pipeliner.py
 create mode 100644 src/finn/compressor/src/passes/cost_estimator.py
 create mode 100644 src/finn/compressor/src/passes/emitter.py
 create mode 100644 src/finn/compressor/src/passes/io_annotator.py
 create mode 100644 src/finn/compressor/src/passes/lut_placer.py
 create mode 100644 src/finn/compressor/src/passes/node_iterator.py
 create mode 100644 src/finn/compressor/src/passes/printer.py
 create mode 100644 src/finn/compressor/src/passes/wire_inserter.py
 create mode 100644 src/finn/compressor/src/target.py
 create mode 100644 src/finn/compressor/src/tests/__init__.py
 create mode 100644 src/finn/compressor/src/tests/test_gen.py
 create mode 100644 src/finn/compressor/src/tests/tester.py
 create mode 100644 src/finn/compressor/src/utils/__init__.py
 create mode 100644 src/finn/compressor/src/utils/mul_comp_map.py
 create mode 100644 src/finn/compressor/src/utils/shape.py

diff --git a/src/finn/compressor/Makefile b/src/finn/compressor/Makefile
new file mode 100644
index 0000000000..7df3e6963e
--- /dev/null
+++ b/src/finn/compressor/Makefile
@@ -0,0 +1,17 @@
+#############################################################################
+# Copyright (C) 2024 - 2025 Advanced Micro Devices, Inc. All rights reserved.
+#
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# @brief    Build automation for compressor testing and generation
+# @author    Simon Gerber <simon.gerber@amd.com>
+#############################################################################
+
+# Default: no constant absorption
+CA?=
+.PHONY: default clean
+
+default:
+	./run_tests.sh $(CA)
+clean:
+	rm -rf *.log *.jou *.vivado .Xil xvlog.pb gen/*
diff --git a/src/finn/compressor/README.md b/src/finn/compressor/README.md
new file mode 100644
index 0000000000..8c6fbbd2b0
--- /dev/null
+++ b/src/finn/compressor/README.md
@@ -0,0 +1,71 @@
+<!--
+Copyright (C) 2024 - 2025 Advanced Micro Devices, Inc. All rights reserved.
+
+SPDX-License-Identifier: BSD-3-Clause
+-->
+
+# Python Compressor Generator
+This tool can generate compressor trees for 7-Series, UltraScale(+) and Versal for arbitrary input shapes.
+
+# Getting started
+1. Clone this repository.
+2. _No_ further dependencies needed!
+
+## Usage
+Generate a compressor of shape `(12,12,12)` called `comp` and save it under `/gen/comp12_12_12.sv`:
+
+```python3 src/main.py -s 12,12,12 -n comp -o gen/comp12_12_12.sv```
+
+See `python3 src/main.py -h` for details.
+
+## Features
+### Custom Input Shape
+The tool can generate compressors for any input shape. A shape is passed as a comma-separated list. Each digit indicates a column's height. *LSB* is *left*, *MSB* is *right*.
+
+### Accumulation
+By passing `-a`, the tool generates an accumulator instead of just an adder. The accumulators width can be specified by `-w`.
+### Gate Absorption
+If desired, every input to the compressor can be preceded by a two-input gate. These gates can be integrated into the first compression stage. Each gate is specified as a HEX digit. The encoding is the same is Vivado's LUT2 primitive: 
+| Secondary Input | Primary Input | Output
+|-----------------|---------------|----------------
+|0	              |0	          |(DIGIT << 0) & 1
+|0	              |1	          |(DIGIT << 1) & 1
+|1	              |0	          |(DIGIT << 2) & 1
+|1	              |1	          |(DIGIT << 3) & 1
+
+For example, `8` maps to an AND gate and `6` maps to an XOR gate.
+
+In CLI, gates can be specified as a flat string like `-g 883ABC`. The *LSB* is *left* and *MSB* is *right*. The leftmost specified gate corresponds to the LSB input in the generated compressor input vector.
+
+### Target
+Generate compressors for either Versal, 7-Series or UltraScale fabrics using `-t \{Versal,7-Series,UltraScale\}̀ .
+
+### Automated Testing
+The tool can automatically generate a SystemVerilog testbench to fuzzy-test the generated compressors by passing `--test`. For testing, the `xvlog`, `xelab` and `xsim` commands have to be available.
+
+### Custom Pipeline Depth
+Specify the maximum combinational delay for the compressor using `-p MAX_DEPTH`. Note that the final adder, which has at least one single routing delay, cannot be pipelined. 
+
+### Constant Input
+Aside to the regular, variable compressor inputs, the tool also supports an additional constant input. It can be specified as a binary number by `-c NUMBER`.
+
+# Implementation Details - How the Code is Structured
+The compressor is internally represented as a graph. Its nodes are defined in `src/graph/nodes.py`. 
+Compressor construction is done in several passes:
+1. Create a graph with all scheduled counters and a final adder (in `src/passes/compressor_constructor.py`).
+    1. (Optional) Generate a gate absorption stage.
+    2. Generate regular compression stages until the compression goal is reached.
+    3. Insert pipeline registers between compressor stages.
+    4. Build either a final adder or an accumulator as the final stage.
+2. Annotate LUT6CY instances with placement constraints so that the LUT Cascade will be utilized (in `src/passes/lut_placer.py`).
+3. Replace inexpressible connections: Place wires between connected instantiated modules (in `src/passes/wire_inserter.py`). 
+4. Annotate input and output signals in the compressor (in `src/passes/io_annotator.py`).
+5. Emit generated SystemVerilog source (in `src/passes/emitter.py`)
+
+## Extending the Tool
+### Adding new Counters
+Counters without gate absorption are defined in `graph/counters/counter_candidates.py`. 
+Counters with gate absorption are defined in `graph/counters/absorption_counter_candidates.py`. 
+
+### Adding new Passes
+Before adding new passes over the compressor graph, check out if the simple iterator defined in `node_iterator.py` can be inherited to save boilerplate code.
diff --git a/src/finn/compressor/__init__.py b/src/finn/compressor/__init__.py
new file mode 100644
index 0000000000..38b3d95ea5
--- /dev/null
+++ b/src/finn/compressor/__init__.py
@@ -0,0 +1,13 @@
+#############################################################################
+# Copyright (C) 2024 - 2025 Advanced Micro Devices, Inc. All rights reserved.
+#
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# @brief    FINN compressor package initialization
+# @author    Simon Gerber <simon.gerber@amd.com>
+#############################################################################
+
+"""FINN compressor — LUT-based compressor tree generator for MVU."""
+
+from .src.dotp_finn import generate_dotp_comp
+from .src.add_multi_finn import generate_add_multi_comps
diff --git a/src/finn/compressor/hdl/dotp_comp_template.sv b/src/finn/compressor/hdl/dotp_comp_template.sv
new file mode 100644
index 0000000000..5cb3119a0e
--- /dev/null
+++ b/src/finn/compressor/hdl/dotp_comp_template.sv
@@ -0,0 +1,154 @@
+/******************************************************************************
+ * Copyright (C) 2024 - 2025 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * @brief	RTL template for dot product compressor with accumulation
+ * @author	Simon Gerber <simon.gerber@amd.com>
+ *****************************************************************************/
+
+/**
+ * LUT-based dot product with fused accumulation.
+ * Drop-in replacement for DSP-based compute cores in the MVU.
+ * Uses a generated compressor tree for the reduction.
+ *
+ *		This file is a TEMPLATE — $COMP_MODULE_NAME$ is substituted
+ *		at code generation time with the config-specific compressor
+ *		module name (e.g. comp_8xs2s2).
+ *****************************************************************************/
+
+module dotp_comp #(
+	int unsigned  PE,
+	int unsigned  SIMD,
+	int unsigned  WEIGHT_WIDTH,
+	int unsigned  ACTIVATION_WIDTH,
+	int unsigned  ACCU_WIDTH,
+	bit  SIGNED_ACTIVATIONS = 0,
+	int unsigned  COMP_PIPELINE_DEPTH = 1
+)(
+	// Global Control
+	input	logic  clk,
+	input	logic  rst,
+	input	logic  en,
+
+	// Input
+	input	logic  last,
+	input	logic  zero,
+	input	logic signed [PE-1:0][SIMD-1:0][WEIGHT_WIDTH-1:0]   w,
+	input	logic        [SIMD-1:0][ACTIVATION_WIDTH-1:0]       a,
+
+	// Output
+	output	logic  vld,
+	output	logic signed [PE-1:0][ACCU_WIDTH-1:0]  p
+);
+
+	initial begin
+		if(COMP_PIPELINE_DEPTH < 1) begin
+			$error("%m: COMP_PIPELINE_DEPTH (%0d) must be >= 1.", COMP_PIPELINE_DEPTH);
+			$finish;
+		end
+	end
+
+	//-----------------------------------------------------------------------
+	// Operand Mapping
+	//
+	// The `mul_comp_map` interface handles partial-product broadcasting
+	// mul_comp_map requires NA >= NB.  Weights are always signed.
+	// If activations are wider, swap operands so that ia gets the wider one.
+	localparam bit  SWAPPED = ACTIVATION_WIDTH > WEIGHT_WIDTH;
+
+	localparam int unsigned  NA = SWAPPED ? ACTIVATION_WIDTH : WEIGHT_WIDTH;
+	localparam int unsigned  NB = SWAPPED ? WEIGHT_WIDTH     : ACTIVATION_WIDTH;
+	localparam bit  SIGNED_A    = SWAPPED ? SIGNED_ACTIVATIONS : 1;  // weights always signed
+	localparam bit  SIGNED_B    = SWAPPED ? 1 : SIGNED_ACTIVATIONS;
+
+	// Input to Matric Broadcasting
+	uwire [NA-1:0]  map0_ia = SWAPPED ? NA'(a[0])    : NA'(w[0][0]);
+	uwire [NB-1:0]  map0_ib = SWAPPED ? NB'(w[0][0]) : NB'(a[0]);
+	mul_comp_map #(.NA(NA), .NB(NB), .SIGNED_A(SIGNED_A), .SIGNED_B(SIGNED_B))
+		map0 (.ia(map0_ia), .ib(map0_ib));
+	localparam int unsigned  NM = $bits(map0.oa);
+
+	//-----------------------------------------------------------------------
+	// Pipeline shift register for last -> vld
+/* verilator lint_off LITENDIAN */
+	logic [1:COMP_PIPELINE_DEPTH]  L = '0;
+/* verilator lint_on LITENDIAN */
+	always_ff @(posedge clk) begin
+		if(rst)      L <= '0;
+		else if(en) begin
+			L[1] <= last;
+			for(int unsigned  i = 2; i <= COMP_PIPELINE_DEPTH; i++)
+				L[i] <= L[i-1];
+		end
+	end
+	assign	vld = L[COMP_PIPELINE_DEPTH];
+
+	//-----------------------------------------------------------------------
+	// PE-parallel compressor instances
+	//-----------------------------------------------------------------------
+	for(genvar  pe = 0; pe < PE; pe++) begin : genPE
+
+		// Partial product matrix broadcasting
+		uwire [NM-1:0]  oa[SIMD];
+		uwire [NM-1:0]  ob[SIMD];
+		for(genvar  i = 0; i < SIMD; i++) begin : genMap
+			uwire [NA-1:0]  map_ia = SWAPPED ? NA'(a[i])    : NA'(w[pe][i]);
+			uwire [NB-1:0]  map_ib = SWAPPED ? NB'(w[pe][i]) : NB'(a[i]);
+			mul_comp_map #(.NA(NA), .NB(NB), .SIGNED_A(SIGNED_A), .SIGNED_B(SIGNED_B))
+				map_i (.ia(map_ia), .ib(map_ib));
+			assign	oa[i] = map_i.oa;
+			assign	ob[i] = map_i.ob;
+		end : genMap
+
+		// Flatten all matrices column by column
+		logic [SIMD*NM-1:0]  comp_a;
+		logic [SIMD*NM-1:0]  comp_b;
+		always_comb begin : blkFlatten
+			automatic int unsigned  src_idx[SIMD] = '{ default: 0 };
+			automatic int unsigned  dst_idx = 0;
+			for(int unsigned  col = 0; col < map0.columns(); col++) begin
+				for(int unsigned  k = 0; k < SIMD; k++) begin
+					for(int unsigned  row = 0; row < map0.height(col); row++) begin
+						comp_a[dst_idx] = oa[k][src_idx[k]];
+						comp_b[dst_idx] = ob[k][src_idx[k]];
+						src_idx[k]++;
+						dst_idx++;
+					end
+				end
+			end
+		end : blkFlatten
+
+		// Compressor with fused accumulation
+		// $COMP_MODULE_NAME$ is replaced at code generation time with the
+		// config-specific compressor module (e.g. comp_8xs2s2).
+		uwire [ACCU_WIDTH-1:0]  comp_out;
+		$COMP_MODULE_NAME$ comp_inst (
+			.clk,
+			.in(comp_b),
+			.in_2(comp_a),
+			.rst(rst || last),
+			.en_neg(rst || zero),
+			.en(en),
+			.out(comp_out)
+		);
+
+		assign	p[pe] = $signed(comp_out);
+
+	end : genPE
+
+	//-----------------------------------------------------------------------
+	// Parameter Validation
+	//-----------------------------------------------------------------------
+	initial begin
+		if (SIMD != $EXPECTED_SIMD$ || NA != $EXPECTED_NA$ || NB != $EXPECTED_NB$ ||
+		    SIGNED_A != $EXPECTED_SIGNED_A$ || SIGNED_B != $EXPECTED_SIGNED_B$ ||
+		    ACCU_WIDTH != $EXPECTED_ACCU_WIDTH$) begin
+			$warning("%m: CRITICAL - dotp_comp parameter mismatch! SIMD=%0d (expected %0d), NA=%0d (expected %0d), NB=%0d (expected %0d), SIGNED_A=%0d (expected %0d), SIGNED_B=%0d (expected %0d), ACCU_WIDTH=%0d (expected %0d)",
+			         SIMD, $EXPECTED_SIMD$, NA, $EXPECTED_NA$, NB, $EXPECTED_NB$,
+			         SIGNED_A, $EXPECTED_SIGNED_A$, SIGNED_B, $EXPECTED_SIGNED_B$,
+			         ACCU_WIDTH, $EXPECTED_ACCU_WIDTH$);
+		end
+	end
+
+endmodule : dotp_comp
diff --git a/src/finn/compressor/hdl/dotp_template.sv b/src/finn/compressor/hdl/dotp_template.sv
new file mode 100644
index 0000000000..944fc8fc76
--- /dev/null
+++ b/src/finn/compressor/hdl/dotp_template.sv
@@ -0,0 +1,66 @@
+/******************************************************************************
+ * Copyright (C) 2024 - 2025 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * @brief	RTL template for standalone dot product compressor
+ *****************************************************************************/
+
+module dotp_{n}x{sa}{na}{sb}{nb} #(
+	localparam int unsigned  N = {n},
+	localparam int unsigned  NA = {na},
+	localparam int unsigned  NB = {nb},
+	localparam bit  SIGNED_A = {signed_a},
+	localparam bit  SIGNED_B = {signed_b},
+	localparam int unsigned  NP = NA > 1?
+		$clog2(N) + (!SIGNED_B && (NB == 1)? NA : NA+NB) :
+		SIGNED_A ^^ SIGNED_B? 1 + $clog2(N) /*[-N:0]*/ : $clog2(N+1) /*[0:N]*/
+)(
+	input	logic  clk,
+
+	input	logic [N-1:0][NA-1:0]  a,
+	input	logic [N-1:0][NB-1:0]  b,
+	output	logic [NP-1:0]  p
+);
+
+	// Input to Matrix Broadcasting
+	mul_comp_map #(.NA(NA), .NB(NB), .SIGNED_A(SIGNED_A), .SIGNED_B(SIGNED_B)) map0 (.ia(a[0]), .ib(b[0]));
+	localparam int unsigned  NM = $bits(map0.oa);
+	uwire [NM-1:0]  oa[N];
+	uwire [NM-1:0]  ob[N];
+	assign	oa[0] = map0.oa;
+	assign	ob[0] = map0.ob;
+	for(genvar  i = 1; i < N; i++) begin
+		mul_comp_map #(.NA(NA), .NB(NB), .SIGNED_A(SIGNED_A), .SIGNED_B(SIGNED_B)) map_i (.ia(a[i]), .ib(b[i]));
+		assign	oa[i] = map_i.oa;
+		assign	ob[i] = map_i.ob;
+	end
+
+	// Flatten all Matrices Column by Column
+	logic [N*NM-1:0]  comp_a;
+	logic [N*NM-1:0]  comp_b;
+	always_comb begin
+		automatic int unsigned  src_idx[N] = '{ default: 0 };
+		automatic int unsigned  dst_idx = 0;
+		for(int unsigned  col = 0; col < map0.columns(); col++) begin
+			for(int unsigned  i = 0; i < N; i++) begin
+				for(int unsigned  row = 0; row < map0.height(col); row++) begin
+					comp_a[dst_idx] = oa[i][src_idx[i]];
+					comp_b[dst_idx] = ob[i][src_idx[i]];
+					src_idx[i]++;
+					dst_idx++;
+				end
+			end
+		end
+	end
+
+	uwire signed [NP-1:0]  comp_p;
+	uwire signed [NP-1:0]  abs_p = {abs_term};
+	comp_{n}x{sa}{na}{sb}{nb} comp (
+		.clk,
+		.in(comp_b), .in_2(comp_a),
+		.out(comp_p)
+	);
+	assign	p = comp_p + abs_p;
+
+endmodule : dotp_{n}x{sa}{na}{sb}{nb}
diff --git a/src/finn/compressor/hdl/mul_comp_map.sv b/src/finn/compressor/hdl/mul_comp_map.sv
new file mode 100644
index 0000000000..7049c34ea4
--- /dev/null
+++ b/src/finn/compressor/hdl/mul_comp_map.sv
@@ -0,0 +1,239 @@
+/******************************************************************************
+ * Copyright (C) 2024 - 2025 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * @brief	Multiplier-to-compressor mapping module for gate absorption
+ * @author	Thomas B. Preußer <thomas.preusser@amd.com>, Simon Gerber <simon.gerber@amd.com>
+ *****************************************************************************/
+
+/**
+ * Broadcasts multiplication inputs to feed a bit product matrix for compression.
+ *
+ * @description
+ *	This interface component broadcasts multiplication inputs to produce a bit
+ *	product matrix like the one below. The output is flattened for the
+ *	ingestion by a compressor with the indicated indices:
+ *
+ *	                                   [6]a3.b0  [3]a2.b0  [1]a1.b0  [0]a0.b0
+ *	                        [10]a3.b1  [7]a2.b1  [4]a1.b1  [2]a0.b1
+ *	             [13]a3.b2  [11]a2.b2  [8]a1.b2  [5]a0.b2
+ *	  [15]a3.b3  [14]a2.b3  [12]a1.b3  [9]a0.b3
+ *
+ *	Functions designated to informing about the produced shape are provided:
+ *	  - columns()   - the number of columns in the matrix shape.
+ *	  - height(col) - the height of the specified column.
+ *	Additionally, the bit product operator is identified for each index by:
+ *	  - gate_op(idx) - the assumed bit product operator as hex LUT code.
+ *
+ *	In the case of unsigned operands, all bit products require to be computed
+ *	as AND gates (8), i.e. m[i] = oa[i] & ob[i].
+ *
+ * The operands can be specified to be signed, which will effect these changes
+ * to produce the correct funtionality:
+ *
+ * SIGNED_A
+ * --------
+ *	The sign extensions of the multiples of input a are not materialized.
+ *	Instead, this identity with s := a_{NA-1} & b_i is applied:
+ *		s ... s  s
+ *		----------
+ *		        !s
+ *		        -1
+ *	In consequence:
+ *	  - The `gate_op()` for the left matrix boundary is identified as NAND (7).
+ *	  - The `absolute_term()` function returns a valu of
+ *	       (-2^NB + 1) * 2^{NA-1}
+ *	    that must be added to the matrix sum for the correct product value.
+ *
+ * SIGNED_B
+ * --------
+ *	The sign extension of input b is not materialized.
+ *	Instead, the multiple of a by the sign bit of b is weighted negatively,
+ *	which expands the produced matrix as follows:
+ *
+ *	                                             [ 6]a3.b0  [3]a2.b0  [1]a1.b0  [0]a0.b0
+ *	                                  [11]a3.b1  [ 7]a2.b1  [4]a1.b1  [2]a0.b1
+ *	                       [14]a3.b2  [12]a2.b2  [ 8]a1.b2  [5]a0.b2
+ *	  [17]0!b3! [16]a3!b3  [15]a2!b3  [13]a1!b3  [ 9]a0!b3
+ *	        -1                                   [10]   b3
+ *	-----------------------------------------------------------------------------------
+ *	                                  [10]a0!b3  [ 6]a3.b0  [3]a2.b0  [1]a1.b0  [0]a0.b0
+ *	                                  [11]a3.b1  [ 7]a2.b1  [4]a1.b1  [2]a0.b1
+ *	                       [14]a3.b2  [12]a2.b2  [ 8]a1.b2  [5]a0.b2
+ *	  [17]0!b3! [16]a3!b3  [15]a2!b3  [13]a1!b3  [ 9]a0.b3
+ *	        -1
+ *
+ *	using:
+ *		- a.b  :=    a & b
+ *		- a!b  :=   !a & b
+ *		- a!b! := !(!a & b)
+ *
+ *	In consequence:
+ *	  - The bit sizes of the outputs are wider and the `columns()` count is larger.
+ *	  - The `gate_op()` at the shown indeces is identified as 2 or D.
+ *	Note that the height of the matrix grows to NB+1 if NA > NB.
+ *
+ * SIGNED_A & SIGNED_B
+ * -------------------
+ *	Both approaches are combined for a purely signed multiplication:
+ *
+ *	                        [10]a0!b3  [ 6]a3.b0! [3]a2.b0  [1]a1.b0  [0]a0.b0
+ *	                        [11]a3.b1! [ 7]a2.b1  [4]a1.b1  [2]a0.b1
+ *	             [14]a3.b2! [12]a2.b2  [ 8]a1.b2  [5]a0.b2
+ *	  [16]a3!b3! [15]a2!b3  [13]a1!b3  [ 9]a0.b3
+ *	         -1         -1         -1         -1
+ *
+ *	using:
+ *		- a.b  :=    a & b
+ *		- a!b  :=   !a & b
+ *		- a.b! := !( a & b)
+ *		- a!b! := !(!a & b)
+ *	In consequence:
+ *	  - The bit sizes of the outputs are wider.
+ *	  - The `gate_op()` at the shown indeces is properly identified.
+ *	  - The `absolute_term()` function returns a value of
+ *	       (-2^NB + 1) * 2^{NA-1}
+ *	    that must be added to the matrix sum for the correct product value.
+ *	Note that the height of the matrix grows to NB+1 if NA > NB.
+ */
+
+interface mul_comp_map #(
+	int unsigned  NA,	// bit width of multiplicand
+	int unsigned  NB,	// bit width of multiplier
+	bit  SIGNED_A,		// signed multiplicand
+	bit  SIGNED_B,		// signed multiplier
+
+	// Extra bits due to sign handling and total output size
+	localparam int unsigned  NX = (NA == 1) || !SIGNED_B? 0 : SIGNED_A? 1 : 2,
+	localparam int unsigned  NM = NA*NB + NX
+)(
+	// Input Operands
+	input	logic [NA-1:0]  ia,  // Multiplicand
+	input	logic [NB-1:0]  ib   // Multiplier
+);
+	// Bit Matrix Broadcasts
+	logic [NM-1:0]  oa;
+	logic [NM-1:0]  ob;
+
+
+	// Operand length support is not symmetrical.
+	initial begin
+		if(NA < NB) begin
+			$error("%m: Switch multiplication operands.");
+			$finish;
+		end
+	end
+
+	function int unsigned columns();
+		return  NA == 1? 1 : NB + NA - (!SIGNED_B || SIGNED_A);
+	endfunction : columns
+
+	function int unsigned height(input int unsigned  col);
+		if(NA == 1)  return  col < 1;
+		else begin
+			automatic int unsigned  ret =
+				(col <  NB)?      col + 1 :
+				(col <  NA)?      NB :
+				(col <  NB+NA-1)? NB+NA-1 - col :
+				(col == NB+NA-1)? SIGNED_B && !SIGNED_A :
+				/* else */        0;
+			if(SIGNED_B && (col == NB))  ret++;
+			return  ret;
+		end
+	endfunction : height
+
+	function bit signed [NA+NB-1:0] absolute_term();
+		if(NA == 1)  return  SIGNED_A ^^ SIGNED_B? -1 : 0;
+		else begin
+			automatic bit signed [NA+NB-1:0]  ret = '{
+				NA+NB-1: SIGNED_A || SIGNED_B,
+				NA-1:    SIGNED_A,
+				default: 0
+			};
+			return  ret;
+		end
+	endfunction : absolute_term
+
+
+	// Beyond the tip of left triangle at column of height 1
+	localparam int unsigned  HIGH = NM - (SIGNED_B && !SIGNED_A);
+
+	function bit [3:0] gate_op(input int unsigned  idx);
+		if(NA == 1)  return  SIGNED_A ^^ SIGNED_B? 7 : 8;
+		else begin
+			automatic bit [3:0]  op = 8; // AND
+
+			if(SIGNED_B) begin
+				automatic bit  inv = 0;
+				// Negative weight for sign-bit row
+				for(int unsigned  col = 0; col < NB; col++) begin
+					if(idx == HIGH-1 - col*(col+1)/2)  inv = 1;
+				end
+				if(idx == HIGH)  inv = 1;
+				if(inv)  op = { op[1:0], op[3:2] };
+				if((idx == HIGH) && !SIGNED_A)  op = ~op;
+			end
+
+			if(SIGNED_A) begin
+				automatic bit  inv = 0;
+				// NAND along left matrix boundary
+				for(int unsigned  col = 0; col < NB; col++) begin
+					if(idx == HIGH - (col+1)*(col+2)/2 + (SIGNED_B && (col < NB-1)))  inv = 1;
+				end
+				if(inv)  op = ~op;
+			end
+
+			return  op;
+		end
+	endfunction : gate_op
+
+	//-----------------------------------------------------------------------
+	// Broadcast Wiring
+	if(NA == 1) begin : genTrivial
+		assign	oa[0] = ia[0];
+		assign	ob[0] = ib[0];
+	end : genTrivial
+	begin : genMatrix
+
+		// Feed right triangle going right to left until first full-height column
+		for(genvar  col = 0; col < NB; col++) begin
+			localparam int unsigned  TOP = col*(col+1)/2;
+			for(genvar  row = 0; row <= col; row++) begin
+				assign	oa[TOP+row] = ia[col-row];
+				assign	ob[TOP+row] = ib[row];
+			end
+		end
+
+		// Feed central full-height rectangle for NA > NB
+		for(genvar  col = 0; col < NA-NB; col++) begin
+			localparam int unsigned  TOP = NB*(NB+1)/2 + col*NB + SIGNED_B;
+			for(genvar  row = 0; row < NB; row++) begin
+				assign  oa[TOP + row] = ia[NB+col - row];
+				assign  ob[TOP + row] = ib[row];
+
+			end
+		end
+
+		// Feed left triangle going left to right up to last column with a receeded height
+		for(genvar  col = 0; col < NB-1; col++) begin
+			localparam int unsigned  BOT = HIGH - col*(col+1)/2 - 1;
+			for(genvar  row = 0; row <= col; row++) begin
+				assign	oa[BOT-row] = ia[NA-1-col+row];
+				assign	ob[BOT-row] = ib[NB-1-row];
+			end
+		end
+
+		// Feed extra elements created for sign handling
+		if(SIGNED_B) begin
+			assign	oa[NB*(NB+1)/2] = ia[0];
+			assign	ob[NB*(NB+1)/2] = ib[NB-1];
+			if(!SIGNED_A) begin
+				assign	oa[HIGH] = 0;
+				assign	ob[HIGH] = ib[NB-1];
+			end
+		end
+
+	end : genMatrix
+
+endinterface : mul_comp_map
diff --git a/src/finn/compressor/src/__init__.py b/src/finn/compressor/src/__init__.py
new file mode 100644
index 0000000000..65cad800cf
--- /dev/null
+++ b/src/finn/compressor/src/__init__.py
@@ -0,0 +1,8 @@
+#############################################################################
+# Copyright (C) 2024 - 2025 Advanced Micro Devices, Inc. All rights reserved.
+#
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# @brief    Compressor source package initialization
+# @author    Simon Gerber <simon.gerber@amd.com>
+#############################################################################
diff --git a/src/finn/compressor/src/add_multi_finn.py b/src/finn/compressor/src/add_multi_finn.py
new file mode 100644
index 0000000000..3932584db0
--- /dev/null
+++ b/src/finn/compressor/src/add_multi_finn.py
@@ -0,0 +1,408 @@
+#############################################################################
+# Copyright (C) 2024 - 2025 Advanced Micro Devices, Inc. All rights reserved.
+#
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# @brief    FINN wrapper for add_multi compressor generation
+# @author    Simon Gerber <simon.gerber@amd.com>
+#############################################################################
+
+"""
+Generate a compressor core for FINN's add_multi module (COMP path).
+
+The add_multi module in mvu.sv reduces N unsigned partial sums of ARG_WIDTH
+bits into a single result (N dsp lanes outputs).  This script generates a LUT-mapped compressor tree
+for a specific (N, ARG_WIDTH) configuration, producing a module that can be
+matched by the CATCH_COMP macro in add_multi.sv.
+
+Unlike dotp_finn.py, no absorption is needed:
+  - No gates:       inputs are complete values, not partial-product factor pairs
+  - No constants:   no Baugh-Wooley sign-correction (inputs are unsigned)
+  - No accumulation: accumulation stays downstream in mvu.sv
+
+Two call modes:
+
+  Direct mode — caller supplies N and ARG_WIDTH explicitly:
+    python add_multi_finn.py --n 32 --arg_width 6 -t Versal -o gen/
+
+  MVU mode — caller supplies MVU-level parameters, and the script computes
+  the required lo_width values per DSP lane via a Python replica of
+  mvu.sv::sliceLanes(), then generates one compressor per unique (N, lo_width):
+    python add_multi_finn.py --mvu --n 8 --version 2 --ww 2 --aw 2 \
+        --accu_width 16 --narrow_weights 0 -t Versal -o gen/
+
+
+Outputs:
+  comp_<N>u<W>_d<delay>.sv  — the generated compressor core(s)
+"""
+
+import os
+import math
+import argparse
+import shutil
+
+from .main import generate_compressor
+from .target import resolve_target, resolve_target_name, Versal, SevenSeries
+from .utils.shape import Shape
+
+
+# ---------------------------------------------------------------------------
+# Python replica of mvu.sv::sliceLanes()
+#
+# This must mirror the SV implementation exactly. Any change to sliceLanes()
+# in mvu.sv requires updating this function as well. The $warning guard in
+# add_multi.sv catches divergence at simulation time.
+#
+# This outsourced computation is required as lane width is relevant to the
+# compressor input Shape and thus needs to be known at generation time.
+
+def clog2(n):
+    """Ceiling of log2, matching SystemVerilog $clog2 semantics."""
+    if n <= 1:
+        return 0
+    return math.ceil(math.log2(n))
+
+
+def slice_lanes(version, ww, aw, accu_width, narrow_weights):
+    """
+    Compute DSP lane offsets — Python replica of mvu.sv::sliceLanes().
+    Parameters
+    ----------
+    version : int
+        DSP version (1=DSP48E1, 2=DSP48E2, 3=DSP58).
+    ww : int
+        WEIGHT_WIDTH.
+    aw : int
+        ACTIVATION_WIDTH.
+    accu_width : int
+        ACCU_WIDTH.
+    narrow_weights : bool
+        NARROW_WEIGHTS flag.
+
+    Returns
+    -------
+    (num_lanes, offsets) : tuple
+        num_lanes : int 
+            number of DSP lanes.
+        offsets   : list[int] 
+            lane boundary positions (length num_lanes+1).
+    """
+    a_width = 25 + 2 * (version > 1)
+    p_width = 58 if version == 3 else 48
+    min_lane_width = ww + aw - 1
+
+    if a_width == ww:
+        num_lanes = 1
+    else:
+        num_lanes = 1 + (a_width - (0 if narrow_weights else 1) - ww) // min_lane_width
+
+    # Distribute slack bits preferring right lanes
+    bit_slack = a_width - (0 if narrow_weights else 1) - ww - (num_lanes - 1) * min_lane_width
+
+    offsets = [0] * (num_lanes + 1)
+    for i in range(1, num_lanes):
+        extra = (bit_slack + (num_lanes - 1 - i)) // (num_lanes - i)
+        offsets[i] = offsets[i - 1] + min_lane_width + extra
+        bit_slack -= extra
+
+    # Last lane bounded by min(ACCU_WIDTH, P_WIDTH)
+    offsets[num_lanes] = offsets[num_lanes - 1] + accu_width
+    if offsets[num_lanes] > p_width:
+        offsets[num_lanes] = p_width
+
+    return num_lanes, offsets
+
+
+def lo_widths_from_mvu_params(version, ww, aw, accu_width, narrow_weights):
+    """
+    Compute the lo_width for each DSP lane.
+
+    Returns
+    -------
+    list[int] 
+        lo_width for lane 0 .. num_lanes-1.
+    """
+    num_lanes, offsets = slice_lanes(version, ww, aw, accu_width, narrow_weights)
+    return [offsets[i + 1] - offsets[i] for i in range(num_lanes)]
+
+
+def comp_module_name(n, arg_width, delay):
+    """
+    Return the compressor module name, e.g. 'comp_32u6_d4'.
+
+    Encodes:
+      N         — number of unsigned addends (= SIMD)
+      ARG_WIDTH — bits per addend (= lo_width from mvu.sv lane slicing)
+      delay     — pipeline stages produced by the generator
+
+    The 'u' indicates unsigned, matching the mvu_bench naming convention.
+    The delay suffix lets the CATCH_COMP macro in add_multi.sv match on
+    minimum pipeline depth (DEPTH >= d).
+    """
+    return f"comp_{n}u{arg_width}_d{delay}"
+
+
+def generate_add_multi_comp(target, n, arg_width, pipeline_every, output_dir,
+                            name=None):
+    """
+    Generate a multi-input adder compressor (no accumulation).
+
+    Parameters
+    ----------
+    target : Target
+        FPGA target (Versal, SevenSeries) — selects LUT primitives.
+    n : int
+        Number of unsigned addends.
+    arg_width : int
+        Bit width of each addend.
+    pipeline_every : int or None
+        Insert pipeline registers every N combinational stages.
+        None means purely combinational.
+    output_dir : str
+        Directory for the generated .sv file.
+    name : str or None
+        Module name override.  When None (default), the name is derived
+        from (n, arg_width, delay) after generation.
+
+    Returns
+    -------
+    (name, path, delay) : tuple
+        Module name, file path, and pipeline depth of the generated compressor.
+    """
+    # Shape: W columns each of height N.
+    # Each of the N operands contributes 1 bit to each of the W bit-positions,
+    # so every column has the same height N.
+    shape = Shape([n] * arg_width)
+
+    # First pass: generate with a temporary name to discover the actual delay.
+    # The delay depends on the compressor structure and pipeline_every, so we
+    # can't know it before generation.
+    tmp_name = name if name is not None else f"comp_{n}u{arg_width}"
+    tmp_path = os.path.join(output_dir, tmp_name + ".sv")
+
+    delay = generate_compressor(
+        target=target,
+        shape=shape,
+        name=tmp_name,
+        comb_depth=pipeline_every,
+        accumulate=False,          # Pure adder, no fused accumulation
+        accumulator_width=None,    # Not applicable without accumulation
+        gates=[],                  # No gate absorption, inputs are complete values
+        constants=[],              # No Baugh-Wooley correction, unsigned inputs
+        path=tmp_path,
+        test=False,
+        enable=False,              # No accumulator registers to initialize
+    )
+
+    # Derive final name with delay suffix
+    if name is not None:
+        final_name = name
+        final_path = tmp_path
+    else:
+        final_name = comp_module_name(n, arg_width, delay)
+        final_path = os.path.join(output_dir, final_name + ".sv")
+
+        if final_name != tmp_name:
+            # Rename file and replace module name inside it
+            with open(tmp_path, "r") as f:
+                content = f.read()
+            content = content.replace(tmp_name, final_name)
+            with open(final_path, "w") as f:
+                f.write(content)
+            os.remove(tmp_path)
+
+    return final_name, final_path, delay
+
+
+def generate_add_multi_comps(fpgapart, version, simd, ww, aw, accu_width,
+                             narrow_weights, output_dir):
+    """
+    Generate add_multi compressor cores and patch add_multi.sv.
+    This is the high-level entry point called by FINN's generate_hdl().
+
+    ALWAYS generates compressors and patches add_multi.sv with CATCH_COMP entries.
+
+    Parameters
+    ----------
+    fpgapart : str
+        FPGA part string.
+    version : int
+        DSP version (1=DSP48E1, 2=DSP48E2, 3=DSP58).
+    simd, ww, aw, accu_width : int
+        MVU parameters.
+    narrow_weights : int
+        NARROW_WEIGHTS flag (0 or 1).
+    output_dir : str
+        Directory for generated files (= code_gen_dir).
+
+    Returns
+    -------
+    dict with keys:
+        comp_names : list[str] — generated module names (empty if ineligible)
+        files      : list[str] — paths of all generated/patched files
+    """
+
+    rtllib_dir = os.path.join(os.environ["FINN_ROOT"], "finn-rtllib/mvu/")
+    patched_path = os.path.join(output_dir, "add_multi.sv")
+
+    # Always generate compressors and patch add_multi.sv
+    target = resolve_target(fpgapart)
+
+    # This is currently a parallel implementation of the lo_width computation in mvu.sv's sliceLanes() function.
+    # The resulting lo_width values determine the compressor input Shapes, so we need to compute them here in Python at generation time.
+    # Must be kept in SYNC.
+    widths = lo_widths_from_mvu_params(version, ww, aw, accu_width, narrow_weights)
+
+    # Generate one compressor per unique (SIMD, lo_width)
+    generated = {}  # (simd, width) -> (name, delay)
+    for w in widths:
+        key = (simd, w)
+        if key not in generated:
+            name, _path, delay = generate_add_multi_comp(
+                target, simd, w,
+                pipeline_every=1,  # Max pipelining (match dotp_comp behavior)
+                output_dir=output_dir)
+            generated[key] = (name, delay)
+
+    # Copy add_multi.sv to output_dir and inject CATCH_COMP lines
+    with open(os.path.join(rtllib_dir, "add_multi.sv"), "r") as f:
+        add_multi_src = f.read()
+
+    catch_lines = ""
+    comp_specs = []
+    for (_n, _w), (name, delay) in generated.items():
+        catch_lines += "\t`CATCH_COMP(%d,%d,%d)\n" % (_n, _w, delay)
+        comp_specs.append((_n, _w, delay))
+
+    marker = "\t// FINN_GENERATED_COMP_ENTRIES\n"
+    if marker not in add_multi_src:
+        raise RuntimeError(
+            "Cannot find FINN_GENERATED_COMP_ENTRIES marker in add_multi.sv. "
+            "Has the file been modified?")
+    add_multi_src = add_multi_src.replace(marker, catch_lines + marker)
+
+    with open(patched_path, "w") as f:
+        f.write(add_multi_src)
+
+    comp_files = [os.path.join(output_dir, name + ".sv")
+                  for (name, _delay) in generated.values()]
+
+    return {
+        "comp_names": [name for (name, _delay) in generated.values()],
+        "comp_specs": comp_specs,  # [(N, ARG_WIDTH, DELAY), ...]
+        "files": [patched_path] + comp_files,
+    }
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        prog="add_multi_finn",
+        description="Generate a compressor core for FINN's add_multi module."
+    )
+    parser.add_argument('--n', type=int, required=True,
+                        help="Number of unsigned addends (= SIMD)")
+    parser.add_argument('-t', '--target', default="Versal",
+                        choices=["Versal", "7-Series", "UltraScale"],
+                        help="Target FPGA generation")
+    parser.add_argument('-p', '--pipeline_every', type=int, default=None,
+                        help="Pipeline registers every N combinational stages")
+    parser.add_argument('-o', '--output_dir', default="../gen",
+                        help="Output directory for generated files")
+    parser.add_argument('--name', default=None,
+                        help="Module name override (default: comp_<N>u<W>_d<delay>)")
+
+    # Direct mode: explicit arg_width
+    parser.add_argument('--arg_width', type=int, default=None,
+                        help="Bit width per addend (direct mode)")
+
+    # MVU mode: derive arg_width(s) from MVU parameters
+    mvu_group = parser.add_argument_group(
+        'MVU parameters',
+        'When --mvu is given, lo_width values are computed from these '
+        'MVU-level parameters (replicating mvu.sv::sliceLanes).'
+    )
+    mvu_group.add_argument('--mvu', action='store_true',
+                           help="Enable MVU mode: derive arg_width from MVU params")
+    mvu_group.add_argument('--version', type=int, default=2,
+                           choices=[1, 2, 3],
+                           help="DSP version (1=DSP48E1, 2=DSP48E2, 3=DSP58)")
+    mvu_group.add_argument('--ww', type=int, default=None,
+                           help="WEIGHT_WIDTH")
+    mvu_group.add_argument('--aw', type=int, default=None,
+                           help="ACTIVATION_WIDTH")
+    mvu_group.add_argument('--accu_width', type=int, default=None,
+                           help="ACCU_WIDTH")
+    mvu_group.add_argument('--narrow_weights', type=int, default=0,
+                           choices=[0, 1],
+                           help="NARROW_WEIGHTS flag (0 or 1)")
+
+    args = parser.parse_args()
+
+    # Validate argument combinations
+    if not args.mvu and args.arg_width is None:
+        parser.error("Either --arg_width (direct mode) or --mvu with MVU "
+                     "parameters is required.")
+    if args.mvu and args.arg_width is not None:
+        parser.error("--arg_width and --mvu are mutually exclusive.")
+    if args.mvu:
+        for param in ('ww', 'aw', 'accu_width'):
+            if getattr(args, param) is None:
+                parser.error(f"--mvu requires --{param}")
+
+    target = resolve_target_name(args.target)
+    os.makedirs(args.output_dir, exist_ok=True)
+
+    if args.mvu:
+        # MVU mode: compute lo_width per lane, generate unique compressors
+        simd = args.n
+
+        # For SIMD < 4, the binary adder tree is already optimal.
+        # A compressor adds structural overhead with no benefit.
+        if simd < 4:
+            print(f"SIMD={simd} < 4: binary tree is optimal, no compressors generated.")
+            return
+
+        widths = lo_widths_from_mvu_params(
+            args.version, args.ww, args.aw,
+            args.accu_width, bool(args.narrow_weights)
+        )
+        depth = 3 + clog2(simd) + (1 if simd == 1 else 0) + 1
+        add_multi_depth = depth - 4
+
+        print(f"MVU config: VERSION={args.version} WW={args.ww} AW={args.aw} "
+              f"ACCU_WIDTH={args.accu_width} NARROW_WEIGHTS={args.narrow_weights}")
+        print(f"  NUM_LANES={len(widths)}  PIPELINE_DEPTH={depth}  "
+              f"ADD_MULTI_DEPTH={add_multi_depth}")
+        print(f"  LO_WIDTHs: {widths}")
+
+        # Generate one compressor per unique (N, lo_width)
+        seen = set()
+        for lane, w in enumerate(widths):
+            if (simd, w) in seen:
+                print(f"  Lane {lane}: lo_width={w} — reuses existing module")
+                continue
+            seen.add((simd, w))
+
+            comp_name, comp_path, comp_delay = generate_add_multi_comp(
+                target, simd, w,
+                args.pipeline_every, args.output_dir, name=args.name
+            )
+            print(f"  Lane {lane}: lo_width={w}")
+            print(f"    Generated: {comp_path}")
+            print(f"    Module:    {comp_name}")
+            print(f"    Delay:     {comp_delay}")
+
+    else:
+        # Direct mode: single compressor for explicit arg_width
+        comp_name, comp_path, comp_delay = generate_add_multi_comp(
+            target, args.n, args.arg_width,
+            args.pipeline_every, args.output_dir, name=args.name)
+
+        print(f"Generated compressor core: {comp_path}")
+        print(f"  Module name:     {comp_name}")
+        print(f"  Configuration:   {args.n} unsigned addends x {args.arg_width} bits")
+        print(f"  Pipeline depth:  {comp_delay}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/src/finn/compressor/src/benchmark.py b/src/finn/compressor/src/benchmark.py
new file mode 100644
index 0000000000..b4f7a5969b
--- /dev/null
+++ b/src/finn/compressor/src/benchmark.py
@@ -0,0 +1,61 @@
+#############################################################################
+# Copyright (C) 2024 - 2025 Advanced Micro Devices, Inc. All rights reserved.
+#
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# @brief    Benchmarking harness for compressor generation
+#############################################################################
+
+from .passes.compressor_constructor import CompressorConstructor
+from .target import Versal
+from .passes.cost_estimator import CostEstimator
+from .utils.shape import Shape
+from functools import reduce
+
+def gmean(numbers):
+    return reduce(lambda x, y: x*y, numbers)**(1.0/len(numbers))
+
+def benchmark():
+    examples = {
+        "128": Shape([128]),
+        "256": Shape([256]),
+        "512": Shape([512]),
+        "128,128": Shape([128,128]),
+        "256,256": Shape([256,256]),
+        "512,512": Shape([512,512]),
+        "Int1": Shape([1,1,2,3,4,5,6,7,5,4,3,2,1]),
+        "Int2": Shape([1,1,1,3,5,7,9,11,13,10,8,6,4,2,1]),
+        "Int3": Shape([1,1,1,1,5,9,13,17,21,25,20,16,12,8,4]),
+        "Int4": Shape([1,1,1,1,1,9,17,25,33,41,49,40,32,24,16,8]),
+        "Int5": Shape([1,1,1,1,1,1,17,33,49,65,81,97,80,64,48,32,16]),
+        "LPFP1": Shape([1,1,1,1,1,1,1,1,1,1,1,1,1,1,2]),
+        "LPFP2": Shape([2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,4]),
+        "LPFP3": Shape([4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,8]),
+        "LPFP4": Shape([8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,16]),
+        "LPFP5": Shape([16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,32]),
+        "6-Input": Shape(32*[6]),
+        "10-Input": Shape(32*[10]),
+        "Mul16": Shape(list(range(1, 17)) + list(reversed(range(1, 16))))
+    }
+
+    luts = []
+    for example_name, example_shape in examples.items():
+        target = Versal()
+        constructor = CompressorConstructor()
+        comp = constructor(target.counter_candidates, 
+                           target.absorbing_counter_candidates,
+                           target.final_adder, example_shape, 
+                           "comp", 1, True, None, tuple(), [])
+        
+        cost = CostEstimator()
+        comp.accept(cost)
+        eff = (sum(comp.input_shape) - sum(comp.output_shape)) / cost.luts
+        luts.append(cost.luts)
+        print(f"Example {example_name:<10} uses {cost.luts:<6} LUTs"
+              f"for {cost.combinatorial_stages} stages (Efficiency: {eff: 1.2f})")
+
+    luts_gmean = gmean(luts)
+    print(f"Geomean {luts_gmean:.6} LUTs")
+
+if __name__=="__main__":
+    benchmark()
\ No newline at end of file
diff --git a/src/finn/compressor/src/dotp.py b/src/finn/compressor/src/dotp.py
new file mode 100644
index 0000000000..b3b96b826b
--- /dev/null
+++ b/src/finn/compressor/src/dotp.py
@@ -0,0 +1,97 @@
+#############################################################################
+# Copyright (C) 2024 - 2025 Advanced Micro Devices, Inc. All rights reserved.
+#
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# @brief    Dot product compressor core generation for standalone testing
+#############################################################################
+
+import sys, re, os
+from .main import generate_compressor
+from .target import Target, Versal, SevenSeries
+from .utils.shape import Shape
+from .utils.mul_comp_map import MulCompMap
+from typing import Optional, List
+
+
+if __name__ == "__main__":
+
+	# Parse and extract Parameters from Command Line
+	sig = sys.argv[1]
+	_ = re.fullmatch("(\\d+)x([us])(\\d+)([us])(\\d+)", sig).groups()
+	(n, na, nb, sa, sb) = (int(_[0]), int(_[2]), int(_[4]), _[1] == 's', _[3] == 's')
+	assert nb <= na
+
+	# Target platform: ca/accu goes in argv[2], target in argv[3] (default versal)
+	target_arg = sys.argv[3] if len(sys.argv) > 3 else "versal"
+	if target_arg == "7series":
+		target = SevenSeries()
+		fpga_part = "xc7z020clg400-1"
+	else:  # versal (default)
+		target = Versal()
+		fpga_part = "xcvc1902-vsva2197-2MP-e-S"
+
+	clog2 = lambda x: (x-1).bit_length()
+	np = clog2(n) + (na if nb == 1 and not sb else na+nb) if na > 1 else (
+			clog2(n+1) if sa == sb else 1 + clog2(n)
+		)
+
+	map = MulCompMap(na, nb, sa, sb)
+	shape = [col * n for col in map.shape()]
+	print("Shape: ", ' '.join((':'.join((f"{val:x}" for val in col)) for col in shape[::-1])))
+
+	# Absolute Term Contribution
+	constants = []
+	abs_term  = n * map.absolute_term()
+	# Move absolute term into absorbed constant if requested
+	if len(sys.argv) > 2 and sys.argv[2] == 'ca':
+		print("Constant absorption.")
+		if abs_term < 0:
+			abs_term += 2**np
+		constants = [(abs_term >> i) & 1 for i in range(np)]
+		abs_term  = 0
+
+	name = "comp_" + sig
+	# Write to gen/ relative to this script's parent directory (compressor/)
+	script_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+	output_path = os.path.join(script_dir, "gen", name + ".sv")
+	generate_compressor(
+		target            = target,
+		shape             = Shape((len(col) for col in shape)),
+		name              = name,
+		comb_depth        = None,
+		accumulate        = False,
+		accumulator_width = None,
+		gates = [[f"{val:x}" for val in col] for col in shape],
+		constants = constants,
+		path = output_path,
+		test = False
+	)
+
+	# Process templates with absolute paths
+	gen_dir = os.path.join(script_dir, "gen")
+	hdl_dir = os.path.join(script_dir, "hdl")
+	for (src_rel, dst_rel) in (
+		("dotp_template.sv", "dotp_"+sig+".sv"),
+		("dotp_tb_template.sv", "dotp_"+sig+"_tb.sv"),
+		("dotp_template.tcl", "dotp_"+sig+".tcl")
+	):
+		src = os.path.join(hdl_dir, src_rel)
+		dst = os.path.join(gen_dir, dst_rel)
+		with open(src, "rt") as fsrc:
+			with open(dst, "wt") as fdst:
+				for l in fsrc:
+					fdst.write(l
+						.replace("{n}", str(n))
+						.replace("{na}", str(na))
+						.replace("{nb}", str(nb))
+						.replace("{sa}", 's' if sa else 'u')
+						.replace("{sb}", 's' if sb else 'u')
+						.replace("{signed_a}", str(int(sa)))
+						.replace("{signed_b}", str(int(sb)))
+						.replace("{abs_term}", str(abs_term))
+						.replace("{part}", fpga_part)
+						# Replace relative paths with absolute paths for TCL
+						.replace("hdl/", hdl_dir + "/")
+						.replace("gen/", gen_dir + "/")
+					)
diff --git a/src/finn/compressor/src/dotp_finn.py b/src/finn/compressor/src/dotp_finn.py
new file mode 100644
index 0000000000..db24e38a67
--- /dev/null
+++ b/src/finn/compressor/src/dotp_finn.py
@@ -0,0 +1,264 @@
+#############################################################################
+# Copyright (C) 2024 - 2025 Advanced Micro Devices, Inc. All rights reserved.
+#
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# @brief    FINN wrapper for dot product compressor generation
+# @author    Simon Gerber <simon.gerber@amd.com>
+#############################################################################
+
+"""
+Generate a compressor core for FINN's dotp_comp module.
+
+The static dotp_comp template (in finn-rtllib/mvu/) instantiates a generated
+compressor core whose module name encodes the configuration signature, e.g.
+`comp_8xs2s2_a16`.  This script generates that core: a LUT-mapped reduction tree
+with fused accumulation, specific to a (SIMD, WW, AW, signedness) configuration.
+
+Usage:
+  python dotp_finn.py --simd 8 --ww 2 --aw 2 --accu_width 16 \
+                      --signed_activations --target Versal -o gen/
+
+Outputs:
+  comp_<sig>.sv  — the generated compressor core (module `comp_<sig>`)
+"""
+
+import os
+import re
+import argparse
+from .main import generate_compressor
+from .utils.mul_comp_map import MulCompMap
+from .target import resolve_target, resolve_target_name
+from .utils.shape import Shape
+
+
+def expand_template(template_path, output_path, substitutions):
+    """Expand a text template by replacing $PLACEHOLDER$ tokens.
+
+    Raises FileNotFoundError if paths invalid, ValueError if placeholders remain.
+    """
+    if not os.path.isfile(template_path):
+        raise FileNotFoundError(f"Template not found: {template_path}")
+
+    output_dir = os.path.dirname(output_path)
+    if output_dir and not os.path.isdir(output_dir):
+        raise FileNotFoundError(f"Output directory does not exist: {output_dir}")
+
+    with open(template_path, "r") as f:
+        text = f.read()
+    for key, value in substitutions.items():
+        text = text.replace(key, value)
+    remaining = re.findall(r'\$[A-Z_]+\$', text)
+    if remaining:
+        raise ValueError(
+            f"Unsubstituted placeholders in {output_path}: {remaining}")
+    with open(output_path, "w") as f:
+        f.write(text)
+
+
+def compute_params(simd, weight_width, activation_width, signed_activations):
+    """Map finn parameters to compressor parameters, respecting NA >= NB."""
+    # Weights are always signed in finn
+    sa_finn = True
+    sb_finn = signed_activations
+
+    # mul_comp_map requires NA >= NB. Swap operands if needed.
+    if weight_width >= activation_width:
+        na, nb = weight_width, activation_width
+        sa, sb = sa_finn, sb_finn
+        swapped = False
+    else:
+        na, nb = activation_width, weight_width
+        sa, sb = sb_finn, sa_finn
+        swapped = True
+
+    n = simd
+    return n, na, nb, sa, sb, swapped
+
+
+def make_signature(n, sa, na, sb, nb):
+    """Build the compressor file signature string, e.g. '8xs2u2'."""
+    return f"{n}x{'s' if sa else 'u'}{na}{'s' if sb else 'u'}{nb}"
+
+
+def comp_module_name(n, sa, na, sb, nb, accu_width):
+    """Return the config-specific compressor module name, e.g. 'comp_8xs2s2_a16'."""
+    return "comp_" + make_signature(n, sa, na, sb, nb) + f"_a{accu_width}"
+
+
+
+def generate_comp_module(target, n, na, nb, sa, sb, accu_width,
+                         pipeline_every, output_dir, name=None):
+    """Generate the compressor core with fused accumulation.
+
+    When *name* is None (the default), the module is named after its
+    configuration signature, e.g. ``comp_8xs2s2_a16``.  This keeps module
+    names unique across different compressor configurations in the same
+    Vivado project.
+    """
+    if name is None:
+        name = comp_module_name(n, sa, na, sb, nb, accu_width)
+    m = MulCompMap(na, nb, sa, sb)
+    shape_cols = [col * n for col in m.shape()]
+    shape = Shape((len(col) for col in shape_cols))
+    gates = [[f"{val:x}" for val in col] for col in shape_cols]
+
+    # Absorb abs_term as a constant input to the compressor tree.
+    # This ensures the correction is applied every accumulation cycle,
+    # not just once at the output.
+    abs_term = n * m.absolute_term()
+    if abs_term != 0:
+        abs_val = abs_term % (1 << accu_width)  # two's complement
+        constants = [(abs_val >> i) & 1 for i in range(accu_width)]
+    else:
+        constants = []
+
+    comp_path = os.path.join(output_dir, name + ".sv")
+    delay = generate_compressor(
+        target=target,
+        shape=shape,
+        name=name,
+        comb_depth=pipeline_every,
+        accumulate=True,
+        accumulator_width=accu_width,
+        gates=gates,
+        constants=constants,
+        path=comp_path,
+        test=False,
+        enable=True,
+    )
+    return name, comp_path, delay
+
+
+def generate_dotp_comp(fpgapart, simd, ww, aw, accu_width, signed_act, output_dir):
+    """
+    Generate the dotp_comp path: compressor core + expanded template.
+
+    This is the high-level entry point called by FINNs generate_hdl().
+
+    Parameters
+    ----------
+    fpgapart : str
+        FPGA part string (e.g. "xcvc1902-...").
+    simd, ww, aw, accu_width : int
+        MVU parameters.
+    signed_act : bool
+        Whether activations are signed.
+    output_dir : str
+        Directory for generated files (= code_gen_dir).
+
+    Returns
+    -------
+    dict with keys:
+        comp_name  : str   — module name (e.g. "comp_8xs2s2_a16")
+        comp_delay : int   — pipeline depth
+        files      : list  — paths of all generated files
+    """
+
+    target = resolve_target(fpgapart)
+    n, na, nb, sa, sb, _ = compute_params(simd, ww, aw, signed_act)
+
+    comp_name, comp_path, comp_delay = generate_comp_module(
+        target, n, na, nb, sa, sb, accu_width,
+        pipeline_every=1,  # Max pipelining
+        output_dir=output_dir)
+
+    # Expand dotp_comp template with the generated module name
+    src_dir = os.path.dirname(os.path.abspath(__file__))
+    compressor_root = os.path.abspath(os.path.join(src_dir, ".."))
+    dotp_comp_template = os.path.join(compressor_root, "hdl", "dotp_comp_template.sv")
+    dotp_comp_path = os.path.join(output_dir, "dotp_comp.sv")
+    expand_template(dotp_comp_template, dotp_comp_path, {
+        "$COMP_MODULE_NAME$": comp_name,
+        "$EXPECTED_SIMD$": str(simd),
+        "$EXPECTED_NA$": str(na),
+        "$EXPECTED_NB$": str(nb),
+        "$EXPECTED_SIGNED_A$": str(1 if sa else 0),
+        "$EXPECTED_SIGNED_B$": str(1 if sb else 0),
+        "$EXPECTED_ACCU_WIDTH$": str(accu_width),
+    })
+
+    return {
+        "comp_name": comp_name,
+        "comp_delay": comp_delay,
+        "files": [dotp_comp_path, comp_path],
+    }
+
+
+def main():
+    script_dir = os.path.dirname(os.path.abspath(__file__))
+    repo_root = os.path.abspath(os.path.join(script_dir, ".."))
+    default_dotp_template = os.path.join(repo_root, "hdl", "dotp_comp_template.sv")
+
+    parser = argparse.ArgumentParser(
+        prog="dotp_finn",
+        description="Generate a compressor core for FINN's dotp_comp module."
+    )
+    parser.add_argument('--simd', type=int, required=True, help="SIMD (operand pairs per cycle)")
+    parser.add_argument('--ww', type=int, required=True, help="Weight bit width")
+    parser.add_argument('--aw', type=int, required=True, help="Activation bit width")
+    parser.add_argument('--accu_width', type=int, required=True, help="Accumulator bit width")
+    parser.add_argument('--signed_activations', action='store_true',
+                        help="Activations are signed")
+    parser.add_argument('-t', '--target', default="Versal",
+                        choices=["Versal", "7-Series", "UltraScale"],
+                        help="Target FPGA generation")
+    parser.add_argument('-p', '--pipeline_every', type=int, default=None,
+                        help="Pipeline registers every N combinational stages")
+    parser.add_argument('-o', '--output_dir', default="../gen",
+                        help="Output directory for generated files")
+    parser.add_argument('-n', '--name', default=None,
+                        help="Module name override (default: comp_<sig>)")
+    parser.add_argument('--dotp-template', default=default_dotp_template,
+                        help="Path to dotp_comp template file to expand")
+    parser.add_argument('--dotp-output-name', default="dotp_comp.sv",
+                        help="Output file name for expanded dotp_comp template")
+    parser.add_argument('--skip-dotp-template', action='store_true',
+                        help="Skip expanding dotp_comp template")
+    args = parser.parse_args()
+    target = resolve_target_name(args.target)
+    os.makedirs(args.output_dir, exist_ok=True)
+
+    # Compute compressor parameters
+    n, na, nb, sa, sb, swapped = compute_params(
+        args.simd, args.ww, args.aw, args.signed_activations)
+
+    # Generate the compressor core with fused accumulation
+    comp_name, comp_path, comp_delay = generate_comp_module(
+        target, n, na, nb, sa, sb, args.accu_width,
+        args.pipeline_every, args.output_dir, name=args.name)
+
+    dotp_path = None
+    if not args.skip_dotp_template:
+        template_path = os.path.abspath(args.dotp_template)
+        if not os.path.isfile(template_path):
+            raise FileNotFoundError(
+                f"dotp template not found: {template_path}. Use --dotp-template or --skip-dotp-template."
+            )
+        dotp_path = os.path.join(args.output_dir, args.dotp_output_name)
+        expand_template(
+            template_path,
+            dotp_path,
+            {
+                "$COMP_MODULE_NAME$": comp_name,
+                "$EXPECTED_SIMD$": str(args.simd),
+                "$EXPECTED_NA$": str(na),
+                "$EXPECTED_NB$": str(nb),
+                "$EXPECTED_SIGNED_A$": str(1 if sa else 0),
+                "$EXPECTED_SIGNED_B$": str(1 if sb else 0),
+                "$EXPECTED_ACCU_WIDTH$": str(args.accu_width),
+            },
+        )
+
+    sig = make_signature(n, sa, na, sb, nb)
+    print(f"Generated compressor core: {comp_path}")
+    if dotp_path is not None:
+        print(f"Expanded dotp template: {dotp_path}")
+    print(f"  Module name:     {comp_name}")
+    print(f"  Configuration:   {sig}")
+    print(f"  Pipeline depth:  {comp_delay}")
+    print(f"  Operands:        {'swapped' if swapped else 'not swapped'} (NA={na} >= NB={nb})")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/src/finn/compressor/src/evaluation.py b/src/finn/compressor/src/evaluation.py
new file mode 100644
index 0000000000..99ed49a33d
--- /dev/null
+++ b/src/finn/compressor/src/evaluation.py
@@ -0,0 +1,253 @@
+#############################################################################
+# Copyright (C) 2024 - 2025 Advanced Micro Devices, Inc. All rights reserved.
+#
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# @brief    Evaluation and benchmarking utilities for compressor
+#############################################################################
+
+from .target import Versal
+from .utils.shape import Shape
+from .main import generate_compressor
+from .tests.test_gen import compressed_width
+from concurrent.futures import ThreadPoolExecutor
+import subprocess
+
+def evaluation():
+    examples = {
+        "128": Shape([128]),
+        "256": Shape([256]),
+        "512": Shape([512]),
+        "128,128": Shape([128,128]),
+        "256,256": Shape([256,256]),
+        "512,512": Shape([512,512]),
+        "Int1": Shape([1,1,2,3,4,5,6,7,5,4,3,2,1]),
+        "Int2": Shape([1,1,1,3,5,7,9,11,13,10,8,6,4,2,1]),
+        "Int3": Shape([1,1,1,1,5,9,13,17,21,25,20,16,12,8,4]),
+        "Int4": Shape([1,1,1,1,1,9,17,25,33,41,49,40,32,24,16,8]),
+        "Int5": Shape([1,1,1,1,1,1,17,33,49,65,81,97,80,64,48,32,16]),
+        "LPFP1": Shape([1,1,1,1,1,1,1,1,1,1,1,1,1,1,2]),
+        "LPFP2": Shape([2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,4]),
+        "LPFP3": Shape([4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,8]),
+        "LPFP4": Shape([8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,16]),
+        "LPFP5": Shape([16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,32]),
+        "6x32": Shape(32*[6]),
+        "10x32": Shape(32*[10]),
+        "Mul16": Shape(list(range(1, 17)) + list(reversed(range(1, 16))))
+    }
+
+    filenames = []
+    for example_name, example_shape in examples.items():
+        print(example_name, example_shape)
+        # combinatorial design
+        filename = "../gen/"+example_name+"_comb.sv"
+        generate_compressor(
+            target=Versal(),
+            shape=example_shape,
+            name="comp",
+            comb_depth=None,
+            accumulate=False,
+            accumulator_width=None,
+            gates=None,
+            constants=[],
+            path=filename,
+            test=True
+        )
+        generate_wrapper(shape=example_shape, pipelined=False, gates=False,
+                         accumulation=False, filename=filename)
+        filenames.append(filename)
+        # accumulating design
+        filename = "../gen/"+example_name+"_acc.sv"
+        generate_compressor(
+            target=Versal(),
+            shape=example_shape,
+            name="comp",
+            comb_depth=1,
+            accumulate=True,
+            accumulator_width=None,
+            gates=None,
+            constants=[],
+            path=filename,
+            test=True
+        )
+        generate_wrapper(shape=example_shape, pipelined=True, gates=False,
+                         accumulation=True, filename=filename)
+        filenames.append(filename)
+        # gate inlined design with accumulation
+        filename = "../gen/"+example_name+"_gate.sv"
+        generate_compressor(
+            target=Versal(),
+            shape=example_shape,
+            name="comp",
+            comb_depth=1,
+            accumulate=True,
+            accumulator_width=None,
+            gates=[["8" for el in range(col)] for col in example_shape],
+            constants=[],
+            path=filename,
+            test=True
+        )
+        generate_wrapper(shape=example_shape, pipelined=True, gates=True,
+                         accumulation=True, filename=filename)
+        filenames.append(filename)
+
+    tclfiles = [emit_eval_tcl_script(el) for el in filenames]
+
+    def call_vivado(filename):
+        command = f"""cd ../gen/ &&
+            ls && 
+            source /proj/xbuilds/released/2023.1/2023.1_0508_1/installs/lin64/Vivado/2023.1/settings64.sh && 
+            vivado -mode batch -source {filename.split("/")[-1]}"""
+        return subprocess.run(command, shell=True, check=True, timeout=3600, 
+                              text=True, executable="/bin/bash")
+
+    print("Executing evaluation threads")
+    with ThreadPoolExecutor(max_workers=15) as executor:
+        executor.map(call_vivado, tclfiles)
+    print("Done executing evaluation threads")
+
+def generate_wrapper(shape, pipelined, gates, accumulation, filename):
+    iw = sum(shape)
+    ow = compressed_width(shape)
+
+    inputs = ["clk", "in"]
+    if gates:
+        inputs.append("in_2")
+
+    if accumulation:
+        inputs.append("en_neg")
+        inputs.append("rst")
+
+    input_str = "\tinput " + ", ".join(inputs) + ",\n"
+    output_str = f"\toutput logic [{ow-1}:0] outReg"
+
+    wrapper_str =  (
+    "module sandwich(\n" +
+    input_str + 
+    output_str +
+    '\n);\n' + 
+    f"""
+\t{"logic en_negReg, rstReg;" if accumulation else ""}
+\tlogic [{iw-1}:0] inReg{", in_2Reg;" if gates else ";"}
+\twire [{ow-1}:0] out;
+\t
+\talways_ff @ (posedge clk) begin
+\t\t{"rstReg <= rst;" if accumulation else ""}
+\t\t{"en_negReg <= en_neg;" if accumulation else ""}
+\t\tinReg <= {{inReg, in}};
+\t\t{"in_2Reg <= {in_2Reg, in_2};" if gates else ""}
+\t\toutReg <= out;
+\tend
+\t
+\t(* keep_hierarchy = "yes" *)
+\tcomp c(.in(inReg), .clk(clk),{" .in_2(in_2Reg)," if gates else ""
+                                }{" .en_neg(en_negReg), .rst(rstReg)," 
+                                  if accumulation else ""} .out(out));
+
+endmodule"""
+    )
+    with open(filename, 'a') as f:
+        f.writelines(wrapper_str)
+
+def emit_eval_tcl_script(compressor_path):
+    comps = "set comps { " + str(compressor_path.split("/")[-1])  + " }"
+    script = comps + """
+set PART xcvc1902-vsva2197-2MP-e-S ; # From VCK190 Evaluation Board
+
+foreach comp $comps {
+    read_verilog $comp
+
+    # -----------------------------------------------------------------------------
+    # Open new file for current module
+    set filename_prefix RESULT_
+    set filename_suffix ".json"
+    set filename $filename_prefix$comp$filename_suffix
+    puts $filename
+    set outfile [open $filename w]
+    puts $outfile "\{"
+
+    set tm 0.7 ; # Minimum possible ime
+    set tt 10.0 ; # Time to Test
+    set ts 100.0 ; # Successful Time
+    set lc 100000 ; # LUT utilization
+
+    # -----------------------------------------------------------------------------
+    # Run synthesis
+    synth_design -top sandwich -part $PART
+
+    # -----------------------------------------------------------------------------
+    # while loop, updating clock 
+    while {[expr $ts - $tm] > 0.1} {
+        puts "NEW SYNTHESIS RUN WITH FREQ $tt"
+        create_clock -name CLK -period $tt [get_port clk]
+
+        # -----------------------------------------------------------------------------
+        # Place and route
+        opt_design -retarget -propconst -sweep ;
+        place_design -directive Explore
+        report_utilization -file util_$comp.twrA
+        route_design -directive Explore
+        report_drc
+        report_utilization -hierarchical
+        report_timing -setup -hold -max_paths 3 -nworst 3 -input_pins -sort_by group -file $comp.twrA
+        report_timing_summary -delay_type min_max -path_type full_clock_expanded -report_unconstrained -check_timing_verbose -max_paths 3 -nworst 3 -significant_digits 3 -input_pins -file $comp.twrA
+
+        # -----------------------------------------------------------------------------
+        # Find maximum data path delay and slack
+        set f [open $comp.twrA r]
+        set file_data [read $f]
+        close $f
+        if {[regexp { +Data Path Delay: +(\d+\.\d+)} $file_data -> value]} {
+            set tr $value
+        } {
+            error "DATA PATH DELAY NOT FOUND"
+        }
+
+        # -----------------------------------------------------------------------------
+        # Find LUT and Slice utilization 
+        set f [open util_$comp.twrA r]
+        set file_data [read $f]
+        close $f
+        if {[regexp {CLB LUTs +\| +(\d+)} $file_data -> value]} {
+            set lc $value
+        } {
+            error "LUT UTILIZATION NOT FOUND"
+        }
+
+        if {[regexp {SLICE +\| +(\d+)} $file_data -> value]} {
+            set sc $value
+        } {
+            error "SLICE UTILIZATION NOT FOUND"
+        }
+
+        # -----------------------------------------------------------------------------
+        # Check if timing was met
+        if { $tt < $tr } {
+            puts {Timing $tr was NOT met!}
+            set tm $tt
+            if { $tr < $ts } {
+                set ts $tr
+            } 
+        } else {
+            set ts $tr
+        }
+        set tt [expr { ($ts + $tm)/2}]
+    }
+
+    puts -nonewline $outfile "\\"Delay\\": $ts,"
+    puts -nonewline $outfile "\\"Slice\\": $sc,"
+    puts -nonewline $outfile "\\"LUTS\\": $lc" ;
+
+    puts $outfile "\}"
+    close $outfile
+    remove_files {$comp}
+}
+q
+"""
+    tclpath = compressor_path.replace(".sv", ".tcl")
+    with open(tclpath, "w") as f:
+        f.writelines(script)
+    return tclpath
+
+if __name__=="__main__":
+    evaluation()
\ No newline at end of file
diff --git a/src/finn/compressor/src/graph/__init__.py b/src/finn/compressor/src/graph/__init__.py
new file mode 100644
index 0000000000..9ec3df1276
--- /dev/null
+++ b/src/finn/compressor/src/graph/__init__.py
@@ -0,0 +1,8 @@
+#############################################################################
+# Copyright (C) 2024 - 2025 Advanced Micro Devices, Inc. All rights reserved.
+#
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# @brief    Compressor graph package initialization
+# @author    Simon Gerber <simon.gerber@amd.com>
+#############################################################################
diff --git a/src/finn/compressor/src/graph/accumulator.py b/src/finn/compressor/src/graph/accumulator.py
new file mode 100644
index 0000000000..2fa585dc5d
--- /dev/null
+++ b/src/finn/compressor/src/graph/accumulator.py
@@ -0,0 +1,96 @@
+#############################################################################
+# Copyright (C) 2024 - 2025 Advanced Micro Devices, Inc. All rights reserved.
+#
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# @brief    Accumulator stage implementation for compressor
+#############################################################################
+
+from .nodes import Shape, Wire, Logic, Stage, Bitmatrix
+from collections.abc import Iterable
+
+class AccumulatorStage(Stage):
+    def __init__(self, shape: Shape, final_adder, preceeding_pipeline_stages,
+                 accumulator_width = None, enable = False):
+        super().__init__()
+        self.input_shape = shape
+        self.output_shape = Shape([1 for _ in range(
+            self.get_accumulator_width(accumulator_width))])
+        self.instances = []
+        self.input_wires = Bitmatrix(shape)
+        self.output_wires = Bitmatrix(self.output_shape) # TODO: Make Logic
+        self.accumulator_width = self.get_accumulator_width(accumulator_width)
+        self.final_adder_gen = final_adder
+        self.preceeding_pipeline_stages = preceeding_pipeline_stages
+        self.enable = enable
+        self.build_hardware()
+
+    def build_hardware(self):
+        acc_input_shape = self.input_shape + self.output_shape
+        final_adder = self.final_adder_gen(acc_input_shape)
+
+        en_neg = Wire(desired_name="en_neg")
+        en_neg.set_to_module_input()
+        rst = Wire(desired_name="rst")
+        rst.set_to_module_input()
+        self.instances.append(en_neg)
+        self.instances.append(rst)
+
+        # Optional clock enable signal (for finnlib integration)
+        en_wire = None
+        if self.enable:
+            en_wire = Wire(desired_name="en")
+            en_wire.set_to_module_input()
+            self.instances.append(en_wire)
+
+        # Create shifted enable and reset signal.
+        # init=1 on rst delay chain: when enable mode is active, en-gating
+        # prevents these registers from capturing the initial rst=1 pulse if
+        # en=0 during global reset.  Initialising to 1 ensures the accumulator
+        # feedback is properly zeroed from power-up.  In the current finn(lib)
+        # integration en is hardwired to '1 making this technically redundant,
+        # but the FPGA INIT attribute is free and keeps the design robust
+        # against future uses where en may be gated.
+        rst_del = self.delay_signal(rst, self.preceeding_pipeline_stages+1,
+                                    en=en_wire,
+                                    init=1 if self.enable else None)
+        en_neg_del = self.delay_signal(en_neg, self.preceeding_pipeline_stages,
+                                       en=en_wire)
+
+        # Connect inputs to final adder
+        loop = self.delay_signal(final_adder.output_wires, cycles=1,
+                                 rst=rst_del, en=en_wire, init=0)
+        in_ = self.delay_signal(self.input_wires, cycles=1, rst=en_neg_del,
+                                en=en_wire, init=0)
+        for col_loop, col_fa in zip(loop, final_adder.input_wires):
+            col_loop[0].connect_to(col_fa[0])
+
+        for col_in, col_fa in zip(in_, final_adder.input_wires):
+            for el_in, el_fa in zip(col_in, col_fa[1:]):
+                el_in.connect_to(el_fa)
+
+        # Connect final adder output to stage output
+        for col_t, col_s in zip(self.output_wires, final_adder.output_wires):
+            for t, s in zip(col_t, col_s):
+                s.connect_to(t)
+        self.instances.append(final_adder)
+
+    def delay_signal(self, signal, /, cycles=1, rst = None, en = None, init = None):
+        if isinstance(signal, Iterable):
+            return [self.delay_signal(el, cycles, rst, en, init) for el in signal]
+        for i in range(cycles):
+            lgc = Logic(rst=rst, en=en, init=init)
+            signal.connect_to(lgc)
+            self.instances.append(lgc)
+            signal = lgc
+        return signal
+       
+
+    def get_accumulator_width(self, input = None):
+        if input:
+            return input
+        else:
+            return sum([(el << idx) for idx, el in 
+                        enumerate(self.input_shape)]).bit_length()
+    
+    def accept(self, visitor): visitor.visit_accumulator_stage(self)
\ No newline at end of file
diff --git a/src/finn/compressor/src/graph/counters/__init__.py b/src/finn/compressor/src/graph/counters/__init__.py
new file mode 100644
index 0000000000..52868b1dbd
--- /dev/null
+++ b/src/finn/compressor/src/graph/counters/__init__.py
@@ -0,0 +1,8 @@
+#############################################################################
+# Copyright (C) 2024 - 2025 Advanced Micro Devices, Inc. All rights reserved.
+#
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# @brief    Counter candidates package initialization
+# @author    Simon Gerber <simon.gerber@amd.com>
+#############################################################################
diff --git a/src/finn/compressor/src/graph/counters/absorption_counter_candidates.py b/src/finn/compressor/src/graph/counters/absorption_counter_candidates.py
new file mode 100644
index 0000000000..53a1163dc3
--- /dev/null
+++ b/src/finn/compressor/src/graph/counters/absorption_counter_candidates.py
@@ -0,0 +1,299 @@
+#############################################################################
+# Copyright (C) 2024 - 2025 Advanced Micro Devices, Inc. All rights reserved.
+#
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# @brief    7-Series and Versal gate absorption counter implementations
+# @author    Co-authored by Simon Gerber <simon.gerber@amd.com>
+#############################################################################
+
+from abc import ABC, abstractmethod
+from ...utils.shape import Shape
+from ..nodes import GateAbsorptionCounter
+from typing import List
+from ..primitives import LUT6CY, LUT2, LUT6
+
+def fa_sum(a, b, c): return a ^ b ^ c
+def fa_carry(a, b, c): return a and b or a and c or b and c
+
+def gate_string_to_pred(string):
+    class Gate:
+        def __init__(self, init):
+            try:
+                self._init = int(init, 16)
+            except ValueError:
+                raise  ValueError(f"Gate specification {string} is invalid!")
+
+        def __call__(self, a, b):
+            return  bool((self._init >> (1*a | 2*b)) & 1)
+
+        def __repr__(self):
+            return  f"{self._init:x}"
+    return  Gate(string)
+
+class GateAbsorptionCounterCandidate(ABC):
+    @abstractmethod
+    def extend_to_fit(self, inputs: Shape, 
+                      gates: List[List[str]]) -> GateAbsorptionCounter:
+        pass
+
+class AbsorbingFACandidate(GateAbsorptionCounterCandidate):
+    def extend_to_fit(self, inputs: Shape,
+                      gates: List[List[str]]) -> GateAbsorptionCounter:
+        if inputs[0] >= 3:
+            return AbsorbingFA(gates[0][:3])
+
+class AbsorbingFA(GateAbsorptionCounter):
+    def __init__(self, gates):
+        self.gates = [gate_string_to_pred(gate) for gate in gates]
+        super().__init__(Shape([3]), Shape([1,1]))
+
+    def build_hardware(self):
+        lut1 = LUT6.fromPred(
+            lambda I0,I1,I2,I3,I4,I5: fa_sum(
+                self.gates[0](I0,I1), 
+                self.gates[1](I2,I3),
+                self.gates[2](I4,I5)))
+        
+        lut2 = LUT6.fromPred(
+            lambda I0,I1,I2,I3,I4,I5: fa_carry(
+                self.gates[0](I0,I1), 
+                self.gates[1](I2,I3),
+                self.gates[2](I4,I5)))
+
+        for lut in zip([lut1, lut2]):
+            self.input_wires[0][0].connect_to(lut.I0)
+            self.input_wires[0][2].connect_to(lut.I2)
+            self.input_wires[0][4].connect_to(lut.I4)
+            self.input_wires_complementary[0][1].connect_to(lut.I1)
+            self.input_wires_complementary[0][3].connect_to(lut.I3)
+            self.input_wires_complementary[0][5].connect_to(lut.I5)
+        self.output_wires[0][0].connect_to(lut1.O)
+        self.output_wires[1][0].connect_to(lut2.O)
+        self.instances += [lut1, lut2]
+
+class MuxCYPredAdderCandidate(GateAbsorptionCounterCandidate):
+    def extend_to_fit(self, inputs: Shape,
+                      gates: List[List[str]]) -> GateAbsorptionCounter:
+        width = 0
+        for i in range(4):
+            if inputs[i] > 2:
+                width += 1
+            else:
+                break
+        selected_gates = []
+        for i in range(width):
+            gates_col = [gates[i][0], gates[i][1]]
+            selected_gates.append(gates_col)
+        if selected_gates:
+            return MuxCYPredAdder(selected_gates)
+
+class MuxCYPredAdder(GateAbsorptionCounter):
+    def __init__(self, gates: List[List[str]]):
+        self.gates = [[gate_string_to_pred(el) for el in col] for col in gates]
+        super().__init__(Shape(len(self.gates) * [2]),
+                         Shape((len(self.gates)+1) * [1]))
+
+    def build_hardware(self):
+        """7-Series horizontal multi-column gate absorption using LUT6_2.
+
+        Similar to VersalPredAdder but uses LUT6_2 with swapped predicate order.
+        Each column has 2 gates, each LUT computes: sum = p1 XOR p2 XOR carry_in
+        """
+        from ..primitives import LUT6_2
+        from ..nodes import Constant
+
+        luts = []
+        for i in range(len(self.gates)):
+            p1 = self.gates[i][0]
+            p2 = self.gates[i][1]
+            # LUT6_2: predO5→O5, predO6→O6 
+            # Match VersalPredAdder pattern: sum first, carry second
+            lut = LUT6_2.fromPred(
+                lambda A0,A1,A2,A3,A4,A5,p1=p1,p2=p2: fa_sum(p1(A0,A1), p2(A2,A3), A4),    # predO5 → O5 (sum)
+                lambda A0,A1,A2,A3,A4,A5,p1=p1,p2=p2: fa_carry(p1(A0,A1), p2(A2,A3), A4), # predO6 → O6 (carry)
+            )
+
+            # Connect inputs (same pattern as Versal)
+            self.input_wires[i][0].connect_to(lut.I0)
+            self.input_wires[i][1].connect_to(lut.I2)
+            self.input_wires_complementary[i][0].connect_to(lut.I1)
+            self.input_wires_complementary[i][1].connect_to(lut.I3)
+
+            # Sum output for this column (O5, not O6!)
+            lut.O5.connect_to(self.output_wires[i][0])
+            luts.append(lut)
+
+        # First LUT needs carry-in = 0
+        Constant("1'b0").connect_to(luts[0].I4)
+
+        # Carry chain: previous carry → next carry-in (O6, not O5!)
+        for p, n in zip(luts, luts[1:]):
+            p.O6.connect_to(n.I4)
+
+        # Final carry-out (O6, not O5!)
+        luts[-1].O6.connect_to(self.output_wires[len(luts)][0])
+
+        self.instances += luts
+
+class VersalPredAdderCandidate(GateAbsorptionCounterCandidate):
+    def extend_to_fit(self, inputs: Shape, 
+                      gates: List[List[str]]) -> GateAbsorptionCounter:
+        width = 0
+        for i in range(4):
+            if inputs[i] > 2:
+                width += 1
+            else:
+                break
+        selected_gates = []
+        for i in range(width):
+            gates_col = [gates[i][0], gates[i][1]]
+            selected_gates.append(gates_col)
+        if selected_gates:        
+            return VersalPredAdder(selected_gates)
+
+class VersalPredAdder(GateAbsorptionCounter):
+    def __init__(self, gates: List[List[str]]):
+        self.gates = [[gate_string_to_pred(el) for el in col] for col in gates]
+        super().__init__(Shape(len(self.gates) * [2]), 
+                         Shape((len(self.gates)+1) * [1]))
+
+    def build_hardware(self):
+        luts = []
+        for i in range(len(self.gates)):
+            p1 = self.gates[i][0]
+            p2 = self.gates[i][1]
+            lut = LUT6CY.fromPred(
+                lambda A0,A1,A2,A3,A4,A5: fa_sum(p1(A0,A1),p2(A2,A3),A4), # s
+                lambda A0,A1,A2,A3,A4,A5: fa_carry(p1(A0,A1), 
+                                                   p2(A2,A3), A4), # c
+            )
+            self.input_wires[i][0].connect_to(lut.I0)
+            self.input_wires[i][1].connect_to(lut.I2)
+            self.input_wires_complementary[i][0].connect_to(lut.I1)
+            self.input_wires_complementary[i][1].connect_to(lut.I3)
+
+            lut.O51.connect_to(self.output_wires[i][0])
+            luts.append(lut)
+
+        for p, n in zip(luts, luts[1:]):
+            p.O52.connect_to(n.I4)
+        luts[-1].O52.connect_to(self.output_wires[len(luts)][0])
+        self.instances += luts
+
+class RippleSumPredAdderCandidate(GateAbsorptionCounterCandidate):
+    def extend_to_fit(self, inputs: Shape,
+                      gates: List[List[str]]) -> GateAbsorptionCounter:
+        max_height = min(inputs[0] // 2, 4)
+        if max_height:
+            return RippleSumPredAdder(gates[0][:max_height*2])
+
+class RippleSumPredAdder(GateAbsorptionCounter):
+    def __init__(self, gates):
+        self.gates = [gate_string_to_pred(gate) for gate in gates]
+        super().__init__(Shape([len(gates)]), Shape([1, (len(gates)+1)//2]))
+
+    def build_hardware(self):
+        luts = []
+        for i in range((len(self.gates) + 1) // 2):
+            p1 = self.gates[2*i]
+            p2 = (self.gates[2*i+1] if len(self.gates) > 2*i+1
+                  else lambda A0,A1: False)
+            lut = LUT6CY.fromPred(
+                lambda A0,A1,A2,A3,A4,A5: 
+                    fa_carry(p1(A0,A1), p2(A2,A3), A4), # c
+                lambda A0,A1,A2,A3,A4,A5: 
+                    fa_sum(p1(A0,A1),p2(A2,A3),A4) # s
+            )
+            luts.append(lut)
+        
+        for p, n in zip(luts, luts[1:]):
+            p.O52.connect_to(n.I4)
+
+        for i, (w1, w2) in enumerate(zip(self.input_wires[0], 
+                                         self.input_wires_complementary[0])):
+            if i % 2 == 0:
+                w1.connect_to(luts[i//2].I0)
+                w2.connect_to(luts[i//2].I1)
+            else:
+                w1.connect_to(luts[i//2].I2)
+                w2.connect_to(luts[i//2].I3)
+        
+        luts[-1].O52.connect_to(self.output_wires[0][0])
+        for i, lut in enumerate(luts):
+            lut.O51.connect_to(self.output_wires[1][i])
+        self.instances += luts
+
+class MuxCYRippleSumCandidate(GateAbsorptionCounterCandidate):
+    """7-Series version of RippleSumPredAdder using CARRY4 instead of LUT6CY."""
+    def extend_to_fit(self, inputs: Shape,
+                      gates: List[List[str]]) -> GateAbsorptionCounter:
+        max_height = min(inputs[0] // 2, 4)
+        if max_height:
+            return MuxCYRippleSum(gates[0][:max_height*2])
+
+class MuxCYRippleSum(GateAbsorptionCounter):
+    """7-Series ripple-carry gate absorption using LUT6_2 + CARRY4."""
+    def __init__(self, gates):
+        self.gates = [gate_string_to_pred(gate) for gate in gates]
+        super().__init__(Shape([len(gates)]), Shape([1, (len(gates)+1)//2]))
+
+    def build_hardware(self):
+        from ..primitives import LUT6_2
+        from ..nodes import Constant
+
+        luts = []
+        for i in range((len(self.gates) + 1) // 2):
+            p1 = self.gates[2*i]
+            p2 = (self.gates[2*i+1] if len(self.gates) > 2*i+1
+                  else lambda A0,A1: False)
+            # Match Versal RippleSumPredAdder pattern with full-adder logic
+            # Gates use I0/I1 (p1) and I2/I3 (p2), carry-in on I4
+            # Try swapping: O5 = sum, O6 = carry (opposite of naming)
+            lut = LUT6_2.fromPred(
+                lambda A0,A1,A2,A3,A4,A5,p1=p1,p2=p2: fa_sum(p1(A0,A1), p2(A2,A3), A4),    # O5 = sum 
+                lambda A0,A1,A2,A3,A4,A5,p1=p1,p2=p2: fa_carry(p1(A0,A1), p2(A2,A3), A4),  # O6 = carry 
+            )
+            luts.append(lut)
+
+        # Connect gate inputs to LUT inputs (same as Versal)
+        for i, (w1, w2) in enumerate(zip(self.input_wires[0],
+                                         self.input_wires_complementary[0])):
+            if i % 2 == 0:
+                w1.connect_to(luts[i//2].I0)
+                w2.connect_to(luts[i//2].I1)
+            else:
+                w1.connect_to(luts[i//2].I2)
+                w2.connect_to(luts[i//2].I3)
+
+        # First LUT needs carry-in = 0
+        Constant("1'b0").connect_to(luts[0].I4)
+
+        # Carry chain: previous carry-out → next carry-in (same as Versal)
+        for p, n in zip(luts, luts[1:]):
+            p.O5.connect_to(n.I4)
+
+        # Connect outputs (same as Versal): final carry + sum bits
+        luts[-1].O5.connect_to(self.output_wires[0][0])  # Final carry-out
+        for i, lut in enumerate(luts):
+            lut.O6.connect_to(self.output_wires[1][i])   # Sum bits
+
+        self.instances += luts
+
+class SinglePredCandidate(GateAbsorptionCounterCandidate):
+    def extend_to_fit(self, inputs: Shape,
+                      gates: List[List[str]]) -> GateAbsorptionCounter:
+        if inputs[0] > 0:
+            return SinglePred(gates[0][0])
+
+class SinglePred(GateAbsorptionCounter):
+    def __init__(self, gate):
+        self.gate = gate_string_to_pred(gate)
+        super().__init__(Shape([1]), Shape([1]))
+
+    def build_hardware(self):
+        lut = LUT2.fromPred(self.gate)
+        self.input_wires[0][0].connect_to(lut.I0)
+        self.input_wires_complementary[0][0].connect_to(lut.I1)
+        lut.O.connect_to(self.output_wires[0][0])
+        self.instances.append(lut)
\ No newline at end of file
diff --git a/src/finn/compressor/src/graph/counters/counter_candidates.py b/src/finn/compressor/src/graph/counters/counter_candidates.py
new file mode 100644
index 0000000000..74398e2a1c
--- /dev/null
+++ b/src/finn/compressor/src/graph/counters/counter_candidates.py
@@ -0,0 +1,737 @@
+#############################################################################
+# Copyright (C) 2024 - 2025 Advanced Micro Devices, Inc. All rights reserved.
+#
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# @brief    LUT-based counter and gate absorption atom implementations
+# @author    Co-authored by Simon Gerber <simon.gerber@amd.com>
+#############################################################################
+
+from itertools import count
+from ..nodes import Counter, Constant, GateAbsorptionCounter
+from abc import ABC, abstractmethod
+from ..primitives import LUT6, LUT6_2, LUT6CY, CARRY4, LUT5
+from ...utils.shape import Shape
+
+MAX_CASCADE_LENGTH = 4
+
+def FA_sum(a, b, c): return a ^ b ^ c
+def FA_carry(a, b, c): return a and b or a and c or b and c
+
+class CounterCandidate(ABC):
+    @abstractmethod
+    def extend_to_fit(self, inputs: Shape, outputs: Shape, 
+                      compression_goal) -> Counter:
+        pass
+
+class VersalAtom(CounterCandidate):
+    def extend_to_fit(self, inputs: Shape, outputs: Shape, 
+                      compression_goal) -> Counter:
+        pass
+
+class FixedShapeCounterCandidate(CounterCandidate):
+    def __init__(self, counter, counter_inputs: Shape, 
+                 counter_outputs: Shape) -> Counter:
+        self.counter = counter
+        self.counter_inputs = counter_inputs
+        self.counter_outputs = counter_outputs
+
+    def extend_to_fit(self, inputs: Shape, outputs: Shape, 
+                      compression_goal) -> Counter:
+        for i in range(len(self.counter_inputs)):
+            if not (self.counter_inputs[i] <= inputs[i] and
+                    inputs[i] + outputs[i] - self.counter_inputs[i] + 
+                    self.counter_outputs[i] - compression_goal(i) >= -1):
+                return None
+        return self.counter()
+
+class FA(Counter):
+    def __init__(self): 
+        super(FA, self).__init__(
+            Shape([3]), 
+            Shape([1, 1]), 
+        )
+
+    def build_hardware(self):
+        lut = LUT6_2.fromPred(
+                              lambda x, y, z, w, q, r:
+                                x and y or x and z or y and z,
+                              lambda x, y, z, w, q, r: x ^ y ^ z, 
+                              "FA")
+        for i in range(3):
+            self.input_wires[0][i].connect_to(lut.in_ports[i])
+        for i in range(2):
+            lut.out_ports[i].connect_to(self.output_wires[i][0])
+        self.instances += (lut,)
+
+class FACandidate(FixedShapeCounterCandidate):
+    def __init__(self):
+        super().__init__(FA, FA().input_shape, FA().output_shape)
+
+hlutnm_counter = count()
+class TenSix(Counter):
+    def __init__(self): 
+        super(TenSix, self).__init__(Shape([10]), Shape([2, 4]))
+
+    def build_hardware(self):
+        lut1 = LUT6_2.fromPred(
+            lambda A0,A1,A2,A3,A4,_: FA_sum(  A3, A4, FA_sum(A0, A1, A2)),
+            lambda A0,A1,A2,A3,A4,_: FA_carry(A3, A4, FA_sum(A0, A1, A2)),
+            "FiveTwo_1"
+        )
+        lut2 = LUT6_2.fromPred(
+            lambda A0,A1,A2,A3,A4,_: FA_sum(  A3, A4, FA_sum(A0, A1, A2)), 
+            lambda A0,A1,A2,A3,A4,_: FA_carry(A3, A4, FA_sum(A0, A1, A2)),
+            "FiveTwo_2"
+        )
+        hlutnm_attr = f"HLUTNM = \"tensix_{next(hlutnm_counter)}\""
+        lut3_A = LUT5.fromPred(
+            lambda A0,A1,A2,A3,A4: FA_carry(A0,A1,A4)
+        )
+        lut3_B = LUT5.fromPred(
+            lambda A0,A1,A2,A3,A4: FA_carry(A2,A3,A4)
+        )
+        lut3_A.annotate(hlutnm_attr)
+        lut3_B.annotate(hlutnm_attr)
+        # TODO: Take care of annotations
+        self.input_wires[0][0].connect_to(lut1.I0)
+        self.input_wires[0][1].connect_to(lut1.I1)
+        self.input_wires[0][2].connect_to(lut1.I2)
+        self.input_wires[0][3].connect_to(lut1.I3)
+        self.input_wires[0][4].connect_to(lut1.I4)
+        lut1.O5.connect_to(self.output_wires[0][0])
+        lut1.O6.connect_to(self.output_wires[1][0])
+
+        self.input_wires[0][5].connect_to(lut2.I0)
+        self.input_wires[0][6].connect_to(lut2.I1)
+        self.input_wires[0][7].connect_to(lut2.I2)
+        self.input_wires[0][8].connect_to(lut2.I3)
+        self.input_wires[0][9].connect_to(lut2.I4)
+        
+        self.input_wires[0][0].connect_to(lut3_A.I0)
+        self.input_wires[0][1].connect_to(lut3_A.I1)
+        self.input_wires[0][2].connect_to(lut3_A.I4)
+
+        self.input_wires[0][5].connect_to(lut3_B.I2)
+        self.input_wires[0][6].connect_to(lut3_B.I3)
+        self.input_wires[0][7].connect_to(lut3_B.I4)
+
+        # Duplicate connections to make Vivado obey HLUTNM
+        self.input_wires[0][5].connect_to(lut3_A.I2)
+        self.input_wires[0][6].connect_to(lut3_A.I3)
+        self.input_wires[0][0].connect_to(lut3_B.I0)
+        self.input_wires[0][1].connect_to(lut3_B.I1)
+
+        lut2.O5.connect_to(self.output_wires[0][1])
+        lut2.O6.connect_to(self.output_wires[1][1])
+
+        lut3_A.O.connect_to(self.output_wires[1][2])
+        lut3_B.O.connect_to(self.output_wires[1][3])
+
+        self.instances += (lut1, lut2, lut3_A, lut3_B)
+
+class TenSixCandidate(FixedShapeCounterCandidate):
+    def __init__(self):
+        super().__init__(TenSix, TenSix().input_shape, TenSix().output_shape)
+
+class FiveTwo(Counter):
+    def __init__(self): super(FiveTwo, self).__init__(Shape([5, 2]),
+                                                      Shape([1, 2, 1]))
+
+    def build_hardware(self):
+        lut1 = LUT6_2.fromPred(
+            lambda A0,A1,A2,A3,A4,_: FA_sum(  A3, A4, FA_sum(A0, A1, A2)),
+            lambda A0,A1,A2,A3,A4,_: FA_carry(A3, A4, FA_sum(A0, A1, A2)),
+            "FiveTwo_1"
+        )
+        lut2 = LUT6_2.fromPred(
+            lambda A0,A1,A2,A3,A4,_: FA_sum(  A3, A4, FA_carry(A0, A1, A2)), 
+            lambda A0,A1,A2,A3,A4,_: FA_carry(A3, A4, FA_carry(A0, A1, A2)),
+            "FiveTwo_2"
+        )
+        self.input_wires[0][0].connect_to(lut1.I0)
+        self.input_wires[0][1].connect_to(lut1.I1)
+        self.input_wires[0][2].connect_to(lut1.I2)
+        self.input_wires[0][3].connect_to(lut1.I3)
+        self.input_wires[0][4].connect_to(lut1.I4)
+        lut1.O5.connect_to(self.output_wires[0][0])
+        lut1.O6.connect_to(self.output_wires[1][0])
+
+        self.input_wires[0][0].connect_to(lut2.I0)
+        self.input_wires[0][1].connect_to(lut2.I1)
+        self.input_wires[0][2].connect_to(lut2.I2)
+        self.input_wires[1][0].connect_to(lut2.I3)
+        self.input_wires[1][1].connect_to(lut2.I4)
+        lut2.O5.connect_to(self.output_wires[1][1])
+        lut2.O6.connect_to(self.output_wires[2][0])
+        self.instances += (lut1, lut2)
+
+class FiveTwoCandidate(FixedShapeCounterCandidate):
+    def __init__(self):
+        super(FiveTwoCandidate, self).__init__(FiveTwo, FiveTwo().input_shape,
+                                               FiveTwo().output_shape)
+
+class DualRailRippleSum(Counter):
+    def __init__(self, w):
+        self._width = w
+        super(DualRailRippleSum, self).__init__(Shape([4*w+1, w+1]), 
+                                                Shape([1, w+1, w]))
+
+    @property
+    def width(self): return self._width
+
+    def build_hardware(self):
+        luts_top = []
+        luts_btm = []
+
+        cascade_top = self.input_wires[0][0]
+        cascade_btm = self.input_wires[1][0]
+        
+        for i in range(0, self._width):
+            lut_top = LUT6CY.fromPred(
+                lambda A0,A1,A2,A3,A4,_: FA_carry(A3, A4, 
+                                                  FA_sum(A0, A1, A2)),
+                lambda A0,A1,A2,A3,A4,_: FA_sum  (A3, A4, 
+                                                  FA_sum(A0, A1, A2)),
+                "dual_rail_top"
+            )
+            lut_btm = LUT6CY.fromPred(
+                lambda A0,A1,A2,A3,A4,_: FA_carry(A3, A4, 
+                                                  FA_carry(A0, A1, A2)),
+                lambda A0,A1,A2,A3,A4,_: FA_sum  (A3, A4, 
+                                                  FA_carry(A0, A1, A2)),
+                "dual_rail_btm"
+            )
+
+            self.input_wires[0][1+4*i].connect_to(lut_top.I0)
+            self.input_wires[0][2+4*i].connect_to(lut_top.I1)
+            self.input_wires[0][3+4*i].connect_to(lut_top.I2)
+            self.input_wires[0][4+4*i].connect_to(lut_top.I3)
+            cascade_top.connect_to(lut_top.I4)
+            lut_top.O51.connect_to(self.output_wires[1][i+1])
+            cascade_top = lut_top.O52
+
+            self.input_wires[0][1+4*i].connect_to(lut_btm.I0)
+            self.input_wires[0][2+4*i].connect_to(lut_btm.I1)
+            self.input_wires[0][3+4*i].connect_to(lut_btm.I2)
+            self.input_wires[1][1+i].connect_to(lut_btm.I3)
+            cascade_btm.connect_to(lut_btm.I4)
+            lut_btm.O51.connect_to(self.output_wires[2][i])
+            cascade_btm = lut_btm.O52
+
+            luts_top.append(lut_top)
+            luts_btm.append(lut_btm)
+
+            if i == self._width - 1:
+                lut_top.O52.connect_to(self.output_wires[0][0])
+                lut_btm.O52.connect_to(self.output_wires[1][0])
+            
+        self.instances += luts_top + luts_btm
+
+class DualRailRippleSumCandidate(CounterCandidate):
+    def extend_to_fit(self, inputs: Shape, outputs: Shape, 
+                      compression_goal) -> Counter:
+        max_height_0 = min(MAX_CASCADE_LENGTH, 
+                         (inputs[0]-1)//4, 
+                         (inputs[0]+outputs[0]-compression_goal(0)+1)//4
+                         ) if inputs[0] >= 5 else 0
+        
+        max_height_1 = min(MAX_CASCADE_LENGTH, 
+                         inputs[1]-1
+                         ) if inputs[1] >= 2 else 0
+        max_height = min(max_height_0, max_height_1, MAX_CASCADE_LENGTH)
+        if max_height > 0: 
+            return DualRailRippleSum(max_height)
+
+class RippleSum(Counter):
+    def __init__(self, w):
+        self._width = w
+        super(RippleSum, self).__init__(Shape([2*w+1]), Shape([1, w]))
+
+    @property
+    def width(self): return self._width
+
+    def build_hardware(self):
+        luts = []
+
+        carry = self.input_wires[0][0]
+
+        for i in range(0, self._width):
+            lut = LUT6CY.fromPred(
+                lambda A0,A1,A2,A3,A4,_: FA_carry(A4, A1, A0),
+                lambda A0,A1,A2,A3,A4,_: FA_sum  (A4, A1, A0),
+                "ripple_sum"
+            )
+
+            self.input_wires[0][1+2*i].connect_to(lut.I0)
+            self.input_wires[0][2+2*i].connect_to(lut.I1)
+            carry.connect_to(lut.I4)
+            lut.O51.connect_to(self.output_wires[1][i])
+            carry = lut.O52
+
+            luts.append(lut)
+
+            if i == self._width - 1:
+                lut.O52.connect_to(self.output_wires[0][0])
+            
+        self.instances += luts
+
+class RippleSumCandidate(CounterCandidate):
+    def extend_to_fit(self, inputs: Shape, outputs: Shape, 
+                      compression_goal) -> Counter:
+        max_height = min(MAX_CASCADE_LENGTH, 
+                         (inputs[0]-1)//2, 
+                         (inputs[0]+outputs[0]+1)//2-compression_goal(0)+1
+                         ) if inputs[0] >= 3 else 0
+        if max_height > 0:
+            return RippleSum(max_height)
+
+class SixThree(Counter):
+    def __init__(self):
+        super(SixThree, self).__init__(Shape([6]), Shape([1, 1, 1]))
+
+    def build_hardware(self):
+        lut1 = LUT6.fromPred(lambda A0,A1,A2,A3,A4,A5:
+                                bool(sum([A0,A1,A2,A3,A4,A5]) & 1),
+                                "sixthree_first")
+        lut2 = LUT6.fromPred(lambda A0,A1,A2,A3,A4,A5:
+                                bool(sum([A0,A1,A2,A3,A4,A5]) & 2),
+                                "sixthree_second")
+        lut3 = LUT6.fromPred(lambda A0,A1,A2,A3,A4,A5:
+                                bool(sum([A0,A1,A2,A3,A4,A5]) & 4),
+                                "sixthree_third")
+        luts = (lut1, lut2, lut3)
+   
+        for lut in luts:
+            for i in range(6):
+                self.input_wires[0][i].connect_to(lut.in_ports[i])
+        
+        for i, lut in enumerate(luts):
+            lut.out_ports[0].connect_to(self.output_wires[i][0])
+        self.instances += luts
+
+class SixThreeCandidate(FixedShapeCounterCandidate):
+    def __init__(self):
+        super().__init__(SixThree, SixThree().input_shape, 
+                         SixThree().output_shape)
+
+class VersalAtom14:
+    def __init__(self):
+        self.shape = Shape([4,1])
+        self.width = 2
+        self.output_width = 2
+
+    def build_luts(self):
+        lut_1 = LUT6CY.fromPred(
+            lambda A0,A1,A2,A3,A4,_: FA_sum(  FA_sum(A0,A1,A2),A3,A4),
+            lambda A0,A1,A2,A3,A4,_: FA_carry(FA_sum(A0,A1,A2),A3,A4),
+            "atom14_first"
+        )
+        lut_2 = LUT6CY.fromPred(
+            lambda A0,A1,A2,A3,A4,_: FA_sum(  FA_carry(A0,A1,A2),A3,A4),
+            lambda A0,A1,A2,A3,A4,_: FA_carry(FA_carry(A0,A1,A2),A3,A4),
+            "atom14_second"
+        )
+        return (lut_1, lut_2)
+
+class VersalAtom2:
+    def __init__(self):
+        self.shape = Shape([2])
+        self.width = 1
+        self.output_width = 1
+
+    def build_luts(self):
+        lut = LUT6CY.fromPred(
+            lambda A0,A1,A2,A3,A4,_: FA_sum(A0,A1,A4),
+            lambda A0,A1,A2,A3,A4,_: FA_carry(A0,A1,A4),
+            "atom2_second"
+        )
+        return (lut,)
+    
+class VersalAtom222:
+    def __init__(self):
+        self.shape = Shape([2,2,2])
+        self.width = 2
+        self.output_width = 3
+
+    def build_luts(self):
+        lut_1 = LUT6CY.fromPred(
+            lambda A0,A1,A2,A3,A4,_: FA_sum(A2,A3,A4),
+            lambda A0,A1,A2,A3,A4,_: FA_sum(A0,A1,FA_carry(A2,A3,A4)),
+        )
+        lut_2 = LUT6CY.fromPred(
+            lambda A0,A1,A2,A3,A4,_: FA_sum(A0,A1,FA_carry(A2,A3,A2^A3^A4)),
+            lambda A0,A1,A2,A3,A4,_: FA_carry(A0,A1,FA_carry(A2,A3,A2^A3^A4)),
+        )
+        return (lut_1, lut_2)
+        
+class VersalAtomCascade(Counter):
+    def __init__(self, atoms):
+        self._atoms = atoms
+
+        in_shape = [el for atom in atoms for el in atom.shape]
+        in_shape[0] += 1
+        in_shape = Shape(in_shape)
+
+        out_shape = Shape([1 for _ 
+                           in range(sum([atom.output_width for 
+                                         atom in atoms]) + 1)])
+        super().__init__(in_shape, out_shape)
+
+    def build_hardware(self):
+        luts = []
+        for atom in self._atoms:
+            # emit the correct luts
+            luts += atom.build_luts()
+
+        if not luts:
+            return
+
+        # Connect inputs
+        lut_idx = 0
+        io_idx = 0
+
+        # Carry-in
+        carry = self.input_wires[0][self._atoms[0].shape[0]]
+
+        for atom in self._atoms:
+            if isinstance(atom, VersalAtom2):
+                self.input_wires[io_idx][0].connect_to(luts[lut_idx].I0)
+                self.input_wires[io_idx][1].connect_to(luts[lut_idx].I1)
+                carry.connect_to(luts[lut_idx].I4)
+                carry = luts[lut_idx].O52
+
+                luts[lut_idx].O51.connect_to(self.output_wires[io_idx][0])
+                lut_idx += 1
+                io_idx += 1
+            elif isinstance(atom, VersalAtom222):
+                self.input_wires[io_idx][0].connect_to(luts[lut_idx].I2)
+                self.input_wires[io_idx][1].connect_to(luts[lut_idx].I3)
+                self.input_wires[io_idx+1][0].connect_to(luts[lut_idx].I0)
+                self.input_wires[io_idx+1][1].connect_to(luts[lut_idx].I1)
+                carry.connect_to(luts[lut_idx].I4)
+                carry = luts[lut_idx].O52
+
+                # second lut
+                self.input_wires[io_idx+1][0].connect_to(luts[lut_idx+1].I2)
+                self.input_wires[io_idx+1][1].connect_to(luts[lut_idx+1].I3)
+                self.input_wires[io_idx+2][0].connect_to(luts[lut_idx+1].I0)
+                self.input_wires[io_idx+2][1].connect_to(luts[lut_idx+1].I1)
+                carry.connect_to(luts[lut_idx+1].I4)
+                carry = luts[lut_idx+1].O52
+
+                luts[lut_idx].O51.connect_to(self.output_wires[io_idx][0])
+                luts[lut_idx].O52.connect_to(self.output_wires[io_idx+1][0])
+                luts[lut_idx+1].O51.connect_to(self.output_wires[io_idx+2][0])
+                lut_idx += 2
+                io_idx += 3
+            elif isinstance(atom, VersalAtom14):
+                # first lut
+                self.input_wires[io_idx][0].connect_to(luts[lut_idx].I0)
+                self.input_wires[io_idx][1].connect_to(luts[lut_idx].I1)
+                self.input_wires[io_idx][2].connect_to(luts[lut_idx].I2)
+                self.input_wires[io_idx][3].connect_to(luts[lut_idx].I3)
+                carry.connect_to(luts[lut_idx].I4)
+                carry = luts[lut_idx].O52
+
+                # second lut
+                self.input_wires[io_idx][0].connect_to(luts[lut_idx+1].I0)
+                self.input_wires[io_idx][1].connect_to(luts[lut_idx+1].I1)
+                self.input_wires[io_idx][2].connect_to(luts[lut_idx+1].I2)
+                self.input_wires[io_idx+1][0].connect_to(luts[lut_idx+1].I3)
+                carry.connect_to(luts[lut_idx+1].I4)
+                carry = luts[lut_idx+1].O52
+
+                luts[lut_idx].O51.connect_to(self.output_wires[io_idx][0])
+                luts[lut_idx+1].O51.connect_to(self.output_wires[io_idx+1][0])
+                
+                lut_idx += 2
+                io_idx += 2
+            else:
+                raise Exception("Error in construction of Versal Atoms")
+        luts[-1].O52.connect_to(self.output_wires[-1][0])
+        self.instances += luts
+
+class VersalAtomCascadeCandidate(CounterCandidate):
+    def extend_to_fit(self, inputs: Shape, outputs: Shape, 
+                    compression_goal) -> Counter:
+        def fits_col(idx, height):
+            return (height <= inputs[idx] and 
+                    inputs[idx] + outputs[idx] - height
+                    + 1 - compression_goal(idx) >= -1)
+        atoms = []
+        io_idx = 0
+        atom_idx = 0
+        while (atom_idx < 4):
+            if atom_idx == 0:
+                if fits_col(io_idx, 5) and fits_col(io_idx+1, 1):
+                    atoms.append(VersalAtom14())
+                    atom_idx += 2
+                    io_idx += 2
+                if (fits_col(io_idx, 3) and fits_col(io_idx+1, 2) and 
+                    fits_col(io_idx+2, 2)):
+                    atoms.append(VersalAtom222())
+                    atom_idx += 2
+                    io_idx += 3
+                elif fits_col(io_idx, 3):
+                    atoms.append(VersalAtom2())
+                    atom_idx += 1
+                    io_idx += 1
+                else:
+                    break
+            elif atom_idx < 3:
+                if fits_col(io_idx, 4) and fits_col(io_idx+1, 1):
+                    atoms.append(VersalAtom14())
+                    atom_idx += 2
+                    io_idx += 2
+                elif (fits_col(io_idx, 2) and fits_col(io_idx+1, 2) and
+                      fits_col(io_idx+2, 2)):
+                    atoms.append(VersalAtom222())
+                    atom_idx += 2
+                    io_idx += 3
+                elif fits_col(io_idx, 2):
+                    atoms.append(VersalAtom2())
+                    atom_idx += 1
+                    io_idx += 1
+                else:
+                    break
+            elif fits_col(io_idx, 2):
+                atoms.append(VersalAtom2())
+                atom_idx += 1
+                io_idx += 1
+            else:
+                break
+        if atoms:
+            return VersalAtomCascade(atoms)
+    
+class ConstantOne(GateAbsorptionCounter):
+    def __init__(self):
+        super().__init__(Shape(tuple()), Shape((1,)))
+        
+    def build_hardware(self):
+        Constant(1).connect_to(self.output_wires[0][0])
+
+class MuxCYAtom06:
+    def __init__(self):
+        self.shape = Shape([6,0])
+        self.width = 2
+        self.output_width = 2
+
+    def build_luts(self):
+        # Matches VHDL atom06.vhdl - the (0,6) atom for 6 inputs from column 0
+        #
+        # VHDL lo LUT: INIT => x"6996_9669_9669_6996"
+        #   Uses all 6 inputs x0[5:0]
+        #   O6 = O5 = XOR of all 6 bits (parity function)
+        #
+        # VHDL hi LUT: INIT => x"177E_7EE8" & x"E8E8_E8E8"
+        #   Uses x0[4:0] with I5=1
+        #   O6 = complex carry propagation
+        #   O5 = 0xE8 repeated = FA_carry(I0,I1,I2)
+        #
+        # Note: This atom is currently DISABLED in MuxCYAtomCascadeCandidate
+        # because it needs further testing. The predicates below match the
+        # VHDL reference but the wiring/integration may need work.
+        #
+        # lo LUT: XOR of all 6 bits
+        lut_1 = LUT6_2.fromPred(
+            lambda A0,A1,A2,A3,A4,A5: A0 ^ A1 ^ A2 ^ A3 ^ A4,        # O5 (5-input XOR)
+            lambda A0,A1,A2,A3,A4,A5: A0 ^ A1 ^ A2 ^ A3 ^ A4 ^ A5,   # O6 (6-input XOR)
+            "atom06_lo"
+        )
+        # hi LUT: carry chain continuation
+        # O5 = FA_carry(A0,A1,A2) for the generate term
+        # O6 = more complex carry propagation (from VHDL 0x177E7EE8)
+        lut_2 = LUT6_2.fromPred(
+            lambda A0,A1,A2,A3,A4,A5: FA_carry(A0,A1,A2),             # O5 -> DI
+            lambda A0,A1,A2,A3,A4,A5: (FA_carry(FA_sum(A0,A1,A2),A3,A4) ^
+                                       FA_carry(A0,A1,A2)),           # O6 -> S
+            "atom06_hi"
+        )
+        return (lut_1, lut_2)
+
+class MuxCYAtom14:
+    def __init__(self):
+        self.shape = Shape([4,1])
+        self.width = 2
+
+    def build_luts(self):
+        # Preußer FPL 2017: (1,4) atom - matches VHDL atom14.vhdl
+        #
+        # CARRY4 primitive: CO = S ? CI : DI, O = S ^ CI
+        #
+        # The key insight from the VHDL reference:
+        #   - O6 (S) computes the propagate signal: XOR of inputs
+        #   - O5 (DI) simply passes through the higher-weight input bit
+        #
+        # This is NOT an AND of the sum/carry with the input!
+        # The VHDL uses INIT patterns:
+        #   lo: x"6996_6996" & x"FF00_FF00"  (O6=0x6996, O5=0xFF00)
+        #   hi: x"17E8_17E8" & x"FF00_FF00"  (O6=0x17E8, O5=0xFF00)
+        #
+        # O5 = 0xFF00 = just passes I3 (the 4th input bit)
+        #
+        # BUGFIX (2026-04-08): Previous implementation incorrectly used:
+        #   O5 = FA_sum(A0,A1,A2) & A3  (WRONG - produces 0xFF96)
+        # Correct implementation:
+        #   O5 = A3  (just pass through - produces 0xFF00)
+        #
+        # lut_1 (position 0): processes x0[3:0] for s0/d0
+        lut_1 = LUT6_2.fromPred(
+            lambda A0,A1,A2,A3,A4,_: A3,                      # O5 -> DI = x0[3]
+            lambda A0,A1,A2,A3,A4,_: FA_sum(A0,A1,A2) ^ A3,   # O6 -> S
+            "atom14_0"
+        )
+        # lut_2 (position 1): processes x0[2:0] and x1 for s1/d1
+        # x1 is mapped to I3 (A3)
+        lut_2 = LUT6_2.fromPred(
+            lambda A0,A1,A2,A3,A4,_: A3,                        # O5 -> DI = x1
+            lambda A0,A1,A2,A3,A4,_: FA_carry(A0,A1,A2) ^ A3,   # O6 -> S
+            "atom14_1"
+        )
+        return (lut_1, lut_2)
+
+class MuxCYAtom2:
+    def __init__(self):
+        self.shape = Shape([2])
+        self.width = 1
+
+    def build_luts(self):
+        # Matches VHDL atom22.vhdl: INIT => x"6666_6666" & x"CCCC_CCCC"
+        #
+        # CARRY4: CO = S ? CI : DI, O = S ^ CI
+        #
+        # The VHDL uses:
+        #   O6 = 0x6666 = I0 ^ I1 (XOR / half-adder sum)
+        #   O5 = 0xCCCC = I1 (just passes through the higher-weight bit)
+        #
+        # BUGFIX (2026-04-08): Previous implementation used O5=A0.
+        # While this happens to produce correct results due to CARRY4
+        # logic simplification, it doesn't match the VHDL reference.
+        # Changed to O5=A1 for consistency with atom22.vhdl.
+        lut = LUT6_2.fromPred(
+            lambda A0,A1,A2,A3,A4,_: A1,       # O5 -> DI = higher-weight bit
+            lambda A0,A1,A2,A3,A4,_: A0 ^ A1,  # O6 -> S (propagate)
+            "atom2"
+        )
+        return (lut,)
+
+class MuxCYAtomCascade(Counter):
+    def __init__(self, atoms):
+        self._atoms = atoms
+        
+        in_shape = [el for atom in atoms for el in atom.shape]
+        in_shape[0] += 1
+        in_shape = Shape(in_shape)
+    
+        out_shape = Shape([1 for _ 
+                           in range(sum([atom.width for atom in atoms]) + 1)])
+        super().__init__(in_shape, out_shape)
+
+    def build_hardware(self):
+        luts = []
+        for atom in self._atoms:
+            luts += atom.build_luts()
+        muxcy = CARRY4()
+
+        if not luts:
+            return
+
+        # Connect inputs
+        idx = 0
+        self.input_wires[0][self._atoms[0].shape[0]].connect_to(muxcy.CI)
+
+        for atom in self._atoms:
+            if isinstance(atom, MuxCYAtom2):
+                self.input_wires[idx][0].connect_to(luts[idx].I0)
+                self.input_wires[idx][1].connect_to(luts[idx].I1)
+                idx += 1                
+            elif isinstance(atom, MuxCYAtom14):
+                # first lut
+                self.input_wires[idx][0].connect_to(luts[idx].I0)
+                self.input_wires[idx][1].connect_to(luts[idx].I1)
+                self.input_wires[idx][2].connect_to(luts[idx].I2)
+                self.input_wires[idx][3].connect_to(luts[idx].I3)
+
+                # second lut
+                self.input_wires[idx][0].connect_to(luts[idx+1].I0)
+                self.input_wires[idx][1].connect_to(luts[idx+1].I1)
+                self.input_wires[idx][2].connect_to(luts[idx+1].I2)
+                self.input_wires[idx+1][0].connect_to(luts[idx+1].I3)
+                idx += 2
+            elif isinstance(atom, MuxCYAtom06):
+                # First LUT (atom06_lo): uses all 6 inputs for XOR
+                self.input_wires[idx][0].connect_to(luts[idx].I0)
+                self.input_wires[idx][1].connect_to(luts[idx].I1)
+                self.input_wires[idx][2].connect_to(luts[idx].I2)
+                self.input_wires[idx][3].connect_to(luts[idx].I3)
+                self.input_wires[idx][4].connect_to(luts[idx].I4)
+                self.input_wires[idx][5].connect_to(luts[idx].I5)
+
+                # Second LUT (atom06_hi): uses inputs 0-4 for carry propagation
+                # BUGFIX: was connecting to luts[idx] instead of luts[idx+1]
+                self.input_wires[idx][0].connect_to(luts[idx+1].I0)
+                self.input_wires[idx][1].connect_to(luts[idx+1].I1)
+                self.input_wires[idx][2].connect_to(luts[idx+1].I2)
+                self.input_wires[idx][3].connect_to(luts[idx+1].I3)
+                self.input_wires[idx][4].connect_to(luts[idx+1].I4)
+                idx += 2
+            else:
+                raise Exception("Error in construction of MuxCYAtoms")
+                
+        # Connect outputs
+        for idx, (lut, di, s, o) in enumerate(zip(luts, 
+                                              muxcy.DI.elements,
+                                              muxcy.S.elements, 
+                                              muxcy.O.elements)):
+            lut.O6.connect_to(s)
+            lut.O5.connect_to(di)
+            o.connect_to(self.output_wires[idx][0])
+
+        muxcy.CO.elements[-1].connect_to(self.output_wires[-1][0])
+        self.instances += luts
+        self.instances.append(muxcy)
+
+class MuxCYAtomCascadeCandidate(CounterCandidate):
+    def extend_to_fit(self, inputs: Shape, outputs: Shape, 
+                    compression_goal) -> Counter:
+        def fits_col(idx, height):
+            return (height <= inputs[idx] and 
+                    inputs[idx] + outputs[idx] - height
+                    + 1 - compression_goal(idx) >= -1)
+        atoms = []
+        i = 0
+        while (i < 4):
+            if i == 0:
+                # MuxCYAtom06: 6:3 compressor for column 0 (needs 7 inputs: 6 + carry-in)
+                if fits_col(i, 7):
+                    atoms.append(MuxCYAtom06())
+                    i += 2
+                elif fits_col(i, 5) and fits_col(i+1, 1):
+                    atoms.append(MuxCYAtom14())
+                    i += 2
+                elif fits_col(i, 3):
+                    atoms.append(MuxCYAtom2())
+                    i += 1
+                else:
+                    break
+            elif i < 3:
+                # MuxCYAtom06: 6:3 compressor for middle columns
+                if fits_col(i, 6):
+                    atoms.append(MuxCYAtom06())
+                    i += 2
+                elif fits_col(i, 4) and fits_col(i+1, 1):
+                    atoms.append(MuxCYAtom14())
+                    i += 2
+                elif fits_col(i, 2):
+                    atoms.append(MuxCYAtom2())
+                    i += 1
+                else:
+                    break
+            elif fits_col(i, 2):
+                atoms.append(MuxCYAtom2())
+                i += 1
+            else:
+                break
+        if i == 4:
+            return MuxCYAtomCascade(atoms)
\ No newline at end of file
diff --git a/src/finn/compressor/src/graph/final_adder.py b/src/finn/compressor/src/graph/final_adder.py
new file mode 100644
index 0000000000..d5fb6456ad
--- /dev/null
+++ b/src/finn/compressor/src/graph/final_adder.py
@@ -0,0 +1,364 @@
+#############################################################################
+# Copyright (C) 2024 - 2025 Advanced Micro Devices, Inc. All rights reserved.
+#
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# @brief    Final adder implementations for compressor output stage
+#############################################################################
+
+from abc import abstractstaticmethod
+from typing import List
+from .nodes import Counter
+from ..utils.shape import Shape
+from .primitives import LUT5, LUT6CY, LOOKAHEAD8, LUT6_2, CARRY4
+
+def FA_sum(a, b, c): return a ^ b ^ c
+def FA_carry(a, b, c): return a and b or a and c or b and c
+
+def ceildiv(a, b):
+    return -(a // -b)
+
+def try_connect(func):
+    try:
+        func()
+    except IndexError:
+        pass
+
+
+class FinalAdder(Counter):
+    @abstractstaticmethod
+    def compression_goal(col): pass
+
+
+class VersalTernaryAdder(FinalAdder):
+    @staticmethod
+    def compression_goal(col): return 5 if col == 0 else 3
+
+    def __init__(self, input_shape: Shape):
+        self.input_shape = input_shape
+        output_shape = Shape([1 for _ in range(len(input_shape) + 2)])
+        super().__init__(input_shape, output_shape)
+
+    def build_hardware(self):
+        l8s = [LOOKAHEAD8() for _ in range((len(self.input_shape)+8)//8)]
+        luts_chain = [LUT6CY.fromPred(
+            lambda A0,A1,A2,A3,A4,A5: FA_sum(FA_sum(A0,A1,A2), A3, A4),
+            lambda A0,A1,A2,A3,A4,A5: FA_carry(FA_sum(A0,A1,A2), A3, A4),
+            "ternary_adder_chain"
+        ) for _ in range(len(self.input_shape)+1)]
+        luts_top = []
+        for i in range(len(self.input_shape)):
+            if i % 2 == 0:
+                luts_top.append(LUT5.fromPred(
+                    lambda A0,A1,A2,A3,A4: FA_carry(A0, A1, A4)
+                ))
+                try_connect(lambda: 
+                            self.input_wires[i][0].connect_to(luts_top[-1].I0))
+                try_connect(lambda: 
+                            self.input_wires[i][1].connect_to(luts_top[-1].I1))
+                try_connect(lambda:
+                            self.input_wires[i+1][0].connect_to(
+                                luts_top[-1].I2))
+                try_connect(lambda:
+                            self.input_wires[i+1][1].connect_to(
+                                luts_top[-1].I3))
+                try_connect(lambda: 
+                            self.input_wires[i][2].connect_to(luts_top[-1].I4))
+            else:
+                luts_top.append(LUT5.fromPred(
+                    lambda A0,A1,A2,A3,A4: FA_carry(A2, A3, A4)
+                ))
+                try_connect(lambda: 
+                            self.input_wires[i-1][0].connect_to(
+                                luts_top[-1].I0))
+                try_connect(lambda: 
+                            self.input_wires[i-1][1].connect_to(
+                                luts_top[-1].I1))
+                try_connect(lambda: self.input_wires[i][0].connect_to(
+                    luts_top[-1].I2))
+                try_connect(lambda: self.input_wires[i][1].connect_to(
+                    luts_top[-1].I3))
+                try_connect(lambda: self.input_wires[i][2].connect_to(
+                    luts_top[-1].I4))
+
+        for idx, (left, right) in enumerate(zip(luts_top[0::2], 
+                                                luts_top[1::2])):
+            left.annotate(f"HLUTNM = final_adder_{idx}")
+            right.annotate(f"HLUTNM = final_adder_{idx}")
+
+        try_connect(lambda: 
+                    self.input_wires[0][3].connect_to(luts_chain[0].I3))
+        try_connect(lambda: 
+                    self.input_wires[0][4].connect_to(luts_chain[0].I4))
+        for i, el in enumerate(luts_chain):
+            try_connect(lambda: self.input_wires[i][0].connect_to(el.I0))
+            try_connect(lambda: self.input_wires[i][1].connect_to(el.I1))
+            try_connect(lambda: self.input_wires[i][2].connect_to(el.I2))
+            el.PROP.connect_to(l8s[i//8].p_in_ports[i%8])
+            el.O51.connect_to(self.output_wires[i][0])
+            el.O52.connect_to(l8s[i//8].c_in_ports[i%8+1])
+
+        for lb, lt in zip(luts_chain[1:], luts_top):
+            lt.O.connect_to(lb.I3)
+
+        # connect carry-ins between lookahead modules 
+        for prev, next in zip(l8s, l8s[1:]):
+            prev.COUTH.connect_to(next.CIN)
+
+        # cascade
+        for i in range(1, len(luts_chain)):
+            if i % 2 == 0:
+                l8s[(i-1)//8].out_ports[((i-1)%8)//2].connect_to(
+                    luts_chain[i].I4)
+            else:
+                luts_chain[i-1].O52.connect_to(luts_chain[i].I4)
+
+        if len(luts_chain) % 2 == 0:
+            l8s[(len(luts_chain)-1)//8].out_ports[len(luts_chain)%8//2-1]\
+                .connect_to(self.output_wires[len(luts_chain)][0])
+        else:
+            luts_chain[-1].O52.connect_to(
+                self.output_wires[len(luts_chain)][0])
+        self.instances += luts_chain + luts_top + l8s
+
+class QuaternaryAdder(FinalAdder):
+    @staticmethod
+    def compression_goal(col): return 5 if col <= 1 else 4
+
+    def __init__(self, input_shape: Shape):
+        output_shape = Shape([1 for _ in range(len(input_shape) + 2)])
+        super().__init__(input_shape, output_shape)
+
+    def build_hardware(self):
+        ## Find the limit up to which the quaternary adder is needed. 
+        # We construct a two-input adder after this.
+        height_4_until = len(self.input_wires)
+        tail_length = 0
+        for idx, col in reversed(list(enumerate(self.input_wires))):
+            if len(col) > 2:
+                break
+            else:
+                height_4_until = idx
+                tail_length += 1
+        
+        # If tail_length==1, the quaternary adder must not be reduced, 
+        # as there would be no savings.
+        if (tail_length == 1):
+            height_4_until += 1
+            tail_length = 0
+
+        # Construct necessary hardware
+        luts_top: List[LUT6CY] = []
+        luts_btm: List[LUT6CY] = []
+
+        for i in range(0, height_4_until):
+            luts_top.append(
+                LUT6CY.fromPred(
+                    lambda A0,A1,A2,A3,A4,_: FA_sum(
+                        FA_sum(A0, A1, A2), A3, A4), # S
+                    lambda A0,A1,A2,A3,A4,_: FA_carry(
+                        FA_sum(A0, A1, A2), A3, A4), # ct
+                    "final_adder_top"
+                )
+            )
+            luts_btm.append(
+                LUT6CY.fromPred(
+                    lambda A0,A1,A2,A3,A4,_: FA_sum(
+                        FA_carry(A0, A1, A2), A3, A4), # out
+                    lambda A0,A1,A2,A3,A4,_: FA_carry(
+                        FA_carry(A0, A1, A2), A3, A4), #cb
+                    "final_adder_btm"
+                )
+            )
+        if (tail_length):
+            luts_top.append(
+                LUT6CY.fromPred(
+                    lambda A0,A1,A2,A3,A4,_: FA_sum(A0, A1, A4), # out
+                    lambda A0,A1,A2,A3,A4,_: FA_carry(A0, A1, A4), # c_btm
+                    "final_adder_top_end"
+                )
+            )
+            luts_btm.append(
+                LUT6CY.fromPred(
+                    lambda A0,A1,A2,A3,A4,_: FA_sum(FA_sum(A0, A1, False), 
+                                                    A3, A4), # out
+                    lambda A0,A1,A2,A3,A4,_: FA_carry(FA_sum(A0, A1, False),
+                                                      A3, A4),  # c_btm
+                    "final_adder_btm_start_two_input_chain"
+                )
+            )
+        for i in range(tail_length-1):
+            luts_btm.append(
+                LUT6CY.fromPred(
+                    lambda A0,A1,A2,A3,A4,_: 
+                        FA_sum(FA_carry(A0, A1, False), 
+                        FA_sum(A2, A3, False), A4), # out
+                    lambda A0,A1,A2,A3,A4,_: 
+                        FA_carry(FA_carry(A0, A1, False), 
+                        FA_sum(A2, A3, False), A4), # cb
+                    "final_adder_btm_two_input_chain"
+                )
+            )
+
+
+        l8s_top = []
+        l8s_btm = []
+        for _ in range(ceildiv(len(luts_top), 8)):
+            l8s_top.append(LOOKAHEAD8())
+        for _ in range(ceildiv(len(luts_btm), 8)):
+            l8s_btm.append(LOOKAHEAD8())
+
+        # Collect relevant input and output signals
+        for i in range(len(luts_top)):
+            luts_top[i].O52.connect_to(l8s_top[i//8].c_in_ports[i%8+1])
+            luts_top[i].PROP.connect_to(l8s_top[i//8].p_in_ports[i%8])
+            
+        for i in range(len(luts_btm)):
+            luts_btm[i].O52.connect_to(l8s_btm[i//8].c_in_ports[i%8+1])
+            luts_btm[i].PROP.connect_to(l8s_btm[i//8].p_in_ports[i%8])
+        
+        carries_top = []
+        carries_btm = []
+        for i in range(0, len(luts_top)):
+            if i % 2 == 0:
+                carries_top.append(luts_top[i].O52)
+            if i % 2 == 1:
+                carries_top.append(l8s_top[i//8].out_ports[i%8//2])
+        for i in range(0, len(luts_btm)):
+            if i % 2 == 0:
+                carries_btm.append(luts_btm[i].O52)
+            if i % 2 == 1:
+                carries_btm.append(l8s_btm[i//8].out_ports[i%8//2])
+        
+        for i in range(0, len(luts_top)-1):
+            carries_top[i].connect_to(luts_top[i+1].I4)
+        for i in range(0, len(luts_btm)-1):
+            carries_btm[i].connect_to(luts_btm[i+1].I4)
+
+        # connect carry-ins between lookahead modules 
+        def chain_l8(l8s):
+            for prev, next in zip(l8s, l8s[1:]):
+                prev.COUTH.connect_to(next.CIN)
+                
+        chain_l8(l8s_top)
+        chain_l8(l8s_btm)
+
+        # connect carry-in to first lut and lookahead module
+        try_connect(lambda: self.input_wires[0][4].connect_to(luts_top[0].I4))
+        try_connect(lambda: self.input_wires[0][4].connect_to(l8s_top[0].CIN))
+        
+        try_connect(lambda: self.input_wires[1][4].connect_to(luts_btm[0].I4))
+        try_connect(lambda: self.input_wires[1][4].connect_to(l8s_btm[0].CIN))
+
+        # downwards connection
+        for t, d in zip(luts_top[1:], luts_btm):
+            t.O51.connect_to(d.I3)
+        last_top = len(carries_top)-1
+        carries_top[last_top].connect_to(luts_btm[last_top].I3)
+        
+        for idx, (lb, lt) in enumerate(zip(luts_btm, 
+                                           luts_top[:height_4_until])):
+            for el in [lb, lt]:
+                try_connect(lambda: self.input_wires[idx][0].connect_to(el.I0))
+                try_connect(lambda: self.input_wires[idx][1].connect_to(el.I1))
+                try_connect(lambda: self.input_wires[idx][2].connect_to(el.I2))
+
+            try_connect(lambda: self.input_wires[idx][3].connect_to(lt.I3))
+
+        if tail_length:
+            lt = luts_top[height_4_until]
+            lb = luts_btm[height_4_until]
+
+            try_connect(lambda:
+                        self.input_wires[height_4_until][0].connect_to(lt.I0))
+            try_connect(lambda:
+                        self.input_wires[height_4_until][1].connect_to(lt.I1))
+
+            try_connect(lambda:
+                        self.input_wires[height_4_until+1][0].connect_to(
+                            lb.I0))
+            try_connect(lambda:
+                        self.input_wires[height_4_until+1][1].connect_to(
+                            lb.I1))
+
+        for idx, lb in enumerate(luts_btm[height_4_until+1:]):
+            try_connect(lambda: 
+                        self.input_wires[idx+height_4_until+1][0].connect_to(
+                            lb.I0))
+            try_connect(lambda: 
+                        self.input_wires[idx+height_4_until+1][1].connect_to(
+                            lb.I1))
+            try_connect(lambda: 
+                        self.input_wires[idx+height_4_until+2][0].connect_to(
+                            lb.I2))
+            try_connect(lambda: 
+                        self.input_wires[idx+height_4_until+2][1].connect_to(
+                            lb.I3))
+
+        def connect_carry_to_lut(carries, luts):
+            for carry, lut in zip(carries, luts[1:]):
+                carry.connect_to(lut.I4)
+
+        connect_carry_to_lut(carries_top, luts_top)
+        connect_carry_to_lut(carries_btm, luts_btm)
+        luts_top[0].O51.connect_to(self.output_wires[0][0])
+
+        for idx, lb in enumerate(luts_btm):
+            lb.O51.connect_to(self.output_wires[idx+1][0])
+
+        carries_btm[len(luts_btm)-1].connect_to(
+            self.output_wires[len(luts_btm)+1][0])
+
+        luts_top[-1].O52.connect_to(luts_btm[len(luts_top)-1].I3)
+
+        self.instances += luts_top + luts_btm + l8s_btm + l8s_top
+
+class MuxCYTernaryAdder(FinalAdder):
+    @staticmethod
+    def compression_goal(col): return 5 if col == 0 else 3
+
+    def __init__(self, input_shape: Shape):
+        input_shape = input_shape
+        output_shape = Shape([1 for _ in range(len(input_shape) + 2)])
+        super().__init__(input_shape, output_shape)
+
+    def build_hardware(self):
+        luts = [LUT6_2.fromPred(
+            lambda A0,A1,A2,A3,A4,A5: FA_carry(A0,A1,A2),
+            lambda A0,A1,A2,A3,A4,A5: FA_sum(A0,A1,A2) ^ A3
+        ) for _ in range(len(self.input_shape)+1)]
+        c4s = [CARRY4() for _ in range(0, len(self.input_shape)+1, 4)]
+        dis = [el for c4 in c4s for el in c4.DI.elements]
+        ss = [el for c4 in c4s for el in c4.S.elements]
+        cis = [c4.CI for c4 in c4s]
+        os = [el for c4 in c4s for el in c4.O.elements]
+        cos = [el for c4 in c4s for el in c4.CO.elements]
+
+        ## Connect CARRY4 together
+        for c4p, c4n in zip(c4s, c4s[1:]):
+            c4p.CO.elements[-1].connect_to(c4n.CI)
+
+        ## Connect inputs
+        # Only connect up to the number of available input columns
+        for idx, lut in enumerate(luts[:len(self.input_wires)]):
+            try_connect(lambda idx=idx, lut=lut: self.input_wires[idx][0].connect_to(lut.I0))
+            try_connect(lambda idx=idx, lut=lut: self.input_wires[idx][1].connect_to(lut.I1))
+            try_connect(lambda idx=idx, lut=lut: self.input_wires[idx][2].connect_to(lut.I2))
+        try_connect(lambda: self.input_wires[0][3].connect_to(luts[0].I3))
+        try_connect(lambda: self.input_wires[0][3].connect_to(dis[0]))
+        try_connect(lambda: self.input_wires[0][4].connect_to(cis[0]))
+
+        ## Second carry connection
+        for p, n, n_di in zip(luts, luts[1:], dis[1:]):
+            p.O5.connect_to(n.I3)
+            p.O5.connect_to(n_di)
+
+        ## Connect outputs
+        for lut, s in zip(luts, ss):
+            lut.O6.connect_to(s)
+        
+        for idx, o in enumerate(os[:len(luts)]):
+            o.connect_to(self.output_wires[idx][0])
+
+        cos[len(luts)-1].connect_to(self.output_wires[len(luts)][0])
+        self.instances += luts + c4s
\ No newline at end of file
diff --git a/src/finn/compressor/src/graph/nodes.py b/src/finn/compressor/src/graph/nodes.py
new file mode 100644
index 0000000000..647129e70d
--- /dev/null
+++ b/src/finn/compressor/src/graph/nodes.py
@@ -0,0 +1,393 @@
+#############################################################################
+# Copyright (C) 2024 - 2025 Advanced Micro Devices, Inc. All rights reserved.
+#
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# @brief    Compressor graph node definitions and delay estimation
+#############################################################################
+
+from __future__ import annotations
+from abc import ABC, abstractmethod
+from typing import List, Tuple, Dict
+from ..utils.shape import Shape
+
+"""
+Convention: LSB at index 0.
+"""
+
+class Node(ABC): 
+    def accept(self, visitor) -> None: pass
+
+class Connectable(Node):
+    target: list[Connectable]
+    source: Connectable
+
+    def __init__(self):
+        self.target = []
+        self.source = None
+
+    def connect_to(self, target):
+        assert isinstance(target, Connectable), \
+            "Target has to be of type Connectible!"
+        self.target.append(target)
+        target.source = self
+
+    @property
+    def has_target(self): return bool(self.target)
+    
+    @property
+    def has_source(self): return self.source is not None
+
+class Constant(Connectable):
+    def __init__(self, value):
+        super().__init__()
+        self.value = str(value)
+
+class Wire(Connectable):
+    def __init__(self, desired_name = None):
+        super().__init__()
+        self.prefix = ""
+        self.desired_name = desired_name
+
+    def set_to_module_input(self): self.prefix = "input "
+    def set_to_module_output(self): self.prefix = "output "
+
+    def accept(self, visitor) -> None: visitor.visit_wire(self)
+
+class Logic(Wire):
+    def __init__(self, *, rst: Connectable = None, 
+                 en: Connectable = None, init: int = None):
+        self.rst = rst
+        self.en = en
+        self.init = init
+        super().__init__()
+            
+    def accept(self, visitor): return visitor.visit_logic(self)
+
+class BlackboxVecElement(Connectable):
+    pass
+
+class BlackboxVec(Node, ABC):
+    def __init__(self, name, width):
+        self.name = name
+        self.elements = [BlackboxVecElement() for el in range(width)]
+        super().__init__()
+
+class BlackboxInputVec(BlackboxVec):
+    def accept(self, visitor) -> None: visitor.visit_blackbox_input_vec(self)
+
+class BlackboxOutputVec(BlackboxVec):
+    def accept(self, visitor) -> None: visitor.visit_blackbox_output_vec(self)
+
+class BlackboxPort(Connectable):
+    def __init__(self, name):
+        self.name = name
+        super().__init__()
+
+    @property
+    @abstractmethod
+    def connected(self): pass
+
+    @property
+    @abstractmethod
+    def wire(self): pass
+
+class BlackboxInput(BlackboxPort):
+    def __init__(self, name):
+        super().__init__(name)
+
+    @property
+    def connected(self): return self.has_source
+
+    def connect_to(self, target): 
+        raise RuntimeError("Blackbox Input cannot act as output.")
+
+    @property
+    def wire(self): return self.source
+
+    def accept(self, visitor) -> None: visitor.visit_blackbox_input(self)
+
+class BlackboxOutput(BlackboxPort):
+    def __init__(self, name):
+        super().__init__(name)
+
+    @property
+    def connected(self): return self.has_target
+
+    @property
+    def wire(self): return self.target
+
+    def accept(self, visitor) -> None: visitor.visit_blackbox_output(self)
+
+class Blackbox(Node):
+    @abstractmethod
+    def __init__(self, module_name: str, in_ports: Tuple[BlackboxInput], 
+                 out_ports: Tuple[BlackboxOutput], parameters: Dict[str, str]):
+        self.module_name = module_name
+        self.in_ports = in_ports
+        self.out_ports = out_ports
+        self.parameters = parameters
+        self.annotations = []
+
+        for port in self.in_ports + self.out_ports:
+            self.__dict__[port.name] = port
+
+    def annotate(self, annotation: str): 
+        self.annotations.append(annotation)
+
+    def accept(self, visitor):
+        visitor.visit_blackbox(self)
+
+class Module(Node):
+    def __init__(self):
+        self.instances = [] # All inner instances
+        super().__init__()
+
+    @property
+    @abstractmethod
+    def inputs(self): pass
+
+    @property
+    @abstractmethod
+    def outputs(self): pass
+
+class Counter(Module):
+    def __init__(self, input_shape: Shape, output_shape: Shape):
+        super().__init__()
+        self.input_shape = input_shape
+        self.output_shape = output_shape
+        self.input_wires = self._build_wires(input_shape)
+        self.output_wires = self._build_wires(output_shape)
+        self.instances += self.inputs + self.outputs
+
+        self.build_hardware()
+
+    def accept(self, visitor) -> None:
+        visitor.visit_counter(self)
+
+    @abstractmethod
+    def build_hardware(self): pass
+
+    def _build_wires(self, shape: Shape):
+        return tuple([tuple([Wire() for _ in range(col_height)])
+                                   for col_height in shape])
+
+    @property
+    def inputs(self): return [el for col in self.input_wires for el in col]
+
+    @property
+    def outputs(self): return [el for col in self.output_wires for el in col]
+
+    @property
+    def luts(self) -> List[LUT]:
+        return [inst for inst in self.instances if isinstance(inst, LUT)]
+
+    @property
+    def efficiency(self) -> float: 
+        if (len(self.luts) == 0 and 
+            sum(self.input_shape) - sum(self.output_shape) == 0):
+            return 0
+        diff = (sum(self.input_shape) - sum(self.output_shape))
+        denom = sum(LUT.size for LUT in self.luts)
+        return diff / denom
+
+    @property
+    def strength(self) -> float: 
+        return sum(self.input_shape) / sum(self.output_shape)
+    
+class GateAbsorptionCounter(Counter):
+    def __init__(self, input_shape: Shape, output_shape: Shape):
+        self.input_wires_complementary = self._build_wires(input_shape)
+        super().__init__(input_shape, output_shape)
+
+    def accept(self, visitor) -> None:
+        visitor.visit_gate_absorption_counter(self)
+
+    @property
+    def inputs(self): return [el for col in 
+                              self.input_wires + self.input_wires_complementary
+                              for el in col]
+
+class Passthrough(Counter):
+    def __init__(self):
+        super().__init__(Shape([1]), Shape([1]))
+
+    def build_hardware(self):
+        self.output_wires = self.input_wires
+        self.instances = [el for col in self.input_wires for el in col]
+
+class Stage(Node):
+    input_shape: Shape
+    output_shape: Shape
+    input_wires: Bitmatrix[Wire]
+    output_wires: Bitmatrix[Wire]
+
+    def connect_to(self, other):
+        for col_s, col_t in zip(self.output_wires, other.input_wires):
+            for el_s, el_t in zip(col_s, col_t):
+                el_s.connect_to(el_t)
+        
+        # TODO: maybe subclass instead? 
+        if "output_wires_complementary" in self.__dict__:
+            for col_s, col_t in zip(self.output_wires_complementary, 
+                                    other.input_wires_complementary):
+                for el_s, el_t in zip(col_s, col_t):
+                    el_s.connect_to(el_t)
+
+class InputStage(Stage):
+    def __init__(self, shape: Shape, gates: bool = False):
+        self.input_shape = shape
+        self.output_shape = shape
+        self.input_wires = Bitmatrix(shape)
+        self.gates = gates
+        if gates:
+            self.input_wires_complementary = Bitmatrix(shape)
+            self.output_wires_complementary = self.input_wires_complementary
+
+        self.output_wires = self.input_wires
+
+    def accept(self, visitor) -> None: visitor.visit_input_stage(self)
+
+class PipelineStage(Stage):
+    def __init__(self, shape: Shape):
+        self.input_shape = shape
+        self.output_shape = shape
+        self.input_wires = Bitmatrix(shape)
+        self.output_wires = Bitmatrix(shape)
+        self.instances = []
+        for i_c, o_c in zip(self.input_wires, self.output_wires):
+            for i, o in zip(i_c, o_c): 
+                lgc = Logic()
+                i.connect_to(lgc)
+                lgc.connect_to(o)
+                self.instances.append(lgc)
+
+    def accept(self, visitor) -> None: visitor.visit_pipeline_stage(self)
+
+class CompressionStage(Stage):
+    def __init__(self): 
+        self.counters_with_shifts = []
+        self.input_wires = Bitmatrix()
+        self.output_wires = Bitmatrix()
+
+    @property
+    def input_shape(self): return self._shape(lambda x: x.input_shape)
+
+    @property
+    def output_shape(self): return self._shape(lambda x: x.output_shape)
+
+    def _shape(self, func):
+        shape = Shape(())
+        for ctr, shift in self.counters_with_shifts:
+            shifted_shape = func(ctr) << shift
+            shape = shape + shifted_shape
+        return shape
+
+    def append_counter(self, counter: Counter, shift: int):
+        self.counters_with_shifts.append((counter, shift))
+        for source_idx, col in enumerate(counter.input_wires):
+            for wire in col:
+                self.input_wires.add_output(wire, source_idx + shift)
+        for source_idx, col in enumerate(counter.output_wires):
+            for wire in col:
+                self.output_wires.add_input(wire, source_idx + shift)
+
+    def accept(self, visitor) -> None: visitor.visit_compression_stage(self)
+
+class GateAbsorbedStage(CompressionStage):
+    def __init__(self):
+        super().__init__()
+        self.input_wires_complementary = Bitmatrix()
+    
+    def append_counter(self, counter: GateAbsorptionCounter, shift: int):
+        super().append_counter(counter, shift)
+        for source_idx, col in enumerate(counter.input_wires_complementary):
+            for wire in col:
+                self.input_wires_complementary.add_output(wire, 
+                                                          source_idx + shift)
+
+    def accept(self, visitor) -> None: visitor.visit_gate_absorbed_stage(self)
+
+class Compressor(Node):
+    def __init__(self, name): 
+        self.stages = []
+        self.module_name = name
+        self.io = []
+
+    @property
+    def input_shape(self): return self.stages[0].input_shape
+    
+    @property
+    def output_shape(self): return self.stages[-1].output_shape
+
+    @property
+    def delay(self):
+        delay_ = 0
+        for s in self.stages:
+            if isinstance(s, PipelineStage): 
+                delay_ += 1
+            from .accumulator import AccumulatorStage
+            if isinstance(s, AccumulatorStage): 
+                delay_ += 1
+        return delay_
+    
+    def accept(self, visitor) -> None: visitor.visit_compressor(self)
+
+class BitmatrixElement(Connectable):
+    def __init__(self, vector, idx_x, idx_y):
+        self.vector = vector
+        self.idx_2d = (idx_x, idx_y)
+        super().__init__()
+
+    @property
+    def lin_idx(self):
+        return sum(self.vector.shape[:self.idx_2d[0]]) + self.idx_2d[1]
+
+    def accept(self, visitor): pass
+
+class Bitmatrix(Node):
+    def __init__(self, shape : Shape = Shape(), name: str = None):
+        self._name = name
+        self.prefix = ""
+        self.connectables = [[BitmatrixElement(self, idx, row)
+                              for row in range(col)]
+                              for idx, col in enumerate(shape)]
+        super().__init__()
+
+    def set_to_module_input(self): self.prefix = "input "
+    def set_to_module_output(self): self.prefix = "output "
+    def __len__(self): return len(self.connectables)
+    def __getitem__(self, sel): return self.connectables[sel]
+    def __iter__(self): return self.connectables.__iter__()
+    def total_size(self): return sum([len(col) for col in self.connectables])
+    
+    @property
+    def shape(self): return Shape([len(col) for col in self.connectables])
+
+    def add_output(self, el, col_idx):
+        be = self._append_wire(el, col_idx)
+        be.connect_to(el)
+
+    def add_input(self, el, col_idx):
+        be = self._append_wire(el, col_idx)
+        el.connect_to(be)
+
+    def _append_wire(self, el, col_idx):
+        while len(self.connectables) <= col_idx:
+            self.connectables.append([])
+        be = BitmatrixElement(self, col_idx, len(self.connectables[col_idx]))
+        self.connectables[col_idx].append(be)
+        return be
+
+    def accept(self, visitor) -> None: visitor.visit_bitmatrix(self)
+
+class LUT(Blackbox):
+    @abstractmethod
+    def __init__(self, module_name, init_code: str, 
+                 in_ports: Tuple[BlackboxInput], 
+                 out_ports: Tuple[BlackboxOutput], 
+                 *, 
+                 size, desired_name = "lut"):
+        self.desired_name = desired_name
+        self.size = size
+        super().__init__(module_name, in_ports, out_ports, {"INIT": init_code})
\ No newline at end of file
diff --git a/src/finn/compressor/src/graph/primitives.py b/src/finn/compressor/src/graph/primitives.py
new file mode 100644
index 0000000000..1cf36507a3
--- /dev/null
+++ b/src/finn/compressor/src/graph/primitives.py
@@ -0,0 +1,113 @@
+#############################################################################
+# Copyright (C) 2024 - 2025 Advanced Micro Devices, Inc. All rights reserved.
+#
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# @brief    FPGA primitive definitions for compressor (LUTs, carry chains, etc.)
+#############################################################################
+
+from ctypes import c_uint64, c_uint32
+from .nodes import BlackboxInput, BlackboxOutput, Blackbox, LUT, Constant
+from .nodes import BlackboxInputVec, BlackboxOutputVec
+
+class LUT2(LUT):
+    @classmethod
+    def fromPred(self, predO2, desired_name = "lut2"):
+        res = 0
+        for i in range(32):
+            inputs = [bool(i & (1 << shmt)) for shmt in range(2)]
+            res = res | (int(predO2(*inputs)) << i)
+            res = res & 0xF
+        init_str = f"""4'h{"{:_x}".format(c_uint32(res).value)}"""
+        return LUT2(init_str, desired_name)
+    
+    def __init__(self, init_code: str, desired_name):
+        in_ports = [BlackboxInput(f"I{el}") for el in range(2)]
+        out_ports = [BlackboxOutput("O")]
+        super().__init__("LUT2", init_code, in_ports, out_ports,
+                         desired_name=desired_name, size=0.5)
+
+class LUT5(LUT):
+    @classmethod
+    def fromPred(self, predO5, desired_name = "lut5"):
+        res = 0
+        for i in range(32):
+            inputs = [bool(i & (1 << shmt)) for shmt in range(5)]
+            res = res | (int(predO5(*inputs)) << i)
+        init_str = f"""32'h{"{:_x}".format(c_uint32(res).value)}"""
+        return LUT5(init_str, desired_name)
+    
+    def __init__(self, init_code: str, desired_name):
+        in_ports = [BlackboxInput(f"I{el}") for el in range(5)]
+        out_ports = [BlackboxOutput("O")]
+        super().__init__("LUT5", init_code, in_ports, out_ports,
+                         desired_name=desired_name, size=0.5)
+
+class LUT6(LUT):
+    @classmethod
+    def fromPred(self, predO6, desired_name = "lut6"):
+        res = 0
+        for i in range(64):
+            inputs = [bool(i & (1 << shmt)) for shmt in range(6)]
+            res = res | (int(predO6(*inputs)) << i)
+        init_str = f"""64'h{"{:_x}".format(c_uint64(res).value)}"""
+        return LUT6(init_str, desired_name)
+    
+    def __init__(self, init_code: str, desired_name):
+        in_ports = [BlackboxInput(f"I{el}") for el in range(6)]
+        out_ports = [BlackboxOutput("O")]
+        super().__init__("LUT6", init_code, in_ports, out_ports,
+                         desired_name=desired_name, size=1)
+
+def split_lut_from_pred(predO5, predO6):
+    res = 0
+    for i in range(32, 64):
+        inputs = [bool(i & (1 << shmt)) for shmt in range(6)]
+        res = res | (int(predO5(*inputs)) << (i-32)) | (int(predO6(*inputs)) << (i))
+        init_str = f"""64'h{"{:_x}".format(c_uint64(res).value)}"""
+    return init_str
+
+class LUT6_2(LUT):
+    @classmethod
+    def fromPred(self, predO5, predO6, desired_name = "lut6_2"):
+        return LUT6_2(split_lut_from_pred(predO5, predO6), desired_name)
+
+    def __init__(self, init_code: str, desired_name):
+        in_ports = [BlackboxInput(f"I{el}") for el in range(6)] 
+        out_ports = [BlackboxOutput("O6"), BlackboxOutput("O5")]
+        super().__init__("LUT6_2", init_code, in_ports, out_ports,
+                         desired_name=desired_name, size=1)
+        Constant("1'b1").connect_to(self.I5)
+
+class LUT6CY(LUT):
+    @classmethod
+    def fromPred(self, predO51, predO52, desired_name = "lut6cy"):
+        return LUT6CY(split_lut_from_pred(predO51, predO52), desired_name)
+
+    def __init__(self, init_code: str, desired_name):
+        in_ports = [BlackboxInput(f"I{el}") for el in range(5)]
+        out_ports = [BlackboxOutput(f"O5{el+1}") for el in range(2)]
+        out_ports.append(BlackboxOutput("PROP"))
+        super().__init__("LUT6CY", init_code, in_ports, out_ports,
+                         desired_name=desired_name, size=1)
+
+class LOOKAHEAD8(Blackbox):
+    def __init__(self):
+        c_in_ports_str = ["CIN", "CYA", "CYB", "CYC", "CYD", "CYE", "CYF", "CYG", "CYH"]
+        p_in_ports_str = ["PROPA", "PROPB", "PROPC", "PROPD", "PROPE", "PROPF", "PROPG",
+                          "PROPH"]
+        out_ports_str = ["COUTB", "COUTD", "COUTF", "COUTH"]
+        
+        self.c_in_ports = [BlackboxInput(el) for el in c_in_ports_str]
+        self.p_in_ports = [BlackboxInput(el) for el in p_in_ports_str]
+        out_ports = [BlackboxOutput(el) for el in out_ports_str]
+        super().__init__("LOOKAHEAD8", self.c_in_ports + self.p_in_ports, out_ports,
+                         {"LOOKB" : "\"TRUE\"", "LOOKD" : "\"TRUE\"",
+                          "LOOKF" : "\"TRUE\"", "LOOKH" : "\"TRUE\""})
+
+class CARRY4(Blackbox):
+    def __init__(self):
+        in_ports = [BlackboxInputVec("DI", 4), BlackboxInputVec("S", 4),
+                    BlackboxInput("CI")]
+        out_ports = [BlackboxOutputVec("O", 4), BlackboxOutputVec("CO", 4)]
+        super().__init__("CARRY4", in_ports, out_ports, {})
\ No newline at end of file
diff --git a/src/finn/compressor/src/graph/visitor.py b/src/finn/compressor/src/graph/visitor.py
new file mode 100644
index 0000000000..5be1ea118f
--- /dev/null
+++ b/src/finn/compressor/src/graph/visitor.py
@@ -0,0 +1,45 @@
+#############################################################################
+# Copyright (C) 2024 - 2025 Advanced Micro Devices, Inc. All rights reserved.
+#
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# @brief    Visitor pattern for compressor graph traversal
+#############################################################################
+
+from abc import ABC
+from .nodes import Counter, CompressionStage, Compressor, InputStage, PipelineStage
+from .nodes import Logic, Bitmatrix, GateAbsorbedStage, GateAbsorptionCounter
+from .nodes import Blackbox
+from .primitives import BlackboxInputVec, BlackboxOutputVec, BlackboxInput
+from .primitives import BlackboxOutput
+
+class Visitor(ABC):
+    def visit_compressor(self, c: Compressor): raise NotImplementedError
+
+    def visit_input_stage(self, s: InputStage): raise NotImplementedError
+    
+    def visit_gate_absorption_stage(self, s: GateAbsorbedStage): 
+        raise NotImplementedError
+
+    def visit_pipeline_stage(self, s: PipelineStage): raise NotImplementedError
+
+    def visit_compression_stage(self, s: CompressionStage): raise NotImplementedError
+
+    def visit_counter(self, c: Counter): raise NotImplementedError
+
+    def visit_gate_absorption_counter(self, c: GateAbsorptionCounter): 
+        raise NotImplementedError
+
+    def visit_blackbox(self, b: Blackbox): raise NotImplementedError
+
+    def visit_blackbox_input(self, b: BlackboxInput): raise NotImplementedError
+
+    def visit_blackbox_output(self, b: BlackboxOutput): raise NotImplementedError
+
+    def visit_blackbox_input_vec(self, b: BlackboxInputVec): raise NotImplementedError
+
+    def visit_blackbox_output_vec(self, b: BlackboxOutputVec): raise NotImplementedError
+
+    def visit_logic(self, lgc: Logic): raise NotImplementedError
+
+    def visit_bitmatrix(self, b: Bitmatrix): raise NotImplementedError
\ No newline at end of file
diff --git a/src/finn/compressor/src/main.py b/src/finn/compressor/src/main.py
new file mode 100644
index 0000000000..ad3331cb14
--- /dev/null
+++ b/src/finn/compressor/src/main.py
@@ -0,0 +1,169 @@
+#############################################################################
+# Copyright (C) 2024 - 2025 Advanced Micro Devices, Inc. All rights reserved.
+#
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# @brief    Main compressor tree generation entry point
+#############################################################################
+
+import time
+import argparse
+from .target import Target, Versal, SevenSeries, UltraScale
+from .utils.shape import Shape
+from .passes.compressor_constructor import CompressorConstructor
+from .passes.cost_estimator import CostEstimator
+from .passes.printer import CompressorPrinter
+from .passes.emitter import VerilogGenerator
+from .passes.wire_inserter import WireInserter
+from .passes.io_annotator import IOAnnotator
+from .passes.lut_placer import LUTPlacer
+from .tests.test_gen import generate_test
+from .tests.tester import tester
+from typing import Optional, List
+
+def parse_cli():
+    parser = argparse.ArgumentParser(
+        prog="Compressor Generator",
+        description="Generate a Compressor Tree for a given input."
+    )
+    parser.add_argument('-o', '--output', default="../gen/out.sv", 
+                        help="Path to store the compressor at.")
+    parser.add_argument('-s', '--shape', required=True, help="Input shape.")
+    parser.add_argument('-a', '--accumulate', action='store_true',
+                        help="Enable accumulation.")
+    parser.add_argument('-w', '--accumulator_width',
+                        help="Accumulator width [default: Reduced input shape].")
+    parser.add_argument('-g', '--gates', default=None,
+                        help="Inline 2-input gates into the compressor. LSB is left."
+                        "Example: 8,3")
+    parser.add_argument('-t', '--target', default="Versal",
+                        help="Target FPGA generation.", choices=["Versal", "7-Series", 
+                                                                 "UltraScale"])
+    parser.add_argument('--test', action="store_true",
+                        help="Test the generated compressor using Vivado XSim.")
+    parser.add_argument('-n', '--name', default="comp", 
+                        help="Name of the generated Systemverilog module.")
+    parser.add_argument('-p', '--pipeline_every', default=None, 
+                        help="Insert Pipeline registers every n stages. Default: "
+                        "Purely combinatorial.")
+    parser.add_argument('-c', '--constant', default=[], help="Add a constant binary "
+                        "number input. Example: 1011")
+    args = parser.parse_args()
+
+    try:
+        shape = Shape(int(el) for el in args.shape.split(','))
+    except (ValueError, TypeError):
+        print("Improperly defined shape.")
+        exit(-1)
+
+    gates = []
+    if args.gates:
+        assert len(args.gates) == sum(shape), \
+            "Length of shape and gate specification do not match."
+        gates_lin = list(args.gates)
+        for col in shape:
+            gates_col = []
+            for _ in range(col):
+                gates_col.append(gates_lin.pop(0))
+            gates.append(gates_col)
+
+    constants = []
+    for char in args.constant:
+        try:
+            constants.append(int(char, 2))
+        except ValueError:
+            print("Improperly defined constant.")
+            exit(-1)
+    if args.target == "Versal":
+        target = Versal()
+    elif args.target == "7-Series":
+        target = SevenSeries()
+    elif args.target == "UltraScale":
+        target = UltraScale()
+    else:
+        raise ValueError("Target not currently supported.")
+
+    generate_compressor(
+        target,
+        shape,
+        args.name,
+        int(args.pipeline_every) if args.pipeline_every else None,
+        args.accumulate,
+        int(args.accumulator_width) if args.accumulator_width else None,
+        gates,
+        constants,
+        args.output,
+        args.test
+    )
+
+def generate_compressor(
+        target: Target,
+        shape: Shape,
+        name: str, 
+        comb_depth: Optional[int],
+        accumulate: bool,
+        accumulator_width: int,
+        gates: List[List[str]],
+        constants: List[int], # Each element is a binary numer digit.
+        path: str,
+        test: bool,
+        enable: bool = False):
+
+    start_time = time.time()
+    constructor = CompressorConstructor()
+    c = constructor(target.counter_candidates, 
+                    target.absorbing_counter_candidates,
+                    target.final_adder,
+                    shape,
+                    name,
+                    comb_depth=comb_depth,
+                    accumulate=accumulate,
+                    accumulator_width=accumulator_width,
+                    constants=constants,
+                    gates=gates,
+                    enable=enable)
+
+    placer = LUTPlacer()
+    c.accept(placer)
+
+    wire_inserter = WireInserter()
+    c.accept(wire_inserter)
+
+    annotator = IOAnnotator()
+    c.accept(annotator)
+
+    cost = CostEstimator()
+    c.accept(cost)
+
+    emitter = VerilogGenerator()
+    c.accept(emitter)
+    with open(path, 'w') as f:
+        withprefix = f"""// Adder generated by the Python Compressor Generator
+// Input shape: {c.input_shape}; Output Shape: {c.output_shape}
+// Pipeline stages: {c.delay}
+// Target Generation: {target.__class__.__name__}
+// Approximate LUTs: {int(cost.luts+0.5)}
+// Accumulation: {"yes" if accumulate else "no"} {f"of width {accumulator_width}" 
+                                                  if accumulator_width else ""}
+// Enable mode: {"yes (init values set on accumulator registers)" if enable else "no"}
+// Gates: {gates if gates else "None"}
+        """ + emitter.emitter.output
+        f.write(withprefix)
+
+    end_time = time.time()
+    print("--%s seconds" % (start_time - end_time))
+
+    c.accept(CompressorPrinter())
+
+    if test:
+        constant = int("".join(str(c) for c in constants), 2) if constants else 0
+        test = generate_test(shape, "comp", c.delay, gates, accumulate, 
+                             accumulator_width, constant)
+        with open("../gen/test.sv", 'w') as f:
+            f.write(test)
+        tester("../gen/test.sv", path)
+
+    return c.delay
+
+if __name__ == "__main__":
+    parse_cli()
\ No newline at end of file
diff --git a/src/finn/compressor/src/passes/__init__.py b/src/finn/compressor/src/passes/__init__.py
new file mode 100644
index 0000000000..ff4b37ccd2
--- /dev/null
+++ b/src/finn/compressor/src/passes/__init__.py
@@ -0,0 +1,8 @@
+#############################################################################
+# Copyright (C) 2024 - 2025 Advanced Micro Devices, Inc. All rights reserved.
+#
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# @brief    Compressor passes package initialization
+# @author    Simon Gerber <simon.gerber@amd.com>
+#############################################################################
diff --git a/src/finn/compressor/src/passes/compressor_constructor.py b/src/finn/compressor/src/passes/compressor_constructor.py
new file mode 100644
index 0000000000..c7c6285873
--- /dev/null
+++ b/src/finn/compressor/src/passes/compressor_constructor.py
@@ -0,0 +1,183 @@
+#############################################################################
+# Copyright (C) 2024 - 2025 Advanced Micro Devices, Inc. All rights reserved.
+#
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# @brief    Compressor tree constructor with two-pass accumulator handling
+#############################################################################
+
+from typing import Tuple, List
+from .compressor_pipeliner import CompressorPipeliner
+from ..graph.accumulator import AccumulatorStage
+from ..graph.counters.counter_candidates import ConstantOne
+from ..graph.counters.absorption_counter_candidates import GateAbsorptionCounterCandidate
+from ..graph.nodes import Compressor, CompressionStage, InputStage, Counter, Passthrough
+from ..graph.nodes import GateAbsorbedStage
+from ..utils.shape import Shape
+
+class CompressorConstructor:
+    def adjust_compression_goal_for_constants(self, compression_goal, constants):
+        # Subtract constants, but never go below 2 (minimum achievable by compressor)
+        return lambda x: max(2, compression_goal(x) -
+                                (constants[x] if x < len(constants) else 0))
+    
+    def get_compression_goal(self, final_adder, accumulate, constants):
+        # Two-pass strategy for accumulate: compress to goal, add constants, then post-check
+        compression_goal = final_adder.compression_goal
+        return self.adjust_compression_goal_for_constants(compression_goal, constants)        
+
+    def add_constants_to_stage(self, s: CompressionStage, constants):
+        """Add constant bits to the compression stage."""
+        for idx, el in enumerate(constants):
+            if el:
+                c = ConstantOne()
+                s.append_counter(c, idx)
+
+    def __call__(self, 
+                 counter_candidates,
+                 absorption_counter_candidates,
+                 final_adder,
+                 input_shape: Shape,
+                 name: str,
+                 comb_depth: int = None,
+                 accumulate=False,
+                 accumulator_width: int = None,
+                 constants: Tuple[bool] = tuple(),
+                 gates: Tuple[Tuple[str]] = tuple(),
+                 enable: bool = False
+                 ) -> Compressor:
+        compression_goal = self.get_compression_goal(final_adder, accumulate, constants)
+        
+        c = Compressor(name)
+        c.stages.append(InputStage(input_shape, gates))
+
+        if gates:
+            s = self.construct_absorption_stage(c.stages[-1].output_shape, gates,
+                                                absorption_counter_candidates)
+            c.stages[-1].connect_to(s)
+            c.stages.append(s)
+
+        # CRITICAL: This loop can hang if compression_goal is unreachable
+        # add_compression_stage cannot compress height-1 or height-2 columns (requires >= 3)
+        # Therefore compression_goal must be achievable given this constraint
+        # See get_compression_goal() for how this is ensured in accumulate configurations
+        while not self.compression_goal_reached(c.stages[-1].output_shape,
+                                                compression_goal):
+            self.add_compression_stage(c, compression_goal, counter_candidates)
+
+        # Add constants to the graph.
+        if not isinstance(c.stages[-1], CompressionStage) and constants:
+            self.add_compression_stage(c, compression_goal, counter_candidates)
+        self.add_constants_to_stage(c.stages[-1], constants)
+
+        # After constants, check if we need additional compression for accumulator mode.
+        # The ternary adder receives: compressor_output + feedback (height 1).
+        # If any column exceeds final_adder capacity, we need more compression.
+        if accumulate:
+            def post_const_goal(x):
+                # Leave room for feedback (height 1) within ternary adder capacity
+                return max(2, final_adder.compression_goal(x) - 1)
+
+            while not self.compression_goal_reached(c.stages[-1].output_shape, post_const_goal):
+                self.add_compression_stage(c, post_const_goal, counter_candidates)
+
+        if comb_depth:
+            pipeliner = CompressorPipeliner()
+            pipeline_stages = pipeliner.pipeline(c, comb_depth)
+        else:
+            pipeline_stages = 0
+
+        if accumulate:
+                acc = AccumulatorStage(c.stages[-1].output_shape, final_adder, 
+                                       pipeline_stages, 
+                                       accumulator_width=accumulator_width,
+                                       enable=enable)
+                c.stages.append(acc)
+        elif max(c.stages[-1].output_shape) > 1:
+                final_stage = CompressionStage()
+                final_stage.append_counter(final_adder(c.stages[-1].output_shape), 0)
+                c.stages.append(final_stage)
+
+        for s_p, s_n in zip(c.stages, c.stages[1:]):
+            s_p.connect_to(s_n)
+        return c
+    
+    def add_compression_stage(self, compressor: Compressor, compression_goal,
+                              counter_candidates):
+        """Add a compression stage. Cannot compress columns with height < 3 (Full Adder = 3:2)."""
+        new_stage = CompressionStage()
+        stage_inputs = compressor.stages[-1].output_shape
+        stage_outputs = Shape()
+
+        i = 0
+        while i < max(len(stage_inputs), len(stage_outputs)):
+            def cur_output_height():
+                return (stage_inputs + stage_outputs)[i]
+
+            def cur_input_height():
+                return stage_inputs[i] if len(stage_inputs) > i else 0
+
+            while cur_input_height() >= 3 and cur_output_height() > compression_goal(i):
+                counter = self.schedule_counter(stage_inputs[i:], 
+                                                stage_outputs[i:], 
+                                                lambda x: compression_goal(x+i),
+                                                counter_candidates)
+                stage_inputs = stage_inputs - (counter.input_shape << i)
+                stage_outputs = stage_outputs + (counter.output_shape << i)
+                new_stage.append_counter(counter, i)
+            i += 1
+
+        # pass through all leftover inputs:
+        for i in range(len(stage_inputs)):
+            for j in range(stage_inputs[i]):
+                new_stage.append_counter(Passthrough(), i)
+
+        compressor.stages.append(new_stage)
+
+    def schedule_counter(self, stage_inputs, stage_outputs, compression_goal,
+                         counter_candidates) -> Counter:
+        counters = [] 
+        for counter_candid in counter_candidates:
+            counter = counter_candid.extend_to_fit(stage_inputs, stage_outputs,
+                                                   compression_goal)
+            counters.append(counter)
+        
+        try:
+            return max((c for c in counters
+                    if c is not None), key = lambda x: (x.efficiency, x.strength))
+        except ValueError:
+            raise ValueError(f"Could not schedule counter for input shape"
+                             f"{stage_inputs}; output shape {stage_outputs}; "
+                             "compression goal {compression_goal(0)}")
+
+    def compression_goal_reached(self, shape, compression_goal):
+        return all([col <= compression_goal(idx)
+                    for idx, col in enumerate(shape)])
+
+    
+    def get_best_inlined_counter(self, input_shape, gates, absorption_counters):
+        candidates = []
+        for counter in absorption_counters:
+            candidate = counter.extend_to_fit(input_shape, gates)
+            if candidate:
+                candidates.append(candidate)
+        return max(candidates, key=lambda x: (x.efficiency, x.strength))
+
+    def construct_absorption_stage(self,
+                                   input_shape: Shape,
+                                   gates: List[str],
+                                   absorption_counters: GateAbsorptionCounterCandidate
+                                   ):
+        s = GateAbsorbedStage()
+        cur_shape = input_shape
+        cur_gates = gates[:]
+        for idx in range(len(input_shape)):
+            while cur_shape[idx] > 0:
+                best = self.get_best_inlined_counter(
+                    cur_shape[idx:], cur_gates[idx:], absorption_counters)
+                cur_shape = cur_shape - (best.input_shape << idx)
+                for i in range(len(cur_shape)):
+                    new = list(reversed(list(reversed(cur_gates[i]))[:cur_shape[i]]))
+                    cur_gates[i] = new
+                s.append_counter(best, idx)
+        return s
\ No newline at end of file
diff --git a/src/finn/compressor/src/passes/compressor_pipeliner.py b/src/finn/compressor/src/passes/compressor_pipeliner.py
new file mode 100644
index 0000000000..b0a1e80163
--- /dev/null
+++ b/src/finn/compressor/src/passes/compressor_pipeliner.py
@@ -0,0 +1,33 @@
+#############################################################################
+# Copyright (C) 2024 - 2025 Advanced Micro Devices, Inc. All rights reserved.
+#
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# @brief    Compressor tree pipelining pass
+#############################################################################
+
+from ..graph.nodes import Compressor, CompressionStage, PipelineStage
+
+class CompressorPipeliner:
+    def pipeline(self, c: Compressor, max_combinational_depth: int):
+        cur_depth = 0
+        pipeline_stages = 0
+        new_stages = []
+
+        for idx, stage in enumerate(c.stages):
+            if isinstance(stage, CompressionStage):
+                new_stages.append(stage)
+                cur_depth += 1
+                if (cur_depth >= max_combinational_depth or 
+                    cur_depth >= max_combinational_depth-1 and idx == len(c.stages)-1):
+                    new_stages.append(PipelineStage(stage.output_shape))
+                    cur_depth = 0
+                    pipeline_stages += 1
+            else:
+                new_stages.append(stage)
+        c.stages = new_stages
+
+        for p, n in zip(c.stages, c.stages[1:]):
+            p.connect_to(n)
+
+        return pipeline_stages
\ No newline at end of file
diff --git a/src/finn/compressor/src/passes/cost_estimator.py b/src/finn/compressor/src/passes/cost_estimator.py
new file mode 100644
index 0000000000..859504a63a
--- /dev/null
+++ b/src/finn/compressor/src/passes/cost_estimator.py
@@ -0,0 +1,35 @@
+#############################################################################
+# Copyright (C) 2024 - 2025 Advanced Micro Devices, Inc. All rights reserved.
+#
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# @brief    Cost estimation pass for compressor resources
+#############################################################################
+
+from ..graph.nodes import CompressionStage, Compressor, GateAbsorbedStage, PipelineStage
+from ..graph.nodes import Blackbox
+from ..graph.primitives import LUT6, LUT6_2, LUT6CY, LUT5, LUT2, LUT
+from .node_iterator import NodeIterator
+
+class CostEstimator(NodeIterator):
+    def iter_compressor(self, c: Compressor):
+        self.combinatorial_stages = -1 # Start with -1 to exclude final adder
+        self.pipeline_stages = 0
+        self.luts = 0
+
+    def iter_compression_stage(self, s: CompressionStage):
+        self.combinatorial_stages += 1
+
+    def iter_gate_absorbed_stage(self, g: GateAbsorbedStage):
+        self.combinatorial_stages += 1
+
+    def iter_pipeline_stage(self, p: PipelineStage):
+        self.pipeline_stages += 1
+
+    def iter_blackbox(self, b: Blackbox):
+        if isinstance(b, LUT5) or isinstance(b, LUT2):
+            self.luts += 0.5
+        elif isinstance(b, LUT6) or isinstance(b, LUT6CY) or isinstance(b, LUT6_2):
+            self.luts += 1
+        elif isinstance(b, LUT):
+            raise RuntimeError("No cost function implemented for this LUT type {b}")
\ No newline at end of file
diff --git a/src/finn/compressor/src/passes/emitter.py b/src/finn/compressor/src/passes/emitter.py
new file mode 100644
index 0000000000..421b0f1379
--- /dev/null
+++ b/src/finn/compressor/src/passes/emitter.py
@@ -0,0 +1,317 @@
+#############################################################################
+# Copyright (C) 2024 - 2025 Advanced Micro Devices, Inc. All rights reserved.
+#
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# @brief    Verilog emitter for compressor tree
+#############################################################################
+
+from io import StringIO
+from contextlib import contextmanager
+from collections import defaultdict
+from typing import Tuple
+from ..graph.primitives import BlackboxInput, BlackboxInputVec, BlackboxOutput
+from ..graph.primitives import BlackboxOutputVec
+from ..graph.visitor import Visitor
+from ..graph.nodes import Bitmatrix, Counter, CompressionStage, Compressor, InputStage
+from ..graph.nodes import PipelineStage, Wire, BlackboxPort, Logic, BlackboxVecElement
+from ..graph.nodes import Connectable, GateAbsorbedStage, Blackbox, BitmatrixElement
+from ..graph.nodes import Constant
+from ..graph.accumulator import AccumulatorStage
+
+class VerilogEmitter:
+    def __init__(self):
+        self._out = StringIO()
+        self._indent_level = 0
+        self._line_start = True
+
+    def emit(self, line = ""):
+        if self._line_start:
+            self._out.write(self._indent_level * "\t")
+        self._line_start = False
+        self._out.write(line)
+
+    def emitln(self, line = ""):
+        if self._line_start:
+            self._out.write(self._indent_level * "\t")
+        self._out.write(line + "\n")
+        self._line_start = True
+
+    @property
+    @contextmanager
+    def indent(self):
+        try:
+            self._indent_level += 1
+            yield None
+        finally:
+            self._indent_level -= 1
+
+    @property
+    def output(self):
+        return self._out.getvalue()
+
+    def save_verilog(self, filename):
+        with open(filename, "w") as f:
+            f.writelines(self._out)
+
+class VerilogGenerator(Visitor):
+    def set_name(self, o: object, name):
+        self._names[type(o)][o] = name
+
+    def get_name(self, o: object):
+        if isinstance(o, BlackboxPort):
+            return o.name
+
+        if o in self._names[type(o)]:
+            return self._names[type(o)][o]
+
+        subdict = self._names[type(o)]
+
+        if isinstance(o, Logic):
+            subdict[o] = f"logic_{len(subdict)}"
+        elif isinstance(o, Wire):
+            if o.desired_name:
+                if o.desired_name not in subdict.values():
+                    subdict[o] = o.desired_name
+                else:
+                    print(f"Could not obey desired name: {o.desired_name}")
+            else:
+                subdict[o] = f"wire_{len(subdict)}"
+        elif isinstance(o, Bitmatrix):
+            subdict[o] = f"bitmatrix_{len(subdict)}"
+        elif isinstance(o, BitmatrixElement):
+            bitmatrix = o.vector
+            return self.get_name(bitmatrix) + f"[{o.lin_idx}]"
+        elif isinstance(o, Constant):
+            return o.value
+        elif isinstance(o, Blackbox):
+            subdict[o] = f"{o.module_name.lower()}_{len(subdict)}"
+        else:
+            raise NotImplementedError(f"get_name cannot handle this type {type(o)}")
+        return subdict[o]
+
+    def visit_compressor(self, c: Compressor):
+        self.emitter = VerilogEmitter()
+        self._declared_hardware = set()
+        self._emitted_hardware = set()
+        self._names = defaultdict(lambda: {})
+
+        self.set_name(c.stages[0].input_wires, "in")
+        if hasattr(c.stages[0], "input_wires_complementary"):
+            self.set_name(c.stages[0].input_wires_complementary, "in_2")
+        self.set_name(c.stages[-1].output_wires, "out")
+
+        self.emitter.emitln(f"module {c.module_name}(")
+        with self.emitter.indent:
+            names = sorted(["input clk"] + 
+                           [el.prefix + ("logic " if isinstance(el, Logic) else 
+                                         f"[{el.total_size()-1}:0] "
+                                         if isinstance(el, Bitmatrix) else
+                                         "") + self.get_name(el) for el in c.io],
+                           key=lambda x: "input" not in x)
+            [self._declared_hardware.add(el) for el in c.io]
+            
+            self.emitter.emitln(",\n\t".join(names))
+        self.emitter.emitln(");")
+
+        with self.emitter.indent:
+            for stage in c.stages:
+                stage.accept(self)
+        self.emitter.emitln("endmodule")
+
+    def visit_input_stage(self, s: InputStage):
+        s.input_wires.accept(self)
+        if hasattr(s, "input_wires_complementary"):
+            s.input_wires_complementary.accept(self)
+        # Visit output_wires if they're separate from input_wires (trivial passthrough case)
+        if s.output_wires is not s.input_wires:
+            s.output_wires.accept(self)
+
+    def visit_accumulator_stage(self, a: AccumulatorStage):
+        self.emitter.emitln()
+        self.emitter.emitln("// Accumulator Stage")
+        a.input_wires.accept(self)
+        [el.accept(self) for el in
+         sorted(a.instances, key=lambda x: (not isinstance(x, Connectable)))]
+        a.output_wires.accept(self)
+
+
+    def visit_pipeline_stage(self, s: PipelineStage):
+        self.emitter.emitln()
+        self.emitter.emitln("// Pipeline Results..")
+        s.input_wires.accept(self)
+        [el.accept(self) for el in s.instances]
+        s.output_wires.accept(self)
+
+    def visit_compression_stage(self, s: CompressionStage):
+        self.emitter.emitln()
+        self.emitter.emitln(f"// Compression Stage with Input Shape: {s.input_shape} "
+                            f"and Output Shape {s.output_shape}")
+        s.input_wires.accept(self)
+        [c.accept(self) for c, _ in s.counters_with_shifts]
+        s.output_wires.accept(self)
+        self.emitter.emitln()
+
+    def visit_gate_absorbed_stage(self, g: GateAbsorbedStage):
+        self.emitter.emitln()
+        self.emitter.emitln("// Compression Stage with Gate Absorption.")
+        self.emitter.emitln(f"// Input Shape: {g.input_shape} "
+                            f"and Output Shape: {g.output_shape}")
+        g.input_wires.accept(self)
+        g.input_wires_complementary.accept(self)
+        [c.accept(self) for c, _ in g.counters_with_shifts]
+        g.output_wires.accept(self)
+        self.emitter.emitln()
+
+    def visit_counter(self, c: Counter):
+        [el.accept(self) for col in c.input_wires for el in col]
+        [el.accept(self) for col in c.output_wires for el in col]
+        [el.accept(self) for el in 
+         sorted(c.instances, key=lambda x: not isinstance(x, Connectable))]
+
+    def visit_gate_absorption_counter(self, c: GateAbsorbedStage):
+        [el.accept(self) for col in c.input_wires_complementary for el in col]
+        self.visit_counter(c)
+
+    def visit_wire(self, w: Wire):
+        if w in self._emitted_hardware:
+            return
+
+        if w not in self._declared_hardware:
+            self.emitter.emitln(f"uwire {self.get_name(w)};")
+        self._declared_hardware.add(w)
+
+        if w.has_source not in self._declared_hardware and isinstance(w.source, Wire):
+            w.source.accept(self)
+
+        if (w.has_source and isinstance(w.source, Connectable) and
+            not isinstance(w.source, BlackboxPort) and
+            not isinstance(w.source, BlackboxVecElement)):
+            self.emitter.emitln(
+                f"assign {self.get_name(w)} = {self.get_name(w.source)};")
+        self._emitted_hardware.add(w)
+
+    def visit_logic(self, lgc: Logic):
+        if lgc in self._emitted_hardware: 
+            return
+        
+        if lgc not in self._declared_hardware:
+            self.emitter.emit(lgc.prefix)
+            init_str = f" = 1'b{lgc.init}" if lgc.init is not None else ""
+            self.emitter.emitln(
+                f'(* srl_style = "register" *) logic {self.get_name(lgc)}{init_str};')
+        self._declared_hardware.add(lgc)
+
+        if (lgc.has_source not in self._declared_hardware and 
+            isinstance(lgc.source, Wire)):
+            lgc.source.accept(self)
+
+        def emit_inner(): 
+            if lgc.source:
+                self.emitter.emitln(
+                    f"{self.get_name(lgc)} <= {self.get_name(lgc.source)};")
+
+        def emit_with_en():
+            if lgc.en:
+                self.emitter.emitln(f"if ({self.get_name(lgc.en)}) begin")
+                with self.emitter.indent:
+                    emit_inner()
+                self.emitter.emitln("end")
+            else: 
+                emit_inner()
+
+        def emit_with_rst_and_en():
+            if lgc.rst and lgc.en:
+                # En-gated rst: preserve state during stalls
+                self.emitter.emitln(f"if ({self.get_name(lgc.en)}) begin")
+                with self.emitter.indent:
+                    self.emitter.emitln(f"if ({self.get_name(lgc.rst)}) begin")
+                    with self.emitter.indent:
+                        self.emitter.emitln(f"{self.get_name(lgc)} <= 1'b0;")
+                    self.emitter.emitln("end else begin")
+                    with self.emitter.indent:
+                        emit_inner()
+                    self.emitter.emitln("end")
+                self.emitter.emitln("end")
+            elif lgc.rst:
+                self.emitter.emitln(f"if ({self.get_name(lgc.rst)}) begin")
+                with self.emitter.indent:
+                    self.emitter.emitln(f"{self.get_name(lgc)} <= 1'b0;")
+                self.emitter.emitln("end else begin")
+                with self.emitter.indent:
+                    emit_inner()
+                self.emitter.emitln("end")
+            else: 
+                emit_with_en()
+
+        self.emitter.emitln("always_ff @(posedge clk) begin")
+        with self.emitter.indent:
+            emit_with_rst_and_en()
+        self.emitter.emitln("end")
+        self._emitted_hardware.add(lgc)
+
+    def visit_blackbox(self, b: Blackbox):
+        if b.annotations:
+            self.emitter.emitln(f"(* {', '.join(b.annotations)} *)")
+        self.emitter.emitln(f"{b.module_name} #(")
+        with self.emitter.indent:
+            for idx, (key, value) in enumerate(b.parameters.items()):
+                ending = "," if idx != len(b.parameters)-1 else ""
+                self.emitter.emitln(f".{key}({value}){ending}")
+        self.emitter.emitln(f") {self.get_name(b)} (")
+        with self.emitter.indent:
+            ports = b.out_ports + b.in_ports
+            for idx, port in enumerate(ports):
+                ending = "," if idx != len(ports)-1 else ""
+                port.accept(self)
+                self.emitter.emitln(ending)
+        self.emitter.emitln(");")
+
+    def visit_blackbox_output(self, b: BlackboxOutput):
+        if b.has_target:
+            self.emitter.emit(f".{b.name}({self.get_name(b.target)})")
+        else:
+            self.emitter.emit(f".{b.name}()")
+
+    def visit_blackbox_output_vec(self, b: BlackboxOutputVec):
+        self.emitter.emit(f".{b.name}(")
+        self.emitter.emit("{")
+        targets = [self.get_name(el.target) for el in b.elements[::-1] if el.target]
+        self.emitter.emit(", ".join(targets))
+        self.emitter.emit("})")
+    
+    def visit_blackbox_input(self, b: BlackboxInput):
+        if b.has_source:
+            self.emitter.emit(f".{b.name}({self.get_name(b.source)})")
+        else:
+            self.emitter.emit(f".{b.name}(1'b0)")
+    
+    def visit_blackbox_input_vec(self, b: BlackboxInputVec):
+        self.emitter.emit(f".{b.name}(")
+        self.emitter.emit("{")
+        sources = [self.get_name(el.source) 
+                   if el.source else "1'b0" 
+                   for el in b.elements[::-1]]
+        self.emitter.emit(", ".join(sources))
+        self.emitter.emit("})")
+
+    def emit_blackbox_ports(self, p: Tuple[BlackboxPort]):
+        for idx, port in enumerate(p):
+            seperator = "," if idx != len(p) - 1 else ""
+            if port.connected:
+                self.emitter.emitln(f".{self.get_name(port)}({self.get_name(port.wire)}){seperator}")
+            elif isinstance(port, BlackboxInput):
+                self.emitter.emitln(f".{self.get_name(port)}(1'b0){seperator}")
+            else:
+                self.emitter.emitln(f".{self.get_name(port)}(){seperator}")
+    
+    def visit_bitmatrix(self, b: Bitmatrix):
+        if b not in self._declared_hardware:
+            self.emitter.emitln(f"uwire [{b.total_size()-1}:0] {self.get_name(b)};")
+            self._declared_hardware.add(b)
+        
+        if b not in self._emitted_hardware:    
+            [self.emitter.emitln(
+                f"assign {self.get_name(el)} = {self.get_name(el.source)};")
+             for col in b for el in col if el.has_source]
+            self._emitted_hardware.add(b)
\ No newline at end of file
diff --git a/src/finn/compressor/src/passes/io_annotator.py b/src/finn/compressor/src/passes/io_annotator.py
new file mode 100644
index 0000000000..e41d077864
--- /dev/null
+++ b/src/finn/compressor/src/passes/io_annotator.py
@@ -0,0 +1,54 @@
+#############################################################################
+# Copyright (C) 2024 - 2025 Advanced Micro Devices, Inc. All rights reserved.
+#
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# @brief    Input/output annotation pass for compressor
+#############################################################################
+
+from ..graph.nodes import Compressor, Logic, Wire, Bitmatrix
+from .node_iterator import NodeIterator
+
+class IOAnnotator(NodeIterator):
+    def visit_compressor(self, c: Compressor):
+        input_wires = c.stages[0].input_wires
+        output_wires = c.stages[-1].output_wires
+
+        # Handle trivial passthrough case where input_wires IS output_wires (same object).
+        # This happens for N=1 compressors where only an InputStage exists.
+        # We need separate Bitmatrix objects for input and output ports.
+        if input_wires is output_wires:
+            new_output = Bitmatrix(input_wires.shape)
+            for in_col, out_col in zip(input_wires, new_output):
+                for in_wire, out_wire in zip(in_col, out_col):
+                    in_wire.connect_to(out_wire)
+            c.stages[-1].output_wires = new_output
+            output_wires = new_output
+
+        input_wires.set_to_module_input()
+        input_wires.name = "in"
+        if c.stages[0].gates:
+            c.stages[0].input_wires_complementary.set_to_module_input()
+            c.stages[0].input_wires_complementary.name = "in_2"
+        output_wires.set_to_module_output()
+        output_wires.name = "out"
+
+        c.io = self.get_all_io(c)
+        
+    def get_all_io(self, c: Compressor):
+        finder = IOFinder()
+        c.accept(finder)
+        return list(set(finder.io))        
+
+class IOFinder(NodeIterator):
+    def iter_compressor(self, c: Compressor):
+        self.connectables = []
+
+    @property
+    def io(self): return [el for el in self.connectables if el.prefix]
+
+    def iter_wire(self, w: Wire): self.connectables.append(w)
+    
+    def iter_logic(self, lgc: Logic): self.connectables.append(lgc)
+
+    def iter_bitmatrix(self, b: Bitmatrix): self.connectables.append(b)
diff --git a/src/finn/compressor/src/passes/lut_placer.py b/src/finn/compressor/src/passes/lut_placer.py
new file mode 100644
index 0000000000..ec8c2cabc2
--- /dev/null
+++ b/src/finn/compressor/src/passes/lut_placer.py
@@ -0,0 +1,85 @@
+#############################################################################
+# Copyright (C) 2024 - 2025 Advanced Micro Devices, Inc. All rights reserved.
+#
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# @brief    RLOC placement annotation for compressor LUTs
+#############################################################################
+
+from .node_iterator import NodeIterator
+from ..graph.nodes import Compressor, Counter, GateAbsorptionCounter
+from ..graph.primitives import LUT6CY
+from ..graph.final_adder import FinalAdder
+
+class LUTPlacer(NodeIterator):
+    def iter_compressor(self, c: Compressor):
+        self.occupations = [] # Reset placement state for every compressor
+
+    def iter_counter(self, c: Counter):
+        # Place LUT6CY instances manually.
+        cascades = self._get_ripple_connected_luts(c)
+        self._calculate_and_annotate_placements(cascades)
+
+    def iter_gate_absorption_counter(self, g: GateAbsorptionCounter):
+        self.iter_counter(g)
+
+    def _get_ripple_connected_luts(self, c: Counter):
+        "Among all LUTs inside a counter, reconstruct all ripple connections."
+        if isinstance(c, FinalAdder):
+            # No manual placement needed, as final adders use the LOOKAHEAD8,
+            # which restricts enforces correct placement itself.
+            return []
+
+        lut6cy_i4s =  {lut.I4:  lut for lut in c.luts if isinstance(lut, LUT6CY)}
+        lut6cy_o52s = {lut.O52: lut for lut in c.luts if isinstance(lut, LUT6CY)}
+
+        lut_output_to_lut_input = {}
+
+        for input, input_lut in lut6cy_i4s.items():
+            if input.source in lut6cy_o52s:
+                target_lut = lut6cy_o52s[input.source]
+                lut_output_to_lut_input[input_lut] = target_lut
+
+        lut_heads = (set(lut_output_to_lut_input.keys()) - 
+                     set(lut_output_to_lut_input.values()))
+        chains = []
+
+        for lut_head in lut_heads:
+            cur = [lut_head]
+            while el := lut_output_to_lut_input.get(cur[-1]):
+                cur.append(el)
+            chains.append(cur[::-1])
+
+        return chains
+    
+    def _calculate_and_annotate_placements(self, cascades):
+        for cascade in cascades:
+            for idx, slice_util in enumerate(self.occupations):
+                if len(cascade) + slice_util <= 8:
+                    self._annotate_placements(cascade, idx, self.occupations[idx])
+                    self.occupations[idx] += len(cascade)
+                    break
+            else:
+                self.occupations.append(len(cascade))
+                self._annotate_placements(cascade, len(self.occupations)-1, 0)
+
+    def _annotate_placements(self, cascade, hu_set, start_idx):
+        """Annotate LUT6CY placement constraints for carry chain packing.
+
+        Places each cascade (ripple chain) into specific BEL positions within a SLICE.
+        Each hu_set represents one SLICE (8 LUTs max). Multiple hu_sets get different
+        Y coordinates to avoid placement conflicts.
+
+        Args:
+            cascade: List of LUT6CY instances forming a carry ripple chain
+            hu_set: SLICE index (0, 1, 2, ...) - maps to RLOC Y coordinate
+            start_idx: Starting BEL position within the SLICE (0-7 = A-H)
+        """
+        assert start_idx + len(cascade) <= 8
+        for i, lut in enumerate(cascade):
+            bel_str = f"{chr(ord('A')+start_idx+i)}5LUT"
+            lut.annotate(f'HU_SET = "hu_set_{hu_set}"')
+            lut.annotate(f'RLOC = "X0Y{hu_set}"')  # Increment Y per SLICE to avoid conflicts
+            lut.annotate(f'BEL = "{bel_str}"')
+            lut.annotate('DONT_TOUCH = "yes"')
+            lut.annotate('IS_BEL_FIXED = "yes"')
\ No newline at end of file
diff --git a/src/finn/compressor/src/passes/node_iterator.py b/src/finn/compressor/src/passes/node_iterator.py
new file mode 100644
index 0000000000..4b0f399e35
--- /dev/null
+++ b/src/finn/compressor/src/passes/node_iterator.py
@@ -0,0 +1,123 @@
+#############################################################################
+# Copyright (C) 2024 - 2025 Advanced Micro Devices, Inc. All rights reserved.
+#
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# @brief    Node iterator pass for compressor graph traversal
+#############################################################################
+
+from ..graph.primitives import LOOKAHEAD8
+from ..graph.visitor import Visitor
+from ..graph.nodes import Counter, CompressionStage, Compressor, InputStage, PipelineStage
+from ..graph.nodes import Blackbox, Wire, Logic, Bitmatrix, GateAbsorbedStage
+from ..graph.nodes import GateAbsorptionCounter, BlackboxInput, BlackboxOutput
+from ..graph.nodes import BlackboxInputVec, BlackboxOutputVec
+from ..graph.accumulator import AccumulatorStage
+
+class NodeIterator(Visitor):
+    def visit_compressor(self, c: Compressor): 
+        self.iter_compressor(c)
+        [s.accept(self) for s in c.stages]
+    
+    def visit_input_stage(self, s: InputStage):
+        self.iter_input_stage(s)
+        s.input_wires.accept(self)
+        if s.gates:
+            s.input_wires_complementary.accept(self)
+        s.output_wires.accept(self)
+
+    def visit_pipeline_stage(self, s: PipelineStage):
+        self.iter_pipeline_stage(s)
+        s.input_wires.accept(self)
+        s.output_wires.accept(self)
+        [el.accept(self) for el in s.instances]
+
+    def visit_compression_stage(self, s: CompressionStage):
+        self.iter_compression_stage(s)
+        s.input_wires.accept(self)
+        s.output_wires.accept(self)
+        [c.accept(self) for c, _ in s.counters_with_shifts]
+
+    def visit_accumulator_stage(self, a: AccumulatorStage):
+        self.iter_accumulator_stage(a)
+        a.input_wires.accept(self)
+        a.output_wires.accept(self)
+        [c.accept(self) for c in a.instances]
+
+    def visit_gate_absorbed_stage(self, g: GateAbsorbedStage):
+        self.iter_gate_absorbed_stage(g)
+        g.input_wires.accept(self)
+        g.input_wires_complementary.accept(self)
+        g.output_wires.accept(self)
+        [c.accept(self) for c, _ in g.counters_with_shifts]
+
+    def visit_counter(self, c: Counter):
+        self.iter_counter(c)
+        [el.accept(self) for col in c.input_wires for el in col]
+        [el.accept(self) for col in c.output_wires for el in col]
+        [el.accept(self) for el in c.instances]
+
+    def visit_gate_absorption_counter(self, g: GateAbsorptionCounter):
+        self.iter_gate_absorption_counter(g)
+        [el.accept(self) for col in g.input_wires for el in col]
+        [el.accept(self) for col in g.input_wires_complementary for el in col]
+        [el.accept(self) for col in g.output_wires for el in col]
+        [el.accept(self) for el in g.instances]
+    
+    def visit_blackbox(self, b: Blackbox):
+        self.iter_blackbox(b)
+        [p.accept(self) for p in b.in_ports + b.out_ports]
+
+    def visit_blackbox_input(self, b: BlackboxInput):
+        self.iter_blackbox_input
+
+    def visit_blackbox_output(self, b: BlackboxOutput):
+        self.iter_blackbox_output
+
+    def visit_blackbox_input_vec(self, b: BlackboxInputVec):
+        self.iter_blackbox_input_vec
+
+    def visit_blackbox_output_vec(self, b: BlackboxOutputVec):
+        self.iter_blackbox_output_vec
+
+    def visit_lookahead8(self, l8: LOOKAHEAD8):
+        self.iter_lookahead8(l8)
+        self.visit_blackbox(l8)
+
+    def visit_wire(self, w: Wire): self.iter_wire(w)
+
+    def visit_logic(self, lgc: Logic): self.iter_logic(lgc)
+
+    def visit_bitmatrix(self, b: Bitmatrix): self.iter_bitmatrix(b)
+
+    def iter_compressor(self, c: Compressor): pass
+    
+    def iter_gate_absorbed_stage(self, g: GateAbsorbedStage): pass
+
+    def iter_input_stage(self, s: InputStage): pass
+
+    def iter_accumulator_stage(self, a: AccumulatorStage): pass
+
+    def iter_pipeline_stage(self, s: PipelineStage): pass
+
+    def iter_compression_stage(self, s: CompressionStage): pass
+
+    def iter_gate_absorption_counter(self, g: GateAbsorptionCounter): pass
+
+    def iter_counter(self, c: Counter): pass
+
+    def iter_blackbox(self, b: Blackbox): pass
+
+    def iter_wire(self, w: Wire): pass
+
+    def iter_logic(self, lgc: Logic): pass
+
+    def iter_bitmatrix(self, b: Bitmatrix): pass
+
+    def iter_blackbox_input(self, b: BlackboxInput): pass
+
+    def iter_blackbox_output(self, b: BlackboxOutput): pass
+
+    def iter_blackbox_input_vec(self, b: BlackboxInputVec): pass
+
+    def iter_blackbox_output_vec(self, b: BlackboxOutputVec): pass
\ No newline at end of file
diff --git a/src/finn/compressor/src/passes/printer.py b/src/finn/compressor/src/passes/printer.py
new file mode 100644
index 0000000000..2ebcabe23f
--- /dev/null
+++ b/src/finn/compressor/src/passes/printer.py
@@ -0,0 +1,54 @@
+#############################################################################
+# Copyright (C) 2024 - 2025 Advanced Micro Devices, Inc. All rights reserved.
+#
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# @brief    Compressor tree printer for debugging
+#############################################################################
+
+from ..graph.nodes import Counter, CompressionStage, Compressor, GateAbsorbedStage
+from ..graph.nodes import GateAbsorptionCounter, InputStage, PipelineStage
+from ..graph.accumulator import AccumulatorStage
+from ..graph.visitor import Visitor
+
+class CompressorPrinter(Visitor):
+    def visit_compressor(self, c: Compressor):
+        print(f"Compressor <Input: {c.input_shape}, Output: {c.output_shape}> [")
+        for stage in c.stages:
+            stage.accept(self)
+        print("]")
+
+    def visit_compression_stage(self, s: CompressionStage):
+        print(f"\tStage: <in: {s.input_shape}, out: {s.output_shape}> [")
+        for counter, shift in s.counters_with_shifts:
+            print(f"\t\t[xshift={shift:2}] ",end="")
+            counter.accept(self)
+        print("\t]")
+
+    def visit_gate_absorbed_stage(self, s: GateAbsorbedStage):
+        print(f"\tStage with Gate Absorption: <in {s.input_shape}, "
+              f"out: {s.output_shape}> [")
+        for counter, shift in s.counters_with_shifts:
+            print(f"\t\t[xshift={shift:2}] ",end="")
+            counter.accept(self)
+        print("\t]")
+
+    def visit_input_stage(self, i: InputStage):
+        print(f"\tInput Stage: <{i.input_shape}>")
+
+    def visit_pipeline_stage(self, p: PipelineStage):
+        print(f"\tPipeline Stage: <{p.input_shape}>")
+
+    def visit_counter(self, c: Counter):
+        print(f"{c.__class__.__name__} <in: {c.input_shape}, out: {c.output_shape}>")
+
+    def visit_gate_absorption_counter(self, c: GateAbsorptionCounter):
+        self.visit_counter(c)
+
+    def visit_accumulator_stage(self, a: AccumulatorStage):
+        print(f"\tAccumulator: <in: {a.input_shape}, out: {a.output_shape}> [")
+        print("\t\t",end="")
+        for i in a.instances:
+            if isinstance(i, Counter):
+                i.accept(self)
+        print("\t]")
\ No newline at end of file
diff --git a/src/finn/compressor/src/passes/wire_inserter.py b/src/finn/compressor/src/passes/wire_inserter.py
new file mode 100644
index 0000000000..6865b1cf3d
--- /dev/null
+++ b/src/finn/compressor/src/passes/wire_inserter.py
@@ -0,0 +1,40 @@
+#############################################################################
+# Copyright (C) 2024 - 2025 Advanced Micro Devices, Inc. All rights reserved.
+#
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# @brief    Wire insertion pass for compressor graph
+#############################################################################
+
+from .node_iterator import NodeIterator
+from ..graph.nodes import Blackbox, Counter, Wire, GateAbsorptionCounter
+
+# Blackbox outputs might be connected to other blackbox inputs. 
+# To express this in verilog, an extra intermediate wire has to
+# be created between the blackboxes. This path adds it.
+class WireInserter(NodeIterator):
+    def iter_counter(self, c: Counter):
+        bboxes = [el for el in c.instances if isinstance(el, Blackbox)]
+        for bbox in bboxes:
+            for output in bbox.out_ports:
+                self.insert_wire_at_blackbox_output(output, c)
+
+    def iter_gate_absorption_counter(self, g: GateAbsorptionCounter): 
+        self.iter_counter(g)
+
+    def insert_wire_at_blackbox_output(self, output, counter):
+        if hasattr(output, "elements"):
+            for el in output.elements:
+                self.insert_wire_at_blackbox_output(el, counter)
+            return
+            
+        if len(output.target) == 1 and isinstance(output.target[0], Wire):
+            output.target = output.target[0]
+            return
+        
+        out_wire = Wire()
+        for input in output.target:
+            out_wire.connect_to(input)
+
+        output.target = out_wire
+        counter.instances.append(out_wire)
\ No newline at end of file
diff --git a/src/finn/compressor/src/target.py b/src/finn/compressor/src/target.py
new file mode 100644
index 0000000000..d526fdbbb0
--- /dev/null
+++ b/src/finn/compressor/src/target.py
@@ -0,0 +1,102 @@
+#############################################################################
+# Copyright (C) 2024 - 2025 Advanced Micro Devices, Inc. All rights reserved.
+#
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# @brief    FPGA target definitions and gate absorption counter selection
+#############################################################################
+
+from abc import ABC
+from .graph.counters.counter_candidates import CounterCandidate, FACandidate
+from .graph.counters.counter_candidates import MuxCYAtomCascadeCandidate
+from .graph.counters.counter_candidates import RippleSumCandidate
+from .graph.counters.counter_candidates import DualRailRippleSumCandidate
+from .graph.counters.counter_candidates import FiveTwoCandidate 
+from .graph.counters.counter_candidates import VersalAtomCascadeCandidate
+from .graph.counters.counter_candidates import SixThreeCandidate, TenSixCandidate
+from .graph.counters.absorption_counter_candidates import GateAbsorptionCounterCandidate
+from .graph.counters.absorption_counter_candidates import VersalPredAdderCandidate
+from .graph.counters.absorption_counter_candidates import RippleSumPredAdderCandidate
+from .graph.counters.absorption_counter_candidates import SinglePredCandidate
+from .graph.counters.absorption_counter_candidates import MuxCYPredAdderCandidate
+from .graph.counters.absorption_counter_candidates import MuxCYRippleSumCandidate
+from .graph.final_adder import MuxCYTernaryAdder, FinalAdder, QuaternaryAdder
+from typing import List
+
+def resolve_target(fpgapart):
+    """Map a Vivado FPGA part string to a compressor Target object.
+
+    Returns Versal() for Versal parts, UltraScale() for UltraScale/UltraScale+ parts,
+    SevenSeries() otherwise.
+    """
+    versal_prefixes_4 = ("xcvc", "xcve", "xcvp", "xcvm", "xqvc", "xqvm")
+    versal_prefixes_5 = ("xqrvc", "xcv80")
+    if fpgapart[0:4] in versal_prefixes_4 or fpgapart[0:5] in versal_prefixes_5:
+        return Versal()
+    # UltraScale/UltraScale+ prefixes: Kintex US (xcku), Virtex US (xcvu), Zynq US (xczu), defense (xqzu)
+    ultrascale_prefixes = ("xcku", "xcvu", "xczu", "xqzu")
+    if fpgapart[0:4] in ultrascale_prefixes:
+        return UltraScale()
+    return SevenSeries()
+
+
+def resolve_target_name(name):
+    """Map a CLI target name ('Versal', '7-Series', 'UltraScale') to a Target object."""
+    if name == "Versal":
+        return Versal()
+    elif name == "7-Series":
+        return SevenSeries()
+    elif name == "UltraScale":
+        return UltraScale()
+    else:
+        raise ValueError(f"Unsupported target: {name!r}. Choose from: ['Versal', '7-Series', 'UltraScale']")
+
+
+class Target(ABC):
+    counter_candidates: List[CounterCandidate]
+    final_adder: FinalAdder
+    absorbing_counter_candidates: List[GateAbsorptionCounterCandidate]
+
+class Versal(Target):
+    def __init__(self):
+        self.counter_candidates = [
+            TenSixCandidate(),
+            FACandidate(),
+            RippleSumCandidate(),
+            DualRailRippleSumCandidate(),
+            FiveTwoCandidate(),
+            SixThreeCandidate(),
+            VersalAtomCascadeCandidate()
+        ]
+        self.absorbing_counter_candidates = [
+            VersalPredAdderCandidate(),
+            RippleSumPredAdderCandidate(),
+            SinglePredCandidate(),
+        ]
+        self.final_adder = QuaternaryAdder
+
+class SevenSeries(Target):
+    def __init__(self):
+        self.counter_candidates = [FACandidate(), FiveTwoCandidate(),
+                                   SixThreeCandidate(), MuxCYAtomCascadeCandidate()]
+        self.final_adder = MuxCYTernaryAdder
+        self.absorbing_counter_candidates = [
+            MuxCYPredAdderCandidate(),
+            MuxCYRippleSumCandidate(),
+            SinglePredCandidate(),
+        ]
+
+class UltraScale(Target):
+    """UltraScale/UltraScale+ - reuses 7-Series primitives.
+
+    Vivado maps CARRY4 to CARRY8 transparently.
+    """
+    def __init__(self):
+        self.counter_candidates = [FACandidate(), FiveTwoCandidate(),
+                                   SixThreeCandidate(), MuxCYAtomCascadeCandidate()]
+        self.final_adder = MuxCYTernaryAdder
+        self.absorbing_counter_candidates = [
+            MuxCYPredAdderCandidate(),
+            MuxCYRippleSumCandidate(),
+            SinglePredCandidate(),
+        ]
\ No newline at end of file
diff --git a/src/finn/compressor/src/tests/__init__.py b/src/finn/compressor/src/tests/__init__.py
new file mode 100644
index 0000000000..b6d457fd32
--- /dev/null
+++ b/src/finn/compressor/src/tests/__init__.py
@@ -0,0 +1,8 @@
+#############################################################################
+# Copyright (C) 2024 - 2025 Advanced Micro Devices, Inc. All rights reserved.
+#
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# @brief    Compressor tests package initialization
+# @author    Simon Gerber <simon.gerber@amd.com>
+#############################################################################
diff --git a/src/finn/compressor/src/tests/test_gen.py b/src/finn/compressor/src/tests/test_gen.py
new file mode 100644
index 0000000000..a0116526aa
--- /dev/null
+++ b/src/finn/compressor/src/tests/test_gen.py
@@ -0,0 +1,150 @@
+#############################################################################
+# Copyright (C) 2024 - 2025 Advanced Micro Devices, Inc. All rights reserved.
+#
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# @brief    Test vector generation for compressor verification
+#############################################################################
+
+from ..utils.shape import Shape
+from itertools import accumulate
+from typing import List
+
+def compressed_width(shape):
+    max = sum([col * (1 << idx) for idx, col in enumerate(shape)])
+    return max.bit_length()
+
+def flatten_gates(gates: List[List[str]]) -> List[str]:
+    return [el for col in gates for el in col]
+
+def generate_test(shape: Shape, module_name: str, pipeline_stages: int, 
+                  gates: List[List[str]], accumulation: bool, accumulator_width: int,
+                  constant: int):
+    assert(type(pipeline_stages) == int)
+
+    if gates:
+        gates = flatten_gates(gates)
+
+    has_clk = bool(pipeline_stages) or accumulate
+
+    accumulated_signature = list(accumulate(shape))
+    addends = []
+    for j, col in enumerate(accumulated_signature):
+        for i in range(shape[j]):
+            addends.append(f"\t\tin_reduced += arr_in[{col+i-shape[j]}] << {j};")
+    addends = "\n".join(addends)
+
+    if gates:
+        preds = "".join([f"\tlocalparam pred_{idx} = 4'h{gate};\n" 
+                         for idx, gate in enumerate(gates)])
+        selects = "".join([f"\tlogic [3:0] sel_{idx};\n" 
+                           for idx, _ in enumerate(gates)])
+        arr_ins = "".join([
+            f"\t\tsel_{i} = (arr_in_b[{i}]<<1) | arr_in_a[{i}];\n" + 
+            f"\t\tarr_in[{i}] = pred_{i}[sel_{i}];\n"
+            for i, _ in enumerate(gates)])
+        gates_decl = (f"\tlogic [{sum(shape)-1}:0] arr_in_a;" + 
+                      f"\tlogic [{sum(shape)-1}:0] arr_in_b;")
+    accumulator_width = (accumulator_width if accumulator_width 
+                         else compressed_width(shape))
+    acc_decl = f"\tlogic [{accumulator_width-1}:0] acc_base;"
+
+    acc_rst_block = """\t\t\tif (reset == 0) begin 
+\t\t\t\tacc_base = 0;
+\t\t\tend else begin 
+\t\t\t\tacc_base = reference[0];
+\t\t\tend"""
+
+    return (
+f"""module tb;
+{gates_decl if gates else ""}
+\tlogic [{sum(shape)-1}:0] arr_in;
+\tlogic [{compressed_width(shape)-1}:0] in_reduced;
+\tlogic [{accumulator_width-1}:0] out;
+\tlogic [{accumulator_width-1}:0] reference [{pipeline_stages}:0]; 
+{acc_decl if accumulation else ""}
+\t{"logic [4:0] reset;" if accumulation else ""}
+\t{"logic rst;" if accumulation else ""}
+\t{"logic clk = 0;" if has_clk else ""}
+\t{"logic en = 1;" if accumulation else ""}
+
+{preds if gates else ""}
+{selects if gates else ""}
+\talways_comb begin;
+{arr_ins if gates else ""}
+\tend
+
+\t{"always #10ns clk = !clk;" if has_clk else ""}
+
+\talways_comb begin 
+\t\t{"reference[0] = acc_base + in_reduced;" 
+     if accumulation else "reference[0] = in_reduced;"}
+\tend
+
+\talways_comb begin 
+\t\tin_reduced = 0;
+\t\t{"if (en) begin" if accumulation else ""}
+in_reduced += {constant};
+{addends}
+\t\t{"end" if accumulation else ""}
+\tend
+           
+\tinitial begin
+\t\t{"acc_base = 0;" if accumulation else ""}
+\t\t{"arr_in_a = 0;" if gates else "arr_in = 0;"}
+\t\t{"arr_in_b = 0;" if gates else ""}
+      
+\t\t{"assign rst = reset == 0;" if accumulation else ""}
+\t\t{"reset = 0; #40ns;" if accumulation else ""}
+        
+\t\tfor (int i = 0; i < 16000; i += 1) begin
+\t\t\t{"automatic type(reset) xx;" if accumulation else ""}
+\t\t\t{"automatic type(en) zz;" if accumulation else ""}
+
+\t\t\t{"automatic type(arr_in_a) yy;" if gates else "automatic type(arr_in) yy;"}
+\t\t\t{"automatic type(arr_in_b) yz;" if gates else ""}
+
+\t\t\t{"void'(std::randomize(xx));" if accumulation else ""}
+\t\t\t{"reset = xx; " if accumulation else ""}
+\t\t\t{"void'(std::randomize(zz));" if accumulation else ""}
+\t\t\t{"en = zz;" if accumulation else ""}
+
+\t\t\tif (i < 5) yy = 0;
+\t\t\telse if (i < 10) yy = '1;
+\t\t\telse void'(std::randomize(yy));
+\t\t\t{"arr_in_a = yy;" if gates else "arr_in = yy;"}
+
+\t\t\t{"if (i < 5) yz = 0;" if gates else ""}
+\t\t\t{"else if (i < 10) yz = '1;" if gates else ""}
+\t\t\t{"else void'(std::randomize(yz));" if gates else ""}
+\t\t\t{"arr_in_b = yz;" if gates else ""}
+
+\t\t\t@(posedge clk);
+\t\t\tfor (int i = 1; i <= {pipeline_stages}; ++i) begin
+\t\t\t\treference[i] <= reference[i-1];
+\t\t\tend
+
+{acc_rst_block if accumulation else ""}
+\t\t\t#1ns;
+\t\t\tif(^reference[{pipeline_stages}] !== 1'bX) begin
+\t\t\t\tassert(reference[{pipeline_stages}] === out) else begin
+\t\t\t\t\t$error("Mismatch: Ref[%0b] != Out[%0b]", reference[{pipeline_stages}], out);
+\t\t\t\t\t#2ns;
+\t\t\t\t\t$stop;
+\t\t\t\tend 
+\t\t\tend
+\t\t#0.01ns;
+        
+\t\tend
+\t\t$display("TEST PASSED");
+\t\t$finish();
+\tend
+
+\t{module_name} dut(
+    {".clk(clk)," if pipeline_stages or accumulation else ""}
+    {".rst(rst)," if accumulation else ""}
+    {".in(arr_in_a), .in_2(arr_in_b)," if gates else ".in(arr_in),"}
+    {".en_neg(!en)," if accumulation else ""}
+    .out(out));
+endmodule
+""").replace("\n\n", "\n")
\ No newline at end of file
diff --git a/src/finn/compressor/src/tests/tester.py b/src/finn/compressor/src/tests/tester.py
new file mode 100644
index 0000000000..3537b97f7a
--- /dev/null
+++ b/src/finn/compressor/src/tests/tester.py
@@ -0,0 +1,41 @@
+#!/usr/bin/env python
+# Copyright (C) 2024, Advanced Micro Devices, Inc.
+# All rights reserved.
+#
+# SPDX-License-Identifier: BSD-3-Clause
+
+"""Vivado XSim wrapper for testing generated compressors."""
+
+import subprocess
+import re
+
+
+def tester(test_loc, comp_loc):
+    """Run Vivado XSim simulation to test a compressor.
+
+    Args:
+        test_loc: Path to testbench SystemVerilog file
+        comp_loc: Path to compressor SystemVerilog file
+    """
+    args = (
+        f"""rm -r xsim.dir/ &&
+        xvlog -work work -sv ../res/glbl.v {test_loc} {comp_loc} -L unisims_ver --nolog &&
+        xelab -L work -L unisims_ver -relax --nolog glbl tb &&
+        xsim --nolog work.glbl#work.tb -R""").replace("\n", " ")
+    print(args)
+    try:
+        ret = subprocess.run(args, capture_output=True, text=True, timeout=300,
+                             shell=True, check=True)
+    except subprocess.CalledProcessError as e:
+        if e.returncode == 127:
+            raise RuntimeError(
+                "Could not call Vivado simulation tools. Did you source Vivado?")
+        else:
+            raise RuntimeError("Something failed during simulation.")
+    if "$finish called at time" in ret.stdout:
+        print("Simulation SUCCESS!")
+    else:
+        print("ERROR in Compressor Simulation!")
+        error = re.findall("Error:.*\n.*\n", ret.stdout)[0].split("\n")
+        print(f">> {error[0]}\n>> {error[1]}")
+        exit(-2)
diff --git a/src/finn/compressor/src/utils/__init__.py b/src/finn/compressor/src/utils/__init__.py
new file mode 100644
index 0000000000..a5d76914d9
--- /dev/null
+++ b/src/finn/compressor/src/utils/__init__.py
@@ -0,0 +1,8 @@
+#############################################################################
+# Copyright (C) 2024 - 2025 Advanced Micro Devices, Inc. All rights reserved.
+#
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# @brief    Compressor utilities package initialization
+# @author    Simon Gerber <simon.gerber@amd.com>
+#############################################################################
diff --git a/src/finn/compressor/src/utils/mul_comp_map.py b/src/finn/compressor/src/utils/mul_comp_map.py
new file mode 100644
index 0000000000..951b732be1
--- /dev/null
+++ b/src/finn/compressor/src/utils/mul_comp_map.py
@@ -0,0 +1,58 @@
+#############################################################################
+# Copyright (C) 2024 - 2025 Advanced Micro Devices, Inc. All rights reserved.
+#
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# @brief    Multiplier-to-compressor input mapping utilities
+#############################################################################
+
+class MulCompMap:
+    def __init__(self, na: int, nb: int, sa: bool, sb: bool):
+        self.na = na
+        self.nb = nb
+        self.sa = sa
+        self.sb = sb
+
+    def columns(self):
+        return 1 if self.na == 1 and self.nb == 1 else self.nb + self.na - (not self.sb or self.sa)
+
+    def shape(self):
+        (na, nb, sa, sb) = (self.na, self.nb, self.sa, self.sb)
+
+        res = []
+        if na == 1 and nb == 1:
+            res.append([7 if sa ^ sb else 8])
+        else:
+            col = 0
+
+            # Crescending right triangle
+            while col < nb:
+                col += 1
+                res.append([8] * col)
+            # Central rectangle
+            while col < na:
+                col += 1
+                res.append([8] * nb)
+            # Decrescending left rectangle
+            while col < nb + na - 1:
+                col += 1
+                res.append([8] * (nb + na - col))
+
+            # Patch in sign handling
+            if sa:
+                for col in range(na - 1, na + nb - 1):
+                    res[col][0] = ~res[col][0] & 15
+            if sb:
+                res[nb].insert(0, 2)
+                for col in range(nb, nb + na - 1):
+                    op = res[col][-1]
+                    res[col][-1] = ((op & 3) << 2) | ((op >> 2) & 3)
+                if not sa:
+                    res.append([13])
+
+        return res
+
+    def absolute_term(self):
+        (na, nb, sa, sb) = (self.na, self.nb, self.sa, self.sb)
+
+        return (-1 if sa ^ sb else 0) if na == 1 and nb == 1 else ((-(sa | sb) << nb) | sa) << (na - 1)
diff --git a/src/finn/compressor/src/utils/shape.py b/src/finn/compressor/src/utils/shape.py
new file mode 100644
index 0000000000..5cfdb9ea3d
--- /dev/null
+++ b/src/finn/compressor/src/utils/shape.py
@@ -0,0 +1,51 @@
+#############################################################################
+# Copyright (C) 2024 - 2025 Advanced Micro Devices, Inc. All rights reserved.
+#
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# @brief    Shape representation for compressor bit matrices
+#############################################################################
+
+from typing import Tuple
+from itertools import zip_longest
+
+class Shape:
+    def __init__(self, t: Tuple[int] = ()): self.t = tuple(t)
+
+    def __len__(self): return len(self.t)
+
+    def __iter__(self): return self.t.__iter__()
+    
+    def __getitem__(self, val):
+        if type(val) == int and val >= len(self.t):
+            return 0
+        r = self.t.__getitem__(val)
+        if type(r) == int:
+            return r
+        else:
+            return Shape(r)
+    
+    def __lshift__(self, val):
+        return Shape([0 for el in range(val)] + list(self.t))
+
+    def __add__(self, val):
+        return self.__binary_arithmetic_operation(val, lambda x,y: x+y)
+
+    def __sub__(self, val):
+        return self.__binary_arithmetic_operation(val, lambda x,y: x-y)
+
+    def __binary_arithmetic_operation(self, val, op):
+        if type(val) == int:
+            return Shape([op(el, val) for el in self.t])
+        elif type(val) == Shape:
+            zipped = zip_longest(self.t, val.t, fillvalue=0)
+            return Shape([op(a, b) for a, b in zipped])
+        else:
+            raise RuntimeError("Unsupported type.")
+        
+    def __repr__(self): return f"Shape {self.t[::-1]}"
+    
+    def __eq__(self, other):
+        for col1, col2 in zip_longest(self, other, fillvalue=0):
+            if col1 != col2: return False
+        return True
\ No newline at end of file

From 1d205bca7b2abd0d63801c9db48ec3968ab80b9f Mon Sep 17 00:00:00 2001
From: Simon Gerber <simon.gerber@amd.com>
Date: Tue, 14 Apr 2026 12:29:37 +0100
Subject: [PATCH 02/10] [Feature] Integrate compressor into MVAU RTL backend

Wire the compressor generator into FINN's RTL MVAU datapath, enabling
LUT-based dot product computation as an alternative to DSP blocks.

RTL Datapath Changes (finn-rtllib/mvu/):
- mvu_vvu_axi.sv: Add USE_COMPRESSOR parameter and conditional instantiation
- add_multi.sv: Add CATCH_COMP macro for generated compressor module instantiation
- mvu_vvu_axi_wrapper.v: Propagate COMP_PIPELINE_DEPTH parameter

FINN Backend Integration (matrixvectoractivation_rtl.py):
- Add compressor eligibility checks (_is_dotp_comp_eligible)
- Conditionally generate dotp_comp and add_multi compressor modules
- Include generated RTL files in build
- Propagate USE_COMPRESSOR and COMP_PIPELINE_DEPTH template variables

Versal MVAU can use compressor-based compute instead of DSP blocks.
7-Series and UltraScale+ not yet supported.
---
 finn-rtllib/mvu/add_multi.sv                  |  62 ++++++-
 finn-rtllib/mvu/mvu_vvu_axi.sv                |  35 +++-
 finn-rtllib/mvu/mvu_vvu_axi_wrapper.v         |   5 +-
 .../rtl/matrixvectoractivation_rtl.py         | 155 ++++++++++++++----
 4 files changed, 218 insertions(+), 39 deletions(-)

diff --git a/finn-rtllib/mvu/add_multi.sv b/finn-rtllib/mvu/add_multi.sv
index 6b45d42e5a..d154ae318d 100644
--- a/finn-rtllib/mvu/add_multi.sv
+++ b/finn-rtllib/mvu/add_multi.sv
@@ -50,13 +50,65 @@ module add_multi import mvu_pkg::*; #(
 	output	logic [SUM_WIDTH-1:0]  sum
 );
 
-	localparam int unsigned  L = $clog2(N);  // Number of levels with reductions
+//---------------------------------------------------------------------------
+// Compressor Path
+//
+// CATCH_COMP entries instantiate a generated compressor module for a
+// specific (N, ARG_WIDTH, delay) triple.  The macro transposes arg[i][j]
+// to the column-major bit-vector expected by the compressor and pads any
+// remaining DEPTH with a shift-register delay.
+//
+// Generated compressors have no en port — when en=0, upstream holds
+// inputs stable and the downstream accumulator does not latch, so
+// correctness is preserved.
 
-	uwire [SUM_WIDTH-1:0]  sum0;
-	if(L < 1) begin : genTrivial
+`define CATCH_COMP(n,w,d) \
+else if(!RESET_ZERO && (N == n) && (ARG_WIDTH == w) && (DEPTH >= d) && (0 <= ARG_LO)) begin : genComp``n``u``w``_d``d \
+	initial $display("[ADD_MULTI_PATH] COMP N=%0d D=%0d W=%0d", N, DEPTH, ARG_WIDTH); \
+\
+	uwire [N*ARG_WIDTH-1:0]  in; \
+	uwire [SUM_WIDTH  -1:0]  out; \
+	for(genvar  i = 0; i < N; i++) begin : genIn \
+		for(genvar  j = 0; j < ARG_WIDTH; j++) begin : genBit \
+				assign	in[j*N+i] = arg[i][j]; \
+		end : genBit \
+	end : genIn \
+	comp_``n``u``w``_d``d comp_inst ( \
+		.clk, \
+		.in, .out \
+	); \
+	initial assert($bits(out) >= $bits(comp_inst.out)) else $warning("CATCH_COMP(%0d,%0d,%0d): compressor output width %0d > SUM_WIDTH %0d", n, w, d, $bits(comp_inst.out), SUM_WIDTH); \
+\
+	localparam int unsigned  COMP_DELAY = d; \
+	localparam int unsigned  SUM_DELAY = DEPTH - COMP_DELAY; \
+	if(SUM_DELAY == 0)  assign  sum = out; \
+	else begin : genDelay \
+			logic [SUM_WIDTH-1:0]  SumZ[SUM_DELAY] = '{ default: 'x }; \
+		always_ff @(posedge clk) begin \
+			if(rst)  SumZ <= '{ default: 'x }; \
+			else begin \
+				for(int unsigned  i = 0; i < SUM_DELAY-1; i++)  SumZ[i] <= SumZ[i+1]; \
+				SumZ[SUM_DELAY-1] <= out; \
+			end \
+		end \
+		assign	sum = SumZ[0]; \
+	end : genDelay \
+end : genComp``n``u``w``_d``d
+
+	if(0) begin end
+	// FINN_GENERATED_COMP_ENTRIES
+
+//- Generic Behavioral Addition ---------
+	else begin : genGeneric
+
+	localparam int unsigned  L = $clog2(N);  // Tree levels
+
+	logic [SUM_WIDTH-1:0]  sum0;
+	if(L < 1) begin : genPassThrough
 		assign	sum0 = arg[0];
-	end : genTrivial
+	end : genPassThrough
 	else begin : genTree
+		initial $display("[ADD_MULTI_PATH] TREE N=%0d D=%0d W=%0d", N, DEPTH, ARG_WIDTH);
 		localparam int unsigned  D = L < DEPTH? L : DEPTH;  // Pipeline stages absorbed by tree
 
 		// Compute the count of decendents for all nodes in the reduction trees.
@@ -129,4 +181,6 @@ module add_multi import mvu_pkg::*; #(
 		assign	sum = SumZ[0];
 	end : genDelay
 
+	end : genGeneric
+
 endmodule : add_multi
diff --git a/finn-rtllib/mvu/mvu_vvu_axi.sv b/finn-rtllib/mvu/mvu_vvu_axi.sv
index a890ac9aa3..dca3e9332c 100644
--- a/finn-rtllib/mvu/mvu_vvu_axi.sv
+++ b/finn-rtllib/mvu/mvu_vvu_axi.sv
@@ -64,6 +64,13 @@ module mvu_vvu_axi #(
 	bit FORCE_BEHAVIORAL = 0,
 	bit M_REG_LUT = 1,
 
+	// LUT-based compressor tree pipeline depth. This is set by default for maximum Pipelining (inbetween every stage).
+	int unsigned  COMP_PIPELINE_DEPTH = 1,
+
+	// Passed at generation time, whether compressors were generated if deemed worth it.
+	// Decides wether to use LUT-based compressors instead of DSPs.
+	bit USE_COMPRESSOR = 0,
+
 	// Safely deducible parameters
 	localparam int unsigned  WEIGHT_STREAM_WIDTH    = PE * SIMD * WEIGHT_WIDTH,
 	localparam int unsigned  WEIGHT_STREAM_WIDTH_BA = (WEIGHT_STREAM_WIDTH + 7)/8 * 8,
@@ -310,7 +317,19 @@ module mvu_vvu_axi #(
 		localparam int unsigned  A_WIDTH = 25 + 2*(VERSION > 1);     // Width of A datapath
 		localparam int unsigned  NUM_LANES = A_WIDTH == WEIGHT_WIDTH? 1 : 1 + (A_WIDTH - !NARROW_WEIGHTS - WEIGHT_WIDTH) / MIN_LANE_WIDTH;
 
-		if(!IS_MVU || ((VERSION > 2) && (NUM_LANES <= 3) && (WEIGHT_WIDTH <= 8) && (ACTIVATION_WIDTH <= 9))) begin : genINT8
+		if(USE_COMPRESSOR) begin : genCompressor
+			dotp_comp #(
+				.PE(PE), .SIMD(DSP_SIMD),
+				.WEIGHT_WIDTH(WEIGHT_WIDTH), .ACTIVATION_WIDTH(ACTIVATION_WIDTH), .ACCU_WIDTH(ACCU_WIDTH),
+				.SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS),
+				.COMP_PIPELINE_DEPTH(COMP_PIPELINE_DEPTH)
+			) core (
+				.clk(ap_clk), .rst, .en('1),
+				.last(dsp_last), .zero(dsp_zero), .w(dsp_w), .a(dsp_a),
+				.vld(dsp_vld), .p(dsp_p)
+			);
+		end : genCompressor
+		else if(!IS_MVU || ((VERSION > 2) && (NUM_LANES <= 3) && (WEIGHT_WIDTH <= 8) && (ACTIVATION_WIDTH <= 9))) begin : genINT8
 			initial $info("Sidestepping to INT8 mode of DSP58 for %0dx%0d.", WEIGHT_WIDTH, ACTIVATION_WIDTH);
 			mvu_vvu_8sx9_dsp58 #(
 				.IS_MVU(IS_MVU),
@@ -343,11 +362,15 @@ module mvu_vvu_axi #(
 
 	if(1) begin : blkOutput
 		localparam int unsigned  CORE_PIPELINE_DEPTH =
-			VERSION == 3? 3 + (SEGMENTLEN == 0? 0 : ((SIMD+2)/3 -1)/SEGMENTLEN) :
-			/* else */    3 + $clog2(SIMD+1) + (SIMD == 1);
-
-		// This is conservative and could be divided by a guaranteed minimum output interval, e.g. MW/SIMD.
-		localparam int unsigned  MAX_IN_FLIGHT = CORE_PIPELINE_DEPTH;
+			USE_COMPRESSOR? COMP_PIPELINE_DEPTH :
+			VERSION == 3?   3 + (SEGMENTLEN == 0? 0 : ((SIMD+2)/3 -1)/SEGMENTLEN) :
+			/* else */      3 + $clog2(SIMD+1) + (SIMD == 1);
+
+		// Floor at the DSP-equivalent depth so the compressor path (shallow pipeline)
+		// still has enough output queue slots to absorb backpressure transients.
+		localparam int unsigned  DSP_PIPELINE_DEPTH = 3 + $clog2(SIMD+1) + (SIMD == 1);
+		localparam int unsigned  MAX_IN_FLIGHT =
+			CORE_PIPELINE_DEPTH > DSP_PIPELINE_DEPTH? CORE_PIPELINE_DEPTH : DSP_PIPELINE_DEPTH;
 		typedef logic [PE-1:0][ACCU_WIDTH-1:0]  output_t;
 
 		logic signed [$clog2(MAX_IN_FLIGHT+1):0]  OPtr = '1;	// -1 | 0, 1, ..., MAX_IN_FLIGHT
diff --git a/finn-rtllib/mvu/mvu_vvu_axi_wrapper.v b/finn-rtllib/mvu/mvu_vvu_axi_wrapper.v
index 9815d67629..47ffa96ac5 100644
--- a/finn-rtllib/mvu/mvu_vvu_axi_wrapper.v
+++ b/finn-rtllib/mvu/mvu_vvu_axi_wrapper.v
@@ -45,6 +45,8 @@ module $MODULE_NAME_AXI_WRAPPER$ #(
 	parameter	NARROW_WEIGHTS = $NARROW_WEIGHTS$,
 	parameter	SIGNED_ACTIVATIONS = $SIGNED_ACTIVATIONS$,
 	parameter	SEGMENTLEN = $SEGMENTLEN$,
+	parameter	COMP_PIPELINE_DEPTH = $COMP_PIPELINE_DEPTH$,
+	parameter	USE_COMPRESSOR = $USE_COMPRESSOR$,
 
 	// Safely deducible parameters
 	parameter	WEIGHT_STREAM_WIDTH_BA = (PE*SIMD*WEIGHT_WIDTH+7)/8 * 8,
@@ -81,7 +83,8 @@ mvu_vvu_axi #(
 `endif
 	.IS_MVU(IS_MVU), .VERSION(VERSION), .PUMPED_COMPUTE(PUMPED_COMPUTE), .MW(MW), .MH(MH), .PE(PE), .SIMD(SIMD),
 	.ACTIVATION_WIDTH(ACTIVATION_WIDTH), .WEIGHT_WIDTH(WEIGHT_WIDTH), .ACCU_WIDTH(ACCU_WIDTH), .NARROW_WEIGHTS(NARROW_WEIGHTS),
-	.SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS), .SEGMENTLEN(SEGMENTLEN)
+	.SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS), .SEGMENTLEN(SEGMENTLEN),
+	.COMP_PIPELINE_DEPTH(COMP_PIPELINE_DEPTH), .USE_COMPRESSOR(USE_COMPRESSOR)
 	) inst (
 	.ap_clk(ap_clk),
 	.ap_clk2x(ap_clk2x),
diff --git a/src/finn/custom_op/fpgadataflow/rtl/matrixvectoractivation_rtl.py b/src/finn/custom_op/fpgadataflow/rtl/matrixvectoractivation_rtl.py
index 9cd6fc2a9d..d536e105f8 100644
--- a/src/finn/custom_op/fpgadataflow/rtl/matrixvectoractivation_rtl.py
+++ b/src/finn/custom_op/fpgadataflow/rtl/matrixvectoractivation_rtl.py
@@ -28,12 +28,15 @@
 
 import numpy as np
 import os
+import shutil
 
 from finn.custom_op.fpgadataflow.matrixvectoractivation import MVAU
 from finn.custom_op.fpgadataflow.rtlbackend import RTLBackend
 from finn.util.basic import get_dsp_block, is_versal
 from finn.util.data_packing import npy_to_rtlsim_input, rtlsim_output_to_npy
 
+from finn.compressor import generate_dotp_comp, generate_add_multi_comps
+
 # ONNX i/o tensor shape assumptions for MatrixVectorActivation_rtl:
 # input 0 is the input tensor, shape (.., i_size) = (..., MW)
 # input 1 is the weight tensor, shape (i_size, o_size) = (MW, MH)
@@ -51,6 +54,16 @@ def get_nodeattr_types(self):
         my_attrs = {
             # Double-pumped DSPs enabled
             "pumpedCompute": ("i", False, 0, {0, 1}),
+            # Compressor module name (set by generate_hdl when compressor is used)
+            "comp_module_name": ("s", False, ""),
+            # add_multi compressor module names, semicolon-separated
+            "add_multi_comp_names": ("s", False, ""),
+            # add_multi compressor specs for synthesis aggregation
+            # Format: "N,W,D;N,W,D;..." e.g. "16,4,0;16,3,0;16,8,0"
+            "add_multi_comp_specs": ("s", False, ""),
+            # Force disable LUT-based compressors (for benchmarking/comparison)
+            # 0 = auto (use compressor when eligible), 1 = force disable
+            "noCompressor": ("i", False, 0, {0, 1}),
         }
         my_attrs.update(MVAU.get_nodeattr_types(self))
         my_attrs.update(RTLBackend.get_nodeattr_types(self))
@@ -160,25 +173,59 @@ def dsp_estimation(self, fpgapart):
             mult_dsp = np.ceil(P / 4) * Q
         return int(mult_dsp)
 
-    def instantiate_ip(self, cmd):
-        # instantiate the RTL IP
-        node_name = self.onnx_node.name
-        code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen")
-        rtllib_dir = os.path.join(os.environ["FINN_ROOT"], "finn-rtllib/mvu/")
-        sourcefiles = [
+    def _get_rtl_source_files(self, abspath=True):
+        """
+        Build the list of RTL source files for this node, including any
+        generated compressor files. Used by both instantiate_ip() and
+        get_rtl_file_list() to avoid duplication.
+        """
+        if abspath:
+            code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") + "/"
+            rtllib_dir = os.path.join(os.environ["FINN_ROOT"], "finn-rtllib/mvu/")
+        else:
+            code_gen_dir = ""
+            rtllib_dir = ""
+
+        base_files = [
             "mvu_pkg.sv",
             "mvu_vvu_axi.sv",
             "replay_buffer.sv",
             "mvu.sv",
             "mvu_vvu_8sx9_dsp58.sv",
-            "add_multi.sv",
         ]
         sourcefiles = [
             os.path.join(code_gen_dir, self.get_nodeattr("gen_top_module") + "_wrapper.v")
-        ] + [rtllib_dir + _ for _ in sourcefiles]
+        ] + [rtllib_dir + f for f in base_files]
+
+        # Add compressor files if dotp_comp was generated
+        comp_name = self.get_nodeattr("comp_module_name")
+        if comp_name:
+            comp_hdl_dir = os.path.join(
+                os.environ["FINN_ROOT"], "src/finn/compressor/hdl/")
+            sourcefiles.append(os.path.join(code_gen_dir, "dotp_comp.sv"))
+            sourcefiles.append(os.path.join(comp_hdl_dir, "mul_comp_map.sv"))
+            sourcefiles.append(os.path.join(code_gen_dir, comp_name + ".sv"))
+            # dotp_comp path doesn't need add_multi.sv
+        else:
+            # DSP path: add_multi.sv always exists in code_gen_dir
+            # (either patched with comps or copy of template)
+            sourcefiles.append(os.path.join(code_gen_dir, "add_multi.sv"))
+            add_multi_names_str = self.get_nodeattr("add_multi_comp_names")
+            if add_multi_names_str:
+                # Add compressor modules if present
+                for name in add_multi_names_str.split(";"):
+                    sourcefiles.append(os.path.join(code_gen_dir, name + ".sv"))
+
+        return sourcefiles
+
+    def instantiate_ip(self, cmd):
+        # instantiate the RTL IP
+        node_name = self.onnx_node.name
+        sourcefiles = self._get_rtl_source_files(abspath=True)
 
         for f in sourcefiles:
             cmd.append("add_files -norecurse %s" % (f))
+
         mem_mode = self.get_nodeattr("mem_mode")
         if mem_mode == "internal_decoupled" or self.get_nodeattr("mlo_max_iter"):
             cmd.append(
@@ -268,6 +315,35 @@ def _resolve_dsp_version(self, dsp_block):
             case _:
                 return 1
 
+    def _is_dotp_comp_eligible(self, fpgapart, ww, aw, pumped_compute):
+        """
+        Check if LUT-based compressor should replace the DSP compute path.
+        Returns True when: non-pumped, small operands (WW <= 4 and AW <= 4),
+        and target is Versal or 7-Series (not UltraScale+).
+        """
+        # Check if compressors are force-disabled (for benchmarking)
+        if self.get_nodeattr("noCompressor"):
+            return False
+        if pumped_compute or ww > 4 or aw > 4:
+            return False
+        dsp_block = get_dsp_block(fpgapart)
+        # DSP48E2 (UltraScale+) excluded: no compressor target exists for its
+        # CARRY8 primitives — generator only supports Versal and 7-Series.
+        return dsp_block in ("DSP58", "DSP48E1")
+        
+
+    def _is_add_multi_comp_eligible(self, version, simd):
+        """
+        Check if add_multi lane reductions should use LUT compressors.
+        Returns True when: not UltraScale+ (version != 2) and SIMD >= 4
+        (below 4 inputs, compressors offer no benefit over binary adder tree).
+        """
+        # Check if compressors are force-disabled (for benchmarking)
+        if self.get_nodeattr("noCompressor"):
+            return False
+        # version 2 = DSP48E2 (UltraScale+) blocked for same reason as above.
+        return version != 2 and simd >= 4
+
     def generate_hdl(self, model, fpgapart, clk):
         # Generate params as part of IP preparation
         code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen")
@@ -286,6 +362,46 @@ def generate_hdl(self, model, fpgapart, clk):
             else 1
         )
         code_gen_dict["$NARROW_WEIGHTS$"] = str(narrow_weights)
+
+        # Extract params from code_gen_dict for compressor generation.
+        simd = int(code_gen_dict["$SIMD$"][0])
+        ww = int(code_gen_dict["$WEIGHT_WIDTH$"][0])
+        aw = int(code_gen_dict["$ACTIVATION_WIDTH$"][0])
+        accu_width = int(code_gen_dict["$ACCU_WIDTH$"][0])
+        signed_act = int(code_gen_dict["$SIGNED_ACTIVATIONS$"][0]) != 0
+        pumped_compute = int(code_gen_dict["$PUMPED_COMPUTE$"][0])
+        version = int(code_gen_dict["$VERSION$"][0])
+
+        # Compressor generation if applicable.
+        if self._is_dotp_comp_eligible(fpgapart, ww, aw, pumped_compute):
+            result = generate_dotp_comp(
+                fpgapart, simd, ww, aw, accu_width, signed_act, code_gen_dir)
+            code_gen_dict["$COMP_PIPELINE_DEPTH$"] = [str(result["comp_delay"])]
+            code_gen_dict["$USE_COMPRESSOR$"] = [str(1)]
+            self.set_nodeattr("comp_module_name", result["comp_name"])
+        else:
+            # Generate add_multi.sv (either patched with comps or template copy)
+            # Check if add_multi should use compressors (respects noCompressor attribute)
+            if self._is_add_multi_comp_eligible(version, simd):
+                result = generate_add_multi_comps(
+                    fpgapart, version, simd, ww, aw, accu_width,
+                    narrow_weights, code_gen_dir)
+                if result["comp_names"]:
+                    self.set_nodeattr("add_multi_comp_names",
+                                      ";".join(result["comp_names"]))
+                    # Store compressor specs for synthesis aggregation
+                    # Format: "N,W,D;N,W,D;..." e.g. "16,4,0;16,3,0;16,8,0"
+                    specs_str = ";".join(
+                        f"{n},{w},{d}" for n, w, d in result.get("comp_specs", [])
+                    )
+                    self.set_nodeattr("add_multi_comp_specs", specs_str)
+            else:
+                # Compressors disabled: copy template add_multi.sv (binary adder tree)
+                rtllib_dir = os.path.join(os.environ["FINN_ROOT"], "finn-rtllib/mvu/")
+                dest = os.path.join(code_gen_dir, "add_multi.sv")
+                shutil.copy(os.path.join(rtllib_dir, "add_multi.sv"), dest)
+                result = {"comp_names": [], "files": [dest]}
+
         # add general parameters to dictionary
         code_gen_dict["$MODULE_NAME_AXI_WRAPPER$"] = [self.get_verilog_top_module_name()]
         # save top module name so we can refer to it after this node has been renamed
@@ -351,30 +467,13 @@ def prepare_codegen_default(self, fpgapart, clk):
             [str(1)] if (self.get_input_datatype(0).min() < 0) else [str(0)]
         )
         code_gen_dict["$SEGMENTLEN$"] = [str(self._resolve_segment_len(clk))]
+        code_gen_dict["$COMP_PIPELINE_DEPTH$"] = [str(1)]
+        code_gen_dict["$USE_COMPRESSOR$"] = [str(0)]
 
         return template_path, code_gen_dict
 
     def get_rtl_file_list(self, abspath=False):
-        if abspath:
-            code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") + "/"
-            rtllib_dir = os.path.join(os.environ["FINN_ROOT"], "finn-rtllib/mvu/")
-        else:
-            code_gen_dir = ""
-            rtllib_dir = ""
-
-        verilog_files = [
-            "mvu_pkg.sv",
-            "mvu_vvu_axi.sv",
-            "replay_buffer.sv",
-            "mvu.sv",
-            "mvu_vvu_8sx9_dsp58.sv",
-            "add_multi.sv",
-        ]
-        verilog_files = [
-            os.path.join(code_gen_dir, self.get_nodeattr("gen_top_module") + "_wrapper.v")
-        ] + [rtllib_dir + _ for _ in verilog_files]
-
-        return verilog_files
+        return self._get_rtl_source_files(abspath=abspath)
 
     def get_verilog_paths(self):
         verilog_paths = super().get_verilog_paths()

From 4be4fd3d268927e7b60021cbd4d1bc2d0f6a997e Mon Sep 17 00:00:00 2001
From: Simon Gerber <simon.gerber@amd.com>
Date: Tue, 14 Apr 2026 12:35:27 +0100
Subject: [PATCH 03/10] [Tests] Add compressor test suite

Test infrastructure:
- XSim testbench templates (dotp_comp_tb, add_multi_comp_tb, mul_comp_map_tb)
- Vivado TCL simulation scripts (dotp_comp, add_multi_comp, dotp)
- Test runner scripts: run_tests.sh (21 core configs), run_dotp_comp_tests.sh (8 configs), run_add_multi_comp_tests.sh (8 configs)
- Common test utilities (test_common.sh)
---
 .../hdl/add_multi_comp_tb_template.sv         | 142 ++++++++
 .../hdl/add_multi_comp_template.tcl           |  35 ++
 .../compressor/hdl/dotp_comp_tb_template.sv   | 303 ++++++++++++++++++
 .../compressor/hdl/dotp_comp_template.tcl     |  32 ++
 src/finn/compressor/hdl/dotp_tb_template.sv   |  59 ++++
 src/finn/compressor/hdl/dotp_template.tcl     |  26 ++
 src/finn/compressor/hdl/mul_comp_map_tb.sv    |  41 +++
 src/finn/compressor/lib/test_common.sh        | 101 ++++++
 .../compressor/run_add_multi_comp_tests.sh    | 135 ++++++++
 src/finn/compressor/run_dotp_comp_tests.sh    | 143 +++++++++
 src/finn/compressor/run_tests.sh              | 105 ++++++
 11 files changed, 1122 insertions(+)
 create mode 100644 src/finn/compressor/hdl/add_multi_comp_tb_template.sv
 create mode 100644 src/finn/compressor/hdl/add_multi_comp_template.tcl
 create mode 100644 src/finn/compressor/hdl/dotp_comp_tb_template.sv
 create mode 100644 src/finn/compressor/hdl/dotp_comp_template.tcl
 create mode 100644 src/finn/compressor/hdl/dotp_tb_template.sv
 create mode 100644 src/finn/compressor/hdl/dotp_template.tcl
 create mode 100644 src/finn/compressor/hdl/mul_comp_map_tb.sv
 create mode 100644 src/finn/compressor/lib/test_common.sh
 create mode 100755 src/finn/compressor/run_add_multi_comp_tests.sh
 create mode 100755 src/finn/compressor/run_dotp_comp_tests.sh
 create mode 100755 src/finn/compressor/run_tests.sh

diff --git a/src/finn/compressor/hdl/add_multi_comp_tb_template.sv b/src/finn/compressor/hdl/add_multi_comp_tb_template.sv
new file mode 100644
index 0000000000..b5327262ce
--- /dev/null
+++ b/src/finn/compressor/hdl/add_multi_comp_tb_template.sv
@@ -0,0 +1,142 @@
+/******************************************************************************
+ * Copyright (C) 2024 - 2025 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * @brief	Testbench template for add_multi compressor cores
+ * @author	Simon Gerber <simon.gerber@amd.com>
+ *****************************************************************************/
+
+/**
+ * Standalone testbench for add_multi compressor (comp_NuW_dD).
+ * Tests the compressor directly without requiring add_multi.sv.
+ *
+ * Template placeholders expanded by run_add_multi_comp_tests.sh:
+ *   {n}           - Number of addends
+ *   {arg_width}   - Bit width of each addend
+ *   {depth}       - Pipeline depth of compressor
+ *   {label}       - Configuration label (e.g. n8_w4_p2)
+ *   {comp_module} - Generated compressor module name (e.g. comp_8u4_d0)
+ *****************************************************************************/
+
+module add_multi_comp_{label}_tb;
+
+	localparam int unsigned  N         = {n};
+	localparam int unsigned  ARG_WIDTH = {arg_width};
+	localparam int unsigned  DEPTH     = {depth};
+	localparam int unsigned  IN_WIDTH  = N * ARG_WIDTH;
+	// Use same formula as mvu_pkg::sumwidth() for consistency
+	localparam int unsigned  SUM_WIDTH = $clog2(N) + ARG_WIDTH;
+	localparam int unsigned  ROUNDS    = 257;
+
+	//-----------------------------------------------------------------------
+	// Global Control
+	logic  clk = 0;
+	always #5ns clk = !clk;
+
+	logic  rst = 1;
+	initial begin
+		repeat(16) @(posedge clk);
+		rst <= 0;
+	end
+
+	bit  done = 0;
+	always_comb begin
+		if(done)  $finish;
+	end
+
+	//-----------------------------------------------------------------------
+	// DUT: direct compressor instantiation
+	logic [IN_WIDTH-1:0]   in;
+	logic [SUM_WIDTH-1:0]  out;
+
+	{comp_module} dut (
+		.clk,
+		.in,
+		.out
+	);
+
+	//-----------------------------------------------------------------------
+	// Transpose function: convert row-major to column-major format.
+	//
+	// The compressor expects inputs in column-major (bit-slice) order:
+	//   in[0..N-1]       = bit 0 of all N addends
+	//   in[N..2N-1]      = bit 1 of all N addends
+	//   ...
+	//   in[(W-1)*N..W*N-1] = bit W-1 of all N addends
+	//
+	// This matches the transpose in add_multi.sv CATCH_COMP macro:
+	//   assign in[j*N+i] = arg[i][j];
+	//
+	// Without this transpose, addend bits would be misaligned and produce
+	// incorrect sums.
+	//-----------------------------------------------------------------------
+	function automatic logic [IN_WIDTH-1:0] transpose(
+		input logic [IN_WIDTH-1:0] row_major
+	);
+		logic [IN_WIDTH-1:0] col_major;
+		for(int i = 0; i < N; i++) begin
+			for(int j = 0; j < ARG_WIDTH; j++) begin
+				col_major[j*N + i] = row_major[i*ARG_WIDTH + j];
+			end
+		end
+		return col_major;
+	endfunction
+
+	//-----------------------------------------------------------------------
+	// Input Feed
+	int  Q[$];
+	initial begin
+		in = 'x;
+		@(posedge clk iff !rst);
+
+		repeat(ROUNDS) begin
+			automatic logic [IN_WIDTH-1:0]  aa;
+			automatic int  exp = 0;
+			void'(std::randomize(aa));
+
+			// Compute expected sum from row-major input
+			for(int unsigned i = 0; i < N; i++) begin
+				exp += aa[i*ARG_WIDTH +: ARG_WIDTH];
+			end
+
+			// Transpose to column-major before feeding compressor
+			in <= transpose(aa);
+			Q.push_back(exp);
+			@(posedge clk);
+		end
+
+		in <= 'x;
+		repeat(DEPTH + 10) @(posedge clk);
+
+		assert(Q.size == 0) else begin
+			$error("Missing %0d outputs.", Q.size);
+		end
+		done = 1;
+	end
+
+	//-----------------------------------------------------------------------
+	// Output Checker
+	int unsigned  Checks = 0;
+	int unsigned  Errors = 0;
+	initial begin
+		@(posedge clk iff !rst);
+		repeat(DEPTH) @(posedge clk);
+		repeat(ROUNDS) @(posedge clk) begin
+			automatic int  exp = Q.pop_front();
+			automatic int  hav = out;
+			assert(hav == exp) else begin
+				$error("Output mismatch %0d instead of %0d.", hav, exp);
+				$stop;
+				Errors <= Errors + 1;
+			end
+			Checks <= Checks + 1;
+		end
+	end
+
+	final begin
+		$display("Performed %0d checks with %0d errors.", Checks, Errors);
+		assert(Checks == ROUNDS) else  $error("Unexpected number of checks: %0d instead of %0d.", Checks, ROUNDS);
+	end
+
+endmodule : add_multi_comp_{label}_tb
diff --git a/src/finn/compressor/hdl/add_multi_comp_template.tcl b/src/finn/compressor/hdl/add_multi_comp_template.tcl
new file mode 100644
index 0000000000..c32279e518
--- /dev/null
+++ b/src/finn/compressor/hdl/add_multi_comp_template.tcl
@@ -0,0 +1,35 @@
+#############################################################################
+# Copyright (C) 2024 - 2025 Advanced Micro Devices, Inc. All rights reserved.
+#
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# @brief    Vivado simulation script for add_multi compressor testbench
+# @author    Simon Gerber <simon.gerber@amd.com>
+#############################################################################
+
+# Vivado batch flow for standalone add_multi compressor test.
+# Behavioral simulation only — verifies the generated compressor produces correct sums.
+#
+# Template placeholders expanded by run_add_multi_comp_tests.sh:
+#   {label}   - Configuration label (e.g. n8_w4_p2)
+#   {tb}      - Testbench module name
+#   {gen_dir} - Absolute path to gen/<label>/
+
+set label {label}
+set tb {tb}
+set part {part}
+create_project -force add_multi_comp_$label add_multi_comp_$label.vivado -part $part
+
+# Design sources: only the generated compressor
+read_verilog -sv {*}[glob {gen_dir}/comp_*.sv]
+
+# Testbench
+set simset [current_fileset -simset]
+add_files -fileset $simset {gen_dir}/{tb}.sv
+set_property top $tb $simset
+set_property xsim.simulate.runtime all $simset
+
+launch_simulation
+close_sim
+
+quit
diff --git a/src/finn/compressor/hdl/dotp_comp_tb_template.sv b/src/finn/compressor/hdl/dotp_comp_tb_template.sv
new file mode 100644
index 0000000000..d6d7841e07
--- /dev/null
+++ b/src/finn/compressor/hdl/dotp_comp_tb_template.sv
@@ -0,0 +1,303 @@
+/******************************************************************************
+ * Testbench for LUT-based dotp_comp module.
+ * Exercises the finnlib protocol: clk, rst, en, last, zero, w, a -> vld, p
+ *
+ * Generated from template for config: PE={pe}, SIMD={simd}, WW={ww}, AW={aw}
+ ******************************************************************************/
+module dotp_comp_{full_sig}_tb;
+
+	localparam int unsigned  ROUNDS = 217;
+
+	localparam int unsigned  PE   = {pe};
+	localparam int unsigned  SIMD = {simd};
+	localparam int unsigned  WEIGHT_WIDTH     = {ww};
+	localparam int unsigned  ACTIVATION_WIDTH = {aw};
+	localparam int unsigned  ACCU_WIDTH       = {accu_width};
+	localparam bit  SIGNED_ACTIVATIONS = {signed_act};
+
+	typedef logic signed [WEIGHT_WIDTH    -1:0]  weight_t;
+	typedef logic        [ACTIVATION_WIDTH-1:0]  activation_t;
+	typedef logic signed [ACCU_WIDTH      -1:0]  accu_t;
+
+	//-----------------------------------------------------------------------
+	// Global Control
+	logic  clk = 0;
+	always #5ns clk = !clk;
+	logic  rst = 1;
+	initial begin
+		repeat(16) @(posedge clk);
+		rst <= 0;
+	end
+
+	//-----------------------------------------------------------------------
+	// DUT
+	logic  en;
+	logic  last;
+	logic  zero;
+	weight_t     [PE-1:0][SIMD-1:0]  w;
+	activation_t         [SIMD-1:0]  a;
+	uwire  vld;
+	accu_t [PE-1:0]  p;
+
+	dotp_comp #(
+		.PE(PE), .SIMD(SIMD),
+		.WEIGHT_WIDTH(WEIGHT_WIDTH),
+		.ACTIVATION_WIDTH(ACTIVATION_WIDTH),
+		.ACCU_WIDTH(ACCU_WIDTH),
+		.SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS),
+		.COMP_PIPELINE_DEPTH({comp_depth})
+	) dut (
+		.clk, .rst, .en,
+		.last, .zero, .w, .a,
+		.vld, .p
+	);
+
+	//-----------------------------------------------------------------------
+	// Input Feed & Reference Model
+	accu_t [PE-1:0]  Q[$];
+	int unsigned  RoundsPushed = 0;
+
+	// Drive one dot-product round with given weights/activations on a single
+	// enabled cycle (en=1, last=1, zero=0).  Computes the expected accumulator
+	// value and pushes it into the checker queue.
+	// Results drain naturally when en resumes in subsequent tests or the
+	// final flush — no en=1 drain cycles needed.
+	task automatic feed_single(
+		input weight_t     [PE-1:0][SIMD-1:0]  ww,
+		input activation_t         [SIMD-1:0]  aa
+	);
+		automatic accu_t [PE-1:0]  pp = '{ default: '0 };
+		for(int unsigned  pe = 0; pe < PE; pe++) begin
+			for(int unsigned  simd = 0; simd < SIMD; simd++) begin
+				pp[pe] += $signed(ww[pe][simd])
+					* $signed({SIGNED_ACTIVATIONS && aa[simd][ACTIVATION_WIDTH-1], aa[simd]});
+			end
+		end
+		en   <= 1;
+		last <= 1;
+		zero <= 0;
+		w    <= ww;
+		a    <= aa;
+		@(posedge clk);
+		en   <= 0;
+		last <= 'x;
+		zero <= 'x;
+		w    <= 'x;
+		a    <= 'x;
+		Q.push_back(pp);
+		RoundsPushed++;
+	endtask : feed_single
+
+	task automatic feed_zero_round();
+		automatic accu_t [PE-1:0]  pp = '{ default: '0 };
+		en   <= 1;
+		last <= 1;
+		zero <= 1;
+		w    <= '0;
+		a    <= '0;
+		@(posedge clk);
+		en   <= 0;
+		last <= 'x;
+		zero <= 'x;
+		w    <= 'x;
+		a    <= 'x;
+		Q.push_back(pp);
+		RoundsPushed++;
+	endtask : feed_zero_round
+
+	initial begin
+		en = 0;
+		last = 'x;
+		zero = 'x;
+		w = 'x;
+		a = 'x;
+		@(posedge clk iff !rst);
+
+		//---------------------------------------------------------------
+		// Directed edge-case tests
+		//---------------------------------------------------------------
+
+		// All zeros
+		begin
+			automatic weight_t     [PE-1:0][SIMD-1:0]  ww = '0;
+			automatic activation_t         [SIMD-1:0]  aa = '0;
+			feed_single(ww, aa);
+		end
+
+		// Zero round via zero flag
+		feed_zero_round();
+
+		// All ones
+		begin
+			automatic weight_t     [PE-1:0][SIMD-1:0]  ww = '1;
+			automatic activation_t         [SIMD-1:0]  aa = '1;
+			feed_single(ww, aa);
+		end
+
+		// Max positive weights, all-ones activations
+		begin
+			automatic weight_t     [PE-1:0][SIMD-1:0]  ww;
+			automatic activation_t         [SIMD-1:0]  aa = '1;
+			for(int unsigned  pe = 0; pe < PE; pe++)
+				for(int unsigned  s = 0; s < SIMD; s++)
+					ww[pe][s] = {1'b0, {(WEIGHT_WIDTH-1){1'b1}}};
+			feed_single(ww, aa);
+		end
+
+		// Single SIMD lane active (first)
+		begin
+			automatic weight_t     [PE-1:0][SIMD-1:0]  ww = '0;
+			automatic activation_t         [SIMD-1:0]  aa = '0;
+			for(int unsigned  pe = 0; pe < PE; pe++)
+				ww[pe][0] = {1'b0, {(WEIGHT_WIDTH-1){1'b1}}};
+			aa[0] = '1;
+			feed_single(ww, aa);
+		end
+
+		// Single SIMD lane active (last)
+		begin
+			automatic weight_t     [PE-1:0][SIMD-1:0]  ww = '0;
+			automatic activation_t         [SIMD-1:0]  aa = '0;
+			for(int unsigned  pe = 0; pe < PE; pe++)
+				ww[pe][SIMD-1] = '1;
+			aa[SIMD-1] = '1;
+			feed_single(ww, aa);
+		end
+
+		// Alternating weights: +max, -max, +max, ...
+		begin
+			automatic weight_t     [PE-1:0][SIMD-1:0]  ww;
+			automatic activation_t         [SIMD-1:0]  aa = '1;
+			for(int unsigned  pe = 0; pe < PE; pe++)
+				for(int unsigned  s = 0; s < SIMD; s++)
+					ww[pe][s] = s[0] ? '1 : {1'b0, {(WEIGHT_WIDTH-1){1'b1}}};
+			feed_single(ww, aa);
+		end
+
+		// Multi-cycle accumulation: 3 cycles then last
+		begin
+			automatic accu_t [PE-1:0]  pp = '{ default: '0 };
+			for(int unsigned  cyc = 0; cyc < 3; cyc++) begin
+				automatic weight_t     [PE-1:0][SIMD-1:0]  ww;
+				automatic activation_t         [SIMD-1:0]  aa;
+				for(int unsigned  pe = 0; pe < PE; pe++)
+					for(int unsigned  s = 0; s < SIMD; s++)
+						ww[pe][s] = weight_t'(cyc + 1);
+				for(int unsigned  s = 0; s < SIMD; s++)
+					aa[s] = activation_t'(s + 1);
+
+				for(int unsigned  pe = 0; pe < PE; pe++)
+					for(int unsigned  s = 0; s < SIMD; s++)
+						pp[pe] += $signed(ww[pe][s])
+							* $signed({SIGNED_ACTIVATIONS && aa[s][ACTIVATION_WIDTH-1], aa[s]});
+
+				en   <= 1;
+				last <= (cyc == 2) ? 1 : 0;
+				zero <= 0;
+				w    <= ww;
+				a    <= aa;
+				@(posedge clk);
+			end
+			en   <= 0;
+			last <= 'x;
+			zero <= 'x;
+			w    <= 'x;
+			a    <= 'x;
+			Q.push_back(pp);
+			RoundsPushed++;
+		end
+
+		//---------------------------------------------------------------
+		// Randomized tests
+		//---------------------------------------------------------------
+		repeat(ROUNDS) begin
+			automatic accu_t [PE-1:0]  pp = '{ default: '0 };
+			do begin
+				en <= 0;
+				last <= 'x;
+				zero <= 'x;
+				w <= 'x;
+				a <= 'x;
+				if($urandom()%31 != 0) begin
+					en <= 1;
+					if($urandom()%19 == 0)  zero <= 1;
+					else begin
+						automatic weight_t     [PE-1:0][SIMD-1:0]  ww;
+						automatic activation_t         [SIMD-1:0]  aa;
+						void'(std::randomize(ww, aa));
+
+						for(int unsigned  pe = 0; pe < PE; pe++) begin
+							for(int unsigned  simd = 0; simd < SIMD; simd++) begin
+								automatic accu_t  m0 = $signed(ww[pe][simd])
+									* $signed({SIGNED_ACTIVATIONS && aa[simd][ACTIVATION_WIDTH-1], aa[simd]});
+								automatic accu_t  p0 = $signed(pp[pe]) + m0;
+								// Avoid overflow by zeroing offending weight
+								if(((m0 < 0) == ($signed(pp[pe]) < 0)) && ((m0 < 0) != (p0 < 0)))
+									ww[pe][simd] = 0;
+								else
+									pp[pe] = p0;
+							end
+						end
+
+						zero <= 0;
+						w <= ww;
+						a <= aa;
+					end
+					last <= $urandom() % 137 == 0;
+				end
+				@(posedge clk);
+			end
+			while(!en || !last);
+			Q.push_back(pp);
+			RoundsPushed++;
+		end
+
+		// Flush: keep en=1 with zero=1 for pipeline to drain
+		en <= 1;
+		last <= 0;
+		zero <= 1;
+		w <= '0;
+		a <= '0;
+		repeat(20) @(posedge clk);
+
+		assert(Q.size == 0) else begin
+			$error("Missing %0d outputs.", Q.size);
+			$stop;
+		end
+
+		$display("Test completed successfully.");
+		$finish;
+	end
+
+	//-----------------------------------------------------------------------
+	// Output Checker
+	int unsigned  Checks = 0;
+	always_ff @(posedge clk iff !rst) begin
+		if(en && vld) begin
+			automatic accu_t [PE-1:0]  exp;
+
+			assert(Q.size > 0) else begin
+				$error("Spurious output: %0p.", p);
+				$stop;
+			end
+
+			exp = Q.pop_front();
+			assert(p === exp) else begin
+				$error("Output mismatch: got %0p, expected %0p.", p, exp);
+				$stop;
+			end
+
+			Checks <= Checks + 1;
+		end
+	end
+
+	final begin
+		assert(Checks == RoundsPushed)
+			$display("Successfully performed %0d checks (%0d directed + %0d random).",
+				Checks, RoundsPushed - ROUNDS, ROUNDS);
+		else
+			$error("Unexpected number of checks: %0d instead of %0d.",
+				Checks, RoundsPushed);
+	end
+
+endmodule : dotp_comp_{full_sig}_tb
diff --git a/src/finn/compressor/hdl/dotp_comp_template.tcl b/src/finn/compressor/hdl/dotp_comp_template.tcl
new file mode 100644
index 0000000000..1e8b2fc482
--- /dev/null
+++ b/src/finn/compressor/hdl/dotp_comp_template.tcl
@@ -0,0 +1,32 @@
+#############################################################################
+# Copyright (C) 2024 - 2025 Advanced Micro Devices, Inc. All rights reserved.
+#
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# @brief    Vivado simulation script for dot product compressor with accumulation
+# @author    Simon Gerber <simon.gerber@amd.com>
+#############################################################################
+
+# Create Fresh Project
+set label {label}
+set src_dir {src_dir}
+set tb dotp_comp_{label}_tb
+set part {part}
+create_project -force dotp_comp_$label dotp_comp_$label.vivado -part $part
+
+# Import Design and Simulation Sources
+# Static: mul_comp_map interface
+# Expanded: dotp_comp.sv (template with $COMP_MODULE_NAME$ filled in)
+# Generated: comp_<sig>.sv (config-specific compressor core)
+read_verilog -sv $src_dir/hdl/mul_comp_map.sv $src_dir/gen/$label/dotp_comp.sv {*}[glob $src_dir/gen/$label/comp_*.sv]
+set simset [current_fileset -simset]
+add_files -fileset $simset $src_dir/gen/$label/$tb.sv
+set_property file_type SystemVerilog [get_files -of_objects $simset $src_dir/gen/$label/$tb.sv]
+set_property top $tb $simset
+set_property xsim.simulate.runtime all $simset
+
+# Run Simulation
+launch_simulation
+close_sim
+
+quit
diff --git a/src/finn/compressor/hdl/dotp_tb_template.sv b/src/finn/compressor/hdl/dotp_tb_template.sv
new file mode 100644
index 0000000000..a5d7dbafd6
--- /dev/null
+++ b/src/finn/compressor/hdl/dotp_tb_template.sv
@@ -0,0 +1,59 @@
+/******************************************************************************
+ * Copyright (C) 2024 - 2025 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * @brief	Testbench template for standalone dot product compressor
+ *****************************************************************************/
+
+module dotp_{n}x{sa}{na}{sb}{nb}_tb #(
+	localparam int unsigned  N = {n},
+	localparam int unsigned  NA = {na},
+	localparam int unsigned  NB = {nb},
+	localparam bit  SIGNED_A = {signed_a},
+	localparam bit  SIGNED_B = {signed_b},
+
+	localparam int unsigned  NP = NA > 1?
+		$clog2(N) + (!SIGNED_B && (NB == 1)? NA : NA+NB) :
+		SIGNED_A ^^ SIGNED_B? 1 + $clog2(N) /*[-N:0]*/ : $clog2(N+1) /*[0:N]*/,
+	localparam bit  SIGNED_P = NA == 1? SIGNED_A ^^ SIGNED_B : SIGNED_A || SIGNED_B
+)();
+	uwire  clk = 'z;
+
+	logic [N-1:0][NA-1:0]  a;
+	logic [N-1:0][NB-1:0]  b;
+	uwire [NP-1:0]  p;
+
+	dotp_{n}x{sa}{na}{sb}{nb} dut (
+		.clk,
+		.a, .b, .p
+	);
+
+	initial begin
+		repeat(137) begin
+			automatic type(a)  aa;
+			automatic type(b)  bb;
+			automatic int  pp = 0;
+			automatic int  px;
+			void'(std::randomize(aa, bb));
+			for(int unsigned  i = 0; i < N; i++) begin
+				automatic logic  sa = SIGNED_A && aa[i][NA-1];
+				automatic logic  sb = SIGNED_B && bb[i][NB-1];
+				pp += $signed({sa, aa[i]}) * $signed({sb, bb[i]});
+			end
+
+			a <= aa;
+			b <= bb;
+			#10ns;
+			px = $signed({ SIGNED_P && p[NP-1], p });
+			assert((^p !== 1'bx) && (px == pp)) else begin
+				$error("Received %0d [0x%0x] instead of %0d.", px, p, pp);
+				$stop;
+			end
+		end
+
+		$display("Test completed.");
+		$finish;
+	end
+
+endmodule : dotp_{n}x{sa}{na}{sb}{nb}_tb
diff --git a/src/finn/compressor/hdl/dotp_template.tcl b/src/finn/compressor/hdl/dotp_template.tcl
new file mode 100644
index 0000000000..d9a9110f97
--- /dev/null
+++ b/src/finn/compressor/hdl/dotp_template.tcl
@@ -0,0 +1,26 @@
+#############################################################################
+# Copyright (C) 2024 - 2025 Advanced Micro Devices, Inc. All rights reserved.
+#
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# @brief    Vivado simulation script for standalone dot product compressor
+#############################################################################
+
+# Create Fresh Project
+set sig {n}x{sa}{na}{sb}{nb}
+set top dotp_$sig
+set part {part}
+create_project -force $top $top.vivado -part $part
+
+# Import Design and Simulation Sources
+read_verilog -sv hdl/mul_comp_map.sv gen/comp_$sig.sv gen/$top.sv
+set simset [current_fileset -simset]
+add_files -fileset $simset gen/${top}_tb.sv
+set_property top ${top}_tb $simset
+set_property xsim.simulate.runtime all $simset
+
+# Run Simulation
+launch_simulation
+close_sim
+
+quit
diff --git a/src/finn/compressor/hdl/mul_comp_map_tb.sv b/src/finn/compressor/hdl/mul_comp_map_tb.sv
new file mode 100644
index 0000000000..c7432f1d9b
--- /dev/null
+++ b/src/finn/compressor/hdl/mul_comp_map_tb.sv
@@ -0,0 +1,41 @@
+/******************************************************************************
+ * Copyright (C) 2024 - 2025 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * @brief	Testbench for multiplier-to-compressor mapping verification
+ * @author	Thomas B. Preußer <thomas.preusser@amd.com>, Simon Gerber <simon.gerber@amd.com>
+ *****************************************************************************/
+
+/**
+ * Quick visualizer for compressor input broadcasting.
+ */
+
+module mul_comp_map_tb;
+	localparam int unsigned  NA = 5;
+	localparam int unsigned  NB = 4;
+	localparam bit  SIGNED_A = 1;
+	localparam bit  SIGNED_B = 1;
+	logic [NA-1:0]  a;
+	logic [NB-1:0]  b;
+	mul_comp_map #(.NA(NA), .NB(NB), .SIGNED_A(SIGNED_A), .SIGNED_B(SIGNED_B)) map (.ia(a), .ib(b));
+
+	initial begin
+		automatic int unsigned  col = 0;
+		automatic int unsigned  row = 0;
+		a = '0;
+		b = '1;
+
+		#5ns;
+		for(int unsigned  i = 0; i < $bits(map.oa); i++) begin
+			$write("\t%0b.%0d.%0b", map.oa[i], map.gate_op(i), map.ob[i]);
+			if(++row == map.height(col)) begin
+				$display();
+				col++;
+				row = 0;
+			end
+		end
+		$display("\t%0b", map.absolute_term());
+	end
+
+endmodule : mul_comp_map_tb
diff --git a/src/finn/compressor/lib/test_common.sh b/src/finn/compressor/lib/test_common.sh
new file mode 100644
index 0000000000..0f2e03b177
--- /dev/null
+++ b/src/finn/compressor/lib/test_common.sh
@@ -0,0 +1,101 @@
+#!/bin/bash
+#############################################################################
+# Copyright (C) 2024 - 2025 Advanced Micro Devices, Inc. All rights reserved.
+#
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# @brief    Common shell utilities for compressor testing
+# @author    Simon Gerber <simon.gerber@amd.com>
+#############################################################################
+
+# Common test utilities for compressor integration tests.
+# Source this file from test scripts.
+
+# Worker pool state (must be declared by sourcing script if not already)
+declare -A workers 2>/dev/null || true
+declare -A errcodes 2>/dev/null || true
+
+# Collect finished workers until at most $1 remain active.
+function collect_workers {
+	local pid label code
+	while :; do
+		for pid in "${!workers[@]}"; do
+			if ! kill -0 "$pid" 2>/dev/null; then
+				label=${workers["$pid"]}
+				wait "$pid"
+				code=$?
+				errcodes["$label"]="$code"
+				unset "workers[$pid]"
+				echo "- $label -> $code"
+			fi
+		done
+		if [ "${#workers[@]}" -le "$1" ]; then return; fi
+		sleep 5
+	done
+}
+
+# Start a test worker. Args: label, function_name
+function start_worker {
+	local label="$1"
+	echo "+ $label ..."
+	"$2" "$label" &
+	workers[$!]="$label"
+}
+
+# Check Vivado output file for errors. Returns error count.
+# Usage: check_vivado_errors <output_file> <label>
+function check_vivado_errors {
+	local out="$1" label="$2"
+	local err_count tcl_err_count success_count
+
+	# Check if output file exists
+	if [ ! -f "$out" ]; then
+		echo "ERROR: Vivado output file not found for $label: $out" >&2
+		return 1
+	fi
+
+	# Check for Vivado errors
+	err_count=$(grep -ic '^Error: ' "$out" || true)
+	tcl_err_count=$(grep -Eic "can't read \"|invalid command name|no such variable|^ERROR: \[Common" "$out" || true)
+
+	# Check for positive completion indicators
+	success_count=$(grep -ic "Successfully performed\|Test completed successfully\|Test completed\.\|Performed.*checks" "$out" || true)
+
+	# TCL errors are fatal
+	if [ "$tcl_err_count" -gt 0 ]; then
+		echo "ERROR: Vivado/Tcl failed for $label (tcl_errors=$tcl_err_count)." >&2
+		return 1
+	fi
+
+	# If no Vivado errors but also no success message, simulation may have crashed
+	if [ "$err_count" -eq 0 ] && [ "$success_count" -eq 0 ]; then
+		# Check if simulation even started
+		if ! grep -q "launch_simulation\|xsim.*-runall\|run all" "$out"; then
+			echo "ERROR: Simulation did not run for $label (no launch detected)." >&2
+			return 1
+		fi
+		echo "WARNING: No success message found for $label (may have incomplete simulation)." >&2
+		# Don't fail here, just warn - some tests might not have explicit success messages
+	fi
+
+	return "$err_count"
+}
+
+# Print colored test summary. Uses global LABELS and errcodes arrays.
+function print_summary {
+	local label code msg overall=0
+
+	echo -e "Summary:\n"
+	for label in "${LABELS[@]}"; do
+		code="${errcodes[$label]}"
+		if [ "$code" -eq 0 ]; then
+			msg=$'\e[92;1mPASS\e[0m'
+		else
+			msg=$'\e[91;1mFAIL\e[0m'" (errors: $code)"
+			overall=1
+		fi
+		printf '  %-40s %s\n' "$label" "$msg"
+	done
+	echo
+	return "$overall"
+}
diff --git a/src/finn/compressor/run_add_multi_comp_tests.sh b/src/finn/compressor/run_add_multi_comp_tests.sh
new file mode 100755
index 0000000000..ccb463920c
--- /dev/null
+++ b/src/finn/compressor/run_add_multi_comp_tests.sh
@@ -0,0 +1,135 @@
+#!/bin/bash
+#############################################################################
+# Copyright (C) 2024 - 2025 Advanced Micro Devices, Inc. All rights reserved.
+#
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# @brief    Test runner for add_multi compressor verification
+# @author    Simon Gerber <simon.gerber@amd.com>
+#############################################################################
+
+# Run standalone add_multi compressor tests.
+# For each (N, ARG_WIDTH) configuration:
+#   1. Generate comp_NuW_dD.sv via add_multi_finn.py
+#   2. Expand TB and TCL templates
+#   3. Run XSim via Vivado
+#
+# Usage: ./run_add_multi_comp_tests.sh [versal|7series]
+# Prerequisites: Vivado on PATH
+
+((${KEEP_LOG:=0}))
+((${MAX_WORKERS:=12}))
+TARGET="${1:-versal}"  # Default to versal
+
+if ! command -v vivado >/dev/null 2>&1; then
+	echo "ERROR: vivado not found in PATH." >&2
+	exit 1
+fi
+
+echo "Vivado: $(command -v vivado)"
+echo "Settings: KEEP_LOG=$KEEP_LOG MAX_WORKERS=$MAX_WORKERS"
+echo "Target: $TARGET"
+
+# Paths
+SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
+HDL_DIR="$SCRIPT_DIR/hdl"
+GEN_BASE="$SCRIPT_DIR/gen"
+FINN_SRC="$(cd "$SCRIPT_DIR/../.." && pwd)"
+export PYTHONPATH="$FINN_SRC${PYTHONPATH:+:$PYTHONPATH}"
+: "${WORK_DIR:=${FINN_HOST_BUILD_DIR:-/tmp/finn_compressor_tests}}"
+
+source "$SCRIPT_DIR/lib/test_common.sh"
+
+# Test configs: --n N --arg_width W [-p pipeline_every]
+TESTS=(
+	"--n 8  --arg_width 4"
+	"--n 8  --arg_width 4  -p 2"
+	"--n 16 --arg_width 3"
+	"--n 16 --arg_width 6  -p 2"
+	"--n 32 --arg_width 6  -p 2"
+	"--n 32 --arg_width 16 -p 2"
+	"--n 47 --arg_width 5  -p 2"
+	"--n 56 --arg_width 8  -p 2"
+)
+
+function parse_config {
+	local n="" w="" p=""
+	while [[ $# -gt 0 ]]; do
+		case "$1" in
+			--n)         n="$2"; shift 2;;
+			--arg_width) w="$2"; shift 2;;
+			-p)          p="$2"; CFG_P_FLAG="-p $2"; shift 2;;
+			*)           shift;;
+		esac
+	done
+	CFG_N="$n"; CFG_W="$w"
+	CFG_LABEL="n${n}_w${w}"; [ -n "$p" ] && CFG_LABEL="${CFG_LABEL}_p${p}"
+	# Set FPGA part based on TARGET variable
+	if [[ "$TARGET" == "7series" ]]; then
+		CFG_PART="xc7z020clg400-1"  # Pynq-Z1
+	else
+		CFG_PART="xcvc1902-vsva2197-2MP-e-S"  # Versal VCK190
+	fi
+}
+
+function run_sim {
+	local label="$1"
+	local tcl="$GEN_BASE/$label/add_multi_comp_${label}.tcl"
+	local out="$GEN_BASE/$label/add_multi_comp_${label}.runner.out"
+	local log=(-nolog); [ "$KEEP_LOG" -gt 0 ] && log=(-log "$GEN_BASE/$label/sim.log")
+
+	vivado "${log[@]}" -nojournal -mode batch -source "$tcl" >"$out" 2>&1
+	check_vivado_errors "$out" "$label"
+	exit $?
+}
+
+# Phase 1: Generate
+LABELS=()
+echo -e "Generating configs:\n"
+for args in "${TESTS[@]}"; do
+	CFG_P_FLAG=""
+	# shellcheck disable=SC2086
+	parse_config $args
+	label="$CFG_LABEL"
+	LABELS+=("$label")
+	gen_dir="$GEN_BASE/$label"
+	mkdir -p "$gen_dir"
+
+	echo "  $label ..."
+
+	# Generate compressor
+	# shellcheck disable=SC2086
+	if ! gen_out=$(python3 -m finn.compressor.src.add_multi_finn \
+		--n "$CFG_N" --arg_width "$CFG_W" $CFG_P_FLAG -o "$gen_dir" 2>&1); then
+		echo "GENERATION FAILED: $gen_out" >&2; exit 1
+	fi
+
+	comp_name=$(echo "$gen_out" | sed -n 's/^ *Module name:[[:space:]]*//p' | head -n 1)
+	comp_depth=$(echo "$gen_out" | sed -n 's/^ *Pipeline depth:[[:space:]]*//p' | head -n 1 | grep -Eo '[0-9]+' || true)
+	[ -z "$comp_name" ] && { echo "ERROR: No module name for $label" >&2; exit 1; }
+	[ -z "$comp_depth" ] && { echo "ERROR: No depth for $label" >&2; exit 1; }
+
+	# Expand TB
+	sed -e "s/{n}/$CFG_N/g" -e "s/{arg_width}/$CFG_W/g" \
+	    -e "s/{depth}/$comp_depth/g" -e "s/{label}/$label/g" \
+	    -e "s/{comp_module}/$comp_name/g" \
+	    "$HDL_DIR/add_multi_comp_tb_template.sv" > "$gen_dir/add_multi_comp_${label}_tb.sv"
+
+	# Expand TCL
+	sed -e "s|{label}|$label|g" -e "s|{tb}|add_multi_comp_${label}_tb|g" \
+	    -e "s|{gen_dir}|$gen_dir|g" -e "s|{part}|$CFG_PART|g" \
+	    "$HDL_DIR/add_multi_comp_template.tcl" > "$gen_dir/add_multi_comp_${label}.tcl"
+done
+echo
+
+# Phase 2: Simulate
+echo -e "Running simulations with $MAX_WORKERS parallel workers:\n"
+for label in "${LABELS[@]}"; do
+	collect_workers $((MAX_WORKERS - 1))
+	start_worker "$label" run_sim
+done
+collect_workers 0
+echo
+
+print_summary
+exit $?
diff --git a/src/finn/compressor/run_dotp_comp_tests.sh b/src/finn/compressor/run_dotp_comp_tests.sh
new file mode 100755
index 0000000000..56a73e4a4a
--- /dev/null
+++ b/src/finn/compressor/run_dotp_comp_tests.sh
@@ -0,0 +1,143 @@
+#!/bin/bash
+#############################################################################
+# Copyright (C) 2024 - 2025 Advanced Micro Devices, Inc. All rights reserved.
+#
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# @brief    Test runner for dot product compressor verification
+# @author    Simon Gerber <simon.gerber@amd.com>
+#############################################################################
+
+# Run dotp_comp integration tests for multiple configurations.
+# Uses dotp_finn.py to generate the compressor core (comp.sv),
+# then instantiates it from the static dotp_comp template via XSim.
+#
+# Usage: ./run_dotp_comp_tests.sh [versal|7series]
+
+((${KEEP_LOG:=0}))
+((${MAX_WORKERS:=12}))
+TARGET="${1:-versal}"  # Default to versal
+
+SRC_DIR="$(cd "$(dirname "$0")" && pwd)"
+FINN_SRC="$(cd "$SRC_DIR/../.." && pwd)"
+export PYTHONPATH="$FINN_SRC${PYTHONPATH:+:$PYTHONPATH}"
+: "${WORK_DIR:=${FINN_HOST_BUILD_DIR:-/tmp/finn_compressor_tests}}"
+
+if ! command -v vivado >/dev/null 2>&1; then
+	echo "ERROR: vivado not found in PATH." >&2
+	exit 1
+fi
+
+echo "Vivado: $(command -v vivado)"
+echo "Settings: KEEP_LOG=$KEEP_LOG MAX_WORKERS=$MAX_WORKERS WORK_DIR=$WORK_DIR"
+echo "Target: $TARGET"
+
+source "$SRC_DIR/lib/test_common.sh"
+
+# Test configs: --pe PE --simd SIMD --ww WW --aw AW --accu_width ACCU [--signed_activations]
+# Target is set via script argument, applied to all tests
+TESTS=(
+	"--pe 2 --simd 8 --ww 1 --aw 1 --accu_width 16"
+	"--pe 2 --simd 8 --ww 1 --aw 1 --accu_width 16 --signed_activations"
+	"--pe 2 --simd 8 --ww 2 --aw 1 --accu_width 16"
+	"--pe 2 --simd 8 --ww 2 --aw 2 --accu_width 16 --signed_activations"
+	"--pe 2 --simd 4 --ww 2 --aw 2 --accu_width 16 --signed_activations"
+	"--pe 2 --simd 16 --ww 2 --aw 2 --accu_width 16 --signed_activations"
+	"--pe 1 --simd 8 --ww 2 --aw 2 --accu_width 16 --signed_activations"
+	"--pe 4 --simd 8 --ww 2 --aw 2 --accu_width 16 --signed_activations"
+)
+
+function parse_config {
+	local pe="" simd="" ww="" aw="" accu="" signed_act=""
+	CFG_SIGNED_FLAG=""
+	while [[ $# -gt 0 ]]; do
+		case "$1" in
+			--pe)    pe="$2"; shift 2;;
+			--simd)  simd="$2"; shift 2;;
+			--ww)    ww="$2"; shift 2;;
+			--aw)    aw="$2"; shift 2;;
+			--accu_width) accu="$2"; shift 2;;
+			--signed_activations) signed_act="_sa"; CFG_SIGNED_FLAG="--signed_activations"; shift;;
+			*) shift;;
+		esac
+	done
+	CFG_PE="$pe"; CFG_SIMD="$simd"; CFG_WW="$ww"; CFG_AW="$aw"; CFG_ACCU="$accu"
+	CFG_LABEL="pe${pe}_simd${simd}_ww${ww}_aw${aw}_accu${accu}${signed_act}"
+	# Sanitize label for SystemVerilog identifiers
+	CFG_LABEL="${CFG_LABEL//-/_}"
+	# Set FPGA part and target flag based on TARGET variable
+	if [[ "$TARGET" == "7series" ]]; then
+		CFG_PART="xc7z020clg400-1"  # Pynq-Z1
+		CFG_TARGET_FLAG="--target 7-Series"
+	else
+		CFG_PART="xcvc1902-vsva2197-2MP-e-S"  # Versal VCK190
+		CFG_TARGET_FLAG=""
+	fi
+}
+
+function run_sim {
+	local label="$1"
+	local tcl="$SRC_DIR/gen/$label/dotp_comp_${label}.tcl"
+	local out="$SRC_DIR/gen/$label/dotp_comp_${label}.runner.out"
+	local log=(-nolog); [ "$KEEP_LOG" -gt 0 ] && log=(-log "$SRC_DIR/gen/$label/sim.log")
+
+	mkdir -p "$WORK_DIR"
+	(cd "$WORK_DIR" && vivado "${log[@]}" -nojournal -mode batch -source "$tcl" >"$out" 2>&1)
+	check_vivado_errors "$out" "$label"
+	exit $?
+}
+
+# Phase 1: Generate
+LABELS=()
+echo -e "Generating configs:\n"
+for args in "${TESTS[@]}"; do
+	CFG_SIGNED_FLAG=""
+	# shellcheck disable=SC2086
+	parse_config $args
+	label="$CFG_LABEL"
+	LABELS+=("$label")
+	out_dir="gen/$label"
+	mkdir -p "$out_dir"
+
+	echo "  $label ..."
+
+	# Generate compressor
+	# shellcheck disable=SC2086
+	gen_out=$(python3 -m finn.compressor.src.dotp_finn \
+		--simd "$CFG_SIMD" --ww "$CFG_WW" --aw "$CFG_AW" \
+		--accu_width "$CFG_ACCU" $CFG_SIGNED_FLAG $CFG_TARGET_FLAG \
+		--dotp-template hdl/dotp_comp_template.sv \
+		--dotp-output-name dotp_comp.sv \
+		-o "$out_dir" 2>&1)
+	if [ $? -ne 0 ]; then
+		echo "GENERATION FAILED: $gen_out" >&2; exit 1
+	fi
+
+	comp_depth=$(echo "$gen_out" | sed -n 's/^ *Pipeline depth:[[:space:]]*//p' | head -n 1 | grep -Eo '[0-9]+' || true)
+	[ -z "$comp_depth" ] && { echo "ERROR: No depth for $label" >&2; exit 1; }
+
+	# Expand TB
+	sed -e "s/{pe}/$CFG_PE/g" -e "s/{simd}/$CFG_SIMD/g" \
+	    -e "s/{ww}/$CFG_WW/g" -e "s/{aw}/$CFG_AW/g" \
+	    -e "s/{accu_width}/$CFG_ACCU/g" \
+	    -e "s/{signed_act}/$([ -n "$CFG_SIGNED_FLAG" ] && echo 1 || echo 0)/g" \
+	    -e "s/{full_sig}/$label/g" -e "s/{comp_depth}/$comp_depth/g" \
+	    hdl/dotp_comp_tb_template.sv > "$out_dir/dotp_comp_${label}_tb.sv"
+
+	# Expand TCL
+	sed -e "s/{label}/$label/g" -e "s|{src_dir}|$SRC_DIR|g" -e "s/{part}/$CFG_PART/g" \
+	    hdl/dotp_comp_template.tcl > "$out_dir/dotp_comp_${label}.tcl"
+done
+echo
+
+# Phase 2: Simulate
+echo -e "Running simulations with $MAX_WORKERS parallel workers:\n"
+for label in "${LABELS[@]}"; do
+	collect_workers $((MAX_WORKERS - 1))
+	start_worker "$label" run_sim
+done
+collect_workers 0
+echo
+
+print_summary
+exit $?
diff --git a/src/finn/compressor/run_tests.sh b/src/finn/compressor/run_tests.sh
new file mode 100755
index 0000000000..d8c517ad1c
--- /dev/null
+++ b/src/finn/compressor/run_tests.sh
@@ -0,0 +1,105 @@
+#!/bin/bash
+#############################################################################
+# Copyright (C) 2024 - 2025 Advanced Micro Devices, Inc. All rights reserved.
+#
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# @brief    Master test runner for all compressor tests
+# @author    Simon Gerber <simon.gerber@amd.com>
+#############################################################################
+
+# If asserted, logs are kept.
+((${KEEP_LOG:=0}))
+# Limit the number of parallel worker processes for simulation.
+((${MAX_WORKERS:=12}))
+# Constant Absorption Option
+ca="$1"
+# Target platform (versal or 7series)
+target="${2:-versal}"
+
+# PYTHONPATH so python -m finn.compressor.src.* resolves
+SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
+FINN_SRC="$(cd "$SCRIPT_DIR/../.." && pwd)"
+export PYTHONPATH="$FINN_SRC${PYTHONPATH:+:$PYTHONPATH}"
+: "${WORK_DIR:=${FINN_HOST_BUILD_DIR:-/tmp/finn_compressor_tests}}"
+
+if ! command -v vivado >/dev/null 2>&1; then
+	echo "ERROR: vivado not found in PATH." >&2
+	exit 1
+fi
+
+echo "Vivado: $(command -v vivado)"
+echo "Settings: KEEP_LOG=$KEEP_LOG MAX_WORKERS=$MAX_WORKERS WORK_DIR=$WORK_DIR"
+
+source "$SCRIPT_DIR/lib/test_common.sh"
+
+TESTS=(
+	1xu1u1 1xu1s1 1xs1u1 1xs1s1
+	7xu1s1
+	8xs1u1
+	9xu1u1
+
+	1xu2u1 1xu2s1 1xs2u1 1xs2s1
+	2xu2s1
+
+	1xu2u2 1xu2s2 1xs2u2 1xs2s2
+	2xs2u2
+
+	1xs3u3
+	3xs5u4
+	3xu5u4
+	7xs7s6
+)
+IFS=$'\n' TESTS=($(sort -r <<<"${TESTS[*]}"))
+
+function run_test {
+	local sig="$1"
+	local gen_log comp_log sim_out
+
+	if [ "$KEEP_LOG" -gt 0 ]; then
+		gen_log="$SCRIPT_DIR/comp_$sig.log"
+		comp_log=(-log "$SCRIPT_DIR/dotp_$sig.log")
+	else
+		gen_log="/dev/null"
+		comp_log=(-nolog)
+	fi
+
+	# Phase 1: Generate compressor
+	if ! python3 -m finn.compressor.src.dotp "$sig" "$ca" "$target" >"$gen_log" 2>&1; then
+		echo "ERROR: Generation failed for $sig" >&2
+		return 1
+	fi
+
+	# Phase 2: Run simulation
+	sim_out="$SCRIPT_DIR/gen/dotp_$sig.runner.out"
+	mkdir -p "$WORK_DIR"
+	(cd "$WORK_DIR" && vivado "${comp_log[@]}" -nojournal -mode batch -source "$SCRIPT_DIR/gen/dotp_$sig.tcl" >"$sim_out" 2>&1)
+
+	check_vivado_errors "$sim_out" "$sig"
+	return $?
+}
+
+# Phase 1: Sequential generation
+LABELS=()
+echo -e "Generating configs:\n"
+for test in "${TESTS[@]}"; do
+	echo "  $test ..."
+	LABELS+=("$test")
+	if ! python3 -m finn.compressor.src.dotp "$test" "$ca" "$target" >/dev/null 2>&1; then
+		echo "ERROR: Generation failed for $test" >&2
+		exit 1
+	fi
+done
+echo
+
+# Phase 2: Parallel simulation
+echo -e "Running simulations with $MAX_WORKERS parallel workers:\n"
+for label in "${LABELS[@]}"; do
+	collect_workers $((MAX_WORKERS - 1))
+	start_worker "$label" run_test
+done
+collect_workers 0
+echo
+
+print_summary
+exit $?

From 03bfca4f7a2ed1919fb6b81c2a969d9b7e8e24ac Mon Sep 17 00:00:00 2001
From: Simon Gerber <simon.gerber@amd.com>
Date: Tue, 14 Apr 2026 13:06:46 +0100
Subject: [PATCH 04/10] [Feature] Add 7-Series and UltraScale+ support with bug
 fixes
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Complete 7-Series support with gate absorption optimization and fused
accumulation. Add UltraScale/UltraScale+ target (reuses 7-Series primitives,
Vivado maps CARRY4→CARRY8 transparently).

Key Changes:
- Implement 7-Series gate absorption (MuxCYPredAdder, MuxCYRippleSum)
- Fix 7-Series fused accumulation and carry chain wiring
- Fix compressor generation bugs (mul_comp_map indexing, N=1 passthrough, MuxCYAtom06)
- Add UltraScale() target class and remove UltraScale+ restrictions
- Remove RTL bitwidth restrictions: 2-3 bit networks now eligible for compressor path
- Add BIPOLAR datatype guard (RTL doesn't support BIPOLAR)
- Unified add_multi.sv generation for OOC synthesis
- VVU template variable consistency (USE_COMPRESSOR, COMP_PIPELINE_DEPTH)

All three FPGA families (Versal, 7-Series, UltraScale+) now fully supported.
---
 finn-rtllib/mvu/add_multi.sv                  |  2 +-
 .../rtl/matrixvectoractivation_rtl.py         | 61 ++++++-------------
 .../rtl/vectorvectoractivation_rtl.py         |  2 +
 .../fpgadataflow/specialize_layers.py         | 43 +++++++------
 .../transformation/fpgadataflow/synth_ooc.py  | 45 ++++++++++++++
 5 files changed, 92 insertions(+), 61 deletions(-)

diff --git a/finn-rtllib/mvu/add_multi.sv b/finn-rtllib/mvu/add_multi.sv
index d154ae318d..f204abe363 100644
--- a/finn-rtllib/mvu/add_multi.sv
+++ b/finn-rtllib/mvu/add_multi.sv
@@ -28,7 +28,7 @@
  * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
  * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
- * @brief	Pipelined multi-input adder tree.
+ * @brief	Pipelined multi-input adder using LUT-based compressors.
  * @author	Thomas B. Preußer <thomas.preusser@amd.com>
  *****************************************************************************/
 
diff --git a/src/finn/custom_op/fpgadataflow/rtl/matrixvectoractivation_rtl.py b/src/finn/custom_op/fpgadataflow/rtl/matrixvectoractivation_rtl.py
index d536e105f8..79af134346 100644
--- a/src/finn/custom_op/fpgadataflow/rtl/matrixvectoractivation_rtl.py
+++ b/src/finn/custom_op/fpgadataflow/rtl/matrixvectoractivation_rtl.py
@@ -318,31 +318,18 @@ def _resolve_dsp_version(self, dsp_block):
     def _is_dotp_comp_eligible(self, fpgapart, ww, aw, pumped_compute):
         """
         Check if LUT-based compressor should replace the DSP compute path.
-        Returns True when: non-pumped, small operands (WW <= 4 and AW <= 4),
-        and target is Versal or 7-Series (not UltraScale+).
+        Returns True when: non-pumped, small operands (WW <= 4 and AW <= 4).
+
+        All FPGA families are supported via resolve_target() in the compressor:
+        - Versal: LUT6 + LOOKAHEAD8 primitives
+        - UltraScale+: LUT6_2 + CARRY4 (Vivado maps to CARRY8)
+        - 7-Series: LUT6_2 + CARRY4
         """
-        # Check if compressors are force-disabled (for benchmarking)
-        if self.get_nodeattr("noCompressor"):
-            return False
         if pumped_compute or ww > 4 or aw > 4:
             return False
-        dsp_block = get_dsp_block(fpgapart)
-        # DSP48E2 (UltraScale+) excluded: no compressor target exists for its
-        # CARRY8 primitives — generator only supports Versal and 7-Series.
-        return dsp_block in ("DSP58", "DSP48E1")
+        return True
         
 
-    def _is_add_multi_comp_eligible(self, version, simd):
-        """
-        Check if add_multi lane reductions should use LUT compressors.
-        Returns True when: not UltraScale+ (version != 2) and SIMD >= 4
-        (below 4 inputs, compressors offer no benefit over binary adder tree).
-        """
-        # Check if compressors are force-disabled (for benchmarking)
-        if self.get_nodeattr("noCompressor"):
-            return False
-        # version 2 = DSP48E2 (UltraScale+) blocked for same reason as above.
-        return version != 2 and simd >= 4
 
     def generate_hdl(self, model, fpgapart, clk):
         # Generate params as part of IP preparation
@@ -380,27 +367,19 @@ def generate_hdl(self, model, fpgapart, clk):
             code_gen_dict["$USE_COMPRESSOR$"] = [str(1)]
             self.set_nodeattr("comp_module_name", result["comp_name"])
         else:
-            # Generate add_multi.sv (either patched with comps or template copy)
-            # Check if add_multi should use compressors (respects noCompressor attribute)
-            if self._is_add_multi_comp_eligible(version, simd):
-                result = generate_add_multi_comps(
-                    fpgapart, version, simd, ww, aw, accu_width,
-                    narrow_weights, code_gen_dir)
-                if result["comp_names"]:
-                    self.set_nodeattr("add_multi_comp_names",
-                                      ";".join(result["comp_names"]))
-                    # Store compressor specs for synthesis aggregation
-                    # Format: "N,W,D;N,W,D;..." e.g. "16,4,0;16,3,0;16,8,0"
-                    specs_str = ";".join(
-                        f"{n},{w},{d}" for n, w, d in result.get("comp_specs", [])
-                    )
-                    self.set_nodeattr("add_multi_comp_specs", specs_str)
-            else:
-                # Compressors disabled: copy template add_multi.sv (binary adder tree)
-                rtllib_dir = os.path.join(os.environ["FINN_ROOT"], "finn-rtllib/mvu/")
-                dest = os.path.join(code_gen_dir, "add_multi.sv")
-                shutil.copy(os.path.join(rtllib_dir, "add_multi.sv"), dest)
-                result = {"comp_names": [], "files": [dest]}
+            # DSP path: Generate add_multi.sv with compressors
+            result = generate_add_multi_comps(
+                fpgapart, version, simd, ww, aw, accu_width,
+                narrow_weights, code_gen_dir)
+            if result["comp_names"]:
+                self.set_nodeattr("add_multi_comp_names",
+                                  ";".join(result["comp_names"]))
+                # Store compressor specs for synthesis aggregation
+                # Format: "N,W,D;N,W,D;..." e.g. "16,4,0;16,3,0;16,8,0"
+                specs_str = ";".join(
+                    f"{n},{w},{d}" for n, w, d in result.get("comp_specs", [])
+                )
+                self.set_nodeattr("add_multi_comp_specs", specs_str)
 
         # add general parameters to dictionary
         code_gen_dict["$MODULE_NAME_AXI_WRAPPER$"] = [self.get_verilog_top_module_name()]
diff --git a/src/finn/custom_op/fpgadataflow/rtl/vectorvectoractivation_rtl.py b/src/finn/custom_op/fpgadataflow/rtl/vectorvectoractivation_rtl.py
index 7ef9d9c9b5..187ae3ec9f 100644
--- a/src/finn/custom_op/fpgadataflow/rtl/vectorvectoractivation_rtl.py
+++ b/src/finn/custom_op/fpgadataflow/rtl/vectorvectoractivation_rtl.py
@@ -284,6 +284,8 @@ def prepare_codegen_default(self, fpgapart, clk):
             [str(1)] if (self.get_input_datatype(0).min() < 0) else [str(0)]
         )
         code_gen_dict["$SEGMENTLEN$"] = [str(self._resolve_segment_len(clk))]
+        code_gen_dict["$COMP_PIPELINE_DEPTH$"] = [str(1)]
+        code_gen_dict["$USE_COMPRESSOR$"] = [str(0)]
 
         return template_path, code_gen_dict
 
diff --git a/src/finn/transformation/fpgadataflow/specialize_layers.py b/src/finn/transformation/fpgadataflow/specialize_layers.py
index 462b55570f..b311c36d12 100644
--- a/src/finn/transformation/fpgadataflow/specialize_layers.py
+++ b/src/finn/transformation/fpgadataflow/specialize_layers.py
@@ -55,11 +55,9 @@ def _determine_impl_style(node, fpgapart, model):
             return _dwc_determine_impl_style(node)
         if rtl_variant:
             if optype == "MVAU":
-                idt = node_inst.get_input_datatype(0)
-                wdt = node_inst.get_input_datatype(1)
-                inp_width_fit = idt.bitwidth() >= 4
-                weight_width_fit = wdt.bitwidth() >= 4
-                if inp_width_fit and weight_width_fit and _mvu_rtl_possible(node, fpgapart, model):
+                # Delegate to _mvu_rtl_possible() which allows 2-8 bit bitwidths
+                # Removed >= 4 early filter to enable RTL/compressors for 2-3 bit
+                if _mvu_rtl_possible(node, fpgapart, model):
                     return "rtl"
                 else:
                     return "hls"
@@ -227,8 +225,15 @@ def _mvu_rtl_possible(n, fpgapart, model):
     # first check if no Activation or binary xnor mode and return False
     # immediately if one of them is True
     no_activation = node_inst.get_nodeattr("noActivation") == 0
-    not_binaryxnor_mode = node_inst.get_nodeattr("binaryXnorMode") == 1
-    if no_activation or not_binaryxnor_mode:
+    is_binaryxnor_mode = node_inst.get_nodeattr("binaryXnorMode") == 1
+    if no_activation or is_binaryxnor_mode:
+        return False
+
+    # RTL does not support BIPOLAR input datatype (1-bit signed {-1,+1})
+    # BIPOLAR requires special handling that only HLS provides
+    from qonnx.core.datatype import DataType
+    idt = node_inst.get_input_datatype(0)
+    if idt == DataType["BIPOLAR"]:
         return False
 
     # check if weights are signed, if not return False
@@ -247,18 +252,18 @@ def _mvu_rtl_possible(n, fpgapart, model):
     else:
         weights_min = np.min(weights)
     narrow_weights = False if weights_min == wdt.min() else True
-    # if non narrow weights and only DSP48E1 available return False
-    if not narrow_weights and dsp_block == "DSP48E1":
-        return False
-
-    # if none of the above constraints have been triggered
-    # we now check if input and weight data types are in range
-    # we only use rtl mvau if the dtypes are at least 2 bit
-    idt = node_inst.get_input_datatype()
-    inp_width_in_range = (2 <= idt.bitwidth() <= 8) or (idt.bitwidth() == 9 and idt.signed())
-    weight_width_in_range = 2 <= wdt.bitwidth() <= 8
-
-    return inp_width_in_range and weight_width_in_range
+    # NOTE: Narrow weight check for DSP48E1 removed (previously returned False for
+    # narrow_weights=False on DSP48E1). Rationale (see matrixvectoractivation_rtl.py):
+    # - Compressor path (LUT-based, WW<=4 && AW<=4): No narrow weight constraint, works
+    #   with full weight range including wdt.min()
+    # - DSP path: Handles narrow weights via NARROW_WEIGHTS module parameter in mvu.sv,
+    #   which adjusts lane slicing to accommodate narrow range
+    # - Test suite: Removed weight clipping in test_fpgadataflow_mvau.py line 785
+    #   (previously forced W = np.clip(W, wdt.min()+1, wdt.max()) on xc7z020)
+    # - Result: Both paths now accept full weight range, narrow_weights computed but not
+    #   used as a gating condition for RTL eligibility
+    
+    return True
 
 
 def _vvu_rtl_possible(n, fpgapart):
diff --git a/src/finn/transformation/fpgadataflow/synth_ooc.py b/src/finn/transformation/fpgadataflow/synth_ooc.py
index 2f9436c6b0..3f8f5ba689 100644
--- a/src/finn/transformation/fpgadataflow/synth_ooc.py
+++ b/src/finn/transformation/fpgadataflow/synth_ooc.py
@@ -44,6 +44,46 @@ def is_hls_float_op(node, model):
     return False
 
 
+def generate_unified_add_multi(model, build_dir):
+    """
+    Generate unified add_multi.sv with aggregated CATCH_COMP entries from all
+    MVAU_rtl nodes. Deduplicates specs and programmatically generates macro calls.
+    """
+    all_specs = set()
+    for node in model.graph.node:
+        if node.op_type == "MVAU_rtl":
+            inst = getCustomOp(node)
+            specs_str = inst.get_nodeattr("add_multi_comp_specs")
+            if specs_str:
+                for spec in specs_str.split(";"):
+                    n, w, d = map(int, spec.split(","))
+                    all_specs.add((n, w, d))
+
+    rtllib_template = os.path.join(os.environ["FINN_ROOT"],
+                                   "finn-rtllib/mvu/add_multi.sv")
+    with open(rtllib_template, 'r') as f:
+        template = f.read()
+
+    if all_specs:
+        catch_comp_lines = [f"\t`CATCH_COMP({n},{w},{d})"
+                           for n, w, d in sorted(all_specs)]
+        entries = "\n".join(catch_comp_lines) + "\n"
+    else:
+        entries = ""
+
+    marker = "\t// FINN_GENERATED_COMP_ENTRIES\n"
+    if marker not in template:
+        raise RuntimeError(
+            "FINN_GENERATED_COMP_ENTRIES marker not found in finn-rtllib/mvu/add_multi.sv! "
+            "Template file may have been modified."
+        )
+
+    unified = template.replace(marker, entries + marker)
+
+    with open(os.path.join(build_dir, "add_multi.sv"), 'w') as f:
+        f.write(unified)
+
+
 class SynthOutOfContext(Transformation):
     """Run out-of-context Vivado synthesis on a stitched IP design."""
 
@@ -68,6 +108,11 @@ def file_to_basename(x):
         for file in all_verilog_srcs:
             if any([file.endswith(x) for x in verilog_extensions]):
                 copy2(file, build_dir)
+
+        # Generate unified add_multi.sv with aggregated CATCH_COMP entries
+        # This overwrites any per-node add_multi.sv files that were copied above
+        generate_unified_add_multi(model, build_dir)
+
         # extract additional tcl commands to set up floating-point ips correctly
         float_ip_tcl = []
         for node in model.graph.node:

From df2664a9df63bb0b4ea93a338be255436ba1c396 Mon Sep 17 00:00:00 2001
From: Simon Gerber <simon.gerber@amd.com>
Date: Wed, 22 Apr 2026 09:46:59 +0100
Subject: [PATCH 05/10] [Compressor Documentation] Documentation updates and
 extended UltraScale+ test coverage

---
 src/finn/compressor/README.md                 |  55 ++-
 .../mvau_compressor_integration_flow.svg      | 330 ++++++++++++++++++
 .../compressor/run_add_multi_comp_tests.sh    |   2 +
 src/finn/compressor/run_dotp_comp_tests.sh    |   3 +
 src/finn/compressor/run_tests.sh              |   3 +-
 src/finn/compressor/src/dotp.py               |   4 +
 6 files changed, 384 insertions(+), 13 deletions(-)
 create mode 100644 src/finn/compressor/mvau_compressor_integration_flow.svg

diff --git a/src/finn/compressor/README.md b/src/finn/compressor/README.md
index 8c6fbbd2b0..ddc8ca7d9f 100644
--- a/src/finn/compressor/README.md
+++ b/src/finn/compressor/README.md
@@ -8,15 +8,43 @@ SPDX-License-Identifier: BSD-3-Clause
 This tool can generate compressor trees for 7-Series, UltraScale(+) and Versal for arbitrary input shapes.
 
 # Getting started
-1. Clone this repository.
-2. _No_ further dependencies needed!
+1. Part of the FINN framework (integrated into MVAU RTL backend).
+2. _standalone compressor generation_ requires no external dependencies.
 
-## Usage
+## FINN Integration
+The compressor is automatically invoked during MVAU layer specialization (`SpecializeLayers` transformation).
+FINN selects the between RTL compressor, RTL DSP and HLS implementations based on the node parameters.
+See the [MVAU compressor integration flow diagram](mvau_compressor_inegration_flow.svg) for the complete decision tree.
+
+**Key integration files:**
+- `src/finn/transformation/fpgadataflow/specialize_layers.py` - RTL vs HLS selection logic
+- `src/finn/custom_op/fpgadataflow/rtl/matrixvectoractivation_rtl.py` - FINN-side RTL MVAU integration with compressor path selection
+- `src/finn/compressor/src/dotp_finn.py` - FINN wrapper for dot-product compressor generation
+- `src/finn/compressor/src/add_multi_finn.py` - FINN wrapper for multi-operand adder generation
+- `finn-rtllib/mvu/mvu_vvu_axi.sv` - RTL template that instantiates generated compressors
+
+This project implements either the full dotp unit of the node with a compressor impleemntation, or optimizes the add_multi additions of the DSP lanes when the RTL DSP path is invoked.
+
+## Standalone Usage
 Generate a compressor of shape `(12,12,12)` called `comp` and save it under `/gen/comp12_12_12.sv`:
 
-```python3 src/main.py -s 12,12,12 -n comp -o gen/comp12_12_12.sv```
+```python3 -m finn.compressor.src.main -s 12,12,12 -n comp -o gen/comp12_12_12.sv```
+
+See `python3 -m finn.compressor.src.main -h` for details.
 
-See `python3 src/main.py -h` for details.
+## Testing
+Run the test suite for verification on different platforms:
+
+```bash
+# Core compressor tests (21 configs)
+./run_tests.sh "" versal        # or 7series, ultrascale
+
+# MVAU integration tests (8 configs)
+./run_dotp_comp_tests.sh versal # or 7series, ultrascale
+
+# Multi-operand adder tests (8 configs)
+./run_add_multi_comp_tests.sh versal # or 7series, ultrascale
+```
 
 ## Features
 ### Custom Input Shape
@@ -25,7 +53,7 @@ The tool can generate compressors for any input shape. A shape is passed as a co
 ### Accumulation
 By passing `-a`, the tool generates an accumulator instead of just an adder. The accumulators width can be specified by `-w`.
 ### Gate Absorption
-If desired, every input to the compressor can be preceded by a two-input gate. These gates can be integrated into the first compression stage. Each gate is specified as a HEX digit. The encoding is the same is Vivado's LUT2 primitive: 
+If desired, every input to the compressor can be preceded by a two-input gate. These gates can be integrated into the first compression stage. Each gate is specified as a HEX digit. The encoding is the same is Vivado's LUT2 primitive:
 | Secondary Input | Primary Input | Output
 |-----------------|---------------|----------------
 |0	              |0	          |(DIGIT << 0) & 1
@@ -38,19 +66,19 @@ For example, `8` maps to an AND gate and `6` maps to an XOR gate.
 In CLI, gates can be specified as a flat string like `-g 883ABC`. The *LSB* is *left* and *MSB* is *right*. The leftmost specified gate corresponds to the LSB input in the generated compressor input vector.
 
 ### Target
-Generate compressors for either Versal, 7-Series or UltraScale fabrics using `-t \{Versal,7-Series,UltraScale\}̀ .
+Generate compressors for either Versal, 7-Series or UltraScale fabrics using `-t {Versal,7-Series,UltraScale}`.
 
 ### Automated Testing
 The tool can automatically generate a SystemVerilog testbench to fuzzy-test the generated compressors by passing `--test`. For testing, the `xvlog`, `xelab` and `xsim` commands have to be available.
 
 ### Custom Pipeline Depth
-Specify the maximum combinational delay for the compressor using `-p MAX_DEPTH`. Note that the final adder, which has at least one single routing delay, cannot be pipelined. 
+Specify the maximum combinational delay for the compressor using `-p MAX_DEPTH`. Note that the final adder, which has at least one single routing delay, cannot be pipelined.
 
 ### Constant Input
 Aside to the regular, variable compressor inputs, the tool also supports an additional constant input. It can be specified as a binary number by `-c NUMBER`.
 
 # Implementation Details - How the Code is Structured
-The compressor is internally represented as a graph. Its nodes are defined in `src/graph/nodes.py`. 
+The compressor is internally represented as a graph. Its nodes are defined in `src/graph/nodes.py`.
 Compressor construction is done in several passes:
 1. Create a graph with all scheduled counters and a final adder (in `src/passes/compressor_constructor.py`).
     1. (Optional) Generate a gate absorption stage.
@@ -58,14 +86,17 @@ Compressor construction is done in several passes:
     3. Insert pipeline registers between compressor stages.
     4. Build either a final adder or an accumulator as the final stage.
 2. Annotate LUT6CY instances with placement constraints so that the LUT Cascade will be utilized (in `src/passes/lut_placer.py`).
-3. Replace inexpressible connections: Place wires between connected instantiated modules (in `src/passes/wire_inserter.py`). 
+3. Replace inexpressible connections: Place wires between connected instantiated modules (in `src/passes/wire_inserter.py`).
 4. Annotate input and output signals in the compressor (in `src/passes/io_annotator.py`).
 5. Emit generated SystemVerilog source (in `src/passes/emitter.py`)
 
 ## Extending the Tool
 ### Adding new Counters
-Counters without gate absorption are defined in `graph/counters/counter_candidates.py`. 
-Counters with gate absorption are defined in `graph/counters/absorption_counter_candidates.py`. 
+Counters without gate absorption are defined in `graph/counters/counter_candidates.py`.
+Counters with gate absorption are defined in `graph/counters/absorption_counter_candidates.py`.
 
 ### Adding new Passes
 Before adding new passes over the compressor graph, check out if the simple iterator defined in `node_iterator.py` can be inherited to save boilerplate code.
+
+# Authors
+This tool was created as a standalone compressor generator by Konstantin Hossfeld and Thomas Preußer. It was extended and integrated into the finn flow by Simon Gerber.
diff --git a/src/finn/compressor/mvau_compressor_integration_flow.svg b/src/finn/compressor/mvau_compressor_integration_flow.svg
new file mode 100644
index 0000000000..5d65495a70
--- /dev/null
+++ b/src/finn/compressor/mvau_compressor_integration_flow.svg
@@ -0,0 +1,330 @@
+<?xml version="1.0" encoding="UTF-8" standalone="no"?>
+<svg width="1450" height="720" xmlns="http://www.w3.org/2000/svg">
+
+  <!-- Vivado-style block diagram with horizontal decision flow -->
+  <defs>
+    <!-- Arrowhead -->
+    <marker id="arrow" markerWidth="8" markerHeight="8" refX="7" refY="3" orient="auto">
+      <polygon points="0 0, 8 3, 0 6" fill="#333" />
+    </marker>
+    <marker id="arrowGreen" markerWidth="8" markerHeight="8" refX="7" refY="3" orient="auto">
+      <polygon points="0 0, 8 3, 0 6" fill="#2E7D32" />
+    </marker>
+    <marker id="arrowRed" markerWidth="8" markerHeight="8" refX="7" refY="3" orient="auto">
+      <polygon points="0 0, 8 3, 0 6" fill="#C62828" />
+    </marker>
+
+    <!-- Decision diamond pattern -->
+    <pattern id="decisionFill" x="0" y="0" width="4" height="4" patternUnits="userSpaceOnUse">
+      <rect width="4" height="4" fill="#FFF3CD"/>
+    </pattern>
+
+    <!-- Block shadow -->
+    <filter id="blockShadow" x="-20%" y="-20%" width="140%" height="140%">
+      <feGaussianBlur in="SourceAlpha" stdDeviation="2"/>
+      <feOffset dx="2" dy="2" result="offsetblur"/>
+      <feMerge>
+        <feMergeNode/>
+        <feMergeNode in="SourceGraphic"/>
+      </feMerge>
+    </filter>
+  </defs>
+
+  <!-- Grid lines (subtle, like Vivado) -->
+  <g stroke="#E8E8E8" stroke-width="0.5" opacity="0.3">
+    <line x1="0" y1="300" x2="1450" y2="300"/>
+    <line x1="200" y1="0" x2="200" y2="720"/>
+    <line x1="450" y1="0" x2="450" y2="720"/>
+    <line x1="750" y1="0" x2="750" y2="720"/>
+    <line x1="1050" y1="0" x2="1050" y2="720"/>
+    <line x1="1350" y1="0" x2="1350" y2="720"/>
+  </g>
+
+  <!-- ========== TOP ROW: DECISION FLOW (HORIZONTAL) ========== -->
+
+  <!-- Entry Node: MVAU -->
+  <g id="entry" filter="url(#blockShadow)">
+    <rect x="30" y="40" width="220" height="220" rx="5" fill="#E3F2FD" stroke="#1565C0" stroke-width="3"/>
+    <rect x="30" y="40" width="220" height="35" rx="5" fill="#1976D2" stroke="#1565C0" stroke-width="2"/>
+    <text x="140" y="63" font-family="Segoe UI, Arial, sans-serif" font-size="16" font-weight="bold" text-anchor="middle" fill="white">
+      MVAU Node
+    </text>
+    <text x="140" y="125" font-family="Segoe UI, Arial, sans-serif" font-size="11" text-anchor="middle">
+      Pre-specialization
+    </text>
+  </g>
+
+  <!-- Arrow from Entry to Decision 1 (CENTERED) -->
+  <line x1="253" y1="150" x2="297" y2="150" stroke="#333" stroke-width="2.5" marker-end="url(#arrow)"/>
+
+  <!-- Decision 1: RTL vs HLS -->
+  <g id="decision1">
+    <rect x="300" y="40" width="240" height="220" rx="5" fill="url(#decisionFill)" stroke="#FFA726" stroke-width="3" filter="url(#blockShadow)"/>
+    <rect x="300" y="40" width="240" height="35" rx="5" fill="#FF9800" stroke="#F57C00" stroke-width="2"/>
+    <text x="420" y="63" font-family="Segoe UI, Arial, sans-serif" font-size="15" font-weight="bold" text-anchor="middle" fill="white">
+      Decision 1: RTL Eligible?
+    </text>
+
+    <!-- Decision parameters in box -->
+    <text x="310" y="100" font-family="Consolas, monospace" font-size="10" font-weight="bold">Attributes:</text>
+    <text x="310" y="120" font-family="Consolas, monospace" font-size="9">• wdt.signed() == True</text>
+    <text x="310" y="138" font-family="Consolas, monospace" font-size="9">• noActivation == 0</text>
+    <text x="310" y="156" font-family="Consolas, monospace" font-size="9">• binaryXnorMode != 1</text>
+
+    <text x="420" y="240" font-family="Consolas, monospace" font-size="9" text-anchor="middle" fill="#666" font-style="italic">
+      specialize_layers.py:218-268
+    </text>
+    <text x="420" y="251" font-family="Consolas, monospace" font-size="9" text-anchor="middle" fill="#666" font-style="italic">
+      _mvu_rtl_possible()
+    </text>
+  </g>
+
+  <!-- Arrow from Decision 1 to Decision 2 (YES/pass, CENTERED) -->
+  <line x1="543" y1="150" x2="587" y2="150" stroke="#2E7D32" stroke-width="3" marker-end="url(#arrowGreen)"/>
+  <text x="565" y="135" font-family="Segoe UI, Arial, sans-serif" font-size="12" font-weight="bold" text-anchor="middle" fill="#2E7D32">YES</text>
+
+  <!-- Decision 2: DOTP Compressor Eligible -->
+  <g id="decision2">
+    <rect x="590" y="40" width="240" height="220" rx="5" fill="url(#decisionFill)" stroke="#FFA726" stroke-width="3" filter="url(#blockShadow)"/>
+    <rect x="590" y="40" width="240" height="35" rx="5" fill="#FF9800" stroke="#F57C00" stroke-width="2"/>
+    <text x="710" y="58" font-family="Segoe UI, Arial, sans-serif" font-size="14" font-weight="bold" text-anchor="middle" fill="white">
+      Decision 2: DOTP
+    </text>
+    <text x="710" y="70" font-family="Segoe UI, Arial, sans-serif" font-size="14" font-weight="bold" text-anchor="middle" fill="white">
+      Compressor Eligible?
+    </text>
+
+    <text x="600" y="90" font-family="Consolas, monospace" font-size="10" font-weight="bold">Bitwidth:</text>
+    <text x="600" y="110" font-family="Consolas, monospace" font-size="9">• ww ≤ 4</text>
+    <text x="600" y="123" font-family="Consolas, monospace" font-size="9">• aw ≤ 4</text>
+
+    <text x="600" y="145" font-family="Consolas, monospace" font-size="10" font-weight="bold">Mode:</text>
+    <text x="600" y="160" font-family="Consolas, monospace" font-size="9">• pumpedCompute == 0</text>
+
+    <text x="710" y="240" font-family="Consolas, monospace" font-size="9" text-anchor="middle" fill="#666" font-style="italic">
+      matrixvectoractivation_rtl.py:318
+    </text>
+    <text x="710" y="251" font-family="Consolas, monospace" font-size="9" text-anchor="middle" fill="#666" font-style="italic">
+      _is_dotp_comp_eligible()
+    </text>
+  </g>
+
+  <!-- Arrow from Decision 2 to Decision 2.5 (NO/fail) -->
+  <line x1="833" y1="150" x2="877" y2="150" stroke="#C62828" stroke-width="3" marker-end="url(#arrowRed)"/>
+  <text x="855" y="135" font-family="Segoe UI, Arial, sans-serif" font-size="12" font-weight="bold" text-anchor="middle" fill="#C62828">NO</text>
+
+  <!-- Decision 2.5: genINT8 Eligible -->
+  <g id="decision2_5">
+    <rect x="880" y="40" width="240" height="220" rx="5" fill="url(#decisionFill)" stroke="#FFA726" stroke-width="3" filter="url(#blockShadow)"/>
+    <rect x="880" y="40" width="240" height="35" rx="5" fill="#FF9800" stroke="#F57C00" stroke-width="2"/>
+    <text x="1000" y="63" font-family="Segoe UI, Arial, sans-serif" font-size="14" font-weight="bold" text-anchor="middle" fill="white">
+      genINT8 Eligible?
+    </text>
+
+    <text x="890" y="90" font-family="Consolas, monospace" font-size="9" font-weight="bold">DSP58 Optimization:</text>
+    <text x="890" y="105" font-family="Consolas, monospace" font-size="8">• VERSION == Versal (DSP58)</text>
+    <text x="890" y="118" font-family="Consolas, monospace" font-size="8">• NUM_LANES ≤ 3</text>
+    <text x="890" y="131" font-family="Consolas, monospace" font-size="8">• WW ≤ 8, AW ≤ 9</text>
+
+    <text x="1000" y="240" font-family="Consolas, monospace" font-size="9" text-anchor="middle" fill="#666" font-style="italic">
+      mvu_vvu_axi.sv:332
+    </text>
+    <text x="1000" y="251" font-family="Consolas, monospace" font-size="9" text-anchor="middle" fill="#666" font-style="italic">
+      (genINT8 decision)
+    </text>
+  </g>
+
+  <!-- Arrow from Decision 2.5 to DSP + Lane Compressors (NO/genSoftVec, horizontal) -->
+  <line x1="1123" y1="150" x2="1167" y2="150" stroke="#C62828" stroke-width="3" marker-end="url(#arrowRed)"/>
+  <text x="1145" y="135" font-family="Segoe UI, Arial, sans-serif" font-size="12" font-weight="bold" text-anchor="middle" fill="#C62828">NO</text>
+
+  <!-- Arrow from Decision 2.5 to genINT8 (YES, downward to terminal row) -->
+  <line x1="1000" y1="263" x2="1000" y2="357" stroke="#2E7D32" stroke-width="3" marker-end="url(#arrowGreen)"/>
+  <text x="1025" y="310" font-family="Segoe UI, Arial, sans-serif" font-size="12" font-weight="bold" text-anchor="middle" fill="#2E7D32">YES</text>
+
+  <!-- Terminal: genINT8 (bottom row, aligned under Decision 2.5) -->
+  <g id="terminal_genINT8" filter="url(#blockShadow)">
+    <rect x="880" y="360" width="240" height="180" rx="5" fill="#B3E5FC" stroke="#0277BD" stroke-width="3"/>
+    <rect x="880" y="360" width="240" height="35" rx="5" fill="#0288D1" stroke="#0277BD" stroke-width="2"/>
+    <text x="1000" y="383" font-family="Segoe UI, Arial, sans-serif" font-size="15" font-weight="bold" text-anchor="middle" fill="white">
+      genINT8
+    </text>
+    <text x="1000" y="410" font-family="Segoe UI, Arial, sans-serif" font-size="11" text-anchor="middle" font-weight="bold">
+      DSP58 Specialized
+    </text>
+    <text x="1000" y="430" font-family="Consolas, monospace" font-size="9" text-anchor="middle">
+      mvu_vvu_8sx9_dsp58.sv
+    </text>
+    <text x="1000" y="445" font-family="Consolas, monospace" font-size="9" text-anchor="middle">
+      No add_multi needed
+    </text>
+    <text x="1000" y="460" font-family="Consolas, monospace" font-size="9" text-anchor="middle">
+      Fits in DSP58 width
+    </text>
+    <line x1="890" y1="475" x2="1110" y2="475" stroke="#0288D1" stroke-width="1"/>
+    <text x="1000" y="492" font-family="Consolas, monospace" font-size="9" text-anchor="middle" fill="#01579B">
+      Versal only (DSP58)
+    </text>
+    <text x="1000" y="507" font-family="Consolas, monospace" font-size="9" text-anchor="middle" fill="#01579B">
+      VERSION==Versal, lanes≤3, WW≤8, AW≤9
+    </text>
+    <text x="1000" y="526" font-family="Consolas, monospace" font-size="9" text-anchor="middle" fill="#01579B" font-style="italic">
+      mvu_vvu_8sx9_dsp58.sv
+    </text>
+  </g>
+
+
+  <!-- ========== BRANCHING ARROWS DOWN TO TERMINALS (CENTERED) ========== -->
+
+  <!-- Decision 1 NO → HLS (CENTERED to box) -->
+  <line x1="430" y1="263" x2="430" y2="357" stroke="#C62828" stroke-width="3" marker-end="url(#arrowRed)"/>
+  <text x="455" y="310" font-family="Segoe UI, Arial, sans-serif" font-size="12" font-weight="bold" text-anchor="middle" fill="#C62828">NO</text>
+
+  <!-- Decision 2 YES → DOTP Compressor (CENTERED to box) -->
+  <line x1="710" y1="263" x2="710" y2="357" stroke="#2E7D32" stroke-width="3" marker-end="url(#arrowGreen)"/>
+  <text x="735" y="310" font-family="Segoe UI, Arial, sans-serif" font-size="12" font-weight="bold" text-anchor="middle" fill="#2E7D32">YES</text>
+
+  <!-- ========== BOTTOM ROW: TERMINAL IMPLEMENTATIONS (3 remaining) ========== -->
+
+  <!-- Terminal 1: HLS Path (CENTERED under Decision 1) -->
+  <g id="terminal_hls" filter="url(#blockShadow)">
+    <rect x="320" y="360" width="220" height="180" rx="5" fill="#BDBDBD" stroke="#424242" stroke-width="3"/>
+    <rect x="320" y="360" width="220" height="35" rx="5" fill="#616161" stroke="#424242" stroke-width="2"/>
+    <text x="430" y="383" font-family="Segoe UI, Arial, sans-serif" font-size="15" font-weight="bold" text-anchor="middle" fill="white">
+      HLS Path
+    </text>
+    <text x="430" y="410" font-family="Segoe UI, Arial, sans-serif" font-size="11" text-anchor="middle" font-weight="bold">
+      Vivado HLS Synthesis
+    </text>
+    <text x="430" y="430" font-family="Consolas, monospace" font-size="9" text-anchor="middle">
+      C++ code generation
+    </text>
+    <text x="430" y="445" font-family="Consolas, monospace" font-size="9" text-anchor="middle">
+      Default: resType="lut"
+    </text>
+    <text x="430" y="460" font-family="Consolas, monospace" font-size="9" text-anchor="middle">
+      (DSP via attribute)
+    </text>
+    <line x1="330" y1="475" x2="530" y2="475" stroke="#616161" stroke-width="1"/>
+    <text x="430" y="492" font-family="Consolas, monospace" font-size="9" text-anchor="middle" fill="#333">
+      Fallback for all configs
+    </text>
+    <text x="430" y="507" font-family="Consolas, monospace" font-size="9" text-anchor="middle" fill="#333">
+      Flexible but less optimal
+    </text>
+    <text x="430" y="526" font-family="Consolas, monospace" font-size="9" text-anchor="middle" fill="#555" font-style="italic">
+      matrixvectoractivation_hls.py
+    </text>
+  </g>
+
+  <!-- Terminal 2: DOTP Compressor (CENTERED under Decision 2) -->
+  <g id="terminal_dotp" filter="url(#blockShadow)">
+    <rect x="600" y="360" width="220" height="180" rx="5" fill="#C8E6C9" stroke="#2E7D32" stroke-width="3"/>
+    <rect x="600" y="360" width="220" height="35" rx="5" fill="#43A047" stroke="#2E7D32" stroke-width="2"/>
+    <text x="710" y="383" font-family="Segoe UI, Arial, sans-serif" font-size="15" font-weight="bold" text-anchor="middle" fill="white">
+      DOTP Compressor
+    </text>
+    <text x="710" y="410" font-family="Segoe UI, Arial, sans-serif" font-size="11" text-anchor="middle" font-weight="bold">
+      Full LUT-Based Compute
+    </text>
+    <text x="710" y="430" font-family="Consolas, monospace" font-size="9" text-anchor="middle">
+      Replaces DSP entirely
+    </text>
+    <text x="710" y="445" font-family="Consolas, monospace" font-size="9" text-anchor="middle">
+      Fused accumulation
+    </text>
+    <text x="710" y="460" font-family="Consolas, monospace" font-size="9" text-anchor="middle">
+      USE_COMPRESSOR=1
+    </text>
+    <line x1="610" y1="475" x2="810" y2="475" stroke="#43A047" stroke-width="1"/>
+    <text x="710" y="492" font-family="Consolas, monospace" font-size="9" text-anchor="middle" fill="#1B5E20">
+      comp_&lt;SIMD&gt;x&lt;s|u&gt;&lt;WW&gt;
+    </text>
+    <text x="710" y="507" font-family="Consolas, monospace" font-size="9" text-anchor="middle" fill="#1B5E20">
+      &lt;s|u&gt;&lt;AW&gt;_a&lt;ACC&gt;.sv
+    </text>
+    <text x="710" y="526" font-family="Consolas, monospace" font-size="9" text-anchor="middle" fill="#1B5E20" font-style="italic">
+      dotp_finn.py, dotp_comp.sv
+    </text>
+  </g>
+
+  <!-- Terminal 3: DSP + Lane Compressors (same row as decisions) -->
+  <g id="terminal_dsp_comp" filter="url(#blockShadow)">
+    <rect x="1170" y="40" width="240" height="220" rx="5" fill="#DCEDC8" stroke="#558B2F" stroke-width="3"/>
+    <rect x="1170" y="40" width="240" height="35" rx="5" fill="#7CB342" stroke="#558B2F" stroke-width="2"/>
+    <text x="1290" y="63" font-family="Segoe UI, Arial, sans-serif" font-size="15" font-weight="bold" text-anchor="middle" fill="white">
+      DSP + Lane Comps
+    </text>
+    <text x="1290" y="95" font-family="Segoe UI, Arial, sans-serif" font-size="11" text-anchor="middle" font-weight="bold">
+      Hybrid Implementation
+    </text>
+    <text x="1290" y="115" font-family="Consolas, monospace" font-size="9" text-anchor="middle">
+      genSoftVec: mvu.sv
+    </text>
+    <text x="1290" y="130" font-family="Consolas, monospace" font-size="9" text-anchor="middle">
+      DSP multiply-accumulate
+    </text>
+    <text x="1290" y="145" font-family="Consolas, monospace" font-size="9" text-anchor="middle">
+      add_multi.sv w/ comps
+    </text>
+    <line x1="1180" y1="160" x2="1400" y2="160" stroke="#7CB342" stroke-width="1"/>
+    <text x="1290" y="177" font-family="Consolas, monospace" font-size="9" text-anchor="middle" fill="#33691E">
+      All lanes use compressors
+    </text>
+    <text x="1290" y="192" font-family="Consolas, monospace" font-size="9" text-anchor="middle" fill="#33691E">
+      comp_&lt;N&gt;u&lt;W&gt;_d&lt;D&gt;.sv
+    </text>
+    <text x="1290" y="240" font-family="Consolas, monospace" font-size="9" text-anchor="middle" fill="#666" font-style="italic">
+      add_multi_finn.py
+    </text>
+  </g>
+
+  <!-- ========== LEGEND ========== -->
+  <g id="legend">
+    <rect x="20" y="570" width="1410" height="130" fill="#F8F9FA" stroke="#999" stroke-width="1.5" rx="5"/>
+
+    <!-- Title -->
+    <text x="40" y="595" font-family="Segoe UI, Arial, sans-serif" font-size="15" font-weight="bold">
+      Parameter Reference
+    </text>
+
+    <!-- Left side: Parameter definitions -->
+    <text x="40" y="620" font-family="Consolas, monospace" font-size="12">
+      <tspan font-weight="bold">WW:</tspan> Weight Width (bitwidth) |
+      <tspan font-weight="bold">AW:</tspan> Activation Width (bitwidth) |
+      <tspan font-weight="bold">SIMD:</tspan> Parallelism factor (folding)
+    </text>
+
+    <text x="40" y="640" font-family="Consolas, monospace" font-size="12">
+      <tspan font-weight="bold">idt:</tspan> Input datatype (idt.bw() → AW) |
+      <tspan font-weight="bold">wdt:</tspan> Weight datatype (wdt.bw() → WW) |
+      <tspan font-weight="bold">pumpedCompute:</tspan> Double-pumping (2x clock)
+    </text>
+
+    <text x="40" y="660" font-family="Consolas, monospace" font-size="12">
+      <tspan font-weight="bold">version:</tspan> DSP type (1=DSP48E1/7-Series, 2=DSP48E2/UltraScale+, 3=DSP58/Versal)
+    </text>
+
+    <text x="40" y="680" font-family="Consolas, monospace" font-size="12">
+      <tspan font-weight="bold">Platform support:</tspan> RTL backend supports all FPGA families. Compressor paths work on Versal, UltraScale+, 7-Series.
+    </text>
+
+    <!-- Right side: Color legend -->
+    <text x="1230" y="595" font-family="Segoe UI, Arial, sans-serif" font-size="15" font-weight="bold">
+      Implementation Types
+    </text>
+
+    <rect x="1230" y="610" width="15" height="15" fill="#C8E6C9" stroke="#2E7D32" stroke-width="1"/>
+    <text x="1250" y="621" font-family="Segoe UI, Arial, sans-serif" font-size="12">LUT-based</text>
+
+    <rect x="1230" y="632" width="15" height="15" fill="#DCEDC8" stroke="#558B2F" stroke-width="1"/>
+    <text x="1250" y="643" font-family="Segoe UI, Arial, sans-serif" font-size="12">Hybrid</text>
+
+    <rect x="1230" y="654" width="15" height="15" fill="#B3E5FC" stroke="#0277BD" stroke-width="1"/>
+    <text x="1250" y="665" font-family="Segoe UI, Arial, sans-serif" font-size="12">DSP-based (genINT8)</text>
+
+    <rect x="1230" y="676" width="15" height="15" fill="#BDBDBD" stroke="#424242" stroke-width="1"/>
+    <text x="1250" y="687" font-family="Segoe UI, Arial, sans-serif" font-size="12">HLS fallback</text>
+  </g>
+
+</svg>
diff --git a/src/finn/compressor/run_add_multi_comp_tests.sh b/src/finn/compressor/run_add_multi_comp_tests.sh
index ccb463920c..e6ffdb3765 100755
--- a/src/finn/compressor/run_add_multi_comp_tests.sh
+++ b/src/finn/compressor/run_add_multi_comp_tests.sh
@@ -67,6 +67,8 @@ function parse_config {
 	# Set FPGA part based on TARGET variable
 	if [[ "$TARGET" == "7series" ]]; then
 		CFG_PART="xc7z020clg400-1"  # Pynq-Z1
+	elif [[ "$TARGET" == "ultrascale" ]]; then
+		CFG_PART="xczu9eg-ffvb1156-2-e"  # ZCU102
 	else
 		CFG_PART="xcvc1902-vsva2197-2MP-e-S"  # Versal VCK190
 	fi
diff --git a/src/finn/compressor/run_dotp_comp_tests.sh b/src/finn/compressor/run_dotp_comp_tests.sh
index 56a73e4a4a..d6084c73d6 100755
--- a/src/finn/compressor/run_dotp_comp_tests.sh
+++ b/src/finn/compressor/run_dotp_comp_tests.sh
@@ -69,6 +69,9 @@ function parse_config {
 	if [[ "$TARGET" == "7series" ]]; then
 		CFG_PART="xc7z020clg400-1"  # Pynq-Z1
 		CFG_TARGET_FLAG="--target 7-Series"
+	elif [[ "$TARGET" == "ultrascale" ]]; then
+		CFG_PART="xczu9eg-ffvb1156-2-e"  # ZCU102
+		CFG_TARGET_FLAG="--target UltraScale"
 	else
 		CFG_PART="xcvc1902-vsva2197-2MP-e-S"  # Versal VCK190
 		CFG_TARGET_FLAG=""
diff --git a/src/finn/compressor/run_tests.sh b/src/finn/compressor/run_tests.sh
index d8c517ad1c..5d7530fdcd 100755
--- a/src/finn/compressor/run_tests.sh
+++ b/src/finn/compressor/run_tests.sh
@@ -14,7 +14,7 @@
 ((${MAX_WORKERS:=12}))
 # Constant Absorption Option
 ca="$1"
-# Target platform (versal or 7series)
+# Target platform (versal, 7series, or ultrascale)
 target="${2:-versal}"
 
 # PYTHONPATH so python -m finn.compressor.src.* resolves
@@ -81,6 +81,7 @@ function run_test {
 
 # Phase 1: Sequential generation
 LABELS=()
+mkdir -p "$SCRIPT_DIR/gen"
 echo -e "Generating configs:\n"
 for test in "${TESTS[@]}"; do
 	echo "  $test ..."
diff --git a/src/finn/compressor/src/dotp.py b/src/finn/compressor/src/dotp.py
index b3b96b826b..2a5f738148 100644
--- a/src/finn/compressor/src/dotp.py
+++ b/src/finn/compressor/src/dotp.py
@@ -27,6 +27,10 @@
 	if target_arg == "7series":
 		target = SevenSeries()
 		fpga_part = "xc7z020clg400-1"
+	elif target_arg == "ultrascale":
+		from .target import UltraScale
+		target = UltraScale()
+		fpga_part = "xczu9eg-ffvb1156-2-e"
 	else:  # versal (default)
 		target = Versal()
 		fpga_part = "xcvc1902-vsva2197-2MP-e-S"

From fb2b3e8e05dd732acc23004d8ee1a6f032e81f6e Mon Sep 17 00:00:00 2001
From: Simon Gerber <simon.gerber@amd.com>
Date: Wed, 22 Apr 2026 14:22:21 +0100
Subject: [PATCH 06/10] [Style] Fix flake8 linting issues.

---
 src/finn/compressor/__init__.py               |   4 +-
 src/finn/compressor/src/add_multi_finn.py     | 157 +++---
 src/finn/compressor/src/benchmark.py          |  72 +--
 src/finn/compressor/src/dotp.py               | 169 +++----
 src/finn/compressor/src/dotp_finn.py          | 130 +++--
 src/finn/compressor/src/evaluation.py         | 138 +++---
 src/finn/compressor/src/graph/accumulator.py  |  49 +-
 .../counters/absorption_counter_candidates.py | 169 ++++---
 .../src/graph/counters/counter_candidates.py  | 449 ++++++++++--------
 src/finn/compressor/src/graph/final_adder.py  | 289 +++++------
 src/finn/compressor/src/graph/nodes.py        | 263 ++++++----
 src/finn/compressor/src/graph/primitives.py   |  84 ++--
 src/finn/compressor/src/graph/visitor.py      |  67 ++-
 src/finn/compressor/src/main.py               | 150 +++---
 .../src/passes/compressor_constructor.py      | 147 +++---
 .../src/passes/compressor_pipeliner.py        |  12 +-
 .../compressor/src/passes/cost_estimator.py   |  16 +-
 src/finn/compressor/src/passes/emitter.py     | 154 +++---
 .../compressor/src/passes/io_annotator.py     |  22 +-
 src/finn/compressor/src/passes/lut_placer.py  |  18 +-
 .../compressor/src/passes/node_iterator.py    |  89 ++--
 src/finn/compressor/src/passes/printer.py     |  23 +-
 .../compressor/src/passes/wire_inserter.py    |  13 +-
 src/finn/compressor/src/target.py             |  67 ++-
 src/finn/compressor/src/tests/test_gen.py     |  74 +--
 src/finn/compressor/src/tests/tester.py       |  13 +-
 src/finn/compressor/src/utils/mul_comp_map.py |   5 +-
 src/finn/compressor/src/utils/shape.py        |  32 +-
 .../rtl/matrixvectoractivation_rtl.py         |  23 +-
 .../fpgadataflow/specialize_layers.py         |  17 +-
 .../transformation/fpgadataflow/synth_ooc.py  |  10 +-
 31 files changed, 1673 insertions(+), 1252 deletions(-)

diff --git a/src/finn/compressor/__init__.py b/src/finn/compressor/__init__.py
index 38b3d95ea5..5adb8a0127 100644
--- a/src/finn/compressor/__init__.py
+++ b/src/finn/compressor/__init__.py
@@ -9,5 +9,7 @@
 
 """FINN compressor — LUT-based compressor tree generator for MVU."""
 
-from .src.dotp_finn import generate_dotp_comp
 from .src.add_multi_finn import generate_add_multi_comps
+from .src.dotp_finn import generate_dotp_comp
+
+__all__ = ["generate_add_multi_comps", "generate_dotp_comp"]
diff --git a/src/finn/compressor/src/add_multi_finn.py b/src/finn/compressor/src/add_multi_finn.py
index 3932584db0..27c5aec986 100644
--- a/src/finn/compressor/src/add_multi_finn.py
+++ b/src/finn/compressor/src/add_multi_finn.py
@@ -36,16 +36,14 @@
   comp_<N>u<W>_d<delay>.sv  — the generated compressor core(s)
 """
 
-import os
-import math
 import argparse
-import shutil
+import math
+import os
 
 from .main import generate_compressor
-from .target import resolve_target, resolve_target_name, Versal, SevenSeries
+from .target import resolve_target, resolve_target_name
 from .utils.shape import Shape
 
-
 # ---------------------------------------------------------------------------
 # Python replica of mvu.sv::sliceLanes()
 #
@@ -56,6 +54,7 @@
 # This outsourced computation is required as lane width is relevant to the
 # compressor input Shape and thus needs to be known at generation time.
 
+
 def clog2(n):
     """Ceiling of log2, matching SystemVerilog $clog2 semantics."""
     if n <= 1:
@@ -82,9 +81,9 @@ def slice_lanes(version, ww, aw, accu_width, narrow_weights):
     Returns
     -------
     (num_lanes, offsets) : tuple
-        num_lanes : int 
+        num_lanes : int
             number of DSP lanes.
-        offsets   : list[int] 
+        offsets   : list[int]
             lane boundary positions (length num_lanes+1).
     """
     a_width = 25 + 2 * (version > 1)
@@ -119,7 +118,7 @@ def lo_widths_from_mvu_params(version, ww, aw, accu_width, narrow_weights):
 
     Returns
     -------
-    list[int] 
+    list[int]
         lo_width for lane 0 .. num_lanes-1.
     """
     num_lanes, offsets = slice_lanes(version, ww, aw, accu_width, narrow_weights)
@@ -142,8 +141,7 @@ def comp_module_name(n, arg_width, delay):
     return f"comp_{n}u{arg_width}_d{delay}"
 
 
-def generate_add_multi_comp(target, n, arg_width, pipeline_every, output_dir,
-                            name=None):
+def generate_add_multi_comp(target, n, arg_width, pipeline_every, output_dir, name=None):
     """
     Generate a multi-input adder compressor (no accumulation).
 
@@ -185,13 +183,13 @@ def generate_add_multi_comp(target, n, arg_width, pipeline_every, output_dir,
         shape=shape,
         name=tmp_name,
         comb_depth=pipeline_every,
-        accumulate=False,          # Pure adder, no fused accumulation
-        accumulator_width=None,    # Not applicable without accumulation
-        gates=[],                  # No gate absorption, inputs are complete values
-        constants=[],              # No Baugh-Wooley correction, unsigned inputs
+        accumulate=False,  # Pure adder, no fused accumulation
+        accumulator_width=None,  # Not applicable without accumulation
+        gates=[],  # No gate absorption, inputs are complete values
+        constants=[],  # No Baugh-Wooley correction, unsigned inputs
         path=tmp_path,
         test=False,
-        enable=False,              # No accumulator registers to initialize
+        enable=False,  # No accumulator registers to initialize
     )
 
     # Derive final name with delay suffix
@@ -214,8 +212,9 @@ def generate_add_multi_comp(target, n, arg_width, pipeline_every, output_dir,
     return final_name, final_path, delay
 
 
-def generate_add_multi_comps(fpgapart, version, simd, ww, aw, accu_width,
-                             narrow_weights, output_dir):
+def generate_add_multi_comps(
+    fpgapart, version, simd, ww, aw, accu_width, narrow_weights, output_dir
+):
     """
     Generate add_multi compressor cores and patch add_multi.sv.
     This is the high-level entry point called by FINN's generate_hdl().
@@ -248,8 +247,10 @@ def generate_add_multi_comps(fpgapart, version, simd, ww, aw, accu_width,
     # Always generate compressors and patch add_multi.sv
     target = resolve_target(fpgapart)
 
-    # This is currently a parallel implementation of the lo_width computation in mvu.sv's sliceLanes() function.
-    # The resulting lo_width values determine the compressor input Shapes, so we need to compute them here in Python at generation time.
+    # This is currently a parallel implementation of the lo_width
+    # computation in mvu.sv's sliceLanes() function. The resulting
+    # lo_width values determine the compressor input Shapes, so we need
+    # to compute them here in Python at generation time.
     # Must be kept in SYNC.
     widths = lo_widths_from_mvu_params(version, ww, aw, accu_width, narrow_weights)
 
@@ -259,9 +260,12 @@ def generate_add_multi_comps(fpgapart, version, simd, ww, aw, accu_width,
         key = (simd, w)
         if key not in generated:
             name, _path, delay = generate_add_multi_comp(
-                target, simd, w,
+                target,
+                simd,
+                w,
                 pipeline_every=1,  # Max pipelining (match dotp_comp behavior)
-                output_dir=output_dir)
+                output_dir=output_dir,
+            )
             generated[key] = (name, delay)
 
     # Copy add_multi.sv to output_dir and inject CATCH_COMP lines
@@ -278,14 +282,14 @@ def generate_add_multi_comps(fpgapart, version, simd, ww, aw, accu_width,
     if marker not in add_multi_src:
         raise RuntimeError(
             "Cannot find FINN_GENERATED_COMP_ENTRIES marker in add_multi.sv. "
-            "Has the file been modified?")
+            "Has the file been modified?"
+        )
     add_multi_src = add_multi_src.replace(marker, catch_lines + marker)
 
     with open(patched_path, "w") as f:
         f.write(add_multi_src)
 
-    comp_files = [os.path.join(output_dir, name + ".sv")
-                  for (name, _delay) in generated.values()]
+    comp_files = [os.path.join(output_dir, name + ".sv") for (name, _delay) in generated.values()]
 
     return {
         "comp_names": [name for (name, _delay) in generated.values()],
@@ -296,56 +300,69 @@ def generate_add_multi_comps(fpgapart, version, simd, ww, aw, accu_width,
 
 def main():
     parser = argparse.ArgumentParser(
-        prog="add_multi_finn",
-        description="Generate a compressor core for FINN's add_multi module."
+        prog="add_multi_finn", description="Generate a compressor core for FINN's add_multi module."
+    )
+    parser.add_argument("--n", type=int, required=True, help="Number of unsigned addends (= SIMD)")
+    parser.add_argument(
+        "-t",
+        "--target",
+        default="Versal",
+        choices=["Versal", "7-Series", "UltraScale"],
+        help="Target FPGA generation",
+    )
+    parser.add_argument(
+        "-p",
+        "--pipeline_every",
+        type=int,
+        default=None,
+        help="Pipeline registers every N combinational stages",
+    )
+    parser.add_argument(
+        "-o", "--output_dir", default="../gen", help="Output directory for generated files"
+    )
+    parser.add_argument(
+        "--name", default=None, help="Module name override (default: comp_<N>u<W>_d<delay>)"
     )
-    parser.add_argument('--n', type=int, required=True,
-                        help="Number of unsigned addends (= SIMD)")
-    parser.add_argument('-t', '--target', default="Versal",
-                        choices=["Versal", "7-Series", "UltraScale"],
-                        help="Target FPGA generation")
-    parser.add_argument('-p', '--pipeline_every', type=int, default=None,
-                        help="Pipeline registers every N combinational stages")
-    parser.add_argument('-o', '--output_dir', default="../gen",
-                        help="Output directory for generated files")
-    parser.add_argument('--name', default=None,
-                        help="Module name override (default: comp_<N>u<W>_d<delay>)")
 
     # Direct mode: explicit arg_width
-    parser.add_argument('--arg_width', type=int, default=None,
-                        help="Bit width per addend (direct mode)")
+    parser.add_argument(
+        "--arg_width", type=int, default=None, help="Bit width per addend (direct mode)"
+    )
 
     # MVU mode: derive arg_width(s) from MVU parameters
     mvu_group = parser.add_argument_group(
-        'MVU parameters',
-        'When --mvu is given, lo_width values are computed from these '
-        'MVU-level parameters (replicating mvu.sv::sliceLanes).'
+        "MVU parameters",
+        "When --mvu is given, lo_width values are computed from these "
+        "MVU-level parameters (replicating mvu.sv::sliceLanes).",
+    )
+    mvu_group.add_argument(
+        "--mvu", action="store_true", help="Enable MVU mode: derive arg_width from MVU params"
+    )
+    mvu_group.add_argument(
+        "--version",
+        type=int,
+        default=2,
+        choices=[1, 2, 3],
+        help="DSP version (1=DSP48E1, 2=DSP48E2, 3=DSP58)",
+    )
+    mvu_group.add_argument("--ww", type=int, default=None, help="WEIGHT_WIDTH")
+    mvu_group.add_argument("--aw", type=int, default=None, help="ACTIVATION_WIDTH")
+    mvu_group.add_argument("--accu_width", type=int, default=None, help="ACCU_WIDTH")
+    mvu_group.add_argument(
+        "--narrow_weights", type=int, default=0, choices=[0, 1], help="NARROW_WEIGHTS flag (0 or 1)"
     )
-    mvu_group.add_argument('--mvu', action='store_true',
-                           help="Enable MVU mode: derive arg_width from MVU params")
-    mvu_group.add_argument('--version', type=int, default=2,
-                           choices=[1, 2, 3],
-                           help="DSP version (1=DSP48E1, 2=DSP48E2, 3=DSP58)")
-    mvu_group.add_argument('--ww', type=int, default=None,
-                           help="WEIGHT_WIDTH")
-    mvu_group.add_argument('--aw', type=int, default=None,
-                           help="ACTIVATION_WIDTH")
-    mvu_group.add_argument('--accu_width', type=int, default=None,
-                           help="ACCU_WIDTH")
-    mvu_group.add_argument('--narrow_weights', type=int, default=0,
-                           choices=[0, 1],
-                           help="NARROW_WEIGHTS flag (0 or 1)")
 
     args = parser.parse_args()
 
     # Validate argument combinations
     if not args.mvu and args.arg_width is None:
-        parser.error("Either --arg_width (direct mode) or --mvu with MVU "
-                     "parameters is required.")
+        parser.error(
+            "Either --arg_width (direct mode) or --mvu with MVU " "parameters is required."
+        )
     if args.mvu and args.arg_width is not None:
         parser.error("--arg_width and --mvu are mutually exclusive.")
     if args.mvu:
-        for param in ('ww', 'aw', 'accu_width'):
+        for param in ("ww", "aw", "accu_width"):
             if getattr(args, param) is None:
                 parser.error(f"--mvu requires --{param}")
 
@@ -363,16 +380,19 @@ def main():
             return
 
         widths = lo_widths_from_mvu_params(
-            args.version, args.ww, args.aw,
-            args.accu_width, bool(args.narrow_weights)
+            args.version, args.ww, args.aw, args.accu_width, bool(args.narrow_weights)
         )
         depth = 3 + clog2(simd) + (1 if simd == 1 else 0) + 1
         add_multi_depth = depth - 4
 
-        print(f"MVU config: VERSION={args.version} WW={args.ww} AW={args.aw} "
-              f"ACCU_WIDTH={args.accu_width} NARROW_WEIGHTS={args.narrow_weights}")
-        print(f"  NUM_LANES={len(widths)}  PIPELINE_DEPTH={depth}  "
-              f"ADD_MULTI_DEPTH={add_multi_depth}")
+        print(
+            f"MVU config: VERSION={args.version} WW={args.ww} AW={args.aw} "
+            f"ACCU_WIDTH={args.accu_width} NARROW_WEIGHTS={args.narrow_weights}"
+        )
+        print(
+            f"  NUM_LANES={len(widths)}  PIPELINE_DEPTH={depth}  "
+            f"ADD_MULTI_DEPTH={add_multi_depth}"
+        )
         print(f"  LO_WIDTHs: {widths}")
 
         # Generate one compressor per unique (N, lo_width)
@@ -384,8 +404,7 @@ def main():
             seen.add((simd, w))
 
             comp_name, comp_path, comp_delay = generate_add_multi_comp(
-                target, simd, w,
-                args.pipeline_every, args.output_dir, name=args.name
+                target, simd, w, args.pipeline_every, args.output_dir, name=args.name
             )
             print(f"  Lane {lane}: lo_width={w}")
             print(f"    Generated: {comp_path}")
@@ -395,8 +414,8 @@ def main():
     else:
         # Direct mode: single compressor for explicit arg_width
         comp_name, comp_path, comp_delay = generate_add_multi_comp(
-            target, args.n, args.arg_width,
-            args.pipeline_every, args.output_dir, name=args.name)
+            target, args.n, args.arg_width, args.pipeline_every, args.output_dir, name=args.name
+        )
 
         print(f"Generated compressor core: {comp_path}")
         print(f"  Module name:     {comp_name}")
diff --git a/src/finn/compressor/src/benchmark.py b/src/finn/compressor/src/benchmark.py
index b4f7a5969b..68b2e87cb3 100644
--- a/src/finn/compressor/src/benchmark.py
+++ b/src/finn/compressor/src/benchmark.py
@@ -6,56 +6,72 @@
 # @brief    Benchmarking harness for compressor generation
 #############################################################################
 
+from functools import reduce
+
 from .passes.compressor_constructor import CompressorConstructor
-from .target import Versal
 from .passes.cost_estimator import CostEstimator
+from .target import Versal
 from .utils.shape import Shape
-from functools import reduce
+
 
 def gmean(numbers):
-    return reduce(lambda x, y: x*y, numbers)**(1.0/len(numbers))
+    return reduce(lambda x, y: x * y, numbers) ** (1.0 / len(numbers))
+
 
 def benchmark():
     examples = {
         "128": Shape([128]),
         "256": Shape([256]),
         "512": Shape([512]),
-        "128,128": Shape([128,128]),
-        "256,256": Shape([256,256]),
-        "512,512": Shape([512,512]),
-        "Int1": Shape([1,1,2,3,4,5,6,7,5,4,3,2,1]),
-        "Int2": Shape([1,1,1,3,5,7,9,11,13,10,8,6,4,2,1]),
-        "Int3": Shape([1,1,1,1,5,9,13,17,21,25,20,16,12,8,4]),
-        "Int4": Shape([1,1,1,1,1,9,17,25,33,41,49,40,32,24,16,8]),
-        "Int5": Shape([1,1,1,1,1,1,17,33,49,65,81,97,80,64,48,32,16]),
-        "LPFP1": Shape([1,1,1,1,1,1,1,1,1,1,1,1,1,1,2]),
-        "LPFP2": Shape([2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,4]),
-        "LPFP3": Shape([4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,8]),
-        "LPFP4": Shape([8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,16]),
-        "LPFP5": Shape([16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,32]),
-        "6-Input": Shape(32*[6]),
-        "10-Input": Shape(32*[10]),
-        "Mul16": Shape(list(range(1, 17)) + list(reversed(range(1, 16))))
+        "128,128": Shape([128, 128]),
+        "256,256": Shape([256, 256]),
+        "512,512": Shape([512, 512]),
+        "Int1": Shape([1, 1, 2, 3, 4, 5, 6, 7, 5, 4, 3, 2, 1]),
+        "Int2": Shape([1, 1, 1, 3, 5, 7, 9, 11, 13, 10, 8, 6, 4, 2, 1]),
+        "Int3": Shape([1, 1, 1, 1, 5, 9, 13, 17, 21, 25, 20, 16, 12, 8, 4]),
+        "Int4": Shape([1, 1, 1, 1, 1, 9, 17, 25, 33, 41, 49, 40, 32, 24, 16, 8]),
+        "Int5": Shape([1, 1, 1, 1, 1, 1, 17, 33, 49, 65, 81, 97, 80, 64, 48, 32, 16]),
+        "LPFP1": Shape([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2]),
+        "LPFP2": Shape([2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 4]),
+        "LPFP3": Shape([4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 8]),
+        "LPFP4": Shape([8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 16]),
+        "LPFP5": Shape(
+            [16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 32]
+        ),
+        "6-Input": Shape(32 * [6]),
+        "10-Input": Shape(32 * [10]),
+        "Mul16": Shape(list(range(1, 17)) + list(reversed(range(1, 16)))),
     }
 
     luts = []
     for example_name, example_shape in examples.items():
         target = Versal()
         constructor = CompressorConstructor()
-        comp = constructor(target.counter_candidates, 
-                           target.absorbing_counter_candidates,
-                           target.final_adder, example_shape, 
-                           "comp", 1, True, None, tuple(), [])
-        
+        comp = constructor(
+            target.counter_candidates,
+            target.absorbing_counter_candidates,
+            target.final_adder,
+            example_shape,
+            "comp",
+            1,
+            True,
+            None,
+            tuple(),
+            [],
+        )
+
         cost = CostEstimator()
         comp.accept(cost)
         eff = (sum(comp.input_shape) - sum(comp.output_shape)) / cost.luts
         luts.append(cost.luts)
-        print(f"Example {example_name:<10} uses {cost.luts:<6} LUTs"
-              f"for {cost.combinatorial_stages} stages (Efficiency: {eff: 1.2f})")
+        print(
+            f"Example {example_name:<10} uses {cost.luts:<6} LUTs"
+            f"for {cost.combinatorial_stages} stages (Efficiency: {eff: 1.2f})"
+        )
 
     luts_gmean = gmean(luts)
     print(f"Geomean {luts_gmean:.6} LUTs")
 
-if __name__=="__main__":
-    benchmark()
\ No newline at end of file
+
+if __name__ == "__main__":
+    benchmark()
diff --git a/src/finn/compressor/src/dotp.py b/src/finn/compressor/src/dotp.py
index 2a5f738148..f6c049b6de 100644
--- a/src/finn/compressor/src/dotp.py
+++ b/src/finn/compressor/src/dotp.py
@@ -6,96 +6,101 @@
 # @brief    Dot product compressor core generation for standalone testing
 #############################################################################
 
-import sys, re, os
+import os
+import re
+import sys
+
 from .main import generate_compressor
-from .target import Target, Versal, SevenSeries
-from .utils.shape import Shape
+from .target import SevenSeries, Versal
 from .utils.mul_comp_map import MulCompMap
-from typing import Optional, List
-
+from .utils.shape import Shape
 
 if __name__ == "__main__":
+    # Parse and extract Parameters from Command Line
+    sig = sys.argv[1]
+    _ = re.fullmatch("(\\d+)x([us])(\\d+)([us])(\\d+)", sig).groups()
+    (n, na, nb, sa, sb) = (int(_[0]), int(_[2]), int(_[4]), _[1] == "s", _[3] == "s")
+    assert nb <= na
+
+    # Target platform: ca/accu goes in argv[2], target in argv[3] (default versal)
+    target_arg = sys.argv[3] if len(sys.argv) > 3 else "versal"
+    if target_arg == "7series":
+        target = SevenSeries()
+        fpga_part = "xc7z020clg400-1"
+    elif target_arg == "ultrascale":
+        from .target import UltraScale
 
-	# Parse and extract Parameters from Command Line
-	sig = sys.argv[1]
-	_ = re.fullmatch("(\\d+)x([us])(\\d+)([us])(\\d+)", sig).groups()
-	(n, na, nb, sa, sb) = (int(_[0]), int(_[2]), int(_[4]), _[1] == 's', _[3] == 's')
-	assert nb <= na
+        target = UltraScale()
+        fpga_part = "xczu9eg-ffvb1156-2-e"
+    else:  # versal (default)
+        target = Versal()
+        fpga_part = "xcvc1902-vsva2197-2MP-e-S"
 
-	# Target platform: ca/accu goes in argv[2], target in argv[3] (default versal)
-	target_arg = sys.argv[3] if len(sys.argv) > 3 else "versal"
-	if target_arg == "7series":
-		target = SevenSeries()
-		fpga_part = "xc7z020clg400-1"
-	elif target_arg == "ultrascale":
-		from .target import UltraScale
-		target = UltraScale()
-		fpga_part = "xczu9eg-ffvb1156-2-e"
-	else:  # versal (default)
-		target = Versal()
-		fpga_part = "xcvc1902-vsva2197-2MP-e-S"
+    def clog2(x):
+        return (x - 1).bit_length()
 
-	clog2 = lambda x: (x-1).bit_length()
-	np = clog2(n) + (na if nb == 1 and not sb else na+nb) if na > 1 else (
-			clog2(n+1) if sa == sb else 1 + clog2(n)
-		)
+    np = (
+        clog2(n) + (na if nb == 1 and not sb else na + nb)
+        if na > 1
+        else (clog2(n + 1) if sa == sb else 1 + clog2(n))
+    )
 
-	map = MulCompMap(na, nb, sa, sb)
-	shape = [col * n for col in map.shape()]
-	print("Shape: ", ' '.join((':'.join((f"{val:x}" for val in col)) for col in shape[::-1])))
+    map = MulCompMap(na, nb, sa, sb)
+    shape = [col * n for col in map.shape()]
+    print("Shape: ", " ".join((":".join((f"{val:x}" for val in col)) for col in shape[::-1])))
 
-	# Absolute Term Contribution
-	constants = []
-	abs_term  = n * map.absolute_term()
-	# Move absolute term into absorbed constant if requested
-	if len(sys.argv) > 2 and sys.argv[2] == 'ca':
-		print("Constant absorption.")
-		if abs_term < 0:
-			abs_term += 2**np
-		constants = [(abs_term >> i) & 1 for i in range(np)]
-		abs_term  = 0
+    # Absolute Term Contribution
+    constants = []
+    abs_term = n * map.absolute_term()
+    # Move absolute term into absorbed constant if requested
+    if len(sys.argv) > 2 and sys.argv[2] == "ca":
+        print("Constant absorption.")
+        if abs_term < 0:
+            abs_term += 2**np
+        constants = [(abs_term >> i) & 1 for i in range(np)]
+        abs_term = 0
 
-	name = "comp_" + sig
-	# Write to gen/ relative to this script's parent directory (compressor/)
-	script_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
-	output_path = os.path.join(script_dir, "gen", name + ".sv")
-	generate_compressor(
-		target            = target,
-		shape             = Shape((len(col) for col in shape)),
-		name              = name,
-		comb_depth        = None,
-		accumulate        = False,
-		accumulator_width = None,
-		gates = [[f"{val:x}" for val in col] for col in shape],
-		constants = constants,
-		path = output_path,
-		test = False
-	)
+    name = "comp_" + sig
+    # Write to gen/ relative to this script's parent directory (compressor/)
+    script_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+    output_path = os.path.join(script_dir, "gen", name + ".sv")
+    generate_compressor(
+        target=target,
+        shape=Shape((len(col) for col in shape)),
+        name=name,
+        comb_depth=None,
+        accumulate=False,
+        accumulator_width=None,
+        gates=[[f"{val:x}" for val in col] for col in shape],
+        constants=constants,
+        path=output_path,
+        test=False,
+    )
 
-	# Process templates with absolute paths
-	gen_dir = os.path.join(script_dir, "gen")
-	hdl_dir = os.path.join(script_dir, "hdl")
-	for (src_rel, dst_rel) in (
-		("dotp_template.sv", "dotp_"+sig+".sv"),
-		("dotp_tb_template.sv", "dotp_"+sig+"_tb.sv"),
-		("dotp_template.tcl", "dotp_"+sig+".tcl")
-	):
-		src = os.path.join(hdl_dir, src_rel)
-		dst = os.path.join(gen_dir, dst_rel)
-		with open(src, "rt") as fsrc:
-			with open(dst, "wt") as fdst:
-				for l in fsrc:
-					fdst.write(l
-						.replace("{n}", str(n))
-						.replace("{na}", str(na))
-						.replace("{nb}", str(nb))
-						.replace("{sa}", 's' if sa else 'u')
-						.replace("{sb}", 's' if sb else 'u')
-						.replace("{signed_a}", str(int(sa)))
-						.replace("{signed_b}", str(int(sb)))
-						.replace("{abs_term}", str(abs_term))
-						.replace("{part}", fpga_part)
-						# Replace relative paths with absolute paths for TCL
-						.replace("hdl/", hdl_dir + "/")
-						.replace("gen/", gen_dir + "/")
-					)
+    # Process templates with absolute paths
+    gen_dir = os.path.join(script_dir, "gen")
+    hdl_dir = os.path.join(script_dir, "hdl")
+    for src_rel, dst_rel in (
+        ("dotp_template.sv", "dotp_" + sig + ".sv"),
+        ("dotp_tb_template.sv", "dotp_" + sig + "_tb.sv"),
+        ("dotp_template.tcl", "dotp_" + sig + ".tcl"),
+    ):
+        src = os.path.join(hdl_dir, src_rel)
+        dst = os.path.join(gen_dir, dst_rel)
+        with open(src, "rt") as fsrc:
+            with open(dst, "wt") as fdst:
+                for line in fsrc:
+                    fdst.write(
+                        line.replace("{n}", str(n))
+                        .replace("{na}", str(na))
+                        .replace("{nb}", str(nb))
+                        .replace("{sa}", "s" if sa else "u")
+                        .replace("{sb}", "s" if sb else "u")
+                        .replace("{signed_a}", str(int(sa)))
+                        .replace("{signed_b}", str(int(sb)))
+                        .replace("{abs_term}", str(abs_term))
+                        .replace("{part}", fpga_part)
+                        # Replace relative paths with absolute paths for TCL
+                        .replace("hdl/", hdl_dir + "/")
+                        .replace("gen/", gen_dir + "/")
+                    )
diff --git a/src/finn/compressor/src/dotp_finn.py b/src/finn/compressor/src/dotp_finn.py
index db24e38a67..3b52dd4179 100644
--- a/src/finn/compressor/src/dotp_finn.py
+++ b/src/finn/compressor/src/dotp_finn.py
@@ -23,12 +23,13 @@
   comp_<sig>.sv  — the generated compressor core (module `comp_<sig>`)
 """
 
+import argparse
 import os
 import re
-import argparse
+
 from .main import generate_compressor
-from .utils.mul_comp_map import MulCompMap
 from .target import resolve_target, resolve_target_name
+from .utils.mul_comp_map import MulCompMap
 from .utils.shape import Shape
 
 
@@ -48,10 +49,9 @@ def expand_template(template_path, output_path, substitutions):
         text = f.read()
     for key, value in substitutions.items():
         text = text.replace(key, value)
-    remaining = re.findall(r'\$[A-Z_]+\$', text)
+    remaining = re.findall(r"\$[A-Z_]+\$", text)
     if remaining:
-        raise ValueError(
-            f"Unsubstituted placeholders in {output_path}: {remaining}")
+        raise ValueError(f"Unsubstituted placeholders in {output_path}: {remaining}")
     with open(output_path, "w") as f:
         f.write(text)
 
@@ -86,9 +86,9 @@ def comp_module_name(n, sa, na, sb, nb, accu_width):
     return "comp_" + make_signature(n, sa, na, sb, nb) + f"_a{accu_width}"
 
 
-
-def generate_comp_module(target, n, na, nb, sa, sb, accu_width,
-                         pipeline_every, output_dir, name=None):
+def generate_comp_module(
+    target, n, na, nb, sa, sb, accu_width, pipeline_every, output_dir, name=None
+):
     """Generate the compressor core with fused accumulation.
 
     When *name* is None (the default), the module is named after its
@@ -159,24 +159,35 @@ def generate_dotp_comp(fpgapart, simd, ww, aw, accu_width, signed_act, output_di
     n, na, nb, sa, sb, _ = compute_params(simd, ww, aw, signed_act)
 
     comp_name, comp_path, comp_delay = generate_comp_module(
-        target, n, na, nb, sa, sb, accu_width,
+        target,
+        n,
+        na,
+        nb,
+        sa,
+        sb,
+        accu_width,
         pipeline_every=1,  # Max pipelining
-        output_dir=output_dir)
+        output_dir=output_dir,
+    )
 
     # Expand dotp_comp template with the generated module name
     src_dir = os.path.dirname(os.path.abspath(__file__))
     compressor_root = os.path.abspath(os.path.join(src_dir, ".."))
     dotp_comp_template = os.path.join(compressor_root, "hdl", "dotp_comp_template.sv")
     dotp_comp_path = os.path.join(output_dir, "dotp_comp.sv")
-    expand_template(dotp_comp_template, dotp_comp_path, {
-        "$COMP_MODULE_NAME$": comp_name,
-        "$EXPECTED_SIMD$": str(simd),
-        "$EXPECTED_NA$": str(na),
-        "$EXPECTED_NB$": str(nb),
-        "$EXPECTED_SIGNED_A$": str(1 if sa else 0),
-        "$EXPECTED_SIGNED_B$": str(1 if sb else 0),
-        "$EXPECTED_ACCU_WIDTH$": str(accu_width),
-    })
+    expand_template(
+        dotp_comp_template,
+        dotp_comp_path,
+        {
+            "$COMP_MODULE_NAME$": comp_name,
+            "$EXPECTED_SIMD$": str(simd),
+            "$EXPECTED_NA$": str(na),
+            "$EXPECTED_NB$": str(nb),
+            "$EXPECTED_SIGNED_A$": str(1 if sa else 0),
+            "$EXPECTED_SIGNED_B$": str(1 if sb else 0),
+            "$EXPECTED_ACCU_WIDTH$": str(accu_width),
+        },
+    )
 
     return {
         "comp_name": comp_name,
@@ -191,49 +202,76 @@ def main():
     default_dotp_template = os.path.join(repo_root, "hdl", "dotp_comp_template.sv")
 
     parser = argparse.ArgumentParser(
-        prog="dotp_finn",
-        description="Generate a compressor core for FINN's dotp_comp module."
+        prog="dotp_finn", description="Generate a compressor core for FINN's dotp_comp module."
+    )
+    parser.add_argument("--simd", type=int, required=True, help="SIMD (operand pairs per cycle)")
+    parser.add_argument("--ww", type=int, required=True, help="Weight bit width")
+    parser.add_argument("--aw", type=int, required=True, help="Activation bit width")
+    parser.add_argument("--accu_width", type=int, required=True, help="Accumulator bit width")
+    parser.add_argument("--signed_activations", action="store_true", help="Activations are signed")
+    parser.add_argument(
+        "-t",
+        "--target",
+        default="Versal",
+        choices=["Versal", "7-Series", "UltraScale"],
+        help="Target FPGA generation",
+    )
+    parser.add_argument(
+        "-p",
+        "--pipeline_every",
+        type=int,
+        default=None,
+        help="Pipeline registers every N combinational stages",
+    )
+    parser.add_argument(
+        "-o", "--output_dir", default="../gen", help="Output directory for generated files"
+    )
+    parser.add_argument(
+        "-n", "--name", default=None, help="Module name override (default: comp_<sig>)"
+    )
+    parser.add_argument(
+        "--dotp-template",
+        default=default_dotp_template,
+        help="Path to dotp_comp template file to expand",
+    )
+    parser.add_argument(
+        "--dotp-output-name",
+        default="dotp_comp.sv",
+        help="Output file name for expanded dotp_comp template",
+    )
+    parser.add_argument(
+        "--skip-dotp-template", action="store_true", help="Skip expanding dotp_comp template"
     )
-    parser.add_argument('--simd', type=int, required=True, help="SIMD (operand pairs per cycle)")
-    parser.add_argument('--ww', type=int, required=True, help="Weight bit width")
-    parser.add_argument('--aw', type=int, required=True, help="Activation bit width")
-    parser.add_argument('--accu_width', type=int, required=True, help="Accumulator bit width")
-    parser.add_argument('--signed_activations', action='store_true',
-                        help="Activations are signed")
-    parser.add_argument('-t', '--target', default="Versal",
-                        choices=["Versal", "7-Series", "UltraScale"],
-                        help="Target FPGA generation")
-    parser.add_argument('-p', '--pipeline_every', type=int, default=None,
-                        help="Pipeline registers every N combinational stages")
-    parser.add_argument('-o', '--output_dir', default="../gen",
-                        help="Output directory for generated files")
-    parser.add_argument('-n', '--name', default=None,
-                        help="Module name override (default: comp_<sig>)")
-    parser.add_argument('--dotp-template', default=default_dotp_template,
-                        help="Path to dotp_comp template file to expand")
-    parser.add_argument('--dotp-output-name', default="dotp_comp.sv",
-                        help="Output file name for expanded dotp_comp template")
-    parser.add_argument('--skip-dotp-template', action='store_true',
-                        help="Skip expanding dotp_comp template")
     args = parser.parse_args()
     target = resolve_target_name(args.target)
     os.makedirs(args.output_dir, exist_ok=True)
 
     # Compute compressor parameters
     n, na, nb, sa, sb, swapped = compute_params(
-        args.simd, args.ww, args.aw, args.signed_activations)
+        args.simd, args.ww, args.aw, args.signed_activations
+    )
 
     # Generate the compressor core with fused accumulation
     comp_name, comp_path, comp_delay = generate_comp_module(
-        target, n, na, nb, sa, sb, args.accu_width,
-        args.pipeline_every, args.output_dir, name=args.name)
+        target,
+        n,
+        na,
+        nb,
+        sa,
+        sb,
+        args.accu_width,
+        args.pipeline_every,
+        args.output_dir,
+        name=args.name,
+    )
 
     dotp_path = None
     if not args.skip_dotp_template:
         template_path = os.path.abspath(args.dotp_template)
         if not os.path.isfile(template_path):
             raise FileNotFoundError(
-                f"dotp template not found: {template_path}. Use --dotp-template or --skip-dotp-template."
+                f"dotp template not found: {template_path}. "
+                f"Use --dotp-template or --skip-dotp-template."
             )
         dotp_path = os.path.join(args.output_dir, args.dotp_output_name)
         expand_template(
diff --git a/src/finn/compressor/src/evaluation.py b/src/finn/compressor/src/evaluation.py
index 99ed49a33d..61dc1f82ea 100644
--- a/src/finn/compressor/src/evaluation.py
+++ b/src/finn/compressor/src/evaluation.py
@@ -6,41 +6,45 @@
 # @brief    Evaluation and benchmarking utilities for compressor
 #############################################################################
 
-from .target import Versal
-from .utils.shape import Shape
+import subprocess
+from concurrent.futures import ThreadPoolExecutor
+
 from .main import generate_compressor
+from .target import Versal
 from .tests.test_gen import compressed_width
-from concurrent.futures import ThreadPoolExecutor
-import subprocess
+from .utils.shape import Shape
+
 
 def evaluation():
     examples = {
         "128": Shape([128]),
         "256": Shape([256]),
         "512": Shape([512]),
-        "128,128": Shape([128,128]),
-        "256,256": Shape([256,256]),
-        "512,512": Shape([512,512]),
-        "Int1": Shape([1,1,2,3,4,5,6,7,5,4,3,2,1]),
-        "Int2": Shape([1,1,1,3,5,7,9,11,13,10,8,6,4,2,1]),
-        "Int3": Shape([1,1,1,1,5,9,13,17,21,25,20,16,12,8,4]),
-        "Int4": Shape([1,1,1,1,1,9,17,25,33,41,49,40,32,24,16,8]),
-        "Int5": Shape([1,1,1,1,1,1,17,33,49,65,81,97,80,64,48,32,16]),
-        "LPFP1": Shape([1,1,1,1,1,1,1,1,1,1,1,1,1,1,2]),
-        "LPFP2": Shape([2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,4]),
-        "LPFP3": Shape([4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,8]),
-        "LPFP4": Shape([8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,16]),
-        "LPFP5": Shape([16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,32]),
-        "6x32": Shape(32*[6]),
-        "10x32": Shape(32*[10]),
-        "Mul16": Shape(list(range(1, 17)) + list(reversed(range(1, 16))))
+        "128,128": Shape([128, 128]),
+        "256,256": Shape([256, 256]),
+        "512,512": Shape([512, 512]),
+        "Int1": Shape([1, 1, 2, 3, 4, 5, 6, 7, 5, 4, 3, 2, 1]),
+        "Int2": Shape([1, 1, 1, 3, 5, 7, 9, 11, 13, 10, 8, 6, 4, 2, 1]),
+        "Int3": Shape([1, 1, 1, 1, 5, 9, 13, 17, 21, 25, 20, 16, 12, 8, 4]),
+        "Int4": Shape([1, 1, 1, 1, 1, 9, 17, 25, 33, 41, 49, 40, 32, 24, 16, 8]),
+        "Int5": Shape([1, 1, 1, 1, 1, 1, 17, 33, 49, 65, 81, 97, 80, 64, 48, 32, 16]),
+        "LPFP1": Shape([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2]),
+        "LPFP2": Shape([2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 4]),
+        "LPFP3": Shape([4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 8]),
+        "LPFP4": Shape([8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 16]),
+        "LPFP5": Shape(
+            [16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 32]
+        ),
+        "6x32": Shape(32 * [6]),
+        "10x32": Shape(32 * [10]),
+        "Mul16": Shape(list(range(1, 17)) + list(reversed(range(1, 16)))),
     }
 
     filenames = []
     for example_name, example_shape in examples.items():
         print(example_name, example_shape)
         # combinatorial design
-        filename = "../gen/"+example_name+"_comb.sv"
+        filename = "../gen/" + example_name + "_comb.sv"
         generate_compressor(
             target=Versal(),
             shape=example_shape,
@@ -51,13 +55,14 @@ def evaluation():
             gates=None,
             constants=[],
             path=filename,
-            test=True
+            test=True,
+        )
+        generate_wrapper(
+            shape=example_shape, pipelined=False, gates=False, accumulation=False, filename=filename
         )
-        generate_wrapper(shape=example_shape, pipelined=False, gates=False,
-                         accumulation=False, filename=filename)
         filenames.append(filename)
         # accumulating design
-        filename = "../gen/"+example_name+"_acc.sv"
+        filename = "../gen/" + example_name + "_acc.sv"
         generate_compressor(
             target=Versal(),
             shape=example_shape,
@@ -68,13 +73,14 @@ def evaluation():
             gates=None,
             constants=[],
             path=filename,
-            test=True
+            test=True,
+        )
+        generate_wrapper(
+            shape=example_shape, pipelined=True, gates=False, accumulation=True, filename=filename
         )
-        generate_wrapper(shape=example_shape, pipelined=True, gates=False,
-                         accumulation=True, filename=filename)
         filenames.append(filename)
         # gate inlined design with accumulation
-        filename = "../gen/"+example_name+"_gate.sv"
+        filename = "../gen/" + example_name + "_gate.sv"
         generate_compressor(
             target=Versal(),
             shape=example_shape,
@@ -85,27 +91,34 @@ def evaluation():
             gates=[["8" for el in range(col)] for col in example_shape],
             constants=[],
             path=filename,
-            test=True
+            test=True,
+        )
+        generate_wrapper(
+            shape=example_shape, pipelined=True, gates=True, accumulation=True, filename=filename
         )
-        generate_wrapper(shape=example_shape, pipelined=True, gates=True,
-                         accumulation=True, filename=filename)
         filenames.append(filename)
 
     tclfiles = [emit_eval_tcl_script(el) for el in filenames]
 
     def call_vivado(filename):
+        vivado_path = (
+            "/proj/xbuilds/released/2023.1/2023.1_0508_1/"
+            "installs/lin64/Vivado/2023.1/settings64.sh"
+        )
         command = f"""cd ../gen/ &&
-            ls && 
-            source /proj/xbuilds/released/2023.1/2023.1_0508_1/installs/lin64/Vivado/2023.1/settings64.sh && 
+            ls &&
+            source {vivado_path} &&
             vivado -mode batch -source {filename.split("/")[-1]}"""
-        return subprocess.run(command, shell=True, check=True, timeout=3600, 
-                              text=True, executable="/bin/bash")
+        return subprocess.run(
+            command, shell=True, check=True, timeout=3600, text=True, executable="/bin/bash"
+        )
 
     print("Executing evaluation threads")
     with ThreadPoolExecutor(max_workers=15) as executor:
         executor.map(call_vivado, tclfiles)
     print("Done executing evaluation threads")
 
+
 def generate_wrapper(shape, pipelined, gates, accumulation, filename):
     iw = sum(shape)
     ow = compressed_width(shape)
@@ -121,12 +134,12 @@ def generate_wrapper(shape, pipelined, gates, accumulation, filename):
     input_str = "\tinput " + ", ".join(inputs) + ",\n"
     output_str = f"\toutput logic [{ow-1}:0] outReg"
 
-    wrapper_str =  (
-    "module sandwich(\n" +
-    input_str + 
-    output_str +
-    '\n);\n' + 
-    f"""
+    wrapper_str = (
+        "module sandwich(\n"
+        + input_str
+        + output_str
+        + "\n);\n"
+        + f"""
 \t{"logic en_negReg, rstReg;" if accumulation else ""}
 \tlogic [{iw-1}:0] inReg{", in_2Reg;" if gates else ";"}
 \twire [{ow-1}:0] out;
@@ -141,17 +154,20 @@ def generate_wrapper(shape, pipelined, gates, accumulation, filename):
 \t
 \t(* keep_hierarchy = "yes" *)
 \tcomp c(.in(inReg), .clk(clk),{" .in_2(in_2Reg)," if gates else ""
-                                }{" .en_neg(en_negReg), .rst(rstReg)," 
+                                }{" .en_neg(en_negReg), .rst(rstReg),"
                                   if accumulation else ""} .out(out));
 
 endmodule"""
     )
-    with open(filename, 'a') as f:
+    with open(filename, "a") as f:
         f.writelines(wrapper_str)
 
+
 def emit_eval_tcl_script(compressor_path):
-    comps = "set comps { " + str(compressor_path.split("/")[-1])  + " }"
-    script = comps + """
+    comps = "set comps { " + str(compressor_path.split("/")[-1]) + " }"
+    script = (
+        comps
+        + """
 set PART xcvc1902-vsva2197-2MP-e-S ; # From VCK190 Evaluation Board
 
 foreach comp $comps {
@@ -164,7 +180,7 @@ def emit_eval_tcl_script(compressor_path):
     set filename $filename_prefix$comp$filename_suffix
     puts $filename
     set outfile [open $filename w]
-    puts $outfile "\{"
+    puts $outfile "\\{"
 
     set tm 0.7 ; # Minimum possible ime
     set tt 10.0 ; # Time to Test
@@ -176,7 +192,7 @@ def emit_eval_tcl_script(compressor_path):
     synth_design -top sandwich -part $PART
 
     # -----------------------------------------------------------------------------
-    # while loop, updating clock 
+    # while loop, updating clock
     while {[expr $ts - $tm] > 0.1} {
         puts "NEW SYNTHESIS RUN WITH FREQ $tt"
         create_clock -name CLK -period $tt [get_port clk]
@@ -189,32 +205,36 @@ def emit_eval_tcl_script(compressor_path):
         route_design -directive Explore
         report_drc
         report_utilization -hierarchical
-        report_timing -setup -hold -max_paths 3 -nworst 3 -input_pins -sort_by group -file $comp.twrA
-        report_timing_summary -delay_type min_max -path_type full_clock_expanded -report_unconstrained -check_timing_verbose -max_paths 3 -nworst 3 -significant_digits 3 -input_pins -file $comp.twrA
+        report_timing -setup -hold -max_paths 3 -nworst 3 -input_pins \\
+            -sort_by group -file $comp.twrA
+        report_timing_summary -delay_type min_max \\
+            -path_type full_clock_expanded -report_unconstrained \\
+            -check_timing_verbose -max_paths 3 -nworst 3 \\
+            -significant_digits 3 -input_pins -file $comp.twrA
 
         # -----------------------------------------------------------------------------
         # Find maximum data path delay and slack
         set f [open $comp.twrA r]
         set file_data [read $f]
         close $f
-        if {[regexp { +Data Path Delay: +(\d+\.\d+)} $file_data -> value]} {
+        if {[regexp { +Data Path Delay: +(\\d+\\.\\d+)} $file_data -> value]} {
             set tr $value
         } {
             error "DATA PATH DELAY NOT FOUND"
         }
 
         # -----------------------------------------------------------------------------
-        # Find LUT and Slice utilization 
+        # Find LUT and Slice utilization
         set f [open util_$comp.twrA r]
         set file_data [read $f]
         close $f
-        if {[regexp {CLB LUTs +\| +(\d+)} $file_data -> value]} {
+        if {[regexp {CLB LUTs +\\| +(\\d+)} $file_data -> value]} {
             set lc $value
         } {
             error "LUT UTILIZATION NOT FOUND"
         }
 
-        if {[regexp {SLICE +\| +(\d+)} $file_data -> value]} {
+        if {[regexp {SLICE +\\| +(\\d+)} $file_data -> value]} {
             set sc $value
         } {
             error "SLICE UTILIZATION NOT FOUND"
@@ -227,7 +247,7 @@ def emit_eval_tcl_script(compressor_path):
             set tm $tt
             if { $tr < $ts } {
                 set ts $tr
-            } 
+            }
         } else {
             set ts $tr
         }
@@ -238,16 +258,18 @@ def emit_eval_tcl_script(compressor_path):
     puts -nonewline $outfile "\\"Slice\\": $sc,"
     puts -nonewline $outfile "\\"LUTS\\": $lc" ;
 
-    puts $outfile "\}"
+    puts $outfile "\\}"
     close $outfile
     remove_files {$comp}
 }
 q
 """
+    )
     tclpath = compressor_path.replace(".sv", ".tcl")
     with open(tclpath, "w") as f:
         f.writelines(script)
     return tclpath
 
-if __name__=="__main__":
-    evaluation()
\ No newline at end of file
+
+if __name__ == "__main__":
+    evaluation()
diff --git a/src/finn/compressor/src/graph/accumulator.py b/src/finn/compressor/src/graph/accumulator.py
index 2fa585dc5d..b85dd78061 100644
--- a/src/finn/compressor/src/graph/accumulator.py
+++ b/src/finn/compressor/src/graph/accumulator.py
@@ -6,19 +6,26 @@
 # @brief    Accumulator stage implementation for compressor
 #############################################################################
 
-from .nodes import Shape, Wire, Logic, Stage, Bitmatrix
 from collections.abc import Iterable
 
+from .nodes import Bitmatrix, Logic, Shape, Stage, Wire
+
+
 class AccumulatorStage(Stage):
-    def __init__(self, shape: Shape, final_adder, preceeding_pipeline_stages,
-                 accumulator_width = None, enable = False):
+    def __init__(
+        self,
+        shape: Shape,
+        final_adder,
+        preceeding_pipeline_stages,
+        accumulator_width=None,
+        enable=False,
+    ):
         super().__init__()
         self.input_shape = shape
-        self.output_shape = Shape([1 for _ in range(
-            self.get_accumulator_width(accumulator_width))])
+        self.output_shape = Shape([1 for _ in range(self.get_accumulator_width(accumulator_width))])
         self.instances = []
         self.input_wires = Bitmatrix(shape)
-        self.output_wires = Bitmatrix(self.output_shape) # TODO: Make Logic
+        self.output_wires = Bitmatrix(self.output_shape)  # TODO: Make Logic
         self.accumulator_width = self.get_accumulator_width(accumulator_width)
         self.final_adder_gen = final_adder
         self.preceeding_pipeline_stages = preceeding_pipeline_stages
@@ -51,17 +58,16 @@ def build_hardware(self):
         # integration en is hardwired to '1 making this technically redundant,
         # but the FPGA INIT attribute is free and keeps the design robust
         # against future uses where en may be gated.
-        rst_del = self.delay_signal(rst, self.preceeding_pipeline_stages+1,
-                                    en=en_wire,
-                                    init=1 if self.enable else None)
-        en_neg_del = self.delay_signal(en_neg, self.preceeding_pipeline_stages,
-                                       en=en_wire)
+        rst_del = self.delay_signal(
+            rst, self.preceeding_pipeline_stages + 1, en=en_wire, init=1 if self.enable else None
+        )
+        en_neg_del = self.delay_signal(en_neg, self.preceeding_pipeline_stages, en=en_wire)
 
         # Connect inputs to final adder
-        loop = self.delay_signal(final_adder.output_wires, cycles=1,
-                                 rst=rst_del, en=en_wire, init=0)
-        in_ = self.delay_signal(self.input_wires, cycles=1, rst=en_neg_del,
-                                en=en_wire, init=0)
+        loop = self.delay_signal(
+            final_adder.output_wires, cycles=1, rst=rst_del, en=en_wire, init=0
+        )
+        in_ = self.delay_signal(self.input_wires, cycles=1, rst=en_neg_del, en=en_wire, init=0)
         for col_loop, col_fa in zip(loop, final_adder.input_wires):
             col_loop[0].connect_to(col_fa[0])
 
@@ -75,7 +81,7 @@ def build_hardware(self):
                 s.connect_to(t)
         self.instances.append(final_adder)
 
-    def delay_signal(self, signal, /, cycles=1, rst = None, en = None, init = None):
+    def delay_signal(self, signal, /, cycles=1, rst=None, en=None, init=None):
         if isinstance(signal, Iterable):
             return [self.delay_signal(el, cycles, rst, en, init) for el in signal]
         for i in range(cycles):
@@ -84,13 +90,12 @@ def delay_signal(self, signal, /, cycles=1, rst = None, en = None, init = None):
             self.instances.append(lgc)
             signal = lgc
         return signal
-       
 
-    def get_accumulator_width(self, input = None):
+    def get_accumulator_width(self, input=None):
         if input:
             return input
         else:
-            return sum([(el << idx) for idx, el in 
-                        enumerate(self.input_shape)]).bit_length()
-    
-    def accept(self, visitor): visitor.visit_accumulator_stage(self)
\ No newline at end of file
+            return sum([(el << idx) for idx, el in enumerate(self.input_shape)]).bit_length()
+
+    def accept(self, visitor):
+        visitor.visit_accumulator_stage(self)
diff --git a/src/finn/compressor/src/graph/counters/absorption_counter_candidates.py b/src/finn/compressor/src/graph/counters/absorption_counter_candidates.py
index 53a1163dc3..9ba56eae93 100644
--- a/src/finn/compressor/src/graph/counters/absorption_counter_candidates.py
+++ b/src/finn/compressor/src/graph/counters/absorption_counter_candidates.py
@@ -8,13 +8,20 @@
 #############################################################################
 
 from abc import ABC, abstractmethod
+from typing import List
+
 from ...utils.shape import Shape
 from ..nodes import GateAbsorptionCounter
-from typing import List
-from ..primitives import LUT6CY, LUT2, LUT6
+from ..primitives import LUT2, LUT6, LUT6CY
+
+
+def fa_sum(a, b, c):
+    return a ^ b ^ c
+
+
+def fa_carry(a, b, c):
+    return a and b or a and c or b and c
 
-def fa_sum(a, b, c): return a ^ b ^ c
-def fa_carry(a, b, c): return a and b or a and c or b and c
 
 def gate_string_to_pred(string):
     class Gate:
@@ -22,44 +29,46 @@ def __init__(self, init):
             try:
                 self._init = int(init, 16)
             except ValueError:
-                raise  ValueError(f"Gate specification {string} is invalid!")
+                raise ValueError(f"Gate specification {string} is invalid!")
 
         def __call__(self, a, b):
-            return  bool((self._init >> (1*a | 2*b)) & 1)
+            return bool((self._init >> (1 * a | 2 * b)) & 1)
 
         def __repr__(self):
-            return  f"{self._init:x}"
-    return  Gate(string)
+            return f"{self._init:x}"
+
+    return Gate(string)
+
 
 class GateAbsorptionCounterCandidate(ABC):
     @abstractmethod
-    def extend_to_fit(self, inputs: Shape, 
-                      gates: List[List[str]]) -> GateAbsorptionCounter:
+    def extend_to_fit(self, inputs: Shape, gates: List[List[str]]) -> GateAbsorptionCounter:
         pass
 
+
 class AbsorbingFACandidate(GateAbsorptionCounterCandidate):
-    def extend_to_fit(self, inputs: Shape,
-                      gates: List[List[str]]) -> GateAbsorptionCounter:
+    def extend_to_fit(self, inputs: Shape, gates: List[List[str]]) -> GateAbsorptionCounter:
         if inputs[0] >= 3:
             return AbsorbingFA(gates[0][:3])
 
+
 class AbsorbingFA(GateAbsorptionCounter):
     def __init__(self, gates):
         self.gates = [gate_string_to_pred(gate) for gate in gates]
-        super().__init__(Shape([3]), Shape([1,1]))
+        super().__init__(Shape([3]), Shape([1, 1]))
 
     def build_hardware(self):
         lut1 = LUT6.fromPred(
-            lambda I0,I1,I2,I3,I4,I5: fa_sum(
-                self.gates[0](I0,I1), 
-                self.gates[1](I2,I3),
-                self.gates[2](I4,I5)))
-        
+            lambda I0, I1, I2, I3, I4, I5: fa_sum(
+                self.gates[0](I0, I1), self.gates[1](I2, I3), self.gates[2](I4, I5)
+            )
+        )
+
         lut2 = LUT6.fromPred(
-            lambda I0,I1,I2,I3,I4,I5: fa_carry(
-                self.gates[0](I0,I1), 
-                self.gates[1](I2,I3),
-                self.gates[2](I4,I5)))
+            lambda I0, I1, I2, I3, I4, I5: fa_carry(
+                self.gates[0](I0, I1), self.gates[1](I2, I3), self.gates[2](I4, I5)
+            )
+        )
 
         for lut in zip([lut1, lut2]):
             self.input_wires[0][0].connect_to(lut.I0)
@@ -72,9 +81,9 @@ def build_hardware(self):
         self.output_wires[1][0].connect_to(lut2.O)
         self.instances += [lut1, lut2]
 
+
 class MuxCYPredAdderCandidate(GateAbsorptionCounterCandidate):
-    def extend_to_fit(self, inputs: Shape,
-                      gates: List[List[str]]) -> GateAbsorptionCounter:
+    def extend_to_fit(self, inputs: Shape, gates: List[List[str]]) -> GateAbsorptionCounter:
         width = 0
         for i in range(4):
             if inputs[i] > 2:
@@ -88,11 +97,11 @@ def extend_to_fit(self, inputs: Shape,
         if selected_gates:
             return MuxCYPredAdder(selected_gates)
 
+
 class MuxCYPredAdder(GateAbsorptionCounter):
     def __init__(self, gates: List[List[str]]):
         self.gates = [[gate_string_to_pred(el) for el in col] for col in gates]
-        super().__init__(Shape(len(self.gates) * [2]),
-                         Shape((len(self.gates)+1) * [1]))
+        super().__init__(Shape(len(self.gates) * [2]), Shape((len(self.gates) + 1) * [1]))
 
     def build_hardware(self):
         """7-Series horizontal multi-column gate absorption using LUT6_2.
@@ -100,18 +109,22 @@ def build_hardware(self):
         Similar to VersalPredAdder but uses LUT6_2 with swapped predicate order.
         Each column has 2 gates, each LUT computes: sum = p1 XOR p2 XOR carry_in
         """
-        from ..primitives import LUT6_2
         from ..nodes import Constant
+        from ..primitives import LUT6_2
 
         luts = []
         for i in range(len(self.gates)):
             p1 = self.gates[i][0]
             p2 = self.gates[i][1]
-            # LUT6_2: predO5→O5, predO6→O6 
+            # LUT6_2: predO5→O5, predO6→O6
             # Match VersalPredAdder pattern: sum first, carry second
             lut = LUT6_2.fromPred(
-                lambda A0,A1,A2,A3,A4,A5,p1=p1,p2=p2: fa_sum(p1(A0,A1), p2(A2,A3), A4),    # predO5 → O5 (sum)
-                lambda A0,A1,A2,A3,A4,A5,p1=p1,p2=p2: fa_carry(p1(A0,A1), p2(A2,A3), A4), # predO6 → O6 (carry)
+                lambda A0, A1, A2, A3, A4, A5, p1=p1, p2=p2: fa_sum(
+                    p1(A0, A1), p2(A2, A3), A4
+                ),  # predO5 → O5 (sum)
+                lambda A0, A1, A2, A3, A4, A5, p1=p1, p2=p2: fa_carry(
+                    p1(A0, A1), p2(A2, A3), A4
+                ),  # predO6 → O6 (carry)
             )
 
             # Connect inputs (same pattern as Versal)
@@ -136,9 +149,9 @@ def build_hardware(self):
 
         self.instances += luts
 
+
 class VersalPredAdderCandidate(GateAbsorptionCounterCandidate):
-    def extend_to_fit(self, inputs: Shape, 
-                      gates: List[List[str]]) -> GateAbsorptionCounter:
+    def extend_to_fit(self, inputs: Shape, gates: List[List[str]]) -> GateAbsorptionCounter:
         width = 0
         for i in range(4):
             if inputs[i] > 2:
@@ -149,14 +162,14 @@ def extend_to_fit(self, inputs: Shape,
         for i in range(width):
             gates_col = [gates[i][0], gates[i][1]]
             selected_gates.append(gates_col)
-        if selected_gates:        
+        if selected_gates:
             return VersalPredAdder(selected_gates)
 
+
 class VersalPredAdder(GateAbsorptionCounter):
     def __init__(self, gates: List[List[str]]):
         self.gates = [[gate_string_to_pred(el) for el in col] for col in gates]
-        super().__init__(Shape(len(self.gates) * [2]), 
-                         Shape((len(self.gates)+1) * [1]))
+        super().__init__(Shape(len(self.gates) * [2]), Shape((len(self.gates) + 1) * [1]))
 
     def build_hardware(self):
         luts = []
@@ -164,9 +177,8 @@ def build_hardware(self):
             p1 = self.gates[i][0]
             p2 = self.gates[i][1]
             lut = LUT6CY.fromPred(
-                lambda A0,A1,A2,A3,A4,A5: fa_sum(p1(A0,A1),p2(A2,A3),A4), # s
-                lambda A0,A1,A2,A3,A4,A5: fa_carry(p1(A0,A1), 
-                                                   p2(A2,A3), A4), # c
+                lambda A0, A1, A2, A3, A4, A5: fa_sum(p1(A0, A1), p2(A2, A3), A4),  # s
+                lambda A0, A1, A2, A3, A4, A5: fa_carry(p1(A0, A1), p2(A2, A3), A4),  # c
             )
             self.input_wires[i][0].connect_to(lut.I0)
             self.input_wires[i][1].connect_to(lut.I2)
@@ -181,90 +193,92 @@ def build_hardware(self):
         luts[-1].O52.connect_to(self.output_wires[len(luts)][0])
         self.instances += luts
 
+
 class RippleSumPredAdderCandidate(GateAbsorptionCounterCandidate):
-    def extend_to_fit(self, inputs: Shape,
-                      gates: List[List[str]]) -> GateAbsorptionCounter:
+    def extend_to_fit(self, inputs: Shape, gates: List[List[str]]) -> GateAbsorptionCounter:
         max_height = min(inputs[0] // 2, 4)
         if max_height:
-            return RippleSumPredAdder(gates[0][:max_height*2])
+            return RippleSumPredAdder(gates[0][: max_height * 2])
+
 
 class RippleSumPredAdder(GateAbsorptionCounter):
     def __init__(self, gates):
         self.gates = [gate_string_to_pred(gate) for gate in gates]
-        super().__init__(Shape([len(gates)]), Shape([1, (len(gates)+1)//2]))
+        super().__init__(Shape([len(gates)]), Shape([1, (len(gates) + 1) // 2]))
 
     def build_hardware(self):
         luts = []
         for i in range((len(self.gates) + 1) // 2):
-            p1 = self.gates[2*i]
-            p2 = (self.gates[2*i+1] if len(self.gates) > 2*i+1
-                  else lambda A0,A1: False)
+            p1 = self.gates[2 * i]
+            p2 = self.gates[2 * i + 1] if len(self.gates) > 2 * i + 1 else lambda A0, A1: False
             lut = LUT6CY.fromPred(
-                lambda A0,A1,A2,A3,A4,A5: 
-                    fa_carry(p1(A0,A1), p2(A2,A3), A4), # c
-                lambda A0,A1,A2,A3,A4,A5: 
-                    fa_sum(p1(A0,A1),p2(A2,A3),A4) # s
+                lambda A0, A1, A2, A3, A4, A5: fa_carry(p1(A0, A1), p2(A2, A3), A4),  # c
+                lambda A0, A1, A2, A3, A4, A5: fa_sum(p1(A0, A1), p2(A2, A3), A4),  # s
             )
             luts.append(lut)
-        
+
         for p, n in zip(luts, luts[1:]):
             p.O52.connect_to(n.I4)
 
-        for i, (w1, w2) in enumerate(zip(self.input_wires[0], 
-                                         self.input_wires_complementary[0])):
+        for i, (w1, w2) in enumerate(zip(self.input_wires[0], self.input_wires_complementary[0])):
             if i % 2 == 0:
-                w1.connect_to(luts[i//2].I0)
-                w2.connect_to(luts[i//2].I1)
+                w1.connect_to(luts[i // 2].I0)
+                w2.connect_to(luts[i // 2].I1)
             else:
-                w1.connect_to(luts[i//2].I2)
-                w2.connect_to(luts[i//2].I3)
-        
+                w1.connect_to(luts[i // 2].I2)
+                w2.connect_to(luts[i // 2].I3)
+
         luts[-1].O52.connect_to(self.output_wires[0][0])
         for i, lut in enumerate(luts):
             lut.O51.connect_to(self.output_wires[1][i])
         self.instances += luts
 
+
 class MuxCYRippleSumCandidate(GateAbsorptionCounterCandidate):
     """7-Series version of RippleSumPredAdder using CARRY4 instead of LUT6CY."""
-    def extend_to_fit(self, inputs: Shape,
-                      gates: List[List[str]]) -> GateAbsorptionCounter:
+
+    def extend_to_fit(self, inputs: Shape, gates: List[List[str]]) -> GateAbsorptionCounter:
         max_height = min(inputs[0] // 2, 4)
         if max_height:
-            return MuxCYRippleSum(gates[0][:max_height*2])
+            return MuxCYRippleSum(gates[0][: max_height * 2])
+
 
 class MuxCYRippleSum(GateAbsorptionCounter):
     """7-Series ripple-carry gate absorption using LUT6_2 + CARRY4."""
+
     def __init__(self, gates):
         self.gates = [gate_string_to_pred(gate) for gate in gates]
-        super().__init__(Shape([len(gates)]), Shape([1, (len(gates)+1)//2]))
+        super().__init__(Shape([len(gates)]), Shape([1, (len(gates) + 1) // 2]))
 
     def build_hardware(self):
-        from ..primitives import LUT6_2
         from ..nodes import Constant
+        from ..primitives import LUT6_2
 
         luts = []
         for i in range((len(self.gates) + 1) // 2):
-            p1 = self.gates[2*i]
-            p2 = (self.gates[2*i+1] if len(self.gates) > 2*i+1
-                  else lambda A0,A1: False)
+            p1 = self.gates[2 * i]
+            p2 = self.gates[2 * i + 1] if len(self.gates) > 2 * i + 1 else lambda A0, A1: False
             # Match Versal RippleSumPredAdder pattern with full-adder logic
             # Gates use I0/I1 (p1) and I2/I3 (p2), carry-in on I4
             # Try swapping: O5 = sum, O6 = carry (opposite of naming)
             lut = LUT6_2.fromPred(
-                lambda A0,A1,A2,A3,A4,A5,p1=p1,p2=p2: fa_sum(p1(A0,A1), p2(A2,A3), A4),    # O5 = sum 
-                lambda A0,A1,A2,A3,A4,A5,p1=p1,p2=p2: fa_carry(p1(A0,A1), p2(A2,A3), A4),  # O6 = carry 
+                lambda A0, A1, A2, A3, A4, A5, p1=p1, p2=p2: fa_sum(
+                    p1(A0, A1), p2(A2, A3), A4
+                ),  # O5 = sum
+                lambda A0, A1, A2, A3, A4, A5, p1=p1, p2=p2: fa_carry(
+                    p1(A0, A1), p2(A2, A3), A4
+                ),  # O6 = carry
             )
             luts.append(lut)
 
         # Connect gate inputs to LUT inputs (same as Versal)
-        for i, (w1, w2) in enumerate(zip(self.input_wires[0],
-                                         self.input_wires_complementary[0])):
+        for i, (w1, w2) in enumerate(zip(self.input_wires[0], self.input_wires_complementary[0])):
             if i % 2 == 0:
-                w1.connect_to(luts[i//2].I0)
-                w2.connect_to(luts[i//2].I1)
+                w1.connect_to(luts[i // 2].I0)
+                w2.connect_to(luts[i // 2].I1)
             else:
-                w1.connect_to(luts[i//2].I2)
-                w2.connect_to(luts[i//2].I3)
+                w1.connect_to(luts[i // 2].I2)
+                w2.connect_to(luts[i // 2].I3)
 
         # First LUT needs carry-in = 0
         Constant("1'b0").connect_to(luts[0].I4)
@@ -276,16 +290,17 @@ def build_hardware(self):
         # Connect outputs (same as Versal): final carry + sum bits
         luts[-1].O5.connect_to(self.output_wires[0][0])  # Final carry-out
         for i, lut in enumerate(luts):
-            lut.O6.connect_to(self.output_wires[1][i])   # Sum bits
+            lut.O6.connect_to(self.output_wires[1][i])  # Sum bits
 
         self.instances += luts
 
+
 class SinglePredCandidate(GateAbsorptionCounterCandidate):
-    def extend_to_fit(self, inputs: Shape,
-                      gates: List[List[str]]) -> GateAbsorptionCounter:
+    def extend_to_fit(self, inputs: Shape, gates: List[List[str]]) -> GateAbsorptionCounter:
         if inputs[0] > 0:
             return SinglePred(gates[0][0])
 
+
 class SinglePred(GateAbsorptionCounter):
     def __init__(self, gate):
         self.gate = gate_string_to_pred(gate)
@@ -296,4 +311,4 @@ def build_hardware(self):
         self.input_wires[0][0].connect_to(lut.I0)
         self.input_wires_complementary[0][0].connect_to(lut.I1)
         lut.O.connect_to(self.output_wires[0][0])
-        self.instances.append(lut)
\ No newline at end of file
+        self.instances.append(lut)
diff --git a/src/finn/compressor/src/graph/counters/counter_candidates.py b/src/finn/compressor/src/graph/counters/counter_candidates.py
index 74398e2a1c..1462a837f9 100644
--- a/src/finn/compressor/src/graph/counters/counter_candidates.py
+++ b/src/finn/compressor/src/graph/counters/counter_candidates.py
@@ -7,90 +7,102 @@
 # @author    Co-authored by Simon Gerber <simon.gerber@amd.com>
 #############################################################################
 
-from itertools import count
-from ..nodes import Counter, Constant, GateAbsorptionCounter
 from abc import ABC, abstractmethod
-from ..primitives import LUT6, LUT6_2, LUT6CY, CARRY4, LUT5
+from itertools import count
+
 from ...utils.shape import Shape
+from ..nodes import Constant, Counter, GateAbsorptionCounter
+from ..primitives import CARRY4, LUT5, LUT6, LUT6_2, LUT6CY
 
 MAX_CASCADE_LENGTH = 4
 
-def FA_sum(a, b, c): return a ^ b ^ c
-def FA_carry(a, b, c): return a and b or a and c or b and c
+
+def FA_sum(a, b, c):
+    return a ^ b ^ c
+
+
+def FA_carry(a, b, c):
+    return a and b or a and c or b and c
+
 
 class CounterCandidate(ABC):
     @abstractmethod
-    def extend_to_fit(self, inputs: Shape, outputs: Shape, 
-                      compression_goal) -> Counter:
+    def extend_to_fit(self, inputs: Shape, outputs: Shape, compression_goal) -> Counter:
         pass
 
+
 class VersalAtom(CounterCandidate):
-    def extend_to_fit(self, inputs: Shape, outputs: Shape, 
-                      compression_goal) -> Counter:
+    def extend_to_fit(self, inputs: Shape, outputs: Shape, compression_goal) -> Counter:
         pass
 
+
 class FixedShapeCounterCandidate(CounterCandidate):
-    def __init__(self, counter, counter_inputs: Shape, 
-                 counter_outputs: Shape) -> Counter:
+    def __init__(self, counter, counter_inputs: Shape, counter_outputs: Shape) -> Counter:
         self.counter = counter
         self.counter_inputs = counter_inputs
         self.counter_outputs = counter_outputs
 
-    def extend_to_fit(self, inputs: Shape, outputs: Shape, 
-                      compression_goal) -> Counter:
+    def extend_to_fit(self, inputs: Shape, outputs: Shape, compression_goal) -> Counter:
         for i in range(len(self.counter_inputs)):
-            if not (self.counter_inputs[i] <= inputs[i] and
-                    inputs[i] + outputs[i] - self.counter_inputs[i] + 
-                    self.counter_outputs[i] - compression_goal(i) >= -1):
+            if not (
+                self.counter_inputs[i] <= inputs[i]
+                and inputs[i]
+                + outputs[i]
+                - self.counter_inputs[i]
+                + self.counter_outputs[i]
+                - compression_goal(i)
+                >= -1
+            ):
                 return None
         return self.counter()
 
+
 class FA(Counter):
-    def __init__(self): 
+    def __init__(self):
         super(FA, self).__init__(
-            Shape([3]), 
-            Shape([1, 1]), 
+            Shape([3]),
+            Shape([1, 1]),
         )
 
     def build_hardware(self):
         lut = LUT6_2.fromPred(
-                              lambda x, y, z, w, q, r:
-                                x and y or x and z or y and z,
-                              lambda x, y, z, w, q, r: x ^ y ^ z, 
-                              "FA")
+            lambda x, y, z, w, q, r: x and y or x and z or y and z,
+            lambda x, y, z, w, q, r: x ^ y ^ z,
+            "FA",
+        )
         for i in range(3):
             self.input_wires[0][i].connect_to(lut.in_ports[i])
         for i in range(2):
             lut.out_ports[i].connect_to(self.output_wires[i][0])
         self.instances += (lut,)
 
+
 class FACandidate(FixedShapeCounterCandidate):
     def __init__(self):
         super().__init__(FA, FA().input_shape, FA().output_shape)
 
+
 hlutnm_counter = count()
+
+
 class TenSix(Counter):
-    def __init__(self): 
+    def __init__(self):
         super(TenSix, self).__init__(Shape([10]), Shape([2, 4]))
 
     def build_hardware(self):
         lut1 = LUT6_2.fromPred(
-            lambda A0,A1,A2,A3,A4,_: FA_sum(  A3, A4, FA_sum(A0, A1, A2)),
-            lambda A0,A1,A2,A3,A4,_: FA_carry(A3, A4, FA_sum(A0, A1, A2)),
-            "FiveTwo_1"
+            lambda A0, A1, A2, A3, A4, _: FA_sum(A3, A4, FA_sum(A0, A1, A2)),
+            lambda A0, A1, A2, A3, A4, _: FA_carry(A3, A4, FA_sum(A0, A1, A2)),
+            "FiveTwo_1",
         )
         lut2 = LUT6_2.fromPred(
-            lambda A0,A1,A2,A3,A4,_: FA_sum(  A3, A4, FA_sum(A0, A1, A2)), 
-            lambda A0,A1,A2,A3,A4,_: FA_carry(A3, A4, FA_sum(A0, A1, A2)),
-            "FiveTwo_2"
-        )
-        hlutnm_attr = f"HLUTNM = \"tensix_{next(hlutnm_counter)}\""
-        lut3_A = LUT5.fromPred(
-            lambda A0,A1,A2,A3,A4: FA_carry(A0,A1,A4)
-        )
-        lut3_B = LUT5.fromPred(
-            lambda A0,A1,A2,A3,A4: FA_carry(A2,A3,A4)
+            lambda A0, A1, A2, A3, A4, _: FA_sum(A3, A4, FA_sum(A0, A1, A2)),
+            lambda A0, A1, A2, A3, A4, _: FA_carry(A3, A4, FA_sum(A0, A1, A2)),
+            "FiveTwo_2",
         )
+        hlutnm_attr = f'HLUTNM = "tensix_{next(hlutnm_counter)}"'
+        lut3_A = LUT5.fromPred(lambda A0, A1, A2, A3, A4: FA_carry(A0, A1, A4))
+        lut3_B = LUT5.fromPred(lambda A0, A1, A2, A3, A4: FA_carry(A2, A3, A4))
         lut3_A.annotate(hlutnm_attr)
         lut3_B.annotate(hlutnm_attr)
         # TODO: Take care of annotations
@@ -107,7 +119,7 @@ def build_hardware(self):
         self.input_wires[0][7].connect_to(lut2.I2)
         self.input_wires[0][8].connect_to(lut2.I3)
         self.input_wires[0][9].connect_to(lut2.I4)
-        
+
         self.input_wires[0][0].connect_to(lut3_A.I0)
         self.input_wires[0][1].connect_to(lut3_A.I1)
         self.input_wires[0][2].connect_to(lut3_A.I4)
@@ -130,24 +142,26 @@ def build_hardware(self):
 
         self.instances += (lut1, lut2, lut3_A, lut3_B)
 
+
 class TenSixCandidate(FixedShapeCounterCandidate):
     def __init__(self):
         super().__init__(TenSix, TenSix().input_shape, TenSix().output_shape)
 
+
 class FiveTwo(Counter):
-    def __init__(self): super(FiveTwo, self).__init__(Shape([5, 2]),
-                                                      Shape([1, 2, 1]))
+    def __init__(self):
+        super(FiveTwo, self).__init__(Shape([5, 2]), Shape([1, 2, 1]))
 
     def build_hardware(self):
         lut1 = LUT6_2.fromPred(
-            lambda A0,A1,A2,A3,A4,_: FA_sum(  A3, A4, FA_sum(A0, A1, A2)),
-            lambda A0,A1,A2,A3,A4,_: FA_carry(A3, A4, FA_sum(A0, A1, A2)),
-            "FiveTwo_1"
+            lambda A0, A1, A2, A3, A4, _: FA_sum(A3, A4, FA_sum(A0, A1, A2)),
+            lambda A0, A1, A2, A3, A4, _: FA_carry(A3, A4, FA_sum(A0, A1, A2)),
+            "FiveTwo_1",
         )
         lut2 = LUT6_2.fromPred(
-            lambda A0,A1,A2,A3,A4,_: FA_sum(  A3, A4, FA_carry(A0, A1, A2)), 
-            lambda A0,A1,A2,A3,A4,_: FA_carry(A3, A4, FA_carry(A0, A1, A2)),
-            "FiveTwo_2"
+            lambda A0, A1, A2, A3, A4, _: FA_sum(A3, A4, FA_carry(A0, A1, A2)),
+            lambda A0, A1, A2, A3, A4, _: FA_carry(A3, A4, FA_carry(A0, A1, A2)),
+            "FiveTwo_2",
         )
         self.input_wires[0][0].connect_to(lut1.I0)
         self.input_wires[0][1].connect_to(lut1.I1)
@@ -166,19 +180,22 @@ def build_hardware(self):
         lut2.O6.connect_to(self.output_wires[2][0])
         self.instances += (lut1, lut2)
 
+
 class FiveTwoCandidate(FixedShapeCounterCandidate):
     def __init__(self):
-        super(FiveTwoCandidate, self).__init__(FiveTwo, FiveTwo().input_shape,
-                                               FiveTwo().output_shape)
+        super(FiveTwoCandidate, self).__init__(
+            FiveTwo, FiveTwo().input_shape, FiveTwo().output_shape
+        )
+
 
 class DualRailRippleSum(Counter):
     def __init__(self, w):
         self._width = w
-        super(DualRailRippleSum, self).__init__(Shape([4*w+1, w+1]), 
-                                                Shape([1, w+1, w]))
+        super(DualRailRippleSum, self).__init__(Shape([4 * w + 1, w + 1]), Shape([1, w + 1, w]))
 
     @property
-    def width(self): return self._width
+    def width(self):
+        return self._width
 
     def build_hardware(self):
         luts_top = []
@@ -186,35 +203,31 @@ def build_hardware(self):
 
         cascade_top = self.input_wires[0][0]
         cascade_btm = self.input_wires[1][0]
-        
+
         for i in range(0, self._width):
             lut_top = LUT6CY.fromPred(
-                lambda A0,A1,A2,A3,A4,_: FA_carry(A3, A4, 
-                                                  FA_sum(A0, A1, A2)),
-                lambda A0,A1,A2,A3,A4,_: FA_sum  (A3, A4, 
-                                                  FA_sum(A0, A1, A2)),
-                "dual_rail_top"
+                lambda A0, A1, A2, A3, A4, _: FA_carry(A3, A4, FA_sum(A0, A1, A2)),
+                lambda A0, A1, A2, A3, A4, _: FA_sum(A3, A4, FA_sum(A0, A1, A2)),
+                "dual_rail_top",
             )
             lut_btm = LUT6CY.fromPred(
-                lambda A0,A1,A2,A3,A4,_: FA_carry(A3, A4, 
-                                                  FA_carry(A0, A1, A2)),
-                lambda A0,A1,A2,A3,A4,_: FA_sum  (A3, A4, 
-                                                  FA_carry(A0, A1, A2)),
-                "dual_rail_btm"
+                lambda A0, A1, A2, A3, A4, _: FA_carry(A3, A4, FA_carry(A0, A1, A2)),
+                lambda A0, A1, A2, A3, A4, _: FA_sum(A3, A4, FA_carry(A0, A1, A2)),
+                "dual_rail_btm",
             )
 
-            self.input_wires[0][1+4*i].connect_to(lut_top.I0)
-            self.input_wires[0][2+4*i].connect_to(lut_top.I1)
-            self.input_wires[0][3+4*i].connect_to(lut_top.I2)
-            self.input_wires[0][4+4*i].connect_to(lut_top.I3)
+            self.input_wires[0][1 + 4 * i].connect_to(lut_top.I0)
+            self.input_wires[0][2 + 4 * i].connect_to(lut_top.I1)
+            self.input_wires[0][3 + 4 * i].connect_to(lut_top.I2)
+            self.input_wires[0][4 + 4 * i].connect_to(lut_top.I3)
             cascade_top.connect_to(lut_top.I4)
-            lut_top.O51.connect_to(self.output_wires[1][i+1])
+            lut_top.O51.connect_to(self.output_wires[1][i + 1])
             cascade_top = lut_top.O52
 
-            self.input_wires[0][1+4*i].connect_to(lut_btm.I0)
-            self.input_wires[0][2+4*i].connect_to(lut_btm.I1)
-            self.input_wires[0][3+4*i].connect_to(lut_btm.I2)
-            self.input_wires[1][1+i].connect_to(lut_btm.I3)
+            self.input_wires[0][1 + 4 * i].connect_to(lut_btm.I0)
+            self.input_wires[0][2 + 4 * i].connect_to(lut_btm.I1)
+            self.input_wires[0][3 + 4 * i].connect_to(lut_btm.I2)
+            self.input_wires[1][1 + i].connect_to(lut_btm.I3)
             cascade_btm.connect_to(lut_btm.I4)
             lut_btm.O51.connect_to(self.output_wires[2][i])
             cascade_btm = lut_btm.O52
@@ -225,31 +238,36 @@ def build_hardware(self):
             if i == self._width - 1:
                 lut_top.O52.connect_to(self.output_wires[0][0])
                 lut_btm.O52.connect_to(self.output_wires[1][0])
-            
+
         self.instances += luts_top + luts_btm
 
+
 class DualRailRippleSumCandidate(CounterCandidate):
-    def extend_to_fit(self, inputs: Shape, outputs: Shape, 
-                      compression_goal) -> Counter:
-        max_height_0 = min(MAX_CASCADE_LENGTH, 
-                         (inputs[0]-1)//4, 
-                         (inputs[0]+outputs[0]-compression_goal(0)+1)//4
-                         ) if inputs[0] >= 5 else 0
-        
-        max_height_1 = min(MAX_CASCADE_LENGTH, 
-                         inputs[1]-1
-                         ) if inputs[1] >= 2 else 0
+    def extend_to_fit(self, inputs: Shape, outputs: Shape, compression_goal) -> Counter:
+        max_height_0 = (
+            min(
+                MAX_CASCADE_LENGTH,
+                (inputs[0] - 1) // 4,
+                (inputs[0] + outputs[0] - compression_goal(0) + 1) // 4,
+            )
+            if inputs[0] >= 5
+            else 0
+        )
+
+        max_height_1 = min(MAX_CASCADE_LENGTH, inputs[1] - 1) if inputs[1] >= 2 else 0
         max_height = min(max_height_0, max_height_1, MAX_CASCADE_LENGTH)
-        if max_height > 0: 
+        if max_height > 0:
             return DualRailRippleSum(max_height)
 
+
 class RippleSum(Counter):
     def __init__(self, w):
         self._width = w
-        super(RippleSum, self).__init__(Shape([2*w+1]), Shape([1, w]))
+        super(RippleSum, self).__init__(Shape([2 * w + 1]), Shape([1, w]))
 
     @property
-    def width(self): return self._width
+    def width(self):
+        return self._width
 
     def build_hardware(self):
         luts = []
@@ -258,13 +276,13 @@ def build_hardware(self):
 
         for i in range(0, self._width):
             lut = LUT6CY.fromPred(
-                lambda A0,A1,A2,A3,A4,_: FA_carry(A4, A1, A0),
-                lambda A0,A1,A2,A3,A4,_: FA_sum  (A4, A1, A0),
-                "ripple_sum"
+                lambda A0, A1, A2, A3, A4, _: FA_carry(A4, A1, A0),
+                lambda A0, A1, A2, A3, A4, _: FA_sum(A4, A1, A0),
+                "ripple_sum",
             )
 
-            self.input_wires[0][1+2*i].connect_to(lut.I0)
-            self.input_wires[0][2+2*i].connect_to(lut.I1)
+            self.input_wires[0][1 + 2 * i].connect_to(lut.I0)
+            self.input_wires[0][2 + 2 * i].connect_to(lut.I1)
             carry.connect_to(lut.I4)
             lut.O51.connect_to(self.output_wires[1][i])
             carry = lut.O52
@@ -273,67 +291,76 @@ def build_hardware(self):
 
             if i == self._width - 1:
                 lut.O52.connect_to(self.output_wires[0][0])
-            
+
         self.instances += luts
 
+
 class RippleSumCandidate(CounterCandidate):
-    def extend_to_fit(self, inputs: Shape, outputs: Shape, 
-                      compression_goal) -> Counter:
-        max_height = min(MAX_CASCADE_LENGTH, 
-                         (inputs[0]-1)//2, 
-                         (inputs[0]+outputs[0]+1)//2-compression_goal(0)+1
-                         ) if inputs[0] >= 3 else 0
+    def extend_to_fit(self, inputs: Shape, outputs: Shape, compression_goal) -> Counter:
+        max_height = (
+            min(
+                MAX_CASCADE_LENGTH,
+                (inputs[0] - 1) // 2,
+                (inputs[0] + outputs[0] + 1) // 2 - compression_goal(0) + 1,
+            )
+            if inputs[0] >= 3
+            else 0
+        )
         if max_height > 0:
             return RippleSum(max_height)
 
+
 class SixThree(Counter):
     def __init__(self):
         super(SixThree, self).__init__(Shape([6]), Shape([1, 1, 1]))
 
     def build_hardware(self):
-        lut1 = LUT6.fromPred(lambda A0,A1,A2,A3,A4,A5:
-                                bool(sum([A0,A1,A2,A3,A4,A5]) & 1),
-                                "sixthree_first")
-        lut2 = LUT6.fromPred(lambda A0,A1,A2,A3,A4,A5:
-                                bool(sum([A0,A1,A2,A3,A4,A5]) & 2),
-                                "sixthree_second")
-        lut3 = LUT6.fromPred(lambda A0,A1,A2,A3,A4,A5:
-                                bool(sum([A0,A1,A2,A3,A4,A5]) & 4),
-                                "sixthree_third")
+        lut1 = LUT6.fromPred(
+            lambda A0, A1, A2, A3, A4, A5: bool(sum([A0, A1, A2, A3, A4, A5]) & 1), "sixthree_first"
+        )
+        lut2 = LUT6.fromPred(
+            lambda A0, A1, A2, A3, A4, A5: bool(sum([A0, A1, A2, A3, A4, A5]) & 2),
+            "sixthree_second",
+        )
+        lut3 = LUT6.fromPred(
+            lambda A0, A1, A2, A3, A4, A5: bool(sum([A0, A1, A2, A3, A4, A5]) & 4), "sixthree_third"
+        )
         luts = (lut1, lut2, lut3)
-   
+
         for lut in luts:
             for i in range(6):
                 self.input_wires[0][i].connect_to(lut.in_ports[i])
-        
+
         for i, lut in enumerate(luts):
             lut.out_ports[0].connect_to(self.output_wires[i][0])
         self.instances += luts
 
+
 class SixThreeCandidate(FixedShapeCounterCandidate):
     def __init__(self):
-        super().__init__(SixThree, SixThree().input_shape, 
-                         SixThree().output_shape)
+        super().__init__(SixThree, SixThree().input_shape, SixThree().output_shape)
+
 
 class VersalAtom14:
     def __init__(self):
-        self.shape = Shape([4,1])
+        self.shape = Shape([4, 1])
         self.width = 2
         self.output_width = 2
 
     def build_luts(self):
         lut_1 = LUT6CY.fromPred(
-            lambda A0,A1,A2,A3,A4,_: FA_sum(  FA_sum(A0,A1,A2),A3,A4),
-            lambda A0,A1,A2,A3,A4,_: FA_carry(FA_sum(A0,A1,A2),A3,A4),
-            "atom14_first"
+            lambda A0, A1, A2, A3, A4, _: FA_sum(FA_sum(A0, A1, A2), A3, A4),
+            lambda A0, A1, A2, A3, A4, _: FA_carry(FA_sum(A0, A1, A2), A3, A4),
+            "atom14_first",
         )
         lut_2 = LUT6CY.fromPred(
-            lambda A0,A1,A2,A3,A4,_: FA_sum(  FA_carry(A0,A1,A2),A3,A4),
-            lambda A0,A1,A2,A3,A4,_: FA_carry(FA_carry(A0,A1,A2),A3,A4),
-            "atom14_second"
+            lambda A0, A1, A2, A3, A4, _: FA_sum(FA_carry(A0, A1, A2), A3, A4),
+            lambda A0, A1, A2, A3, A4, _: FA_carry(FA_carry(A0, A1, A2), A3, A4),
+            "atom14_second",
         )
         return (lut_1, lut_2)
 
+
 class VersalAtom2:
     def __init__(self):
         self.shape = Shape([2])
@@ -342,29 +369,31 @@ def __init__(self):
 
     def build_luts(self):
         lut = LUT6CY.fromPred(
-            lambda A0,A1,A2,A3,A4,_: FA_sum(A0,A1,A4),
-            lambda A0,A1,A2,A3,A4,_: FA_carry(A0,A1,A4),
-            "atom2_second"
+            lambda A0, A1, A2, A3, A4, _: FA_sum(A0, A1, A4),
+            lambda A0, A1, A2, A3, A4, _: FA_carry(A0, A1, A4),
+            "atom2_second",
         )
         return (lut,)
-    
+
+
 class VersalAtom222:
     def __init__(self):
-        self.shape = Shape([2,2,2])
+        self.shape = Shape([2, 2, 2])
         self.width = 2
         self.output_width = 3
 
     def build_luts(self):
         lut_1 = LUT6CY.fromPred(
-            lambda A0,A1,A2,A3,A4,_: FA_sum(A2,A3,A4),
-            lambda A0,A1,A2,A3,A4,_: FA_sum(A0,A1,FA_carry(A2,A3,A4)),
+            lambda A0, A1, A2, A3, A4, _: FA_sum(A2, A3, A4),
+            lambda A0, A1, A2, A3, A4, _: FA_sum(A0, A1, FA_carry(A2, A3, A4)),
         )
         lut_2 = LUT6CY.fromPred(
-            lambda A0,A1,A2,A3,A4,_: FA_sum(A0,A1,FA_carry(A2,A3,A2^A3^A4)),
-            lambda A0,A1,A2,A3,A4,_: FA_carry(A0,A1,FA_carry(A2,A3,A2^A3^A4)),
+            lambda A0, A1, A2, A3, A4, _: FA_sum(A0, A1, FA_carry(A2, A3, A2 ^ A3 ^ A4)),
+            lambda A0, A1, A2, A3, A4, _: FA_carry(A0, A1, FA_carry(A2, A3, A2 ^ A3 ^ A4)),
         )
         return (lut_1, lut_2)
-        
+
+
 class VersalAtomCascade(Counter):
     def __init__(self, atoms):
         self._atoms = atoms
@@ -373,9 +402,7 @@ def __init__(self, atoms):
         in_shape[0] += 1
         in_shape = Shape(in_shape)
 
-        out_shape = Shape([1 for _ 
-                           in range(sum([atom.output_width for 
-                                         atom in atoms]) + 1)])
+        out_shape = Shape([1 for _ in range(sum([atom.output_width for atom in atoms]) + 1)])
         super().__init__(in_shape, out_shape)
 
     def build_hardware(self):
@@ -407,22 +434,22 @@ def build_hardware(self):
             elif isinstance(atom, VersalAtom222):
                 self.input_wires[io_idx][0].connect_to(luts[lut_idx].I2)
                 self.input_wires[io_idx][1].connect_to(luts[lut_idx].I3)
-                self.input_wires[io_idx+1][0].connect_to(luts[lut_idx].I0)
-                self.input_wires[io_idx+1][1].connect_to(luts[lut_idx].I1)
+                self.input_wires[io_idx + 1][0].connect_to(luts[lut_idx].I0)
+                self.input_wires[io_idx + 1][1].connect_to(luts[lut_idx].I1)
                 carry.connect_to(luts[lut_idx].I4)
                 carry = luts[lut_idx].O52
 
                 # second lut
-                self.input_wires[io_idx+1][0].connect_to(luts[lut_idx+1].I2)
-                self.input_wires[io_idx+1][1].connect_to(luts[lut_idx+1].I3)
-                self.input_wires[io_idx+2][0].connect_to(luts[lut_idx+1].I0)
-                self.input_wires[io_idx+2][1].connect_to(luts[lut_idx+1].I1)
-                carry.connect_to(luts[lut_idx+1].I4)
-                carry = luts[lut_idx+1].O52
+                self.input_wires[io_idx + 1][0].connect_to(luts[lut_idx + 1].I2)
+                self.input_wires[io_idx + 1][1].connect_to(luts[lut_idx + 1].I3)
+                self.input_wires[io_idx + 2][0].connect_to(luts[lut_idx + 1].I0)
+                self.input_wires[io_idx + 2][1].connect_to(luts[lut_idx + 1].I1)
+                carry.connect_to(luts[lut_idx + 1].I4)
+                carry = luts[lut_idx + 1].O52
 
                 luts[lut_idx].O51.connect_to(self.output_wires[io_idx][0])
-                luts[lut_idx].O52.connect_to(self.output_wires[io_idx+1][0])
-                luts[lut_idx+1].O51.connect_to(self.output_wires[io_idx+2][0])
+                luts[lut_idx].O52.connect_to(self.output_wires[io_idx + 1][0])
+                luts[lut_idx + 1].O51.connect_to(self.output_wires[io_idx + 2][0])
                 lut_idx += 2
                 io_idx += 3
             elif isinstance(atom, VersalAtom14):
@@ -435,16 +462,16 @@ def build_hardware(self):
                 carry = luts[lut_idx].O52
 
                 # second lut
-                self.input_wires[io_idx][0].connect_to(luts[lut_idx+1].I0)
-                self.input_wires[io_idx][1].connect_to(luts[lut_idx+1].I1)
-                self.input_wires[io_idx][2].connect_to(luts[lut_idx+1].I2)
-                self.input_wires[io_idx+1][0].connect_to(luts[lut_idx+1].I3)
-                carry.connect_to(luts[lut_idx+1].I4)
-                carry = luts[lut_idx+1].O52
+                self.input_wires[io_idx][0].connect_to(luts[lut_idx + 1].I0)
+                self.input_wires[io_idx][1].connect_to(luts[lut_idx + 1].I1)
+                self.input_wires[io_idx][2].connect_to(luts[lut_idx + 1].I2)
+                self.input_wires[io_idx + 1][0].connect_to(luts[lut_idx + 1].I3)
+                carry.connect_to(luts[lut_idx + 1].I4)
+                carry = luts[lut_idx + 1].O52
 
                 luts[lut_idx].O51.connect_to(self.output_wires[io_idx][0])
-                luts[lut_idx+1].O51.connect_to(self.output_wires[io_idx+1][0])
-                
+                luts[lut_idx + 1].O51.connect_to(self.output_wires[io_idx + 1][0])
+
                 lut_idx += 2
                 io_idx += 2
             else:
@@ -452,24 +479,25 @@ def build_hardware(self):
         luts[-1].O52.connect_to(self.output_wires[-1][0])
         self.instances += luts
 
+
 class VersalAtomCascadeCandidate(CounterCandidate):
-    def extend_to_fit(self, inputs: Shape, outputs: Shape, 
-                    compression_goal) -> Counter:
+    def extend_to_fit(self, inputs: Shape, outputs: Shape, compression_goal) -> Counter:
         def fits_col(idx, height):
-            return (height <= inputs[idx] and 
-                    inputs[idx] + outputs[idx] - height
-                    + 1 - compression_goal(idx) >= -1)
+            return (
+                height <= inputs[idx]
+                and inputs[idx] + outputs[idx] - height + 1 - compression_goal(idx) >= -1
+            )
+
         atoms = []
         io_idx = 0
         atom_idx = 0
-        while (atom_idx < 4):
+        while atom_idx < 4:
             if atom_idx == 0:
-                if fits_col(io_idx, 5) and fits_col(io_idx+1, 1):
+                if fits_col(io_idx, 5) and fits_col(io_idx + 1, 1):
                     atoms.append(VersalAtom14())
                     atom_idx += 2
                     io_idx += 2
-                if (fits_col(io_idx, 3) and fits_col(io_idx+1, 2) and 
-                    fits_col(io_idx+2, 2)):
+                if fits_col(io_idx, 3) and fits_col(io_idx + 1, 2) and fits_col(io_idx + 2, 2):
                     atoms.append(VersalAtom222())
                     atom_idx += 2
                     io_idx += 3
@@ -480,12 +508,11 @@ def fits_col(idx, height):
                 else:
                     break
             elif atom_idx < 3:
-                if fits_col(io_idx, 4) and fits_col(io_idx+1, 1):
+                if fits_col(io_idx, 4) and fits_col(io_idx + 1, 1):
                     atoms.append(VersalAtom14())
                     atom_idx += 2
                     io_idx += 2
-                elif (fits_col(io_idx, 2) and fits_col(io_idx+1, 2) and
-                      fits_col(io_idx+2, 2)):
+                elif fits_col(io_idx, 2) and fits_col(io_idx + 1, 2) and fits_col(io_idx + 2, 2):
                     atoms.append(VersalAtom222())
                     atom_idx += 2
                     io_idx += 3
@@ -503,17 +530,19 @@ def fits_col(idx, height):
                 break
         if atoms:
             return VersalAtomCascade(atoms)
-    
+
+
 class ConstantOne(GateAbsorptionCounter):
     def __init__(self):
         super().__init__(Shape(tuple()), Shape((1,)))
-        
+
     def build_hardware(self):
         Constant(1).connect_to(self.output_wires[0][0])
 
+
 class MuxCYAtom06:
     def __init__(self):
-        self.shape = Shape([6,0])
+        self.shape = Shape([6, 0])
         self.width = 2
         self.output_width = 2
 
@@ -535,24 +564,26 @@ def build_luts(self):
         #
         # lo LUT: XOR of all 6 bits
         lut_1 = LUT6_2.fromPred(
-            lambda A0,A1,A2,A3,A4,A5: A0 ^ A1 ^ A2 ^ A3 ^ A4,        # O5 (5-input XOR)
-            lambda A0,A1,A2,A3,A4,A5: A0 ^ A1 ^ A2 ^ A3 ^ A4 ^ A5,   # O6 (6-input XOR)
-            "atom06_lo"
+            lambda A0, A1, A2, A3, A4, A5: A0 ^ A1 ^ A2 ^ A3 ^ A4,  # O5 (5-input XOR)
+            lambda A0, A1, A2, A3, A4, A5: A0 ^ A1 ^ A2 ^ A3 ^ A4 ^ A5,  # O6 (6-input XOR)
+            "atom06_lo",
         )
         # hi LUT: carry chain continuation
         # O5 = FA_carry(A0,A1,A2) for the generate term
         # O6 = more complex carry propagation (from VHDL 0x177E7EE8)
         lut_2 = LUT6_2.fromPred(
-            lambda A0,A1,A2,A3,A4,A5: FA_carry(A0,A1,A2),             # O5 -> DI
-            lambda A0,A1,A2,A3,A4,A5: (FA_carry(FA_sum(A0,A1,A2),A3,A4) ^
-                                       FA_carry(A0,A1,A2)),           # O6 -> S
-            "atom06_hi"
+            lambda A0, A1, A2, A3, A4, A5: FA_carry(A0, A1, A2),  # O5 -> DI
+            lambda A0, A1, A2, A3, A4, A5: (
+                FA_carry(FA_sum(A0, A1, A2), A3, A4) ^ FA_carry(A0, A1, A2)
+            ),  # O6 -> S
+            "atom06_hi",
         )
         return (lut_1, lut_2)
 
+
 class MuxCYAtom14:
     def __init__(self):
-        self.shape = Shape([4,1])
+        self.shape = Shape([4, 1])
         self.width = 2
 
     def build_luts(self):
@@ -578,19 +609,20 @@ def build_luts(self):
         #
         # lut_1 (position 0): processes x0[3:0] for s0/d0
         lut_1 = LUT6_2.fromPred(
-            lambda A0,A1,A2,A3,A4,_: A3,                      # O5 -> DI = x0[3]
-            lambda A0,A1,A2,A3,A4,_: FA_sum(A0,A1,A2) ^ A3,   # O6 -> S
-            "atom14_0"
+            lambda A0, A1, A2, A3, A4, _: A3,  # O5 -> DI = x0[3]
+            lambda A0, A1, A2, A3, A4, _: FA_sum(A0, A1, A2) ^ A3,  # O6 -> S
+            "atom14_0",
         )
         # lut_2 (position 1): processes x0[2:0] and x1 for s1/d1
         # x1 is mapped to I3 (A3)
         lut_2 = LUT6_2.fromPred(
-            lambda A0,A1,A2,A3,A4,_: A3,                        # O5 -> DI = x1
-            lambda A0,A1,A2,A3,A4,_: FA_carry(A0,A1,A2) ^ A3,   # O6 -> S
-            "atom14_1"
+            lambda A0, A1, A2, A3, A4, _: A3,  # O5 -> DI = x1
+            lambda A0, A1, A2, A3, A4, _: FA_carry(A0, A1, A2) ^ A3,  # O6 -> S
+            "atom14_1",
         )
         return (lut_1, lut_2)
 
+
 class MuxCYAtom2:
     def __init__(self):
         self.shape = Shape([2])
@@ -610,22 +642,22 @@ def build_luts(self):
         # logic simplification, it doesn't match the VHDL reference.
         # Changed to O5=A1 for consistency with atom22.vhdl.
         lut = LUT6_2.fromPred(
-            lambda A0,A1,A2,A3,A4,_: A1,       # O5 -> DI = higher-weight bit
-            lambda A0,A1,A2,A3,A4,_: A0 ^ A1,  # O6 -> S (propagate)
-            "atom2"
+            lambda A0, A1, A2, A3, A4, _: A1,  # O5 -> DI = higher-weight bit
+            lambda A0, A1, A2, A3, A4, _: A0 ^ A1,  # O6 -> S (propagate)
+            "atom2",
         )
         return (lut,)
 
+
 class MuxCYAtomCascade(Counter):
     def __init__(self, atoms):
         self._atoms = atoms
-        
+
         in_shape = [el for atom in atoms for el in atom.shape]
         in_shape[0] += 1
         in_shape = Shape(in_shape)
-    
-        out_shape = Shape([1 for _ 
-                           in range(sum([atom.width for atom in atoms]) + 1)])
+
+        out_shape = Shape([1 for _ in range(sum([atom.width for atom in atoms]) + 1)])
         super().__init__(in_shape, out_shape)
 
     def build_hardware(self):
@@ -645,7 +677,7 @@ def build_hardware(self):
             if isinstance(atom, MuxCYAtom2):
                 self.input_wires[idx][0].connect_to(luts[idx].I0)
                 self.input_wires[idx][1].connect_to(luts[idx].I1)
-                idx += 1                
+                idx += 1
             elif isinstance(atom, MuxCYAtom14):
                 # first lut
                 self.input_wires[idx][0].connect_to(luts[idx].I0)
@@ -654,10 +686,10 @@ def build_hardware(self):
                 self.input_wires[idx][3].connect_to(luts[idx].I3)
 
                 # second lut
-                self.input_wires[idx][0].connect_to(luts[idx+1].I0)
-                self.input_wires[idx][1].connect_to(luts[idx+1].I1)
-                self.input_wires[idx][2].connect_to(luts[idx+1].I2)
-                self.input_wires[idx+1][0].connect_to(luts[idx+1].I3)
+                self.input_wires[idx][0].connect_to(luts[idx + 1].I0)
+                self.input_wires[idx][1].connect_to(luts[idx + 1].I1)
+                self.input_wires[idx][2].connect_to(luts[idx + 1].I2)
+                self.input_wires[idx + 1][0].connect_to(luts[idx + 1].I3)
                 idx += 2
             elif isinstance(atom, MuxCYAtom06):
                 # First LUT (atom06_lo): uses all 6 inputs for XOR
@@ -670,20 +702,19 @@ def build_hardware(self):
 
                 # Second LUT (atom06_hi): uses inputs 0-4 for carry propagation
                 # BUGFIX: was connecting to luts[idx] instead of luts[idx+1]
-                self.input_wires[idx][0].connect_to(luts[idx+1].I0)
-                self.input_wires[idx][1].connect_to(luts[idx+1].I1)
-                self.input_wires[idx][2].connect_to(luts[idx+1].I2)
-                self.input_wires[idx][3].connect_to(luts[idx+1].I3)
-                self.input_wires[idx][4].connect_to(luts[idx+1].I4)
+                self.input_wires[idx][0].connect_to(luts[idx + 1].I0)
+                self.input_wires[idx][1].connect_to(luts[idx + 1].I1)
+                self.input_wires[idx][2].connect_to(luts[idx + 1].I2)
+                self.input_wires[idx][3].connect_to(luts[idx + 1].I3)
+                self.input_wires[idx][4].connect_to(luts[idx + 1].I4)
                 idx += 2
             else:
                 raise Exception("Error in construction of MuxCYAtoms")
-                
+
         # Connect outputs
-        for idx, (lut, di, s, o) in enumerate(zip(luts, 
-                                              muxcy.DI.elements,
-                                              muxcy.S.elements, 
-                                              muxcy.O.elements)):
+        for idx, (lut, di, s, o) in enumerate(
+            zip(luts, muxcy.DI.elements, muxcy.S.elements, muxcy.O.elements)
+        ):
             lut.O6.connect_to(s)
             lut.O5.connect_to(di)
             o.connect_to(self.output_wires[idx][0])
@@ -692,22 +723,24 @@ def build_hardware(self):
         self.instances += luts
         self.instances.append(muxcy)
 
+
 class MuxCYAtomCascadeCandidate(CounterCandidate):
-    def extend_to_fit(self, inputs: Shape, outputs: Shape, 
-                    compression_goal) -> Counter:
+    def extend_to_fit(self, inputs: Shape, outputs: Shape, compression_goal) -> Counter:
         def fits_col(idx, height):
-            return (height <= inputs[idx] and 
-                    inputs[idx] + outputs[idx] - height
-                    + 1 - compression_goal(idx) >= -1)
+            return (
+                height <= inputs[idx]
+                and inputs[idx] + outputs[idx] - height + 1 - compression_goal(idx) >= -1
+            )
+
         atoms = []
         i = 0
-        while (i < 4):
+        while i < 4:
             if i == 0:
                 # MuxCYAtom06: 6:3 compressor for column 0 (needs 7 inputs: 6 + carry-in)
                 if fits_col(i, 7):
                     atoms.append(MuxCYAtom06())
                     i += 2
-                elif fits_col(i, 5) and fits_col(i+1, 1):
+                elif fits_col(i, 5) and fits_col(i + 1, 1):
                     atoms.append(MuxCYAtom14())
                     i += 2
                 elif fits_col(i, 3):
@@ -720,7 +753,7 @@ def fits_col(idx, height):
                 if fits_col(i, 6):
                     atoms.append(MuxCYAtom06())
                     i += 2
-                elif fits_col(i, 4) and fits_col(i+1, 1):
+                elif fits_col(i, 4) and fits_col(i + 1, 1):
                     atoms.append(MuxCYAtom14())
                     i += 2
                 elif fits_col(i, 2):
@@ -734,4 +767,4 @@ def fits_col(idx, height):
             else:
                 break
         if i == 4:
-            return MuxCYAtomCascade(atoms)
\ No newline at end of file
+            return MuxCYAtomCascade(atoms)
diff --git a/src/finn/compressor/src/graph/final_adder.py b/src/finn/compressor/src/graph/final_adder.py
index d5fb6456ad..febcceb925 100644
--- a/src/finn/compressor/src/graph/final_adder.py
+++ b/src/finn/compressor/src/graph/final_adder.py
@@ -8,16 +8,24 @@
 
 from abc import abstractstaticmethod
 from typing import List
-from .nodes import Counter
+
 from ..utils.shape import Shape
-from .primitives import LUT5, LUT6CY, LOOKAHEAD8, LUT6_2, CARRY4
+from .nodes import Counter
+from .primitives import CARRY4, LOOKAHEAD8, LUT5, LUT6_2, LUT6CY
+
+
+def FA_sum(a, b, c):
+    return a ^ b ^ c
+
+
+def FA_carry(a, b, c):
+    return a and b or a and c or b and c
 
-def FA_sum(a, b, c): return a ^ b ^ c
-def FA_carry(a, b, c): return a and b or a and c or b and c
 
 def ceildiv(a, b):
     return -(a // -b)
 
+
 def try_connect(func):
     try:
         func()
@@ -27,12 +35,14 @@ def try_connect(func):
 
 class FinalAdder(Counter):
     @abstractstaticmethod
-    def compression_goal(col): pass
+    def compression_goal(col):
+        pass
 
 
 class VersalTernaryAdder(FinalAdder):
     @staticmethod
-    def compression_goal(col): return 5 if col == 0 else 3
+    def compression_goal(col):
+        return 5 if col == 0 else 3
 
     def __init__(self, input_shape: Shape):
         self.input_shape = input_shape
@@ -40,97 +50,80 @@ def __init__(self, input_shape: Shape):
         super().__init__(input_shape, output_shape)
 
     def build_hardware(self):
-        l8s = [LOOKAHEAD8() for _ in range((len(self.input_shape)+8)//8)]
-        luts_chain = [LUT6CY.fromPred(
-            lambda A0,A1,A2,A3,A4,A5: FA_sum(FA_sum(A0,A1,A2), A3, A4),
-            lambda A0,A1,A2,A3,A4,A5: FA_carry(FA_sum(A0,A1,A2), A3, A4),
-            "ternary_adder_chain"
-        ) for _ in range(len(self.input_shape)+1)]
+        l8s = [LOOKAHEAD8() for _ in range((len(self.input_shape) + 8) // 8)]
+        luts_chain = [
+            LUT6CY.fromPred(
+                lambda A0, A1, A2, A3, A4, A5: FA_sum(FA_sum(A0, A1, A2), A3, A4),
+                lambda A0, A1, A2, A3, A4, A5: FA_carry(FA_sum(A0, A1, A2), A3, A4),
+                "ternary_adder_chain",
+            )
+            for _ in range(len(self.input_shape) + 1)
+        ]
         luts_top = []
         for i in range(len(self.input_shape)):
             if i % 2 == 0:
-                luts_top.append(LUT5.fromPred(
-                    lambda A0,A1,A2,A3,A4: FA_carry(A0, A1, A4)
-                ))
-                try_connect(lambda: 
-                            self.input_wires[i][0].connect_to(luts_top[-1].I0))
-                try_connect(lambda: 
-                            self.input_wires[i][1].connect_to(luts_top[-1].I1))
-                try_connect(lambda:
-                            self.input_wires[i+1][0].connect_to(
-                                luts_top[-1].I2))
-                try_connect(lambda:
-                            self.input_wires[i+1][1].connect_to(
-                                luts_top[-1].I3))
-                try_connect(lambda: 
-                            self.input_wires[i][2].connect_to(luts_top[-1].I4))
+                luts_top.append(LUT5.fromPred(lambda A0, A1, A2, A3, A4: FA_carry(A0, A1, A4)))
+                try_connect(lambda: self.input_wires[i][0].connect_to(luts_top[-1].I0))
+                try_connect(lambda: self.input_wires[i][1].connect_to(luts_top[-1].I1))
+                try_connect(lambda: self.input_wires[i + 1][0].connect_to(luts_top[-1].I2))
+                try_connect(lambda: self.input_wires[i + 1][1].connect_to(luts_top[-1].I3))
+                try_connect(lambda: self.input_wires[i][2].connect_to(luts_top[-1].I4))
             else:
-                luts_top.append(LUT5.fromPred(
-                    lambda A0,A1,A2,A3,A4: FA_carry(A2, A3, A4)
-                ))
-                try_connect(lambda: 
-                            self.input_wires[i-1][0].connect_to(
-                                luts_top[-1].I0))
-                try_connect(lambda: 
-                            self.input_wires[i-1][1].connect_to(
-                                luts_top[-1].I1))
-                try_connect(lambda: self.input_wires[i][0].connect_to(
-                    luts_top[-1].I2))
-                try_connect(lambda: self.input_wires[i][1].connect_to(
-                    luts_top[-1].I3))
-                try_connect(lambda: self.input_wires[i][2].connect_to(
-                    luts_top[-1].I4))
-
-        for idx, (left, right) in enumerate(zip(luts_top[0::2], 
-                                                luts_top[1::2])):
+                luts_top.append(LUT5.fromPred(lambda A0, A1, A2, A3, A4: FA_carry(A2, A3, A4)))
+                try_connect(lambda: self.input_wires[i - 1][0].connect_to(luts_top[-1].I0))
+                try_connect(lambda: self.input_wires[i - 1][1].connect_to(luts_top[-1].I1))
+                try_connect(lambda: self.input_wires[i][0].connect_to(luts_top[-1].I2))
+                try_connect(lambda: self.input_wires[i][1].connect_to(luts_top[-1].I3))
+                try_connect(lambda: self.input_wires[i][2].connect_to(luts_top[-1].I4))
+
+        for idx, (left, right) in enumerate(zip(luts_top[0::2], luts_top[1::2])):
             left.annotate(f"HLUTNM = final_adder_{idx}")
             right.annotate(f"HLUTNM = final_adder_{idx}")
 
-        try_connect(lambda: 
-                    self.input_wires[0][3].connect_to(luts_chain[0].I3))
-        try_connect(lambda: 
-                    self.input_wires[0][4].connect_to(luts_chain[0].I4))
+        try_connect(lambda: self.input_wires[0][3].connect_to(luts_chain[0].I3))
+        try_connect(lambda: self.input_wires[0][4].connect_to(luts_chain[0].I4))
         for i, el in enumerate(luts_chain):
             try_connect(lambda: self.input_wires[i][0].connect_to(el.I0))
             try_connect(lambda: self.input_wires[i][1].connect_to(el.I1))
             try_connect(lambda: self.input_wires[i][2].connect_to(el.I2))
-            el.PROP.connect_to(l8s[i//8].p_in_ports[i%8])
+            el.PROP.connect_to(l8s[i // 8].p_in_ports[i % 8])
             el.O51.connect_to(self.output_wires[i][0])
-            el.O52.connect_to(l8s[i//8].c_in_ports[i%8+1])
+            el.O52.connect_to(l8s[i // 8].c_in_ports[i % 8 + 1])
 
         for lb, lt in zip(luts_chain[1:], luts_top):
             lt.O.connect_to(lb.I3)
 
-        # connect carry-ins between lookahead modules 
+        # connect carry-ins between lookahead modules
         for prev, next in zip(l8s, l8s[1:]):
             prev.COUTH.connect_to(next.CIN)
 
         # cascade
         for i in range(1, len(luts_chain)):
             if i % 2 == 0:
-                l8s[(i-1)//8].out_ports[((i-1)%8)//2].connect_to(
-                    luts_chain[i].I4)
+                l8s[(i - 1) // 8].out_ports[((i - 1) % 8) // 2].connect_to(luts_chain[i].I4)
             else:
-                luts_chain[i-1].O52.connect_to(luts_chain[i].I4)
+                luts_chain[i - 1].O52.connect_to(luts_chain[i].I4)
 
         if len(luts_chain) % 2 == 0:
-            l8s[(len(luts_chain)-1)//8].out_ports[len(luts_chain)%8//2-1]\
-                .connect_to(self.output_wires[len(luts_chain)][0])
+            l8s[(len(luts_chain) - 1) // 8].out_ports[len(luts_chain) % 8 // 2 - 1].connect_to(
+                self.output_wires[len(luts_chain)][0]
+            )
         else:
-            luts_chain[-1].O52.connect_to(
-                self.output_wires[len(luts_chain)][0])
+            luts_chain[-1].O52.connect_to(self.output_wires[len(luts_chain)][0])
         self.instances += luts_chain + luts_top + l8s
 
+
 class QuaternaryAdder(FinalAdder):
     @staticmethod
-    def compression_goal(col): return 5 if col <= 1 else 4
+    def compression_goal(col):
+        return 5 if col <= 1 else 4
 
     def __init__(self, input_shape: Shape):
         output_shape = Shape([1 for _ in range(len(input_shape) + 2)])
         super().__init__(input_shape, output_shape)
 
     def build_hardware(self):
-        ## Find the limit up to which the quaternary adder is needed. 
+        # Find the limit up to which the quaternary adder is needed.
         # We construct a two-input adder after this.
         height_4_until = len(self.input_wires)
         tail_length = 0
@@ -140,10 +133,10 @@ def build_hardware(self):
             else:
                 height_4_until = idx
                 tail_length += 1
-        
-        # If tail_length==1, the quaternary adder must not be reduced, 
+
+        # If tail_length==1, the quaternary adder must not be reduced,
         # as there would be no savings.
-        if (tail_length == 1):
+        if tail_length == 1:
             height_4_until += 1
             tail_length = 0
 
@@ -154,53 +147,46 @@ def build_hardware(self):
         for i in range(0, height_4_until):
             luts_top.append(
                 LUT6CY.fromPred(
-                    lambda A0,A1,A2,A3,A4,_: FA_sum(
-                        FA_sum(A0, A1, A2), A3, A4), # S
-                    lambda A0,A1,A2,A3,A4,_: FA_carry(
-                        FA_sum(A0, A1, A2), A3, A4), # ct
-                    "final_adder_top"
+                    lambda A0, A1, A2, A3, A4, _: FA_sum(FA_sum(A0, A1, A2), A3, A4),  # S
+                    lambda A0, A1, A2, A3, A4, _: FA_carry(FA_sum(A0, A1, A2), A3, A4),  # ct
+                    "final_adder_top",
                 )
             )
             luts_btm.append(
                 LUT6CY.fromPred(
-                    lambda A0,A1,A2,A3,A4,_: FA_sum(
-                        FA_carry(A0, A1, A2), A3, A4), # out
-                    lambda A0,A1,A2,A3,A4,_: FA_carry(
-                        FA_carry(A0, A1, A2), A3, A4), #cb
-                    "final_adder_btm"
+                    lambda A0, A1, A2, A3, A4, _: FA_sum(FA_carry(A0, A1, A2), A3, A4),  # out
+                    lambda A0, A1, A2, A3, A4, _: FA_carry(FA_carry(A0, A1, A2), A3, A4),  # cb
+                    "final_adder_btm",
                 )
             )
-        if (tail_length):
+        if tail_length:
             luts_top.append(
                 LUT6CY.fromPred(
-                    lambda A0,A1,A2,A3,A4,_: FA_sum(A0, A1, A4), # out
-                    lambda A0,A1,A2,A3,A4,_: FA_carry(A0, A1, A4), # c_btm
-                    "final_adder_top_end"
+                    lambda A0, A1, A2, A3, A4, _: FA_sum(A0, A1, A4),  # out
+                    lambda A0, A1, A2, A3, A4, _: FA_carry(A0, A1, A4),  # c_btm
+                    "final_adder_top_end",
                 )
             )
             luts_btm.append(
                 LUT6CY.fromPred(
-                    lambda A0,A1,A2,A3,A4,_: FA_sum(FA_sum(A0, A1, False), 
-                                                    A3, A4), # out
-                    lambda A0,A1,A2,A3,A4,_: FA_carry(FA_sum(A0, A1, False),
-                                                      A3, A4),  # c_btm
-                    "final_adder_btm_start_two_input_chain"
+                    lambda A0, A1, A2, A3, A4, _: FA_sum(FA_sum(A0, A1, False), A3, A4),  # out
+                    lambda A0, A1, A2, A3, A4, _: FA_carry(FA_sum(A0, A1, False), A3, A4),  # c_btm
+                    "final_adder_btm_start_two_input_chain",
                 )
             )
-        for i in range(tail_length-1):
+        for i in range(tail_length - 1):
             luts_btm.append(
                 LUT6CY.fromPred(
-                    lambda A0,A1,A2,A3,A4,_: 
-                        FA_sum(FA_carry(A0, A1, False), 
-                        FA_sum(A2, A3, False), A4), # out
-                    lambda A0,A1,A2,A3,A4,_: 
-                        FA_carry(FA_carry(A0, A1, False), 
-                        FA_sum(A2, A3, False), A4), # cb
-                    "final_adder_btm_two_input_chain"
+                    lambda A0, A1, A2, A3, A4, _: FA_sum(
+                        FA_carry(A0, A1, False), FA_sum(A2, A3, False), A4
+                    ),  # out
+                    lambda A0, A1, A2, A3, A4, _: FA_carry(
+                        FA_carry(A0, A1, False), FA_sum(A2, A3, False), A4
+                    ),  # cb
+                    "final_adder_btm_two_input_chain",
                 )
             )
 
-
         l8s_top = []
         l8s_btm = []
         for _ in range(ceildiv(len(luts_top), 8)):
@@ -210,54 +196,53 @@ def build_hardware(self):
 
         # Collect relevant input and output signals
         for i in range(len(luts_top)):
-            luts_top[i].O52.connect_to(l8s_top[i//8].c_in_ports[i%8+1])
-            luts_top[i].PROP.connect_to(l8s_top[i//8].p_in_ports[i%8])
-            
+            luts_top[i].O52.connect_to(l8s_top[i // 8].c_in_ports[i % 8 + 1])
+            luts_top[i].PROP.connect_to(l8s_top[i // 8].p_in_ports[i % 8])
+
         for i in range(len(luts_btm)):
-            luts_btm[i].O52.connect_to(l8s_btm[i//8].c_in_ports[i%8+1])
-            luts_btm[i].PROP.connect_to(l8s_btm[i//8].p_in_ports[i%8])
-        
+            luts_btm[i].O52.connect_to(l8s_btm[i // 8].c_in_ports[i % 8 + 1])
+            luts_btm[i].PROP.connect_to(l8s_btm[i // 8].p_in_ports[i % 8])
+
         carries_top = []
         carries_btm = []
         for i in range(0, len(luts_top)):
             if i % 2 == 0:
                 carries_top.append(luts_top[i].O52)
             if i % 2 == 1:
-                carries_top.append(l8s_top[i//8].out_ports[i%8//2])
+                carries_top.append(l8s_top[i // 8].out_ports[i % 8 // 2])
         for i in range(0, len(luts_btm)):
             if i % 2 == 0:
                 carries_btm.append(luts_btm[i].O52)
             if i % 2 == 1:
-                carries_btm.append(l8s_btm[i//8].out_ports[i%8//2])
-        
-        for i in range(0, len(luts_top)-1):
-            carries_top[i].connect_to(luts_top[i+1].I4)
-        for i in range(0, len(luts_btm)-1):
-            carries_btm[i].connect_to(luts_btm[i+1].I4)
-
-        # connect carry-ins between lookahead modules 
+                carries_btm.append(l8s_btm[i // 8].out_ports[i % 8 // 2])
+
+        for i in range(0, len(luts_top) - 1):
+            carries_top[i].connect_to(luts_top[i + 1].I4)
+        for i in range(0, len(luts_btm) - 1):
+            carries_btm[i].connect_to(luts_btm[i + 1].I4)
+
+        # connect carry-ins between lookahead modules
         def chain_l8(l8s):
             for prev, next in zip(l8s, l8s[1:]):
                 prev.COUTH.connect_to(next.CIN)
-                
+
         chain_l8(l8s_top)
         chain_l8(l8s_btm)
 
         # connect carry-in to first lut and lookahead module
         try_connect(lambda: self.input_wires[0][4].connect_to(luts_top[0].I4))
         try_connect(lambda: self.input_wires[0][4].connect_to(l8s_top[0].CIN))
-        
+
         try_connect(lambda: self.input_wires[1][4].connect_to(luts_btm[0].I4))
         try_connect(lambda: self.input_wires[1][4].connect_to(l8s_btm[0].CIN))
 
         # downwards connection
         for t, d in zip(luts_top[1:], luts_btm):
             t.O51.connect_to(d.I3)
-        last_top = len(carries_top)-1
+        last_top = len(carries_top) - 1
         carries_top[last_top].connect_to(luts_btm[last_top].I3)
-        
-        for idx, (lb, lt) in enumerate(zip(luts_btm, 
-                                           luts_top[:height_4_until])):
+
+        for idx, (lb, lt) in enumerate(zip(luts_btm, luts_top[:height_4_until])):
             for el in [lb, lt]:
                 try_connect(lambda: self.input_wires[idx][0].connect_to(el.I0))
                 try_connect(lambda: self.input_wires[idx][1].connect_to(el.I1))
@@ -269,31 +254,17 @@ def chain_l8(l8s):
             lt = luts_top[height_4_until]
             lb = luts_btm[height_4_until]
 
-            try_connect(lambda:
-                        self.input_wires[height_4_until][0].connect_to(lt.I0))
-            try_connect(lambda:
-                        self.input_wires[height_4_until][1].connect_to(lt.I1))
-
-            try_connect(lambda:
-                        self.input_wires[height_4_until+1][0].connect_to(
-                            lb.I0))
-            try_connect(lambda:
-                        self.input_wires[height_4_until+1][1].connect_to(
-                            lb.I1))
-
-        for idx, lb in enumerate(luts_btm[height_4_until+1:]):
-            try_connect(lambda: 
-                        self.input_wires[idx+height_4_until+1][0].connect_to(
-                            lb.I0))
-            try_connect(lambda: 
-                        self.input_wires[idx+height_4_until+1][1].connect_to(
-                            lb.I1))
-            try_connect(lambda: 
-                        self.input_wires[idx+height_4_until+2][0].connect_to(
-                            lb.I2))
-            try_connect(lambda: 
-                        self.input_wires[idx+height_4_until+2][1].connect_to(
-                            lb.I3))
+            try_connect(lambda: self.input_wires[height_4_until][0].connect_to(lt.I0))
+            try_connect(lambda: self.input_wires[height_4_until][1].connect_to(lt.I1))
+
+            try_connect(lambda: self.input_wires[height_4_until + 1][0].connect_to(lb.I0))
+            try_connect(lambda: self.input_wires[height_4_until + 1][1].connect_to(lb.I1))
+
+        for idx, lb in enumerate(luts_btm[height_4_until + 1 :]):
+            try_connect(lambda: self.input_wires[idx + height_4_until + 1][0].connect_to(lb.I0))
+            try_connect(lambda: self.input_wires[idx + height_4_until + 1][1].connect_to(lb.I1))
+            try_connect(lambda: self.input_wires[idx + height_4_until + 2][0].connect_to(lb.I2))
+            try_connect(lambda: self.input_wires[idx + height_4_until + 2][1].connect_to(lb.I3))
 
         def connect_carry_to_lut(carries, luts):
             for carry, lut in zip(carries, luts[1:]):
@@ -304,18 +275,19 @@ def connect_carry_to_lut(carries, luts):
         luts_top[0].O51.connect_to(self.output_wires[0][0])
 
         for idx, lb in enumerate(luts_btm):
-            lb.O51.connect_to(self.output_wires[idx+1][0])
+            lb.O51.connect_to(self.output_wires[idx + 1][0])
 
-        carries_btm[len(luts_btm)-1].connect_to(
-            self.output_wires[len(luts_btm)+1][0])
+        carries_btm[len(luts_btm) - 1].connect_to(self.output_wires[len(luts_btm) + 1][0])
 
-        luts_top[-1].O52.connect_to(luts_btm[len(luts_top)-1].I3)
+        luts_top[-1].O52.connect_to(luts_btm[len(luts_top) - 1].I3)
 
         self.instances += luts_top + luts_btm + l8s_btm + l8s_top
 
+
 class MuxCYTernaryAdder(FinalAdder):
     @staticmethod
-    def compression_goal(col): return 5 if col == 0 else 3
+    def compression_goal(col):
+        return 5 if col == 0 else 3
 
     def __init__(self, input_shape: Shape):
         input_shape = input_shape
@@ -323,24 +295,27 @@ def __init__(self, input_shape: Shape):
         super().__init__(input_shape, output_shape)
 
     def build_hardware(self):
-        luts = [LUT6_2.fromPred(
-            lambda A0,A1,A2,A3,A4,A5: FA_carry(A0,A1,A2),
-            lambda A0,A1,A2,A3,A4,A5: FA_sum(A0,A1,A2) ^ A3
-        ) for _ in range(len(self.input_shape)+1)]
-        c4s = [CARRY4() for _ in range(0, len(self.input_shape)+1, 4)]
+        luts = [
+            LUT6_2.fromPred(
+                lambda A0, A1, A2, A3, A4, A5: FA_carry(A0, A1, A2),
+                lambda A0, A1, A2, A3, A4, A5: FA_sum(A0, A1, A2) ^ A3,
+            )
+            for _ in range(len(self.input_shape) + 1)
+        ]
+        c4s = [CARRY4() for _ in range(0, len(self.input_shape) + 1, 4)]
         dis = [el for c4 in c4s for el in c4.DI.elements]
         ss = [el for c4 in c4s for el in c4.S.elements]
         cis = [c4.CI for c4 in c4s]
         os = [el for c4 in c4s for el in c4.O.elements]
         cos = [el for c4 in c4s for el in c4.CO.elements]
 
-        ## Connect CARRY4 together
+        # Connect CARRY4 together
         for c4p, c4n in zip(c4s, c4s[1:]):
             c4p.CO.elements[-1].connect_to(c4n.CI)
 
-        ## Connect inputs
+        # Connect inputs
         # Only connect up to the number of available input columns
-        for idx, lut in enumerate(luts[:len(self.input_wires)]):
+        for idx, lut in enumerate(luts[: len(self.input_wires)]):
             try_connect(lambda idx=idx, lut=lut: self.input_wires[idx][0].connect_to(lut.I0))
             try_connect(lambda idx=idx, lut=lut: self.input_wires[idx][1].connect_to(lut.I1))
             try_connect(lambda idx=idx, lut=lut: self.input_wires[idx][2].connect_to(lut.I2))
@@ -348,17 +323,17 @@ def build_hardware(self):
         try_connect(lambda: self.input_wires[0][3].connect_to(dis[0]))
         try_connect(lambda: self.input_wires[0][4].connect_to(cis[0]))
 
-        ## Second carry connection
+        # Second carry connection
         for p, n, n_di in zip(luts, luts[1:], dis[1:]):
             p.O5.connect_to(n.I3)
             p.O5.connect_to(n_di)
 
-        ## Connect outputs
+        # Connect outputs
         for lut, s in zip(luts, ss):
             lut.O6.connect_to(s)
-        
-        for idx, o in enumerate(os[:len(luts)]):
+
+        for idx, o in enumerate(os[: len(luts)]):
             o.connect_to(self.output_wires[idx][0])
 
-        cos[len(luts)-1].connect_to(self.output_wires[len(luts)][0])
-        self.instances += luts + c4s
\ No newline at end of file
+        cos[len(luts) - 1].connect_to(self.output_wires[len(luts)][0])
+        self.instances += luts + c4s
diff --git a/src/finn/compressor/src/graph/nodes.py b/src/finn/compressor/src/graph/nodes.py
index 647129e70d..8fd299528e 100644
--- a/src/finn/compressor/src/graph/nodes.py
+++ b/src/finn/compressor/src/graph/nodes.py
@@ -7,16 +7,21 @@
 #############################################################################
 
 from __future__ import annotations
+
 from abc import ABC, abstractmethod
-from typing import List, Tuple, Dict
+from typing import Dict, List, Tuple
+
 from ..utils.shape import Shape
 
 """
 Convention: LSB at index 0.
 """
 
-class Node(ABC): 
-    def accept(self, visitor) -> None: pass
+
+class Node(ABC):
+    def accept(self, visitor) -> None:
+        pass
+
 
 class Connectable(Node):
     target: list[Connectable]
@@ -27,57 +32,72 @@ def __init__(self):
         self.source = None
 
     def connect_to(self, target):
-        assert isinstance(target, Connectable), \
-            "Target has to be of type Connectible!"
+        assert isinstance(target, Connectable), "Target has to be of type Connectible!"
         self.target.append(target)
         target.source = self
 
     @property
-    def has_target(self): return bool(self.target)
-    
+    def has_target(self):
+        return bool(self.target)
+
     @property
-    def has_source(self): return self.source is not None
+    def has_source(self):
+        return self.source is not None
+
 
 class Constant(Connectable):
     def __init__(self, value):
         super().__init__()
         self.value = str(value)
 
+
 class Wire(Connectable):
-    def __init__(self, desired_name = None):
+    def __init__(self, desired_name=None):
         super().__init__()
         self.prefix = ""
         self.desired_name = desired_name
 
-    def set_to_module_input(self): self.prefix = "input "
-    def set_to_module_output(self): self.prefix = "output "
+    def set_to_module_input(self):
+        self.prefix = "input "
+
+    def set_to_module_output(self):
+        self.prefix = "output "
+
+    def accept(self, visitor) -> None:
+        visitor.visit_wire(self)
 
-    def accept(self, visitor) -> None: visitor.visit_wire(self)
 
 class Logic(Wire):
-    def __init__(self, *, rst: Connectable = None, 
-                 en: Connectable = None, init: int = None):
+    def __init__(self, *, rst: Connectable = None, en: Connectable = None, init: int = None):
         self.rst = rst
         self.en = en
         self.init = init
         super().__init__()
-            
-    def accept(self, visitor): return visitor.visit_logic(self)
+
+    def accept(self, visitor):
+        return visitor.visit_logic(self)
+
 
 class BlackboxVecElement(Connectable):
     pass
 
+
 class BlackboxVec(Node, ABC):
     def __init__(self, name, width):
         self.name = name
         self.elements = [BlackboxVecElement() for el in range(width)]
         super().__init__()
 
+
 class BlackboxInputVec(BlackboxVec):
-    def accept(self, visitor) -> None: visitor.visit_blackbox_input_vec(self)
+    def accept(self, visitor) -> None:
+        visitor.visit_blackbox_input_vec(self)
+
 
 class BlackboxOutputVec(BlackboxVec):
-    def accept(self, visitor) -> None: visitor.visit_blackbox_output_vec(self)
+    def accept(self, visitor) -> None:
+        visitor.visit_blackbox_output_vec(self)
+
 
 class BlackboxPort(Connectable):
     def __init__(self, name):
@@ -86,43 +106,59 @@ def __init__(self, name):
 
     @property
     @abstractmethod
-    def connected(self): pass
+    def connected(self):
+        pass
 
     @property
     @abstractmethod
-    def wire(self): pass
+    def wire(self):
+        pass
+
 
 class BlackboxInput(BlackboxPort):
     def __init__(self, name):
         super().__init__(name)
 
     @property
-    def connected(self): return self.has_source
+    def connected(self):
+        return self.has_source
 
-    def connect_to(self, target): 
+    def connect_to(self, target):
         raise RuntimeError("Blackbox Input cannot act as output.")
 
     @property
-    def wire(self): return self.source
+    def wire(self):
+        return self.source
+
+    def accept(self, visitor) -> None:
+        visitor.visit_blackbox_input(self)
 
-    def accept(self, visitor) -> None: visitor.visit_blackbox_input(self)
 
 class BlackboxOutput(BlackboxPort):
     def __init__(self, name):
         super().__init__(name)
 
     @property
-    def connected(self): return self.has_target
+    def connected(self):
+        return self.has_target
 
     @property
-    def wire(self): return self.target
+    def wire(self):
+        return self.target
+
+    def accept(self, visitor) -> None:
+        visitor.visit_blackbox_output(self)
 
-    def accept(self, visitor) -> None: visitor.visit_blackbox_output(self)
 
 class Blackbox(Node):
     @abstractmethod
-    def __init__(self, module_name: str, in_ports: Tuple[BlackboxInput], 
-                 out_ports: Tuple[BlackboxOutput], parameters: Dict[str, str]):
+    def __init__(
+        self,
+        module_name: str,
+        in_ports: Tuple[BlackboxInput],
+        out_ports: Tuple[BlackboxOutput],
+        parameters: Dict[str, str],
+    ):
         self.module_name = module_name
         self.in_ports = in_ports
         self.out_ports = out_ports
@@ -132,24 +168,28 @@ def __init__(self, module_name: str, in_ports: Tuple[BlackboxInput],
         for port in self.in_ports + self.out_ports:
             self.__dict__[port.name] = port
 
-    def annotate(self, annotation: str): 
+    def annotate(self, annotation: str):
         self.annotations.append(annotation)
 
     def accept(self, visitor):
         visitor.visit_blackbox(self)
 
+
 class Module(Node):
     def __init__(self):
-        self.instances = [] # All inner instances
+        self.instances = []  # All inner instances
         super().__init__()
 
     @property
     @abstractmethod
-    def inputs(self): pass
+    def inputs(self):
+        pass
 
     @property
     @abstractmethod
-    def outputs(self): pass
+    def outputs(self):
+        pass
+
 
 class Counter(Module):
     def __init__(self, input_shape: Shape, output_shape: Shape):
@@ -166,35 +206,37 @@ def accept(self, visitor) -> None:
         visitor.visit_counter(self)
 
     @abstractmethod
-    def build_hardware(self): pass
+    def build_hardware(self):
+        pass
 
     def _build_wires(self, shape: Shape):
-        return tuple([tuple([Wire() for _ in range(col_height)])
-                                   for col_height in shape])
+        return tuple([tuple([Wire() for _ in range(col_height)]) for col_height in shape])
 
     @property
-    def inputs(self): return [el for col in self.input_wires for el in col]
+    def inputs(self):
+        return [el for col in self.input_wires for el in col]
 
     @property
-    def outputs(self): return [el for col in self.output_wires for el in col]
+    def outputs(self):
+        return [el for col in self.output_wires for el in col]
 
     @property
     def luts(self) -> List[LUT]:
         return [inst for inst in self.instances if isinstance(inst, LUT)]
 
     @property
-    def efficiency(self) -> float: 
-        if (len(self.luts) == 0 and 
-            sum(self.input_shape) - sum(self.output_shape) == 0):
+    def efficiency(self) -> float:
+        if len(self.luts) == 0 and sum(self.input_shape) - sum(self.output_shape) == 0:
             return 0
-        diff = (sum(self.input_shape) - sum(self.output_shape))
+        diff = sum(self.input_shape) - sum(self.output_shape)
         denom = sum(LUT.size for LUT in self.luts)
         return diff / denom
 
     @property
-    def strength(self) -> float: 
+    def strength(self) -> float:
         return sum(self.input_shape) / sum(self.output_shape)
-    
+
+
 class GateAbsorptionCounter(Counter):
     def __init__(self, input_shape: Shape, output_shape: Shape):
         self.input_wires_complementary = self._build_wires(input_shape)
@@ -204,9 +246,9 @@ def accept(self, visitor) -> None:
         visitor.visit_gate_absorption_counter(self)
 
     @property
-    def inputs(self): return [el for col in 
-                              self.input_wires + self.input_wires_complementary
-                              for el in col]
+    def inputs(self):
+        return [el for col in self.input_wires + self.input_wires_complementary for el in col]
+
 
 class Passthrough(Counter):
     def __init__(self):
@@ -216,6 +258,7 @@ def build_hardware(self):
         self.output_wires = self.input_wires
         self.instances = [el for col in self.input_wires for el in col]
 
+
 class Stage(Node):
     input_shape: Shape
     output_shape: Shape
@@ -226,14 +269,16 @@ def connect_to(self, other):
         for col_s, col_t in zip(self.output_wires, other.input_wires):
             for el_s, el_t in zip(col_s, col_t):
                 el_s.connect_to(el_t)
-        
-        # TODO: maybe subclass instead? 
+
+        # TODO: maybe subclass instead?
         if "output_wires_complementary" in self.__dict__:
-            for col_s, col_t in zip(self.output_wires_complementary, 
-                                    other.input_wires_complementary):
+            for col_s, col_t in zip(
+                self.output_wires_complementary, other.input_wires_complementary
+            ):
                 for el_s, el_t in zip(col_s, col_t):
                     el_s.connect_to(el_t)
 
+
 class InputStage(Stage):
     def __init__(self, shape: Shape, gates: bool = False):
         self.input_shape = shape
@@ -246,7 +291,9 @@ def __init__(self, shape: Shape, gates: bool = False):
 
         self.output_wires = self.input_wires
 
-    def accept(self, visitor) -> None: visitor.visit_input_stage(self)
+    def accept(self, visitor) -> None:
+        visitor.visit_input_stage(self)
+
 
 class PipelineStage(Stage):
     def __init__(self, shape: Shape):
@@ -256,25 +303,29 @@ def __init__(self, shape: Shape):
         self.output_wires = Bitmatrix(shape)
         self.instances = []
         for i_c, o_c in zip(self.input_wires, self.output_wires):
-            for i, o in zip(i_c, o_c): 
+            for i, o in zip(i_c, o_c):
                 lgc = Logic()
                 i.connect_to(lgc)
                 lgc.connect_to(o)
                 self.instances.append(lgc)
 
-    def accept(self, visitor) -> None: visitor.visit_pipeline_stage(self)
+    def accept(self, visitor) -> None:
+        visitor.visit_pipeline_stage(self)
+
 
 class CompressionStage(Stage):
-    def __init__(self): 
+    def __init__(self):
         self.counters_with_shifts = []
         self.input_wires = Bitmatrix()
         self.output_wires = Bitmatrix()
 
     @property
-    def input_shape(self): return self._shape(lambda x: x.input_shape)
+    def input_shape(self):
+        return self._shape(lambda x: x.input_shape)
 
     @property
-    def output_shape(self): return self._shape(lambda x: x.output_shape)
+    def output_shape(self):
+        return self._shape(lambda x: x.output_shape)
 
     def _shape(self, func):
         shape = Shape(())
@@ -292,46 +343,54 @@ def append_counter(self, counter: Counter, shift: int):
             for wire in col:
                 self.output_wires.add_input(wire, source_idx + shift)
 
-    def accept(self, visitor) -> None: visitor.visit_compression_stage(self)
+    def accept(self, visitor) -> None:
+        visitor.visit_compression_stage(self)
+
 
 class GateAbsorbedStage(CompressionStage):
     def __init__(self):
         super().__init__()
         self.input_wires_complementary = Bitmatrix()
-    
+
     def append_counter(self, counter: GateAbsorptionCounter, shift: int):
         super().append_counter(counter, shift)
         for source_idx, col in enumerate(counter.input_wires_complementary):
             for wire in col:
-                self.input_wires_complementary.add_output(wire, 
-                                                          source_idx + shift)
+                self.input_wires_complementary.add_output(wire, source_idx + shift)
+
+    def accept(self, visitor) -> None:
+        visitor.visit_gate_absorbed_stage(self)
 
-    def accept(self, visitor) -> None: visitor.visit_gate_absorbed_stage(self)
 
 class Compressor(Node):
-    def __init__(self, name): 
+    def __init__(self, name):
         self.stages = []
         self.module_name = name
         self.io = []
 
     @property
-    def input_shape(self): return self.stages[0].input_shape
-    
+    def input_shape(self):
+        return self.stages[0].input_shape
+
     @property
-    def output_shape(self): return self.stages[-1].output_shape
+    def output_shape(self):
+        return self.stages[-1].output_shape
 
     @property
     def delay(self):
         delay_ = 0
         for s in self.stages:
-            if isinstance(s, PipelineStage): 
+            if isinstance(s, PipelineStage):
                 delay_ += 1
             from .accumulator import AccumulatorStage
-            if isinstance(s, AccumulatorStage): 
+
+            if isinstance(s, AccumulatorStage):
                 delay_ += 1
         return delay_
-    
-    def accept(self, visitor) -> None: visitor.visit_compressor(self)
+
+    def accept(self, visitor) -> None:
+        visitor.visit_compressor(self)
+
 
 class BitmatrixElement(Connectable):
     def __init__(self, vector, idx_x, idx_y):
@@ -341,28 +400,43 @@ def __init__(self, vector, idx_x, idx_y):
 
     @property
     def lin_idx(self):
-        return sum(self.vector.shape[:self.idx_2d[0]]) + self.idx_2d[1]
+        return sum(self.vector.shape[: self.idx_2d[0]]) + self.idx_2d[1]
+
+    def accept(self, visitor):
+        pass
 
-    def accept(self, visitor): pass
 
 class Bitmatrix(Node):
-    def __init__(self, shape : Shape = Shape(), name: str = None):
+    def __init__(self, shape: Shape = Shape(), name: str = None):
         self._name = name
         self.prefix = ""
-        self.connectables = [[BitmatrixElement(self, idx, row)
-                              for row in range(col)]
-                              for idx, col in enumerate(shape)]
+        self.connectables = [
+            [BitmatrixElement(self, idx, row) for row in range(col)]
+            for idx, col in enumerate(shape)
+        ]
         super().__init__()
 
-    def set_to_module_input(self): self.prefix = "input "
-    def set_to_module_output(self): self.prefix = "output "
-    def __len__(self): return len(self.connectables)
-    def __getitem__(self, sel): return self.connectables[sel]
-    def __iter__(self): return self.connectables.__iter__()
-    def total_size(self): return sum([len(col) for col in self.connectables])
-    
+    def set_to_module_input(self):
+        self.prefix = "input "
+
+    def set_to_module_output(self):
+        self.prefix = "output "
+
+    def __len__(self):
+        return len(self.connectables)
+
+    def __getitem__(self, sel):
+        return self.connectables[sel]
+
+    def __iter__(self):
+        return self.connectables.__iter__()
+
+    def total_size(self):
+        return sum([len(col) for col in self.connectables])
+
     @property
-    def shape(self): return Shape([len(col) for col in self.connectables])
+    def shape(self):
+        return Shape([len(col) for col in self.connectables])
 
     def add_output(self, el, col_idx):
         be = self._append_wire(el, col_idx)
@@ -379,15 +453,22 @@ def _append_wire(self, el, col_idx):
         self.connectables[col_idx].append(be)
         return be
 
-    def accept(self, visitor) -> None: visitor.visit_bitmatrix(self)
+    def accept(self, visitor) -> None:
+        visitor.visit_bitmatrix(self)
+
 
 class LUT(Blackbox):
     @abstractmethod
-    def __init__(self, module_name, init_code: str, 
-                 in_ports: Tuple[BlackboxInput], 
-                 out_ports: Tuple[BlackboxOutput], 
-                 *, 
-                 size, desired_name = "lut"):
+    def __init__(
+        self,
+        module_name,
+        init_code: str,
+        in_ports: Tuple[BlackboxInput],
+        out_ports: Tuple[BlackboxOutput],
+        *,
+        size,
+        desired_name="lut",
+    ):
         self.desired_name = desired_name
         self.size = size
-        super().__init__(module_name, in_ports, out_ports, {"INIT": init_code})
\ No newline at end of file
+        super().__init__(module_name, in_ports, out_ports, {"INIT": init_code})
diff --git a/src/finn/compressor/src/graph/primitives.py b/src/finn/compressor/src/graph/primitives.py
index 1cf36507a3..bb77faee5c 100644
--- a/src/finn/compressor/src/graph/primitives.py
+++ b/src/finn/compressor/src/graph/primitives.py
@@ -6,13 +6,22 @@
 # @brief    FPGA primitive definitions for compressor (LUTs, carry chains, etc.)
 #############################################################################
 
-from ctypes import c_uint64, c_uint32
-from .nodes import BlackboxInput, BlackboxOutput, Blackbox, LUT, Constant
-from .nodes import BlackboxInputVec, BlackboxOutputVec
+from ctypes import c_uint32, c_uint64
+
+from .nodes import (
+    LUT,
+    Blackbox,
+    BlackboxInput,
+    BlackboxInputVec,
+    BlackboxOutput,
+    BlackboxOutputVec,
+    Constant,
+)
+
 
 class LUT2(LUT):
     @classmethod
-    def fromPred(self, predO2, desired_name = "lut2"):
+    def fromPred(self, predO2, desired_name="lut2"):
         res = 0
         for i in range(32):
             inputs = [bool(i & (1 << shmt)) for shmt in range(2)]
@@ -20,94 +29,105 @@ def fromPred(self, predO2, desired_name = "lut2"):
             res = res & 0xF
         init_str = f"""4'h{"{:_x}".format(c_uint32(res).value)}"""
         return LUT2(init_str, desired_name)
-    
+
     def __init__(self, init_code: str, desired_name):
         in_ports = [BlackboxInput(f"I{el}") for el in range(2)]
         out_ports = [BlackboxOutput("O")]
-        super().__init__("LUT2", init_code, in_ports, out_ports,
-                         desired_name=desired_name, size=0.5)
+        super().__init__(
+            "LUT2", init_code, in_ports, out_ports, desired_name=desired_name, size=0.5
+        )
+
 
 class LUT5(LUT):
     @classmethod
-    def fromPred(self, predO5, desired_name = "lut5"):
+    def fromPred(self, predO5, desired_name="lut5"):
         res = 0
         for i in range(32):
             inputs = [bool(i & (1 << shmt)) for shmt in range(5)]
             res = res | (int(predO5(*inputs)) << i)
         init_str = f"""32'h{"{:_x}".format(c_uint32(res).value)}"""
         return LUT5(init_str, desired_name)
-    
+
     def __init__(self, init_code: str, desired_name):
         in_ports = [BlackboxInput(f"I{el}") for el in range(5)]
         out_ports = [BlackboxOutput("O")]
-        super().__init__("LUT5", init_code, in_ports, out_ports,
-                         desired_name=desired_name, size=0.5)
+        super().__init__(
+            "LUT5", init_code, in_ports, out_ports, desired_name=desired_name, size=0.5
+        )
+
 
 class LUT6(LUT):
     @classmethod
-    def fromPred(self, predO6, desired_name = "lut6"):
+    def fromPred(self, predO6, desired_name="lut6"):
         res = 0
         for i in range(64):
             inputs = [bool(i & (1 << shmt)) for shmt in range(6)]
             res = res | (int(predO6(*inputs)) << i)
         init_str = f"""64'h{"{:_x}".format(c_uint64(res).value)}"""
         return LUT6(init_str, desired_name)
-    
+
     def __init__(self, init_code: str, desired_name):
         in_ports = [BlackboxInput(f"I{el}") for el in range(6)]
         out_ports = [BlackboxOutput("O")]
-        super().__init__("LUT6", init_code, in_ports, out_ports,
-                         desired_name=desired_name, size=1)
+        super().__init__("LUT6", init_code, in_ports, out_ports, desired_name=desired_name, size=1)
+
 
 def split_lut_from_pred(predO5, predO6):
     res = 0
     for i in range(32, 64):
         inputs = [bool(i & (1 << shmt)) for shmt in range(6)]
-        res = res | (int(predO5(*inputs)) << (i-32)) | (int(predO6(*inputs)) << (i))
+        res = res | (int(predO5(*inputs)) << (i - 32)) | (int(predO6(*inputs)) << (i))
         init_str = f"""64'h{"{:_x}".format(c_uint64(res).value)}"""
     return init_str
 
+
 class LUT6_2(LUT):
     @classmethod
-    def fromPred(self, predO5, predO6, desired_name = "lut6_2"):
+    def fromPred(self, predO5, predO6, desired_name="lut6_2"):
         return LUT6_2(split_lut_from_pred(predO5, predO6), desired_name)
 
     def __init__(self, init_code: str, desired_name):
-        in_ports = [BlackboxInput(f"I{el}") for el in range(6)] 
+        in_ports = [BlackboxInput(f"I{el}") for el in range(6)]
         out_ports = [BlackboxOutput("O6"), BlackboxOutput("O5")]
-        super().__init__("LUT6_2", init_code, in_ports, out_ports,
-                         desired_name=desired_name, size=1)
+        super().__init__(
+            "LUT6_2", init_code, in_ports, out_ports, desired_name=desired_name, size=1
+        )
         Constant("1'b1").connect_to(self.I5)
 
+
 class LUT6CY(LUT):
     @classmethod
-    def fromPred(self, predO51, predO52, desired_name = "lut6cy"):
+    def fromPred(self, predO51, predO52, desired_name="lut6cy"):
         return LUT6CY(split_lut_from_pred(predO51, predO52), desired_name)
 
     def __init__(self, init_code: str, desired_name):
         in_ports = [BlackboxInput(f"I{el}") for el in range(5)]
         out_ports = [BlackboxOutput(f"O5{el+1}") for el in range(2)]
         out_ports.append(BlackboxOutput("PROP"))
-        super().__init__("LUT6CY", init_code, in_ports, out_ports,
-                         desired_name=desired_name, size=1)
+        super().__init__(
+            "LUT6CY", init_code, in_ports, out_ports, desired_name=desired_name, size=1
+        )
+
 
 class LOOKAHEAD8(Blackbox):
     def __init__(self):
         c_in_ports_str = ["CIN", "CYA", "CYB", "CYC", "CYD", "CYE", "CYF", "CYG", "CYH"]
-        p_in_ports_str = ["PROPA", "PROPB", "PROPC", "PROPD", "PROPE", "PROPF", "PROPG",
-                          "PROPH"]
+        p_in_ports_str = ["PROPA", "PROPB", "PROPC", "PROPD", "PROPE", "PROPF", "PROPG", "PROPH"]
         out_ports_str = ["COUTB", "COUTD", "COUTF", "COUTH"]
-        
+
         self.c_in_ports = [BlackboxInput(el) for el in c_in_ports_str]
         self.p_in_ports = [BlackboxInput(el) for el in p_in_ports_str]
         out_ports = [BlackboxOutput(el) for el in out_ports_str]
-        super().__init__("LOOKAHEAD8", self.c_in_ports + self.p_in_ports, out_ports,
-                         {"LOOKB" : "\"TRUE\"", "LOOKD" : "\"TRUE\"",
-                          "LOOKF" : "\"TRUE\"", "LOOKH" : "\"TRUE\""})
+        super().__init__(
+            "LOOKAHEAD8",
+            self.c_in_ports + self.p_in_ports,
+            out_ports,
+            {"LOOKB": '"TRUE"', "LOOKD": '"TRUE"', "LOOKF": '"TRUE"', "LOOKH": '"TRUE"'},
+        )
+
 
 class CARRY4(Blackbox):
     def __init__(self):
-        in_ports = [BlackboxInputVec("DI", 4), BlackboxInputVec("S", 4),
-                    BlackboxInput("CI")]
+        in_ports = [BlackboxInputVec("DI", 4), BlackboxInputVec("S", 4), BlackboxInput("CI")]
         out_ports = [BlackboxOutputVec("O", 4), BlackboxOutputVec("CO", 4)]
-        super().__init__("CARRY4", in_ports, out_ports, {})
\ No newline at end of file
+        super().__init__("CARRY4", in_ports, out_ports, {})
diff --git a/src/finn/compressor/src/graph/visitor.py b/src/finn/compressor/src/graph/visitor.py
index 5be1ea118f..2f8e6950ed 100644
--- a/src/finn/compressor/src/graph/visitor.py
+++ b/src/finn/compressor/src/graph/visitor.py
@@ -7,39 +7,66 @@
 #############################################################################
 
 from abc import ABC
-from .nodes import Counter, CompressionStage, Compressor, InputStage, PipelineStage
-from .nodes import Logic, Bitmatrix, GateAbsorbedStage, GateAbsorptionCounter
-from .nodes import Blackbox
-from .primitives import BlackboxInputVec, BlackboxOutputVec, BlackboxInput
-from .primitives import BlackboxOutput
+
+from .nodes import (
+    Bitmatrix,
+    Blackbox,
+    CompressionStage,
+    Compressor,
+    Counter,
+    GateAbsorbedStage,
+    GateAbsorptionCounter,
+    InputStage,
+    Logic,
+    PipelineStage,
+)
+from .primitives import (
+    BlackboxInput,
+    BlackboxInputVec,
+    BlackboxOutput,
+    BlackboxOutputVec,
+)
+
 
 class Visitor(ABC):
-    def visit_compressor(self, c: Compressor): raise NotImplementedError
+    def visit_compressor(self, c: Compressor):
+        raise NotImplementedError
 
-    def visit_input_stage(self, s: InputStage): raise NotImplementedError
-    
-    def visit_gate_absorption_stage(self, s: GateAbsorbedStage): 
+    def visit_input_stage(self, s: InputStage):
         raise NotImplementedError
 
-    def visit_pipeline_stage(self, s: PipelineStage): raise NotImplementedError
+    def visit_gate_absorption_stage(self, s: GateAbsorbedStage):
+        raise NotImplementedError
 
-    def visit_compression_stage(self, s: CompressionStage): raise NotImplementedError
+    def visit_pipeline_stage(self, s: PipelineStage):
+        raise NotImplementedError
 
-    def visit_counter(self, c: Counter): raise NotImplementedError
+    def visit_compression_stage(self, s: CompressionStage):
+        raise NotImplementedError
 
-    def visit_gate_absorption_counter(self, c: GateAbsorptionCounter): 
+    def visit_counter(self, c: Counter):
         raise NotImplementedError
 
-    def visit_blackbox(self, b: Blackbox): raise NotImplementedError
+    def visit_gate_absorption_counter(self, c: GateAbsorptionCounter):
+        raise NotImplementedError
 
-    def visit_blackbox_input(self, b: BlackboxInput): raise NotImplementedError
+    def visit_blackbox(self, b: Blackbox):
+        raise NotImplementedError
 
-    def visit_blackbox_output(self, b: BlackboxOutput): raise NotImplementedError
+    def visit_blackbox_input(self, b: BlackboxInput):
+        raise NotImplementedError
 
-    def visit_blackbox_input_vec(self, b: BlackboxInputVec): raise NotImplementedError
+    def visit_blackbox_output(self, b: BlackboxOutput):
+        raise NotImplementedError
 
-    def visit_blackbox_output_vec(self, b: BlackboxOutputVec): raise NotImplementedError
+    def visit_blackbox_input_vec(self, b: BlackboxInputVec):
+        raise NotImplementedError
 
-    def visit_logic(self, lgc: Logic): raise NotImplementedError
+    def visit_blackbox_output_vec(self, b: BlackboxOutputVec):
+        raise NotImplementedError
 
-    def visit_bitmatrix(self, b: Bitmatrix): raise NotImplementedError
\ No newline at end of file
+    def visit_logic(self, lgc: Logic):
+        raise NotImplementedError
+
+    def visit_bitmatrix(self, b: Bitmatrix):
+        raise NotImplementedError
diff --git a/src/finn/compressor/src/main.py b/src/finn/compressor/src/main.py
index ad3331cb14..bac3f98a9d 100644
--- a/src/finn/compressor/src/main.py
+++ b/src/finn/compressor/src/main.py
@@ -6,60 +6,74 @@
 # @brief    Main compressor tree generation entry point
 #############################################################################
 
-import time
 import argparse
-from .target import Target, Versal, SevenSeries, UltraScale
-from .utils.shape import Shape
+import time
+from typing import List, Optional
+
 from .passes.compressor_constructor import CompressorConstructor
 from .passes.cost_estimator import CostEstimator
-from .passes.printer import CompressorPrinter
 from .passes.emitter import VerilogGenerator
-from .passes.wire_inserter import WireInserter
 from .passes.io_annotator import IOAnnotator
 from .passes.lut_placer import LUTPlacer
+from .passes.printer import CompressorPrinter
+from .passes.wire_inserter import WireInserter
+from .target import SevenSeries, Target, UltraScale, Versal
 from .tests.test_gen import generate_test
 from .tests.tester import tester
-from typing import Optional, List
+from .utils.shape import Shape
+
 
 def parse_cli():
     parser = argparse.ArgumentParser(
-        prog="Compressor Generator",
-        description="Generate a Compressor Tree for a given input."
+        prog="Compressor Generator", description="Generate a Compressor Tree for a given input."
+    )
+    parser.add_argument(
+        "-o", "--output", default="../gen/out.sv", help="Path to store the compressor at."
+    )
+    parser.add_argument("-s", "--shape", required=True, help="Input shape.")
+    parser.add_argument("-a", "--accumulate", action="store_true", help="Enable accumulation.")
+    parser.add_argument(
+        "-w", "--accumulator_width", help="Accumulator width [default: Reduced input shape]."
+    )
+    parser.add_argument(
+        "-g",
+        "--gates",
+        default=None,
+        help="Inline 2-input gates into the compressor. LSB is left." "Example: 8,3",
+    )
+    parser.add_argument(
+        "-t",
+        "--target",
+        default="Versal",
+        help="Target FPGA generation.",
+        choices=["Versal", "7-Series", "UltraScale"],
+    )
+    parser.add_argument(
+        "--test", action="store_true", help="Test the generated compressor using Vivado XSim."
+    )
+    parser.add_argument(
+        "-n", "--name", default="comp", help="Name of the generated Systemverilog module."
+    )
+    parser.add_argument(
+        "-p",
+        "--pipeline_every",
+        default=None,
+        help="Insert Pipeline registers every n stages. Default: " "Purely combinatorial.",
+    )
+    parser.add_argument(
+        "-c", "--constant", default=[], help="Add a constant binary " "number input. Example: 1011"
     )
-    parser.add_argument('-o', '--output', default="../gen/out.sv", 
-                        help="Path to store the compressor at.")
-    parser.add_argument('-s', '--shape', required=True, help="Input shape.")
-    parser.add_argument('-a', '--accumulate', action='store_true',
-                        help="Enable accumulation.")
-    parser.add_argument('-w', '--accumulator_width',
-                        help="Accumulator width [default: Reduced input shape].")
-    parser.add_argument('-g', '--gates', default=None,
-                        help="Inline 2-input gates into the compressor. LSB is left."
-                        "Example: 8,3")
-    parser.add_argument('-t', '--target', default="Versal",
-                        help="Target FPGA generation.", choices=["Versal", "7-Series", 
-                                                                 "UltraScale"])
-    parser.add_argument('--test', action="store_true",
-                        help="Test the generated compressor using Vivado XSim.")
-    parser.add_argument('-n', '--name', default="comp", 
-                        help="Name of the generated Systemverilog module.")
-    parser.add_argument('-p', '--pipeline_every', default=None, 
-                        help="Insert Pipeline registers every n stages. Default: "
-                        "Purely combinatorial.")
-    parser.add_argument('-c', '--constant', default=[], help="Add a constant binary "
-                        "number input. Example: 1011")
     args = parser.parse_args()
 
     try:
-        shape = Shape(int(el) for el in args.shape.split(','))
+        shape = Shape(int(el) for el in args.shape.split(","))
     except (ValueError, TypeError):
         print("Improperly defined shape.")
         exit(-1)
 
     gates = []
     if args.gates:
-        assert len(args.gates) == sum(shape), \
-            "Length of shape and gate specification do not match."
+        assert len(args.gates) == sum(shape), "Length of shape and gate specification do not match."
         gates_lin = list(args.gates)
         for col in shape:
             gates_col = []
@@ -93,35 +107,38 @@ def parse_cli():
         gates,
         constants,
         args.output,
-        args.test
+        args.test,
     )
 
-def generate_compressor(
-        target: Target,
-        shape: Shape,
-        name: str, 
-        comb_depth: Optional[int],
-        accumulate: bool,
-        accumulator_width: int,
-        gates: List[List[str]],
-        constants: List[int], # Each element is a binary numer digit.
-        path: str,
-        test: bool,
-        enable: bool = False):
 
+def generate_compressor(
+    target: Target,
+    shape: Shape,
+    name: str,
+    comb_depth: Optional[int],
+    accumulate: bool,
+    accumulator_width: int,
+    gates: List[List[str]],
+    constants: List[int],  # Each element is a binary numer digit.
+    path: str,
+    test: bool,
+    enable: bool = False,
+):
     start_time = time.time()
     constructor = CompressorConstructor()
-    c = constructor(target.counter_candidates, 
-                    target.absorbing_counter_candidates,
-                    target.final_adder,
-                    shape,
-                    name,
-                    comb_depth=comb_depth,
-                    accumulate=accumulate,
-                    accumulator_width=accumulator_width,
-                    constants=constants,
-                    gates=gates,
-                    enable=enable)
+    c = constructor(
+        target.counter_candidates,
+        target.absorbing_counter_candidates,
+        target.final_adder,
+        shape,
+        name,
+        comb_depth=comb_depth,
+        accumulate=accumulate,
+        accumulator_width=accumulator_width,
+        constants=constants,
+        gates=gates,
+        enable=enable,
+    )
 
     placer = LUTPlacer()
     c.accept(placer)
@@ -137,17 +154,20 @@ def generate_compressor(
 
     emitter = VerilogGenerator()
     c.accept(emitter)
-    with open(path, 'w') as f:
-        withprefix = f"""// Adder generated by the Python Compressor Generator
+    with open(path, "w") as f:
+        withprefix = (
+            f"""// Adder generated by the Python Compressor Generator
 // Input shape: {c.input_shape}; Output Shape: {c.output_shape}
 // Pipeline stages: {c.delay}
 // Target Generation: {target.__class__.__name__}
 // Approximate LUTs: {int(cost.luts+0.5)}
-// Accumulation: {"yes" if accumulate else "no"} {f"of width {accumulator_width}" 
+// Accumulation: {"yes" if accumulate else "no"} {f"of width {accumulator_width}"
                                                   if accumulator_width else ""}
 // Enable mode: {"yes (init values set on accumulator registers)" if enable else "no"}
 // Gates: {gates if gates else "None"}
-        """ + emitter.emitter.output
+        """
+            + emitter.emitter.output
+        )
         f.write(withprefix)
 
     end_time = time.time()
@@ -157,13 +177,13 @@ def generate_compressor(
 
     if test:
         constant = int("".join(str(c) for c in constants), 2) if constants else 0
-        test = generate_test(shape, "comp", c.delay, gates, accumulate, 
-                             accumulator_width, constant)
-        with open("../gen/test.sv", 'w') as f:
+        test = generate_test(shape, "comp", c.delay, gates, accumulate, accumulator_width, constant)
+        with open("../gen/test.sv", "w") as f:
             f.write(test)
         tester("../gen/test.sv", path)
 
     return c.delay
 
+
 if __name__ == "__main__":
-    parse_cli()
\ No newline at end of file
+    parse_cli()
diff --git a/src/finn/compressor/src/passes/compressor_constructor.py b/src/finn/compressor/src/passes/compressor_constructor.py
index c7c6285873..10036923ab 100644
--- a/src/finn/compressor/src/passes/compressor_constructor.py
+++ b/src/finn/compressor/src/passes/compressor_constructor.py
@@ -6,25 +6,34 @@
 # @brief    Compressor tree constructor with two-pass accumulator handling
 #############################################################################
 
-from typing import Tuple, List
-from .compressor_pipeliner import CompressorPipeliner
+from typing import List, Tuple
+
 from ..graph.accumulator import AccumulatorStage
+from ..graph.counters.absorption_counter_candidates import (
+    GateAbsorptionCounterCandidate,
+)
 from ..graph.counters.counter_candidates import ConstantOne
-from ..graph.counters.absorption_counter_candidates import GateAbsorptionCounterCandidate
-from ..graph.nodes import Compressor, CompressionStage, InputStage, Counter, Passthrough
-from ..graph.nodes import GateAbsorbedStage
+from ..graph.nodes import (
+    CompressionStage,
+    Compressor,
+    Counter,
+    GateAbsorbedStage,
+    InputStage,
+    Passthrough,
+)
 from ..utils.shape import Shape
+from .compressor_pipeliner import CompressorPipeliner
+
 
 class CompressorConstructor:
     def adjust_compression_goal_for_constants(self, compression_goal, constants):
         # Subtract constants, but never go below 2 (minimum achievable by compressor)
-        return lambda x: max(2, compression_goal(x) -
-                                (constants[x] if x < len(constants) else 0))
-    
+        return lambda x: max(2, compression_goal(x) - (constants[x] if x < len(constants) else 0))
+
     def get_compression_goal(self, final_adder, accumulate, constants):
         # Two-pass strategy for accumulate: compress to goal, add constants, then post-check
         compression_goal = final_adder.compression_goal
-        return self.adjust_compression_goal_for_constants(compression_goal, constants)        
+        return self.adjust_compression_goal_for_constants(compression_goal, constants)
 
     def add_constants_to_stage(self, s: CompressionStage, constants):
         """Add constant bits to the compression stage."""
@@ -33,27 +42,29 @@ def add_constants_to_stage(self, s: CompressionStage, constants):
                 c = ConstantOne()
                 s.append_counter(c, idx)
 
-    def __call__(self, 
-                 counter_candidates,
-                 absorption_counter_candidates,
-                 final_adder,
-                 input_shape: Shape,
-                 name: str,
-                 comb_depth: int = None,
-                 accumulate=False,
-                 accumulator_width: int = None,
-                 constants: Tuple[bool] = tuple(),
-                 gates: Tuple[Tuple[str]] = tuple(),
-                 enable: bool = False
-                 ) -> Compressor:
+    def __call__(
+        self,
+        counter_candidates,
+        absorption_counter_candidates,
+        final_adder,
+        input_shape: Shape,
+        name: str,
+        comb_depth: int = None,
+        accumulate=False,
+        accumulator_width: int = None,
+        constants: Tuple[bool] = tuple(),
+        gates: Tuple[Tuple[str]] = tuple(),
+        enable: bool = False,
+    ) -> Compressor:
         compression_goal = self.get_compression_goal(final_adder, accumulate, constants)
-        
+
         c = Compressor(name)
         c.stages.append(InputStage(input_shape, gates))
 
         if gates:
-            s = self.construct_absorption_stage(c.stages[-1].output_shape, gates,
-                                                absorption_counter_candidates)
+            s = self.construct_absorption_stage(
+                c.stages[-1].output_shape, gates, absorption_counter_candidates
+            )
             c.stages[-1].connect_to(s)
             c.stages.append(s)
 
@@ -61,8 +72,7 @@ def __call__(self,
         # add_compression_stage cannot compress height-1 or height-2 columns (requires >= 3)
         # Therefore compression_goal must be achievable given this constraint
         # See get_compression_goal() for how this is ensured in accumulate configurations
-        while not self.compression_goal_reached(c.stages[-1].output_shape,
-                                                compression_goal):
+        while not self.compression_goal_reached(c.stages[-1].output_shape, compression_goal):
             self.add_compression_stage(c, compression_goal, counter_candidates)
 
         # Add constants to the graph.
@@ -74,6 +84,7 @@ def __call__(self,
         # The ternary adder receives: compressor_output + feedback (height 1).
         # If any column exceeds final_adder capacity, we need more compression.
         if accumulate:
+
             def post_const_goal(x):
                 # Leave room for feedback (height 1) within ternary adder capacity
                 return max(2, final_adder.compression_goal(x) - 1)
@@ -88,22 +99,24 @@ def post_const_goal(x):
             pipeline_stages = 0
 
         if accumulate:
-                acc = AccumulatorStage(c.stages[-1].output_shape, final_adder, 
-                                       pipeline_stages, 
-                                       accumulator_width=accumulator_width,
-                                       enable=enable)
-                c.stages.append(acc)
+            acc = AccumulatorStage(
+                c.stages[-1].output_shape,
+                final_adder,
+                pipeline_stages,
+                accumulator_width=accumulator_width,
+                enable=enable,
+            )
+            c.stages.append(acc)
         elif max(c.stages[-1].output_shape) > 1:
-                final_stage = CompressionStage()
-                final_stage.append_counter(final_adder(c.stages[-1].output_shape), 0)
-                c.stages.append(final_stage)
+            final_stage = CompressionStage()
+            final_stage.append_counter(final_adder(c.stages[-1].output_shape), 0)
+            c.stages.append(final_stage)
 
         for s_p, s_n in zip(c.stages, c.stages[1:]):
             s_p.connect_to(s_n)
         return c
-    
-    def add_compression_stage(self, compressor: Compressor, compression_goal,
-                              counter_candidates):
+
+    def add_compression_stage(self, compressor: Compressor, compression_goal, counter_candidates):
         """Add a compression stage. Cannot compress columns with height < 3 (Full Adder = 3:2)."""
         new_stage = CompressionStage()
         stage_inputs = compressor.stages[-1].output_shape
@@ -111,6 +124,7 @@ def add_compression_stage(self, compressor: Compressor, compression_goal,
 
         i = 0
         while i < max(len(stage_inputs), len(stage_outputs)):
+
             def cur_output_height():
                 return (stage_inputs + stage_outputs)[i]
 
@@ -118,10 +132,12 @@ def cur_input_height():
                 return stage_inputs[i] if len(stage_inputs) > i else 0
 
             while cur_input_height() >= 3 and cur_output_height() > compression_goal(i):
-                counter = self.schedule_counter(stage_inputs[i:], 
-                                                stage_outputs[i:], 
-                                                lambda x: compression_goal(x+i),
-                                                counter_candidates)
+                counter = self.schedule_counter(
+                    stage_inputs[i:],
+                    stage_outputs[i:],
+                    lambda x: compression_goal(x + i),
+                    counter_candidates,
+                )
                 stage_inputs = stage_inputs - (counter.input_shape << i)
                 stage_outputs = stage_outputs + (counter.output_shape << i)
                 new_stage.append_counter(counter, i)
@@ -134,27 +150,28 @@ def cur_input_height():
 
         compressor.stages.append(new_stage)
 
-    def schedule_counter(self, stage_inputs, stage_outputs, compression_goal,
-                         counter_candidates) -> Counter:
-        counters = [] 
+    def schedule_counter(
+        self, stage_inputs, stage_outputs, compression_goal, counter_candidates
+    ) -> Counter:
+        counters = []
         for counter_candid in counter_candidates:
-            counter = counter_candid.extend_to_fit(stage_inputs, stage_outputs,
-                                                   compression_goal)
+            counter = counter_candid.extend_to_fit(stage_inputs, stage_outputs, compression_goal)
             counters.append(counter)
-        
+
         try:
-            return max((c for c in counters
-                    if c is not None), key = lambda x: (x.efficiency, x.strength))
+            return max(
+                (c for c in counters if c is not None), key=lambda x: (x.efficiency, x.strength)
+            )
         except ValueError:
-            raise ValueError(f"Could not schedule counter for input shape"
-                             f"{stage_inputs}; output shape {stage_outputs}; "
-                             "compression goal {compression_goal(0)}")
+            raise ValueError(
+                f"Could not schedule counter for input shape"
+                f"{stage_inputs}; output shape {stage_outputs}; "
+                "compression goal {compression_goal(0)}"
+            )
 
     def compression_goal_reached(self, shape, compression_goal):
-        return all([col <= compression_goal(idx)
-                    for idx, col in enumerate(shape)])
+        return all([col <= compression_goal(idx) for idx, col in enumerate(shape)])
 
-    
     def get_best_inlined_counter(self, input_shape, gates, absorption_counters):
         candidates = []
         for counter in absorption_counters:
@@ -163,21 +180,23 @@ def get_best_inlined_counter(self, input_shape, gates, absorption_counters):
                 candidates.append(candidate)
         return max(candidates, key=lambda x: (x.efficiency, x.strength))
 
-    def construct_absorption_stage(self,
-                                   input_shape: Shape,
-                                   gates: List[str],
-                                   absorption_counters: GateAbsorptionCounterCandidate
-                                   ):
+    def construct_absorption_stage(
+        self,
+        input_shape: Shape,
+        gates: List[str],
+        absorption_counters: GateAbsorptionCounterCandidate,
+    ):
         s = GateAbsorbedStage()
         cur_shape = input_shape
         cur_gates = gates[:]
         for idx in range(len(input_shape)):
             while cur_shape[idx] > 0:
                 best = self.get_best_inlined_counter(
-                    cur_shape[idx:], cur_gates[idx:], absorption_counters)
+                    cur_shape[idx:], cur_gates[idx:], absorption_counters
+                )
                 cur_shape = cur_shape - (best.input_shape << idx)
                 for i in range(len(cur_shape)):
-                    new = list(reversed(list(reversed(cur_gates[i]))[:cur_shape[i]]))
+                    new = list(reversed(list(reversed(cur_gates[i]))[: cur_shape[i]]))
                     cur_gates[i] = new
                 s.append_counter(best, idx)
-        return s
\ No newline at end of file
+        return s
diff --git a/src/finn/compressor/src/passes/compressor_pipeliner.py b/src/finn/compressor/src/passes/compressor_pipeliner.py
index b0a1e80163..3a291d972f 100644
--- a/src/finn/compressor/src/passes/compressor_pipeliner.py
+++ b/src/finn/compressor/src/passes/compressor_pipeliner.py
@@ -6,7 +6,8 @@
 # @brief    Compressor tree pipelining pass
 #############################################################################
 
-from ..graph.nodes import Compressor, CompressionStage, PipelineStage
+from ..graph.nodes import CompressionStage, Compressor, PipelineStage
+
 
 class CompressorPipeliner:
     def pipeline(self, c: Compressor, max_combinational_depth: int):
@@ -18,8 +19,11 @@ def pipeline(self, c: Compressor, max_combinational_depth: int):
             if isinstance(stage, CompressionStage):
                 new_stages.append(stage)
                 cur_depth += 1
-                if (cur_depth >= max_combinational_depth or 
-                    cur_depth >= max_combinational_depth-1 and idx == len(c.stages)-1):
+                if (
+                    cur_depth >= max_combinational_depth
+                    or cur_depth >= max_combinational_depth - 1
+                    and idx == len(c.stages) - 1
+                ):
                     new_stages.append(PipelineStage(stage.output_shape))
                     cur_depth = 0
                     pipeline_stages += 1
@@ -30,4 +34,4 @@ def pipeline(self, c: Compressor, max_combinational_depth: int):
         for p, n in zip(c.stages, c.stages[1:]):
             p.connect_to(n)
 
-        return pipeline_stages
\ No newline at end of file
+        return pipeline_stages
diff --git a/src/finn/compressor/src/passes/cost_estimator.py b/src/finn/compressor/src/passes/cost_estimator.py
index 859504a63a..5d0f6f0514 100644
--- a/src/finn/compressor/src/passes/cost_estimator.py
+++ b/src/finn/compressor/src/passes/cost_estimator.py
@@ -6,14 +6,20 @@
 # @brief    Cost estimation pass for compressor resources
 #############################################################################
 
-from ..graph.nodes import CompressionStage, Compressor, GateAbsorbedStage, PipelineStage
-from ..graph.nodes import Blackbox
-from ..graph.primitives import LUT6, LUT6_2, LUT6CY, LUT5, LUT2, LUT
+from ..graph.nodes import (
+    Blackbox,
+    CompressionStage,
+    Compressor,
+    GateAbsorbedStage,
+    PipelineStage,
+)
+from ..graph.primitives import LUT, LUT2, LUT5, LUT6, LUT6_2, LUT6CY
 from .node_iterator import NodeIterator
 
+
 class CostEstimator(NodeIterator):
     def iter_compressor(self, c: Compressor):
-        self.combinatorial_stages = -1 # Start with -1 to exclude final adder
+        self.combinatorial_stages = -1  # Start with -1 to exclude final adder
         self.pipeline_stages = 0
         self.luts = 0
 
@@ -32,4 +38,4 @@ def iter_blackbox(self, b: Blackbox):
         elif isinstance(b, LUT6) or isinstance(b, LUT6CY) or isinstance(b, LUT6_2):
             self.luts += 1
         elif isinstance(b, LUT):
-            raise RuntimeError("No cost function implemented for this LUT type {b}")
\ No newline at end of file
+            raise RuntimeError("No cost function implemented for this LUT type {b}")
diff --git a/src/finn/compressor/src/passes/emitter.py b/src/finn/compressor/src/passes/emitter.py
index 421b0f1379..5b38a2c9a3 100644
--- a/src/finn/compressor/src/passes/emitter.py
+++ b/src/finn/compressor/src/passes/emitter.py
@@ -6,18 +6,37 @@
 # @brief    Verilog emitter for compressor tree
 #############################################################################
 
-from io import StringIO
-from contextlib import contextmanager
 from collections import defaultdict
+from contextlib import contextmanager
+from io import StringIO
 from typing import Tuple
-from ..graph.primitives import BlackboxInput, BlackboxInputVec, BlackboxOutput
-from ..graph.primitives import BlackboxOutputVec
-from ..graph.visitor import Visitor
-from ..graph.nodes import Bitmatrix, Counter, CompressionStage, Compressor, InputStage
-from ..graph.nodes import PipelineStage, Wire, BlackboxPort, Logic, BlackboxVecElement
-from ..graph.nodes import Connectable, GateAbsorbedStage, Blackbox, BitmatrixElement
-from ..graph.nodes import Constant
+
 from ..graph.accumulator import AccumulatorStage
+from ..graph.nodes import (
+    Bitmatrix,
+    BitmatrixElement,
+    Blackbox,
+    BlackboxPort,
+    BlackboxVecElement,
+    CompressionStage,
+    Compressor,
+    Connectable,
+    Constant,
+    Counter,
+    GateAbsorbedStage,
+    InputStage,
+    Logic,
+    PipelineStage,
+    Wire,
+)
+from ..graph.primitives import (
+    BlackboxInput,
+    BlackboxInputVec,
+    BlackboxOutput,
+    BlackboxOutputVec,
+)
+from ..graph.visitor import Visitor
+
 
 class VerilogEmitter:
     def __init__(self):
@@ -25,13 +44,13 @@ def __init__(self):
         self._indent_level = 0
         self._line_start = True
 
-    def emit(self, line = ""):
+    def emit(self, line=""):
         if self._line_start:
             self._out.write(self._indent_level * "\t")
         self._line_start = False
         self._out.write(line)
 
-    def emitln(self, line = ""):
+    def emitln(self, line=""):
         if self._line_start:
             self._out.write(self._indent_level * "\t")
         self._out.write(line + "\n")
@@ -54,6 +73,7 @@ def save_verilog(self, filename):
         with open(filename, "w") as f:
             f.writelines(self._out)
 
+
 class VerilogGenerator(Visitor):
     def set_name(self, o: object, name):
         self._names[type(o)][o] = name
@@ -103,14 +123,24 @@ def visit_compressor(self, c: Compressor):
 
         self.emitter.emitln(f"module {c.module_name}(")
         with self.emitter.indent:
-            names = sorted(["input clk"] + 
-                           [el.prefix + ("logic " if isinstance(el, Logic) else 
-                                         f"[{el.total_size()-1}:0] "
-                                         if isinstance(el, Bitmatrix) else
-                                         "") + self.get_name(el) for el in c.io],
-                           key=lambda x: "input" not in x)
+            names = sorted(
+                ["input clk"]
+                + [
+                    el.prefix
+                    + (
+                        "logic "
+                        if isinstance(el, Logic)
+                        else f"[{el.total_size()-1}:0] "
+                        if isinstance(el, Bitmatrix)
+                        else ""
+                    )
+                    + self.get_name(el)
+                    for el in c.io
+                ],
+                key=lambda x: "input" not in x,
+            )
             [self._declared_hardware.add(el) for el in c.io]
-            
+
             self.emitter.emitln(",\n\t".join(names))
         self.emitter.emitln(");")
 
@@ -131,11 +161,12 @@ def visit_accumulator_stage(self, a: AccumulatorStage):
         self.emitter.emitln()
         self.emitter.emitln("// Accumulator Stage")
         a.input_wires.accept(self)
-        [el.accept(self) for el in
-         sorted(a.instances, key=lambda x: (not isinstance(x, Connectable)))]
+        [
+            el.accept(self)
+            for el in sorted(a.instances, key=lambda x: (not isinstance(x, Connectable)))
+        ]
         a.output_wires.accept(self)
 
-
     def visit_pipeline_stage(self, s: PipelineStage):
         self.emitter.emitln()
         self.emitter.emitln("// Pipeline Results..")
@@ -145,8 +176,10 @@ def visit_pipeline_stage(self, s: PipelineStage):
 
     def visit_compression_stage(self, s: CompressionStage):
         self.emitter.emitln()
-        self.emitter.emitln(f"// Compression Stage with Input Shape: {s.input_shape} "
-                            f"and Output Shape {s.output_shape}")
+        self.emitter.emitln(
+            f"// Compression Stage with Input Shape: {s.input_shape} "
+            f"and Output Shape {s.output_shape}"
+        )
         s.input_wires.accept(self)
         [c.accept(self) for c, _ in s.counters_with_shifts]
         s.output_wires.accept(self)
@@ -155,8 +188,9 @@ def visit_compression_stage(self, s: CompressionStage):
     def visit_gate_absorbed_stage(self, g: GateAbsorbedStage):
         self.emitter.emitln()
         self.emitter.emitln("// Compression Stage with Gate Absorption.")
-        self.emitter.emitln(f"// Input Shape: {g.input_shape} "
-                            f"and Output Shape: {g.output_shape}")
+        self.emitter.emitln(
+            f"// Input Shape: {g.input_shape} " f"and Output Shape: {g.output_shape}"
+        )
         g.input_wires.accept(self)
         g.input_wires_complementary.accept(self)
         [c.accept(self) for c, _ in g.counters_with_shifts]
@@ -166,8 +200,10 @@ def visit_gate_absorbed_stage(self, g: GateAbsorbedStage):
     def visit_counter(self, c: Counter):
         [el.accept(self) for col in c.input_wires for el in col]
         [el.accept(self) for col in c.output_wires for el in col]
-        [el.accept(self) for el in 
-         sorted(c.instances, key=lambda x: not isinstance(x, Connectable))]
+        [
+            el.accept(self)
+            for el in sorted(c.instances, key=lambda x: not isinstance(x, Connectable))
+        ]
 
     def visit_gate_absorption_counter(self, c: GateAbsorbedStage):
         [el.accept(self) for col in c.input_wires_complementary for el in col]
@@ -184,32 +220,33 @@ def visit_wire(self, w: Wire):
         if w.has_source not in self._declared_hardware and isinstance(w.source, Wire):
             w.source.accept(self)
 
-        if (w.has_source and isinstance(w.source, Connectable) and
-            not isinstance(w.source, BlackboxPort) and
-            not isinstance(w.source, BlackboxVecElement)):
-            self.emitter.emitln(
-                f"assign {self.get_name(w)} = {self.get_name(w.source)};")
+        if (
+            w.has_source
+            and isinstance(w.source, Connectable)
+            and not isinstance(w.source, BlackboxPort)
+            and not isinstance(w.source, BlackboxVecElement)
+        ):
+            self.emitter.emitln(f"assign {self.get_name(w)} = {self.get_name(w.source)};")
         self._emitted_hardware.add(w)
 
     def visit_logic(self, lgc: Logic):
-        if lgc in self._emitted_hardware: 
+        if lgc in self._emitted_hardware:
             return
-        
+
         if lgc not in self._declared_hardware:
             self.emitter.emit(lgc.prefix)
             init_str = f" = 1'b{lgc.init}" if lgc.init is not None else ""
             self.emitter.emitln(
-                f'(* srl_style = "register" *) logic {self.get_name(lgc)}{init_str};')
+                f'(* srl_style = "register" *) logic {self.get_name(lgc)}{init_str};'
+            )
         self._declared_hardware.add(lgc)
 
-        if (lgc.has_source not in self._declared_hardware and 
-            isinstance(lgc.source, Wire)):
+        if lgc.has_source not in self._declared_hardware and isinstance(lgc.source, Wire):
             lgc.source.accept(self)
 
-        def emit_inner(): 
+        def emit_inner():
             if lgc.source:
-                self.emitter.emitln(
-                    f"{self.get_name(lgc)} <= {self.get_name(lgc.source)};")
+                self.emitter.emitln(f"{self.get_name(lgc)} <= {self.get_name(lgc.source)};")
 
         def emit_with_en():
             if lgc.en:
@@ -217,7 +254,7 @@ def emit_with_en():
                 with self.emitter.indent:
                     emit_inner()
                 self.emitter.emitln("end")
-            else: 
+            else:
                 emit_inner()
 
         def emit_with_rst_and_en():
@@ -241,7 +278,7 @@ def emit_with_rst_and_en():
                 with self.emitter.indent:
                     emit_inner()
                 self.emitter.emitln("end")
-            else: 
+            else:
                 emit_with_en()
 
         self.emitter.emitln("always_ff @(posedge clk) begin")
@@ -256,13 +293,13 @@ def visit_blackbox(self, b: Blackbox):
         self.emitter.emitln(f"{b.module_name} #(")
         with self.emitter.indent:
             for idx, (key, value) in enumerate(b.parameters.items()):
-                ending = "," if idx != len(b.parameters)-1 else ""
+                ending = "," if idx != len(b.parameters) - 1 else ""
                 self.emitter.emitln(f".{key}({value}){ending}")
         self.emitter.emitln(f") {self.get_name(b)} (")
         with self.emitter.indent:
             ports = b.out_ports + b.in_ports
             for idx, port in enumerate(ports):
-                ending = "," if idx != len(ports)-1 else ""
+                ending = "," if idx != len(ports) - 1 else ""
                 port.accept(self)
                 self.emitter.emitln(ending)
         self.emitter.emitln(");")
@@ -279,19 +316,17 @@ def visit_blackbox_output_vec(self, b: BlackboxOutputVec):
         targets = [self.get_name(el.target) for el in b.elements[::-1] if el.target]
         self.emitter.emit(", ".join(targets))
         self.emitter.emit("})")
-    
+
     def visit_blackbox_input(self, b: BlackboxInput):
         if b.has_source:
             self.emitter.emit(f".{b.name}({self.get_name(b.source)})")
         else:
             self.emitter.emit(f".{b.name}(1'b0)")
-    
+
     def visit_blackbox_input_vec(self, b: BlackboxInputVec):
         self.emitter.emit(f".{b.name}(")
         self.emitter.emit("{")
-        sources = [self.get_name(el.source) 
-                   if el.source else "1'b0" 
-                   for el in b.elements[::-1]]
+        sources = [self.get_name(el.source) if el.source else "1'b0" for el in b.elements[::-1]]
         self.emitter.emit(", ".join(sources))
         self.emitter.emit("})")
 
@@ -299,19 +334,24 @@ def emit_blackbox_ports(self, p: Tuple[BlackboxPort]):
         for idx, port in enumerate(p):
             seperator = "," if idx != len(p) - 1 else ""
             if port.connected:
-                self.emitter.emitln(f".{self.get_name(port)}({self.get_name(port.wire)}){seperator}")
+                self.emitter.emitln(
+                    f".{self.get_name(port)}({self.get_name(port.wire)}){seperator}"
+                )
             elif isinstance(port, BlackboxInput):
                 self.emitter.emitln(f".{self.get_name(port)}(1'b0){seperator}")
             else:
                 self.emitter.emitln(f".{self.get_name(port)}(){seperator}")
-    
+
     def visit_bitmatrix(self, b: Bitmatrix):
         if b not in self._declared_hardware:
             self.emitter.emitln(f"uwire [{b.total_size()-1}:0] {self.get_name(b)};")
             self._declared_hardware.add(b)
-        
-        if b not in self._emitted_hardware:    
-            [self.emitter.emitln(
-                f"assign {self.get_name(el)} = {self.get_name(el.source)};")
-             for col in b for el in col if el.has_source]
-            self._emitted_hardware.add(b)
\ No newline at end of file
+
+        if b not in self._emitted_hardware:
+            [
+                self.emitter.emitln(f"assign {self.get_name(el)} = {self.get_name(el.source)};")
+                for col in b
+                for el in col
+                if el.has_source
+            ]
+            self._emitted_hardware.add(b)
diff --git a/src/finn/compressor/src/passes/io_annotator.py b/src/finn/compressor/src/passes/io_annotator.py
index e41d077864..c5e0b66b04 100644
--- a/src/finn/compressor/src/passes/io_annotator.py
+++ b/src/finn/compressor/src/passes/io_annotator.py
@@ -6,9 +6,10 @@
 # @brief    Input/output annotation pass for compressor
 #############################################################################
 
-from ..graph.nodes import Compressor, Logic, Wire, Bitmatrix
+from ..graph.nodes import Bitmatrix, Compressor, Logic, Wire
 from .node_iterator import NodeIterator
 
+
 class IOAnnotator(NodeIterator):
     def visit_compressor(self, c: Compressor):
         input_wires = c.stages[0].input_wires
@@ -34,21 +35,26 @@ def visit_compressor(self, c: Compressor):
         output_wires.name = "out"
 
         c.io = self.get_all_io(c)
-        
+
     def get_all_io(self, c: Compressor):
         finder = IOFinder()
         c.accept(finder)
-        return list(set(finder.io))        
+        return list(set(finder.io))
+
 
 class IOFinder(NodeIterator):
     def iter_compressor(self, c: Compressor):
         self.connectables = []
 
     @property
-    def io(self): return [el for el in self.connectables if el.prefix]
+    def io(self):
+        return [el for el in self.connectables if el.prefix]
+
+    def iter_wire(self, w: Wire):
+        self.connectables.append(w)
 
-    def iter_wire(self, w: Wire): self.connectables.append(w)
-    
-    def iter_logic(self, lgc: Logic): self.connectables.append(lgc)
+    def iter_logic(self, lgc: Logic):
+        self.connectables.append(lgc)
 
-    def iter_bitmatrix(self, b: Bitmatrix): self.connectables.append(b)
+    def iter_bitmatrix(self, b: Bitmatrix):
+        self.connectables.append(b)
diff --git a/src/finn/compressor/src/passes/lut_placer.py b/src/finn/compressor/src/passes/lut_placer.py
index ec8c2cabc2..1501fa5053 100644
--- a/src/finn/compressor/src/passes/lut_placer.py
+++ b/src/finn/compressor/src/passes/lut_placer.py
@@ -6,14 +6,15 @@
 # @brief    RLOC placement annotation for compressor LUTs
 #############################################################################
 
-from .node_iterator import NodeIterator
+from ..graph.final_adder import FinalAdder
 from ..graph.nodes import Compressor, Counter, GateAbsorptionCounter
 from ..graph.primitives import LUT6CY
-from ..graph.final_adder import FinalAdder
+from .node_iterator import NodeIterator
+
 
 class LUTPlacer(NodeIterator):
     def iter_compressor(self, c: Compressor):
-        self.occupations = [] # Reset placement state for every compressor
+        self.occupations = []  # Reset placement state for every compressor
 
     def iter_counter(self, c: Counter):
         # Place LUT6CY instances manually.
@@ -30,7 +31,7 @@ def _get_ripple_connected_luts(self, c: Counter):
             # which restricts enforces correct placement itself.
             return []
 
-        lut6cy_i4s =  {lut.I4:  lut for lut in c.luts if isinstance(lut, LUT6CY)}
+        lut6cy_i4s = {lut.I4: lut for lut in c.luts if isinstance(lut, LUT6CY)}
         lut6cy_o52s = {lut.O52: lut for lut in c.luts if isinstance(lut, LUT6CY)}
 
         lut_output_to_lut_input = {}
@@ -40,8 +41,7 @@ def _get_ripple_connected_luts(self, c: Counter):
                 target_lut = lut6cy_o52s[input.source]
                 lut_output_to_lut_input[input_lut] = target_lut
 
-        lut_heads = (set(lut_output_to_lut_input.keys()) - 
-                     set(lut_output_to_lut_input.values()))
+        lut_heads = set(lut_output_to_lut_input.keys()) - set(lut_output_to_lut_input.values())
         chains = []
 
         for lut_head in lut_heads:
@@ -51,7 +51,7 @@ def _get_ripple_connected_luts(self, c: Counter):
             chains.append(cur[::-1])
 
         return chains
-    
+
     def _calculate_and_annotate_placements(self, cascades):
         for cascade in cascades:
             for idx, slice_util in enumerate(self.occupations):
@@ -61,7 +61,7 @@ def _calculate_and_annotate_placements(self, cascades):
                     break
             else:
                 self.occupations.append(len(cascade))
-                self._annotate_placements(cascade, len(self.occupations)-1, 0)
+                self._annotate_placements(cascade, len(self.occupations) - 1, 0)
 
     def _annotate_placements(self, cascade, hu_set, start_idx):
         """Annotate LUT6CY placement constraints for carry chain packing.
@@ -82,4 +82,4 @@ def _annotate_placements(self, cascade, hu_set, start_idx):
             lut.annotate(f'RLOC = "X0Y{hu_set}"')  # Increment Y per SLICE to avoid conflicts
             lut.annotate(f'BEL = "{bel_str}"')
             lut.annotate('DONT_TOUCH = "yes"')
-            lut.annotate('IS_BEL_FIXED = "yes"')
\ No newline at end of file
+            lut.annotate('IS_BEL_FIXED = "yes"')
diff --git a/src/finn/compressor/src/passes/node_iterator.py b/src/finn/compressor/src/passes/node_iterator.py
index 4b0f399e35..919565afe2 100644
--- a/src/finn/compressor/src/passes/node_iterator.py
+++ b/src/finn/compressor/src/passes/node_iterator.py
@@ -6,19 +6,33 @@
 # @brief    Node iterator pass for compressor graph traversal
 #############################################################################
 
+from ..graph.accumulator import AccumulatorStage
+from ..graph.nodes import (
+    Bitmatrix,
+    Blackbox,
+    BlackboxInput,
+    BlackboxInputVec,
+    BlackboxOutput,
+    BlackboxOutputVec,
+    CompressionStage,
+    Compressor,
+    Counter,
+    GateAbsorbedStage,
+    GateAbsorptionCounter,
+    InputStage,
+    Logic,
+    PipelineStage,
+    Wire,
+)
 from ..graph.primitives import LOOKAHEAD8
 from ..graph.visitor import Visitor
-from ..graph.nodes import Counter, CompressionStage, Compressor, InputStage, PipelineStage
-from ..graph.nodes import Blackbox, Wire, Logic, Bitmatrix, GateAbsorbedStage
-from ..graph.nodes import GateAbsorptionCounter, BlackboxInput, BlackboxOutput
-from ..graph.nodes import BlackboxInputVec, BlackboxOutputVec
-from ..graph.accumulator import AccumulatorStage
+
 
 class NodeIterator(Visitor):
-    def visit_compressor(self, c: Compressor): 
+    def visit_compressor(self, c: Compressor):
         self.iter_compressor(c)
         [s.accept(self) for s in c.stages]
-    
+
     def visit_input_stage(self, s: InputStage):
         self.iter_input_stage(s)
         s.input_wires.accept(self)
@@ -63,7 +77,7 @@ def visit_gate_absorption_counter(self, g: GateAbsorptionCounter):
         [el.accept(self) for col in g.input_wires_complementary for el in col]
         [el.accept(self) for col in g.output_wires for el in col]
         [el.accept(self) for el in g.instances]
-    
+
     def visit_blackbox(self, b: Blackbox):
         self.iter_blackbox(b)
         [p.accept(self) for p in b.in_ports + b.out_ports]
@@ -84,40 +98,59 @@ def visit_lookahead8(self, l8: LOOKAHEAD8):
         self.iter_lookahead8(l8)
         self.visit_blackbox(l8)
 
-    def visit_wire(self, w: Wire): self.iter_wire(w)
+    def visit_wire(self, w: Wire):
+        self.iter_wire(w)
+
+    def visit_logic(self, lgc: Logic):
+        self.iter_logic(lgc)
 
-    def visit_logic(self, lgc: Logic): self.iter_logic(lgc)
+    def visit_bitmatrix(self, b: Bitmatrix):
+        self.iter_bitmatrix(b)
 
-    def visit_bitmatrix(self, b: Bitmatrix): self.iter_bitmatrix(b)
+    def iter_compressor(self, c: Compressor):
+        pass
 
-    def iter_compressor(self, c: Compressor): pass
-    
-    def iter_gate_absorbed_stage(self, g: GateAbsorbedStage): pass
+    def iter_gate_absorbed_stage(self, g: GateAbsorbedStage):
+        pass
 
-    def iter_input_stage(self, s: InputStage): pass
+    def iter_input_stage(self, s: InputStage):
+        pass
 
-    def iter_accumulator_stage(self, a: AccumulatorStage): pass
+    def iter_accumulator_stage(self, a: AccumulatorStage):
+        pass
 
-    def iter_pipeline_stage(self, s: PipelineStage): pass
+    def iter_pipeline_stage(self, s: PipelineStage):
+        pass
 
-    def iter_compression_stage(self, s: CompressionStage): pass
+    def iter_compression_stage(self, s: CompressionStage):
+        pass
 
-    def iter_gate_absorption_counter(self, g: GateAbsorptionCounter): pass
+    def iter_gate_absorption_counter(self, g: GateAbsorptionCounter):
+        pass
 
-    def iter_counter(self, c: Counter): pass
+    def iter_counter(self, c: Counter):
+        pass
 
-    def iter_blackbox(self, b: Blackbox): pass
+    def iter_blackbox(self, b: Blackbox):
+        pass
 
-    def iter_wire(self, w: Wire): pass
+    def iter_wire(self, w: Wire):
+        pass
 
-    def iter_logic(self, lgc: Logic): pass
+    def iter_logic(self, lgc: Logic):
+        pass
 
-    def iter_bitmatrix(self, b: Bitmatrix): pass
+    def iter_bitmatrix(self, b: Bitmatrix):
+        pass
 
-    def iter_blackbox_input(self, b: BlackboxInput): pass
+    def iter_blackbox_input(self, b: BlackboxInput):
+        pass
 
-    def iter_blackbox_output(self, b: BlackboxOutput): pass
+    def iter_blackbox_output(self, b: BlackboxOutput):
+        pass
 
-    def iter_blackbox_input_vec(self, b: BlackboxInputVec): pass
+    def iter_blackbox_input_vec(self, b: BlackboxInputVec):
+        pass
 
-    def iter_blackbox_output_vec(self, b: BlackboxOutputVec): pass
\ No newline at end of file
+    def iter_blackbox_output_vec(self, b: BlackboxOutputVec):
+        pass
diff --git a/src/finn/compressor/src/passes/printer.py b/src/finn/compressor/src/passes/printer.py
index 2ebcabe23f..a2386fbb72 100644
--- a/src/finn/compressor/src/passes/printer.py
+++ b/src/finn/compressor/src/passes/printer.py
@@ -6,11 +6,19 @@
 # @brief    Compressor tree printer for debugging
 #############################################################################
 
-from ..graph.nodes import Counter, CompressionStage, Compressor, GateAbsorbedStage
-from ..graph.nodes import GateAbsorptionCounter, InputStage, PipelineStage
 from ..graph.accumulator import AccumulatorStage
+from ..graph.nodes import (
+    CompressionStage,
+    Compressor,
+    Counter,
+    GateAbsorbedStage,
+    GateAbsorptionCounter,
+    InputStage,
+    PipelineStage,
+)
 from ..graph.visitor import Visitor
 
+
 class CompressorPrinter(Visitor):
     def visit_compressor(self, c: Compressor):
         print(f"Compressor <Input: {c.input_shape}, Output: {c.output_shape}> [")
@@ -21,15 +29,14 @@ def visit_compressor(self, c: Compressor):
     def visit_compression_stage(self, s: CompressionStage):
         print(f"\tStage: <in: {s.input_shape}, out: {s.output_shape}> [")
         for counter, shift in s.counters_with_shifts:
-            print(f"\t\t[xshift={shift:2}] ",end="")
+            print(f"\t\t[xshift={shift:2}] ", end="")
             counter.accept(self)
         print("\t]")
 
     def visit_gate_absorbed_stage(self, s: GateAbsorbedStage):
-        print(f"\tStage with Gate Absorption: <in {s.input_shape}, "
-              f"out: {s.output_shape}> [")
+        print(f"\tStage with Gate Absorption: <in {s.input_shape}, " f"out: {s.output_shape}> [")
         for counter, shift in s.counters_with_shifts:
-            print(f"\t\t[xshift={shift:2}] ",end="")
+            print(f"\t\t[xshift={shift:2}] ", end="")
             counter.accept(self)
         print("\t]")
 
@@ -47,8 +54,8 @@ def visit_gate_absorption_counter(self, c: GateAbsorptionCounter):
 
     def visit_accumulator_stage(self, a: AccumulatorStage):
         print(f"\tAccumulator: <in: {a.input_shape}, out: {a.output_shape}> [")
-        print("\t\t",end="")
+        print("\t\t", end="")
         for i in a.instances:
             if isinstance(i, Counter):
                 i.accept(self)
-        print("\t]")
\ No newline at end of file
+        print("\t]")
diff --git a/src/finn/compressor/src/passes/wire_inserter.py b/src/finn/compressor/src/passes/wire_inserter.py
index 6865b1cf3d..b80192f8c5 100644
--- a/src/finn/compressor/src/passes/wire_inserter.py
+++ b/src/finn/compressor/src/passes/wire_inserter.py
@@ -6,10 +6,11 @@
 # @brief    Wire insertion pass for compressor graph
 #############################################################################
 
+from ..graph.nodes import Blackbox, Counter, GateAbsorptionCounter, Wire
 from .node_iterator import NodeIterator
-from ..graph.nodes import Blackbox, Counter, Wire, GateAbsorptionCounter
 
-# Blackbox outputs might be connected to other blackbox inputs. 
+
+# Blackbox outputs might be connected to other blackbox inputs.
 # To express this in verilog, an extra intermediate wire has to
 # be created between the blackboxes. This path adds it.
 class WireInserter(NodeIterator):
@@ -19,7 +20,7 @@ def iter_counter(self, c: Counter):
             for output in bbox.out_ports:
                 self.insert_wire_at_blackbox_output(output, c)
 
-    def iter_gate_absorption_counter(self, g: GateAbsorptionCounter): 
+    def iter_gate_absorption_counter(self, g: GateAbsorptionCounter):
         self.iter_counter(g)
 
     def insert_wire_at_blackbox_output(self, output, counter):
@@ -27,14 +28,14 @@ def insert_wire_at_blackbox_output(self, output, counter):
             for el in output.elements:
                 self.insert_wire_at_blackbox_output(el, counter)
             return
-            
+
         if len(output.target) == 1 and isinstance(output.target[0], Wire):
             output.target = output.target[0]
             return
-        
+
         out_wire = Wire()
         for input in output.target:
             out_wire.connect_to(input)
 
         output.target = out_wire
-        counter.instances.append(out_wire)
\ No newline at end of file
+        counter.instances.append(out_wire)
diff --git a/src/finn/compressor/src/target.py b/src/finn/compressor/src/target.py
index d526fdbbb0..32f1d45818 100644
--- a/src/finn/compressor/src/target.py
+++ b/src/finn/compressor/src/target.py
@@ -7,22 +7,30 @@
 #############################################################################
 
 from abc import ABC
-from .graph.counters.counter_candidates import CounterCandidate, FACandidate
-from .graph.counters.counter_candidates import MuxCYAtomCascadeCandidate
-from .graph.counters.counter_candidates import RippleSumCandidate
-from .graph.counters.counter_candidates import DualRailRippleSumCandidate
-from .graph.counters.counter_candidates import FiveTwoCandidate 
-from .graph.counters.counter_candidates import VersalAtomCascadeCandidate
-from .graph.counters.counter_candidates import SixThreeCandidate, TenSixCandidate
-from .graph.counters.absorption_counter_candidates import GateAbsorptionCounterCandidate
-from .graph.counters.absorption_counter_candidates import VersalPredAdderCandidate
-from .graph.counters.absorption_counter_candidates import RippleSumPredAdderCandidate
-from .graph.counters.absorption_counter_candidates import SinglePredCandidate
-from .graph.counters.absorption_counter_candidates import MuxCYPredAdderCandidate
-from .graph.counters.absorption_counter_candidates import MuxCYRippleSumCandidate
-from .graph.final_adder import MuxCYTernaryAdder, FinalAdder, QuaternaryAdder
 from typing import List
 
+from .graph.counters.absorption_counter_candidates import (
+    GateAbsorptionCounterCandidate,
+    MuxCYPredAdderCandidate,
+    MuxCYRippleSumCandidate,
+    RippleSumPredAdderCandidate,
+    SinglePredCandidate,
+    VersalPredAdderCandidate,
+)
+from .graph.counters.counter_candidates import (
+    CounterCandidate,
+    DualRailRippleSumCandidate,
+    FACandidate,
+    FiveTwoCandidate,
+    MuxCYAtomCascadeCandidate,
+    RippleSumCandidate,
+    SixThreeCandidate,
+    TenSixCandidate,
+    VersalAtomCascadeCandidate,
+)
+from .graph.final_adder import FinalAdder, MuxCYTernaryAdder, QuaternaryAdder
+
+
 def resolve_target(fpgapart):
     """Map a Vivado FPGA part string to a compressor Target object.
 
@@ -33,7 +41,8 @@ def resolve_target(fpgapart):
     versal_prefixes_5 = ("xqrvc", "xcv80")
     if fpgapart[0:4] in versal_prefixes_4 or fpgapart[0:5] in versal_prefixes_5:
         return Versal()
-    # UltraScale/UltraScale+ prefixes: Kintex US (xcku), Virtex US (xcvu), Zynq US (xczu), defense (xqzu)
+    # UltraScale/UltraScale+ prefixes:
+    # Kintex US (xcku), Virtex US (xcvu), Zynq US (xczu), defense (xqzu)
     ultrascale_prefixes = ("xcku", "xcvu", "xczu", "xqzu")
     if fpgapart[0:4] in ultrascale_prefixes:
         return UltraScale()
@@ -49,7 +58,9 @@ def resolve_target_name(name):
     elif name == "UltraScale":
         return UltraScale()
     else:
-        raise ValueError(f"Unsupported target: {name!r}. Choose from: ['Versal', '7-Series', 'UltraScale']")
+        raise ValueError(
+            f"Unsupported target: {name!r}. Choose from: ['Versal', '7-Series', 'UltraScale']"
+        )
 
 
 class Target(ABC):
@@ -57,6 +68,7 @@ class Target(ABC):
     final_adder: FinalAdder
     absorbing_counter_candidates: List[GateAbsorptionCounterCandidate]
 
+
 class Versal(Target):
     def __init__(self):
         self.counter_candidates = [
@@ -66,7 +78,7 @@ def __init__(self):
             DualRailRippleSumCandidate(),
             FiveTwoCandidate(),
             SixThreeCandidate(),
-            VersalAtomCascadeCandidate()
+            VersalAtomCascadeCandidate(),
         ]
         self.absorbing_counter_candidates = [
             VersalPredAdderCandidate(),
@@ -75,10 +87,15 @@ def __init__(self):
         ]
         self.final_adder = QuaternaryAdder
 
+
 class SevenSeries(Target):
     def __init__(self):
-        self.counter_candidates = [FACandidate(), FiveTwoCandidate(),
-                                   SixThreeCandidate(), MuxCYAtomCascadeCandidate()]
+        self.counter_candidates = [
+            FACandidate(),
+            FiveTwoCandidate(),
+            SixThreeCandidate(),
+            MuxCYAtomCascadeCandidate(),
+        ]
         self.final_adder = MuxCYTernaryAdder
         self.absorbing_counter_candidates = [
             MuxCYPredAdderCandidate(),
@@ -86,17 +103,23 @@ def __init__(self):
             SinglePredCandidate(),
         ]
 
+
 class UltraScale(Target):
     """UltraScale/UltraScale+ - reuses 7-Series primitives.
 
     Vivado maps CARRY4 to CARRY8 transparently.
     """
+
     def __init__(self):
-        self.counter_candidates = [FACandidate(), FiveTwoCandidate(),
-                                   SixThreeCandidate(), MuxCYAtomCascadeCandidate()]
+        self.counter_candidates = [
+            FACandidate(),
+            FiveTwoCandidate(),
+            SixThreeCandidate(),
+            MuxCYAtomCascadeCandidate(),
+        ]
         self.final_adder = MuxCYTernaryAdder
         self.absorbing_counter_candidates = [
             MuxCYPredAdderCandidate(),
             MuxCYRippleSumCandidate(),
             SinglePredCandidate(),
-        ]
\ No newline at end of file
+        ]
diff --git a/src/finn/compressor/src/tests/test_gen.py b/src/finn/compressor/src/tests/test_gen.py
index a0116526aa..cfb95ceafd 100644
--- a/src/finn/compressor/src/tests/test_gen.py
+++ b/src/finn/compressor/src/tests/test_gen.py
@@ -6,21 +6,31 @@
 # @brief    Test vector generation for compressor verification
 #############################################################################
 
-from ..utils.shape import Shape
 from itertools import accumulate
 from typing import List
 
+from ..utils.shape import Shape
+
+
 def compressed_width(shape):
     max = sum([col * (1 << idx) for idx, col in enumerate(shape)])
     return max.bit_length()
 
+
 def flatten_gates(gates: List[List[str]]) -> List[str]:
     return [el for col in gates for el in col]
 
-def generate_test(shape: Shape, module_name: str, pipeline_stages: int, 
-                  gates: List[List[str]], accumulation: bool, accumulator_width: int,
-                  constant: int):
-    assert(type(pipeline_stages) == int)
+
+def generate_test(
+    shape: Shape,
+    module_name: str,
+    pipeline_stages: int,
+    gates: List[List[str]],
+    accumulation: bool,
+    accumulator_width: int,
+    constant: int,
+):
+    assert type(pipeline_stages) == int
 
     if gates:
         gates = flatten_gates(gates)
@@ -35,33 +45,36 @@ def generate_test(shape: Shape, module_name: str, pipeline_stages: int,
     addends = "\n".join(addends)
 
     if gates:
-        preds = "".join([f"\tlocalparam pred_{idx} = 4'h{gate};\n" 
-                         for idx, gate in enumerate(gates)])
-        selects = "".join([f"\tlogic [3:0] sel_{idx};\n" 
-                           for idx, _ in enumerate(gates)])
-        arr_ins = "".join([
-            f"\t\tsel_{i} = (arr_in_b[{i}]<<1) | arr_in_a[{i}];\n" + 
-            f"\t\tarr_in[{i}] = pred_{i}[sel_{i}];\n"
-            for i, _ in enumerate(gates)])
-        gates_decl = (f"\tlogic [{sum(shape)-1}:0] arr_in_a;" + 
-                      f"\tlogic [{sum(shape)-1}:0] arr_in_b;")
-    accumulator_width = (accumulator_width if accumulator_width 
-                         else compressed_width(shape))
+        preds = "".join(
+            [f"\tlocalparam pred_{idx} = 4'h{gate};\n" for idx, gate in enumerate(gates)]
+        )
+        selects = "".join([f"\tlogic [3:0] sel_{idx};\n" for idx, _ in enumerate(gates)])
+        arr_ins = "".join(
+            [
+                f"\t\tsel_{i} = (arr_in_b[{i}]<<1) | arr_in_a[{i}];\n"
+                + f"\t\tarr_in[{i}] = pred_{i}[sel_{i}];\n"
+                for i, _ in enumerate(gates)
+            ]
+        )
+        gates_decl = (
+            f"\tlogic [{sum(shape)-1}:0] arr_in_a;" + f"\tlogic [{sum(shape)-1}:0] arr_in_b;"
+        )
+    accumulator_width = accumulator_width if accumulator_width else compressed_width(shape)
     acc_decl = f"\tlogic [{accumulator_width-1}:0] acc_base;"
 
-    acc_rst_block = """\t\t\tif (reset == 0) begin 
+    acc_rst_block = """\t\t\tif (reset == 0) begin
 \t\t\t\tacc_base = 0;
-\t\t\tend else begin 
+\t\t\tend else begin
 \t\t\t\tacc_base = reference[0];
 \t\t\tend"""
 
     return (
-f"""module tb;
+        f"""module tb;
 {gates_decl if gates else ""}
 \tlogic [{sum(shape)-1}:0] arr_in;
 \tlogic [{compressed_width(shape)-1}:0] in_reduced;
 \tlogic [{accumulator_width-1}:0] out;
-\tlogic [{accumulator_width-1}:0] reference [{pipeline_stages}:0]; 
+\tlogic [{accumulator_width-1}:0] reference [{pipeline_stages}:0];
 {acc_decl if accumulation else ""}
 \t{"logic [4:0] reset;" if accumulation else ""}
 \t{"logic rst;" if accumulation else ""}
@@ -76,27 +89,27 @@ def generate_test(shape: Shape, module_name: str, pipeline_stages: int,
 
 \t{"always #10ns clk = !clk;" if has_clk else ""}
 
-\talways_comb begin 
-\t\t{"reference[0] = acc_base + in_reduced;" 
+\talways_comb begin
+\t\t{"reference[0] = acc_base + in_reduced;"
      if accumulation else "reference[0] = in_reduced;"}
 \tend
 
-\talways_comb begin 
+\talways_comb begin
 \t\tin_reduced = 0;
 \t\t{"if (en) begin" if accumulation else ""}
 in_reduced += {constant};
 {addends}
 \t\t{"end" if accumulation else ""}
 \tend
-           
+
 \tinitial begin
 \t\t{"acc_base = 0;" if accumulation else ""}
 \t\t{"arr_in_a = 0;" if gates else "arr_in = 0;"}
 \t\t{"arr_in_b = 0;" if gates else ""}
-      
+
 \t\t{"assign rst = reset == 0;" if accumulation else ""}
 \t\t{"reset = 0; #40ns;" if accumulation else ""}
-        
+
 \t\tfor (int i = 0; i < 16000; i += 1) begin
 \t\t\t{"automatic type(reset) xx;" if accumulation else ""}
 \t\t\t{"automatic type(en) zz;" if accumulation else ""}
@@ -131,10 +144,10 @@ def generate_test(shape: Shape, module_name: str, pipeline_stages: int,
 \t\t\t\t\t$error("Mismatch: Ref[%0b] != Out[%0b]", reference[{pipeline_stages}], out);
 \t\t\t\t\t#2ns;
 \t\t\t\t\t$stop;
-\t\t\t\tend 
+\t\t\t\tend
 \t\t\tend
 \t\t#0.01ns;
-        
+
 \t\tend
 \t\t$display("TEST PASSED");
 \t\t$finish();
@@ -147,4 +160,5 @@ def generate_test(shape: Shape, module_name: str, pipeline_stages: int,
     {".en_neg(!en)," if accumulation else ""}
     .out(out));
 endmodule
-""").replace("\n\n", "\n")
\ No newline at end of file
+"""
+    ).replace("\n\n", "\n")
diff --git a/src/finn/compressor/src/tests/tester.py b/src/finn/compressor/src/tests/tester.py
index 3537b97f7a..f2cf3d3f77 100644
--- a/src/finn/compressor/src/tests/tester.py
+++ b/src/finn/compressor/src/tests/tester.py
@@ -6,8 +6,8 @@
 
 """Vivado XSim wrapper for testing generated compressors."""
 
-import subprocess
 import re
+import subprocess
 
 
 def tester(test_loc, comp_loc):
@@ -21,15 +21,16 @@ def tester(test_loc, comp_loc):
         f"""rm -r xsim.dir/ &&
         xvlog -work work -sv ../res/glbl.v {test_loc} {comp_loc} -L unisims_ver --nolog &&
         xelab -L work -L unisims_ver -relax --nolog glbl tb &&
-        xsim --nolog work.glbl#work.tb -R""").replace("\n", " ")
+        xsim --nolog work.glbl#work.tb -R"""
+    ).replace("\n", " ")
     print(args)
     try:
-        ret = subprocess.run(args, capture_output=True, text=True, timeout=300,
-                             shell=True, check=True)
+        ret = subprocess.run(
+            args, capture_output=True, text=True, timeout=300, shell=True, check=True
+        )
     except subprocess.CalledProcessError as e:
         if e.returncode == 127:
-            raise RuntimeError(
-                "Could not call Vivado simulation tools. Did you source Vivado?")
+            raise RuntimeError("Could not call Vivado simulation tools. Did you source Vivado?")
         else:
             raise RuntimeError("Something failed during simulation.")
     if "$finish called at time" in ret.stdout:
diff --git a/src/finn/compressor/src/utils/mul_comp_map.py b/src/finn/compressor/src/utils/mul_comp_map.py
index 951b732be1..1cd044ea29 100644
--- a/src/finn/compressor/src/utils/mul_comp_map.py
+++ b/src/finn/compressor/src/utils/mul_comp_map.py
@@ -6,6 +6,7 @@
 # @brief    Multiplier-to-compressor input mapping utilities
 #############################################################################
 
+
 class MulCompMap:
     def __init__(self, na: int, nb: int, sa: bool, sb: bool):
         self.na = na
@@ -55,4 +56,6 @@ def shape(self):
     def absolute_term(self):
         (na, nb, sa, sb) = (self.na, self.nb, self.sa, self.sb)
 
-        return (-1 if sa ^ sb else 0) if na == 1 and nb == 1 else ((-(sa | sb) << nb) | sa) << (na - 1)
+        return (
+            (-1 if sa ^ sb else 0) if na == 1 and nb == 1 else ((-(sa | sb) << nb) | sa) << (na - 1)
+        )
diff --git a/src/finn/compressor/src/utils/shape.py b/src/finn/compressor/src/utils/shape.py
index 5cfdb9ea3d..2e111ad4b9 100644
--- a/src/finn/compressor/src/utils/shape.py
+++ b/src/finn/compressor/src/utils/shape.py
@@ -6,16 +6,20 @@
 # @brief    Shape representation for compressor bit matrices
 #############################################################################
 
-from typing import Tuple
 from itertools import zip_longest
+from typing import Tuple
+
 
 class Shape:
-    def __init__(self, t: Tuple[int] = ()): self.t = tuple(t)
+    def __init__(self, t: Tuple[int] = ()):
+        self.t = tuple(t)
+
+    def __len__(self):
+        return len(self.t)
 
-    def __len__(self): return len(self.t)
+    def __iter__(self):
+        return self.t.__iter__()
 
-    def __iter__(self): return self.t.__iter__()
-    
     def __getitem__(self, val):
         if type(val) == int and val >= len(self.t):
             return 0
@@ -24,15 +28,15 @@ def __getitem__(self, val):
             return r
         else:
             return Shape(r)
-    
+
     def __lshift__(self, val):
         return Shape([0 for el in range(val)] + list(self.t))
 
     def __add__(self, val):
-        return self.__binary_arithmetic_operation(val, lambda x,y: x+y)
+        return self.__binary_arithmetic_operation(val, lambda x, y: x + y)
 
     def __sub__(self, val):
-        return self.__binary_arithmetic_operation(val, lambda x,y: x-y)
+        return self.__binary_arithmetic_operation(val, lambda x, y: x - y)
 
     def __binary_arithmetic_operation(self, val, op):
         if type(val) == int:
@@ -42,10 +46,12 @@ def __binary_arithmetic_operation(self, val, op):
             return Shape([op(a, b) for a, b in zipped])
         else:
             raise RuntimeError("Unsupported type.")
-        
-    def __repr__(self): return f"Shape {self.t[::-1]}"
-    
+
+    def __repr__(self):
+        return f"Shape {self.t[::-1]}"
+
     def __eq__(self, other):
         for col1, col2 in zip_longest(self, other, fillvalue=0):
-            if col1 != col2: return False
-        return True
\ No newline at end of file
+            if col1 != col2:
+                return False
+        return True
diff --git a/src/finn/custom_op/fpgadataflow/rtl/matrixvectoractivation_rtl.py b/src/finn/custom_op/fpgadataflow/rtl/matrixvectoractivation_rtl.py
index 79af134346..48d031d940 100644
--- a/src/finn/custom_op/fpgadataflow/rtl/matrixvectoractivation_rtl.py
+++ b/src/finn/custom_op/fpgadataflow/rtl/matrixvectoractivation_rtl.py
@@ -28,15 +28,13 @@
 
 import numpy as np
 import os
-import shutil
 
+from finn.compressor import generate_add_multi_comps, generate_dotp_comp
 from finn.custom_op.fpgadataflow.matrixvectoractivation import MVAU
 from finn.custom_op.fpgadataflow.rtlbackend import RTLBackend
 from finn.util.basic import get_dsp_block, is_versal
 from finn.util.data_packing import npy_to_rtlsim_input, rtlsim_output_to_npy
 
-from finn.compressor import generate_dotp_comp, generate_add_multi_comps
-
 # ONNX i/o tensor shape assumptions for MatrixVectorActivation_rtl:
 # input 0 is the input tensor, shape (.., i_size) = (..., MW)
 # input 1 is the weight tensor, shape (i_size, o_size) = (MW, MH)
@@ -200,8 +198,7 @@ def _get_rtl_source_files(self, abspath=True):
         # Add compressor files if dotp_comp was generated
         comp_name = self.get_nodeattr("comp_module_name")
         if comp_name:
-            comp_hdl_dir = os.path.join(
-                os.environ["FINN_ROOT"], "src/finn/compressor/hdl/")
+            comp_hdl_dir = os.path.join(os.environ["FINN_ROOT"], "src/finn/compressor/hdl/")
             sourcefiles.append(os.path.join(code_gen_dir, "dotp_comp.sv"))
             sourcefiles.append(os.path.join(comp_hdl_dir, "mul_comp_map.sv"))
             sourcefiles.append(os.path.join(code_gen_dir, comp_name + ".sv"))
@@ -328,8 +325,6 @@ def _is_dotp_comp_eligible(self, fpgapart, ww, aw, pumped_compute):
         if pumped_compute or ww > 4 or aw > 4:
             return False
         return True
-        
-
 
     def generate_hdl(self, model, fpgapart, clk):
         # Generate params as part of IP preparation
@@ -362,23 +357,21 @@ def generate_hdl(self, model, fpgapart, clk):
         # Compressor generation if applicable.
         if self._is_dotp_comp_eligible(fpgapart, ww, aw, pumped_compute):
             result = generate_dotp_comp(
-                fpgapart, simd, ww, aw, accu_width, signed_act, code_gen_dir)
+                fpgapart, simd, ww, aw, accu_width, signed_act, code_gen_dir
+            )
             code_gen_dict["$COMP_PIPELINE_DEPTH$"] = [str(result["comp_delay"])]
             code_gen_dict["$USE_COMPRESSOR$"] = [str(1)]
             self.set_nodeattr("comp_module_name", result["comp_name"])
         else:
             # DSP path: Generate add_multi.sv with compressors
             result = generate_add_multi_comps(
-                fpgapart, version, simd, ww, aw, accu_width,
-                narrow_weights, code_gen_dir)
+                fpgapart, version, simd, ww, aw, accu_width, narrow_weights, code_gen_dir
+            )
             if result["comp_names"]:
-                self.set_nodeattr("add_multi_comp_names",
-                                  ";".join(result["comp_names"]))
+                self.set_nodeattr("add_multi_comp_names", ";".join(result["comp_names"]))
                 # Store compressor specs for synthesis aggregation
                 # Format: "N,W,D;N,W,D;..." e.g. "16,4,0;16,3,0;16,8,0"
-                specs_str = ";".join(
-                    f"{n},{w},{d}" for n, w, d in result.get("comp_specs", [])
-                )
+                specs_str = ";".join(f"{n},{w},{d}" for n, w, d in result.get("comp_specs", []))
                 self.set_nodeattr("add_multi_comp_specs", specs_str)
 
         # add general parameters to dictionary
diff --git a/src/finn/transformation/fpgadataflow/specialize_layers.py b/src/finn/transformation/fpgadataflow/specialize_layers.py
index b311c36d12..3f00d7feb9 100644
--- a/src/finn/transformation/fpgadataflow/specialize_layers.py
+++ b/src/finn/transformation/fpgadataflow/specialize_layers.py
@@ -26,7 +26,6 @@
 # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-import numpy as np
 import warnings
 from onnx import helper
 from qonnx.custom_op.registry import getCustomOp
@@ -34,7 +33,7 @@
 
 from finn.custom_op.fpgadataflow.hls import custom_op as hls_variants
 from finn.custom_op.fpgadataflow.rtl import custom_op as rtl_variants
-from finn.util.basic import get_dsp_block, is_versal
+from finn.util.basic import is_versal
 
 
 def _determine_impl_style(node, fpgapart, model):
@@ -232,6 +231,7 @@ def _mvu_rtl_possible(n, fpgapart, model):
     # RTL does not support BIPOLAR input datatype (1-bit signed {-1,+1})
     # BIPOLAR requires special handling that only HLS provides
     from qonnx.core.datatype import DataType
+
     idt = node_inst.get_input_datatype(0)
     if idt == DataType["BIPOLAR"]:
         return False
@@ -241,17 +241,6 @@ def _mvu_rtl_possible(n, fpgapart, model):
     if not wdt.signed():
         return False
 
-    # check which dsp block is available on fpga
-    dsp_block = get_dsp_block(fpgapart)
-    # check if weights are narrow
-    weights = model.get_initializer(n.input[1])
-    # if dynamic input, set minimum of weights to wdt.min()
-    # otherwise set it to the minimum value in the weight matrix
-    if weights is None:
-        weights_min = wdt.min()
-    else:
-        weights_min = np.min(weights)
-    narrow_weights = False if weights_min == wdt.min() else True
     # NOTE: Narrow weight check for DSP48E1 removed (previously returned False for
     # narrow_weights=False on DSP48E1). Rationale (see matrixvectoractivation_rtl.py):
     # - Compressor path (LUT-based, WW<=4 && AW<=4): No narrow weight constraint, works
@@ -262,7 +251,7 @@ def _mvu_rtl_possible(n, fpgapart, model):
     #   (previously forced W = np.clip(W, wdt.min()+1, wdt.max()) on xc7z020)
     # - Result: Both paths now accept full weight range, narrow_weights computed but not
     #   used as a gating condition for RTL eligibility
-    
+
     return True
 
 
diff --git a/src/finn/transformation/fpgadataflow/synth_ooc.py b/src/finn/transformation/fpgadataflow/synth_ooc.py
index 3f8f5ba689..23d6fa72bf 100644
--- a/src/finn/transformation/fpgadataflow/synth_ooc.py
+++ b/src/finn/transformation/fpgadataflow/synth_ooc.py
@@ -59,14 +59,12 @@ def generate_unified_add_multi(model, build_dir):
                     n, w, d = map(int, spec.split(","))
                     all_specs.add((n, w, d))
 
-    rtllib_template = os.path.join(os.environ["FINN_ROOT"],
-                                   "finn-rtllib/mvu/add_multi.sv")
-    with open(rtllib_template, 'r') as f:
+    rtllib_template = os.path.join(os.environ["FINN_ROOT"], "finn-rtllib/mvu/add_multi.sv")
+    with open(rtllib_template, "r") as f:
         template = f.read()
 
     if all_specs:
-        catch_comp_lines = [f"\t`CATCH_COMP({n},{w},{d})"
-                           for n, w, d in sorted(all_specs)]
+        catch_comp_lines = [f"\t`CATCH_COMP({n},{w},{d})" for n, w, d in sorted(all_specs)]
         entries = "\n".join(catch_comp_lines) + "\n"
     else:
         entries = ""
@@ -80,7 +78,7 @@ def generate_unified_add_multi(model, build_dir):
 
     unified = template.replace(marker, entries + marker)
 
-    with open(os.path.join(build_dir, "add_multi.sv"), 'w') as f:
+    with open(os.path.join(build_dir, "add_multi.sv"), "w") as f:
         f.write(unified)
 
 

From 0281051c7045207d5f341d67b547a4030b0846dc Mon Sep 17 00:00:00 2001
From: Simon Gerber <simon.gerber@amd.com>
Date: Mon, 27 Apr 2026 10:19:13 +0100
Subject: [PATCH 07/10] [RTL MVU] Enforce minimum 2-bit bitwidth constraint

- Fixed bitwidth >= 2 check for both activations and weights
- Remove 7-Series narrow weight clipping from tests
- Update comments to reflect bitwidth ranges
---
 .../fpgadataflow/specialize_layers.py         | 44 ++++++-------------
 tests/fpgadataflow/test_fpgadataflow_mvau.py  |  9 ++--
 2 files changed, 20 insertions(+), 33 deletions(-)

diff --git a/src/finn/transformation/fpgadataflow/specialize_layers.py b/src/finn/transformation/fpgadataflow/specialize_layers.py
index 3f00d7feb9..5951bc3f38 100644
--- a/src/finn/transformation/fpgadataflow/specialize_layers.py
+++ b/src/finn/transformation/fpgadataflow/specialize_layers.py
@@ -28,6 +28,7 @@
 
 import warnings
 from onnx import helper
+from qonnx.core.datatype import DataType
 from qonnx.custom_op.registry import getCustomOp
 from qonnx.transformation.base import Transformation
 
@@ -54,8 +55,6 @@ def _determine_impl_style(node, fpgapart, model):
             return _dwc_determine_impl_style(node)
         if rtl_variant:
             if optype == "MVAU":
-                # Delegate to _mvu_rtl_possible() which allows 2-8 bit bitwidths
-                # Removed >= 4 early filter to enable RTL/compressors for 2-3 bit
                 if _mvu_rtl_possible(node, fpgapart, model):
                     return "rtl"
                 else:
@@ -131,8 +130,8 @@ def _determine_impl_style(node, fpgapart, model):
                 return "rtl"
             else:
                 warn_str = """There is no RTL variant for %s. The node will automatically be
-                        set to HLS variant. Please check the bit-widths to be <= 8 and ensure the
-                        thresholds are implemented as standalone layer""" % (
+                        set to HLS variant. Ensure thresholds are implemented as standalone layer,
+                        weights are signed, and bitwidths are >= 2""" % (
                     node.name,
                 )
                 warnings.warn(warn_str)
@@ -214,12 +213,9 @@ def _dwc_determine_impl_style(node):
 
 def _mvu_rtl_possible(n, fpgapart, model):
     # Checks whether RTL-based MVU is supported
-    # Currently, for DSP48 we only support computations up to
-    # 8sx8u (8-bit signed weights x 8-bit (un)signed activations)
-    # and for DSP58 we support up to 8sx9s.
-    # Please note, DSP48E1 does only support narrow range for weights
-    # Next to that, embedded thresholding functionality is not supported
-    # and neither binaryxnormode computation.
+    # RTL MVU uses either DSP blocks (for larger bitwidths) or LUT-based compressor (2<=WW<=4 && 2<=AW<=4)
+    # Weights must be signed, activations can be unsigned or signed
+    # Embedded thresholding and binaryXnorMode are not supported
     node_inst = getCustomOp(n)
     # first check if no Activation or binary xnor mode and return False
     # immediately if one of them is True
@@ -228,31 +224,19 @@ def _mvu_rtl_possible(n, fpgapart, model):
     if no_activation or is_binaryxnor_mode:
         return False
 
-    # RTL does not support BIPOLAR input datatype (1-bit signed {-1,+1})
-    # BIPOLAR requires special handling that only HLS provides
-    from qonnx.core.datatype import DataType
-
     idt = node_inst.get_input_datatype(0)
-    if idt == DataType["BIPOLAR"]:
-        return False
-
-    # check if weights are signed, if not return False
     wdt = node_inst.get_input_datatype(1)
+
     if not wdt.signed():
         return False
 
-    # NOTE: Narrow weight check for DSP48E1 removed (previously returned False for
-    # narrow_weights=False on DSP48E1). Rationale (see matrixvectoractivation_rtl.py):
-    # - Compressor path (LUT-based, WW<=4 && AW<=4): No narrow weight constraint, works
-    #   with full weight range including wdt.min()
-    # - DSP path: Handles narrow weights via NARROW_WEIGHTS module parameter in mvu.sv,
-    #   which adjusts lane slicing to accommodate narrow range
-    # - Test suite: Removed weight clipping in test_fpgadataflow_mvau.py line 785
-    #   (previously forced W = np.clip(W, wdt.min()+1, wdt.max()) on xc7z020)
-    # - Result: Both paths now accept full weight range, narrow_weights computed but not
-    #   used as a gating condition for RTL eligibility
-
-    return True
+    # if none of the above constraints have been triggered
+    # we now check if input and weight data types are in range
+    # we only use rtl mvau if the dtypes are at least 2 bit
+    inp_width_in_range = idt.bitwidth() >= 2
+    weight_width_in_range = wdt.bitwidth() >= 2
+
+    return inp_width_in_range and weight_width_in_range
 
 
 def _vvu_rtl_possible(n, fpgapart):
diff --git a/tests/fpgadataflow/test_fpgadataflow_mvau.py b/tests/fpgadataflow/test_fpgadataflow_mvau.py
index 780efd170f..08a75ef80d 100644
--- a/tests/fpgadataflow/test_fpgadataflow_mvau.py
+++ b/tests/fpgadataflow/test_fpgadataflow_mvau.py
@@ -748,7 +748,10 @@ def test_mvau_fifocharacterize_rtlsim(
 @pytest.mark.parametrize("pe", [1, 9, 18])
 @pytest.mark.parametrize("simd", [1, 16, 32])
 @pytest.mark.parametrize(
-    "idt_wdt", [[DataType["UINT4"], DataType["INT4"]], [DataType["UINT8"], DataType["INT8"]]]
+    "idt_wdt", [
+        [DataType["UINT4"], DataType["INT4"]],
+        [DataType["UINT8"], DataType["INT8"]],
+    ]
 )
 @pytest.mark.parametrize(
     "part", ["xcvc1902-vsva2197-2MP-e-S", "xcku3p-ffva676-1-e", "xc7z020clg400-1"]
@@ -782,8 +785,8 @@ def test_fpgadataflow_rtl_mvau(
     ofm = helper.make_tensor_value_info("ofm", TensorProto.FLOAT, (1, ofm_h, ofm_w, mh))
     W = gen_finn_dt_tensor(wdt, (mw, mh))
     # if 7 series, force weights to narrow range
-    if part == "xc7z020clg400-1":
-        W = np.clip(W, wdt.min() + 1, wdt.max())
+    # if part == "xc7z020clg400-1":
+    #     W = np.clip(W, wdt.min() + 1, wdt.max())
     model = make_single_matmul_modelwrapper(ifm, ofm, idt, wdt, W)
     model = model.transform(GiveUniqueNodeNames())
     model = model.transform(GiveReadableTensorNames())

From c747ae94bf2b14ed7169a7af23adbd78fdd66312 Mon Sep 17 00:00:00 2001
From: Simon Gerber <simon.gerber@amd.com>
Date: Fri, 15 May 2026 10:50:25 +0100
Subject: [PATCH 08/10] Improved pipelining capabilities for non-accumulation
 compressor usage. pre-commit changes. code cleanup of several testing
 scripts.

---
 finn-rtllib/mvu/add_multi.sv                  |  18 +-
 finn-rtllib/mvu/mvu_vvu_axi.sv                |   1 -
 src/finn/compressor/README.md                 |   1 +
 src/finn/compressor/gen_dotp_netlist.sh       |  62 ++++++
 .../hdl/add_multi_comp_template.tcl           |   7 +-
 .../compressor/hdl/dotp_comp_tb_template.sv   |  42 +---
 .../compressor/hdl/dotp_comp_template.tcl     |   4 +-
 src/finn/compressor/hdl/dotp_tb_template.sv   |   5 +-
 src/finn/compressor/hdl/dotp_template.tcl     |   4 +-
 .../compressor/run_add_multi_comp_tests.sh    | 111 +++++-----
 src/finn/compressor/run_dotp_comp_tests.sh    | 140 ++++++------
 src/finn/compressor/src/dotp.py               |   3 +-
 src/finn/compressor/src/graph/accumulator.py  |   6 +-
 .../counters/absorption_counter_candidates.py |   9 +-
 src/finn/compressor/src/graph/final_adder.py  | 208 ++++++++++--------
 src/finn/compressor/src/graph/nodes.py        |   9 +-
 .../src/passes/compressor_constructor.py      |  15 +-
 src/finn/compressor/src/passes/emitter.py     |   9 +-
 src/finn/compressor/src/passes/lut_placer.py  |  12 -
 src/finn/compressor/src/tests/test_gen.py     |   2 +-
 src/finn/compressor/src/tests/tester.py       |   3 +-
 src/finn/compressor/src/utils/shape.py        |   8 +-
 .../rtl/matrixvectoractivation_rtl.py         |   6 +-
 .../fpgadataflow/specialize_layers.py         |   4 +-
 tests/fpgadataflow/test_fpgadataflow_mvau.py  |   8 +-
 25 files changed, 373 insertions(+), 324 deletions(-)
 create mode 100755 src/finn/compressor/gen_dotp_netlist.sh

diff --git a/finn-rtllib/mvu/add_multi.sv b/finn-rtllib/mvu/add_multi.sv
index f204abe363..25f5b9a411 100644
--- a/finn-rtllib/mvu/add_multi.sv
+++ b/finn-rtllib/mvu/add_multi.sv
@@ -83,10 +83,10 @@ else if(!RESET_ZERO && (N == n) && (ARG_WIDTH == w) && (DEPTH >= d) && (0 <= ARG
 	localparam int unsigned  SUM_DELAY = DEPTH - COMP_DELAY; \
 	if(SUM_DELAY == 0)  assign  sum = out; \
 	else begin : genDelay \
-			logic [SUM_WIDTH-1:0]  SumZ[SUM_DELAY] = '{ default: 'x }; \
+		logic [SUM_WIDTH-1:0]  SumZ[SUM_DELAY] = '{ default: '0 }; \
 		always_ff @(posedge clk) begin \
-			if(rst)  SumZ <= '{ default: 'x }; \
-			else begin \
+			if(rst)  SumZ <= '{ default: '0 }; \
+			else if(en) begin \
 				for(int unsigned  i = 0; i < SUM_DELAY-1; i++)  SumZ[i] <= SumZ[i+1]; \
 				SumZ[SUM_DELAY-1] <= out; \
 			end \
@@ -169,13 +169,13 @@ end : genComp``n``u``w``_d``d
 	// Delay Output if requested DEPTH exceeds Tree Height
 	if(DEPTH <= L)  assign  sum = sum0;
 	else begin : genDelay
-		localparam logic [SUM_WIDTH-1:0]  SUM_RESET = {(SUM_WIDTH){RESET_ZERO? 1'b0 : 1'bx}};
-		logic [SUM_WIDTH-1:0]  SumZ[DEPTH - L] = '{ default: SUM_RESET };
+		localparam int unsigned  DELAY = DEPTH - L;
+		logic [SUM_WIDTH-1:0]  SumZ[DELAY] = '{ default: '0 };
 		always_ff @(posedge clk) begin
-			if(rst)  SumZ <= '{ default: SUM_RESET };
-			else begin
-				for(int unsigned  i = 0; i < DEPTH-L-1; i++)  SumZ[i] <= SumZ[i+1];
-				SumZ[DEPTH-L-1] <= sum0;
+			if(rst)  SumZ <= '{ default: '0 };
+			else if(en) begin
+				for(int unsigned  i = 0; i < DELAY-1; i++)  SumZ[i] <= SumZ[i+1];
+				SumZ[DELAY-1] <= sum0;
 			end
 		end
 		assign	sum = SumZ[0];
diff --git a/finn-rtllib/mvu/mvu_vvu_axi.sv b/finn-rtllib/mvu/mvu_vvu_axi.sv
index dca3e9332c..0df301c767 100644
--- a/finn-rtllib/mvu/mvu_vvu_axi.sv
+++ b/finn-rtllib/mvu/mvu_vvu_axi.sv
@@ -367,7 +367,6 @@ module mvu_vvu_axi #(
 			/* else */      3 + $clog2(SIMD+1) + (SIMD == 1);
 
 		// Floor at the DSP-equivalent depth so the compressor path (shallow pipeline)
-		// still has enough output queue slots to absorb backpressure transients.
 		localparam int unsigned  DSP_PIPELINE_DEPTH = 3 + $clog2(SIMD+1) + (SIMD == 1);
 		localparam int unsigned  MAX_IN_FLIGHT =
 			CORE_PIPELINE_DEPTH > DSP_PIPELINE_DEPTH? CORE_PIPELINE_DEPTH : DSP_PIPELINE_DEPTH;
diff --git a/src/finn/compressor/README.md b/src/finn/compressor/README.md
index ddc8ca7d9f..b80dbe6a43 100644
--- a/src/finn/compressor/README.md
+++ b/src/finn/compressor/README.md
@@ -73,6 +73,7 @@ The tool can automatically generate a SystemVerilog testbench to fuzzy-test the
 
 ### Custom Pipeline Depth
 Specify the maximum combinational delay for the compressor using `-p MAX_DEPTH`. Note that the final adder, which has at least one single routing delay, cannot be pipelined.
+This excludes the `Quaternary Adder`, which can be split into two stages when not used in accumulation. The pipelined version is the default if `-a` is not passed.
 
 ### Constant Input
 Aside to the regular, variable compressor inputs, the tool also supports an additional constant input. It can be specified as a binary number by `-c NUMBER`.
diff --git a/src/finn/compressor/gen_dotp_netlist.sh b/src/finn/compressor/gen_dotp_netlist.sh
new file mode 100755
index 0000000000..ea35a66b04
--- /dev/null
+++ b/src/finn/compressor/gen_dotp_netlist.sh
@@ -0,0 +1,62 @@
+#!/bin/bash
+#############################################################################
+# Copyright (C) 2024 - 2025 Advanced Micro Devices, Inc. All rights reserved.
+#
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Generate standalone dotp compressor netlist for inspection or integration.
+# Output is a self-contained RTL directory that can be simulated or synthesized.
+#
+# Usage: Edit parameters below, then run: ./gen_dotp_netlist.sh
+#############################################################################
+
+# === Configuration ===
+SIMD=256
+WW=4
+AW=4
+ACCU_WIDTH=16
+SIGNED_WEIGHTS=0      # 0=unsigned, 1=signed
+SIGNED_ACT=0          # 0=unsigned, 1=signed
+TARGET="Versal"       # Versal, 7-Series, UltraScale
+# =====================
+
+set -e
+SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
+export PYTHONPATH="$(cd "$SCRIPT_DIR/../../.." && pwd):${PYTHONPATH:-}"
+
+# Build output directory name from config
+LABEL="simd${SIMD}_w${WW}_a${AW}"
+[ "$SIGNED_WEIGHTS" -eq 0 ] && LABEL="${LABEL}_uw"
+[ "$SIGNED_ACT" -eq 1 ] && LABEL="${LABEL}_sa"
+LABEL="${LABEL}_$(echo "$TARGET" | tr '[:upper:]' '[:lower:]' | tr -d '-')"
+OUT_DIR="$SCRIPT_DIR/gen/$LABEL"
+mkdir -p "$OUT_DIR"
+
+echo "Generating dotp compressor netlist"
+echo "  Config: SIMD=$SIMD, WW=$WW, AW=$AW, ACCU=$ACCU_WIDTH"
+echo "  Target: $TARGET"
+echo "  Output: $OUT_DIR"
+echo ""
+
+# Build flags
+FLAGS=""
+[ "$SIGNED_WEIGHTS" -eq 0 ] && FLAGS="--unsigned_weights"
+[ "$SIGNED_ACT" -eq 1 ] && FLAGS="$FLAGS --signed_activations"
+
+# Generate compressor core and dotp wrapper
+python3 -m finn.compressor.src.dotp_finn \
+    --simd "$SIMD" --ww "$WW" --aw "$AW" \
+    --accu_width "$ACCU_WIDTH" $FLAGS \
+    --target "$TARGET" \
+    --dotp-template "$SCRIPT_DIR/hdl/dotp_comp_template.sv" \
+    --dotp-output-name dotp_comp.sv \
+    -o "$OUT_DIR"
+
+# Include mul_comp_map for complete netlist
+cp "$SCRIPT_DIR/hdl/mul_comp_map.sv" "$OUT_DIR/"
+
+echo ""
+echo "Generated files:"
+ls -1 "$OUT_DIR"/*.sv
+echo ""
+echo "Done. Netlist ready in: $OUT_DIR"
diff --git a/src/finn/compressor/hdl/add_multi_comp_template.tcl b/src/finn/compressor/hdl/add_multi_comp_template.tcl
index c32279e518..7402bf2df6 100644
--- a/src/finn/compressor/hdl/add_multi_comp_template.tcl
+++ b/src/finn/compressor/hdl/add_multi_comp_template.tcl
@@ -7,9 +7,6 @@
 # @author    Simon Gerber <simon.gerber@amd.com>
 #############################################################################
 
-# Vivado batch flow for standalone add_multi compressor test.
-# Behavioral simulation only — verifies the generated compressor produces correct sums.
-#
 # Template placeholders expanded by run_add_multi_comp_tests.sh:
 #   {label}   - Configuration label (e.g. n8_w4_p2)
 #   {tb}      - Testbench module name
@@ -29,7 +26,9 @@ add_files -fileset $simset {gen_dir}/{tb}.sv
 set_property top $tb $simset
 set_property xsim.simulate.runtime all $simset
 
-launch_simulation
+if {[catch {launch_simulation} err]} {
+    puts "ERROR: Simulation failed: $err"
+}
 close_sim
 
 quit
diff --git a/src/finn/compressor/hdl/dotp_comp_tb_template.sv b/src/finn/compressor/hdl/dotp_comp_tb_template.sv
index d6d7841e07..8d15d92759 100644
--- a/src/finn/compressor/hdl/dotp_comp_tb_template.sv
+++ b/src/finn/compressor/hdl/dotp_comp_tb_template.sv
@@ -144,33 +144,13 @@ module dotp_comp_{full_sig}_tb;
 			feed_single(ww, aa);
 		end
 
-		// Single SIMD lane active (first)
-		begin
-			automatic weight_t     [PE-1:0][SIMD-1:0]  ww = '0;
-			automatic activation_t         [SIMD-1:0]  aa = '0;
-			for(int unsigned  pe = 0; pe < PE; pe++)
-				ww[pe][0] = {1'b0, {(WEIGHT_WIDTH-1){1'b1}}};
-			aa[0] = '1;
-			feed_single(ww, aa);
-		end
-
-		// Single SIMD lane active (last)
-		begin
+		// Single SIMD lane active (first and last lanes)
+		for(int unsigned  lane = 0; lane < SIMD; lane += (SIMD > 1 ? SIMD-1 : 1)) begin
 			automatic weight_t     [PE-1:0][SIMD-1:0]  ww = '0;
 			automatic activation_t         [SIMD-1:0]  aa = '0;
 			for(int unsigned  pe = 0; pe < PE; pe++)
-				ww[pe][SIMD-1] = '1;
-			aa[SIMD-1] = '1;
-			feed_single(ww, aa);
-		end
-
-		// Alternating weights: +max, -max, +max, ...
-		begin
-			automatic weight_t     [PE-1:0][SIMD-1:0]  ww;
-			automatic activation_t         [SIMD-1:0]  aa = '1;
-			for(int unsigned  pe = 0; pe < PE; pe++)
-				for(int unsigned  s = 0; s < SIMD; s++)
-					ww[pe][s] = s[0] ? '1 : {1'b0, {(WEIGHT_WIDTH-1){1'b1}}};
+				ww[pe][lane] = {1'b0, {(WEIGHT_WIDTH-1){1'b1}}};
+			aa[lane] = '1;
 			feed_single(ww, aa);
 		end
 
@@ -226,18 +206,10 @@ module dotp_comp_{full_sig}_tb;
 						automatic activation_t         [SIMD-1:0]  aa;
 						void'(std::randomize(ww, aa));
 
-						for(int unsigned  pe = 0; pe < PE; pe++) begin
-							for(int unsigned  simd = 0; simd < SIMD; simd++) begin
-								automatic accu_t  m0 = $signed(ww[pe][simd])
+						for(int unsigned  pe = 0; pe < PE; pe++)
+							for(int unsigned  simd = 0; simd < SIMD; simd++)
+								pp[pe] += $signed(ww[pe][simd])
 									* $signed({SIGNED_ACTIVATIONS && aa[simd][ACTIVATION_WIDTH-1], aa[simd]});
-								automatic accu_t  p0 = $signed(pp[pe]) + m0;
-								// Avoid overflow by zeroing offending weight
-								if(((m0 < 0) == ($signed(pp[pe]) < 0)) && ((m0 < 0) != (p0 < 0)))
-									ww[pe][simd] = 0;
-								else
-									pp[pe] = p0;
-							end
-						end
 
 						zero <= 0;
 						w <= ww;
diff --git a/src/finn/compressor/hdl/dotp_comp_template.tcl b/src/finn/compressor/hdl/dotp_comp_template.tcl
index 1e8b2fc482..41eed8bdbf 100644
--- a/src/finn/compressor/hdl/dotp_comp_template.tcl
+++ b/src/finn/compressor/hdl/dotp_comp_template.tcl
@@ -26,7 +26,9 @@ set_property top $tb $simset
 set_property xsim.simulate.runtime all $simset
 
 # Run Simulation
-launch_simulation
+if {[catch {launch_simulation} err]} {
+    puts "ERROR: Simulation failed: $err"
+}
 close_sim
 
 quit
diff --git a/src/finn/compressor/hdl/dotp_tb_template.sv b/src/finn/compressor/hdl/dotp_tb_template.sv
index a5d7dbafd6..aa2165ac67 100644
--- a/src/finn/compressor/hdl/dotp_tb_template.sv
+++ b/src/finn/compressor/hdl/dotp_tb_template.sv
@@ -18,7 +18,8 @@ module dotp_{n}x{sa}{na}{sb}{nb}_tb #(
 		SIGNED_A ^^ SIGNED_B? 1 + $clog2(N) /*[-N:0]*/ : $clog2(N+1) /*[0:N]*/,
 	localparam bit  SIGNED_P = NA == 1? SIGNED_A ^^ SIGNED_B : SIGNED_A || SIGNED_B
 )();
-	uwire  clk = 'z;
+	logic  clk = 0;
+	always #5ns clk = ~clk;  // 10ns period = 100MHz
 
 	logic [N-1:0][NA-1:0]  a;
 	logic [N-1:0][NB-1:0]  b;
@@ -44,7 +45,7 @@ module dotp_{n}x{sa}{na}{sb}{nb}_tb #(
 
 			a <= aa;
 			b <= bb;
-			#10ns;
+			repeat({depth} + 1) @(posedge clk);
 			px = $signed({ SIGNED_P && p[NP-1], p });
 			assert((^p !== 1'bx) && (px == pp)) else begin
 				$error("Received %0d [0x%0x] instead of %0d.", px, p, pp);
diff --git a/src/finn/compressor/hdl/dotp_template.tcl b/src/finn/compressor/hdl/dotp_template.tcl
index d9a9110f97..1916676012 100644
--- a/src/finn/compressor/hdl/dotp_template.tcl
+++ b/src/finn/compressor/hdl/dotp_template.tcl
@@ -20,7 +20,9 @@ set_property top ${top}_tb $simset
 set_property xsim.simulate.runtime all $simset
 
 # Run Simulation
-launch_simulation
+if {[catch {launch_simulation} err]} {
+    puts "ERROR: Simulation failed: $err"
+}
 close_sim
 
 quit
diff --git a/src/finn/compressor/run_add_multi_comp_tests.sh b/src/finn/compressor/run_add_multi_comp_tests.sh
index e6ffdb3765..0828543e18 100755
--- a/src/finn/compressor/run_add_multi_comp_tests.sh
+++ b/src/finn/compressor/run_add_multi_comp_tests.sh
@@ -8,18 +8,14 @@
 # @author    Simon Gerber <simon.gerber@amd.com>
 #############################################################################
 
-# Run standalone add_multi compressor tests.
-# For each (N, ARG_WIDTH) configuration:
-#   1. Generate comp_NuW_dD.sv via add_multi_finn.py
-#   2. Expand TB and TCL templates
-#   3. Run XSim via Vivado
-#
-# Usage: ./run_add_multi_comp_tests.sh [versal|7series]
-# Prerequisites: Vivado on PATH
+# Usage: ./run_add_multi_comp_tests.sh [target]
+#   target: versal, 7series, ultrascale (default: versal)
 
 ((${KEEP_LOG:=0}))
 ((${MAX_WORKERS:=12}))
-TARGET="${1:-versal}"  # Default to versal
+
+# Parse target argument
+TARGET="${1:-versal}"
 
 if ! command -v vivado >/dev/null 2>&1; then
 	echo "ERROR: vivado not found in PATH." >&2
@@ -30,79 +26,90 @@ echo "Vivado: $(command -v vivado)"
 echo "Settings: KEEP_LOG=$KEEP_LOG MAX_WORKERS=$MAX_WORKERS"
 echo "Target: $TARGET"
 
-# Paths
+# Paths (all absolute for portability)
 SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
 HDL_DIR="$SCRIPT_DIR/hdl"
-GEN_BASE="$SCRIPT_DIR/gen"
-FINN_SRC="$(cd "$SCRIPT_DIR/../.." && pwd)"
-export PYTHONPATH="$FINN_SRC${PYTHONPATH:+:$PYTHONPATH}"
-: "${WORK_DIR:=${FINN_HOST_BUILD_DIR:-/tmp/finn_compressor_tests}}"
+GEN_DIR="$SCRIPT_DIR/gen"
+FINN_SRC="$(cd "$SCRIPT_DIR/../../.." && pwd)"
+export PYTHONPATH="$FINN_SRC/src${PYTHONPATH:+:$PYTHONPATH}"
+
+# Vivado working directory (isolated temp, unique per invocation)
+WORK_DIR="/tmp/finn_compressor_tests_$$"
 
 source "$SCRIPT_DIR/lib/test_common.sh"
 
-# Test configs: --n N --arg_width W [-p pipeline_every]
+# Test configs: N ARG_WIDTH PIPELINE_EVERY
+# Format: "N W P" where P is pipeline_every (0 = no pipelining)
 TESTS=(
-	"--n 8  --arg_width 4"
-	"--n 8  --arg_width 4  -p 2"
-	"--n 16 --arg_width 3"
-	"--n 16 --arg_width 6  -p 2"
-	"--n 32 --arg_width 6  -p 2"
-	"--n 32 --arg_width 16 -p 2"
-	"--n 47 --arg_width 5  -p 2"
-	"--n 56 --arg_width 8  -p 2"
+	"8  4  0"
+	"8  4  2"
+	"16 3  0"
+	"16 6  2"
+	"32 6  2"
+	"32 16 2"
+	"47 5  2"
+	"56 8  2"
 )
 
-function parse_config {
-	local n="" w="" p=""
-	while [[ $# -gt 0 ]]; do
-		case "$1" in
-			--n)         n="$2"; shift 2;;
-			--arg_width) w="$2"; shift 2;;
-			-p)          p="$2"; CFG_P_FLAG="-p $2"; shift 2;;
-			*)           shift;;
-		esac
-	done
-	CFG_N="$n"; CFG_W="$w"
-	CFG_LABEL="n${n}_w${w}"; [ -n "$p" ] && CFG_LABEL="${CFG_LABEL}_p${p}"
-	# Set FPGA part based on TARGET variable
+# Set FPGA part based on TARGET variable
+function get_fpga_part {
 	if [[ "$TARGET" == "7series" ]]; then
-		CFG_PART="xc7z020clg400-1"  # Pynq-Z1
+		echo "xc7z020clg400-1"  # Pynq-Z1
 	elif [[ "$TARGET" == "ultrascale" ]]; then
-		CFG_PART="xczu9eg-ffvb1156-2-e"  # ZCU102
+		echo "xczu9eg-ffvb1156-2-e"  # ZCU102
 	else
-		CFG_PART="xcvc1902-vsva2197-2MP-e-S"  # Versal VCK190
+		echo "xcvc1902-vsva2197-2MP-e-S"  # Versal VCK190
 	fi
 }
 
+# Build label from config
+function make_label {
+	local n=$1 w=$2 p=$3
+	local label="n${n}_w${w}"
+	[ "$p" -ne 0 ] && label="${label}_p${p}"
+	echo "$label"
+}
+
 function run_sim {
 	local label="$1"
-	local tcl="$GEN_BASE/$label/add_multi_comp_${label}.tcl"
-	local out="$GEN_BASE/$label/add_multi_comp_${label}.runner.out"
-	local log=(-nolog); [ "$KEEP_LOG" -gt 0 ] && log=(-log "$GEN_BASE/$label/sim.log")
+	local work="$WORK_DIR/$label"
+	local tcl="$GEN_DIR/$label/add_multi_comp_${label}.tcl"
+	local out="$GEN_DIR/$label/add_multi_comp_${label}.runner.out"
+	local log=(-nolog); [ "$KEEP_LOG" -gt 0 ] && log=(-log "$GEN_DIR/$label/sim.log")
 
-	vivado "${log[@]}" -nojournal -mode batch -source "$tcl" >"$out" 2>&1
+	mkdir -p "$work"
+	(cd "$work" && vivado "${log[@]}" -nojournal -mode batch -source "$tcl" >"$out" 2>&1)
 	check_vivado_errors "$out" "$label"
 	exit $?
 }
 
 # Phase 1: Generate
 LABELS=()
+FPGA_PART=$(get_fpga_part)
 echo -e "Generating configs:\n"
-for args in "${TESTS[@]}"; do
-	CFG_P_FLAG=""
-	# shellcheck disable=SC2086
-	parse_config $args
-	label="$CFG_LABEL"
+for test in "${TESTS[@]}"; do
+	read -r n w p <<< "$test"
+	label=$(make_label "$n" "$w" "$p")
 	LABELS+=("$label")
-	gen_dir="$GEN_BASE/$label"
+	gen_dir="$GEN_DIR/$label"
 	mkdir -p "$gen_dir"
 
 	echo "  $label ..."
 
+	# Build target flag (Versal is default, no flag needed)
+	target_flag=""
+	[[ "$TARGET" == "7series" ]] && target_flag="--target 7-Series"
+	[[ "$TARGET" == "ultrascale" ]] && target_flag="--target UltraScale"
+
+	# Build pipeline flag
+	pipeline_flag=""
+	[ "$p" -ne 0 ] && pipeline_flag="-p $p"
+
 	# Generate compressor
 	# shellcheck disable=SC2086
 	if ! gen_out=$(python3 -m finn.compressor.src.add_multi_finn \
-		--n "$CFG_N" --arg_width "$CFG_W" $CFG_P_FLAG -o "$gen_dir" 2>&1); then
+		--n "$n" --arg_width "$w" $pipeline_flag $target_flag \
+		-o "$gen_dir" 2>&1); then
 		echo "GENERATION FAILED: $gen_out" >&2; exit 1
 	fi
 
@@ -112,14 +119,14 @@ for args in "${TESTS[@]}"; do
 	[ -z "$comp_depth" ] && { echo "ERROR: No depth for $label" >&2; exit 1; }
 
 	# Expand TB
-	sed -e "s/{n}/$CFG_N/g" -e "s/{arg_width}/$CFG_W/g" \
+	sed -e "s/{n}/$n/g" -e "s/{arg_width}/$w/g" \
 	    -e "s/{depth}/$comp_depth/g" -e "s/{label}/$label/g" \
 	    -e "s/{comp_module}/$comp_name/g" \
 	    "$HDL_DIR/add_multi_comp_tb_template.sv" > "$gen_dir/add_multi_comp_${label}_tb.sv"
 
 	# Expand TCL
 	sed -e "s|{label}|$label|g" -e "s|{tb}|add_multi_comp_${label}_tb|g" \
-	    -e "s|{gen_dir}|$gen_dir|g" -e "s|{part}|$CFG_PART|g" \
+	    -e "s|{gen_dir}|$gen_dir|g" -e "s|{part}|$FPGA_PART|g" \
 	    "$HDL_DIR/add_multi_comp_template.tcl" > "$gen_dir/add_multi_comp_${label}.tcl"
 done
 echo
diff --git a/src/finn/compressor/run_dotp_comp_tests.sh b/src/finn/compressor/run_dotp_comp_tests.sh
index d6084c73d6..8843b7b728 100755
--- a/src/finn/compressor/run_dotp_comp_tests.sh
+++ b/src/finn/compressor/run_dotp_comp_tests.sh
@@ -8,20 +8,25 @@
 # @author    Simon Gerber <simon.gerber@amd.com>
 #############################################################################
 
-# Run dotp_comp integration tests for multiple configurations.
-# Uses dotp_finn.py to generate the compressor core (comp.sv),
-# then instantiates it from the static dotp_comp template via XSim.
-#
-# Usage: ./run_dotp_comp_tests.sh [versal|7series]
+# Usage: ./run_dotp_comp_tests.sh [target]
+#   target: versal, 7series, ultrascale (default: versal)
 
 ((${KEEP_LOG:=0}))
 ((${MAX_WORKERS:=12}))
-TARGET="${1:-versal}"  # Default to versal
 
-SRC_DIR="$(cd "$(dirname "$0")" && pwd)"
-FINN_SRC="$(cd "$SRC_DIR/../.." && pwd)"
-export PYTHONPATH="$FINN_SRC${PYTHONPATH:+:$PYTHONPATH}"
-: "${WORK_DIR:=${FINN_HOST_BUILD_DIR:-/tmp/finn_compressor_tests}}"
+# Parse target argument
+TARGET="${1:-versal}"
+
+# Paths (all absolute for portability)
+SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
+HDL_DIR="$SCRIPT_DIR/hdl"
+GEN_DIR="$SCRIPT_DIR/gen"
+FINN_SRC="$(cd "$SCRIPT_DIR/../../.." && pwd)"
+# PYTHONPATH needs to point to where finn.compressor can be imported from (src/)
+export PYTHONPATH="$FINN_SRC/src${PYTHONPATH:+:$PYTHONPATH}"
+
+# Vivado working directory (isolated temp, unique per invocation)
+WORK_DIR="/tmp/finn_compressor_tests_$$"
 
 if ! command -v vivado >/dev/null 2>&1; then
 	echo "ERROR: vivado not found in PATH." >&2
@@ -29,86 +34,84 @@ if ! command -v vivado >/dev/null 2>&1; then
 fi
 
 echo "Vivado: $(command -v vivado)"
-echo "Settings: KEEP_LOG=$KEEP_LOG MAX_WORKERS=$MAX_WORKERS WORK_DIR=$WORK_DIR"
+echo "Settings: KEEP_LOG=$KEEP_LOG MAX_WORKERS=$MAX_WORKERS"
 echo "Target: $TARGET"
 
-source "$SRC_DIR/lib/test_common.sh"
+source "$SCRIPT_DIR/lib/test_common.sh"
 
-# Test configs: --pe PE --simd SIMD --ww WW --aw AW --accu_width ACCU [--signed_activations]
+# Test configs: PE SIMD WW AW ACCU SIGNED_ACT
+# Format: "PE SIMD WW AW ACCU SIGNED" where SIGNED=1 for signed activations, 0 otherwise
 # Target is set via script argument, applied to all tests
 TESTS=(
-	"--pe 2 --simd 8 --ww 1 --aw 1 --accu_width 16"
-	"--pe 2 --simd 8 --ww 1 --aw 1 --accu_width 16 --signed_activations"
-	"--pe 2 --simd 8 --ww 2 --aw 1 --accu_width 16"
-	"--pe 2 --simd 8 --ww 2 --aw 2 --accu_width 16 --signed_activations"
-	"--pe 2 --simd 4 --ww 2 --aw 2 --accu_width 16 --signed_activations"
-	"--pe 2 --simd 16 --ww 2 --aw 2 --accu_width 16 --signed_activations"
-	"--pe 1 --simd 8 --ww 2 --aw 2 --accu_width 16 --signed_activations"
-	"--pe 4 --simd 8 --ww 2 --aw 2 --accu_width 16 --signed_activations"
+	"2 8 1 1 16 0"
+	"2 8 1 1 16 1"
+	"2 8 2 1 16 0"
+	"2 8 2 2 16 1"
+	"2 4 2 2 16 1"
+	"2 16 2 2 16 1"
+	"1 8 2 2 16 1"
+	"4 8 2 2 16 1"
 )
 
-function parse_config {
-	local pe="" simd="" ww="" aw="" accu="" signed_act=""
-	CFG_SIGNED_FLAG=""
-	while [[ $# -gt 0 ]]; do
-		case "$1" in
-			--pe)    pe="$2"; shift 2;;
-			--simd)  simd="$2"; shift 2;;
-			--ww)    ww="$2"; shift 2;;
-			--aw)    aw="$2"; shift 2;;
-			--accu_width) accu="$2"; shift 2;;
-			--signed_activations) signed_act="_sa"; CFG_SIGNED_FLAG="--signed_activations"; shift;;
-			*) shift;;
-		esac
-	done
-	CFG_PE="$pe"; CFG_SIMD="$simd"; CFG_WW="$ww"; CFG_AW="$aw"; CFG_ACCU="$accu"
-	CFG_LABEL="pe${pe}_simd${simd}_ww${ww}_aw${aw}_accu${accu}${signed_act}"
-	# Sanitize label for SystemVerilog identifiers
-	CFG_LABEL="${CFG_LABEL//-/_}"
-	# Set FPGA part and target flag based on TARGET variable
+# Set FPGA part based on TARGET variable
+function get_fpga_part {
 	if [[ "$TARGET" == "7series" ]]; then
-		CFG_PART="xc7z020clg400-1"  # Pynq-Z1
-		CFG_TARGET_FLAG="--target 7-Series"
+		echo "xc7z020clg400-1"  # Pynq-Z1
 	elif [[ "$TARGET" == "ultrascale" ]]; then
-		CFG_PART="xczu9eg-ffvb1156-2-e"  # ZCU102
-		CFG_TARGET_FLAG="--target UltraScale"
+		echo "xczu9eg-ffvb1156-2-e"  # ZCU102
 	else
-		CFG_PART="xcvc1902-vsva2197-2MP-e-S"  # Versal VCK190
-		CFG_TARGET_FLAG=""
+		echo "xcvc1902-vsva2197-2MP-e-S"  # Versal VCK190
 	fi
 }
 
+# Build label from config
+function make_label {
+	local pe=$1 simd=$2 ww=$3 aw=$4 accu=$5 signed=$6
+	local label="pe${pe}_simd${simd}_ww${ww}_aw${aw}_accu${accu}"
+	[ "$signed" -eq 1 ] && label="${label}_sa"
+	echo "${label//-/_}"  # Sanitize for SystemVerilog
+}
+
 function run_sim {
 	local label="$1"
-	local tcl="$SRC_DIR/gen/$label/dotp_comp_${label}.tcl"
-	local out="$SRC_DIR/gen/$label/dotp_comp_${label}.runner.out"
-	local log=(-nolog); [ "$KEEP_LOG" -gt 0 ] && log=(-log "$SRC_DIR/gen/$label/sim.log")
+	local work="$WORK_DIR/$label"
+	local tcl="$GEN_DIR/$label/dotp_comp_${label}.tcl"
+	local out="$GEN_DIR/$label/dotp_comp_${label}.runner.out"
+	local log=(-nolog); [ "$KEEP_LOG" -gt 0 ] && log=(-log "$GEN_DIR/$label/sim.log")
 
-	mkdir -p "$WORK_DIR"
-	(cd "$WORK_DIR" && vivado "${log[@]}" -nojournal -mode batch -source "$tcl" >"$out" 2>&1)
+	mkdir -p "$work"
+	(cd "$work" && vivado "${log[@]}" -nojournal -mode batch -source "$tcl" >"$out" 2>&1)
 	check_vivado_errors "$out" "$label"
 	exit $?
 }
 
 # Phase 1: Generate
 LABELS=()
+FPGA_PART=$(get_fpga_part)
 echo -e "Generating configs:\n"
-for args in "${TESTS[@]}"; do
-	CFG_SIGNED_FLAG=""
-	# shellcheck disable=SC2086
-	parse_config $args
-	label="$CFG_LABEL"
+for test in "${TESTS[@]}"; do
+	read -r pe simd ww aw accu signed <<< "$test"
+	label=$(make_label "$pe" "$simd" "$ww" "$aw" "$accu" "$signed")
 	LABELS+=("$label")
-	out_dir="gen/$label"
+	out_dir="$GEN_DIR/$label"
 	mkdir -p "$out_dir"
 
 	echo "  $label ..."
 
+	# Build target flag (Versal is default, no flag needed)
+	target_flag=""
+	[[ "$TARGET" == "7series" ]] && target_flag="--target 7-Series"
+	[[ "$TARGET" == "ultrascale" ]] && target_flag="--target UltraScale"
+
+	# Build signed activations flag
+	signed_flag=""
+	[ "$signed" -eq 1 ] && signed_flag="--signed_activations"
+
 	# Generate compressor
 	# shellcheck disable=SC2086
 	gen_out=$(python3 -m finn.compressor.src.dotp_finn \
-		--simd "$CFG_SIMD" --ww "$CFG_WW" --aw "$CFG_AW" \
-		--accu_width "$CFG_ACCU" $CFG_SIGNED_FLAG $CFG_TARGET_FLAG \
+		--simd "$simd" --ww "$ww" --aw "$aw" --accu_width "$accu" \
+		$signed_flag $target_flag \
 		--dotp-template hdl/dotp_comp_template.sv \
 		--dotp-output-name dotp_comp.sv \
 		-o "$out_dir" 2>&1)
@@ -119,17 +122,22 @@ for args in "${TESTS[@]}"; do
 	comp_depth=$(echo "$gen_out" | sed -n 's/^ *Pipeline depth:[[:space:]]*//p' | head -n 1 | grep -Eo '[0-9]+' || true)
 	[ -z "$comp_depth" ] && { echo "ERROR: No depth for $label" >&2; exit 1; }
 
+	# Extract dotp module name from generated file
+	dotp_module=$(grep "^module" "$out_dir/dotp_comp.sv" | sed 's/module \([^ #]*\).*/\1/')
+	[ -z "$dotp_module" ] && { echo "ERROR: No dotp module name for $label" >&2; exit 1; }
+
 	# Expand TB
-	sed -e "s/{pe}/$CFG_PE/g" -e "s/{simd}/$CFG_SIMD/g" \
-	    -e "s/{ww}/$CFG_WW/g" -e "s/{aw}/$CFG_AW/g" \
-	    -e "s/{accu_width}/$CFG_ACCU/g" \
-	    -e "s/{signed_act}/$([ -n "$CFG_SIGNED_FLAG" ] && echo 1 || echo 0)/g" \
+	sed -e "s/{pe}/$pe/g" -e "s/{simd}/$simd/g" \
+	    -e "s/{ww}/$ww/g" -e "s/{aw}/$aw/g" \
+	    -e "s/{accu_width}/$accu/g" \
+	    -e "s/{signed_act}/$signed/g" \
 	    -e "s/{full_sig}/$label/g" -e "s/{comp_depth}/$comp_depth/g" \
-	    hdl/dotp_comp_tb_template.sv > "$out_dir/dotp_comp_${label}_tb.sv"
+	    -e "s/{dotp_module}/$dotp_module/g" \
+	    "$HDL_DIR/dotp_comp_tb_template.sv" > "$out_dir/dotp_comp_${label}_tb.sv"
 
 	# Expand TCL
-	sed -e "s/{label}/$label/g" -e "s|{src_dir}|$SRC_DIR|g" -e "s/{part}/$CFG_PART/g" \
-	    hdl/dotp_comp_template.tcl > "$out_dir/dotp_comp_${label}.tcl"
+	sed -e "s/{label}/$label/g" -e "s|{src_dir}|$SCRIPT_DIR|g" -e "s/{part}/$FPGA_PART/g" \
+	    "$HDL_DIR/dotp_comp_template.tcl" > "$out_dir/dotp_comp_${label}.tcl"
 done
 echo
 
diff --git a/src/finn/compressor/src/dotp.py b/src/finn/compressor/src/dotp.py
index f6c049b6de..9c00dfb46f 100644
--- a/src/finn/compressor/src/dotp.py
+++ b/src/finn/compressor/src/dotp.py
@@ -64,7 +64,7 @@ def clog2(x):
     # Write to gen/ relative to this script's parent directory (compressor/)
     script_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
     output_path = os.path.join(script_dir, "gen", name + ".sv")
-    generate_compressor(
+    comp_depth = generate_compressor(
         target=target,
         shape=Shape((len(col) for col in shape)),
         name=name,
@@ -99,6 +99,7 @@ def clog2(x):
                         .replace("{signed_a}", str(int(sa)))
                         .replace("{signed_b}", str(int(sb)))
                         .replace("{abs_term}", str(abs_term))
+                        .replace("{depth}", str(comp_depth))
                         .replace("{part}", fpga_part)
                         # Replace relative paths with absolute paths for TCL
                         .replace("hdl/", hdl_dir + "/")
diff --git a/src/finn/compressor/src/graph/accumulator.py b/src/finn/compressor/src/graph/accumulator.py
index b85dd78061..54488cc57b 100644
--- a/src/finn/compressor/src/graph/accumulator.py
+++ b/src/finn/compressor/src/graph/accumulator.py
@@ -43,7 +43,6 @@ def build_hardware(self):
         self.instances.append(en_neg)
         self.instances.append(rst)
 
-        # Optional clock enable signal (for finnlib integration)
         en_wire = None
         if self.enable:
             en_wire = Wire(desired_name="en")
@@ -54,10 +53,7 @@ def build_hardware(self):
         # init=1 on rst delay chain: when enable mode is active, en-gating
         # prevents these registers from capturing the initial rst=1 pulse if
         # en=0 during global reset.  Initialising to 1 ensures the accumulator
-        # feedback is properly zeroed from power-up.  In the current finn(lib)
-        # integration en is hardwired to '1 making this technically redundant,
-        # but the FPGA INIT attribute is free and keeps the design robust
-        # against future uses where en may be gated.
+        # feedback is properly zeroed from power-up.
         rst_del = self.delay_signal(
             rst, self.preceeding_pipeline_stages + 1, en=en_wire, init=1 if self.enable else None
         )
diff --git a/src/finn/compressor/src/graph/counters/absorption_counter_candidates.py b/src/finn/compressor/src/graph/counters/absorption_counter_candidates.py
index 9ba56eae93..726172fb54 100644
--- a/src/finn/compressor/src/graph/counters/absorption_counter_candidates.py
+++ b/src/finn/compressor/src/graph/counters/absorption_counter_candidates.py
@@ -11,8 +11,8 @@
 from typing import List
 
 from ...utils.shape import Shape
-from ..nodes import GateAbsorptionCounter
-from ..primitives import LUT2, LUT6, LUT6CY
+from ..nodes import Constant, GateAbsorptionCounter
+from ..primitives import LUT2, LUT6, LUT6_2, LUT6CY
 
 
 def fa_sum(a, b, c):
@@ -109,8 +109,6 @@ def build_hardware(self):
         Similar to VersalPredAdder but uses LUT6_2 with swapped predicate order.
         Each column has 2 gates, each LUT computes: sum = p1 XOR p2 XOR carry_in
         """
-        from ..nodes import Constant
-        from ..primitives import LUT6_2
 
         luts = []
         for i in range(len(self.gates)):
@@ -251,9 +249,6 @@ def __init__(self, gates):
         super().__init__(Shape([len(gates)]), Shape([1, (len(gates) + 1) // 2]))
 
     def build_hardware(self):
-        from ..nodes import Constant
-        from ..primitives import LUT6_2
-
         luts = []
         for i in range((len(self.gates) + 1) // 2):
             p1 = self.gates[2 * i]
diff --git a/src/finn/compressor/src/graph/final_adder.py b/src/finn/compressor/src/graph/final_adder.py
index febcceb925..9b07890ef1 100644
--- a/src/finn/compressor/src/graph/final_adder.py
+++ b/src/finn/compressor/src/graph/final_adder.py
@@ -10,8 +10,8 @@
 from typing import List
 
 from ..utils.shape import Shape
-from .nodes import Counter
-from .primitives import CARRY4, LOOKAHEAD8, LUT5, LUT6_2, LUT6CY
+from .nodes import BlackboxOutput, Counter, Logic, Wire
+from .primitives import CARRY4, LOOKAHEAD8, LUT6_2, LUT6CY
 
 
 def FA_sum(a, b, c):
@@ -38,79 +38,9 @@ class FinalAdder(Counter):
     def compression_goal(col):
         pass
 
-
-class VersalTernaryAdder(FinalAdder):
-    @staticmethod
-    def compression_goal(col):
-        return 5 if col == 0 else 3
-
-    def __init__(self, input_shape: Shape):
-        self.input_shape = input_shape
-        output_shape = Shape([1 for _ in range(len(input_shape) + 2)])
-        super().__init__(input_shape, output_shape)
-
-    def build_hardware(self):
-        l8s = [LOOKAHEAD8() for _ in range((len(self.input_shape) + 8) // 8)]
-        luts_chain = [
-            LUT6CY.fromPred(
-                lambda A0, A1, A2, A3, A4, A5: FA_sum(FA_sum(A0, A1, A2), A3, A4),
-                lambda A0, A1, A2, A3, A4, A5: FA_carry(FA_sum(A0, A1, A2), A3, A4),
-                "ternary_adder_chain",
-            )
-            for _ in range(len(self.input_shape) + 1)
-        ]
-        luts_top = []
-        for i in range(len(self.input_shape)):
-            if i % 2 == 0:
-                luts_top.append(LUT5.fromPred(lambda A0, A1, A2, A3, A4: FA_carry(A0, A1, A4)))
-                try_connect(lambda: self.input_wires[i][0].connect_to(luts_top[-1].I0))
-                try_connect(lambda: self.input_wires[i][1].connect_to(luts_top[-1].I1))
-                try_connect(lambda: self.input_wires[i + 1][0].connect_to(luts_top[-1].I2))
-                try_connect(lambda: self.input_wires[i + 1][1].connect_to(luts_top[-1].I3))
-                try_connect(lambda: self.input_wires[i][2].connect_to(luts_top[-1].I4))
-            else:
-                luts_top.append(LUT5.fromPred(lambda A0, A1, A2, A3, A4: FA_carry(A2, A3, A4)))
-                try_connect(lambda: self.input_wires[i - 1][0].connect_to(luts_top[-1].I0))
-                try_connect(lambda: self.input_wires[i - 1][1].connect_to(luts_top[-1].I1))
-                try_connect(lambda: self.input_wires[i][0].connect_to(luts_top[-1].I2))
-                try_connect(lambda: self.input_wires[i][1].connect_to(luts_top[-1].I3))
-                try_connect(lambda: self.input_wires[i][2].connect_to(luts_top[-1].I4))
-
-        for idx, (left, right) in enumerate(zip(luts_top[0::2], luts_top[1::2])):
-            left.annotate(f"HLUTNM = final_adder_{idx}")
-            right.annotate(f"HLUTNM = final_adder_{idx}")
-
-        try_connect(lambda: self.input_wires[0][3].connect_to(luts_chain[0].I3))
-        try_connect(lambda: self.input_wires[0][4].connect_to(luts_chain[0].I4))
-        for i, el in enumerate(luts_chain):
-            try_connect(lambda: self.input_wires[i][0].connect_to(el.I0))
-            try_connect(lambda: self.input_wires[i][1].connect_to(el.I1))
-            try_connect(lambda: self.input_wires[i][2].connect_to(el.I2))
-            el.PROP.connect_to(l8s[i // 8].p_in_ports[i % 8])
-            el.O51.connect_to(self.output_wires[i][0])
-            el.O52.connect_to(l8s[i // 8].c_in_ports[i % 8 + 1])
-
-        for lb, lt in zip(luts_chain[1:], luts_top):
-            lt.O.connect_to(lb.I3)
-
-        # connect carry-ins between lookahead modules
-        for prev, next in zip(l8s, l8s[1:]):
-            prev.COUTH.connect_to(next.CIN)
-
-        # cascade
-        for i in range(1, len(luts_chain)):
-            if i % 2 == 0:
-                l8s[(i - 1) // 8].out_ports[((i - 1) % 8) // 2].connect_to(luts_chain[i].I4)
-            else:
-                luts_chain[i - 1].O52.connect_to(luts_chain[i].I4)
-
-        if len(luts_chain) % 2 == 0:
-            l8s[(len(luts_chain) - 1) // 8].out_ports[len(luts_chain) % 8 // 2 - 1].connect_to(
-                self.output_wires[len(luts_chain)][0]
-            )
-        else:
-            luts_chain[-1].O52.connect_to(self.output_wires[len(luts_chain)][0])
-        self.instances += luts_chain + luts_top + l8s
+    @property
+    def delay(self):
+        return 0
 
 
 class QuaternaryAdder(FinalAdder):
@@ -118,10 +48,35 @@ class QuaternaryAdder(FinalAdder):
     def compression_goal(col):
         return 5 if col <= 1 else 4
 
-    def __init__(self, input_shape: Shape):
+    def __init__(self, input_shape: Shape, pipelined: bool = False):
+        self.pipelined = pipelined
         output_shape = Shape([1 for _ in range(len(input_shape) + 2)])
         super().__init__(input_shape, output_shape)
 
+    @property
+    def delay(self):
+        return 1 if self.pipelined else 0
+
+    def _add_register(self, signal):
+        """
+        Helper that inserts a register (Logic node) after signal and returns the register.
+
+        If signal is a BlackboxOutput, we need an intermediate Wire because
+        the Verilog emitter can't handle Logic with BlackboxOutput as a source.
+        """
+
+        reg = Logic()
+        if isinstance(signal, BlackboxOutput):
+            # Insert a wire between BlackboxOutput and Logic
+            wire = Wire()
+            signal.connect_to(wire)
+            wire.connect_to(reg)
+            self.instances.append(wire)
+        else:
+            signal.connect_to(reg)
+        self.instances.append(reg)
+        return reg
+
     def build_hardware(self):
         # Find the limit up to which the quaternary adder is needed.
         # We construct a two-input adder after this.
@@ -233,23 +188,53 @@ def chain_l8(l8s):
         try_connect(lambda: self.input_wires[0][4].connect_to(luts_top[0].I4))
         try_connect(lambda: self.input_wires[0][4].connect_to(l8s_top[0].CIN))
 
-        try_connect(lambda: self.input_wires[1][4].connect_to(luts_btm[0].I4))
-        try_connect(lambda: self.input_wires[1][4].connect_to(l8s_btm[0].CIN))
+        # Bottom row carry-in (optionally through register for pipelining)
+        # try_conneect silently fails and goes on
+        # if the input wire is not present. _add.register however
+        # will throw if we try to connect a register to a non-existent input wire,
+        # so we check for existence first whenever we want to add a register.
+        if self.pipelined and len(self.input_wires) > 1 and len(self.input_wires[1]) > 4:
+            btm_cin_reg = self._add_register(self.input_wires[1][4])
+            try_connect(lambda: btm_cin_reg.connect_to(luts_btm[0].I4))
+            try_connect(lambda: btm_cin_reg.connect_to(l8s_btm[0].CIN))
+        else:
+            try_connect(lambda: self.input_wires[1][4].connect_to(luts_btm[0].I4))
+            try_connect(lambda: self.input_wires[1][4].connect_to(l8s_btm[0].CIN))
 
-        # downwards connection
+        # downwards connection (optionally through registers for pipelining)
         for t, d in zip(luts_top[1:], luts_btm):
-            t.O51.connect_to(d.I3)
+            if self.pipelined:
+                reg = self._add_register(t.O51)
+                reg.connect_to(d.I3)
+            else:
+                t.O51.connect_to(d.I3)
         last_top = len(carries_top) - 1
-        carries_top[last_top].connect_to(luts_btm[last_top].I3)
+        if self.pipelined:
+            reg = self._add_register(carries_top[last_top])
+            reg.connect_to(luts_btm[last_top].I3)
+        else:
+            carries_top[last_top].connect_to(luts_btm[last_top].I3)
 
+        # Connect inputs to top and bottom rows
+        # For pipelining: top row connects directly, bottom row through registers
         for idx, (lb, lt) in enumerate(zip(luts_btm, luts_top[:height_4_until])):
-            for el in [lb, lt]:
-                try_connect(lambda: self.input_wires[idx][0].connect_to(el.I0))
-                try_connect(lambda: self.input_wires[idx][1].connect_to(el.I1))
-                try_connect(lambda: self.input_wires[idx][2].connect_to(el.I2))
-
+            # Top row: always direct connection
+            try_connect(lambda: self.input_wires[idx][0].connect_to(lt.I0))
+            try_connect(lambda: self.input_wires[idx][1].connect_to(lt.I1))
+            try_connect(lambda: self.input_wires[idx][2].connect_to(lt.I2))
             try_connect(lambda: self.input_wires[idx][3].connect_to(lt.I3))
 
+            # Bottom row: through registers if pipelined
+            if self.pipelined:
+                for i, port in enumerate([lb.I0, lb.I1, lb.I2]):
+                    if len(self.input_wires[idx]) > i:
+                        reg = self._add_register(self.input_wires[idx][i])
+                        reg.connect_to(port)
+            else:
+                try_connect(lambda: self.input_wires[idx][0].connect_to(lb.I0))
+                try_connect(lambda: self.input_wires[idx][1].connect_to(lb.I1))
+                try_connect(lambda: self.input_wires[idx][2].connect_to(lb.I2))
+
         if tail_length:
             lt = luts_top[height_4_until]
             lb = luts_btm[height_4_until]
@@ -257,14 +242,39 @@ def chain_l8(l8s):
             try_connect(lambda: self.input_wires[height_4_until][0].connect_to(lt.I0))
             try_connect(lambda: self.input_wires[height_4_until][1].connect_to(lt.I1))
 
-            try_connect(lambda: self.input_wires[height_4_until + 1][0].connect_to(lb.I0))
-            try_connect(lambda: self.input_wires[height_4_until + 1][1].connect_to(lb.I1))
+            if self.pipelined:
+                if len(self.input_wires) > height_4_until + 1:
+                    if len(self.input_wires[height_4_until + 1]) > 0:
+                        reg0 = self._add_register(self.input_wires[height_4_until + 1][0])
+                        reg0.connect_to(lb.I0)
+                    if len(self.input_wires[height_4_until + 1]) > 1:
+                        reg1 = self._add_register(self.input_wires[height_4_until + 1][1])
+                        reg1.connect_to(lb.I1)
+            else:
+                try_connect(lambda: self.input_wires[height_4_until + 1][0].connect_to(lb.I0))
+                try_connect(lambda: self.input_wires[height_4_until + 1][1].connect_to(lb.I1))
 
         for idx, lb in enumerate(luts_btm[height_4_until + 1 :]):
-            try_connect(lambda: self.input_wires[idx + height_4_until + 1][0].connect_to(lb.I0))
-            try_connect(lambda: self.input_wires[idx + height_4_until + 1][1].connect_to(lb.I1))
-            try_connect(lambda: self.input_wires[idx + height_4_until + 2][0].connect_to(lb.I2))
-            try_connect(lambda: self.input_wires[idx + height_4_until + 2][1].connect_to(lb.I3))
+            if self.pipelined:
+                col1 = idx + height_4_until + 1
+                col2 = idx + height_4_until + 2
+                if len(self.input_wires) > col1 and len(self.input_wires[col1]) > 0:
+                    reg0 = self._add_register(self.input_wires[col1][0])
+                    reg0.connect_to(lb.I0)
+                if len(self.input_wires) > col1 and len(self.input_wires[col1]) > 1:
+                    reg1 = self._add_register(self.input_wires[col1][1])
+                    reg1.connect_to(lb.I1)
+                if len(self.input_wires) > col2 and len(self.input_wires[col2]) > 0:
+                    reg2 = self._add_register(self.input_wires[col2][0])
+                    reg2.connect_to(lb.I2)
+                if len(self.input_wires) > col2 and len(self.input_wires[col2]) > 1:
+                    reg3 = self._add_register(self.input_wires[col2][1])
+                    reg3.connect_to(lb.I3)
+            else:
+                try_connect(lambda: self.input_wires[idx + height_4_until + 1][0].connect_to(lb.I0))
+                try_connect(lambda: self.input_wires[idx + height_4_until + 1][1].connect_to(lb.I1))
+                try_connect(lambda: self.input_wires[idx + height_4_until + 2][0].connect_to(lb.I2))
+                try_connect(lambda: self.input_wires[idx + height_4_until + 2][1].connect_to(lb.I3))
 
         def connect_carry_to_lut(carries, luts):
             for carry, lut in zip(carries, luts[1:]):
@@ -272,14 +282,24 @@ def connect_carry_to_lut(carries, luts):
 
         connect_carry_to_lut(carries_top, luts_top)
         connect_carry_to_lut(carries_btm, luts_btm)
-        luts_top[0].O51.connect_to(self.output_wires[0][0])
+
+        # First output bit comes from top row - must be registered when pipelined
+        if self.pipelined:
+            reg = self._add_register(luts_top[0].O51)
+            reg.connect_to(self.output_wires[0][0])
+        else:
+            luts_top[0].O51.connect_to(self.output_wires[0][0])
 
         for idx, lb in enumerate(luts_btm):
             lb.O51.connect_to(self.output_wires[idx + 1][0])
 
         carries_btm[len(luts_btm) - 1].connect_to(self.output_wires[len(luts_btm) + 1][0])
 
-        luts_top[-1].O52.connect_to(luts_btm[len(luts_top) - 1].I3)
+        if self.pipelined:
+            reg = self._add_register(luts_top[-1].O52)
+            reg.connect_to(luts_btm[len(luts_top) - 1].I3)
+        else:
+            luts_top[-1].O52.connect_to(luts_btm[len(luts_top) - 1].I3)
 
         self.instances += luts_top + luts_btm + l8s_btm + l8s_top
 
diff --git a/src/finn/compressor/src/graph/nodes.py b/src/finn/compressor/src/graph/nodes.py
index 8fd299528e..6a2499af23 100644
--- a/src/finn/compressor/src/graph/nodes.py
+++ b/src/finn/compressor/src/graph/nodes.py
@@ -378,14 +378,19 @@ def output_shape(self):
 
     @property
     def delay(self):
+        from .accumulator import AccumulatorStage  # noqa: PLC0415
+
         delay_ = 0
         for s in self.stages:
             if isinstance(s, PipelineStage):
                 delay_ += 1
-            from .accumulator import AccumulatorStage
-
             if isinstance(s, AccumulatorStage):
                 delay_ += 1
+            # Check for pipelined final adders in CompressionStages
+            if isinstance(s, CompressionStage):
+                for counter, _ in s.counters_with_shifts:
+                    if hasattr(counter, "delay"):
+                        delay_ += counter.delay
         return delay_
 
     def accept(self, visitor) -> None:
diff --git a/src/finn/compressor/src/passes/compressor_constructor.py b/src/finn/compressor/src/passes/compressor_constructor.py
index 10036923ab..1cb3bd093d 100644
--- a/src/finn/compressor/src/passes/compressor_constructor.py
+++ b/src/finn/compressor/src/passes/compressor_constructor.py
@@ -68,10 +68,6 @@ def __call__(
             c.stages[-1].connect_to(s)
             c.stages.append(s)
 
-        # CRITICAL: This loop can hang if compression_goal is unreachable
-        # add_compression_stage cannot compress height-1 or height-2 columns (requires >= 3)
-        # Therefore compression_goal must be achievable given this constraint
-        # See get_compression_goal() for how this is ensured in accumulate configurations
         while not self.compression_goal_reached(c.stages[-1].output_shape, compression_goal):
             self.add_compression_stage(c, compression_goal, counter_candidates)
 
@@ -80,9 +76,6 @@ def __call__(
             self.add_compression_stage(c, compression_goal, counter_candidates)
         self.add_constants_to_stage(c.stages[-1], constants)
 
-        # After constants, check if we need additional compression for accumulator mode.
-        # The ternary adder receives: compressor_output + feedback (height 1).
-        # If any column exceeds final_adder capacity, we need more compression.
         if accumulate:
 
             def post_const_goal(x):
@@ -107,9 +100,15 @@ def post_const_goal(x):
                 enable=enable,
             )
             c.stages.append(acc)
+        # if we dont accumulate, we can choose between a pipelined 
+        # or non-pipelined quaternary final adder when using Versal.
         elif max(c.stages[-1].output_shape) > 1:
             final_stage = CompressionStage()
-            final_stage.append_counter(final_adder(c.stages[-1].output_shape), 0)
+            try:
+                fa = final_adder(c.stages[-1].output_shape, pipelined=True)
+            except TypeError:
+                fa = final_adder(c.stages[-1].output_shape)
+            final_stage.append_counter(fa, 0)
             c.stages.append(final_stage)
 
         for s_p, s_n in zip(c.stages, c.stages[1:]):
diff --git a/src/finn/compressor/src/passes/emitter.py b/src/finn/compressor/src/passes/emitter.py
index 5b38a2c9a3..3e69ead969 100644
--- a/src/finn/compressor/src/passes/emitter.py
+++ b/src/finn/compressor/src/passes/emitter.py
@@ -71,7 +71,7 @@ def output(self):
 
     def save_verilog(self, filename):
         with open(filename, "w") as f:
-            f.writelines(self._out)
+            f.write(self._out.getvalue())
 
 
 class VerilogGenerator(Visitor):
@@ -96,7 +96,8 @@ def get_name(self, o: object):
                 else:
                     print(f"Could not obey desired name: {o.desired_name}")
             else:
-                subdict[o] = f"wire_{len(subdict)}"
+                new_name = f"wire_{len(subdict)}"
+                subdict[o] = new_name
         elif isinstance(o, Bitmatrix):
             subdict[o] = f"bitmatrix_{len(subdict)}"
         elif isinstance(o, BitmatrixElement):
@@ -217,7 +218,7 @@ def visit_wire(self, w: Wire):
             self.emitter.emitln(f"uwire {self.get_name(w)};")
         self._declared_hardware.add(w)
 
-        if w.has_source not in self._declared_hardware and isinstance(w.source, Wire):
+        if w.source not in self._declared_hardware and isinstance(w.source, Wire):
             w.source.accept(self)
 
         if (
@@ -241,7 +242,7 @@ def visit_logic(self, lgc: Logic):
             )
         self._declared_hardware.add(lgc)
 
-        if lgc.has_source not in self._declared_hardware and isinstance(lgc.source, Wire):
+        if lgc.source not in self._declared_hardware and isinstance(lgc.source, Wire):
             lgc.source.accept(self)
 
         def emit_inner():
diff --git a/src/finn/compressor/src/passes/lut_placer.py b/src/finn/compressor/src/passes/lut_placer.py
index 1501fa5053..68c13bd4b7 100644
--- a/src/finn/compressor/src/passes/lut_placer.py
+++ b/src/finn/compressor/src/passes/lut_placer.py
@@ -17,7 +17,6 @@ def iter_compressor(self, c: Compressor):
         self.occupations = []  # Reset placement state for every compressor
 
     def iter_counter(self, c: Counter):
-        # Place LUT6CY instances manually.
         cascades = self._get_ripple_connected_luts(c)
         self._calculate_and_annotate_placements(cascades)
 
@@ -64,17 +63,6 @@ def _calculate_and_annotate_placements(self, cascades):
                 self._annotate_placements(cascade, len(self.occupations) - 1, 0)
 
     def _annotate_placements(self, cascade, hu_set, start_idx):
-        """Annotate LUT6CY placement constraints for carry chain packing.
-
-        Places each cascade (ripple chain) into specific BEL positions within a SLICE.
-        Each hu_set represents one SLICE (8 LUTs max). Multiple hu_sets get different
-        Y coordinates to avoid placement conflicts.
-
-        Args:
-            cascade: List of LUT6CY instances forming a carry ripple chain
-            hu_set: SLICE index (0, 1, 2, ...) - maps to RLOC Y coordinate
-            start_idx: Starting BEL position within the SLICE (0-7 = A-H)
-        """
         assert start_idx + len(cascade) <= 8
         for i, lut in enumerate(cascade):
             bel_str = f"{chr(ord('A')+start_idx+i)}5LUT"
diff --git a/src/finn/compressor/src/tests/test_gen.py b/src/finn/compressor/src/tests/test_gen.py
index cfb95ceafd..a07e19e7b1 100644
--- a/src/finn/compressor/src/tests/test_gen.py
+++ b/src/finn/compressor/src/tests/test_gen.py
@@ -30,7 +30,7 @@ def generate_test(
     accumulator_width: int,
     constant: int,
 ):
-    assert type(pipeline_stages) == int
+    assert isinstance(pipeline_stages, int)
 
     if gates:
         gates = flatten_gates(gates)
diff --git a/src/finn/compressor/src/tests/tester.py b/src/finn/compressor/src/tests/tester.py
index f2cf3d3f77..5b74cdd789 100644
--- a/src/finn/compressor/src/tests/tester.py
+++ b/src/finn/compressor/src/tests/tester.py
@@ -1,8 +1,9 @@
-#!/usr/bin/env python
+#############################################################################
 # Copyright (C) 2024, Advanced Micro Devices, Inc.
 # All rights reserved.
 #
 # SPDX-License-Identifier: BSD-3-Clause
+#############################################################################
 
 """Vivado XSim wrapper for testing generated compressors."""
 
diff --git a/src/finn/compressor/src/utils/shape.py b/src/finn/compressor/src/utils/shape.py
index 2e111ad4b9..266c6260d1 100644
--- a/src/finn/compressor/src/utils/shape.py
+++ b/src/finn/compressor/src/utils/shape.py
@@ -21,10 +21,10 @@ def __iter__(self):
         return self.t.__iter__()
 
     def __getitem__(self, val):
-        if type(val) == int and val >= len(self.t):
+        if isinstance(val, int) and val >= len(self.t):
             return 0
         r = self.t.__getitem__(val)
-        if type(r) == int:
+        if isinstance(r, int):
             return r
         else:
             return Shape(r)
@@ -39,9 +39,9 @@ def __sub__(self, val):
         return self.__binary_arithmetic_operation(val, lambda x, y: x - y)
 
     def __binary_arithmetic_operation(self, val, op):
-        if type(val) == int:
+        if isinstance(val, int):
             return Shape([op(el, val) for el in self.t])
-        elif type(val) == Shape:
+        elif isinstance(val, Shape):
             zipped = zip_longest(self.t, val.t, fillvalue=0)
             return Shape([op(a, b) for a, b in zipped])
         else:
diff --git a/src/finn/custom_op/fpgadataflow/rtl/matrixvectoractivation_rtl.py b/src/finn/custom_op/fpgadataflow/rtl/matrixvectoractivation_rtl.py
index 48d031d940..91effcf7b5 100644
--- a/src/finn/custom_op/fpgadataflow/rtl/matrixvectoractivation_rtl.py
+++ b/src/finn/custom_op/fpgadataflow/rtl/matrixvectoractivation_rtl.py
@@ -59,9 +59,6 @@ def get_nodeattr_types(self):
             # add_multi compressor specs for synthesis aggregation
             # Format: "N,W,D;N,W,D;..." e.g. "16,4,0;16,3,0;16,8,0"
             "add_multi_comp_specs": ("s", False, ""),
-            # Force disable LUT-based compressors (for benchmarking/comparison)
-            # 0 = auto (use compressor when eligible), 1 = force disable
-            "noCompressor": ("i", False, 0, {0, 1}),
         }
         my_attrs.update(MVAU.get_nodeattr_types(self))
         my_attrs.update(RTLBackend.get_nodeattr_types(self))
@@ -193,7 +190,7 @@ def _get_rtl_source_files(self, abspath=True):
         ]
         sourcefiles = [
             os.path.join(code_gen_dir, self.get_nodeattr("gen_top_module") + "_wrapper.v")
-        ] + [rtllib_dir + f for f in base_files]
+        ] + [rtllib_dir + _ for _ in base_files]
 
         # Add compressor files if dotp_comp was generated
         comp_name = self.get_nodeattr("comp_module_name")
@@ -222,7 +219,6 @@ def instantiate_ip(self, cmd):
 
         for f in sourcefiles:
             cmd.append("add_files -norecurse %s" % (f))
-
         mem_mode = self.get_nodeattr("mem_mode")
         if mem_mode == "internal_decoupled" or self.get_nodeattr("mlo_max_iter"):
             cmd.append(
diff --git a/src/finn/transformation/fpgadataflow/specialize_layers.py b/src/finn/transformation/fpgadataflow/specialize_layers.py
index 739deb22b7..9a072b1f30 100644
--- a/src/finn/transformation/fpgadataflow/specialize_layers.py
+++ b/src/finn/transformation/fpgadataflow/specialize_layers.py
@@ -28,7 +28,6 @@
 
 import warnings
 from onnx import helper
-from qonnx.core.datatype import DataType
 from qonnx.custom_op.registry import getCustomOp
 from qonnx.transformation.base import Transformation
 
@@ -229,7 +228,8 @@ def _dwc_determine_impl_style(node):
 
 def _mvu_rtl_possible(n, fpgapart, model):
     # Checks whether RTL-based MVU is supported
-    # RTL MVU uses either DSP blocks (for larger bitwidths) or LUT-based compressor (2<=WW<=4 && 2<=AW<=4)
+    # RTL MVU uses either DSP blocks (for larger bitwidths)
+    # or LUT-based compressor (2<=WW<=4 && 2<=AW<=4)
     # Weights must be signed, activations can be unsigned or signed
     # Embedded thresholding and binaryXnorMode are not supported
     node_inst = getCustomOp(n)
diff --git a/tests/fpgadataflow/test_fpgadataflow_mvau.py b/tests/fpgadataflow/test_fpgadataflow_mvau.py
index 84aeeaae43..3c2cbfed28 100644
--- a/tests/fpgadataflow/test_fpgadataflow_mvau.py
+++ b/tests/fpgadataflow/test_fpgadataflow_mvau.py
@@ -749,10 +749,7 @@ def test_mvau_fifocharacterize_rtlsim(
 @pytest.mark.parametrize("pe", [1, 9, 18])
 @pytest.mark.parametrize("simd", [1, 16, 32])
 @pytest.mark.parametrize(
-    "idt_wdt", [
-        [DataType["UINT4"], DataType["INT4"]],
-        [DataType["UINT8"], DataType["INT8"]],
-    ]
+    "idt_wdt", [[DataType["UINT4"], DataType["INT4"]], [DataType["UINT8"], DataType["INT8"]]]
 )
 @pytest.mark.parametrize(
     "part", ["xcvc1902-vsva2197-2MP-e-S", "xcku3p-ffva676-1-e", "xc7z020clg400-1"]
@@ -785,9 +782,6 @@ def test_fpgadataflow_rtl_mvau(
     ifm = helper.make_tensor_value_info("ifm", TensorProto.FLOAT, [1, ofm_h, ofm_w, mw])
     ofm = helper.make_tensor_value_info("ofm", TensorProto.FLOAT, (1, ofm_h, ofm_w, mh))
     W = gen_finn_dt_tensor(wdt, (mw, mh))
-    # if 7 series, force weights to narrow range
-    # if part == "xc7z020clg400-1":
-    #     W = np.clip(W, wdt.min() + 1, wdt.max())
     model = make_single_matmul_modelwrapper(ifm, ofm, idt, wdt, W)
     model = model.transform(GiveUniqueNodeNames())
     model = model.transform(GiveReadableTensorNames())

From 362751157b06786bc76ccd5f4ba49e0c3ee8ae06 Mon Sep 17 00:00:00 2001
From: Simon Gerber <simon.gerber@amd.com>
Date: Tue, 19 May 2026 10:00:03 +0100
Subject: [PATCH 09/10] pre-commit

---
 src/finn/compressor/src/passes/compressor_constructor.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/finn/compressor/src/passes/compressor_constructor.py b/src/finn/compressor/src/passes/compressor_constructor.py
index 1cb3bd093d..e7b7ad7192 100644
--- a/src/finn/compressor/src/passes/compressor_constructor.py
+++ b/src/finn/compressor/src/passes/compressor_constructor.py
@@ -100,7 +100,7 @@ def post_const_goal(x):
                 enable=enable,
             )
             c.stages.append(acc)
-        # if we dont accumulate, we can choose between a pipelined 
+        # if we dont accumulate, we can choose between a pipelined
         # or non-pipelined quaternary final adder when using Versal.
         elif max(c.stages[-1].output_shape) > 1:
             final_stage = CompressionStage()

From 732684f95c3c7917aad6e22a6c78fd05245987fd Mon Sep 17 00:00:00 2001
From: Simon Gerber <simon.gerber@amd.com>
Date: Tue, 2 Jun 2026 08:29:17 +0100
Subject: [PATCH 10/10] Use config-specific dotp module names to avoid
 potential multi-MVAU collisions

---
 finn-rtllib/mvu/mvu_vvu_axi.sv                |  2 +-
 .../compressor/hdl/dotp_comp_tb_template.sv   |  2 +-
 src/finn/compressor/hdl/dotp_comp_template.sv | 10 +++---
 src/finn/compressor/src/dotp_finn.py          |  8 ++++-
 .../rtl/matrixvectoractivation_rtl.py         | 34 ++++++++++++++++---
 .../rtl/vectorvectoractivation_rtl.py         | 27 ++++++++++-----
 6 files changed, 63 insertions(+), 20 deletions(-)

diff --git a/finn-rtllib/mvu/mvu_vvu_axi.sv b/finn-rtllib/mvu/mvu_vvu_axi.sv
index 0df301c767..86eccf7ca1 100644
--- a/finn-rtllib/mvu/mvu_vvu_axi.sv
+++ b/finn-rtllib/mvu/mvu_vvu_axi.sv
@@ -318,7 +318,7 @@ module mvu_vvu_axi #(
 		localparam int unsigned  NUM_LANES = A_WIDTH == WEIGHT_WIDTH? 1 : 1 + (A_WIDTH - !NARROW_WEIGHTS - WEIGHT_WIDTH) / MIN_LANE_WIDTH;
 
 		if(USE_COMPRESSOR) begin : genCompressor
-			dotp_comp #(
+			$DOTP_MODULE_NAME$ #(
 				.PE(PE), .SIMD(DSP_SIMD),
 				.WEIGHT_WIDTH(WEIGHT_WIDTH), .ACTIVATION_WIDTH(ACTIVATION_WIDTH), .ACCU_WIDTH(ACCU_WIDTH),
 				.SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS),
diff --git a/src/finn/compressor/hdl/dotp_comp_tb_template.sv b/src/finn/compressor/hdl/dotp_comp_tb_template.sv
index 8d15d92759..4d214651f9 100644
--- a/src/finn/compressor/hdl/dotp_comp_tb_template.sv
+++ b/src/finn/compressor/hdl/dotp_comp_tb_template.sv
@@ -39,7 +39,7 @@ module dotp_comp_{full_sig}_tb;
 	uwire  vld;
 	accu_t [PE-1:0]  p;
 
-	dotp_comp #(
+	{dotp_module} #(
 		.PE(PE), .SIMD(SIMD),
 		.WEIGHT_WIDTH(WEIGHT_WIDTH),
 		.ACTIVATION_WIDTH(ACTIVATION_WIDTH),
diff --git a/src/finn/compressor/hdl/dotp_comp_template.sv b/src/finn/compressor/hdl/dotp_comp_template.sv
index 5cb3119a0e..14d7f2dd78 100644
--- a/src/finn/compressor/hdl/dotp_comp_template.sv
+++ b/src/finn/compressor/hdl/dotp_comp_template.sv
@@ -12,12 +12,12 @@
  * Drop-in replacement for DSP-based compute cores in the MVU.
  * Uses a generated compressor tree for the reduction.
  *
- *		This file is a TEMPLATE — $COMP_MODULE_NAME$ is substituted
- *		at code generation time with the config-specific compressor
- *		module name (e.g. comp_8xs2s2).
+ *		This file is a TEMPLATE with the following substitutions:
+ *		- $DOTP_MODULE_NAME$ → config-specific wrapper name (e.g. dotp_comp_8xs2s2_a16)
+ *		- $COMP_MODULE_NAME$ → compressor module name (e.g. comp_8xs2s2_a16)
  *****************************************************************************/
 
-module dotp_comp #(
+module $DOTP_MODULE_NAME$ #(
 	int unsigned  PE,
 	int unsigned  SIMD,
 	int unsigned  WEIGHT_WIDTH,
@@ -151,4 +151,4 @@ module dotp_comp #(
 		end
 	end
 
-endmodule : dotp_comp
+endmodule : $DOTP_MODULE_NAME$
diff --git a/src/finn/compressor/src/dotp_finn.py b/src/finn/compressor/src/dotp_finn.py
index 3b52dd4179..77dfac7459 100644
--- a/src/finn/compressor/src/dotp_finn.py
+++ b/src/finn/compressor/src/dotp_finn.py
@@ -171,14 +171,17 @@ def generate_dotp_comp(fpgapart, simd, ww, aw, accu_width, signed_act, output_di
     )
 
     # Expand dotp_comp template with the generated module name
+    # Use config-specific module name to avoid collisions in multi-MVAU builds
     src_dir = os.path.dirname(os.path.abspath(__file__))
     compressor_root = os.path.abspath(os.path.join(src_dir, ".."))
     dotp_comp_template = os.path.join(compressor_root, "hdl", "dotp_comp_template.sv")
-    dotp_comp_path = os.path.join(output_dir, "dotp_comp.sv")
+    dotp_module_name = f"dotp_{comp_name}"
+    dotp_comp_path = os.path.join(output_dir, f"{dotp_module_name}.sv")
     expand_template(
         dotp_comp_template,
         dotp_comp_path,
         {
+            "$DOTP_MODULE_NAME$": dotp_module_name,
             "$COMP_MODULE_NAME$": comp_name,
             "$EXPECTED_SIMD$": str(simd),
             "$EXPECTED_NA$": str(na),
@@ -191,6 +194,7 @@ def generate_dotp_comp(fpgapart, simd, ww, aw, accu_width, signed_act, output_di
 
     return {
         "comp_name": comp_name,
+        "dotp_module_name": dotp_module_name,
         "comp_delay": comp_delay,
         "files": [dotp_comp_path, comp_path],
     }
@@ -273,11 +277,13 @@ def main():
                 f"dotp template not found: {template_path}. "
                 f"Use --dotp-template or --skip-dotp-template."
             )
+        dotp_module_name = f"dotp_{comp_name}"
         dotp_path = os.path.join(args.output_dir, args.dotp_output_name)
         expand_template(
             template_path,
             dotp_path,
             {
+                "$DOTP_MODULE_NAME$": dotp_module_name,
                 "$COMP_MODULE_NAME$": comp_name,
                 "$EXPECTED_SIMD$": str(args.simd),
                 "$EXPECTED_NA$": str(na),
diff --git a/src/finn/custom_op/fpgadataflow/rtl/matrixvectoractivation_rtl.py b/src/finn/custom_op/fpgadataflow/rtl/matrixvectoractivation_rtl.py
index 91effcf7b5..3b16688e79 100644
--- a/src/finn/custom_op/fpgadataflow/rtl/matrixvectoractivation_rtl.py
+++ b/src/finn/custom_op/fpgadataflow/rtl/matrixvectoractivation_rtl.py
@@ -54,6 +54,8 @@ def get_nodeattr_types(self):
             "pumpedCompute": ("i", False, 0, {0, 1}),
             # Compressor module name (set by generate_hdl when compressor is used)
             "comp_module_name": ("s", False, ""),
+            # dotp_comp wrapper module name (set by generate_hdl when dotp compressor is used)
+            "dotp_module_name": ("s", False, ""),
             # add_multi compressor module names, semicolon-separated
             "add_multi_comp_names": ("s", False, ""),
             # add_multi compressor specs for synthesis aggregation
@@ -183,25 +185,29 @@ def _get_rtl_source_files(self, abspath=True):
 
         base_files = [
             "mvu_pkg.sv",
-            "mvu_vvu_axi.sv",
             "replay_buffer.sv",
             "mvu.sv",
             "mvu_vvu_8sx9_dsp58.sv",
         ]
         sourcefiles = [
             os.path.join(code_gen_dir, self.get_nodeattr("gen_top_module") + "_wrapper.v")
-        ] + [rtllib_dir + _ for _ in base_files]
+        ] + [rtllib_dir + f for f in base_files]
 
         # Add compressor files if dotp_comp was generated
         comp_name = self.get_nodeattr("comp_module_name")
         if comp_name:
             comp_hdl_dir = os.path.join(os.environ["FINN_ROOT"], "src/finn/compressor/hdl/")
-            sourcefiles.append(os.path.join(code_gen_dir, "dotp_comp.sv"))
+            dotp_module_name = self.get_nodeattr("dotp_module_name")
+            sourcefiles.append(os.path.join(code_gen_dir, f"{dotp_module_name}.sv"))
             sourcefiles.append(os.path.join(comp_hdl_dir, "mul_comp_map.sv"))
             sourcefiles.append(os.path.join(code_gen_dir, comp_name + ".sv"))
+            # Use local mvu_vvu_axi.sv with substituted $DOTP_MODULE_NAME$
+            sourcefiles.append(os.path.join(code_gen_dir, "mvu_vvu_axi.sv"))
             # dotp_comp path doesn't need add_multi.sv
         else:
-            # DSP path: add_multi.sv always exists in code_gen_dir
+            # DSP path: use local mvu_vvu_axi.sv (no placeholder substitution needed)
+            sourcefiles.append(os.path.join(code_gen_dir, "mvu_vvu_axi.sv"))
+            # add_multi.sv always exists in code_gen_dir
             # (either patched with comps or copy of template)
             sourcefiles.append(os.path.join(code_gen_dir, "add_multi.sv"))
             add_multi_names_str = self.get_nodeattr("add_multi_comp_names")
@@ -358,6 +364,16 @@ def generate_hdl(self, model, fpgapart, clk):
             code_gen_dict["$COMP_PIPELINE_DEPTH$"] = [str(result["comp_delay"])]
             code_gen_dict["$USE_COMPRESSOR$"] = [str(1)]
             self.set_nodeattr("comp_module_name", result["comp_name"])
+            self.set_nodeattr("dotp_module_name", result["dotp_module_name"])
+            # Copy mvu_vvu_axi.sv and substitute $DOTP_MODULE_NAME$
+            rtllib_dir = os.path.join(os.environ["FINN_ROOT"], "finn-rtllib/mvu/")
+            with open(os.path.join(rtllib_dir, "mvu_vvu_axi.sv"), "r") as f:
+                mvu_vvu_axi_content = f.read()
+            mvu_vvu_axi_content = mvu_vvu_axi_content.replace(
+                "$DOTP_MODULE_NAME$", result["dotp_module_name"]
+            )
+            with open(os.path.join(code_gen_dir, "mvu_vvu_axi.sv"), "w") as f:
+                f.write(mvu_vvu_axi_content)
         else:
             # DSP path: Generate add_multi.sv with compressors
             result = generate_add_multi_comps(
@@ -369,6 +385,16 @@ def generate_hdl(self, model, fpgapart, clk):
                 # Format: "N,W,D;N,W,D;..." e.g. "16,4,0;16,3,0;16,8,0"
                 specs_str = ";".join(f"{n},{w},{d}" for n, w, d in result.get("comp_specs", []))
                 self.set_nodeattr("add_multi_comp_specs", specs_str)
+            # Copy mvu_vvu_axi.sv and substitute placeholder with dummy name
+            # (not used since USE_COMPRESSOR=0, but Vivado parses entire file)
+            rtllib_dir = os.path.join(os.environ["FINN_ROOT"], "finn-rtllib/mvu/")
+            with open(os.path.join(rtllib_dir, "mvu_vvu_axi.sv"), "r") as f:
+                mvu_vvu_axi_content = f.read()
+            mvu_vvu_axi_content = mvu_vvu_axi_content.replace(
+                "$DOTP_MODULE_NAME$", "dotp_comp"  # Dummy name, won't be instantiated
+            )
+            with open(os.path.join(code_gen_dir, "mvu_vvu_axi.sv"), "w") as f:
+                f.write(mvu_vvu_axi_content)
 
         # add general parameters to dictionary
         code_gen_dict["$MODULE_NAME_AXI_WRAPPER$"] = [self.get_verilog_top_module_name()]
diff --git a/src/finn/custom_op/fpgadataflow/rtl/vectorvectoractivation_rtl.py b/src/finn/custom_op/fpgadataflow/rtl/vectorvectoractivation_rtl.py
index 187ae3ec9f..cfcbbad8bd 100644
--- a/src/finn/custom_op/fpgadataflow/rtl/vectorvectoractivation_rtl.py
+++ b/src/finn/custom_op/fpgadataflow/rtl/vectorvectoractivation_rtl.py
@@ -149,17 +149,17 @@ def instantiate_ip(self, cmd):
         node_name = self.onnx_node.name
         code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen")
         rtllib_dir = os.path.join(os.environ["FINN_ROOT"], "finn-rtllib/mvu/")
-        sourcefiles = [
+        rtllib_files = [
             "mvu_pkg.sv",
-            "mvu_vvu_axi.sv",
             "replay_buffer.sv",
             "mvu.sv",
             "mvu_vvu_8sx9_dsp58.sv",
             "add_multi.sv",
         ]
         sourcefiles = [
-            os.path.join(code_gen_dir, self.get_nodeattr("gen_top_module") + "_wrapper.v")
-        ] + [rtllib_dir + _ for _ in sourcefiles]
+            os.path.join(code_gen_dir, self.get_nodeattr("gen_top_module") + "_wrapper.v"),
+            os.path.join(code_gen_dir, "mvu_vvu_axi.sv"),  # Local copy with substituted placeholder
+        ] + [rtllib_dir + _ for _ in rtllib_files]
 
         for f in sourcefiles:
             cmd.append("add_files -norecurse %s" % (f))
@@ -219,6 +219,17 @@ def generate_hdl(self, model, fpgapart, clk):
         ) as f:
             f.write(template_wrapper)
 
+        # Copy mvu_vvu_axi.sv and substitute placeholder with dummy name
+        # (not used since USE_COMPRESSOR=0, but Vivado parses entire file)
+        rtllib_dir = os.path.join(os.environ["FINN_ROOT"], "finn-rtllib/mvu/")
+        with open(os.path.join(rtllib_dir, "mvu_vvu_axi.sv"), "r") as f:
+            mvu_vvu_axi_content = f.read()
+        mvu_vvu_axi_content = mvu_vvu_axi_content.replace(
+            "$DOTP_MODULE_NAME$", "dotp_comp"  # Dummy name, won't be instantiated
+        )
+        with open(os.path.join(code_gen_dir, "mvu_vvu_axi.sv"), "w") as f:
+            f.write(mvu_vvu_axi_content)
+
         if self.get_nodeattr("mem_mode") == "internal_decoupled":
             if self.get_nodeattr("ram_style") == "ultra" and not is_versal(fpgapart):
                 runtime_writeable = self.get_nodeattr("runtime_writeable_weights")
@@ -297,17 +308,17 @@ def get_rtl_file_list(self, abspath=False):
             code_gen_dir = ""
             rtllib_dir = ""
 
-        verilog_files = [
+        rtllib_files = [
             "mvu_pkg.sv",
-            "mvu_vvu_axi.sv",
             "replay_buffer.sv",
             "mvu.sv",
             "mvu_vvu_8sx9_dsp58.sv",
             "add_multi.sv",
         ]
         verilog_files = [
-            os.path.join(code_gen_dir, self.get_nodeattr("gen_top_module") + "_wrapper.v")
-        ] + [rtllib_dir + _ for _ in verilog_files]
+            os.path.join(code_gen_dir, self.get_nodeattr("gen_top_module") + "_wrapper.v"),
+            os.path.join(code_gen_dir, "mvu_vvu_axi.sv"),  # Local copy with substituted placeholder
+        ] + [rtllib_dir + _ for _ in rtllib_files]
 
         return verilog_files