diff --git a/finn-rtllib/mvu/add_multi.sv b/finn-rtllib/mvu/add_multi.sv
index 6b45d42e5a..25f5b9a411 100644
--- a/finn-rtllib/mvu/add_multi.sv
+++ b/finn-rtllib/mvu/add_multi.sv
@@ -28,7 +28,7 @@
  * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
  * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
- * @brief	Pipelined multi-input adder tree.
+ * @brief	Pipelined multi-input adder using LUT-based compressors.
  * @author	Thomas B. Preußer <thomas.preusser@amd.com>
  *****************************************************************************/
 
@@ -50,13 +50,65 @@ module add_multi import mvu_pkg::*; #(
 	output	logic [SUM_WIDTH-1:0]  sum
 );
 
-	localparam int unsigned  L = $clog2(N);  // Number of levels with reductions
+//---------------------------------------------------------------------------
+// Compressor Path
+//
+// CATCH_COMP entries instantiate a generated compressor module for a
+// specific (N, ARG_WIDTH, delay) triple.  The macro transposes arg[i][j]
+// to the column-major bit-vector expected by the compressor and pads any
+// remaining DEPTH with a shift-register delay.
+//
+// Generated compressors have no en port — when en=0, upstream holds
+// inputs stable and the downstream accumulator does not latch, so
+// correctness is preserved.
 
-	uwire [SUM_WIDTH-1:0]  sum0;
-	if(L < 1) begin : genTrivial
+`define CATCH_COMP(n,w,d) \
+else if(!RESET_ZERO && (N == n) && (ARG_WIDTH == w) && (DEPTH >= d) && (0 <= ARG_LO)) begin : genComp``n``u``w``_d``d \
+	initial $display("[ADD_MULTI_PATH] COMP N=%0d D=%0d W=%0d", N, DEPTH, ARG_WIDTH); \
+\
+	uwire [N*ARG_WIDTH-1:0]  in; \
+	uwire [SUM_WIDTH  -1:0]  out; \
+	for(genvar  i = 0; i < N; i++) begin : genIn \
+		for(genvar  j = 0; j < ARG_WIDTH; j++) begin : genBit \
+				assign	in[j*N+i] = arg[i][j]; \
+		end : genBit \
+	end : genIn \
+	comp_``n``u``w``_d``d comp_inst ( \
+		.clk, \
+		.in, .out \
+	); \
+	initial assert($bits(out) >= $bits(comp_inst.out)) else $warning("CATCH_COMP(%0d,%0d,%0d): compressor output width %0d > SUM_WIDTH %0d", n, w, d, $bits(comp_inst.out), SUM_WIDTH); \
+\
+	localparam int unsigned  COMP_DELAY = d; \
+	localparam int unsigned  SUM_DELAY = DEPTH - COMP_DELAY; \
+	if(SUM_DELAY == 0)  assign  sum = out; \
+	else begin : genDelay \
+		logic [SUM_WIDTH-1:0]  SumZ[SUM_DELAY] = '{ default: '0 }; \
+		always_ff @(posedge clk) begin \
+			if(rst)  SumZ <= '{ default: '0 }; \
+			else if(en) begin \
+				for(int unsigned  i = 0; i < SUM_DELAY-1; i++)  SumZ[i] <= SumZ[i+1]; \
+				SumZ[SUM_DELAY-1] <= out; \
+			end \
+		end \
+		assign	sum = SumZ[0]; \
+	end : genDelay \
+end : genComp``n``u``w``_d``d
+
+	if(0) begin end
+	// FINN_GENERATED_COMP_ENTRIES
+
+//- Generic Behavioral Addition ---------
+	else begin : genGeneric
+
+	localparam int unsigned  L = $clog2(N);  // Tree levels
+
+	logic [SUM_WIDTH-1:0]  sum0;
+	if(L < 1) begin : genPassThrough
 		assign	sum0 = arg[0];
-	end : genTrivial
+	end : genPassThrough
 	else begin : genTree
+		initial $display("[ADD_MULTI_PATH] TREE N=%0d D=%0d W=%0d", N, DEPTH, ARG_WIDTH);
 		localparam int unsigned  D = L < DEPTH? L : DEPTH;  // Pipeline stages absorbed by tree
 
 		// Compute the count of decendents for all nodes in the reduction trees.
@@ -117,16 +169,18 @@ module add_multi import mvu_pkg::*; #(
 	// Delay Output if requested DEPTH exceeds Tree Height
 	if(DEPTH <= L)  assign  sum = sum0;
 	else begin : genDelay
-		localparam logic [SUM_WIDTH-1:0]  SUM_RESET = {(SUM_WIDTH){RESET_ZERO? 1'b0 : 1'bx}};
-		logic [SUM_WIDTH-1:0]  SumZ[DEPTH - L] = '{ default: SUM_RESET };
+		localparam int unsigned  DELAY = DEPTH - L;
+		logic [SUM_WIDTH-1:0]  SumZ[DELAY] = '{ default: '0 };
 		always_ff @(posedge clk) begin
-			if(rst)  SumZ <= '{ default: SUM_RESET };
-			else begin
-				for(int unsigned  i = 0; i < DEPTH-L-1; i++)  SumZ[i] <= SumZ[i+1];
-				SumZ[DEPTH-L-1] <= sum0;
+			if(rst)  SumZ <= '{ default: '0 };
+			else if(en) begin
+				for(int unsigned  i = 0; i < DELAY-1; i++)  SumZ[i] <= SumZ[i+1];
+				SumZ[DELAY-1] <= sum0;
 			end
 		end
 		assign	sum = SumZ[0];
 	end : genDelay
 
+	end : genGeneric
+
 endmodule : add_multi
diff --git a/finn-rtllib/mvu/mvu_vvu_axi.sv b/finn-rtllib/mvu/mvu_vvu_axi.sv
index a890ac9aa3..86eccf7ca1 100644
--- a/finn-rtllib/mvu/mvu_vvu_axi.sv
+++ b/finn-rtllib/mvu/mvu_vvu_axi.sv
@@ -64,6 +64,13 @@ module mvu_vvu_axi #(
 	bit FORCE_BEHAVIORAL = 0,
 	bit M_REG_LUT = 1,
 
+	// LUT-based compressor tree pipeline depth. This is set by default for maximum Pipelining (inbetween every stage).
+	int unsigned  COMP_PIPELINE_DEPTH = 1,
+
+	// Passed at generation time, whether compressors were generated if deemed worth it.
+	// Decides wether to use LUT-based compressors instead of DSPs.
+	bit USE_COMPRESSOR = 0,
+
 	// Safely deducible parameters
 	localparam int unsigned  WEIGHT_STREAM_WIDTH    = PE * SIMD * WEIGHT_WIDTH,
 	localparam int unsigned  WEIGHT_STREAM_WIDTH_BA = (WEIGHT_STREAM_WIDTH + 7)/8 * 8,
@@ -310,7 +317,19 @@ module mvu_vvu_axi #(
 		localparam int unsigned  A_WIDTH = 25 + 2*(VERSION > 1);     // Width of A datapath
 		localparam int unsigned  NUM_LANES = A_WIDTH == WEIGHT_WIDTH? 1 : 1 + (A_WIDTH - !NARROW_WEIGHTS - WEIGHT_WIDTH) / MIN_LANE_WIDTH;
 
-		if(!IS_MVU || ((VERSION > 2) && (NUM_LANES <= 3) && (WEIGHT_WIDTH <= 8) && (ACTIVATION_WIDTH <= 9))) begin : genINT8
+		if(USE_COMPRESSOR) begin : genCompressor
+			$DOTP_MODULE_NAME$ #(
+				.PE(PE), .SIMD(DSP_SIMD),
+				.WEIGHT_WIDTH(WEIGHT_WIDTH), .ACTIVATION_WIDTH(ACTIVATION_WIDTH), .ACCU_WIDTH(ACCU_WIDTH),
+				.SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS),
+				.COMP_PIPELINE_DEPTH(COMP_PIPELINE_DEPTH)
+			) core (
+				.clk(ap_clk), .rst, .en('1),
+				.last(dsp_last), .zero(dsp_zero), .w(dsp_w), .a(dsp_a),
+				.vld(dsp_vld), .p(dsp_p)
+			);
+		end : genCompressor
+		else if(!IS_MVU || ((VERSION > 2) && (NUM_LANES <= 3) && (WEIGHT_WIDTH <= 8) && (ACTIVATION_WIDTH <= 9))) begin : genINT8
 			initial $info("Sidestepping to INT8 mode of DSP58 for %0dx%0d.", WEIGHT_WIDTH, ACTIVATION_WIDTH);
 			mvu_vvu_8sx9_dsp58 #(
 				.IS_MVU(IS_MVU),
@@ -343,11 +362,14 @@ module mvu_vvu_axi #(
 
 	if(1) begin : blkOutput
 		localparam int unsigned  CORE_PIPELINE_DEPTH =
-			VERSION == 3? 3 + (SEGMENTLEN == 0? 0 : ((SIMD+2)/3 -1)/SEGMENTLEN) :
-			/* else */    3 + $clog2(SIMD+1) + (SIMD == 1);
-
-		// This is conservative and could be divided by a guaranteed minimum output interval, e.g. MW/SIMD.
-		localparam int unsigned  MAX_IN_FLIGHT = CORE_PIPELINE_DEPTH;
+			USE_COMPRESSOR? COMP_PIPELINE_DEPTH :
+			VERSION == 3?   3 + (SEGMENTLEN == 0? 0 : ((SIMD+2)/3 -1)/SEGMENTLEN) :
+			/* else */      3 + $clog2(SIMD+1) + (SIMD == 1);
+
+		// Floor at the DSP-equivalent depth so the compressor path (shallow pipeline)
+		localparam int unsigned  DSP_PIPELINE_DEPTH = 3 + $clog2(SIMD+1) + (SIMD == 1);
+		localparam int unsigned  MAX_IN_FLIGHT =
+			CORE_PIPELINE_DEPTH > DSP_PIPELINE_DEPTH? CORE_PIPELINE_DEPTH : DSP_PIPELINE_DEPTH;
 		typedef logic [PE-1:0][ACCU_WIDTH-1:0]  output_t;
 
 		logic signed [$clog2(MAX_IN_FLIGHT+1):0]  OPtr = '1;	// -1 | 0, 1, ..., MAX_IN_FLIGHT
diff --git a/finn-rtllib/mvu/mvu_vvu_axi_wrapper.v b/finn-rtllib/mvu/mvu_vvu_axi_wrapper.v
index 9815d67629..47ffa96ac5 100644
--- a/finn-rtllib/mvu/mvu_vvu_axi_wrapper.v
+++ b/finn-rtllib/mvu/mvu_vvu_axi_wrapper.v
@@ -45,6 +45,8 @@ module $MODULE_NAME_AXI_WRAPPER$ #(
 	parameter	NARROW_WEIGHTS = $NARROW_WEIGHTS$,
 	parameter	SIGNED_ACTIVATIONS = $SIGNED_ACTIVATIONS$,
 	parameter	SEGMENTLEN = $SEGMENTLEN$,
+	parameter	COMP_PIPELINE_DEPTH = $COMP_PIPELINE_DEPTH$,
+	parameter	USE_COMPRESSOR = $USE_COMPRESSOR$,
 
 	// Safely deducible parameters
 	parameter	WEIGHT_STREAM_WIDTH_BA = (PE*SIMD*WEIGHT_WIDTH+7)/8 * 8,
@@ -81,7 +83,8 @@ mvu_vvu_axi #(
 `endif
 	.IS_MVU(IS_MVU), .VERSION(VERSION), .PUMPED_COMPUTE(PUMPED_COMPUTE), .MW(MW), .MH(MH), .PE(PE), .SIMD(SIMD),
 	.ACTIVATION_WIDTH(ACTIVATION_WIDTH), .WEIGHT_WIDTH(WEIGHT_WIDTH), .ACCU_WIDTH(ACCU_WIDTH), .NARROW_WEIGHTS(NARROW_WEIGHTS),
-	.SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS), .SEGMENTLEN(SEGMENTLEN)
+	.SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS), .SEGMENTLEN(SEGMENTLEN),
+	.COMP_PIPELINE_DEPTH(COMP_PIPELINE_DEPTH), .USE_COMPRESSOR(USE_COMPRESSOR)
 	) inst (
 	.ap_clk(ap_clk),
 	.ap_clk2x(ap_clk2x),
diff --git a/src/finn/compressor/Makefile b/src/finn/compressor/Makefile
new file mode 100644
index 0000000000..7df3e6963e
--- /dev/null
+++ b/src/finn/compressor/Makefile
@@ -0,0 +1,17 @@
+#############################################################################
+# Copyright (C) 2024 - 2025 Advanced Micro Devices, Inc. All rights reserved.
+#
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# @brief    Build automation for compressor testing and generation
+# @author    Simon Gerber <simon.gerber@amd.com>
+#############################################################################
+
+# Default: no constant absorption
+CA?=
+.PHONY: default clean
+
+default:
+	./run_tests.sh $(CA)
+clean:
+	rm -rf *.log *.jou *.vivado .Xil xvlog.pb gen/*
diff --git a/src/finn/compressor/README.md b/src/finn/compressor/README.md
new file mode 100644
index 0000000000..b80dbe6a43
--- /dev/null
+++ b/src/finn/compressor/README.md
@@ -0,0 +1,103 @@
+<!--
+Copyright (C) 2024 - 2025 Advanced Micro Devices, Inc. All rights reserved.
+
+SPDX-License-Identifier: BSD-3-Clause
+-->
+
+# Python Compressor Generator
+This tool can generate compressor trees for 7-Series, UltraScale(+) and Versal for arbitrary input shapes.
+
+# Getting started
+1. Part of the FINN framework (integrated into MVAU RTL backend).
+2. _standalone compressor generation_ requires no external dependencies.
+
+## FINN Integration
+The compressor is automatically invoked during MVAU layer specialization (`SpecializeLayers` transformation).
+FINN selects the between RTL compressor, RTL DSP and HLS implementations based on the node parameters.
+See the [MVAU compressor integration flow diagram](mvau_compressor_inegration_flow.svg) for the complete decision tree.
+
+**Key integration files:**
+- `src/finn/transformation/fpgadataflow/specialize_layers.py` - RTL vs HLS selection logic
+- `src/finn/custom_op/fpgadataflow/rtl/matrixvectoractivation_rtl.py` - FINN-side RTL MVAU integration with compressor path selection
+- `src/finn/compressor/src/dotp_finn.py` - FINN wrapper for dot-product compressor generation
+- `src/finn/compressor/src/add_multi_finn.py` - FINN wrapper for multi-operand adder generation
+- `finn-rtllib/mvu/mvu_vvu_axi.sv` - RTL template that instantiates generated compressors
+
+This project implements either the full dotp unit of the node with a compressor impleemntation, or optimizes the add_multi additions of the DSP lanes when the RTL DSP path is invoked.
+
+## Standalone Usage
+Generate a compressor of shape `(12,12,12)` called `comp` and save it under `/gen/comp12_12_12.sv`:
+
+```python3 -m finn.compressor.src.main -s 12,12,12 -n comp -o gen/comp12_12_12.sv```
+
+See `python3 -m finn.compressor.src.main -h` for details.
+
+## Testing
+Run the test suite for verification on different platforms:
+
+```bash
+# Core compressor tests (21 configs)
+./run_tests.sh "" versal        # or 7series, ultrascale
+
+# MVAU integration tests (8 configs)
+./run_dotp_comp_tests.sh versal # or 7series, ultrascale
+
+# Multi-operand adder tests (8 configs)
+./run_add_multi_comp_tests.sh versal # or 7series, ultrascale
+```
+
+## Features
+### Custom Input Shape
+The tool can generate compressors for any input shape. A shape is passed as a comma-separated list. Each digit indicates a column's height. *LSB* is *left*, *MSB* is *right*.
+
+### Accumulation
+By passing `-a`, the tool generates an accumulator instead of just an adder. The accumulators width can be specified by `-w`.
+### Gate Absorption
+If desired, every input to the compressor can be preceded by a two-input gate. These gates can be integrated into the first compression stage. Each gate is specified as a HEX digit. The encoding is the same is Vivado's LUT2 primitive:
+| Secondary Input | Primary Input | Output
+|-----------------|---------------|----------------
+|0	              |0	          |(DIGIT << 0) & 1
+|0	              |1	          |(DIGIT << 1) & 1
+|1	              |0	          |(DIGIT << 2) & 1
+|1	              |1	          |(DIGIT << 3) & 1
+
+For example, `8` maps to an AND gate and `6` maps to an XOR gate.
+
+In CLI, gates can be specified as a flat string like `-g 883ABC`. The *LSB* is *left* and *MSB* is *right*. The leftmost specified gate corresponds to the LSB input in the generated compressor input vector.
+
+### Target
+Generate compressors for either Versal, 7-Series or UltraScale fabrics using `-t {Versal,7-Series,UltraScale}`.
+
+### Automated Testing
+The tool can automatically generate a SystemVerilog testbench to fuzzy-test the generated compressors by passing `--test`. For testing, the `xvlog`, `xelab` and `xsim` commands have to be available.
+
+### Custom Pipeline Depth
+Specify the maximum combinational delay for the compressor using `-p MAX_DEPTH`. Note that the final adder, which has at least one single routing delay, cannot be pipelined.
+This excludes the `Quaternary Adder`, which can be split into two stages when not used in accumulation. The pipelined version is the default if `-a` is not passed.
+
+### Constant Input
+Aside to the regular, variable compressor inputs, the tool also supports an additional constant input. It can be specified as a binary number by `-c NUMBER`.
+
+# Implementation Details - How the Code is Structured
+The compressor is internally represented as a graph. Its nodes are defined in `src/graph/nodes.py`.
+Compressor construction is done in several passes:
+1. Create a graph with all scheduled counters and a final adder (in `src/passes/compressor_constructor.py`).
+    1. (Optional) Generate a gate absorption stage.
+    2. Generate regular compression stages until the compression goal is reached.
+    3. Insert pipeline registers between compressor stages.
+    4. Build either a final adder or an accumulator as the final stage.
+2. Annotate LUT6CY instances with placement constraints so that the LUT Cascade will be utilized (in `src/passes/lut_placer.py`).
+3. Replace inexpressible connections: Place wires between connected instantiated modules (in `src/passes/wire_inserter.py`).
+4. Annotate input and output signals in the compressor (in `src/passes/io_annotator.py`).
+5. Emit generated SystemVerilog source (in `src/passes/emitter.py`)
+
+## Extending the Tool
+### Adding new Counters
+Counters without gate absorption are defined in `graph/counters/counter_candidates.py`.
+Counters with gate absorption are defined in `graph/counters/absorption_counter_candidates.py`.
+
+### Adding new Passes
+Before adding new passes over the compressor graph, check out if the simple iterator defined in `node_iterator.py` can be inherited to save boilerplate code.
+
+# Authors
+This tool was created as a standalone compressor generator by Konstantin Hossfeld and Thomas Preußer. It was extended and integrated into the finn flow by Simon Gerber.
diff --git a/src/finn/compressor/__init__.py b/src/finn/compressor/__init__.py
new file mode 100644
index 0000000000..5adb8a0127
--- /dev/null
+++ b/src/finn/compressor/__init__.py
@@ -0,0 +1,15 @@
+#############################################################################
+# Copyright (C) 2024 - 2025 Advanced Micro Devices, Inc. All rights reserved.
+#
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# @brief    FINN compressor package initialization
+# @author    Simon Gerber <simon.gerber@amd.com>
+#############################################################################
+
+"""FINN compressor — LUT-based compressor tree generator for MVU."""
+
+from .src.add_multi_finn import generate_add_multi_comps
+from .src.dotp_finn import generate_dotp_comp
+
+__all__ = ["generate_add_multi_comps", "generate_dotp_comp"]
diff --git a/src/finn/compressor/gen_dotp_netlist.sh b/src/finn/compressor/gen_dotp_netlist.sh
new file mode 100755
index 0000000000..ea35a66b04
--- /dev/null
+++ b/src/finn/compressor/gen_dotp_netlist.sh
@@ -0,0 +1,62 @@
+#!/bin/bash
+#############################################################################
+# Copyright (C) 2024 - 2025 Advanced Micro Devices, Inc. All rights reserved.
+#
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Generate standalone dotp compressor netlist for inspection or integration.
+# Output is a self-contained RTL directory that can be simulated or synthesized.
+#
+# Usage: Edit parameters below, then run: ./gen_dotp_netlist.sh
+#############################################################################
+
+# === Configuration ===
+SIMD=256
+WW=4
+AW=4
+ACCU_WIDTH=16
+SIGNED_WEIGHTS=0      # 0=unsigned, 1=signed
+SIGNED_ACT=0          # 0=unsigned, 1=signed
+TARGET="Versal"       # Versal, 7-Series, UltraScale
+# =====================
+
+set -e
+SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
+export PYTHONPATH="$(cd "$SCRIPT_DIR/../../.." && pwd):${PYTHONPATH:-}"
+
+# Build output directory name from config
+LABEL="simd${SIMD}_w${WW}_a${AW}"
+[ "$SIGNED_WEIGHTS" -eq 0 ] && LABEL="${LABEL}_uw"
+[ "$SIGNED_ACT" -eq 1 ] && LABEL="${LABEL}_sa"
+LABEL="${LABEL}_$(echo "$TARGET" | tr '[:upper:]' '[:lower:]' | tr -d '-')"
+OUT_DIR="$SCRIPT_DIR/gen/$LABEL"
+mkdir -p "$OUT_DIR"
+
+echo "Generating dotp compressor netlist"
+echo "  Config: SIMD=$SIMD, WW=$WW, AW=$AW, ACCU=$ACCU_WIDTH"
+echo "  Target: $TARGET"
+echo "  Output: $OUT_DIR"
+echo ""
+
+# Build flags
+FLAGS=""
+[ "$SIGNED_WEIGHTS" -eq 0 ] && FLAGS="--unsigned_weights"
+[ "$SIGNED_ACT" -eq 1 ] && FLAGS="$FLAGS --signed_activations"
+
+# Generate compressor core and dotp wrapper
+python3 -m finn.compressor.src.dotp_finn \
+    --simd "$SIMD" --ww "$WW" --aw "$AW" \
+    --accu_width "$ACCU_WIDTH" $FLAGS \
+    --target "$TARGET" \
+    --dotp-template "$SCRIPT_DIR/hdl/dotp_comp_template.sv" \
+    --dotp-output-name dotp_comp.sv \
+    -o "$OUT_DIR"
+
+# Include mul_comp_map for complete netlist
+cp "$SCRIPT_DIR/hdl/mul_comp_map.sv" "$OUT_DIR/"
+
+echo ""
+echo "Generated files:"
+ls -1 "$OUT_DIR"/*.sv
+echo ""
+echo "Done. Netlist ready in: $OUT_DIR"
diff --git a/src/finn/compressor/hdl/add_multi_comp_tb_template.sv b/src/finn/compressor/hdl/add_multi_comp_tb_template.sv
new file mode 100644
index 0000000000..b5327262ce
--- /dev/null
+++ b/src/finn/compressor/hdl/add_multi_comp_tb_template.sv
@@ -0,0 +1,142 @@
+/******************************************************************************
+ * Copyright (C) 2024 - 2025 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * @brief	Testbench template for add_multi compressor cores
+ * @author	Simon Gerber <simon.gerber@amd.com>
+ *****************************************************************************/
+
+/**
+ * Standalone testbench for add_multi compressor (comp_NuW_dD).
+ * Tests the compressor directly without requiring add_multi.sv.
+ *
+ * Template placeholders expanded by run_add_multi_comp_tests.sh:
+ *   {n}           - Number of addends
+ *   {arg_width}   - Bit width of each addend
+ *   {depth}       - Pipeline depth of compressor
+ *   {label}       - Configuration label (e.g. n8_w4_p2)
+ *   {comp_module} - Generated compressor module name (e.g. comp_8u4_d0)
+ *****************************************************************************/
+
+module add_multi_comp_{label}_tb;
+
+	localparam int unsigned  N         = {n};
+	localparam int unsigned  ARG_WIDTH = {arg_width};
+	localparam int unsigned  DEPTH     = {depth};
+	localparam int unsigned  IN_WIDTH  = N * ARG_WIDTH;
+	// Use same formula as mvu_pkg::sumwidth() for consistency
+	localparam int unsigned  SUM_WIDTH = $clog2(N) + ARG_WIDTH;
+	localparam int unsigned  ROUNDS    = 257;
+
+	//-----------------------------------------------------------------------
+	// Global Control
+	logic  clk = 0;
+	always #5ns clk = !clk;
+
+	logic  rst = 1;
+	initial begin
+		repeat(16) @(posedge clk);
+		rst <= 0;
+	end
+
+	bit  done = 0;
+	always_comb begin
+		if(done)  $finish;
+	end
+
+	//-----------------------------------------------------------------------
+	// DUT: direct compressor instantiation
+	logic [IN_WIDTH-1:0]   in;
+	logic [SUM_WIDTH-1:0]  out;
+
+	{comp_module} dut (
+		.clk,
+		.in,
+		.out
+	);
+
+	//-----------------------------------------------------------------------
+	// Transpose function: convert row-major to column-major format.
+	//
+	// The compressor expects inputs in column-major (bit-slice) order:
+	//   in[0..N-1]       = bit 0 of all N addends
+	//   in[N..2N-1]      = bit 1 of all N addends
+	//   ...
+	//   in[(W-1)*N..W*N-1] = bit W-1 of all N addends
+	//
+	// This matches the transpose in add_multi.sv CATCH_COMP macro:
+	//   assign in[j*N+i] = arg[i][j];
+	//
+	// Without this transpose, addend bits would be misaligned and produce
+	// incorrect sums.
+	//-----------------------------------------------------------------------
+	function automatic logic [IN_WIDTH-1:0] transpose(
+		input logic [IN_WIDTH-1:0] row_major
+	);
+		logic [IN_WIDTH-1:0] col_major;
+		for(int i = 0; i < N; i++) begin
+			for(int j = 0; j < ARG_WIDTH; j++) begin
+				col_major[j*N + i] = row_major[i*ARG_WIDTH + j];
+			end
+		end
+		return col_major;
+	endfunction
+
+	//-----------------------------------------------------------------------
+	// Input Feed
+	int  Q[$];
+	initial begin
+		in = 'x;
+		@(posedge clk iff !rst);
+
+		repeat(ROUNDS) begin
+			automatic logic [IN_WIDTH-1:0]  aa;
+			automatic int  exp = 0;
+			void'(std::randomize(aa));
+
+			// Compute expected sum from row-major input
+			for(int unsigned i = 0; i < N; i++) begin
+				exp += aa[i*ARG_WIDTH +: ARG_WIDTH];
+			end
+
+			// Transpose to column-major before feeding compressor
+			in <= transpose(aa);
+			Q.push_back(exp);
+			@(posedge clk);
+		end
+
+		in <= 'x;
+		repeat(DEPTH + 10) @(posedge clk);
+
+		assert(Q.size == 0) else begin
+			$error("Missing %0d outputs.", Q.size);
+		end
+		done = 1;
+	end
+
+	//-----------------------------------------------------------------------
+	// Output Checker
+	int unsigned  Checks = 0;
+	int unsigned  Errors = 0;
+	initial begin
+		@(posedge clk iff !rst);
+		repeat(DEPTH) @(posedge clk);
+		repeat(ROUNDS) @(posedge clk) begin
+			automatic int  exp = Q.pop_front();
+			automatic int  hav = out;
+			assert(hav == exp) else begin
+				$error("Output mismatch %0d instead of %0d.", hav, exp);
+				$stop;
+				Errors <= Errors + 1;
+			end
+			Checks <= Checks + 1;
+		end
+	end
+
+	final begin
+		$display("Performed %0d checks with %0d errors.", Checks, Errors);
+		assert(Checks == ROUNDS) else  $error("Unexpected number of checks: %0d instead of %0d.", Checks, ROUNDS);
+	end
+
+endmodule : add_multi_comp_{label}_tb
diff --git a/src/finn/compressor/hdl/add_multi_comp_template.tcl b/src/finn/compressor/hdl/add_multi_comp_template.tcl
new file mode 100644
index 0000000000..7402bf2df6
--- /dev/null
+++ b/src/finn/compressor/hdl/add_multi_comp_template.tcl
@@ -0,0 +1,34 @@
+#############################################################################
+# Copyright (C) 2024 - 2025 Advanced Micro Devices, Inc. All rights reserved.
+#
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# @brief    Vivado simulation script for add_multi compressor testbench
+# @author    Simon Gerber <simon.gerber@amd.com>
+#############################################################################
+
+# Template placeholders expanded by run_add_multi_comp_tests.sh:
+#   {label}   - Configuration label (e.g. n8_w4_p2)
+#   {tb}      - Testbench module name
+#   {gen_dir} - Absolute path to gen/<label>/
+
+set label {label}
+set tb {tb}
+set part {part}
+create_project -force add_multi_comp_$label add_multi_comp_$label.vivado -part $part
+
+# Design sources: only the generated compressor
+read_verilog -sv {*}[glob {gen_dir}/comp_*.sv]
+
+# Testbench
+set simset [current_fileset -simset]
+add_files -fileset $simset {gen_dir}/{tb}.sv
+set_property top $tb $simset
+set_property xsim.simulate.runtime all $simset
+
+if {[catch {launch_simulation} err]} {
+    puts "ERROR: Simulation failed: $err"
+}
+close_sim
+
+quit
diff --git a/src/finn/compressor/hdl/dotp_comp_tb_template.sv b/src/finn/compressor/hdl/dotp_comp_tb_template.sv
new file mode 100644
index 0000000000..4d214651f9
--- /dev/null
+++ b/src/finn/compressor/hdl/dotp_comp_tb_template.sv
@@ -0,0 +1,275 @@
+/******************************************************************************
+ * Testbench for LUT-based dotp_comp module.
+ * Exercises the finnlib protocol: clk, rst, en, last, zero, w, a -> vld, p
+ *
+ * Generated from template for config: PE={pe}, SIMD={simd}, WW={ww}, AW={aw}
+ ******************************************************************************/
+module dotp_comp_{full_sig}_tb;
+
+	localparam int unsigned  ROUNDS = 217;
+
+	localparam int unsigned  PE   = {pe};
+	localparam int unsigned  SIMD = {simd};
+	localparam int unsigned  WEIGHT_WIDTH     = {ww};
+	localparam int unsigned  ACTIVATION_WIDTH = {aw};
+	localparam int unsigned  ACCU_WIDTH       = {accu_width};
+	localparam bit  SIGNED_ACTIVATIONS = {signed_act};
+
+	typedef logic signed [WEIGHT_WIDTH    -1:0]  weight_t;
+	typedef logic        [ACTIVATION_WIDTH-1:0]  activation_t;
+	typedef logic signed [ACCU_WIDTH      -1:0]  accu_t;
+
+	//-----------------------------------------------------------------------
+	// Global Control
+	logic  clk = 0;
+	always #5ns clk = !clk;
+	logic  rst = 1;
+	initial begin
+		repeat(16) @(posedge clk);
+		rst <= 0;
+	end
+
+	//-----------------------------------------------------------------------
+	// DUT
+	logic  en;
+	logic  last;
+	logic  zero;
+	weight_t     [PE-1:0][SIMD-1:0]  w;
+	activation_t         [SIMD-1:0]  a;
+	uwire  vld;
+	accu_t [PE-1:0]  p;
+
+	{dotp_module} #(
+		.PE(PE), .SIMD(SIMD),
+		.WEIGHT_WIDTH(WEIGHT_WIDTH),
+		.ACTIVATION_WIDTH(ACTIVATION_WIDTH),
+		.ACCU_WIDTH(ACCU_WIDTH),
+		.SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS),
+		.COMP_PIPELINE_DEPTH({comp_depth})
+	) dut (
+		.clk, .rst, .en,
+		.last, .zero, .w, .a,
+		.vld, .p
+	);
+
+	//-----------------------------------------------------------------------
+	// Input Feed & Reference Model
+	accu_t [PE-1:0]  Q[$];
+	int unsigned  RoundsPushed = 0;
+
+	// Drive one dot-product round with given weights/activations on a single
+	// enabled cycle (en=1, last=1, zero=0).  Computes the expected accumulator
+	// value and pushes it into the checker queue.
+	// Results drain naturally when en resumes in subsequent tests or the
+	// final flush — no en=1 drain cycles needed.
+	task automatic feed_single(
+		input weight_t     [PE-1:0][SIMD-1:0]  ww,
+		input activation_t         [SIMD-1:0]  aa
+	);
+		automatic accu_t [PE-1:0]  pp = '{ default: '0 };
+		for(int unsigned  pe = 0; pe < PE; pe++) begin
+			for(int unsigned  simd = 0; simd < SIMD; simd++) begin
+				pp[pe] += $signed(ww[pe][simd])
+					* $signed({SIGNED_ACTIVATIONS && aa[simd][ACTIVATION_WIDTH-1], aa[simd]});
+			end
+		end
+		en   <= 1;
+		last <= 1;
+		zero <= 0;
+		w    <= ww;
+		a    <= aa;
+		@(posedge clk);
+		en   <= 0;
+		last <= 'x;
+		zero <= 'x;
+		w    <= 'x;
+		a    <= 'x;
+		Q.push_back(pp);
+		RoundsPushed++;
+	endtask : feed_single
+
+	task automatic feed_zero_round();
+		automatic accu_t [PE-1:0]  pp = '{ default: '0 };
+		en   <= 1;
+		last <= 1;
+		zero <= 1;
+		w    <= '0;
+		a    <= '0;
+		@(posedge clk);
+		en   <= 0;
+		last <= 'x;
+		zero <= 'x;
+		w    <= 'x;
+		a    <= 'x;
+		Q.push_back(pp);
+		RoundsPushed++;
+	endtask : feed_zero_round
+
+	initial begin
+		en = 0;
+		last = 'x;
+		zero = 'x;
+		w = 'x;
+		a = 'x;
+		@(posedge clk iff !rst);
+
+		//---------------------------------------------------------------
+		// Directed edge-case tests
+		//---------------------------------------------------------------
+
+		// All zeros
+		begin
+			automatic weight_t     [PE-1:0][SIMD-1:0]  ww = '0;
+			automatic activation_t         [SIMD-1:0]  aa = '0;
+			feed_single(ww, aa);
+		end
+
+		// Zero round via zero flag
+		feed_zero_round();
+
+		// All ones
+		begin
+			automatic weight_t     [PE-1:0][SIMD-1:0]  ww = '1;
+			automatic activation_t         [SIMD-1:0]  aa = '1;
+			feed_single(ww, aa);
+		end
+
+		// Max positive weights, all-ones activations
+		begin
+			automatic weight_t     [PE-1:0][SIMD-1:0]  ww;
+			automatic activation_t         [SIMD-1:0]  aa = '1;
+			for(int unsigned  pe = 0; pe < PE; pe++)
+				for(int unsigned  s = 0; s < SIMD; s++)
+					ww[pe][s] = {1'b0, {(WEIGHT_WIDTH-1){1'b1}}};
+			feed_single(ww, aa);
+		end
+
+		// Single SIMD lane active (first and last lanes)
+		for(int unsigned  lane = 0; lane < SIMD; lane += (SIMD > 1 ? SIMD-1 : 1)) begin
+			automatic weight_t     [PE-1:0][SIMD-1:0]  ww = '0;
+			automatic activation_t         [SIMD-1:0]  aa = '0;
+			for(int unsigned  pe = 0; pe < PE; pe++)
+				ww[pe][lane] = {1'b0, {(WEIGHT_WIDTH-1){1'b1}}};
+			aa[lane] = '1;
+			feed_single(ww, aa);
+		end
+
+		// Multi-cycle accumulation: 3 cycles then last
+		begin
+			automatic accu_t [PE-1:0]  pp = '{ default: '0 };
+			for(int unsigned  cyc = 0; cyc < 3; cyc++) begin
+				automatic weight_t     [PE-1:0][SIMD-1:0]  ww;
+				automatic activation_t         [SIMD-1:0]  aa;
+				for(int unsigned  pe = 0; pe < PE; pe++)
+					for(int unsigned  s = 0; s < SIMD; s++)
+						ww[pe][s] = weight_t'(cyc + 1);
+				for(int unsigned  s = 0; s < SIMD; s++)
+					aa[s] = activation_t'(s + 1);
+
+				for(int unsigned  pe = 0; pe < PE; pe++)
+					for(int unsigned  s = 0; s < SIMD; s++)
+						pp[pe] += $signed(ww[pe][s])
+							* $signed({SIGNED_ACTIVATIONS && aa[s][ACTIVATION_WIDTH-1], aa[s]});
+
+				en   <= 1;
+				last <= (cyc == 2) ? 1 : 0;
+				zero <= 0;
+				w    <= ww;
+				a    <= aa;
+				@(posedge clk);
+			end
+			en   <= 0;
+			last <= 'x;
+			zero <= 'x;
+			w    <= 'x;
+			a    <= 'x;
+			Q.push_back(pp);
+			RoundsPushed++;
+		end
+
+		//---------------------------------------------------------------
+		// Randomized tests
+		//---------------------------------------------------------------
+		repeat(ROUNDS) begin
+			automatic accu_t [PE-1:0]  pp = '{ default: '0 };
+			do begin
+				en <= 0;
+				last <= 'x;
+				zero <= 'x;
+				w <= 'x;
+				a <= 'x;
+				if($urandom()%31 != 0) begin
+					en <= 1;
+					if($urandom()%19 == 0)  zero <= 1;
+					else begin
+						automatic weight_t     [PE-1:0][SIMD-1:0]  ww;
+						automatic activation_t         [SIMD-1:0]  aa;
+						void'(std::randomize(ww, aa));
+
+						for(int unsigned  pe = 0; pe < PE; pe++)
+							for(int unsigned  simd = 0; simd < SIMD; simd++)
+								pp[pe] += $signed(ww[pe][simd])
+									* $signed({SIGNED_ACTIVATIONS && aa[simd][ACTIVATION_WIDTH-1], aa[simd]});
+
+						zero <= 0;
+						w <= ww;
+						a <= aa;
+					end
+					last <= $urandom() % 137 == 0;
+				end
+				@(posedge clk);
+			end
+			while(!en || !last);
+			Q.push_back(pp);
+			RoundsPushed++;
+		end
+
+		// Flush: keep en=1 with zero=1 for pipeline to drain
+		en <= 1;
+		last <= 0;
+		zero <= 1;
+		w <= '0;
+		a <= '0;
+		repeat(20) @(posedge clk);
+
+		assert(Q.size == 0) else begin
+			$error("Missing %0d outputs.", Q.size);
+			$stop;
+		end
+
+		$display("Test completed successfully.");
+		$finish;
+	end
+
+	//-----------------------------------------------------------------------
+	// Output Checker
+	int unsigned  Checks = 0;
+	always_ff @(posedge clk iff !rst) begin
+		if(en && vld) begin
+			automatic accu_t [PE-1:0]  exp;
+
+			assert(Q.size > 0) else begin
+				$error("Spurious output: %0p.", p);
+				$stop;
+			end
+
+			exp = Q.pop_front();
+			assert(p === exp) else begin
+				$error("Output mismatch: got %0p, expected %0p.", p, exp);
+				$stop;
+			end
+
+			Checks <= Checks + 1;
+		end
+	end
+
+	final begin
+		assert(Checks == RoundsPushed)
+			$display("Successfully performed %0d checks (%0d directed + %0d random).",
+				Checks, RoundsPushed - ROUNDS, ROUNDS);
+		else
+			$error("Unexpected number of checks: %0d instead of %0d.",
+				Checks, RoundsPushed);
+	end
+
+endmodule : dotp_comp_{full_sig}_tb
diff --git a/src/finn/compressor/hdl/dotp_comp_template.sv b/src/finn/compressor/hdl/dotp_comp_template.sv
new file mode 100644
index 0000000000..14d7f2dd78
--- /dev/null
+++ b/src/finn/compressor/hdl/dotp_comp_template.sv
@@ -0,0 +1,154 @@
+/******************************************************************************
+ * Copyright (C) 2024 - 2025 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * @brief	RTL template for dot product compressor with accumulation
+ * @author	Simon Gerber <simon.gerber@amd.com>
+ *****************************************************************************/
+
+/**
+ * LUT-based dot product with fused accumulation.
+ * Drop-in replacement for DSP-based compute cores in the MVU.
+ * Uses a generated compressor tree for the reduction.
+ *
+ *		This file is a TEMPLATE with the following substitutions:
+ *		- $DOTP_MODULE_NAME$ → config-specific wrapper name (e.g. dotp_comp_8xs2s2_a16)
+ *		- $COMP_MODULE_NAME$ → compressor module name (e.g. comp_8xs2s2_a16)
+ *****************************************************************************/
+
+module $DOTP_MODULE_NAME$ #(
+	int unsigned  PE,
+	int unsigned  SIMD,
+	int unsigned  WEIGHT_WIDTH,
+	int unsigned  ACTIVATION_WIDTH,
+	int unsigned  ACCU_WIDTH,
+	bit  SIGNED_ACTIVATIONS = 0,
+	int unsigned  COMP_PIPELINE_DEPTH = 1
+)(
+	// Global Control
+	input	logic  clk,
+	input	logic  rst,
+	input	logic  en,
+
+	// Input
+	input	logic  last,
+	input	logic  zero,
+	input	logic signed [PE-1:0][SIMD-1:0][WEIGHT_WIDTH-1:0]   w,
+	input	logic        [SIMD-1:0][ACTIVATION_WIDTH-1:0]       a,
+
+	// Output
+	output	logic  vld,
+	output	logic signed [PE-1:0][ACCU_WIDTH-1:0]  p
+);
+
+	initial begin
+		if(COMP_PIPELINE_DEPTH < 1) begin
+			$error("%m: COMP_PIPELINE_DEPTH (%0d) must be >= 1.", COMP_PIPELINE_DEPTH);
+			$finish;
+		end
+	end
+
+	//-----------------------------------------------------------------------
+	// Operand Mapping
+	//
+	// The `mul_comp_map` interface handles partial-product broadcasting
+	// mul_comp_map requires NA >= NB.  Weights are always signed.
+	// If activations are wider, swap operands so that ia gets the wider one.
+	localparam bit  SWAPPED = ACTIVATION_WIDTH > WEIGHT_WIDTH;
+
+	localparam int unsigned  NA = SWAPPED ? ACTIVATION_WIDTH : WEIGHT_WIDTH;
+	localparam int unsigned  NB = SWAPPED ? WEIGHT_WIDTH     : ACTIVATION_WIDTH;
+	localparam bit  SIGNED_A    = SWAPPED ? SIGNED_ACTIVATIONS : 1;  // weights always signed
+	localparam bit  SIGNED_B    = SWAPPED ? 1 : SIGNED_ACTIVATIONS;
+
+	// Input to Matric Broadcasting
+	uwire [NA-1:0]  map0_ia = SWAPPED ? NA'(a[0])    : NA'(w[0][0]);
+	uwire [NB-1:0]  map0_ib = SWAPPED ? NB'(w[0][0]) : NB'(a[0]);
+	mul_comp_map #(.NA(NA), .NB(NB), .SIGNED_A(SIGNED_A), .SIGNED_B(SIGNED_B))
+		map0 (.ia(map0_ia), .ib(map0_ib));
+	localparam int unsigned  NM = $bits(map0.oa);
+
+	//-----------------------------------------------------------------------
+	// Pipeline shift register for last -> vld
+/* verilator lint_off LITENDIAN */
+	logic [1:COMP_PIPELINE_DEPTH]  L = '0;
+/* verilator lint_on LITENDIAN */
+	always_ff @(posedge clk) begin
+		if(rst)      L <= '0;
+		else if(en) begin
+			L[1] <= last;
+			for(int unsigned  i = 2; i <= COMP_PIPELINE_DEPTH; i++)
+				L[i] <= L[i-1];
+		end
+	end
+	assign	vld = L[COMP_PIPELINE_DEPTH];
+
+	//-----------------------------------------------------------------------
+	// PE-parallel compressor instances
+	//-----------------------------------------------------------------------
+	for(genvar  pe = 0; pe < PE; pe++) begin : genPE
+
+		// Partial product matrix broadcasting
+		uwire [NM-1:0]  oa[SIMD];
+		uwire [NM-1:0]  ob[SIMD];
+		for(genvar  i = 0; i < SIMD; i++) begin : genMap
+			uwire [NA-1:0]  map_ia = SWAPPED ? NA'(a[i])    : NA'(w[pe][i]);
+			uwire [NB-1:0]  map_ib = SWAPPED ? NB'(w[pe][i]) : NB'(a[i]);
+			mul_comp_map #(.NA(NA), .NB(NB), .SIGNED_A(SIGNED_A), .SIGNED_B(SIGNED_B))
+				map_i (.ia(map_ia), .ib(map_ib));
+			assign	oa[i] = map_i.oa;
+			assign	ob[i] = map_i.ob;
+		end : genMap
+
+		// Flatten all matrices column by column
+		logic [SIMD*NM-1:0]  comp_a;
+		logic [SIMD*NM-1:0]  comp_b;
+		always_comb begin : blkFlatten
+			automatic int unsigned  src_idx[SIMD] = '{ default: 0 };
+			automatic int unsigned  dst_idx = 0;
+			for(int unsigned  col = 0; col < map0.columns(); col++) begin
+				for(int unsigned  k = 0; k < SIMD; k++) begin
+					for(int unsigned  row = 0; row < map0.height(col); row++) begin
+						comp_a[dst_idx] = oa[k][src_idx[k]];
+						comp_b[dst_idx] = ob[k][src_idx[k]];
+						src_idx[k]++;
+						dst_idx++;
+					end
+				end
+			end
+		end : blkFlatten
+
+		// Compressor with fused accumulation
+		// $COMP_MODULE_NAME$ is replaced at code generation time with the
+		// config-specific compressor module (e.g. comp_8xs2s2).
+		uwire [ACCU_WIDTH-1:0]  comp_out;
+		$COMP_MODULE_NAME$ comp_inst (
+			.clk,
+			.in(comp_b),
+			.in_2(comp_a),
+			.rst(rst || last),
+			.en_neg(rst || zero),
+			.en(en),
+			.out(comp_out)
+		);
+
+		assign	p[pe] = $signed(comp_out);
+
+	end : genPE
+
+	//-----------------------------------------------------------------------
+	// Parameter Validation
+	//-----------------------------------------------------------------------
+	initial begin
+		if (SIMD != $EXPECTED_SIMD$ || NA != $EXPECTED_NA$ || NB != $EXPECTED_NB$ ||
+		    SIGNED_A != $EXPECTED_SIGNED_A$ || SIGNED_B != $EXPECTED_SIGNED_B$ ||
+		    ACCU_WIDTH != $EXPECTED_ACCU_WIDTH$) begin
+			$warning("%m: CRITICAL - dotp_comp parameter mismatch! SIMD=%0d (expected %0d), NA=%0d (expected %0d), NB=%0d (expected %0d), SIGNED_A=%0d (expected %0d), SIGNED_B=%0d (expected %0d), ACCU_WIDTH=%0d (expected %0d)",
+			         SIMD, $EXPECTED_SIMD$, NA, $EXPECTED_NA$, NB, $EXPECTED_NB$,
+			         SIGNED_A, $EXPECTED_SIGNED_A$, SIGNED_B, $EXPECTED_SIGNED_B$,
+			         ACCU_WIDTH, $EXPECTED_ACCU_WIDTH$);
+		end
+	end
+
+endmodule : $DOTP_MODULE_NAME$
diff --git a/src/finn/compressor/hdl/dotp_comp_template.tcl b/src/finn/compressor/hdl/dotp_comp_template.tcl
new file mode 100644
index 0000000000..41eed8bdbf
--- /dev/null
+++ b/src/finn/compressor/hdl/dotp_comp_template.tcl
@@ -0,0 +1,34 @@
+#############################################################################
+# Copyright (C) 2024 - 2025 Advanced Micro Devices, Inc. All rights reserved.
+#
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# @brief    Vivado simulation script for dot product compressor with accumulation
+# @author    Simon Gerber <simon.gerber@amd.com>
+#############################################################################
+
+# Create Fresh Project
+set label {label}
+set src_dir {src_dir}
+set tb dotp_comp_{label}_tb
+set part {part}
+create_project -force dotp_comp_$label dotp_comp_$label.vivado -part $part
+
+# Import Design and Simulation Sources
+# Static: mul_comp_map interface
+# Expanded: dotp_comp.sv (template with $COMP_MODULE_NAME$ filled in)
+# Generated: comp_<sig>.sv (config-specific compressor core)
+read_verilog -sv $src_dir/hdl/mul_comp_map.sv $src_dir/gen/$label/dotp_comp.sv {*}[glob $src_dir/gen/$label/comp_*.sv]
+set simset [current_fileset -simset]
+add_files -fileset $simset $src_dir/gen/$label/$tb.sv
+set_property file_type SystemVerilog [get_files -of_objects $simset $src_dir/gen/$label/$tb.sv]
+set_property top $tb $simset
+set_property xsim.simulate.runtime all $simset
+
+# Run Simulation
+if {[catch {launch_simulation} err]} {
+    puts "ERROR: Simulation failed: $err"
+}
+close_sim
+
+quit
diff --git a/src/finn/compressor/hdl/dotp_tb_template.sv b/src/finn/compressor/hdl/dotp_tb_template.sv
new file mode 100644
index 0000000000..aa2165ac67
--- /dev/null
+++ b/src/finn/compressor/hdl/dotp_tb_template.sv
@@ -0,0 +1,60 @@
+/******************************************************************************
+ * Copyright (C) 2024 - 2025 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * @brief	Testbench template for standalone dot product compressor
+ *****************************************************************************/
+
+module dotp_{n}x{sa}{na}{sb}{nb}_tb #(
+	localparam int unsigned  N = {n},
+	localparam int unsigned  NA = {na},
+	localparam int unsigned  NB = {nb},
+	localparam bit  SIGNED_A = {signed_a},
+	localparam bit  SIGNED_B = {signed_b},
+
+	localparam int unsigned  NP = NA > 1?
+		$clog2(N) + (!SIGNED_B && (NB == 1)? NA : NA+NB) :
+		SIGNED_A ^^ SIGNED_B? 1 + $clog2(N) /*[-N:0]*/ : $clog2(N+1) /*[0:N]*/,
+	localparam bit  SIGNED_P = NA == 1? SIGNED_A ^^ SIGNED_B : SIGNED_A || SIGNED_B
+)();
+	logic  clk = 0;
+	always #5ns clk = ~clk;  // 10ns period = 100MHz
+
+	logic [N-1:0][NA-1:0]  a;
+	logic [N-1:0][NB-1:0]  b;
+	uwire [NP-1:0]  p;
+
+	dotp_{n}x{sa}{na}{sb}{nb} dut (
+		.clk,
+		.a, .b, .p
+	);
+
+	initial begin
+		repeat(137) begin
+			automatic type(a)  aa;
+			automatic type(b)  bb;
+			automatic int  pp = 0;
+			automatic int  px;
+			void'(std::randomize(aa, bb));
+			for(int unsigned  i = 0; i < N; i++) begin
+				automatic logic  sa = SIGNED_A && aa[i][NA-1];
+				automatic logic  sb = SIGNED_B && bb[i][NB-1];
+				pp += $signed({sa, aa[i]}) * $signed({sb, bb[i]});
+			end
+
+			a <= aa;
+			b <= bb;
+			repeat({depth} + 1) @(posedge clk);
+			px = $signed({ SIGNED_P && p[NP-1], p });
+			assert((^p !== 1'bx) && (px == pp)) else begin
+				$error("Received %0d [0x%0x] instead of %0d.", px, p, pp);
+				$stop;
+			end
+		end
+
+		$display("Test completed.");
+		$finish;
+	end
+
+endmodule : dotp_{n}x{sa}{na}{sb}{nb}_tb
diff --git a/src/finn/compressor/hdl/dotp_template.sv b/src/finn/compressor/hdl/dotp_template.sv
new file mode 100644
index 0000000000..944fc8fc76
--- /dev/null
+++ b/src/finn/compressor/hdl/dotp_template.sv
@@ -0,0 +1,66 @@
+/******************************************************************************
+ * Copyright (C) 2024 - 2025 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * @brief	RTL template for standalone dot product compressor
+ *****************************************************************************/
+
+module dotp_{n}x{sa}{na}{sb}{nb} #(
+	localparam int unsigned  N = {n},
+	localparam int unsigned  NA = {na},
+	localparam int unsigned  NB = {nb},
+	localparam bit  SIGNED_A = {signed_a},
+	localparam bit  SIGNED_B = {signed_b},
+	localparam int unsigned  NP = NA > 1?
+		$clog2(N) + (!SIGNED_B && (NB == 1)? NA : NA+NB) :
+		SIGNED_A ^^ SIGNED_B? 1 + $clog2(N) /*[-N:0]*/ : $clog2(N+1) /*[0:N]*/
+)(
+	input	logic  clk,
+
+	input	logic [N-1:0][NA-1:0]  a,
+	input	logic [N-1:0][NB-1:0]  b,
+	output	logic [NP-1:0]  p
+);
+
+	// Input to Matrix Broadcasting
+	mul_comp_map #(.NA(NA), .NB(NB), .SIGNED_A(SIGNED_A), .SIGNED_B(SIGNED_B)) map0 (.ia(a[0]), .ib(b[0]));
+	localparam int unsigned  NM = $bits(map0.oa);
+	uwire [NM-1:0]  oa[N];
+	uwire [NM-1:0]  ob[N];
+	assign	oa[0] = map0.oa;
+	assign	ob[0] = map0.ob;
+	for(genvar  i = 1; i < N; i++) begin
+		mul_comp_map #(.NA(NA), .NB(NB), .SIGNED_A(SIGNED_A), .SIGNED_B(SIGNED_B)) map_i (.ia(a[i]), .ib(b[i]));
+		assign	oa[i] = map_i.oa;
+		assign	ob[i] = map_i.ob;
+	end
+
+	// Flatten all Matrices Column by Column
+	logic [N*NM-1:0]  comp_a;
+	logic [N*NM-1:0]  comp_b;
+	always_comb begin
+		automatic int unsigned  src_idx[N] = '{ default: 0 };
+		automatic int unsigned  dst_idx = 0;
+		for(int unsigned  col = 0; col < map0.columns(); col++) begin
+			for(int unsigned  i = 0; i < N; i++) begin
+				for(int unsigned  row = 0; row < map0.height(col); row++) begin
+					comp_a[dst_idx] = oa[i][src_idx[i]];
+					comp_b[dst_idx] = ob[i][src_idx[i]];
+					src_idx[i]++;
+					dst_idx++;
+				end
+			end
+		end
+	end
+
+	uwire signed [NP-1:0]  comp_p;
+	uwire signed [NP-1:0]  abs_p = {abs_term};
+	comp_{n}x{sa}{na}{sb}{nb} comp (
+		.clk,
+		.in(comp_b), .in_2(comp_a),
+		.out(comp_p)
+	);
+	assign	p = comp_p + abs_p;
+
+endmodule : dotp_{n}x{sa}{na}{sb}{nb}
diff --git a/src/finn/compressor/hdl/dotp_template.tcl b/src/finn/compressor/hdl/dotp_template.tcl
new file mode 100644
index 0000000000..1916676012
--- /dev/null
+++ b/src/finn/compressor/hdl/dotp_template.tcl
@@ -0,0 +1,28 @@
+#############################################################################
+# Copyright (C) 2024 - 2025 Advanced Micro Devices, Inc. All rights reserved.
+#
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# @brief    Vivado simulation script for standalone dot product compressor
+#############################################################################
+
+# Create Fresh Project
+set sig {n}x{sa}{na}{sb}{nb}
+set top dotp_$sig
+set part {part}
+create_project -force $top $top.vivado -part $part
+
+# Import Design and Simulation Sources
+read_verilog -sv hdl/mul_comp_map.sv gen/comp_$sig.sv gen/$top.sv
+set simset [current_fileset -simset]
+add_files -fileset $simset gen/${top}_tb.sv
+set_property top ${top}_tb $simset
+set_property xsim.simulate.runtime all $simset
+
+# Run Simulation
+if {[catch {launch_simulation} err]} {
+    puts "ERROR: Simulation failed: $err"
+}
+close_sim
+
+quit
diff --git a/src/finn/compressor/hdl/mul_comp_map.sv b/src/finn/compressor/hdl/mul_comp_map.sv
new file mode 100644
index 0000000000..7049c34ea4
--- /dev/null
+++ b/src/finn/compressor/hdl/mul_comp_map.sv
@@ -0,0 +1,239 @@
+/******************************************************************************
+ * Copyright (C) 2024 - 2025 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * @brief	Multiplier-to-compressor mapping module for gate absorption
+ * @author	Thomas B. Preußer <thomas.preusser@amd.com>, Simon Gerber <simon.gerber@amd.com>
+ *****************************************************************************/
+
+/**
+ * Broadcasts multiplication inputs to feed a bit product matrix for compression.
+ *
+ * @description
+ *	This interface component broadcasts multiplication inputs to produce a bit
+ *	product matrix like the one below. The output is flattened for the
+ *	ingestion by a compressor with the indicated indices:
+ *
+ *	                                   [6]a3.b0  [3]a2.b0  [1]a1.b0  [0]a0.b0
+ *	                        [10]a3.b1  [7]a2.b1  [4]a1.b1  [2]a0.b1
+ *	             [13]a3.b2  [11]a2.b2  [8]a1.b2  [5]a0.b2
+ *	  [15]a3.b3  [14]a2.b3  [12]a1.b3  [9]a0.b3
+ *
+ *	Functions designated to informing about the produced shape are provided:
+ *	  - columns()   - the number of columns in the matrix shape.
+ *	  - height(col) - the height of the specified column.
+ *	Additionally, the bit product operator is identified for each index by:
+ *	  - gate_op(idx) - the assumed bit product operator as hex LUT code.
+ *
+ *	In the case of unsigned operands, all bit products require to be computed
+ *	as AND gates (8), i.e. m[i] = oa[i] & ob[i].
+ *
+ * The operands can be specified to be signed, which will effect these changes
+ * to produce the correct funtionality:
+ *
+ * SIGNED_A
+ * --------
+ *	The sign extensions of the multiples of input a are not materialized.
+ *	Instead, this identity with s := a_{NA-1} & b_i is applied:
+ *		s ... s  s
+ *		----------
+ *		        !s
+ *		        -1
+ *	In consequence:
+ *	  - The `gate_op()` for the left matrix boundary is identified as NAND (7).
+ *	  - The `absolute_term()` function returns a valu of
+ *	       (-2^NB + 1) * 2^{NA-1}
+ *	    that must be added to the matrix sum for the correct product value.
+ *
+ * SIGNED_B
+ * --------
+ *	The sign extension of input b is not materialized.
+ *	Instead, the multiple of a by the sign bit of b is weighted negatively,
+ *	which expands the produced matrix as follows:
+ *
+ *	                                             [ 6]a3.b0  [3]a2.b0  [1]a1.b0  [0]a0.b0
+ *	                                  [11]a3.b1  [ 7]a2.b1  [4]a1.b1  [2]a0.b1
+ *	                       [14]a3.b2  [12]a2.b2  [ 8]a1.b2  [5]a0.b2
+ *	  [17]0!b3! [16]a3!b3  [15]a2!b3  [13]a1!b3  [ 9]a0!b3
+ *	        -1                                   [10]   b3
+ *	-----------------------------------------------------------------------------------
+ *	                                  [10]a0!b3  [ 6]a3.b0  [3]a2.b0  [1]a1.b0  [0]a0.b0
+ *	                                  [11]a3.b1  [ 7]a2.b1  [4]a1.b1  [2]a0.b1
+ *	                       [14]a3.b2  [12]a2.b2  [ 8]a1.b2  [5]a0.b2
+ *	  [17]0!b3! [16]a3!b3  [15]a2!b3  [13]a1!b3  [ 9]a0.b3
+ *	        -1
+ *
+ *	using:
+ *		- a.b  :=    a & b
+ *		- a!b  :=   !a & b
+ *		- a!b! := !(!a & b)
+ *
+ *	In consequence:
+ *	  - The bit sizes of the outputs are wider and the `columns()` count is larger.
+ *	  - The `gate_op()` at the shown indeces is identified as 2 or D.
+ *	Note that the height of the matrix grows to NB+1 if NA > NB.
+ *
+ * SIGNED_A & SIGNED_B
+ * -------------------
+ *	Both approaches are combined for a purely signed multiplication:
+ *
+ *	                        [10]a0!b3  [ 6]a3.b0! [3]a2.b0  [1]a1.b0  [0]a0.b0
+ *	                        [11]a3.b1! [ 7]a2.b1  [4]a1.b1  [2]a0.b1
+ *	             [14]a3.b2! [12]a2.b2  [ 8]a1.b2  [5]a0.b2
+ *	  [16]a3!b3! [15]a2!b3  [13]a1!b3  [ 9]a0.b3
+ *	         -1         -1         -1         -1
+ *
+ *	using:
+ *		- a.b  :=    a & b
+ *		- a!b  :=   !a & b
+ *		- a.b! := !( a & b)
+ *		- a!b! := !(!a & b)
+ *	In consequence:
+ *	  - The bit sizes of the outputs are wider.
+ *	  - The `gate_op()` at the shown indeces is properly identified.
+ *	  - The `absolute_term()` function returns a value of
+ *	       (-2^NB + 1) * 2^{NA-1}
+ *	    that must be added to the matrix sum for the correct product value.
+ *	Note that the height of the matrix grows to NB+1 if NA > NB.
+ */
+
+interface mul_comp_map #(
+	int unsigned  NA,	// bit width of multiplicand
+	int unsigned  NB,	// bit width of multiplier
+	bit  SIGNED_A,		// signed multiplicand
+	bit  SIGNED_B,		// signed multiplier
+
+	// Extra bits due to sign handling and total output size
+	localparam int unsigned  NX = (NA == 1) || !SIGNED_B? 0 : SIGNED_A? 1 : 2,
+	localparam int unsigned  NM = NA*NB + NX
+)(
+	// Input Operands
+	input	logic [NA-1:0]  ia,  // Multiplicand
+	input	logic [NB-1:0]  ib   // Multiplier
+);
+	// Bit Matrix Broadcasts
+	logic [NM-1:0]  oa;
+	logic [NM-1:0]  ob;
+
+
+	// Operand length support is not symmetrical.
+	initial begin
+		if(NA < NB) begin
+			$error("%m: Switch multiplication operands.");
+			$finish;
+		end
+	end
+
+	function int unsigned columns();
+		return  NA == 1? 1 : NB + NA - (!SIGNED_B || SIGNED_A);
+	endfunction : columns
+
+	function int unsigned height(input int unsigned  col);
+		if(NA == 1)  return  col < 1;
+		else begin
+			automatic int unsigned  ret =
+				(col <  NB)?      col + 1 :
+				(col <  NA)?      NB :
+				(col <  NB+NA-1)? NB+NA-1 - col :
+				(col == NB+NA-1)? SIGNED_B && !SIGNED_A :
+				/* else */        0;
+			if(SIGNED_B && (col == NB))  ret++;
+			return  ret;
+		end
+	endfunction : height
+
+	function bit signed [NA+NB-1:0] absolute_term();
+		if(NA == 1)  return  SIGNED_A ^^ SIGNED_B? -1 : 0;
+		else begin
+			automatic bit signed [NA+NB-1:0]  ret = '{
+				NA+NB-1: SIGNED_A || SIGNED_B,
+				NA-1:    SIGNED_A,
+				default: 0
+			};
+			return  ret;
+		end
+	endfunction : absolute_term
+
+
+	// Beyond the tip of left triangle at column of height 1
+	localparam int unsigned  HIGH = NM - (SIGNED_B && !SIGNED_A);
+
+	function bit [3:0] gate_op(input int unsigned  idx);
+		if(NA == 1)  return  SIGNED_A ^^ SIGNED_B? 7 : 8;
+		else begin
+			automatic bit [3:0]  op = 8; // AND
+
+			if(SIGNED_B) begin
+				automatic bit  inv = 0;
+				// Negative weight for sign-bit row
+				for(int unsigned  col = 0; col < NB; col++) begin
+					if(idx == HIGH-1 - col*(col+1)/2)  inv = 1;
+				end
+				if(idx == HIGH)  inv = 1;
+				if(inv)  op = { op[1:0], op[3:2] };
+				if((idx == HIGH) && !SIGNED_A)  op = ~op;
+			end
+
+			if(SIGNED_A) begin
+				automatic bit  inv = 0;
+				// NAND along left matrix boundary
+				for(int unsigned  col = 0; col < NB; col++) begin
+					if(idx == HIGH - (col+1)*(col+2)/2 + (SIGNED_B && (col < NB-1)))  inv = 1;
+				end
+				if(inv)  op = ~op;
+			end
+
+			return  op;
+		end
+	endfunction : gate_op
+
+	//-----------------------------------------------------------------------
+	// Broadcast Wiring
+	if(NA == 1) begin : genTrivial
+		assign	oa[0] = ia[0];
+		assign	ob[0] = ib[0];
+	end : genTrivial
+	begin : genMatrix
+
+		// Feed right triangle going right to left until first full-height column
+		for(genvar  col = 0; col < NB; col++) begin
+			localparam int unsigned  TOP = col*(col+1)/2;
+			for(genvar  row = 0; row <= col; row++) begin
+				assign	oa[TOP+row] = ia[col-row];
+				assign	ob[TOP+row] = ib[row];
+			end
+		end
+
+		// Feed central full-height rectangle for NA > NB
+		for(genvar  col = 0; col < NA-NB; col++) begin
+			localparam int unsigned  TOP = NB*(NB+1)/2 + col*NB + SIGNED_B;
+			for(genvar  row = 0; row < NB; row++) begin
+				assign  oa[TOP + row] = ia[NB+col - row];
+				assign  ob[TOP + row] = ib[row];
+
+			end
+		end
+
+		// Feed left triangle going left to right up to last column with a receeded height
+		for(genvar  col = 0; col < NB-1; col++) begin
+			localparam int unsigned  BOT = HIGH - col*(col+1)/2 - 1;
+			for(genvar  row = 0; row <= col; row++) begin
+				assign	oa[BOT-row] = ia[NA-1-col+row];
+				assign	ob[BOT-row] = ib[NB-1-row];
+			end
+		end
+
+		// Feed extra elements created for sign handling
+		if(SIGNED_B) begin
+			assign	oa[NB*(NB+1)/2] = ia[0];
+			assign	ob[NB*(NB+1)/2] = ib[NB-1];
+			if(!SIGNED_A) begin
+				assign	oa[HIGH] = 0;
+				assign	ob[HIGH] = ib[NB-1];
+			end
+		end
+
+	end : genMatrix
+
+endinterface : mul_comp_map
diff --git a/src/finn/compressor/hdl/mul_comp_map_tb.sv b/src/finn/compressor/hdl/mul_comp_map_tb.sv
new file mode 100644
index 0000000000..c7432f1d9b
--- /dev/null
+++ b/src/finn/compressor/hdl/mul_comp_map_tb.sv
@@ -0,0 +1,41 @@
+/******************************************************************************
+ * Copyright (C) 2024 - 2025 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * @brief	Testbench for multiplier-to-compressor mapping verification
+ * @author	Thomas B. Preußer <thomas.preusser@amd.com>, Simon Gerber <simon.gerber@amd.com>
+ *****************************************************************************/
+
+/**
+ * Quick visualizer for compressor input broadcasting.
+ */
+
+module mul_comp_map_tb;
+	localparam int unsigned  NA = 5;
+	localparam int unsigned  NB = 4;
+	localparam bit  SIGNED_A = 1;
+	localparam bit  SIGNED_B = 1;
+	logic [NA-1:0]  a;
+	logic [NB-1:0]  b;
+	mul_comp_map #(.NA(NA), .NB(NB), .SIGNED_A(SIGNED_A), .SIGNED_B(SIGNED_B)) map (.ia(a), .ib(b));
+
+	initial begin
+		automatic int unsigned  col = 0;
+		automatic int unsigned  row = 0;
+		a = '0;
+		b = '1;
+
+		#5ns;
+		for(int unsigned  i = 0; i < $bits(map.oa); i++) begin
+			$write("\t%0b.%0d.%0b", map.oa[i], map.gate_op(i), map.ob[i]);
+			if(++row == map.height(col)) begin
+				$display();
+				col++;
+				row = 0;
+			end
+		end
+		$display("\t%0b", map.absolute_term());
+	end
+
+endmodule : mul_comp_map_tb
diff --git a/src/finn/compressor/lib/test_common.sh b/src/finn/compressor/lib/test_common.sh
new file mode 100644
index 0000000000..0f2e03b177
--- /dev/null
+++ b/src/finn/compressor/lib/test_common.sh
@@ -0,0 +1,101 @@
+#!/bin/bash
+#############################################################################
+# Copyright (C) 2024 - 2025 Advanced Micro Devices, Inc. All rights reserved.
+#
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# @brief    Common shell utilities for compressor testing
+# @author    Simon Gerber <simon.gerber@amd.com>
+#############################################################################
+
+# Common test utilities for compressor integration tests.
+# Source this file from test scripts.
+
+# Worker pool state (must be declared by sourcing script if not already)
+declare -A workers 2>/dev/null || true
+declare -A errcodes 2>/dev/null || true
+
+# Collect finished workers until at most $1 remain active.
+function collect_workers {
+	local pid label code
+	while :; do
+		for pid in "${!workers[@]}"; do
+			if ! kill -0 "$pid" 2>/dev/null; then
+				label=${workers["$pid"]}
+				wait "$pid"
+				code=$?
+				errcodes["$label"]="$code"
+				unset "workers[$pid]"
+				echo "- $label -> $code"
+			fi
+		done
+		if [ "${#workers[@]}" -le "$1" ]; then return; fi
+		sleep 5
+	done
+}
+
+# Start a test worker. Args: label, function_name
+function start_worker {
+	local label="$1"
+	echo "+ $label ..."
+	"$2" "$label" &
+	workers[$!]="$label"
+}
+
+# Check Vivado output file for errors. Returns error count.
+# Usage: check_vivado_errors <output_file> <label>
+function check_vivado_errors {
+	local out="$1" label="$2"
+	local err_count tcl_err_count success_count
+
+	# Check if output file exists
+	if [ ! -f "$out" ]; then
+		echo "ERROR: Vivado output file not found for $label: $out" >&2
+		return 1
+	fi
+
+	# Check for Vivado errors
+	err_count=$(grep -ic '^Error: ' "$out" || true)
+	tcl_err_count=$(grep -Eic "can't read \"|invalid command name|no such variable|^ERROR: \[Common" "$out" || true)
+
+	# Check for positive completion indicators
+	success_count=$(grep -ic "Successfully performed\|Test completed successfully\|Test completed\.\|Performed.*checks" "$out" || true)
+
+	# TCL errors are fatal
+	if [ "$tcl_err_count" -gt 0 ]; then
+		echo "ERROR: Vivado/Tcl failed for $label (tcl_errors=$tcl_err_count)." >&2
+		return 1
+	fi
+
+	# If no Vivado errors but also no success message, simulation may have crashed
+	if [ "$err_count" -eq 0 ] && [ "$success_count" -eq 0 ]; then
+		# Check if simulation even started
+		if ! grep -q "launch_simulation\|xsim.*-runall\|run all" "$out"; then
+			echo "ERROR: Simulation did not run for $label (no launch detected)." >&2
+			return 1
+		fi
+		echo "WARNING: No success message found for $label (may have incomplete simulation)." >&2
+		# Don't fail here, just warn - some tests might not have explicit success messages
+	fi
+
+	return "$err_count"
+}
+
+# Print colored test summary. Uses global LABELS and errcodes arrays.
+function print_summary {
+	local label code msg overall=0
+
+	echo -e "Summary:\n"
+	for label in "${LABELS[@]}"; do
+		code="${errcodes[$label]}"
+		if [ "$code" -eq 0 ]; then
+			msg=$'\e[92;1mPASS\e[0m'
+		else
+			msg=$'\e[91;1mFAIL\e[0m'" (errors: $code)"
+			overall=1
+		fi
+		printf '  %-40s %s\n' "$label" "$msg"
+	done
+	echo
+	return "$overall"
+}
diff --git a/src/finn/compressor/mvau_compressor_integration_flow.svg b/src/finn/compressor/mvau_compressor_integration_flow.svg
new file mode 100644
index 0000000000..5d65495a70
--- /dev/null
+++ b/src/finn/compressor/mvau_compressor_integration_flow.svg
@@ -0,0 +1,330 @@
+<?xml version="1.0" encoding="UTF-8" standalone="no"?>
+<svg width="1450" height="720" xmlns="http://www.w3.org/2000/svg">
+
+  <!-- Vivado-style block diagram with horizontal decision flow -->
+  <defs>
+    <!-- Arrowhead -->
+    <marker id="arrow" markerWidth="8" markerHeight="8" refX="7" refY="3" orient="auto">
+      <polygon points="0 0, 8 3, 0 6" fill="#333" />
+    </marker>
+    <marker id="arrowGreen" markerWidth="8" markerHeight="8" refX="7" refY="3" orient="auto">
+      <polygon points="0 0, 8 3, 0 6" fill="#2E7D32" />
+    </marker>
+    <marker id="arrowRed" markerWidth="8" markerHeight="8" refX="7" refY="3" orient="auto">
+      <polygon points="0 0, 8 3, 0 6" fill="#C62828" />
+    </marker>
+
+    <!-- Decision diamond pattern -->
+    <pattern id="decisionFill" x="0" y="0" width="4" height="4" patternUnits="userSpaceOnUse">
+      <rect width="4" height="4" fill="#FFF3CD"/>
+    </pattern>
+
+    <!-- Block shadow -->
+    <filter id="blockShadow" x="-20%" y="-20%" width="140%" height="140%">
+      <feGaussianBlur in="SourceAlpha" stdDeviation="2"/>
+      <feOffset dx="2" dy="2" result="offsetblur"/>
+      <feMerge>
+        <feMergeNode/>
+        <feMergeNode in="SourceGraphic"/>
+      </feMerge>
+    </filter>
+  </defs>
+
+  <!-- Grid lines (subtle, like Vivado) -->
+  <g stroke="#E8E8E8" stroke-width="0.5" opacity="0.3">
+    <line x1="0" y1="300" x2="1450" y2="300"/>
+    <line x1="200" y1="0" x2="200" y2="720"/>
+    <line x1="450" y1="0" x2="450" y2="720"/>
+    <line x1="750" y1="0" x2="750" y2="720"/>
+    <line x1="1050" y1="0" x2="1050" y2="720"/>
+    <line x1="1350" y1="0" x2="1350" y2="720"/>
+  </g>
+
+  <!-- ========== TOP ROW: DECISION FLOW (HORIZONTAL) ========== -->
+
+  <!-- Entry Node: MVAU -->
+  <g id="entry" filter="url(#blockShadow)">
+    <rect x="30" y="40" width="220" height="220" rx="5" fill="#E3F2FD" stroke="#1565C0" stroke-width="3"/>
+    <rect x="30" y="40" width="220" height="35" rx="5" fill="#1976D2" stroke="#1565C0" stroke-width="2"/>
+    <text x="140" y="63" font-family="Segoe UI, Arial, sans-serif" font-size="16" font-weight="bold" text-anchor="middle" fill="white">
+      MVAU Node
+    </text>
+    <text x="140" y="125" font-family="Segoe UI, Arial, sans-serif" font-size="11" text-anchor="middle">
+      Pre-specialization
+    </text>
+  </g>
+
+  <!-- Arrow from Entry to Decision 1 (CENTERED) -->
+  <line x1="253" y1="150" x2="297" y2="150" stroke="#333" stroke-width="2.5" marker-end="url(#arrow)"/>
+
+  <!-- Decision 1: RTL vs HLS -->
+  <g id="decision1">
+    <rect x="300" y="40" width="240" height="220" rx="5" fill="url(#decisionFill)" stroke="#FFA726" stroke-width="3" filter="url(#blockShadow)"/>
+    <rect x="300" y="40" width="240" height="35" rx="5" fill="#FF9800" stroke="#F57C00" stroke-width="2"/>
+    <text x="420" y="63" font-family="Segoe UI, Arial, sans-serif" font-size="15" font-weight="bold" text-anchor="middle" fill="white">
+      Decision 1: RTL Eligible?
+    </text>
+
+    <!-- Decision parameters in box -->
+    <text x="310" y="100" font-family="Consolas, monospace" font-size="10" font-weight="bold">Attributes:</text>
+    <text x="310" y="120" font-family="Consolas, monospace" font-size="9">• wdt.signed() == True</text>
+    <text x="310" y="138" font-family="Consolas, monospace" font-size="9">• noActivation == 0</text>
+    <text x="310" y="156" font-family="Consolas, monospace" font-size="9">• binaryXnorMode != 1</text>
+
+    <text x="420" y="240" font-family="Consolas, monospace" font-size="9" text-anchor="middle" fill="#666" font-style="italic">
+      specialize_layers.py:218-268
+    </text>
+    <text x="420" y="251" font-family="Consolas, monospace" font-size="9" text-anchor="middle" fill="#666" font-style="italic">
+      _mvu_rtl_possible()
+    </text>
+  </g>
+
+  <!-- Arrow from Decision 1 to Decision 2 (YES/pass, CENTERED) -->
+  <line x1="543" y1="150" x2="587" y2="150" stroke="#2E7D32" stroke-width="3" marker-end="url(#arrowGreen)"/>
+  <text x="565" y="135" font-family="Segoe UI, Arial, sans-serif" font-size="12" font-weight="bold" text-anchor="middle" fill="#2E7D32">YES</text>
+
+  <!-- Decision 2: DOTP Compressor Eligible -->
+  <g id="decision2">
+    <rect x="590" y="40" width="240" height="220" rx="5" fill="url(#decisionFill)" stroke="#FFA726" stroke-width="3" filter="url(#blockShadow)"/>
+    <rect x="590" y="40" width="240" height="35" rx="5" fill="#FF9800" stroke="#F57C00" stroke-width="2"/>
+    <text x="710" y="58" font-family="Segoe UI, Arial, sans-serif" font-size="14" font-weight="bold" text-anchor="middle" fill="white">
+      Decision 2: DOTP
+    </text>
+    <text x="710" y="70" font-family="Segoe UI, Arial, sans-serif" font-size="14" font-weight="bold" text-anchor="middle" fill="white">
+      Compressor Eligible?
+    </text>
+
+    <text x="600" y="90" font-family="Consolas, monospace" font-size="10" font-weight="bold">Bitwidth:</text>
+    <text x="600" y="110" font-family="Consolas, monospace" font-size="9">• ww ≤ 4</text>
+    <text x="600" y="123" font-family="Consolas, monospace" font-size="9">• aw ≤ 4</text>
+
+    <text x="600" y="145" font-family="Consolas, monospace" font-size="10" font-weight="bold">Mode:</text>
+    <text x="600" y="160" font-family="Consolas, monospace" font-size="9">• pumpedCompute == 0</text>
+
+    <text x="710" y="240" font-family="Consolas, monospace" font-size="9" text-anchor="middle" fill="#666" font-style="italic">
+      matrixvectoractivation_rtl.py:318
+    </text>
+    <text x="710" y="251" font-family="Consolas, monospace" font-size="9" text-anchor="middle" fill="#666" font-style="italic">
+      _is_dotp_comp_eligible()
+    </text>
+  </g>
+
+  <!-- Arrow from Decision 2 to Decision 2.5 (NO/fail) -->
+  <line x1="833" y1="150" x2="877" y2="150" stroke="#C62828" stroke-width="3" marker-end="url(#arrowRed)"/>
+  <text x="855" y="135" font-family="Segoe UI, Arial, sans-serif" font-size="12" font-weight="bold" text-anchor="middle" fill="#C62828">NO</text>
+
+  <!-- Decision 2.5: genINT8 Eligible -->
+  <g id="decision2_5">
+    <rect x="880" y="40" width="240" height="220" rx="5" fill="url(#decisionFill)" stroke="#FFA726" stroke-width="3" filter="url(#blockShadow)"/>
+    <rect x="880" y="40" width="240" height="35" rx="5" fill="#FF9800" stroke="#F57C00" stroke-width="2"/>
+    <text x="1000" y="63" font-family="Segoe UI, Arial, sans-serif" font-size="14" font-weight="bold" text-anchor="middle" fill="white">
+      genINT8 Eligible?
+    </text>
+
+    <text x="890" y="90" font-family="Consolas, monospace" font-size="9" font-weight="bold">DSP58 Optimization:</text>
+    <text x="890" y="105" font-family="Consolas, monospace" font-size="8">• VERSION == Versal (DSP58)</text>
+    <text x="890" y="118" font-family="Consolas, monospace" font-size="8">• NUM_LANES ≤ 3</text>
+    <text x="890" y="131" font-family="Consolas, monospace" font-size="8">• WW ≤ 8, AW ≤ 9</text>
+
+    <text x="1000" y="240" font-family="Consolas, monospace" font-size="9" text-anchor="middle" fill="#666" font-style="italic">
+      mvu_vvu_axi.sv:332
+    </text>
+    <text x="1000" y="251" font-family="Consolas, monospace" font-size="9" text-anchor="middle" fill="#666" font-style="italic">
+      (genINT8 decision)
+    </text>
+  </g>
+
+  <!-- Arrow from Decision 2.5 to DSP + Lane Compressors (NO/genSoftVec, horizontal) -->
+  <line x1="1123" y1="150" x2="1167" y2="150" stroke="#C62828" stroke-width="3" marker-end="url(#arrowRed)"/>
+  <text x="1145" y="135" font-family="Segoe UI, Arial, sans-serif" font-size="12" font-weight="bold" text-anchor="middle" fill="#C62828">NO</text>
+
+  <!-- Arrow from Decision 2.5 to genINT8 (YES, downward to terminal row) -->
+  <line x1="1000" y1="263" x2="1000" y2="357" stroke="#2E7D32" stroke-width="3" marker-end="url(#arrowGreen)"/>
+  <text x="1025" y="310" font-family="Segoe UI, Arial, sans-serif" font-size="12" font-weight="bold" text-anchor="middle" fill="#2E7D32">YES</text>
+
+  <!-- Terminal: genINT8 (bottom row, aligned under Decision 2.5) -->
+  <g id="terminal_genINT8" filter="url(#blockShadow)">
+    <rect x="880" y="360" width="240" height="180" rx="5" fill="#B3E5FC" stroke="#0277BD" stroke-width="3"/>
+    <rect x="880" y="360" width="240" height="35" rx="5" fill="#0288D1" stroke="#0277BD" stroke-width="2"/>
+    <text x="1000" y="383" font-family="Segoe UI, Arial, sans-serif" font-size="15" font-weight="bold" text-anchor="middle" fill="white">
+      genINT8
+    </text>
+    <text x="1000" y="410" font-family="Segoe UI, Arial, sans-serif" font-size="11" text-anchor="middle" font-weight="bold">
+      DSP58 Specialized
+    </text>
+    <text x="1000" y="430" font-family="Consolas, monospace" font-size="9" text-anchor="middle">
+      mvu_vvu_8sx9_dsp58.sv
+    </text>
+    <text x="1000" y="445" font-family="Consolas, monospace" font-size="9" text-anchor="middle">
+      No add_multi needed
+    </text>
+    <text x="1000" y="460" font-family="Consolas, monospace" font-size="9" text-anchor="middle">
+      Fits in DSP58 width
+    </text>
+    <line x1="890" y1="475" x2="1110" y2="475" stroke="#0288D1" stroke-width="1"/>
+    <text x="1000" y="492" font-family="Consolas, monospace" font-size="9" text-anchor="middle" fill="#01579B">
+      Versal only (DSP58)
+    </text>
+    <text x="1000" y="507" font-family="Consolas, monospace" font-size="9" text-anchor="middle" fill="#01579B">
+      VERSION==Versal, lanes≤3, WW≤8, AW≤9
+    </text>
+    <text x="1000" y="526" font-family="Consolas, monospace" font-size="9" text-anchor="middle" fill="#01579B" font-style="italic">
+      mvu_vvu_8sx9_dsp58.sv
+    </text>
+  </g>
+
+
+  <!-- ========== BRANCHING ARROWS DOWN TO TERMINALS (CENTERED) ========== -->
+
+  <!-- Decision 1 NO → HLS (CENTERED to box) -->
+  <line x1="430" y1="263" x2="430" y2="357" stroke="#C62828" stroke-width="3" marker-end="url(#arrowRed)"/>
+  <text x="455" y="310" font-family="Segoe UI, Arial, sans-serif" font-size="12" font-weight="bold" text-anchor="middle" fill="#C62828">NO</text>
+
+  <!-- Decision 2 YES → DOTP Compressor (CENTERED to box) -->
+  <line x1="710" y1="263" x2="710" y2="357" stroke="#2E7D32" stroke-width="3" marker-end="url(#arrowGreen)"/>
+  <text x="735" y="310" font-family="Segoe UI, Arial, sans-serif" font-size="12" font-weight="bold" text-anchor="middle" fill="#2E7D32">YES</text>
+
+  <!-- ========== BOTTOM ROW: TERMINAL IMPLEMENTATIONS (3 remaining) ========== -->
+
+  <!-- Terminal 1: HLS Path (CENTERED under Decision 1) -->
+  <g id="terminal_hls" filter="url(#blockShadow)">
+    <rect x="320" y="360" width="220" height="180" rx="5" fill="#BDBDBD" stroke="#424242" stroke-width="3"/>
+    <rect x="320" y="360" width="220" height="35" rx="5" fill="#616161" stroke="#424242" stroke-width="2"/>
+    <text x="430" y="383" font-family="Segoe UI, Arial, sans-serif" font-size="15" font-weight="bold" text-anchor="middle" fill="white">
+      HLS Path
+    </text>
+    <text x="430" y="410" font-family="Segoe UI, Arial, sans-serif" font-size="11" text-anchor="middle" font-weight="bold">
+      Vivado HLS Synthesis
+    </text>
+    <text x="430" y="430" font-family="Consolas, monospace" font-size="9" text-anchor="middle">
+      C++ code generation
+    </text>
+    <text x="430" y="445" font-family="Consolas, monospace" font-size="9" text-anchor="middle">
+      Default: resType="lut"
+    </text>
+    <text x="430" y="460" font-family="Consolas, monospace" font-size="9" text-anchor="middle">
+      (DSP via attribute)
+    </text>
+    <line x1="330" y1="475" x2="530" y2="475" stroke="#616161" stroke-width="1"/>
+    <text x="430" y="492" font-family="Consolas, monospace" font-size="9" text-anchor="middle" fill="#333">
+      Fallback for all configs
+    </text>
+    <text x="430" y="507" font-family="Consolas, monospace" font-size="9" text-anchor="middle" fill="#333">
+      Flexible but less optimal
+    </text>
+    <text x="430" y="526" font-family="Consolas, monospace" font-size="9" text-anchor="middle" fill="#555" font-style="italic">
+      matrixvectoractivation_hls.py
+    </text>
+  </g>
+
+  <!-- Terminal 2: DOTP Compressor (CENTERED under Decision 2) -->
+  <g id="terminal_dotp" filter="url(#blockShadow)">
+    <rect x="600" y="360" width="220" height="180" rx="5" fill="#C8E6C9" stroke="#2E7D32" stroke-width="3"/>
+    <rect x="600" y="360" width="220" height="35" rx="5" fill="#43A047" stroke="#2E7D32" stroke-width="2"/>
+    <text x="710" y="383" font-family="Segoe UI, Arial, sans-serif" font-size="15" font-weight="bold" text-anchor="middle" fill="white">
+      DOTP Compressor
+    </text>
+    <text x="710" y="410" font-family="Segoe UI, Arial, sans-serif" font-size="11" text-anchor="middle" font-weight="bold">
+      Full LUT-Based Compute
+    </text>
+    <text x="710" y="430" font-family="Consolas, monospace" font-size="9" text-anchor="middle">
+      Replaces DSP entirely
+    </text>
+    <text x="710" y="445" font-family="Consolas, monospace" font-size="9" text-anchor="middle">
+      Fused accumulation
+    </text>
+    <text x="710" y="460" font-family="Consolas, monospace" font-size="9" text-anchor="middle">
+      USE_COMPRESSOR=1
+    </text>
+    <line x1="610" y1="475" x2="810" y2="475" stroke="#43A047" stroke-width="1"/>
+    <text x="710" y="492" font-family="Consolas, monospace" font-size="9" text-anchor="middle" fill="#1B5E20">
+      comp_&lt;SIMD&gt;x&lt;s|u&gt;&lt;WW&gt;
+    </text>
+    <text x="710" y="507" font-family="Consolas, monospace" font-size="9" text-anchor="middle" fill="#1B5E20">
+      &lt;s|u&gt;&lt;AW&gt;_a&lt;ACC&gt;.sv
+    </text>
+    <text x="710" y="526" font-family="Consolas, monospace" font-size="9" text-anchor="middle" fill="#1B5E20" font-style="italic">
+      dotp_finn.py, dotp_comp.sv
+    </text>
+  </g>
+
+  <!-- Terminal 3: DSP + Lane Compressors (same row as decisions) -->
+  <g id="terminal_dsp_comp" filter="url(#blockShadow)">
+    <rect x="1170" y="40" width="240" height="220" rx="5" fill="#DCEDC8" stroke="#558B2F" stroke-width="3"/>
+    <rect x="1170" y="40" width="240" height="35" rx="5" fill="#7CB342" stroke="#558B2F" stroke-width="2"/>
+    <text x="1290" y="63" font-family="Segoe UI, Arial, sans-serif" font-size="15" font-weight="bold" text-anchor="middle" fill="white">
+      DSP + Lane Comps
+    </text>
+    <text x="1290" y="95" font-family="Segoe UI, Arial, sans-serif" font-size="11" text-anchor="middle" font-weight="bold">
+      Hybrid Implementation
+    </text>
+    <text x="1290" y="115" font-family="Consolas, monospace" font-size="9" text-anchor="middle">
+      genSoftVec: mvu.sv
+    </text>
+    <text x="1290" y="130" font-family="Consolas, monospace" font-size="9" text-anchor="middle">
+      DSP multiply-accumulate
+    </text>
+    <text x="1290" y="145" font-family="Consolas, monospace" font-size="9" text-anchor="middle">
+      add_multi.sv w/ comps
+    </text>
+    <line x1="1180" y1="160" x2="1400" y2="160" stroke="#7CB342" stroke-width="1"/>
+    <text x="1290" y="177" font-family="Consolas, monospace" font-size="9" text-anchor="middle" fill="#33691E">
+      All lanes use compressors
+    </text>
+    <text x="1290" y="192" font-family="Consolas, monospace" font-size="9" text-anchor="middle" fill="#33691E">
+      comp_&lt;N&gt;u&lt;W&gt;_d&lt;D&gt;.sv
+    </text>
+    <text x="1290" y="240" font-family="Consolas, monospace" font-size="9" text-anchor="middle" fill="#666" font-style="italic">
+      add_multi_finn.py
+    </text>
+  </g>
+
+  <!-- ========== LEGEND ========== -->
+  <g id="legend">
+    <rect x="20" y="570" width="1410" height="130" fill="#F8F9FA" stroke="#999" stroke-width="1.5" rx="5"/>
+
+    <!-- Title -->
+    <text x="40" y="595" font-family="Segoe UI, Arial, sans-serif" font-size="15" font-weight="bold">
+      Parameter Reference
+    </text>
+
+    <!-- Left side: Parameter definitions -->
+    <text x="40" y="620" font-family="Consolas, monospace" font-size="12">
+      <tspan font-weight="bold">WW:</tspan> Weight Width (bitwidth) |
+      <tspan font-weight="bold">AW:</tspan> Activation Width (bitwidth) |
+      <tspan font-weight="bold">SIMD:</tspan> Parallelism factor (folding)
+    </text>
+
+    <text x="40" y="640" font-family="Consolas, monospace" font-size="12">
+      <tspan font-weight="bold">idt:</tspan> Input datatype (idt.bw() → AW) |
+      <tspan font-weight="bold">wdt:</tspan> Weight datatype (wdt.bw() → WW) |
+      <tspan font-weight="bold">pumpedCompute:</tspan> Double-pumping (2x clock)
+    </text>
+
+    <text x="40" y="660" font-family="Consolas, monospace" font-size="12">
+      <tspan font-weight="bold">version:</tspan> DSP type (1=DSP48E1/7-Series, 2=DSP48E2/UltraScale+, 3=DSP58/Versal)
+    </text>
+
+    <text x="40" y="680" font-family="Consolas, monospace" font-size="12">
+      <tspan font-weight="bold">Platform support:</tspan> RTL backend supports all FPGA families. Compressor paths work on Versal, UltraScale+, 7-Series.
+    </text>
+
+    <!-- Right side: Color legend -->
+    <text x="1230" y="595" font-family="Segoe UI, Arial, sans-serif" font-size="15" font-weight="bold">
+      Implementation Types
+    </text>
+
+    <rect x="1230" y="610" width="15" height="15" fill="#C8E6C9" stroke="#2E7D32" stroke-width="1"/>
+    <text x="1250" y="621" font-family="Segoe UI, Arial, sans-serif" font-size="12">LUT-based</text>
+
+    <rect x="1230" y="632" width="15" height="15" fill="#DCEDC8" stroke="#558B2F" stroke-width="1"/>
+    <text x="1250" y="643" font-family="Segoe UI, Arial, sans-serif" font-size="12">Hybrid</text>
+
+    <rect x="1230" y="654" width="15" height="15" fill="#B3E5FC" stroke="#0277BD" stroke-width="1"/>
+    <text x="1250" y="665" font-family="Segoe UI, Arial, sans-serif" font-size="12">DSP-based (genINT8)</text>
+
+    <rect x="1230" y="676" width="15" height="15" fill="#BDBDBD" stroke="#424242" stroke-width="1"/>
+    <text x="1250" y="687" font-family="Segoe UI, Arial, sans-serif" font-size="12">HLS fallback</text>
+  </g>
+
+</svg>
diff --git a/src/finn/compressor/run_add_multi_comp_tests.sh b/src/finn/compressor/run_add_multi_comp_tests.sh
new file mode 100755
index 0000000000..0828543e18
--- /dev/null
+++ b/src/finn/compressor/run_add_multi_comp_tests.sh
@@ -0,0 +1,144 @@
+#!/bin/bash
+#############################################################################
+# Copyright (C) 2024 - 2025 Advanced Micro Devices, Inc. All rights reserved.
+#
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# @brief    Test runner for add_multi compressor verification
+# @author    Simon Gerber <simon.gerber@amd.com>
+#############################################################################
+
+# Usage: ./run_add_multi_comp_tests.sh [target]
+#   target: versal, 7series, ultrascale (default: versal)
+
+((${KEEP_LOG:=0}))
+((${MAX_WORKERS:=12}))
+
+# Parse target argument
+TARGET="${1:-versal}"
+
+if ! command -v vivado >/dev/null 2>&1; then
+	echo "ERROR: vivado not found in PATH." >&2
+	exit 1
+fi
+
+echo "Vivado: $(command -v vivado)"
+echo "Settings: KEEP_LOG=$KEEP_LOG MAX_WORKERS=$MAX_WORKERS"
+echo "Target: $TARGET"
+
+# Paths (all absolute for portability)
+SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
+HDL_DIR="$SCRIPT_DIR/hdl"
+GEN_DIR="$SCRIPT_DIR/gen"
+FINN_SRC="$(cd "$SCRIPT_DIR/../../.." && pwd)"
+export PYTHONPATH="$FINN_SRC/src${PYTHONPATH:+:$PYTHONPATH}"
+
+# Vivado working directory (isolated temp, unique per invocation)
+WORK_DIR="/tmp/finn_compressor_tests_$$"
+
+source "$SCRIPT_DIR/lib/test_common.sh"
+
+# Test configs: N ARG_WIDTH PIPELINE_EVERY
+# Format: "N W P" where P is pipeline_every (0 = no pipelining)
+TESTS=(
+	"8  4  0"
+	"8  4  2"
+	"16 3  0"
+	"16 6  2"
+	"32 6  2"
+	"32 16 2"
+	"47 5  2"
+	"56 8  2"
+)
+
+# Set FPGA part based on TARGET variable
+function get_fpga_part {
+	if [[ "$TARGET" == "7series" ]]; then
+		echo "xc7z020clg400-1"  # Pynq-Z1
+	elif [[ "$TARGET" == "ultrascale" ]]; then
+		echo "xczu9eg-ffvb1156-2-e"  # ZCU102
+	else
+		echo "xcvc1902-vsva2197-2MP-e-S"  # Versal VCK190
+	fi
+}
+
+# Build label from config
+function make_label {
+	local n=$1 w=$2 p=$3
+	local label="n${n}_w${w}"
+	[ "$p" -ne 0 ] && label="${label}_p${p}"
+	echo "$label"
+}
+
+function run_sim {
+	local label="$1"
+	local work="$WORK_DIR/$label"
+	local tcl="$GEN_DIR/$label/add_multi_comp_${label}.tcl"
+	local out="$GEN_DIR/$label/add_multi_comp_${label}.runner.out"
+	local log=(-nolog); [ "$KEEP_LOG" -gt 0 ] && log=(-log "$GEN_DIR/$label/sim.log")
+
+	mkdir -p "$work"
+	(cd "$work" && vivado "${log[@]}" -nojournal -mode batch -source "$tcl" >"$out" 2>&1)
+	check_vivado_errors "$out" "$label"
+	exit $?
+}
+
+# Phase 1: Generate
+LABELS=()
+FPGA_PART=$(get_fpga_part)
+echo -e "Generating configs:\n"
+for test in "${TESTS[@]}"; do
+	read -r n w p <<< "$test"
+	label=$(make_label "$n" "$w" "$p")
+	LABELS+=("$label")
+	gen_dir="$GEN_DIR/$label"
+	mkdir -p "$gen_dir"
+
+	echo "  $label ..."
+
+	# Build target flag (Versal is default, no flag needed)
+	target_flag=""
+	[[ "$TARGET" == "7series" ]] && target_flag="--target 7-Series"
+	[[ "$TARGET" == "ultrascale" ]] && target_flag="--target UltraScale"
+
+	# Build pipeline flag
+	pipeline_flag=""
+	[ "$p" -ne 0 ] && pipeline_flag="-p $p"
+
+	# Generate compressor
+	# shellcheck disable=SC2086
+	if ! gen_out=$(python3 -m finn.compressor.src.add_multi_finn \
+		--n "$n" --arg_width "$w" $pipeline_flag $target_flag \
+		-o "$gen_dir" 2>&1); then
+		echo "GENERATION FAILED: $gen_out" >&2; exit 1
+	fi
+
+	comp_name=$(echo "$gen_out" | sed -n 's/^ *Module name:[[:space:]]*//p' | head -n 1)
+	comp_depth=$(echo "$gen_out" | sed -n 's/^ *Pipeline depth:[[:space:]]*//p' | head -n 1 | grep -Eo '[0-9]+' || true)
+	[ -z "$comp_name" ] && { echo "ERROR: No module name for $label" >&2; exit 1; }
+	[ -z "$comp_depth" ] && { echo "ERROR: No depth for $label" >&2; exit 1; }
+
+	# Expand TB
+	sed -e "s/{n}/$n/g" -e "s/{arg_width}/$w/g" \
+	    -e "s/{depth}/$comp_depth/g" -e "s/{label}/$label/g" \
+	    -e "s/{comp_module}/$comp_name/g" \
+	    "$HDL_DIR/add_multi_comp_tb_template.sv" > "$gen_dir/add_multi_comp_${label}_tb.sv"
+
+	# Expand TCL
+	sed -e "s|{label}|$label|g" -e "s|{tb}|add_multi_comp_${label}_tb|g" \
+	    -e "s|{gen_dir}|$gen_dir|g" -e "s|{part}|$FPGA_PART|g" \
+	    "$HDL_DIR/add_multi_comp_template.tcl" > "$gen_dir/add_multi_comp_${label}.tcl"
+done
+echo
+
+# Phase 2: Simulate
+echo -e "Running simulations with $MAX_WORKERS parallel workers:\n"
+for label in "${LABELS[@]}"; do
+	collect_workers $((MAX_WORKERS - 1))
+	start_worker "$label" run_sim
+done
+collect_workers 0
+echo
+
+print_summary
+exit $?
diff --git a/src/finn/compressor/run_dotp_comp_tests.sh b/src/finn/compressor/run_dotp_comp_tests.sh
new file mode 100755
index 0000000000..8843b7b728
--- /dev/null
+++ b/src/finn/compressor/run_dotp_comp_tests.sh
@@ -0,0 +1,154 @@
+#!/bin/bash
+#############################################################################
+# Copyright (C) 2024 - 2025 Advanced Micro Devices, Inc. All rights reserved.
+#
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# @brief    Test runner for dot product compressor verification
+# @author    Simon Gerber <simon.gerber@amd.com>
+#############################################################################
+
+# Usage: ./run_dotp_comp_tests.sh [target]
+#   target: versal, 7series, ultrascale (default: versal)
+
+((${KEEP_LOG:=0}))
+((${MAX_WORKERS:=12}))
+
+# Parse target argument
+TARGET="${1:-versal}"
+
+# Paths (all absolute for portability)
+SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
+HDL_DIR="$SCRIPT_DIR/hdl"
+GEN_DIR="$SCRIPT_DIR/gen"
+FINN_SRC="$(cd "$SCRIPT_DIR/../../.." && pwd)"
+# PYTHONPATH needs to point to where finn.compressor can be imported from (src/)
+export PYTHONPATH="$FINN_SRC/src${PYTHONPATH:+:$PYTHONPATH}"
+
+# Vivado working directory (isolated temp, unique per invocation)
+WORK_DIR="/tmp/finn_compressor_tests_$$"
+
+if ! command -v vivado >/dev/null 2>&1; then
+	echo "ERROR: vivado not found in PATH." >&2
+	exit 1
+fi
+
+echo "Vivado: $(command -v vivado)"
+echo "Settings: KEEP_LOG=$KEEP_LOG MAX_WORKERS=$MAX_WORKERS"
+echo "Target: $TARGET"
+
+source "$SCRIPT_DIR/lib/test_common.sh"
+
+# Test configs: PE SIMD WW AW ACCU SIGNED_ACT
+# Format: "PE SIMD WW AW ACCU SIGNED" where SIGNED=1 for signed activations, 0 otherwise
+# Target is set via script argument, applied to all tests
+TESTS=(
+	"2 8 1 1 16 0"
+	"2 8 1 1 16 1"
+	"2 8 2 1 16 0"
+	"2 8 2 2 16 1"
+	"2 4 2 2 16 1"
+	"2 16 2 2 16 1"
+	"1 8 2 2 16 1"
+	"4 8 2 2 16 1"
+)
+
+# Set FPGA part based on TARGET variable
+function get_fpga_part {
+	if [[ "$TARGET" == "7series" ]]; then
+		echo "xc7z020clg400-1"  # Pynq-Z1
+	elif [[ "$TARGET" == "ultrascale" ]]; then
+		echo "xczu9eg-ffvb1156-2-e"  # ZCU102
+	else
+		echo "xcvc1902-vsva2197-2MP-e-S"  # Versal VCK190
+	fi
+}
+
+# Build label from config
+function make_label {
+	local pe=$1 simd=$2 ww=$3 aw=$4 accu=$5 signed=$6
+	local label="pe${pe}_simd${simd}_ww${ww}_aw${aw}_accu${accu}"
+	[ "$signed" -eq 1 ] && label="${label}_sa"
+	echo "${label//-/_}"  # Sanitize for SystemVerilog
+}
+
+function run_sim {
+	local label="$1"
+	local work="$WORK_DIR/$label"
+	local tcl="$GEN_DIR/$label/dotp_comp_${label}.tcl"
+	local out="$GEN_DIR/$label/dotp_comp_${label}.runner.out"
+	local log=(-nolog); [ "$KEEP_LOG" -gt 0 ] && log=(-log "$GEN_DIR/$label/sim.log")
+
+	mkdir -p "$work"
+	(cd "$work" && vivado "${log[@]}" -nojournal -mode batch -source "$tcl" >"$out" 2>&1)
+	check_vivado_errors "$out" "$label"
+	exit $?
+}
+
+# Phase 1: Generate
+LABELS=()
+FPGA_PART=$(get_fpga_part)
+echo -e "Generating configs:\n"
+for test in "${TESTS[@]}"; do
+	read -r pe simd ww aw accu signed <<< "$test"
+	label=$(make_label "$pe" "$simd" "$ww" "$aw" "$accu" "$signed")
+	LABELS+=("$label")
+	out_dir="$GEN_DIR/$label"
+	mkdir -p "$out_dir"
+
+	echo "  $label ..."
+
+	# Build target flag (Versal is default, no flag needed)
+	target_flag=""
+	[[ "$TARGET" == "7series" ]] && target_flag="--target 7-Series"
+	[[ "$TARGET" == "ultrascale" ]] && target_flag="--target UltraScale"
+
+	# Build signed activations flag
+	signed_flag=""
+	[ "$signed" -eq 1 ] && signed_flag="--signed_activations"
+
+	# Generate compressor
+	# shellcheck disable=SC2086
+	gen_out=$(python3 -m finn.compressor.src.dotp_finn \
+		--simd "$simd" --ww "$ww" --aw "$aw" --accu_width "$accu" \
+		$signed_flag $target_flag \
+		--dotp-template hdl/dotp_comp_template.sv \
+		--dotp-output-name dotp_comp.sv \
+		-o "$out_dir" 2>&1)
+	if [ $? -ne 0 ]; then
+		echo "GENERATION FAILED: $gen_out" >&2; exit 1
+	fi
+
+	comp_depth=$(echo "$gen_out" | sed -n 's/^ *Pipeline depth:[[:space:]]*//p' | head -n 1 | grep -Eo '[0-9]+' || true)
+	[ -z "$comp_depth" ] && { echo "ERROR: No depth for $label" >&2; exit 1; }
+
+	# Extract dotp module name from generated file
+	dotp_module=$(grep "^module" "$out_dir/dotp_comp.sv" | sed 's/module \([^ #]*\).*/\1/')
+	[ -z "$dotp_module" ] && { echo "ERROR: No dotp module name for $label" >&2; exit 1; }
+
+	# Expand TB
+	sed -e "s/{pe}/$pe/g" -e "s/{simd}/$simd/g" \
+	    -e "s/{ww}/$ww/g" -e "s/{aw}/$aw/g" \
+	    -e "s/{accu_width}/$accu/g" \
+	    -e "s/{signed_act}/$signed/g" \
+	    -e "s/{full_sig}/$label/g" -e "s/{comp_depth}/$comp_depth/g" \
+	    -e "s/{dotp_module}/$dotp_module/g" \
+	    "$HDL_DIR/dotp_comp_tb_template.sv" > "$out_dir/dotp_comp_${label}_tb.sv"
+
+	# Expand TCL
+	sed -e "s/{label}/$label/g" -e "s|{src_dir}|$SCRIPT_DIR|g" -e "s/{part}/$FPGA_PART/g" \
+	    "$HDL_DIR/dotp_comp_template.tcl" > "$out_dir/dotp_comp_${label}.tcl"
+done
+echo
+
+# Phase 2: Simulate
+echo -e "Running simulations with $MAX_WORKERS parallel workers:\n"
+for label in "${LABELS[@]}"; do
+	collect_workers $((MAX_WORKERS - 1))
+	start_worker "$label" run_sim
+done
+collect_workers 0
+echo
+
+print_summary
+exit $?
diff --git a/src/finn/compressor/run_tests.sh b/src/finn/compressor/run_tests.sh
new file mode 100755
index 0000000000..5d7530fdcd
--- /dev/null
+++ b/src/finn/compressor/run_tests.sh
@@ -0,0 +1,106 @@
+#!/bin/bash
+#############################################################################
+# Copyright (C) 2024 - 2025 Advanced Micro Devices, Inc. All rights reserved.
+#
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# @brief    Master test runner for all compressor tests
+# @author    Simon Gerber <simon.gerber@amd.com>
+#############################################################################
+
+# If asserted, logs are kept.
+((${KEEP_LOG:=0}))
+# Limit the number of parallel worker processes for simulation.
+((${MAX_WORKERS:=12}))
+# Constant Absorption Option
+ca="$1"
+# Target platform (versal, 7series, or ultrascale)
+target="${2:-versal}"
+
+# PYTHONPATH so python -m finn.compressor.src.* resolves
+SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
+FINN_SRC="$(cd "$SCRIPT_DIR/../.." && pwd)"
+export PYTHONPATH="$FINN_SRC${PYTHONPATH:+:$PYTHONPATH}"
+: "${WORK_DIR:=${FINN_HOST_BUILD_DIR:-/tmp/finn_compressor_tests}}"
+
+if ! command -v vivado >/dev/null 2>&1; then
+	echo "ERROR: vivado not found in PATH." >&2
+	exit 1
+fi
+
+echo "Vivado: $(command -v vivado)"
+echo "Settings: KEEP_LOG=$KEEP_LOG MAX_WORKERS=$MAX_WORKERS WORK_DIR=$WORK_DIR"
+
+source "$SCRIPT_DIR/lib/test_common.sh"
+
+TESTS=(
+	1xu1u1 1xu1s1 1xs1u1 1xs1s1
+	7xu1s1
+	8xs1u1
+	9xu1u1
+
+	1xu2u1 1xu2s1 1xs2u1 1xs2s1
+	2xu2s1
+
+	1xu2u2 1xu2s2 1xs2u2 1xs2s2
+	2xs2u2
+
+	1xs3u3
+	3xs5u4
+	3xu5u4
+	7xs7s6
+)
+IFS=$'\n' TESTS=($(sort -r <<<"${TESTS[*]}"))
+
+function run_test {
+	local sig="$1"
+	local gen_log comp_log sim_out
+
+	if [ "$KEEP_LOG" -gt 0 ]; then
+		gen_log="$SCRIPT_DIR/comp_$sig.log"
+		comp_log=(-log "$SCRIPT_DIR/dotp_$sig.log")
+	else
+		gen_log="/dev/null"
+		comp_log=(-nolog)
+	fi
+
+	# Phase 1: Generate compressor
+	if ! python3 -m finn.compressor.src.dotp "$sig" "$ca" "$target" >"$gen_log" 2>&1; then
+		echo "ERROR: Generation failed for $sig" >&2
+		return 1
+	fi
+
+	# Phase 2: Run simulation
+	sim_out="$SCRIPT_DIR/gen/dotp_$sig.runner.out"
+	mkdir -p "$WORK_DIR"
+	(cd "$WORK_DIR" && vivado "${comp_log[@]}" -nojournal -mode batch -source "$SCRIPT_DIR/gen/dotp_$sig.tcl" >"$sim_out" 2>&1)
+
+	check_vivado_errors "$sim_out" "$sig"
+	return $?
+}
+
+# Phase 1: Sequential generation
+LABELS=()
+mkdir -p "$SCRIPT_DIR/gen"
+echo -e "Generating configs:\n"
+for test in "${TESTS[@]}"; do
+	echo "  $test ..."
+	LABELS+=("$test")
+	if ! python3 -m finn.compressor.src.dotp "$test" "$ca" "$target" >/dev/null 2>&1; then
+		echo "ERROR: Generation failed for $test" >&2
+		exit 1
+	fi
+done
+echo
+
+# Phase 2: Parallel simulation
+echo -e "Running simulations with $MAX_WORKERS parallel workers:\n"
+for label in "${LABELS[@]}"; do
+	collect_workers $((MAX_WORKERS - 1))
+	start_worker "$label" run_test
+done
+collect_workers 0
+echo
+
+print_summary
+exit $?
diff --git a/src/finn/compressor/src/__init__.py b/src/finn/compressor/src/__init__.py
new file mode 100644
index 0000000000..65cad800cf
--- /dev/null
+++ b/src/finn/compressor/src/__init__.py
@@ -0,0 +1,8 @@
+#############################################################################
+# Copyright (C) 2024 - 2025 Advanced Micro Devices, Inc. All rights reserved.
+#
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# @brief    Compressor source package initialization
+# @author    Simon Gerber <simon.gerber@amd.com>
+#############################################################################
diff --git a/src/finn/compressor/src/add_multi_finn.py b/src/finn/compressor/src/add_multi_finn.py
new file mode 100644
index 0000000000..27c5aec986
--- /dev/null
+++ b/src/finn/compressor/src/add_multi_finn.py
@@ -0,0 +1,427 @@
+#############################################################################
+# Copyright (C) 2024 - 2025 Advanced Micro Devices, Inc. All rights reserved.
+#
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# @brief    FINN wrapper for add_multi compressor generation
+# @author    Simon Gerber <simon.gerber@amd.com>
+#############################################################################
+
+"""
+Generate a compressor core for FINN's add_multi module (COMP path).
+
+The add_multi module in mvu.sv reduces N unsigned partial sums of ARG_WIDTH
+bits into a single result (N dsp lanes outputs).  This script generates a LUT-mapped compressor tree
+for a specific (N, ARG_WIDTH) configuration, producing a module that can be
+matched by the CATCH_COMP macro in add_multi.sv.
+
+Unlike dotp_finn.py, no absorption is needed:
+  - No gates:       inputs are complete values, not partial-product factor pairs
+  - No constants:   no Baugh-Wooley sign-correction (inputs are unsigned)
+  - No accumulation: accumulation stays downstream in mvu.sv
+
+Two call modes:
+
+  Direct mode — caller supplies N and ARG_WIDTH explicitly:
+    python add_multi_finn.py --n 32 --arg_width 6 -t Versal -o gen/
+
+  MVU mode — caller supplies MVU-level parameters, and the script computes
+  the required lo_width values per DSP lane via a Python replica of
+  mvu.sv::sliceLanes(), then generates one compressor per unique (N, lo_width):
+    python add_multi_finn.py --mvu --n 8 --version 2 --ww 2 --aw 2 \
+        --accu_width 16 --narrow_weights 0 -t Versal -o gen/
+
+
+Outputs:
+  comp_<N>u<W>_d<delay>.sv  — the generated compressor core(s)
+"""
+
+import argparse
+import math
+import os
+
+from .main import generate_compressor
+from .target import resolve_target, resolve_target_name
+from .utils.shape import Shape
+
+# ---------------------------------------------------------------------------
+# Python replica of mvu.sv::sliceLanes()
+#
+# This must mirror the SV implementation exactly. Any change to sliceLanes()
+# in mvu.sv requires updating this function as well. The $warning guard in
+# add_multi.sv catches divergence at simulation time.
+#
+# This outsourced computation is required as lane width is relevant to the
+# compressor input Shape and thus needs to be known at generation time.
+
+
+def clog2(n):
+    """Ceiling of log2, matching SystemVerilog $clog2 semantics."""
+    if n <= 1:
+        return 0
+    return math.ceil(math.log2(n))
+
+
+def slice_lanes(version, ww, aw, accu_width, narrow_weights):
+    """
+    Compute DSP lane offsets — Python replica of mvu.sv::sliceLanes().
+    Parameters
+    ----------
+    version : int
+        DSP version (1=DSP48E1, 2=DSP48E2, 3=DSP58).
+    ww : int
+        WEIGHT_WIDTH.
+    aw : int
+        ACTIVATION_WIDTH.
+    accu_width : int
+        ACCU_WIDTH.
+    narrow_weights : bool
+        NARROW_WEIGHTS flag.
+
+    Returns
+    -------
+    (num_lanes, offsets) : tuple
+        num_lanes : int
+            number of DSP lanes.
+        offsets   : list[int]
+            lane boundary positions (length num_lanes+1).
+    """
+    a_width = 25 + 2 * (version > 1)
+    p_width = 58 if version == 3 else 48
+    min_lane_width = ww + aw - 1
+
+    if a_width == ww:
+        num_lanes = 1
+    else:
+        num_lanes = 1 + (a_width - (0 if narrow_weights else 1) - ww) // min_lane_width
+
+    # Distribute slack bits preferring right lanes
+    bit_slack = a_width - (0 if narrow_weights else 1) - ww - (num_lanes - 1) * min_lane_width
+
+    offsets = [0] * (num_lanes + 1)
+    for i in range(1, num_lanes):
+        extra = (bit_slack + (num_lanes - 1 - i)) // (num_lanes - i)
+        offsets[i] = offsets[i - 1] + min_lane_width + extra
+        bit_slack -= extra
+
+    # Last lane bounded by min(ACCU_WIDTH, P_WIDTH)
+    offsets[num_lanes] = offsets[num_lanes - 1] + accu_width
+    if offsets[num_lanes] > p_width:
+        offsets[num_lanes] = p_width
+
+    return num_lanes, offsets
+
+
+def lo_widths_from_mvu_params(version, ww, aw, accu_width, narrow_weights):
+    """
+    Compute the lo_width for each DSP lane.
+
+    Returns
+    -------
+    list[int]
+        lo_width for lane 0 .. num_lanes-1.
+    """
+    num_lanes, offsets = slice_lanes(version, ww, aw, accu_width, narrow_weights)
+    return [offsets[i + 1] - offsets[i] for i in range(num_lanes)]
+
+
+def comp_module_name(n, arg_width, delay):
+    """
+    Return the compressor module name, e.g. 'comp_32u6_d4'.
+
+    Encodes:
+      N         — number of unsigned addends (= SIMD)
+      ARG_WIDTH — bits per addend (= lo_width from mvu.sv lane slicing)
+      delay     — pipeline stages produced by the generator
+
+    The 'u' indicates unsigned, matching the mvu_bench naming convention.
+    The delay suffix lets the CATCH_COMP macro in add_multi.sv match on
+    minimum pipeline depth (DEPTH >= d).
+    """
+    return f"comp_{n}u{arg_width}_d{delay}"
+
+
+def generate_add_multi_comp(target, n, arg_width, pipeline_every, output_dir, name=None):
+    """
+    Generate a multi-input adder compressor (no accumulation).
+
+    Parameters
+    ----------
+    target : Target
+        FPGA target (Versal, SevenSeries) — selects LUT primitives.
+    n : int
+        Number of unsigned addends.
+    arg_width : int
+        Bit width of each addend.
+    pipeline_every : int or None
+        Insert pipeline registers every N combinational stages.
+        None means purely combinational.
+    output_dir : str
+        Directory for the generated .sv file.
+    name : str or None
+        Module name override.  When None (default), the name is derived
+        from (n, arg_width, delay) after generation.
+
+    Returns
+    -------
+    (name, path, delay) : tuple
+        Module name, file path, and pipeline depth of the generated compressor.
+    """
+    # Shape: W columns each of height N.
+    # Each of the N operands contributes 1 bit to each of the W bit-positions,
+    # so every column has the same height N.
+    shape = Shape([n] * arg_width)
+
+    # First pass: generate with a temporary name to discover the actual delay.
+    # The delay depends on the compressor structure and pipeline_every, so we
+    # can't know it before generation.
+    tmp_name = name if name is not None else f"comp_{n}u{arg_width}"
+    tmp_path = os.path.join(output_dir, tmp_name + ".sv")
+
+    delay = generate_compressor(
+        target=target,
+        shape=shape,
+        name=tmp_name,
+        comb_depth=pipeline_every,
+        accumulate=False,  # Pure adder, no fused accumulation
+        accumulator_width=None,  # Not applicable without accumulation
+        gates=[],  # No gate absorption, inputs are complete values
+        constants=[],  # No Baugh-Wooley correction, unsigned inputs
+        path=tmp_path,
+        test=False,
+        enable=False,  # No accumulator registers to initialize
+    )
+
+    # Derive final name with delay suffix
+    if name is not None:
+        final_name = name
+        final_path = tmp_path
+    else:
+        final_name = comp_module_name(n, arg_width, delay)
+        final_path = os.path.join(output_dir, final_name + ".sv")
+
+        if final_name != tmp_name:
+            # Rename file and replace module name inside it
+            with open(tmp_path, "r") as f:
+                content = f.read()
+            content = content.replace(tmp_name, final_name)
+            with open(final_path, "w") as f:
+                f.write(content)
+            os.remove(tmp_path)
+
+    return final_name, final_path, delay
+
+
+def generate_add_multi_comps(
+    fpgapart, version, simd, ww, aw, accu_width, narrow_weights, output_dir
+):
+    """
+    Generate add_multi compressor cores and patch add_multi.sv.
+    This is the high-level entry point called by FINN's generate_hdl().
+
+    ALWAYS generates compressors and patches add_multi.sv with CATCH_COMP entries.
+
+    Parameters
+    ----------
+    fpgapart : str
+        FPGA part string.
+    version : int
+        DSP version (1=DSP48E1, 2=DSP48E2, 3=DSP58).
+    simd, ww, aw, accu_width : int
+        MVU parameters.
+    narrow_weights : int
+        NARROW_WEIGHTS flag (0 or 1).
+    output_dir : str
+        Directory for generated files (= code_gen_dir).
+
+    Returns
+    -------
+    dict with keys:
+        comp_names : list[str] — generated module names (empty if ineligible)
+        files      : list[str] — paths of all generated/patched files
+    """
+
+    rtllib_dir = os.path.join(os.environ["FINN_ROOT"], "finn-rtllib/mvu/")
+    patched_path = os.path.join(output_dir, "add_multi.sv")
+
+    # Always generate compressors and patch add_multi.sv
+    target = resolve_target(fpgapart)
+
+    # This is currently a parallel implementation of the lo_width
+    # computation in mvu.sv's sliceLanes() function. The resulting
+    # lo_width values determine the compressor input Shapes, so we need
+    # to compute them here in Python at generation time.
+    # Must be kept in SYNC.
+    widths = lo_widths_from_mvu_params(version, ww, aw, accu_width, narrow_weights)
+
+    # Generate one compressor per unique (SIMD, lo_width)
+    generated = {}  # (simd, width) -> (name, delay)
+    for w in widths:
+        key = (simd, w)
+        if key not in generated:
+            name, _path, delay = generate_add_multi_comp(
+                target,
+                simd,
+                w,
+                pipeline_every=1,  # Max pipelining (match dotp_comp behavior)
+                output_dir=output_dir,
+            )
+            generated[key] = (name, delay)
+
+    # Copy add_multi.sv to output_dir and inject CATCH_COMP lines
+    with open(os.path.join(rtllib_dir, "add_multi.sv"), "r") as f:
+        add_multi_src = f.read()
+
+    catch_lines = ""
+    comp_specs = []
+    for (_n, _w), (name, delay) in generated.items():
+        catch_lines += "\t`CATCH_COMP(%d,%d,%d)\n" % (_n, _w, delay)
+        comp_specs.append((_n, _w, delay))
+
+    marker = "\t// FINN_GENERATED_COMP_ENTRIES\n"
+    if marker not in add_multi_src:
+        raise RuntimeError(
+            "Cannot find FINN_GENERATED_COMP_ENTRIES marker in add_multi.sv. "
+            "Has the file been modified?"
+        )
+    add_multi_src = add_multi_src.replace(marker, catch_lines + marker)
+
+    with open(patched_path, "w") as f:
+        f.write(add_multi_src)
+
+    comp_files = [os.path.join(output_dir, name + ".sv") for (name, _delay) in generated.values()]
+
+    return {
+        "comp_names": [name for (name, _delay) in generated.values()],
+        "comp_specs": comp_specs,  # [(N, ARG_WIDTH, DELAY), ...]
+        "files": [patched_path] + comp_files,
+    }
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        prog="add_multi_finn", description="Generate a compressor core for FINN's add_multi module."
+    )
+    parser.add_argument("--n", type=int, required=True, help="Number of unsigned addends (= SIMD)")
+    parser.add_argument(
+        "-t",
+        "--target",
+        default="Versal",
+        choices=["Versal", "7-Series", "UltraScale"],
+        help="Target FPGA generation",
+    )
+    parser.add_argument(
+        "-p",
+        "--pipeline_every",
+        type=int,
+        default=None,
+        help="Pipeline registers every N combinational stages",
+    )
+    parser.add_argument(
+        "-o", "--output_dir", default="../gen", help="Output directory for generated files"
+    )
+    parser.add_argument(
+        "--name", default=None, help="Module name override (default: comp_<N>u<W>_d<delay>)"
+    )
+
+    # Direct mode: explicit arg_width
+    parser.add_argument(
+        "--arg_width", type=int, default=None, help="Bit width per addend (direct mode)"
+    )
+
+    # MVU mode: derive arg_width(s) from MVU parameters
+    mvu_group = parser.add_argument_group(
+        "MVU parameters",
+        "When --mvu is given, lo_width values are computed from these "
+        "MVU-level parameters (replicating mvu.sv::sliceLanes).",
+    )
+    mvu_group.add_argument(
+        "--mvu", action="store_true", help="Enable MVU mode: derive arg_width from MVU params"
+    )
+    mvu_group.add_argument(
+        "--version",
+        type=int,
+        default=2,
+        choices=[1, 2, 3],
+        help="DSP version (1=DSP48E1, 2=DSP48E2, 3=DSP58)",
+    )
+    mvu_group.add_argument("--ww", type=int, default=None, help="WEIGHT_WIDTH")
+    mvu_group.add_argument("--aw", type=int, default=None, help="ACTIVATION_WIDTH")
+    mvu_group.add_argument("--accu_width", type=int, default=None, help="ACCU_WIDTH")
+    mvu_group.add_argument(
+        "--narrow_weights", type=int, default=0, choices=[0, 1], help="NARROW_WEIGHTS flag (0 or 1)"
+    )
+
+    args = parser.parse_args()
+
+    # Validate argument combinations
+    if not args.mvu and args.arg_width is None:
+        parser.error(
+            "Either --arg_width (direct mode) or --mvu with MVU " "parameters is required."
+        )
+    if args.mvu and args.arg_width is not None:
+        parser.error("--arg_width and --mvu are mutually exclusive.")
+    if args.mvu:
+        for param in ("ww", "aw", "accu_width"):
+            if getattr(args, param) is None:
+                parser.error(f"--mvu requires --{param}")
+
+    target = resolve_target_name(args.target)
+    os.makedirs(args.output_dir, exist_ok=True)
+
+    if args.mvu:
+        # MVU mode: compute lo_width per lane, generate unique compressors
+        simd = args.n
+
+        # For SIMD < 4, the binary adder tree is already optimal.
+        # A compressor adds structural overhead with no benefit.
+        if simd < 4:
+            print(f"SIMD={simd} < 4: binary tree is optimal, no compressors generated.")
+            return
+
+        widths = lo_widths_from_mvu_params(
+            args.version, args.ww, args.aw, args.accu_width, bool(args.narrow_weights)
+        )
+        depth = 3 + clog2(simd) + (1 if simd == 1 else 0) + 1
+        add_multi_depth = depth - 4
+
+        print(
+            f"MVU config: VERSION={args.version} WW={args.ww} AW={args.aw} "
+            f"ACCU_WIDTH={args.accu_width} NARROW_WEIGHTS={args.narrow_weights}"
+        )
+        print(
+            f"  NUM_LANES={len(widths)}  PIPELINE_DEPTH={depth}  "
+            f"ADD_MULTI_DEPTH={add_multi_depth}"
+        )
+        print(f"  LO_WIDTHs: {widths}")
+
+        # Generate one compressor per unique (N, lo_width)
+        seen = set()
+        for lane, w in enumerate(widths):
+            if (simd, w) in seen:
+                print(f"  Lane {lane}: lo_width={w} — reuses existing module")
+                continue
+            seen.add((simd, w))
+
+            comp_name, comp_path, comp_delay = generate_add_multi_comp(
+                target, simd, w, args.pipeline_every, args.output_dir, name=args.name
+            )
+            print(f"  Lane {lane}: lo_width={w}")
+            print(f"    Generated: {comp_path}")
+            print(f"    Module:    {comp_name}")
+            print(f"    Delay:     {comp_delay}")
+
+    else:
+        # Direct mode: single compressor for explicit arg_width
+        comp_name, comp_path, comp_delay = generate_add_multi_comp(
+            target, args.n, args.arg_width, args.pipeline_every, args.output_dir, name=args.name
+        )
+
+        print(f"Generated compressor core: {comp_path}")
+        print(f"  Module name:     {comp_name}")
+        print(f"  Configuration:   {args.n} unsigned addends x {args.arg_width} bits")
+        print(f"  Pipeline depth:  {comp_delay}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/src/finn/compressor/src/benchmark.py b/src/finn/compressor/src/benchmark.py
new file mode 100644
index 0000000000..68b2e87cb3
--- /dev/null
+++ b/src/finn/compressor/src/benchmark.py
@@ -0,0 +1,77 @@
+#############################################################################
+# Copyright (C) 2024 - 2025 Advanced Micro Devices, Inc. All rights reserved.
+#
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# @brief    Benchmarking harness for compressor generation
+#############################################################################
+
+from functools import reduce
+
+from .passes.compressor_constructor import CompressorConstructor
+from .passes.cost_estimator import CostEstimator
+from .target import Versal
+from .utils.shape import Shape
+
+
+def gmean(numbers):
+    return reduce(lambda x, y: x * y, numbers) ** (1.0 / len(numbers))
+
+
+def benchmark():
+    examples = {
+        "128": Shape([128]),
+        "256": Shape([256]),
+        "512": Shape([512]),
+        "128,128": Shape([128, 128]),
+        "256,256": Shape([256, 256]),
+        "512,512": Shape([512, 512]),
+        "Int1": Shape([1, 1, 2, 3, 4, 5, 6, 7, 5, 4, 3, 2, 1]),
+        "Int2": Shape([1, 1, 1, 3, 5, 7, 9, 11, 13, 10, 8, 6, 4, 2, 1]),
+        "Int3": Shape([1, 1, 1, 1, 5, 9, 13, 17, 21, 25, 20, 16, 12, 8, 4]),
+        "Int4": Shape([1, 1, 1, 1, 1, 9, 17, 25, 33, 41, 49, 40, 32, 24, 16, 8]),
+        "Int5": Shape([1, 1, 1, 1, 1, 1, 17, 33, 49, 65, 81, 97, 80, 64, 48, 32, 16]),
+        "LPFP1": Shape([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2]),
+        "LPFP2": Shape([2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 4]),
+        "LPFP3": Shape([4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 8]),
+        "LPFP4": Shape([8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 16]),
+        "LPFP5": Shape(
+            [16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 32]
+        ),
+        "6-Input": Shape(32 * [6]),
+        "10-Input": Shape(32 * [10]),
+        "Mul16": Shape(list(range(1, 17)) + list(reversed(range(1, 16)))),
+    }
+
+    luts = []
+    for example_name, example_shape in examples.items():
+        target = Versal()
+        constructor = CompressorConstructor()
+        comp = constructor(
+            target.counter_candidates,
+            target.absorbing_counter_candidates,
+            target.final_adder,
+            example_shape,
+            "comp",
+            1,
+            True,
+            None,
+            tuple(),
+            [],
+        )
+
+        cost = CostEstimator()
+        comp.accept(cost)
+        eff = (sum(comp.input_shape) - sum(comp.output_shape)) / cost.luts
+        luts.append(cost.luts)
+        print(
+            f"Example {example_name:<10} uses {cost.luts:<6} LUTs"
+            f"for {cost.combinatorial_stages} stages (Efficiency: {eff: 1.2f})"
+        )
+
+    luts_gmean = gmean(luts)
+    print(f"Geomean {luts_gmean:.6} LUTs")
+
+
+if __name__ == "__main__":
+    benchmark()
diff --git a/src/finn/compressor/src/dotp.py b/src/finn/compressor/src/dotp.py
new file mode 100644
index 0000000000..9c00dfb46f
--- /dev/null
+++ b/src/finn/compressor/src/dotp.py
@@ -0,0 +1,107 @@
+#############################################################################
+# Copyright (C) 2024 - 2025 Advanced Micro Devices, Inc. All rights reserved.
+#
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# @brief    Dot product compressor core generation for standalone testing
+#############################################################################
+
+import os
+import re
+import sys
+
+from .main import generate_compressor
+from .target import SevenSeries, Versal
+from .utils.mul_comp_map import MulCompMap
+from .utils.shape import Shape
+
+if __name__ == "__main__":
+    # Parse and extract Parameters from Command Line
+    sig = sys.argv[1]
+    _ = re.fullmatch("(\\d+)x([us])(\\d+)([us])(\\d+)", sig).groups()
+    (n, na, nb, sa, sb) = (int(_[0]), int(_[2]), int(_[4]), _[1] == "s", _[3] == "s")
+    assert nb <= na
+
+    # Target platform: ca/accu goes in argv[2], target in argv[3] (default versal)
+    target_arg = sys.argv[3] if len(sys.argv) > 3 else "versal"
+    if target_arg == "7series":
+        target = SevenSeries()
+        fpga_part = "xc7z020clg400-1"
+    elif target_arg == "ultrascale":
+        from .target import UltraScale
+
+        target = UltraScale()
+        fpga_part = "xczu9eg-ffvb1156-2-e"
+    else:  # versal (default)
+        target = Versal()
+        fpga_part = "xcvc1902-vsva2197-2MP-e-S"
+
+    def clog2(x):
+        return (x - 1).bit_length()
+
+    np = (
+        clog2(n) + (na if nb == 1 and not sb else na + nb)
+        if na > 1
+        else (clog2(n + 1) if sa == sb else 1 + clog2(n))
+    )
+
+    map = MulCompMap(na, nb, sa, sb)
+    shape = [col * n for col in map.shape()]
+    print("Shape: ", " ".join((":".join((f"{val:x}" for val in col)) for col in shape[::-1])))
+
+    # Absolute Term Contribution
+    constants = []
+    abs_term = n * map.absolute_term()
+    # Move absolute term into absorbed constant if requested
+    if len(sys.argv) > 2 and sys.argv[2] == "ca":
+        print("Constant absorption.")
+        if abs_term < 0:
+            abs_term += 2**np
+        constants = [(abs_term >> i) & 1 for i in range(np)]
+        abs_term = 0
+
+    name = "comp_" + sig
+    # Write to gen/ relative to this script's parent directory (compressor/)
+    script_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+    output_path = os.path.join(script_dir, "gen", name + ".sv")
+    comp_depth = generate_compressor(
+        target=target,
+        shape=Shape((len(col) for col in shape)),
+        name=name,
+        comb_depth=None,
+        accumulate=False,
+        accumulator_width=None,
+        gates=[[f"{val:x}" for val in col] for col in shape],
+        constants=constants,
+        path=output_path,
+        test=False,
+    )
+
+    # Process templates with absolute paths
+    gen_dir = os.path.join(script_dir, "gen")
+    hdl_dir = os.path.join(script_dir, "hdl")
+    for src_rel, dst_rel in (
+        ("dotp_template.sv", "dotp_" + sig + ".sv"),
+        ("dotp_tb_template.sv", "dotp_" + sig + "_tb.sv"),
+        ("dotp_template.tcl", "dotp_" + sig + ".tcl"),
+    ):
+        src = os.path.join(hdl_dir, src_rel)
+        dst = os.path.join(gen_dir, dst_rel)
+        with open(src, "rt") as fsrc:
+            with open(dst, "wt") as fdst:
+                for line in fsrc:
+                    fdst.write(
+                        line.replace("{n}", str(n))
+                        .replace("{na}", str(na))
+                        .replace("{nb}", str(nb))
+                        .replace("{sa}", "s" if sa else "u")
+                        .replace("{sb}", "s" if sb else "u")
+                        .replace("{signed_a}", str(int(sa)))
+                        .replace("{signed_b}", str(int(sb)))
+                        .replace("{abs_term}", str(abs_term))
+                        .replace("{depth}", str(comp_depth))
+                        .replace("{part}", fpga_part)
+                        # Replace relative paths with absolute paths for TCL
+                        .replace("hdl/", hdl_dir + "/")
+                        .replace("gen/", gen_dir + "/")
+                    )
diff --git a/src/finn/compressor/src/dotp_finn.py b/src/finn/compressor/src/dotp_finn.py
new file mode 100644
index 0000000000..77dfac7459
--- /dev/null
+++ b/src/finn/compressor/src/dotp_finn.py
@@ -0,0 +1,308 @@
+#############################################################################
+# Copyright (C) 2024 - 2025 Advanced Micro Devices, Inc. All rights reserved.
+#
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# @brief    FINN wrapper for dot product compressor generation
+# @author    Simon Gerber <simon.gerber@amd.com>
+#############################################################################
+
+"""
+Generate a compressor core for FINN's dotp_comp module.
+
+The static dotp_comp template (in finn-rtllib/mvu/) instantiates a generated
+compressor core whose module name encodes the configuration signature, e.g.
+`comp_8xs2s2_a16`.  This script generates that core: a LUT-mapped reduction tree
+with fused accumulation, specific to a (SIMD, WW, AW, signedness) configuration.
+
+Usage:
+  python dotp_finn.py --simd 8 --ww 2 --aw 2 --accu_width 16 \
+                      --signed_activations --target Versal -o gen/
+
+Outputs:
+  comp_<sig>.sv  — the generated compressor core (module `comp_<sig>`)
+"""
+
+import argparse
+import os
+import re
+
+from .main import generate_compressor
+from .target import resolve_target, resolve_target_name
+from .utils.mul_comp_map import MulCompMap
+from .utils.shape import Shape
+
+
+def expand_template(template_path, output_path, substitutions):
+    """Expand a text template by replacing $PLACEHOLDER$ tokens.
+
+    Raises FileNotFoundError if paths invalid, ValueError if placeholders remain.
+    """
+    if not os.path.isfile(template_path):
+        raise FileNotFoundError(f"Template not found: {template_path}")
+
+    output_dir = os.path.dirname(output_path)
+    if output_dir and not os.path.isdir(output_dir):
+        raise FileNotFoundError(f"Output directory does not exist: {output_dir}")
+
+    with open(template_path, "r") as f:
+        text = f.read()
+    for key, value in substitutions.items():
+        text = text.replace(key, value)
+    remaining = re.findall(r"\$[A-Z_]+\$", text)
+    if remaining:
+        raise ValueError(f"Unsubstituted placeholders in {output_path}: {remaining}")
+    with open(output_path, "w") as f:
+        f.write(text)
+
+
+def compute_params(simd, weight_width, activation_width, signed_activations):
+    """Map finn parameters to compressor parameters, respecting NA >= NB."""
+    # Weights are always signed in finn
+    sa_finn = True
+    sb_finn = signed_activations
+
+    # mul_comp_map requires NA >= NB. Swap operands if needed.
+    if weight_width >= activation_width:
+        na, nb = weight_width, activation_width
+        sa, sb = sa_finn, sb_finn
+        swapped = False
+    else:
+        na, nb = activation_width, weight_width
+        sa, sb = sb_finn, sa_finn
+        swapped = True
+
+    n = simd
+    return n, na, nb, sa, sb, swapped
+
+
+def make_signature(n, sa, na, sb, nb):
+    """Build the compressor file signature string, e.g. '8xs2u2'."""
+    return f"{n}x{'s' if sa else 'u'}{na}{'s' if sb else 'u'}{nb}"
+
+
+def comp_module_name(n, sa, na, sb, nb, accu_width):
+    """Return the config-specific compressor module name, e.g. 'comp_8xs2s2_a16'."""
+    return "comp_" + make_signature(n, sa, na, sb, nb) + f"_a{accu_width}"
+
+
+def generate_comp_module(
+    target, n, na, nb, sa, sb, accu_width, pipeline_every, output_dir, name=None
+):
+    """Generate the compressor core with fused accumulation.
+
+    When *name* is None (the default), the module is named after its
+    configuration signature, e.g. ``comp_8xs2s2_a16``.  This keeps module
+    names unique across different compressor configurations in the same
+    Vivado project.
+    """
+    if name is None:
+        name = comp_module_name(n, sa, na, sb, nb, accu_width)
+    m = MulCompMap(na, nb, sa, sb)
+    shape_cols = [col * n for col in m.shape()]
+    shape = Shape((len(col) for col in shape_cols))
+    gates = [[f"{val:x}" for val in col] for col in shape_cols]
+
+    # Absorb abs_term as a constant input to the compressor tree.
+    # This ensures the correction is applied every accumulation cycle,
+    # not just once at the output.
+    abs_term = n * m.absolute_term()
+    if abs_term != 0:
+        abs_val = abs_term % (1 << accu_width)  # two's complement
+        constants = [(abs_val >> i) & 1 for i in range(accu_width)]
+    else:
+        constants = []
+
+    comp_path = os.path.join(output_dir, name + ".sv")
+    delay = generate_compressor(
+        target=target,
+        shape=shape,
+        name=name,
+        comb_depth=pipeline_every,
+        accumulate=True,
+        accumulator_width=accu_width,
+        gates=gates,
+        constants=constants,
+        path=comp_path,
+        test=False,
+        enable=True,
+    )
+    return name, comp_path, delay
+
+
+def generate_dotp_comp(fpgapart, simd, ww, aw, accu_width, signed_act, output_dir):
+    """
+    Generate the dotp_comp path: compressor core + expanded template.
+
+    This is the high-level entry point called by FINNs generate_hdl().
+
+    Parameters
+    ----------
+    fpgapart : str
+        FPGA part string (e.g. "xcvc1902-...").
+    simd, ww, aw, accu_width : int
+        MVU parameters.
+    signed_act : bool
+        Whether activations are signed.
+    output_dir : str
+        Directory for generated files (= code_gen_dir).
+
+    Returns
+    -------
+    dict with keys:
+        comp_name  : str   — module name (e.g. "comp_8xs2s2_a16")
+        comp_delay : int   — pipeline depth
+        files      : list  — paths of all generated files
+    """
+
+    target = resolve_target(fpgapart)
+    n, na, nb, sa, sb, _ = compute_params(simd, ww, aw, signed_act)
+
+    comp_name, comp_path, comp_delay = generate_comp_module(
+        target,
+        n,
+        na,
+        nb,
+        sa,
+        sb,
+        accu_width,
+        pipeline_every=1,  # Max pipelining
+        output_dir=output_dir,
+    )
+
+    # Expand dotp_comp template with the generated module name
+    # Use config-specific module name to avoid collisions in multi-MVAU builds
+    src_dir = os.path.dirname(os.path.abspath(__file__))
+    compressor_root = os.path.abspath(os.path.join(src_dir, ".."))
+    dotp_comp_template = os.path.join(compressor_root, "hdl", "dotp_comp_template.sv")
+    dotp_module_name = f"dotp_{comp_name}"
+    dotp_comp_path = os.path.join(output_dir, f"{dotp_module_name}.sv")
+    expand_template(
+        dotp_comp_template,
+        dotp_comp_path,
+        {
+            "$DOTP_MODULE_NAME$": dotp_module_name,
+            "$COMP_MODULE_NAME$": comp_name,
+            "$EXPECTED_SIMD$": str(simd),
+            "$EXPECTED_NA$": str(na),
+            "$EXPECTED_NB$": str(nb),
+            "$EXPECTED_SIGNED_A$": str(1 if sa else 0),
+            "$EXPECTED_SIGNED_B$": str(1 if sb else 0),
+            "$EXPECTED_ACCU_WIDTH$": str(accu_width),
+        },
+    )
+
+    return {
+        "comp_name": comp_name,
+        "dotp_module_name": dotp_module_name,
+        "comp_delay": comp_delay,
+        "files": [dotp_comp_path, comp_path],
+    }
+
+
+def main():
+    script_dir = os.path.dirname(os.path.abspath(__file__))
+    repo_root = os.path.abspath(os.path.join(script_dir, ".."))
+    default_dotp_template = os.path.join(repo_root, "hdl", "dotp_comp_template.sv")
+
+    parser = argparse.ArgumentParser(
+        prog="dotp_finn", description="Generate a compressor core for FINN's dotp_comp module."
+    )
+    parser.add_argument("--simd", type=int, required=True, help="SIMD (operand pairs per cycle)")
+    parser.add_argument("--ww", type=int, required=True, help="Weight bit width")
+    parser.add_argument("--aw", type=int, required=True, help="Activation bit width")
+    parser.add_argument("--accu_width", type=int, required=True, help="Accumulator bit width")
+    parser.add_argument("--signed_activations", action="store_true", help="Activations are signed")
+    parser.add_argument(
+        "-t",
+        "--target",
+        default="Versal",
+        choices=["Versal", "7-Series", "UltraScale"],
+        help="Target FPGA generation",
+    )
+    parser.add_argument(
+        "-p",
+        "--pipeline_every",
+        type=int,
+        default=None,
+        help="Pipeline registers every N combinational stages",
+    )
+    parser.add_argument(
+        "-o", "--output_dir", default="../gen", help="Output directory for generated files"
+    )
+    parser.add_argument(
+        "-n", "--name", default=None, help="Module name override (default: comp_<sig>)"
+    )
+    parser.add_argument(
+        "--dotp-template",
+        default=default_dotp_template,
+        help="Path to dotp_comp template file to expand",
+    )
+    parser.add_argument(
+        "--dotp-output-name",
+        default="dotp_comp.sv",
+        help="Output file name for expanded dotp_comp template",
+    )
+    parser.add_argument(
+        "--skip-dotp-template", action="store_true", help="Skip expanding dotp_comp template"
+    )
+    args = parser.parse_args()
+    target = resolve_target_name(args.target)
+    os.makedirs(args.output_dir, exist_ok=True)
+
+    # Compute compressor parameters
+    n, na, nb, sa, sb, swapped = compute_params(
+        args.simd, args.ww, args.aw, args.signed_activations
+    )
+
+    # Generate the compressor core with fused accumulation
+    comp_name, comp_path, comp_delay = generate_comp_module(
+        target,
+        n,
+        na,
+        nb,
+        sa,
+        sb,
+        args.accu_width,
+        args.pipeline_every,
+        args.output_dir,
+        name=args.name,
+    )
+
+    dotp_path = None
+    if not args.skip_dotp_template:
+        template_path = os.path.abspath(args.dotp_template)
+        if not os.path.isfile(template_path):
+            raise FileNotFoundError(
+                f"dotp template not found: {template_path}. "
+                f"Use --dotp-template or --skip-dotp-template."
+            )
+        dotp_module_name = f"dotp_{comp_name}"
+        dotp_path = os.path.join(args.output_dir, args.dotp_output_name)
+        expand_template(
+            template_path,
+            dotp_path,
+            {
+                "$DOTP_MODULE_NAME$": dotp_module_name,
+                "$COMP_MODULE_NAME$": comp_name,
+                "$EXPECTED_SIMD$": str(args.simd),
+                "$EXPECTED_NA$": str(na),
+                "$EXPECTED_NB$": str(nb),
+                "$EXPECTED_SIGNED_A$": str(1 if sa else 0),
+                "$EXPECTED_SIGNED_B$": str(1 if sb else 0),
+                "$EXPECTED_ACCU_WIDTH$": str(args.accu_width),
+            },
+        )
+
+    sig = make_signature(n, sa, na, sb, nb)
+    print(f"Generated compressor core: {comp_path}")
+    if dotp_path is not None:
+        print(f"Expanded dotp template: {dotp_path}")
+    print(f"  Module name:     {comp_name}")
+    print(f"  Configuration:   {sig}")
+    print(f"  Pipeline depth:  {comp_delay}")
+    print(f"  Operands:        {'swapped' if swapped else 'not swapped'} (NA={na} >= NB={nb})")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/src/finn/compressor/src/evaluation.py b/src/finn/compressor/src/evaluation.py
new file mode 100644
index 0000000000..61dc1f82ea
--- /dev/null
+++ b/src/finn/compressor/src/evaluation.py
@@ -0,0 +1,275 @@
+#############################################################################
+# Copyright (C) 2024 - 2025 Advanced Micro Devices, Inc. All rights reserved.
+#
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# @brief    Evaluation and benchmarking utilities for compressor
+#############################################################################
+
+import subprocess
+from concurrent.futures import ThreadPoolExecutor
+
+from .main import generate_compressor
+from .target import Versal
+from .tests.test_gen import compressed_width
+from .utils.shape import Shape
+
+
+def evaluation():
+    examples = {
+        "128": Shape([128]),
+        "256": Shape([256]),
+        "512": Shape([512]),
+        "128,128": Shape([128, 128]),
+        "256,256": Shape([256, 256]),
+        "512,512": Shape([512, 512]),
+        "Int1": Shape([1, 1, 2, 3, 4, 5, 6, 7, 5, 4, 3, 2, 1]),
+        "Int2": Shape([1, 1, 1, 3, 5, 7, 9, 11, 13, 10, 8, 6, 4, 2, 1]),
+        "Int3": Shape([1, 1, 1, 1, 5, 9, 13, 17, 21, 25, 20, 16, 12, 8, 4]),
+        "Int4": Shape([1, 1, 1, 1, 1, 9, 17, 25, 33, 41, 49, 40, 32, 24, 16, 8]),
+        "Int5": Shape([1, 1, 1, 1, 1, 1, 17, 33, 49, 65, 81, 97, 80, 64, 48, 32, 16]),
+        "LPFP1": Shape([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2]),
+        "LPFP2": Shape([2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 4]),
+        "LPFP3": Shape([4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 8]),
+        "LPFP4": Shape([8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 16]),
+        "LPFP5": Shape(
+            [16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 32]
+        ),
+        "6x32": Shape(32 * [6]),
+        "10x32": Shape(32 * [10]),
+        "Mul16": Shape(list(range(1, 17)) + list(reversed(range(1, 16)))),
+    }
+
+    filenames = []
+    for example_name, example_shape in examples.items():
+        print(example_name, example_shape)
+        # combinatorial design
+        filename = "../gen/" + example_name + "_comb.sv"
+        generate_compressor(
+            target=Versal(),
+            shape=example_shape,
+            name="comp",
+            comb_depth=None,
+            accumulate=False,
+            accumulator_width=None,
+            gates=None,
+            constants=[],
+            path=filename,
+            test=True,
+        )
+        generate_wrapper(
+            shape=example_shape, pipelined=False, gates=False, accumulation=False, filename=filename
+        )
+        filenames.append(filename)
+        # accumulating design
+        filename = "../gen/" + example_name + "_acc.sv"
+        generate_compressor(
+            target=Versal(),
+            shape=example_shape,
+            name="comp",
+            comb_depth=1,
+            accumulate=True,
+            accumulator_width=None,
+            gates=None,
+            constants=[],
+            path=filename,
+            test=True,
+        )
+        generate_wrapper(
+            shape=example_shape, pipelined=True, gates=False, accumulation=True, filename=filename
+        )
+        filenames.append(filename)
+        # gate inlined design with accumulation
+        filename = "../gen/" + example_name + "_gate.sv"
+        generate_compressor(
+            target=Versal(),
+            shape=example_shape,
+            name="comp",
+            comb_depth=1,
+            accumulate=True,
+            accumulator_width=None,
+            gates=[["8" for el in range(col)] for col in example_shape],
+            constants=[],
+            path=filename,
+            test=True,
+        )
+        generate_wrapper(
+            shape=example_shape, pipelined=True, gates=True, accumulation=True, filename=filename
+        )
+        filenames.append(filename)
+
+    tclfiles = [emit_eval_tcl_script(el) for el in filenames]
+
+    def call_vivado(filename):
+        vivado_path = (
+            "/proj/xbuilds/released/2023.1/2023.1_0508_1/"
+            "installs/lin64/Vivado/2023.1/settings64.sh"
+        )
+        command = f"""cd ../gen/ &&
+            ls &&
+            source {vivado_path} &&
+            vivado -mode batch -source {filename.split("/")[-1]}"""
+        return subprocess.run(
+            command, shell=True, check=True, timeout=3600, text=True, executable="/bin/bash"
+        )
+
+    print("Executing evaluation threads")
+    with ThreadPoolExecutor(max_workers=15) as executor:
+        executor.map(call_vivado, tclfiles)
+    print("Done executing evaluation threads")
+
+
+def generate_wrapper(shape, pipelined, gates, accumulation, filename):
+    iw = sum(shape)
+    ow = compressed_width(shape)
+
+    inputs = ["clk", "in"]
+    if gates:
+        inputs.append("in_2")
+
+    if accumulation:
+        inputs.append("en_neg")
+        inputs.append("rst")
+
+    input_str = "\tinput " + ", ".join(inputs) + ",\n"
+    output_str = f"\toutput logic [{ow-1}:0] outReg"
+
+    wrapper_str = (
+        "module sandwich(\n"
+        + input_str
+        + output_str
+        + "\n);\n"
+        + f"""
+\t{"logic en_negReg, rstReg;" if accumulation else ""}
+\tlogic [{iw-1}:0] inReg{", in_2Reg;" if gates else ";"}
+\twire [{ow-1}:0] out;
+\t
+\talways_ff @ (posedge clk) begin
+\t\t{"rstReg <= rst;" if accumulation else ""}
+\t\t{"en_negReg <= en_neg;" if accumulation else ""}
+\t\tinReg <= {{inReg, in}};
+\t\t{"in_2Reg <= {in_2Reg, in_2};" if gates else ""}
+\t\toutReg <= out;
+\tend
+\t
+\t(* keep_hierarchy = "yes" *)
+\tcomp c(.in(inReg), .clk(clk),{" .in_2(in_2Reg)," if gates else ""
+                                }{" .en_neg(en_negReg), .rst(rstReg),"
+                                  if accumulation else ""} .out(out));
+
+endmodule"""
+    )
+    with open(filename, "a") as f:
+        f.writelines(wrapper_str)
+
+
+def emit_eval_tcl_script(compressor_path):
+    comps = "set comps { " + str(compressor_path.split("/")[-1]) + " }"
+    script = (
+        comps
+        + """
+set PART xcvc1902-vsva2197-2MP-e-S ; # From VCK190 Evaluation Board
+
+foreach comp $comps {
+    read_verilog $comp
+
+    # -----------------------------------------------------------------------------
+    # Open new file for current module
+    set filename_prefix RESULT_
+    set filename_suffix ".json"
+    set filename $filename_prefix$comp$filename_suffix
+    puts $filename
+    set outfile [open $filename w]
+    puts $outfile "\\{"
+
+    set tm 0.7 ; # Minimum possible ime
+    set tt 10.0 ; # Time to Test
+    set ts 100.0 ; # Successful Time
+    set lc 100000 ; # LUT utilization
+
+    # -----------------------------------------------------------------------------
+    # Run synthesis
+    synth_design -top sandwich -part $PART
+
+    # -----------------------------------------------------------------------------
+    # while loop, updating clock
+    while {[expr $ts - $tm] > 0.1} {
+        puts "NEW SYNTHESIS RUN WITH FREQ $tt"
+        create_clock -name CLK -period $tt [get_port clk]
+
+        # -----------------------------------------------------------------------------
+        # Place and route
+        opt_design -retarget -propconst -sweep ;
+        place_design -directive Explore
+        report_utilization -file util_$comp.twrA
+        route_design -directive Explore
+        report_drc
+        report_utilization -hierarchical
+        report_timing -setup -hold -max_paths 3 -nworst 3 -input_pins \\
+            -sort_by group -file $comp.twrA
+        report_timing_summary -delay_type min_max \\
+            -path_type full_clock_expanded -report_unconstrained \\
+            -check_timing_verbose -max_paths 3 -nworst 3 \\
+            -significant_digits 3 -input_pins -file $comp.twrA
+
+        # -----------------------------------------------------------------------------
+        # Find maximum data path delay and slack
+        set f [open $comp.twrA r]
+        set file_data [read $f]
+        close $f
+        if {[regexp { +Data Path Delay: +(\\d+\\.\\d+)} $file_data -> value]} {
+            set tr $value
+        } {
+            error "DATA PATH DELAY NOT FOUND"
+        }
+
+        # -----------------------------------------------------------------------------
+        # Find LUT and Slice utilization
+        set f [open util_$comp.twrA r]
+        set file_data [read $f]
+        close $f
+        if {[regexp {CLB LUTs +\\| +(\\d+)} $file_data -> value]} {
+            set lc $value
+        } {
+            error "LUT UTILIZATION NOT FOUND"
+        }
+
+        if {[regexp {SLICE +\\| +(\\d+)} $file_data -> value]} {
+            set sc $value
+        } {
+            error "SLICE UTILIZATION NOT FOUND"
+        }
+
+        # -----------------------------------------------------------------------------
+        # Check if timing was met
+        if { $tt < $tr } {
+            puts {Timing $tr was NOT met!}
+            set tm $tt
+            if { $tr < $ts } {
+                set ts $tr
+            }
+        } else {
+            set ts $tr
+        }
+        set tt [expr { ($ts + $tm)/2}]
+    }
+
+    puts -nonewline $outfile "\\"Delay\\": $ts,"
+    puts -nonewline $outfile "\\"Slice\\": $sc,"
+    puts -nonewline $outfile "\\"LUTS\\": $lc" ;
+
+    puts $outfile "\\}"
+    close $outfile
+    remove_files {$comp}
+}
+q
+"""
+    )
+    tclpath = compressor_path.replace(".sv", ".tcl")
+    with open(tclpath, "w") as f:
+        f.writelines(script)
+    return tclpath
+
+
+if __name__ == "__main__":
+    evaluation()
diff --git a/src/finn/compressor/src/graph/__init__.py b/src/finn/compressor/src/graph/__init__.py
new file mode 100644
index 0000000000..9ec3df1276
--- /dev/null
+++ b/src/finn/compressor/src/graph/__init__.py
@@ -0,0 +1,8 @@
+#############################################################################
+# Copyright (C) 2024 - 2025 Advanced Micro Devices, Inc. All rights reserved.
+#
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# @brief    Compressor graph package initialization
+# @author    Simon Gerber <simon.gerber@amd.com>
+#############################################################################
diff --git a/src/finn/compressor/src/graph/accumulator.py b/src/finn/compressor/src/graph/accumulator.py
new file mode 100644
index 0000000000..54488cc57b
--- /dev/null
+++ b/src/finn/compressor/src/graph/accumulator.py
@@ -0,0 +1,97 @@
+#############################################################################
+# Copyright (C) 2024 - 2025 Advanced Micro Devices, Inc. All rights reserved.
+#
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# @brief    Accumulator stage implementation for compressor
+#############################################################################
+
+from collections.abc import Iterable
+
+from .nodes import Bitmatrix, Logic, Shape, Stage, Wire
+
+
+class AccumulatorStage(Stage):
+    def __init__(
+        self,
+        shape: Shape,
+        final_adder,
+        preceeding_pipeline_stages,
+        accumulator_width=None,
+        enable=False,
+    ):
+        super().__init__()
+        self.input_shape = shape
+        self.output_shape = Shape([1 for _ in range(self.get_accumulator_width(accumulator_width))])
+        self.instances = []
+        self.input_wires = Bitmatrix(shape)
+        self.output_wires = Bitmatrix(self.output_shape)  # TODO: Make Logic
+        self.accumulator_width = self.get_accumulator_width(accumulator_width)
+        self.final_adder_gen = final_adder
+        self.preceeding_pipeline_stages = preceeding_pipeline_stages
+        self.enable = enable
+        self.build_hardware()
+
+    def build_hardware(self):
+        acc_input_shape = self.input_shape + self.output_shape
+        final_adder = self.final_adder_gen(acc_input_shape)
+
+        en_neg = Wire(desired_name="en_neg")
+        en_neg.set_to_module_input()
+        rst = Wire(desired_name="rst")
+        rst.set_to_module_input()
+        self.instances.append(en_neg)
+        self.instances.append(rst)
+
+        en_wire = None
+        if self.enable:
+            en_wire = Wire(desired_name="en")
+            en_wire.set_to_module_input()
+            self.instances.append(en_wire)
+
+        # Create shifted enable and reset signal.
+        # init=1 on rst delay chain: when enable mode is active, en-gating
+        # prevents these registers from capturing the initial rst=1 pulse if
+        # en=0 during global reset.  Initialising to 1 ensures the accumulator
+        # feedback is properly zeroed from power-up.
+        rst_del = self.delay_signal(
+            rst, self.preceeding_pipeline_stages + 1, en=en_wire, init=1 if self.enable else None
+        )
+        en_neg_del = self.delay_signal(en_neg, self.preceeding_pipeline_stages, en=en_wire)
+
+        # Connect inputs to final adder
+        loop = self.delay_signal(
+            final_adder.output_wires, cycles=1, rst=rst_del, en=en_wire, init=0
+        )
+        in_ = self.delay_signal(self.input_wires, cycles=1, rst=en_neg_del, en=en_wire, init=0)
+        for col_loop, col_fa in zip(loop, final_adder.input_wires):
+            col_loop[0].connect_to(col_fa[0])
+
+        for col_in, col_fa in zip(in_, final_adder.input_wires):
+            for el_in, el_fa in zip(col_in, col_fa[1:]):
+                el_in.connect_to(el_fa)
+
+        # Connect final adder output to stage output
+        for col_t, col_s in zip(self.output_wires, final_adder.output_wires):
+            for t, s in zip(col_t, col_s):
+                s.connect_to(t)
+        self.instances.append(final_adder)
+
+    def delay_signal(self, signal, /, cycles=1, rst=None, en=None, init=None):
+        if isinstance(signal, Iterable):
+            return [self.delay_signal(el, cycles, rst, en, init) for el in signal]
+        for i in range(cycles):
+            lgc = Logic(rst=rst, en=en, init=init)
+            signal.connect_to(lgc)
+            self.instances.append(lgc)
+            signal = lgc
+        return signal
+
+    def get_accumulator_width(self, input=None):
+        if input:
+            return input
+        else:
+            return sum([(el << idx) for idx, el in enumerate(self.input_shape)]).bit_length()
+
+    def accept(self, visitor):
+        visitor.visit_accumulator_stage(self)
diff --git a/src/finn/compressor/src/graph/counters/__init__.py b/src/finn/compressor/src/graph/counters/__init__.py
new file mode 100644
index 0000000000..52868b1dbd
--- /dev/null
+++ b/src/finn/compressor/src/graph/counters/__init__.py
@@ -0,0 +1,8 @@
+#############################################################################
+# Copyright (C) 2024 - 2025 Advanced Micro Devices, Inc. All rights reserved.
+#
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# @brief    Counter candidates package initialization
+# @author    Simon Gerber <simon.gerber@amd.com>
+#############################################################################
diff --git a/src/finn/compressor/src/graph/counters/absorption_counter_candidates.py b/src/finn/compressor/src/graph/counters/absorption_counter_candidates.py
new file mode 100644
index 0000000000..726172fb54
--- /dev/null
+++ b/src/finn/compressor/src/graph/counters/absorption_counter_candidates.py
@@ -0,0 +1,309 @@
+#############################################################################
+# Copyright (C) 2024 - 2025 Advanced Micro Devices, Inc. All rights reserved.
+#
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# @brief    7-Series and Versal gate absorption counter implementations
+# @author    Co-authored by Simon Gerber <simon.gerber@amd.com>
+#############################################################################
+
+from abc import ABC, abstractmethod
+from typing import List
+
+from ...utils.shape import Shape
+from ..nodes import Constant, GateAbsorptionCounter
+from ..primitives import LUT2, LUT6, LUT6_2, LUT6CY
+
+
+def fa_sum(a, b, c):
+    return a ^ b ^ c
+
+
+def fa_carry(a, b, c):
+    return a and b or a and c or b and c
+
+
+def gate_string_to_pred(string):
+    class Gate:
+        def __init__(self, init):
+            try:
+                self._init = int(init, 16)
+            except ValueError:
+                raise ValueError(f"Gate specification {string} is invalid!")
+
+        def __call__(self, a, b):
+            return bool((self._init >> (1 * a | 2 * b)) & 1)
+
+        def __repr__(self):
+            return f"{self._init:x}"
+
+    return Gate(string)
+
+
+class GateAbsorptionCounterCandidate(ABC):
+    @abstractmethod
+    def extend_to_fit(self, inputs: Shape, gates: List[List[str]]) -> GateAbsorptionCounter:
+        pass
+
+
+class AbsorbingFACandidate(GateAbsorptionCounterCandidate):
+    def extend_to_fit(self, inputs: Shape, gates: List[List[str]]) -> GateAbsorptionCounter:
+        if inputs[0] >= 3:
+            return AbsorbingFA(gates[0][:3])
+
+
+class AbsorbingFA(GateAbsorptionCounter):
+    def __init__(self, gates):
+        self.gates = [gate_string_to_pred(gate) for gate in gates]
+        super().__init__(Shape([3]), Shape([1, 1]))
+
+    def build_hardware(self):
+        lut1 = LUT6.fromPred(
+            lambda I0, I1, I2, I3, I4, I5: fa_sum(
+                self.gates[0](I0, I1), self.gates[1](I2, I3), self.gates[2](I4, I5)
+            )
+        )
+
+        lut2 = LUT6.fromPred(
+            lambda I0, I1, I2, I3, I4, I5: fa_carry(
+                self.gates[0](I0, I1), self.gates[1](I2, I3), self.gates[2](I4, I5)
+            )
+        )
+
+        for lut in zip([lut1, lut2]):
+            self.input_wires[0][0].connect_to(lut.I0)
+            self.input_wires[0][2].connect_to(lut.I2)
+            self.input_wires[0][4].connect_to(lut.I4)
+            self.input_wires_complementary[0][1].connect_to(lut.I1)
+            self.input_wires_complementary[0][3].connect_to(lut.I3)
+            self.input_wires_complementary[0][5].connect_to(lut.I5)
+        self.output_wires[0][0].connect_to(lut1.O)
+        self.output_wires[1][0].connect_to(lut2.O)
+        self.instances += [lut1, lut2]
+
+
+class MuxCYPredAdderCandidate(GateAbsorptionCounterCandidate):
+    def extend_to_fit(self, inputs: Shape, gates: List[List[str]]) -> GateAbsorptionCounter:
+        width = 0
+        for i in range(4):
+            if inputs[i] > 2:
+                width += 1
+            else:
+                break
+        selected_gates = []
+        for i in range(width):
+            gates_col = [gates[i][0], gates[i][1]]
+            selected_gates.append(gates_col)
+        if selected_gates:
+            return MuxCYPredAdder(selected_gates)
+
+
+class MuxCYPredAdder(GateAbsorptionCounter):
+    def __init__(self, gates: List[List[str]]):
+        self.gates = [[gate_string_to_pred(el) for el in col] for col in gates]
+        super().__init__(Shape(len(self.gates) * [2]), Shape((len(self.gates) + 1) * [1]))
+
+    def build_hardware(self):
+        """7-Series horizontal multi-column gate absorption using LUT6_2.
+
+        Similar to VersalPredAdder but uses LUT6_2 with swapped predicate order.
+        Each column has 2 gates, each LUT computes: sum = p1 XOR p2 XOR carry_in
+        """
+
+        luts = []
+        for i in range(len(self.gates)):
+            p1 = self.gates[i][0]
+            p2 = self.gates[i][1]
+            # LUT6_2: predO5→O5, predO6→O6
+            # Match VersalPredAdder pattern: sum first, carry second
+            lut = LUT6_2.fromPred(
+                lambda A0, A1, A2, A3, A4, A5, p1=p1, p2=p2: fa_sum(
+                    p1(A0, A1), p2(A2, A3), A4
+                ),  # predO5 → O5 (sum)
+                lambda A0, A1, A2, A3, A4, A5, p1=p1, p2=p2: fa_carry(
+                    p1(A0, A1), p2(A2, A3), A4
+                ),  # predO6 → O6 (carry)
+            )
+
+            # Connect inputs (same pattern as Versal)
+            self.input_wires[i][0].connect_to(lut.I0)
+            self.input_wires[i][1].connect_to(lut.I2)
+            self.input_wires_complementary[i][0].connect_to(lut.I1)
+            self.input_wires_complementary[i][1].connect_to(lut.I3)
+
+            # Sum output for this column (O5, not O6!)
+            lut.O5.connect_to(self.output_wires[i][0])
+            luts.append(lut)
+
+        # First LUT needs carry-in = 0
+        Constant("1'b0").connect_to(luts[0].I4)
+
+        # Carry chain: previous carry → next carry-in (O6, not O5!)
+        for p, n in zip(luts, luts[1:]):
+            p.O6.connect_to(n.I4)
+
+        # Final carry-out (O6, not O5!)
+        luts[-1].O6.connect_to(self.output_wires[len(luts)][0])
+
+        self.instances += luts
+
+
+class VersalPredAdderCandidate(GateAbsorptionCounterCandidate):
+    def extend_to_fit(self, inputs: Shape, gates: List[List[str]]) -> GateAbsorptionCounter:
+        width = 0
+        for i in range(4):
+            if inputs[i] > 2:
+                width += 1
+            else:
+                break
+        selected_gates = []
+        for i in range(width):
+            gates_col = [gates[i][0], gates[i][1]]
+            selected_gates.append(gates_col)
+        if selected_gates:
+            return VersalPredAdder(selected_gates)
+
+
+class VersalPredAdder(GateAbsorptionCounter):
+    def __init__(self, gates: List[List[str]]):
+        self.gates = [[gate_string_to_pred(el) for el in col] for col in gates]
+        super().__init__(Shape(len(self.gates) * [2]), Shape((len(self.gates) + 1) * [1]))
+
+    def build_hardware(self):
+        luts = []
+        for i in range(len(self.gates)):
+            p1 = self.gates[i][0]
+            p2 = self.gates[i][1]
+            lut = LUT6CY.fromPred(
+                lambda A0, A1, A2, A3, A4, A5: fa_sum(p1(A0, A1), p2(A2, A3), A4),  # s
+                lambda A0, A1, A2, A3, A4, A5: fa_carry(p1(A0, A1), p2(A2, A3), A4),  # c
+            )
+            self.input_wires[i][0].connect_to(lut.I0)
+            self.input_wires[i][1].connect_to(lut.I2)
+            self.input_wires_complementary[i][0].connect_to(lut.I1)
+            self.input_wires_complementary[i][1].connect_to(lut.I3)
+
+            lut.O51.connect_to(self.output_wires[i][0])
+            luts.append(lut)
+
+        for p, n in zip(luts, luts[1:]):
+            p.O52.connect_to(n.I4)
+        luts[-1].O52.connect_to(self.output_wires[len(luts)][0])
+        self.instances += luts
+
+
+class RippleSumPredAdderCandidate(GateAbsorptionCounterCandidate):
+    def extend_to_fit(self, inputs: Shape, gates: List[List[str]]) -> GateAbsorptionCounter:
+        max_height = min(inputs[0] // 2, 4)
+        if max_height:
+            return RippleSumPredAdder(gates[0][: max_height * 2])
+
+
+class RippleSumPredAdder(GateAbsorptionCounter):
+    def __init__(self, gates):
+        self.gates = [gate_string_to_pred(gate) for gate in gates]
+        super().__init__(Shape([len(gates)]), Shape([1, (len(gates) + 1) // 2]))
+
+    def build_hardware(self):
+        luts = []
+        for i in range((len(self.gates) + 1) // 2):
+            p1 = self.gates[2 * i]
+            p2 = self.gates[2 * i + 1] if len(self.gates) > 2 * i + 1 else lambda A0, A1: False
+            lut = LUT6CY.fromPred(
+                lambda A0, A1, A2, A3, A4, A5: fa_carry(p1(A0, A1), p2(A2, A3), A4),  # c
+                lambda A0, A1, A2, A3, A4, A5: fa_sum(p1(A0, A1), p2(A2, A3), A4),  # s
+            )
+            luts.append(lut)
+
+        for p, n in zip(luts, luts[1:]):
+            p.O52.connect_to(n.I4)
+
+        for i, (w1, w2) in enumerate(zip(self.input_wires[0], self.input_wires_complementary[0])):
+            if i % 2 == 0:
+                w1.connect_to(luts[i // 2].I0)
+                w2.connect_to(luts[i // 2].I1)
+            else:
+                w1.connect_to(luts[i // 2].I2)
+                w2.connect_to(luts[i // 2].I3)
+
+        luts[-1].O52.connect_to(self.output_wires[0][0])
+        for i, lut in enumerate(luts):
+            lut.O51.connect_to(self.output_wires[1][i])
+        self.instances += luts
+
+
+class MuxCYRippleSumCandidate(GateAbsorptionCounterCandidate):
+    """7-Series version of RippleSumPredAdder using CARRY4 instead of LUT6CY."""
+
+    def extend_to_fit(self, inputs: Shape, gates: List[List[str]]) -> GateAbsorptionCounter:
+        max_height = min(inputs[0] // 2, 4)
+        if max_height:
+            return MuxCYRippleSum(gates[0][: max_height * 2])
+
+
+class MuxCYRippleSum(GateAbsorptionCounter):
+    """7-Series ripple-carry gate absorption using LUT6_2 + CARRY4."""
+
+    def __init__(self, gates):
+        self.gates = [gate_string_to_pred(gate) for gate in gates]
+        super().__init__(Shape([len(gates)]), Shape([1, (len(gates) + 1) // 2]))
+
+    def build_hardware(self):
+        luts = []
+        for i in range((len(self.gates) + 1) // 2):
+            p1 = self.gates[2 * i]
+            p2 = self.gates[2 * i + 1] if len(self.gates) > 2 * i + 1 else lambda A0, A1: False
+            # Match Versal RippleSumPredAdder pattern with full-adder logic
+            # Gates use I0/I1 (p1) and I2/I3 (p2), carry-in on I4
+            # Try swapping: O5 = sum, O6 = carry (opposite of naming)
+            lut = LUT6_2.fromPred(
+                lambda A0, A1, A2, A3, A4, A5, p1=p1, p2=p2: fa_sum(
+                    p1(A0, A1), p2(A2, A3), A4
+                ),  # O5 = sum
+                lambda A0, A1, A2, A3, A4, A5, p1=p1, p2=p2: fa_carry(
+                    p1(A0, A1), p2(A2, A3), A4
+                ),  # O6 = carry
+            )
+            luts.append(lut)
+
+        # Connect gate inputs to LUT inputs (same as Versal)
+        for i, (w1, w2) in enumerate(zip(self.input_wires[0], self.input_wires_complementary[0])):
+            if i % 2 == 0:
+                w1.connect_to(luts[i // 2].I0)
+                w2.connect_to(luts[i // 2].I1)
+            else:
+                w1.connect_to(luts[i // 2].I2)
+                w2.connect_to(luts[i // 2].I3)
+
+        # First LUT needs carry-in = 0
+        Constant("1'b0").connect_to(luts[0].I4)
+
+        # Carry chain: previous carry-out → next carry-in (same as Versal)
+        for p, n in zip(luts, luts[1:]):
+            p.O5.connect_to(n.I4)
+
+        # Connect outputs (same as Versal): final carry + sum bits
+        luts[-1].O5.connect_to(self.output_wires[0][0])  # Final carry-out
+        for i, lut in enumerate(luts):
+            lut.O6.connect_to(self.output_wires[1][i])  # Sum bits
+
+        self.instances += luts
+
+
+class SinglePredCandidate(GateAbsorptionCounterCandidate):
+    def extend_to_fit(self, inputs: Shape, gates: List[List[str]]) -> GateAbsorptionCounter:
+        if inputs[0] > 0:
+            return SinglePred(gates[0][0])
+
+
+class SinglePred(GateAbsorptionCounter):
+    def __init__(self, gate):
+        self.gate = gate_string_to_pred(gate)
+        super().__init__(Shape([1]), Shape([1]))
+
+    def build_hardware(self):
+        lut = LUT2.fromPred(self.gate)
+        self.input_wires[0][0].connect_to(lut.I0)
+        self.input_wires_complementary[0][0].connect_to(lut.I1)
+        lut.O.connect_to(self.output_wires[0][0])
+        self.instances.append(lut)
diff --git a/src/finn/compressor/src/graph/counters/counter_candidates.py b/src/finn/compressor/src/graph/counters/counter_candidates.py
new file mode 100644
index 0000000000..1462a837f9
--- /dev/null
+++ b/src/finn/compressor/src/graph/counters/counter_candidates.py
@@ -0,0 +1,770 @@
+#############################################################################
+# Copyright (C) 2024 - 2025 Advanced Micro Devices, Inc. All rights reserved.
+#
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# @brief    LUT-based counter and gate absorption atom implementations
+# @author    Co-authored by Simon Gerber <simon.gerber@amd.com>
+#############################################################################
+
+from abc import ABC, abstractmethod
+from itertools import count
+
+from ...utils.shape import Shape
+from ..nodes import Constant, Counter, GateAbsorptionCounter
+from ..primitives import CARRY4, LUT5, LUT6, LUT6_2, LUT6CY
+
+MAX_CASCADE_LENGTH = 4
+
+
+def FA_sum(a, b, c):
+    return a ^ b ^ c
+
+
+def FA_carry(a, b, c):
+    return a and b or a and c or b and c
+
+
+class CounterCandidate(ABC):
+    @abstractmethod
+    def extend_to_fit(self, inputs: Shape, outputs: Shape, compression_goal) -> Counter:
+        pass
+
+
+class VersalAtom(CounterCandidate):
+    def extend_to_fit(self, inputs: Shape, outputs: Shape, compression_goal) -> Counter:
+        pass
+
+
+class FixedShapeCounterCandidate(CounterCandidate):
+    def __init__(self, counter, counter_inputs: Shape, counter_outputs: Shape) -> Counter:
+        self.counter = counter
+        self.counter_inputs = counter_inputs
+        self.counter_outputs = counter_outputs
+
+    def extend_to_fit(self, inputs: Shape, outputs: Shape, compression_goal) -> Counter:
+        for i in range(len(self.counter_inputs)):
+            if not (
+                self.counter_inputs[i] <= inputs[i]
+                and inputs[i]
+                + outputs[i]
+                - self.counter_inputs[i]
+                + self.counter_outputs[i]
+                - compression_goal(i)
+                >= -1
+            ):
+                return None
+        return self.counter()
+
+
+class FA(Counter):
+    def __init__(self):
+        super(FA, self).__init__(
+            Shape([3]),
+            Shape([1, 1]),
+        )
+
+    def build_hardware(self):
+        lut = LUT6_2.fromPred(
+            lambda x, y, z, w, q, r: x and y or x and z or y and z,
+            lambda x, y, z, w, q, r: x ^ y ^ z,
+            "FA",
+        )
+        for i in range(3):
+            self.input_wires[0][i].connect_to(lut.in_ports[i])
+        for i in range(2):
+            lut.out_ports[i].connect_to(self.output_wires[i][0])
+        self.instances += (lut,)
+
+
+class FACandidate(FixedShapeCounterCandidate):
+    def __init__(self):
+        super().__init__(FA, FA().input_shape, FA().output_shape)
+
+
+hlutnm_counter = count()
+
+
+class TenSix(Counter):
+    def __init__(self):
+        super(TenSix, self).__init__(Shape([10]), Shape([2, 4]))
+
+    def build_hardware(self):
+        lut1 = LUT6_2.fromPred(
+            lambda A0, A1, A2, A3, A4, _: FA_sum(A3, A4, FA_sum(A0, A1, A2)),
+            lambda A0, A1, A2, A3, A4, _: FA_carry(A3, A4, FA_sum(A0, A1, A2)),
+            "FiveTwo_1",
+        )
+        lut2 = LUT6_2.fromPred(
+            lambda A0, A1, A2, A3, A4, _: FA_sum(A3, A4, FA_sum(A0, A1, A2)),
+            lambda A0, A1, A2, A3, A4, _: FA_carry(A3, A4, FA_sum(A0, A1, A2)),
+            "FiveTwo_2",
+        )
+        hlutnm_attr = f'HLUTNM = "tensix_{next(hlutnm_counter)}"'
+        lut3_A = LUT5.fromPred(lambda A0, A1, A2, A3, A4: FA_carry(A0, A1, A4))
+        lut3_B = LUT5.fromPred(lambda A0, A1, A2, A3, A4: FA_carry(A2, A3, A4))
+        lut3_A.annotate(hlutnm_attr)
+        lut3_B.annotate(hlutnm_attr)
+        # TODO: Take care of annotations
+        self.input_wires[0][0].connect_to(lut1.I0)
+        self.input_wires[0][1].connect_to(lut1.I1)
+        self.input_wires[0][2].connect_to(lut1.I2)
+        self.input_wires[0][3].connect_to(lut1.I3)
+        self.input_wires[0][4].connect_to(lut1.I4)
+        lut1.O5.connect_to(self.output_wires[0][0])
+        lut1.O6.connect_to(self.output_wires[1][0])
+
+        self.input_wires[0][5].connect_to(lut2.I0)
+        self.input_wires[0][6].connect_to(lut2.I1)
+        self.input_wires[0][7].connect_to(lut2.I2)
+        self.input_wires[0][8].connect_to(lut2.I3)
+        self.input_wires[0][9].connect_to(lut2.I4)
+
+        self.input_wires[0][0].connect_to(lut3_A.I0)
+        self.input_wires[0][1].connect_to(lut3_A.I1)
+        self.input_wires[0][2].connect_to(lut3_A.I4)
+
+        self.input_wires[0][5].connect_to(lut3_B.I2)
+        self.input_wires[0][6].connect_to(lut3_B.I3)
+        self.input_wires[0][7].connect_to(lut3_B.I4)
+
+        # Duplicate connections to make Vivado obey HLUTNM
+        self.input_wires[0][5].connect_to(lut3_A.I2)
+        self.input_wires[0][6].connect_to(lut3_A.I3)
+        self.input_wires[0][0].connect_to(lut3_B.I0)
+        self.input_wires[0][1].connect_to(lut3_B.I1)
+
+        lut2.O5.connect_to(self.output_wires[0][1])
+        lut2.O6.connect_to(self.output_wires[1][1])
+
+        lut3_A.O.connect_to(self.output_wires[1][2])
+        lut3_B.O.connect_to(self.output_wires[1][3])
+
+        self.instances += (lut1, lut2, lut3_A, lut3_B)
+
+
+class TenSixCandidate(FixedShapeCounterCandidate):
+    def __init__(self):
+        super().__init__(TenSix, TenSix().input_shape, TenSix().output_shape)
+
+
+class FiveTwo(Counter):
+    def __init__(self):
+        super(FiveTwo, self).__init__(Shape([5, 2]), Shape([1, 2, 1]))
+
+    def build_hardware(self):
+        lut1 = LUT6_2.fromPred(
+            lambda A0, A1, A2, A3, A4, _: FA_sum(A3, A4, FA_sum(A0, A1, A2)),
+            lambda A0, A1, A2, A3, A4, _: FA_carry(A3, A4, FA_sum(A0, A1, A2)),
+            "FiveTwo_1",
+        )
+        lut2 = LUT6_2.fromPred(
+            lambda A0, A1, A2, A3, A4, _: FA_sum(A3, A4, FA_carry(A0, A1, A2)),
+            lambda A0, A1, A2, A3, A4, _: FA_carry(A3, A4, FA_carry(A0, A1, A2)),
+            "FiveTwo_2",
+        )
+        self.input_wires[0][0].connect_to(lut1.I0)
+        self.input_wires[0][1].connect_to(lut1.I1)
+        self.input_wires[0][2].connect_to(lut1.I2)
+        self.input_wires[0][3].connect_to(lut1.I3)
+        self.input_wires[0][4].connect_to(lut1.I4)
+        lut1.O5.connect_to(self.output_wires[0][0])
+        lut1.O6.connect_to(self.output_wires[1][0])
+
+        self.input_wires[0][0].connect_to(lut2.I0)
+        self.input_wires[0][1].connect_to(lut2.I1)
+        self.input_wires[0][2].connect_to(lut2.I2)
+        self.input_wires[1][0].connect_to(lut2.I3)
+        self.input_wires[1][1].connect_to(lut2.I4)
+        lut2.O5.connect_to(self.output_wires[1][1])
+        lut2.O6.connect_to(self.output_wires[2][0])
+        self.instances += (lut1, lut2)
+
+
+class FiveTwoCandidate(FixedShapeCounterCandidate):
+    def __init__(self):
+        super(FiveTwoCandidate, self).__init__(
+            FiveTwo, FiveTwo().input_shape, FiveTwo().output_shape
+        )
+
+
+class DualRailRippleSum(Counter):
+    def __init__(self, w):
+        self._width = w
+        super(DualRailRippleSum, self).__init__(Shape([4 * w + 1, w + 1]), Shape([1, w + 1, w]))
+
+    @property
+    def width(self):
+        return self._width
+
+    def build_hardware(self):
+        luts_top = []
+        luts_btm = []
+
+        cascade_top = self.input_wires[0][0]
+        cascade_btm = self.input_wires[1][0]
+
+        for i in range(0, self._width):
+            lut_top = LUT6CY.fromPred(
+                lambda A0, A1, A2, A3, A4, _: FA_carry(A3, A4, FA_sum(A0, A1, A2)),
+                lambda A0, A1, A2, A3, A4, _: FA_sum(A3, A4, FA_sum(A0, A1, A2)),
+                "dual_rail_top",
+            )
+            lut_btm = LUT6CY.fromPred(
+                lambda A0, A1, A2, A3, A4, _: FA_carry(A3, A4, FA_carry(A0, A1, A2)),
+                lambda A0, A1, A2, A3, A4, _: FA_sum(A3, A4, FA_carry(A0, A1, A2)),
+                "dual_rail_btm",
+            )
+
+            self.input_wires[0][1 + 4 * i].connect_to(lut_top.I0)
+            self.input_wires[0][2 + 4 * i].connect_to(lut_top.I1)
+            self.input_wires[0][3 + 4 * i].connect_to(lut_top.I2)
+            self.input_wires[0][4 + 4 * i].connect_to(lut_top.I3)
+            cascade_top.connect_to(lut_top.I4)
+            lut_top.O51.connect_to(self.output_wires[1][i + 1])
+            cascade_top = lut_top.O52
+
+            self.input_wires[0][1 + 4 * i].connect_to(lut_btm.I0)
+            self.input_wires[0][2 + 4 * i].connect_to(lut_btm.I1)
+            self.input_wires[0][3 + 4 * i].connect_to(lut_btm.I2)
+            self.input_wires[1][1 + i].connect_to(lut_btm.I3)
+            cascade_btm.connect_to(lut_btm.I4)
+            lut_btm.O51.connect_to(self.output_wires[2][i])
+            cascade_btm = lut_btm.O52
+
+            luts_top.append(lut_top)
+            luts_btm.append(lut_btm)
+
+            if i == self._width - 1:
+                lut_top.O52.connect_to(self.output_wires[0][0])
+                lut_btm.O52.connect_to(self.output_wires[1][0])
+
+        self.instances += luts_top + luts_btm
+
+
+class DualRailRippleSumCandidate(CounterCandidate):
+    def extend_to_fit(self, inputs: Shape, outputs: Shape, compression_goal) -> Counter:
+        max_height_0 = (
+            min(
+                MAX_CASCADE_LENGTH,
+                (inputs[0] - 1) // 4,
+                (inputs[0] + outputs[0] - compression_goal(0) + 1) // 4,
+            )
+            if inputs[0] >= 5
+            else 0
+        )
+
+        max_height_1 = min(MAX_CASCADE_LENGTH, inputs[1] - 1) if inputs[1] >= 2 else 0
+        max_height = min(max_height_0, max_height_1, MAX_CASCADE_LENGTH)
+        if max_height > 0:
+            return DualRailRippleSum(max_height)
+
+
+class RippleSum(Counter):
+    def __init__(self, w):
+        self._width = w
+        super(RippleSum, self).__init__(Shape([2 * w + 1]), Shape([1, w]))
+
+    @property
+    def width(self):
+        return self._width
+
+    def build_hardware(self):
+        luts = []
+
+        carry = self.input_wires[0][0]
+
+        for i in range(0, self._width):
+            lut = LUT6CY.fromPred(
+                lambda A0, A1, A2, A3, A4, _: FA_carry(A4, A1, A0),
+                lambda A0, A1, A2, A3, A4, _: FA_sum(A4, A1, A0),
+                "ripple_sum",
+            )
+
+            self.input_wires[0][1 + 2 * i].connect_to(lut.I0)
+            self.input_wires[0][2 + 2 * i].connect_to(lut.I1)
+            carry.connect_to(lut.I4)
+            lut.O51.connect_to(self.output_wires[1][i])
+            carry = lut.O52
+
+            luts.append(lut)
+
+            if i == self._width - 1:
+                lut.O52.connect_to(self.output_wires[0][0])
+
+        self.instances += luts
+
+
+class RippleSumCandidate(CounterCandidate):
+    def extend_to_fit(self, inputs: Shape, outputs: Shape, compression_goal) -> Counter:
+        max_height = (
+            min(
+                MAX_CASCADE_LENGTH,
+                (inputs[0] - 1) // 2,
+                (inputs[0] + outputs[0] + 1) // 2 - compression_goal(0) + 1,
+            )
+            if inputs[0] >= 3
+            else 0
+        )
+        if max_height > 0:
+            return RippleSum(max_height)
+
+
+class SixThree(Counter):
+    def __init__(self):
+        super(SixThree, self).__init__(Shape([6]), Shape([1, 1, 1]))
+
+    def build_hardware(self):
+        lut1 = LUT6.fromPred(
+            lambda A0, A1, A2, A3, A4, A5: bool(sum([A0, A1, A2, A3, A4, A5]) & 1), "sixthree_first"
+        )
+        lut2 = LUT6.fromPred(
+            lambda A0, A1, A2, A3, A4, A5: bool(sum([A0, A1, A2, A3, A4, A5]) & 2),
+            "sixthree_second",
+        )
+        lut3 = LUT6.fromPred(
+            lambda A0, A1, A2, A3, A4, A5: bool(sum([A0, A1, A2, A3, A4, A5]) & 4), "sixthree_third"
+        )
+        luts = (lut1, lut2, lut3)
+
+        for lut in luts:
+            for i in range(6):
+                self.input_wires[0][i].connect_to(lut.in_ports[i])
+
+        for i, lut in enumerate(luts):
+            lut.out_ports[0].connect_to(self.output_wires[i][0])
+        self.instances += luts
+
+
+class SixThreeCandidate(FixedShapeCounterCandidate):
+    def __init__(self):
+        super().__init__(SixThree, SixThree().input_shape, SixThree().output_shape)
+
+
+class VersalAtom14:
+    def __init__(self):
+        self.shape = Shape([4, 1])
+        self.width = 2
+        self.output_width = 2
+
+    def build_luts(self):
+        lut_1 = LUT6CY.fromPred(
+            lambda A0, A1, A2, A3, A4, _: FA_sum(FA_sum(A0, A1, A2), A3, A4),
+            lambda A0, A1, A2, A3, A4, _: FA_carry(FA_sum(A0, A1, A2), A3, A4),
+            "atom14_first",
+        )
+        lut_2 = LUT6CY.fromPred(
+            lambda A0, A1, A2, A3, A4, _: FA_sum(FA_carry(A0, A1, A2), A3, A4),
+            lambda A0, A1, A2, A3, A4, _: FA_carry(FA_carry(A0, A1, A2), A3, A4),
+            "atom14_second",
+        )
+        return (lut_1, lut_2)
+
+
+class VersalAtom2:
+    def __init__(self):
+        self.shape = Shape([2])
+        self.width = 1
+        self.output_width = 1
+
+    def build_luts(self):
+        lut = LUT6CY.fromPred(
+            lambda A0, A1, A2, A3, A4, _: FA_sum(A0, A1, A4),
+            lambda A0, A1, A2, A3, A4, _: FA_carry(A0, A1, A4),
+            "atom2_second",
+        )
+        return (lut,)
+
+
+class VersalAtom222:
+    def __init__(self):
+        self.shape = Shape([2, 2, 2])
+        self.width = 2
+        self.output_width = 3
+
+    def build_luts(self):
+        lut_1 = LUT6CY.fromPred(
+            lambda A0, A1, A2, A3, A4, _: FA_sum(A2, A3, A4),
+            lambda A0, A1, A2, A3, A4, _: FA_sum(A0, A1, FA_carry(A2, A3, A4)),
+        )
+        lut_2 = LUT6CY.fromPred(
+            lambda A0, A1, A2, A3, A4, _: FA_sum(A0, A1, FA_carry(A2, A3, A2 ^ A3 ^ A4)),
+            lambda A0, A1, A2, A3, A4, _: FA_carry(A0, A1, FA_carry(A2, A3, A2 ^ A3 ^ A4)),
+        )
+        return (lut_1, lut_2)
+
+
+class VersalAtomCascade(Counter):
+    def __init__(self, atoms):
+        self._atoms = atoms
+
+        in_shape = [el for atom in atoms for el in atom.shape]
+        in_shape[0] += 1
+        in_shape = Shape(in_shape)
+
+        out_shape = Shape([1 for _ in range(sum([atom.output_width for atom in atoms]) + 1)])
+        super().__init__(in_shape, out_shape)
+
+    def build_hardware(self):
+        luts = []
+        for atom in self._atoms:
+            # emit the correct luts
+            luts += atom.build_luts()
+
+        if not luts:
+            return
+
+        # Connect inputs
+        lut_idx = 0
+        io_idx = 0
+
+        # Carry-in
+        carry = self.input_wires[0][self._atoms[0].shape[0]]
+
+        for atom in self._atoms:
+            if isinstance(atom, VersalAtom2):
+                self.input_wires[io_idx][0].connect_to(luts[lut_idx].I0)
+                self.input_wires[io_idx][1].connect_to(luts[lut_idx].I1)
+                carry.connect_to(luts[lut_idx].I4)
+                carry = luts[lut_idx].O52
+
+                luts[lut_idx].O51.connect_to(self.output_wires[io_idx][0])
+                lut_idx += 1
+                io_idx += 1
+            elif isinstance(atom, VersalAtom222):
+                self.input_wires[io_idx][0].connect_to(luts[lut_idx].I2)
+                self.input_wires[io_idx][1].connect_to(luts[lut_idx].I3)
+                self.input_wires[io_idx + 1][0].connect_to(luts[lut_idx].I0)
+                self.input_wires[io_idx + 1][1].connect_to(luts[lut_idx].I1)
+                carry.connect_to(luts[lut_idx].I4)
+                carry = luts[lut_idx].O52
+
+                # second lut
+                self.input_wires[io_idx + 1][0].connect_to(luts[lut_idx + 1].I2)
+                self.input_wires[io_idx + 1][1].connect_to(luts[lut_idx + 1].I3)
+                self.input_wires[io_idx + 2][0].connect_to(luts[lut_idx + 1].I0)
+                self.input_wires[io_idx + 2][1].connect_to(luts[lut_idx + 1].I1)
+                carry.connect_to(luts[lut_idx + 1].I4)
+                carry = luts[lut_idx + 1].O52
+
+                luts[lut_idx].O51.connect_to(self.output_wires[io_idx][0])
+                luts[lut_idx].O52.connect_to(self.output_wires[io_idx + 1][0])
+                luts[lut_idx + 1].O51.connect_to(self.output_wires[io_idx + 2][0])
+                lut_idx += 2
+                io_idx += 3
+            elif isinstance(atom, VersalAtom14):
+                # first lut
+                self.input_wires[io_idx][0].connect_to(luts[lut_idx].I0)
+                self.input_wires[io_idx][1].connect_to(luts[lut_idx].I1)
+                self.input_wires[io_idx][2].connect_to(luts[lut_idx].I2)
+                self.input_wires[io_idx][3].connect_to(luts[lut_idx].I3)
+                carry.connect_to(luts[lut_idx].I4)
+                carry = luts[lut_idx].O52
+
+                # second lut
+                self.input_wires[io_idx][0].connect_to(luts[lut_idx + 1].I0)
+                self.input_wires[io_idx][1].connect_to(luts[lut_idx + 1].I1)
+                self.input_wires[io_idx][2].connect_to(luts[lut_idx + 1].I2)
+                self.input_wires[io_idx + 1][0].connect_to(luts[lut_idx + 1].I3)
+                carry.connect_to(luts[lut_idx + 1].I4)
+                carry = luts[lut_idx + 1].O52
+
+                luts[lut_idx].O51.connect_to(self.output_wires[io_idx][0])
+                luts[lut_idx + 1].O51.connect_to(self.output_wires[io_idx + 1][0])
+
+                lut_idx += 2
+                io_idx += 2
+            else:
+                raise Exception("Error in construction of Versal Atoms")
+        luts[-1].O52.connect_to(self.output_wires[-1][0])
+        self.instances += luts
+
+
+class VersalAtomCascadeCandidate(CounterCandidate):
+    def extend_to_fit(self, inputs: Shape, outputs: Shape, compression_goal) -> Counter:
+        def fits_col(idx, height):
+            return (
+                height <= inputs[idx]
+                and inputs[idx] + outputs[idx] - height + 1 - compression_goal(idx) >= -1
+            )
+
+        atoms = []
+        io_idx = 0
+        atom_idx = 0
+        while atom_idx < 4:
+            if atom_idx == 0:
+                if fits_col(io_idx, 5) and fits_col(io_idx + 1, 1):
+                    atoms.append(VersalAtom14())
+                    atom_idx += 2
+                    io_idx += 2
+                if fits_col(io_idx, 3) and fits_col(io_idx + 1, 2) and fits_col(io_idx + 2, 2):
+                    atoms.append(VersalAtom222())
+                    atom_idx += 2
+                    io_idx += 3
+                elif fits_col(io_idx, 3):
+                    atoms.append(VersalAtom2())
+                    atom_idx += 1
+                    io_idx += 1
+                else:
+                    break
+            elif atom_idx < 3:
+                if fits_col(io_idx, 4) and fits_col(io_idx + 1, 1):
+                    atoms.append(VersalAtom14())
+                    atom_idx += 2
+                    io_idx += 2
+                elif fits_col(io_idx, 2) and fits_col(io_idx + 1, 2) and fits_col(io_idx + 2, 2):
+                    atoms.append(VersalAtom222())
+                    atom_idx += 2
+                    io_idx += 3
+                elif fits_col(io_idx, 2):
+                    atoms.append(VersalAtom2())
+                    atom_idx += 1
+                    io_idx += 1
+                else:
+                    break
+            elif fits_col(io_idx, 2):
+                atoms.append(VersalAtom2())
+                atom_idx += 1
+                io_idx += 1
+            else:
+                break
+        if atoms:
+            return VersalAtomCascade(atoms)
+
+
+class ConstantOne(GateAbsorptionCounter):
+    def __init__(self):
+        super().__init__(Shape(tuple()), Shape((1,)))
+
+    def build_hardware(self):
+        Constant(1).connect_to(self.output_wires[0][0])
+
+
+class MuxCYAtom06:
+    def __init__(self):
+        self.shape = Shape([6, 0])
+        self.width = 2
+        self.output_width = 2
+
+    def build_luts(self):
+        # Matches VHDL atom06.vhdl - the (0,6) atom for 6 inputs from column 0
+        #
+        # VHDL lo LUT: INIT => x"6996_9669_9669_6996"
+        #   Uses all 6 inputs x0[5:0]
+        #   O6 = O5 = XOR of all 6 bits (parity function)
+        #
+        # VHDL hi LUT: INIT => x"177E_7EE8" & x"E8E8_E8E8"
+        #   Uses x0[4:0] with I5=1
+        #   O6 = complex carry propagation
+        #   O5 = 0xE8 repeated = FA_carry(I0,I1,I2)
+        #
+        # Note: This atom is currently DISABLED in MuxCYAtomCascadeCandidate
+        # because it needs further testing. The predicates below match the
+        # VHDL reference but the wiring/integration may need work.
+        #
+        # lo LUT: XOR of all 6 bits
+        lut_1 = LUT6_2.fromPred(
+            lambda A0, A1, A2, A3, A4, A5: A0 ^ A1 ^ A2 ^ A3 ^ A4,  # O5 (5-input XOR)
+            lambda A0, A1, A2, A3, A4, A5: A0 ^ A1 ^ A2 ^ A3 ^ A4 ^ A5,  # O6 (6-input XOR)
+            "atom06_lo",
+        )
+        # hi LUT: carry chain continuation
+        # O5 = FA_carry(A0,A1,A2) for the generate term
+        # O6 = more complex carry propagation (from VHDL 0x177E7EE8)
+        lut_2 = LUT6_2.fromPred(
+            lambda A0, A1, A2, A3, A4, A5: FA_carry(A0, A1, A2),  # O5 -> DI
+            lambda A0, A1, A2, A3, A4, A5: (
+                FA_carry(FA_sum(A0, A1, A2), A3, A4) ^ FA_carry(A0, A1, A2)
+            ),  # O6 -> S
+            "atom06_hi",
+        )
+        return (lut_1, lut_2)
+
+
+class MuxCYAtom14:
+    def __init__(self):
+        self.shape = Shape([4, 1])
+        self.width = 2
+
+    def build_luts(self):
+        # Preußer FPL 2017: (1,4) atom - matches VHDL atom14.vhdl
+        #
+        # CARRY4 primitive: CO = S ? CI : DI, O = S ^ CI
+        #
+        # The key insight from the VHDL reference:
+        #   - O6 (S) computes the propagate signal: XOR of inputs
+        #   - O5 (DI) simply passes through the higher-weight input bit
+        #
+        # This is NOT an AND of the sum/carry with the input!
+        # The VHDL uses INIT patterns:
+        #   lo: x"6996_6996" & x"FF00_FF00"  (O6=0x6996, O5=0xFF00)
+        #   hi: x"17E8_17E8" & x"FF00_FF00"  (O6=0x17E8, O5=0xFF00)
+        #
+        # O5 = 0xFF00 = just passes I3 (the 4th input bit)
+        #
+        # BUGFIX (2026-04-08): Previous implementation incorrectly used:
+        #   O5 = FA_sum(A0,A1,A2) & A3  (WRONG - produces 0xFF96)
+        # Correct implementation:
+        #   O5 = A3  (just pass through - produces 0xFF00)
+        #
+        # lut_1 (position 0): processes x0[3:0] for s0/d0
+        lut_1 = LUT6_2.fromPred(
+            lambda A0, A1, A2, A3, A4, _: A3,  # O5 -> DI = x0[3]
+            lambda A0, A1, A2, A3, A4, _: FA_sum(A0, A1, A2) ^ A3,  # O6 -> S
+            "atom14_0",
+        )
+        # lut_2 (position 1): processes x0[2:0] and x1 for s1/d1
+        # x1 is mapped to I3 (A3)
+        lut_2 = LUT6_2.fromPred(
+            lambda A0, A1, A2, A3, A4, _: A3,  # O5 -> DI = x1
+            lambda A0, A1, A2, A3, A4, _: FA_carry(A0, A1, A2) ^ A3,  # O6 -> S
+            "atom14_1",
+        )
+        return (lut_1, lut_2)
+
+
+class MuxCYAtom2:
+    def __init__(self):
+        self.shape = Shape([2])
+        self.width = 1
+
+    def build_luts(self):
+        # Matches VHDL atom22.vhdl: INIT => x"6666_6666" & x"CCCC_CCCC"
+        #
+        # CARRY4: CO = S ? CI : DI, O = S ^ CI
+        #
+        # The VHDL uses:
+        #   O6 = 0x6666 = I0 ^ I1 (XOR / half-adder sum)
+        #   O5 = 0xCCCC = I1 (just passes through the higher-weight bit)
+        #
+        # BUGFIX (2026-04-08): Previous implementation used O5=A0.
+        # While this happens to produce correct results due to CARRY4
+        # logic simplification, it doesn't match the VHDL reference.
+        # Changed to O5=A1 for consistency with atom22.vhdl.
+        lut = LUT6_2.fromPred(
+            lambda A0, A1, A2, A3, A4, _: A1,  # O5 -> DI = higher-weight bit
+            lambda A0, A1, A2, A3, A4, _: A0 ^ A1,  # O6 -> S (propagate)
+            "atom2",
+        )
+        return (lut,)
+
+
+class MuxCYAtomCascade(Counter):
+    def __init__(self, atoms):
+        self._atoms = atoms
+
+        in_shape = [el for atom in atoms for el in atom.shape]
+        in_shape[0] += 1
+        in_shape = Shape(in_shape)
+
+        out_shape = Shape([1 for _ in range(sum([atom.width for atom in atoms]) + 1)])
+        super().__init__(in_shape, out_shape)
+
+    def build_hardware(self):
+        luts = []
+        for atom in self._atoms:
+            luts += atom.build_luts()
+        muxcy = CARRY4()
+
+        if not luts:
+            return
+
+        # Connect inputs
+        idx = 0
+        self.input_wires[0][self._atoms[0].shape[0]].connect_to(muxcy.CI)
+
+        for atom in self._atoms:
+            if isinstance(atom, MuxCYAtom2):
+                self.input_wires[idx][0].connect_to(luts[idx].I0)
+                self.input_wires[idx][1].connect_to(luts[idx].I1)
+                idx += 1
+            elif isinstance(atom, MuxCYAtom14):
+                # first lut
+                self.input_wires[idx][0].connect_to(luts[idx].I0)
+                self.input_wires[idx][1].connect_to(luts[idx].I1)
+                self.input_wires[idx][2].connect_to(luts[idx].I2)
+                self.input_wires[idx][3].connect_to(luts[idx].I3)
+
+                # second lut
+                self.input_wires[idx][0].connect_to(luts[idx + 1].I0)
+                self.input_wires[idx][1].connect_to(luts[idx + 1].I1)
+                self.input_wires[idx][2].connect_to(luts[idx + 1].I2)
+                self.input_wires[idx + 1][0].connect_to(luts[idx + 1].I3)
+                idx += 2
+            elif isinstance(atom, MuxCYAtom06):
+                # First LUT (atom06_lo): uses all 6 inputs for XOR
+                self.input_wires[idx][0].connect_to(luts[idx].I0)
+                self.input_wires[idx][1].connect_to(luts[idx].I1)
+                self.input_wires[idx][2].connect_to(luts[idx].I2)
+                self.input_wires[idx][3].connect_to(luts[idx].I3)
+                self.input_wires[idx][4].connect_to(luts[idx].I4)
+                self.input_wires[idx][5].connect_to(luts[idx].I5)
+
+                # Second LUT (atom06_hi): uses inputs 0-4 for carry propagation
+                # BUGFIX: was connecting to luts[idx] instead of luts[idx+1]
+                self.input_wires[idx][0].connect_to(luts[idx + 1].I0)
+                self.input_wires[idx][1].connect_to(luts[idx + 1].I1)
+                self.input_wires[idx][2].connect_to(luts[idx + 1].I2)
+                self.input_wires[idx][3].connect_to(luts[idx + 1].I3)
+                self.input_wires[idx][4].connect_to(luts[idx + 1].I4)
+                idx += 2
+            else:
+                raise Exception("Error in construction of MuxCYAtoms")
+
+        # Connect outputs
+        for idx, (lut, di, s, o) in enumerate(
+            zip(luts, muxcy.DI.elements, muxcy.S.elements, muxcy.O.elements)
+        ):
+            lut.O6.connect_to(s)
+            lut.O5.connect_to(di)
+            o.connect_to(self.output_wires[idx][0])
+
+        muxcy.CO.elements[-1].connect_to(self.output_wires[-1][0])
+        self.instances += luts
+        self.instances.append(muxcy)
+
+
+class MuxCYAtomCascadeCandidate(CounterCandidate):
+    def extend_to_fit(self, inputs: Shape, outputs: Shape, compression_goal) -> Counter:
+        def fits_col(idx, height):
+            return (
+                height <= inputs[idx]
+                and inputs[idx] + outputs[idx] - height + 1 - compression_goal(idx) >= -1
+            )
+
+        atoms = []
+        i = 0
+        while i < 4:
+            if i == 0:
+                # MuxCYAtom06: 6:3 compressor for column 0 (needs 7 inputs: 6 + carry-in)
+                if fits_col(i, 7):
+                    atoms.append(MuxCYAtom06())
+                    i += 2
+                elif fits_col(i, 5) and fits_col(i + 1, 1):
+                    atoms.append(MuxCYAtom14())
+                    i += 2
+                elif fits_col(i, 3):
+                    atoms.append(MuxCYAtom2())
+                    i += 1
+                else:
+                    break
+            elif i < 3:
+                # MuxCYAtom06: 6:3 compressor for middle columns
+                if fits_col(i, 6):
+                    atoms.append(MuxCYAtom06())
+                    i += 2
+                elif fits_col(i, 4) and fits_col(i + 1, 1):
+                    atoms.append(MuxCYAtom14())
+                    i += 2
+                elif fits_col(i, 2):
+                    atoms.append(MuxCYAtom2())
+                    i += 1
+                else:
+                    break
+            elif fits_col(i, 2):
+                atoms.append(MuxCYAtom2())
+                i += 1
+            else:
+                break
+        if i == 4:
+            return MuxCYAtomCascade(atoms)
diff --git a/src/finn/compressor/src/graph/final_adder.py b/src/finn/compressor/src/graph/final_adder.py
new file mode 100644
index 0000000000..9b07890ef1
--- /dev/null
+++ b/src/finn/compressor/src/graph/final_adder.py
@@ -0,0 +1,359 @@
+#############################################################################
+# Copyright (C) 2024 - 2025 Advanced Micro Devices, Inc. All rights reserved.
+#
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# @brief    Final adder implementations for compressor output stage
+#############################################################################
+
+from abc import abstractstaticmethod
+from typing import List
+
+from ..utils.shape import Shape
+from .nodes import BlackboxOutput, Counter, Logic, Wire
+from .primitives import CARRY4, LOOKAHEAD8, LUT6_2, LUT6CY
+
+
+def FA_sum(a, b, c):
+    return a ^ b ^ c
+
+
+def FA_carry(a, b, c):
+    return a and b or a and c or b and c
+
+
+def ceildiv(a, b):
+    return -(a // -b)
+
+
+def try_connect(func):
+    try:
+        func()
+    except IndexError:
+        pass
+
+
+class FinalAdder(Counter):
+    @abstractstaticmethod
+    def compression_goal(col):
+        pass
+
+    @property
+    def delay(self):
+        return 0
+
+
+class QuaternaryAdder(FinalAdder):
+    @staticmethod
+    def compression_goal(col):
+        return 5 if col <= 1 else 4
+
+    def __init__(self, input_shape: Shape, pipelined: bool = False):
+        self.pipelined = pipelined
+        output_shape = Shape([1 for _ in range(len(input_shape) + 2)])
+        super().__init__(input_shape, output_shape)
+
+    @property
+    def delay(self):
+        return 1 if self.pipelined else 0
+
+    def _add_register(self, signal):
+        """
+        Helper that inserts a register (Logic node) after signal and returns the register.
+
+        If signal is a BlackboxOutput, we need an intermediate Wire because
+        the Verilog emitter can't handle Logic with BlackboxOutput as a source.
+        """
+
+        reg = Logic()
+        if isinstance(signal, BlackboxOutput):
+            # Insert a wire between BlackboxOutput and Logic
+            wire = Wire()
+            signal.connect_to(wire)
+            wire.connect_to(reg)
+            self.instances.append(wire)
+        else:
+            signal.connect_to(reg)
+        self.instances.append(reg)
+        return reg
+
+    def build_hardware(self):
+        # Find the limit up to which the quaternary adder is needed.
+        # We construct a two-input adder after this.
+        height_4_until = len(self.input_wires)
+        tail_length = 0
+        for idx, col in reversed(list(enumerate(self.input_wires))):
+            if len(col) > 2:
+                break
+            else:
+                height_4_until = idx
+                tail_length += 1
+
+        # If tail_length==1, the quaternary adder must not be reduced,
+        # as there would be no savings.
+        if tail_length == 1:
+            height_4_until += 1
+            tail_length = 0
+
+        # Construct necessary hardware
+        luts_top: List[LUT6CY] = []
+        luts_btm: List[LUT6CY] = []
+
+        for i in range(0, height_4_until):
+            luts_top.append(
+                LUT6CY.fromPred(
+                    lambda A0, A1, A2, A3, A4, _: FA_sum(FA_sum(A0, A1, A2), A3, A4),  # S
+                    lambda A0, A1, A2, A3, A4, _: FA_carry(FA_sum(A0, A1, A2), A3, A4),  # ct
+                    "final_adder_top",
+                )
+            )
+            luts_btm.append(
+                LUT6CY.fromPred(
+                    lambda A0, A1, A2, A3, A4, _: FA_sum(FA_carry(A0, A1, A2), A3, A4),  # out
+                    lambda A0, A1, A2, A3, A4, _: FA_carry(FA_carry(A0, A1, A2), A3, A4),  # cb
+                    "final_adder_btm",
+                )
+            )
+        if tail_length:
+            luts_top.append(
+                LUT6CY.fromPred(
+                    lambda A0, A1, A2, A3, A4, _: FA_sum(A0, A1, A4),  # out
+                    lambda A0, A1, A2, A3, A4, _: FA_carry(A0, A1, A4),  # c_btm
+                    "final_adder_top_end",
+                )
+            )
+            luts_btm.append(
+                LUT6CY.fromPred(
+                    lambda A0, A1, A2, A3, A4, _: FA_sum(FA_sum(A0, A1, False), A3, A4),  # out
+                    lambda A0, A1, A2, A3, A4, _: FA_carry(FA_sum(A0, A1, False), A3, A4),  # c_btm
+                    "final_adder_btm_start_two_input_chain",
+                )
+            )
+        for i in range(tail_length - 1):
+            luts_btm.append(
+                LUT6CY.fromPred(
+                    lambda A0, A1, A2, A3, A4, _: FA_sum(
+                        FA_carry(A0, A1, False), FA_sum(A2, A3, False), A4
+                    ),  # out
+                    lambda A0, A1, A2, A3, A4, _: FA_carry(
+                        FA_carry(A0, A1, False), FA_sum(A2, A3, False), A4
+                    ),  # cb
+                    "final_adder_btm_two_input_chain",
+                )
+            )
+
+        l8s_top = []
+        l8s_btm = []
+        for _ in range(ceildiv(len(luts_top), 8)):
+            l8s_top.append(LOOKAHEAD8())
+        for _ in range(ceildiv(len(luts_btm), 8)):
+            l8s_btm.append(LOOKAHEAD8())
+
+        # Collect relevant input and output signals
+        for i in range(len(luts_top)):
+            luts_top[i].O52.connect_to(l8s_top[i // 8].c_in_ports[i % 8 + 1])
+            luts_top[i].PROP.connect_to(l8s_top[i // 8].p_in_ports[i % 8])
+
+        for i in range(len(luts_btm)):
+            luts_btm[i].O52.connect_to(l8s_btm[i // 8].c_in_ports[i % 8 + 1])
+            luts_btm[i].PROP.connect_to(l8s_btm[i // 8].p_in_ports[i % 8])
+
+        carries_top = []
+        carries_btm = []
+        for i in range(0, len(luts_top)):
+            if i % 2 == 0:
+                carries_top.append(luts_top[i].O52)
+            if i % 2 == 1:
+                carries_top.append(l8s_top[i // 8].out_ports[i % 8 // 2])
+        for i in range(0, len(luts_btm)):
+            if i % 2 == 0:
+                carries_btm.append(luts_btm[i].O52)
+            if i % 2 == 1:
+                carries_btm.append(l8s_btm[i // 8].out_ports[i % 8 // 2])
+
+        for i in range(0, len(luts_top) - 1):
+            carries_top[i].connect_to(luts_top[i + 1].I4)
+        for i in range(0, len(luts_btm) - 1):
+            carries_btm[i].connect_to(luts_btm[i + 1].I4)
+
+        # connect carry-ins between lookahead modules
+        def chain_l8(l8s):
+            for prev, next in zip(l8s, l8s[1:]):
+                prev.COUTH.connect_to(next.CIN)
+
+        chain_l8(l8s_top)
+        chain_l8(l8s_btm)
+
+        # connect carry-in to first lut and lookahead module
+        try_connect(lambda: self.input_wires[0][4].connect_to(luts_top[0].I4))
+        try_connect(lambda: self.input_wires[0][4].connect_to(l8s_top[0].CIN))
+
+        # Bottom row carry-in (optionally through register for pipelining)
+        # try_conneect silently fails and goes on
+        # if the input wire is not present. _add.register however
+        # will throw if we try to connect a register to a non-existent input wire,
+        # so we check for existence first whenever we want to add a register.
+        if self.pipelined and len(self.input_wires) > 1 and len(self.input_wires[1]) > 4:
+            btm_cin_reg = self._add_register(self.input_wires[1][4])
+            try_connect(lambda: btm_cin_reg.connect_to(luts_btm[0].I4))
+            try_connect(lambda: btm_cin_reg.connect_to(l8s_btm[0].CIN))
+        else:
+            try_connect(lambda: self.input_wires[1][4].connect_to(luts_btm[0].I4))
+            try_connect(lambda: self.input_wires[1][4].connect_to(l8s_btm[0].CIN))
+
+        # downwards connection (optionally through registers for pipelining)
+        for t, d in zip(luts_top[1:], luts_btm):
+            if self.pipelined:
+                reg = self._add_register(t.O51)
+                reg.connect_to(d.I3)
+            else:
+                t.O51.connect_to(d.I3)
+        last_top = len(carries_top) - 1
+        if self.pipelined:
+            reg = self._add_register(carries_top[last_top])
+            reg.connect_to(luts_btm[last_top].I3)
+        else:
+            carries_top[last_top].connect_to(luts_btm[last_top].I3)
+
+        # Connect inputs to top and bottom rows
+        # For pipelining: top row connects directly, bottom row through registers
+        for idx, (lb, lt) in enumerate(zip(luts_btm, luts_top[:height_4_until])):
+            # Top row: always direct connection
+            try_connect(lambda: self.input_wires[idx][0].connect_to(lt.I0))
+            try_connect(lambda: self.input_wires[idx][1].connect_to(lt.I1))
+            try_connect(lambda: self.input_wires[idx][2].connect_to(lt.I2))
+            try_connect(lambda: self.input_wires[idx][3].connect_to(lt.I3))
+
+            # Bottom row: through registers if pipelined
+            if self.pipelined:
+                for i, port in enumerate([lb.I0, lb.I1, lb.I2]):
+                    if len(self.input_wires[idx]) > i:
+                        reg = self._add_register(self.input_wires[idx][i])
+                        reg.connect_to(port)
+            else:
+                try_connect(lambda: self.input_wires[idx][0].connect_to(lb.I0))
+                try_connect(lambda: self.input_wires[idx][1].connect_to(lb.I1))
+                try_connect(lambda: self.input_wires[idx][2].connect_to(lb.I2))
+
+        if tail_length:
+            lt = luts_top[height_4_until]
+            lb = luts_btm[height_4_until]
+
+            try_connect(lambda: self.input_wires[height_4_until][0].connect_to(lt.I0))
+            try_connect(lambda: self.input_wires[height_4_until][1].connect_to(lt.I1))
+
+            if self.pipelined:
+                if len(self.input_wires) > height_4_until + 1:
+                    if len(self.input_wires[height_4_until + 1]) > 0:
+                        reg0 = self._add_register(self.input_wires[height_4_until + 1][0])
+                        reg0.connect_to(lb.I0)
+                    if len(self.input_wires[height_4_until + 1]) > 1:
+                        reg1 = self._add_register(self.input_wires[height_4_until + 1][1])
+                        reg1.connect_to(lb.I1)
+            else:
+                try_connect(lambda: self.input_wires[height_4_until + 1][0].connect_to(lb.I0))
+                try_connect(lambda: self.input_wires[height_4_until + 1][1].connect_to(lb.I1))
+
+        for idx, lb in enumerate(luts_btm[height_4_until + 1 :]):
+            if self.pipelined:
+                col1 = idx + height_4_until + 1
+                col2 = idx + height_4_until + 2
+                if len(self.input_wires) > col1 and len(self.input_wires[col1]) > 0:
+                    reg0 = self._add_register(self.input_wires[col1][0])
+                    reg0.connect_to(lb.I0)
+                if len(self.input_wires) > col1 and len(self.input_wires[col1]) > 1:
+                    reg1 = self._add_register(self.input_wires[col1][1])
+                    reg1.connect_to(lb.I1)
+                if len(self.input_wires) > col2 and len(self.input_wires[col2]) > 0:
+                    reg2 = self._add_register(self.input_wires[col2][0])
+                    reg2.connect_to(lb.I2)
+                if len(self.input_wires) > col2 and len(self.input_wires[col2]) > 1:
+                    reg3 = self._add_register(self.input_wires[col2][1])
+                    reg3.connect_to(lb.I3)
+            else:
+                try_connect(lambda: self.input_wires[idx + height_4_until + 1][0].connect_to(lb.I0))
+                try_connect(lambda: self.input_wires[idx + height_4_until + 1][1].connect_to(lb.I1))
+                try_connect(lambda: self.input_wires[idx + height_4_until + 2][0].connect_to(lb.I2))
+                try_connect(lambda: self.input_wires[idx + height_4_until + 2][1].connect_to(lb.I3))
+
+        def connect_carry_to_lut(carries, luts):
+            for carry, lut in zip(carries, luts[1:]):
+                carry.connect_to(lut.I4)
+
+        connect_carry_to_lut(carries_top, luts_top)
+        connect_carry_to_lut(carries_btm, luts_btm)
+
+        # First output bit comes from top row - must be registered when pipelined
+        if self.pipelined:
+            reg = self._add_register(luts_top[0].O51)
+            reg.connect_to(self.output_wires[0][0])
+        else:
+            luts_top[0].O51.connect_to(self.output_wires[0][0])
+
+        for idx, lb in enumerate(luts_btm):
+            lb.O51.connect_to(self.output_wires[idx + 1][0])
+
+        carries_btm[len(luts_btm) - 1].connect_to(self.output_wires[len(luts_btm) + 1][0])
+
+        if self.pipelined:
+            reg = self._add_register(luts_top[-1].O52)
+            reg.connect_to(luts_btm[len(luts_top) - 1].I3)
+        else:
+            luts_top[-1].O52.connect_to(luts_btm[len(luts_top) - 1].I3)
+
+        self.instances += luts_top + luts_btm + l8s_btm + l8s_top
+
+
+class MuxCYTernaryAdder(FinalAdder):
+    @staticmethod
+    def compression_goal(col):
+        return 5 if col == 0 else 3
+
+    def __init__(self, input_shape: Shape):
+        input_shape = input_shape
+        output_shape = Shape([1 for _ in range(len(input_shape) + 2)])
+        super().__init__(input_shape, output_shape)
+
+    def build_hardware(self):
+        luts = [
+            LUT6_2.fromPred(
+                lambda A0, A1, A2, A3, A4, A5: FA_carry(A0, A1, A2),
+                lambda A0, A1, A2, A3, A4, A5: FA_sum(A0, A1, A2) ^ A3,
+            )
+            for _ in range(len(self.input_shape) + 1)
+        ]
+        c4s = [CARRY4() for _ in range(0, len(self.input_shape) + 1, 4)]
+        dis = [el for c4 in c4s for el in c4.DI.elements]
+        ss = [el for c4 in c4s for el in c4.S.elements]
+        cis = [c4.CI for c4 in c4s]
+        os = [el for c4 in c4s for el in c4.O.elements]
+        cos = [el for c4 in c4s for el in c4.CO.elements]
+
+        # Connect CARRY4 together
+        for c4p, c4n in zip(c4s, c4s[1:]):
+            c4p.CO.elements[-1].connect_to(c4n.CI)
+
+        # Connect inputs
+        # Only connect up to the number of available input columns
+        for idx, lut in enumerate(luts[: len(self.input_wires)]):
+            try_connect(lambda idx=idx, lut=lut: self.input_wires[idx][0].connect_to(lut.I0))
+            try_connect(lambda idx=idx, lut=lut: self.input_wires[idx][1].connect_to(lut.I1))
+            try_connect(lambda idx=idx, lut=lut: self.input_wires[idx][2].connect_to(lut.I2))
+        try_connect(lambda: self.input_wires[0][3].connect_to(luts[0].I3))
+        try_connect(lambda: self.input_wires[0][3].connect_to(dis[0]))
+        try_connect(lambda: self.input_wires[0][4].connect_to(cis[0]))
+
+        # Second carry connection
+        for p, n, n_di in zip(luts, luts[1:], dis[1:]):
+            p.O5.connect_to(n.I3)
+            p.O5.connect_to(n_di)
+
+        # Connect outputs
+        for lut, s in zip(luts, ss):
+            lut.O6.connect_to(s)
+
+        for idx, o in enumerate(os[: len(luts)]):
+            o.connect_to(self.output_wires[idx][0])
+
+        cos[len(luts) - 1].connect_to(self.output_wires[len(luts)][0])
+        self.instances += luts + c4s
diff --git a/src/finn/compressor/src/graph/nodes.py b/src/finn/compressor/src/graph/nodes.py
new file mode 100644
index 0000000000..6a2499af23
--- /dev/null
+++ b/src/finn/compressor/src/graph/nodes.py
@@ -0,0 +1,479 @@
+#############################################################################
+# Copyright (C) 2024 - 2025 Advanced Micro Devices, Inc. All rights reserved.
+#
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# @brief    Compressor graph node definitions and delay estimation
+#############################################################################
+
+from __future__ import annotations
+
+from abc import ABC, abstractmethod
+from typing import Dict, List, Tuple
+
+from ..utils.shape import Shape
+
+"""
+Convention: LSB at index 0.
+"""
+
+
+class Node(ABC):
+    def accept(self, visitor) -> None:
+        pass
+
+
+class Connectable(Node):
+    target: list[Connectable]
+    source: Connectable
+
+    def __init__(self):
+        self.target = []
+        self.source = None
+
+    def connect_to(self, target):
+        assert isinstance(target, Connectable), "Target has to be of type Connectible!"
+        self.target.append(target)
+        target.source = self
+
+    @property
+    def has_target(self):
+        return bool(self.target)
+
+    @property
+    def has_source(self):
+        return self.source is not None
+
+
+class Constant(Connectable):
+    def __init__(self, value):
+        super().__init__()
+        self.value = str(value)
+
+
+class Wire(Connectable):
+    def __init__(self, desired_name=None):
+        super().__init__()
+        self.prefix = ""
+        self.desired_name = desired_name
+
+    def set_to_module_input(self):
+        self.prefix = "input "
+
+    def set_to_module_output(self):
+        self.prefix = "output "
+
+    def accept(self, visitor) -> None:
+        visitor.visit_wire(self)
+
+
+class Logic(Wire):
+    def __init__(self, *, rst: Connectable = None, en: Connectable = None, init: int = None):
+        self.rst = rst
+        self.en = en
+        self.init = init
+        super().__init__()
+
+    def accept(self, visitor):
+        return visitor.visit_logic(self)
+
+
+class BlackboxVecElement(Connectable):
+    pass
+
+
+class BlackboxVec(Node, ABC):
+    def __init__(self, name, width):
+        self.name = name
+        self.elements = [BlackboxVecElement() for el in range(width)]
+        super().__init__()
+
+
+class BlackboxInputVec(BlackboxVec):
+    def accept(self, visitor) -> None:
+        visitor.visit_blackbox_input_vec(self)
+
+
+class BlackboxOutputVec(BlackboxVec):
+    def accept(self, visitor) -> None:
+        visitor.visit_blackbox_output_vec(self)
+
+
+class BlackboxPort(Connectable):
+    def __init__(self, name):
+        self.name = name
+        super().__init__()
+
+    @property
+    @abstractmethod
+    def connected(self):
+        pass
+
+    @property
+    @abstractmethod
+    def wire(self):
+        pass
+
+
+class BlackboxInput(BlackboxPort):
+    def __init__(self, name):
+        super().__init__(name)
+
+    @property
+    def connected(self):
+        return self.has_source
+
+    def connect_to(self, target):
+        raise RuntimeError("Blackbox Input cannot act as output.")
+
+    @property
+    def wire(self):
+        return self.source
+
+    def accept(self, visitor) -> None:
+        visitor.visit_blackbox_input(self)
+
+
+class BlackboxOutput(BlackboxPort):
+    def __init__(self, name):
+        super().__init__(name)
+
+    @property
+    def connected(self):
+        return self.has_target
+
+    @property
+    def wire(self):
+        return self.target
+
+    def accept(self, visitor) -> None:
+        visitor.visit_blackbox_output(self)
+
+
+class Blackbox(Node):
+    @abstractmethod
+    def __init__(
+        self,
+        module_name: str,
+        in_ports: Tuple[BlackboxInput],
+        out_ports: Tuple[BlackboxOutput],
+        parameters: Dict[str, str],
+    ):
+        self.module_name = module_name
+        self.in_ports = in_ports
+        self.out_ports = out_ports
+        self.parameters = parameters
+        self.annotations = []
+
+        for port in self.in_ports + self.out_ports:
+            self.__dict__[port.name] = port
+
+    def annotate(self, annotation: str):
+        self.annotations.append(annotation)
+
+    def accept(self, visitor):
+        visitor.visit_blackbox(self)
+
+
+class Module(Node):
+    def __init__(self):
+        self.instances = []  # All inner instances
+        super().__init__()
+
+    @property
+    @abstractmethod
+    def inputs(self):
+        pass
+
+    @property
+    @abstractmethod
+    def outputs(self):
+        pass
+
+
+class Counter(Module):
+    def __init__(self, input_shape: Shape, output_shape: Shape):
+        super().__init__()
+        self.input_shape = input_shape
+        self.output_shape = output_shape
+        self.input_wires = self._build_wires(input_shape)
+        self.output_wires = self._build_wires(output_shape)
+        self.instances += self.inputs + self.outputs
+
+        self.build_hardware()
+
+    def accept(self, visitor) -> None:
+        visitor.visit_counter(self)
+
+    @abstractmethod
+    def build_hardware(self):
+        pass
+
+    def _build_wires(self, shape: Shape):
+        return tuple([tuple([Wire() for _ in range(col_height)]) for col_height in shape])
+
+    @property
+    def inputs(self):
+        return [el for col in self.input_wires for el in col]
+
+    @property
+    def outputs(self):
+        return [el for col in self.output_wires for el in col]
+
+    @property
+    def luts(self) -> List[LUT]:
+        return [inst for inst in self.instances if isinstance(inst, LUT)]
+
+    @property
+    def efficiency(self) -> float:
+        if len(self.luts) == 0 and sum(self.input_shape) - sum(self.output_shape) == 0:
+            return 0
+        diff = sum(self.input_shape) - sum(self.output_shape)
+        denom = sum(LUT.size for LUT in self.luts)
+        return diff / denom
+
+    @property
+    def strength(self) -> float:
+        return sum(self.input_shape) / sum(self.output_shape)
+
+
+class GateAbsorptionCounter(Counter):
+    def __init__(self, input_shape: Shape, output_shape: Shape):
+        self.input_wires_complementary = self._build_wires(input_shape)
+        super().__init__(input_shape, output_shape)
+
+    def accept(self, visitor) -> None:
+        visitor.visit_gate_absorption_counter(self)
+
+    @property
+    def inputs(self):
+        return [el for col in self.input_wires + self.input_wires_complementary for el in col]
+
+
+class Passthrough(Counter):
+    def __init__(self):
+        super().__init__(Shape([1]), Shape([1]))
+
+    def build_hardware(self):
+        self.output_wires = self.input_wires
+        self.instances = [el for col in self.input_wires for el in col]
+
+
+class Stage(Node):
+    input_shape: Shape
+    output_shape: Shape
+    input_wires: Bitmatrix[Wire]
+    output_wires: Bitmatrix[Wire]
+
+    def connect_to(self, other):
+        for col_s, col_t in zip(self.output_wires, other.input_wires):
+            for el_s, el_t in zip(col_s, col_t):
+                el_s.connect_to(el_t)
+
+        # TODO: maybe subclass instead?
+        if "output_wires_complementary" in self.__dict__:
+            for col_s, col_t in zip(
+                self.output_wires_complementary, other.input_wires_complementary
+            ):
+                for el_s, el_t in zip(col_s, col_t):
+                    el_s.connect_to(el_t)
+
+
+class InputStage(Stage):
+    def __init__(self, shape: Shape, gates: bool = False):
+        self.input_shape = shape
+        self.output_shape = shape
+        self.input_wires = Bitmatrix(shape)
+        self.gates = gates
+        if gates:
+            self.input_wires_complementary = Bitmatrix(shape)
+            self.output_wires_complementary = self.input_wires_complementary
+
+        self.output_wires = self.input_wires
+
+    def accept(self, visitor) -> None:
+        visitor.visit_input_stage(self)
+
+
+class PipelineStage(Stage):
+    def __init__(self, shape: Shape):
+        self.input_shape = shape
+        self.output_shape = shape
+        self.input_wires = Bitmatrix(shape)
+        self.output_wires = Bitmatrix(shape)
+        self.instances = []
+        for i_c, o_c in zip(self.input_wires, self.output_wires):
+            for i, o in zip(i_c, o_c):
+                lgc = Logic()
+                i.connect_to(lgc)
+                lgc.connect_to(o)
+                self.instances.append(lgc)
+
+    def accept(self, visitor) -> None:
+        visitor.visit_pipeline_stage(self)
+
+
+class CompressionStage(Stage):
+    def __init__(self):
+        self.counters_with_shifts = []
+        self.input_wires = Bitmatrix()
+        self.output_wires = Bitmatrix()
+
+    @property
+    def input_shape(self):
+        return self._shape(lambda x: x.input_shape)
+
+    @property
+    def output_shape(self):
+        return self._shape(lambda x: x.output_shape)
+
+    def _shape(self, func):
+        shape = Shape(())
+        for ctr, shift in self.counters_with_shifts:
+            shifted_shape = func(ctr) << shift
+            shape = shape + shifted_shape
+        return shape
+
+    def append_counter(self, counter: Counter, shift: int):
+        self.counters_with_shifts.append((counter, shift))
+        for source_idx, col in enumerate(counter.input_wires):
+            for wire in col:
+                self.input_wires.add_output(wire, source_idx + shift)
+        for source_idx, col in enumerate(counter.output_wires):
+            for wire in col:
+                self.output_wires.add_input(wire, source_idx + shift)
+
+    def accept(self, visitor) -> None:
+        visitor.visit_compression_stage(self)
+
+
+class GateAbsorbedStage(CompressionStage):
+    def __init__(self):
+        super().__init__()
+        self.input_wires_complementary = Bitmatrix()
+
+    def append_counter(self, counter: GateAbsorptionCounter, shift: int):
+        super().append_counter(counter, shift)
+        for source_idx, col in enumerate(counter.input_wires_complementary):
+            for wire in col:
+                self.input_wires_complementary.add_output(wire, source_idx + shift)
+
+    def accept(self, visitor) -> None:
+        visitor.visit_gate_absorbed_stage(self)
+
+
+class Compressor(Node):
+    def __init__(self, name):
+        self.stages = []
+        self.module_name = name
+        self.io = []
+
+    @property
+    def input_shape(self):
+        return self.stages[0].input_shape
+
+    @property
+    def output_shape(self):
+        return self.stages[-1].output_shape
+
+    @property
+    def delay(self):
+        from .accumulator import AccumulatorStage  # noqa: PLC0415
+
+        delay_ = 0
+        for s in self.stages:
+            if isinstance(s, PipelineStage):
+                delay_ += 1
+            if isinstance(s, AccumulatorStage):
+                delay_ += 1
+            # Check for pipelined final adders in CompressionStages
+            if isinstance(s, CompressionStage):
+                for counter, _ in s.counters_with_shifts:
+                    if hasattr(counter, "delay"):
+                        delay_ += counter.delay
+        return delay_
+
+    def accept(self, visitor) -> None:
+        visitor.visit_compressor(self)
+
+
+class BitmatrixElement(Connectable):
+    def __init__(self, vector, idx_x, idx_y):
+        self.vector = vector
+        self.idx_2d = (idx_x, idx_y)
+        super().__init__()
+
+    @property
+    def lin_idx(self):
+        return sum(self.vector.shape[: self.idx_2d[0]]) + self.idx_2d[1]
+
+    def accept(self, visitor):
+        pass
+
+
+class Bitmatrix(Node):
+    def __init__(self, shape: Shape = Shape(), name: str = None):
+        self._name = name
+        self.prefix = ""
+        self.connectables = [
+            [BitmatrixElement(self, idx, row) for row in range(col)]
+            for idx, col in enumerate(shape)
+        ]
+        super().__init__()
+
+    def set_to_module_input(self):
+        self.prefix = "input "
+
+    def set_to_module_output(self):
+        self.prefix = "output "
+
+    def __len__(self):
+        return len(self.connectables)
+
+    def __getitem__(self, sel):
+        return self.connectables[sel]
+
+    def __iter__(self):
+        return self.connectables.__iter__()
+
+    def total_size(self):
+        return sum([len(col) for col in self.connectables])
+
+    @property
+    def shape(self):
+        return Shape([len(col) for col in self.connectables])
+
+    def add_output(self, el, col_idx):
+        be = self._append_wire(el, col_idx)
+        be.connect_to(el)
+
+    def add_input(self, el, col_idx):
+        be = self._append_wire(el, col_idx)
+        el.connect_to(be)
+
+    def _append_wire(self, el, col_idx):
+        while len(self.connectables) <= col_idx:
+            self.connectables.append([])
+        be = BitmatrixElement(self, col_idx, len(self.connectables[col_idx]))
+        self.connectables[col_idx].append(be)
+        return be
+
+    def accept(self, visitor) -> None:
+        visitor.visit_bitmatrix(self)
+
+
+class LUT(Blackbox):
+    @abstractmethod
+    def __init__(
+        self,
+        module_name,
+        init_code: str,
+        in_ports: Tuple[BlackboxInput],
+        out_ports: Tuple[BlackboxOutput],
+        *,
+        size,
+        desired_name="lut",
+    ):
+        self.desired_name = desired_name
+        self.size = size
+        super().__init__(module_name, in_ports, out_ports, {"INIT": init_code})
diff --git a/src/finn/compressor/src/graph/primitives.py b/src/finn/compressor/src/graph/primitives.py
new file mode 100644
index 0000000000..bb77faee5c
--- /dev/null
+++ b/src/finn/compressor/src/graph/primitives.py
@@ -0,0 +1,133 @@
+#############################################################################
+# Copyright (C) 2024 - 2025 Advanced Micro Devices, Inc. All rights reserved.
+#
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# @brief    FPGA primitive definitions for compressor (LUTs, carry chains, etc.)
+#############################################################################
+
+from ctypes import c_uint32, c_uint64
+
+from .nodes import (
+    LUT,
+    Blackbox,
+    BlackboxInput,
+    BlackboxInputVec,
+    BlackboxOutput,
+    BlackboxOutputVec,
+    Constant,
+)
+
+
+class LUT2(LUT):
+    @classmethod
+    def fromPred(self, predO2, desired_name="lut2"):
+        res = 0
+        for i in range(32):
+            inputs = [bool(i & (1 << shmt)) for shmt in range(2)]
+            res = res | (int(predO2(*inputs)) << i)
+            res = res & 0xF
+        init_str = f"""4'h{"{:_x}".format(c_uint32(res).value)}"""
+        return LUT2(init_str, desired_name)
+
+    def __init__(self, init_code: str, desired_name):
+        in_ports = [BlackboxInput(f"I{el}") for el in range(2)]
+        out_ports = [BlackboxOutput("O")]
+        super().__init__(
+            "LUT2", init_code, in_ports, out_ports, desired_name=desired_name, size=0.5
+        )
+
+
+class LUT5(LUT):
+    @classmethod
+    def fromPred(self, predO5, desired_name="lut5"):
+        res = 0
+        for i in range(32):
+            inputs = [bool(i & (1 << shmt)) for shmt in range(5)]
+            res = res | (int(predO5(*inputs)) << i)
+        init_str = f"""32'h{"{:_x}".format(c_uint32(res).value)}"""
+        return LUT5(init_str, desired_name)
+
+    def __init__(self, init_code: str, desired_name):
+        in_ports = [BlackboxInput(f"I{el}") for el in range(5)]
+        out_ports = [BlackboxOutput("O")]
+        super().__init__(
+            "LUT5", init_code, in_ports, out_ports, desired_name=desired_name, size=0.5
+        )
+
+
+class LUT6(LUT):
+    @classmethod
+    def fromPred(self, predO6, desired_name="lut6"):
+        res = 0
+        for i in range(64):
+            inputs = [bool(i & (1 << shmt)) for shmt in range(6)]
+            res = res | (int(predO6(*inputs)) << i)
+        init_str = f"""64'h{"{:_x}".format(c_uint64(res).value)}"""
+        return LUT6(init_str, desired_name)
+
+    def __init__(self, init_code: str, desired_name):
+        in_ports = [BlackboxInput(f"I{el}") for el in range(6)]
+        out_ports = [BlackboxOutput("O")]
+        super().__init__("LUT6", init_code, in_ports, out_ports, desired_name=desired_name, size=1)
+
+
+def split_lut_from_pred(predO5, predO6):
+    res = 0
+    for i in range(32, 64):
+        inputs = [bool(i & (1 << shmt)) for shmt in range(6)]
+        res = res | (int(predO5(*inputs)) << (i - 32)) | (int(predO6(*inputs)) << (i))
+        init_str = f"""64'h{"{:_x}".format(c_uint64(res).value)}"""
+    return init_str
+
+
+class LUT6_2(LUT):
+    @classmethod
+    def fromPred(self, predO5, predO6, desired_name="lut6_2"):
+        return LUT6_2(split_lut_from_pred(predO5, predO6), desired_name)
+
+    def __init__(self, init_code: str, desired_name):
+        in_ports = [BlackboxInput(f"I{el}") for el in range(6)]
+        out_ports = [BlackboxOutput("O6"), BlackboxOutput("O5")]
+        super().__init__(
+            "LUT6_2", init_code, in_ports, out_ports, desired_name=desired_name, size=1
+        )
+        Constant("1'b1").connect_to(self.I5)
+
+
+class LUT6CY(LUT):
+    @classmethod
+    def fromPred(self, predO51, predO52, desired_name="lut6cy"):
+        return LUT6CY(split_lut_from_pred(predO51, predO52), desired_name)
+
+    def __init__(self, init_code: str, desired_name):
+        in_ports = [BlackboxInput(f"I{el}") for el in range(5)]
+        out_ports = [BlackboxOutput(f"O5{el+1}") for el in range(2)]
+        out_ports.append(BlackboxOutput("PROP"))
+        super().__init__(
+            "LUT6CY", init_code, in_ports, out_ports, desired_name=desired_name, size=1
+        )
+
+
+class LOOKAHEAD8(Blackbox):
+    def __init__(self):
+        c_in_ports_str = ["CIN", "CYA", "CYB", "CYC", "CYD", "CYE", "CYF", "CYG", "CYH"]
+        p_in_ports_str = ["PROPA", "PROPB", "PROPC", "PROPD", "PROPE", "PROPF", "PROPG", "PROPH"]
+        out_ports_str = ["COUTB", "COUTD", "COUTF", "COUTH"]
+
+        self.c_in_ports = [BlackboxInput(el) for el in c_in_ports_str]
+        self.p_in_ports = [BlackboxInput(el) for el in p_in_ports_str]
+        out_ports = [BlackboxOutput(el) for el in out_ports_str]
+        super().__init__(
+            "LOOKAHEAD8",
+            self.c_in_ports + self.p_in_ports,
+            out_ports,
+            {"LOOKB": '"TRUE"', "LOOKD": '"TRUE"', "LOOKF": '"TRUE"', "LOOKH": '"TRUE"'},
+        )
+
+
+class CARRY4(Blackbox):
+    def __init__(self):
+        in_ports = [BlackboxInputVec("DI", 4), BlackboxInputVec("S", 4), BlackboxInput("CI")]
+        out_ports = [BlackboxOutputVec("O", 4), BlackboxOutputVec("CO", 4)]
+        super().__init__("CARRY4", in_ports, out_ports, {})
diff --git a/src/finn/compressor/src/graph/visitor.py b/src/finn/compressor/src/graph/visitor.py
new file mode 100644
index 0000000000..2f8e6950ed
--- /dev/null
+++ b/src/finn/compressor/src/graph/visitor.py
@@ -0,0 +1,72 @@
+#############################################################################
+# Copyright (C) 2024 - 2025 Advanced Micro Devices, Inc. All rights reserved.
+#
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# @brief    Visitor pattern for compressor graph traversal
+#############################################################################
+
+from abc import ABC
+
+from .nodes import (
+    Bitmatrix,
+    Blackbox,
+    CompressionStage,
+    Compressor,
+    Counter,
+    GateAbsorbedStage,
+    GateAbsorptionCounter,
+    InputStage,
+    Logic,
+    PipelineStage,
+)
+from .primitives import (
+    BlackboxInput,
+    BlackboxInputVec,
+    BlackboxOutput,
+    BlackboxOutputVec,
+)
+
+
+class Visitor(ABC):
+    def visit_compressor(self, c: Compressor):
+        raise NotImplementedError
+
+    def visit_input_stage(self, s: InputStage):
+        raise NotImplementedError
+
+    def visit_gate_absorption_stage(self, s: GateAbsorbedStage):
+        raise NotImplementedError
+
+    def visit_pipeline_stage(self, s: PipelineStage):
+        raise NotImplementedError
+
+    def visit_compression_stage(self, s: CompressionStage):
+        raise NotImplementedError
+
+    def visit_counter(self, c: Counter):
+        raise NotImplementedError
+
+    def visit_gate_absorption_counter(self, c: GateAbsorptionCounter):
+        raise NotImplementedError
+
+    def visit_blackbox(self, b: Blackbox):
+        raise NotImplementedError
+
+    def visit_blackbox_input(self, b: BlackboxInput):
+        raise NotImplementedError
+
+    def visit_blackbox_output(self, b: BlackboxOutput):
+        raise NotImplementedError
+
+    def visit_blackbox_input_vec(self, b: BlackboxInputVec):
+        raise NotImplementedError
+
+    def visit_blackbox_output_vec(self, b: BlackboxOutputVec):
+        raise NotImplementedError
+
+    def visit_logic(self, lgc: Logic):
+        raise NotImplementedError
+
+    def visit_bitmatrix(self, b: Bitmatrix):
+        raise NotImplementedError
diff --git a/src/finn/compressor/src/main.py b/src/finn/compressor/src/main.py
new file mode 100644
index 0000000000..bac3f98a9d
--- /dev/null
+++ b/src/finn/compressor/src/main.py
@@ -0,0 +1,189 @@
+#############################################################################
+# Copyright (C) 2024 - 2025 Advanced Micro Devices, Inc. All rights reserved.
+#
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# @brief    Main compressor tree generation entry point
+#############################################################################
+
+import argparse
+import time
+from typing import List, Optional
+
+from .passes.compressor_constructor import CompressorConstructor
+from .passes.cost_estimator import CostEstimator
+from .passes.emitter import VerilogGenerator
+from .passes.io_annotator import IOAnnotator
+from .passes.lut_placer import LUTPlacer
+from .passes.printer import CompressorPrinter
+from .passes.wire_inserter import WireInserter
+from .target import SevenSeries, Target, UltraScale, Versal
+from .tests.test_gen import generate_test
+from .tests.tester import tester
+from .utils.shape import Shape
+
+
+def parse_cli():
+    parser = argparse.ArgumentParser(
+        prog="Compressor Generator", description="Generate a Compressor Tree for a given input."
+    )
+    parser.add_argument(
+        "-o", "--output", default="../gen/out.sv", help="Path to store the compressor at."
+    )
+    parser.add_argument("-s", "--shape", required=True, help="Input shape.")
+    parser.add_argument("-a", "--accumulate", action="store_true", help="Enable accumulation.")
+    parser.add_argument(
+        "-w", "--accumulator_width", help="Accumulator width [default: Reduced input shape]."
+    )
+    parser.add_argument(
+        "-g",
+        "--gates",
+        default=None,
+        help="Inline 2-input gates into the compressor. LSB is left." "Example: 8,3",
+    )
+    parser.add_argument(
+        "-t",
+        "--target",
+        default="Versal",
+        help="Target FPGA generation.",
+        choices=["Versal", "7-Series", "UltraScale"],
+    )
+    parser.add_argument(
+        "--test", action="store_true", help="Test the generated compressor using Vivado XSim."
+    )
+    parser.add_argument(
+        "-n", "--name", default="comp", help="Name of the generated Systemverilog module."
+    )
+    parser.add_argument(
+        "-p",
+        "--pipeline_every",
+        default=None,
+        help="Insert Pipeline registers every n stages. Default: " "Purely combinatorial.",
+    )
+    parser.add_argument(
+        "-c", "--constant", default=[], help="Add a constant binary " "number input. Example: 1011"
+    )
+    args = parser.parse_args()
+
+    try:
+        shape = Shape(int(el) for el in args.shape.split(","))
+    except (ValueError, TypeError):
+        print("Improperly defined shape.")
+        exit(-1)
+
+    gates = []
+    if args.gates:
+        assert len(args.gates) == sum(shape), "Length of shape and gate specification do not match."
+        gates_lin = list(args.gates)
+        for col in shape:
+            gates_col = []
+            for _ in range(col):
+                gates_col.append(gates_lin.pop(0))
+            gates.append(gates_col)
+
+    constants = []
+    for char in args.constant:
+        try:
+            constants.append(int(char, 2))
+        except ValueError:
+            print("Improperly defined constant.")
+            exit(-1)
+    if args.target == "Versal":
+        target = Versal()
+    elif args.target == "7-Series":
+        target = SevenSeries()
+    elif args.target == "UltraScale":
+        target = UltraScale()
+    else:
+        raise ValueError("Target not currently supported.")
+
+    generate_compressor(
+        target,
+        shape,
+        args.name,
+        int(args.pipeline_every) if args.pipeline_every else None,
+        args.accumulate,
+        int(args.accumulator_width) if args.accumulator_width else None,
+        gates,
+        constants,
+        args.output,
+        args.test,
+    )
+
+
+def generate_compressor(
+    target: Target,
+    shape: Shape,
+    name: str,
+    comb_depth: Optional[int],
+    accumulate: bool,
+    accumulator_width: int,
+    gates: List[List[str]],
+    constants: List[int],  # Each element is a binary numer digit.
+    path: str,
+    test: bool,
+    enable: bool = False,
+):
+    start_time = time.time()
+    constructor = CompressorConstructor()
+    c = constructor(
+        target.counter_candidates,
+        target.absorbing_counter_candidates,
+        target.final_adder,
+        shape,
+        name,
+        comb_depth=comb_depth,
+        accumulate=accumulate,
+        accumulator_width=accumulator_width,
+        constants=constants,
+        gates=gates,
+        enable=enable,
+    )
+
+    placer = LUTPlacer()
+    c.accept(placer)
+
+    wire_inserter = WireInserter()
+    c.accept(wire_inserter)
+
+    annotator = IOAnnotator()
+    c.accept(annotator)
+
+    cost = CostEstimator()
+    c.accept(cost)
+
+    emitter = VerilogGenerator()
+    c.accept(emitter)
+    with open(path, "w") as f:
+        withprefix = (
+            f"""// Adder generated by the Python Compressor Generator
+// Input shape: {c.input_shape}; Output Shape: {c.output_shape}
+// Pipeline stages: {c.delay}
+// Target Generation: {target.__class__.__name__}
+// Approximate LUTs: {int(cost.luts+0.5)}
+// Accumulation: {"yes" if accumulate else "no"} {f"of width {accumulator_width}"
+                                                  if accumulator_width else ""}
+// Enable mode: {"yes (init values set on accumulator registers)" if enable else "no"}
+// Gates: {gates if gates else "None"}
+        """
+            + emitter.emitter.output
+        )
+        f.write(withprefix)
+
+    end_time = time.time()
+    print("--%s seconds" % (start_time - end_time))
+
+    c.accept(CompressorPrinter())
+
+    if test:
+        constant = int("".join(str(c) for c in constants), 2) if constants else 0
+        test = generate_test(shape, "comp", c.delay, gates, accumulate, accumulator_width, constant)
+        with open("../gen/test.sv", "w") as f:
+            f.write(test)
+        tester("../gen/test.sv", path)
+
+    return c.delay
+
+
+if __name__ == "__main__":
+    parse_cli()
diff --git a/src/finn/compressor/src/passes/__init__.py b/src/finn/compressor/src/passes/__init__.py
new file mode 100644
index 0000000000..ff4b37ccd2
--- /dev/null
+++ b/src/finn/compressor/src/passes/__init__.py
@@ -0,0 +1,8 @@
+#############################################################################
+# Copyright (C) 2024 - 2025 Advanced Micro Devices, Inc. All rights reserved.
+#
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# @brief    Compressor passes package initialization
+# @author    Simon Gerber <simon.gerber@amd.com>
+#############################################################################
diff --git a/src/finn/compressor/src/passes/compressor_constructor.py b/src/finn/compressor/src/passes/compressor_constructor.py
new file mode 100644
index 0000000000..e7b7ad7192
--- /dev/null
+++ b/src/finn/compressor/src/passes/compressor_constructor.py
@@ -0,0 +1,201 @@
+#############################################################################
+# Copyright (C) 2024 - 2025 Advanced Micro Devices, Inc. All rights reserved.
+#
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# @brief    Compressor tree constructor with two-pass accumulator handling
+#############################################################################
+
+from typing import List, Tuple
+
+from ..graph.accumulator import AccumulatorStage
+from ..graph.counters.absorption_counter_candidates import (
+    GateAbsorptionCounterCandidate,
+)
+from ..graph.counters.counter_candidates import ConstantOne
+from ..graph.nodes import (
+    CompressionStage,
+    Compressor,
+    Counter,
+    GateAbsorbedStage,
+    InputStage,
+    Passthrough,
+)
+from ..utils.shape import Shape
+from .compressor_pipeliner import CompressorPipeliner
+
+
+class CompressorConstructor:
+    def adjust_compression_goal_for_constants(self, compression_goal, constants):
+        # Subtract constants, but never go below 2 (minimum achievable by compressor)
+        return lambda x: max(2, compression_goal(x) - (constants[x] if x < len(constants) else 0))
+
+    def get_compression_goal(self, final_adder, accumulate, constants):
+        # Two-pass strategy for accumulate: compress to goal, add constants, then post-check
+        compression_goal = final_adder.compression_goal
+        return self.adjust_compression_goal_for_constants(compression_goal, constants)
+
+    def add_constants_to_stage(self, s: CompressionStage, constants):
+        """Add constant bits to the compression stage."""
+        for idx, el in enumerate(constants):
+            if el:
+                c = ConstantOne()
+                s.append_counter(c, idx)
+
+    def __call__(
+        self,
+        counter_candidates,
+        absorption_counter_candidates,
+        final_adder,
+        input_shape: Shape,
+        name: str,
+        comb_depth: int = None,
+        accumulate=False,
+        accumulator_width: int = None,
+        constants: Tuple[bool] = tuple(),
+        gates: Tuple[Tuple[str]] = tuple(),
+        enable: bool = False,
+    ) -> Compressor:
+        compression_goal = self.get_compression_goal(final_adder, accumulate, constants)
+
+        c = Compressor(name)
+        c.stages.append(InputStage(input_shape, gates))
+
+        if gates:
+            s = self.construct_absorption_stage(
+                c.stages[-1].output_shape, gates, absorption_counter_candidates
+            )
+            c.stages[-1].connect_to(s)
+            c.stages.append(s)
+
+        while not self.compression_goal_reached(c.stages[-1].output_shape, compression_goal):
+            self.add_compression_stage(c, compression_goal, counter_candidates)
+
+        # Add constants to the graph.
+        if not isinstance(c.stages[-1], CompressionStage) and constants:
+            self.add_compression_stage(c, compression_goal, counter_candidates)
+        self.add_constants_to_stage(c.stages[-1], constants)
+
+        if accumulate:
+
+            def post_const_goal(x):
+                # Leave room for feedback (height 1) within ternary adder capacity
+                return max(2, final_adder.compression_goal(x) - 1)
+
+            while not self.compression_goal_reached(c.stages[-1].output_shape, post_const_goal):
+                self.add_compression_stage(c, post_const_goal, counter_candidates)
+
+        if comb_depth:
+            pipeliner = CompressorPipeliner()
+            pipeline_stages = pipeliner.pipeline(c, comb_depth)
+        else:
+            pipeline_stages = 0
+
+        if accumulate:
+            acc = AccumulatorStage(
+                c.stages[-1].output_shape,
+                final_adder,
+                pipeline_stages,
+                accumulator_width=accumulator_width,
+                enable=enable,
+            )
+            c.stages.append(acc)
+        # if we dont accumulate, we can choose between a pipelined
+        # or non-pipelined quaternary final adder when using Versal.
+        elif max(c.stages[-1].output_shape) > 1:
+            final_stage = CompressionStage()
+            try:
+                fa = final_adder(c.stages[-1].output_shape, pipelined=True)
+            except TypeError:
+                fa = final_adder(c.stages[-1].output_shape)
+            final_stage.append_counter(fa, 0)
+            c.stages.append(final_stage)
+
+        for s_p, s_n in zip(c.stages, c.stages[1:]):
+            s_p.connect_to(s_n)
+        return c
+
+    def add_compression_stage(self, compressor: Compressor, compression_goal, counter_candidates):
+        """Add a compression stage. Cannot compress columns with height < 3 (Full Adder = 3:2)."""
+        new_stage = CompressionStage()
+        stage_inputs = compressor.stages[-1].output_shape
+        stage_outputs = Shape()
+
+        i = 0
+        while i < max(len(stage_inputs), len(stage_outputs)):
+
+            def cur_output_height():
+                return (stage_inputs + stage_outputs)[i]
+
+            def cur_input_height():
+                return stage_inputs[i] if len(stage_inputs) > i else 0
+
+            while cur_input_height() >= 3 and cur_output_height() > compression_goal(i):
+                counter = self.schedule_counter(
+                    stage_inputs[i:],
+                    stage_outputs[i:],
+                    lambda x: compression_goal(x + i),
+                    counter_candidates,
+                )
+                stage_inputs = stage_inputs - (counter.input_shape << i)
+                stage_outputs = stage_outputs + (counter.output_shape << i)
+                new_stage.append_counter(counter, i)
+            i += 1
+
+        # pass through all leftover inputs:
+        for i in range(len(stage_inputs)):
+            for j in range(stage_inputs[i]):
+                new_stage.append_counter(Passthrough(), i)
+
+        compressor.stages.append(new_stage)
+
+    def schedule_counter(
+        self, stage_inputs, stage_outputs, compression_goal, counter_candidates
+    ) -> Counter:
+        counters = []
+        for counter_candid in counter_candidates:
+            counter = counter_candid.extend_to_fit(stage_inputs, stage_outputs, compression_goal)
+            counters.append(counter)
+
+        try:
+            return max(
+                (c for c in counters if c is not None), key=lambda x: (x.efficiency, x.strength)
+            )
+        except ValueError:
+            raise ValueError(
+                f"Could not schedule counter for input shape"
+                f"{stage_inputs}; output shape {stage_outputs}; "
+                "compression goal {compression_goal(0)}"
+            )
+
+    def compression_goal_reached(self, shape, compression_goal):
+        return all([col <= compression_goal(idx) for idx, col in enumerate(shape)])
+
+    def get_best_inlined_counter(self, input_shape, gates, absorption_counters):
+        candidates = []
+        for counter in absorption_counters:
+            candidate = counter.extend_to_fit(input_shape, gates)
+            if candidate:
+                candidates.append(candidate)
+        return max(candidates, key=lambda x: (x.efficiency, x.strength))
+
+    def construct_absorption_stage(
+        self,
+        input_shape: Shape,
+        gates: List[str],
+        absorption_counters: GateAbsorptionCounterCandidate,
+    ):
+        s = GateAbsorbedStage()
+        cur_shape = input_shape
+        cur_gates = gates[:]
+        for idx in range(len(input_shape)):
+            while cur_shape[idx] > 0:
+                best = self.get_best_inlined_counter(
+                    cur_shape[idx:], cur_gates[idx:], absorption_counters
+                )
+                cur_shape = cur_shape - (best.input_shape << idx)
+                for i in range(len(cur_shape)):
+                    new = list(reversed(list(reversed(cur_gates[i]))[: cur_shape[i]]))
+                    cur_gates[i] = new
+                s.append_counter(best, idx)
+        return s
diff --git a/src/finn/compressor/src/passes/compressor_pipeliner.py b/src/finn/compressor/src/passes/compressor_pipeliner.py
new file mode 100644
index 0000000000..3a291d972f
--- /dev/null
+++ b/src/finn/compressor/src/passes/compressor_pipeliner.py
@@ -0,0 +1,37 @@
+#############################################################################
+# Copyright (C) 2024 - 2025 Advanced Micro Devices, Inc. All rights reserved.
+#
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# @brief    Compressor tree pipelining pass
+#############################################################################
+
+from ..graph.nodes import CompressionStage, Compressor, PipelineStage
+
+
+class CompressorPipeliner:
+    def pipeline(self, c: Compressor, max_combinational_depth: int):
+        cur_depth = 0
+        pipeline_stages = 0
+        new_stages = []
+
+        for idx, stage in enumerate(c.stages):
+            if isinstance(stage, CompressionStage):
+                new_stages.append(stage)
+                cur_depth += 1
+                if (
+                    cur_depth >= max_combinational_depth
+                    or cur_depth >= max_combinational_depth - 1
+                    and idx == len(c.stages) - 1
+                ):
+                    new_stages.append(PipelineStage(stage.output_shape))
+                    cur_depth = 0
+                    pipeline_stages += 1
+            else:
+                new_stages.append(stage)
+        c.stages = new_stages
+
+        for p, n in zip(c.stages, c.stages[1:]):
+            p.connect_to(n)
+
+        return pipeline_stages
diff --git a/src/finn/compressor/src/passes/cost_estimator.py b/src/finn/compressor/src/passes/cost_estimator.py
new file mode 100644
index 0000000000..5d0f6f0514
--- /dev/null
+++ b/src/finn/compressor/src/passes/cost_estimator.py
@@ -0,0 +1,41 @@
+#############################################################################
+# Copyright (C) 2024 - 2025 Advanced Micro Devices, Inc. All rights reserved.
+#
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# @brief    Cost estimation pass for compressor resources
+#############################################################################
+
+from ..graph.nodes import (
+    Blackbox,
+    CompressionStage,
+    Compressor,
+    GateAbsorbedStage,
+    PipelineStage,
+)
+from ..graph.primitives import LUT, LUT2, LUT5, LUT6, LUT6_2, LUT6CY
+from .node_iterator import NodeIterator
+
+
+class CostEstimator(NodeIterator):
+    def iter_compressor(self, c: Compressor):
+        self.combinatorial_stages = -1  # Start with -1 to exclude final adder
+        self.pipeline_stages = 0
+        self.luts = 0
+
+    def iter_compression_stage(self, s: CompressionStage):
+        self.combinatorial_stages += 1
+
+    def iter_gate_absorbed_stage(self, g: GateAbsorbedStage):
+        self.combinatorial_stages += 1
+
+    def iter_pipeline_stage(self, p: PipelineStage):
+        self.pipeline_stages += 1
+
+    def iter_blackbox(self, b: Blackbox):
+        if isinstance(b, LUT5) or isinstance(b, LUT2):
+            self.luts += 0.5
+        elif isinstance(b, LUT6) or isinstance(b, LUT6CY) or isinstance(b, LUT6_2):
+            self.luts += 1
+        elif isinstance(b, LUT):
+            raise RuntimeError("No cost function implemented for this LUT type {b}")
diff --git a/src/finn/compressor/src/passes/emitter.py b/src/finn/compressor/src/passes/emitter.py
new file mode 100644
index 0000000000..3e69ead969
--- /dev/null
+++ b/src/finn/compressor/src/passes/emitter.py
@@ -0,0 +1,358 @@
+#############################################################################
+# Copyright (C) 2024 - 2025 Advanced Micro Devices, Inc. All rights reserved.
+#
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# @brief    Verilog emitter for compressor tree
+#############################################################################
+
+from collections import defaultdict
+from contextlib import contextmanager
+from io import StringIO
+from typing import Tuple
+
+from ..graph.accumulator import AccumulatorStage
+from ..graph.nodes import (
+    Bitmatrix,
+    BitmatrixElement,
+    Blackbox,
+    BlackboxPort,
+    BlackboxVecElement,
+    CompressionStage,
+    Compressor,
+    Connectable,
+    Constant,
+    Counter,
+    GateAbsorbedStage,
+    InputStage,
+    Logic,
+    PipelineStage,
+    Wire,
+)
+from ..graph.primitives import (
+    BlackboxInput,
+    BlackboxInputVec,
+    BlackboxOutput,
+    BlackboxOutputVec,
+)
+from ..graph.visitor import Visitor
+
+
+class VerilogEmitter:
+    def __init__(self):
+        self._out = StringIO()
+        self._indent_level = 0
+        self._line_start = True
+
+    def emit(self, line=""):
+        if self._line_start:
+            self._out.write(self._indent_level * "\t")
+        self._line_start = False
+        self._out.write(line)
+
+    def emitln(self, line=""):
+        if self._line_start:
+            self._out.write(self._indent_level * "\t")
+        self._out.write(line + "\n")
+        self._line_start = True
+
+    @property
+    @contextmanager
+    def indent(self):
+        try:
+            self._indent_level += 1
+            yield None
+        finally:
+            self._indent_level -= 1
+
+    @property
+    def output(self):
+        return self._out.getvalue()
+
+    def save_verilog(self, filename):
+        with open(filename, "w") as f:
+            f.write(self._out.getvalue())
+
+
+class VerilogGenerator(Visitor):
+    def set_name(self, o: object, name):
+        self._names[type(o)][o] = name
+
+    def get_name(self, o: object):
+        if isinstance(o, BlackboxPort):
+            return o.name
+
+        if o in self._names[type(o)]:
+            return self._names[type(o)][o]
+
+        subdict = self._names[type(o)]
+
+        if isinstance(o, Logic):
+            subdict[o] = f"logic_{len(subdict)}"
+        elif isinstance(o, Wire):
+            if o.desired_name:
+                if o.desired_name not in subdict.values():
+                    subdict[o] = o.desired_name
+                else:
+                    print(f"Could not obey desired name: {o.desired_name}")
+            else:
+                new_name = f"wire_{len(subdict)}"
+                subdict[o] = new_name
+        elif isinstance(o, Bitmatrix):
+            subdict[o] = f"bitmatrix_{len(subdict)}"
+        elif isinstance(o, BitmatrixElement):
+            bitmatrix = o.vector
+            return self.get_name(bitmatrix) + f"[{o.lin_idx}]"
+        elif isinstance(o, Constant):
+            return o.value
+        elif isinstance(o, Blackbox):
+            subdict[o] = f"{o.module_name.lower()}_{len(subdict)}"
+        else:
+            raise NotImplementedError(f"get_name cannot handle this type {type(o)}")
+        return subdict[o]
+
+    def visit_compressor(self, c: Compressor):
+        self.emitter = VerilogEmitter()
+        self._declared_hardware = set()
+        self._emitted_hardware = set()
+        self._names = defaultdict(lambda: {})
+
+        self.set_name(c.stages[0].input_wires, "in")
+        if hasattr(c.stages[0], "input_wires_complementary"):
+            self.set_name(c.stages[0].input_wires_complementary, "in_2")
+        self.set_name(c.stages[-1].output_wires, "out")
+
+        self.emitter.emitln(f"module {c.module_name}(")
+        with self.emitter.indent:
+            names = sorted(
+                ["input clk"]
+                + [
+                    el.prefix
+                    + (
+                        "logic "
+                        if isinstance(el, Logic)
+                        else f"[{el.total_size()-1}:0] "
+                        if isinstance(el, Bitmatrix)
+                        else ""
+                    )
+                    + self.get_name(el)
+                    for el in c.io
+                ],
+                key=lambda x: "input" not in x,
+            )
+            [self._declared_hardware.add(el) for el in c.io]
+
+            self.emitter.emitln(",\n\t".join(names))
+        self.emitter.emitln(");")
+
+        with self.emitter.indent:
+            for stage in c.stages:
+                stage.accept(self)
+        self.emitter.emitln("endmodule")
+
+    def visit_input_stage(self, s: InputStage):
+        s.input_wires.accept(self)
+        if hasattr(s, "input_wires_complementary"):
+            s.input_wires_complementary.accept(self)
+        # Visit output_wires if they're separate from input_wires (trivial passthrough case)
+        if s.output_wires is not s.input_wires:
+            s.output_wires.accept(self)
+
+    def visit_accumulator_stage(self, a: AccumulatorStage):
+        self.emitter.emitln()
+        self.emitter.emitln("// Accumulator Stage")
+        a.input_wires.accept(self)
+        [
+            el.accept(self)
+            for el in sorted(a.instances, key=lambda x: (not isinstance(x, Connectable)))
+        ]
+        a.output_wires.accept(self)
+
+    def visit_pipeline_stage(self, s: PipelineStage):
+        self.emitter.emitln()
+        self.emitter.emitln("// Pipeline Results..")
+        s.input_wires.accept(self)
+        [el.accept(self) for el in s.instances]
+        s.output_wires.accept(self)
+
+    def visit_compression_stage(self, s: CompressionStage):
+        self.emitter.emitln()
+        self.emitter.emitln(
+            f"// Compression Stage with Input Shape: {s.input_shape} "
+            f"and Output Shape {s.output_shape}"
+        )
+        s.input_wires.accept(self)
+        [c.accept(self) for c, _ in s.counters_with_shifts]
+        s.output_wires.accept(self)
+        self.emitter.emitln()
+
+    def visit_gate_absorbed_stage(self, g: GateAbsorbedStage):
+        self.emitter.emitln()
+        self.emitter.emitln("// Compression Stage with Gate Absorption.")
+        self.emitter.emitln(
+            f"// Input Shape: {g.input_shape} " f"and Output Shape: {g.output_shape}"
+        )
+        g.input_wires.accept(self)
+        g.input_wires_complementary.accept(self)
+        [c.accept(self) for c, _ in g.counters_with_shifts]
+        g.output_wires.accept(self)
+        self.emitter.emitln()
+
+    def visit_counter(self, c: Counter):
+        [el.accept(self) for col in c.input_wires for el in col]
+        [el.accept(self) for col in c.output_wires for el in col]
+        [
+            el.accept(self)
+            for el in sorted(c.instances, key=lambda x: not isinstance(x, Connectable))
+        ]
+
+    def visit_gate_absorption_counter(self, c: GateAbsorbedStage):
+        [el.accept(self) for col in c.input_wires_complementary for el in col]
+        self.visit_counter(c)
+
+    def visit_wire(self, w: Wire):
+        if w in self._emitted_hardware:
+            return
+
+        if w not in self._declared_hardware:
+            self.emitter.emitln(f"uwire {self.get_name(w)};")
+        self._declared_hardware.add(w)
+
+        if w.source not in self._declared_hardware and isinstance(w.source, Wire):
+            w.source.accept(self)
+
+        if (
+            w.has_source
+            and isinstance(w.source, Connectable)
+            and not isinstance(w.source, BlackboxPort)
+            and not isinstance(w.source, BlackboxVecElement)
+        ):
+            self.emitter.emitln(f"assign {self.get_name(w)} = {self.get_name(w.source)};")
+        self._emitted_hardware.add(w)
+
+    def visit_logic(self, lgc: Logic):
+        if lgc in self._emitted_hardware:
+            return
+
+        if lgc not in self._declared_hardware:
+            self.emitter.emit(lgc.prefix)
+            init_str = f" = 1'b{lgc.init}" if lgc.init is not None else ""
+            self.emitter.emitln(
+                f'(* srl_style = "register" *) logic {self.get_name(lgc)}{init_str};'
+            )
+        self._declared_hardware.add(lgc)
+
+        if lgc.source not in self._declared_hardware and isinstance(lgc.source, Wire):
+            lgc.source.accept(self)
+
+        def emit_inner():
+            if lgc.source:
+                self.emitter.emitln(f"{self.get_name(lgc)} <= {self.get_name(lgc.source)};")
+
+        def emit_with_en():
+            if lgc.en:
+                self.emitter.emitln(f"if ({self.get_name(lgc.en)}) begin")
+                with self.emitter.indent:
+                    emit_inner()
+                self.emitter.emitln("end")
+            else:
+                emit_inner()
+
+        def emit_with_rst_and_en():
+            if lgc.rst and lgc.en:
+                # En-gated rst: preserve state during stalls
+                self.emitter.emitln(f"if ({self.get_name(lgc.en)}) begin")
+                with self.emitter.indent:
+                    self.emitter.emitln(f"if ({self.get_name(lgc.rst)}) begin")
+                    with self.emitter.indent:
+                        self.emitter.emitln(f"{self.get_name(lgc)} <= 1'b0;")
+                    self.emitter.emitln("end else begin")
+                    with self.emitter.indent:
+                        emit_inner()
+                    self.emitter.emitln("end")
+                self.emitter.emitln("end")
+            elif lgc.rst:
+                self.emitter.emitln(f"if ({self.get_name(lgc.rst)}) begin")
+                with self.emitter.indent:
+                    self.emitter.emitln(f"{self.get_name(lgc)} <= 1'b0;")
+                self.emitter.emitln("end else begin")
+                with self.emitter.indent:
+                    emit_inner()
+                self.emitter.emitln("end")
+            else:
+                emit_with_en()
+
+        self.emitter.emitln("always_ff @(posedge clk) begin")
+        with self.emitter.indent:
+            emit_with_rst_and_en()
+        self.emitter.emitln("end")
+        self._emitted_hardware.add(lgc)
+
+    def visit_blackbox(self, b: Blackbox):
+        if b.annotations:
+            self.emitter.emitln(f"(* {', '.join(b.annotations)} *)")
+        self.emitter.emitln(f"{b.module_name} #(")
+        with self.emitter.indent:
+            for idx, (key, value) in enumerate(b.parameters.items()):
+                ending = "," if idx != len(b.parameters) - 1 else ""
+                self.emitter.emitln(f".{key}({value}){ending}")
+        self.emitter.emitln(f") {self.get_name(b)} (")
+        with self.emitter.indent:
+            ports = b.out_ports + b.in_ports
+            for idx, port in enumerate(ports):
+                ending = "," if idx != len(ports) - 1 else ""
+                port.accept(self)
+                self.emitter.emitln(ending)
+        self.emitter.emitln(");")
+
+    def visit_blackbox_output(self, b: BlackboxOutput):
+        if b.has_target:
+            self.emitter.emit(f".{b.name}({self.get_name(b.target)})")
+        else:
+            self.emitter.emit(f".{b.name}()")
+
+    def visit_blackbox_output_vec(self, b: BlackboxOutputVec):
+        self.emitter.emit(f".{b.name}(")
+        self.emitter.emit("{")
+        targets = [self.get_name(el.target) for el in b.elements[::-1] if el.target]
+        self.emitter.emit(", ".join(targets))
+        self.emitter.emit("})")
+
+    def visit_blackbox_input(self, b: BlackboxInput):
+        if b.has_source:
+            self.emitter.emit(f".{b.name}({self.get_name(b.source)})")
+        else:
+            self.emitter.emit(f".{b.name}(1'b0)")
+
+    def visit_blackbox_input_vec(self, b: BlackboxInputVec):
+        self.emitter.emit(f".{b.name}(")
+        self.emitter.emit("{")
+        sources = [self.get_name(el.source) if el.source else "1'b0" for el in b.elements[::-1]]
+        self.emitter.emit(", ".join(sources))
+        self.emitter.emit("})")
+
+    def emit_blackbox_ports(self, p: Tuple[BlackboxPort]):
+        for idx, port in enumerate(p):
+            seperator = "," if idx != len(p) - 1 else ""
+            if port.connected:
+                self.emitter.emitln(
+                    f".{self.get_name(port)}({self.get_name(port.wire)}){seperator}"
+                )
+            elif isinstance(port, BlackboxInput):
+                self.emitter.emitln(f".{self.get_name(port)}(1'b0){seperator}")
+            else:
+                self.emitter.emitln(f".{self.get_name(port)}(){seperator}")
+
+    def visit_bitmatrix(self, b: Bitmatrix):
+        if b not in self._declared_hardware:
+            self.emitter.emitln(f"uwire [{b.total_size()-1}:0] {self.get_name(b)};")
+            self._declared_hardware.add(b)
+
+        if b not in self._emitted_hardware:
+            [
+                self.emitter.emitln(f"assign {self.get_name(el)} = {self.get_name(el.source)};")
+                for col in b
+                for el in col
+                if el.has_source
+            ]
+            self._emitted_hardware.add(b)
diff --git a/src/finn/compressor/src/passes/io_annotator.py b/src/finn/compressor/src/passes/io_annotator.py
new file mode 100644
index 0000000000..c5e0b66b04
--- /dev/null
+++ b/src/finn/compressor/src/passes/io_annotator.py
@@ -0,0 +1,60 @@
+#############################################################################
+# Copyright (C) 2024 - 2025 Advanced Micro Devices, Inc. All rights reserved.
+#
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# @brief    Input/output annotation pass for compressor
+#############################################################################
+
+from ..graph.nodes import Bitmatrix, Compressor, Logic, Wire
+from .node_iterator import NodeIterator
+
+
+class IOAnnotator(NodeIterator):
+    def visit_compressor(self, c: Compressor):
+        input_wires = c.stages[0].input_wires
+        output_wires = c.stages[-1].output_wires
+
+        # Handle trivial passthrough case where input_wires IS output_wires (same object).
+        # This happens for N=1 compressors where only an InputStage exists.
+        # We need separate Bitmatrix objects for input and output ports.
+        if input_wires is output_wires:
+            new_output = Bitmatrix(input_wires.shape)
+            for in_col, out_col in zip(input_wires, new_output):
+                for in_wire, out_wire in zip(in_col, out_col):
+                    in_wire.connect_to(out_wire)
+            c.stages[-1].output_wires = new_output
+            output_wires = new_output
+
+        input_wires.set_to_module_input()
+        input_wires.name = "in"
+        if c.stages[0].gates:
+            c.stages[0].input_wires_complementary.set_to_module_input()
+            c.stages[0].input_wires_complementary.name = "in_2"
+        output_wires.set_to_module_output()
+        output_wires.name = "out"
+
+        c.io = self.get_all_io(c)
+
+    def get_all_io(self, c: Compressor):
+        finder = IOFinder()
+        c.accept(finder)
+        return list(set(finder.io))
+
+
+class IOFinder(NodeIterator):
+    def iter_compressor(self, c: Compressor):
+        self.connectables = []
+
+    @property
+    def io(self):
+        return [el for el in self.connectables if el.prefix]
+
+    def iter_wire(self, w: Wire):
+        self.connectables.append(w)
+
+    def iter_logic(self, lgc: Logic):
+        self.connectables.append(lgc)
+
+    def iter_bitmatrix(self, b: Bitmatrix):
+        self.connectables.append(b)
diff --git a/src/finn/compressor/src/passes/lut_placer.py b/src/finn/compressor/src/passes/lut_placer.py
new file mode 100644
index 0000000000..68c13bd4b7
--- /dev/null
+++ b/src/finn/compressor/src/passes/lut_placer.py
@@ -0,0 +1,73 @@
+#############################################################################
+# Copyright (C) 2024 - 2025 Advanced Micro Devices, Inc. All rights reserved.
+#
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# @brief    RLOC placement annotation for compressor LUTs
+#############################################################################
+
+from ..graph.final_adder import FinalAdder
+from ..graph.nodes import Compressor, Counter, GateAbsorptionCounter
+from ..graph.primitives import LUT6CY
+from .node_iterator import NodeIterator
+
+
+class LUTPlacer(NodeIterator):
+    def iter_compressor(self, c: Compressor):
+        self.occupations = []  # Reset placement state for every compressor
+
+    def iter_counter(self, c: Counter):
+        cascades = self._get_ripple_connected_luts(c)
+        self._calculate_and_annotate_placements(cascades)
+
+    def iter_gate_absorption_counter(self, g: GateAbsorptionCounter):
+        self.iter_counter(g)
+
+    def _get_ripple_connected_luts(self, c: Counter):
+        "Among all LUTs inside a counter, reconstruct all ripple connections."
+        if isinstance(c, FinalAdder):
+            # No manual placement needed, as final adders use the LOOKAHEAD8,
+            # which restricts enforces correct placement itself.
+            return []
+
+        lut6cy_i4s = {lut.I4: lut for lut in c.luts if isinstance(lut, LUT6CY)}
+        lut6cy_o52s = {lut.O52: lut for lut in c.luts if isinstance(lut, LUT6CY)}
+
+        lut_output_to_lut_input = {}
+
+        for input, input_lut in lut6cy_i4s.items():
+            if input.source in lut6cy_o52s:
+                target_lut = lut6cy_o52s[input.source]
+                lut_output_to_lut_input[input_lut] = target_lut
+
+        lut_heads = set(lut_output_to_lut_input.keys()) - set(lut_output_to_lut_input.values())
+        chains = []
+
+        for lut_head in lut_heads:
+            cur = [lut_head]
+            while el := lut_output_to_lut_input.get(cur[-1]):
+                cur.append(el)
+            chains.append(cur[::-1])
+
+        return chains
+
+    def _calculate_and_annotate_placements(self, cascades):
+        for cascade in cascades:
+            for idx, slice_util in enumerate(self.occupations):
+                if len(cascade) + slice_util <= 8:
+                    self._annotate_placements(cascade, idx, self.occupations[idx])
+                    self.occupations[idx] += len(cascade)
+                    break
+            else:
+                self.occupations.append(len(cascade))
+                self._annotate_placements(cascade, len(self.occupations) - 1, 0)
+
+    def _annotate_placements(self, cascade, hu_set, start_idx):
+        assert start_idx + len(cascade) <= 8
+        for i, lut in enumerate(cascade):
+            bel_str = f"{chr(ord('A')+start_idx+i)}5LUT"
+            lut.annotate(f'HU_SET = "hu_set_{hu_set}"')
+            lut.annotate(f'RLOC = "X0Y{hu_set}"')  # Increment Y per SLICE to avoid conflicts
+            lut.annotate(f'BEL = "{bel_str}"')
+            lut.annotate('DONT_TOUCH = "yes"')
+            lut.annotate('IS_BEL_FIXED = "yes"')
diff --git a/src/finn/compressor/src/passes/node_iterator.py b/src/finn/compressor/src/passes/node_iterator.py
new file mode 100644
index 0000000000..919565afe2
--- /dev/null
+++ b/src/finn/compressor/src/passes/node_iterator.py
@@ -0,0 +1,156 @@
+#############################################################################
+# Copyright (C) 2024 - 2025 Advanced Micro Devices, Inc. All rights reserved.
+#
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# @brief    Node iterator pass for compressor graph traversal
+#############################################################################
+
+from ..graph.accumulator import AccumulatorStage
+from ..graph.nodes import (
+    Bitmatrix,
+    Blackbox,
+    BlackboxInput,
+    BlackboxInputVec,
+    BlackboxOutput,
+    BlackboxOutputVec,
+    CompressionStage,
+    Compressor,
+    Counter,
+    GateAbsorbedStage,
+    GateAbsorptionCounter,
+    InputStage,
+    Logic,
+    PipelineStage,
+    Wire,
+)
+from ..graph.primitives import LOOKAHEAD8
+from ..graph.visitor import Visitor
+
+
+class NodeIterator(Visitor):
+    def visit_compressor(self, c: Compressor):
+        self.iter_compressor(c)
+        [s.accept(self) for s in c.stages]
+
+    def visit_input_stage(self, s: InputStage):
+        self.iter_input_stage(s)
+        s.input_wires.accept(self)
+        if s.gates:
+            s.input_wires_complementary.accept(self)
+        s.output_wires.accept(self)
+
+    def visit_pipeline_stage(self, s: PipelineStage):
+        self.iter_pipeline_stage(s)
+        s.input_wires.accept(self)
+        s.output_wires.accept(self)
+        [el.accept(self) for el in s.instances]
+
+    def visit_compression_stage(self, s: CompressionStage):
+        self.iter_compression_stage(s)
+        s.input_wires.accept(self)
+        s.output_wires.accept(self)
+        [c.accept(self) for c, _ in s.counters_with_shifts]
+
+    def visit_accumulator_stage(self, a: AccumulatorStage):
+        self.iter_accumulator_stage(a)
+        a.input_wires.accept(self)
+        a.output_wires.accept(self)
+        [c.accept(self) for c in a.instances]
+
+    def visit_gate_absorbed_stage(self, g: GateAbsorbedStage):
+        self.iter_gate_absorbed_stage(g)
+        g.input_wires.accept(self)
+        g.input_wires_complementary.accept(self)
+        g.output_wires.accept(self)
+        [c.accept(self) for c, _ in g.counters_with_shifts]
+
+    def visit_counter(self, c: Counter):
+        self.iter_counter(c)
+        [el.accept(self) for col in c.input_wires for el in col]
+        [el.accept(self) for col in c.output_wires for el in col]
+        [el.accept(self) for el in c.instances]
+
+    def visit_gate_absorption_counter(self, g: GateAbsorptionCounter):
+        self.iter_gate_absorption_counter(g)
+        [el.accept(self) for col in g.input_wires for el in col]
+        [el.accept(self) for col in g.input_wires_complementary for el in col]
+        [el.accept(self) for col in g.output_wires for el in col]
+        [el.accept(self) for el in g.instances]
+
+    def visit_blackbox(self, b: Blackbox):
+        self.iter_blackbox(b)
+        [p.accept(self) for p in b.in_ports + b.out_ports]
+
+    def visit_blackbox_input(self, b: BlackboxInput):
+        self.iter_blackbox_input
+
+    def visit_blackbox_output(self, b: BlackboxOutput):
+        self.iter_blackbox_output
+
+    def visit_blackbox_input_vec(self, b: BlackboxInputVec):
+        self.iter_blackbox_input_vec
+
+    def visit_blackbox_output_vec(self, b: BlackboxOutputVec):
+        self.iter_blackbox_output_vec
+
+    def visit_lookahead8(self, l8: LOOKAHEAD8):
+        self.iter_lookahead8(l8)
+        self.visit_blackbox(l8)
+
+    def visit_wire(self, w: Wire):
+        self.iter_wire(w)
+
+    def visit_logic(self, lgc: Logic):
+        self.iter_logic(lgc)
+
+    def visit_bitmatrix(self, b: Bitmatrix):
+        self.iter_bitmatrix(b)
+
+    def iter_compressor(self, c: Compressor):
+        pass
+
+    def iter_gate_absorbed_stage(self, g: GateAbsorbedStage):
+        pass
+
+    def iter_input_stage(self, s: InputStage):
+        pass
+
+    def iter_accumulator_stage(self, a: AccumulatorStage):
+        pass
+
+    def iter_pipeline_stage(self, s: PipelineStage):
+        pass
+
+    def iter_compression_stage(self, s: CompressionStage):
+        pass
+
+    def iter_gate_absorption_counter(self, g: GateAbsorptionCounter):
+        pass
+
+    def iter_counter(self, c: Counter):
+        pass
+
+    def iter_blackbox(self, b: Blackbox):
+        pass
+
+    def iter_wire(self, w: Wire):
+        pass
+
+    def iter_logic(self, lgc: Logic):
+        pass
+
+    def iter_bitmatrix(self, b: Bitmatrix):
+        pass
+
+    def iter_blackbox_input(self, b: BlackboxInput):
+        pass
+
+    def iter_blackbox_output(self, b: BlackboxOutput):
+        pass
+
+    def iter_blackbox_input_vec(self, b: BlackboxInputVec):
+        pass
+
+    def iter_blackbox_output_vec(self, b: BlackboxOutputVec):
+        pass
diff --git a/src/finn/compressor/src/passes/printer.py b/src/finn/compressor/src/passes/printer.py
new file mode 100644
index 0000000000..a2386fbb72
--- /dev/null
+++ b/src/finn/compressor/src/passes/printer.py
@@ -0,0 +1,61 @@
+#############################################################################
+# Copyright (C) 2024 - 2025 Advanced Micro Devices, Inc. All rights reserved.
+#
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# @brief    Compressor tree printer for debugging
+#############################################################################
+
+from ..graph.accumulator import AccumulatorStage
+from ..graph.nodes import (
+    CompressionStage,
+    Compressor,
+    Counter,
+    GateAbsorbedStage,
+    GateAbsorptionCounter,
+    InputStage,
+    PipelineStage,
+)
+from ..graph.visitor import Visitor
+
+
+class CompressorPrinter(Visitor):
+    def visit_compressor(self, c: Compressor):
+        print(f"Compressor <Input: {c.input_shape}, Output: {c.output_shape}> [")
+        for stage in c.stages:
+            stage.accept(self)
+        print("]")
+
+    def visit_compression_stage(self, s: CompressionStage):
+        print(f"\tStage: <in: {s.input_shape}, out: {s.output_shape}> [")
+        for counter, shift in s.counters_with_shifts:
+            print(f"\t\t[xshift={shift:2}] ", end="")
+            counter.accept(self)
+        print("\t]")
+
+    def visit_gate_absorbed_stage(self, s: GateAbsorbedStage):
+        print(f"\tStage with Gate Absorption: <in {s.input_shape}, " f"out: {s.output_shape}> [")
+        for counter, shift in s.counters_with_shifts:
+            print(f"\t\t[xshift={shift:2}] ", end="")
+            counter.accept(self)
+        print("\t]")
+
+    def visit_input_stage(self, i: InputStage):
+        print(f"\tInput Stage: <{i.input_shape}>")
+
+    def visit_pipeline_stage(self, p: PipelineStage):
+        print(f"\tPipeline Stage: <{p.input_shape}>")
+
+    def visit_counter(self, c: Counter):
+        print(f"{c.__class__.__name__} <in: {c.input_shape}, out: {c.output_shape}>")
+
+    def visit_gate_absorption_counter(self, c: GateAbsorptionCounter):
+        self.visit_counter(c)
+
+    def visit_accumulator_stage(self, a: AccumulatorStage):
+        print(f"\tAccumulator: <in: {a.input_shape}, out: {a.output_shape}> [")
+        print("\t\t", end="")
+        for i in a.instances:
+            if isinstance(i, Counter):
+                i.accept(self)
+        print("\t]")
diff --git a/src/finn/compressor/src/passes/wire_inserter.py b/src/finn/compressor/src/passes/wire_inserter.py
new file mode 100644
index 0000000000..b80192f8c5
--- /dev/null
+++ b/src/finn/compressor/src/passes/wire_inserter.py
@@ -0,0 +1,41 @@
+#############################################################################
+# Copyright (C) 2024 - 2025 Advanced Micro Devices, Inc. All rights reserved.
+#
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# @brief    Wire insertion pass for compressor graph
+#############################################################################
+
+from ..graph.nodes import Blackbox, Counter, GateAbsorptionCounter, Wire
+from .node_iterator import NodeIterator
+
+
+# Blackbox outputs might be connected to other blackbox inputs.
+# To express this in verilog, an extra intermediate wire has to
+# be created between the blackboxes. This path adds it.
+class WireInserter(NodeIterator):
+    def iter_counter(self, c: Counter):
+        bboxes = [el for el in c.instances if isinstance(el, Blackbox)]
+        for bbox in bboxes:
+            for output in bbox.out_ports:
+                self.insert_wire_at_blackbox_output(output, c)
+
+    def iter_gate_absorption_counter(self, g: GateAbsorptionCounter):
+        self.iter_counter(g)
+
+    def insert_wire_at_blackbox_output(self, output, counter):
+        if hasattr(output, "elements"):
+            for el in output.elements:
+                self.insert_wire_at_blackbox_output(el, counter)
+            return
+
+        if len(output.target) == 1 and isinstance(output.target[0], Wire):
+            output.target = output.target[0]
+            return
+
+        out_wire = Wire()
+        for input in output.target:
+            out_wire.connect_to(input)
+
+        output.target = out_wire
+        counter.instances.append(out_wire)
diff --git a/src/finn/compressor/src/target.py b/src/finn/compressor/src/target.py
new file mode 100644
index 0000000000..32f1d45818
--- /dev/null
+++ b/src/finn/compressor/src/target.py
@@ -0,0 +1,125 @@
+#############################################################################
+# Copyright (C) 2024 - 2025 Advanced Micro Devices, Inc. All rights reserved.
+#
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# @brief    FPGA target definitions and gate absorption counter selection
+#############################################################################
+
+from abc import ABC
+from typing import List
+
+from .graph.counters.absorption_counter_candidates import (
+    GateAbsorptionCounterCandidate,
+    MuxCYPredAdderCandidate,
+    MuxCYRippleSumCandidate,
+    RippleSumPredAdderCandidate,
+    SinglePredCandidate,
+    VersalPredAdderCandidate,
+)
+from .graph.counters.counter_candidates import (
+    CounterCandidate,
+    DualRailRippleSumCandidate,
+    FACandidate,
+    FiveTwoCandidate,
+    MuxCYAtomCascadeCandidate,
+    RippleSumCandidate,
+    SixThreeCandidate,
+    TenSixCandidate,
+    VersalAtomCascadeCandidate,
+)
+from .graph.final_adder import FinalAdder, MuxCYTernaryAdder, QuaternaryAdder
+
+
+def resolve_target(fpgapart):
+    """Map a Vivado FPGA part string to a compressor Target object.
+
+    Returns Versal() for Versal parts, UltraScale() for UltraScale/UltraScale+ parts,
+    SevenSeries() otherwise.
+    """
+    versal_prefixes_4 = ("xcvc", "xcve", "xcvp", "xcvm", "xqvc", "xqvm")
+    versal_prefixes_5 = ("xqrvc", "xcv80")
+    if fpgapart[0:4] in versal_prefixes_4 or fpgapart[0:5] in versal_prefixes_5:
+        return Versal()
+    # UltraScale/UltraScale+ prefixes:
+    # Kintex US (xcku), Virtex US (xcvu), Zynq US (xczu), defense (xqzu)
+    ultrascale_prefixes = ("xcku", "xcvu", "xczu", "xqzu")
+    if fpgapart[0:4] in ultrascale_prefixes:
+        return UltraScale()
+    return SevenSeries()
+
+
+def resolve_target_name(name):
+    """Map a CLI target name ('Versal', '7-Series', 'UltraScale') to a Target object."""
+    if name == "Versal":
+        return Versal()
+    elif name == "7-Series":
+        return SevenSeries()
+    elif name == "UltraScale":
+        return UltraScale()
+    else:
+        raise ValueError(
+            f"Unsupported target: {name!r}. Choose from: ['Versal', '7-Series', 'UltraScale']"
+        )
+
+
+class Target(ABC):
+    counter_candidates: List[CounterCandidate]
+    final_adder: FinalAdder
+    absorbing_counter_candidates: List[GateAbsorptionCounterCandidate]
+
+
+class Versal(Target):
+    def __init__(self):
+        self.counter_candidates = [
+            TenSixCandidate(),
+            FACandidate(),
+            RippleSumCandidate(),
+            DualRailRippleSumCandidate(),
+            FiveTwoCandidate(),
+            SixThreeCandidate(),
+            VersalAtomCascadeCandidate(),
+        ]
+        self.absorbing_counter_candidates = [
+            VersalPredAdderCandidate(),
+            RippleSumPredAdderCandidate(),
+            SinglePredCandidate(),
+        ]
+        self.final_adder = QuaternaryAdder
+
+
+class SevenSeries(Target):
+    def __init__(self):
+        self.counter_candidates = [
+            FACandidate(),
+            FiveTwoCandidate(),
+            SixThreeCandidate(),
+            MuxCYAtomCascadeCandidate(),
+        ]
+        self.final_adder = MuxCYTernaryAdder
+        self.absorbing_counter_candidates = [
+            MuxCYPredAdderCandidate(),
+            MuxCYRippleSumCandidate(),
+            SinglePredCandidate(),
+        ]
+
+
+class UltraScale(Target):
+    """UltraScale/UltraScale+ - reuses 7-Series primitives.
+
+    Vivado maps CARRY4 to CARRY8 transparently.
+    """
+
+    def __init__(self):
+        self.counter_candidates = [
+            FACandidate(),
+            FiveTwoCandidate(),
+            SixThreeCandidate(),
+            MuxCYAtomCascadeCandidate(),
+        ]
+        self.final_adder = MuxCYTernaryAdder
+        self.absorbing_counter_candidates = [
+            MuxCYPredAdderCandidate(),
+            MuxCYRippleSumCandidate(),
+            SinglePredCandidate(),
+        ]
diff --git a/src/finn/compressor/src/tests/__init__.py b/src/finn/compressor/src/tests/__init__.py
new file mode 100644
index 0000000000..b6d457fd32
--- /dev/null
+++ b/src/finn/compressor/src/tests/__init__.py
@@ -0,0 +1,8 @@
+#############################################################################
+# Copyright (C) 2024 - 2025 Advanced Micro Devices, Inc. All rights reserved.
+#
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# @brief    Compressor tests package initialization
+# @author    Simon Gerber <simon.gerber@amd.com>
+#############################################################################
diff --git a/src/finn/compressor/src/tests/test_gen.py b/src/finn/compressor/src/tests/test_gen.py
new file mode 100644
index 0000000000..a07e19e7b1
--- /dev/null
+++ b/src/finn/compressor/src/tests/test_gen.py
@@ -0,0 +1,164 @@
+#############################################################################
+# Copyright (C) 2024 - 2025 Advanced Micro Devices, Inc. All rights reserved.
+#
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# @brief    Test vector generation for compressor verification
+#############################################################################
+
+from itertools import accumulate
+from typing import List
+
+from ..utils.shape import Shape
+
+
+def compressed_width(shape):
+    max = sum([col * (1 << idx) for idx, col in enumerate(shape)])
+    return max.bit_length()
+
+
+def flatten_gates(gates: List[List[str]]) -> List[str]:
+    return [el for col in gates for el in col]
+
+
+def generate_test(
+    shape: Shape,
+    module_name: str,
+    pipeline_stages: int,
+    gates: List[List[str]],
+    accumulation: bool,
+    accumulator_width: int,
+    constant: int,
+):
+    assert isinstance(pipeline_stages, int)
+
+    if gates:
+        gates = flatten_gates(gates)
+
+    has_clk = bool(pipeline_stages) or accumulate
+
+    accumulated_signature = list(accumulate(shape))
+    addends = []
+    for j, col in enumerate(accumulated_signature):
+        for i in range(shape[j]):
+            addends.append(f"\t\tin_reduced += arr_in[{col+i-shape[j]}] << {j};")
+    addends = "\n".join(addends)
+
+    if gates:
+        preds = "".join(
+            [f"\tlocalparam pred_{idx} = 4'h{gate};\n" for idx, gate in enumerate(gates)]
+        )
+        selects = "".join([f"\tlogic [3:0] sel_{idx};\n" for idx, _ in enumerate(gates)])
+        arr_ins = "".join(
+            [
+                f"\t\tsel_{i} = (arr_in_b[{i}]<<1) | arr_in_a[{i}];\n"
+                + f"\t\tarr_in[{i}] = pred_{i}[sel_{i}];\n"
+                for i, _ in enumerate(gates)
+            ]
+        )
+        gates_decl = (
+            f"\tlogic [{sum(shape)-1}:0] arr_in_a;" + f"\tlogic [{sum(shape)-1}:0] arr_in_b;"
+        )
+    accumulator_width = accumulator_width if accumulator_width else compressed_width(shape)
+    acc_decl = f"\tlogic [{accumulator_width-1}:0] acc_base;"
+
+    acc_rst_block = """\t\t\tif (reset == 0) begin
+\t\t\t\tacc_base = 0;
+\t\t\tend else begin
+\t\t\t\tacc_base = reference[0];
+\t\t\tend"""
+
+    return (
+        f"""module tb;
+{gates_decl if gates else ""}
+\tlogic [{sum(shape)-1}:0] arr_in;
+\tlogic [{compressed_width(shape)-1}:0] in_reduced;
+\tlogic [{accumulator_width-1}:0] out;
+\tlogic [{accumulator_width-1}:0] reference [{pipeline_stages}:0];
+{acc_decl if accumulation else ""}
+\t{"logic [4:0] reset;" if accumulation else ""}
+\t{"logic rst;" if accumulation else ""}
+\t{"logic clk = 0;" if has_clk else ""}
+\t{"logic en = 1;" if accumulation else ""}
+
+{preds if gates else ""}
+{selects if gates else ""}
+\talways_comb begin;
+{arr_ins if gates else ""}
+\tend
+
+\t{"always #10ns clk = !clk;" if has_clk else ""}
+
+\talways_comb begin
+\t\t{"reference[0] = acc_base + in_reduced;"
+     if accumulation else "reference[0] = in_reduced;"}
+\tend
+
+\talways_comb begin
+\t\tin_reduced = 0;
+\t\t{"if (en) begin" if accumulation else ""}
+in_reduced += {constant};
+{addends}
+\t\t{"end" if accumulation else ""}
+\tend
+
+\tinitial begin
+\t\t{"acc_base = 0;" if accumulation else ""}
+\t\t{"arr_in_a = 0;" if gates else "arr_in = 0;"}
+\t\t{"arr_in_b = 0;" if gates else ""}
+
+\t\t{"assign rst = reset == 0;" if accumulation else ""}
+\t\t{"reset = 0; #40ns;" if accumulation else ""}
+
+\t\tfor (int i = 0; i < 16000; i += 1) begin
+\t\t\t{"automatic type(reset) xx;" if accumulation else ""}
+\t\t\t{"automatic type(en) zz;" if accumulation else ""}
+
+\t\t\t{"automatic type(arr_in_a) yy;" if gates else "automatic type(arr_in) yy;"}
+\t\t\t{"automatic type(arr_in_b) yz;" if gates else ""}
+
+\t\t\t{"void'(std::randomize(xx));" if accumulation else ""}
+\t\t\t{"reset = xx; " if accumulation else ""}
+\t\t\t{"void'(std::randomize(zz));" if accumulation else ""}
+\t\t\t{"en = zz;" if accumulation else ""}
+
+\t\t\tif (i < 5) yy = 0;
+\t\t\telse if (i < 10) yy = '1;
+\t\t\telse void'(std::randomize(yy));
+\t\t\t{"arr_in_a = yy;" if gates else "arr_in = yy;"}
+
+\t\t\t{"if (i < 5) yz = 0;" if gates else ""}
+\t\t\t{"else if (i < 10) yz = '1;" if gates else ""}
+\t\t\t{"else void'(std::randomize(yz));" if gates else ""}
+\t\t\t{"arr_in_b = yz;" if gates else ""}
+
+\t\t\t@(posedge clk);
+\t\t\tfor (int i = 1; i <= {pipeline_stages}; ++i) begin
+\t\t\t\treference[i] <= reference[i-1];
+\t\t\tend
+
+{acc_rst_block if accumulation else ""}
+\t\t\t#1ns;
+\t\t\tif(^reference[{pipeline_stages}] !== 1'bX) begin
+\t\t\t\tassert(reference[{pipeline_stages}] === out) else begin
+\t\t\t\t\t$error("Mismatch: Ref[%0b] != Out[%0b]", reference[{pipeline_stages}], out);
+\t\t\t\t\t#2ns;
+\t\t\t\t\t$stop;
+\t\t\t\tend
+\t\t\tend
+\t\t#0.01ns;
+
+\t\tend
+\t\t$display("TEST PASSED");
+\t\t$finish();
+\tend
+
+\t{module_name} dut(
+    {".clk(clk)," if pipeline_stages or accumulation else ""}
+    {".rst(rst)," if accumulation else ""}
+    {".in(arr_in_a), .in_2(arr_in_b)," if gates else ".in(arr_in),"}
+    {".en_neg(!en)," if accumulation else ""}
+    .out(out));
+endmodule
+"""
+    ).replace("\n\n", "\n")
diff --git a/src/finn/compressor/src/tests/tester.py b/src/finn/compressor/src/tests/tester.py
new file mode 100644
index 0000000000..5b74cdd789
--- /dev/null
+++ b/src/finn/compressor/src/tests/tester.py
@@ -0,0 +1,43 @@
+#############################################################################
+# Copyright (C) 2024, Advanced Micro Devices, Inc.
+# All rights reserved.
+#
+# SPDX-License-Identifier: BSD-3-Clause
+#############################################################################
+
+"""Vivado XSim wrapper for testing generated compressors."""
+
+import re
+import subprocess
+
+
+def tester(test_loc, comp_loc):
+    """Run Vivado XSim simulation to test a compressor.
+
+    Args:
+        test_loc: Path to testbench SystemVerilog file
+        comp_loc: Path to compressor SystemVerilog file
+    """
+    args = (
+        f"""rm -r xsim.dir/ &&
+        xvlog -work work -sv ../res/glbl.v {test_loc} {comp_loc} -L unisims_ver --nolog &&
+        xelab -L work -L unisims_ver -relax --nolog glbl tb &&
+        xsim --nolog work.glbl#work.tb -R"""
+    ).replace("\n", " ")
+    print(args)
+    try:
+        ret = subprocess.run(
+            args, capture_output=True, text=True, timeout=300, shell=True, check=True
+        )
+    except subprocess.CalledProcessError as e:
+        if e.returncode == 127:
+            raise RuntimeError("Could not call Vivado simulation tools. Did you source Vivado?")
+        else:
+            raise RuntimeError("Something failed during simulation.")
+    if "$finish called at time" in ret.stdout:
+        print("Simulation SUCCESS!")
+    else:
+        print("ERROR in Compressor Simulation!")
+        error = re.findall("Error:.*\n.*\n", ret.stdout)[0].split("\n")
+        print(f">> {error[0]}\n>> {error[1]}")
+        exit(-2)
diff --git a/src/finn/compressor/src/utils/__init__.py b/src/finn/compressor/src/utils/__init__.py
new file mode 100644
index 0000000000..a5d76914d9
--- /dev/null
+++ b/src/finn/compressor/src/utils/__init__.py
@@ -0,0 +1,8 @@
+#############################################################################
+# Copyright (C) 2024 - 2025 Advanced Micro Devices, Inc. All rights reserved.
+#
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# @brief    Compressor utilities package initialization
+# @author    Simon Gerber <simon.gerber@amd.com>
+#############################################################################
diff --git a/src/finn/compressor/src/utils/mul_comp_map.py b/src/finn/compressor/src/utils/mul_comp_map.py
new file mode 100644
index 0000000000..1cd044ea29
--- /dev/null
+++ b/src/finn/compressor/src/utils/mul_comp_map.py
@@ -0,0 +1,61 @@
+#############################################################################
+# Copyright (C) 2024 - 2025 Advanced Micro Devices, Inc. All rights reserved.
+#
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# @brief    Multiplier-to-compressor input mapping utilities
+#############################################################################
+
+
+class MulCompMap:
+    def __init__(self, na: int, nb: int, sa: bool, sb: bool):
+        self.na = na
+        self.nb = nb
+        self.sa = sa
+        self.sb = sb
+
+    def columns(self):
+        return 1 if self.na == 1 and self.nb == 1 else self.nb + self.na - (not self.sb or self.sa)
+
+    def shape(self):
+        (na, nb, sa, sb) = (self.na, self.nb, self.sa, self.sb)
+
+        res = []
+        if na == 1 and nb == 1:
+            res.append([7 if sa ^ sb else 8])
+        else:
+            col = 0
+
+            # Crescending right triangle
+            while col < nb:
+                col += 1
+                res.append([8] * col)
+            # Central rectangle
+            while col < na:
+                col += 1
+                res.append([8] * nb)
+            # Decrescending left rectangle
+            while col < nb + na - 1:
+                col += 1
+                res.append([8] * (nb + na - col))
+
+            # Patch in sign handling
+            if sa:
+                for col in range(na - 1, na + nb - 1):
+                    res[col][0] = ~res[col][0] & 15
+            if sb:
+                res[nb].insert(0, 2)
+                for col in range(nb, nb + na - 1):
+                    op = res[col][-1]
+                    res[col][-1] = ((op & 3) << 2) | ((op >> 2) & 3)
+                if not sa:
+                    res.append([13])
+
+        return res
+
+    def absolute_term(self):
+        (na, nb, sa, sb) = (self.na, self.nb, self.sa, self.sb)
+
+        return (
+            (-1 if sa ^ sb else 0) if na == 1 and nb == 1 else ((-(sa | sb) << nb) | sa) << (na - 1)
+        )
diff --git a/src/finn/compressor/src/utils/shape.py b/src/finn/compressor/src/utils/shape.py
new file mode 100644
index 0000000000..266c6260d1
--- /dev/null
+++ b/src/finn/compressor/src/utils/shape.py
@@ -0,0 +1,57 @@
+#############################################################################
+# Copyright (C) 2024 - 2025 Advanced Micro Devices, Inc. All rights reserved.
+#
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# @brief    Shape representation for compressor bit matrices
+#############################################################################
+
+from itertools import zip_longest
+from typing import Tuple
+
+
+class Shape:
+    def __init__(self, t: Tuple[int] = ()):
+        self.t = tuple(t)
+
+    def __len__(self):
+        return len(self.t)
+
+    def __iter__(self):
+        return self.t.__iter__()
+
+    def __getitem__(self, val):
+        if isinstance(val, int) and val >= len(self.t):
+            return 0
+        r = self.t.__getitem__(val)
+        if isinstance(r, int):
+            return r
+        else:
+            return Shape(r)
+
+    def __lshift__(self, val):
+        return Shape([0 for el in range(val)] + list(self.t))
+
+    def __add__(self, val):
+        return self.__binary_arithmetic_operation(val, lambda x, y: x + y)
+
+    def __sub__(self, val):
+        return self.__binary_arithmetic_operation(val, lambda x, y: x - y)
+
+    def __binary_arithmetic_operation(self, val, op):
+        if isinstance(val, int):
+            return Shape([op(el, val) for el in self.t])
+        elif isinstance(val, Shape):
+            zipped = zip_longest(self.t, val.t, fillvalue=0)
+            return Shape([op(a, b) for a, b in zipped])
+        else:
+            raise RuntimeError("Unsupported type.")
+
+    def __repr__(self):
+        return f"Shape {self.t[::-1]}"
+
+    def __eq__(self, other):
+        for col1, col2 in zip_longest(self, other, fillvalue=0):
+            if col1 != col2:
+                return False
+        return True
diff --git a/src/finn/custom_op/fpgadataflow/rtl/matrixvectoractivation_rtl.py b/src/finn/custom_op/fpgadataflow/rtl/matrixvectoractivation_rtl.py
index 9cd6fc2a9d..3b16688e79 100644
--- a/src/finn/custom_op/fpgadataflow/rtl/matrixvectoractivation_rtl.py
+++ b/src/finn/custom_op/fpgadataflow/rtl/matrixvectoractivation_rtl.py
@@ -29,6 +29,7 @@
 import numpy as np
 import os
 
+from finn.compressor import generate_add_multi_comps, generate_dotp_comp
 from finn.custom_op.fpgadataflow.matrixvectoractivation import MVAU
 from finn.custom_op.fpgadataflow.rtlbackend import RTLBackend
 from finn.util.basic import get_dsp_block, is_versal
@@ -51,6 +52,15 @@ def get_nodeattr_types(self):
         my_attrs = {
             # Double-pumped DSPs enabled
             "pumpedCompute": ("i", False, 0, {0, 1}),
+            # Compressor module name (set by generate_hdl when compressor is used)
+            "comp_module_name": ("s", False, ""),
+            # dotp_comp wrapper module name (set by generate_hdl when dotp compressor is used)
+            "dotp_module_name": ("s", False, ""),
+            # add_multi compressor module names, semicolon-separated
+            "add_multi_comp_names": ("s", False, ""),
+            # add_multi compressor specs for synthesis aggregation
+            # Format: "N,W,D;N,W,D;..." e.g. "16,4,0;16,3,0;16,8,0"
+            "add_multi_comp_specs": ("s", False, ""),
         }
         my_attrs.update(MVAU.get_nodeattr_types(self))
         my_attrs.update(RTLBackend.get_nodeattr_types(self))
@@ -160,22 +170,58 @@ def dsp_estimation(self, fpgapart):
             mult_dsp = np.ceil(P / 4) * Q
         return int(mult_dsp)
 
-    def instantiate_ip(self, cmd):
-        # instantiate the RTL IP
-        node_name = self.onnx_node.name
-        code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen")
-        rtllib_dir = os.path.join(os.environ["FINN_ROOT"], "finn-rtllib/mvu/")
-        sourcefiles = [
+    def _get_rtl_source_files(self, abspath=True):
+        """
+        Build the list of RTL source files for this node, including any
+        generated compressor files. Used by both instantiate_ip() and
+        get_rtl_file_list() to avoid duplication.
+        """
+        if abspath:
+            code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") + "/"
+            rtllib_dir = os.path.join(os.environ["FINN_ROOT"], "finn-rtllib/mvu/")
+        else:
+            code_gen_dir = ""
+            rtllib_dir = ""
+
+        base_files = [
             "mvu_pkg.sv",
-            "mvu_vvu_axi.sv",
             "replay_buffer.sv",
             "mvu.sv",
             "mvu_vvu_8sx9_dsp58.sv",
-            "add_multi.sv",
         ]
         sourcefiles = [
             os.path.join(code_gen_dir, self.get_nodeattr("gen_top_module") + "_wrapper.v")
-        ] + [rtllib_dir + _ for _ in sourcefiles]
+        ] + [rtllib_dir + f for f in base_files]
+
+        # Add compressor files if dotp_comp was generated
+        comp_name = self.get_nodeattr("comp_module_name")
+        if comp_name:
+            comp_hdl_dir = os.path.join(os.environ["FINN_ROOT"], "src/finn/compressor/hdl/")
+            dotp_module_name = self.get_nodeattr("dotp_module_name")
+            sourcefiles.append(os.path.join(code_gen_dir, f"{dotp_module_name}.sv"))
+            sourcefiles.append(os.path.join(comp_hdl_dir, "mul_comp_map.sv"))
+            sourcefiles.append(os.path.join(code_gen_dir, comp_name + ".sv"))
+            # Use local mvu_vvu_axi.sv with substituted $DOTP_MODULE_NAME$
+            sourcefiles.append(os.path.join(code_gen_dir, "mvu_vvu_axi.sv"))
+            # dotp_comp path doesn't need add_multi.sv
+        else:
+            # DSP path: use local mvu_vvu_axi.sv (no placeholder substitution needed)
+            sourcefiles.append(os.path.join(code_gen_dir, "mvu_vvu_axi.sv"))
+            # add_multi.sv always exists in code_gen_dir
+            # (either patched with comps or copy of template)
+            sourcefiles.append(os.path.join(code_gen_dir, "add_multi.sv"))
+            add_multi_names_str = self.get_nodeattr("add_multi_comp_names")
+            if add_multi_names_str:
+                # Add compressor modules if present
+                for name in add_multi_names_str.split(";"):
+                    sourcefiles.append(os.path.join(code_gen_dir, name + ".sv"))
+
+        return sourcefiles
+
+    def instantiate_ip(self, cmd):
+        # instantiate the RTL IP
+        node_name = self.onnx_node.name
+        sourcefiles = self._get_rtl_source_files(abspath=True)
 
         for f in sourcefiles:
             cmd.append("add_files -norecurse %s" % (f))
@@ -268,6 +314,20 @@ def _resolve_dsp_version(self, dsp_block):
             case _:
                 return 1
 
+    def _is_dotp_comp_eligible(self, fpgapart, ww, aw, pumped_compute):
+        """
+        Check if LUT-based compressor should replace the DSP compute path.
+        Returns True when: non-pumped, small operands (WW <= 4 and AW <= 4).
+
+        All FPGA families are supported via resolve_target() in the compressor:
+        - Versal: LUT6 + LOOKAHEAD8 primitives
+        - UltraScale+: LUT6_2 + CARRY4 (Vivado maps to CARRY8)
+        - 7-Series: LUT6_2 + CARRY4
+        """
+        if pumped_compute or ww > 4 or aw > 4:
+            return False
+        return True
+
     def generate_hdl(self, model, fpgapart, clk):
         # Generate params as part of IP preparation
         code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen")
@@ -286,6 +346,56 @@ def generate_hdl(self, model, fpgapart, clk):
             else 1
         )
         code_gen_dict["$NARROW_WEIGHTS$"] = str(narrow_weights)
+
+        # Extract params from code_gen_dict for compressor generation.
+        simd = int(code_gen_dict["$SIMD$"][0])
+        ww = int(code_gen_dict["$WEIGHT_WIDTH$"][0])
+        aw = int(code_gen_dict["$ACTIVATION_WIDTH$"][0])
+        accu_width = int(code_gen_dict["$ACCU_WIDTH$"][0])
+        signed_act = int(code_gen_dict["$SIGNED_ACTIVATIONS$"][0]) != 0
+        pumped_compute = int(code_gen_dict["$PUMPED_COMPUTE$"][0])
+        version = int(code_gen_dict["$VERSION$"][0])
+
+        # Compressor generation if applicable.
+        if self._is_dotp_comp_eligible(fpgapart, ww, aw, pumped_compute):
+            result = generate_dotp_comp(
+                fpgapart, simd, ww, aw, accu_width, signed_act, code_gen_dir
+            )
+            code_gen_dict["$COMP_PIPELINE_DEPTH$"] = [str(result["comp_delay"])]
+            code_gen_dict["$USE_COMPRESSOR$"] = [str(1)]
+            self.set_nodeattr("comp_module_name", result["comp_name"])
+            self.set_nodeattr("dotp_module_name", result["dotp_module_name"])
+            # Copy mvu_vvu_axi.sv and substitute $DOTP_MODULE_NAME$
+            rtllib_dir = os.path.join(os.environ["FINN_ROOT"], "finn-rtllib/mvu/")
+            with open(os.path.join(rtllib_dir, "mvu_vvu_axi.sv"), "r") as f:
+                mvu_vvu_axi_content = f.read()
+            mvu_vvu_axi_content = mvu_vvu_axi_content.replace(
+                "$DOTP_MODULE_NAME$", result["dotp_module_name"]
+            )
+            with open(os.path.join(code_gen_dir, "mvu_vvu_axi.sv"), "w") as f:
+                f.write(mvu_vvu_axi_content)
+        else:
+            # DSP path: Generate add_multi.sv with compressors
+            result = generate_add_multi_comps(
+                fpgapart, version, simd, ww, aw, accu_width, narrow_weights, code_gen_dir
+            )
+            if result["comp_names"]:
+                self.set_nodeattr("add_multi_comp_names", ";".join(result["comp_names"]))
+                # Store compressor specs for synthesis aggregation
+                # Format: "N,W,D;N,W,D;..." e.g. "16,4,0;16,3,0;16,8,0"
+                specs_str = ";".join(f"{n},{w},{d}" for n, w, d in result.get("comp_specs", []))
+                self.set_nodeattr("add_multi_comp_specs", specs_str)
+            # Copy mvu_vvu_axi.sv and substitute placeholder with dummy name
+            # (not used since USE_COMPRESSOR=0, but Vivado parses entire file)
+            rtllib_dir = os.path.join(os.environ["FINN_ROOT"], "finn-rtllib/mvu/")
+            with open(os.path.join(rtllib_dir, "mvu_vvu_axi.sv"), "r") as f:
+                mvu_vvu_axi_content = f.read()
+            mvu_vvu_axi_content = mvu_vvu_axi_content.replace(
+                "$DOTP_MODULE_NAME$", "dotp_comp"  # Dummy name, won't be instantiated
+            )
+            with open(os.path.join(code_gen_dir, "mvu_vvu_axi.sv"), "w") as f:
+                f.write(mvu_vvu_axi_content)
+
         # add general parameters to dictionary
         code_gen_dict["$MODULE_NAME_AXI_WRAPPER$"] = [self.get_verilog_top_module_name()]
         # save top module name so we can refer to it after this node has been renamed
@@ -351,30 +461,13 @@ def prepare_codegen_default(self, fpgapart, clk):
             [str(1)] if (self.get_input_datatype(0).min() < 0) else [str(0)]
         )
         code_gen_dict["$SEGMENTLEN$"] = [str(self._resolve_segment_len(clk))]
+        code_gen_dict["$COMP_PIPELINE_DEPTH$"] = [str(1)]
+        code_gen_dict["$USE_COMPRESSOR$"] = [str(0)]
 
         return template_path, code_gen_dict
 
     def get_rtl_file_list(self, abspath=False):
-        if abspath:
-            code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") + "/"
-            rtllib_dir = os.path.join(os.environ["FINN_ROOT"], "finn-rtllib/mvu/")
-        else:
-            code_gen_dir = ""
-            rtllib_dir = ""
-
-        verilog_files = [
-            "mvu_pkg.sv",
-            "mvu_vvu_axi.sv",
-            "replay_buffer.sv",
-            "mvu.sv",
-            "mvu_vvu_8sx9_dsp58.sv",
-            "add_multi.sv",
-        ]
-        verilog_files = [
-            os.path.join(code_gen_dir, self.get_nodeattr("gen_top_module") + "_wrapper.v")
-        ] + [rtllib_dir + _ for _ in verilog_files]
-
-        return verilog_files
+        return self._get_rtl_source_files(abspath=abspath)
 
     def get_verilog_paths(self):
         verilog_paths = super().get_verilog_paths()
diff --git a/src/finn/custom_op/fpgadataflow/rtl/vectorvectoractivation_rtl.py b/src/finn/custom_op/fpgadataflow/rtl/vectorvectoractivation_rtl.py
index 7ef9d9c9b5..cfcbbad8bd 100644
--- a/src/finn/custom_op/fpgadataflow/rtl/vectorvectoractivation_rtl.py
+++ b/src/finn/custom_op/fpgadataflow/rtl/vectorvectoractivation_rtl.py
@@ -149,17 +149,17 @@ def instantiate_ip(self, cmd):
         node_name = self.onnx_node.name
         code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen")
         rtllib_dir = os.path.join(os.environ["FINN_ROOT"], "finn-rtllib/mvu/")
-        sourcefiles = [
+        rtllib_files = [
             "mvu_pkg.sv",
-            "mvu_vvu_axi.sv",
             "replay_buffer.sv",
             "mvu.sv",
             "mvu_vvu_8sx9_dsp58.sv",
             "add_multi.sv",
         ]
         sourcefiles = [
-            os.path.join(code_gen_dir, self.get_nodeattr("gen_top_module") + "_wrapper.v")
-        ] + [rtllib_dir + _ for _ in sourcefiles]
+            os.path.join(code_gen_dir, self.get_nodeattr("gen_top_module") + "_wrapper.v"),
+            os.path.join(code_gen_dir, "mvu_vvu_axi.sv"),  # Local copy with substituted placeholder
+        ] + [rtllib_dir + _ for _ in rtllib_files]
 
         for f in sourcefiles:
             cmd.append("add_files -norecurse %s" % (f))
@@ -219,6 +219,17 @@ def generate_hdl(self, model, fpgapart, clk):
         ) as f:
             f.write(template_wrapper)
 
+        # Copy mvu_vvu_axi.sv and substitute placeholder with dummy name
+        # (not used since USE_COMPRESSOR=0, but Vivado parses entire file)
+        rtllib_dir = os.path.join(os.environ["FINN_ROOT"], "finn-rtllib/mvu/")
+        with open(os.path.join(rtllib_dir, "mvu_vvu_axi.sv"), "r") as f:
+            mvu_vvu_axi_content = f.read()
+        mvu_vvu_axi_content = mvu_vvu_axi_content.replace(
+            "$DOTP_MODULE_NAME$", "dotp_comp"  # Dummy name, won't be instantiated
+        )
+        with open(os.path.join(code_gen_dir, "mvu_vvu_axi.sv"), "w") as f:
+            f.write(mvu_vvu_axi_content)
+
         if self.get_nodeattr("mem_mode") == "internal_decoupled":
             if self.get_nodeattr("ram_style") == "ultra" and not is_versal(fpgapart):
                 runtime_writeable = self.get_nodeattr("runtime_writeable_weights")
@@ -284,6 +295,8 @@ def prepare_codegen_default(self, fpgapart, clk):
             [str(1)] if (self.get_input_datatype(0).min() < 0) else [str(0)]
         )
         code_gen_dict["$SEGMENTLEN$"] = [str(self._resolve_segment_len(clk))]
+        code_gen_dict["$COMP_PIPELINE_DEPTH$"] = [str(1)]
+        code_gen_dict["$USE_COMPRESSOR$"] = [str(0)]
 
         return template_path, code_gen_dict
 
@@ -295,17 +308,17 @@ def get_rtl_file_list(self, abspath=False):
             code_gen_dir = ""
             rtllib_dir = ""
 
-        verilog_files = [
+        rtllib_files = [
             "mvu_pkg.sv",
-            "mvu_vvu_axi.sv",
             "replay_buffer.sv",
             "mvu.sv",
             "mvu_vvu_8sx9_dsp58.sv",
             "add_multi.sv",
         ]
         verilog_files = [
-            os.path.join(code_gen_dir, self.get_nodeattr("gen_top_module") + "_wrapper.v")
-        ] + [rtllib_dir + _ for _ in verilog_files]
+            os.path.join(code_gen_dir, self.get_nodeattr("gen_top_module") + "_wrapper.v"),
+            os.path.join(code_gen_dir, "mvu_vvu_axi.sv"),  # Local copy with substituted placeholder
+        ] + [rtllib_dir + _ for _ in rtllib_files]
 
         return verilog_files
 
diff --git a/src/finn/transformation/fpgadataflow/specialize_layers.py b/src/finn/transformation/fpgadataflow/specialize_layers.py
index dcd2472e0a..9a072b1f30 100644
--- a/src/finn/transformation/fpgadataflow/specialize_layers.py
+++ b/src/finn/transformation/fpgadataflow/specialize_layers.py
@@ -26,7 +26,6 @@
 # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-import numpy as np
 import warnings
 from onnx import helper
 from qonnx.custom_op.registry import getCustomOp
@@ -34,7 +33,7 @@
 
 from finn.custom_op.fpgadataflow.hls import custom_op as hls_variants
 from finn.custom_op.fpgadataflow.rtl import custom_op as rtl_variants
-from finn.util.basic import get_dsp_block, is_versal
+from finn.util.basic import is_versal
 
 
 def _determine_impl_style(node, fpgapart, model):
@@ -55,11 +54,7 @@ def _determine_impl_style(node, fpgapart, model):
             return _dwc_determine_impl_style(node)
         if rtl_variant:
             if optype == "MVAU":
-                idt = node_inst.get_input_datatype(0)
-                wdt = node_inst.get_input_datatype(1)
-                inp_width_fit = idt.bitwidth() >= 4
-                weight_width_fit = wdt.bitwidth() >= 4
-                if inp_width_fit and weight_width_fit and _mvu_rtl_possible(node, fpgapart, model):
+                if _mvu_rtl_possible(node, fpgapart, model):
                     return "rtl"
                 else:
                     return "hls"
@@ -139,8 +134,8 @@ def _determine_impl_style(node, fpgapart, model):
                 return "rtl"
             else:
                 warn_str = """There is no RTL variant for %s. The node will automatically be
-                        set to HLS variant. Please check the bit-widths to be <= 8 and ensure the
-                        thresholds are implemented as standalone layer""" % (
+                        set to HLS variant. Ensure thresholds are implemented as standalone layer,
+                        weights are signed, and bitwidths are >= 2""" % (
                     node.name,
                 )
                 warnings.warn(warn_str)
@@ -233,46 +228,29 @@ def _dwc_determine_impl_style(node):
 
 def _mvu_rtl_possible(n, fpgapart, model):
     # Checks whether RTL-based MVU is supported
-    # Currently, for DSP48 we only support computations up to
-    # 8sx8u (8-bit signed weights x 8-bit (un)signed activations)
-    # and for DSP58 we support up to 8sx9s.
-    # Please note, DSP48E1 does only support narrow range for weights
-    # Next to that, embedded thresholding functionality is not supported
-    # and neither binaryxnormode computation.
+    # RTL MVU uses either DSP blocks (for larger bitwidths)
+    # or LUT-based compressor (2<=WW<=4 && 2<=AW<=4)
+    # Weights must be signed, activations can be unsigned or signed
+    # Embedded thresholding and binaryXnorMode are not supported
     node_inst = getCustomOp(n)
     # first check if no Activation or binary xnor mode and return False
     # immediately if one of them is True
     no_activation = node_inst.get_nodeattr("noActivation") == 0
-    not_binaryxnor_mode = node_inst.get_nodeattr("binaryXnorMode") == 1
-    if no_activation or not_binaryxnor_mode:
+    is_binaryxnor_mode = node_inst.get_nodeattr("binaryXnorMode") == 1
+    if no_activation or is_binaryxnor_mode:
         return False
 
-    # check if weights are signed, if not return False
+    idt = node_inst.get_input_datatype(0)
     wdt = node_inst.get_input_datatype(1)
-    if not wdt.signed():
-        return False
 
-    # check which dsp block is available on fpga
-    dsp_block = get_dsp_block(fpgapart)
-    # check if weights are narrow
-    weights = model.get_initializer(n.input[1])
-    # if dynamic input, set minimum of weights to wdt.min()
-    # otherwise set it to the minimum value in the weight matrix
-    if weights is None:
-        weights_min = wdt.min()
-    else:
-        weights_min = np.min(weights)
-    narrow_weights = False if weights_min == wdt.min() else True
-    # if non narrow weights and only DSP48E1 available return False
-    if not narrow_weights and dsp_block == "DSP48E1":
+    if not wdt.signed():
         return False
 
     # if none of the above constraints have been triggered
     # we now check if input and weight data types are in range
     # we only use rtl mvau if the dtypes are at least 2 bit
-    idt = node_inst.get_input_datatype()
-    inp_width_in_range = 2 <= idt.bitwidth()
-    weight_width_in_range = 2 <= wdt.bitwidth()
+    inp_width_in_range = idt.bitwidth() >= 2
+    weight_width_in_range = wdt.bitwidth() >= 2
 
     return inp_width_in_range and weight_width_in_range
 
diff --git a/src/finn/transformation/fpgadataflow/synth_ooc.py b/src/finn/transformation/fpgadataflow/synth_ooc.py
index 2f9436c6b0..23d6fa72bf 100644
--- a/src/finn/transformation/fpgadataflow/synth_ooc.py
+++ b/src/finn/transformation/fpgadataflow/synth_ooc.py
@@ -44,6 +44,44 @@ def is_hls_float_op(node, model):
     return False
 
 
+def generate_unified_add_multi(model, build_dir):
+    """
+    Generate unified add_multi.sv with aggregated CATCH_COMP entries from all
+    MVAU_rtl nodes. Deduplicates specs and programmatically generates macro calls.
+    """
+    all_specs = set()
+    for node in model.graph.node:
+        if node.op_type == "MVAU_rtl":
+            inst = getCustomOp(node)
+            specs_str = inst.get_nodeattr("add_multi_comp_specs")
+            if specs_str:
+                for spec in specs_str.split(";"):
+                    n, w, d = map(int, spec.split(","))
+                    all_specs.add((n, w, d))
+
+    rtllib_template = os.path.join(os.environ["FINN_ROOT"], "finn-rtllib/mvu/add_multi.sv")
+    with open(rtllib_template, "r") as f:
+        template = f.read()
+
+    if all_specs:
+        catch_comp_lines = [f"\t`CATCH_COMP({n},{w},{d})" for n, w, d in sorted(all_specs)]
+        entries = "\n".join(catch_comp_lines) + "\n"
+    else:
+        entries = ""
+
+    marker = "\t// FINN_GENERATED_COMP_ENTRIES\n"
+    if marker not in template:
+        raise RuntimeError(
+            "FINN_GENERATED_COMP_ENTRIES marker not found in finn-rtllib/mvu/add_multi.sv! "
+            "Template file may have been modified."
+        )
+
+    unified = template.replace(marker, entries + marker)
+
+    with open(os.path.join(build_dir, "add_multi.sv"), "w") as f:
+        f.write(unified)
+
+
 class SynthOutOfContext(Transformation):
     """Run out-of-context Vivado synthesis on a stitched IP design."""
 
@@ -68,6 +106,11 @@ def file_to_basename(x):
         for file in all_verilog_srcs:
             if any([file.endswith(x) for x in verilog_extensions]):
                 copy2(file, build_dir)
+
+        # Generate unified add_multi.sv with aggregated CATCH_COMP entries
+        # This overwrites any per-node add_multi.sv files that were copied above
+        generate_unified_add_multi(model, build_dir)
+
         # extract additional tcl commands to set up floating-point ips correctly
         float_ip_tcl = []
         for node in model.graph.node:
diff --git a/tests/fpgadataflow/test_fpgadataflow_mvau.py b/tests/fpgadataflow/test_fpgadataflow_mvau.py
index 4128092df1..3c2cbfed28 100644
--- a/tests/fpgadataflow/test_fpgadataflow_mvau.py
+++ b/tests/fpgadataflow/test_fpgadataflow_mvau.py
@@ -782,9 +782,6 @@ def test_fpgadataflow_rtl_mvau(
     ifm = helper.make_tensor_value_info("ifm", TensorProto.FLOAT, [1, ofm_h, ofm_w, mw])
     ofm = helper.make_tensor_value_info("ofm", TensorProto.FLOAT, (1, ofm_h, ofm_w, mh))
     W = gen_finn_dt_tensor(wdt, (mw, mh))
-    # if 7 series, force weights to narrow range
-    if part == "xc7z020clg400-1":
-        W = np.clip(W, wdt.min() + 1, wdt.max())
     model = make_single_matmul_modelwrapper(ifm, ofm, idt, wdt, W)
     model = model.transform(GiveUniqueNodeNames())
     model = model.transform(GiveReadableTensorNames())