From 3183fdd0bffdd88649cc30068c4d1de58a0fddda Mon Sep 17 00:00:00 2001 From: Simon Gerber Date: Tue, 14 Apr 2026 12:14:07 +0100 Subject: [PATCH 01/10] [Feature] Add LUT-based compressor tree generator Port of compressor-python library for efficient low-bitwidth dot product computation using LUT primitives instead of DSP blocks. Architecture: - Counter-based compressor trees - Fused accumulation with constant propagation - Target-specific primitive selection (CARRY4/CARRY8/LOOKAHEAD8) FPGA Support: - Versal: Fully functional - 7-Series: Functional without fused accumulation and gate absorption (not ready for mvau integration) - UltraScale/UltraScale+: Not yet implemented Integration scripts for both dotp_comp and add_multi optimization modes included. Implementation: - Python-based compressor graph construction and optimization - SystemVerilog template expansion for RTL generation - mul_comp_map module for partial product broadcasting This commit adds the generator infrastructure only. Integration with FINN's RTL backend follows in subsequent commits. --- src/finn/compressor/Makefile | 17 + src/finn/compressor/README.md | 71 ++ src/finn/compressor/__init__.py | 13 + src/finn/compressor/hdl/dotp_comp_template.sv | 154 ++++ src/finn/compressor/hdl/dotp_template.sv | 66 ++ src/finn/compressor/hdl/mul_comp_map.sv | 239 ++++++ src/finn/compressor/src/__init__.py | 8 + src/finn/compressor/src/add_multi_finn.py | 408 ++++++++++ src/finn/compressor/src/benchmark.py | 61 ++ src/finn/compressor/src/dotp.py | 97 +++ src/finn/compressor/src/dotp_finn.py | 264 +++++++ src/finn/compressor/src/evaluation.py | 253 ++++++ src/finn/compressor/src/graph/__init__.py | 8 + src/finn/compressor/src/graph/accumulator.py | 96 +++ .../compressor/src/graph/counters/__init__.py | 8 + .../counters/absorption_counter_candidates.py | 299 +++++++ .../src/graph/counters/counter_candidates.py | 737 ++++++++++++++++++ src/finn/compressor/src/graph/final_adder.py | 364 +++++++++ src/finn/compressor/src/graph/nodes.py | 393 ++++++++++ src/finn/compressor/src/graph/primitives.py | 113 +++ src/finn/compressor/src/graph/visitor.py | 45 ++ src/finn/compressor/src/main.py | 169 ++++ src/finn/compressor/src/passes/__init__.py | 8 + .../src/passes/compressor_constructor.py | 183 +++++ .../src/passes/compressor_pipeliner.py | 33 + .../compressor/src/passes/cost_estimator.py | 35 + src/finn/compressor/src/passes/emitter.py | 317 ++++++++ .../compressor/src/passes/io_annotator.py | 54 ++ src/finn/compressor/src/passes/lut_placer.py | 85 ++ .../compressor/src/passes/node_iterator.py | 123 +++ src/finn/compressor/src/passes/printer.py | 54 ++ .../compressor/src/passes/wire_inserter.py | 40 + src/finn/compressor/src/target.py | 102 +++ src/finn/compressor/src/tests/__init__.py | 8 + src/finn/compressor/src/tests/test_gen.py | 150 ++++ src/finn/compressor/src/tests/tester.py | 41 + src/finn/compressor/src/utils/__init__.py | 8 + src/finn/compressor/src/utils/mul_comp_map.py | 58 ++ src/finn/compressor/src/utils/shape.py | 51 ++ 39 files changed, 5233 insertions(+) create mode 100644 src/finn/compressor/Makefile create mode 100644 src/finn/compressor/README.md create mode 100644 src/finn/compressor/__init__.py create mode 100644 src/finn/compressor/hdl/dotp_comp_template.sv create mode 100644 src/finn/compressor/hdl/dotp_template.sv create mode 100644 src/finn/compressor/hdl/mul_comp_map.sv create mode 100644 src/finn/compressor/src/__init__.py create mode 100644 src/finn/compressor/src/add_multi_finn.py create mode 100644 src/finn/compressor/src/benchmark.py create mode 100644 src/finn/compressor/src/dotp.py create mode 100644 src/finn/compressor/src/dotp_finn.py create mode 100644 src/finn/compressor/src/evaluation.py create mode 100644 src/finn/compressor/src/graph/__init__.py create mode 100644 src/finn/compressor/src/graph/accumulator.py create mode 100644 src/finn/compressor/src/graph/counters/__init__.py create mode 100644 src/finn/compressor/src/graph/counters/absorption_counter_candidates.py create mode 100644 src/finn/compressor/src/graph/counters/counter_candidates.py create mode 100644 src/finn/compressor/src/graph/final_adder.py create mode 100644 src/finn/compressor/src/graph/nodes.py create mode 100644 src/finn/compressor/src/graph/primitives.py create mode 100644 src/finn/compressor/src/graph/visitor.py create mode 100644 src/finn/compressor/src/main.py create mode 100644 src/finn/compressor/src/passes/__init__.py create mode 100644 src/finn/compressor/src/passes/compressor_constructor.py create mode 100644 src/finn/compressor/src/passes/compressor_pipeliner.py create mode 100644 src/finn/compressor/src/passes/cost_estimator.py create mode 100644 src/finn/compressor/src/passes/emitter.py create mode 100644 src/finn/compressor/src/passes/io_annotator.py create mode 100644 src/finn/compressor/src/passes/lut_placer.py create mode 100644 src/finn/compressor/src/passes/node_iterator.py create mode 100644 src/finn/compressor/src/passes/printer.py create mode 100644 src/finn/compressor/src/passes/wire_inserter.py create mode 100644 src/finn/compressor/src/target.py create mode 100644 src/finn/compressor/src/tests/__init__.py create mode 100644 src/finn/compressor/src/tests/test_gen.py create mode 100644 src/finn/compressor/src/tests/tester.py create mode 100644 src/finn/compressor/src/utils/__init__.py create mode 100644 src/finn/compressor/src/utils/mul_comp_map.py create mode 100644 src/finn/compressor/src/utils/shape.py diff --git a/src/finn/compressor/Makefile b/src/finn/compressor/Makefile new file mode 100644 index 0000000000..7df3e6963e --- /dev/null +++ b/src/finn/compressor/Makefile @@ -0,0 +1,17 @@ +############################################################################# +# Copyright (C) 2024 - 2025 Advanced Micro Devices, Inc. All rights reserved. +# +# SPDX-License-Identifier: BSD-3-Clause +# +# @brief Build automation for compressor testing and generation +# @author Simon Gerber +############################################################################# + +# Default: no constant absorption +CA?= +.PHONY: default clean + +default: + ./run_tests.sh $(CA) +clean: + rm -rf *.log *.jou *.vivado .Xil xvlog.pb gen/* diff --git a/src/finn/compressor/README.md b/src/finn/compressor/README.md new file mode 100644 index 0000000000..8c6fbbd2b0 --- /dev/null +++ b/src/finn/compressor/README.md @@ -0,0 +1,71 @@ + + +# Python Compressor Generator +This tool can generate compressor trees for 7-Series, UltraScale(+) and Versal for arbitrary input shapes. + +# Getting started +1. Clone this repository. +2. _No_ further dependencies needed! + +## Usage +Generate a compressor of shape `(12,12,12)` called `comp` and save it under `/gen/comp12_12_12.sv`: + +```python3 src/main.py -s 12,12,12 -n comp -o gen/comp12_12_12.sv``` + +See `python3 src/main.py -h` for details. + +## Features +### Custom Input Shape +The tool can generate compressors for any input shape. A shape is passed as a comma-separated list. Each digit indicates a column's height. *LSB* is *left*, *MSB* is *right*. + +### Accumulation +By passing `-a`, the tool generates an accumulator instead of just an adder. The accumulators width can be specified by `-w`. +### Gate Absorption +If desired, every input to the compressor can be preceded by a two-input gate. These gates can be integrated into the first compression stage. Each gate is specified as a HEX digit. The encoding is the same is Vivado's LUT2 primitive: +| Secondary Input | Primary Input | Output +|-----------------|---------------|---------------- +|0 |0 |(DIGIT << 0) & 1 +|0 |1 |(DIGIT << 1) & 1 +|1 |0 |(DIGIT << 2) & 1 +|1 |1 |(DIGIT << 3) & 1 + +For example, `8` maps to an AND gate and `6` maps to an XOR gate. + +In CLI, gates can be specified as a flat string like `-g 883ABC`. The *LSB* is *left* and *MSB* is *right*. The leftmost specified gate corresponds to the LSB input in the generated compressor input vector. + +### Target +Generate compressors for either Versal, 7-Series or UltraScale fabrics using `-t \{Versal,7-Series,UltraScale\}̀ . + +### Automated Testing +The tool can automatically generate a SystemVerilog testbench to fuzzy-test the generated compressors by passing `--test`. For testing, the `xvlog`, `xelab` and `xsim` commands have to be available. + +### Custom Pipeline Depth +Specify the maximum combinational delay for the compressor using `-p MAX_DEPTH`. Note that the final adder, which has at least one single routing delay, cannot be pipelined. + +### Constant Input +Aside to the regular, variable compressor inputs, the tool also supports an additional constant input. It can be specified as a binary number by `-c NUMBER`. + +# Implementation Details - How the Code is Structured +The compressor is internally represented as a graph. Its nodes are defined in `src/graph/nodes.py`. +Compressor construction is done in several passes: +1. Create a graph with all scheduled counters and a final adder (in `src/passes/compressor_constructor.py`). + 1. (Optional) Generate a gate absorption stage. + 2. Generate regular compression stages until the compression goal is reached. + 3. Insert pipeline registers between compressor stages. + 4. Build either a final adder or an accumulator as the final stage. +2. Annotate LUT6CY instances with placement constraints so that the LUT Cascade will be utilized (in `src/passes/lut_placer.py`). +3. Replace inexpressible connections: Place wires between connected instantiated modules (in `src/passes/wire_inserter.py`). +4. Annotate input and output signals in the compressor (in `src/passes/io_annotator.py`). +5. Emit generated SystemVerilog source (in `src/passes/emitter.py`) + +## Extending the Tool +### Adding new Counters +Counters without gate absorption are defined in `graph/counters/counter_candidates.py`. +Counters with gate absorption are defined in `graph/counters/absorption_counter_candidates.py`. + +### Adding new Passes +Before adding new passes over the compressor graph, check out if the simple iterator defined in `node_iterator.py` can be inherited to save boilerplate code. diff --git a/src/finn/compressor/__init__.py b/src/finn/compressor/__init__.py new file mode 100644 index 0000000000..38b3d95ea5 --- /dev/null +++ b/src/finn/compressor/__init__.py @@ -0,0 +1,13 @@ +############################################################################# +# Copyright (C) 2024 - 2025 Advanced Micro Devices, Inc. All rights reserved. +# +# SPDX-License-Identifier: BSD-3-Clause +# +# @brief FINN compressor package initialization +# @author Simon Gerber +############################################################################# + +"""FINN compressor — LUT-based compressor tree generator for MVU.""" + +from .src.dotp_finn import generate_dotp_comp +from .src.add_multi_finn import generate_add_multi_comps diff --git a/src/finn/compressor/hdl/dotp_comp_template.sv b/src/finn/compressor/hdl/dotp_comp_template.sv new file mode 100644 index 0000000000..5cb3119a0e --- /dev/null +++ b/src/finn/compressor/hdl/dotp_comp_template.sv @@ -0,0 +1,154 @@ +/****************************************************************************** + * Copyright (C) 2024 - 2025 Advanced Micro Devices, Inc. All rights reserved. + * + * SPDX-License-Identifier: BSD-3-Clause + * + * @brief RTL template for dot product compressor with accumulation + * @author Simon Gerber + *****************************************************************************/ + +/** + * LUT-based dot product with fused accumulation. + * Drop-in replacement for DSP-based compute cores in the MVU. + * Uses a generated compressor tree for the reduction. + * + * This file is a TEMPLATE — $COMP_MODULE_NAME$ is substituted + * at code generation time with the config-specific compressor + * module name (e.g. comp_8xs2s2). + *****************************************************************************/ + +module dotp_comp #( + int unsigned PE, + int unsigned SIMD, + int unsigned WEIGHT_WIDTH, + int unsigned ACTIVATION_WIDTH, + int unsigned ACCU_WIDTH, + bit SIGNED_ACTIVATIONS = 0, + int unsigned COMP_PIPELINE_DEPTH = 1 +)( + // Global Control + input logic clk, + input logic rst, + input logic en, + + // Input + input logic last, + input logic zero, + input logic signed [PE-1:0][SIMD-1:0][WEIGHT_WIDTH-1:0] w, + input logic [SIMD-1:0][ACTIVATION_WIDTH-1:0] a, + + // Output + output logic vld, + output logic signed [PE-1:0][ACCU_WIDTH-1:0] p +); + + initial begin + if(COMP_PIPELINE_DEPTH < 1) begin + $error("%m: COMP_PIPELINE_DEPTH (%0d) must be >= 1.", COMP_PIPELINE_DEPTH); + $finish; + end + end + + //----------------------------------------------------------------------- + // Operand Mapping + // + // The `mul_comp_map` interface handles partial-product broadcasting + // mul_comp_map requires NA >= NB. Weights are always signed. + // If activations are wider, swap operands so that ia gets the wider one. + localparam bit SWAPPED = ACTIVATION_WIDTH > WEIGHT_WIDTH; + + localparam int unsigned NA = SWAPPED ? ACTIVATION_WIDTH : WEIGHT_WIDTH; + localparam int unsigned NB = SWAPPED ? WEIGHT_WIDTH : ACTIVATION_WIDTH; + localparam bit SIGNED_A = SWAPPED ? SIGNED_ACTIVATIONS : 1; // weights always signed + localparam bit SIGNED_B = SWAPPED ? 1 : SIGNED_ACTIVATIONS; + + // Input to Matric Broadcasting + uwire [NA-1:0] map0_ia = SWAPPED ? NA'(a[0]) : NA'(w[0][0]); + uwire [NB-1:0] map0_ib = SWAPPED ? NB'(w[0][0]) : NB'(a[0]); + mul_comp_map #(.NA(NA), .NB(NB), .SIGNED_A(SIGNED_A), .SIGNED_B(SIGNED_B)) + map0 (.ia(map0_ia), .ib(map0_ib)); + localparam int unsigned NM = $bits(map0.oa); + + //----------------------------------------------------------------------- + // Pipeline shift register for last -> vld +/* verilator lint_off LITENDIAN */ + logic [1:COMP_PIPELINE_DEPTH] L = '0; +/* verilator lint_on LITENDIAN */ + always_ff @(posedge clk) begin + if(rst) L <= '0; + else if(en) begin + L[1] <= last; + for(int unsigned i = 2; i <= COMP_PIPELINE_DEPTH; i++) + L[i] <= L[i-1]; + end + end + assign vld = L[COMP_PIPELINE_DEPTH]; + + //----------------------------------------------------------------------- + // PE-parallel compressor instances + //----------------------------------------------------------------------- + for(genvar pe = 0; pe < PE; pe++) begin : genPE + + // Partial product matrix broadcasting + uwire [NM-1:0] oa[SIMD]; + uwire [NM-1:0] ob[SIMD]; + for(genvar i = 0; i < SIMD; i++) begin : genMap + uwire [NA-1:0] map_ia = SWAPPED ? NA'(a[i]) : NA'(w[pe][i]); + uwire [NB-1:0] map_ib = SWAPPED ? NB'(w[pe][i]) : NB'(a[i]); + mul_comp_map #(.NA(NA), .NB(NB), .SIGNED_A(SIGNED_A), .SIGNED_B(SIGNED_B)) + map_i (.ia(map_ia), .ib(map_ib)); + assign oa[i] = map_i.oa; + assign ob[i] = map_i.ob; + end : genMap + + // Flatten all matrices column by column + logic [SIMD*NM-1:0] comp_a; + logic [SIMD*NM-1:0] comp_b; + always_comb begin : blkFlatten + automatic int unsigned src_idx[SIMD] = '{ default: 0 }; + automatic int unsigned dst_idx = 0; + for(int unsigned col = 0; col < map0.columns(); col++) begin + for(int unsigned k = 0; k < SIMD; k++) begin + for(int unsigned row = 0; row < map0.height(col); row++) begin + comp_a[dst_idx] = oa[k][src_idx[k]]; + comp_b[dst_idx] = ob[k][src_idx[k]]; + src_idx[k]++; + dst_idx++; + end + end + end + end : blkFlatten + + // Compressor with fused accumulation + // $COMP_MODULE_NAME$ is replaced at code generation time with the + // config-specific compressor module (e.g. comp_8xs2s2). + uwire [ACCU_WIDTH-1:0] comp_out; + $COMP_MODULE_NAME$ comp_inst ( + .clk, + .in(comp_b), + .in_2(comp_a), + .rst(rst || last), + .en_neg(rst || zero), + .en(en), + .out(comp_out) + ); + + assign p[pe] = $signed(comp_out); + + end : genPE + + //----------------------------------------------------------------------- + // Parameter Validation + //----------------------------------------------------------------------- + initial begin + if (SIMD != $EXPECTED_SIMD$ || NA != $EXPECTED_NA$ || NB != $EXPECTED_NB$ || + SIGNED_A != $EXPECTED_SIGNED_A$ || SIGNED_B != $EXPECTED_SIGNED_B$ || + ACCU_WIDTH != $EXPECTED_ACCU_WIDTH$) begin + $warning("%m: CRITICAL - dotp_comp parameter mismatch! SIMD=%0d (expected %0d), NA=%0d (expected %0d), NB=%0d (expected %0d), SIGNED_A=%0d (expected %0d), SIGNED_B=%0d (expected %0d), ACCU_WIDTH=%0d (expected %0d)", + SIMD, $EXPECTED_SIMD$, NA, $EXPECTED_NA$, NB, $EXPECTED_NB$, + SIGNED_A, $EXPECTED_SIGNED_A$, SIGNED_B, $EXPECTED_SIGNED_B$, + ACCU_WIDTH, $EXPECTED_ACCU_WIDTH$); + end + end + +endmodule : dotp_comp diff --git a/src/finn/compressor/hdl/dotp_template.sv b/src/finn/compressor/hdl/dotp_template.sv new file mode 100644 index 0000000000..944fc8fc76 --- /dev/null +++ b/src/finn/compressor/hdl/dotp_template.sv @@ -0,0 +1,66 @@ +/****************************************************************************** + * Copyright (C) 2024 - 2025 Advanced Micro Devices, Inc. All rights reserved. + * + * SPDX-License-Identifier: BSD-3-Clause + * + * @brief RTL template for standalone dot product compressor + *****************************************************************************/ + +module dotp_{n}x{sa}{na}{sb}{nb} #( + localparam int unsigned N = {n}, + localparam int unsigned NA = {na}, + localparam int unsigned NB = {nb}, + localparam bit SIGNED_A = {signed_a}, + localparam bit SIGNED_B = {signed_b}, + localparam int unsigned NP = NA > 1? + $clog2(N) + (!SIGNED_B && (NB == 1)? NA : NA+NB) : + SIGNED_A ^^ SIGNED_B? 1 + $clog2(N) /*[-N:0]*/ : $clog2(N+1) /*[0:N]*/ +)( + input logic clk, + + input logic [N-1:0][NA-1:0] a, + input logic [N-1:0][NB-1:0] b, + output logic [NP-1:0] p +); + + // Input to Matrix Broadcasting + mul_comp_map #(.NA(NA), .NB(NB), .SIGNED_A(SIGNED_A), .SIGNED_B(SIGNED_B)) map0 (.ia(a[0]), .ib(b[0])); + localparam int unsigned NM = $bits(map0.oa); + uwire [NM-1:0] oa[N]; + uwire [NM-1:0] ob[N]; + assign oa[0] = map0.oa; + assign ob[0] = map0.ob; + for(genvar i = 1; i < N; i++) begin + mul_comp_map #(.NA(NA), .NB(NB), .SIGNED_A(SIGNED_A), .SIGNED_B(SIGNED_B)) map_i (.ia(a[i]), .ib(b[i])); + assign oa[i] = map_i.oa; + assign ob[i] = map_i.ob; + end + + // Flatten all Matrices Column by Column + logic [N*NM-1:0] comp_a; + logic [N*NM-1:0] comp_b; + always_comb begin + automatic int unsigned src_idx[N] = '{ default: 0 }; + automatic int unsigned dst_idx = 0; + for(int unsigned col = 0; col < map0.columns(); col++) begin + for(int unsigned i = 0; i < N; i++) begin + for(int unsigned row = 0; row < map0.height(col); row++) begin + comp_a[dst_idx] = oa[i][src_idx[i]]; + comp_b[dst_idx] = ob[i][src_idx[i]]; + src_idx[i]++; + dst_idx++; + end + end + end + end + + uwire signed [NP-1:0] comp_p; + uwire signed [NP-1:0] abs_p = {abs_term}; + comp_{n}x{sa}{na}{sb}{nb} comp ( + .clk, + .in(comp_b), .in_2(comp_a), + .out(comp_p) + ); + assign p = comp_p + abs_p; + +endmodule : dotp_{n}x{sa}{na}{sb}{nb} diff --git a/src/finn/compressor/hdl/mul_comp_map.sv b/src/finn/compressor/hdl/mul_comp_map.sv new file mode 100644 index 0000000000..7049c34ea4 --- /dev/null +++ b/src/finn/compressor/hdl/mul_comp_map.sv @@ -0,0 +1,239 @@ +/****************************************************************************** + * Copyright (C) 2024 - 2025 Advanced Micro Devices, Inc. All rights reserved. + * + * SPDX-License-Identifier: BSD-3-Clause + * + * @brief Multiplier-to-compressor mapping module for gate absorption + * @author Thomas B. Preußer , Simon Gerber + *****************************************************************************/ + +/** + * Broadcasts multiplication inputs to feed a bit product matrix for compression. + * + * @description + * This interface component broadcasts multiplication inputs to produce a bit + * product matrix like the one below. The output is flattened for the + * ingestion by a compressor with the indicated indices: + * + * [6]a3.b0 [3]a2.b0 [1]a1.b0 [0]a0.b0 + * [10]a3.b1 [7]a2.b1 [4]a1.b1 [2]a0.b1 + * [13]a3.b2 [11]a2.b2 [8]a1.b2 [5]a0.b2 + * [15]a3.b3 [14]a2.b3 [12]a1.b3 [9]a0.b3 + * + * Functions designated to informing about the produced shape are provided: + * - columns() - the number of columns in the matrix shape. + * - height(col) - the height of the specified column. + * Additionally, the bit product operator is identified for each index by: + * - gate_op(idx) - the assumed bit product operator as hex LUT code. + * + * In the case of unsigned operands, all bit products require to be computed + * as AND gates (8), i.e. m[i] = oa[i] & ob[i]. + * + * The operands can be specified to be signed, which will effect these changes + * to produce the correct funtionality: + * + * SIGNED_A + * -------- + * The sign extensions of the multiples of input a are not materialized. + * Instead, this identity with s := a_{NA-1} & b_i is applied: + * s ... s s + * ---------- + * !s + * -1 + * In consequence: + * - The `gate_op()` for the left matrix boundary is identified as NAND (7). + * - The `absolute_term()` function returns a valu of + * (-2^NB + 1) * 2^{NA-1} + * that must be added to the matrix sum for the correct product value. + * + * SIGNED_B + * -------- + * The sign extension of input b is not materialized. + * Instead, the multiple of a by the sign bit of b is weighted negatively, + * which expands the produced matrix as follows: + * + * [ 6]a3.b0 [3]a2.b0 [1]a1.b0 [0]a0.b0 + * [11]a3.b1 [ 7]a2.b1 [4]a1.b1 [2]a0.b1 + * [14]a3.b2 [12]a2.b2 [ 8]a1.b2 [5]a0.b2 + * [17]0!b3! [16]a3!b3 [15]a2!b3 [13]a1!b3 [ 9]a0!b3 + * -1 [10] b3 + * ----------------------------------------------------------------------------------- + * [10]a0!b3 [ 6]a3.b0 [3]a2.b0 [1]a1.b0 [0]a0.b0 + * [11]a3.b1 [ 7]a2.b1 [4]a1.b1 [2]a0.b1 + * [14]a3.b2 [12]a2.b2 [ 8]a1.b2 [5]a0.b2 + * [17]0!b3! [16]a3!b3 [15]a2!b3 [13]a1!b3 [ 9]a0.b3 + * -1 + * + * using: + * - a.b := a & b + * - a!b := !a & b + * - a!b! := !(!a & b) + * + * In consequence: + * - The bit sizes of the outputs are wider and the `columns()` count is larger. + * - The `gate_op()` at the shown indeces is identified as 2 or D. + * Note that the height of the matrix grows to NB+1 if NA > NB. + * + * SIGNED_A & SIGNED_B + * ------------------- + * Both approaches are combined for a purely signed multiplication: + * + * [10]a0!b3 [ 6]a3.b0! [3]a2.b0 [1]a1.b0 [0]a0.b0 + * [11]a3.b1! [ 7]a2.b1 [4]a1.b1 [2]a0.b1 + * [14]a3.b2! [12]a2.b2 [ 8]a1.b2 [5]a0.b2 + * [16]a3!b3! [15]a2!b3 [13]a1!b3 [ 9]a0.b3 + * -1 -1 -1 -1 + * + * using: + * - a.b := a & b + * - a!b := !a & b + * - a.b! := !( a & b) + * - a!b! := !(!a & b) + * In consequence: + * - The bit sizes of the outputs are wider. + * - The `gate_op()` at the shown indeces is properly identified. + * - The `absolute_term()` function returns a value of + * (-2^NB + 1) * 2^{NA-1} + * that must be added to the matrix sum for the correct product value. + * Note that the height of the matrix grows to NB+1 if NA > NB. + */ + +interface mul_comp_map #( + int unsigned NA, // bit width of multiplicand + int unsigned NB, // bit width of multiplier + bit SIGNED_A, // signed multiplicand + bit SIGNED_B, // signed multiplier + + // Extra bits due to sign handling and total output size + localparam int unsigned NX = (NA == 1) || !SIGNED_B? 0 : SIGNED_A? 1 : 2, + localparam int unsigned NM = NA*NB + NX +)( + // Input Operands + input logic [NA-1:0] ia, // Multiplicand + input logic [NB-1:0] ib // Multiplier +); + // Bit Matrix Broadcasts + logic [NM-1:0] oa; + logic [NM-1:0] ob; + + + // Operand length support is not symmetrical. + initial begin + if(NA < NB) begin + $error("%m: Switch multiplication operands."); + $finish; + end + end + + function int unsigned columns(); + return NA == 1? 1 : NB + NA - (!SIGNED_B || SIGNED_A); + endfunction : columns + + function int unsigned height(input int unsigned col); + if(NA == 1) return col < 1; + else begin + automatic int unsigned ret = + (col < NB)? col + 1 : + (col < NA)? NB : + (col < NB+NA-1)? NB+NA-1 - col : + (col == NB+NA-1)? SIGNED_B && !SIGNED_A : + /* else */ 0; + if(SIGNED_B && (col == NB)) ret++; + return ret; + end + endfunction : height + + function bit signed [NA+NB-1:0] absolute_term(); + if(NA == 1) return SIGNED_A ^^ SIGNED_B? -1 : 0; + else begin + automatic bit signed [NA+NB-1:0] ret = '{ + NA+NB-1: SIGNED_A || SIGNED_B, + NA-1: SIGNED_A, + default: 0 + }; + return ret; + end + endfunction : absolute_term + + + // Beyond the tip of left triangle at column of height 1 + localparam int unsigned HIGH = NM - (SIGNED_B && !SIGNED_A); + + function bit [3:0] gate_op(input int unsigned idx); + if(NA == 1) return SIGNED_A ^^ SIGNED_B? 7 : 8; + else begin + automatic bit [3:0] op = 8; // AND + + if(SIGNED_B) begin + automatic bit inv = 0; + // Negative weight for sign-bit row + for(int unsigned col = 0; col < NB; col++) begin + if(idx == HIGH-1 - col*(col+1)/2) inv = 1; + end + if(idx == HIGH) inv = 1; + if(inv) op = { op[1:0], op[3:2] }; + if((idx == HIGH) && !SIGNED_A) op = ~op; + end + + if(SIGNED_A) begin + automatic bit inv = 0; + // NAND along left matrix boundary + for(int unsigned col = 0; col < NB; col++) begin + if(idx == HIGH - (col+1)*(col+2)/2 + (SIGNED_B && (col < NB-1))) inv = 1; + end + if(inv) op = ~op; + end + + return op; + end + endfunction : gate_op + + //----------------------------------------------------------------------- + // Broadcast Wiring + if(NA == 1) begin : genTrivial + assign oa[0] = ia[0]; + assign ob[0] = ib[0]; + end : genTrivial + begin : genMatrix + + // Feed right triangle going right to left until first full-height column + for(genvar col = 0; col < NB; col++) begin + localparam int unsigned TOP = col*(col+1)/2; + for(genvar row = 0; row <= col; row++) begin + assign oa[TOP+row] = ia[col-row]; + assign ob[TOP+row] = ib[row]; + end + end + + // Feed central full-height rectangle for NA > NB + for(genvar col = 0; col < NA-NB; col++) begin + localparam int unsigned TOP = NB*(NB+1)/2 + col*NB + SIGNED_B; + for(genvar row = 0; row < NB; row++) begin + assign oa[TOP + row] = ia[NB+col - row]; + assign ob[TOP + row] = ib[row]; + + end + end + + // Feed left triangle going left to right up to last column with a receeded height + for(genvar col = 0; col < NB-1; col++) begin + localparam int unsigned BOT = HIGH - col*(col+1)/2 - 1; + for(genvar row = 0; row <= col; row++) begin + assign oa[BOT-row] = ia[NA-1-col+row]; + assign ob[BOT-row] = ib[NB-1-row]; + end + end + + // Feed extra elements created for sign handling + if(SIGNED_B) begin + assign oa[NB*(NB+1)/2] = ia[0]; + assign ob[NB*(NB+1)/2] = ib[NB-1]; + if(!SIGNED_A) begin + assign oa[HIGH] = 0; + assign ob[HIGH] = ib[NB-1]; + end + end + + end : genMatrix + +endinterface : mul_comp_map diff --git a/src/finn/compressor/src/__init__.py b/src/finn/compressor/src/__init__.py new file mode 100644 index 0000000000..65cad800cf --- /dev/null +++ b/src/finn/compressor/src/__init__.py @@ -0,0 +1,8 @@ +############################################################################# +# Copyright (C) 2024 - 2025 Advanced Micro Devices, Inc. All rights reserved. +# +# SPDX-License-Identifier: BSD-3-Clause +# +# @brief Compressor source package initialization +# @author Simon Gerber +############################################################################# diff --git a/src/finn/compressor/src/add_multi_finn.py b/src/finn/compressor/src/add_multi_finn.py new file mode 100644 index 0000000000..3932584db0 --- /dev/null +++ b/src/finn/compressor/src/add_multi_finn.py @@ -0,0 +1,408 @@ +############################################################################# +# Copyright (C) 2024 - 2025 Advanced Micro Devices, Inc. All rights reserved. +# +# SPDX-License-Identifier: BSD-3-Clause +# +# @brief FINN wrapper for add_multi compressor generation +# @author Simon Gerber +############################################################################# + +""" +Generate a compressor core for FINN's add_multi module (COMP path). + +The add_multi module in mvu.sv reduces N unsigned partial sums of ARG_WIDTH +bits into a single result (N dsp lanes outputs). This script generates a LUT-mapped compressor tree +for a specific (N, ARG_WIDTH) configuration, producing a module that can be +matched by the CATCH_COMP macro in add_multi.sv. + +Unlike dotp_finn.py, no absorption is needed: + - No gates: inputs are complete values, not partial-product factor pairs + - No constants: no Baugh-Wooley sign-correction (inputs are unsigned) + - No accumulation: accumulation stays downstream in mvu.sv + +Two call modes: + + Direct mode — caller supplies N and ARG_WIDTH explicitly: + python add_multi_finn.py --n 32 --arg_width 6 -t Versal -o gen/ + + MVU mode — caller supplies MVU-level parameters, and the script computes + the required lo_width values per DSP lane via a Python replica of + mvu.sv::sliceLanes(), then generates one compressor per unique (N, lo_width): + python add_multi_finn.py --mvu --n 8 --version 2 --ww 2 --aw 2 \ + --accu_width 16 --narrow_weights 0 -t Versal -o gen/ + + +Outputs: + comp_u_d.sv — the generated compressor core(s) +""" + +import os +import math +import argparse +import shutil + +from .main import generate_compressor +from .target import resolve_target, resolve_target_name, Versal, SevenSeries +from .utils.shape import Shape + + +# --------------------------------------------------------------------------- +# Python replica of mvu.sv::sliceLanes() +# +# This must mirror the SV implementation exactly. Any change to sliceLanes() +# in mvu.sv requires updating this function as well. The $warning guard in +# add_multi.sv catches divergence at simulation time. +# +# This outsourced computation is required as lane width is relevant to the +# compressor input Shape and thus needs to be known at generation time. + +def clog2(n): + """Ceiling of log2, matching SystemVerilog $clog2 semantics.""" + if n <= 1: + return 0 + return math.ceil(math.log2(n)) + + +def slice_lanes(version, ww, aw, accu_width, narrow_weights): + """ + Compute DSP lane offsets — Python replica of mvu.sv::sliceLanes(). + Parameters + ---------- + version : int + DSP version (1=DSP48E1, 2=DSP48E2, 3=DSP58). + ww : int + WEIGHT_WIDTH. + aw : int + ACTIVATION_WIDTH. + accu_width : int + ACCU_WIDTH. + narrow_weights : bool + NARROW_WEIGHTS flag. + + Returns + ------- + (num_lanes, offsets) : tuple + num_lanes : int + number of DSP lanes. + offsets : list[int] + lane boundary positions (length num_lanes+1). + """ + a_width = 25 + 2 * (version > 1) + p_width = 58 if version == 3 else 48 + min_lane_width = ww + aw - 1 + + if a_width == ww: + num_lanes = 1 + else: + num_lanes = 1 + (a_width - (0 if narrow_weights else 1) - ww) // min_lane_width + + # Distribute slack bits preferring right lanes + bit_slack = a_width - (0 if narrow_weights else 1) - ww - (num_lanes - 1) * min_lane_width + + offsets = [0] * (num_lanes + 1) + for i in range(1, num_lanes): + extra = (bit_slack + (num_lanes - 1 - i)) // (num_lanes - i) + offsets[i] = offsets[i - 1] + min_lane_width + extra + bit_slack -= extra + + # Last lane bounded by min(ACCU_WIDTH, P_WIDTH) + offsets[num_lanes] = offsets[num_lanes - 1] + accu_width + if offsets[num_lanes] > p_width: + offsets[num_lanes] = p_width + + return num_lanes, offsets + + +def lo_widths_from_mvu_params(version, ww, aw, accu_width, narrow_weights): + """ + Compute the lo_width for each DSP lane. + + Returns + ------- + list[int] + lo_width for lane 0 .. num_lanes-1. + """ + num_lanes, offsets = slice_lanes(version, ww, aw, accu_width, narrow_weights) + return [offsets[i + 1] - offsets[i] for i in range(num_lanes)] + + +def comp_module_name(n, arg_width, delay): + """ + Return the compressor module name, e.g. 'comp_32u6_d4'. + + Encodes: + N — number of unsigned addends (= SIMD) + ARG_WIDTH — bits per addend (= lo_width from mvu.sv lane slicing) + delay — pipeline stages produced by the generator + + The 'u' indicates unsigned, matching the mvu_bench naming convention. + The delay suffix lets the CATCH_COMP macro in add_multi.sv match on + minimum pipeline depth (DEPTH >= d). + """ + return f"comp_{n}u{arg_width}_d{delay}" + + +def generate_add_multi_comp(target, n, arg_width, pipeline_every, output_dir, + name=None): + """ + Generate a multi-input adder compressor (no accumulation). + + Parameters + ---------- + target : Target + FPGA target (Versal, SevenSeries) — selects LUT primitives. + n : int + Number of unsigned addends. + arg_width : int + Bit width of each addend. + pipeline_every : int or None + Insert pipeline registers every N combinational stages. + None means purely combinational. + output_dir : str + Directory for the generated .sv file. + name : str or None + Module name override. When None (default), the name is derived + from (n, arg_width, delay) after generation. + + Returns + ------- + (name, path, delay) : tuple + Module name, file path, and pipeline depth of the generated compressor. + """ + # Shape: W columns each of height N. + # Each of the N operands contributes 1 bit to each of the W bit-positions, + # so every column has the same height N. + shape = Shape([n] * arg_width) + + # First pass: generate with a temporary name to discover the actual delay. + # The delay depends on the compressor structure and pipeline_every, so we + # can't know it before generation. + tmp_name = name if name is not None else f"comp_{n}u{arg_width}" + tmp_path = os.path.join(output_dir, tmp_name + ".sv") + + delay = generate_compressor( + target=target, + shape=shape, + name=tmp_name, + comb_depth=pipeline_every, + accumulate=False, # Pure adder, no fused accumulation + accumulator_width=None, # Not applicable without accumulation + gates=[], # No gate absorption, inputs are complete values + constants=[], # No Baugh-Wooley correction, unsigned inputs + path=tmp_path, + test=False, + enable=False, # No accumulator registers to initialize + ) + + # Derive final name with delay suffix + if name is not None: + final_name = name + final_path = tmp_path + else: + final_name = comp_module_name(n, arg_width, delay) + final_path = os.path.join(output_dir, final_name + ".sv") + + if final_name != tmp_name: + # Rename file and replace module name inside it + with open(tmp_path, "r") as f: + content = f.read() + content = content.replace(tmp_name, final_name) + with open(final_path, "w") as f: + f.write(content) + os.remove(tmp_path) + + return final_name, final_path, delay + + +def generate_add_multi_comps(fpgapart, version, simd, ww, aw, accu_width, + narrow_weights, output_dir): + """ + Generate add_multi compressor cores and patch add_multi.sv. + This is the high-level entry point called by FINN's generate_hdl(). + + ALWAYS generates compressors and patches add_multi.sv with CATCH_COMP entries. + + Parameters + ---------- + fpgapart : str + FPGA part string. + version : int + DSP version (1=DSP48E1, 2=DSP48E2, 3=DSP58). + simd, ww, aw, accu_width : int + MVU parameters. + narrow_weights : int + NARROW_WEIGHTS flag (0 or 1). + output_dir : str + Directory for generated files (= code_gen_dir). + + Returns + ------- + dict with keys: + comp_names : list[str] — generated module names (empty if ineligible) + files : list[str] — paths of all generated/patched files + """ + + rtllib_dir = os.path.join(os.environ["FINN_ROOT"], "finn-rtllib/mvu/") + patched_path = os.path.join(output_dir, "add_multi.sv") + + # Always generate compressors and patch add_multi.sv + target = resolve_target(fpgapart) + + # This is currently a parallel implementation of the lo_width computation in mvu.sv's sliceLanes() function. + # The resulting lo_width values determine the compressor input Shapes, so we need to compute them here in Python at generation time. + # Must be kept in SYNC. + widths = lo_widths_from_mvu_params(version, ww, aw, accu_width, narrow_weights) + + # Generate one compressor per unique (SIMD, lo_width) + generated = {} # (simd, width) -> (name, delay) + for w in widths: + key = (simd, w) + if key not in generated: + name, _path, delay = generate_add_multi_comp( + target, simd, w, + pipeline_every=1, # Max pipelining (match dotp_comp behavior) + output_dir=output_dir) + generated[key] = (name, delay) + + # Copy add_multi.sv to output_dir and inject CATCH_COMP lines + with open(os.path.join(rtllib_dir, "add_multi.sv"), "r") as f: + add_multi_src = f.read() + + catch_lines = "" + comp_specs = [] + for (_n, _w), (name, delay) in generated.items(): + catch_lines += "\t`CATCH_COMP(%d,%d,%d)\n" % (_n, _w, delay) + comp_specs.append((_n, _w, delay)) + + marker = "\t// FINN_GENERATED_COMP_ENTRIES\n" + if marker not in add_multi_src: + raise RuntimeError( + "Cannot find FINN_GENERATED_COMP_ENTRIES marker in add_multi.sv. " + "Has the file been modified?") + add_multi_src = add_multi_src.replace(marker, catch_lines + marker) + + with open(patched_path, "w") as f: + f.write(add_multi_src) + + comp_files = [os.path.join(output_dir, name + ".sv") + for (name, _delay) in generated.values()] + + return { + "comp_names": [name for (name, _delay) in generated.values()], + "comp_specs": comp_specs, # [(N, ARG_WIDTH, DELAY), ...] + "files": [patched_path] + comp_files, + } + + +def main(): + parser = argparse.ArgumentParser( + prog="add_multi_finn", + description="Generate a compressor core for FINN's add_multi module." + ) + parser.add_argument('--n', type=int, required=True, + help="Number of unsigned addends (= SIMD)") + parser.add_argument('-t', '--target', default="Versal", + choices=["Versal", "7-Series", "UltraScale"], + help="Target FPGA generation") + parser.add_argument('-p', '--pipeline_every', type=int, default=None, + help="Pipeline registers every N combinational stages") + parser.add_argument('-o', '--output_dir', default="../gen", + help="Output directory for generated files") + parser.add_argument('--name', default=None, + help="Module name override (default: comp_u_d)") + + # Direct mode: explicit arg_width + parser.add_argument('--arg_width', type=int, default=None, + help="Bit width per addend (direct mode)") + + # MVU mode: derive arg_width(s) from MVU parameters + mvu_group = parser.add_argument_group( + 'MVU parameters', + 'When --mvu is given, lo_width values are computed from these ' + 'MVU-level parameters (replicating mvu.sv::sliceLanes).' + ) + mvu_group.add_argument('--mvu', action='store_true', + help="Enable MVU mode: derive arg_width from MVU params") + mvu_group.add_argument('--version', type=int, default=2, + choices=[1, 2, 3], + help="DSP version (1=DSP48E1, 2=DSP48E2, 3=DSP58)") + mvu_group.add_argument('--ww', type=int, default=None, + help="WEIGHT_WIDTH") + mvu_group.add_argument('--aw', type=int, default=None, + help="ACTIVATION_WIDTH") + mvu_group.add_argument('--accu_width', type=int, default=None, + help="ACCU_WIDTH") + mvu_group.add_argument('--narrow_weights', type=int, default=0, + choices=[0, 1], + help="NARROW_WEIGHTS flag (0 or 1)") + + args = parser.parse_args() + + # Validate argument combinations + if not args.mvu and args.arg_width is None: + parser.error("Either --arg_width (direct mode) or --mvu with MVU " + "parameters is required.") + if args.mvu and args.arg_width is not None: + parser.error("--arg_width and --mvu are mutually exclusive.") + if args.mvu: + for param in ('ww', 'aw', 'accu_width'): + if getattr(args, param) is None: + parser.error(f"--mvu requires --{param}") + + target = resolve_target_name(args.target) + os.makedirs(args.output_dir, exist_ok=True) + + if args.mvu: + # MVU mode: compute lo_width per lane, generate unique compressors + simd = args.n + + # For SIMD < 4, the binary adder tree is already optimal. + # A compressor adds structural overhead with no benefit. + if simd < 4: + print(f"SIMD={simd} < 4: binary tree is optimal, no compressors generated.") + return + + widths = lo_widths_from_mvu_params( + args.version, args.ww, args.aw, + args.accu_width, bool(args.narrow_weights) + ) + depth = 3 + clog2(simd) + (1 if simd == 1 else 0) + 1 + add_multi_depth = depth - 4 + + print(f"MVU config: VERSION={args.version} WW={args.ww} AW={args.aw} " + f"ACCU_WIDTH={args.accu_width} NARROW_WEIGHTS={args.narrow_weights}") + print(f" NUM_LANES={len(widths)} PIPELINE_DEPTH={depth} " + f"ADD_MULTI_DEPTH={add_multi_depth}") + print(f" LO_WIDTHs: {widths}") + + # Generate one compressor per unique (N, lo_width) + seen = set() + for lane, w in enumerate(widths): + if (simd, w) in seen: + print(f" Lane {lane}: lo_width={w} — reuses existing module") + continue + seen.add((simd, w)) + + comp_name, comp_path, comp_delay = generate_add_multi_comp( + target, simd, w, + args.pipeline_every, args.output_dir, name=args.name + ) + print(f" Lane {lane}: lo_width={w}") + print(f" Generated: {comp_path}") + print(f" Module: {comp_name}") + print(f" Delay: {comp_delay}") + + else: + # Direct mode: single compressor for explicit arg_width + comp_name, comp_path, comp_delay = generate_add_multi_comp( + target, args.n, args.arg_width, + args.pipeline_every, args.output_dir, name=args.name) + + print(f"Generated compressor core: {comp_path}") + print(f" Module name: {comp_name}") + print(f" Configuration: {args.n} unsigned addends x {args.arg_width} bits") + print(f" Pipeline depth: {comp_delay}") + + +if __name__ == "__main__": + main() diff --git a/src/finn/compressor/src/benchmark.py b/src/finn/compressor/src/benchmark.py new file mode 100644 index 0000000000..b4f7a5969b --- /dev/null +++ b/src/finn/compressor/src/benchmark.py @@ -0,0 +1,61 @@ +############################################################################# +# Copyright (C) 2024 - 2025 Advanced Micro Devices, Inc. All rights reserved. +# +# SPDX-License-Identifier: BSD-3-Clause +# +# @brief Benchmarking harness for compressor generation +############################################################################# + +from .passes.compressor_constructor import CompressorConstructor +from .target import Versal +from .passes.cost_estimator import CostEstimator +from .utils.shape import Shape +from functools import reduce + +def gmean(numbers): + return reduce(lambda x, y: x*y, numbers)**(1.0/len(numbers)) + +def benchmark(): + examples = { + "128": Shape([128]), + "256": Shape([256]), + "512": Shape([512]), + "128,128": Shape([128,128]), + "256,256": Shape([256,256]), + "512,512": Shape([512,512]), + "Int1": Shape([1,1,2,3,4,5,6,7,5,4,3,2,1]), + "Int2": Shape([1,1,1,3,5,7,9,11,13,10,8,6,4,2,1]), + "Int3": Shape([1,1,1,1,5,9,13,17,21,25,20,16,12,8,4]), + "Int4": Shape([1,1,1,1,1,9,17,25,33,41,49,40,32,24,16,8]), + "Int5": Shape([1,1,1,1,1,1,17,33,49,65,81,97,80,64,48,32,16]), + "LPFP1": Shape([1,1,1,1,1,1,1,1,1,1,1,1,1,1,2]), + "LPFP2": Shape([2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,4]), + "LPFP3": Shape([4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,8]), + "LPFP4": Shape([8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,16]), + "LPFP5": Shape([16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,32]), + "6-Input": Shape(32*[6]), + "10-Input": Shape(32*[10]), + "Mul16": Shape(list(range(1, 17)) + list(reversed(range(1, 16)))) + } + + luts = [] + for example_name, example_shape in examples.items(): + target = Versal() + constructor = CompressorConstructor() + comp = constructor(target.counter_candidates, + target.absorbing_counter_candidates, + target.final_adder, example_shape, + "comp", 1, True, None, tuple(), []) + + cost = CostEstimator() + comp.accept(cost) + eff = (sum(comp.input_shape) - sum(comp.output_shape)) / cost.luts + luts.append(cost.luts) + print(f"Example {example_name:<10} uses {cost.luts:<6} LUTs" + f"for {cost.combinatorial_stages} stages (Efficiency: {eff: 1.2f})") + + luts_gmean = gmean(luts) + print(f"Geomean {luts_gmean:.6} LUTs") + +if __name__=="__main__": + benchmark() \ No newline at end of file diff --git a/src/finn/compressor/src/dotp.py b/src/finn/compressor/src/dotp.py new file mode 100644 index 0000000000..b3b96b826b --- /dev/null +++ b/src/finn/compressor/src/dotp.py @@ -0,0 +1,97 @@ +############################################################################# +# Copyright (C) 2024 - 2025 Advanced Micro Devices, Inc. All rights reserved. +# +# SPDX-License-Identifier: BSD-3-Clause +# +# @brief Dot product compressor core generation for standalone testing +############################################################################# + +import sys, re, os +from .main import generate_compressor +from .target import Target, Versal, SevenSeries +from .utils.shape import Shape +from .utils.mul_comp_map import MulCompMap +from typing import Optional, List + + +if __name__ == "__main__": + + # Parse and extract Parameters from Command Line + sig = sys.argv[1] + _ = re.fullmatch("(\\d+)x([us])(\\d+)([us])(\\d+)", sig).groups() + (n, na, nb, sa, sb) = (int(_[0]), int(_[2]), int(_[4]), _[1] == 's', _[3] == 's') + assert nb <= na + + # Target platform: ca/accu goes in argv[2], target in argv[3] (default versal) + target_arg = sys.argv[3] if len(sys.argv) > 3 else "versal" + if target_arg == "7series": + target = SevenSeries() + fpga_part = "xc7z020clg400-1" + else: # versal (default) + target = Versal() + fpga_part = "xcvc1902-vsva2197-2MP-e-S" + + clog2 = lambda x: (x-1).bit_length() + np = clog2(n) + (na if nb == 1 and not sb else na+nb) if na > 1 else ( + clog2(n+1) if sa == sb else 1 + clog2(n) + ) + + map = MulCompMap(na, nb, sa, sb) + shape = [col * n for col in map.shape()] + print("Shape: ", ' '.join((':'.join((f"{val:x}" for val in col)) for col in shape[::-1]))) + + # Absolute Term Contribution + constants = [] + abs_term = n * map.absolute_term() + # Move absolute term into absorbed constant if requested + if len(sys.argv) > 2 and sys.argv[2] == 'ca': + print("Constant absorption.") + if abs_term < 0: + abs_term += 2**np + constants = [(abs_term >> i) & 1 for i in range(np)] + abs_term = 0 + + name = "comp_" + sig + # Write to gen/ relative to this script's parent directory (compressor/) + script_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) + output_path = os.path.join(script_dir, "gen", name + ".sv") + generate_compressor( + target = target, + shape = Shape((len(col) for col in shape)), + name = name, + comb_depth = None, + accumulate = False, + accumulator_width = None, + gates = [[f"{val:x}" for val in col] for col in shape], + constants = constants, + path = output_path, + test = False + ) + + # Process templates with absolute paths + gen_dir = os.path.join(script_dir, "gen") + hdl_dir = os.path.join(script_dir, "hdl") + for (src_rel, dst_rel) in ( + ("dotp_template.sv", "dotp_"+sig+".sv"), + ("dotp_tb_template.sv", "dotp_"+sig+"_tb.sv"), + ("dotp_template.tcl", "dotp_"+sig+".tcl") + ): + src = os.path.join(hdl_dir, src_rel) + dst = os.path.join(gen_dir, dst_rel) + with open(src, "rt") as fsrc: + with open(dst, "wt") as fdst: + for l in fsrc: + fdst.write(l + .replace("{n}", str(n)) + .replace("{na}", str(na)) + .replace("{nb}", str(nb)) + .replace("{sa}", 's' if sa else 'u') + .replace("{sb}", 's' if sb else 'u') + .replace("{signed_a}", str(int(sa))) + .replace("{signed_b}", str(int(sb))) + .replace("{abs_term}", str(abs_term)) + .replace("{part}", fpga_part) + # Replace relative paths with absolute paths for TCL + .replace("hdl/", hdl_dir + "/") + .replace("gen/", gen_dir + "/") + ) diff --git a/src/finn/compressor/src/dotp_finn.py b/src/finn/compressor/src/dotp_finn.py new file mode 100644 index 0000000000..db24e38a67 --- /dev/null +++ b/src/finn/compressor/src/dotp_finn.py @@ -0,0 +1,264 @@ +############################################################################# +# Copyright (C) 2024 - 2025 Advanced Micro Devices, Inc. All rights reserved. +# +# SPDX-License-Identifier: BSD-3-Clause +# +# @brief FINN wrapper for dot product compressor generation +# @author Simon Gerber +############################################################################# + +""" +Generate a compressor core for FINN's dotp_comp module. + +The static dotp_comp template (in finn-rtllib/mvu/) instantiates a generated +compressor core whose module name encodes the configuration signature, e.g. +`comp_8xs2s2_a16`. This script generates that core: a LUT-mapped reduction tree +with fused accumulation, specific to a (SIMD, WW, AW, signedness) configuration. + +Usage: + python dotp_finn.py --simd 8 --ww 2 --aw 2 --accu_width 16 \ + --signed_activations --target Versal -o gen/ + +Outputs: + comp_.sv — the generated compressor core (module `comp_`) +""" + +import os +import re +import argparse +from .main import generate_compressor +from .utils.mul_comp_map import MulCompMap +from .target import resolve_target, resolve_target_name +from .utils.shape import Shape + + +def expand_template(template_path, output_path, substitutions): + """Expand a text template by replacing $PLACEHOLDER$ tokens. + + Raises FileNotFoundError if paths invalid, ValueError if placeholders remain. + """ + if not os.path.isfile(template_path): + raise FileNotFoundError(f"Template not found: {template_path}") + + output_dir = os.path.dirname(output_path) + if output_dir and not os.path.isdir(output_dir): + raise FileNotFoundError(f"Output directory does not exist: {output_dir}") + + with open(template_path, "r") as f: + text = f.read() + for key, value in substitutions.items(): + text = text.replace(key, value) + remaining = re.findall(r'\$[A-Z_]+\$', text) + if remaining: + raise ValueError( + f"Unsubstituted placeholders in {output_path}: {remaining}") + with open(output_path, "w") as f: + f.write(text) + + +def compute_params(simd, weight_width, activation_width, signed_activations): + """Map finn parameters to compressor parameters, respecting NA >= NB.""" + # Weights are always signed in finn + sa_finn = True + sb_finn = signed_activations + + # mul_comp_map requires NA >= NB. Swap operands if needed. + if weight_width >= activation_width: + na, nb = weight_width, activation_width + sa, sb = sa_finn, sb_finn + swapped = False + else: + na, nb = activation_width, weight_width + sa, sb = sb_finn, sa_finn + swapped = True + + n = simd + return n, na, nb, sa, sb, swapped + + +def make_signature(n, sa, na, sb, nb): + """Build the compressor file signature string, e.g. '8xs2u2'.""" + return f"{n}x{'s' if sa else 'u'}{na}{'s' if sb else 'u'}{nb}" + + +def comp_module_name(n, sa, na, sb, nb, accu_width): + """Return the config-specific compressor module name, e.g. 'comp_8xs2s2_a16'.""" + return "comp_" + make_signature(n, sa, na, sb, nb) + f"_a{accu_width}" + + + +def generate_comp_module(target, n, na, nb, sa, sb, accu_width, + pipeline_every, output_dir, name=None): + """Generate the compressor core with fused accumulation. + + When *name* is None (the default), the module is named after its + configuration signature, e.g. ``comp_8xs2s2_a16``. This keeps module + names unique across different compressor configurations in the same + Vivado project. + """ + if name is None: + name = comp_module_name(n, sa, na, sb, nb, accu_width) + m = MulCompMap(na, nb, sa, sb) + shape_cols = [col * n for col in m.shape()] + shape = Shape((len(col) for col in shape_cols)) + gates = [[f"{val:x}" for val in col] for col in shape_cols] + + # Absorb abs_term as a constant input to the compressor tree. + # This ensures the correction is applied every accumulation cycle, + # not just once at the output. + abs_term = n * m.absolute_term() + if abs_term != 0: + abs_val = abs_term % (1 << accu_width) # two's complement + constants = [(abs_val >> i) & 1 for i in range(accu_width)] + else: + constants = [] + + comp_path = os.path.join(output_dir, name + ".sv") + delay = generate_compressor( + target=target, + shape=shape, + name=name, + comb_depth=pipeline_every, + accumulate=True, + accumulator_width=accu_width, + gates=gates, + constants=constants, + path=comp_path, + test=False, + enable=True, + ) + return name, comp_path, delay + + +def generate_dotp_comp(fpgapart, simd, ww, aw, accu_width, signed_act, output_dir): + """ + Generate the dotp_comp path: compressor core + expanded template. + + This is the high-level entry point called by FINNs generate_hdl(). + + Parameters + ---------- + fpgapart : str + FPGA part string (e.g. "xcvc1902-..."). + simd, ww, aw, accu_width : int + MVU parameters. + signed_act : bool + Whether activations are signed. + output_dir : str + Directory for generated files (= code_gen_dir). + + Returns + ------- + dict with keys: + comp_name : str — module name (e.g. "comp_8xs2s2_a16") + comp_delay : int — pipeline depth + files : list — paths of all generated files + """ + + target = resolve_target(fpgapart) + n, na, nb, sa, sb, _ = compute_params(simd, ww, aw, signed_act) + + comp_name, comp_path, comp_delay = generate_comp_module( + target, n, na, nb, sa, sb, accu_width, + pipeline_every=1, # Max pipelining + output_dir=output_dir) + + # Expand dotp_comp template with the generated module name + src_dir = os.path.dirname(os.path.abspath(__file__)) + compressor_root = os.path.abspath(os.path.join(src_dir, "..")) + dotp_comp_template = os.path.join(compressor_root, "hdl", "dotp_comp_template.sv") + dotp_comp_path = os.path.join(output_dir, "dotp_comp.sv") + expand_template(dotp_comp_template, dotp_comp_path, { + "$COMP_MODULE_NAME$": comp_name, + "$EXPECTED_SIMD$": str(simd), + "$EXPECTED_NA$": str(na), + "$EXPECTED_NB$": str(nb), + "$EXPECTED_SIGNED_A$": str(1 if sa else 0), + "$EXPECTED_SIGNED_B$": str(1 if sb else 0), + "$EXPECTED_ACCU_WIDTH$": str(accu_width), + }) + + return { + "comp_name": comp_name, + "comp_delay": comp_delay, + "files": [dotp_comp_path, comp_path], + } + + +def main(): + script_dir = os.path.dirname(os.path.abspath(__file__)) + repo_root = os.path.abspath(os.path.join(script_dir, "..")) + default_dotp_template = os.path.join(repo_root, "hdl", "dotp_comp_template.sv") + + parser = argparse.ArgumentParser( + prog="dotp_finn", + description="Generate a compressor core for FINN's dotp_comp module." + ) + parser.add_argument('--simd', type=int, required=True, help="SIMD (operand pairs per cycle)") + parser.add_argument('--ww', type=int, required=True, help="Weight bit width") + parser.add_argument('--aw', type=int, required=True, help="Activation bit width") + parser.add_argument('--accu_width', type=int, required=True, help="Accumulator bit width") + parser.add_argument('--signed_activations', action='store_true', + help="Activations are signed") + parser.add_argument('-t', '--target', default="Versal", + choices=["Versal", "7-Series", "UltraScale"], + help="Target FPGA generation") + parser.add_argument('-p', '--pipeline_every', type=int, default=None, + help="Pipeline registers every N combinational stages") + parser.add_argument('-o', '--output_dir', default="../gen", + help="Output directory for generated files") + parser.add_argument('-n', '--name', default=None, + help="Module name override (default: comp_)") + parser.add_argument('--dotp-template', default=default_dotp_template, + help="Path to dotp_comp template file to expand") + parser.add_argument('--dotp-output-name', default="dotp_comp.sv", + help="Output file name for expanded dotp_comp template") + parser.add_argument('--skip-dotp-template', action='store_true', + help="Skip expanding dotp_comp template") + args = parser.parse_args() + target = resolve_target_name(args.target) + os.makedirs(args.output_dir, exist_ok=True) + + # Compute compressor parameters + n, na, nb, sa, sb, swapped = compute_params( + args.simd, args.ww, args.aw, args.signed_activations) + + # Generate the compressor core with fused accumulation + comp_name, comp_path, comp_delay = generate_comp_module( + target, n, na, nb, sa, sb, args.accu_width, + args.pipeline_every, args.output_dir, name=args.name) + + dotp_path = None + if not args.skip_dotp_template: + template_path = os.path.abspath(args.dotp_template) + if not os.path.isfile(template_path): + raise FileNotFoundError( + f"dotp template not found: {template_path}. Use --dotp-template or --skip-dotp-template." + ) + dotp_path = os.path.join(args.output_dir, args.dotp_output_name) + expand_template( + template_path, + dotp_path, + { + "$COMP_MODULE_NAME$": comp_name, + "$EXPECTED_SIMD$": str(args.simd), + "$EXPECTED_NA$": str(na), + "$EXPECTED_NB$": str(nb), + "$EXPECTED_SIGNED_A$": str(1 if sa else 0), + "$EXPECTED_SIGNED_B$": str(1 if sb else 0), + "$EXPECTED_ACCU_WIDTH$": str(args.accu_width), + }, + ) + + sig = make_signature(n, sa, na, sb, nb) + print(f"Generated compressor core: {comp_path}") + if dotp_path is not None: + print(f"Expanded dotp template: {dotp_path}") + print(f" Module name: {comp_name}") + print(f" Configuration: {sig}") + print(f" Pipeline depth: {comp_delay}") + print(f" Operands: {'swapped' if swapped else 'not swapped'} (NA={na} >= NB={nb})") + + +if __name__ == "__main__": + main() diff --git a/src/finn/compressor/src/evaluation.py b/src/finn/compressor/src/evaluation.py new file mode 100644 index 0000000000..99ed49a33d --- /dev/null +++ b/src/finn/compressor/src/evaluation.py @@ -0,0 +1,253 @@ +############################################################################# +# Copyright (C) 2024 - 2025 Advanced Micro Devices, Inc. All rights reserved. +# +# SPDX-License-Identifier: BSD-3-Clause +# +# @brief Evaluation and benchmarking utilities for compressor +############################################################################# + +from .target import Versal +from .utils.shape import Shape +from .main import generate_compressor +from .tests.test_gen import compressed_width +from concurrent.futures import ThreadPoolExecutor +import subprocess + +def evaluation(): + examples = { + "128": Shape([128]), + "256": Shape([256]), + "512": Shape([512]), + "128,128": Shape([128,128]), + "256,256": Shape([256,256]), + "512,512": Shape([512,512]), + "Int1": Shape([1,1,2,3,4,5,6,7,5,4,3,2,1]), + "Int2": Shape([1,1,1,3,5,7,9,11,13,10,8,6,4,2,1]), + "Int3": Shape([1,1,1,1,5,9,13,17,21,25,20,16,12,8,4]), + "Int4": Shape([1,1,1,1,1,9,17,25,33,41,49,40,32,24,16,8]), + "Int5": Shape([1,1,1,1,1,1,17,33,49,65,81,97,80,64,48,32,16]), + "LPFP1": Shape([1,1,1,1,1,1,1,1,1,1,1,1,1,1,2]), + "LPFP2": Shape([2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,4]), + "LPFP3": Shape([4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,8]), + "LPFP4": Shape([8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,16]), + "LPFP5": Shape([16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,32]), + "6x32": Shape(32*[6]), + "10x32": Shape(32*[10]), + "Mul16": Shape(list(range(1, 17)) + list(reversed(range(1, 16)))) + } + + filenames = [] + for example_name, example_shape in examples.items(): + print(example_name, example_shape) + # combinatorial design + filename = "../gen/"+example_name+"_comb.sv" + generate_compressor( + target=Versal(), + shape=example_shape, + name="comp", + comb_depth=None, + accumulate=False, + accumulator_width=None, + gates=None, + constants=[], + path=filename, + test=True + ) + generate_wrapper(shape=example_shape, pipelined=False, gates=False, + accumulation=False, filename=filename) + filenames.append(filename) + # accumulating design + filename = "../gen/"+example_name+"_acc.sv" + generate_compressor( + target=Versal(), + shape=example_shape, + name="comp", + comb_depth=1, + accumulate=True, + accumulator_width=None, + gates=None, + constants=[], + path=filename, + test=True + ) + generate_wrapper(shape=example_shape, pipelined=True, gates=False, + accumulation=True, filename=filename) + filenames.append(filename) + # gate inlined design with accumulation + filename = "../gen/"+example_name+"_gate.sv" + generate_compressor( + target=Versal(), + shape=example_shape, + name="comp", + comb_depth=1, + accumulate=True, + accumulator_width=None, + gates=[["8" for el in range(col)] for col in example_shape], + constants=[], + path=filename, + test=True + ) + generate_wrapper(shape=example_shape, pipelined=True, gates=True, + accumulation=True, filename=filename) + filenames.append(filename) + + tclfiles = [emit_eval_tcl_script(el) for el in filenames] + + def call_vivado(filename): + command = f"""cd ../gen/ && + ls && + source /proj/xbuilds/released/2023.1/2023.1_0508_1/installs/lin64/Vivado/2023.1/settings64.sh && + vivado -mode batch -source {filename.split("/")[-1]}""" + return subprocess.run(command, shell=True, check=True, timeout=3600, + text=True, executable="/bin/bash") + + print("Executing evaluation threads") + with ThreadPoolExecutor(max_workers=15) as executor: + executor.map(call_vivado, tclfiles) + print("Done executing evaluation threads") + +def generate_wrapper(shape, pipelined, gates, accumulation, filename): + iw = sum(shape) + ow = compressed_width(shape) + + inputs = ["clk", "in"] + if gates: + inputs.append("in_2") + + if accumulation: + inputs.append("en_neg") + inputs.append("rst") + + input_str = "\tinput " + ", ".join(inputs) + ",\n" + output_str = f"\toutput logic [{ow-1}:0] outReg" + + wrapper_str = ( + "module sandwich(\n" + + input_str + + output_str + + '\n);\n' + + f""" +\t{"logic en_negReg, rstReg;" if accumulation else ""} +\tlogic [{iw-1}:0] inReg{", in_2Reg;" if gates else ";"} +\twire [{ow-1}:0] out; +\t +\talways_ff @ (posedge clk) begin +\t\t{"rstReg <= rst;" if accumulation else ""} +\t\t{"en_negReg <= en_neg;" if accumulation else ""} +\t\tinReg <= {{inReg, in}}; +\t\t{"in_2Reg <= {in_2Reg, in_2};" if gates else ""} +\t\toutReg <= out; +\tend +\t +\t(* keep_hierarchy = "yes" *) +\tcomp c(.in(inReg), .clk(clk),{" .in_2(in_2Reg)," if gates else "" + }{" .en_neg(en_negReg), .rst(rstReg)," + if accumulation else ""} .out(out)); + +endmodule""" + ) + with open(filename, 'a') as f: + f.writelines(wrapper_str) + +def emit_eval_tcl_script(compressor_path): + comps = "set comps { " + str(compressor_path.split("/")[-1]) + " }" + script = comps + """ +set PART xcvc1902-vsva2197-2MP-e-S ; # From VCK190 Evaluation Board + +foreach comp $comps { + read_verilog $comp + + # ----------------------------------------------------------------------------- + # Open new file for current module + set filename_prefix RESULT_ + set filename_suffix ".json" + set filename $filename_prefix$comp$filename_suffix + puts $filename + set outfile [open $filename w] + puts $outfile "\{" + + set tm 0.7 ; # Minimum possible ime + set tt 10.0 ; # Time to Test + set ts 100.0 ; # Successful Time + set lc 100000 ; # LUT utilization + + # ----------------------------------------------------------------------------- + # Run synthesis + synth_design -top sandwich -part $PART + + # ----------------------------------------------------------------------------- + # while loop, updating clock + while {[expr $ts - $tm] > 0.1} { + puts "NEW SYNTHESIS RUN WITH FREQ $tt" + create_clock -name CLK -period $tt [get_port clk] + + # ----------------------------------------------------------------------------- + # Place and route + opt_design -retarget -propconst -sweep ; + place_design -directive Explore + report_utilization -file util_$comp.twrA + route_design -directive Explore + report_drc + report_utilization -hierarchical + report_timing -setup -hold -max_paths 3 -nworst 3 -input_pins -sort_by group -file $comp.twrA + report_timing_summary -delay_type min_max -path_type full_clock_expanded -report_unconstrained -check_timing_verbose -max_paths 3 -nworst 3 -significant_digits 3 -input_pins -file $comp.twrA + + # ----------------------------------------------------------------------------- + # Find maximum data path delay and slack + set f [open $comp.twrA r] + set file_data [read $f] + close $f + if {[regexp { +Data Path Delay: +(\d+\.\d+)} $file_data -> value]} { + set tr $value + } { + error "DATA PATH DELAY NOT FOUND" + } + + # ----------------------------------------------------------------------------- + # Find LUT and Slice utilization + set f [open util_$comp.twrA r] + set file_data [read $f] + close $f + if {[regexp {CLB LUTs +\| +(\d+)} $file_data -> value]} { + set lc $value + } { + error "LUT UTILIZATION NOT FOUND" + } + + if {[regexp {SLICE +\| +(\d+)} $file_data -> value]} { + set sc $value + } { + error "SLICE UTILIZATION NOT FOUND" + } + + # ----------------------------------------------------------------------------- + # Check if timing was met + if { $tt < $tr } { + puts {Timing $tr was NOT met!} + set tm $tt + if { $tr < $ts } { + set ts $tr + } + } else { + set ts $tr + } + set tt [expr { ($ts + $tm)/2}] + } + + puts -nonewline $outfile "\\"Delay\\": $ts," + puts -nonewline $outfile "\\"Slice\\": $sc," + puts -nonewline $outfile "\\"LUTS\\": $lc" ; + + puts $outfile "\}" + close $outfile + remove_files {$comp} +} +q +""" + tclpath = compressor_path.replace(".sv", ".tcl") + with open(tclpath, "w") as f: + f.writelines(script) + return tclpath + +if __name__=="__main__": + evaluation() \ No newline at end of file diff --git a/src/finn/compressor/src/graph/__init__.py b/src/finn/compressor/src/graph/__init__.py new file mode 100644 index 0000000000..9ec3df1276 --- /dev/null +++ b/src/finn/compressor/src/graph/__init__.py @@ -0,0 +1,8 @@ +############################################################################# +# Copyright (C) 2024 - 2025 Advanced Micro Devices, Inc. All rights reserved. +# +# SPDX-License-Identifier: BSD-3-Clause +# +# @brief Compressor graph package initialization +# @author Simon Gerber +############################################################################# diff --git a/src/finn/compressor/src/graph/accumulator.py b/src/finn/compressor/src/graph/accumulator.py new file mode 100644 index 0000000000..2fa585dc5d --- /dev/null +++ b/src/finn/compressor/src/graph/accumulator.py @@ -0,0 +1,96 @@ +############################################################################# +# Copyright (C) 2024 - 2025 Advanced Micro Devices, Inc. All rights reserved. +# +# SPDX-License-Identifier: BSD-3-Clause +# +# @brief Accumulator stage implementation for compressor +############################################################################# + +from .nodes import Shape, Wire, Logic, Stage, Bitmatrix +from collections.abc import Iterable + +class AccumulatorStage(Stage): + def __init__(self, shape: Shape, final_adder, preceeding_pipeline_stages, + accumulator_width = None, enable = False): + super().__init__() + self.input_shape = shape + self.output_shape = Shape([1 for _ in range( + self.get_accumulator_width(accumulator_width))]) + self.instances = [] + self.input_wires = Bitmatrix(shape) + self.output_wires = Bitmatrix(self.output_shape) # TODO: Make Logic + self.accumulator_width = self.get_accumulator_width(accumulator_width) + self.final_adder_gen = final_adder + self.preceeding_pipeline_stages = preceeding_pipeline_stages + self.enable = enable + self.build_hardware() + + def build_hardware(self): + acc_input_shape = self.input_shape + self.output_shape + final_adder = self.final_adder_gen(acc_input_shape) + + en_neg = Wire(desired_name="en_neg") + en_neg.set_to_module_input() + rst = Wire(desired_name="rst") + rst.set_to_module_input() + self.instances.append(en_neg) + self.instances.append(rst) + + # Optional clock enable signal (for finnlib integration) + en_wire = None + if self.enable: + en_wire = Wire(desired_name="en") + en_wire.set_to_module_input() + self.instances.append(en_wire) + + # Create shifted enable and reset signal. + # init=1 on rst delay chain: when enable mode is active, en-gating + # prevents these registers from capturing the initial rst=1 pulse if + # en=0 during global reset. Initialising to 1 ensures the accumulator + # feedback is properly zeroed from power-up. In the current finn(lib) + # integration en is hardwired to '1 making this technically redundant, + # but the FPGA INIT attribute is free and keeps the design robust + # against future uses where en may be gated. + rst_del = self.delay_signal(rst, self.preceeding_pipeline_stages+1, + en=en_wire, + init=1 if self.enable else None) + en_neg_del = self.delay_signal(en_neg, self.preceeding_pipeline_stages, + en=en_wire) + + # Connect inputs to final adder + loop = self.delay_signal(final_adder.output_wires, cycles=1, + rst=rst_del, en=en_wire, init=0) + in_ = self.delay_signal(self.input_wires, cycles=1, rst=en_neg_del, + en=en_wire, init=0) + for col_loop, col_fa in zip(loop, final_adder.input_wires): + col_loop[0].connect_to(col_fa[0]) + + for col_in, col_fa in zip(in_, final_adder.input_wires): + for el_in, el_fa in zip(col_in, col_fa[1:]): + el_in.connect_to(el_fa) + + # Connect final adder output to stage output + for col_t, col_s in zip(self.output_wires, final_adder.output_wires): + for t, s in zip(col_t, col_s): + s.connect_to(t) + self.instances.append(final_adder) + + def delay_signal(self, signal, /, cycles=1, rst = None, en = None, init = None): + if isinstance(signal, Iterable): + return [self.delay_signal(el, cycles, rst, en, init) for el in signal] + for i in range(cycles): + lgc = Logic(rst=rst, en=en, init=init) + signal.connect_to(lgc) + self.instances.append(lgc) + signal = lgc + return signal + + + def get_accumulator_width(self, input = None): + if input: + return input + else: + return sum([(el << idx) for idx, el in + enumerate(self.input_shape)]).bit_length() + + def accept(self, visitor): visitor.visit_accumulator_stage(self) \ No newline at end of file diff --git a/src/finn/compressor/src/graph/counters/__init__.py b/src/finn/compressor/src/graph/counters/__init__.py new file mode 100644 index 0000000000..52868b1dbd --- /dev/null +++ b/src/finn/compressor/src/graph/counters/__init__.py @@ -0,0 +1,8 @@ +############################################################################# +# Copyright (C) 2024 - 2025 Advanced Micro Devices, Inc. All rights reserved. +# +# SPDX-License-Identifier: BSD-3-Clause +# +# @brief Counter candidates package initialization +# @author Simon Gerber +############################################################################# diff --git a/src/finn/compressor/src/graph/counters/absorption_counter_candidates.py b/src/finn/compressor/src/graph/counters/absorption_counter_candidates.py new file mode 100644 index 0000000000..53a1163dc3 --- /dev/null +++ b/src/finn/compressor/src/graph/counters/absorption_counter_candidates.py @@ -0,0 +1,299 @@ +############################################################################# +# Copyright (C) 2024 - 2025 Advanced Micro Devices, Inc. All rights reserved. +# +# SPDX-License-Identifier: BSD-3-Clause +# +# @brief 7-Series and Versal gate absorption counter implementations +# @author Co-authored by Simon Gerber +############################################################################# + +from abc import ABC, abstractmethod +from ...utils.shape import Shape +from ..nodes import GateAbsorptionCounter +from typing import List +from ..primitives import LUT6CY, LUT2, LUT6 + +def fa_sum(a, b, c): return a ^ b ^ c +def fa_carry(a, b, c): return a and b or a and c or b and c + +def gate_string_to_pred(string): + class Gate: + def __init__(self, init): + try: + self._init = int(init, 16) + except ValueError: + raise ValueError(f"Gate specification {string} is invalid!") + + def __call__(self, a, b): + return bool((self._init >> (1*a | 2*b)) & 1) + + def __repr__(self): + return f"{self._init:x}" + return Gate(string) + +class GateAbsorptionCounterCandidate(ABC): + @abstractmethod + def extend_to_fit(self, inputs: Shape, + gates: List[List[str]]) -> GateAbsorptionCounter: + pass + +class AbsorbingFACandidate(GateAbsorptionCounterCandidate): + def extend_to_fit(self, inputs: Shape, + gates: List[List[str]]) -> GateAbsorptionCounter: + if inputs[0] >= 3: + return AbsorbingFA(gates[0][:3]) + +class AbsorbingFA(GateAbsorptionCounter): + def __init__(self, gates): + self.gates = [gate_string_to_pred(gate) for gate in gates] + super().__init__(Shape([3]), Shape([1,1])) + + def build_hardware(self): + lut1 = LUT6.fromPred( + lambda I0,I1,I2,I3,I4,I5: fa_sum( + self.gates[0](I0,I1), + self.gates[1](I2,I3), + self.gates[2](I4,I5))) + + lut2 = LUT6.fromPred( + lambda I0,I1,I2,I3,I4,I5: fa_carry( + self.gates[0](I0,I1), + self.gates[1](I2,I3), + self.gates[2](I4,I5))) + + for lut in zip([lut1, lut2]): + self.input_wires[0][0].connect_to(lut.I0) + self.input_wires[0][2].connect_to(lut.I2) + self.input_wires[0][4].connect_to(lut.I4) + self.input_wires_complementary[0][1].connect_to(lut.I1) + self.input_wires_complementary[0][3].connect_to(lut.I3) + self.input_wires_complementary[0][5].connect_to(lut.I5) + self.output_wires[0][0].connect_to(lut1.O) + self.output_wires[1][0].connect_to(lut2.O) + self.instances += [lut1, lut2] + +class MuxCYPredAdderCandidate(GateAbsorptionCounterCandidate): + def extend_to_fit(self, inputs: Shape, + gates: List[List[str]]) -> GateAbsorptionCounter: + width = 0 + for i in range(4): + if inputs[i] > 2: + width += 1 + else: + break + selected_gates = [] + for i in range(width): + gates_col = [gates[i][0], gates[i][1]] + selected_gates.append(gates_col) + if selected_gates: + return MuxCYPredAdder(selected_gates) + +class MuxCYPredAdder(GateAbsorptionCounter): + def __init__(self, gates: List[List[str]]): + self.gates = [[gate_string_to_pred(el) for el in col] for col in gates] + super().__init__(Shape(len(self.gates) * [2]), + Shape((len(self.gates)+1) * [1])) + + def build_hardware(self): + """7-Series horizontal multi-column gate absorption using LUT6_2. + + Similar to VersalPredAdder but uses LUT6_2 with swapped predicate order. + Each column has 2 gates, each LUT computes: sum = p1 XOR p2 XOR carry_in + """ + from ..primitives import LUT6_2 + from ..nodes import Constant + + luts = [] + for i in range(len(self.gates)): + p1 = self.gates[i][0] + p2 = self.gates[i][1] + # LUT6_2: predO5→O5, predO6→O6 + # Match VersalPredAdder pattern: sum first, carry second + lut = LUT6_2.fromPred( + lambda A0,A1,A2,A3,A4,A5,p1=p1,p2=p2: fa_sum(p1(A0,A1), p2(A2,A3), A4), # predO5 → O5 (sum) + lambda A0,A1,A2,A3,A4,A5,p1=p1,p2=p2: fa_carry(p1(A0,A1), p2(A2,A3), A4), # predO6 → O6 (carry) + ) + + # Connect inputs (same pattern as Versal) + self.input_wires[i][0].connect_to(lut.I0) + self.input_wires[i][1].connect_to(lut.I2) + self.input_wires_complementary[i][0].connect_to(lut.I1) + self.input_wires_complementary[i][1].connect_to(lut.I3) + + # Sum output for this column (O5, not O6!) + lut.O5.connect_to(self.output_wires[i][0]) + luts.append(lut) + + # First LUT needs carry-in = 0 + Constant("1'b0").connect_to(luts[0].I4) + + # Carry chain: previous carry → next carry-in (O6, not O5!) + for p, n in zip(luts, luts[1:]): + p.O6.connect_to(n.I4) + + # Final carry-out (O6, not O5!) + luts[-1].O6.connect_to(self.output_wires[len(luts)][0]) + + self.instances += luts + +class VersalPredAdderCandidate(GateAbsorptionCounterCandidate): + def extend_to_fit(self, inputs: Shape, + gates: List[List[str]]) -> GateAbsorptionCounter: + width = 0 + for i in range(4): + if inputs[i] > 2: + width += 1 + else: + break + selected_gates = [] + for i in range(width): + gates_col = [gates[i][0], gates[i][1]] + selected_gates.append(gates_col) + if selected_gates: + return VersalPredAdder(selected_gates) + +class VersalPredAdder(GateAbsorptionCounter): + def __init__(self, gates: List[List[str]]): + self.gates = [[gate_string_to_pred(el) for el in col] for col in gates] + super().__init__(Shape(len(self.gates) * [2]), + Shape((len(self.gates)+1) * [1])) + + def build_hardware(self): + luts = [] + for i in range(len(self.gates)): + p1 = self.gates[i][0] + p2 = self.gates[i][1] + lut = LUT6CY.fromPred( + lambda A0,A1,A2,A3,A4,A5: fa_sum(p1(A0,A1),p2(A2,A3),A4), # s + lambda A0,A1,A2,A3,A4,A5: fa_carry(p1(A0,A1), + p2(A2,A3), A4), # c + ) + self.input_wires[i][0].connect_to(lut.I0) + self.input_wires[i][1].connect_to(lut.I2) + self.input_wires_complementary[i][0].connect_to(lut.I1) + self.input_wires_complementary[i][1].connect_to(lut.I3) + + lut.O51.connect_to(self.output_wires[i][0]) + luts.append(lut) + + for p, n in zip(luts, luts[1:]): + p.O52.connect_to(n.I4) + luts[-1].O52.connect_to(self.output_wires[len(luts)][0]) + self.instances += luts + +class RippleSumPredAdderCandidate(GateAbsorptionCounterCandidate): + def extend_to_fit(self, inputs: Shape, + gates: List[List[str]]) -> GateAbsorptionCounter: + max_height = min(inputs[0] // 2, 4) + if max_height: + return RippleSumPredAdder(gates[0][:max_height*2]) + +class RippleSumPredAdder(GateAbsorptionCounter): + def __init__(self, gates): + self.gates = [gate_string_to_pred(gate) for gate in gates] + super().__init__(Shape([len(gates)]), Shape([1, (len(gates)+1)//2])) + + def build_hardware(self): + luts = [] + for i in range((len(self.gates) + 1) // 2): + p1 = self.gates[2*i] + p2 = (self.gates[2*i+1] if len(self.gates) > 2*i+1 + else lambda A0,A1: False) + lut = LUT6CY.fromPred( + lambda A0,A1,A2,A3,A4,A5: + fa_carry(p1(A0,A1), p2(A2,A3), A4), # c + lambda A0,A1,A2,A3,A4,A5: + fa_sum(p1(A0,A1),p2(A2,A3),A4) # s + ) + luts.append(lut) + + for p, n in zip(luts, luts[1:]): + p.O52.connect_to(n.I4) + + for i, (w1, w2) in enumerate(zip(self.input_wires[0], + self.input_wires_complementary[0])): + if i % 2 == 0: + w1.connect_to(luts[i//2].I0) + w2.connect_to(luts[i//2].I1) + else: + w1.connect_to(luts[i//2].I2) + w2.connect_to(luts[i//2].I3) + + luts[-1].O52.connect_to(self.output_wires[0][0]) + for i, lut in enumerate(luts): + lut.O51.connect_to(self.output_wires[1][i]) + self.instances += luts + +class MuxCYRippleSumCandidate(GateAbsorptionCounterCandidate): + """7-Series version of RippleSumPredAdder using CARRY4 instead of LUT6CY.""" + def extend_to_fit(self, inputs: Shape, + gates: List[List[str]]) -> GateAbsorptionCounter: + max_height = min(inputs[0] // 2, 4) + if max_height: + return MuxCYRippleSum(gates[0][:max_height*2]) + +class MuxCYRippleSum(GateAbsorptionCounter): + """7-Series ripple-carry gate absorption using LUT6_2 + CARRY4.""" + def __init__(self, gates): + self.gates = [gate_string_to_pred(gate) for gate in gates] + super().__init__(Shape([len(gates)]), Shape([1, (len(gates)+1)//2])) + + def build_hardware(self): + from ..primitives import LUT6_2 + from ..nodes import Constant + + luts = [] + for i in range((len(self.gates) + 1) // 2): + p1 = self.gates[2*i] + p2 = (self.gates[2*i+1] if len(self.gates) > 2*i+1 + else lambda A0,A1: False) + # Match Versal RippleSumPredAdder pattern with full-adder logic + # Gates use I0/I1 (p1) and I2/I3 (p2), carry-in on I4 + # Try swapping: O5 = sum, O6 = carry (opposite of naming) + lut = LUT6_2.fromPred( + lambda A0,A1,A2,A3,A4,A5,p1=p1,p2=p2: fa_sum(p1(A0,A1), p2(A2,A3), A4), # O5 = sum + lambda A0,A1,A2,A3,A4,A5,p1=p1,p2=p2: fa_carry(p1(A0,A1), p2(A2,A3), A4), # O6 = carry + ) + luts.append(lut) + + # Connect gate inputs to LUT inputs (same as Versal) + for i, (w1, w2) in enumerate(zip(self.input_wires[0], + self.input_wires_complementary[0])): + if i % 2 == 0: + w1.connect_to(luts[i//2].I0) + w2.connect_to(luts[i//2].I1) + else: + w1.connect_to(luts[i//2].I2) + w2.connect_to(luts[i//2].I3) + + # First LUT needs carry-in = 0 + Constant("1'b0").connect_to(luts[0].I4) + + # Carry chain: previous carry-out → next carry-in (same as Versal) + for p, n in zip(luts, luts[1:]): + p.O5.connect_to(n.I4) + + # Connect outputs (same as Versal): final carry + sum bits + luts[-1].O5.connect_to(self.output_wires[0][0]) # Final carry-out + for i, lut in enumerate(luts): + lut.O6.connect_to(self.output_wires[1][i]) # Sum bits + + self.instances += luts + +class SinglePredCandidate(GateAbsorptionCounterCandidate): + def extend_to_fit(self, inputs: Shape, + gates: List[List[str]]) -> GateAbsorptionCounter: + if inputs[0] > 0: + return SinglePred(gates[0][0]) + +class SinglePred(GateAbsorptionCounter): + def __init__(self, gate): + self.gate = gate_string_to_pred(gate) + super().__init__(Shape([1]), Shape([1])) + + def build_hardware(self): + lut = LUT2.fromPred(self.gate) + self.input_wires[0][0].connect_to(lut.I0) + self.input_wires_complementary[0][0].connect_to(lut.I1) + lut.O.connect_to(self.output_wires[0][0]) + self.instances.append(lut) \ No newline at end of file diff --git a/src/finn/compressor/src/graph/counters/counter_candidates.py b/src/finn/compressor/src/graph/counters/counter_candidates.py new file mode 100644 index 0000000000..74398e2a1c --- /dev/null +++ b/src/finn/compressor/src/graph/counters/counter_candidates.py @@ -0,0 +1,737 @@ +############################################################################# +# Copyright (C) 2024 - 2025 Advanced Micro Devices, Inc. All rights reserved. +# +# SPDX-License-Identifier: BSD-3-Clause +# +# @brief LUT-based counter and gate absorption atom implementations +# @author Co-authored by Simon Gerber +############################################################################# + +from itertools import count +from ..nodes import Counter, Constant, GateAbsorptionCounter +from abc import ABC, abstractmethod +from ..primitives import LUT6, LUT6_2, LUT6CY, CARRY4, LUT5 +from ...utils.shape import Shape + +MAX_CASCADE_LENGTH = 4 + +def FA_sum(a, b, c): return a ^ b ^ c +def FA_carry(a, b, c): return a and b or a and c or b and c + +class CounterCandidate(ABC): + @abstractmethod + def extend_to_fit(self, inputs: Shape, outputs: Shape, + compression_goal) -> Counter: + pass + +class VersalAtom(CounterCandidate): + def extend_to_fit(self, inputs: Shape, outputs: Shape, + compression_goal) -> Counter: + pass + +class FixedShapeCounterCandidate(CounterCandidate): + def __init__(self, counter, counter_inputs: Shape, + counter_outputs: Shape) -> Counter: + self.counter = counter + self.counter_inputs = counter_inputs + self.counter_outputs = counter_outputs + + def extend_to_fit(self, inputs: Shape, outputs: Shape, + compression_goal) -> Counter: + for i in range(len(self.counter_inputs)): + if not (self.counter_inputs[i] <= inputs[i] and + inputs[i] + outputs[i] - self.counter_inputs[i] + + self.counter_outputs[i] - compression_goal(i) >= -1): + return None + return self.counter() + +class FA(Counter): + def __init__(self): + super(FA, self).__init__( + Shape([3]), + Shape([1, 1]), + ) + + def build_hardware(self): + lut = LUT6_2.fromPred( + lambda x, y, z, w, q, r: + x and y or x and z or y and z, + lambda x, y, z, w, q, r: x ^ y ^ z, + "FA") + for i in range(3): + self.input_wires[0][i].connect_to(lut.in_ports[i]) + for i in range(2): + lut.out_ports[i].connect_to(self.output_wires[i][0]) + self.instances += (lut,) + +class FACandidate(FixedShapeCounterCandidate): + def __init__(self): + super().__init__(FA, FA().input_shape, FA().output_shape) + +hlutnm_counter = count() +class TenSix(Counter): + def __init__(self): + super(TenSix, self).__init__(Shape([10]), Shape([2, 4])) + + def build_hardware(self): + lut1 = LUT6_2.fromPred( + lambda A0,A1,A2,A3,A4,_: FA_sum( A3, A4, FA_sum(A0, A1, A2)), + lambda A0,A1,A2,A3,A4,_: FA_carry(A3, A4, FA_sum(A0, A1, A2)), + "FiveTwo_1" + ) + lut2 = LUT6_2.fromPred( + lambda A0,A1,A2,A3,A4,_: FA_sum( A3, A4, FA_sum(A0, A1, A2)), + lambda A0,A1,A2,A3,A4,_: FA_carry(A3, A4, FA_sum(A0, A1, A2)), + "FiveTwo_2" + ) + hlutnm_attr = f"HLUTNM = \"tensix_{next(hlutnm_counter)}\"" + lut3_A = LUT5.fromPred( + lambda A0,A1,A2,A3,A4: FA_carry(A0,A1,A4) + ) + lut3_B = LUT5.fromPred( + lambda A0,A1,A2,A3,A4: FA_carry(A2,A3,A4) + ) + lut3_A.annotate(hlutnm_attr) + lut3_B.annotate(hlutnm_attr) + # TODO: Take care of annotations + self.input_wires[0][0].connect_to(lut1.I0) + self.input_wires[0][1].connect_to(lut1.I1) + self.input_wires[0][2].connect_to(lut1.I2) + self.input_wires[0][3].connect_to(lut1.I3) + self.input_wires[0][4].connect_to(lut1.I4) + lut1.O5.connect_to(self.output_wires[0][0]) + lut1.O6.connect_to(self.output_wires[1][0]) + + self.input_wires[0][5].connect_to(lut2.I0) + self.input_wires[0][6].connect_to(lut2.I1) + self.input_wires[0][7].connect_to(lut2.I2) + self.input_wires[0][8].connect_to(lut2.I3) + self.input_wires[0][9].connect_to(lut2.I4) + + self.input_wires[0][0].connect_to(lut3_A.I0) + self.input_wires[0][1].connect_to(lut3_A.I1) + self.input_wires[0][2].connect_to(lut3_A.I4) + + self.input_wires[0][5].connect_to(lut3_B.I2) + self.input_wires[0][6].connect_to(lut3_B.I3) + self.input_wires[0][7].connect_to(lut3_B.I4) + + # Duplicate connections to make Vivado obey HLUTNM + self.input_wires[0][5].connect_to(lut3_A.I2) + self.input_wires[0][6].connect_to(lut3_A.I3) + self.input_wires[0][0].connect_to(lut3_B.I0) + self.input_wires[0][1].connect_to(lut3_B.I1) + + lut2.O5.connect_to(self.output_wires[0][1]) + lut2.O6.connect_to(self.output_wires[1][1]) + + lut3_A.O.connect_to(self.output_wires[1][2]) + lut3_B.O.connect_to(self.output_wires[1][3]) + + self.instances += (lut1, lut2, lut3_A, lut3_B) + +class TenSixCandidate(FixedShapeCounterCandidate): + def __init__(self): + super().__init__(TenSix, TenSix().input_shape, TenSix().output_shape) + +class FiveTwo(Counter): + def __init__(self): super(FiveTwo, self).__init__(Shape([5, 2]), + Shape([1, 2, 1])) + + def build_hardware(self): + lut1 = LUT6_2.fromPred( + lambda A0,A1,A2,A3,A4,_: FA_sum( A3, A4, FA_sum(A0, A1, A2)), + lambda A0,A1,A2,A3,A4,_: FA_carry(A3, A4, FA_sum(A0, A1, A2)), + "FiveTwo_1" + ) + lut2 = LUT6_2.fromPred( + lambda A0,A1,A2,A3,A4,_: FA_sum( A3, A4, FA_carry(A0, A1, A2)), + lambda A0,A1,A2,A3,A4,_: FA_carry(A3, A4, FA_carry(A0, A1, A2)), + "FiveTwo_2" + ) + self.input_wires[0][0].connect_to(lut1.I0) + self.input_wires[0][1].connect_to(lut1.I1) + self.input_wires[0][2].connect_to(lut1.I2) + self.input_wires[0][3].connect_to(lut1.I3) + self.input_wires[0][4].connect_to(lut1.I4) + lut1.O5.connect_to(self.output_wires[0][0]) + lut1.O6.connect_to(self.output_wires[1][0]) + + self.input_wires[0][0].connect_to(lut2.I0) + self.input_wires[0][1].connect_to(lut2.I1) + self.input_wires[0][2].connect_to(lut2.I2) + self.input_wires[1][0].connect_to(lut2.I3) + self.input_wires[1][1].connect_to(lut2.I4) + lut2.O5.connect_to(self.output_wires[1][1]) + lut2.O6.connect_to(self.output_wires[2][0]) + self.instances += (lut1, lut2) + +class FiveTwoCandidate(FixedShapeCounterCandidate): + def __init__(self): + super(FiveTwoCandidate, self).__init__(FiveTwo, FiveTwo().input_shape, + FiveTwo().output_shape) + +class DualRailRippleSum(Counter): + def __init__(self, w): + self._width = w + super(DualRailRippleSum, self).__init__(Shape([4*w+1, w+1]), + Shape([1, w+1, w])) + + @property + def width(self): return self._width + + def build_hardware(self): + luts_top = [] + luts_btm = [] + + cascade_top = self.input_wires[0][0] + cascade_btm = self.input_wires[1][0] + + for i in range(0, self._width): + lut_top = LUT6CY.fromPred( + lambda A0,A1,A2,A3,A4,_: FA_carry(A3, A4, + FA_sum(A0, A1, A2)), + lambda A0,A1,A2,A3,A4,_: FA_sum (A3, A4, + FA_sum(A0, A1, A2)), + "dual_rail_top" + ) + lut_btm = LUT6CY.fromPred( + lambda A0,A1,A2,A3,A4,_: FA_carry(A3, A4, + FA_carry(A0, A1, A2)), + lambda A0,A1,A2,A3,A4,_: FA_sum (A3, A4, + FA_carry(A0, A1, A2)), + "dual_rail_btm" + ) + + self.input_wires[0][1+4*i].connect_to(lut_top.I0) + self.input_wires[0][2+4*i].connect_to(lut_top.I1) + self.input_wires[0][3+4*i].connect_to(lut_top.I2) + self.input_wires[0][4+4*i].connect_to(lut_top.I3) + cascade_top.connect_to(lut_top.I4) + lut_top.O51.connect_to(self.output_wires[1][i+1]) + cascade_top = lut_top.O52 + + self.input_wires[0][1+4*i].connect_to(lut_btm.I0) + self.input_wires[0][2+4*i].connect_to(lut_btm.I1) + self.input_wires[0][3+4*i].connect_to(lut_btm.I2) + self.input_wires[1][1+i].connect_to(lut_btm.I3) + cascade_btm.connect_to(lut_btm.I4) + lut_btm.O51.connect_to(self.output_wires[2][i]) + cascade_btm = lut_btm.O52 + + luts_top.append(lut_top) + luts_btm.append(lut_btm) + + if i == self._width - 1: + lut_top.O52.connect_to(self.output_wires[0][0]) + lut_btm.O52.connect_to(self.output_wires[1][0]) + + self.instances += luts_top + luts_btm + +class DualRailRippleSumCandidate(CounterCandidate): + def extend_to_fit(self, inputs: Shape, outputs: Shape, + compression_goal) -> Counter: + max_height_0 = min(MAX_CASCADE_LENGTH, + (inputs[0]-1)//4, + (inputs[0]+outputs[0]-compression_goal(0)+1)//4 + ) if inputs[0] >= 5 else 0 + + max_height_1 = min(MAX_CASCADE_LENGTH, + inputs[1]-1 + ) if inputs[1] >= 2 else 0 + max_height = min(max_height_0, max_height_1, MAX_CASCADE_LENGTH) + if max_height > 0: + return DualRailRippleSum(max_height) + +class RippleSum(Counter): + def __init__(self, w): + self._width = w + super(RippleSum, self).__init__(Shape([2*w+1]), Shape([1, w])) + + @property + def width(self): return self._width + + def build_hardware(self): + luts = [] + + carry = self.input_wires[0][0] + + for i in range(0, self._width): + lut = LUT6CY.fromPred( + lambda A0,A1,A2,A3,A4,_: FA_carry(A4, A1, A0), + lambda A0,A1,A2,A3,A4,_: FA_sum (A4, A1, A0), + "ripple_sum" + ) + + self.input_wires[0][1+2*i].connect_to(lut.I0) + self.input_wires[0][2+2*i].connect_to(lut.I1) + carry.connect_to(lut.I4) + lut.O51.connect_to(self.output_wires[1][i]) + carry = lut.O52 + + luts.append(lut) + + if i == self._width - 1: + lut.O52.connect_to(self.output_wires[0][0]) + + self.instances += luts + +class RippleSumCandidate(CounterCandidate): + def extend_to_fit(self, inputs: Shape, outputs: Shape, + compression_goal) -> Counter: + max_height = min(MAX_CASCADE_LENGTH, + (inputs[0]-1)//2, + (inputs[0]+outputs[0]+1)//2-compression_goal(0)+1 + ) if inputs[0] >= 3 else 0 + if max_height > 0: + return RippleSum(max_height) + +class SixThree(Counter): + def __init__(self): + super(SixThree, self).__init__(Shape([6]), Shape([1, 1, 1])) + + def build_hardware(self): + lut1 = LUT6.fromPred(lambda A0,A1,A2,A3,A4,A5: + bool(sum([A0,A1,A2,A3,A4,A5]) & 1), + "sixthree_first") + lut2 = LUT6.fromPred(lambda A0,A1,A2,A3,A4,A5: + bool(sum([A0,A1,A2,A3,A4,A5]) & 2), + "sixthree_second") + lut3 = LUT6.fromPred(lambda A0,A1,A2,A3,A4,A5: + bool(sum([A0,A1,A2,A3,A4,A5]) & 4), + "sixthree_third") + luts = (lut1, lut2, lut3) + + for lut in luts: + for i in range(6): + self.input_wires[0][i].connect_to(lut.in_ports[i]) + + for i, lut in enumerate(luts): + lut.out_ports[0].connect_to(self.output_wires[i][0]) + self.instances += luts + +class SixThreeCandidate(FixedShapeCounterCandidate): + def __init__(self): + super().__init__(SixThree, SixThree().input_shape, + SixThree().output_shape) + +class VersalAtom14: + def __init__(self): + self.shape = Shape([4,1]) + self.width = 2 + self.output_width = 2 + + def build_luts(self): + lut_1 = LUT6CY.fromPred( + lambda A0,A1,A2,A3,A4,_: FA_sum( FA_sum(A0,A1,A2),A3,A4), + lambda A0,A1,A2,A3,A4,_: FA_carry(FA_sum(A0,A1,A2),A3,A4), + "atom14_first" + ) + lut_2 = LUT6CY.fromPred( + lambda A0,A1,A2,A3,A4,_: FA_sum( FA_carry(A0,A1,A2),A3,A4), + lambda A0,A1,A2,A3,A4,_: FA_carry(FA_carry(A0,A1,A2),A3,A4), + "atom14_second" + ) + return (lut_1, lut_2) + +class VersalAtom2: + def __init__(self): + self.shape = Shape([2]) + self.width = 1 + self.output_width = 1 + + def build_luts(self): + lut = LUT6CY.fromPred( + lambda A0,A1,A2,A3,A4,_: FA_sum(A0,A1,A4), + lambda A0,A1,A2,A3,A4,_: FA_carry(A0,A1,A4), + "atom2_second" + ) + return (lut,) + +class VersalAtom222: + def __init__(self): + self.shape = Shape([2,2,2]) + self.width = 2 + self.output_width = 3 + + def build_luts(self): + lut_1 = LUT6CY.fromPred( + lambda A0,A1,A2,A3,A4,_: FA_sum(A2,A3,A4), + lambda A0,A1,A2,A3,A4,_: FA_sum(A0,A1,FA_carry(A2,A3,A4)), + ) + lut_2 = LUT6CY.fromPred( + lambda A0,A1,A2,A3,A4,_: FA_sum(A0,A1,FA_carry(A2,A3,A2^A3^A4)), + lambda A0,A1,A2,A3,A4,_: FA_carry(A0,A1,FA_carry(A2,A3,A2^A3^A4)), + ) + return (lut_1, lut_2) + +class VersalAtomCascade(Counter): + def __init__(self, atoms): + self._atoms = atoms + + in_shape = [el for atom in atoms for el in atom.shape] + in_shape[0] += 1 + in_shape = Shape(in_shape) + + out_shape = Shape([1 for _ + in range(sum([atom.output_width for + atom in atoms]) + 1)]) + super().__init__(in_shape, out_shape) + + def build_hardware(self): + luts = [] + for atom in self._atoms: + # emit the correct luts + luts += atom.build_luts() + + if not luts: + return + + # Connect inputs + lut_idx = 0 + io_idx = 0 + + # Carry-in + carry = self.input_wires[0][self._atoms[0].shape[0]] + + for atom in self._atoms: + if isinstance(atom, VersalAtom2): + self.input_wires[io_idx][0].connect_to(luts[lut_idx].I0) + self.input_wires[io_idx][1].connect_to(luts[lut_idx].I1) + carry.connect_to(luts[lut_idx].I4) + carry = luts[lut_idx].O52 + + luts[lut_idx].O51.connect_to(self.output_wires[io_idx][0]) + lut_idx += 1 + io_idx += 1 + elif isinstance(atom, VersalAtom222): + self.input_wires[io_idx][0].connect_to(luts[lut_idx].I2) + self.input_wires[io_idx][1].connect_to(luts[lut_idx].I3) + self.input_wires[io_idx+1][0].connect_to(luts[lut_idx].I0) + self.input_wires[io_idx+1][1].connect_to(luts[lut_idx].I1) + carry.connect_to(luts[lut_idx].I4) + carry = luts[lut_idx].O52 + + # second lut + self.input_wires[io_idx+1][0].connect_to(luts[lut_idx+1].I2) + self.input_wires[io_idx+1][1].connect_to(luts[lut_idx+1].I3) + self.input_wires[io_idx+2][0].connect_to(luts[lut_idx+1].I0) + self.input_wires[io_idx+2][1].connect_to(luts[lut_idx+1].I1) + carry.connect_to(luts[lut_idx+1].I4) + carry = luts[lut_idx+1].O52 + + luts[lut_idx].O51.connect_to(self.output_wires[io_idx][0]) + luts[lut_idx].O52.connect_to(self.output_wires[io_idx+1][0]) + luts[lut_idx+1].O51.connect_to(self.output_wires[io_idx+2][0]) + lut_idx += 2 + io_idx += 3 + elif isinstance(atom, VersalAtom14): + # first lut + self.input_wires[io_idx][0].connect_to(luts[lut_idx].I0) + self.input_wires[io_idx][1].connect_to(luts[lut_idx].I1) + self.input_wires[io_idx][2].connect_to(luts[lut_idx].I2) + self.input_wires[io_idx][3].connect_to(luts[lut_idx].I3) + carry.connect_to(luts[lut_idx].I4) + carry = luts[lut_idx].O52 + + # second lut + self.input_wires[io_idx][0].connect_to(luts[lut_idx+1].I0) + self.input_wires[io_idx][1].connect_to(luts[lut_idx+1].I1) + self.input_wires[io_idx][2].connect_to(luts[lut_idx+1].I2) + self.input_wires[io_idx+1][0].connect_to(luts[lut_idx+1].I3) + carry.connect_to(luts[lut_idx+1].I4) + carry = luts[lut_idx+1].O52 + + luts[lut_idx].O51.connect_to(self.output_wires[io_idx][0]) + luts[lut_idx+1].O51.connect_to(self.output_wires[io_idx+1][0]) + + lut_idx += 2 + io_idx += 2 + else: + raise Exception("Error in construction of Versal Atoms") + luts[-1].O52.connect_to(self.output_wires[-1][0]) + self.instances += luts + +class VersalAtomCascadeCandidate(CounterCandidate): + def extend_to_fit(self, inputs: Shape, outputs: Shape, + compression_goal) -> Counter: + def fits_col(idx, height): + return (height <= inputs[idx] and + inputs[idx] + outputs[idx] - height + + 1 - compression_goal(idx) >= -1) + atoms = [] + io_idx = 0 + atom_idx = 0 + while (atom_idx < 4): + if atom_idx == 0: + if fits_col(io_idx, 5) and fits_col(io_idx+1, 1): + atoms.append(VersalAtom14()) + atom_idx += 2 + io_idx += 2 + if (fits_col(io_idx, 3) and fits_col(io_idx+1, 2) and + fits_col(io_idx+2, 2)): + atoms.append(VersalAtom222()) + atom_idx += 2 + io_idx += 3 + elif fits_col(io_idx, 3): + atoms.append(VersalAtom2()) + atom_idx += 1 + io_idx += 1 + else: + break + elif atom_idx < 3: + if fits_col(io_idx, 4) and fits_col(io_idx+1, 1): + atoms.append(VersalAtom14()) + atom_idx += 2 + io_idx += 2 + elif (fits_col(io_idx, 2) and fits_col(io_idx+1, 2) and + fits_col(io_idx+2, 2)): + atoms.append(VersalAtom222()) + atom_idx += 2 + io_idx += 3 + elif fits_col(io_idx, 2): + atoms.append(VersalAtom2()) + atom_idx += 1 + io_idx += 1 + else: + break + elif fits_col(io_idx, 2): + atoms.append(VersalAtom2()) + atom_idx += 1 + io_idx += 1 + else: + break + if atoms: + return VersalAtomCascade(atoms) + +class ConstantOne(GateAbsorptionCounter): + def __init__(self): + super().__init__(Shape(tuple()), Shape((1,))) + + def build_hardware(self): + Constant(1).connect_to(self.output_wires[0][0]) + +class MuxCYAtom06: + def __init__(self): + self.shape = Shape([6,0]) + self.width = 2 + self.output_width = 2 + + def build_luts(self): + # Matches VHDL atom06.vhdl - the (0,6) atom for 6 inputs from column 0 + # + # VHDL lo LUT: INIT => x"6996_9669_9669_6996" + # Uses all 6 inputs x0[5:0] + # O6 = O5 = XOR of all 6 bits (parity function) + # + # VHDL hi LUT: INIT => x"177E_7EE8" & x"E8E8_E8E8" + # Uses x0[4:0] with I5=1 + # O6 = complex carry propagation + # O5 = 0xE8 repeated = FA_carry(I0,I1,I2) + # + # Note: This atom is currently DISABLED in MuxCYAtomCascadeCandidate + # because it needs further testing. The predicates below match the + # VHDL reference but the wiring/integration may need work. + # + # lo LUT: XOR of all 6 bits + lut_1 = LUT6_2.fromPred( + lambda A0,A1,A2,A3,A4,A5: A0 ^ A1 ^ A2 ^ A3 ^ A4, # O5 (5-input XOR) + lambda A0,A1,A2,A3,A4,A5: A0 ^ A1 ^ A2 ^ A3 ^ A4 ^ A5, # O6 (6-input XOR) + "atom06_lo" + ) + # hi LUT: carry chain continuation + # O5 = FA_carry(A0,A1,A2) for the generate term + # O6 = more complex carry propagation (from VHDL 0x177E7EE8) + lut_2 = LUT6_2.fromPred( + lambda A0,A1,A2,A3,A4,A5: FA_carry(A0,A1,A2), # O5 -> DI + lambda A0,A1,A2,A3,A4,A5: (FA_carry(FA_sum(A0,A1,A2),A3,A4) ^ + FA_carry(A0,A1,A2)), # O6 -> S + "atom06_hi" + ) + return (lut_1, lut_2) + +class MuxCYAtom14: + def __init__(self): + self.shape = Shape([4,1]) + self.width = 2 + + def build_luts(self): + # Preußer FPL 2017: (1,4) atom - matches VHDL atom14.vhdl + # + # CARRY4 primitive: CO = S ? CI : DI, O = S ^ CI + # + # The key insight from the VHDL reference: + # - O6 (S) computes the propagate signal: XOR of inputs + # - O5 (DI) simply passes through the higher-weight input bit + # + # This is NOT an AND of the sum/carry with the input! + # The VHDL uses INIT patterns: + # lo: x"6996_6996" & x"FF00_FF00" (O6=0x6996, O5=0xFF00) + # hi: x"17E8_17E8" & x"FF00_FF00" (O6=0x17E8, O5=0xFF00) + # + # O5 = 0xFF00 = just passes I3 (the 4th input bit) + # + # BUGFIX (2026-04-08): Previous implementation incorrectly used: + # O5 = FA_sum(A0,A1,A2) & A3 (WRONG - produces 0xFF96) + # Correct implementation: + # O5 = A3 (just pass through - produces 0xFF00) + # + # lut_1 (position 0): processes x0[3:0] for s0/d0 + lut_1 = LUT6_2.fromPred( + lambda A0,A1,A2,A3,A4,_: A3, # O5 -> DI = x0[3] + lambda A0,A1,A2,A3,A4,_: FA_sum(A0,A1,A2) ^ A3, # O6 -> S + "atom14_0" + ) + # lut_2 (position 1): processes x0[2:0] and x1 for s1/d1 + # x1 is mapped to I3 (A3) + lut_2 = LUT6_2.fromPred( + lambda A0,A1,A2,A3,A4,_: A3, # O5 -> DI = x1 + lambda A0,A1,A2,A3,A4,_: FA_carry(A0,A1,A2) ^ A3, # O6 -> S + "atom14_1" + ) + return (lut_1, lut_2) + +class MuxCYAtom2: + def __init__(self): + self.shape = Shape([2]) + self.width = 1 + + def build_luts(self): + # Matches VHDL atom22.vhdl: INIT => x"6666_6666" & x"CCCC_CCCC" + # + # CARRY4: CO = S ? CI : DI, O = S ^ CI + # + # The VHDL uses: + # O6 = 0x6666 = I0 ^ I1 (XOR / half-adder sum) + # O5 = 0xCCCC = I1 (just passes through the higher-weight bit) + # + # BUGFIX (2026-04-08): Previous implementation used O5=A0. + # While this happens to produce correct results due to CARRY4 + # logic simplification, it doesn't match the VHDL reference. + # Changed to O5=A1 for consistency with atom22.vhdl. + lut = LUT6_2.fromPred( + lambda A0,A1,A2,A3,A4,_: A1, # O5 -> DI = higher-weight bit + lambda A0,A1,A2,A3,A4,_: A0 ^ A1, # O6 -> S (propagate) + "atom2" + ) + return (lut,) + +class MuxCYAtomCascade(Counter): + def __init__(self, atoms): + self._atoms = atoms + + in_shape = [el for atom in atoms for el in atom.shape] + in_shape[0] += 1 + in_shape = Shape(in_shape) + + out_shape = Shape([1 for _ + in range(sum([atom.width for atom in atoms]) + 1)]) + super().__init__(in_shape, out_shape) + + def build_hardware(self): + luts = [] + for atom in self._atoms: + luts += atom.build_luts() + muxcy = CARRY4() + + if not luts: + return + + # Connect inputs + idx = 0 + self.input_wires[0][self._atoms[0].shape[0]].connect_to(muxcy.CI) + + for atom in self._atoms: + if isinstance(atom, MuxCYAtom2): + self.input_wires[idx][0].connect_to(luts[idx].I0) + self.input_wires[idx][1].connect_to(luts[idx].I1) + idx += 1 + elif isinstance(atom, MuxCYAtom14): + # first lut + self.input_wires[idx][0].connect_to(luts[idx].I0) + self.input_wires[idx][1].connect_to(luts[idx].I1) + self.input_wires[idx][2].connect_to(luts[idx].I2) + self.input_wires[idx][3].connect_to(luts[idx].I3) + + # second lut + self.input_wires[idx][0].connect_to(luts[idx+1].I0) + self.input_wires[idx][1].connect_to(luts[idx+1].I1) + self.input_wires[idx][2].connect_to(luts[idx+1].I2) + self.input_wires[idx+1][0].connect_to(luts[idx+1].I3) + idx += 2 + elif isinstance(atom, MuxCYAtom06): + # First LUT (atom06_lo): uses all 6 inputs for XOR + self.input_wires[idx][0].connect_to(luts[idx].I0) + self.input_wires[idx][1].connect_to(luts[idx].I1) + self.input_wires[idx][2].connect_to(luts[idx].I2) + self.input_wires[idx][3].connect_to(luts[idx].I3) + self.input_wires[idx][4].connect_to(luts[idx].I4) + self.input_wires[idx][5].connect_to(luts[idx].I5) + + # Second LUT (atom06_hi): uses inputs 0-4 for carry propagation + # BUGFIX: was connecting to luts[idx] instead of luts[idx+1] + self.input_wires[idx][0].connect_to(luts[idx+1].I0) + self.input_wires[idx][1].connect_to(luts[idx+1].I1) + self.input_wires[idx][2].connect_to(luts[idx+1].I2) + self.input_wires[idx][3].connect_to(luts[idx+1].I3) + self.input_wires[idx][4].connect_to(luts[idx+1].I4) + idx += 2 + else: + raise Exception("Error in construction of MuxCYAtoms") + + # Connect outputs + for idx, (lut, di, s, o) in enumerate(zip(luts, + muxcy.DI.elements, + muxcy.S.elements, + muxcy.O.elements)): + lut.O6.connect_to(s) + lut.O5.connect_to(di) + o.connect_to(self.output_wires[idx][0]) + + muxcy.CO.elements[-1].connect_to(self.output_wires[-1][0]) + self.instances += luts + self.instances.append(muxcy) + +class MuxCYAtomCascadeCandidate(CounterCandidate): + def extend_to_fit(self, inputs: Shape, outputs: Shape, + compression_goal) -> Counter: + def fits_col(idx, height): + return (height <= inputs[idx] and + inputs[idx] + outputs[idx] - height + + 1 - compression_goal(idx) >= -1) + atoms = [] + i = 0 + while (i < 4): + if i == 0: + # MuxCYAtom06: 6:3 compressor for column 0 (needs 7 inputs: 6 + carry-in) + if fits_col(i, 7): + atoms.append(MuxCYAtom06()) + i += 2 + elif fits_col(i, 5) and fits_col(i+1, 1): + atoms.append(MuxCYAtom14()) + i += 2 + elif fits_col(i, 3): + atoms.append(MuxCYAtom2()) + i += 1 + else: + break + elif i < 3: + # MuxCYAtom06: 6:3 compressor for middle columns + if fits_col(i, 6): + atoms.append(MuxCYAtom06()) + i += 2 + elif fits_col(i, 4) and fits_col(i+1, 1): + atoms.append(MuxCYAtom14()) + i += 2 + elif fits_col(i, 2): + atoms.append(MuxCYAtom2()) + i += 1 + else: + break + elif fits_col(i, 2): + atoms.append(MuxCYAtom2()) + i += 1 + else: + break + if i == 4: + return MuxCYAtomCascade(atoms) \ No newline at end of file diff --git a/src/finn/compressor/src/graph/final_adder.py b/src/finn/compressor/src/graph/final_adder.py new file mode 100644 index 0000000000..d5fb6456ad --- /dev/null +++ b/src/finn/compressor/src/graph/final_adder.py @@ -0,0 +1,364 @@ +############################################################################# +# Copyright (C) 2024 - 2025 Advanced Micro Devices, Inc. All rights reserved. +# +# SPDX-License-Identifier: BSD-3-Clause +# +# @brief Final adder implementations for compressor output stage +############################################################################# + +from abc import abstractstaticmethod +from typing import List +from .nodes import Counter +from ..utils.shape import Shape +from .primitives import LUT5, LUT6CY, LOOKAHEAD8, LUT6_2, CARRY4 + +def FA_sum(a, b, c): return a ^ b ^ c +def FA_carry(a, b, c): return a and b or a and c or b and c + +def ceildiv(a, b): + return -(a // -b) + +def try_connect(func): + try: + func() + except IndexError: + pass + + +class FinalAdder(Counter): + @abstractstaticmethod + def compression_goal(col): pass + + +class VersalTernaryAdder(FinalAdder): + @staticmethod + def compression_goal(col): return 5 if col == 0 else 3 + + def __init__(self, input_shape: Shape): + self.input_shape = input_shape + output_shape = Shape([1 for _ in range(len(input_shape) + 2)]) + super().__init__(input_shape, output_shape) + + def build_hardware(self): + l8s = [LOOKAHEAD8() for _ in range((len(self.input_shape)+8)//8)] + luts_chain = [LUT6CY.fromPred( + lambda A0,A1,A2,A3,A4,A5: FA_sum(FA_sum(A0,A1,A2), A3, A4), + lambda A0,A1,A2,A3,A4,A5: FA_carry(FA_sum(A0,A1,A2), A3, A4), + "ternary_adder_chain" + ) for _ in range(len(self.input_shape)+1)] + luts_top = [] + for i in range(len(self.input_shape)): + if i % 2 == 0: + luts_top.append(LUT5.fromPred( + lambda A0,A1,A2,A3,A4: FA_carry(A0, A1, A4) + )) + try_connect(lambda: + self.input_wires[i][0].connect_to(luts_top[-1].I0)) + try_connect(lambda: + self.input_wires[i][1].connect_to(luts_top[-1].I1)) + try_connect(lambda: + self.input_wires[i+1][0].connect_to( + luts_top[-1].I2)) + try_connect(lambda: + self.input_wires[i+1][1].connect_to( + luts_top[-1].I3)) + try_connect(lambda: + self.input_wires[i][2].connect_to(luts_top[-1].I4)) + else: + luts_top.append(LUT5.fromPred( + lambda A0,A1,A2,A3,A4: FA_carry(A2, A3, A4) + )) + try_connect(lambda: + self.input_wires[i-1][0].connect_to( + luts_top[-1].I0)) + try_connect(lambda: + self.input_wires[i-1][1].connect_to( + luts_top[-1].I1)) + try_connect(lambda: self.input_wires[i][0].connect_to( + luts_top[-1].I2)) + try_connect(lambda: self.input_wires[i][1].connect_to( + luts_top[-1].I3)) + try_connect(lambda: self.input_wires[i][2].connect_to( + luts_top[-1].I4)) + + for idx, (left, right) in enumerate(zip(luts_top[0::2], + luts_top[1::2])): + left.annotate(f"HLUTNM = final_adder_{idx}") + right.annotate(f"HLUTNM = final_adder_{idx}") + + try_connect(lambda: + self.input_wires[0][3].connect_to(luts_chain[0].I3)) + try_connect(lambda: + self.input_wires[0][4].connect_to(luts_chain[0].I4)) + for i, el in enumerate(luts_chain): + try_connect(lambda: self.input_wires[i][0].connect_to(el.I0)) + try_connect(lambda: self.input_wires[i][1].connect_to(el.I1)) + try_connect(lambda: self.input_wires[i][2].connect_to(el.I2)) + el.PROP.connect_to(l8s[i//8].p_in_ports[i%8]) + el.O51.connect_to(self.output_wires[i][0]) + el.O52.connect_to(l8s[i//8].c_in_ports[i%8+1]) + + for lb, lt in zip(luts_chain[1:], luts_top): + lt.O.connect_to(lb.I3) + + # connect carry-ins between lookahead modules + for prev, next in zip(l8s, l8s[1:]): + prev.COUTH.connect_to(next.CIN) + + # cascade + for i in range(1, len(luts_chain)): + if i % 2 == 0: + l8s[(i-1)//8].out_ports[((i-1)%8)//2].connect_to( + luts_chain[i].I4) + else: + luts_chain[i-1].O52.connect_to(luts_chain[i].I4) + + if len(luts_chain) % 2 == 0: + l8s[(len(luts_chain)-1)//8].out_ports[len(luts_chain)%8//2-1]\ + .connect_to(self.output_wires[len(luts_chain)][0]) + else: + luts_chain[-1].O52.connect_to( + self.output_wires[len(luts_chain)][0]) + self.instances += luts_chain + luts_top + l8s + +class QuaternaryAdder(FinalAdder): + @staticmethod + def compression_goal(col): return 5 if col <= 1 else 4 + + def __init__(self, input_shape: Shape): + output_shape = Shape([1 for _ in range(len(input_shape) + 2)]) + super().__init__(input_shape, output_shape) + + def build_hardware(self): + ## Find the limit up to which the quaternary adder is needed. + # We construct a two-input adder after this. + height_4_until = len(self.input_wires) + tail_length = 0 + for idx, col in reversed(list(enumerate(self.input_wires))): + if len(col) > 2: + break + else: + height_4_until = idx + tail_length += 1 + + # If tail_length==1, the quaternary adder must not be reduced, + # as there would be no savings. + if (tail_length == 1): + height_4_until += 1 + tail_length = 0 + + # Construct necessary hardware + luts_top: List[LUT6CY] = [] + luts_btm: List[LUT6CY] = [] + + for i in range(0, height_4_until): + luts_top.append( + LUT6CY.fromPred( + lambda A0,A1,A2,A3,A4,_: FA_sum( + FA_sum(A0, A1, A2), A3, A4), # S + lambda A0,A1,A2,A3,A4,_: FA_carry( + FA_sum(A0, A1, A2), A3, A4), # ct + "final_adder_top" + ) + ) + luts_btm.append( + LUT6CY.fromPred( + lambda A0,A1,A2,A3,A4,_: FA_sum( + FA_carry(A0, A1, A2), A3, A4), # out + lambda A0,A1,A2,A3,A4,_: FA_carry( + FA_carry(A0, A1, A2), A3, A4), #cb + "final_adder_btm" + ) + ) + if (tail_length): + luts_top.append( + LUT6CY.fromPred( + lambda A0,A1,A2,A3,A4,_: FA_sum(A0, A1, A4), # out + lambda A0,A1,A2,A3,A4,_: FA_carry(A0, A1, A4), # c_btm + "final_adder_top_end" + ) + ) + luts_btm.append( + LUT6CY.fromPred( + lambda A0,A1,A2,A3,A4,_: FA_sum(FA_sum(A0, A1, False), + A3, A4), # out + lambda A0,A1,A2,A3,A4,_: FA_carry(FA_sum(A0, A1, False), + A3, A4), # c_btm + "final_adder_btm_start_two_input_chain" + ) + ) + for i in range(tail_length-1): + luts_btm.append( + LUT6CY.fromPred( + lambda A0,A1,A2,A3,A4,_: + FA_sum(FA_carry(A0, A1, False), + FA_sum(A2, A3, False), A4), # out + lambda A0,A1,A2,A3,A4,_: + FA_carry(FA_carry(A0, A1, False), + FA_sum(A2, A3, False), A4), # cb + "final_adder_btm_two_input_chain" + ) + ) + + + l8s_top = [] + l8s_btm = [] + for _ in range(ceildiv(len(luts_top), 8)): + l8s_top.append(LOOKAHEAD8()) + for _ in range(ceildiv(len(luts_btm), 8)): + l8s_btm.append(LOOKAHEAD8()) + + # Collect relevant input and output signals + for i in range(len(luts_top)): + luts_top[i].O52.connect_to(l8s_top[i//8].c_in_ports[i%8+1]) + luts_top[i].PROP.connect_to(l8s_top[i//8].p_in_ports[i%8]) + + for i in range(len(luts_btm)): + luts_btm[i].O52.connect_to(l8s_btm[i//8].c_in_ports[i%8+1]) + luts_btm[i].PROP.connect_to(l8s_btm[i//8].p_in_ports[i%8]) + + carries_top = [] + carries_btm = [] + for i in range(0, len(luts_top)): + if i % 2 == 0: + carries_top.append(luts_top[i].O52) + if i % 2 == 1: + carries_top.append(l8s_top[i//8].out_ports[i%8//2]) + for i in range(0, len(luts_btm)): + if i % 2 == 0: + carries_btm.append(luts_btm[i].O52) + if i % 2 == 1: + carries_btm.append(l8s_btm[i//8].out_ports[i%8//2]) + + for i in range(0, len(luts_top)-1): + carries_top[i].connect_to(luts_top[i+1].I4) + for i in range(0, len(luts_btm)-1): + carries_btm[i].connect_to(luts_btm[i+1].I4) + + # connect carry-ins between lookahead modules + def chain_l8(l8s): + for prev, next in zip(l8s, l8s[1:]): + prev.COUTH.connect_to(next.CIN) + + chain_l8(l8s_top) + chain_l8(l8s_btm) + + # connect carry-in to first lut and lookahead module + try_connect(lambda: self.input_wires[0][4].connect_to(luts_top[0].I4)) + try_connect(lambda: self.input_wires[0][4].connect_to(l8s_top[0].CIN)) + + try_connect(lambda: self.input_wires[1][4].connect_to(luts_btm[0].I4)) + try_connect(lambda: self.input_wires[1][4].connect_to(l8s_btm[0].CIN)) + + # downwards connection + for t, d in zip(luts_top[1:], luts_btm): + t.O51.connect_to(d.I3) + last_top = len(carries_top)-1 + carries_top[last_top].connect_to(luts_btm[last_top].I3) + + for idx, (lb, lt) in enumerate(zip(luts_btm, + luts_top[:height_4_until])): + for el in [lb, lt]: + try_connect(lambda: self.input_wires[idx][0].connect_to(el.I0)) + try_connect(lambda: self.input_wires[idx][1].connect_to(el.I1)) + try_connect(lambda: self.input_wires[idx][2].connect_to(el.I2)) + + try_connect(lambda: self.input_wires[idx][3].connect_to(lt.I3)) + + if tail_length: + lt = luts_top[height_4_until] + lb = luts_btm[height_4_until] + + try_connect(lambda: + self.input_wires[height_4_until][0].connect_to(lt.I0)) + try_connect(lambda: + self.input_wires[height_4_until][1].connect_to(lt.I1)) + + try_connect(lambda: + self.input_wires[height_4_until+1][0].connect_to( + lb.I0)) + try_connect(lambda: + self.input_wires[height_4_until+1][1].connect_to( + lb.I1)) + + for idx, lb in enumerate(luts_btm[height_4_until+1:]): + try_connect(lambda: + self.input_wires[idx+height_4_until+1][0].connect_to( + lb.I0)) + try_connect(lambda: + self.input_wires[idx+height_4_until+1][1].connect_to( + lb.I1)) + try_connect(lambda: + self.input_wires[idx+height_4_until+2][0].connect_to( + lb.I2)) + try_connect(lambda: + self.input_wires[idx+height_4_until+2][1].connect_to( + lb.I3)) + + def connect_carry_to_lut(carries, luts): + for carry, lut in zip(carries, luts[1:]): + carry.connect_to(lut.I4) + + connect_carry_to_lut(carries_top, luts_top) + connect_carry_to_lut(carries_btm, luts_btm) + luts_top[0].O51.connect_to(self.output_wires[0][0]) + + for idx, lb in enumerate(luts_btm): + lb.O51.connect_to(self.output_wires[idx+1][0]) + + carries_btm[len(luts_btm)-1].connect_to( + self.output_wires[len(luts_btm)+1][0]) + + luts_top[-1].O52.connect_to(luts_btm[len(luts_top)-1].I3) + + self.instances += luts_top + luts_btm + l8s_btm + l8s_top + +class MuxCYTernaryAdder(FinalAdder): + @staticmethod + def compression_goal(col): return 5 if col == 0 else 3 + + def __init__(self, input_shape: Shape): + input_shape = input_shape + output_shape = Shape([1 for _ in range(len(input_shape) + 2)]) + super().__init__(input_shape, output_shape) + + def build_hardware(self): + luts = [LUT6_2.fromPred( + lambda A0,A1,A2,A3,A4,A5: FA_carry(A0,A1,A2), + lambda A0,A1,A2,A3,A4,A5: FA_sum(A0,A1,A2) ^ A3 + ) for _ in range(len(self.input_shape)+1)] + c4s = [CARRY4() for _ in range(0, len(self.input_shape)+1, 4)] + dis = [el for c4 in c4s for el in c4.DI.elements] + ss = [el for c4 in c4s for el in c4.S.elements] + cis = [c4.CI for c4 in c4s] + os = [el for c4 in c4s for el in c4.O.elements] + cos = [el for c4 in c4s for el in c4.CO.elements] + + ## Connect CARRY4 together + for c4p, c4n in zip(c4s, c4s[1:]): + c4p.CO.elements[-1].connect_to(c4n.CI) + + ## Connect inputs + # Only connect up to the number of available input columns + for idx, lut in enumerate(luts[:len(self.input_wires)]): + try_connect(lambda idx=idx, lut=lut: self.input_wires[idx][0].connect_to(lut.I0)) + try_connect(lambda idx=idx, lut=lut: self.input_wires[idx][1].connect_to(lut.I1)) + try_connect(lambda idx=idx, lut=lut: self.input_wires[idx][2].connect_to(lut.I2)) + try_connect(lambda: self.input_wires[0][3].connect_to(luts[0].I3)) + try_connect(lambda: self.input_wires[0][3].connect_to(dis[0])) + try_connect(lambda: self.input_wires[0][4].connect_to(cis[0])) + + ## Second carry connection + for p, n, n_di in zip(luts, luts[1:], dis[1:]): + p.O5.connect_to(n.I3) + p.O5.connect_to(n_di) + + ## Connect outputs + for lut, s in zip(luts, ss): + lut.O6.connect_to(s) + + for idx, o in enumerate(os[:len(luts)]): + o.connect_to(self.output_wires[idx][0]) + + cos[len(luts)-1].connect_to(self.output_wires[len(luts)][0]) + self.instances += luts + c4s \ No newline at end of file diff --git a/src/finn/compressor/src/graph/nodes.py b/src/finn/compressor/src/graph/nodes.py new file mode 100644 index 0000000000..647129e70d --- /dev/null +++ b/src/finn/compressor/src/graph/nodes.py @@ -0,0 +1,393 @@ +############################################################################# +# Copyright (C) 2024 - 2025 Advanced Micro Devices, Inc. All rights reserved. +# +# SPDX-License-Identifier: BSD-3-Clause +# +# @brief Compressor graph node definitions and delay estimation +############################################################################# + +from __future__ import annotations +from abc import ABC, abstractmethod +from typing import List, Tuple, Dict +from ..utils.shape import Shape + +""" +Convention: LSB at index 0. +""" + +class Node(ABC): + def accept(self, visitor) -> None: pass + +class Connectable(Node): + target: list[Connectable] + source: Connectable + + def __init__(self): + self.target = [] + self.source = None + + def connect_to(self, target): + assert isinstance(target, Connectable), \ + "Target has to be of type Connectible!" + self.target.append(target) + target.source = self + + @property + def has_target(self): return bool(self.target) + + @property + def has_source(self): return self.source is not None + +class Constant(Connectable): + def __init__(self, value): + super().__init__() + self.value = str(value) + +class Wire(Connectable): + def __init__(self, desired_name = None): + super().__init__() + self.prefix = "" + self.desired_name = desired_name + + def set_to_module_input(self): self.prefix = "input " + def set_to_module_output(self): self.prefix = "output " + + def accept(self, visitor) -> None: visitor.visit_wire(self) + +class Logic(Wire): + def __init__(self, *, rst: Connectable = None, + en: Connectable = None, init: int = None): + self.rst = rst + self.en = en + self.init = init + super().__init__() + + def accept(self, visitor): return visitor.visit_logic(self) + +class BlackboxVecElement(Connectable): + pass + +class BlackboxVec(Node, ABC): + def __init__(self, name, width): + self.name = name + self.elements = [BlackboxVecElement() for el in range(width)] + super().__init__() + +class BlackboxInputVec(BlackboxVec): + def accept(self, visitor) -> None: visitor.visit_blackbox_input_vec(self) + +class BlackboxOutputVec(BlackboxVec): + def accept(self, visitor) -> None: visitor.visit_blackbox_output_vec(self) + +class BlackboxPort(Connectable): + def __init__(self, name): + self.name = name + super().__init__() + + @property + @abstractmethod + def connected(self): pass + + @property + @abstractmethod + def wire(self): pass + +class BlackboxInput(BlackboxPort): + def __init__(self, name): + super().__init__(name) + + @property + def connected(self): return self.has_source + + def connect_to(self, target): + raise RuntimeError("Blackbox Input cannot act as output.") + + @property + def wire(self): return self.source + + def accept(self, visitor) -> None: visitor.visit_blackbox_input(self) + +class BlackboxOutput(BlackboxPort): + def __init__(self, name): + super().__init__(name) + + @property + def connected(self): return self.has_target + + @property + def wire(self): return self.target + + def accept(self, visitor) -> None: visitor.visit_blackbox_output(self) + +class Blackbox(Node): + @abstractmethod + def __init__(self, module_name: str, in_ports: Tuple[BlackboxInput], + out_ports: Tuple[BlackboxOutput], parameters: Dict[str, str]): + self.module_name = module_name + self.in_ports = in_ports + self.out_ports = out_ports + self.parameters = parameters + self.annotations = [] + + for port in self.in_ports + self.out_ports: + self.__dict__[port.name] = port + + def annotate(self, annotation: str): + self.annotations.append(annotation) + + def accept(self, visitor): + visitor.visit_blackbox(self) + +class Module(Node): + def __init__(self): + self.instances = [] # All inner instances + super().__init__() + + @property + @abstractmethod + def inputs(self): pass + + @property + @abstractmethod + def outputs(self): pass + +class Counter(Module): + def __init__(self, input_shape: Shape, output_shape: Shape): + super().__init__() + self.input_shape = input_shape + self.output_shape = output_shape + self.input_wires = self._build_wires(input_shape) + self.output_wires = self._build_wires(output_shape) + self.instances += self.inputs + self.outputs + + self.build_hardware() + + def accept(self, visitor) -> None: + visitor.visit_counter(self) + + @abstractmethod + def build_hardware(self): pass + + def _build_wires(self, shape: Shape): + return tuple([tuple([Wire() for _ in range(col_height)]) + for col_height in shape]) + + @property + def inputs(self): return [el for col in self.input_wires for el in col] + + @property + def outputs(self): return [el for col in self.output_wires for el in col] + + @property + def luts(self) -> List[LUT]: + return [inst for inst in self.instances if isinstance(inst, LUT)] + + @property + def efficiency(self) -> float: + if (len(self.luts) == 0 and + sum(self.input_shape) - sum(self.output_shape) == 0): + return 0 + diff = (sum(self.input_shape) - sum(self.output_shape)) + denom = sum(LUT.size for LUT in self.luts) + return diff / denom + + @property + def strength(self) -> float: + return sum(self.input_shape) / sum(self.output_shape) + +class GateAbsorptionCounter(Counter): + def __init__(self, input_shape: Shape, output_shape: Shape): + self.input_wires_complementary = self._build_wires(input_shape) + super().__init__(input_shape, output_shape) + + def accept(self, visitor) -> None: + visitor.visit_gate_absorption_counter(self) + + @property + def inputs(self): return [el for col in + self.input_wires + self.input_wires_complementary + for el in col] + +class Passthrough(Counter): + def __init__(self): + super().__init__(Shape([1]), Shape([1])) + + def build_hardware(self): + self.output_wires = self.input_wires + self.instances = [el for col in self.input_wires for el in col] + +class Stage(Node): + input_shape: Shape + output_shape: Shape + input_wires: Bitmatrix[Wire] + output_wires: Bitmatrix[Wire] + + def connect_to(self, other): + for col_s, col_t in zip(self.output_wires, other.input_wires): + for el_s, el_t in zip(col_s, col_t): + el_s.connect_to(el_t) + + # TODO: maybe subclass instead? + if "output_wires_complementary" in self.__dict__: + for col_s, col_t in zip(self.output_wires_complementary, + other.input_wires_complementary): + for el_s, el_t in zip(col_s, col_t): + el_s.connect_to(el_t) + +class InputStage(Stage): + def __init__(self, shape: Shape, gates: bool = False): + self.input_shape = shape + self.output_shape = shape + self.input_wires = Bitmatrix(shape) + self.gates = gates + if gates: + self.input_wires_complementary = Bitmatrix(shape) + self.output_wires_complementary = self.input_wires_complementary + + self.output_wires = self.input_wires + + def accept(self, visitor) -> None: visitor.visit_input_stage(self) + +class PipelineStage(Stage): + def __init__(self, shape: Shape): + self.input_shape = shape + self.output_shape = shape + self.input_wires = Bitmatrix(shape) + self.output_wires = Bitmatrix(shape) + self.instances = [] + for i_c, o_c in zip(self.input_wires, self.output_wires): + for i, o in zip(i_c, o_c): + lgc = Logic() + i.connect_to(lgc) + lgc.connect_to(o) + self.instances.append(lgc) + + def accept(self, visitor) -> None: visitor.visit_pipeline_stage(self) + +class CompressionStage(Stage): + def __init__(self): + self.counters_with_shifts = [] + self.input_wires = Bitmatrix() + self.output_wires = Bitmatrix() + + @property + def input_shape(self): return self._shape(lambda x: x.input_shape) + + @property + def output_shape(self): return self._shape(lambda x: x.output_shape) + + def _shape(self, func): + shape = Shape(()) + for ctr, shift in self.counters_with_shifts: + shifted_shape = func(ctr) << shift + shape = shape + shifted_shape + return shape + + def append_counter(self, counter: Counter, shift: int): + self.counters_with_shifts.append((counter, shift)) + for source_idx, col in enumerate(counter.input_wires): + for wire in col: + self.input_wires.add_output(wire, source_idx + shift) + for source_idx, col in enumerate(counter.output_wires): + for wire in col: + self.output_wires.add_input(wire, source_idx + shift) + + def accept(self, visitor) -> None: visitor.visit_compression_stage(self) + +class GateAbsorbedStage(CompressionStage): + def __init__(self): + super().__init__() + self.input_wires_complementary = Bitmatrix() + + def append_counter(self, counter: GateAbsorptionCounter, shift: int): + super().append_counter(counter, shift) + for source_idx, col in enumerate(counter.input_wires_complementary): + for wire in col: + self.input_wires_complementary.add_output(wire, + source_idx + shift) + + def accept(self, visitor) -> None: visitor.visit_gate_absorbed_stage(self) + +class Compressor(Node): + def __init__(self, name): + self.stages = [] + self.module_name = name + self.io = [] + + @property + def input_shape(self): return self.stages[0].input_shape + + @property + def output_shape(self): return self.stages[-1].output_shape + + @property + def delay(self): + delay_ = 0 + for s in self.stages: + if isinstance(s, PipelineStage): + delay_ += 1 + from .accumulator import AccumulatorStage + if isinstance(s, AccumulatorStage): + delay_ += 1 + return delay_ + + def accept(self, visitor) -> None: visitor.visit_compressor(self) + +class BitmatrixElement(Connectable): + def __init__(self, vector, idx_x, idx_y): + self.vector = vector + self.idx_2d = (idx_x, idx_y) + super().__init__() + + @property + def lin_idx(self): + return sum(self.vector.shape[:self.idx_2d[0]]) + self.idx_2d[1] + + def accept(self, visitor): pass + +class Bitmatrix(Node): + def __init__(self, shape : Shape = Shape(), name: str = None): + self._name = name + self.prefix = "" + self.connectables = [[BitmatrixElement(self, idx, row) + for row in range(col)] + for idx, col in enumerate(shape)] + super().__init__() + + def set_to_module_input(self): self.prefix = "input " + def set_to_module_output(self): self.prefix = "output " + def __len__(self): return len(self.connectables) + def __getitem__(self, sel): return self.connectables[sel] + def __iter__(self): return self.connectables.__iter__() + def total_size(self): return sum([len(col) for col in self.connectables]) + + @property + def shape(self): return Shape([len(col) for col in self.connectables]) + + def add_output(self, el, col_idx): + be = self._append_wire(el, col_idx) + be.connect_to(el) + + def add_input(self, el, col_idx): + be = self._append_wire(el, col_idx) + el.connect_to(be) + + def _append_wire(self, el, col_idx): + while len(self.connectables) <= col_idx: + self.connectables.append([]) + be = BitmatrixElement(self, col_idx, len(self.connectables[col_idx])) + self.connectables[col_idx].append(be) + return be + + def accept(self, visitor) -> None: visitor.visit_bitmatrix(self) + +class LUT(Blackbox): + @abstractmethod + def __init__(self, module_name, init_code: str, + in_ports: Tuple[BlackboxInput], + out_ports: Tuple[BlackboxOutput], + *, + size, desired_name = "lut"): + self.desired_name = desired_name + self.size = size + super().__init__(module_name, in_ports, out_ports, {"INIT": init_code}) \ No newline at end of file diff --git a/src/finn/compressor/src/graph/primitives.py b/src/finn/compressor/src/graph/primitives.py new file mode 100644 index 0000000000..1cf36507a3 --- /dev/null +++ b/src/finn/compressor/src/graph/primitives.py @@ -0,0 +1,113 @@ +############################################################################# +# Copyright (C) 2024 - 2025 Advanced Micro Devices, Inc. All rights reserved. +# +# SPDX-License-Identifier: BSD-3-Clause +# +# @brief FPGA primitive definitions for compressor (LUTs, carry chains, etc.) +############################################################################# + +from ctypes import c_uint64, c_uint32 +from .nodes import BlackboxInput, BlackboxOutput, Blackbox, LUT, Constant +from .nodes import BlackboxInputVec, BlackboxOutputVec + +class LUT2(LUT): + @classmethod + def fromPred(self, predO2, desired_name = "lut2"): + res = 0 + for i in range(32): + inputs = [bool(i & (1 << shmt)) for shmt in range(2)] + res = res | (int(predO2(*inputs)) << i) + res = res & 0xF + init_str = f"""4'h{"{:_x}".format(c_uint32(res).value)}""" + return LUT2(init_str, desired_name) + + def __init__(self, init_code: str, desired_name): + in_ports = [BlackboxInput(f"I{el}") for el in range(2)] + out_ports = [BlackboxOutput("O")] + super().__init__("LUT2", init_code, in_ports, out_ports, + desired_name=desired_name, size=0.5) + +class LUT5(LUT): + @classmethod + def fromPred(self, predO5, desired_name = "lut5"): + res = 0 + for i in range(32): + inputs = [bool(i & (1 << shmt)) for shmt in range(5)] + res = res | (int(predO5(*inputs)) << i) + init_str = f"""32'h{"{:_x}".format(c_uint32(res).value)}""" + return LUT5(init_str, desired_name) + + def __init__(self, init_code: str, desired_name): + in_ports = [BlackboxInput(f"I{el}") for el in range(5)] + out_ports = [BlackboxOutput("O")] + super().__init__("LUT5", init_code, in_ports, out_ports, + desired_name=desired_name, size=0.5) + +class LUT6(LUT): + @classmethod + def fromPred(self, predO6, desired_name = "lut6"): + res = 0 + for i in range(64): + inputs = [bool(i & (1 << shmt)) for shmt in range(6)] + res = res | (int(predO6(*inputs)) << i) + init_str = f"""64'h{"{:_x}".format(c_uint64(res).value)}""" + return LUT6(init_str, desired_name) + + def __init__(self, init_code: str, desired_name): + in_ports = [BlackboxInput(f"I{el}") for el in range(6)] + out_ports = [BlackboxOutput("O")] + super().__init__("LUT6", init_code, in_ports, out_ports, + desired_name=desired_name, size=1) + +def split_lut_from_pred(predO5, predO6): + res = 0 + for i in range(32, 64): + inputs = [bool(i & (1 << shmt)) for shmt in range(6)] + res = res | (int(predO5(*inputs)) << (i-32)) | (int(predO6(*inputs)) << (i)) + init_str = f"""64'h{"{:_x}".format(c_uint64(res).value)}""" + return init_str + +class LUT6_2(LUT): + @classmethod + def fromPred(self, predO5, predO6, desired_name = "lut6_2"): + return LUT6_2(split_lut_from_pred(predO5, predO6), desired_name) + + def __init__(self, init_code: str, desired_name): + in_ports = [BlackboxInput(f"I{el}") for el in range(6)] + out_ports = [BlackboxOutput("O6"), BlackboxOutput("O5")] + super().__init__("LUT6_2", init_code, in_ports, out_ports, + desired_name=desired_name, size=1) + Constant("1'b1").connect_to(self.I5) + +class LUT6CY(LUT): + @classmethod + def fromPred(self, predO51, predO52, desired_name = "lut6cy"): + return LUT6CY(split_lut_from_pred(predO51, predO52), desired_name) + + def __init__(self, init_code: str, desired_name): + in_ports = [BlackboxInput(f"I{el}") for el in range(5)] + out_ports = [BlackboxOutput(f"O5{el+1}") for el in range(2)] + out_ports.append(BlackboxOutput("PROP")) + super().__init__("LUT6CY", init_code, in_ports, out_ports, + desired_name=desired_name, size=1) + +class LOOKAHEAD8(Blackbox): + def __init__(self): + c_in_ports_str = ["CIN", "CYA", "CYB", "CYC", "CYD", "CYE", "CYF", "CYG", "CYH"] + p_in_ports_str = ["PROPA", "PROPB", "PROPC", "PROPD", "PROPE", "PROPF", "PROPG", + "PROPH"] + out_ports_str = ["COUTB", "COUTD", "COUTF", "COUTH"] + + self.c_in_ports = [BlackboxInput(el) for el in c_in_ports_str] + self.p_in_ports = [BlackboxInput(el) for el in p_in_ports_str] + out_ports = [BlackboxOutput(el) for el in out_ports_str] + super().__init__("LOOKAHEAD8", self.c_in_ports + self.p_in_ports, out_ports, + {"LOOKB" : "\"TRUE\"", "LOOKD" : "\"TRUE\"", + "LOOKF" : "\"TRUE\"", "LOOKH" : "\"TRUE\""}) + +class CARRY4(Blackbox): + def __init__(self): + in_ports = [BlackboxInputVec("DI", 4), BlackboxInputVec("S", 4), + BlackboxInput("CI")] + out_ports = [BlackboxOutputVec("O", 4), BlackboxOutputVec("CO", 4)] + super().__init__("CARRY4", in_ports, out_ports, {}) \ No newline at end of file diff --git a/src/finn/compressor/src/graph/visitor.py b/src/finn/compressor/src/graph/visitor.py new file mode 100644 index 0000000000..5be1ea118f --- /dev/null +++ b/src/finn/compressor/src/graph/visitor.py @@ -0,0 +1,45 @@ +############################################################################# +# Copyright (C) 2024 - 2025 Advanced Micro Devices, Inc. All rights reserved. +# +# SPDX-License-Identifier: BSD-3-Clause +# +# @brief Visitor pattern for compressor graph traversal +############################################################################# + +from abc import ABC +from .nodes import Counter, CompressionStage, Compressor, InputStage, PipelineStage +from .nodes import Logic, Bitmatrix, GateAbsorbedStage, GateAbsorptionCounter +from .nodes import Blackbox +from .primitives import BlackboxInputVec, BlackboxOutputVec, BlackboxInput +from .primitives import BlackboxOutput + +class Visitor(ABC): + def visit_compressor(self, c: Compressor): raise NotImplementedError + + def visit_input_stage(self, s: InputStage): raise NotImplementedError + + def visit_gate_absorption_stage(self, s: GateAbsorbedStage): + raise NotImplementedError + + def visit_pipeline_stage(self, s: PipelineStage): raise NotImplementedError + + def visit_compression_stage(self, s: CompressionStage): raise NotImplementedError + + def visit_counter(self, c: Counter): raise NotImplementedError + + def visit_gate_absorption_counter(self, c: GateAbsorptionCounter): + raise NotImplementedError + + def visit_blackbox(self, b: Blackbox): raise NotImplementedError + + def visit_blackbox_input(self, b: BlackboxInput): raise NotImplementedError + + def visit_blackbox_output(self, b: BlackboxOutput): raise NotImplementedError + + def visit_blackbox_input_vec(self, b: BlackboxInputVec): raise NotImplementedError + + def visit_blackbox_output_vec(self, b: BlackboxOutputVec): raise NotImplementedError + + def visit_logic(self, lgc: Logic): raise NotImplementedError + + def visit_bitmatrix(self, b: Bitmatrix): raise NotImplementedError \ No newline at end of file diff --git a/src/finn/compressor/src/main.py b/src/finn/compressor/src/main.py new file mode 100644 index 0000000000..ad3331cb14 --- /dev/null +++ b/src/finn/compressor/src/main.py @@ -0,0 +1,169 @@ +############################################################################# +# Copyright (C) 2024 - 2025 Advanced Micro Devices, Inc. All rights reserved. +# +# SPDX-License-Identifier: BSD-3-Clause +# +# @brief Main compressor tree generation entry point +############################################################################# + +import time +import argparse +from .target import Target, Versal, SevenSeries, UltraScale +from .utils.shape import Shape +from .passes.compressor_constructor import CompressorConstructor +from .passes.cost_estimator import CostEstimator +from .passes.printer import CompressorPrinter +from .passes.emitter import VerilogGenerator +from .passes.wire_inserter import WireInserter +from .passes.io_annotator import IOAnnotator +from .passes.lut_placer import LUTPlacer +from .tests.test_gen import generate_test +from .tests.tester import tester +from typing import Optional, List + +def parse_cli(): + parser = argparse.ArgumentParser( + prog="Compressor Generator", + description="Generate a Compressor Tree for a given input." + ) + parser.add_argument('-o', '--output', default="../gen/out.sv", + help="Path to store the compressor at.") + parser.add_argument('-s', '--shape', required=True, help="Input shape.") + parser.add_argument('-a', '--accumulate', action='store_true', + help="Enable accumulation.") + parser.add_argument('-w', '--accumulator_width', + help="Accumulator width [default: Reduced input shape].") + parser.add_argument('-g', '--gates', default=None, + help="Inline 2-input gates into the compressor. LSB is left." + "Example: 8,3") + parser.add_argument('-t', '--target', default="Versal", + help="Target FPGA generation.", choices=["Versal", "7-Series", + "UltraScale"]) + parser.add_argument('--test', action="store_true", + help="Test the generated compressor using Vivado XSim.") + parser.add_argument('-n', '--name', default="comp", + help="Name of the generated Systemverilog module.") + parser.add_argument('-p', '--pipeline_every', default=None, + help="Insert Pipeline registers every n stages. Default: " + "Purely combinatorial.") + parser.add_argument('-c', '--constant', default=[], help="Add a constant binary " + "number input. Example: 1011") + args = parser.parse_args() + + try: + shape = Shape(int(el) for el in args.shape.split(',')) + except (ValueError, TypeError): + print("Improperly defined shape.") + exit(-1) + + gates = [] + if args.gates: + assert len(args.gates) == sum(shape), \ + "Length of shape and gate specification do not match." + gates_lin = list(args.gates) + for col in shape: + gates_col = [] + for _ in range(col): + gates_col.append(gates_lin.pop(0)) + gates.append(gates_col) + + constants = [] + for char in args.constant: + try: + constants.append(int(char, 2)) + except ValueError: + print("Improperly defined constant.") + exit(-1) + if args.target == "Versal": + target = Versal() + elif args.target == "7-Series": + target = SevenSeries() + elif args.target == "UltraScale": + target = UltraScale() + else: + raise ValueError("Target not currently supported.") + + generate_compressor( + target, + shape, + args.name, + int(args.pipeline_every) if args.pipeline_every else None, + args.accumulate, + int(args.accumulator_width) if args.accumulator_width else None, + gates, + constants, + args.output, + args.test + ) + +def generate_compressor( + target: Target, + shape: Shape, + name: str, + comb_depth: Optional[int], + accumulate: bool, + accumulator_width: int, + gates: List[List[str]], + constants: List[int], # Each element is a binary numer digit. + path: str, + test: bool, + enable: bool = False): + + start_time = time.time() + constructor = CompressorConstructor() + c = constructor(target.counter_candidates, + target.absorbing_counter_candidates, + target.final_adder, + shape, + name, + comb_depth=comb_depth, + accumulate=accumulate, + accumulator_width=accumulator_width, + constants=constants, + gates=gates, + enable=enable) + + placer = LUTPlacer() + c.accept(placer) + + wire_inserter = WireInserter() + c.accept(wire_inserter) + + annotator = IOAnnotator() + c.accept(annotator) + + cost = CostEstimator() + c.accept(cost) + + emitter = VerilogGenerator() + c.accept(emitter) + with open(path, 'w') as f: + withprefix = f"""// Adder generated by the Python Compressor Generator +// Input shape: {c.input_shape}; Output Shape: {c.output_shape} +// Pipeline stages: {c.delay} +// Target Generation: {target.__class__.__name__} +// Approximate LUTs: {int(cost.luts+0.5)} +// Accumulation: {"yes" if accumulate else "no"} {f"of width {accumulator_width}" + if accumulator_width else ""} +// Enable mode: {"yes (init values set on accumulator registers)" if enable else "no"} +// Gates: {gates if gates else "None"} + """ + emitter.emitter.output + f.write(withprefix) + + end_time = time.time() + print("--%s seconds" % (start_time - end_time)) + + c.accept(CompressorPrinter()) + + if test: + constant = int("".join(str(c) for c in constants), 2) if constants else 0 + test = generate_test(shape, "comp", c.delay, gates, accumulate, + accumulator_width, constant) + with open("../gen/test.sv", 'w') as f: + f.write(test) + tester("../gen/test.sv", path) + + return c.delay + +if __name__ == "__main__": + parse_cli() \ No newline at end of file diff --git a/src/finn/compressor/src/passes/__init__.py b/src/finn/compressor/src/passes/__init__.py new file mode 100644 index 0000000000..ff4b37ccd2 --- /dev/null +++ b/src/finn/compressor/src/passes/__init__.py @@ -0,0 +1,8 @@ +############################################################################# +# Copyright (C) 2024 - 2025 Advanced Micro Devices, Inc. All rights reserved. +# +# SPDX-License-Identifier: BSD-3-Clause +# +# @brief Compressor passes package initialization +# @author Simon Gerber +############################################################################# diff --git a/src/finn/compressor/src/passes/compressor_constructor.py b/src/finn/compressor/src/passes/compressor_constructor.py new file mode 100644 index 0000000000..c7c6285873 --- /dev/null +++ b/src/finn/compressor/src/passes/compressor_constructor.py @@ -0,0 +1,183 @@ +############################################################################# +# Copyright (C) 2024 - 2025 Advanced Micro Devices, Inc. All rights reserved. +# +# SPDX-License-Identifier: BSD-3-Clause +# +# @brief Compressor tree constructor with two-pass accumulator handling +############################################################################# + +from typing import Tuple, List +from .compressor_pipeliner import CompressorPipeliner +from ..graph.accumulator import AccumulatorStage +from ..graph.counters.counter_candidates import ConstantOne +from ..graph.counters.absorption_counter_candidates import GateAbsorptionCounterCandidate +from ..graph.nodes import Compressor, CompressionStage, InputStage, Counter, Passthrough +from ..graph.nodes import GateAbsorbedStage +from ..utils.shape import Shape + +class CompressorConstructor: + def adjust_compression_goal_for_constants(self, compression_goal, constants): + # Subtract constants, but never go below 2 (minimum achievable by compressor) + return lambda x: max(2, compression_goal(x) - + (constants[x] if x < len(constants) else 0)) + + def get_compression_goal(self, final_adder, accumulate, constants): + # Two-pass strategy for accumulate: compress to goal, add constants, then post-check + compression_goal = final_adder.compression_goal + return self.adjust_compression_goal_for_constants(compression_goal, constants) + + def add_constants_to_stage(self, s: CompressionStage, constants): + """Add constant bits to the compression stage.""" + for idx, el in enumerate(constants): + if el: + c = ConstantOne() + s.append_counter(c, idx) + + def __call__(self, + counter_candidates, + absorption_counter_candidates, + final_adder, + input_shape: Shape, + name: str, + comb_depth: int = None, + accumulate=False, + accumulator_width: int = None, + constants: Tuple[bool] = tuple(), + gates: Tuple[Tuple[str]] = tuple(), + enable: bool = False + ) -> Compressor: + compression_goal = self.get_compression_goal(final_adder, accumulate, constants) + + c = Compressor(name) + c.stages.append(InputStage(input_shape, gates)) + + if gates: + s = self.construct_absorption_stage(c.stages[-1].output_shape, gates, + absorption_counter_candidates) + c.stages[-1].connect_to(s) + c.stages.append(s) + + # CRITICAL: This loop can hang if compression_goal is unreachable + # add_compression_stage cannot compress height-1 or height-2 columns (requires >= 3) + # Therefore compression_goal must be achievable given this constraint + # See get_compression_goal() for how this is ensured in accumulate configurations + while not self.compression_goal_reached(c.stages[-1].output_shape, + compression_goal): + self.add_compression_stage(c, compression_goal, counter_candidates) + + # Add constants to the graph. + if not isinstance(c.stages[-1], CompressionStage) and constants: + self.add_compression_stage(c, compression_goal, counter_candidates) + self.add_constants_to_stage(c.stages[-1], constants) + + # After constants, check if we need additional compression for accumulator mode. + # The ternary adder receives: compressor_output + feedback (height 1). + # If any column exceeds final_adder capacity, we need more compression. + if accumulate: + def post_const_goal(x): + # Leave room for feedback (height 1) within ternary adder capacity + return max(2, final_adder.compression_goal(x) - 1) + + while not self.compression_goal_reached(c.stages[-1].output_shape, post_const_goal): + self.add_compression_stage(c, post_const_goal, counter_candidates) + + if comb_depth: + pipeliner = CompressorPipeliner() + pipeline_stages = pipeliner.pipeline(c, comb_depth) + else: + pipeline_stages = 0 + + if accumulate: + acc = AccumulatorStage(c.stages[-1].output_shape, final_adder, + pipeline_stages, + accumulator_width=accumulator_width, + enable=enable) + c.stages.append(acc) + elif max(c.stages[-1].output_shape) > 1: + final_stage = CompressionStage() + final_stage.append_counter(final_adder(c.stages[-1].output_shape), 0) + c.stages.append(final_stage) + + for s_p, s_n in zip(c.stages, c.stages[1:]): + s_p.connect_to(s_n) + return c + + def add_compression_stage(self, compressor: Compressor, compression_goal, + counter_candidates): + """Add a compression stage. Cannot compress columns with height < 3 (Full Adder = 3:2).""" + new_stage = CompressionStage() + stage_inputs = compressor.stages[-1].output_shape + stage_outputs = Shape() + + i = 0 + while i < max(len(stage_inputs), len(stage_outputs)): + def cur_output_height(): + return (stage_inputs + stage_outputs)[i] + + def cur_input_height(): + return stage_inputs[i] if len(stage_inputs) > i else 0 + + while cur_input_height() >= 3 and cur_output_height() > compression_goal(i): + counter = self.schedule_counter(stage_inputs[i:], + stage_outputs[i:], + lambda x: compression_goal(x+i), + counter_candidates) + stage_inputs = stage_inputs - (counter.input_shape << i) + stage_outputs = stage_outputs + (counter.output_shape << i) + new_stage.append_counter(counter, i) + i += 1 + + # pass through all leftover inputs: + for i in range(len(stage_inputs)): + for j in range(stage_inputs[i]): + new_stage.append_counter(Passthrough(), i) + + compressor.stages.append(new_stage) + + def schedule_counter(self, stage_inputs, stage_outputs, compression_goal, + counter_candidates) -> Counter: + counters = [] + for counter_candid in counter_candidates: + counter = counter_candid.extend_to_fit(stage_inputs, stage_outputs, + compression_goal) + counters.append(counter) + + try: + return max((c for c in counters + if c is not None), key = lambda x: (x.efficiency, x.strength)) + except ValueError: + raise ValueError(f"Could not schedule counter for input shape" + f"{stage_inputs}; output shape {stage_outputs}; " + "compression goal {compression_goal(0)}") + + def compression_goal_reached(self, shape, compression_goal): + return all([col <= compression_goal(idx) + for idx, col in enumerate(shape)]) + + + def get_best_inlined_counter(self, input_shape, gates, absorption_counters): + candidates = [] + for counter in absorption_counters: + candidate = counter.extend_to_fit(input_shape, gates) + if candidate: + candidates.append(candidate) + return max(candidates, key=lambda x: (x.efficiency, x.strength)) + + def construct_absorption_stage(self, + input_shape: Shape, + gates: List[str], + absorption_counters: GateAbsorptionCounterCandidate + ): + s = GateAbsorbedStage() + cur_shape = input_shape + cur_gates = gates[:] + for idx in range(len(input_shape)): + while cur_shape[idx] > 0: + best = self.get_best_inlined_counter( + cur_shape[idx:], cur_gates[idx:], absorption_counters) + cur_shape = cur_shape - (best.input_shape << idx) + for i in range(len(cur_shape)): + new = list(reversed(list(reversed(cur_gates[i]))[:cur_shape[i]])) + cur_gates[i] = new + s.append_counter(best, idx) + return s \ No newline at end of file diff --git a/src/finn/compressor/src/passes/compressor_pipeliner.py b/src/finn/compressor/src/passes/compressor_pipeliner.py new file mode 100644 index 0000000000..b0a1e80163 --- /dev/null +++ b/src/finn/compressor/src/passes/compressor_pipeliner.py @@ -0,0 +1,33 @@ +############################################################################# +# Copyright (C) 2024 - 2025 Advanced Micro Devices, Inc. All rights reserved. +# +# SPDX-License-Identifier: BSD-3-Clause +# +# @brief Compressor tree pipelining pass +############################################################################# + +from ..graph.nodes import Compressor, CompressionStage, PipelineStage + +class CompressorPipeliner: + def pipeline(self, c: Compressor, max_combinational_depth: int): + cur_depth = 0 + pipeline_stages = 0 + new_stages = [] + + for idx, stage in enumerate(c.stages): + if isinstance(stage, CompressionStage): + new_stages.append(stage) + cur_depth += 1 + if (cur_depth >= max_combinational_depth or + cur_depth >= max_combinational_depth-1 and idx == len(c.stages)-1): + new_stages.append(PipelineStage(stage.output_shape)) + cur_depth = 0 + pipeline_stages += 1 + else: + new_stages.append(stage) + c.stages = new_stages + + for p, n in zip(c.stages, c.stages[1:]): + p.connect_to(n) + + return pipeline_stages \ No newline at end of file diff --git a/src/finn/compressor/src/passes/cost_estimator.py b/src/finn/compressor/src/passes/cost_estimator.py new file mode 100644 index 0000000000..859504a63a --- /dev/null +++ b/src/finn/compressor/src/passes/cost_estimator.py @@ -0,0 +1,35 @@ +############################################################################# +# Copyright (C) 2024 - 2025 Advanced Micro Devices, Inc. All rights reserved. +# +# SPDX-License-Identifier: BSD-3-Clause +# +# @brief Cost estimation pass for compressor resources +############################################################################# + +from ..graph.nodes import CompressionStage, Compressor, GateAbsorbedStage, PipelineStage +from ..graph.nodes import Blackbox +from ..graph.primitives import LUT6, LUT6_2, LUT6CY, LUT5, LUT2, LUT +from .node_iterator import NodeIterator + +class CostEstimator(NodeIterator): + def iter_compressor(self, c: Compressor): + self.combinatorial_stages = -1 # Start with -1 to exclude final adder + self.pipeline_stages = 0 + self.luts = 0 + + def iter_compression_stage(self, s: CompressionStage): + self.combinatorial_stages += 1 + + def iter_gate_absorbed_stage(self, g: GateAbsorbedStage): + self.combinatorial_stages += 1 + + def iter_pipeline_stage(self, p: PipelineStage): + self.pipeline_stages += 1 + + def iter_blackbox(self, b: Blackbox): + if isinstance(b, LUT5) or isinstance(b, LUT2): + self.luts += 0.5 + elif isinstance(b, LUT6) or isinstance(b, LUT6CY) or isinstance(b, LUT6_2): + self.luts += 1 + elif isinstance(b, LUT): + raise RuntimeError("No cost function implemented for this LUT type {b}") \ No newline at end of file diff --git a/src/finn/compressor/src/passes/emitter.py b/src/finn/compressor/src/passes/emitter.py new file mode 100644 index 0000000000..421b0f1379 --- /dev/null +++ b/src/finn/compressor/src/passes/emitter.py @@ -0,0 +1,317 @@ +############################################################################# +# Copyright (C) 2024 - 2025 Advanced Micro Devices, Inc. All rights reserved. +# +# SPDX-License-Identifier: BSD-3-Clause +# +# @brief Verilog emitter for compressor tree +############################################################################# + +from io import StringIO +from contextlib import contextmanager +from collections import defaultdict +from typing import Tuple +from ..graph.primitives import BlackboxInput, BlackboxInputVec, BlackboxOutput +from ..graph.primitives import BlackboxOutputVec +from ..graph.visitor import Visitor +from ..graph.nodes import Bitmatrix, Counter, CompressionStage, Compressor, InputStage +from ..graph.nodes import PipelineStage, Wire, BlackboxPort, Logic, BlackboxVecElement +from ..graph.nodes import Connectable, GateAbsorbedStage, Blackbox, BitmatrixElement +from ..graph.nodes import Constant +from ..graph.accumulator import AccumulatorStage + +class VerilogEmitter: + def __init__(self): + self._out = StringIO() + self._indent_level = 0 + self._line_start = True + + def emit(self, line = ""): + if self._line_start: + self._out.write(self._indent_level * "\t") + self._line_start = False + self._out.write(line) + + def emitln(self, line = ""): + if self._line_start: + self._out.write(self._indent_level * "\t") + self._out.write(line + "\n") + self._line_start = True + + @property + @contextmanager + def indent(self): + try: + self._indent_level += 1 + yield None + finally: + self._indent_level -= 1 + + @property + def output(self): + return self._out.getvalue() + + def save_verilog(self, filename): + with open(filename, "w") as f: + f.writelines(self._out) + +class VerilogGenerator(Visitor): + def set_name(self, o: object, name): + self._names[type(o)][o] = name + + def get_name(self, o: object): + if isinstance(o, BlackboxPort): + return o.name + + if o in self._names[type(o)]: + return self._names[type(o)][o] + + subdict = self._names[type(o)] + + if isinstance(o, Logic): + subdict[o] = f"logic_{len(subdict)}" + elif isinstance(o, Wire): + if o.desired_name: + if o.desired_name not in subdict.values(): + subdict[o] = o.desired_name + else: + print(f"Could not obey desired name: {o.desired_name}") + else: + subdict[o] = f"wire_{len(subdict)}" + elif isinstance(o, Bitmatrix): + subdict[o] = f"bitmatrix_{len(subdict)}" + elif isinstance(o, BitmatrixElement): + bitmatrix = o.vector + return self.get_name(bitmatrix) + f"[{o.lin_idx}]" + elif isinstance(o, Constant): + return o.value + elif isinstance(o, Blackbox): + subdict[o] = f"{o.module_name.lower()}_{len(subdict)}" + else: + raise NotImplementedError(f"get_name cannot handle this type {type(o)}") + return subdict[o] + + def visit_compressor(self, c: Compressor): + self.emitter = VerilogEmitter() + self._declared_hardware = set() + self._emitted_hardware = set() + self._names = defaultdict(lambda: {}) + + self.set_name(c.stages[0].input_wires, "in") + if hasattr(c.stages[0], "input_wires_complementary"): + self.set_name(c.stages[0].input_wires_complementary, "in_2") + self.set_name(c.stages[-1].output_wires, "out") + + self.emitter.emitln(f"module {c.module_name}(") + with self.emitter.indent: + names = sorted(["input clk"] + + [el.prefix + ("logic " if isinstance(el, Logic) else + f"[{el.total_size()-1}:0] " + if isinstance(el, Bitmatrix) else + "") + self.get_name(el) for el in c.io], + key=lambda x: "input" not in x) + [self._declared_hardware.add(el) for el in c.io] + + self.emitter.emitln(",\n\t".join(names)) + self.emitter.emitln(");") + + with self.emitter.indent: + for stage in c.stages: + stage.accept(self) + self.emitter.emitln("endmodule") + + def visit_input_stage(self, s: InputStage): + s.input_wires.accept(self) + if hasattr(s, "input_wires_complementary"): + s.input_wires_complementary.accept(self) + # Visit output_wires if they're separate from input_wires (trivial passthrough case) + if s.output_wires is not s.input_wires: + s.output_wires.accept(self) + + def visit_accumulator_stage(self, a: AccumulatorStage): + self.emitter.emitln() + self.emitter.emitln("// Accumulator Stage") + a.input_wires.accept(self) + [el.accept(self) for el in + sorted(a.instances, key=lambda x: (not isinstance(x, Connectable)))] + a.output_wires.accept(self) + + + def visit_pipeline_stage(self, s: PipelineStage): + self.emitter.emitln() + self.emitter.emitln("// Pipeline Results..") + s.input_wires.accept(self) + [el.accept(self) for el in s.instances] + s.output_wires.accept(self) + + def visit_compression_stage(self, s: CompressionStage): + self.emitter.emitln() + self.emitter.emitln(f"// Compression Stage with Input Shape: {s.input_shape} " + f"and Output Shape {s.output_shape}") + s.input_wires.accept(self) + [c.accept(self) for c, _ in s.counters_with_shifts] + s.output_wires.accept(self) + self.emitter.emitln() + + def visit_gate_absorbed_stage(self, g: GateAbsorbedStage): + self.emitter.emitln() + self.emitter.emitln("// Compression Stage with Gate Absorption.") + self.emitter.emitln(f"// Input Shape: {g.input_shape} " + f"and Output Shape: {g.output_shape}") + g.input_wires.accept(self) + g.input_wires_complementary.accept(self) + [c.accept(self) for c, _ in g.counters_with_shifts] + g.output_wires.accept(self) + self.emitter.emitln() + + def visit_counter(self, c: Counter): + [el.accept(self) for col in c.input_wires for el in col] + [el.accept(self) for col in c.output_wires for el in col] + [el.accept(self) for el in + sorted(c.instances, key=lambda x: not isinstance(x, Connectable))] + + def visit_gate_absorption_counter(self, c: GateAbsorbedStage): + [el.accept(self) for col in c.input_wires_complementary for el in col] + self.visit_counter(c) + + def visit_wire(self, w: Wire): + if w in self._emitted_hardware: + return + + if w not in self._declared_hardware: + self.emitter.emitln(f"uwire {self.get_name(w)};") + self._declared_hardware.add(w) + + if w.has_source not in self._declared_hardware and isinstance(w.source, Wire): + w.source.accept(self) + + if (w.has_source and isinstance(w.source, Connectable) and + not isinstance(w.source, BlackboxPort) and + not isinstance(w.source, BlackboxVecElement)): + self.emitter.emitln( + f"assign {self.get_name(w)} = {self.get_name(w.source)};") + self._emitted_hardware.add(w) + + def visit_logic(self, lgc: Logic): + if lgc in self._emitted_hardware: + return + + if lgc not in self._declared_hardware: + self.emitter.emit(lgc.prefix) + init_str = f" = 1'b{lgc.init}" if lgc.init is not None else "" + self.emitter.emitln( + f'(* srl_style = "register" *) logic {self.get_name(lgc)}{init_str};') + self._declared_hardware.add(lgc) + + if (lgc.has_source not in self._declared_hardware and + isinstance(lgc.source, Wire)): + lgc.source.accept(self) + + def emit_inner(): + if lgc.source: + self.emitter.emitln( + f"{self.get_name(lgc)} <= {self.get_name(lgc.source)};") + + def emit_with_en(): + if lgc.en: + self.emitter.emitln(f"if ({self.get_name(lgc.en)}) begin") + with self.emitter.indent: + emit_inner() + self.emitter.emitln("end") + else: + emit_inner() + + def emit_with_rst_and_en(): + if lgc.rst and lgc.en: + # En-gated rst: preserve state during stalls + self.emitter.emitln(f"if ({self.get_name(lgc.en)}) begin") + with self.emitter.indent: + self.emitter.emitln(f"if ({self.get_name(lgc.rst)}) begin") + with self.emitter.indent: + self.emitter.emitln(f"{self.get_name(lgc)} <= 1'b0;") + self.emitter.emitln("end else begin") + with self.emitter.indent: + emit_inner() + self.emitter.emitln("end") + self.emitter.emitln("end") + elif lgc.rst: + self.emitter.emitln(f"if ({self.get_name(lgc.rst)}) begin") + with self.emitter.indent: + self.emitter.emitln(f"{self.get_name(lgc)} <= 1'b0;") + self.emitter.emitln("end else begin") + with self.emitter.indent: + emit_inner() + self.emitter.emitln("end") + else: + emit_with_en() + + self.emitter.emitln("always_ff @(posedge clk) begin") + with self.emitter.indent: + emit_with_rst_and_en() + self.emitter.emitln("end") + self._emitted_hardware.add(lgc) + + def visit_blackbox(self, b: Blackbox): + if b.annotations: + self.emitter.emitln(f"(* {', '.join(b.annotations)} *)") + self.emitter.emitln(f"{b.module_name} #(") + with self.emitter.indent: + for idx, (key, value) in enumerate(b.parameters.items()): + ending = "," if idx != len(b.parameters)-1 else "" + self.emitter.emitln(f".{key}({value}){ending}") + self.emitter.emitln(f") {self.get_name(b)} (") + with self.emitter.indent: + ports = b.out_ports + b.in_ports + for idx, port in enumerate(ports): + ending = "," if idx != len(ports)-1 else "" + port.accept(self) + self.emitter.emitln(ending) + self.emitter.emitln(");") + + def visit_blackbox_output(self, b: BlackboxOutput): + if b.has_target: + self.emitter.emit(f".{b.name}({self.get_name(b.target)})") + else: + self.emitter.emit(f".{b.name}()") + + def visit_blackbox_output_vec(self, b: BlackboxOutputVec): + self.emitter.emit(f".{b.name}(") + self.emitter.emit("{") + targets = [self.get_name(el.target) for el in b.elements[::-1] if el.target] + self.emitter.emit(", ".join(targets)) + self.emitter.emit("})") + + def visit_blackbox_input(self, b: BlackboxInput): + if b.has_source: + self.emitter.emit(f".{b.name}({self.get_name(b.source)})") + else: + self.emitter.emit(f".{b.name}(1'b0)") + + def visit_blackbox_input_vec(self, b: BlackboxInputVec): + self.emitter.emit(f".{b.name}(") + self.emitter.emit("{") + sources = [self.get_name(el.source) + if el.source else "1'b0" + for el in b.elements[::-1]] + self.emitter.emit(", ".join(sources)) + self.emitter.emit("})") + + def emit_blackbox_ports(self, p: Tuple[BlackboxPort]): + for idx, port in enumerate(p): + seperator = "," if idx != len(p) - 1 else "" + if port.connected: + self.emitter.emitln(f".{self.get_name(port)}({self.get_name(port.wire)}){seperator}") + elif isinstance(port, BlackboxInput): + self.emitter.emitln(f".{self.get_name(port)}(1'b0){seperator}") + else: + self.emitter.emitln(f".{self.get_name(port)}(){seperator}") + + def visit_bitmatrix(self, b: Bitmatrix): + if b not in self._declared_hardware: + self.emitter.emitln(f"uwire [{b.total_size()-1}:0] {self.get_name(b)};") + self._declared_hardware.add(b) + + if b not in self._emitted_hardware: + [self.emitter.emitln( + f"assign {self.get_name(el)} = {self.get_name(el.source)};") + for col in b for el in col if el.has_source] + self._emitted_hardware.add(b) \ No newline at end of file diff --git a/src/finn/compressor/src/passes/io_annotator.py b/src/finn/compressor/src/passes/io_annotator.py new file mode 100644 index 0000000000..e41d077864 --- /dev/null +++ b/src/finn/compressor/src/passes/io_annotator.py @@ -0,0 +1,54 @@ +############################################################################# +# Copyright (C) 2024 - 2025 Advanced Micro Devices, Inc. All rights reserved. +# +# SPDX-License-Identifier: BSD-3-Clause +# +# @brief Input/output annotation pass for compressor +############################################################################# + +from ..graph.nodes import Compressor, Logic, Wire, Bitmatrix +from .node_iterator import NodeIterator + +class IOAnnotator(NodeIterator): + def visit_compressor(self, c: Compressor): + input_wires = c.stages[0].input_wires + output_wires = c.stages[-1].output_wires + + # Handle trivial passthrough case where input_wires IS output_wires (same object). + # This happens for N=1 compressors where only an InputStage exists. + # We need separate Bitmatrix objects for input and output ports. + if input_wires is output_wires: + new_output = Bitmatrix(input_wires.shape) + for in_col, out_col in zip(input_wires, new_output): + for in_wire, out_wire in zip(in_col, out_col): + in_wire.connect_to(out_wire) + c.stages[-1].output_wires = new_output + output_wires = new_output + + input_wires.set_to_module_input() + input_wires.name = "in" + if c.stages[0].gates: + c.stages[0].input_wires_complementary.set_to_module_input() + c.stages[0].input_wires_complementary.name = "in_2" + output_wires.set_to_module_output() + output_wires.name = "out" + + c.io = self.get_all_io(c) + + def get_all_io(self, c: Compressor): + finder = IOFinder() + c.accept(finder) + return list(set(finder.io)) + +class IOFinder(NodeIterator): + def iter_compressor(self, c: Compressor): + self.connectables = [] + + @property + def io(self): return [el for el in self.connectables if el.prefix] + + def iter_wire(self, w: Wire): self.connectables.append(w) + + def iter_logic(self, lgc: Logic): self.connectables.append(lgc) + + def iter_bitmatrix(self, b: Bitmatrix): self.connectables.append(b) diff --git a/src/finn/compressor/src/passes/lut_placer.py b/src/finn/compressor/src/passes/lut_placer.py new file mode 100644 index 0000000000..ec8c2cabc2 --- /dev/null +++ b/src/finn/compressor/src/passes/lut_placer.py @@ -0,0 +1,85 @@ +############################################################################# +# Copyright (C) 2024 - 2025 Advanced Micro Devices, Inc. All rights reserved. +# +# SPDX-License-Identifier: BSD-3-Clause +# +# @brief RLOC placement annotation for compressor LUTs +############################################################################# + +from .node_iterator import NodeIterator +from ..graph.nodes import Compressor, Counter, GateAbsorptionCounter +from ..graph.primitives import LUT6CY +from ..graph.final_adder import FinalAdder + +class LUTPlacer(NodeIterator): + def iter_compressor(self, c: Compressor): + self.occupations = [] # Reset placement state for every compressor + + def iter_counter(self, c: Counter): + # Place LUT6CY instances manually. + cascades = self._get_ripple_connected_luts(c) + self._calculate_and_annotate_placements(cascades) + + def iter_gate_absorption_counter(self, g: GateAbsorptionCounter): + self.iter_counter(g) + + def _get_ripple_connected_luts(self, c: Counter): + "Among all LUTs inside a counter, reconstruct all ripple connections." + if isinstance(c, FinalAdder): + # No manual placement needed, as final adders use the LOOKAHEAD8, + # which restricts enforces correct placement itself. + return [] + + lut6cy_i4s = {lut.I4: lut for lut in c.luts if isinstance(lut, LUT6CY)} + lut6cy_o52s = {lut.O52: lut for lut in c.luts if isinstance(lut, LUT6CY)} + + lut_output_to_lut_input = {} + + for input, input_lut in lut6cy_i4s.items(): + if input.source in lut6cy_o52s: + target_lut = lut6cy_o52s[input.source] + lut_output_to_lut_input[input_lut] = target_lut + + lut_heads = (set(lut_output_to_lut_input.keys()) - + set(lut_output_to_lut_input.values())) + chains = [] + + for lut_head in lut_heads: + cur = [lut_head] + while el := lut_output_to_lut_input.get(cur[-1]): + cur.append(el) + chains.append(cur[::-1]) + + return chains + + def _calculate_and_annotate_placements(self, cascades): + for cascade in cascades: + for idx, slice_util in enumerate(self.occupations): + if len(cascade) + slice_util <= 8: + self._annotate_placements(cascade, idx, self.occupations[idx]) + self.occupations[idx] += len(cascade) + break + else: + self.occupations.append(len(cascade)) + self._annotate_placements(cascade, len(self.occupations)-1, 0) + + def _annotate_placements(self, cascade, hu_set, start_idx): + """Annotate LUT6CY placement constraints for carry chain packing. + + Places each cascade (ripple chain) into specific BEL positions within a SLICE. + Each hu_set represents one SLICE (8 LUTs max). Multiple hu_sets get different + Y coordinates to avoid placement conflicts. + + Args: + cascade: List of LUT6CY instances forming a carry ripple chain + hu_set: SLICE index (0, 1, 2, ...) - maps to RLOC Y coordinate + start_idx: Starting BEL position within the SLICE (0-7 = A-H) + """ + assert start_idx + len(cascade) <= 8 + for i, lut in enumerate(cascade): + bel_str = f"{chr(ord('A')+start_idx+i)}5LUT" + lut.annotate(f'HU_SET = "hu_set_{hu_set}"') + lut.annotate(f'RLOC = "X0Y{hu_set}"') # Increment Y per SLICE to avoid conflicts + lut.annotate(f'BEL = "{bel_str}"') + lut.annotate('DONT_TOUCH = "yes"') + lut.annotate('IS_BEL_FIXED = "yes"') \ No newline at end of file diff --git a/src/finn/compressor/src/passes/node_iterator.py b/src/finn/compressor/src/passes/node_iterator.py new file mode 100644 index 0000000000..4b0f399e35 --- /dev/null +++ b/src/finn/compressor/src/passes/node_iterator.py @@ -0,0 +1,123 @@ +############################################################################# +# Copyright (C) 2024 - 2025 Advanced Micro Devices, Inc. All rights reserved. +# +# SPDX-License-Identifier: BSD-3-Clause +# +# @brief Node iterator pass for compressor graph traversal +############################################################################# + +from ..graph.primitives import LOOKAHEAD8 +from ..graph.visitor import Visitor +from ..graph.nodes import Counter, CompressionStage, Compressor, InputStage, PipelineStage +from ..graph.nodes import Blackbox, Wire, Logic, Bitmatrix, GateAbsorbedStage +from ..graph.nodes import GateAbsorptionCounter, BlackboxInput, BlackboxOutput +from ..graph.nodes import BlackboxInputVec, BlackboxOutputVec +from ..graph.accumulator import AccumulatorStage + +class NodeIterator(Visitor): + def visit_compressor(self, c: Compressor): + self.iter_compressor(c) + [s.accept(self) for s in c.stages] + + def visit_input_stage(self, s: InputStage): + self.iter_input_stage(s) + s.input_wires.accept(self) + if s.gates: + s.input_wires_complementary.accept(self) + s.output_wires.accept(self) + + def visit_pipeline_stage(self, s: PipelineStage): + self.iter_pipeline_stage(s) + s.input_wires.accept(self) + s.output_wires.accept(self) + [el.accept(self) for el in s.instances] + + def visit_compression_stage(self, s: CompressionStage): + self.iter_compression_stage(s) + s.input_wires.accept(self) + s.output_wires.accept(self) + [c.accept(self) for c, _ in s.counters_with_shifts] + + def visit_accumulator_stage(self, a: AccumulatorStage): + self.iter_accumulator_stage(a) + a.input_wires.accept(self) + a.output_wires.accept(self) + [c.accept(self) for c in a.instances] + + def visit_gate_absorbed_stage(self, g: GateAbsorbedStage): + self.iter_gate_absorbed_stage(g) + g.input_wires.accept(self) + g.input_wires_complementary.accept(self) + g.output_wires.accept(self) + [c.accept(self) for c, _ in g.counters_with_shifts] + + def visit_counter(self, c: Counter): + self.iter_counter(c) + [el.accept(self) for col in c.input_wires for el in col] + [el.accept(self) for col in c.output_wires for el in col] + [el.accept(self) for el in c.instances] + + def visit_gate_absorption_counter(self, g: GateAbsorptionCounter): + self.iter_gate_absorption_counter(g) + [el.accept(self) for col in g.input_wires for el in col] + [el.accept(self) for col in g.input_wires_complementary for el in col] + [el.accept(self) for col in g.output_wires for el in col] + [el.accept(self) for el in g.instances] + + def visit_blackbox(self, b: Blackbox): + self.iter_blackbox(b) + [p.accept(self) for p in b.in_ports + b.out_ports] + + def visit_blackbox_input(self, b: BlackboxInput): + self.iter_blackbox_input + + def visit_blackbox_output(self, b: BlackboxOutput): + self.iter_blackbox_output + + def visit_blackbox_input_vec(self, b: BlackboxInputVec): + self.iter_blackbox_input_vec + + def visit_blackbox_output_vec(self, b: BlackboxOutputVec): + self.iter_blackbox_output_vec + + def visit_lookahead8(self, l8: LOOKAHEAD8): + self.iter_lookahead8(l8) + self.visit_blackbox(l8) + + def visit_wire(self, w: Wire): self.iter_wire(w) + + def visit_logic(self, lgc: Logic): self.iter_logic(lgc) + + def visit_bitmatrix(self, b: Bitmatrix): self.iter_bitmatrix(b) + + def iter_compressor(self, c: Compressor): pass + + def iter_gate_absorbed_stage(self, g: GateAbsorbedStage): pass + + def iter_input_stage(self, s: InputStage): pass + + def iter_accumulator_stage(self, a: AccumulatorStage): pass + + def iter_pipeline_stage(self, s: PipelineStage): pass + + def iter_compression_stage(self, s: CompressionStage): pass + + def iter_gate_absorption_counter(self, g: GateAbsorptionCounter): pass + + def iter_counter(self, c: Counter): pass + + def iter_blackbox(self, b: Blackbox): pass + + def iter_wire(self, w: Wire): pass + + def iter_logic(self, lgc: Logic): pass + + def iter_bitmatrix(self, b: Bitmatrix): pass + + def iter_blackbox_input(self, b: BlackboxInput): pass + + def iter_blackbox_output(self, b: BlackboxOutput): pass + + def iter_blackbox_input_vec(self, b: BlackboxInputVec): pass + + def iter_blackbox_output_vec(self, b: BlackboxOutputVec): pass \ No newline at end of file diff --git a/src/finn/compressor/src/passes/printer.py b/src/finn/compressor/src/passes/printer.py new file mode 100644 index 0000000000..2ebcabe23f --- /dev/null +++ b/src/finn/compressor/src/passes/printer.py @@ -0,0 +1,54 @@ +############################################################################# +# Copyright (C) 2024 - 2025 Advanced Micro Devices, Inc. All rights reserved. +# +# SPDX-License-Identifier: BSD-3-Clause +# +# @brief Compressor tree printer for debugging +############################################################################# + +from ..graph.nodes import Counter, CompressionStage, Compressor, GateAbsorbedStage +from ..graph.nodes import GateAbsorptionCounter, InputStage, PipelineStage +from ..graph.accumulator import AccumulatorStage +from ..graph.visitor import Visitor + +class CompressorPrinter(Visitor): + def visit_compressor(self, c: Compressor): + print(f"Compressor [") + for stage in c.stages: + stage.accept(self) + print("]") + + def visit_compression_stage(self, s: CompressionStage): + print(f"\tStage: [") + for counter, shift in s.counters_with_shifts: + print(f"\t\t[xshift={shift:2}] ",end="") + counter.accept(self) + print("\t]") + + def visit_gate_absorbed_stage(self, s: GateAbsorbedStage): + print(f"\tStage with Gate Absorption: [") + for counter, shift in s.counters_with_shifts: + print(f"\t\t[xshift={shift:2}] ",end="") + counter.accept(self) + print("\t]") + + def visit_input_stage(self, i: InputStage): + print(f"\tInput Stage: <{i.input_shape}>") + + def visit_pipeline_stage(self, p: PipelineStage): + print(f"\tPipeline Stage: <{p.input_shape}>") + + def visit_counter(self, c: Counter): + print(f"{c.__class__.__name__} ") + + def visit_gate_absorption_counter(self, c: GateAbsorptionCounter): + self.visit_counter(c) + + def visit_accumulator_stage(self, a: AccumulatorStage): + print(f"\tAccumulator: [") + print("\t\t",end="") + for i in a.instances: + if isinstance(i, Counter): + i.accept(self) + print("\t]") \ No newline at end of file diff --git a/src/finn/compressor/src/passes/wire_inserter.py b/src/finn/compressor/src/passes/wire_inserter.py new file mode 100644 index 0000000000..6865b1cf3d --- /dev/null +++ b/src/finn/compressor/src/passes/wire_inserter.py @@ -0,0 +1,40 @@ +############################################################################# +# Copyright (C) 2024 - 2025 Advanced Micro Devices, Inc. All rights reserved. +# +# SPDX-License-Identifier: BSD-3-Clause +# +# @brief Wire insertion pass for compressor graph +############################################################################# + +from .node_iterator import NodeIterator +from ..graph.nodes import Blackbox, Counter, Wire, GateAbsorptionCounter + +# Blackbox outputs might be connected to other blackbox inputs. +# To express this in verilog, an extra intermediate wire has to +# be created between the blackboxes. This path adds it. +class WireInserter(NodeIterator): + def iter_counter(self, c: Counter): + bboxes = [el for el in c.instances if isinstance(el, Blackbox)] + for bbox in bboxes: + for output in bbox.out_ports: + self.insert_wire_at_blackbox_output(output, c) + + def iter_gate_absorption_counter(self, g: GateAbsorptionCounter): + self.iter_counter(g) + + def insert_wire_at_blackbox_output(self, output, counter): + if hasattr(output, "elements"): + for el in output.elements: + self.insert_wire_at_blackbox_output(el, counter) + return + + if len(output.target) == 1 and isinstance(output.target[0], Wire): + output.target = output.target[0] + return + + out_wire = Wire() + for input in output.target: + out_wire.connect_to(input) + + output.target = out_wire + counter.instances.append(out_wire) \ No newline at end of file diff --git a/src/finn/compressor/src/target.py b/src/finn/compressor/src/target.py new file mode 100644 index 0000000000..d526fdbbb0 --- /dev/null +++ b/src/finn/compressor/src/target.py @@ -0,0 +1,102 @@ +############################################################################# +# Copyright (C) 2024 - 2025 Advanced Micro Devices, Inc. All rights reserved. +# +# SPDX-License-Identifier: BSD-3-Clause +# +# @brief FPGA target definitions and gate absorption counter selection +############################################################################# + +from abc import ABC +from .graph.counters.counter_candidates import CounterCandidate, FACandidate +from .graph.counters.counter_candidates import MuxCYAtomCascadeCandidate +from .graph.counters.counter_candidates import RippleSumCandidate +from .graph.counters.counter_candidates import DualRailRippleSumCandidate +from .graph.counters.counter_candidates import FiveTwoCandidate +from .graph.counters.counter_candidates import VersalAtomCascadeCandidate +from .graph.counters.counter_candidates import SixThreeCandidate, TenSixCandidate +from .graph.counters.absorption_counter_candidates import GateAbsorptionCounterCandidate +from .graph.counters.absorption_counter_candidates import VersalPredAdderCandidate +from .graph.counters.absorption_counter_candidates import RippleSumPredAdderCandidate +from .graph.counters.absorption_counter_candidates import SinglePredCandidate +from .graph.counters.absorption_counter_candidates import MuxCYPredAdderCandidate +from .graph.counters.absorption_counter_candidates import MuxCYRippleSumCandidate +from .graph.final_adder import MuxCYTernaryAdder, FinalAdder, QuaternaryAdder +from typing import List + +def resolve_target(fpgapart): + """Map a Vivado FPGA part string to a compressor Target object. + + Returns Versal() for Versal parts, UltraScale() for UltraScale/UltraScale+ parts, + SevenSeries() otherwise. + """ + versal_prefixes_4 = ("xcvc", "xcve", "xcvp", "xcvm", "xqvc", "xqvm") + versal_prefixes_5 = ("xqrvc", "xcv80") + if fpgapart[0:4] in versal_prefixes_4 or fpgapart[0:5] in versal_prefixes_5: + return Versal() + # UltraScale/UltraScale+ prefixes: Kintex US (xcku), Virtex US (xcvu), Zynq US (xczu), defense (xqzu) + ultrascale_prefixes = ("xcku", "xcvu", "xczu", "xqzu") + if fpgapart[0:4] in ultrascale_prefixes: + return UltraScale() + return SevenSeries() + + +def resolve_target_name(name): + """Map a CLI target name ('Versal', '7-Series', 'UltraScale') to a Target object.""" + if name == "Versal": + return Versal() + elif name == "7-Series": + return SevenSeries() + elif name == "UltraScale": + return UltraScale() + else: + raise ValueError(f"Unsupported target: {name!r}. Choose from: ['Versal', '7-Series', 'UltraScale']") + + +class Target(ABC): + counter_candidates: List[CounterCandidate] + final_adder: FinalAdder + absorbing_counter_candidates: List[GateAbsorptionCounterCandidate] + +class Versal(Target): + def __init__(self): + self.counter_candidates = [ + TenSixCandidate(), + FACandidate(), + RippleSumCandidate(), + DualRailRippleSumCandidate(), + FiveTwoCandidate(), + SixThreeCandidate(), + VersalAtomCascadeCandidate() + ] + self.absorbing_counter_candidates = [ + VersalPredAdderCandidate(), + RippleSumPredAdderCandidate(), + SinglePredCandidate(), + ] + self.final_adder = QuaternaryAdder + +class SevenSeries(Target): + def __init__(self): + self.counter_candidates = [FACandidate(), FiveTwoCandidate(), + SixThreeCandidate(), MuxCYAtomCascadeCandidate()] + self.final_adder = MuxCYTernaryAdder + self.absorbing_counter_candidates = [ + MuxCYPredAdderCandidate(), + MuxCYRippleSumCandidate(), + SinglePredCandidate(), + ] + +class UltraScale(Target): + """UltraScale/UltraScale+ - reuses 7-Series primitives. + + Vivado maps CARRY4 to CARRY8 transparently. + """ + def __init__(self): + self.counter_candidates = [FACandidate(), FiveTwoCandidate(), + SixThreeCandidate(), MuxCYAtomCascadeCandidate()] + self.final_adder = MuxCYTernaryAdder + self.absorbing_counter_candidates = [ + MuxCYPredAdderCandidate(), + MuxCYRippleSumCandidate(), + SinglePredCandidate(), + ] \ No newline at end of file diff --git a/src/finn/compressor/src/tests/__init__.py b/src/finn/compressor/src/tests/__init__.py new file mode 100644 index 0000000000..b6d457fd32 --- /dev/null +++ b/src/finn/compressor/src/tests/__init__.py @@ -0,0 +1,8 @@ +############################################################################# +# Copyright (C) 2024 - 2025 Advanced Micro Devices, Inc. All rights reserved. +# +# SPDX-License-Identifier: BSD-3-Clause +# +# @brief Compressor tests package initialization +# @author Simon Gerber +############################################################################# diff --git a/src/finn/compressor/src/tests/test_gen.py b/src/finn/compressor/src/tests/test_gen.py new file mode 100644 index 0000000000..a0116526aa --- /dev/null +++ b/src/finn/compressor/src/tests/test_gen.py @@ -0,0 +1,150 @@ +############################################################################# +# Copyright (C) 2024 - 2025 Advanced Micro Devices, Inc. All rights reserved. +# +# SPDX-License-Identifier: BSD-3-Clause +# +# @brief Test vector generation for compressor verification +############################################################################# + +from ..utils.shape import Shape +from itertools import accumulate +from typing import List + +def compressed_width(shape): + max = sum([col * (1 << idx) for idx, col in enumerate(shape)]) + return max.bit_length() + +def flatten_gates(gates: List[List[str]]) -> List[str]: + return [el for col in gates for el in col] + +def generate_test(shape: Shape, module_name: str, pipeline_stages: int, + gates: List[List[str]], accumulation: bool, accumulator_width: int, + constant: int): + assert(type(pipeline_stages) == int) + + if gates: + gates = flatten_gates(gates) + + has_clk = bool(pipeline_stages) or accumulate + + accumulated_signature = list(accumulate(shape)) + addends = [] + for j, col in enumerate(accumulated_signature): + for i in range(shape[j]): + addends.append(f"\t\tin_reduced += arr_in[{col+i-shape[j]}] << {j};") + addends = "\n".join(addends) + + if gates: + preds = "".join([f"\tlocalparam pred_{idx} = 4'h{gate};\n" + for idx, gate in enumerate(gates)]) + selects = "".join([f"\tlogic [3:0] sel_{idx};\n" + for idx, _ in enumerate(gates)]) + arr_ins = "".join([ + f"\t\tsel_{i} = (arr_in_b[{i}]<<1) | arr_in_a[{i}];\n" + + f"\t\tarr_in[{i}] = pred_{i}[sel_{i}];\n" + for i, _ in enumerate(gates)]) + gates_decl = (f"\tlogic [{sum(shape)-1}:0] arr_in_a;" + + f"\tlogic [{sum(shape)-1}:0] arr_in_b;") + accumulator_width = (accumulator_width if accumulator_width + else compressed_width(shape)) + acc_decl = f"\tlogic [{accumulator_width-1}:0] acc_base;" + + acc_rst_block = """\t\t\tif (reset == 0) begin +\t\t\t\tacc_base = 0; +\t\t\tend else begin +\t\t\t\tacc_base = reference[0]; +\t\t\tend""" + + return ( +f"""module tb; +{gates_decl if gates else ""} +\tlogic [{sum(shape)-1}:0] arr_in; +\tlogic [{compressed_width(shape)-1}:0] in_reduced; +\tlogic [{accumulator_width-1}:0] out; +\tlogic [{accumulator_width-1}:0] reference [{pipeline_stages}:0]; +{acc_decl if accumulation else ""} +\t{"logic [4:0] reset;" if accumulation else ""} +\t{"logic rst;" if accumulation else ""} +\t{"logic clk = 0;" if has_clk else ""} +\t{"logic en = 1;" if accumulation else ""} + +{preds if gates else ""} +{selects if gates else ""} +\talways_comb begin; +{arr_ins if gates else ""} +\tend + +\t{"always #10ns clk = !clk;" if has_clk else ""} + +\talways_comb begin +\t\t{"reference[0] = acc_base + in_reduced;" + if accumulation else "reference[0] = in_reduced;"} +\tend + +\talways_comb begin +\t\tin_reduced = 0; +\t\t{"if (en) begin" if accumulation else ""} +in_reduced += {constant}; +{addends} +\t\t{"end" if accumulation else ""} +\tend + +\tinitial begin +\t\t{"acc_base = 0;" if accumulation else ""} +\t\t{"arr_in_a = 0;" if gates else "arr_in = 0;"} +\t\t{"arr_in_b = 0;" if gates else ""} + +\t\t{"assign rst = reset == 0;" if accumulation else ""} +\t\t{"reset = 0; #40ns;" if accumulation else ""} + +\t\tfor (int i = 0; i < 16000; i += 1) begin +\t\t\t{"automatic type(reset) xx;" if accumulation else ""} +\t\t\t{"automatic type(en) zz;" if accumulation else ""} + +\t\t\t{"automatic type(arr_in_a) yy;" if gates else "automatic type(arr_in) yy;"} +\t\t\t{"automatic type(arr_in_b) yz;" if gates else ""} + +\t\t\t{"void'(std::randomize(xx));" if accumulation else ""} +\t\t\t{"reset = xx; " if accumulation else ""} +\t\t\t{"void'(std::randomize(zz));" if accumulation else ""} +\t\t\t{"en = zz;" if accumulation else ""} + +\t\t\tif (i < 5) yy = 0; +\t\t\telse if (i < 10) yy = '1; +\t\t\telse void'(std::randomize(yy)); +\t\t\t{"arr_in_a = yy;" if gates else "arr_in = yy;"} + +\t\t\t{"if (i < 5) yz = 0;" if gates else ""} +\t\t\t{"else if (i < 10) yz = '1;" if gates else ""} +\t\t\t{"else void'(std::randomize(yz));" if gates else ""} +\t\t\t{"arr_in_b = yz;" if gates else ""} + +\t\t\t@(posedge clk); +\t\t\tfor (int i = 1; i <= {pipeline_stages}; ++i) begin +\t\t\t\treference[i] <= reference[i-1]; +\t\t\tend + +{acc_rst_block if accumulation else ""} +\t\t\t#1ns; +\t\t\tif(^reference[{pipeline_stages}] !== 1'bX) begin +\t\t\t\tassert(reference[{pipeline_stages}] === out) else begin +\t\t\t\t\t$error("Mismatch: Ref[%0b] != Out[%0b]", reference[{pipeline_stages}], out); +\t\t\t\t\t#2ns; +\t\t\t\t\t$stop; +\t\t\t\tend +\t\t\tend +\t\t#0.01ns; + +\t\tend +\t\t$display("TEST PASSED"); +\t\t$finish(); +\tend + +\t{module_name} dut( + {".clk(clk)," if pipeline_stages or accumulation else ""} + {".rst(rst)," if accumulation else ""} + {".in(arr_in_a), .in_2(arr_in_b)," if gates else ".in(arr_in),"} + {".en_neg(!en)," if accumulation else ""} + .out(out)); +endmodule +""").replace("\n\n", "\n") \ No newline at end of file diff --git a/src/finn/compressor/src/tests/tester.py b/src/finn/compressor/src/tests/tester.py new file mode 100644 index 0000000000..3537b97f7a --- /dev/null +++ b/src/finn/compressor/src/tests/tester.py @@ -0,0 +1,41 @@ +#!/usr/bin/env python +# Copyright (C) 2024, Advanced Micro Devices, Inc. +# All rights reserved. +# +# SPDX-License-Identifier: BSD-3-Clause + +"""Vivado XSim wrapper for testing generated compressors.""" + +import subprocess +import re + + +def tester(test_loc, comp_loc): + """Run Vivado XSim simulation to test a compressor. + + Args: + test_loc: Path to testbench SystemVerilog file + comp_loc: Path to compressor SystemVerilog file + """ + args = ( + f"""rm -r xsim.dir/ && + xvlog -work work -sv ../res/glbl.v {test_loc} {comp_loc} -L unisims_ver --nolog && + xelab -L work -L unisims_ver -relax --nolog glbl tb && + xsim --nolog work.glbl#work.tb -R""").replace("\n", " ") + print(args) + try: + ret = subprocess.run(args, capture_output=True, text=True, timeout=300, + shell=True, check=True) + except subprocess.CalledProcessError as e: + if e.returncode == 127: + raise RuntimeError( + "Could not call Vivado simulation tools. Did you source Vivado?") + else: + raise RuntimeError("Something failed during simulation.") + if "$finish called at time" in ret.stdout: + print("Simulation SUCCESS!") + else: + print("ERROR in Compressor Simulation!") + error = re.findall("Error:.*\n.*\n", ret.stdout)[0].split("\n") + print(f">> {error[0]}\n>> {error[1]}") + exit(-2) diff --git a/src/finn/compressor/src/utils/__init__.py b/src/finn/compressor/src/utils/__init__.py new file mode 100644 index 0000000000..a5d76914d9 --- /dev/null +++ b/src/finn/compressor/src/utils/__init__.py @@ -0,0 +1,8 @@ +############################################################################# +# Copyright (C) 2024 - 2025 Advanced Micro Devices, Inc. All rights reserved. +# +# SPDX-License-Identifier: BSD-3-Clause +# +# @brief Compressor utilities package initialization +# @author Simon Gerber +############################################################################# diff --git a/src/finn/compressor/src/utils/mul_comp_map.py b/src/finn/compressor/src/utils/mul_comp_map.py new file mode 100644 index 0000000000..951b732be1 --- /dev/null +++ b/src/finn/compressor/src/utils/mul_comp_map.py @@ -0,0 +1,58 @@ +############################################################################# +# Copyright (C) 2024 - 2025 Advanced Micro Devices, Inc. All rights reserved. +# +# SPDX-License-Identifier: BSD-3-Clause +# +# @brief Multiplier-to-compressor input mapping utilities +############################################################################# + +class MulCompMap: + def __init__(self, na: int, nb: int, sa: bool, sb: bool): + self.na = na + self.nb = nb + self.sa = sa + self.sb = sb + + def columns(self): + return 1 if self.na == 1 and self.nb == 1 else self.nb + self.na - (not self.sb or self.sa) + + def shape(self): + (na, nb, sa, sb) = (self.na, self.nb, self.sa, self.sb) + + res = [] + if na == 1 and nb == 1: + res.append([7 if sa ^ sb else 8]) + else: + col = 0 + + # Crescending right triangle + while col < nb: + col += 1 + res.append([8] * col) + # Central rectangle + while col < na: + col += 1 + res.append([8] * nb) + # Decrescending left rectangle + while col < nb + na - 1: + col += 1 + res.append([8] * (nb + na - col)) + + # Patch in sign handling + if sa: + for col in range(na - 1, na + nb - 1): + res[col][0] = ~res[col][0] & 15 + if sb: + res[nb].insert(0, 2) + for col in range(nb, nb + na - 1): + op = res[col][-1] + res[col][-1] = ((op & 3) << 2) | ((op >> 2) & 3) + if not sa: + res.append([13]) + + return res + + def absolute_term(self): + (na, nb, sa, sb) = (self.na, self.nb, self.sa, self.sb) + + return (-1 if sa ^ sb else 0) if na == 1 and nb == 1 else ((-(sa | sb) << nb) | sa) << (na - 1) diff --git a/src/finn/compressor/src/utils/shape.py b/src/finn/compressor/src/utils/shape.py new file mode 100644 index 0000000000..5cfdb9ea3d --- /dev/null +++ b/src/finn/compressor/src/utils/shape.py @@ -0,0 +1,51 @@ +############################################################################# +# Copyright (C) 2024 - 2025 Advanced Micro Devices, Inc. All rights reserved. +# +# SPDX-License-Identifier: BSD-3-Clause +# +# @brief Shape representation for compressor bit matrices +############################################################################# + +from typing import Tuple +from itertools import zip_longest + +class Shape: + def __init__(self, t: Tuple[int] = ()): self.t = tuple(t) + + def __len__(self): return len(self.t) + + def __iter__(self): return self.t.__iter__() + + def __getitem__(self, val): + if type(val) == int and val >= len(self.t): + return 0 + r = self.t.__getitem__(val) + if type(r) == int: + return r + else: + return Shape(r) + + def __lshift__(self, val): + return Shape([0 for el in range(val)] + list(self.t)) + + def __add__(self, val): + return self.__binary_arithmetic_operation(val, lambda x,y: x+y) + + def __sub__(self, val): + return self.__binary_arithmetic_operation(val, lambda x,y: x-y) + + def __binary_arithmetic_operation(self, val, op): + if type(val) == int: + return Shape([op(el, val) for el in self.t]) + elif type(val) == Shape: + zipped = zip_longest(self.t, val.t, fillvalue=0) + return Shape([op(a, b) for a, b in zipped]) + else: + raise RuntimeError("Unsupported type.") + + def __repr__(self): return f"Shape {self.t[::-1]}" + + def __eq__(self, other): + for col1, col2 in zip_longest(self, other, fillvalue=0): + if col1 != col2: return False + return True \ No newline at end of file From 1d205bca7b2abd0d63801c9db48ec3968ab80b9f Mon Sep 17 00:00:00 2001 From: Simon Gerber Date: Tue, 14 Apr 2026 12:29:37 +0100 Subject: [PATCH 02/10] [Feature] Integrate compressor into MVAU RTL backend Wire the compressor generator into FINN's RTL MVAU datapath, enabling LUT-based dot product computation as an alternative to DSP blocks. RTL Datapath Changes (finn-rtllib/mvu/): - mvu_vvu_axi.sv: Add USE_COMPRESSOR parameter and conditional instantiation - add_multi.sv: Add CATCH_COMP macro for generated compressor module instantiation - mvu_vvu_axi_wrapper.v: Propagate COMP_PIPELINE_DEPTH parameter FINN Backend Integration (matrixvectoractivation_rtl.py): - Add compressor eligibility checks (_is_dotp_comp_eligible) - Conditionally generate dotp_comp and add_multi compressor modules - Include generated RTL files in build - Propagate USE_COMPRESSOR and COMP_PIPELINE_DEPTH template variables Versal MVAU can use compressor-based compute instead of DSP blocks. 7-Series and UltraScale+ not yet supported. --- finn-rtllib/mvu/add_multi.sv | 62 ++++++- finn-rtllib/mvu/mvu_vvu_axi.sv | 35 +++- finn-rtllib/mvu/mvu_vvu_axi_wrapper.v | 5 +- .../rtl/matrixvectoractivation_rtl.py | 155 ++++++++++++++---- 4 files changed, 218 insertions(+), 39 deletions(-) diff --git a/finn-rtllib/mvu/add_multi.sv b/finn-rtllib/mvu/add_multi.sv index 6b45d42e5a..d154ae318d 100644 --- a/finn-rtllib/mvu/add_multi.sv +++ b/finn-rtllib/mvu/add_multi.sv @@ -50,13 +50,65 @@ module add_multi import mvu_pkg::*; #( output logic [SUM_WIDTH-1:0] sum ); - localparam int unsigned L = $clog2(N); // Number of levels with reductions +//--------------------------------------------------------------------------- +// Compressor Path +// +// CATCH_COMP entries instantiate a generated compressor module for a +// specific (N, ARG_WIDTH, delay) triple. The macro transposes arg[i][j] +// to the column-major bit-vector expected by the compressor and pads any +// remaining DEPTH with a shift-register delay. +// +// Generated compressors have no en port — when en=0, upstream holds +// inputs stable and the downstream accumulator does not latch, so +// correctness is preserved. - uwire [SUM_WIDTH-1:0] sum0; - if(L < 1) begin : genTrivial +`define CATCH_COMP(n,w,d) \ +else if(!RESET_ZERO && (N == n) && (ARG_WIDTH == w) && (DEPTH >= d) && (0 <= ARG_LO)) begin : genComp``n``u``w``_d``d \ + initial $display("[ADD_MULTI_PATH] COMP N=%0d D=%0d W=%0d", N, DEPTH, ARG_WIDTH); \ +\ + uwire [N*ARG_WIDTH-1:0] in; \ + uwire [SUM_WIDTH -1:0] out; \ + for(genvar i = 0; i < N; i++) begin : genIn \ + for(genvar j = 0; j < ARG_WIDTH; j++) begin : genBit \ + assign in[j*N+i] = arg[i][j]; \ + end : genBit \ + end : genIn \ + comp_``n``u``w``_d``d comp_inst ( \ + .clk, \ + .in, .out \ + ); \ + initial assert($bits(out) >= $bits(comp_inst.out)) else $warning("CATCH_COMP(%0d,%0d,%0d): compressor output width %0d > SUM_WIDTH %0d", n, w, d, $bits(comp_inst.out), SUM_WIDTH); \ +\ + localparam int unsigned COMP_DELAY = d; \ + localparam int unsigned SUM_DELAY = DEPTH - COMP_DELAY; \ + if(SUM_DELAY == 0) assign sum = out; \ + else begin : genDelay \ + logic [SUM_WIDTH-1:0] SumZ[SUM_DELAY] = '{ default: 'x }; \ + always_ff @(posedge clk) begin \ + if(rst) SumZ <= '{ default: 'x }; \ + else begin \ + for(int unsigned i = 0; i < SUM_DELAY-1; i++) SumZ[i] <= SumZ[i+1]; \ + SumZ[SUM_DELAY-1] <= out; \ + end \ + end \ + assign sum = SumZ[0]; \ + end : genDelay \ +end : genComp``n``u``w``_d``d + + if(0) begin end + // FINN_GENERATED_COMP_ENTRIES + +//- Generic Behavioral Addition --------- + else begin : genGeneric + + localparam int unsigned L = $clog2(N); // Tree levels + + logic [SUM_WIDTH-1:0] sum0; + if(L < 1) begin : genPassThrough assign sum0 = arg[0]; - end : genTrivial + end : genPassThrough else begin : genTree + initial $display("[ADD_MULTI_PATH] TREE N=%0d D=%0d W=%0d", N, DEPTH, ARG_WIDTH); localparam int unsigned D = L < DEPTH? L : DEPTH; // Pipeline stages absorbed by tree // Compute the count of decendents for all nodes in the reduction trees. @@ -129,4 +181,6 @@ module add_multi import mvu_pkg::*; #( assign sum = SumZ[0]; end : genDelay + end : genGeneric + endmodule : add_multi diff --git a/finn-rtllib/mvu/mvu_vvu_axi.sv b/finn-rtllib/mvu/mvu_vvu_axi.sv index a890ac9aa3..dca3e9332c 100644 --- a/finn-rtllib/mvu/mvu_vvu_axi.sv +++ b/finn-rtllib/mvu/mvu_vvu_axi.sv @@ -64,6 +64,13 @@ module mvu_vvu_axi #( bit FORCE_BEHAVIORAL = 0, bit M_REG_LUT = 1, + // LUT-based compressor tree pipeline depth. This is set by default for maximum Pipelining (inbetween every stage). + int unsigned COMP_PIPELINE_DEPTH = 1, + + // Passed at generation time, whether compressors were generated if deemed worth it. + // Decides wether to use LUT-based compressors instead of DSPs. + bit USE_COMPRESSOR = 0, + // Safely deducible parameters localparam int unsigned WEIGHT_STREAM_WIDTH = PE * SIMD * WEIGHT_WIDTH, localparam int unsigned WEIGHT_STREAM_WIDTH_BA = (WEIGHT_STREAM_WIDTH + 7)/8 * 8, @@ -310,7 +317,19 @@ module mvu_vvu_axi #( localparam int unsigned A_WIDTH = 25 + 2*(VERSION > 1); // Width of A datapath localparam int unsigned NUM_LANES = A_WIDTH == WEIGHT_WIDTH? 1 : 1 + (A_WIDTH - !NARROW_WEIGHTS - WEIGHT_WIDTH) / MIN_LANE_WIDTH; - if(!IS_MVU || ((VERSION > 2) && (NUM_LANES <= 3) && (WEIGHT_WIDTH <= 8) && (ACTIVATION_WIDTH <= 9))) begin : genINT8 + if(USE_COMPRESSOR) begin : genCompressor + dotp_comp #( + .PE(PE), .SIMD(DSP_SIMD), + .WEIGHT_WIDTH(WEIGHT_WIDTH), .ACTIVATION_WIDTH(ACTIVATION_WIDTH), .ACCU_WIDTH(ACCU_WIDTH), + .SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS), + .COMP_PIPELINE_DEPTH(COMP_PIPELINE_DEPTH) + ) core ( + .clk(ap_clk), .rst, .en('1), + .last(dsp_last), .zero(dsp_zero), .w(dsp_w), .a(dsp_a), + .vld(dsp_vld), .p(dsp_p) + ); + end : genCompressor + else if(!IS_MVU || ((VERSION > 2) && (NUM_LANES <= 3) && (WEIGHT_WIDTH <= 8) && (ACTIVATION_WIDTH <= 9))) begin : genINT8 initial $info("Sidestepping to INT8 mode of DSP58 for %0dx%0d.", WEIGHT_WIDTH, ACTIVATION_WIDTH); mvu_vvu_8sx9_dsp58 #( .IS_MVU(IS_MVU), @@ -343,11 +362,15 @@ module mvu_vvu_axi #( if(1) begin : blkOutput localparam int unsigned CORE_PIPELINE_DEPTH = - VERSION == 3? 3 + (SEGMENTLEN == 0? 0 : ((SIMD+2)/3 -1)/SEGMENTLEN) : - /* else */ 3 + $clog2(SIMD+1) + (SIMD == 1); - - // This is conservative and could be divided by a guaranteed minimum output interval, e.g. MW/SIMD. - localparam int unsigned MAX_IN_FLIGHT = CORE_PIPELINE_DEPTH; + USE_COMPRESSOR? COMP_PIPELINE_DEPTH : + VERSION == 3? 3 + (SEGMENTLEN == 0? 0 : ((SIMD+2)/3 -1)/SEGMENTLEN) : + /* else */ 3 + $clog2(SIMD+1) + (SIMD == 1); + + // Floor at the DSP-equivalent depth so the compressor path (shallow pipeline) + // still has enough output queue slots to absorb backpressure transients. + localparam int unsigned DSP_PIPELINE_DEPTH = 3 + $clog2(SIMD+1) + (SIMD == 1); + localparam int unsigned MAX_IN_FLIGHT = + CORE_PIPELINE_DEPTH > DSP_PIPELINE_DEPTH? CORE_PIPELINE_DEPTH : DSP_PIPELINE_DEPTH; typedef logic [PE-1:0][ACCU_WIDTH-1:0] output_t; logic signed [$clog2(MAX_IN_FLIGHT+1):0] OPtr = '1; // -1 | 0, 1, ..., MAX_IN_FLIGHT diff --git a/finn-rtllib/mvu/mvu_vvu_axi_wrapper.v b/finn-rtllib/mvu/mvu_vvu_axi_wrapper.v index 9815d67629..47ffa96ac5 100644 --- a/finn-rtllib/mvu/mvu_vvu_axi_wrapper.v +++ b/finn-rtllib/mvu/mvu_vvu_axi_wrapper.v @@ -45,6 +45,8 @@ module $MODULE_NAME_AXI_WRAPPER$ #( parameter NARROW_WEIGHTS = $NARROW_WEIGHTS$, parameter SIGNED_ACTIVATIONS = $SIGNED_ACTIVATIONS$, parameter SEGMENTLEN = $SEGMENTLEN$, + parameter COMP_PIPELINE_DEPTH = $COMP_PIPELINE_DEPTH$, + parameter USE_COMPRESSOR = $USE_COMPRESSOR$, // Safely deducible parameters parameter WEIGHT_STREAM_WIDTH_BA = (PE*SIMD*WEIGHT_WIDTH+7)/8 * 8, @@ -81,7 +83,8 @@ mvu_vvu_axi #( `endif .IS_MVU(IS_MVU), .VERSION(VERSION), .PUMPED_COMPUTE(PUMPED_COMPUTE), .MW(MW), .MH(MH), .PE(PE), .SIMD(SIMD), .ACTIVATION_WIDTH(ACTIVATION_WIDTH), .WEIGHT_WIDTH(WEIGHT_WIDTH), .ACCU_WIDTH(ACCU_WIDTH), .NARROW_WEIGHTS(NARROW_WEIGHTS), - .SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS), .SEGMENTLEN(SEGMENTLEN) + .SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS), .SEGMENTLEN(SEGMENTLEN), + .COMP_PIPELINE_DEPTH(COMP_PIPELINE_DEPTH), .USE_COMPRESSOR(USE_COMPRESSOR) ) inst ( .ap_clk(ap_clk), .ap_clk2x(ap_clk2x), diff --git a/src/finn/custom_op/fpgadataflow/rtl/matrixvectoractivation_rtl.py b/src/finn/custom_op/fpgadataflow/rtl/matrixvectoractivation_rtl.py index 9cd6fc2a9d..d536e105f8 100644 --- a/src/finn/custom_op/fpgadataflow/rtl/matrixvectoractivation_rtl.py +++ b/src/finn/custom_op/fpgadataflow/rtl/matrixvectoractivation_rtl.py @@ -28,12 +28,15 @@ import numpy as np import os +import shutil from finn.custom_op.fpgadataflow.matrixvectoractivation import MVAU from finn.custom_op.fpgadataflow.rtlbackend import RTLBackend from finn.util.basic import get_dsp_block, is_versal from finn.util.data_packing import npy_to_rtlsim_input, rtlsim_output_to_npy +from finn.compressor import generate_dotp_comp, generate_add_multi_comps + # ONNX i/o tensor shape assumptions for MatrixVectorActivation_rtl: # input 0 is the input tensor, shape (.., i_size) = (..., MW) # input 1 is the weight tensor, shape (i_size, o_size) = (MW, MH) @@ -51,6 +54,16 @@ def get_nodeattr_types(self): my_attrs = { # Double-pumped DSPs enabled "pumpedCompute": ("i", False, 0, {0, 1}), + # Compressor module name (set by generate_hdl when compressor is used) + "comp_module_name": ("s", False, ""), + # add_multi compressor module names, semicolon-separated + "add_multi_comp_names": ("s", False, ""), + # add_multi compressor specs for synthesis aggregation + # Format: "N,W,D;N,W,D;..." e.g. "16,4,0;16,3,0;16,8,0" + "add_multi_comp_specs": ("s", False, ""), + # Force disable LUT-based compressors (for benchmarking/comparison) + # 0 = auto (use compressor when eligible), 1 = force disable + "noCompressor": ("i", False, 0, {0, 1}), } my_attrs.update(MVAU.get_nodeattr_types(self)) my_attrs.update(RTLBackend.get_nodeattr_types(self)) @@ -160,25 +173,59 @@ def dsp_estimation(self, fpgapart): mult_dsp = np.ceil(P / 4) * Q return int(mult_dsp) - def instantiate_ip(self, cmd): - # instantiate the RTL IP - node_name = self.onnx_node.name - code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") - rtllib_dir = os.path.join(os.environ["FINN_ROOT"], "finn-rtllib/mvu/") - sourcefiles = [ + def _get_rtl_source_files(self, abspath=True): + """ + Build the list of RTL source files for this node, including any + generated compressor files. Used by both instantiate_ip() and + get_rtl_file_list() to avoid duplication. + """ + if abspath: + code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") + "/" + rtllib_dir = os.path.join(os.environ["FINN_ROOT"], "finn-rtllib/mvu/") + else: + code_gen_dir = "" + rtllib_dir = "" + + base_files = [ "mvu_pkg.sv", "mvu_vvu_axi.sv", "replay_buffer.sv", "mvu.sv", "mvu_vvu_8sx9_dsp58.sv", - "add_multi.sv", ] sourcefiles = [ os.path.join(code_gen_dir, self.get_nodeattr("gen_top_module") + "_wrapper.v") - ] + [rtllib_dir + _ for _ in sourcefiles] + ] + [rtllib_dir + f for f in base_files] + + # Add compressor files if dotp_comp was generated + comp_name = self.get_nodeattr("comp_module_name") + if comp_name: + comp_hdl_dir = os.path.join( + os.environ["FINN_ROOT"], "src/finn/compressor/hdl/") + sourcefiles.append(os.path.join(code_gen_dir, "dotp_comp.sv")) + sourcefiles.append(os.path.join(comp_hdl_dir, "mul_comp_map.sv")) + sourcefiles.append(os.path.join(code_gen_dir, comp_name + ".sv")) + # dotp_comp path doesn't need add_multi.sv + else: + # DSP path: add_multi.sv always exists in code_gen_dir + # (either patched with comps or copy of template) + sourcefiles.append(os.path.join(code_gen_dir, "add_multi.sv")) + add_multi_names_str = self.get_nodeattr("add_multi_comp_names") + if add_multi_names_str: + # Add compressor modules if present + for name in add_multi_names_str.split(";"): + sourcefiles.append(os.path.join(code_gen_dir, name + ".sv")) + + return sourcefiles + + def instantiate_ip(self, cmd): + # instantiate the RTL IP + node_name = self.onnx_node.name + sourcefiles = self._get_rtl_source_files(abspath=True) for f in sourcefiles: cmd.append("add_files -norecurse %s" % (f)) + mem_mode = self.get_nodeattr("mem_mode") if mem_mode == "internal_decoupled" or self.get_nodeattr("mlo_max_iter"): cmd.append( @@ -268,6 +315,35 @@ def _resolve_dsp_version(self, dsp_block): case _: return 1 + def _is_dotp_comp_eligible(self, fpgapart, ww, aw, pumped_compute): + """ + Check if LUT-based compressor should replace the DSP compute path. + Returns True when: non-pumped, small operands (WW <= 4 and AW <= 4), + and target is Versal or 7-Series (not UltraScale+). + """ + # Check if compressors are force-disabled (for benchmarking) + if self.get_nodeattr("noCompressor"): + return False + if pumped_compute or ww > 4 or aw > 4: + return False + dsp_block = get_dsp_block(fpgapart) + # DSP48E2 (UltraScale+) excluded: no compressor target exists for its + # CARRY8 primitives — generator only supports Versal and 7-Series. + return dsp_block in ("DSP58", "DSP48E1") + + + def _is_add_multi_comp_eligible(self, version, simd): + """ + Check if add_multi lane reductions should use LUT compressors. + Returns True when: not UltraScale+ (version != 2) and SIMD >= 4 + (below 4 inputs, compressors offer no benefit over binary adder tree). + """ + # Check if compressors are force-disabled (for benchmarking) + if self.get_nodeattr("noCompressor"): + return False + # version 2 = DSP48E2 (UltraScale+) blocked for same reason as above. + return version != 2 and simd >= 4 + def generate_hdl(self, model, fpgapart, clk): # Generate params as part of IP preparation code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") @@ -286,6 +362,46 @@ def generate_hdl(self, model, fpgapart, clk): else 1 ) code_gen_dict["$NARROW_WEIGHTS$"] = str(narrow_weights) + + # Extract params from code_gen_dict for compressor generation. + simd = int(code_gen_dict["$SIMD$"][0]) + ww = int(code_gen_dict["$WEIGHT_WIDTH$"][0]) + aw = int(code_gen_dict["$ACTIVATION_WIDTH$"][0]) + accu_width = int(code_gen_dict["$ACCU_WIDTH$"][0]) + signed_act = int(code_gen_dict["$SIGNED_ACTIVATIONS$"][0]) != 0 + pumped_compute = int(code_gen_dict["$PUMPED_COMPUTE$"][0]) + version = int(code_gen_dict["$VERSION$"][0]) + + # Compressor generation if applicable. + if self._is_dotp_comp_eligible(fpgapart, ww, aw, pumped_compute): + result = generate_dotp_comp( + fpgapart, simd, ww, aw, accu_width, signed_act, code_gen_dir) + code_gen_dict["$COMP_PIPELINE_DEPTH$"] = [str(result["comp_delay"])] + code_gen_dict["$USE_COMPRESSOR$"] = [str(1)] + self.set_nodeattr("comp_module_name", result["comp_name"]) + else: + # Generate add_multi.sv (either patched with comps or template copy) + # Check if add_multi should use compressors (respects noCompressor attribute) + if self._is_add_multi_comp_eligible(version, simd): + result = generate_add_multi_comps( + fpgapart, version, simd, ww, aw, accu_width, + narrow_weights, code_gen_dir) + if result["comp_names"]: + self.set_nodeattr("add_multi_comp_names", + ";".join(result["comp_names"])) + # Store compressor specs for synthesis aggregation + # Format: "N,W,D;N,W,D;..." e.g. "16,4,0;16,3,0;16,8,0" + specs_str = ";".join( + f"{n},{w},{d}" for n, w, d in result.get("comp_specs", []) + ) + self.set_nodeattr("add_multi_comp_specs", specs_str) + else: + # Compressors disabled: copy template add_multi.sv (binary adder tree) + rtllib_dir = os.path.join(os.environ["FINN_ROOT"], "finn-rtllib/mvu/") + dest = os.path.join(code_gen_dir, "add_multi.sv") + shutil.copy(os.path.join(rtllib_dir, "add_multi.sv"), dest) + result = {"comp_names": [], "files": [dest]} + # add general parameters to dictionary code_gen_dict["$MODULE_NAME_AXI_WRAPPER$"] = [self.get_verilog_top_module_name()] # save top module name so we can refer to it after this node has been renamed @@ -351,30 +467,13 @@ def prepare_codegen_default(self, fpgapart, clk): [str(1)] if (self.get_input_datatype(0).min() < 0) else [str(0)] ) code_gen_dict["$SEGMENTLEN$"] = [str(self._resolve_segment_len(clk))] + code_gen_dict["$COMP_PIPELINE_DEPTH$"] = [str(1)] + code_gen_dict["$USE_COMPRESSOR$"] = [str(0)] return template_path, code_gen_dict def get_rtl_file_list(self, abspath=False): - if abspath: - code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") + "/" - rtllib_dir = os.path.join(os.environ["FINN_ROOT"], "finn-rtllib/mvu/") - else: - code_gen_dir = "" - rtllib_dir = "" - - verilog_files = [ - "mvu_pkg.sv", - "mvu_vvu_axi.sv", - "replay_buffer.sv", - "mvu.sv", - "mvu_vvu_8sx9_dsp58.sv", - "add_multi.sv", - ] - verilog_files = [ - os.path.join(code_gen_dir, self.get_nodeattr("gen_top_module") + "_wrapper.v") - ] + [rtllib_dir + _ for _ in verilog_files] - - return verilog_files + return self._get_rtl_source_files(abspath=abspath) def get_verilog_paths(self): verilog_paths = super().get_verilog_paths() From 4be4fd3d268927e7b60021cbd4d1bc2d0f6a997e Mon Sep 17 00:00:00 2001 From: Simon Gerber Date: Tue, 14 Apr 2026 12:35:27 +0100 Subject: [PATCH 03/10] [Tests] Add compressor test suite Test infrastructure: - XSim testbench templates (dotp_comp_tb, add_multi_comp_tb, mul_comp_map_tb) - Vivado TCL simulation scripts (dotp_comp, add_multi_comp, dotp) - Test runner scripts: run_tests.sh (21 core configs), run_dotp_comp_tests.sh (8 configs), run_add_multi_comp_tests.sh (8 configs) - Common test utilities (test_common.sh) --- .../hdl/add_multi_comp_tb_template.sv | 142 ++++++++ .../hdl/add_multi_comp_template.tcl | 35 ++ .../compressor/hdl/dotp_comp_tb_template.sv | 303 ++++++++++++++++++ .../compressor/hdl/dotp_comp_template.tcl | 32 ++ src/finn/compressor/hdl/dotp_tb_template.sv | 59 ++++ src/finn/compressor/hdl/dotp_template.tcl | 26 ++ src/finn/compressor/hdl/mul_comp_map_tb.sv | 41 +++ src/finn/compressor/lib/test_common.sh | 101 ++++++ .../compressor/run_add_multi_comp_tests.sh | 135 ++++++++ src/finn/compressor/run_dotp_comp_tests.sh | 143 +++++++++ src/finn/compressor/run_tests.sh | 105 ++++++ 11 files changed, 1122 insertions(+) create mode 100644 src/finn/compressor/hdl/add_multi_comp_tb_template.sv create mode 100644 src/finn/compressor/hdl/add_multi_comp_template.tcl create mode 100644 src/finn/compressor/hdl/dotp_comp_tb_template.sv create mode 100644 src/finn/compressor/hdl/dotp_comp_template.tcl create mode 100644 src/finn/compressor/hdl/dotp_tb_template.sv create mode 100644 src/finn/compressor/hdl/dotp_template.tcl create mode 100644 src/finn/compressor/hdl/mul_comp_map_tb.sv create mode 100644 src/finn/compressor/lib/test_common.sh create mode 100755 src/finn/compressor/run_add_multi_comp_tests.sh create mode 100755 src/finn/compressor/run_dotp_comp_tests.sh create mode 100755 src/finn/compressor/run_tests.sh diff --git a/src/finn/compressor/hdl/add_multi_comp_tb_template.sv b/src/finn/compressor/hdl/add_multi_comp_tb_template.sv new file mode 100644 index 0000000000..b5327262ce --- /dev/null +++ b/src/finn/compressor/hdl/add_multi_comp_tb_template.sv @@ -0,0 +1,142 @@ +/****************************************************************************** + * Copyright (C) 2024 - 2025 Advanced Micro Devices, Inc. All rights reserved. + * + * SPDX-License-Identifier: BSD-3-Clause + * + * @brief Testbench template for add_multi compressor cores + * @author Simon Gerber + *****************************************************************************/ + +/** + * Standalone testbench for add_multi compressor (comp_NuW_dD). + * Tests the compressor directly without requiring add_multi.sv. + * + * Template placeholders expanded by run_add_multi_comp_tests.sh: + * {n} - Number of addends + * {arg_width} - Bit width of each addend + * {depth} - Pipeline depth of compressor + * {label} - Configuration label (e.g. n8_w4_p2) + * {comp_module} - Generated compressor module name (e.g. comp_8u4_d0) + *****************************************************************************/ + +module add_multi_comp_{label}_tb; + + localparam int unsigned N = {n}; + localparam int unsigned ARG_WIDTH = {arg_width}; + localparam int unsigned DEPTH = {depth}; + localparam int unsigned IN_WIDTH = N * ARG_WIDTH; + // Use same formula as mvu_pkg::sumwidth() for consistency + localparam int unsigned SUM_WIDTH = $clog2(N) + ARG_WIDTH; + localparam int unsigned ROUNDS = 257; + + //----------------------------------------------------------------------- + // Global Control + logic clk = 0; + always #5ns clk = !clk; + + logic rst = 1; + initial begin + repeat(16) @(posedge clk); + rst <= 0; + end + + bit done = 0; + always_comb begin + if(done) $finish; + end + + //----------------------------------------------------------------------- + // DUT: direct compressor instantiation + logic [IN_WIDTH-1:0] in; + logic [SUM_WIDTH-1:0] out; + + {comp_module} dut ( + .clk, + .in, + .out + ); + + //----------------------------------------------------------------------- + // Transpose function: convert row-major to column-major format. + // + // The compressor expects inputs in column-major (bit-slice) order: + // in[0..N-1] = bit 0 of all N addends + // in[N..2N-1] = bit 1 of all N addends + // ... + // in[(W-1)*N..W*N-1] = bit W-1 of all N addends + // + // This matches the transpose in add_multi.sv CATCH_COMP macro: + // assign in[j*N+i] = arg[i][j]; + // + // Without this transpose, addend bits would be misaligned and produce + // incorrect sums. + //----------------------------------------------------------------------- + function automatic logic [IN_WIDTH-1:0] transpose( + input logic [IN_WIDTH-1:0] row_major + ); + logic [IN_WIDTH-1:0] col_major; + for(int i = 0; i < N; i++) begin + for(int j = 0; j < ARG_WIDTH; j++) begin + col_major[j*N + i] = row_major[i*ARG_WIDTH + j]; + end + end + return col_major; + endfunction + + //----------------------------------------------------------------------- + // Input Feed + int Q[$]; + initial begin + in = 'x; + @(posedge clk iff !rst); + + repeat(ROUNDS) begin + automatic logic [IN_WIDTH-1:0] aa; + automatic int exp = 0; + void'(std::randomize(aa)); + + // Compute expected sum from row-major input + for(int unsigned i = 0; i < N; i++) begin + exp += aa[i*ARG_WIDTH +: ARG_WIDTH]; + end + + // Transpose to column-major before feeding compressor + in <= transpose(aa); + Q.push_back(exp); + @(posedge clk); + end + + in <= 'x; + repeat(DEPTH + 10) @(posedge clk); + + assert(Q.size == 0) else begin + $error("Missing %0d outputs.", Q.size); + end + done = 1; + end + + //----------------------------------------------------------------------- + // Output Checker + int unsigned Checks = 0; + int unsigned Errors = 0; + initial begin + @(posedge clk iff !rst); + repeat(DEPTH) @(posedge clk); + repeat(ROUNDS) @(posedge clk) begin + automatic int exp = Q.pop_front(); + automatic int hav = out; + assert(hav == exp) else begin + $error("Output mismatch %0d instead of %0d.", hav, exp); + $stop; + Errors <= Errors + 1; + end + Checks <= Checks + 1; + end + end + + final begin + $display("Performed %0d checks with %0d errors.", Checks, Errors); + assert(Checks == ROUNDS) else $error("Unexpected number of checks: %0d instead of %0d.", Checks, ROUNDS); + end + +endmodule : add_multi_comp_{label}_tb diff --git a/src/finn/compressor/hdl/add_multi_comp_template.tcl b/src/finn/compressor/hdl/add_multi_comp_template.tcl new file mode 100644 index 0000000000..c32279e518 --- /dev/null +++ b/src/finn/compressor/hdl/add_multi_comp_template.tcl @@ -0,0 +1,35 @@ +############################################################################# +# Copyright (C) 2024 - 2025 Advanced Micro Devices, Inc. All rights reserved. +# +# SPDX-License-Identifier: BSD-3-Clause +# +# @brief Vivado simulation script for add_multi compressor testbench +# @author Simon Gerber +############################################################################# + +# Vivado batch flow for standalone add_multi compressor test. +# Behavioral simulation only — verifies the generated compressor produces correct sums. +# +# Template placeholders expanded by run_add_multi_comp_tests.sh: +# {label} - Configuration label (e.g. n8_w4_p2) +# {tb} - Testbench module name +# {gen_dir} - Absolute path to gen/