diff --git a/docker/Dockerfile.finn b/docker/Dockerfile.finn
index 5126ed3ff4..f960dc08a8 100644
--- a/docker/Dockerfile.finn
+++ b/docker/Dockerfile.finn
@@ -118,6 +118,7 @@ RUN pip install pytest-metadata==1.7.0
 RUN pip install pytest-html==3.0.0
 RUN pip install pytest-html-merger==0.0.8
 RUN pip install pytest-cov==4.1.0
+RUN pip install pyyaml==6.0.1
 
 # extra dependencies from other FINN deps
 # installed in Docker image to make entrypoint script go faster
diff --git a/docker/jenkins/Jenkinsfile b/docker/jenkins/Jenkinsfile
index 6d51fffd64..cca3436363 100644
--- a/docker/jenkins/Jenkinsfile
+++ b/docker/jenkins/Jenkinsfile
@@ -93,7 +93,7 @@ pipeline {
                 cleanPreviousBuildFiles(env.FINN_HOST_BUILD_DIR)
 
                 // Pass in the marker to run with pytest and the XML test results filename
-                runDockerPytestWithMarker("fpgadataflow", "${env.TEST_NAME}", "--cov --cov-report=html:coverage_fpgadataflow")
+                runDockerPytestWithMarker("fpgadataflow", "${env.TEST_NAME}", "--cov --cov-report=html:coverage_fpgadataflow -n ${env.NUM_PYTEST_WORKERS} --dist worksteal")
 
                 // Stash the test results file(s)
                 stash name: env.TEST_NAME, includes: "${env.TEST_NAME}.xml,${env.TEST_NAME}.html"
@@ -324,21 +324,17 @@ void runDockerPytestWithMarker(String marker, String testResultsFilename, String
   sh """./run-docker.sh python -m pytest -m ${marker} --junitxml=${testResultsFilename}.xml --html=${testResultsFilename}.html --self-contained-html ${additionalOptions}"""
 }
 
-def findBoardBuildFiles(String searchDir, String dirToFind) {
-  def result = sh(script: "find $searchDir -type d -name \"$dirToFind*\"", returnStdout: true).trim()
-  if (result.empty) {
-      error "Directory containing '$dirToFind' not found."
-  }
-	return result
-}
-
 void findCopyZip(String board, String findDir, String copyDir) {
-  def buildDir = findBoardBuildFiles(findDir, "hw_deployment_${board}")
-  sh "cp -r ${buildDir}/${board} ${copyDir}/"
-  dir(copyDir) {
-    sh "zip -r ${board}.zip ${board}/"
-    sh "mkdir -p ${env.ARTIFACT_DIR}/${copyDir}/"
-    sh "cp ${board}.zip ${env.ARTIFACT_DIR}/${copyDir}/"
+  sh "mkdir -p ${copyDir}"
+  try {
+    sh "cp -r ${findDir}/hw_deployment_*/${board} ${copyDir}/"
+    dir(copyDir) {
+      sh "zip -r ${board}.zip ${board}/"
+      sh "mkdir -p ${env.ARTIFACT_DIR}/${copyDir}/"
+      sh "cp ${board}.zip ${env.ARTIFACT_DIR}/${copyDir}/"
+    }
+  } catch (err) {
+    error "No ${board} hw_deployment_* build artifacts found in ${findDir}"
   }
 }
 
diff --git a/finn-rtllib/fifo/hdl/Q_srl.v b/finn-rtllib/fifo/hdl/Q_srl.v
index 0b01973163..9eec01f81a 100644
--- a/finn-rtllib/fifo/hdl/Q_srl.v
+++ b/finn-rtllib/fifo/hdl/Q_srl.v
@@ -119,6 +119,15 @@ module Q_srl (clock, reset, i_d, i_v, i_r, o_d, o_v, o_r, count, maxcount);
    reg       i_b_reg  		    // - true iff !full
 	     /* synthesis syn_allow_retiming=0 */ ;
 
+	// Parameter Checking
+	initial begin
+		if(depth < 2) begin
+			$error("%m: FIFO depth must be two or higher.");
+			$finish;
+		end
+	end
+
+
    assign addr_full_ = (state_==state_more) && (addr_==depth-2);
 						// - queue full
    assign addr_zero_ = (addr==0);		// - queue contains 2 (or 1,0)
diff --git a/finn-rtllib/memstream/hdl/Q_srl.v b/finn-rtllib/memstream/hdl/Q_srl.v
deleted file mode 100644
index 11cef604e0..0000000000
--- a/finn-rtllib/memstream/hdl/Q_srl.v
+++ /dev/null
@@ -1,308 +0,0 @@
-// original source:
-// https://github.com/nachiket/tdfc/blob/master/verilog/queues/Q_srl_oreg3_prefull_SIMPLE.v
-
-
-// Copyright (c) 1999 The Regents of the University of California
-// Copyright (c) 2010 The Regents of the University of Pennsylvania
-// Copyright (c) 2011 Department of Electrical and Electronic Engineering, Imperial College London
-// Copyright (c) 2020 Xilinx
-//
-// Permission to use, copy, modify, and distribute this software and
-// its documentation for any purpose, without fee, and without a
-// written agreement is hereby granted, provided that the above copyright
-// notice and this paragraph and the following two paragraphs appear in
-// all copies.
-//
-// IN NO EVENT SHALL THE UNIVERSITY OF CALIFORNIA BE LIABLE TO ANY PARTY FOR
-// DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, INCLUDING
-// LOST PROFITS, ARISING OUT OF THE USE OF THIS SOFTWARE AND ITS DOCUMENTATION,
-// EVEN IF THE UNIVERSITY OF CALIFORNIA HAS BEEN ADVISED OF THE POSSIBILITY OF
-// SUCH DAMAGE.
-//
-// THE UNIVERSITY OF CALIFORNIA SPECIFICALLY DISCLAIMS ANY WARRANTIES,
-// INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY
-// AND FITNESS FOR A PARTICULAR PURPOSE. THE SOFTWARE PROVIDED HEREUNDER IS ON
-// AN "AS IS" BASIS, AND THE UNIVERSITY OF CALIFORNIA HAS NO OBLIGATIONS TO
-// PROVIDE MAINTENANCE, SUPPORT, UPDATES, ENHANCEMENTS, OR MODIFICATIONS.
-//
-
-// Q_srl_oreg3_prefull_SIMPLE.v
-//
-//  - In-page queue with parameterizable depth, bit width
-//  - Stream I/O is triple (data, valid, back-pressure),
-//      with EOS concatenated into the data
-//  - Flow control for input & output is combinationally decoupled
-//  - 2 <= depth <= 256
-//      * (depth >= 2)  is required to decouple I/O flow control,
-//          where empty => no produce,  full => no consume,
-//          and depth 1 would ping-pong between the two at half rate
-//      * (depth <= 256) can be modified
-//           by changing ''synthesis loop_limit X'' below
-//          and changing ''addrwidth'' or its log computation
-//  - 1 <= width
-//  - Queue storage is in SRL16E, up to depth 16 per LUT per bit-slice,
-//      plus output register (for fast output)
-//  - Queue addressing is done by ''addr'' up-down counter
-//  - Queue fullness is checked by comparator (addr==depth)
-//  - Queue fullness                           is pre-computed for next cycle
-//  - Queue input back-pressure                is pre-computed for next cycle
-//  - Queue output valid (state!=state__empty) is pre-computed for next cycle
-//      (necessary since SRL data output reg requires non-boolean state)
-//  - FSM has 3 states (empty, one, more)
-//  - When empty, continue to emit most recently emitted value (for debugging)
-//
-//  - Queue slots used      = / (state==state_empty) ? 0
-//                            | (state==state_one)   ? 1
-//                            \ (state==state_more)  ? addr+2
-//  - Queue slots used     <=  depth
-//  - Queue slots remaining =  depth - used
-//                          = / (state==state_empty) ? depth
-//                            | (state==state_one)   ? depth-1
-//                            \ (state==state_more)  ? depth-2-addr
-//
-//  - Synplify 7.1 / 8.0
-//  - Eylon Caspi,  9/11/03, 8/18/04, 3/29/05
-
-
-`ifdef  Q_srl
-`else
-`define Q_srl
-
-
-module Q_srl (clock, reset, i_d, i_v, i_r, o_d, o_v, o_r, count, maxcount);
-
-   parameter depth = 16;   // - greatest #items in queue  (2 <= depth <= 256)
-   parameter width = 16;   // - width of data (i_d, o_d)
-
-   parameter addrwidth = $clog2(depth);
-
-   input     clock;
-   input     reset;
-
-   input  [width-1:0] i_d;	// - input  stream data (concat data + eos)
-   input              i_v;	// - input  stream valid
-   output             i_r;	// - input  stream ready
-   wire               i_b;  // - input  stream back-pressure
-
-   output [width-1:0] o_d;	// - output stream data (concat data + eos)
-   output             o_v;	// - output stream valid
-   input              o_r;	// - output stream ready
-   wire               o_b;	// - output stream back-pressure
-
-   output [addrwidth:0] count;  // - output number of elems in queue
-   output [addrwidth:0] maxcount;  // - maximum observed count since reset
-
-   reg [addrwidth:0] maxcount_reg;  // - maximum count seen until now
-   reg    [addrwidth-1:0] addr, addr_, a_;		// - SRL16 address
-							//     for data output
-   reg 			  shift_en_;			// - SRL16 shift enable
-   reg    [width-1:0] 	  srl [depth-2:0];		// - SRL16 memory
-   reg 			  shift_en_o_;			// - SRLO  shift enable
-   reg    [width-1:0] 	  srlo_, srlo			// - SRLO  output reg
-			  /* synthesis syn_allow_retiming=0 */ ;
-
-   parameter state_empty = 2'd0;    // - state empty : o_v=0 o_d=UNDEFINED
-   parameter state_one   = 2'd1;    // - state one   : o_v=1 o_d=srlo
-   parameter state_more  = 2'd2;    // - state more  : o_v=1 o_d=srlo
-				    //     #items in srl = addr+2
-
-   reg [1:0] state, state_;	    // - state register
-
-   wire      addr_full_;	    // - true iff addr==depth-2 on NEXT cycle
-   reg       addr_full; 	    // - true iff addr==depth-2
-   wire      addr_zero_;	    // - true iff addr==0
-   wire      o_v_reg_;		    // - true iff state_empty   on NEXT cycle
-   reg       o_v_reg  		    // - true iff state_empty
-	     /* synthesis syn_allow_retiming=0 */ ;
-   wire      i_b_reg_;		    // - true iff !full         on NEXT cycle
-   reg       i_b_reg  		    // - true iff !full
-	     /* synthesis syn_allow_retiming=0 */ ;
-
-   assign addr_full_ = (state_==state_more) && (addr_==depth-2);
-						// - queue full
-   assign addr_zero_ = (addr==0);		// - queue contains 2 (or 1,0)
-   assign o_v_reg_   = (state_!=state_empty);	// - output valid if non-empty
-   assign i_b_reg_   = addr_full_;		// - input bp if full
-   assign o_d = srlo;				// - output data from queue
-   assign o_v = o_v_reg;			// - output valid if non-empty
-   assign i_b = i_b_reg;			// - input bp if full
-   assign maxcount = maxcount_reg;
-
-   assign i_r = !i_b;
-   assign o_b = !o_r;
-
-   assign count = (state==state_more ? addr+2 : (state==state_one ? 1 : 0));
-
-   // - ''always'' block with both FFs and SRL16 does not work,
-   //      since FFs need reset but SRL16 does not
-
-   always @(posedge clock) begin	// - seq always: FFs
-      if (reset) begin
-	 state     <= state_empty;
-	 addr      <= 0;
-         addr_full <= 0;
-	 o_v_reg   <= 0;
-
-	 i_b_reg   <= 0;
-	 maxcount_reg <= 0;
-
-      end
-      else begin
-	 state     <= state_;
-	 addr      <= addr_;
-         addr_full <= addr_full_;
-	 o_v_reg   <= o_v_reg_;
-	 i_b_reg   <= i_b_reg_;
-	 maxcount_reg <= (count > maxcount_reg ? count : maxcount_reg);
-      end
-   end // always @ (posedge clock)
-
-   always @(posedge clock) begin	// - seq always: srlo
-      // - infer enabled output reg at end of shift chain
-      // - input first element from i_d, all subsequent elements from SRL16
-      if (reset) begin
-	 srlo <= 0;
-      end
-      else begin
-	 if (shift_en_o_) begin
-	    srlo <= srlo_;
-	 end
-      end
-   end // always @ (posedge clock)
-
-   always @(posedge clock) begin			// - seq always: srl
-      // - infer enabled SRL16E from shifting srl array
-      // - no reset capability;  srl[] contents undefined on reset
-      if (shift_en_) begin
-	 // synthesis loop_limit 256
-	 for (a_=depth-2; a_>0; a_=a_-1) begin
-	    srl[a_] = srl[a_-1];
-	 end
-	 srl[0] <= i_d;
-      end
-   end // always @ (posedge clock or negedge reset)
-
-   always @* begin					// - combi always
-        srlo_       <=  'bx;
-        shift_en_o_ <= 1'bx;
-        shift_en_   <= 1'bx;
-        addr_       <=  'bx;
-        state_      <= 2'bx;
-      case (state)
-
-	state_empty: begin		    // - (empty, will not produce)
-	      if (i_v) begin		    // - empty & i_v => consume
-		 srlo_       <= i_d;
-		 shift_en_o_ <= 1;
-		 shift_en_   <= 1'bx;
-		 addr_       <= 0;
-		 state_      <= state_one;
-	      end
-	      else	begin		    // - empty & !i_v => idle
-		 srlo_       <= 'bx;
-		 shift_en_o_ <= 0;
-		 shift_en_   <= 1'bx;
-		 addr_       <= 0;
-		 state_      <= state_empty;
-	      end
-	end
-
-	state_one: begin		    // - (contains one)
-	      if (i_v && o_b) begin	    // - one & i_v & o_b => consume
-		 srlo_       <= 'bx;
-		 shift_en_o_ <= 0;
-		 shift_en_   <= 1;
-		 addr_       <= 0;
-		 state_      <= state_more;
-	      end
-	      else if (i_v && !o_b) begin   // - one & i_v & !o_b => cons+prod
-		 srlo_       <= i_d;
-		 shift_en_o_ <= 1;
-		 shift_en_   <= 1;
-		 addr_       <= 0;
-		 state_      <= state_one;
-	      end
-	      else if (!i_v && o_b) begin   // - one & !i_v & o_b => idle
-		 srlo_       <= 'bx;
-		 shift_en_o_ <= 0;
-		 shift_en_   <= 1'bx;
-		 addr_       <= 0;
-		 state_      <= state_one;
-	      end
-	      else if (!i_v && !o_b) begin  // - one & !i_v & !o_b => produce
-		 srlo_       <= 'bx;
-		 shift_en_o_ <= 0;
-		 shift_en_   <= 1'bx;
-		 addr_       <= 0;
-		 state_      <= state_empty;
-	      end
-	end // case: state_one
-
-	state_more: begin		    // - (contains more than one)
-	   if (addr_full || (depth==2)) begin
-					    // - (full, will not consume)
-					    // - (full here if depth==2)
-	      if (o_b) begin		    // - full & o_b => idle
-		 srlo_       <= 'bx;
-		 shift_en_o_ <= 0;
-		 shift_en_   <= 0;
-		 addr_       <= addr;
-		 state_      <= state_more;
-	      end
-	      else begin		    // - full & !o_b => produce
-		 srlo_       <= srl[addr];
-		 shift_en_o_ <= 1;
-		 shift_en_   <= 0;
-//		 addr_       <= addr-1;
-//		 state_      <= state_more;
-		 addr_       <= addr_zero_ ? 0         : addr-1;
-		 state_      <= addr_zero_ ? state_one : state_more;
-	      end
-	   end
-	   else begin			    // - (mid: neither empty nor full)
-	      if (i_v && o_b) begin	    // - mid & i_v & o_b => consume
-		 srlo_       <= 'bx;
-		 shift_en_o_ <= 0;
-		 shift_en_   <= 1;
-		 addr_       <= addr+1;
-		 state_      <= state_more;
-	      end
-	      else if (i_v && !o_b) begin   // - mid & i_v & !o_b => cons+prod
-		 srlo_       <= srl[addr];
-		 shift_en_o_ <= 1;
-		 shift_en_   <= 1;
-		 addr_       <= addr;
-		 state_      <= state_more;
-	      end
-	      else if (!i_v && o_b) begin   // - mid & !i_v & o_b => idle
-		 srlo_       <= 'bx;
-		 shift_en_o_ <= 0;
-		 shift_en_   <= 0;
-		 addr_       <= addr;
-		 state_      <= state_more;
-	      end
-	      else if (!i_v && !o_b) begin  // - mid & !i_v & !o_b => produce
-		 srlo_       <= srl[addr];
-		 shift_en_o_ <= 1;
-		 shift_en_   <= 0;
-		 addr_       <= addr_zero_ ? 0         : addr-1;
-		 state_      <= addr_zero_ ? state_one : state_more;
-	      end
-	   end // else: !if(addr_full)
-	end // case: state_more
-
-	default: begin
-		 srlo_       <=  'bx;
-		 shift_en_o_ <= 1'bx;
-		 shift_en_   <= 1'bx;
-		 addr_       <=  'bx;
-		 state_      <= 2'bx;
-	end // case: default
-
-      endcase // case(state)
-   end // always @ *
-
-endmodule // Q_srl
-
-
-`endif  // `ifdef  Q_srl
diff --git a/notebooks/end2end_example/bnn-pynq/cnv_end2end_example.ipynb b/notebooks/end2end_example/bnn-pynq/cnv_end2end_example.ipynb
index 8b8cff8ee9..507b1022e6 100644
--- a/notebooks/end2end_example/bnn-pynq/cnv_end2end_example.ipynb
+++ b/notebooks/end2end_example/bnn-pynq/cnv_end2end_example.ipynb
@@ -484,8 +484,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "from shutil import copy\n",
-    "from distutils.dir_util import copy_tree\n",
+    "from shutil import copy, copytree\n",
     "\n",
     "# create directory for deployment files\n",
     "deployment_dir = make_build_dir(prefix=\"pynq_deployment_\")\n",
@@ -503,7 +502,7 @@
     "\n",
     "# driver.py and python libraries\n",
     "pynq_driver_dir = model.get_metadata_prop(\"pynq_driver_dir\")\n",
-    "copy_tree(pynq_driver_dir, deployment_dir)"
+    "copytree(pynq_driver_dir, deployment_dir, dirs_exist_ok=True)"
    ]
   },
   {
diff --git a/notebooks/end2end_example/bnn-pynq/tfc_end2end_example.ipynb b/notebooks/end2end_example/bnn-pynq/tfc_end2end_example.ipynb
index 675ba23d2d..bb5e357b66 100644
--- a/notebooks/end2end_example/bnn-pynq/tfc_end2end_example.ipynb
+++ b/notebooks/end2end_example/bnn-pynq/tfc_end2end_example.ipynb
@@ -895,8 +895,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "from shutil import copy\n",
-    "from distutils.dir_util import copy_tree\n",
+    "from shutil import copy, copytree\n",
     "\n",
     "# create directory for deployment files\n",
     "deployment_dir = make_build_dir(prefix=\"pynq_deployment_\")\n",
@@ -914,7 +913,7 @@
     "\n",
     "# driver.py and python libraries\n",
     "pynq_driver_dir = model.get_metadata_prop(\"pynq_driver_dir\")\n",
-    "copy_tree(pynq_driver_dir, deployment_dir)"
+    "copytree(pynq_driver_dir, deployment_dir, dirs_exist_ok=True)"
    ]
   },
   {
diff --git a/run-docker.sh b/run-docker.sh
index ec55299f6c..66ef8f00f2 100755
--- a/run-docker.sh
+++ b/run-docker.sh
@@ -88,7 +88,7 @@ SCRIPTPATH=$(dirname "$SCRIPT")
 : ${PLATFORM_REPO_PATHS="/opt/xilinx/platforms"}
 : ${XRT_DEB_VERSION="xrt_202220.2.14.354_22.04-amd64-xrt"}
 : ${FINN_HOST_BUILD_DIR="/tmp/$DOCKER_INST_NAME"}
-: ${FINN_DOCKER_TAG="xilinx/finn:$(git describe --always --tags --dirty).$XRT_DEB_VERSION"}
+: ${FINN_DOCKER_TAG="xilinx/finn:$(OLD_PWD=$(pwd); cd $SCRIPTPATH; git describe --always --tags --dirty; cd $OLD_PWD).$XRT_DEB_VERSION"}
 : ${FINN_DOCKER_PREBUILT="0"}
 : ${FINN_DOCKER_RUN_AS_ROOT="0"}
 : ${FINN_DOCKER_GPU="$(docker info | grep nvidia | wc -m)"}
diff --git a/src/finn/builder/build_dataflow_steps.py b/src/finn/builder/build_dataflow_steps.py
index ab2280554c..5163b2dbdb 100644
--- a/src/finn/builder/build_dataflow_steps.py
+++ b/src/finn/builder/build_dataflow_steps.py
@@ -33,7 +33,6 @@
 import shutil
 import warnings
 from copy import deepcopy
-from distutils.dir_util import copy_tree
 from functools import partial
 from qonnx.core.modelwrapper import ModelWrapper
 from qonnx.custom_op.registry import getCustomOp
@@ -656,7 +655,9 @@ def step_create_stitched_ip(model: ModelWrapper, cfg: DataflowBuildConfig):
             )
         )
         # TODO copy all ip sources into output dir? as zip?
-        copy_tree(model.get_metadata_prop("vivado_stitch_proj"), stitched_ip_dir)
+        shutil.copytree(
+            model.get_metadata_prop("vivado_stitch_proj"), stitched_ip_dir, dirs_exist_ok=True
+        )
         print("Vivado stitched IP written into " + stitched_ip_dir)
     if VerificationStepType.STITCHED_IP_RTLSIM in cfg._resolve_verification_steps():
         # prepare ip-stitched rtlsim
@@ -761,7 +762,7 @@ def step_make_pynq_driver(model: ModelWrapper, cfg: DataflowBuildConfig):
     if DataflowOutputType.PYNQ_DRIVER in cfg.generate_outputs:
         driver_dir = cfg.output_dir + "/driver"
         model = model.transform(MakePYNQDriver(cfg._resolve_driver_platform()))
-        copy_tree(model.get_metadata_prop("pynq_driver_dir"), driver_dir)
+        shutil.copytree(model.get_metadata_prop("pynq_driver_dir"), driver_dir, dirs_exist_ok=True)
         print("PYNQ Python driver written into " + driver_dir)
     return model
 
@@ -862,8 +863,8 @@ def step_deployment_package(model: ModelWrapper, cfg: DataflowBuildConfig):
         bitfile_dir = cfg.output_dir + "/bitfile"
         driver_dir = cfg.output_dir + "/driver"
         os.makedirs(deploy_dir, exist_ok=True)
-        copy_tree(bitfile_dir, deploy_dir + "/bitfile")
-        copy_tree(driver_dir, deploy_dir + "/driver")
+        shutil.copytree(bitfile_dir, deploy_dir + "/bitfile", dirs_exist_ok=True)
+        shutil.copytree(driver_dir, deploy_dir + "/driver", dirs_exist_ok=True)
     return model
 
 
diff --git a/src/finn/custom_op/fpgadataflow/concat.py b/src/finn/custom_op/fpgadataflow/concat.py
index 210b6b7fdd..985ac83ea6 100644
--- a/src/finn/custom_op/fpgadataflow/concat.py
+++ b/src/finn/custom_op/fpgadataflow/concat.py
@@ -29,7 +29,6 @@
 
 import numpy as np
 from qonnx.core.datatype import DataType
-from qonnx.util.basic import roundup_to_integer_multiple
 
 from finn.custom_op.fpgadataflow.hwcustomop import HWCustomOp
 
@@ -134,10 +133,6 @@ def execute_node(self, context, graph):
         result = np.concatenate(inp_values, axis=-1)
         context[node.output[0]] = result
 
-    def get_instream_width_padded(self, ind=0):
-        in_width = self.get_instream_width(ind)
-        return roundup_to_integer_multiple(in_width, 8)
-
     def get_verilog_top_module_intf_names(self):
         intf_names = super().get_verilog_top_module_intf_names()
         n_inputs = self.get_n_inputs()
diff --git a/src/finn/custom_op/fpgadataflow/hls/lookup_hls.py b/src/finn/custom_op/fpgadataflow/hls/lookup_hls.py
index ba44deb898..ad40b62d8c 100644
--- a/src/finn/custom_op/fpgadataflow/hls/lookup_hls.py
+++ b/src/finn/custom_op/fpgadataflow/hls/lookup_hls.py
@@ -28,6 +28,7 @@
 
 import numpy as np
 import os
+import warnings
 from math import ceil, log2
 from qonnx.core.datatype import DataType
 
@@ -87,31 +88,6 @@ def defines(self, var):
             my_defines.append("#define EmbeddingType %s" % emb_hls_type)
         self.code_gen_dict["$DEFINES$"] = my_defines
 
-    def read_npy_data(self):
-        code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim")
-        dtype = self.get_input_datatype()
-        if dtype == DataType["BIPOLAR"]:
-            # use binary for bipolar storage
-            dtype = DataType["BINARY"]
-        elem_bits = dtype.bitwidth()
-        packed_bits = self.get_instream_width()
-        packed_hls_type = "ap_uint<%d>" % packed_bits
-        elem_hls_type = dtype.get_hls_datatype_str()
-        npy_type = "int64_t"
-        npy_in = "%s/input_0.npy" % code_gen_dir
-        self.code_gen_dict["$READNPYDATA$"] = []
-        self.code_gen_dict["$READNPYDATA$"].append(
-            'npy2apintstream<%s, %s, %d, %s>("%s", in0_%s);'
-            % (
-                packed_hls_type,
-                elem_hls_type,
-                elem_bits,
-                npy_type,
-                npy_in,
-                self.hls_sname(),
-            )
-        )
-
     def dataoutstrm(self):
         code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim")
         dtype = self.get_output_datatype()
@@ -273,7 +249,18 @@ def execute_node(self, context, graph):
             )
 
         inp = context[node.input[0]]
-        assert inp.dtype == np.int64, "Inputs must be contained in int64 ndarray"
+
+        # Make sure the input has the right container datatype
+        if inp.dtype is not np.float32:
+            # Issue a warning to make the user aware of this type-cast
+            warnings.warn(
+                f"{node.name}: Changing input container datatype from "
+                f"{inp.dtype} to {np.float32}"
+            )
+            # Convert the input to floating point representation as the
+            # container datatype
+            inp = inp.astype(np.float32)
+
         assert inp.shape == exp_ishape, """Input shape doesn't match expected shape."""
         export_idt = self.get_input_datatype()
         odt = self.get_output_datatype()
diff --git a/src/finn/custom_op/fpgadataflow/rtl/streamingfifo_rtl.py b/src/finn/custom_op/fpgadataflow/rtl/streamingfifo_rtl.py
index f8f27cb647..1b240eeff8 100644
--- a/src/finn/custom_op/fpgadataflow/rtl/streamingfifo_rtl.py
+++ b/src/finn/custom_op/fpgadataflow/rtl/streamingfifo_rtl.py
@@ -133,10 +133,18 @@ def execute_node(self, context, graph):
         elif mode == "rtlsim":
             code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen")
             # create a npy file for the input of the node
-            assert (
-                str(inp.dtype) == "float32"
-            ), """Input datatype is
-                not float32 as expected."""
+
+            # Make sure the input has the right container datatype
+            if inp.dtype is not np.float32:
+                # Issue a warning to make the user aware of this type-cast
+                warnings.warn(
+                    f"{node.name}: Changing input container datatype from "
+                    f"{inp.dtype} to {np.float32}"
+                )
+                # Convert the input to floating point representation as the
+                # container datatype
+                inp = inp.astype(np.float32)
+
             expected_inp_shape = self.get_folded_input_shape()
             reshaped_input = inp.reshape(expected_inp_shape)
             if DataType[self.get_nodeattr("dataType")] == DataType["BIPOLAR"]:
diff --git a/src/finn/custom_op/fpgadataflow/thresholding.py b/src/finn/custom_op/fpgadataflow/thresholding.py
index 12cb76be4e..8cebf613b1 100644
--- a/src/finn/custom_op/fpgadataflow/thresholding.py
+++ b/src/finn/custom_op/fpgadataflow/thresholding.py
@@ -243,16 +243,29 @@ def execute_node(self, context, graph):
         inp_values = context[node.input[0]]
         th_val = context[node.input[1]]
         out_bias = self.get_nodeattr("ActVal")
-        # MT expects inputs to be in the shape (N,C,H,W) or (N, C)
-        # if 4D then input values in context are (N,H,W,C) and need to
-        # be transposed.
-        # if 2D then inputs can be passed directly to MT function
-        is_4d = len(inp_values.shape) == 4
-        if is_4d:
-            inp_values = np.transpose(inp_values, (0, 3, 1, 2))
+
+        # Consider the data layout for transposing the input into the format
+        # accepted by the multithreshold function above, i.e, the channel
+        # dimension is along the axis with index 1.
+        data_layout = None
+        # If there is no layout annotation, guess based on rank of the tensor
+        # TODO: Currently there is no mechanism here to get the layout
+        #  annotation, we allways guess, but this matches the previous behavior.
+        if len(inp_values.shape) < 5:
+            # Maps tensor rank to layout annotation
+            rank_to_layout = {0: None, 1: "C", 2: "NC", 3: "NWC", 4: "NHWC"}
+            # Lookup the layout required by this input shape
+            data_layout = rank_to_layout[len(inp_values.shape)]
+        # Lookup the index of the channel dimension in the data layout
+        # Note: Assumes there is at most one "C" which denotes the channel
+        # dimension
+        cdim = data_layout.index("C") if "C" in data_layout else 1
+        # Rearrange the input to the expected (N, C, ...) layout
+        inp_values = inp_values.swapaxes(cdim, 1)
         y = multithreshold(inp_values, th_val, out_bias=out_bias)
-        if is_4d:
-            y = y.transpose(0, 2, 3, 1)
+        # Rearrange the output back to the original layout
+        y = y.swapaxes(cdim, 1)
+
         act = DataType[self.get_nodeattr("outputDataType")]
         if act == DataType["BIPOLAR"]:
             # binary to bipolar
diff --git a/src/finn/transformation/fpgadataflow/insert_fifo.py b/src/finn/transformation/fpgadataflow/insert_fifo.py
index 9ed0f51cd4..3f697266ae 100644
--- a/src/finn/transformation/fpgadataflow/insert_fifo.py
+++ b/src/finn/transformation/fpgadataflow/insert_fifo.py
@@ -202,6 +202,8 @@ def apply(self, model):
                     fifo_depth = n0.get_nodeattr("inFIFODepths")[inp_ind]
 
                     if fifo_depth > 2 or self.create_shallow_fifos:
+                        # Ensure that create shallow fifo condition doesn't create depth=1 fifos
+                        fifo_depth = max(fifo_depth, 2)
                         # create fifo node
                         fifo_output_tensor = oh.make_tensor_value_info(
                             model.make_new_valueinfo_name(),
@@ -264,6 +266,8 @@ def apply(self, model):
                     fifo_depth = n0.get_nodeattr("outFIFODepths")[out_ind]
 
                     if fifo_depth > 2 or self.create_shallow_fifos:
+                        # Ensure that create shallow fifo condition doesn't create depth=1 fifos
+                        fifo_depth = max(fifo_depth, 2)
                         # create fifo node
                         fifo_input_tensor = oh.make_tensor_value_info(
                             model.make_new_valueinfo_name(),
diff --git a/src/finn/transformation/fpgadataflow/set_fifo_depths.py b/src/finn/transformation/fpgadataflow/set_fifo_depths.py
index 82ee536d50..c3baf80aab 100644
--- a/src/finn/transformation/fpgadataflow/set_fifo_depths.py
+++ b/src/finn/transformation/fpgadataflow/set_fifo_depths.py
@@ -539,7 +539,7 @@ def decompose_pow2(x):
     ret_final = []
     for cand_depth in ret_pass2:
         if cand_depth <= max_qsrl_depth:
-            ret_final.append((cand_depth, "rtl"))
+            ret_final.append((max(2, cand_depth), "rtl"))
         else:
             ret_final.append((cand_depth, "vivado"))
 
diff --git a/tests/end2end/test_end2end_bnn_pynq.py b/tests/end2end/test_end2end_bnn_pynq.py
index 0d3418624a..ac098d708c 100644
--- a/tests/end2end/test_end2end_bnn_pynq.py
+++ b/tests/end2end/test_end2end_bnn_pynq.py
@@ -40,7 +40,6 @@
 import warnings
 from brevitas.export import export_qonnx
 from dataset_loading import cifar, mnist
-from distutils.dir_util import copy_tree
 from qonnx.core.datatype import DataType
 from qonnx.core.modelwrapper import ModelWrapper
 from qonnx.custom_op.registry import getCustomOp
@@ -59,7 +58,7 @@
 from qonnx.transformation.lower_convs_to_matmul import LowerConvsToMatMul
 from qonnx.transformation.merge_onnx_models import MergeONNXModels
 from qonnx.util.cleanup import cleanup as qonnx_cleanup
-from shutil import copy
+from shutil import copy, copytree
 
 import finn.transformation.fpgadataflow.convert_to_hw_layers as to_hw
 import finn.transformation.streamline.absorb as absorb
@@ -112,8 +111,9 @@
 rtlsim_trace = False
 
 
-def get_checkpoint_name(topology, wbits, abits, step):
-    return build_dir + "/end2end_%s_w%da%d_%s.onnx" % (
+def get_checkpoint_name(board, topology, wbits, abits, step):
+    return build_dir + "/end2end_%s_%s_w%da%d_%s.onnx" % (
+        board,
         topology,
         wbits,
         abits,
@@ -357,7 +357,7 @@ def deploy_based_on_board(model, model_title, topology, wbits, abits, board):
 
     # driver.py and python libraries
     pynq_driver_dir = model.get_metadata_prop("pynq_driver_dir")
-    copy_tree(pynq_driver_dir, deployment_dir)
+    copytree(pynq_driver_dir, deployment_dir, dirs_exist_ok=True)
     model.set_metadata_prop("pynq_deploy_dir", deployment_dir)
 
 
@@ -451,11 +451,16 @@ def pytest_generate_tests(metafunc):
             scenarios.extend(test_cases)
 
     if len(scenarios) > 0:
-        for scenario in scenarios:
+        for i, scenario in enumerate(scenarios):
             idlist.append(scenario[0])
             items = scenario[1].items()
             argnames = [x[0] for x in items]
-            argvalues.append([x[1] for x in items])
+            argvalues_scenario = [x[1] for x in items]
+            argvalues.append(
+                pytest.param(
+                    *argvalues_scenario, marks=pytest.mark.xdist_group(name="bnn_pynq_%d" % i)
+                )
+            )
         metafunc.parametrize(argnames, argvalues, ids=idlist, scope="class")
 
 
@@ -471,7 +476,7 @@ def test_export(self, topology, wbits, abits, board):
         if topology == "lfc" and not (wbits == 1 and abits == 1):
             pytest.skip("Skipping certain lfc configs")
         (model, ishape) = get_trained_network_and_ishape(topology, wbits, abits)
-        chkpt_name = get_checkpoint_name(topology, wbits, abits, "export")
+        chkpt_name = get_checkpoint_name(board, topology, wbits, abits, "export")
         export_qonnx(model, torch.randn(ishape), chkpt_name, opset_version=13)
         qonnx_cleanup(chkpt_name, out_file=chkpt_name)
         model = ModelWrapper(chkpt_name)
@@ -480,7 +485,7 @@ def test_export(self, topology, wbits, abits, board):
         assert os.path.isfile(chkpt_name)
 
     def test_import_and_tidy(self, topology, wbits, abits, board):
-        prev_chkpt_name = get_checkpoint_name(topology, wbits, abits, "export")
+        prev_chkpt_name = get_checkpoint_name(board, topology, wbits, abits, "export")
         model = load_test_checkpoint_or_skip(prev_chkpt_name)
         model = model.transform(InferShapes())
         model = model.transform(FoldConstants())
@@ -488,17 +493,17 @@ def test_import_and_tidy(self, topology, wbits, abits, board):
         model = model.transform(GiveReadableTensorNames())
         model = model.transform(InferDataTypes())
         model = model.transform(RemoveStaticGraphInputs())
-        chkpt = get_checkpoint_name(topology, wbits, abits, "import_and_tidy")
+        chkpt = get_checkpoint_name(board, topology, wbits, abits, "import_and_tidy")
         model.save(chkpt)
 
     def test_add_pre_and_postproc(self, topology, wbits, abits, board):
-        prev_chkpt_name = get_checkpoint_name(topology, wbits, abits, "import_and_tidy")
+        prev_chkpt_name = get_checkpoint_name(board, topology, wbits, abits, "import_and_tidy")
         model = load_test_checkpoint_or_skip(prev_chkpt_name)
         global_inp_name = model.graph.input[0].name
         ishape = model.get_tensor_shape(global_inp_name)
         # preprocessing: torchvision's ToTensor divides uint8 inputs by 255
         totensor_pyt = ToTensor()
-        chkpt_preproc_name = get_checkpoint_name(topology, wbits, abits, "preproc")
+        chkpt_preproc_name = get_checkpoint_name(board, topology, wbits, abits, "preproc")
         export_qonnx(totensor_pyt, torch.randn(ishape), chkpt_preproc_name, opset_version=13)
         qonnx_cleanup(chkpt_preproc_name, out_file=chkpt_preproc_name)
         pre_model = ModelWrapper(chkpt_preproc_name)
@@ -515,7 +520,7 @@ def test_add_pre_and_postproc(self, topology, wbits, abits, board):
         model.set_tensor_datatype(global_inp_name, DataType["UINT8"])
         # postprocessing: insert Top-1 node at the end
         model = model.transform(InsertTopK(k=1))
-        chkpt_name = get_checkpoint_name(topology, wbits, abits, "pre_post")
+        chkpt_name = get_checkpoint_name(board, topology, wbits, abits, "pre_post")
         # tidy-up again
         model = model.transform(InferShapes())
         model = model.transform(FoldConstants())
@@ -527,7 +532,7 @@ def test_add_pre_and_postproc(self, topology, wbits, abits, board):
         assert os.path.isfile(chkpt_name)
 
     def test_streamline(self, topology, wbits, abits, board):
-        prev_chkpt_name = get_checkpoint_name(topology, wbits, abits, "pre_post")
+        prev_chkpt_name = get_checkpoint_name(board, topology, wbits, abits, "pre_post")
         model = load_test_checkpoint_or_skip(prev_chkpt_name)
         model = model.transform(absorb.AbsorbSignBiasIntoMultiThreshold())
         # move past any reshapes to be able to streamline input scaling
@@ -543,10 +548,10 @@ def test_streamline(self, topology, wbits, abits, board):
         model = model.transform(absorb.AbsorbScalarMulAddIntoTopK())
         model = model.transform(InferDataLayouts())
         model = model.transform(RemoveUnusedTensors())
-        model.save(get_checkpoint_name(topology, wbits, abits, "streamline"))
+        model.save(get_checkpoint_name(board, topology, wbits, abits, "streamline"))
 
     def test_convert_to_hw_layers(self, topology, wbits, abits, board):
-        prev_chkpt_name = get_checkpoint_name(topology, wbits, abits, "streamline")
+        prev_chkpt_name = get_checkpoint_name(board, topology, wbits, abits, "streamline")
         model = load_test_checkpoint_or_skip(prev_chkpt_name)
         if topology == "tfc" and wbits == 1 and abits == 1:
             # use standalone thresholds for tfc-w1a1 to also exercise that option
@@ -568,7 +573,7 @@ def test_convert_to_hw_layers(self, topology, wbits, abits, board):
         model = model.transform(absorb.AbsorbConsecutiveTransposes())
         model = model.transform(GiveUniqueNodeNames())
         model = model.transform(InferDataLayouts())
-        model.save(get_checkpoint_name(topology, wbits, abits, "convert_to_hw_layers"))
+        model.save(get_checkpoint_name(board, topology, wbits, abits, "convert_to_hw_layers"))
         exp_layer_counts = {
             "tfc": [
                 ("Reshape", 1),
@@ -607,11 +612,11 @@ def test_convert_to_hw_layers(self, topology, wbits, abits, board):
 
     def test_specialize_layers(self, topology, wbits, abits, board):
         build_data = get_build_env(board, target_clk_ns)
-        prev_chkpt_name = get_checkpoint_name(topology, wbits, abits, "convert_to_hw_layers")
+        prev_chkpt_name = get_checkpoint_name(board, topology, wbits, abits, "convert_to_hw_layers")
         model = load_test_checkpoint_or_skip(prev_chkpt_name)
         model = model.transform(SpecializeLayers(build_data["part"]))
         model = model.transform(GiveUniqueNodeNames())
-        model.save(get_checkpoint_name(topology, wbits, abits, "specialize_layers"))
+        model.save(get_checkpoint_name(board, topology, wbits, abits, "specialize_layers"))
         exp_layer_counts = {
             "tfc": [
                 ("Reshape", 1),
@@ -649,45 +654,45 @@ def test_specialize_layers(self, topology, wbits, abits, board):
             assert len(model.get_nodes_by_op_type(op_type)) == exp_count
 
     def test_create_dataflow_partition(self, topology, wbits, abits, board):
-        prev_chkpt_name = get_checkpoint_name(topology, wbits, abits, "specialize_layers")
+        prev_chkpt_name = get_checkpoint_name(board, topology, wbits, abits, "specialize_layers")
         model = load_test_checkpoint_or_skip(prev_chkpt_name)
         parent_model = model.transform(CreateDataflowPartition())
-        parent_model_chkpt = get_checkpoint_name(topology, wbits, abits, "dataflow_parent")
+        parent_model_chkpt = get_checkpoint_name(board, topology, wbits, abits, "dataflow_parent")
         parent_model.save(parent_model_chkpt)
         sdp_node = parent_model.get_nodes_by_op_type("StreamingDataflowPartition")[0]
         sdp_node = getCustomOp(sdp_node)
         dataflow_model_filename = sdp_node.get_nodeattr("model")
         dataflow_model = load_test_checkpoint_or_skip(dataflow_model_filename)
-        dataflow_model_chkpt = get_checkpoint_name(topology, wbits, abits, "dataflow_model")
+        dataflow_model_chkpt = get_checkpoint_name(board, topology, wbits, abits, "dataflow_model")
         dataflow_model.save(dataflow_model_chkpt)
 
     def test_fold(self, topology, wbits, abits, board):
-        prev_chkpt_name = get_checkpoint_name(topology, wbits, abits, "dataflow_model")
+        prev_chkpt_name = get_checkpoint_name(board, topology, wbits, abits, "dataflow_model")
         model = load_test_checkpoint_or_skip(prev_chkpt_name)
         folding_fxn = get_folding_function(topology, wbits, abits)
         model = folding_fxn(model)
-        model.save(get_checkpoint_name(topology, wbits, abits, "fold"))
+        model.save(get_checkpoint_name(board, topology, wbits, abits, "fold"))
 
     def test_minimize_bit_width(self, topology, wbits, abits, board):
-        prev_chkpt_name = get_checkpoint_name(topology, wbits, abits, "fold")
+        prev_chkpt_name = get_checkpoint_name(board, topology, wbits, abits, "fold")
         model = load_test_checkpoint_or_skip(prev_chkpt_name)
         model = model.transform(MinimizeAccumulatorWidth())
         model = model.transform(MinimizeWeightBitWidth())
         model = model.transform(RoundAndClipThresholds())
-        curr_chkpt_name = get_checkpoint_name(topology, wbits, abits, "minimize_bit_width")
+        curr_chkpt_name = get_checkpoint_name(board, topology, wbits, abits, "minimize_bit_width")
         model.save(curr_chkpt_name)
 
     @pytest.mark.slow
     @pytest.mark.vivado
     def test_cppsim(self, topology, wbits, abits, board):
-        prev_chkpt_name = get_checkpoint_name(topology, wbits, abits, "minimize_bit_width")
+        prev_chkpt_name = get_checkpoint_name(board, topology, wbits, abits, "minimize_bit_width")
         model = load_test_checkpoint_or_skip(prev_chkpt_name)
         model = model.transform(PrepareCppSim())
         model = model.transform(CompileCppSim())
         model = model.transform(SetExecMode("cppsim"))
-        cppsim_chkpt = get_checkpoint_name(topology, wbits, abits, "cppsim")
+        cppsim_chkpt = get_checkpoint_name(board, topology, wbits, abits, "cppsim")
         model.save(cppsim_chkpt)
-        parent_chkpt = get_checkpoint_name(topology, wbits, abits, "dataflow_parent")
+        parent_chkpt = get_checkpoint_name(board, topology, wbits, abits, "dataflow_parent")
         (input_tensor_npy, output_tensor_npy) = get_golden_io_pair(
             topology, wbits, abits, return_topk=1
         )
@@ -700,17 +705,17 @@ def test_ipgen(self, topology, wbits, abits, board):
         build_data = get_build_env(board, target_clk_ns)
         if build_data["kind"] == "alveo" and ("VITIS_PATH" not in os.environ):
             pytest.skip("VITIS_PATH not set")
-        prev_chkpt_name = get_checkpoint_name(topology, wbits, abits, "minimize_bit_width")
+        prev_chkpt_name = get_checkpoint_name(board, topology, wbits, abits, "minimize_bit_width")
         model = load_test_checkpoint_or_skip(prev_chkpt_name)
         model = model.transform(GiveUniqueNodeNames())
         model = model.transform(PrepareIP(build_data["part"], target_clk_ns))
         model = model.transform(HLSSynthIP())
-        model.save(get_checkpoint_name(topology, wbits, abits, "ipgen_" + board))
+        model.save(get_checkpoint_name(board, topology, wbits, abits, "ipgen"))
 
     @pytest.mark.slow
     @pytest.mark.vivado
     def test_set_fifo_depths(self, topology, wbits, abits, board):
-        prev_chkpt_name = get_checkpoint_name(topology, wbits, abits, "ipgen_" + board)
+        prev_chkpt_name = get_checkpoint_name(board, topology, wbits, abits, "ipgen")
         model = load_test_checkpoint_or_skip(prev_chkpt_name)
         test_fpga_part = get_build_env(board, target_clk_ns)["part"]
         if topology == "cnv" and abits == 2 and board == "Pynq-Z1":
@@ -725,12 +730,12 @@ def test_set_fifo_depths(self, topology, wbits, abits, board):
 
         fifo_layers = model.get_nodes_by_op_type("StreamingFIFO_rtl")
         assert len(fifo_layers) > 0
-        model.save(get_checkpoint_name(topology, wbits, abits, "fifodepth_" + board))
+        model.save(get_checkpoint_name(board, topology, wbits, abits, "fifodepth"))
 
     @pytest.mark.slow
     @pytest.mark.vivado
     def test_ipstitch_rtlsim(self, topology, wbits, abits, board):
-        prev_chkpt_name = get_checkpoint_name(topology, wbits, abits, "fifodepth_" + board)
+        prev_chkpt_name = get_checkpoint_name(board, topology, wbits, abits, "fifodepth")
         model = load_test_checkpoint_or_skip(prev_chkpt_name)
         test_fpga_part = get_build_env(board, target_clk_ns)["part"]
         model = model.transform(InsertDWC())
@@ -750,9 +755,9 @@ def test_ipstitch_rtlsim(self, topology, wbits, abits, board):
         if rtlsim_trace:
             model.set_metadata_prop("rtlsim_trace", "%s_w%da%d.vcd" % (topology, wbits, abits))
             os.environ["RTLSIM_TRACE_DEPTH"] = "3"
-        rtlsim_chkpt = get_checkpoint_name(topology, wbits, abits, "ipstitch_rtlsim_" + board)
+        rtlsim_chkpt = get_checkpoint_name(board, topology, wbits, abits, "ipstitch_rtlsim")
         model.save(rtlsim_chkpt)
-        parent_chkpt = get_checkpoint_name(topology, wbits, abits, "dataflow_parent")
+        parent_chkpt = get_checkpoint_name(board, topology, wbits, abits, "dataflow_parent")
         (input_tensor_npy, output_tensor_npy) = get_golden_io_pair(
             topology, wbits, abits, return_topk=1
         )
@@ -762,7 +767,7 @@ def test_ipstitch_rtlsim(self, topology, wbits, abits, board):
     @pytest.mark.slow
     @pytest.mark.vivado
     def test_throughput_rtlsim(self, topology, wbits, abits, board):
-        prev_chkpt_name = get_checkpoint_name(topology, wbits, abits, "ipstitch_rtlsim_" + board)
+        prev_chkpt_name = get_checkpoint_name(board, topology, wbits, abits, "ipstitch_rtlsim")
         model = load_test_checkpoint_or_skip(prev_chkpt_name)
         n_nodes = len(model.graph.node)
         perf_est = model.analysis(dataflow_performance)
@@ -780,11 +785,11 @@ def test_throughput_rtlsim(self, topology, wbits, abits, board):
     def test_validate_top1(self, topology, wbits, abits, board):
         if "TEST_END2END_VALIDATE_TOP1" not in os.environ:
             pytest.skip("TEST_END2END_VALIDATE_TOP1 not set")
-        prepostproc_chkpt = get_checkpoint_name(topology, wbits, abits, "pre_post")
-        streamline_chkpt = get_checkpoint_name(topology, wbits, abits, "streamline")
-        parent_chkpt = get_checkpoint_name(topology, wbits, abits, "dataflow_parent")
-        cppsim_chkpt = get_checkpoint_name(topology, wbits, abits, "cppsim")
-        rtlsim_chkpt = get_checkpoint_name(topology, wbits, abits, "ipstitch_rtlsim_" + board)
+        prepostproc_chkpt = get_checkpoint_name(board, topology, wbits, abits, "pre_post")
+        streamline_chkpt = get_checkpoint_name(board, topology, wbits, abits, "streamline")
+        parent_chkpt = get_checkpoint_name(board, topology, wbits, abits, "dataflow_parent")
+        cppsim_chkpt = get_checkpoint_name(board, topology, wbits, abits, "cppsim")
+        rtlsim_chkpt = get_checkpoint_name(board, topology, wbits, abits, "ipstitch_rtlsim")
         dataset = topology2dataset(topology)
         assert measure_top1_accuracy(prepostproc_chkpt, dataset) > 80
         assert measure_top1_accuracy(streamline_chkpt, dataset) > 80
@@ -798,11 +803,11 @@ def test_build(self, topology, wbits, abits, board):
         build_data = get_build_env(board, target_clk_ns)
         if build_data["kind"] == "alveo" and ("VITIS_PATH" not in os.environ):
             pytest.skip("VITIS_PATH not set")
-        prev_chkpt_name = get_checkpoint_name(topology, wbits, abits, "fifodepth_" + board)
+        prev_chkpt_name = get_checkpoint_name(board, topology, wbits, abits, "fifodepth")
         model = load_test_checkpoint_or_skip(prev_chkpt_name)
         model = model.transform(build_data["build_fxn"])
         model = model.transform(AnnotateResources("synth", build_data["part"]))
-        model.save(get_checkpoint_name(topology, wbits, abits, "build_" + board))
+        model.save(get_checkpoint_name(board, topology, wbits, abits, "build"))
 
     @pytest.mark.slow
     @pytest.mark.vivado
@@ -811,16 +816,16 @@ def test_make_pynq_driver(self, topology, wbits, abits, board):
         build_data = get_build_env(board, target_clk_ns)
         if build_data["kind"] == "alveo" and ("VITIS_PATH" not in os.environ):
             pytest.skip("VITIS_PATH not set")
-        prev_chkpt_name = get_checkpoint_name(topology, wbits, abits, "build_" + board)
+        prev_chkpt_name = get_checkpoint_name(board, topology, wbits, abits, "build")
         model = load_test_checkpoint_or_skip(prev_chkpt_name)
         board_to_driver_platform = "alveo" if build_data["kind"] == "alveo" else "zynq-iodma"
         model = model.transform(MakePYNQDriver(board_to_driver_platform))
-        model.save(get_checkpoint_name(topology, wbits, abits, "driver_" + board))
+        model.save(get_checkpoint_name(board, topology, wbits, abits, "driver"))
 
     def test_deploy(self, topology, wbits, abits, board):
-        prev_chkpt_name = get_checkpoint_name(topology, wbits, abits, "driver_" + board)
+        prev_chkpt_name = get_checkpoint_name(board, topology, wbits, abits, "driver")
         model = load_test_checkpoint_or_skip(prev_chkpt_name)
         model_title = "%s_w%d_a%d_%s" % ("bnn", wbits, abits, topology)
         deploy_based_on_board(model, model_title, topology, wbits, abits, board)
         # save the model to be able to link it to the parent
-        model.save(get_checkpoint_name(topology, wbits, abits, "deploy_" + board))
+        model.save(get_checkpoint_name(board, topology, wbits, abits, "deploy"))
diff --git a/tests/end2end/test_end2end_cybsec_mlp.py b/tests/end2end/test_end2end_cybsec_mlp.py
index 9ee07d57a3..61b9c38ac5 100644
--- a/tests/end2end/test_end2end_cybsec_mlp.py
+++ b/tests/end2end/test_end2end_cybsec_mlp.py
@@ -79,6 +79,7 @@ def forward(self, x):
         return out_final
 
 
+@pytest.mark.xdist_group(name="end2end_cybsec")
 @pytest.mark.end2end
 def test_end2end_cybsec_mlp_export():
     assets_dir = os.environ["FINN_ROOT"] + "/src/finn/qnn-data/cybsec-mlp"
@@ -143,6 +144,7 @@ def test_end2end_cybsec_mlp_export():
     assert model.get_tensor_datatype(first_matmul_w_name) == DataType["INT2"]
 
 
+@pytest.mark.xdist_group(name="end2end_cybsec")
 @pytest.mark.slow
 @pytest.mark.vivado
 @pytest.mark.end2end
diff --git a/tests/end2end/test_end2end_mobilenet_v1.py b/tests/end2end/test_end2end_mobilenet_v1.py
index 4c52277970..e1daf6fc97 100644
--- a/tests/end2end/test_end2end_mobilenet_v1.py
+++ b/tests/end2end/test_end2end_mobilenet_v1.py
@@ -106,6 +106,7 @@
 first_layer_res_type = "dsp"
 
 
+@pytest.mark.xdist_group(name="end2end_mobilenet")
 @pytest.mark.end2end
 def test_end2end_mobilenet_export():
     # export preprocessing
@@ -163,6 +164,7 @@ def test_end2end_mobilenet_export():
     assert os.path.isfile(build_dir + "/end2end_mobilenet_preproc.onnx")
 
 
+@pytest.mark.xdist_group(name="end2end_mobilenet")
 @pytest.mark.end2end
 def test_end2end_mobilenet_tidy_and_merge_with_preproc():
     preproc_model = load_test_checkpoint_or_skip(build_dir + "/end2end_mobilenet_preproc.onnx")
@@ -185,6 +187,7 @@ def test_end2end_mobilenet_tidy_and_merge_with_preproc():
     model.save(build_dir + "/end2end_mobilenet_tidy.onnx")
 
 
+@pytest.mark.xdist_group(name="end2end_mobilenet")
 @pytest.mark.end2end
 def test_end2end_mobilenet_streamline():
     model = load_test_checkpoint_or_skip(build_dir + "/end2end_mobilenet_tidy.onnx")
@@ -214,6 +217,7 @@ def test_end2end_mobilenet_streamline():
     assert len(model.get_nodes_by_op_type("Mul")) == 0  # no Mul ops remain
 
 
+@pytest.mark.xdist_group(name="end2end_mobilenet")
 @pytest.mark.end2end
 def test_end2end_mobilenet_lowering():
     model = load_test_checkpoint_or_skip(build_dir + "/end2end_mobilenet_streamlined.onnx")
@@ -227,6 +231,7 @@ def test_end2end_mobilenet_lowering():
     model.save(build_dir + "/end2end_mobilenet_lowered.onnx")
 
 
+@pytest.mark.xdist_group(name="end2end_mobilenet")
 @pytest.mark.end2end
 def test_end2end_mobilenet_convert_to_hw_layers():
     model = load_test_checkpoint_or_skip(build_dir + "/end2end_mobilenet_lowered.onnx")
@@ -243,6 +248,7 @@ def test_end2end_mobilenet_convert_to_hw_layers():
     model.save(build_dir + "/end2end_mobilenet_hw_layers.onnx")
 
 
+@pytest.mark.xdist_group(name="end2end_mobilenet")
 @pytest.mark.end2end
 def test_end2end_mobilenet_specialize_layers():
     model = load_test_checkpoint_or_skip(build_dir + "/end2end_mobilenet_hw_layers.onnx")
@@ -252,6 +258,7 @@ def test_end2end_mobilenet_specialize_layers():
     model.save(build_dir + "/end2end_mobilenet_specialize_layers.onnx")
 
 
+@pytest.mark.xdist_group(name="end2end_mobilenet")
 @pytest.mark.end2end
 def test_end2end_mobilenet_create_dataflow_partition():
     model = load_test_checkpoint_or_skip(build_dir + "/end2end_mobilenet_specialize_layers.onnx")
@@ -265,6 +272,7 @@ def test_end2end_mobilenet_create_dataflow_partition():
     dataflow_model.save(build_dir + "/end2end_mobilenet_dataflow_model.onnx")
 
 
+@pytest.mark.xdist_group(name="end2end_mobilenet")
 @pytest.mark.end2end
 def test_end2end_mobilenet_folding():
     model = load_test_checkpoint_or_skip(build_dir + "/end2end_mobilenet_dataflow_model.onnx")
@@ -348,6 +356,7 @@ def test_end2end_mobilenet_folding():
     model.save(build_dir + "/end2end_mobilenet_folded.onnx")
 
 
+@pytest.mark.xdist_group(name="end2end_mobilenet")
 @pytest.mark.end2end
 def test_end2end_mobilenet_minimize_bit_width():
     model = load_test_checkpoint_or_skip(build_dir + "/end2end_mobilenet_folded.onnx")
@@ -357,6 +366,7 @@ def test_end2end_mobilenet_minimize_bit_width():
     model.save(build_dir + "/end2end_mobilenet_minimize_bitwidth.onnx")
 
 
+@pytest.mark.xdist_group(name="end2end_mobilenet")
 @pytest.mark.slow
 @pytest.mark.vivado
 @pytest.mark.end2end
@@ -393,6 +403,7 @@ def test_end2end_mobilenet_cppsim():
     # assert np.isclose(golden_prob, res_cppsim_prob[0, 0, 0, :5]).all()
 
 
+@pytest.mark.xdist_group(name="end2end_mobilenet")
 @pytest.mark.slow
 @pytest.mark.vivado
 @pytest.mark.end2end
@@ -403,6 +414,7 @@ def test_end2end_mobilenet_ipgen():
     model.save(build_dir + "/end2end_mobilenet_hw_ipgen.onnx")
 
 
+@pytest.mark.xdist_group(name="end2end_mobilenet")
 @pytest.mark.slow
 @pytest.mark.vivado
 @pytest.mark.end2end
@@ -439,6 +451,7 @@ def test_end2end_mobilenet_rtlsim():
     # assert np.isclose(golden_prob, res_rtlsim_prob[0, 0, 0, :5]).all()
 
 
+@pytest.mark.xdist_group(name="end2end_mobilenet")
 @pytest.mark.slow
 @pytest.mark.vivado
 @pytest.mark.end2end
@@ -465,6 +478,7 @@ def test_end2end_mobilenet_set_fifo_depths():
     model.save(build_dir + "/end2end_mobilenet_set_fifo_depths.onnx")
 
 
+@pytest.mark.xdist_group(name="end2end_mobilenet")
 @pytest.mark.slow
 @pytest.mark.vivado
 @pytest.mark.end2end
@@ -481,6 +495,7 @@ def test_end2end_mobilenet_stitched_ip():
     model.save(build_dir + "/end2end_mobilenet_stitched_ip.onnx")
 
 
+@pytest.mark.xdist_group(name="end2end_mobilenet")
 @pytest.mark.slow
 @pytest.mark.vivado
 @pytest.mark.end2end
@@ -517,6 +532,7 @@ def test_end2end_mobilenet_stitched_ip_rtlsim():
     # assert np.isclose(golden_prob, res_rtlsim_ip_prob[0, 0, 0, :5]).all()
 
 
+@pytest.mark.xdist_group(name="end2end_mobilenet")
 @pytest.mark.slow
 @pytest.mark.vivado
 @pytest.mark.end2end
diff --git a/tests/end2end/test_ext_weights.py b/tests/end2end/test_ext_weights.py
index 29d2f58e66..eeb7d95a49 100644
--- a/tests/end2end/test_ext_weights.py
+++ b/tests/end2end/test_ext_weights.py
@@ -66,6 +66,7 @@ def get_checkpoint_name(step):
         return build_dir + "/end2end_ext_weights_%s.onnx" % (step)
 
 
+@pytest.mark.xdist_group(name="end2end_ext_weights")
 @pytest.mark.end2end
 def test_end2end_ext_weights_download():
     if not os.path.isfile(onnx_zip_local):
@@ -75,6 +76,7 @@ def test_end2end_ext_weights_download():
     assert os.path.isfile(get_checkpoint_name("download"))
 
 
+@pytest.mark.xdist_group(name="end2end_ext_weights")
 @pytest.mark.slow
 @pytest.mark.vivado
 @pytest.mark.end2end
diff --git a/tests/fpgadataflow/test_split_large_fifos.py b/tests/fpgadataflow/test_split_large_fifos.py
index d192755d06..c993b51884 100644
--- a/tests/fpgadataflow/test_split_large_fifos.py
+++ b/tests/fpgadataflow/test_split_large_fifos.py
@@ -63,7 +63,7 @@ def get_folding_cfg(depth=65536):
 @pytest.mark.slow
 @pytest.mark.vivado
 @pytest.mark.fpgadataflow
-@pytest.mark.parametrize("depth", [16384, 65536, 45000])
+@pytest.mark.parametrize("depth", [16384, 65536, 45000, 1537])
 @pytest.mark.parametrize("force_python_rtlsim", ["True", "False"])
 def test_split_large_fifos(depth, force_python_rtlsim):
     tmp_output_dir = fetch_test_model("tfc")
@@ -103,13 +103,14 @@ def test_split_large_fifos(depth, force_python_rtlsim):
         inst = getCustomOp(fifo_node)
         fifo_depth = inst.get_nodeattr("depth")
         assert fifo_depth == golden_cfg[i % len(golden_cfg)][0]
+        assert fifo_depth > 1
 
     shutil.rmtree(tmp_output_dir)
 
 
 def test_split_large_fifo_configs():
     ret0 = get_fifo_split_configs(513, 256, 32768)
-    assert ret0 == [(512, "vivado"), (1, "rtl")]
+    assert ret0 == [(512, "vivado"), (2, "rtl")]
     ret1 = get_fifo_split_configs(1200, 256, 32768)
     assert ret1 == [(1024, "vivado"), (176, "rtl")]
     ret2 = get_fifo_split_configs(45000, 256, 32768)
diff --git a/tests/notebooks/test_jupyter_notebooks.py b/tests/notebooks/test_jupyter_notebooks.py
index e1415b9066..060bb07238 100644
--- a/tests/notebooks/test_jupyter_notebooks.py
+++ b/tests/notebooks/test_jupyter_notebooks.py
@@ -1,6 +1,7 @@
 import pytest
 
 import nbformat
+import os
 from nbconvert.preprocessors import ExecutePreprocessor
 
 from finn.util.basic import get_finn_root
@@ -12,28 +13,67 @@
 notebook_bnn_dir = get_finn_root() + "/notebooks/end2end_example/bnn-pynq/"
 
 basics_notebooks = [
-    pytest.param(notebook_basic_dir + "0_how_to_work_with_onnx.ipynb"),
-    pytest.param(notebook_basic_dir + "1_brevitas_network_import_via_QONNX.ipynb"),
+    pytest.param(
+        notebook_basic_dir + "0_how_to_work_with_onnx.ipynb",
+        marks=pytest.mark.xdist_group(name="notebooks_general"),
+    ),
+    pytest.param(
+        notebook_basic_dir + "1_brevitas_network_import_via_QONNX.ipynb",
+        marks=pytest.mark.xdist_group(name="notebooks_general"),
+    ),
 ]
 
 advanced_notebooks = [
-    pytest.param(notebook_advanced_dir + "0_custom_analysis_pass.ipynb"),
-    pytest.param(notebook_advanced_dir + "1_custom_transformation_pass.ipynb"),
-    pytest.param(notebook_advanced_dir + "2_custom_op.ipynb"),
-    pytest.param(notebook_advanced_dir + "3_folding.ipynb"),
-    pytest.param(notebook_advanced_dir + "4_advanced_builder_settings.ipynb"),
+    pytest.param(
+        notebook_advanced_dir + "0_custom_analysis_pass.ipynb",
+        marks=pytest.mark.xdist_group(name="notebooks_general"),
+    ),
+    pytest.param(
+        notebook_advanced_dir + "1_custom_transformation_pass.ipynb",
+        marks=pytest.mark.xdist_group(name="notebooks_general"),
+    ),
+    pytest.param(
+        notebook_advanced_dir + "2_custom_op.ipynb",
+        marks=pytest.mark.xdist_group(name="notebooks_general"),
+    ),
+    pytest.param(
+        notebook_advanced_dir + "3_folding.ipynb",
+        marks=pytest.mark.xdist_group(name="notebooks_general"),
+    ),
+    pytest.param(
+        notebook_advanced_dir + "4_advanced_builder_settings.ipynb",
+        marks=pytest.mark.xdist_group(name="notebooks_general"),
+    ),
 ]
 
 cyber_notebooks = [
-    pytest.param(notebook_cyber_dir + "1-train-mlp-with-brevitas.ipynb"),
-    pytest.param(notebook_cyber_dir + "2-import-into-finn-and-verify.ipynb"),
-    pytest.param(notebook_cyber_dir + "3-build-accelerator-with-finn.ipynb"),
+    pytest.param(
+        notebook_cyber_dir + "1-train-mlp-with-brevitas.ipynb",
+        marks=pytest.mark.xdist_group(name="notebooks_cybsec"),
+    ),
+    pytest.param(
+        notebook_cyber_dir + "2-import-into-finn-and-verify.ipynb",
+        marks=pytest.mark.xdist_group(name="notebooks_cybsec"),
+    ),
+    pytest.param(
+        notebook_cyber_dir + "3-build-accelerator-with-finn.ipynb",
+        marks=pytest.mark.xdist_group(name="notebooks_cybsec"),
+    ),
 ]
 
 bnn_notebooks = [
-    pytest.param(notebook_bnn_dir + "cnv_end2end_example.ipynb"),
-    pytest.param(notebook_bnn_dir + "tfc_end2end_example.ipynb"),
-    pytest.param(notebook_bnn_dir + "tfc_end2end_verification.ipynb"),
+    pytest.param(
+        notebook_bnn_dir + "cnv_end2end_example.ipynb",
+        marks=pytest.mark.xdist_group(name="notebooks_cnv"),
+    ),
+    pytest.param(
+        notebook_bnn_dir + "tfc_end2end_example.ipynb",
+        marks=pytest.mark.xdist_group(name="notebooks_tfc"),
+    ),
+    pytest.param(
+        notebook_bnn_dir + "tfc_end2end_verification.ipynb",
+        marks=pytest.mark.xdist_group(name="notebooks_tfc"),
+    ),
 ]
 
 
@@ -41,8 +81,16 @@
 @pytest.mark.parametrize(
     "notebook", basics_notebooks + advanced_notebooks + cyber_notebooks + bnn_notebooks
 )
-def test_notebook_exec(notebook):
+def test_notebook_exec(notebook, request):
     with open(notebook) as f:
+        # Set different NETRON_PORT for each xdist group to avoid conflicts
+        xdist_groups = ["notebooks_general", "notebooks_cybsec", "notebooks_cnv", "notebooks_tfc"]
+        for mark in request.node.own_markers:
+            if mark.name == "xdist_group":
+                group = mark.kwargs["name"]
+                os.environ["NETRON_PORT"] = str(8081 + xdist_groups.index(group))
+                break
+
         nb = nbformat.read(f, as_version=4)
         ep = ExecutePreprocessor(timeout=notebook_timeout_seconds, kernel_name="python3")
         try: