diff --git a/.github/workflows/quicktest-dev-pr.yml b/.github/workflows/quicktest-dev-pr.yml
index 80ac0b61e6..d188007465 100644
--- a/.github/workflows/quicktest-dev-pr.yml
+++ b/.github/workflows/quicktest-dev-pr.yml
@@ -50,4 +50,4 @@ jobs:
 
       - name: DockerRunQuicktest
         run: |
-          docker run --init --hostname finn_gha -v $(pwd):/workspace/finn -e FINN_BUILD_DIR=/tmp/finn_gha -e FINN_INST_NAME=finn_gha finn_gha quicktest.sh
+          docker run --init --hostname finn_gha -w $(pwd) -v $(pwd):$(pwd) -e FINN_BUILD_DIR=/tmp/finn_gha -e FINN_INST_NAME=finn_gha finn_gha quicktest.sh
diff --git a/.gitignore b/.gitignore
index 225fb5cfa3..be61378730 100644
--- a/.gitignore
+++ b/.gitignore
@@ -77,9 +77,6 @@ MANIFEST
 # Per-project virtualenvs
 .venv*/
 
-# Jenkins cfg dir
-/docker/jenkins_home
-
 # SSH key dir mounted into Docker
 /ssh_keys/
 
@@ -96,3 +93,6 @@ MANIFEST
 
 # generated files as part of end2end notebooks
 /notebooks/end2end_example/**/*.onnx
+
+# downloaded dep repos
+/deps/
diff --git a/AUTHORS.rst b/AUTHORS.rst
index 1d42d35a3b..d011ce3d7a 100644
--- a/AUTHORS.rst
+++ b/AUTHORS.rst
@@ -2,8 +2,9 @@
 Contributors
 ============
 
-* Yaman Umuroglu (@maltanar) (maintainer)
-* Jakoba Petri-Koenig (@auphelia)
+* Jakoba Petri-Koenig (@auphelia) (maintainer)
+* Thomas Preusser (@preusser)
+* Yaman Umuroglu (@maltanar)
 * Andrea Rigoni (@AndreaRigoni)
 * Hendrik Borras (@HenniOVP)
 * Lucian Petrica (@quetric)
@@ -22,3 +23,6 @@ Contributors
 * Javier Duarte (@jmduarte)
 * Uma Maheshwari (@umav1511)
 * José Rosa (@pinxau1000)
+* Aziz Bahri (@azizb-xlnx)
+* Fionn O'Donohoe (@fionnodonohoe-xlnx)
+* Matthias Gehre (@mgehre-amd)
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index f12dafa857..d376a1b42b 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -2,7 +2,7 @@ We welcome contributions to FINN.
 
 Please follow the steps below and be sure that your contribution complies with our guidelines.
 
-1. Share your proposal via <a href="https://github.com/Xilinx/finn/issues" target="_blank">Github issues</a>. If you are looking for some issues to get started with, we have a list of <a href="https://github.com/Xilinx/finn/labels/good%20first%20issue">good first issues</a> in the issue tracker. Feel free to ask questions on the <a href="https://gitter.im/xilinx-finn/community?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge">FINN gitter channel as well</a>.
+1. Share your proposal via <a href="https://github.com/Xilinx/finn/issues" target="_blank">Github issues</a>. If you are looking for some issues to get started with, we have a list of <a href="https://github.com/Xilinx/finn/labels/good%20first%20issue">good first issues</a> in the issue tracker. Feel free to ask questions in the <a href="https://github.com/Xilinx/finn/discussions">FINN GitHub discussions</a> as well.
 
 	We welcome submissions to:
 
@@ -17,7 +17,7 @@ Please follow the steps below and be sure that your contribution complies with o
 
 	2. Clone the fork to your local computer using *git clone*. Checkout the branch you want to work on.
 
-	3. Please install <a href="https://pre-commit.com/" target="_blank">pre-commit</a> to ensure your code is formatted to our style guidelines. The hooks we use for pre-commit can be found in <a href="https://github.com/Xilinx/finn/blob/master/.pre-commit-config.yaml" target="_blank">this file</a>
+	3. Please install <a href="https://pre-commit.com/" target="_blank">pre-commit</a> to ensure your code is formatted to our style guidelines. The hooks we use for pre-commit can be found in <a href="https://github.com/Xilinx/finn/blob/main/.pre-commit-config.yaml" target="_blank">this file</a>
 
 	4. Modify the Python source code, Jupyter notebooks and Sphinx documentation etc. as needed.
 
@@ -26,9 +26,9 @@ Please follow the steps below and be sure that your contribution complies with o
 	6. If you are introducing new functionality, add at least one unit test under the `test/` folder and make sure it passes before you submit the pull request.
 
 	7. Submit a pull request by clicking the *pull request* button on your GitHub repo:
-		1. The <a href="https://github.com/Xilinx/finn" target="_blank">master branch</a> should always be treated as stable and clean. Only hot fixes are allowed to be pull-requested. The hot fix is supposed to be very important such that without this fix, a lot of things will break.
+		1. The <a href="https://github.com/Xilinx/finn" target="_blank">main branch</a> should always be treated as stable and clean. Only hot fixes are allowed to be pull-requested. The hot fix is supposed to be very important such that without this fix, a lot of things will break.
         2. For new features, smaller bug fixes, doc updates, and many other fixes, users should pull request against the <a href="https://github.com/Xilinx/finn/tree/dev" target="_blank">development branch</a>.
 
 3. We will review your contribution and, if any additional fixes or modifications are
 necessary, may provide feedback to guide you. When accepted, your pull request will
-be merged to the repository. If you have more questions please contact us via the <a href="https://gitter.im/xilinx-finn/community" target="_blank">FINN gitter channel</a>.
+be merged to the repository. If you have more questions please contact us.
diff --git a/README.md b/README.md
index 5f193ae34d..37ff7ddae0 100644
--- a/README.md
+++ b/README.md
@@ -24,9 +24,7 @@ Please see the [Getting Started](https://finn.readthedocs.io/en/latest/getting_s
 
 ## What's New in FINN?
 
-* **2021-11-05:** v0.7 is released, introducing QONNX support, three new example networks and many other improvements. Read more on the [v0.7 release blog post](https://xilinx.github.io/finn//2021/11/05/finn-v07-is-released.html).
-* **2021-06-15:** v0.6 is released, with ResNet-50 on U250 and ZCU104 MobileNet-v1 in finn-examples showcasing new features plus a lot more. Read more on the [v0.6 release blog post](https://xilinx.github.io/finn//2021/06/15/finn-v06-is-released.html).
-* **2020-12-17:** v0.5b (beta) is released, with a new [examples repo](https://github.com/Xilinx/finn-examples) including MobileNet-v1. Read more on the <a href="https://xilinx.github.io/finn/2020/12/17/finn-v05b-beta-is-released.html">release blog post</a>.
+* Please find all news under [GitHub discussions Announcements](https://github.com/Xilinx/finn/discussions/categories/announcements).
 
 ## Documentation
 
diff --git a/custom_hls/checksum.cpp b/custom_hls/checksum.cpp
new file mode 100644
index 0000000000..3ea3870d35
--- /dev/null
+++ b/custom_hls/checksum.cpp
@@ -0,0 +1,36 @@
+/******************************************************************************
+ *  Copyright (c) 2022, Xilinx, Inc.
+ *  All rights reserved.
+ *
+ *  Redistribution and use in source and binary forms, with or without
+ *  modification, are permitted provided that the following conditions are met:
+ *
+ *  1.  Redistributions of source code must retain the above copyright notice,
+ *     this list of conditions and the following disclaimer.
+ *
+ *  2.  Redistributions in binary form must reproduce the above copyright
+ *      notice, this list of conditions and the following disclaimer in the
+ *      documentation and/or other materials provided with the distribution.
+ *
+ *  3.  Neither the name of the copyright holder nor the names of its
+ *      contributors may be used to endorse or promote products derived from
+ *      this software without specific prior written permission.
+ *
+ *  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ *  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
+ *  THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ *  PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
+ *  CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ *  EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ *  PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ *  OR BUSINESS INTERRUPTION). HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ *  WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
+ *  OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
+ *  ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * @brief	Validation top-level module for checksum component.
+ * @author	Thomas B. Preußer <tpreusse@amd.com>
+ *
+ *******************************************************************************/
+#include "checksum.hpp"
+CHECKSUM_TOP(WORDS_PER_FRAME, WORD_SIZE, ITEMS_PER_WORD)
diff --git a/custom_hls/checksum.hpp b/custom_hls/checksum.hpp
new file mode 100644
index 0000000000..77fc14694f
--- /dev/null
+++ b/custom_hls/checksum.hpp
@@ -0,0 +1,134 @@
+/******************************************************************************
+ *  Copyright (c) 2022, Xilinx, Inc.
+ *  All rights reserved.
+ *
+ *  Redistribution and use in source and binary forms, with or without
+ *  modification, are permitted provided that the following conditions are met:
+ *
+ *  1.  Redistributions of source code must retain the above copyright notice,
+ *     this list of conditions and the following disclaimer.
+ *
+ *  2.  Redistributions in binary form must reproduce the above copyright
+ *      notice, this list of conditions and the following disclaimer in the
+ *      documentation and/or other materials provided with the distribution.
+ *
+ *  3.  Neither the name of the copyright holder nor the names of its
+ *      contributors may be used to endorse or promote products derived from
+ *      this software without specific prior written permission.
+ *
+ *  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ *  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
+ *  THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ *  PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
+ *  CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ *  EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ *  PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ *  OR BUSINESS INTERRUPTION). HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ *  WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
+ *  OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
+ *  ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * @brief	Checksum over stream-carried data frames.
+ * @author	Thomas B. Preußer <tpreusse@amd.com>
+ *
+ *******************************************************************************/
+#include <hls_stream.h>
+#include <ap_int.h>
+
+
+/**
+ * Computes a checksum over a forwarded stream assumed to carry frames of
+ * N words further subdivided into K subwords.
+ *	- Subword slicing can be customized typically by using a lambda.
+ *	  The provided DefaultSubwordSlicer assumes an `ap_(u)int`-like word
+ *	  type with a member `width` and a range-based slicing operator. It
+ *	  further assumes a little-endian arrangement of subwords within words
+ *	  for the canonical subword stream order.
+ *	- Subwords wider than 23 bits are folded using bitwise XOR across
+ *	  slices of 23 bits starting from the LSB.
+ *	- The folded subword values are weighted according to their position
+ *	  in the stream relative to the start of frame by a periodic weight
+ *	  sequence 1, 2, 3, ...
+ *	- The weighted folded subword values are reduced to a checksum by an
+ *	  accumulation module 2^24.
+ *	- A checksum is emitted for each completed frame. It is the concatenation
+ *	  of an 8-bit (modulo 256) frame counter and the 24-bit frame checksum.
+ */
+template<typename T, unsigned K> class DefaultSubwordSlicer {
+	static_assert(T::width%K == 0, "Word size must be subword multiple.");
+	static constexpr unsigned  W = T::width/K;
+public:
+	ap_uint<W> operator()(T const &x, unsigned const  j) const {
+#pragma HLS inline
+		return  x((j+1)*W-1, j*W);
+	}
+};
+
+template<
+  unsigned N,	// number of data words in a frame
+  unsigned K,	// subword count per data word
+  typename T,	// type of stream-carried data words
+  typename F = DefaultSubwordSlicer<T, K>	// f(T(), j) to extract subwords
+>
+void checksum(
+	hls::stream<T> &src,
+	hls::stream<T> &dst,
+	ap_uint<32>    &chk,
+	ap_uint<1>     drain,	// drain data after checksuming without forward to `dst`
+	F&& f = F()
+) {
+	ap_uint<2>  coeff[3] = { 1, 2, 3 };
+	ap_uint<24>  s = 0;
+
+	for(unsigned  i = 0; i < N; i++) {
+#pragma HLS pipeline II=1 style=flp
+		T const  x = src.read();
+
+		// Pass-thru copy
+		if(!drain)  dst.write(x);
+
+		// Actual checksum update
+		for(unsigned  j = 0; j < K; j++) {
+#pragma HLS unroll
+			auto const   v0 = f(x, j);
+			constexpr unsigned  W = 1 + (decltype(v0)::width-1)/23;
+			ap_uint<K*23>  v = v0;
+			ap_uint<  23>  w = 0;
+			for(unsigned  k = 0; k < W; k++) {
+				w ^= v(23*k+22, 23*k);
+			}
+			s += (coeff[j%3][1]? (w, ap_uint<1>(0)) : ap_uint<24>(0)) + (coeff[j%3][0]? w : ap_uint<23>(0));
+		}
+
+		// Re-align coefficients
+		for(unsigned  j = 0; j < 3; j++) {
+#pragma HLS unroll
+			ap_uint<3> const  cc = coeff[j] + ap_uint<3>(K%3);
+			coeff[j] = cc(1, 0) + cc[2];
+		}
+	}
+
+	// Frame counter & output
+	static ap_uint<8>  cnt = 0;
+#pragma HLS reset variable=cnt
+	chk = (cnt++, s);
+}
+
+#define CHECKSUM_TOP_(WORDS_PER_FRAME, WORD_SIZE, ITEMS_PER_WORD) \
+	using  T = ap_uint<WORD_SIZE>; \
+	void checksum_ ## WORDS_PER_FRAME ## _ ## WORD_SIZE ## _ ## ITEMS_PER_WORD ( \
+		hls::stream<T> &src, \
+		hls::stream<T> &dst, \
+		ap_uint<32>    &chk, \
+		ap_uint< 1>    drain \
+	) { \
+	_Pragma("HLS interface port=src axis") \
+	_Pragma("HLS interface port=dst axis") \
+	_Pragma("HLS interface port=chk s_axilite") \
+	_Pragma("HLS interface port=drain s_axilite") \
+	_Pragma("HLS interface port=return ap_ctrl_none") \
+	_Pragma("HLS dataflow disable_start_propagation") \
+		checksum<WORDS_PER_FRAME, ITEMS_PER_WORD>(src, dst, chk, drain); \
+	}
+#define CHECKSUM_TOP(WORDS_PER_FRAME, WORD_SIZE, ITEMS_PER_WORD) \
+	CHECKSUM_TOP_(WORDS_PER_FRAME, WORD_SIZE, ITEMS_PER_WORD)
diff --git a/custom_hls/checksum_tb.sv b/custom_hls/checksum_tb.sv
new file mode 100644
index 0000000000..cec4e1b5bb
--- /dev/null
+++ b/custom_hls/checksum_tb.sv
@@ -0,0 +1,136 @@
+/******************************************************************************
+ *  Copyright (c) 2022, Xilinx, Inc.
+ *  All rights reserved.
+ *
+ *  Redistribution and use in source and binary forms, with or without
+ *  modification, are permitted provided that the following conditions are met:
+ *
+ *  1.  Redistributions of source code must retain the above copyright notice,
+ *     this list of conditions and the following disclaimer.
+ *
+ *  2.  Redistributions in binary form must reproduce the above copyright
+ *      notice, this list of conditions and the following disclaimer in the
+ *      documentation and/or other materials provided with the distribution.
+ *
+ *  3.  Neither the name of the copyright holder nor the names of its
+ *      contributors may be used to endorse or promote products derived from
+ *      this software without specific prior written permission.
+ *
+ *  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ *  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
+ *  THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ *  PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
+ *  CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ *  EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ *  PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ *  OR BUSINESS INTERRUPTION). HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ *  WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
+ *  OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
+ *  ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * @brief	Testbench for checksum component.
+ * @author	Thomas B. Preußer <tpreusse@amd.com>
+ *
+ *******************************************************************************/
+module checksum_tb;
+
+	//-----------------------------------------------------------------------
+	// Global Control
+	logic  clk = 0;
+	always #5ns clk = !clk;
+	logic  rst;
+
+	//-----------------------------------------------------------------------
+	// DUT
+	localparam int unsigned  N = 60;	// words per frame
+	localparam int unsigned  K = 4;		// subwords per word
+	localparam int unsigned  W = 8;		// subword size
+
+	logic [K-1:0][W-1:0]  src_TDATA;
+	logic  src_TVALID;
+	uwire  src_TREADY;
+
+	uwire [K-1:0][W-1:0]  dst_TDATA;
+	uwire  dst_TVALID;
+	logic  dst_TREADY;
+
+	uwire [31:0]  chk;
+	uwire         chk_vld;
+
+	checksum_top dut (
+		.ap_clk(clk), .ap_rst_n(!rst),
+		.src_TDATA, .src_TVALID, .src_TREADY,
+		.dst_TDATA, .dst_TVALID, .dst_TREADY,
+		.chk, .chk_ap_vld(chk_vld),
+		.ap_local_block(), .ap_local_deadlock()
+	);
+
+	//-----------------------------------------------------------------------
+	// Stimulus
+	logic [K-1:0][W-1:0]  Bypass  [$] = {};
+	logic [31:0]          Checksum[$] = {};
+	initial begin
+		src_TDATA  = 'x;
+		src_TVALID =  0;
+
+		rst = 1;
+		repeat(9) @(posedge clk);
+		rst <= 0;
+
+		for(int unsigned  r = 0; r < 311; r++) begin
+			automatic logic [23:0]  sum = 0;
+			src_TVALID <= 1;
+			for(int unsigned  i = 0; i < N; i++) begin
+				for(int unsigned  k = 0; k < K; k++) begin
+					automatic logic [W-1:0]  v = $urandom()>>17;
+					src_TDATA[k] <= v;
+					sum += ((K*i+k)%3 + 1) * v;
+				end
+				@(posedge clk iff src_TREADY);
+				Bypass.push_back(src_TDATA);
+			end
+			src_TVALID <= 0;
+			$display("Expect: %02x:%06x", r[7:0], sum);
+			Checksum.push_back({r, sum});
+		end
+
+		repeat(8) @(posedge clk);
+		$finish;
+	end
+
+	//-----------------------------------------------------------------------
+	// Output Validation
+
+	// Drain and check pass-thru stream
+	assign	dst_TREADY = 1;
+	always_ff @(posedge clk iff dst_TVALID) begin
+		assert(Bypass.size()) begin
+			automatic logic [K-1:0][W-1:0]  exp = Bypass.pop_front();
+			assert(dst_TDATA === exp) else begin
+				$error("Unexpected output %0x instead of %0x.", dst_TDATA, exp);
+				$stop;
+			end
+		end
+		else begin
+			$error("Spurious data output.");
+			$stop;
+		end
+	end
+
+	// Validate checksum reports
+	always_ff @(posedge clk iff chk_vld) begin
+		$display("Check:  %02x:%06x", chk[31:24], chk[23:0]);
+		assert(Checksum.size()) begin
+			automatic logic [31:0]  exp = Checksum.pop_front();
+			assert(chk === exp) else begin
+				$error("Unexpected checksum %0x instead of %0x.", chk, exp);
+				$stop;
+			end
+		end
+		else begin
+			$error("Spurious checksum output.");
+			$stop;
+		end
+	end
+
+endmodule : checksum_tb
diff --git a/docker/Dockerfile.finn b/docker/Dockerfile.finn
index 4d03e2fbb5..a3f40d52ef 100644
--- a/docker/Dockerfile.finn
+++ b/docker/Dockerfile.finn
@@ -29,8 +29,7 @@
 FROM pytorch/pytorch:1.7.1-cuda11.0-cudnn8-runtime
 LABEL maintainer="Yaman Umuroglu <yamanu@xilinx.com>"
 
-# XRT version to be installed
-ARG XRT_DEB_VERSION="xrt_202010.2.7.766_18.04-amd64-xrt"
+ARG XRT_DEB_VERSION="xrt_202210.2.13.466_18.04-amd64-xrt"
 
 WORKDIR /workspace
 
@@ -39,24 +38,29 @@ WORKDIR /workspace
 ENV TZ="Europe/Dublin"
 RUN ln -snf /usr/share/zoneinfo/$TZ /etc/localtime && echo $TZ > /etc/timezone
 
-RUN apt-get update
-RUN apt-get -y upgrade
-RUN apt-get install -y build-essential
-RUN apt-get install -y libglib2.0-0
-RUN apt-get install -y libsm6
-RUN apt-get install -y libxext6
-RUN apt-get install -y libxrender-dev
-RUN apt-get install -y verilator
-RUN apt-get install -y nano
-RUN apt-get install -y zsh
-RUN apt-get install -y rsync
-RUN apt-get install -y git
-RUN apt-get install -y sshpass
-RUN apt-get install -y wget
-RUN apt-get install -y sudo
-RUN apt-get install -y unzip
-RUN apt-get install -y zip
+RUN apt-get update && \
+    apt-get install -y \
+    build-essential \
+    libc6-dev-i386 \
+    libglib2.0-0 \
+    libsm6 \
+    libxext6 \
+    libxrender-dev \
+    verilator \
+    nano \
+    zsh \
+    rsync \
+    git \
+    openssh-client \
+    sshpass \
+    wget \
+    sudo \
+    unzip \
+    zip \
+    locales \
+    lsb-core
 RUN echo "StrictHostKeyChecking no" >> /etc/ssh/ssh_config
+RUN locale-gen "en_US.UTF-8"
 
 # install XRT
 RUN wget https://www.xilinx.com/bin/public/openDownload?filename=$XRT_DEB_VERSION.deb -O /tmp/$XRT_DEB_VERSION.deb
@@ -72,11 +76,12 @@ RUN rm requirements.txt
 RUN pip install pygments==2.4.1
 RUN pip install ipykernel==5.5.5
 RUN pip install jupyter==1.0.0
+RUN pip install markupsafe==2.0.1
 RUN pip install matplotlib==3.3.1 --ignore-installed
 RUN pip install pytest-dependency==0.5.1
-RUN pip install sphinx==3.1.2
+RUN pip install sphinx==5.0.2
 RUN pip install sphinx_rtd_theme==0.5.0
-RUN pip install pytest-xdist==2.0.0
+RUN pip install pytest-xdist[setproctitle]==2.4.0
 RUN pip install pytest-parallel==0.1.0
 RUN pip install "netron>=5.0.0"
 RUN pip install pandas==1.1.5
@@ -84,70 +89,21 @@ RUN pip install scikit-learn==0.24.1
 RUN pip install tqdm==4.31.1
 RUN pip install -e git+https://github.com/fbcotter/dataset_loading.git@0.0.4#egg=dataset_loading
 
-# git-based Python repo dependencies
-# these are installed in editable mode for easier co-development
-ARG FINN_BASE_COMMIT="e8facdd719b55839cca46da2cc4f4a4a372afb41"
-ARG QONNX_COMMIT="9f9eff95227cc57aadc6eafcbd44b7acda89f067"
-ARG FINN_EXP_COMMIT="af6102769226b82b639f243dc36f065340991513"
-ARG BREVITAS_COMMIT="a5b71d6de1389d3e7db898fef72e014842670f03"
-ARG PYVERILATOR_COMMIT="0c3eb9343500fc1352a02c020a736c8c2db47e8e"
-ARG CNPY_COMMIT="4e8810b1a8637695171ed346ce68f6984e585ef4"
-ARG HLSLIB_COMMIT="966d17d3fddd801927b2167627d23a9a15ed1461"
-ARG OMX_COMMIT="1dfc4aa2f2895632742cd5751520c6b472feb74e"
-ARG AVNET_BDF_COMMIT="2d49cfc25766f07792c0b314489f21fe916b639b"
-
-# finn-base
-RUN git clone https://github.com/Xilinx/finn-base.git /workspace/finn-base
-RUN git -C /workspace/finn-base checkout $FINN_BASE_COMMIT
-RUN pip install -e /workspace/finn-base
-# Install qonnx without dependencies, currently its only dependency is finn-base
-RUN git clone https://github.com/fastmachinelearning/qonnx.git /workspace/qonnx
-RUN git -C /workspace/qonnx checkout $QONNX_COMMIT
-RUN pip install --no-dependencies -e /workspace/qonnx
+# extra dependencies from other FINN deps
+# installed in Docker image to make entrypoint script go faster
 # finn-experimental
-RUN git clone https://github.com/Xilinx/finn-experimental.git /workspace/finn-experimental
-RUN git -C /workspace/finn-experimental checkout $FINN_EXP_COMMIT
-RUN pip install -e /workspace/finn-experimental
+RUN pip install deap==1.3.1
+RUN pip install mip==1.13.0
+RUN pip install networkx==2.8
 # brevitas
-RUN git clone https://github.com/Xilinx/brevitas.git /workspace/brevitas
-RUN git -C /workspace/brevitas checkout $BREVITAS_COMMIT
-RUN pip install -e /workspace/brevitas
+RUN pip install future-annotations==1.0.0
+RUN pip install dependencies==2.0.1
+RUN pip install tokenize-rt==4.2.1
 # pyverilator
-RUN git clone https://github.com/maltanar/pyverilator.git /workspace/pyverilator
-RUN git -C /workspace/pyverilator checkout $PYVERILATOR_COMMIT
-RUN pip install -e /workspace/pyverilator
-# other git-based dependencies (non-Python)
-# cnpy
-RUN git clone https://github.com/rogersce/cnpy.git /workspace/cnpy
-RUN git -C /workspace/cnpy checkout $CNPY_COMMIT
-# finn-hlslib
-RUN git clone https://github.com/Xilinx/finn-hlslib.git /workspace/finn-hlslib
-RUN git -C /workspace/finn-hlslib checkout $HLSLIB_COMMIT
-# oh-my-xilinx
-RUN git clone https://bitbucket.org/maltanar/oh-my-xilinx.git /workspace/oh-my-xilinx
-RUN git -C /workspace/oh-my-xilinx checkout $OMX_COMMIT
-# board files
-RUN cd /tmp; \
-    wget -q https://github.com/cathalmccabe/pynq-z1_board_files/raw/master/pynq-z1.zip; \
-    wget -q https://dpoauwgwqsy2x.cloudfront.net/Download/pynq-z2.zip; \
-    unzip -q pynq-z1.zip; \
-    unzip -q pynq-z2.zip; \
-    mkdir /workspace/board_files; \
-    mv pynq-z1/ /workspace/board_files/; \
-    mv pynq-z2/ /workspace/board_files/; \
-    rm pynq-z1.zip; \
-    rm pynq-z2.zip; \
-    git clone https://github.com/Avnet/bdf.git /workspace/avnet-bdf; \
-    git -C /workspace/avnet-bdf checkout  $AVNET_BDF_COMMIT; \
-    mv /workspace/avnet-bdf/* /workspace/board_files/;
-
+RUN pip install tclwrapper==0.0.1
 
 # extra environment variables for FINN compiler
 ENV VIVADO_IP_CACHE "/tmp/vivado_ip_cache"
-ENV PATH "${PATH}:/workspace/oh-my-xilinx"
-ENV OHMYXILINX "/workspace/oh-my-xilinx"
-
-WORKDIR /workspace/finn
 
 COPY docker/finn_entrypoint.sh /usr/local/bin/
 COPY docker/quicktest.sh /usr/local/bin/
diff --git a/docker/finn_entrypoint.sh b/docker/finn_entrypoint.sh
index a2312d025b..b5c702111a 100644
--- a/docker/finn_entrypoint.sh
+++ b/docker/finn_entrypoint.sh
@@ -28,11 +28,14 @@
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 
-export FINN_ROOT=/workspace/finn
 export HOME=/tmp/home_dir
 export SHELL=/bin/bash
+export LANG="en_US.UTF-8"
+export LC_ALL="en_US.UTF-8"
+export LANGUAGE="en_US:en"
 # colorful terminal output
 export PS1='\[\033[1;36m\]\u\[\033[1;31m\]@\[\033[1;32m\]\h:\[\033[1;35m\]\w\[\033[1;31m\]\$\[\033[0m\] '
+export PATH=$PATH:$OHMYXILINX
 
 YELLOW='\033[0;33m'
 GREEN='\033[0;32m'
@@ -51,12 +54,21 @@ recho () {
   echo -e "${RED}ERROR: $1${NC}"
 }
 
-if [ -f "$FINN_ROOT/setup.py" ];then
+# qonnx
+pip install --user -e ${FINN_ROOT}/deps/qonnx
+# finn-experimental
+pip install --user -e ${FINN_ROOT}/deps/finn-experimental
+# brevitas
+pip install --user -e ${FINN_ROOT}/deps/brevitas
+# pyverilator
+pip install --user -e ${FINN_ROOT}/deps/pyverilator
+
+if [ -f "${FINN_ROOT}/setup.py" ];then
   # run pip install for finn
-  pip install --user -e $FINN_ROOT
+  pip install --user -e ${FINN_ROOT}
 else
-  recho "Unable to find FINN source code in /workspace/finn"
-  recho "Ensure you have passed -v <path-to-finn-repo>:/workspace/finn to the docker run command"
+  recho "Unable to find FINN source code in ${FINN_ROOT}"
+  recho "Ensure you have passed -v <path-to-finn-repo>:<path-to-finn-repo> to the docker run command"
   exit -1
 fi
 
@@ -90,5 +102,17 @@ else
   fi
 fi
 
+if [ -f "$HLS_PATH/settings64.sh" ];then
+  # source Vitis HLS env.vars
+  source $HLS_PATH/settings64.sh
+  gecho "Found Vitis HLS at $HLS_PATH"
+else
+  yecho "Unable to find $HLS_PATH/settings64.sh"
+  yecho "Functionality dependent on Vitis HLS will not be available."
+  yecho "Please note that FINN needs at least version 2020.2 for Vitis HLS support."
+  yecho "If you need Vitis HLS, ensure HLS_PATH is set correctly and mounted into the Docker container."
+fi
+
+export PATH=$PATH:$HOME/.local/bin
 # execute the provided command(s) as root
 exec "$@"
diff --git a/docker/jenkins/Dockerfile.jenkins b/docker/jenkins/Dockerfile.jenkins
deleted file mode 100644
index e1939b642e..0000000000
--- a/docker/jenkins/Dockerfile.jenkins
+++ /dev/null
@@ -1,11 +0,0 @@
-FROM jenkins/jenkins:lts
-# if we want to install via apt
-USER root
-RUN apt-get update
-RUN apt-get install -y gnupg-agent curl ca-certificates apt-transport-https software-properties-common
-RUN curl -fsSL https://download.docker.com/linux/debian/gpg | apt-key add -
-RUN add-apt-repository "deb [arch=amd64] https://download.docker.com/linux/debian $(lsb_release -cs) stable"
-RUN apt-get update
-RUN apt-get install -y docker-ce-cli
-# drop back to the regular jenkins user - good practice
-USER jenkins
diff --git a/docker/jenkins/Jenkinsfile b/docker/jenkins/Jenkinsfile
index f321194189..ad533efa5d 100644
--- a/docker/jenkins/Jenkinsfile
+++ b/docker/jenkins/Jenkinsfile
@@ -1,107 +1,45 @@
-pipeline {
-    agent any
-    parameters {
-        string(name: 'FINN_CI_BRANCH', defaultValue: '', description: 'FINN branch to build')
-        string(name: 'FINN_XILINX_PATH', defaultValue: '', description: 'Path to Xilinx tool installation')
-        string(name: 'FINN_XILINX_VERSION', defaultValue: '2020.1', description: 'Xilinx tool version')
-        string(name: 'PYNQ_BOARD', defaultValue: 'Pynq-Z1', description: 'PYNQ board type')
-        string(name: 'PYNQ_IP', defaultValue: '', description: 'PYNQ board IP address')
-        string(name: 'PYNQ_USERNAME', defaultValue: 'xilinx', description: 'PYNQ board username')
-        string(name: 'PYNQ_PASSWORD', defaultValue: 'xilinx', description: 'PYNQ board password')
-        string(name: 'PYNQ_TARGET_DIR', defaultValue: '/home/xilinx/finn', description: 'PYNQ board target deployment directory')
-        string(name: 'NUM_DEFAULT_WORKERS', defaultValue: '1', description: 'Number of cores for parallel transformations')
-        // main test: everything except rtlsim and end2end tests, parallel run with xdist, no parallel transformations to save on memory
-        string(name: 'DOCKER_CMD_MAIN', defaultValue: """python setup.py test --addopts "-k 'not (rtlsim or end2end)' --dist=loadfile -n auto" """, description: 'Main test command')
-        // rtlsim tests: parallel run with pytest-parallel, no parallel transformations to save on memory
-        string(name: 'DOCKER_CMD_RTLSIM', defaultValue: """python setup.py test --addopts "-k rtlsim --workers auto" """, description: 'rtlsim test command')
-        // end2end tests: no parallel testing, use NUM_DEFAULT_WORKERS for parallel transformations
-        string(name: 'DOCKER_CMD_END2END', defaultValue: """python setup.py test --addopts "-k end2end" """, description: 'end2end test command')
-        // allow specifying where to mount the cloned folder from, since Jenkins and FINN may be running in separate containers
-        string(name: 'WORKSPACE_MOUNT', defaultValue: '/var/jenkins_home/workspace/finn', description: 'Path to Jenkins workspace mount')
+node {
+    def app
+    stage('Clone repository') {
+        /* Let's make sure we have the repository cloned to our workspace */
+        checkout scm
     }
-    environment {
-        DOCKER_TAG='finn_ci:$BUILD_ID'
-        DOCKER_INST_NAME='finn_ci'
-        BUILD_PATH='/tmp/finn_ci'
-        VIVADO_PATH=${params.FINN_XILINX_PATH}/Vivado/${params.FINN_XILINX_VERSION}
-        VITIS_PATH=${params.FINN_XILINX_PATH}/Vitis/${params.FINN_XILINX_VERSION}
-    }
-    stages {
-        stage("Clone") {
-            steps {
-                git branch: "${params.FINN_CI_BRANCH}", url: 'https://github.com/Xilinx/finn.git'
+    withEnv([
+        "FINN_XILINX_PATH=/proj/xbuilds/SWIP/2022.1_0420_0327/installs/lin64",
+        "FINN_XILINX_VERSION=2022.1",
+        "FINN_DOCKER_TAG=xilinx/finn:jenkins",
+        "FINN_HOST_BUILD_DIR=/scratch/users/finn_ci",
+        "PLATFORM_REPO_PATHS=/opt/xilinx/dsa"
+    ]){
+        parallel firstBranch: {
+            stage('Brevitas export') {
+                dir("${env.WORKSPACE}") {
+                sh("bash run-docker.sh python setup.py test --addopts -mbrevitas_export")
+                }
             }
-        }
-      stage('Build') {
-            steps {
-                sh """
-                docker build -t $DOCKER_TAG -f docker/Dockerfile.finn_ci \
-                --build-arg BUILD_PATH=$BUILD_PATH \
-                .
-                """
+        }, secondBranch: {
+            stage('Streamlining transformations') {
+                dir("${env.WORKSPACE}") {
+                sh("bash run-docker.sh python setup.py test --addopts -mstreamline")
+                }
             }
-        }
-        stage('test-main') {
-            steps {
-                catchError(buildResult: 'SUCCESS', stageResult: 'FAILURE') {
-                sh """
-                docker run --init \
-                --hostname $DOCKER_INST_NAME \
-                -v ${params.WORKSPACE_MOUNT}:/workspace/finn \
-                -v ${params.FINN_XILINX_PATH}:${params.FINN_XILINX_PATH}:ro \
-                -e NUM_DEFAULT_WORKERS=1 \
-                -e FINN_INST_NAME=$DOCKER_INST_NAME \
-                -e VIVADO_PATH=$VIVADO_PATH \
-                -e VITIS_PATH=$VITIS_PATH \
-                -e PYNQ_BOARD=${params.PYNQ_BOARD} \
-                -e PYNQ_IP=${params.PYNQ_IP} \
-                -e PYNQ_USERNAME=${params.PYNQ_USERNAME} \
-                -e PYNQ_PASSWORD=${params.PYNQ_PASSWORD} \
-                -e PYNQ_TARGET_DIR=${params.PYNQ_TARGET_DIR} \
-                $DOCKER_TAG ${params.DOCKER_CMD_MAIN}
-                """}
+        }, thirdBranch: {
+            stage('Util functions') {
+                dir("${env.WORKSPACE}") {
+                sh("bash run-docker.sh python setup.py test --addopts -mutil")
+                }
             }
-        }
-        stage('test-rtlsim') {
-            steps {
-                catchError(buildResult: 'SUCCESS', stageResult: 'FAILURE') {
-                sh """
-                docker run --init \
-                --hostname $DOCKER_INST_NAME \
-                -v ${params.WORKSPACE_MOUNT}:/workspace/finn \
-                -v $VIVADO_PATH:$VIVADO_PATH:ro \
-                -e NUM_DEFAULT_WORKERS=1 \
-                -e FINN_INST_NAME=$DOCKER_INST_NAME \
-                -e VIVADO_PATH=$VIVADO_PATH \
-                -e VITIS_PATH=$VITIS_PATH \
-                -e PYNQ_BOARD=${params.PYNQ_BOARD} \
-                -e PYNQ_IP=${params.PYNQ_IP} \
-                -e PYNQ_USERNAME=${params.PYNQ_USERNAME} \
-                -e PYNQ_PASSWORD=${params.PYNQ_PASSWORD} \
-                -e PYNQ_TARGET_DIR=${params.PYNQ_TARGET_DIR} \
-                $DOCKER_TAG ${params.DOCKER_CMD_RTLSIM}
-                """}
+        }, fourthBranch: {
+            stage('General transformations') {
+                dir("${env.WORKSPACE}") {
+                sh("bash run-docker.sh python setup.py test --addopts -mtransform")
+                }
             }
-        }
-        stage('test-end2end') {
-            steps {
-                catchError(buildResult: 'SUCCESS', stageResult: 'FAILURE') {
-                sh """
-                docker run --init \
-                --hostname $DOCKER_INST_NAME \
-                -v ${params.WORKSPACE_MOUNT}:/workspace/finn \
-                -v $VIVADO_PATH:$VIVADO_PATH:ro \
-                -e NUM_DEFAULT_WORKERS=${params.NUM_DEFAULT_WORKERS} \
-                -e FINN_INST_NAME=$DOCKER_INST_NAME \
-                -e VIVADO_PATH=$VIVADO_PATH \
-                -e VITIS_PATH=$VITIS_PATH \
-                -e PYNQ_BOARD=${params.PYNQ_BOARD} \
-                -e PYNQ_IP=${params.PYNQ_IP} \
-                -e PYNQ_USERNAME=${params.PYNQ_USERNAME} \
-                -e PYNQ_PASSWORD=${params.PYNQ_PASSWORD} \
-                -e PYNQ_TARGET_DIR=${params.PYNQ_TARGET_DIR} \
-                $DOCKER_TAG ${params.DOCKER_CMD_END2END}
-                """ }
+        }, fifthBranch: {
+            stage('Fpgadataflow transformations and simulations') {
+                dir("${env.WORKSPACE}") {
+                sh("bash run-docker.sh python setup.py test --addopts -mfpgadataflow")
+                }
             }
         }
     }
diff --git a/docker/jenkins/launch-jenkins.sh b/docker/jenkins/launch-jenkins.sh
deleted file mode 100755
index 64dc1ec73f..0000000000
--- a/docker/jenkins/launch-jenkins.sh
+++ /dev/null
@@ -1,19 +0,0 @@
-#!/bin/bash
-
-# defaults, can be overriden by environment variables
-# user to run Jenkins as -- see NOTE below regarding Docker access permissions
-: ${JENKINS_USER=jenkins}
-# port for Jenkins on host machine
-: ${JENKINS_PORT=8080}
-# make Jenkins config persistent by mounting into this folder
-: ${JENKINS_HOME=$(pwd)/jenkins_home}
-
-mkdir -p $JENKINS_HOME
-
-# build a Jenkins Docker image that also has the Docker CLI installed
-docker build -t finn_jenkins -f Dockerfile.jenkins .
-
-# launch Docker container mounted to local Docker socket
-# NOTE: we allow customizing the user (e.g. as root) to work around permission
-# issues, may not al
-docker run -u $JENKINS_USER -p $JENKINS_PORT:8080 -v /var/run/docker.sock:/var/run/docker.sock -v $JENKINS_HOME:/var/jenkins_home finn_jenkins
diff --git a/docker/quicktest.sh b/docker/quicktest.sh
index b4ad37232f..f625f2b1ef 100755
--- a/docker/quicktest.sh
+++ b/docker/quicktest.sh
@@ -2,7 +2,7 @@
 
 : ${PYTEST_PARALLEL=auto}
 
-cd $FINN_ROOT
+cd $FINN_ROOT/finn
 # check if command line argument is empty or not present
 if [ -z $1 ]; then
   echo "Running quicktest: not (vivado or slow or board) with pytest-xdist"
diff --git a/docs/finn/brevitas_export.rst b/docs/finn/brevitas_export.rst
index 408b14fd2b..304aa30854 100644
--- a/docs/finn/brevitas_export.rst
+++ b/docs/finn/brevitas_export.rst
@@ -8,7 +8,7 @@ Brevitas Export
    :scale: 70%
    :align: center
 
-FINN expects an ONNX model as input. This can be a model trained with `Brevitas <https://github.com/Xilinx/brevitas>`_. Brevitas is a PyTorch library for quantization-aware training and the FINN Docker image comes with several `example Brevitas networks <https://github.com/Xilinx/brevitas/tree/master/brevitas_examples/bnn_pynq>`_. Brevitas provides an export of a quantized network in ONNX representation in several flavors.
+FINN expects an ONNX model as input. This can be a model trained with `Brevitas <https://github.com/Xilinx/brevitas>`_. Brevitas is a PyTorch library for quantization-aware training and the FINN Docker image comes with several `example Brevitas networks <https://github.com/Xilinx/brevitas/tree/master/src/brevitas_examples/bnn_pynq>`_. Brevitas provides an export of a quantized network in ONNX representation in several flavors.
 Two of the Brevitas-exported ONNX variants can be ingested by FINN:
 
    * FINN-ONNX: Quantized weights exported as tensors with additional attributes to mark low-precision datatypes. Quantized activations exported as MultiThreshold nodes.
diff --git a/docs/finn/command_line.rst b/docs/finn/command_line.rst
index ccb891a0ab..12e01db554 100644
--- a/docs/finn/command_line.rst
+++ b/docs/finn/command_line.rst
@@ -41,7 +41,7 @@ To use it, first create a folder with the necessary configuration and model file
 2. Put your ONNX model to be converted under ``dataflow_build_dir/model.onnx``.
    The filename is important and must exactly be ``model.onnx``.
 3. Create a JSON file with the build configuration. It must be named ``dataflow_build_dir/dataflow_build_config.json``.
-   Read more about the build configuration options on :py:mod:``finn.builder.build_dataflow_config.DataflowBuildConfig``.
+   Read more about the build configuration options on :py:mod:`finn.builder.build_dataflow_config.DataflowBuildConfig`.
    You can find an example .json file under ``src/finn/qnn-data/build_dataflow/dataflow_build_config.json``
 4. (Optional) create a JSON file with the folding configuration. It must be named ``dataflow_build_dir/folding_config.json``.
    You can find an example .json file under ``src/finn/qnn-data/build_dataflow/folding_config.json``.
@@ -55,7 +55,7 @@ Now you can invoke the simple dataflow build as follows:
   ./run-docker.sh build_dataflow <path/to/dataflow_build_dir/>
 
 Depending on the chosen output products, the dataflow build will run for a while
-as it go through numerous steps:
+as it goes through numerous steps:
 
 .. code-block:: none
 
@@ -186,20 +186,23 @@ This is possible by using the `build_custom` entry as follows:
 outside the FINN repo folder for cleaner separation. Let's call this folder
 ``custom_build_dir``.
 
-2. Create a ``custom_build_dir/build.py`` file that will perform the build when
-executed. You should also put any ONNX model(s) or other Python modules you
-may want to include in your build flow in this folder (so that they get mounted
-into the Docker container while building). Besides the filename and data placement,
+2. Create one or more Python files under this directory that perform the build(s)
+you would like when executed, for instance ``custom_build_dir/build.py`` and
+``custom_build_dir/build_quick.py``.
+You should also put any ONNX model(s) or other
+Python modules you may want to include in your build flow in this folder (so that they get
+mounted into the Docker container while building). Besides the data placement,
 you have complete freedom on how to implement the build flow here, including
 calling the steps from the simple dataflow build mode above,
 making calls to FINN library functions, preprocessing and altering models, building several variants etc.
-You can find a basic example of build.py under ``src/finn/qnn-data/build_dataflow/build.py``.
+You can find a basic example of a build flow under ``src/finn/qnn-data/build_dataflow/build.py``.
 
-You can launch the custom build flow using:
+You can launch the desired custom build flow using:
 
 ::
 
- ./run-docker.sh build_custom <path/to/custom_build_dir/>
+ ./run-docker.sh build_custom <path/to/custom_build_dir> <name-of-build-flow>
 
 This will mount the specified folder into the FINN Docker container and launch
-your ``build.py``.
+the build flow. If ``<name-of-build-flow>`` is not specified it will default to ``build``
+and thus execute ``build.py``. If it is specified, it will be ``<name-of-build-flow>.py``.
diff --git a/docs/finn/developers.rst b/docs/finn/developers.rst
index 508cd86a31..b152dfef66 100644
--- a/docs/finn/developers.rst
+++ b/docs/finn/developers.rst
@@ -63,40 +63,43 @@ Docker images
 
 If you want to add new dependencies (packages, repos) to FINN it's
 important to understand how we handle this in Docker.
-There are currently two Docker images used in FINN:
-
-* The finn.dev image, used for deploying and developing the FINN compiler. Details described below.
-* The finn.ci image, which is used for continuous integration testing. Almost identical to finn.dev image, key differences are no user setup and fewer packages installed (e.g. no Jupyter).
 
 The finn.dev image is built and launched as follows:
 
-1. run-docker.sh launches the build of the Docker image with `docker build`
+1. run-docker.sh launches fetch-repos.sh to checkout dependency git repos at correct commit hashes (unless ``FINN_SKIP_DEP_REPOS=1``)
 
-2. Docker image is built from docker/Dockerfile.finn_dev using the following steps:
+2. run-docker.sh launches the build of the Docker image with `docker build` (unless ``FINN_DOCKER_PREBUILT=1``). Docker image is built from docker/Dockerfile.finn using the following steps:
 
   * Base: PyTorch dev image
   * Set up apt dependencies: apt-get install a few packages for verilator and
   * Set up pip dependencies: Python packages FINN depends on are listed in requirements.txt, which is copied into the container and pip-installed. Some additional packages (such as Jupyter and Netron) are also installed.
-  * Do user setup: Switch to the same user running the container to avoid running as root.
-  * Clone dependency repos: These include Brevitas, finn-hlslib, finn-base, pyverilator and oh-my-xilinx. The correct commit version will be checked out by the entrypoint script.
   * Install XRT deps, if needed: For Vitis builds we need to install the extra dependencies for XRT. This is only triggered if the image is built with the INSTALL_XRT_DEPS=1 argument.
 
 3. Docker image is ready, run-docker.sh can now launch a container from this image with `docker run`. It sets up certain environment variables and volume mounts:
 
   * Vivado/Vitis is mounted from the host into the container (on the same path).
-  * The finn root folder is mounted under /workspace/finn. This allows modifying the source code on the host and testing inside the container.
+  * The finn root folder is mounted into the container (on the same path). This allows modifying the source code on the host and testing inside the container.
   * The build folder is mounted under /tmp/finn_dev_username (can be overridden by defining FINN_HOST_BUILD_DIR). This will be used for generated files. Mounting on the host allows easy examination of the generated files, and keeping the generated files after the container exits.
   * Various environment variables are set up for use inside the container. See the run-docker.sh script for a complete list.
 
 4. Entrypoint script (docker/finn_entrypoint.sh) upon launching container performs the following:
 
-  * Update and checkout the dependency repos at specified commits.
   * Source Vivado settings64.sh from specified path to make vivado and vivado_hls available.
   * Download PYNQ board files into the finn root directory, unless they already exist.
   * Source Vitits settings64.sh if Vitis is mounted.
 
 5. Depending on the arguments to run-docker.sh a different application is launched. run-docker.sh notebook launches a Jupyter server for the tutorials, whereas run-docker.sh build_custom and run-docker.sh build_dataflow trigger a dataflow build (see documentation). Running without arguments yields an interactive shell. See run-docker.sh for other options.
 
+(Re-)launching builds outside of Docker
+========================================
+
+It is possible to launch builds for FINN-generated HLS IP and stitched-IP folders outside of the Docker container.
+This may be necessary for visual inspection of the generated designs inside the Vivado GUI, if you run into licensing
+issues during synthesis, or other environmental problems.
+Simply set the ``FINN_ROOT`` environment variable to the location where the FINN compiler is installed on the host
+computer, and you should be able to launch the various .tcl scripts or .xpr project files without using the FINN
+Docker container as well.
+
 Linting
 =======
 
@@ -118,16 +121,16 @@ The checks are configured in .pre-commit-config.yaml under the repo root.
 Testing
 =======
 
-Tests are vital to keep FINN running.  All the FINN tests can be found at https://github.com/Xilinx/finn/tree/master/tests.
+Tests are vital to keep FINN running.  All the FINN tests can be found at https://github.com/Xilinx/finn/tree/main/tests.
 These tests can be roughly grouped into three categories:
 
- * Unit tests: targeting unit functionality, e.g. a single transformation. Example: https://github.com/Xilinx/finn/blob/master/tests/transformation/streamline/test_sign_to_thres.py tests the expected behavior of the `ConvertSignToThres` transformation pass.
+ * Unit tests: targeting unit functionality, e.g. a single transformation. Example: https://github.com/Xilinx/finn/blob/main/tests/transformation/streamline/test_sign_to_thres.py tests the expected behavior of the `ConvertSignToThres` transformation pass.
 
- * Small-scale integration tests: targeting a group of related classes or functions that to test how they behave together. Example: https://github.com/Xilinx/finn/blob/master/tests/fpgadataflow/test_convert_to_hls_conv_layer.py sets up variants of ONNX Conv nodes that are first lowered and then converted to FINN HLS layers.
+ * Small-scale integration tests: targeting a group of related classes or functions that to test how they behave together. Example: https://github.com/Xilinx/finn/blob/main/tests/fpgadataflow/test_convert_to_hls_conv_layer.py sets up variants of ONNX Conv nodes that are first lowered and then converted to FINN HLS layers.
 
- * End-to-end tests: testing a typical 'end-to-end' compilation flow in FINN, where one end is a trained QNN and the other end is a hardware implementation. These tests can be quite large and are typically broken into several steps that depend on prior ones. Examples: https://github.com/Xilinx/finn/tree/master/tests/end2end
+ * End-to-end tests: testing a typical 'end-to-end' compilation flow in FINN, where one end is a trained QNN and the other end is a hardware implementation. These tests can be quite large and are typically broken into several steps that depend on prior ones. Examples: https://github.com/Xilinx/finn/tree/main/tests/end2end
 
-Additionally, finn-base, brevitas and finn-hlslib also include their own test suites.
+Additionally, qonnx, brevitas and finn-hlslib also include their own test suites.
 The full FINN compiler test suite
 (which will take several hours to run and require a PYNQ board) can be executed
 by:
diff --git a/docs/finn/end_to_end_flow.rst b/docs/finn/end_to_end_flow.rst
index a51d56d771..bc5c523071 100644
--- a/docs/finn/end_to_end_flow.rst
+++ b/docs/finn/end_to_end_flow.rst
@@ -11,7 +11,7 @@ As you can see in the picture, FINN has a high modularity and has the property t
 
 The white fields show the state of the network representation in the respective step. The colored fields represent the transformations that are applied to the network to achieve a certain result. The diagram is divided into five sections, each of it includes several flow steps. The flow starts in top left corner with Brevitas export (green section), followed by the preparation of the network (blue section) for the Vivado HLS and Vivado IPI (orange section). There is also a section for testing and verification in software (red section) and the hardware generation and deployment on the PYNQ board (yellow section).
 
-This example flow is covered in the `end2end_example <https://github.com/Xilinx/finn/tree/master/notebooks/end2end_example>`_ Jupyter notebooks.
+This example flow is covered in the `end2end_example <https://github.com/Xilinx/finn/tree/main/notebooks/end2end_example>`_ Jupyter notebooks.
 For a more detailed overview about the different flow sections, please have a look at the corresponding pages:
 
 .. toctree::
diff --git a/docs/finn/example_networks.rst b/docs/finn/example_networks.rst
index 3f1ae0d603..ee58926578 100644
--- a/docs/finn/example_networks.rst
+++ b/docs/finn/example_networks.rst
@@ -13,22 +13,16 @@ compiler.
 End-to-end Integration tests
 ============================
 
-The FINN compiler uses `several pre-trained QNNs <https://github.com/Xilinx/brevitas/tree/master/brevitas_examples/bnn_pynq>`_
+The FINN compiler uses `several pre-trained QNNs <https://github.com/Xilinx/brevitas/tree/master/src/brevitas_examples/bnn_pynq>`_
 that serve as both examples and testcases.
 
 * TFC, SFC, LFC... are fully-connected networks trained on the MNIST dataset
 * CNV is a convolutional network trained on the CIFAR-10 dataset
 * w\_a\_ refers to the quantization used for the weights (w) and activations (a) in bits
 
-These networks are built end-to-end as part of the `FINN integration tests <https://github.com/Xilinx/finn/blob/master/tests/end2end/test_end2end_bnn_pynq.py>`_ ,
+These networks are built end-to-end as part of the `FINN integration tests <https://github.com/Xilinx/finn/blob/main/tests/end2end/test_end2end_bnn_pynq.py>`_ ,
 and the key performance indicators (FPGA resource, frames per second...) are
 automatically posted to the dashboard below.
-To implement a new network, you can use the `integration test code <https://github.com/Xilinx/finn/blob/dev/tests/end2end/test_end2end_bnn_pynq.py>`_
+To implement a new network, you can use the `integration test code <https://github.com/Xilinx/finn/blob/main/tests/end2end/test_end2end_bnn_pynq.py>`_
 as a starting point, as well as the `relevant Jupyter notebooks
-<https://github.com/Xilinx/finn/tree/master/notebooks/end2end_example/bnn-pynq>`_.
-
-.. image:: https://firebasestorage.googleapis.com/v0/b/drive-assets.google.com.a.appspot.com/o/Asset%20-%20Drive%20Icon512.png?alt=media
-  :width: 50px
-  :align: left
-
-`FINN end-to-end integration tests dashboard on Google Drive <https://bit.ly/finn-end2end-dashboard>`_
+<https://github.com/Xilinx/finn/tree/main/notebooks/end2end_example/bnn-pynq>`_.
diff --git a/docs/finn/faq.rst b/docs/finn/faq.rst
index e426bdb4e2..ef4457f53a 100644
--- a/docs/finn/faq.rst
+++ b/docs/finn/faq.rst
@@ -1,8 +1,8 @@
 .. _faq:
 
-***********************
+***************************
 Frequently Asked Questions
-***********************
+***************************
 
 Can't find the answer to your question here? Check `FINN GitHub Discussions <https://github.com/Xilinx/finn/discussions>`_.
 
@@ -75,7 +75,7 @@ Why does FINN-generated architectures need FIFOs between layers?
     See https://github.com/Xilinx/finn/discussions/383
 
 How do I tell FINN to utilize DSPs instead of LUTs for MAC operations in particular layers?
-    This is done with the ``resType="dsp"`` attribute on ``StreamingFCLayer`` and ``Vector_Vector_Activate`` instances.
+    This is done with the ``resType="dsp"`` attribute on ``MatrixVectorActivation`` and ``Vector_Vector_Activate`` instances.
     When using the ``build_dataflow`` system, this can be specified at a per layer basis by specifying it as part of one or more layers’
     folding config (:py:mod:`finn.builder.build_dataflow_config.DataflowBuildConfig.folding_config_file`).
     This is a good idea for layers with more weight/input act bits and high PE*SIMD.
@@ -84,7 +84,7 @@ How do I tell FINN to utilize DSPs instead of LUTs for MAC operations in particu
 
 How do I tell FINN to utilize a particular type of memory resource in particular layers?
     This is done with the ``ram_style`` attribute. Check the particular ``HLSCustomOp`` attribute definition to see
-    which modes are supported (`example for StreamingFCLayer <https://github.com/Xilinx/finn/blob/dev/src/finn/custom_op/fpgadataflow/streamingfclayer_batch.py#L95>`_).
+    which modes are supported (`example for MatrixVectorActivation <https://github.com/Xilinx/finn/blob/dev/src/finn/custom_op/fpgadataflow/matrixvectoractivation.py#L101>`_).
     When using the ``build_dataflow`` system, this can be specified at a per layer basis by specifying it as part of one or more layers’
     folding config (:py:mod:`finn.builder.build_dataflow_config.DataflowBuildConfig.folding_config_file`).
     See the `MobileNet-v1 build config for ZCU104 in finn-examples <https://github.com/Xilinx/finn-examples/blob/main/build/mobilenet-v1/folding_config/ZCU104_folding_config.json#L15>`_ for reference.
@@ -100,7 +100,7 @@ Which data layout do FINN-generated accelerators use? Big-endian? Little-endian?
     If you need to do this manually, first examine how the `FINN PYNQ Python drivers <https://github.com/Xilinx/finn-examples/blob/main/finn_examples/driver.py#L379>`_ do this – notice how the input data is
     first reshaped to create the “folded input shape” that reflects the word size of the first layer based on how much it
     was parallelized, then data packing is applied to obtain a raw byte array (with some reversals going on) that can be
-    fed directly to the hardware. Another example of this is the `npy_to_rtlsim_input <https://github.com/Xilinx/finn-base/blob/dev/src/finn/util/data_packing.py#L289>`_ function, which converts npy arrays to lists of Python arbitrary-precision integers that we feed into pyverilator for rtl simulation:
+    fed directly to the hardware. Another example of this is the `npy_to_rtlsim_input <https://github.com/Xilinx/finn-base/blob/dev/src/finn/util/data_packing.py#L289>`_ function, which converts npy arrays to lists of Python arbitrary-precision integers that we feed into pyverilator for rtl simulation.
 
 Why does FIFO sizing take so long for my network? Is something wrong?
     The automatic FIFO sizing in FINN can take quite long. It unfortunately doesn’t really parallelize on multiple cores since
diff --git a/docs/finn/getting_started.rst b/docs/finn/getting_started.rst
index af7a05751b..40425c119f 100644
--- a/docs/finn/getting_started.rst
+++ b/docs/finn/getting_started.rst
@@ -8,7 +8,7 @@ Quickstart
 ==========
 
 1. Install Docker to run `without root <https://docs.docker.com/engine/install/linux-postinstall/#manage-docker-as-a-non-root-user>`_
-2. Set up ``FINN_XILINX_PATH`` and ``FINN_XILINX_VERSION`` environment variables pointing respectively to the Xilinx tools installation directory and version (e.g. ``FINN_XILINX_PATH=/opt/Xilinx`` and ``FINN_XILINX_VERSION=2020.1``)
+2. Set up ``FINN_XILINX_PATH`` and ``FINN_XILINX_VERSION`` environment variables pointing respectively to the Xilinx tools installation directory and version (e.g. ``FINN_XILINX_PATH=/opt/Xilinx`` and ``FINN_XILINX_VERSION=2022.1``)
 3. Clone the FINN compiler from the repo: ``git clone https://github.com/Xilinx/finn/`` and go into the directory where it is cloned
 4. Execute ``./run-docker.sh quicktest`` to verify your installation.
 5. Optionally, follow the instructions on :ref:`PYNQ board first-time setup` or :ref:`Alveo first-time setup` for board setup.
@@ -47,7 +47,7 @@ by using the "advanced mode" described in the :ref:`command_line` section.
 
 Running FINN in Docker
 ======================
-FINN only running inside a Docker container, and comes with a script to easily build and launch the container. If you are not familiar with Docker, there are many excellent `online resources <https://docker-curriculum.com/>`_ to get started.
+FINN runs inside a Docker container, it comes with a script to easily build and launch the container. If you are not familiar with Docker, there are many excellent `online resources <https://docker-curriculum.com/>`_ to get started.
 You may want to review the :ref:`General FINN Docker tips` and :ref:`Environment variables` as well.
 If you want to use prebuilt images, read :ref:`Using a prebuilt image`.
 The ``run-docker.sh`` script that can be launched in the following modes:
@@ -82,9 +82,11 @@ FINN comes with numerous Jupyter notebook tutorials, which you can launch with:
   bash ./run-docker.sh notebook
 
 This will launch the `Jupyter notebook <https://jupyter.org/>`_ server inside a Docker container, and print a link on the terminal that you can open in your browser to run the FINN notebooks or create new ones.
-.. note:: The link will look something like this (the token you get will be different):
-http://127.0.0.1:8888/?token=f5c6bd32ae93ec103a88152214baedff4ce1850d81065bfc.
-The ``run-docker.sh`` script forwards ports 8888 for Jupyter and 8081 for Netron, and launches the notebook server with appropriate arguments.
+
+.. note::
+  The link will look something like this (the token you get will be different):
+  http://127.0.0.1:8888/?token=f5c6bd32ae93ec103a88152214baedff4ce1850d81065bfc.
+  The ``run-docker.sh`` script forwards ports 8888 for Jupyter and 8081 for Netron, and launches the notebook server with appropriate arguments.
 
 
 Environment variables
@@ -94,7 +96,7 @@ Prior to running the `run-docker.sh` script, there are several environment varia
 These are summarized below:
 
 * (required) ``FINN_XILINX_PATH`` points to your Xilinx tools installation on the host (e.g. ``/opt/Xilinx``)
-* (required) ``FINN_XILINX_VERSION`` sets the Xilinx tools version to be used (e.g. ``2020.1``)
+* (required) ``FINN_XILINX_VERSION`` sets the Xilinx tools version to be used (e.g. ``2022.1``)
 * (required for Alveo) ``PLATFORM_REPO_PATHS`` points to the Vitis platform files (DSA).
 * (required for Alveo) ``XRT_DEB_VERSION`` specifies the .deb to be installed for XRT inside the container (see default value in ``run-docker.sh``).
 * (optional) ``NUM_DEFAULT_WORKERS`` (default 4) specifies the degree of parallelization for the transformations that can be run in parallel, potentially reducing build time
@@ -113,6 +115,7 @@ These are summarized below:
 * (optional) ``FINN_DOCKER_RUN_AS_ROOT`` (default 0) if set to 1 then run Docker container as root, default is the current user.
 * (optional) ``FINN_DOCKER_GPU`` (autodetected) if not 0 then expose all Nvidia GPUs or those selected by ``NVIDIA_VISIBLE_DEVICES`` to Docker container for accelerated DNN training. Requires `Nvidia Container Toolkit <https://github.com/NVIDIA/nvidia-docker>`_
 * (optional) ``FINN_DOCKER_EXTRA`` (default "") pass extra arguments to the ``docker run`` command when executing ``./run-docker.sh``
+* (optional) ``FINN_SKIP_DEP_REPOS`` (default "0") skips the download of FINN dependency repos (uses the ones already downloaded under deps/.
 * (optional) ``NVIDIA_VISIBLE_DEVICES`` (default "") specifies specific Nvidia GPUs to use in Docker container. Possible values are a comma-separated list of GPU UUID(s) or index(es) e.g. ``0,1,2``, ``all``, ``none``, or void/empty/unset.
 * (optional) ``DOCKER_BUILDKIT`` (default "1") enables `Docker BuildKit <https://docs.docker.com/develop/develop-images/build_enhancements/>`_ for faster Docker image rebuilding (recommended).
 
@@ -120,8 +123,8 @@ General FINN Docker tips
 ************************
 * Several folders including the root directory of the FINN compiler and the ``FINN_HOST_BUILD_DIR`` will be mounted into the Docker container and can be used to exchange files.
 * Do not use ``sudo`` to launch the FINN Docker. Instead, setup Docker to run `without root <https://docs.docker.com/engine/install/linux-postinstall/#manage-docker-as-a-non-root-user>`_.
-* If you want a new terminal on an already-running container, you can do this with `docker exec -it <name_of_container> bash`.
-* The container is spawned with the `--rm` option, so make sure that any important files you created inside the container are either in the /workspace/finn folder (which is mounted from the host computer) or otherwise backed up.
+* If you want a new terminal on an already-running container, you can do this with ``docker exec -it <name_of_container> bash``.
+* The container is spawned with the `--rm` option, so make sure that any important files you created inside the container are either in the finn compiler folder (which is mounted from the host computer) or otherwise backed up.
 
 Using a prebuilt image
 **********************
@@ -137,8 +140,10 @@ If you are having trouble building the Docker image or need offline access, you
 
 Supported FPGA Hardware
 =======================
-**Shell-integrated accelerator + driver:** For quick deployment, we target boards supported by  `PYNQ <https://pynq.io/>`_ . For these platforms, we can build a full bitfile including DMAs to move data into and out of the FINN-generated accelerator, as well as a Python driver to launch the accelerator. We support the Pynq-Z1, Pynq-Z2, Ultra96, ZCU102 and ZCU104 boards.
-As of FINN v0.4b we also have preliminary support for `Xilinx Alveo boards <https://www.xilinx.com/products/boards-and-kits/alveo.html>`_ using PYNQ and Vitis, see instructions below for Alveo setup.
+**Shell-integrated accelerator + driver:** For quick deployment, we target boards supported by  `PYNQ <http://www.pynq.io/>`_ . For these platforms, we can build a full bitfile including DMAs to move data into and out of the FINN-generated accelerator, as well as a Python driver to launch the accelerator. We support the Pynq-Z1, Pynq-Z2, Ultra96, ZCU102 and ZCU104 boards.
+
+.. warning::
+  In previous FINN versions (v0.4b - v0.7) we had support for `Xilinx Alveo boards <https://www.xilinx.com/products/boards-and-kits/alveo.html>`_ using PYNQ and Vitis 2020.1, see instructions below for Alveo setup that works with older versions. Please note that with the new release with Vitis 2022.1, we do only have experimental support to automatically deployment for Alveo cards.
 
 **Vivado IPI support for any Xilinx FPGA:** FINN generates a Vivado IP Integrator (IPI) design from the neural network with AXI stream (FIFO) in-out interfaces, which can be integrated onto any Xilinx FPGA as part of a larger system. It's up to you to take the FINN-generated accelerator (what we call "stitched IP" in the tutorials), wire it up to your FPGA design and send/receive neural network data to/from the accelerator.
 
@@ -154,7 +159,7 @@ Start on the target side:
 Continue on the host side (replace the ``<PYNQ_IP>`` and ``<PYNQ_USERNAME>`` with the IP address and username of your board from the first step):
 
 1. Launch the Docker container from where you cloned finn with ``./run-docker.sh``
-2. Go into the `ssh_keys` directory  (e.g. ``cd /workspace/finn/ssh_keys``)
+2. Go into the `ssh_keys` directory  (e.g. ``cd /path/to/finn/ssh_keys``)
 3. Run ``ssh-keygen`` to create a key pair e.g. ``id_rsa`` private and ``id_rsa.pub`` public key
 4. Run ``ssh-copy-id -i id_rsa.pub <PYNQ_USERNAME>@<PYNQ_IP>`` to install the keys on the remote system
 5. Test that you can ``ssh <PYNQ_USERNAME>@<PYNQ_IP>`` without having to enter the password. Pass the ``-v`` flag to the ssh command if it doesn't work to help you debug.
@@ -200,11 +205,10 @@ System Requirements
 
 * Ubuntu 18.04 with ``bash`` installed
 * Docker `without root <https://docs.docker.com/engine/install/linux-postinstall/#manage-docker-as-a-non-root-user>`_
-* A working Vivado 2020.1 installation
+* A working Vitis/Vivado 2022.1 installation
 * ``FINN_XILINX_PATH`` and ``FINN_XILINX_VERSION`` environment variables correctly set, see `Quickstart`_
 * *(optional)* `Vivado/Vitis license`_ if targeting non-WebPack FPGA parts.
 * *(optional)* A PYNQ board with a network connection, see `PYNQ board first-time setup`_
-* *(optional)* An Alveo board, and a working Vitis 2020.1 installation if you want to use Vitis and Alveo (see `Alveo first-time setup`_ )
 
 We also recommend running the FINN compiler on a system with sufficiently
 strong hardware:
diff --git a/docs/finn/hw_build.rst b/docs/finn/hw_build.rst
index d03fc400bd..2a64b87943 100644
--- a/docs/finn/hw_build.rst
+++ b/docs/finn/hw_build.rst
@@ -9,12 +9,14 @@ Hardware Build and Deployment
    :align: center
 
 A model where all layers have been converted to HLS layers can be processed by
-FINN to build a bitfile targeting either a Zynq or Alveo system.
+FINN to build a bitfile and driver targeting a Zynq system or to generate a Vivado IP Integrator (IPI)
+design with AXI stream (FIFO) in-out interfaces, which can be integrated onto any Xilinx FPGA as part of a larger system.
+
 
 Hardware Build
 ==============
 
-Internally, the hardware build consists of the following steps:
+Internally, the hardware build for Zynq devices consists of the following steps:
 
 1. Driver generation
 2. DMA and DWC node insertion
@@ -22,12 +24,9 @@ Internally, the hardware build consists of the following steps:
 4. FIFO insertion and IP generation
 5. Vivado/Vitis project generation and synthesis
 
-.. note:: **In previous FINN releases it was necessary to step through the
-individual sub-steps for hardware build manually by calling each transformation.
-The hardware build transformations `ZynqBuild` and `VitisBuild` now execute all
-necessary sub-transformations. For more control over the build process, the
-transformations listed below can still be called individually.
-**
+.. note::
+  In previous FINN releases it was necessary to step through the individual sub-steps for hardware build manually by calling each transformation. The hardware build transformations `ZynqBuild` now execute all necessary sub-transformations. For more control over the build process, the transformations listed below can still be called individually.
+
 
 Driver Generation
 ------------------
@@ -60,9 +59,7 @@ This is accomplished by the :py:mod:`finn.transformation.fpgadataflow.floorplan.
 and :py:mod:`finn.transformation.fpgadataflow.create_dataflow_partition.CreateDataflowPartition`
 transformations.
 
-.. note:: **For Vitis, each partition will be compiled as a separate kernel,
-and linked together afterwards. For Zynq, each partition will become an IP
-block. **
+.. note:: For Vitis, each partition will be compiled as a separate kernel, and linked together afterwards. For Zynq, each partition will become an IP block.
 
 
 FIFO Insertion and IP Generation
diff --git a/docs/finn/img/repo-structure.png b/docs/finn/img/repo-structure.png
index 05031ff9a5..704e5e5bda 100644
Binary files a/docs/finn/img/repo-structure.png and b/docs/finn/img/repo-structure.png differ
diff --git a/docs/finn/index.rst b/docs/finn/index.rst
index 751b105bb4..c13bf81cec 100644
--- a/docs/finn/index.rst
+++ b/docs/finn/index.rst
@@ -33,9 +33,7 @@ More FINN Resources
 
 * `The FINN examples repository <https://github.com/Xilinx/finn-examples>`_
 
-* `List of publications <https://github.com/Xilinx/finn/blob/master/docs/publications.md>`_
-
-* `Roadmap <https://github.com/Xilinx/finn/projects/1>`_
+* `List of publications <https://xilinx.github.io/finn/publications>`_
 
 .. toctree::
    :maxdepth: 5
diff --git a/docs/finn/internals.rst b/docs/finn/internals.rst
index 9305f78402..0b33affc76 100644
--- a/docs/finn/internals.rst
+++ b/docs/finn/internals.rst
@@ -1,8 +1,8 @@
 .. _internals:
 
-*********
+**********
 Internals
-*********
+**********
 
 Intermediate Representation: QONNX and FINN-ONNX
 ================================================
@@ -14,16 +14,18 @@ FINN uses `ONNX <https://github.com/onnx/onnx>`_ as an intermediate representati
 Custom Quantization Annotations
 ===============================
 
-ONNX does not support datatypes smaller than 8-bit integers, whereas in FINN we are interested in smaller integers down to ternary and bipolar. To make this work, FINN uses the quantization_annotation field in ONNX to annotate tensors with their FINN DataType (:py:mod:`finn.core.datatype.DataType`) information. However, all tensors are expected to use single-precision floating point (float32) storage in FINN. This means we store even a 1-bit value as floating point for the purposes of representation. The FINN compiler flow is responsible for eventually producing a packed representation for the target hardware, where the 1-bit is actually stored as 1-bit.
+ONNX does not support datatypes smaller than 8-bit integers, whereas in FINN we are interested in smaller integers down to ternary and bipolar. To make this work, FINN-ONNX uses the quantization_annotation field in ONNX to annotate tensors with their FINN DataType (:py:mod:`qonnx.core.datatype.DataType`) information. However, all tensors are expected to use single-precision floating point (float32) storage in FINN. This means we store even a 1-bit value as floating point for the purposes of representation. The FINN compiler flow is responsible for eventually producing a packed representation for the target hardware, where the 1-bit is actually stored as 1-bit.
 
 Note that FINN uses floating point tensors as a carrier data type to represent integers. Floating point arithmetic can introduce rounding errors, e.g. (int_num * float_scale) / float_scale is not always equal to int_num.
-When using the custom ONNX execution flow, FINN will attempt to sanitize any rounding errors for integer tensors. See (:py:mod:`finn.util.basic.sanitize_quant_values`) for more information.
+When using the custom ONNX execution flow, FINN will attempt to sanitize any rounding errors for integer tensors. See (:py:mod:`qonnx.util.basic.sanitize_quant_values`) for more information.
 This behavior can be disabled (not recommended!) by setting the environment variable SANITIZE_QUANT_TENSORS=0.
 
+.. note:: In QONNX the quantization is represented differently, for details please check the `QONNX repository <https://github.com/fastmachinelearning/qonnx>`_ .
+
 Custom Operations/Nodes
 =======================
 
-FINN uses many custom operations (op_type in ONNX NodeProto) that are not defined in the ONNX operator schema. These custom nodes are marked with domain="finn.*" in the protobuf to identify them as such. These nodes can represent specific operations that we need for low-bit networks, or operations that are specific to a particular hardware backend. To get more familiar with custom operations and how they are created, please take a look in the Jupyter notebook about CustomOps (see chapter :ref:`tutorials` for details) or directly in the module :py:mod:`finn.custom_op`.
+FINN uses many custom operations (op_type in ONNX NodeProto) that are not defined in the ONNX operator schema. These custom nodes are marked with domain="finn.*" or domain="qonnx.*" in the protobuf to identify them as such. These nodes can represent specific operations that we need for low-bit networks, or operations that are specific to a particular hardware backend. To get more familiar with custom operations and how they are created, please take a look in the Jupyter notebook about CustomOps (see chapter :ref:`tutorials` for details) or directly in the module :py:mod:`finn.custom_op`.
 
 .. note:: See the description of `this PR <https://github.com/Xilinx/finn-base/pull/6>`_ for more on how the operator wrapper library is organized.
 
@@ -39,7 +41,7 @@ To verify correct operation of FINN-ONNX graphs, FINN provides its own ONNX exec
 ModelWrapper
 ============
 
-FINN provides a ModelWrapper class (:py:mod:`finn.core.modelwrapper.ModelWrapper`) as a thin wrapper around ONNX to make it easier to analyze and manipulate ONNX graphs. This wrapper provides many helper functions, while still giving full access to the ONNX protobuf representation.
+FINN provides a ModelWrapper class (:py:mod:`qonnx.core.modelwrapper.ModelWrapper`) as a thin wrapper around ONNX to make it easier to analyze and manipulate ONNX graphs. This wrapper provides many helper functions, while still giving full access to the ONNX protobuf representation.
 
 Some of the helper functions are described in more detail below.
 
@@ -48,7 +50,7 @@ Create a ModelWrapper instance
 The ModelWrapper instance can be created using a model in .onnx format or by directly passing a ModelProto instance to the wrapper. The code block below gives an example of how to use the wrapper on a model in .onnx format.
 ::
 
-  from finn.core.modelwrapper import ModelWrapper
+  from qonnx.core.modelwrapper import ModelWrapper
   model = ModelWrapper("model.onnx")
 
 Access the ONNX GraphProto through ModelWrapper
@@ -116,9 +118,9 @@ As mentioned above there are FINN DataTypes additional to the container datatype
   model.get_tensor_datatype(tensor_list[2])
 
   # set tensor datatype of third tensor in model tensor list
-  from finn.core.datatype import DataType
+  from qonnx.core.datatype import DataType
 
-  finn_dtype = DataType.BIPOLAR
+  finn_dtype = DataType["BIPOLAR"]
   model.set_tensor_datatype(tensor_list[2], finn_dtype)
 
 ModelWrapper contains two helper functions for tensor initializers, one to determine the current initializer and one to set the initializer of a tensor. If there is no initializer, None is returned.
@@ -127,7 +129,7 @@ ModelWrapper contains two helper functions for tensor initializers, one to deter
   # get tensor initializer of third tensor in model tensor list
   model.get_initializer(tensor_list[2])
 
-ModelWrapper contains more useful functions, if you are interested please have a look at the ModelWrapper module (:py:mod:`finn.core.modelwrapper.ModelWrapper`) directly.
+ModelWrapper contains more useful functions, if you are interested please have a look at the ModelWrapper module (:py:mod:`qonnx.core.modelwrapper.ModelWrapper`) directly.
 
 
 .. _analysis_pass:
@@ -146,16 +148,18 @@ A transformation passes changes (transforms) the given model, it gets the model
 
 .. _mem_mode:
 
-StreamingFCLayer *mem_mode*
-===========================
+MatrixVectorActivation *mem_mode*
+==================================
 
-FINN supports two types of the so-called *mem_mode* attrıbute for the node StreamingFCLayer. This mode controls how the weight values are accessed during the execution. That means the mode setting has direct influence on the resulting circuit. Currently two settings for the *mem_mode* are supported in FINN:
+FINN supports three types of the so-called *mem_mode* attrıbute for the node MatrixVectorActivation. This mode controls how the weight values are accessed during the execution. That means the mode setting has direct influence on the resulting circuit. Currently three settings for the *mem_mode* are supported in FINN:
 
 * "const"
 
 * "decoupled"
 
-The following picture shows the idea behind the two modes.
+* "external"
+
+The following picture shows the idea behind the "const" and "decoupled" mode.
 
 .. image:: img/mem_mode.png
    :scale: 55%
@@ -163,7 +167,7 @@ The following picture shows the idea behind the two modes.
 
 Const mode
 ----------
-In *const* mode the weights are "baked in" into the Matrix-Vector-Activate-Unit (MVAU), which means they are part of the HLS code. During the IP block generation the weight values are integrated as *params.h* file in the HLS code and synthesized together with it. For the *const* mode IP block generation the `StreamingFCLayer_Batch function <https://github.com/Xilinx/finn-hlslib/blob/07a8353f6cdfd8bcdd81e309a5581044c2a93d3b/fclayer.h#L94>`_ from the finn-hls library is used, which implements a standard MVAU. The resulting IP block has an input and an output stream, as shown in the above picture on the left. FIFOs in the form of verilog components are connected to these.
+In *const* mode the weights are "baked in" into the Matrix-Vector-Activate-Unit (MVAU), which means they are part of the HLS code. During the IP block generation the weight values are integrated as *params.h* file in the HLS code and synthesized together with it. For the *const* mode IP block generation the `Matrix_Vector_Activate_Batch function <https://github.com/Xilinx/finn-hlslib/blob/19fa1197c09bca24a0f77a7fa04b8d7cb5cc1c1d/mvau.hpp#L93>`_ from the finn-hls library is used, which implements a standard MVAU. The resulting IP block has an input and an output stream, as shown in the above picture on the left. FIFOs in the form of verilog components are connected to these.
 
 Advantages:
 
@@ -185,7 +189,7 @@ In *decoupled* mode a different variant of the MVAU with three ports is used. Be
 
 Advantages:
 
-* better control over the used memory primivites used (see the ram_style attribute in StreamingFCLayer)
+* better control over the used memory primivites used (see the ram_style attribute in MatrixVectorActivation)
 
 * potentially faster HLS synthesis time since weight array shape is no longer part of HLS synthesis
 
diff --git a/docs/finn/nw_prep.rst b/docs/finn/nw_prep.rst
index f5c64e76a4..566eda5bac 100644
--- a/docs/finn/nw_prep.rst
+++ b/docs/finn/nw_prep.rst
@@ -17,13 +17,13 @@ Various transformations are involved in the network preparation. The following i
 Tidy-up transformations
 =======================
 
-These transformations do not appear in the diagram above, but are applied in many steps in the FINN flow to postprocess the model after a transformation and/or prepare it for the next transformation. They ensure that all information is set and behave like a "tidy-up". These transformations are the following:
+These transformations do not appear in the diagram above, but are applied in many steps in the FINN flow to postprocess the model after a transformation and/or prepare it for the next transformation. They ensure that all information is set and behave like a "tidy-up". These transformations are located in the `QONNX repository <https://github.com/fastmachinelearning/qonnx>`_ and can be imported:
 
-* :py:mod:`finn.transformation.general.GiveReadableTensorNames` and :py:mod:`finn.transformation.general.GiveUniqueNodeNames`
+* :py:mod:`qonnx.transformation.general.GiveReadableTensorNames` and :py:mod:`qonnx.transformation.general.GiveUniqueNodeNames`
 
-* :py:mod:`finn.transformation.infer_datatypes.InferDataTypes` and :py:mod:`finn.transformation.infer_shapes.InferShapes`
+* :py:mod:`qonnx.transformation.infer_datatypes.InferDataTypes` and :py:mod:`qonnx.transformation.infer_shapes.InferShapes`
 
-* :py:mod:`finn.transformation.fold_constants.FoldConstants`
+* :py:mod:`qonnx.transformation.fold_constants.FoldConstants`
 
 Streamlining Transformations
 ============================
@@ -35,7 +35,7 @@ After this transformation the ONNX model is streamlined and contains now custom
 Convert to HLS Layers
 =====================
 
-Pairs of binary XNORPopcountMatMul layers are converted to StreamingFCLayers and following Multithreshold layers are absorbed into the Matrix-Vector-Activate-Unit (MVAU). The result is a model consisting of a mixture of HLS and non-HLS layers. For more details, see :py:mod:`finn.transformation.fpgadataflow.convert_to_hls_layers`. The MVAU can be implemented in two different modes, *const* and *decoupled*, see chapter :ref:`mem_mode`.
+In this step standard or custom layers are converted to HLS layers. HLS layers are layers that directly correspond to a finn-hlslib function call. For example pairs of binary XNORPopcountMatMul and MultiThreshold layers are converted to MatrixVectorActivation layers. The result is a model consisting of a mixture of HLS and non-HLS layers. For more details, see :py:mod:`finn.transformation.fpgadataflow.convert_to_hls_layers`. The MatrixVectorActivation layer can be implemented in three different modes, *const*, *decoupled* (see chapter :ref:`mem_mode`) and *external*.
 
 Dataflow Partitioning
 =====================
@@ -43,8 +43,8 @@ Dataflow Partitioning
 In the next step the graph is split and the part consisting of HLS layers is further processed in the FINN flow. The parent graph containing the non-HLS layers remains. The PE and SIMD are set to 1 by default, so the result is a network of only HLS layers with maximum folding. The model can be verified using the *cppsim* simulation. It is a simulation using C++ and is described in more detail in chapter :ref:`verification`.
 
 Folding
-=======
+=========
 
 To adjust the folding, the values for PE and SIMD can be increased to achieve also an increase in the performance. The result can be verified using the same simulation flow as for the network with maximum folding (*cppsim* using C++), for details please have a look at chapter :ref:`verification`.
 
-The result is a network of HLS layers with desired folding and it can be passed to :ref:`vivado_synth`.
+The result is a network of HLS layers with desired folding and it can be passed to :ref:`hw_build`.
diff --git a/docs/finn/source_code/finn.analysis.rst b/docs/finn/source_code/finn.analysis.rst
index 1de42ac32b..f2321dbee7 100644
--- a/docs/finn/source_code/finn.analysis.rst
+++ b/docs/finn/source_code/finn.analysis.rst
@@ -15,26 +15,26 @@ Submodules
 Analysis Passes
 ===============
 
-finn.analysis.base
+qonnx.analysis.base
 -----------------------------
 
-.. automodule:: finn.analysis.base
+.. automodule:: qonnx.analysis.base
    :members:
    :undoc-members:
    :show-inheritance:
 
-finn.analysis.inference\_cost
------------------------------
+qonnx.analysis.inference\_cost
+-------------------------------
 
-.. automodule:: finn.analysis.inference_cost
+.. automodule:: qonnx.analysis.inference_cost
    :members:
    :undoc-members:
    :show-inheritance:
 
-finn.analysis.topology
+qonnx.analysis.topology
 -----------------------------
 
-.. automodule:: finn.analysis.topology
+.. automodule:: qonnx.analysis.topology
    :members:
    :undoc-members:
    :show-inheritance:
diff --git a/docs/finn/source_code/finn.core.rst b/docs/finn/source_code/finn.core.rst
index 86afd5a106..4e3de458e1 100644
--- a/docs/finn/source_code/finn.core.rst
+++ b/docs/finn/source_code/finn.core.rst
@@ -5,34 +5,34 @@ Core
 Modules
 =======
 
-finn.core.data\_layout
+qonnx.core.data\_layout
 -------------------------
 
-.. automodule:: finn.core.data_layout
+.. automodule:: qonnx.core.data_layout
    :members:
    :undoc-members:
    :show-inheritance:
 
-finn.core.datatype
+qonnx.core.datatype
 -------------------------
 
-.. automodule:: finn.core.datatype
+.. automodule:: qonnx.core.datatype
    :members:
    :undoc-members:
    :show-inheritance:
 
-finn.core.execute\_custom\_node
+qonnx.core.execute\_custom\_node
 --------------------------------------
 
-.. automodule:: finn.core.execute_custom_node
+.. automodule:: qonnx.core.execute_custom_node
    :members:
    :undoc-members:
    :show-inheritance:
 
-finn.core.modelwrapper
+qonnx.core.modelwrapper
 -----------------------------
 
-.. automodule:: finn.core.modelwrapper
+.. automodule:: qonnx.core.modelwrapper
    :members:
    :undoc-members:
    :show-inheritance:
diff --git a/docs/finn/source_code/finn.custom_op.fpgadataflow.rst b/docs/finn/source_code/finn.custom_op.fpgadataflow.rst
index 34a6285f22..cc56ea603e 100644
--- a/docs/finn/source_code/finn.custom_op.fpgadataflow.rst
+++ b/docs/finn/source_code/finn.custom_op.fpgadataflow.rst
@@ -22,7 +22,7 @@ finn.custom\_op.fpgadataflow.addstreams\_batch
    :show-inheritance:
 
 finn.custom\_op.fpgadataflow.channelwise\_op\_batch
------------------------------------------------
+-----------------------------------------------------
 
 .. automodule:: finn.custom_op.fpgadataflow.channelwise_op_batch
    :members:
@@ -55,7 +55,7 @@ finn.custom\_op.fpgadataflow.downsampler
    :show-inheritance:
 
 finn.custom\_op.fpgadataflow.duplicatestreams\_batch
------------------------------------------------
+-------------------------------------------------------
 
 .. automodule:: finn.custom_op.fpgadataflow.duplicatestreams_batch
    :members:
@@ -71,7 +71,7 @@ finn.custom\_op.fpgadataflow.fmpadding\_batch
    :show-inheritance:
 
 finn.custom\_op.fpgadataflow.globalaccpool\_batch
------------------------------------------------
+---------------------------------------------------
 
 .. automodule:: finn.custom_op.fpgadataflow.globalaccpool_batch
    :members:
@@ -127,10 +127,10 @@ finn.custom\_op.fpgadataflow.streamingdatawidthconverter\_batch
    :undoc-members:
    :show-inheritance:
 
-finn.custom\_op.fpgadataflow.streamingfclayer\_batch
+finn.custom\_op.fpgadataflow.matrixvectoractivation
 -----------------------------------------------------------
 
-.. automodule:: finn.custom_op.fpgadataflow.streamingfclayer_batch
+.. automodule:: finn.custom_op.fpgadataflow.matrixvectoractivation
    :members:
    :undoc-members:
    :show-inheritance:
@@ -160,7 +160,7 @@ finn.custom\_op.fpgadataflow.templates
    :show-inheritance:
 
 finn.custom\_op.fpgadataflow.thresholding\_batch
------------------------------------------------
+-------------------------------------------------------
 
 .. automodule:: finn.custom_op.fpgadataflow.thresholding_batch
    :members:
@@ -184,10 +184,10 @@ finn.custom\_op.fpgadataflow.upsampler
    :undoc-members:
    :show-inheritance:
 
-finn.custom\_op.fpgadataflow.vector\_vector\_activate\_batch
------------------------------------------------
+finn.custom\_op.fpgadataflow.vectorvectoractivation
+-----------------------------------------------------
 
-.. automodule:: finn.custom_op.fpgadataflow.vector_vector_activate_batch
+.. automodule:: finn.custom_op.fpgadataflow.vectorvectoractivation
    :members:
    :undoc-members:
    :show-inheritance:
diff --git a/docs/finn/source_code/finn.custom_op.general.rst b/docs/finn/source_code/finn.custom_op.general.rst
deleted file mode 100644
index 87749fd69e..0000000000
--- a/docs/finn/source_code/finn.custom_op.general.rst
+++ /dev/null
@@ -1,86 +0,0 @@
-************************
-Custom Op - General
-************************
-
-General Custom Ops
-===================
-
-finn.custom\_op.general.bipolar_quant
---------------------------------------
-
-.. automodule:: finn.custom_op.general.bipolar_quant
-   :members:
-   :undoc-members:
-   :show-inheritance:
-
-finn.custom\_op.general.debugmarker
------------------------------------
-
-.. automodule:: finn.custom_op.general.debugmarker
-   :members:
-   :undoc-members:
-   :show-inheritance:
-
-finn.custom\_op.general.genericpartition
------------------------------------------
-
-.. automodule:: finn.custom_op.general.genericpartition
-   :members:
-   :undoc-members:
-   :show-inheritance:
-
-finn.custom\_op.general.im2col
-------------------------------
-
-.. automodule:: finn.custom_op.general.im2col
-   :members:
-   :undoc-members:
-   :show-inheritance:
-
-finn.custom\_op.general.maxpoolnhwc
-------------------------------------
-
-.. automodule:: finn.custom_op.general.maxpoolnhwc
-   :members:
-   :undoc-members:
-   :show-inheritance:
-
-finn.custom\_op.general.multithreshold
----------------------------------------
-
-.. automodule:: finn.custom_op.general.multithreshold
-   :members:
-   :undoc-members:
-   :show-inheritance:
-
-finn.custom\_op.general.quant
-------------------------------
-
-.. automodule:: finn.custom_op.general.quant
-  :members:
-  :undoc-members:
-  :show-inheritance:
-
-finn.custom\_op.general.quantavgpool2d
---------------------------------------
-
-.. automodule:: finn.custom_op.general.quantavgpool2d
-  :members:
-  :undoc-members:
-  :show-inheritance:
-
-finn.custom\_op.general.trunc
-------------------------------
-
-.. automodule:: finn.custom_op.general.trunc
-  :members:
-  :undoc-members:
-  :show-inheritance:
-
-finn.custom\_op.general.xnorpopcount
--------------------------------------
-
-.. automodule:: finn.custom_op.general.xnorpopcount
-   :members:
-   :undoc-members:
-   :show-inheritance:
diff --git a/docs/finn/source_code/finn.custom_op.rst b/docs/finn/source_code/finn.custom_op.rst
index 1ee3e1dce1..20d90a7bb5 100644
--- a/docs/finn/source_code/finn.custom_op.rst
+++ b/docs/finn/source_code/finn.custom_op.rst
@@ -9,7 +9,7 @@ Submodules
    :maxdepth: 2
 
    finn.custom_op.fpgadataflow
-   finn.custom_op.general
+   qonnx.custom_op.general
 
 Custom Op Nodes
 ===============
@@ -17,15 +17,15 @@ Custom Op Nodes
 Base Class
 ----------
 
-.. automodule:: finn.custom_op.base
+.. automodule:: qonnx.custom_op.base
    :members:
    :undoc-members:
    :show-inheritance:
 
-finn.custom\_op.registry
+qonnx.custom\_op.registry
 -------------------------
 
-.. automodule:: finn.custom_op.registry
+.. automodule:: qonnx.custom_op.registry
   :members:
   :undoc-members:
   :show-inheritance:
diff --git a/docs/finn/source_code/finn.rst b/docs/finn/source_code/finn.rst
index 607ac636a4..5547a46623 100644
--- a/docs/finn/source_code/finn.rst
+++ b/docs/finn/source_code/finn.rst
@@ -3,7 +3,7 @@ FINN API
 ********
 The FINN sources are divided into different modules. They are listed below.
 
-.. note:: **Some of these functions and modules are located in the `finn-base` repository.**
+.. note:: **Some of these functions and modules are located in the `qonnx` repository.**
 
 Modules
 =======
diff --git a/docs/finn/source_code/finn.transformation.qonnx.rst b/docs/finn/source_code/finn.transformation.qonnx.rst
index 8320e19efb..1332639b1d 100644
--- a/docs/finn/source_code/finn.transformation.qonnx.rst
+++ b/docs/finn/source_code/finn.transformation.qonnx.rst
@@ -1,4 +1,4 @@
-***********************
+************************
 Transformation - QONNX
 ************************
 
diff --git a/docs/finn/source_code/finn.transformation.rst b/docs/finn/source_code/finn.transformation.rst
index cffb0fd0f9..6a28eeedb2 100644
--- a/docs/finn/source_code/finn.transformation.rst
+++ b/docs/finn/source_code/finn.transformation.rst
@@ -25,147 +25,147 @@ Base Class
    :undoc-members:
    :show-inheritance:
 
-finn.transformation.batchnorm\_to\_affine
+qonnx.transformation.batchnorm\_to\_affine
 ------------------------------------------------
 
-.. automodule:: finn.transformation.batchnorm_to_affine
+.. automodule:: qonnx.transformation.batchnorm_to_affine
    :members:
    :undoc-members:
    :show-inheritance:
 
-finn.transformation.bipolar\_to\_xnor
+qonnx.transformation.bipolar\_to\_xnor
 --------------------------------------------
 
-.. automodule:: finn.transformation.bipolar_to_xnor
+.. automodule:: qonnx.transformation.bipolar_to_xnor
    :members:
    :undoc-members:
    :show-inheritance:
 
-finn.transformation.change\_3d\_tensors\_to\_4d
+qonnx.transformation.change\_3d\_tensors\_to\_4d
 ------------------------------------------------
 
-.. automodule:: finn.transformation.change_3d_tensors_to_4d
+.. automodule:: qonnx.transformation.change_3d_tensors_to_4d
   :members:
   :undoc-members:
   :show-inheritance:
 
-finn.transformation.change\_datalayout
+qonnx.transformation.change\_datalayout
 --------------------------------------------
 
-.. automodule:: finn.transformation.change_datalayout
+.. automodule:: qonnx.transformation.change_datalayout
   :members:
   :undoc-members:
   :show-inheritance:
 
-finn.transformation.create\_generic\_partitions
+qonnx.transformation.create\_generic\_partitions
 ------------------------------------------------
 
-.. automodule:: finn.transformation.create_generic_partitions
+.. automodule:: qonnx.transformation.create_generic_partitions
   :members:
   :undoc-members:
   :show-inheritance:
 
-finn.transformation.double\_to\_single\_float
+qonnx.transformation.double\_to\_single\_float
 ----------------------------------------------------
 
-.. automodule:: finn.transformation.double_to_single_float
+.. automodule:: qonnx.transformation.double_to_single_float
    :members:
    :undoc-members:
    :show-inheritance:
 
-finn.transformation.extend\_partition
+qonnx.transformation.extend\_partition
 ------------------------------------------
 
-.. automodule:: finn.transformation.extend_partition
+.. automodule:: qonnx.transformation.extend_partition
    :members:
    :undoc-members:
    :show-inheritance:
 
-finn.transformation.extract\_conv\_bias
+qonnx.transformation.extract\_conv\_bias
 ------------------------------------------
 
-.. automodule:: finn.transformation.extract_conv_bias
+.. automodule:: qonnx.transformation.extract_conv_bias
    :members:
    :undoc-members:
    :show-inheritance:
 
 
-finn.transformation.fold\_constants
+qonnx.transformation.fold\_constants
 ------------------------------------------
 
-.. automodule:: finn.transformation.fold_constants
+.. automodule:: qonnx.transformation.fold_constants
    :members:
    :undoc-members:
    :show-inheritance:
 
-finn.transformation.gemm\_to\_matmul
+qonnx.transformation.gemm\_to\_matmul
 ------------------------------------------
 
-.. automodule:: finn.transformation.gemm_to_matmul
+.. automodule:: qonnx.transformation.gemm_to_matmul
    :members:
    :undoc-members:
    :show-inheritance:
 
-finn.transformation.general
+qonnx.transformation.general
 ----------------------------------
 
-.. automodule:: finn.transformation.general
+.. automodule:: qonnx.transformation.general
    :members:
    :undoc-members:
    :show-inheritance:
 
-finn.transformation.infer\_data\_layouts
+qonnx.transformation.infer\_data\_layouts
 -------------------------------------------
 
-.. automodule:: finn.transformation.infer_data_layouts
+.. automodule:: qonnx.transformation.infer_data_layouts
   :members:
   :undoc-members:
   :show-inheritance:
 
-finn.transformation.infer\_datatypes
+qonnx.transformation.infer\_datatypes
 -------------------------------------------
 
-.. automodule:: finn.transformation.infer_datatypes
+.. automodule:: qonnx.transformation.infer_datatypes
    :members:
    :undoc-members:
    :show-inheritance:
 
-finn.transformation.infer\_shapes
+qonnx.transformation.infer\_shapes
 ----------------------------------------
 
-.. automodule:: finn.transformation.infer_shapes
+.. automodule:: qonnx.transformation.infer_shapes
    :members:
    :undoc-members:
    :show-inheritance:
 
-finn.transformation.insert\_topk
+qonnx.transformation.insert\_topk
 ---------------------------------------
 
-.. automodule:: finn.transformation.insert_topk
+.. automodule:: qonnx.transformation.insert_topk
    :members:
    :undoc-members:
    :show-inheritance:
 
-finn.transformation.lower\_convs\_to\_matmul
+qonnx.transformation.lower\_convs\_to\_matmul
 ---------------------------------------------------
 
-.. automodule:: finn.transformation.lower_convs_to_matmul
+.. automodule:: qonnx.transformation.lower_convs_to_matmul
    :members:
    :undoc-members:
    :show-inheritance:
 
-finn.transformation.make\_input\_chanlast
+qonnx.transformation.make\_input\_chanlast
 ------------------------------------------
 
-.. automodule:: finn.transformation.make_input_chanlast
+.. automodule:: qonnx.transformation.make_input_chanlast
   :members:
   :undoc-members:
   :show-inheritance:
 
-finn.transformation.merge\_onnx\_models
+qonnx.transformation.merge\_onnx\_models
 ----------------------------------------
 
-.. automodule:: finn.transformation.merge_onnx_models
+.. automodule:: qonnx.transformation.merge_onnx_models
   :members:
   :undoc-members:
   :show-inheritance:
@@ -179,10 +179,10 @@ finn.transformation.move\_reshape
    :undoc-members:
    :show-inheritance:
 
-finn.transformation.remove
+qonnx.transformation.remove
 -------------------------------------
 
-.. automodule:: finn.transformation.remove
+.. automodule:: qonnx.transformation.remove
   :members:
   :undoc-members:
   :show-inheritance:
diff --git a/docs/finn/source_code/finn.util.rst b/docs/finn/source_code/finn.util.rst
index 62b72c2ac8..8dffa01632 100644
--- a/docs/finn/source_code/finn.util.rst
+++ b/docs/finn/source_code/finn.util.rst
@@ -5,24 +5,33 @@ Util
 Utility Modules
 ===============
 
-finn.util.basic
+qonnx.util.basic
 ----------------------
 
-.. automodule:: finn.util.basic
+.. automodule:: qonnx.util.basic
    :members:
    :undoc-members:
    :show-inheritance:
 
-finn.util.config
-----------------
 
-.. automodule:: finn.util.config
+qonnx.util.config
+--------------------
+
+.. automodule:: qonnx.util.config
   :members:
   :undoc-members:
   :show-inheritance:
 
+finn.util.basic
+----------------------
+
+.. automodule:: finn.util.basic
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
 finn.util.create
-----------------
+------------------
 
 .. automodule:: finn.util.create
   :members:
@@ -63,11 +72,10 @@ finn.util.imagenet
   :undoc-members:
   :show-inheritance:
 
-
-finn.util.onnx
+qonnx.util.onnx
 ---------------------
 
-.. automodule:: finn.util.onnx
+.. automodule:: qonnx.util.onnx
    :members:
    :undoc-members:
    :show-inheritance:
diff --git a/docs/finn/source_code/qonnx.custom_op.general.rst b/docs/finn/source_code/qonnx.custom_op.general.rst
new file mode 100644
index 0000000000..84609971ed
--- /dev/null
+++ b/docs/finn/source_code/qonnx.custom_op.general.rst
@@ -0,0 +1,86 @@
+************************
+Custom Op - General
+************************
+
+General Custom Ops
+===================
+
+qonnx.custom\_op.general.bipolar_quant
+--------------------------------------
+
+.. automodule:: qonnx.custom_op.general.bipolar_quant
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
+qonnx.custom\_op.general.debugmarker
+------------------------------------
+
+.. automodule:: qonnx.custom_op.general.debugmarker
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
+qonnx.custom\_op.general.genericpartition
+-----------------------------------------
+
+.. automodule:: qonnx.custom_op.general.genericpartition
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
+qonnx.custom\_op.general.im2col
+-------------------------------
+
+.. automodule:: qonnx.custom_op.general.im2col
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
+qonnx.custom\_op.general.maxpoolnhwc
+------------------------------------
+
+.. automodule:: qonnx.custom_op.general.maxpoolnhwc
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
+qonnx.custom\_op.general.multithreshold
+---------------------------------------
+
+.. automodule:: qonnx.custom_op.general.multithreshold
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
+qonnx.custom\_op.general.quant
+------------------------------
+
+.. automodule:: qonnx.custom_op.general.quant
+  :members:
+  :undoc-members:
+  :show-inheritance:
+
+qonnx.custom\_op.general.quantavgpool2d
+---------------------------------------
+
+.. automodule:: qonnx.custom_op.general.quantavgpool2d
+  :members:
+  :undoc-members:
+  :show-inheritance:
+
+qonnx.custom\_op.general.trunc
+------------------------------
+
+.. automodule:: qonnx.custom_op.general.trunc
+  :members:
+  :undoc-members:
+  :show-inheritance:
+
+qonnx.custom\_op.general.xnorpopcount
+-------------------------------------
+
+.. automodule:: qonnx.custom_op.general.xnorpopcount
+   :members:
+   :undoc-members:
+   :show-inheritance:
diff --git a/docs/finn/tutorials.rst b/docs/finn/tutorials.rst
index 4c260ecfb1..110f77c5b1 100644
--- a/docs/finn/tutorials.rst
+++ b/docs/finn/tutorials.rst
@@ -5,7 +5,7 @@ Tutorials
 *********
 
 FINN provides several Jupyter notebooks that can help to get familiar with the basics, the internals and the end-to-end flow in FINN.
-All Jupyter notebooks can be found in the repo in the `notebook folder <https://github.com/Xilinx/finn/tree/master/notebooks>`_.
+All Jupyter notebooks can be found in the repo in the `notebook folder <https://github.com/Xilinx/finn/tree/main/notebooks>`_.
 
 Basics
 ======
@@ -23,7 +23,7 @@ The notebooks in this folder should give a basic insight into FINN, how to get s
 End-to-End Flow
 ===============
 
-There are two groups of notebooks currently available under `the end2end_example directory <https://github.com/Xilinx/finn/tree/master/notebooks/end2end_example>`_ :
+There are two groups of notebooks currently available under `the end2end_example directory <https://github.com/Xilinx/finn/tree/main/notebooks/end2end_example>`_ :
 
 * ``cybersecurity`` shows how to train a quantized MLP with Brevitas and deploy it with FINN using the :ref:`command_line` build system.
 
diff --git a/docs/finn/verification.rst b/docs/finn/verification.rst
index 7c636941ad..e1a9ac4b31 100644
--- a/docs/finn/verification.rst
+++ b/docs/finn/verification.rst
@@ -8,7 +8,7 @@ Functional Verification
    :scale: 70%
    :align: center
 
-This part of the flow is covered by the Jupyter notebook about the verification of a simple fully-connected network, which you can find in the `end2end notebook folder <https://github.com/Xilinx/finn/tree/master/notebooks/end2end_example/bnn-pynq/tfc_end2end_verification.ipynb>`_.
+This part of the flow is covered by the Jupyter notebook about the verification of a simple fully-connected network, which you can find in the `end2end notebook folder <https://github.com/Xilinx/finn/blob/main/notebooks/end2end_example/bnn-pynq/tfc_end2end_verification.ipynb>`_.
 
 When the network is transformed it is important to verify the functionality to make sure the transformation did not change the behaviour of the model. There are multiple ways of verification that can be applied in different stages of the network inside FINN. All can be accessed using the execution function in module :py:mod:`finn.core.onnx_exec`. The execution happens in most cases node by node, which supports networks that have a mixture of standard ONNX nodes, custom nodes and HLS custom nodes. A single node can be executed using one or more of the following methods:
 
diff --git a/docs/finn/vivado_synth.rst b/docs/finn/vivado_synth.rst
deleted file mode 100644
index ca8b8ad655..0000000000
--- a/docs/finn/vivado_synth.rst
+++ /dev/null
@@ -1,13 +0,0 @@
-.. _vivado_synth:
-
-*************************
-Vivado HLS and Vivado IPI
-*************************
-
-.. image:: img/vivado-synth.png
-   :scale: 70%
-   :align: center
-
-In this step the system is handed over to Vivado. To do this, IP blocks are created from each layer using Vivado HLS and then stitched together using Vivado IP Integrator. This creates a Vivado design of the entire network. The design can be verified using `PyVerilator <https://github.com/maltanar/pyverilator>`_ either on the network with the unstitched IP blocks or on the stitched IP. The generated verilog files are passed to PyVerilator and in this way the model can be emulated. This procedure is called *rtlsim* in FINN flow and details can be found in the chapter :ref:`verification`.
-
-Once the model is in the form of a stitched IP, it can be passed to the next flow step :ref:`pynq_deploy`.
diff --git a/fetch-repos.sh b/fetch-repos.sh
new file mode 100755
index 0000000000..1fb830e349
--- /dev/null
+++ b/fetch-repos.sh
@@ -0,0 +1,133 @@
+#!/bin/bash
+# Copyright (c) 2020-2022, Xilinx, Inc.
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of FINN nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+QONNX_COMMIT="4a4826641db8d34619d31eac155fe95af11692eb"
+FINN_EXP_COMMIT="9cbd2787b5160e2b44e0e8164a0df1457dbd5366"
+BREVITAS_COMMIT="a5b71d6de1389d3e7db898fef72e014842670f03"
+PYVERILATOR_COMMIT="64b8294ff1afebb47be76fcad6ae87027e0402c2"
+CNPY_COMMIT="4e8810b1a8637695171ed346ce68f6984e585ef4"
+HLSLIB_COMMIT="e9946e5e56acd85837e8e79224d2bb60764bed69"
+OMX_COMMIT="d1065a788219ca0eb54d5e57600b1f9d7f67d4cc"
+AVNET_BDF_COMMIT="2d49cfc25766f07792c0b314489f21fe916b639b"
+XIL_BDF_COMMIT="8cf4bb674a919ac34e3d99d8d71a9e60af93d14e"
+EXP_BOARD_FILES_MD5="30eecc497c31050bd46d10ea20eba232"
+
+QONNX_URL="https://github.com/fastmachinelearning/qonnx.git"
+FINN_EXP_URL="https://github.com/Xilinx/finn-experimental.git"
+BREVITAS_URL="https://github.com/Xilinx/brevitas.git"
+PYVERILATOR_URL="https://github.com/maltanar/pyverilator.git"
+CNPY_URL="https://github.com/rogersce/cnpy.git"
+HLSLIB_URL="https://github.com/Xilinx/finn-hlslib.git"
+OMX_URL="https://github.com/maltanar/oh-my-xilinx.git"
+AVNET_BDF_URL="https://github.com/Avnet/bdf.git"
+XIL_BDF_URL="https://github.com/Xilinx/XilinxBoardStore.git"
+
+QONNX_DIR="qonnx"
+FINN_EXP_DIR="finn-experimental"
+BREVITAS_DIR="brevitas"
+PYVERILATOR_DIR="pyverilator"
+CNPY_DIR="cnpy"
+HLSLIB_DIR="finn-hlslib"
+OMX_DIR="oh-my-xilinx"
+AVNET_BDF_DIR="avnet-bdf"
+XIL_BDF_DIR="xil-bdf"
+
+# absolute path to this script, e.g. /home/user/bin/foo.sh
+SCRIPT=$(readlink -f "$0")
+# absolute path this script is in, thus /home/user/bin
+SCRIPTPATH=$(dirname "$SCRIPT")
+
+fetch_repo() {
+    # URL for git repo to be cloned
+    REPO_URL=$1
+    # commit hash for repo
+    REPO_COMMIT=$2
+    # directory to clone to under deps/
+    REPO_DIR=$3
+    # absolute path for the repo local copy
+    CLONE_TO=$SCRIPTPATH/deps/$REPO_DIR
+
+    # clone repo if dir not found
+    if [ ! -d "$CLONE_TO" ]; then
+        git clone $REPO_URL $CLONE_TO
+    fi
+    # verify and try to pull repo if not at correct commit
+    CURRENT_COMMIT=$(git -C $CLONE_TO rev-parse HEAD)
+    if [ $CURRENT_COMMIT != $REPO_COMMIT ]; then
+        git -C $CLONE_TO pull
+        # checkout the expected commit
+        git -C $CLONE_TO checkout $REPO_COMMIT
+    fi
+    # verify one last time
+    CURRENT_COMMIT=$(git -C $CLONE_TO rev-parse HEAD)
+    if [ $CURRENT_COMMIT == $REPO_COMMIT ]; then
+        echo "Successfully checked out $REPO_DIR at commit $CURRENT_COMMIT"
+    else
+        echo "Could not check out $REPO_DIR. Check your internet connection and try again."
+    fi
+}
+
+fetch_board_files() {
+    echo "Downloading and extracting board files..."
+    mkdir -p "$SCRIPTPATH/deps/board_files"
+    OLD_PWD=$(pwd)
+    cd "$SCRIPTPATH/deps/board_files"
+    wget -q https://github.com/cathalmccabe/pynq-z1_board_files/raw/master/pynq-z1.zip
+    wget -q https://dpoauwgwqsy2x.cloudfront.net/Download/pynq-z2.zip
+    unzip -q pynq-z1.zip
+    unzip -q pynq-z2.zip
+    cp -r $SCRIPTPATH/deps/$AVNET_BDF_DIR/* $SCRIPTPATH/deps/board_files/
+    cp -r $SCRIPTPATH/deps/$XIL_BDF_DIR/boards/Xilinx/rfsoc2x2 $SCRIPTPATH/deps/board_files/;
+    cd $OLD_PWD
+}
+
+fetch_repo $QONNX_URL $QONNX_COMMIT $QONNX_DIR
+fetch_repo $FINN_EXP_URL $FINN_EXP_COMMIT $FINN_EXP_DIR
+fetch_repo $BREVITAS_URL $BREVITAS_COMMIT $BREVITAS_DIR
+fetch_repo $PYVERILATOR_URL $PYVERILATOR_COMMIT $PYVERILATOR_DIR
+fetch_repo $CNPY_URL $CNPY_COMMIT $CNPY_DIR
+fetch_repo $HLSLIB_URL $HLSLIB_COMMIT $HLSLIB_DIR
+fetch_repo $OMX_URL $OMX_COMMIT $OMX_DIR
+fetch_repo $AVNET_BDF_URL $AVNET_BDF_COMMIT $AVNET_BDF_DIR
+fetch_repo $XIL_BDF_URL $XIL_BDF_COMMIT $XIL_BDF_DIR
+
+# download extra Pynq board files and extract if needed
+if [ ! -d "$SCRIPTPATH/deps/board_files" ]; then
+    fetch_board_files
+else
+    cd $SCRIPTPATH
+    BOARD_FILES_MD5=$(find deps/board_files/ -type f -exec md5sum {} \; | sort -k 2 | md5sum | cut -d' ' -f 1)
+    if [ "$BOARD_FILES_MD5" = "$EXP_BOARD_FILES_MD5" ]; then
+        echo "Verified board files folder content md5: $BOARD_FILES_MD5"
+    else
+        echo "Board files folder content mismatch, removing and re-downloading"
+        rm -rf deps/board_files/
+        fetch_board_files
+    fi
+fi
diff --git a/finn-rtllib/axi_info/component.xml b/finn-rtllib/axi_info/component.xml
new file mode 100644
index 0000000000..d22637534f
--- /dev/null
+++ b/finn-rtllib/axi_info/component.xml
@@ -0,0 +1,708 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<spirit:component xmlns:xilinx="http://www.xilinx.com" xmlns:spirit="http://www.spiritconsortium.org/XMLSchema/SPIRIT/1685-2009" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
+  <spirit:vendor>AMD</spirit:vendor>
+  <spirit:library>user</spirit:library>
+  <spirit:name>axi_info_top</spirit:name>
+  <spirit:version>1.0</spirit:version>
+  <spirit:busInterfaces>
+    <spirit:busInterface>
+      <spirit:name>s_axi</spirit:name>
+      <spirit:busType spirit:vendor="xilinx.com" spirit:library="interface" spirit:name="aximm" spirit:version="1.0"/>
+      <spirit:abstractionType spirit:vendor="xilinx.com" spirit:library="interface" spirit:name="aximm_rtl" spirit:version="1.0"/>
+      <spirit:slave>
+        <spirit:memoryMapRef spirit:memoryMapRef="s_axi"/>
+      </spirit:slave>
+      <spirit:portMaps>
+        <spirit:portMap>
+          <spirit:logicalPort>
+            <spirit:name>AWADDR</spirit:name>
+          </spirit:logicalPort>
+          <spirit:physicalPort>
+            <spirit:name>s_axi_AWADDR</spirit:name>
+          </spirit:physicalPort>
+        </spirit:portMap>
+        <spirit:portMap>
+          <spirit:logicalPort>
+            <spirit:name>AWVALID</spirit:name>
+          </spirit:logicalPort>
+          <spirit:physicalPort>
+            <spirit:name>s_axi_AWVALID</spirit:name>
+          </spirit:physicalPort>
+        </spirit:portMap>
+        <spirit:portMap>
+          <spirit:logicalPort>
+            <spirit:name>AWREADY</spirit:name>
+          </spirit:logicalPort>
+          <spirit:physicalPort>
+            <spirit:name>s_axi_AWREADY</spirit:name>
+          </spirit:physicalPort>
+        </spirit:portMap>
+        <spirit:portMap>
+          <spirit:logicalPort>
+            <spirit:name>WDATA</spirit:name>
+          </spirit:logicalPort>
+          <spirit:physicalPort>
+            <spirit:name>s_axi_WDATA</spirit:name>
+          </spirit:physicalPort>
+        </spirit:portMap>
+        <spirit:portMap>
+          <spirit:logicalPort>
+            <spirit:name>WSTRB</spirit:name>
+          </spirit:logicalPort>
+          <spirit:physicalPort>
+            <spirit:name>s_axi_WSTRB</spirit:name>
+          </spirit:physicalPort>
+        </spirit:portMap>
+        <spirit:portMap>
+          <spirit:logicalPort>
+            <spirit:name>WVALID</spirit:name>
+          </spirit:logicalPort>
+          <spirit:physicalPort>
+            <spirit:name>s_axi_WVALID</spirit:name>
+          </spirit:physicalPort>
+        </spirit:portMap>
+        <spirit:portMap>
+          <spirit:logicalPort>
+            <spirit:name>WREADY</spirit:name>
+          </spirit:logicalPort>
+          <spirit:physicalPort>
+            <spirit:name>s_axi_WREADY</spirit:name>
+          </spirit:physicalPort>
+        </spirit:portMap>
+        <spirit:portMap>
+          <spirit:logicalPort>
+            <spirit:name>BRESP</spirit:name>
+          </spirit:logicalPort>
+          <spirit:physicalPort>
+            <spirit:name>s_axi_BRESP</spirit:name>
+          </spirit:physicalPort>
+        </spirit:portMap>
+        <spirit:portMap>
+          <spirit:logicalPort>
+            <spirit:name>BVALID</spirit:name>
+          </spirit:logicalPort>
+          <spirit:physicalPort>
+            <spirit:name>s_axi_BVALID</spirit:name>
+          </spirit:physicalPort>
+        </spirit:portMap>
+        <spirit:portMap>
+          <spirit:logicalPort>
+            <spirit:name>BREADY</spirit:name>
+          </spirit:logicalPort>
+          <spirit:physicalPort>
+            <spirit:name>s_axi_BREADY</spirit:name>
+          </spirit:physicalPort>
+        </spirit:portMap>
+        <spirit:portMap>
+          <spirit:logicalPort>
+            <spirit:name>ARADDR</spirit:name>
+          </spirit:logicalPort>
+          <spirit:physicalPort>
+            <spirit:name>s_axi_ARADDR</spirit:name>
+          </spirit:physicalPort>
+        </spirit:portMap>
+        <spirit:portMap>
+          <spirit:logicalPort>
+            <spirit:name>ARVALID</spirit:name>
+          </spirit:logicalPort>
+          <spirit:physicalPort>
+            <spirit:name>s_axi_ARVALID</spirit:name>
+          </spirit:physicalPort>
+        </spirit:portMap>
+        <spirit:portMap>
+          <spirit:logicalPort>
+            <spirit:name>ARREADY</spirit:name>
+          </spirit:logicalPort>
+          <spirit:physicalPort>
+            <spirit:name>s_axi_ARREADY</spirit:name>
+          </spirit:physicalPort>
+        </spirit:portMap>
+        <spirit:portMap>
+          <spirit:logicalPort>
+            <spirit:name>RDATA</spirit:name>
+          </spirit:logicalPort>
+          <spirit:physicalPort>
+            <spirit:name>s_axi_RDATA</spirit:name>
+          </spirit:physicalPort>
+        </spirit:portMap>
+        <spirit:portMap>
+          <spirit:logicalPort>
+            <spirit:name>RRESP</spirit:name>
+          </spirit:logicalPort>
+          <spirit:physicalPort>
+            <spirit:name>s_axi_RRESP</spirit:name>
+          </spirit:physicalPort>
+        </spirit:portMap>
+        <spirit:portMap>
+          <spirit:logicalPort>
+            <spirit:name>RVALID</spirit:name>
+          </spirit:logicalPort>
+          <spirit:physicalPort>
+            <spirit:name>s_axi_RVALID</spirit:name>
+          </spirit:physicalPort>
+        </spirit:portMap>
+        <spirit:portMap>
+          <spirit:logicalPort>
+            <spirit:name>RREADY</spirit:name>
+          </spirit:logicalPort>
+          <spirit:physicalPort>
+            <spirit:name>s_axi_RREADY</spirit:name>
+          </spirit:physicalPort>
+        </spirit:portMap>
+      </spirit:portMaps>
+    </spirit:busInterface>
+    <spirit:busInterface>
+      <spirit:name>ap_rst_n</spirit:name>
+      <spirit:busType spirit:vendor="xilinx.com" spirit:library="signal" spirit:name="reset" spirit:version="1.0"/>
+      <spirit:abstractionType spirit:vendor="xilinx.com" spirit:library="signal" spirit:name="reset_rtl" spirit:version="1.0"/>
+      <spirit:slave/>
+      <spirit:portMaps>
+        <spirit:portMap>
+          <spirit:logicalPort>
+            <spirit:name>RST</spirit:name>
+          </spirit:logicalPort>
+          <spirit:physicalPort>
+            <spirit:name>ap_rst_n</spirit:name>
+          </spirit:physicalPort>
+        </spirit:portMap>
+      </spirit:portMaps>
+      <spirit:parameters>
+        <spirit:parameter>
+          <spirit:name>POLARITY</spirit:name>
+          <spirit:value spirit:id="BUSIFPARAM_VALUE.AP_RST_N.POLARITY" spirit:choiceRef="choice_list_9d8b0d81">ACTIVE_LOW</spirit:value>
+        </spirit:parameter>
+      </spirit:parameters>
+    </spirit:busInterface>
+    <spirit:busInterface>
+      <spirit:name>ap_clk</spirit:name>
+      <spirit:busType spirit:vendor="xilinx.com" spirit:library="signal" spirit:name="clock" spirit:version="1.0"/>
+      <spirit:abstractionType spirit:vendor="xilinx.com" spirit:library="signal" spirit:name="clock_rtl" spirit:version="1.0"/>
+      <spirit:slave/>
+      <spirit:portMaps>
+        <spirit:portMap>
+          <spirit:logicalPort>
+            <spirit:name>CLK</spirit:name>
+          </spirit:logicalPort>
+          <spirit:physicalPort>
+            <spirit:name>ap_clk</spirit:name>
+          </spirit:physicalPort>
+        </spirit:portMap>
+      </spirit:portMaps>
+      <spirit:parameters>
+        <spirit:parameter>
+          <spirit:name>ASSOCIATED_RESET</spirit:name>
+          <spirit:value spirit:id="BUSIFPARAM_VALUE.AP_CLK.ASSOCIATED_RESET">ap_rst_n</spirit:value>
+        </spirit:parameter>
+        <spirit:parameter>
+          <spirit:name>ASSOCIATED_BUSIF</spirit:name>
+          <spirit:value spirit:id="BUSIFPARAM_VALUE.AP_CLK.ASSOCIATED_BUSIF">s_axi</spirit:value>
+        </spirit:parameter>
+      </spirit:parameters>
+    </spirit:busInterface>
+  </spirit:busInterfaces>
+  <spirit:memoryMaps>
+    <spirit:memoryMap>
+      <spirit:name>s_axi</spirit:name>
+      <spirit:displayName>s_axi</spirit:displayName>
+      <spirit:addressBlock>
+        <spirit:name>reg0</spirit:name>
+        <spirit:displayName>reg0</spirit:displayName>
+        <spirit:baseAddress spirit:format="bitString" spirit:bitStringLength="1">0x0</spirit:baseAddress>
+        <spirit:range spirit:format="bitString" spirit:bitStringLength="13" spirit:minimum="4096" spirit:rangeType="long">0x1000</spirit:range>
+        <spirit:width spirit:format="long">32</spirit:width>
+        <spirit:usage>register</spirit:usage>
+      </spirit:addressBlock>
+    </spirit:memoryMap>
+  </spirit:memoryMaps>
+  <spirit:model>
+    <spirit:views>
+      <spirit:view>
+        <spirit:name>xilinx_anylanguagesynthesis</spirit:name>
+        <spirit:displayName>Synthesis</spirit:displayName>
+        <spirit:envIdentifier>:vivado.xilinx.com:synthesis</spirit:envIdentifier>
+        <spirit:language>SystemVerilog</spirit:language>
+        <spirit:modelName>axi_info_top</spirit:modelName>
+        <spirit:fileSetRef>
+          <spirit:localName>xilinx_anylanguagesynthesis_view_fileset</spirit:localName>
+        </spirit:fileSetRef>
+        <spirit:parameters>
+          <spirit:parameter>
+            <spirit:name>viewChecksum</spirit:name>
+            <spirit:value>7d682dfc</spirit:value>
+          </spirit:parameter>
+        </spirit:parameters>
+      </spirit:view>
+      <spirit:view>
+        <spirit:name>xilinx_anylanguagebehavioralsimulation</spirit:name>
+        <spirit:displayName>Simulation</spirit:displayName>
+        <spirit:envIdentifier>:vivado.xilinx.com:simulation</spirit:envIdentifier>
+        <spirit:language>SystemVerilog</spirit:language>
+        <spirit:modelName>axi_info_top</spirit:modelName>
+        <spirit:fileSetRef>
+          <spirit:localName>xilinx_anylanguagebehavioralsimulation_view_fileset</spirit:localName>
+        </spirit:fileSetRef>
+        <spirit:parameters>
+          <spirit:parameter>
+            <spirit:name>viewChecksum</spirit:name>
+            <spirit:value>7d682dfc</spirit:value>
+          </spirit:parameter>
+        </spirit:parameters>
+      </spirit:view>
+      <spirit:view>
+        <spirit:name>xilinx_xpgui</spirit:name>
+        <spirit:displayName>UI Layout</spirit:displayName>
+        <spirit:envIdentifier>:vivado.xilinx.com:xgui.ui</spirit:envIdentifier>
+        <spirit:fileSetRef>
+          <spirit:localName>xilinx_xpgui_view_fileset</spirit:localName>
+        </spirit:fileSetRef>
+        <spirit:parameters>
+          <spirit:parameter>
+            <spirit:name>viewChecksum</spirit:name>
+            <spirit:value>e11f9727</spirit:value>
+          </spirit:parameter>
+        </spirit:parameters>
+      </spirit:view>
+    </spirit:views>
+    <spirit:ports>
+      <spirit:port>
+        <spirit:name>ap_clk</spirit:name>
+        <spirit:wire>
+          <spirit:direction>in</spirit:direction>
+          <spirit:wireTypeDefs>
+            <spirit:wireTypeDef>
+              <spirit:typeName>logic</spirit:typeName>
+              <spirit:viewNameRef>xilinx_anylanguagesynthesis</spirit:viewNameRef>
+              <spirit:viewNameRef>xilinx_anylanguagebehavioralsimulation</spirit:viewNameRef>
+            </spirit:wireTypeDef>
+          </spirit:wireTypeDefs>
+        </spirit:wire>
+      </spirit:port>
+      <spirit:port>
+        <spirit:name>ap_rst_n</spirit:name>
+        <spirit:wire>
+          <spirit:direction>in</spirit:direction>
+          <spirit:wireTypeDefs>
+            <spirit:wireTypeDef>
+              <spirit:typeName>logic</spirit:typeName>
+              <spirit:viewNameRef>xilinx_anylanguagesynthesis</spirit:viewNameRef>
+              <spirit:viewNameRef>xilinx_anylanguagebehavioralsimulation</spirit:viewNameRef>
+            </spirit:wireTypeDef>
+          </spirit:wireTypeDefs>
+        </spirit:wire>
+      </spirit:port>
+      <spirit:port>
+        <spirit:name>s_axi_AWVALID</spirit:name>
+        <spirit:wire>
+          <spirit:direction>in</spirit:direction>
+          <spirit:wireTypeDefs>
+            <spirit:wireTypeDef>
+              <spirit:typeName>logic</spirit:typeName>
+              <spirit:viewNameRef>xilinx_anylanguagesynthesis</spirit:viewNameRef>
+              <spirit:viewNameRef>xilinx_anylanguagebehavioralsimulation</spirit:viewNameRef>
+            </spirit:wireTypeDef>
+          </spirit:wireTypeDefs>
+          <spirit:driver>
+            <spirit:defaultValue spirit:format="long">0</spirit:defaultValue>
+          </spirit:driver>
+        </spirit:wire>
+      </spirit:port>
+      <spirit:port>
+        <spirit:name>s_axi_AWREADY</spirit:name>
+        <spirit:wire>
+          <spirit:direction>out</spirit:direction>
+          <spirit:wireTypeDefs>
+            <spirit:wireTypeDef>
+              <spirit:typeName>logic</spirit:typeName>
+              <spirit:viewNameRef>xilinx_anylanguagesynthesis</spirit:viewNameRef>
+              <spirit:viewNameRef>xilinx_anylanguagebehavioralsimulation</spirit:viewNameRef>
+            </spirit:wireTypeDef>
+          </spirit:wireTypeDefs>
+        </spirit:wire>
+      </spirit:port>
+      <spirit:port>
+        <spirit:name>s_axi_AWADDR</spirit:name>
+        <spirit:wire>
+          <spirit:direction>in</spirit:direction>
+          <spirit:vector>
+            <spirit:left spirit:format="long">4</spirit:left>
+            <spirit:right spirit:format="long">0</spirit:right>
+          </spirit:vector>
+          <spirit:wireTypeDefs>
+            <spirit:wireTypeDef>
+              <spirit:typeName>logic</spirit:typeName>
+              <spirit:viewNameRef>xilinx_anylanguagesynthesis</spirit:viewNameRef>
+              <spirit:viewNameRef>xilinx_anylanguagebehavioralsimulation</spirit:viewNameRef>
+            </spirit:wireTypeDef>
+          </spirit:wireTypeDefs>
+          <spirit:driver>
+            <spirit:defaultValue spirit:format="long">0</spirit:defaultValue>
+          </spirit:driver>
+        </spirit:wire>
+      </spirit:port>
+      <spirit:port>
+        <spirit:name>s_axi_WVALID</spirit:name>
+        <spirit:wire>
+          <spirit:direction>in</spirit:direction>
+          <spirit:wireTypeDefs>
+            <spirit:wireTypeDef>
+              <spirit:typeName>logic</spirit:typeName>
+              <spirit:viewNameRef>xilinx_anylanguagesynthesis</spirit:viewNameRef>
+              <spirit:viewNameRef>xilinx_anylanguagebehavioralsimulation</spirit:viewNameRef>
+            </spirit:wireTypeDef>
+          </spirit:wireTypeDefs>
+          <spirit:driver>
+            <spirit:defaultValue spirit:format="long">0</spirit:defaultValue>
+          </spirit:driver>
+        </spirit:wire>
+      </spirit:port>
+      <spirit:port>
+        <spirit:name>s_axi_WREADY</spirit:name>
+        <spirit:wire>
+          <spirit:direction>out</spirit:direction>
+          <spirit:wireTypeDefs>
+            <spirit:wireTypeDef>
+              <spirit:typeName>logic</spirit:typeName>
+              <spirit:viewNameRef>xilinx_anylanguagesynthesis</spirit:viewNameRef>
+              <spirit:viewNameRef>xilinx_anylanguagebehavioralsimulation</spirit:viewNameRef>
+            </spirit:wireTypeDef>
+          </spirit:wireTypeDefs>
+        </spirit:wire>
+      </spirit:port>
+      <spirit:port>
+        <spirit:name>s_axi_WDATA</spirit:name>
+        <spirit:wire>
+          <spirit:direction>in</spirit:direction>
+          <spirit:vector>
+            <spirit:left spirit:format="long">31</spirit:left>
+            <spirit:right spirit:format="long">0</spirit:right>
+          </spirit:vector>
+          <spirit:wireTypeDefs>
+            <spirit:wireTypeDef>
+              <spirit:typeName>logic</spirit:typeName>
+              <spirit:viewNameRef>xilinx_anylanguagesynthesis</spirit:viewNameRef>
+              <spirit:viewNameRef>xilinx_anylanguagebehavioralsimulation</spirit:viewNameRef>
+            </spirit:wireTypeDef>
+          </spirit:wireTypeDefs>
+          <spirit:driver>
+            <spirit:defaultValue spirit:format="long">0</spirit:defaultValue>
+          </spirit:driver>
+        </spirit:wire>
+      </spirit:port>
+      <spirit:port>
+        <spirit:name>s_axi_WSTRB</spirit:name>
+        <spirit:wire>
+          <spirit:direction>in</spirit:direction>
+          <spirit:vector>
+            <spirit:left spirit:format="long">3</spirit:left>
+            <spirit:right spirit:format="long">0</spirit:right>
+          </spirit:vector>
+          <spirit:wireTypeDefs>
+            <spirit:wireTypeDef>
+              <spirit:typeName>logic</spirit:typeName>
+              <spirit:viewNameRef>xilinx_anylanguagesynthesis</spirit:viewNameRef>
+              <spirit:viewNameRef>xilinx_anylanguagebehavioralsimulation</spirit:viewNameRef>
+            </spirit:wireTypeDef>
+          </spirit:wireTypeDefs>
+          <spirit:driver>
+            <spirit:defaultValue spirit:format="long">1</spirit:defaultValue>
+          </spirit:driver>
+        </spirit:wire>
+      </spirit:port>
+      <spirit:port>
+        <spirit:name>s_axi_BVALID</spirit:name>
+        <spirit:wire>
+          <spirit:direction>out</spirit:direction>
+          <spirit:wireTypeDefs>
+            <spirit:wireTypeDef>
+              <spirit:typeName>logic</spirit:typeName>
+              <spirit:viewNameRef>xilinx_anylanguagesynthesis</spirit:viewNameRef>
+              <spirit:viewNameRef>xilinx_anylanguagebehavioralsimulation</spirit:viewNameRef>
+            </spirit:wireTypeDef>
+          </spirit:wireTypeDefs>
+        </spirit:wire>
+      </spirit:port>
+      <spirit:port>
+        <spirit:name>s_axi_BREADY</spirit:name>
+        <spirit:wire>
+          <spirit:direction>in</spirit:direction>
+          <spirit:wireTypeDefs>
+            <spirit:wireTypeDef>
+              <spirit:typeName>logic</spirit:typeName>
+              <spirit:viewNameRef>xilinx_anylanguagesynthesis</spirit:viewNameRef>
+              <spirit:viewNameRef>xilinx_anylanguagebehavioralsimulation</spirit:viewNameRef>
+            </spirit:wireTypeDef>
+          </spirit:wireTypeDefs>
+          <spirit:driver>
+            <spirit:defaultValue spirit:format="long">0</spirit:defaultValue>
+          </spirit:driver>
+        </spirit:wire>
+      </spirit:port>
+      <spirit:port>
+        <spirit:name>s_axi_BRESP</spirit:name>
+        <spirit:wire>
+          <spirit:direction>out</spirit:direction>
+          <spirit:vector>
+            <spirit:left spirit:format="long">1</spirit:left>
+            <spirit:right spirit:format="long">0</spirit:right>
+          </spirit:vector>
+          <spirit:wireTypeDefs>
+            <spirit:wireTypeDef>
+              <spirit:typeName>logic</spirit:typeName>
+              <spirit:viewNameRef>xilinx_anylanguagesynthesis</spirit:viewNameRef>
+              <spirit:viewNameRef>xilinx_anylanguagebehavioralsimulation</spirit:viewNameRef>
+            </spirit:wireTypeDef>
+          </spirit:wireTypeDefs>
+        </spirit:wire>
+      </spirit:port>
+      <spirit:port>
+        <spirit:name>s_axi_ARVALID</spirit:name>
+        <spirit:wire>
+          <spirit:direction>in</spirit:direction>
+          <spirit:wireTypeDefs>
+            <spirit:wireTypeDef>
+              <spirit:typeName>logic</spirit:typeName>
+              <spirit:viewNameRef>xilinx_anylanguagesynthesis</spirit:viewNameRef>
+              <spirit:viewNameRef>xilinx_anylanguagebehavioralsimulation</spirit:viewNameRef>
+            </spirit:wireTypeDef>
+          </spirit:wireTypeDefs>
+          <spirit:driver>
+            <spirit:defaultValue spirit:format="long">0</spirit:defaultValue>
+          </spirit:driver>
+        </spirit:wire>
+      </spirit:port>
+      <spirit:port>
+        <spirit:name>s_axi_ARREADY</spirit:name>
+        <spirit:wire>
+          <spirit:direction>out</spirit:direction>
+          <spirit:wireTypeDefs>
+            <spirit:wireTypeDef>
+              <spirit:typeName>logic</spirit:typeName>
+              <spirit:viewNameRef>xilinx_anylanguagesynthesis</spirit:viewNameRef>
+              <spirit:viewNameRef>xilinx_anylanguagebehavioralsimulation</spirit:viewNameRef>
+            </spirit:wireTypeDef>
+          </spirit:wireTypeDefs>
+        </spirit:wire>
+      </spirit:port>
+      <spirit:port>
+        <spirit:name>s_axi_ARADDR</spirit:name>
+        <spirit:wire>
+          <spirit:direction>in</spirit:direction>
+          <spirit:vector>
+            <spirit:left spirit:format="long">4</spirit:left>
+            <spirit:right spirit:format="long">0</spirit:right>
+          </spirit:vector>
+          <spirit:wireTypeDefs>
+            <spirit:wireTypeDef>
+              <spirit:typeName>logic</spirit:typeName>
+              <spirit:viewNameRef>xilinx_anylanguagesynthesis</spirit:viewNameRef>
+              <spirit:viewNameRef>xilinx_anylanguagebehavioralsimulation</spirit:viewNameRef>
+            </spirit:wireTypeDef>
+          </spirit:wireTypeDefs>
+          <spirit:driver>
+            <spirit:defaultValue spirit:format="long">0</spirit:defaultValue>
+          </spirit:driver>
+        </spirit:wire>
+      </spirit:port>
+      <spirit:port>
+        <spirit:name>s_axi_RVALID</spirit:name>
+        <spirit:wire>
+          <spirit:direction>out</spirit:direction>
+          <spirit:wireTypeDefs>
+            <spirit:wireTypeDef>
+              <spirit:typeName>logic</spirit:typeName>
+              <spirit:viewNameRef>xilinx_anylanguagesynthesis</spirit:viewNameRef>
+              <spirit:viewNameRef>xilinx_anylanguagebehavioralsimulation</spirit:viewNameRef>
+            </spirit:wireTypeDef>
+          </spirit:wireTypeDefs>
+        </spirit:wire>
+      </spirit:port>
+      <spirit:port>
+        <spirit:name>s_axi_RREADY</spirit:name>
+        <spirit:wire>
+          <spirit:direction>in</spirit:direction>
+          <spirit:wireTypeDefs>
+            <spirit:wireTypeDef>
+              <spirit:typeName>logic</spirit:typeName>
+              <spirit:viewNameRef>xilinx_anylanguagesynthesis</spirit:viewNameRef>
+              <spirit:viewNameRef>xilinx_anylanguagebehavioralsimulation</spirit:viewNameRef>
+            </spirit:wireTypeDef>
+          </spirit:wireTypeDefs>
+          <spirit:driver>
+            <spirit:defaultValue spirit:format="long">0</spirit:defaultValue>
+          </spirit:driver>
+        </spirit:wire>
+      </spirit:port>
+      <spirit:port>
+        <spirit:name>s_axi_RDATA</spirit:name>
+        <spirit:wire>
+          <spirit:direction>out</spirit:direction>
+          <spirit:vector>
+            <spirit:left spirit:format="long">31</spirit:left>
+            <spirit:right spirit:format="long">0</spirit:right>
+          </spirit:vector>
+          <spirit:wireTypeDefs>
+            <spirit:wireTypeDef>
+              <spirit:typeName>logic</spirit:typeName>
+              <spirit:viewNameRef>xilinx_anylanguagesynthesis</spirit:viewNameRef>
+              <spirit:viewNameRef>xilinx_anylanguagebehavioralsimulation</spirit:viewNameRef>
+            </spirit:wireTypeDef>
+          </spirit:wireTypeDefs>
+        </spirit:wire>
+      </spirit:port>
+      <spirit:port>
+        <spirit:name>s_axi_RRESP</spirit:name>
+        <spirit:wire>
+          <spirit:direction>out</spirit:direction>
+          <spirit:vector>
+            <spirit:left spirit:format="long">1</spirit:left>
+            <spirit:right spirit:format="long">0</spirit:right>
+          </spirit:vector>
+          <spirit:wireTypeDefs>
+            <spirit:wireTypeDef>
+              <spirit:typeName>logic</spirit:typeName>
+              <spirit:viewNameRef>xilinx_anylanguagesynthesis</spirit:viewNameRef>
+              <spirit:viewNameRef>xilinx_anylanguagebehavioralsimulation</spirit:viewNameRef>
+            </spirit:wireTypeDef>
+          </spirit:wireTypeDefs>
+        </spirit:wire>
+      </spirit:port>
+    </spirit:ports>
+    <spirit:modelParameters>
+      <spirit:modelParameter xsi:type="spirit:nameValueTypeType" spirit:dataType="bit(31 0)">
+        <spirit:name>SIG_CUSTOMER</spirit:name>
+        <spirit:displayName>Sig Customer</spirit:displayName>
+        <spirit:value spirit:format="long" spirit:resolve="generated" spirit:id="MODELPARAM_VALUE.SIG_CUSTOMER">0</spirit:value>
+      </spirit:modelParameter>
+      <spirit:modelParameter spirit:dataType="bit(31 0)">
+        <spirit:name>SIG_APPLICATION</spirit:name>
+        <spirit:displayName>Sig Application</spirit:displayName>
+        <spirit:value spirit:format="long" spirit:resolve="generated" spirit:id="MODELPARAM_VALUE.SIG_APPLICATION">0</spirit:value>
+      </spirit:modelParameter>
+      <spirit:modelParameter spirit:dataType="bit(31 0)">
+        <spirit:name>VERSION</spirit:name>
+        <spirit:displayName>Version</spirit:displayName>
+        <spirit:value spirit:format="long" spirit:resolve="generated" spirit:id="MODELPARAM_VALUE.VERSION">0</spirit:value>
+      </spirit:modelParameter>
+      <spirit:modelParameter spirit:dataType="bit(31 0)">
+        <spirit:name>CHECKSUM_COUNT</spirit:name>
+        <spirit:displayName>Checksum Count</spirit:displayName>
+        <spirit:value spirit:format="long" spirit:resolve="generated" spirit:id="MODELPARAM_VALUE.CHECKSUM_COUNT">0</spirit:value>
+      </spirit:modelParameter>
+    </spirit:modelParameters>
+  </spirit:model>
+  <spirit:choices>
+    <spirit:choice>
+      <spirit:name>choice_list_9d8b0d81</spirit:name>
+      <spirit:enumeration>ACTIVE_HIGH</spirit:enumeration>
+      <spirit:enumeration>ACTIVE_LOW</spirit:enumeration>
+    </spirit:choice>
+  </spirit:choices>
+  <spirit:fileSets>
+    <spirit:fileSet>
+      <spirit:name>xilinx_anylanguagesynthesis_view_fileset</spirit:name>
+      <spirit:file>
+        <spirit:name>hdl/axi_info.sv</spirit:name>
+        <spirit:fileType>systemVerilogSource</spirit:fileType>
+      </spirit:file>
+      <spirit:file>
+        <spirit:name>hdl/axi_info_top.sv</spirit:name>
+        <spirit:fileType>systemVerilogSource</spirit:fileType>
+        <spirit:userFileType>CHECKSUM_ec9ff0da</spirit:userFileType>
+      </spirit:file>
+    </spirit:fileSet>
+    <spirit:fileSet>
+      <spirit:name>xilinx_anylanguagebehavioralsimulation_view_fileset</spirit:name>
+      <spirit:file>
+        <spirit:name>hdl/axi_info.sv</spirit:name>
+        <spirit:fileType>systemVerilogSource</spirit:fileType>
+      </spirit:file>
+      <spirit:file>
+        <spirit:name>hdl/axi_info_top.sv</spirit:name>
+        <spirit:fileType>systemVerilogSource</spirit:fileType>
+      </spirit:file>
+    </spirit:fileSet>
+    <spirit:fileSet>
+      <spirit:name>xilinx_xpgui_view_fileset</spirit:name>
+      <spirit:file>
+        <spirit:name>xgui/axi_info_top_v1_0.tcl</spirit:name>
+        <spirit:fileType>tclSource</spirit:fileType>
+        <spirit:userFileType>CHECKSUM_e11f9727</spirit:userFileType>
+        <spirit:userFileType>XGUI_VERSION_2</spirit:userFileType>
+      </spirit:file>
+    </spirit:fileSet>
+  </spirit:fileSets>
+  <spirit:description>axi_info_top_v1_0</spirit:description>
+  <spirit:parameters>
+    <spirit:parameter>
+      <spirit:name>SIG_CUSTOMER</spirit:name>
+      <spirit:displayName>Sig Customer</spirit:displayName>
+      <spirit:value spirit:format="long" spirit:resolve="user" spirit:id="PARAM_VALUE.SIG_CUSTOMER">0</spirit:value>
+    </spirit:parameter>
+    <spirit:parameter>
+      <spirit:name>SIG_APPLICATION</spirit:name>
+      <spirit:displayName>Sig Application</spirit:displayName>
+      <spirit:value spirit:format="long" spirit:resolve="user" spirit:id="PARAM_VALUE.SIG_APPLICATION">0</spirit:value>
+    </spirit:parameter>
+    <spirit:parameter>
+      <spirit:name>VERSION</spirit:name>
+      <spirit:displayName>Version</spirit:displayName>
+      <spirit:value spirit:format="long" spirit:resolve="user" spirit:id="PARAM_VALUE.VERSION">0</spirit:value>
+    </spirit:parameter>
+    <spirit:parameter>
+      <spirit:name>CHECKSUM_COUNT</spirit:name>
+      <spirit:displayName>Checksum Count</spirit:displayName>
+      <spirit:value spirit:format="long" spirit:resolve="user" spirit:id="PARAM_VALUE.CHECKSUM_COUNT">0</spirit:value>
+    </spirit:parameter>
+    <spirit:parameter>
+      <spirit:name>Component_Name</spirit:name>
+      <spirit:value spirit:resolve="user" spirit:id="PARAM_VALUE.Component_Name" spirit:order="1">axi_info_top_v1_0</spirit:value>
+    </spirit:parameter>
+  </spirit:parameters>
+  <spirit:vendorExtensions>
+    <xilinx:coreExtensions>
+      <xilinx:supportedFamilies>
+        <xilinx:family xilinx:lifeCycle="Production">virtex7</xilinx:family>
+        <xilinx:family xilinx:lifeCycle="Production">qvirtex7</xilinx:family>
+        <xilinx:family xilinx:lifeCycle="Production">versal</xilinx:family>
+        <xilinx:family xilinx:lifeCycle="Production">kintex7</xilinx:family>
+        <xilinx:family xilinx:lifeCycle="Production">kintex7l</xilinx:family>
+        <xilinx:family xilinx:lifeCycle="Production">qkintex7</xilinx:family>
+        <xilinx:family xilinx:lifeCycle="Production">qkintex7l</xilinx:family>
+        <xilinx:family xilinx:lifeCycle="Production">akintex7</xilinx:family>
+        <xilinx:family xilinx:lifeCycle="Production">artix7</xilinx:family>
+        <xilinx:family xilinx:lifeCycle="Production">artix7l</xilinx:family>
+        <xilinx:family xilinx:lifeCycle="Production">aartix7</xilinx:family>
+        <xilinx:family xilinx:lifeCycle="Production">qartix7</xilinx:family>
+        <xilinx:family xilinx:lifeCycle="Production">zynq</xilinx:family>
+        <xilinx:family xilinx:lifeCycle="Production">qzynq</xilinx:family>
+        <xilinx:family xilinx:lifeCycle="Production">azynq</xilinx:family>
+        <xilinx:family xilinx:lifeCycle="Production">spartan7</xilinx:family>
+        <xilinx:family xilinx:lifeCycle="Production">aspartan7</xilinx:family>
+        <xilinx:family xilinx:lifeCycle="Production">virtexu</xilinx:family>
+        <xilinx:family xilinx:lifeCycle="Production">zynquplus</xilinx:family>
+        <xilinx:family xilinx:lifeCycle="Production">virtexuplus</xilinx:family>
+        <xilinx:family xilinx:lifeCycle="Production">virtexuplusHBM</xilinx:family>
+        <xilinx:family xilinx:lifeCycle="Production">virtexuplus58g</xilinx:family>
+        <xilinx:family xilinx:lifeCycle="Production">kintexuplus</xilinx:family>
+        <xilinx:family xilinx:lifeCycle="Production">artixuplus</xilinx:family>
+        <xilinx:family xilinx:lifeCycle="Production">kintexu</xilinx:family>
+      </xilinx:supportedFamilies>
+      <xilinx:taxonomies>
+        <xilinx:taxonomy>/UserIP</xilinx:taxonomy>
+      </xilinx:taxonomies>
+      <xilinx:displayName>axi_info_top_v1_0</xilinx:displayName>
+      <xilinx:definitionSource>package_project</xilinx:definitionSource>
+      <xilinx:coreRevision>5</xilinx:coreRevision>
+      <xilinx:coreCreationDateTime>2022-05-30T14:16:13Z</xilinx:coreCreationDateTime>
+    </xilinx:coreExtensions>
+    <xilinx:packagingInfo>
+      <xilinx:xilinxVersion>2022.1</xilinx:xilinxVersion>
+      <xilinx:checksum xilinx:scope="busInterfaces" xilinx:value="919b2cd5"/>
+      <xilinx:checksum xilinx:scope="memoryMaps" xilinx:value="c930e363"/>
+      <xilinx:checksum xilinx:scope="fileGroups" xilinx:value="5ec5459d"/>
+      <xilinx:checksum xilinx:scope="ports" xilinx:value="bd3646cb"/>
+      <xilinx:checksum xilinx:scope="hdlParameters" xilinx:value="eab94b69"/>
+      <xilinx:checksum xilinx:scope="parameters" xilinx:value="ba692e87"/>
+    </xilinx:packagingInfo>
+  </spirit:vendorExtensions>
+</spirit:component>
diff --git a/finn-rtllib/axi_info/hdl/axi_info.sv b/finn-rtllib/axi_info/hdl/axi_info.sv
new file mode 100644
index 0000000000..c0f35730c7
--- /dev/null
+++ b/finn-rtllib/axi_info/hdl/axi_info.sv
@@ -0,0 +1,119 @@
+/******************************************************************************
+ *  Copyright (c) 2022, Xilinx, Inc.
+ *  All rights reserved.
+ *
+ *  Redistribution and use in source and binary forms, with or without
+ *  modification, are permitted provided that the following conditions are met:
+ *
+ *  1.  Redistributions of source code must retain the above copyright notice,
+ *     this list of conditions and the following disclaimer.
+ *
+ *  2.  Redistributions in binary form must reproduce the above copyright
+ *      notice, this list of conditions and the following disclaimer in the
+ *      documentation and/or other materials provided with the distribution.
+ *
+ *  3.  Neither the name of the copyright holder nor the names of its
+ *      contributors may be used to endorse or promote products derived from
+ *      this software without specific prior written permission.
+ *
+ *  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ *  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
+ *  THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ *  PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
+ *  CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ *  EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ *  PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ *  OR BUSINESS INTERRUPTION). HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ *  WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
+ *  OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
+ *  ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * @brief	Read-only exposure of compiled-in info data on AXI-lite.
+ * @author	Thomas B. Preußer <tpreusse@amd.com>
+ *
+ *******************************************************************************/
+module axi_info #(
+	int unsigned  N,
+	int unsigned  S_AXI_DATA_WIDTH = 32,
+	bit [S_AXI_DATA_WIDTH-1:0]  DATA[N]
+)(
+	//- Global Control ------------------
+	input	logic  ap_clk,
+	input	logic  ap_rst_n,
+
+	//- AXI Lite ------------------------
+	// Writing
+	input	logic                  s_axi_AWVALID,
+	output	logic                  s_axi_AWREADY,
+	input	logic [$clog2(N)+1:0]  s_axi_AWADDR,
+
+	input	logic                           s_axi_WVALID,
+	output	logic                           s_axi_WREADY,
+	input	logic [S_AXI_DATA_WIDTH  -1:0]  s_axi_WDATA,
+	input	logic [S_AXI_DATA_WIDTH/8-1:0]  s_axi_WSTRB,
+
+	output	logic        s_axi_BVALID,
+	input	logic        s_axi_BREADY,
+	output	logic [1:0]  s_axi_BRESP,
+
+	// Reading
+	input	logic                  s_axi_ARVALID,
+	output	logic                  s_axi_ARREADY,
+	input	logic [$clog2(N)+1:0]  s_axi_ARADDR,
+
+	output	logic                         s_axi_RVALID,
+	input	logic                         s_axi_RREADY,
+	output	logic [S_AXI_DATA_WIDTH-1:0]  s_axi_RDATA,
+	output	logic [                 1:0]  s_axi_RRESP
+);
+
+	uwire  clk = ap_clk;
+	uwire  rst = !ap_rst_n;
+
+	//-----------------------------------------------------------------------
+	// Error out all Writes
+	if(1) begin : blkKillWrites
+		logic  WABusy = 0;
+		logic  WDBusy = 0;
+		uwire  clr = rst || (WABusy && WDBusy && s_axi_BREADY);
+		always_ff @(posedge clk) begin : blockName
+			if(clr) begin
+				WABusy <= 0;
+				WDBusy <= 0;
+			end
+			else begin
+				WABusy <= WABusy || s_axi_AWVALID;
+				WDBusy <= WDBusy || s_axi_WVALID;
+			end
+		end
+		assign	s_axi_AWREADY = !WABusy;
+		assign	s_axi_WREADY  = !WDBusy;
+		assign	s_axi_BVALID  = WABusy && WDBusy;
+		assign	s_axi_BRESP   = '1; // DECERR
+
+	end : blkKillWrites
+
+	//-----------------------------------------------------------------------
+	// Answer Reads
+	if(1) begin : blkRead
+		logic                         RValid =  0;
+		logic [S_AXI_DATA_WIDTH-1:0]  RData;//  = 'x;
+		always_ff @(posedge clk) begin
+			if(rst) begin
+				RValid <=  0;
+				RData  <= 'x;
+			end
+			else if(s_axi_ARREADY) begin
+				automatic logic [$left(s_axi_ARADDR):2]  addr_eff = s_axi_ARADDR[$left(s_axi_ARADDR):2];
+				RValid <= s_axi_ARVALID;
+				RData  <= (addr_eff < N)? DATA[addr_eff] : 32'hDEADDEAD;
+			end
+		end
+		assign	s_axi_ARREADY = !RValid || s_axi_RREADY;
+		assign	s_axi_RVALID  = RValid;
+		assign	s_axi_RDATA   = RData;
+		assign	s_axi_RRESP   = '0; // OKAY
+
+	end : blkRead
+
+endmodule : axi_info
diff --git a/finn-rtllib/axi_info/hdl/axi_info_top.sv b/finn-rtllib/axi_info/hdl/axi_info_top.sv
new file mode 100644
index 0000000000..ab2cfc8bed
--- /dev/null
+++ b/finn-rtllib/axi_info/hdl/axi_info_top.sv
@@ -0,0 +1,95 @@
+/******************************************************************************
+ *  Copyright (c) 2022, Xilinx, Inc.
+ *  All rights reserved.
+ *
+ *  Redistribution and use in source and binary forms, with or without
+ *  modification, are permitted provided that the following conditions are met:
+ *
+ *  1.  Redistributions of source code must retain the above copyright notice,
+ *     this list of conditions and the following disclaimer.
+ *
+ *  2.  Redistributions in binary form must reproduce the above copyright
+ *      notice, this list of conditions and the following disclaimer in the
+ *      documentation and/or other materials provided with the distribution.
+ *
+ *  3.  Neither the name of the copyright holder nor the names of its
+ *      contributors may be used to endorse or promote products derived from
+ *      this software without specific prior written permission.
+ *
+ *  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ *  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
+ *  THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ *  PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
+ *  CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ *  EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ *  PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ *  OR BUSINESS INTERRUPTION). HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ *  WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
+ *  OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
+ *  ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * @author	Thomas B. Preußer <tpreusse@amd.com>
+ *
+ *******************************************************************************/
+module axi_info_top #(
+	bit [31:0]  SIG_CUSTOMER,
+	bit [31:0]  SIG_APPLICATION,
+	bit [31:0]  VERSION,
+	bit [31:0]  CHECKSUM_COUNT
+)(
+	//- Global Control ------------------
+	input	logic  ap_clk,
+	input	logic  ap_rst_n,
+
+	//- AXI Lite ------------------------
+	// Writing
+	input	logic        s_axi_AWVALID,
+	output	logic        s_axi_AWREADY,
+	input	logic [4:0]  s_axi_AWADDR,
+
+	input	logic         s_axi_WVALID,
+	output	logic         s_axi_WREADY,
+	input	logic [31:0]  s_axi_WDATA,
+	input	logic [ 3:0]  s_axi_WSTRB,
+
+	output	logic        s_axi_BVALID,
+	input	logic        s_axi_BREADY,
+	output	logic [1:0]  s_axi_BRESP,
+
+	// Reading
+	input	logic        s_axi_ARVALID,
+	output	logic        s_axi_ARREADY,
+	input	logic [4:0]  s_axi_ARADDR,
+
+	output	logic         s_axi_RVALID,
+	input	logic         s_axi_RREADY,
+	output	logic [31:0]  s_axi_RDATA,
+	output	logic [ 1:0]  s_axi_RRESP
+);
+
+	axi_info #(
+		.N(6),
+		.S_AXI_DATA_WIDTH(32),
+		.DATA('{
+			32'h4649_4E4E,
+			SIG_CUSTOMER,
+			SIG_APPLICATION,
+			VERSION,
+			32'h0,
+			CHECKSUM_COUNT
+		})
+	) inst (
+		//- Global Control ------------------
+		.ap_clk, .ap_rst_n,
+
+		//- AXI Lite ------------------------
+		// Writing
+		.s_axi_AWVALID,	.s_axi_AWREADY,	.s_axi_AWADDR,
+		.s_axi_WVALID,	.s_axi_WREADY,	.s_axi_WDATA,	.s_axi_WSTRB,
+		.s_axi_BVALID,	.s_axi_BREADY,	.s_axi_BRESP,
+		// Reading
+		.s_axi_ARVALID,	.s_axi_ARREADY,	.s_axi_ARADDR,
+		.s_axi_RVALID,	.s_axi_RREADY,	.s_axi_RDATA,	.s_axi_RRESP
+	);
+
+endmodule : axi_info_top
diff --git a/finn-rtllib/axi_info/xgui/axi_info_top_v1_0.tcl b/finn-rtllib/axi_info/xgui/axi_info_top_v1_0.tcl
new file mode 100644
index 0000000000..76ab1a5c5b
--- /dev/null
+++ b/finn-rtllib/axi_info/xgui/axi_info_top_v1_0.tcl
@@ -0,0 +1,69 @@
+# Definitional proc to organize widgets for parameters.
+proc init_gui { IPINST } {
+  ipgui::add_param $IPINST -name "Component_Name"
+  #Adding Page
+  set Page_0 [ipgui::add_page $IPINST -name "Page 0"]
+  ipgui::add_param $IPINST -name "CHECKSUM_COUNT" -parent ${Page_0}
+  ipgui::add_param $IPINST -name "SIG_APPLICATION" -parent ${Page_0}
+  ipgui::add_param $IPINST -name "SIG_CUSTOMER" -parent ${Page_0}
+  ipgui::add_param $IPINST -name "VERSION" -parent ${Page_0}
+
+
+}
+
+proc update_PARAM_VALUE.CHECKSUM_COUNT { PARAM_VALUE.CHECKSUM_COUNT } {
+	# Procedure called to update CHECKSUM_COUNT when any of the dependent parameters in the arguments change
+}
+
+proc validate_PARAM_VALUE.CHECKSUM_COUNT { PARAM_VALUE.CHECKSUM_COUNT } {
+	# Procedure called to validate CHECKSUM_COUNT
+	return true
+}
+
+proc update_PARAM_VALUE.SIG_APPLICATION { PARAM_VALUE.SIG_APPLICATION } {
+	# Procedure called to update SIG_APPLICATION when any of the dependent parameters in the arguments change
+}
+
+proc validate_PARAM_VALUE.SIG_APPLICATION { PARAM_VALUE.SIG_APPLICATION } {
+	# Procedure called to validate SIG_APPLICATION
+	return true
+}
+
+proc update_PARAM_VALUE.SIG_CUSTOMER { PARAM_VALUE.SIG_CUSTOMER } {
+	# Procedure called to update SIG_CUSTOMER when any of the dependent parameters in the arguments change
+}
+
+proc validate_PARAM_VALUE.SIG_CUSTOMER { PARAM_VALUE.SIG_CUSTOMER } {
+	# Procedure called to validate SIG_CUSTOMER
+	return true
+}
+
+proc update_PARAM_VALUE.VERSION { PARAM_VALUE.VERSION } {
+	# Procedure called to update VERSION when any of the dependent parameters in the arguments change
+}
+
+proc validate_PARAM_VALUE.VERSION { PARAM_VALUE.VERSION } {
+	# Procedure called to validate VERSION
+	return true
+}
+
+
+proc update_MODELPARAM_VALUE.SIG_CUSTOMER { MODELPARAM_VALUE.SIG_CUSTOMER PARAM_VALUE.SIG_CUSTOMER } {
+	# Procedure called to set VHDL generic/Verilog parameter value(s) based on TCL parameter value
+	set_property value [get_property value ${PARAM_VALUE.SIG_CUSTOMER}] ${MODELPARAM_VALUE.SIG_CUSTOMER}
+}
+
+proc update_MODELPARAM_VALUE.SIG_APPLICATION { MODELPARAM_VALUE.SIG_APPLICATION PARAM_VALUE.SIG_APPLICATION } {
+	# Procedure called to set VHDL generic/Verilog parameter value(s) based on TCL parameter value
+	set_property value [get_property value ${PARAM_VALUE.SIG_APPLICATION}] ${MODELPARAM_VALUE.SIG_APPLICATION}
+}
+
+proc update_MODELPARAM_VALUE.VERSION { MODELPARAM_VALUE.VERSION PARAM_VALUE.VERSION } {
+	# Procedure called to set VHDL generic/Verilog parameter value(s) based on TCL parameter value
+	set_property value [get_property value ${PARAM_VALUE.VERSION}] ${MODELPARAM_VALUE.VERSION}
+}
+
+proc update_MODELPARAM_VALUE.CHECKSUM_COUNT { MODELPARAM_VALUE.CHECKSUM_COUNT PARAM_VALUE.CHECKSUM_COUNT } {
+	# Procedure called to set VHDL generic/Verilog parameter value(s) based on TCL parameter value
+	set_property value [get_property value ${PARAM_VALUE.CHECKSUM_COUNT}] ${MODELPARAM_VALUE.CHECKSUM_COUNT}
+}
diff --git a/finn-rtllib/memstream/component.xml b/finn-rtllib/memstream/component.xml
index 1e5b710dc8..63a8540a76 100644
--- a/finn-rtllib/memstream/component.xml
+++ b/finn-rtllib/memstream/component.xml
@@ -1677,6 +1677,7 @@
         <xilinx:family xilinx:lifeCycle="Production">qzynq</xilinx:family>
         <xilinx:family xilinx:lifeCycle="Production">qzynqplus</xilinx:family>
         <xilinx:family xilinx:lifeCycle="Production">versal</xilinx:family>
+        <xilinx:family xilinx:lifeCycle="Production">versalprime</xilinx:family>
         <xilinx:family xilinx:lifeCycle="Production">virtex7</xilinx:family>
         <xilinx:family xilinx:lifeCycle="Production">virtexu</xilinx:family>
         <xilinx:family xilinx:lifeCycle="Production">virtexuplus</xilinx:family>
diff --git a/finn-rtllib/memstream/hdl/memstream_singleblock.v b/finn-rtllib/memstream/hdl/memstream_singleblock.v
index 6bb3a97115..c9b8770aaa 100644
--- a/finn-rtllib/memstream/hdl/memstream_singleblock.v
+++ b/finn-rtllib/memstream/hdl/memstream_singleblock.v
@@ -192,7 +192,11 @@ end else begin: bypass
 
 reg [MEM_WIDTH-1:0] singleval[0:0];
 initial begin
-    $readmemh({MEM_INIT,"memblock_0.dat"}, singleval, 0, 0);
+    `ifdef SYNTHESIS
+        $readmemh({MEM_INIT,"memblock_synth_0.dat"}, singleval, 0, 0);
+    `else
+        $readmemh({MEM_INIT,"memblock_sim_0.dat"}, singleval, 0, 0);
+    `endif
 end
 
 always @(posedge aclk)
diff --git a/finn-rtllib/memstream/hdl/ramb18_sdp.v b/finn-rtllib/memstream/hdl/ramb18_sdp.v
index 63a349f7d5..8d2fbf9a98 100644
--- a/finn-rtllib/memstream/hdl/ramb18_sdp.v
+++ b/finn-rtllib/memstream/hdl/ramb18_sdp.v
@@ -71,15 +71,15 @@ initial begin
 	//MEM_INIT path must be terminated by /
   `ifdef SYNTHESIS
   if (ID < 10)
-    $readmemh({MEM_INIT,"memblock_",idx+8'd48,".dat"}, mem, 0, DEPTH-1);
+    $readmemh({MEM_INIT,"memblock_synth_",idx+8'd48,".dat"}, mem, 0, DEPTH-1);
   else
-    $readmemh({MEM_INIT,"memblock_",(idx/10)+8'd48,(idx%10)+8'd48,".dat"}, mem, 0, DEPTH-1);
+    $readmemh({MEM_INIT,"memblock_synth_",(idx/10)+8'd48,(idx%10)+8'd48,".dat"}, mem, 0, DEPTH-1);
   `else
   $sformat(idx,"%0d",ID);
   if (ID < 10)
-    $readmemh({MEM_INIT,"memblock_",idx[7:0],".dat"}, mem, 0, DEPTH-1);
+    $readmemh({MEM_INIT,"memblock_sim_",idx[7:0],".dat"}, mem, 0, DEPTH-1);
   else
-    $readmemh({MEM_INIT,"memblock_",idx,".dat"}, mem, 0, DEPTH-1);
+    $readmemh({MEM_INIT,"memblock_sim_",idx,".dat"}, mem, 0, DEPTH-1);
   `endif
 end
 
diff --git a/notebooks/FCLayer_graph.onnx b/notebooks/FCLayer_graph.onnx
deleted file mode 100644
index 950c78a9de..0000000000
Binary files a/notebooks/FCLayer_graph.onnx and /dev/null differ
diff --git a/notebooks/advanced/0_custom_analysis_pass.ipynb b/notebooks/advanced/0_custom_analysis_pass.ipynb
index 617bfa0897..a4ad32ed7f 100644
--- a/notebooks/advanced/0_custom_analysis_pass.ipynb
+++ b/notebooks/advanced/0_custom_analysis_pass.ipynb
@@ -13,7 +13,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -48,38 +48,9 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": null,
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Serving '../LFCW1A1.onnx' at http://0.0.0.0:8081\n"
-     ]
-    },
-    {
-     "data": {
-      "text/html": [
-       "\n",
-       "        <iframe\n",
-       "            width=\"100%\"\n",
-       "            height=\"400\"\n",
-       "            src=\"http://0.0.0.0:8081/\"\n",
-       "            frameborder=\"0\"\n",
-       "            allowfullscreen\n",
-       "        ></iframe>\n",
-       "        "
-      ],
-      "text/plain": [
-       "<IPython.lib.display.IFrame at 0x7f14142de3c8>"
-      ]
-     },
-     "execution_count": 2,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
+   "outputs": [],
    "source": [
     "showInNetron(\"../LFCW1A1.onnx\")"
    ]
@@ -93,11 +64,11 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
-    "from finn.core.modelwrapper import ModelWrapper\n",
+    "from qonnx.core.modelwrapper import ModelWrapper\n",
     "model = ModelWrapper('../LFCW1A1.onnx')"
    ]
   },
@@ -110,7 +81,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -140,20 +111,9 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": null,
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "    def analysis(self, analysis_fxn):\n",
-      "        \"\"\"Runs given anaylsis_fxn on this model and return resulting dict.\"\"\"\n",
-      "        return analysis_fxn(self)\n",
-      "\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "showSrc(ModelWrapper.analysis)"
    ]
@@ -167,17 +127,9 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": null,
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "{'Shape': 1, 'Gather': 1, 'Unsqueeze': 5, 'Concat': 1, 'Reshape': 1, 'Mul': 5, 'Sub': 1, 'Sign': 4, 'MatMul': 4, 'BatchNormalization': 3, 'Squeeze': 3}\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "print(model.analysis(count_equal_nodes))"
    ]
@@ -199,7 +151,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.6.8"
+   "version": "3.8.5"
   }
  },
  "nbformat": 4,
diff --git a/notebooks/advanced/1_custom_transformation_pass.ipynb b/notebooks/advanced/1_custom_transformation_pass.ipynb
index 9d9bc74633..e40a534af5 100644
--- a/notebooks/advanced/1_custom_transformation_pass.ipynb
+++ b/notebooks/advanced/1_custom_transformation_pass.ipynb
@@ -13,7 +13,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -42,34 +42,11 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": null,
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "    def transform(self, transformation, make_deepcopy=True):\n",
-      "        \"\"\"Applies given Transformation repeatedly until no more changes can be made\n",
-      "        and returns a transformed ModelWrapper instance.\n",
-      "\n",
-      "        If make_deepcopy is specified, operates on a new (deep)copy of model.\n",
-      "        \"\"\"\n",
-      "        transformed_model = self\n",
-      "        if make_deepcopy:\n",
-      "            transformed_model = copy.deepcopy(self)\n",
-      "        model_was_changed = True\n",
-      "        while model_was_changed:\n",
-      "            (transformed_model, model_was_changed) = transformation.apply(\n",
-      "                transformed_model\n",
-      "            )\n",
-      "        return transformed_model\n",
-      "\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
-    "from finn.core.modelwrapper import ModelWrapper\n",
+    "from qonnx.core.modelwrapper import ModelWrapper\n",
     "showSrc(ModelWrapper.transform)"
    ]
   },
@@ -79,9 +56,11 @@
    "source": [
     "When the function is called, the model, the name of the transformation and, if required, the flag make_deepcopy are passed. It is also possible not to make a copy of the model. In this case `make_deepcopy` must be set to False. Then the branch `if make_deepcopy:` would not be taken and no copy of the model would be made. \n",
     "\n",
-    "The unchanged model is first passed to the variable `transformed_model` to pass this variable on to the transformation later. \n",
+    "Additionally, the attribute `fix_float64` of the model is checked and if it is set to `True` all double values are converted to float. This assures a correct functionality of the model.\n",
+    "\n",
+    "The unchanged model is passed to the variable `transformed_model` to pass this variable on to the transformation later. \n",
     "\n",
-    "`model_was_changed` indicates whether the transformation needs to be applied more then once. Because it needs to be applied at least one time `model_was_changed` is first set to True and then depending on the return values of the transformation function the transformation can be applied more then once. \n",
+    "`model_was_changed` indicates whether the transformation needs to be applied more than once. Because it needs to be applied at least one time `model_was_changed` is first set to True and then depending on the return values of the transformation function the transformation can be applied more then once. \n",
     "\n",
     "**Important**: Due to the structure of this function, `model_was_changed` must be set to False at some point. Otherwise the loop is infinite.\n",
     "    \n",
@@ -98,29 +77,11 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": null,
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "class Transformation(ABC):\n",
-      "    \"\"\"Transformation class all transformations are based on. Contains only\n",
-      "    abstract method apply() every transformation has to fill.\"\"\"\n",
-      "\n",
-      "    def __init__(self):\n",
-      "        super().__init__()\n",
-      "\n",
-      "    @abstractmethod\n",
-      "    def apply(self, model):\n",
-      "        pass\n",
-      "\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
-    "from finn.transformation.base import Transformation\n",
+    "from qonnx.transformation.base import Transformation\n",
     "\n",
     "showSrc(Transformation)"
    ]
@@ -145,61 +106,32 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
     "import onnx\n",
     "onnx_model = onnx.load('../LFCW1A1.onnx')\n",
-    "from finn.core.modelwrapper import ModelWrapper\n",
+    "from qonnx.core.modelwrapper import ModelWrapper\n",
     "onnx_model = ModelWrapper(onnx_model)"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": null,
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Serving '../LFCW1A1.onnx' at http://0.0.0.0:8081\n"
-     ]
-    },
-    {
-     "data": {
-      "text/html": [
-       "\n",
-       "        <iframe\n",
-       "            width=\"100%\"\n",
-       "            height=\"400\"\n",
-       "            src=\"http://0.0.0.0:8081/\"\n",
-       "            frameborder=\"0\"\n",
-       "            allowfullscreen\n",
-       "        ></iframe>\n",
-       "        "
-      ],
-      "text/plain": [
-       "<IPython.lib.display.IFrame at 0x7fc625ac0a20>"
-      ]
-     },
-     "execution_count": 5,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
+   "outputs": [],
    "source": [
     "showInNetron('../LFCW1A1.onnx')"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
-    "from finn.transformation.base import Transformation\n",
+    "from qonnx.transformation.base import Transformation\n",
     "\n",
     "class ConvertSubToAdd(Transformation):\n",
     "    def apply(self, model):\n",
@@ -232,7 +164,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -242,40 +174,9 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 8,
+   "execution_count": null,
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "\n",
-      "Stopping http://0.0.0.0:8081\n",
-      "Serving '/tmp/LFCW1A1_changed.onnx' at http://0.0.0.0:8081\n"
-     ]
-    },
-    {
-     "data": {
-      "text/html": [
-       "\n",
-       "        <iframe\n",
-       "            width=\"100%\"\n",
-       "            height=\"400\"\n",
-       "            src=\"http://0.0.0.0:8081/\"\n",
-       "            frameborder=\"0\"\n",
-       "            allowfullscreen\n",
-       "        ></iframe>\n",
-       "        "
-      ],
-      "text/plain": [
-       "<IPython.lib.display.IFrame at 0x7fc625ac09b0>"
-      ]
-     },
-     "execution_count": 8,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
+   "outputs": [],
    "source": [
     "showInNetron('/tmp/LFCW1A1_changed.onnx')"
    ]
@@ -291,68 +192,11 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 9,
+   "execution_count": null,
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "class NodeLocalTransformation(Transformation):\n",
-      "    \"\"\"\n",
-      "    Parent class for transformations, which can be executed locally to one node\n",
-      "    by accessing and modifying the attributes of only that node.\n",
-      "    This class can then automatically parallelize the transformation.\n",
-      "    Transformations sublcassing NodeLocalTransformation must implement the\n",
-      "    abstract method applyNodeLocal().\n",
-      "\n",
-      "    To control the degree of parallelization, specify the num_workers argument\n",
-      "    in the constructor, using one of the following values:\n",
-      "    * None: use NUM_DEFAULT_WORKERS environment variable\n",
-      "    * 0: use all available CPU cores\n",
-      "    * (any other int>0): set number of parallel workers\n",
-      "    \"\"\"\n",
-      "\n",
-      "    def __init__(self, num_workers=None):\n",
-      "        super().__init__()\n",
-      "        if num_workers is None:\n",
-      "            self._num_workers = get_num_default_workers()\n",
-      "        else:\n",
-      "            self._num_workers = num_workers\n",
-      "        assert self._num_workers >= 0, \"Number of workers must be nonnegative.\"\n",
-      "        if self._num_workers == 0:\n",
-      "            self._num_workers = mp.cpu_count()\n",
-      "\n",
-      "    @abstractmethod\n",
-      "    def applyNodeLocal(self, node):\n",
-      "        pass\n",
-      "\n",
-      "    def apply(self, model):\n",
-      "        # Remove old nodes from the current model\n",
-      "        old_nodes = []\n",
-      "        for i in range(len(model.graph.node)):\n",
-      "            old_nodes.append(model.graph.node.pop())\n",
-      "\n",
-      "        # Execute transformation in parallel\n",
-      "        with mp.Pool(self._num_workers) as p:\n",
-      "            new_nodes_and_bool = p.map(self.applyNodeLocal, old_nodes, chunksize=1)\n",
-      "\n",
-      "        # extract nodes and check if the transformation needs to run again\n",
-      "        # Note: .pop() had initially reversed the node order\n",
-      "        run_again = False\n",
-      "        for node, run in reversed(new_nodes_and_bool):\n",
-      "            # Reattach new nodes to old model\n",
-      "            model.graph.node.append(node)\n",
-      "            if run is True:\n",
-      "                run_again = True\n",
-      "\n",
-      "        return (model, run_again)\n",
-      "\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
-    "from finn.transformation.base import NodeLocalTransformation\n",
+    "from qonnx.transformation.base import NodeLocalTransformation\n",
     "\n",
     "showSrc(NodeLocalTransformation)"
    ]
@@ -363,66 +207,16 @@
    "source": [
     "Transformations that are to be executed in parallel must have the method `applyNodeLocal()` implemented. Please note that the transformation is only executed on a single node, the parallel transformations do not have access to the entire model or tensors. Parallelization has the advantage that especially time-consuming transformations such as compilation can be executed more effectively. \n",
     "\n",
-    "To control the degree of parallelization the argument `num_workers` can be specified. When the Docker container is started, the env variable `NUM_DEFAULT_WORKERS` is set to 1 by default, this can be increased depending on the system. You can also set the number of workers manually to a specific value when calling a transformation that allows parallelization. If the value is set to 0, all available CPU cores are used.\n",
+    "To control the degree of parallelization the argument `num_workers` can be specified. When the Docker container is started, the env variable `NUM_DEFAULT_WORKERS` is set to 4 by default, this can be increased or decreased depending on the system. You can also set the number of workers manually to a specific value when calling a transformation that allows parallelization. If the value is set to 0, all available CPU cores are used.\n",
     "\n",
     "In the following we want to take a closer look at the implementation using the compile transformation as example."
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 10,
+   "execution_count": null,
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "class CompileCppSim(NodeLocalTransformation):\n",
-      "    \"\"\"For every node: compile C++ code in node attribute \"code_gen_dir_cppsim\"\n",
-      "    and save path to executables in node attribute \"executable_path\".\n",
-      "    All nodes in the graph must have the fpgadataflow backend attribute.\n",
-      "\n",
-      "    To use these executables, exec_mode must be set to \"cppsim\" (using transformation\n",
-      "    SetExecMode) and the model has to be executed using execute_onnx() from\n",
-      "    finn.core.onnx_exec\n",
-      "\n",
-      "    * num_workers (int or None) number of parallel workers, see documentation in\n",
-      "      NodeLocalTransformation for more details.\n",
-      "    \"\"\"\n",
-      "\n",
-      "    def __init__(self, num_workers=None):\n",
-      "        super().__init__(num_workers=num_workers)\n",
-      "\n",
-      "    def applyNodeLocal(self, node):\n",
-      "        op_type = node.op_type\n",
-      "        if is_fpgadataflow_node(node) is True:\n",
-      "            try:\n",
-      "                # lookup op_type in registry of CustomOps\n",
-      "                inst = registry.getCustomOp(node)\n",
-      "                # ensure that code is generated\n",
-      "                assert (\n",
-      "                    inst.get_nodeattr(\"code_gen_dir_cppsim\") != \"\"\n",
-      "                ), \"\"\"Node\n",
-      "                attribute \"code_gen_dir_cppsim\" is not set. Please run\n",
-      "                Transformation PrepareCppSim first.\"\"\"\n",
-      "                # call the compilation function for this node\n",
-      "                inst.compile_singlenode_code()\n",
-      "                # ensure that executable path is now set\n",
-      "                assert (\n",
-      "                    inst.get_nodeattr(\"executable_path\") != \"\"\n",
-      "                ), \"\"\"Transformation\n",
-      "                compile was not successful, there is no path to executables set\n",
-      "                in node attribute \"executable_path\".\"\"\"\n",
-      "            except KeyError:\n",
-      "                # exception if op_type is not supported\n",
-      "                raise Exception(\n",
-      "                    \"Custom op_type %s is currently not supported.\" % op_type\n",
-      "                )\n",
-      "        return (node, False)\n",
-      "\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "from finn.transformation.fpgadataflow.compile_cppsim import CompileCppSim\n",
     "\n",
@@ -453,7 +247,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.6.8"
+   "version": "3.8.5"
   }
  },
  "nbformat": 4,
diff --git a/notebooks/advanced/2_custom_op.ipynb b/notebooks/advanced/2_custom_op.ipynb
index 57f2601c73..c27f8bdca7 100644
--- a/notebooks/advanced/2_custom_op.ipynb
+++ b/notebooks/advanced/2_custom_op.ipynb
@@ -6,7 +6,7 @@
    "source": [
     "# Introduction to custom ops in FINN\n",
     "\n",
-    "Suppose that you want to introduce a new (custom) operation type into the FINN. Custom operations in FINN are useful for a variety of things ranging from code generation to functional verification. This is achieved by creating a new Python module for your custom operation that fulfills certain interface specifications.\n",
+    "Suppose that you want to introduce a new (custom) operation type into the FINN compiler. Custom operations in FINN are useful for a variety of things ranging from code generation to functional verification. This is achieved by creating a new Python module for your custom operation that fulfills certain interface specifications.\n",
     "\n",
     "One thing to point out before we start is that **these custom operations are generic** and not really tied to e.g. Vivado HLS or few-bit quantization. As you will see in this notebook, it's possible to provide arbitrary Python/C/C++/... execution and code generation paths for custom nodes.\n",
     "\n",
@@ -23,64 +23,16 @@
     "\n",
     "2. `CustomOp` subclasses need to implement the methods below (those not starting with underscore).\n",
     "\n",
-    "3. To be discoverable in the custom op register, `CustomOp` subclasses must set the `domain` field to the name of the Python module they appear in. For instance, to use the custom `Im2Col` op type from [here](https://github.com/Xilinx/finn-base/blob/dev/src/finn/custom_op/general/im2col.py), the ONNX node must use `domain=finn.custom_op.general` since its module is located at `finn/custom_op/general/im2col.py`."
+    "3. To be discoverable in the custom op register, `CustomOp` subclasses must set the `domain` field to the name of the Python module they appear in. For instance, to use the custom `Im2Col` op type from [here](https://github.com/Xilinx/finn-base/blob/dev/src/finn/custom_op/general/im2col.py), the ONNX node must use `domain=qonnx.custom_op.general` since its module is located at `finn/custom_op/general/im2col.py`."
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 1,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "['__abstractmethods__',\n",
-       " '__class__',\n",
-       " '__delattr__',\n",
-       " '__dict__',\n",
-       " '__dir__',\n",
-       " '__doc__',\n",
-       " '__eq__',\n",
-       " '__format__',\n",
-       " '__ge__',\n",
-       " '__getattribute__',\n",
-       " '__gt__',\n",
-       " '__hash__',\n",
-       " '__init__',\n",
-       " '__init_subclass__',\n",
-       " '__le__',\n",
-       " '__lt__',\n",
-       " '__module__',\n",
-       " '__ne__',\n",
-       " '__new__',\n",
-       " '__reduce__',\n",
-       " '__reduce_ex__',\n",
-       " '__repr__',\n",
-       " '__setattr__',\n",
-       " '__sizeof__',\n",
-       " '__slots__',\n",
-       " '__str__',\n",
-       " '__subclasshook__',\n",
-       " '__weakref__',\n",
-       " '_abc_impl',\n",
-       " 'execute_node',\n",
-       " 'get_nodeattr',\n",
-       " 'get_nodeattr_allowed_values',\n",
-       " 'get_nodeattr_def',\n",
-       " 'get_nodeattr_types',\n",
-       " 'infer_node_datatype',\n",
-       " 'make_shape_compatible_op',\n",
-       " 'set_nodeattr',\n",
-       " 'verify_node']"
-      ]
-     },
-     "execution_count": 1,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "from finn.custom_op.base import CustomOp\n",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from qonnx.custom_op.base import CustomOp\n",
     "dir(CustomOp)"
    ]
   },
@@ -95,7 +47,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -183,11 +135,11 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
-    "import finn.custom_op.general as general\n",
+    "import qonnx.custom_op.general as general\n",
     "general.custom_op[\"MyPythonPowerOp\"] = MyPythonPowerOp"
    ]
   },
@@ -200,27 +152,9 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "{'DebugMarker': finn.custom_op.general.debugmarker.DebugMarker,\n",
-       " 'QuantAvgPool2d': finn.custom_op.general.quantavgpool2d.QuantAvgPool2d,\n",
-       " 'MaxPoolNHWC': finn.custom_op.general.maxpoolnhwc.MaxPoolNHWC,\n",
-       " 'GenericPartition': finn.custom_op.general.genericpartition.GenericPartition,\n",
-       " 'MultiThreshold': finn.custom_op.general.multithreshold.MultiThreshold,\n",
-       " 'XnorPopcountMatMul': finn.custom_op.general.xnorpopcount.XnorPopcountMatMul,\n",
-       " 'Im2Col': finn.custom_op.general.im2col.Im2Col,\n",
-       " 'MyPythonPowerOp': __main__.MyPythonPowerOp}"
-      ]
-     },
-     "execution_count": 4,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
    "source": [
     "general.custom_op"
    ]
@@ -238,11 +172,11 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
-    "from finn.core.modelwrapper import ModelWrapper\n",
+    "from qonnx.core.modelwrapper import ModelWrapper\n",
     "from onnx import TensorProto\n",
     "\n",
     "def make_graph(ishape, exp, op_type = \"MyPythonPowerOp\"):\n",
@@ -261,7 +195,7 @@
     "        # name of output tensor\n",
     "        [\"outp\"],\n",
     "        # specify domain s.t. FINN can find our op under this submodule\n",
-    "        domain=\"finn.custom_op.general\",\n",
+    "        domain=\"qonnx.custom_op.general\",\n",
     "        # set up attributes\n",
     "        exponent = int(exp),\n",
     "        exec_mode = \"python\"\n",
@@ -283,34 +217,9 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "[input: \"inp\"\n",
-       "output: \"outp\"\n",
-       "op_type: \"MyPythonPowerOp\"\n",
-       "attribute {\n",
-       "  name: \"exec_mode\"\n",
-       "  s: \"python\"\n",
-       "  type: STRING\n",
-       "}\n",
-       "attribute {\n",
-       "  name: \"exponent\"\n",
-       "  i: 2\n",
-       "  type: INT\n",
-       "}\n",
-       "domain: \"finn.custom_op.general\"\n",
-       "]"
-      ]
-     },
-     "execution_count": 6,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
    "source": [
     "# generate a small graph with our custom op\n",
     "input_shape = (1, 2, 4)\n",
@@ -327,24 +236,12 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 7,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "array([[[ 0., -3.,  1., -8.],\n",
-       "        [ 2., -2., -4., -8.]]], dtype=float32)"
-      ]
-     },
-     "execution_count": 7,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "from finn.core.datatype import DataType\n",
-    "from finn.util.basic import gen_finn_dt_tensor\n",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from qonnx.core.datatype import DataType\n",
+    "from qonnx.util.basic import gen_finn_dt_tensor\n",
     "\n",
     "# generate a random input of e.g signed 4-bit values\n",
     "random_input = gen_finn_dt_tensor(DataType[\"INT4\"], input_shape)\n",
@@ -360,21 +257,9 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 8,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "{'outp': array([[[ 0.,  9.,  1., 64.],\n",
-       "         [ 4.,  4., 16., 64.]]], dtype=float32)}"
-      ]
-     },
-     "execution_count": 8,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
    "source": [
     "from finn.core.onnx_exec import execute_onnx\n",
     "\n",
@@ -406,7 +291,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 9,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -521,34 +406,9 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 10,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "[input: \"inp\"\n",
-       "output: \"outp\"\n",
-       "op_type: \"MyMixedPowerOp\"\n",
-       "attribute {\n",
-       "  name: \"exec_mode\"\n",
-       "  s: \"python\"\n",
-       "  type: STRING\n",
-       "}\n",
-       "attribute {\n",
-       "  name: \"exponent\"\n",
-       "  i: 2\n",
-       "  type: INT\n",
-       "}\n",
-       "domain: \"finn.custom_op.general\"\n",
-       "]"
-      ]
-     },
-     "execution_count": 10,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
    "source": [
     "# register our new op\n",
     "general.custom_op[\"MyMixedPowerOp\"] = MyMixedPowerOp\n",
@@ -567,21 +427,11 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 11,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Available functions: ['__abstractmethods__', '__class__', '__delattr__', '__dict__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__le__', '__lt__', '__module__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__sizeof__', '__slots__', '__str__', '__subclasshook__', '__weakref__', '_abc_impl', 'execute_node', 'get_nodeattr', 'get_nodeattr_allowed_values', 'get_nodeattr_def', 'get_nodeattr_types', 'infer_node_datatype', 'make_shape_compatible_op', 'my_custom_cpp_gen', 'onnx_node', 'set_nodeattr', 'verify_node']\n",
-      "codegen_dir: \n",
-      "exec_mode: python\n"
-     ]
-    }
-   ],
-   "source": [
-    "from finn.custom_op.registry import getCustomOp\n",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from qonnx.custom_op.registry import getCustomOp\n",
     "\n",
     "# get FINN wrapper for this node, with all the functionality\n",
     "op_inst = getCustomOp(mixedop_graph.model.graph.node[0])\n",
@@ -602,13 +452,13 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 12,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
-    "#from finn.transformation.base import Transformation\n",
+    "#from qonnx.transformation.base import Transformation\n",
     "# can derive from NodeLocalTransformation for faster (parallel) execution\n",
-    "from finn.transformation.base import NodeLocalTransformation\n",
+    "from qonnx.transformation.base import NodeLocalTransformation\n",
     "import os\n",
     "\n",
     "class MyNodeLocalCodeGen(NodeLocalTransformation):\n",
@@ -641,7 +491,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 13,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -657,17 +507,9 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 14,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "/tmp/finn_dev_maltanar/my_custom_oppswiou3i\n"
-     ]
-    }
-   ],
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
    "source": [
     "new_op_inst = getCustomOp(mixedop_graph_new.graph.node[0])\n",
     "codegen_dir = new_op_inst.get_nodeattr(\"codegen_dir\")\n",
@@ -683,17 +525,9 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 15,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "compile.sh  node_model\ttop.cpp\r\n"
-     ]
-    }
-   ],
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
    "source": [
     "! ls {codegen_dir}"
    ]
@@ -707,39 +541,9 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 16,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "\r\n",
-      "#include <iostream>\r\n",
-      "#include <fstream>\r\n",
-      "using namespace std;\r\n",
-      "#define EXPONENT 2\r\n",
-      "\r\n",
-      "int main(int argc, char **argv) {\r\n",
-      "    ifstream infile(\"input.txt\");\r\n",
-      "    ofstream outfile(\"output.txt\");\r\n",
-      "    \r\n",
-      "    float elem;\r\n",
-      "    while (infile >> elem)\r\n",
-      "    {\r\n",
-      "        float res = 1.0;\r\n",
-      "        for(int i=0; i < EXPONENT; i++) {\r\n",
-      "            res *= elem;\r\n",
-      "        }\r\n",
-      "        outfile << res << \"\\n\";\r\n",
-      "    }\r\n",
-      "\r\n",
-      "    return 0;\r\n",
-      "}\r\n",
-      "        "
-     ]
-    }
-   ],
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
    "source": [
     "! cat {codegen_dir}/top.cpp"
    ]
@@ -757,7 +561,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 17,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -766,7 +570,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 18,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -775,26 +579,16 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 19,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "49\r\n",
-      "64\r\n",
-      "81\r\n"
-     ]
-    }
-   ],
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
    "source": [
     "! cat {codegen_dir}/output.txt"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 20,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -812,21 +606,9 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 21,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "array([[[-6.,  3.,  2., -5.],\n",
-       "        [ 5.,  2.,  0., -2.]]], dtype=float32)"
-      ]
-     },
-     "execution_count": 21,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
    "source": [
     "# generate a random input of e.g signed 4-bit values\n",
     "random_input = gen_finn_dt_tensor(DataType[\"INT4\"], input_shape)\n",
@@ -842,21 +624,9 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 22,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "{'outp': array([[[36.,  9.,  4., 25.],\n",
-       "         [25.,  4.,  0.,  4.]]], dtype=float32)}"
-      ]
-     },
-     "execution_count": 22,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
    "source": [
     "# run with FINN's execute_onnx, custom node will use Python execution\n",
     "new_op_inst.set_nodeattr(\"exec_mode\", \"python\")\n",
@@ -874,34 +644,15 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 23,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "{'outp': array([[[36.,  9.,  4., 25.],\n",
-       "         [25.,  4.,  0.,  4.]]])}"
-      ]
-     },
-     "execution_count": 23,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
    "source": [
     "# run with FINN's execute_onnx, custom node will use c++ execution\n",
     "new_op_inst.set_nodeattr(\"exec_mode\", \"c++\")\n",
     "ret = execute_onnx(mixedop_graph_new, inp_dict)\n",
     "ret"
    ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": []
   }
  ],
  "metadata": {
diff --git a/notebooks/basics/0_how_to_work_with_onnx.ipynb b/notebooks/basics/0_how_to_work_with_onnx.ipynb
index 58f53c3298..514efd1693 100644
--- a/notebooks/basics/0_how_to_work_with_onnx.ipynb
+++ b/notebooks/basics/0_how_to_work_with_onnx.ipynb
@@ -31,7 +31,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -56,7 +56,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -98,7 +98,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -119,7 +119,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -154,7 +154,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -171,7 +171,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -180,40 +180,9 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 7,
-   "metadata": {
-    "scrolled": true
-   },
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Serving '/tmp/simple_model.onnx' at http://0.0.0.0:8081\n"
-     ]
-    },
-    {
-     "data": {
-      "text/html": [
-       "\n",
-       "        <iframe\n",
-       "            width=\"100%\"\n",
-       "            height=\"400\"\n",
-       "            src=\"http://0.0.0.0:8081/\"\n",
-       "            frameborder=\"0\"\n",
-       "            allowfullscreen\n",
-       "        ></iframe>\n",
-       "        "
-      ],
-      "text/plain": [
-       "<IPython.lib.display.IFrame at 0x7fcdfc956b70>"
-      ]
-     },
-     "execution_count": 7,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
    "source": [
     "showInNetron('/tmp/simple_model.onnx')"
    ]
@@ -229,7 +198,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 8,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -252,7 +221,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 9,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -270,7 +239,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 10,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -289,7 +258,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 11,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -308,29 +277,9 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 12,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "The output of the ONNX model is: \n",
-      "[[22. 13. 21.  8.]\n",
-      " [ 0.  8. 11.  1.]\n",
-      " [ 3. 12.  8.  2.]\n",
-      " [ 0.  6.  1.  4.]]\n",
-      "\n",
-      "The output of the reference function is: \n",
-      "[[22. 13. 21.  8.]\n",
-      " [ 0.  8. 11.  1.]\n",
-      " [ 3. 12.  8.  2.]\n",
-      " [ 0.  6.  1.  4.]]\n",
-      "\n",
-      "The results are the same!\n"
-     ]
-    }
-   ],
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
    "source": [
     "ref_output= expected_output(in1_values, in2_values, in3_values)\n",
     "print(\"The output of the ONNX model is: \\n{}\".format(output[0]))\n",
@@ -364,16 +313,16 @@
    "source": [
     "In the following we assume that we do not know the appearance of the model, so we first try to identify whether there are two consecutive adders in the graph and then convert them into a sum node. \n",
     "\n",
-    "Here we make use of FINN. FINN provides a thin wrapper around the model which provides several additional helper functions to manipulate the graph. The code can be found [here](https://github.com/Xilinx/finn/blob/master/src/finn/core/modelwrapper.py)."
+    "Here we make use of FINN. FINN provides a thin wrapper around the model which provides several additional helper functions to manipulate the graph. The so called `ModelWrapper` can be found in the QONNX repository which contains a lot of functionality that is used by FINN, you can find it [here](https://github.com/fastmachinelearning/qonnx/blob/main/src/qonnx/core/modelwrapper.py)."
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 13,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
-    "from finn.core.modelwrapper import ModelWrapper\n",
+    "from qonnx.core.modelwrapper import ModelWrapper\n",
     "finn_model = ModelWrapper(onnx_model)"
    ]
   },
@@ -386,7 +335,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 14,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -410,7 +359,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 15,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -433,19 +382,9 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 16,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Found adder node: Add1\n",
-      "Found adder node: Add2\n",
-      "Found adder node: Add3\n"
-     ]
-    }
-   ],
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
    "source": [
     "add_nodes = identify_adder_nodes(finn_model)\n",
     "for node in add_nodes:\n",
@@ -456,48 +395,19 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "Among other helper functions, `ModelWrapper` offers two functions that can help to determine the preceding and succeeding node of a node. However, these functions are not getting a node as input, but can determine the consumer or producer of a tensor. We write two functions that uses these helper functions to determine the previous and the next node of a node."
+    "Among other helper functions, `ModelWrapper` offers two functions that can help to determine the preceding and succeeding node of a node: `find_direct_successors` and `find_direct_predecessors`. So we can use one of them to define a function to find adder pairs."
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 17,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "def find_predecessor(model, node):\n",
-    "    predecessors = []\n",
-    "    for i in range(len(node.input)):\n",
-    "        producer = model.find_producer(node.input[i])\n",
-    "        predecessors.append(producer)\n",
-    "    return predecessors\n",
-    "        \n",
-    "\n",
-    "def find_successor(model, node):\n",
-    "    successors = []\n",
-    "    for i in range(len(node.output)):\n",
-    "        consumer = model.find_consumer(node.output[i])\n",
-    "        successors.append(consumer)\n",
-    "    return successors"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "The first function uses `find_producer` from `ModelWrapper` to create a list of the producers of the inputs of the given node. So the returned list is indirectly filled with the predecessors of the node. The second function works in a similar way, `find_consumer` from `ModelWrapper` is used to find the consumers of the output tensors of the node and so a list with the successors can be created. "
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 18,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
     "def adder_pair(model, node):\n",
     "    adder_pairs = []\n",
     "    node_pair = []\n",
-    "    successor_list = find_successor(model, node)\n",
+    "    successor_list = model.find_direct_successors(node)\n",
     "    \n",
     "    for successor in successor_list:\n",
     "        if successor.op_type == \"Add\":\n",
@@ -505,34 +415,23 @@
     "            node_pair.append(successor)\n",
     "            adder_pairs.append((node_pair))\n",
     "            node_pair = []\n",
-    "    return adder_pairs\n",
-    "            "
+    "    return adder_pairs     "
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "The function gets a node and the model as input. Two empty lists are created to be filled with a list of adder node pairs that can be returned as result of the function. Then the function `find_successor` is used to return all of the successors of the node. If one of the successors is an adder node, the node is saved in `node_pair` together with the successive adder node and put in the list `adder_pairs`. Then the temporary list is cleaned and can be filled with the next adder node pair. Since it is theoretically possible for an adder node to have more than one subsequent adder node, a list of lists is created. This list of the node with all its successive adder nodes is returned.\n",
+    "The function gets a node and the model as input. Two empty lists are created to be filled with a list of adder node pairs that can be returned as result of the function. Then the function `find_direct_successors` is used to return all of the successors of the node. If one of the successors is an adder node, the node is saved in `node_pair` together with the successive adder node and put in the list `adder_pairs`. Then the temporary list is cleaned and can be filled with the next adder node pair. Since it is theoretically possible for an adder node to have more than one subsequent adder node, a list of lists is created. This list of the node with all its successive adder nodes is returned.\n",
     "\n",
     "So now we can find out which adder node has an adder node as successor. Since the model is known, one adder pair (Add1+Add2) should be found when applying the function to the previously determined adder node list (`add_nodes`)."
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 19,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Found following pair that could be replaced by a sum node:\n",
-      "Add1\n",
-      "Add2\n"
-     ]
-    }
-   ],
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
    "source": [
     "for node in add_nodes:\n",
     "    add_pairs = adder_pair(finn_model, node)\n",
@@ -556,18 +455,9 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 20,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "The new node gets the following inputs: \n",
-      "['in1', 'in2', 'in3']\n"
-     ]
-    }
-   ],
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
    "source": [
     "input_list = []\n",
     "for i in range(len(substitute_pair)):\n",
@@ -591,7 +481,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 21,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -602,12 +492,12 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "The summary node can be created with this information."
+    "The sum node can be created with this information."
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 22,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -628,7 +518,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 23,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -656,7 +546,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 25,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -666,40 +556,9 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 26,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "\n",
-      "Stopping http://0.0.0.0:8081\n",
-      "Serving '/tmp/simple_model1.onnx' at http://0.0.0.0:8081\n"
-     ]
-    },
-    {
-     "data": {
-      "text/html": [
-       "\n",
-       "        <iframe\n",
-       "            width=\"100%\"\n",
-       "            height=\"400\"\n",
-       "            src=\"http://0.0.0.0:8081/\"\n",
-       "            frameborder=\"0\"\n",
-       "            allowfullscreen\n",
-       "        ></iframe>\n",
-       "        "
-      ],
-      "text/plain": [
-       "<IPython.lib.display.IFrame at 0x7fcdfc130cc0>"
-      ]
-     },
-     "execution_count": 26,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
    "source": [
     "showInNetron('/tmp/simple_model1.onnx')"
    ]
@@ -713,7 +572,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 27,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -723,29 +582,9 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 28,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "The output of the manipulated ONNX model is: \n",
-      "[[22. 13. 21.  8.]\n",
-      " [ 0.  8. 11.  1.]\n",
-      " [ 3. 12.  8.  2.]\n",
-      " [ 0.  6.  1.  4.]]\n",
-      "\n",
-      "The output of the reference function is: \n",
-      "[[22. 13. 21.  8.]\n",
-      " [ 0.  8. 11.  1.]\n",
-      " [ 3. 12.  8.  2.]\n",
-      " [ 0.  6.  1.  4.]]\n",
-      "\n",
-      "The results are the same!\n"
-     ]
-    }
-   ],
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
    "source": [
     "print(\"The output of the manipulated ONNX model is: \\n{}\".format(output[0]))\n",
     "print(\"\\nThe output of the reference function is: \\n{}\".format(ref_output))\n",
@@ -773,7 +612,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.6.8"
+   "version": "3.8.5"
   }
  },
  "nbformat": 4,
diff --git a/notebooks/basics/1_brevitas_network_import.ipynb b/notebooks/basics/1_brevitas_network_import.ipynb
index b6d6c3bdfd..5fb29754dc 100644
--- a/notebooks/basics/1_brevitas_network_import.ipynb
+++ b/notebooks/basics/1_brevitas_network_import.ipynb
@@ -80,7 +80,7 @@
     "from pkgutil import get_data\n",
     "import onnx\n",
     "import onnx.numpy_helper as nph\n",
-    "raw_i = get_data(\"finn.data\", \"onnx/mnist-conv/test_data_set_0/input_0.pb\")\n",
+    "raw_i = get_data(\"qonnx.data\", \"onnx/mnist-conv/test_data_set_0/input_0.pb\")\n",
     "input_tensor = onnx.load_tensor_from_string(raw_i)\n",
     "input_tensor_npy = nph.to_array(input_tensor)\n",
     "input_tensor_pyt = torch.from_numpy(input_tensor_npy).float()\n",
@@ -181,7 +181,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "from finn.core.modelwrapper import ModelWrapper\n",
+    "from qonnx.core.modelwrapper import ModelWrapper\n",
     "model = ModelWrapper(export_onnx_path)\n",
     "model.graph.node[8]"
    ]
@@ -240,8 +240,8 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "from finn.transformation.fold_constants import FoldConstants\n",
-    "from finn.transformation.infer_shapes import InferShapes\n",
+    "from qonnx.transformation.fold_constants import FoldConstants\n",
+    "from qonnx.transformation.infer_shapes import InferShapes\n",
     "model = model.transform(InferShapes())\n",
     "model = model.transform(FoldConstants())\n",
     "export_onnx_path_transformed = \"/tmp/LFCW1A1-clean.onnx\"\n",
diff --git a/notebooks/end2end_example/bnn-pynq/cnv_end2end_example.ipynb b/notebooks/end2end_example/bnn-pynq/cnv_end2end_example.ipynb
index 2d668f3e04..a2747e3921 100644
--- a/notebooks/end2end_example/bnn-pynq/cnv_end2end_example.ipynb
+++ b/notebooks/end2end_example/bnn-pynq/cnv_end2end_example.ipynb
@@ -55,14 +55,15 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
     "from finn.util.basic import make_build_dir\n",
     "from finn.util.visualization import showInNetron\n",
+    "import os\n",
     "    \n",
-    "build_dir = \"/workspace/finn\""
+    "build_dir = os.environ[\"FINN_BUILD_DIR\"]"
    ]
   },
   {
@@ -76,28 +77,17 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "/workspace/brevitas/src/brevitas_examples/bnn_pynq/models/CNV.py:106: TracerWarning: torch.tensor results are registered as constants in the trace. You can safely ignore this warning if you use this function to create tensors out of constant variables that would be the same every time you call this function. In any other case, this might cause the trace to be incorrect.\n",
-      "  x = 2.0 * x - torch.tensor([1.0], device=x.device)\n",
-      "/workspace/brevitas/src/brevitas/quant_tensor/__init__.py:74: TracerWarning: torch.tensor results are registered as constants in the trace. You can safely ignore this warning if you use this function to create tensors out of constant variables that would be the same every time you call this function. In any other case, this might cause the trace to be incorrect.\n",
-      "  training = torch.tensor(training, dtype=torch.bool)\n"
-     ]
-    }
-   ],
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
    "source": [
     "import onnx\n",
     "from finn.util.test import get_test_model_trained\n",
     "import brevitas.onnx as bo\n",
-    "from finn.core.modelwrapper import ModelWrapper\n",
-    "from finn.transformation.infer_shapes import InferShapes\n",
-    "from finn.transformation.fold_constants import FoldConstants\n",
-    "from finn.transformation.general import GiveReadableTensorNames, GiveUniqueNodeNames, RemoveStaticGraphInputs\n",
+    "from qonnx.core.modelwrapper import ModelWrapper\n",
+    "from qonnx.transformation.infer_shapes import InferShapes\n",
+    "from qonnx.transformation.fold_constants import FoldConstants\n",
+    "from qonnx.transformation.general import GiveReadableTensorNames, GiveUniqueNodeNames, RemoveStaticGraphInputs\n",
     "\n",
     "cnv = get_test_model_trained(\"CNV\", 1, 1)\n",
     "bo.export_finn_onnx(cnv, (1, 3, 32, 32), build_dir + \"/end2end_cnv_w1a1_export.onnx\")\n",
@@ -119,38 +109,9 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Serving '/workspace/finn/end2end_cnv_w1a1_tidy.onnx' at http://0.0.0.0:8081\n"
-     ]
-    },
-    {
-     "data": {
-      "text/html": [
-       "\n",
-       "        <iframe\n",
-       "            width=\"100%\"\n",
-       "            height=\"400\"\n",
-       "            src=\"http://localhost:8081/\"\n",
-       "            frameborder=\"0\"\n",
-       "            allowfullscreen\n",
-       "        ></iframe>\n",
-       "        "
-      ],
-      "text/plain": [
-       "<IPython.lib.display.IFrame at 0x7f912af76550>"
-      ]
-     },
-     "execution_count": 3,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
    "source": [
     "showInNetron(build_dir+\"/end2end_cnv_w1a1_tidy.onnx\")"
    ]
@@ -159,7 +120,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "You can see that the network is composed of a repeating convolution-convolution-maxpool layer pattern to extract features using 3x3 convolution kernels (with weights binarized) and `Sign` activations, followed by fully connected layers acting as the classifier. Also notice the initial `MultiThreshold` layer at the beginning of the network, which is quantizing float inputs to 8-bit ones."
+    "You can see that the network is composed of a repeating convolution-convolution-maxpool layer pattern to extract features using 3x3 convolution kernels (with weights binarized), followed by fully connected layers acting as the classifier. Also notice the initial `MultiThreshold` layer at the beginning of the network, which is quantizing float inputs to 8-bit ones."
    ]
   },
   {
@@ -173,22 +134,13 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "/workspace/finn-base/src/finn/transformation/infer_data_layouts.py:114: UserWarning: Assuming 4D input is NCHW\n",
-      "  warnings.warn(\"Assuming 4D input is NCHW\")\n"
-     ]
-    }
-   ],
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
    "source": [
     "from finn.util.pytorch import ToTensor\n",
-    "from finn.transformation.merge_onnx_models import MergeONNXModels\n",
-    "from finn.core.datatype import DataType\n",
+    "from qonnx.transformation.merge_onnx_models import MergeONNXModels\n",
+    "from qonnx.core.datatype import DataType\n",
     "\n",
     "model = ModelWrapper(build_dir+\"/end2end_cnv_w1a1_tidy.onnx\")\n",
     "global_inp_name = model.graph.input[0].name\n",
@@ -208,12 +160,12 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
-    "from finn.transformation.insert_topk import InsertTopK\n",
-    "from finn.transformation.infer_datatypes import InferDataTypes\n",
+    "from qonnx.transformation.insert_topk import InsertTopK\n",
+    "from qonnx.transformation.infer_datatypes import InferDataTypes\n",
     "\n",
     "# postprocessing: insert Top-1 node at the end\n",
     "model = model.transform(InsertTopK(k=1))\n",
@@ -230,39 +182,9 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Stopping http://0.0.0.0:8081\n",
-      "Serving '/workspace/finn/end2end_cnv_w1a1_pre_post.onnx' at http://0.0.0.0:8081\n"
-     ]
-    },
-    {
-     "data": {
-      "text/html": [
-       "\n",
-       "        <iframe\n",
-       "            width=\"100%\"\n",
-       "            height=\"400\"\n",
-       "            src=\"http://localhost:8081/\"\n",
-       "            frameborder=\"0\"\n",
-       "            allowfullscreen\n",
-       "        ></iframe>\n",
-       "        "
-      ],
-      "text/plain": [
-       "<IPython.lib.display.IFrame at 0x7f8ffd85a760>"
-      ]
-     },
-     "execution_count": 6,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
    "source": [
     "showInNetron(build_dir+\"/end2end_cnv_w1a1_pre_post.onnx\")"
    ]
@@ -280,22 +202,24 @@
     "Note how the convolution layer looks very similar to the fully connected one in terms of the matrix-vector-threshold unit (MVTU), but now the MVTU is preceded by a sliding window unit that produces the matrix from the input image. All of these building blocks, including the `MaxPool` layer you see in this figure, exist as templated Vivado HLS C++ functions in [finn-hlslib](https://github.com/Xilinx/finn-hlslib).\n",
     "\n",
     "\n",
-    "To target this kind of hardware architecture with our network we'll apply a convolution lowering transformation, in addition to streamlining. You may recall the *streamlining transformation* that we applied to the TFC-w1a1 network, which is a series of mathematical simplifications that allow us to get rid of floating point scaling operations by implementing few-bit activations as thresholding operations. **The current implementation of streamlining is highly network-specific and may not work for your network if its topology is very different than the example network here. We hope to rectify this in future releases.**"
+    "To target this kind of hardware architecture with our network we'll apply a convolution lowering transformation, in addition to streamlining. You may recall the *streamlining transformation* that we applied to the TFC-w1a1 network, which is a series of mathematical simplifications that allow us to get rid of floating point scaling operations by implementing few-bit activations as thresholding operations. \n",
+    "\n",
+    "**The current implementation of streamlining is highly network-specific and may not work for your network if its topology is very different than the example network here. We hope to rectify this in future releases.**"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
     "from finn.transformation.streamline import Streamline\n",
-    "from finn.transformation.lower_convs_to_matmul import LowerConvsToMatMul\n",
-    "from finn.transformation.bipolar_to_xnor import ConvertBipolarMatMulToXnorPopcount\n",
+    "from qonnx.transformation.lower_convs_to_matmul import LowerConvsToMatMul\n",
+    "from qonnx.transformation.bipolar_to_xnor import ConvertBipolarMatMulToXnorPopcount\n",
     "import finn.transformation.streamline.absorb as absorb\n",
     "from finn.transformation.streamline.reorder import MakeMaxPoolNHWC, MoveScalarLinearPastInvariants\n",
-    "from finn.transformation.infer_data_layouts import InferDataLayouts\n",
-    "from finn.transformation.general import RemoveUnusedTensors\n",
+    "from qonnx.transformation.infer_data_layouts import InferDataLayouts\n",
+    "from qonnx.transformation.general import RemoveUnusedTensors\n",
     "\n",
     "model = ModelWrapper(build_dir + \"/end2end_cnv_w1a1_pre_post.onnx\")\n",
     "model = model.transform(MoveScalarLinearPastInvariants())\n",
@@ -328,39 +252,9 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 8,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Stopping http://0.0.0.0:8081\n",
-      "Serving '/workspace/finn/end2end_cnv_w1a1_streamlined.onnx' at http://0.0.0.0:8081\n"
-     ]
-    },
-    {
-     "data": {
-      "text/html": [
-       "\n",
-       "        <iframe\n",
-       "            width=\"100%\"\n",
-       "            height=\"400\"\n",
-       "            src=\"http://localhost:8081/\"\n",
-       "            frameborder=\"0\"\n",
-       "            allowfullscreen\n",
-       "        ></iframe>\n",
-       "        "
-      ],
-      "text/plain": [
-       "<IPython.lib.display.IFrame at 0x7f91ac6e6f70>"
-      ]
-     },
-     "execution_count": 8,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
    "source": [
     "showInNetron(build_dir+\"/end2end_cnv_w1a1_streamlined.onnx\")"
    ]
@@ -376,33 +270,24 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 9,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "/workspace/finn/src/finn/custom_op/fpgadataflow/streamingfclayer_batch.py:591: UserWarning: Clipping some thresholds in \n",
-      "  warnings.warn(\"Clipping some thresholds in %s\" % self.onnx_node.name)\n"
-     ]
-    }
-   ],
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
    "source": [
     "import finn.transformation.fpgadataflow.convert_to_hls_layers as to_hls\n",
     "from finn.transformation.fpgadataflow.create_dataflow_partition import (\n",
     "    CreateDataflowPartition,\n",
     ")\n",
     "from finn.transformation.move_reshape import RemoveCNVtoFCFlatten\n",
-    "from finn.custom_op.registry import getCustomOp\n",
-    "from finn.transformation.infer_data_layouts import InferDataLayouts\n",
+    "from qonnx.custom_op.registry import getCustomOp\n",
+    "from qonnx.transformation.infer_data_layouts import InferDataLayouts\n",
     "\n",
     "# choose the memory mode for the MVTU units, decoupled or const\n",
     "mem_mode = \"decoupled\"\n",
     "\n",
     "model = ModelWrapper(build_dir + \"/end2end_cnv_w1a1_streamlined.onnx\")\n",
-    "model = model.transform(to_hls.InferBinaryStreamingFCLayer(mem_mode))\n",
-    "model = model.transform(to_hls.InferQuantizedStreamingFCLayer(mem_mode))\n",
+    "model = model.transform(to_hls.InferBinaryMatrixVectorActivation(mem_mode))\n",
+    "model = model.transform(to_hls.InferQuantizedMatrixVectorActivation(mem_mode))\n",
     "# TopK to LabelSelect\n",
     "model = model.transform(to_hls.InferLabelSelectLayer())\n",
     "# input quantization (if any) to standalone thresholding\n",
@@ -429,46 +314,14 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "Notice the additional `RemoveCNVtoFCFlatten` transformation that was not used for TFC-w1a1. In the last Netron visualization you may have noticed a `Reshape` operation towards the end of the network where the convolutional part of the network ends and the fully-connected layers started. That `Reshape` is essentialy a tensor flattening operation, which we can remove for the purposes of hardware implementation. We can examine the contents of the dataflow partition with Netron, and observe the `ConvolutionInputGenerator`, `StreamingFCLayer_Batch` and `StreamingMaxPool_Batch` nodes that implement the sliding window, matrix multiply and maxpool operations in hlslib. *Note that the StreamingFCLayer instances following the ConvolutionInputGenerator nodes are really implementing the convolutions, despite the name. The final three StreamingFCLayer instances implement actual FC layers.*"
+    "Notice the additional `RemoveCNVtoFCFlatten` transformation that was not used for TFC-w1a1. In the last Netron visualization you may have noticed a `Reshape` operation towards the end of the network where the convolutional part of the network ends and the fully-connected layers started. That `Reshape` is essentialy a tensor flattening operation, which we can remove for the purposes of hardware implementation. We can examine the contents of the dataflow partition with Netron, and observe the `ConvolutionInputGenerator`, `MatrixVectorActivation` and `StreamingMaxPool_Batch` nodes that implement the sliding window, matrix multiply and maxpool operations in hlslib. *Note that the MatrixVectorActivation instances following the ConvolutionInputGenerator nodes are really implementing the convolutions, despite the name. The final three MatrixVectorActivation instances implement actual FC layers.*"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 10,
-   "metadata": {
-    "scrolled": false
-   },
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Stopping http://0.0.0.0:8081\n",
-      "Serving '/workspace/finn/end2end_cnv_w1a1_dataflow_parent.onnx' at http://0.0.0.0:8081\n"
-     ]
-    },
-    {
-     "data": {
-      "text/html": [
-       "\n",
-       "        <iframe\n",
-       "            width=\"100%\"\n",
-       "            height=\"400\"\n",
-       "            src=\"http://localhost:8081/\"\n",
-       "            frameborder=\"0\"\n",
-       "            allowfullscreen\n",
-       "        ></iframe>\n",
-       "        "
-      ],
-      "text/plain": [
-       "<IPython.lib.display.IFrame at 0x7f8ffd85ae20>"
-      ]
-     },
-     "execution_count": 10,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
    "source": [
     "showInNetron(build_dir + \"/end2end_cnv_w1a1_dataflow_parent.onnx\")"
    ]
@@ -482,39 +335,9 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 11,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Stopping http://0.0.0.0:8081\n",
-      "Serving '/workspace/finn/end2end_cnv_w1a1_dataflow_model.onnx' at http://0.0.0.0:8081\n"
-     ]
-    },
-    {
-     "data": {
-      "text/html": [
-       "\n",
-       "        <iframe\n",
-       "            width=\"100%\"\n",
-       "            height=\"400\"\n",
-       "            src=\"http://localhost:8081/\"\n",
-       "            frameborder=\"0\"\n",
-       "            allowfullscreen\n",
-       "        ></iframe>\n",
-       "        "
-      ],
-      "text/plain": [
-       "<IPython.lib.display.IFrame at 0x7f8ffd832280>"
-      ]
-     },
-     "execution_count": 11,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
    "source": [
     "showInNetron(build_dir + \"/end2end_cnv_w1a1_dataflow_model.onnx\")"
    ]
@@ -528,12 +351,12 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 12,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
     "model = ModelWrapper(build_dir + \"/end2end_cnv_w1a1_dataflow_model.onnx\")\n",
-    "fc_layers = model.get_nodes_by_op_type(\"StreamingFCLayer_Batch\")\n",
+    "fc_layers = model.get_nodes_by_op_type(\"MatrixVectorActivation\")\n",
     "# each tuple is (PE, SIMD, in_fifo_depth) for a layer\n",
     "folding = [\n",
     "    (16, 3, 128),\n",
@@ -567,44 +390,14 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "Below we visualize in Netron to observe the `StreamingDataWidthConverter` and `StreamingFIFO` nodes that have been inserted into graph, as well as the folding factors in the `PE` and `SIMD` attributes of each `StreamingFCLayer_Batch`."
+    "Below we visualize in Netron to observe the `StreamingDataWidthConverter` and `StreamingFIFO` nodes that have been inserted into graph, as well as the folding factors in the `PE` and `SIMD` attributes of each `MatrixVectorActivation`."
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 13,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Stopping http://0.0.0.0:8081\n",
-      "Serving '/workspace/finn/end2end_cnv_w1a1_folded.onnx' at http://0.0.0.0:8081\n"
-     ]
-    },
-    {
-     "data": {
-      "text/html": [
-       "\n",
-       "        <iframe\n",
-       "            width=\"100%\"\n",
-       "            height=\"400\"\n",
-       "            src=\"http://localhost:8081/\"\n",
-       "            frameborder=\"0\"\n",
-       "            allowfullscreen\n",
-       "        ></iframe>\n",
-       "        "
-      ],
-      "text/plain": [
-       "<IPython.lib.display.IFrame at 0x7f8ff1243af0>"
-      ]
-     },
-     "execution_count": 13,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
    "source": [
     "showInNetron(build_dir + \"/end2end_cnv_w1a1_folded.onnx\")"
    ]
@@ -627,29 +420,41 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 14,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "/workspace/finn/src/finn/transformation/fpgadataflow/floorplan.py:107: UserWarning: 32 nodes have no entry in the provided floorplan, SLR was set to -1\n",
-      "  warnings.warn(\n",
-      "/workspace/finn/src/finn/transformation/fpgadataflow/insert_fifo.py:154: UserWarning: Overriding input FIFO depth to 32\n",
-      "  warnings.warn(\"Overriding input FIFO depth to 32\")\n",
-      "/workspace/finn/src/finn/transformation/fpgadataflow/insert_fifo.py:200: UserWarning: Overriding output FIFO depth to 32\n",
-      "  warnings.warn(\"Overriding output FIFO depth to 32\")\n"
-     ]
-    }
-   ],
-   "source": [
-    "test_pynq_board = \"Pynq-Z2\"\n",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "test_pynq_board = \"Pynq-Z1\"\n",
     "target_clk_ns = 10\n",
     "\n",
     "from finn.transformation.fpgadataflow.make_zynq_proj import ZynqBuild\n",
     "model = ModelWrapper(build_dir+\"/end2end_cnv_w1a1_folded.onnx\")\n",
-    "model = model.transform(ZynqBuild(platform = test_pynq_board, period_ns = target_clk_ns))\n",
+    "model = model.transform(ZynqBuild(platform = test_pynq_board, period_ns = target_clk_ns))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "After the `ZynqBuild` we run one additional transformation to generate a PYNQ driver for the accelerator."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from finn.transformation.fpgadataflow.make_pynq_driver import MakePYNQDriver\n",
+    "model = model.transform(MakePYNQDriver(\"zynq-iodma\"))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
     "model.save(build_dir + \"/end2end_cnv_w1a1_synth.onnx\")"
    ]
   },
@@ -659,35 +464,22 @@
    "source": [
     "## 5. Deployment and Remote Execution\n",
     "\n",
-    "Now that we're done with the hardware generation, we can generate a Python driver for accelerator and copy the necessary files onto our PYNQ board.\n",
+    "Now that we're done with the hardware generation, we can copy the necessary files onto our PYNQ board.\n",
     "\n",
     "**Make sure you've [set up the SSH keys for your PYNQ board](https://finn-dev.readthedocs.io/en/latest/getting_started.html#pynq-board-first-time-setup) before executing this step.**"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 15,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Welcome to PYNQ Linux, based on Ubuntu 18.04 (GNU/Linux 4.19.0-xilinx-v2019.1 armv7l)\r\n",
-      "\r\n",
-      " * Super-optimized for small spaces - read how we shrank the memory\r\n",
-      "   footprint of MicroK8s to make it the smallest full K8s around.\r\n",
-      "\r\n",
-      "   https://ubuntu.com/blog/microk8s-memory-optimisation\r\n"
-     ]
-    }
-   ],
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
    "source": [
     "import os\n",
     "\n",
     "# set up the following values according to your own environment\n",
     "# FINN will use ssh to deploy and run the generated accelerator\n",
-    "ip = os.getenv(\"PYNQ_IP\", \"192.168.2.99\")\n",
+    "ip = \"192.168.2.99\"\n",
     "username = os.getenv(\"PYNQ_USERNAME\", \"xilinx\")\n",
     "password = os.getenv(\"PYNQ_PASSWORD\", \"xilinx\")\n",
     "port = os.getenv(\"PYNQ_PORT\", 22)\n",
@@ -701,7 +493,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 16,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -714,20 +506,9 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 17,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "'/home/xilinx/finn_dev_jduarte/pynq_deployment_yrxnwrak'"
-      ]
-     },
-     "execution_count": 17,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
    "source": [
     "target_dir_pynq = target_dir + \"/\" + model.get_metadata_prop(\"pynq_deployment_dir\").split(\"/\")[-1]\n",
     "target_dir_pynq"
@@ -735,24 +516,9 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 18,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "total 4240\r\n",
-      "-rw-rw-r-- 1 xilinx xilinx   18616 Jun 28 20:42 driver_base.py\r\n",
-      "-rw-r--r-- 1 xilinx xilinx    4868 Jun 28 20:42 driver.py\r\n",
-      "drwxr-xr-x 4 xilinx xilinx    4096 Jun 28 20:42 finn\r\n",
-      "-rw-r--r-- 1 xilinx xilinx 4045671 Jun 28 20:42 resizer.bit\r\n",
-      "-rw-r--r-- 1 xilinx xilinx  247083 Jun 28 20:42 resizer.hwh\r\n",
-      "drwxr-xr-x 2 xilinx xilinx    4096 Jun 28 20:42 runtime_weights\r\n",
-      "-rw-rw-r-- 1 xilinx xilinx    4107 Jun 28 20:42 validate.py\r\n"
-     ]
-    }
-   ],
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
    "source": [
     "! ssh {options} {username}@{ip} -p {port} 'ls -l {target_dir_pynq}'"
    ]
@@ -766,32 +532,9 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 19,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "<matplotlib.image.AxesImage at 0x7f917faeb6d0>"
-      ]
-     },
-     "execution_count": 19,
-     "metadata": {},
-     "output_type": "execute_result"
-    },
-    {
-     "data": {
-      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAPsAAAD5CAYAAADhukOtAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjMuMSwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/d3fzzAAAACXBIWXMAAAsTAAALEwEAmpwYAAAe8klEQVR4nO2da4yc53Xf/2feuex9Z5dLLpdXURJlRVZiSqFVO1EV2akDRUkgGwhcu4ChAEYUBBEQA+kHwQVqF+gHp6ht+EPhgq5VK4ZrWbUtSEiE1LYcRDDsSKJu1IW6ULxIJJdcksu97+zcTj/MyKXU5//sksudpf38fwDB2efs875nnnnPvLPPf8455u4QQvz6k1tvB4QQnUHBLkQiKNiFSAQFuxCJoGAXIhEU7EIkQn41k83sDgBfA5AB+B/u/qXY7/d3533DQDF8rPh5Ltq3mKTo4Lbouci06PH40eJGj70Px/wP2yx2MjIHAGLK7KXJttyP2NHcL/4aaB2TrQenGX3Sl+ZH7NkxSzPiBvNxer6OxaVG0MlLDnYzywD8NwAfA3AcwNNm9qi7v8LmbBgo4gv/7vrw8bxJz1UshN20HA+IanWJ2uqNGj9XMfxmBACNZthHj7wqlmtQWy6jJnitlx8T/JiFYiU4nkVeastx/xvNOrXV6vw1azZJUBj3ox6+RgEAS+x4WC5wwz7G3tSrVX59NBqRdYxcw7nIa1Yl19U8X3osVMPH+/ZPTkR8uHRuAXDI3Q+7exXAgwDuWsXxhBBryGqCfSuAty/4+Xh7TAhxBbLmG3Rmdo+Z7Tez/XOLkc8lQog1ZTXBfgLA9gt+3tYeexfuvs/d97r73r7uVe0HCiFWwWqC/WkAu81sl5kVAXwKwKOXxy0hxOXmkm+17l43s3sB/B+0pLf73f3l6BwYquT9xX2RTyS7lSXwHesc+FZ3Ph/ZIb8ExcsKfNJStUpt9WbEx4j0lkV28fNkmjX5DjPqXLmI7SI3I/5XrSs43shKfE7seA2+HtbkPhpRE7oir1neuC2XjygXtcgaG/8T1skae0RnyLKwjzFlYlWfq939MQCPreYYQojOoG/QCZEICnYhEkHBLkQiKNiFSAQFuxCJ0OFvuTicJVY4l3+8EZ5jDS7VNGtc8sq6IzIOeDIDk7yaEemnWChQW925rVmLPLfI+er1sM0imVy5iMxnGU8M8iwsrwHAYiMssZ06x+Wp+Sr3cW6Oz8ucr0d/V3gdi8Zf54GebmrrLnEJrZnj11wuKqOFfeRXB1BjyVcR7U13diESQcEuRCIo2IVIBAW7EImgYBciETq6G2/uyDfIrnsW2S0mSRylLJIfn49tS0YSHUiCAQCaCFOPFQvLcT8KRb7ru/mq66htZuostZ09txA+V57vqucQSU6p80tk0bn/B4+FffTSMJ1Ty3hiU7WP7/zPTU9S24mJqeB4X4k/r8ap8BwA2DHK13FDP1/HrnysnFX4Oi5GLuEGUSBi5bZ0ZxciERTsQiSCgl2IRFCwC5EICnYhEkHBLkQirEO517A0YPkyn0HkhHqsA0eOy3LVOk9YKEZqpDUapFZYJDEFESmkGKmD9q/+zceo7Zmf/4LaTk6dC47PRyS0eoNLXseOn6G2Iyd495FSeSw4vm10F53jpX5qq+b561Lo20ht9cpccPzcxEk6p6fM5cHjc6eprUJqJQLAaD9Pa+kphBNhGrWwjAoArIlPpJOX7uxCpIKCXYhEULALkQgKdiESQcEuRCIo2IVIhFVJb2Z2FMAsgAaAurvvjf1+03JYyoXllemFHjqvQdoTDfVxeW0g43JYPlKPrRmR5ZisQevqIZ5Ft7Bwntp++vePUNvpKV6v7/Rc+HzHTvBzHRt/m9qyrj5qa2QD1NY7MBIcL/Tw4+W7eBZdKdKSqSvHpcOz1XBbsbFtO+icyuI8tR05wqW3yekKtWXGn/dVG8O2QoNLecbqMkak3suhs3/E3XnOpRDiikAf44VIhNUGuwP4kZk9Y2b3XA6HhBBrw2o/xt/q7ifMbBOAH5vZq+7+xIW/0H4TuAcAhvp5lQ8hxNqyqju7u59o/z8B4GEAtwR+Z5+773X3vX3d6/BVfCEEgFUEu5n1mln/O48B/AGAly6XY0KIy8tqbrWjAB5ub/XnAfwvd//H2IR603BmMZzhM1kr03lP/Pyfg+O/sZtLLh95f1j6AYChSHHLJslsA4AcadOTy/GMpobztkURNQlHjh2htslFngHmPUPB8ayPSz+5oVlq6y4PUlu1wqWmKmmvNDDEX7OBPm6bOHWK2mbO84KT/cXwJd7VzWW+t85zcanQv4nazpx6i9r6TvM13jwQ9qXbIpmKpAgrIrLyJQe7ux8G8IFLnS+E6CyS3oRIBAW7EImgYBciERTsQiSCgl2IROhsr7eshPxguODgwjn+vlMrhgsKTi6EpTAAWKjy3mADRZ7Z1iR9t9rG4HCW8Yy9SpVLPGd48hrOznIJMFYQcWhjOJtrvjlD54yA+5hFMtGqBb6Olfmw1FSZ437sHN1AbQtEQgOACZLZBgBWCMuU05O8mCMiBUQX53lGXFbk18HEDM86HCfZcjtH+PWdYwlxsRaH3CSE+HVCwS5EIijYhUgEBbsQiaBgFyIROrob39Xdi/f91v+XBQsAOP4vr9F5fYPh3fhbPhw+FgD0ZMeorUp2igEgl+dJLVYI70w3vEzn9G/aTm3PHzhEbX1lvjO9def7qc1z4d3nQmTnvLkUbhkFANVqpMVWZK0yksTx8gsH6JyBUqRFUi9PkumN1LU7eSpcM65OlBUAyMgOPgAM9XN1YrrBk57OT3LbkVPTwfEto5vpnDxTlCLZVbqzC5EICnYhEkHBLkQiKNiFSAQFuxCJoGAXIhE6Kr3lsjx6BsOS0s6rr6PzFolqsWPXtXTOSI1LK1NHuCxXiyTCNOrhRIdbbvs4nbPjat4Ra9dvHqW2Z557gdqG+rgkc3IiXD8t77yMd6nAJS/wZcRcJClkmtSFG+rl54qcCo2IVDayMSzNAsBSLfx6nj0flrsAwCItu/ojdfLyGQ+naoUn3hx++3hwfGOZy3y7t4XbqHnk/q07uxCJoGAXIhEU7EIkgoJdiERQsAuRCAp2IRJhWenNzO4H8McAJtz9xvbYMIDvAbgKwFEAn3R3XmTrnWPlcshK4Qylk6cP0nl7fvuDwfHeQV7zK5s9QW2NeqRFTqTW2eG3w9lytw6F6+oBAHq2UVN/L5djuvI8k6s7Uuusq0gytiJ11bZuGaO2V958k9qKRV7nb2Y2vFZXbdtN51x3/Q3UNjnJL6++gTK1nTw1ERy3HK/vVh7iNf6mI7Xksohk191TprbF2fB1cIhcbwDQXQyfq1aPZClSy//jWwDueM/YfQAed/fdAB5v/yyEuIJZNtjb/dbf+w2JuwA80H78AICPX163hBCXm0v9m33U3cfbj0+h1dFVCHEFs+oNOnd3RL7paGb3mNl+M9s/Pc1rhgsh1pZLDfbTZjYGAO3/w7sgANx9n7vvdfe9g4MDl3g6IcRqudRgfxTA3e3HdwN45PK4I4RYK1YivX0XwO0ARszsOIAvAPgSgIfM7LMAjgH45EpOZpah0BW+u1cqvCDi0lI47a0QkaB6evmniN5IS6NSxrPe+vLhfk3f2vdNOudP/u291FaYP0VtxVIkeynHfdx19dbg+MTkSTqnMsez1zZvGqG2yRkuHS5Vw6/n1dfyTMVrruWZj9PPPUtt87Nz1DYzH/ax3uAS1eJiuB0TAJTLg9TWcC6VDZR5tl+9Gn49sxzvD3Z8PPxhukqy/IAVBLu7f5qYfn+5uUKIKwd9g06IRFCwC5EICnYhEkHBLkQiKNiFSISOFpyEGSwLSxALEfmnsrAYHC9EenLNnuNZXsi49FYAL0Q4Vg5nSr1xkPdsO3mc27DA5bBjx49S202beY+7rTvDxSi3TPBvNM8f4gU4h0tlausvc1nu8OGjwfGxLWFpEACmZvg3LGsRqez0Gd6rrukWHLdIcciFiPRmOX5dhc/UojdSqBLNcJZd0cLXPQBUz4VlW4+U7dSdXYhEULALkQgKdiESQcEuRCIo2IVIBAW7EInQWenNAZCeXZlzaWVsJNwfrqeLS28/PcALJQ5FivLtHubZSV2lsOxSzHOp5szEUWprLvHihTuu4UUss8jz7hkYCo6PjPLCl+cmedbYdCSzrRFRNzeS/mv5iFxaIdlfQDyba7HCs8PqxEk2DgCVJZ6BWa/z++OGkU3UZsavq6KFr5+SRfoOejjjsxApeqk7uxCJoGAXIhEU7EIkgoJdiERQsAuRCB3djTcDCvlwMslgH09OKfeHbdbku5UzzhMPzp7nKQsj/XxJeovhHdVGLlwjDwCOnjxKbaNDvJ7Zzmt5K6QKPx2eeibcRuvEON/57+8L7+ADQKHAWzy9fOgt7gi5jzQj95elyG783DxPCikP83ZNdZIIM36aFkRGbz9/XfIZTzTp6eE1EYusLRcA1MKJPI35KTpldFN/cDxf4G2tdGcXIhEU7EIkgoJdiERQsAuRCAp2IRJBwS5EIqyk/dP9AP4YwIS739ge+yKAPwdwpv1rn3f3x1ZywszCUsjmTeHaaS0niYwTSYAY28YTSfZH5LAp45KdZ+E6eYMjPKlicIAnQBS6wvIJAFwVkd76BsOJQQDwP+//dnB8IbJWM4uT1LawyGsDFiJXz+ah8POuTPJ6d/Mk0QgABgf46/Lqa29Q2+nTZ4LjM5GWUeUyf2IDvX3UljnXRAtVvo4ZqUW4sZcfb7ArHEf5yO17JXf2bwG4IzD+VXff0/63okAXQqwfywa7uz8BgL/1CyF+JVjN3+z3mtkBM7vfzPhXsIQQVwSXGuxfB3ANgD0AxgF8mf2imd1jZvvNbP/U1NQlnk4IsVouKdjd/bS7N9y9CeAbAGjXAnff5+573X1vuVy+RDeFEKvlkoLdzMYu+PETAF66PO4IIdaKlUhv3wVwO4ARMzsO4AsAbjezPWhVlTsK4C9WcrJcLkezfwaGuPRWb4TdLOV5JtF1u3ZQ2/5nuOQ1U7iW2po2Gxwf3crltVcO/gu1/c7v/Rm1/eLnfN78fKRNUvVscHzi1Nt0Tuw9f67GbXlwaWgoF86y29rNfZ8+wyW0esa3hUY3cVujEc6kW4y0eKos8rp785EaevUml/NqlRPUtqkQzujb0sez6Jbq4Tmxu/eywe7unw4Mf3O5eUKIKwt9g06IRFCwC5EICnYhEkHBLkQiKNiFSISOFpzM5XLo7QtnLw2NjNB5dQu7WckV6ZyuvgFqK5d5QcG33j5Fbbd+8P1hP+Z4O6me/nDWFQCMnzhObYdef53a6g3enihH6g3Oz0zTOf0bxqhteprLUIN9vBjl+667MTj+9Auv0jnPvnqU2m69/Q+prVDkEtXhQ4eC49Oz/HnFimJWFrm8tnOUS7rdvbyg6vBweJ7neQHOejVc+NJJVimgO7sQyaBgFyIRFOxCJIKCXYhEULALkQgKdiESoaPSm3sTzXpY8hgc5oX85hfDhQgXGrzvVpbx97Ed27dR2+sv88yr6YWwxNbXyzPstl9DTTj2Oi++eOLkOLV9+MMfpLaFhbA01L9lK50zvIUX53xrkktli0tcciz2hvuvDWzcTufc1M9flzNnwv3QAODosReobX4xLFNOTXMJbePGjdQ26Px12dnHJdFNA7wHW8HCmYDVGu9v10skthx4TOjOLkQiKNiFSAQFuxCJoGAXIhEU7EIkQkd345v1GmbPhXczuyO1vZYq4V1Oa3L3zfiu5Mgwb5/0eu4wtU1Mhlv4nMv4rvRgH6+td/2NPCHn8DFeM67GuyRhaiasduzevZvO2b2LSwbHxnkCzcsvv0ht586Gk1OKJa66DPXxRJLjL3NV4NQ5XtfOSLJUFmm9FWsdtpPnmWBHP08M6srxpJalSvj6aTZ5bcNanRyPX/a6swuRCgp2IRJBwS5EIijYhUgEBbsQiaBgFyIRVtL+aTuAvwMwitbG/j53/5qZDQP4HoCr0GoB9Ul3D/f8abO0tITDh8LS1o7dv0HndeXC0luzyhMF8l0RGSRi6+/n0lDfQLiu3fXXv4/O+cmPHqO2hWle765neBO1HTo+QW3bt4WTcna972Y6p1Tkl8HVO3iSz9Qkf7lfORhOKGo61w1PTPFEkhmSDAUAlQaXbWemwlLkps086eatc7w+3fB2LpeeK3E/0OTPbaoefm6e59fpEjleFTzhZiV39jqAv3H3GwB8CMBfmdkNAO4D8Li77wbwePtnIcQVyrLB7u7j7v5s+/EsgIMAtgK4C8AD7V97AMDH18hHIcRl4KL+ZjezqwDcBOBJAKPuv0zuPYXWx3whxBXKioPdzPoA/ADA59z9Xd9PdHcH+aKemd1jZvvNbP/sLC8YIIRYW1YU7GZWQCvQv+PuP2wPnzazsbZ9DEBw18jd97n7XnffG9v8EkKsLcsGu5kZWv3YD7r7Vy4wPQrg7vbjuwE8cvndE0JcLlaS9fa7AD4D4EUze7499nkAXwLwkJl9FsAxAJ9c7kALS3U8fygsG+248RY6r4lwtpmxzB8AaPL0n5nZWWqbmjpLbRuG9wTH77zjI3TOng9cT20P/fBhajPjEsrg4BC1bd0SlpT6Bsp0TlYPry8ADG/ml8jYrhq1TXeHZaPnXuD14sbneEqZF3g7r8HNPItx5JqwVJZFZK2Gcz9e83D7MgA4dIrLg8WMH3OxUgmOL0Qu73ozfH3MNnh24LLB7u4/A8A8/f3l5gshrgz0DTohEkHBLkQiKNiFSAQFuxCJoGAXIhE6WnCy0jC8Pt0dtJ1t8AKAXghLE7kqL4boRJoAgFyO27aM8Wyzf/074cyxrgKXXHbt5G2X/uhPP0Vt33/4H6jt7Cn+vMenw8ULK5VDdE4RXOOZXOS2Q8d41h6qYVnOR3iG4NCmcJFKAGhGKim2vvNF5nWFj9m0cCFKAKhF2opNN/i5ugr8mF15Lr3NWzjLrlbg5/JmeH0bEclWd3YhEkHBLkQiKNiFSAQFuxCJoGAXIhEU7EIkQkelt6WG4fWp8PvLIz/jfcP27BwJjm8u8gyknkIkW2sz7782NsKzq665mhQpdF5McPzMOWq7/0Eurz37/CvUxnrfAQBNBHT+vu4NfrxGia9HI8eloTzCEms9Ig3Vc+E5ANAVu1IjWWqVavh5e47PyUcy4rIm7+vnFS5T1sHnFZphHzPjr1m1FvY/0uJQd3YhUkHBLkQiKNiFSAQFuxCJoGAXIhE6uhvfgGEuF04WePzZ1+m8N94Mt4y647dvoHOu2cLb9Bw5HG5NBAC3ffBGausiiQmzVb7D/NA/Pk1tz71yktoW6pFWQpHd4lwh/P7djNTkyxnfRY7tWjeaPAFoieww1xp8jhmvabeESFKI8+eWz5Od7ozf53p6eEJLEdz/Bt9wR8N4qDXIxHqNvy7F/nJw3HL8PLqzC5EICnYhEkHBLkQiKNiFSAQFuxCJoGAXIhGWld7MbDuAv0OrJbMD2OfuXzOzLwL4cwBn2r/6eXd/LHqyfB4bRjYGbZPnuXwyfn4qOP7zF3irm0ZtZ8QTLq1s3EySXQBYFpbDntr/Ep3zDz/9BbUtNXnNNeS59JbLXfx7dGOJJ7t4RJZrRuS1mOTFWigV8vySs4xLmMj4a5aPzMuy8PliTUazyPrmnMuDjUiyUTMiHTLNbvNmLh/3D4Rtb5Yi68Q9+CV1AH/j7s+aWT+AZ8zsx23bV939v67gGEKIdWYlvd7GAYy3H8+a2UEAvGSqEOKK5KI+D5rZVQBuAvBke+heMztgZvebGW8tKoRYd1Yc7GbWB+AHAD7n7jMAvg7gGgB70Lrzf5nMu8fM9pvZ/voib5UshFhbVhTs1qrC/wMA33H3HwKAu59294a7NwF8A0Cwwbq773P3ve6+N9/NG0EIIdaWZYPdzAzANwEcdPevXDA+dsGvfQIA35IWQqw7K9mN/10AnwHwopk93x77PIBPm9ketOS4owD+YrkDmRmVSQoFLjXVK2E54ejpGTpnaf4gtd1283XU1l0eo7bpSlgi+ecn99M5FeeZS7U6l3FKJZ7Z1ozUQVtYCLcSipFFMrKMJ70h0pEJJSJ5xbKyELFZicuU3d28dl2eSH21SEbZ7Pw8tTUiMuVSnb8ug0PhOooAMDoWtvVFCu8tzob/JPbItbGS3fifAQi95FFNXQhxZaFv0AmRCAp2IRJBwS5EIijYhUgEBbsQidDRgpNwR7NOsqhiGUNZWIaqgmc7TcwtUduzr/FCj3cucGll1sNyx4nz/JuBpT6eXVVf4P5Xlrj/PT0RqYm0vYodz3Lcj1ykXVMsg82JjOaR+0shIjfO1Xj2XbXOpTImy8Uy9mIS2nyk9VZfmctr5Y285Vi1Hj7ma6/yrM4CyUasVbl/urMLkQgKdiESQcEuRCIo2IVIBAW7EImgYBciETosvQFgWUPO5Y4sCxfrazqXhRo5XuDv6ASXyu5/iOf3fPT2vcHxIyfPBMcBYKERK0IYkaG6eOHArMhtPaSHWbGby1qLs1y6imWHeUSiKpCMrSzPX7PYubJIUclYH7vFhbmLnhM7V3lomNo2jPKMybPnJqlt6uyp8PhbvCfhtbt2hQ0RSVF3diESQcEuRCIo2IVIBAW7EImgYBciERTsQiRCR6W3LJ9huFwO2ioVLofNL4YzeYoZz/6qR2ShXKS45RNPHaC2IyfD2XLT87xw5OTcIrWRZCcAQG9vJFsuUlSwVAo/t3xEruvq5hllWSQjLl/gx2yQ+0g9InlZxObOfWzU+PpXa+FF7u7iUuTIhg3UNjTC5bVqJHNzqRgpHkn6szXzXD6er4Svq2ZEwtadXYhEULALkQgKdiESQcEuRCIo2IVIhGV3482sC8ATAErt3/++u3/BzHYBeBDABgDPAPiMu0f2lwFvOpbILmIp8raz1AjvthYyvhtc55vI8Bw/Wa6b74IfIwkvuUhyR73Gd5hjikGlUqG2+Uh7ohx5bmyXHgB6i3zXtzuSQJPLcf+LXeHzdffw9a1WeSLM2UmeSNIEn5cvhNdjaKCXzhkdLlPb5s08EWZqntf5m506T21z01PB8fIwP9fZM2eD4/VIMtFK7uxLAD7q7h9Aqz3zHWb2IQB/C+Cr7n4tgPMAPruCYwkh1ollg91bvJMnWGj/cwAfBfD99vgDAD6+Fg4KIS4PK+3PnrU7uE4A+DGANwFMuf+yRelxAFvXxEMhxGVhRcHu7g133wNgG4BbAFy/0hOY2T1mtt/M9tcWeItlIcTaclG78e4+BeCfAHwYQNnsl429twE4Qebsc/e97r630DOwGl+FEKtg2WA3s41mVm4/7gbwMQAH0Qr6P23/2t0AHlkjH4UQl4GVJMKMAXjAzDK03hwecve/N7NXADxoZv8ZwHMAvrncgZrNJpYWw5JSKTM6r4d42azxJJNI1yI0wSWjWCJBk7SbqlcjCRwN/rxiLYhitmYkEYZJb+fPc+lnMrKOA31cohqM1GMbILXwusClvEaTS1d5iyTrlPiLvVQJH7OU569L7Fz1hemIjfs/N3WO2pokWaerxCXRCquTZ5HnRS1t3P0AgJsC44fR+vtdCPErgL5BJ0QiKNiFSAQFuxCJoGAXIhEU7EIkgsUknst+MrMzAI61fxwBEE7d6Szy493Ij3fzq+bHTnffGDJ0NNjfdWKz/e4ebp4mP+SH/LjsfuhjvBCJoGAXIhHWM9j3reO5L0R+vBv58W5+bfxYt7/ZhRCdRR/jhUiEdQl2M7vDzF4zs0Nmdt96+ND246iZvWhmz5vZ/g6e934zmzCzly4YGzazH5vZG+3/h9bJjy+a2Yn2mjxvZnd2wI/tZvZPZvaKmb1sZn/dHu/omkT86OiamFmXmT1lZi+0/fhP7fFdZvZkO26+Z2a84moId+/oPwAZWmWtrgZQBPACgBs67Ufbl6MARtbhvLcBuBnASxeM/RcA97Uf3wfgb9fJjy8C+PcdXo8xADe3H/cDeB3ADZ1ek4gfHV0TAAagr/24AOBJAB8C8BCAT7XH/zuAv7yY467Hnf0WAIfc/bC3Sk8/COCudfBj3XD3JwC8tzbyXWgV7gQ6VMCT+NFx3H3c3Z9tP55FqzjKVnR4TSJ+dBRvcdmLvK5HsG8F8PYFP69nsUoH8CMze8bM7lknH95h1N3H249PARhdR1/uNbMD7Y/5a/7nxIWY2VVo1U94Euu4Ju/xA+jwmqxFkdfUN+hudfebAfwhgL8ys9vW2yGg9c6O1hvRevB1ANeg1SNgHMCXO3ViM+sD8AMAn3P3d1Un7eSaBPzo+Jr4Koq8MtYj2E8A2H7Bz7RY5Vrj7ifa/08AeBjrW3nntJmNAUD7/4n1cMLdT7cvtCaAb6BDa2JmBbQC7Dvu/sP2cMfXJOTHeq1J+9xTuMgir4z1CPanAexu7ywWAXwKwKOddsLMes2s/53HAP4AwEvxWWvKo2gV7gTWsYDnO8HV5hPowJqYmaFVw/Cgu3/lAlNH14T50ek1WbMir53aYXzPbuOdaO10vgngP6yTD1ejpQS8AODlTvoB4LtofRysofW312fR6pn3OIA3APwEwPA6+fFtAC8COIBWsI11wI9b0fqIfgDA8+1/d3Z6TSJ+dHRNAPwWWkVcD6D1xvIfL7hmnwJwCMD/BlC6mOPqG3RCJELqG3RCJIOCXYhEULALkQgKdiESQcEuRCIo2IVIBAW7EImgYBciEf4vt7E0CllzrOkAAAAASUVORK5CYII=\n",
-      "text/plain": [
-       "<Figure size 432x288 with 1 Axes>"
-      ]
-     },
-     "metadata": {
-      "needs_background": "light"
-     },
-     "output_type": "display_data"
-    }
-   ],
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
    "source": [
     "import pkg_resources as pk\n",
     "import matplotlib.pyplot as plt\n",
@@ -812,7 +555,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 20,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -829,20 +572,9 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 21,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "array([[3.]], dtype=float32)"
-      ]
-     },
-     "execution_count": 21,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
    "source": [
     "ret[oname]"
    ]
@@ -874,20 +606,9 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 22,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "[sudo] password for xilinx: Requirement already satisfied: dataset_loading from git+https://github.com/fbcotter/dataset_loading.git@0.0.4#egg=dataset_loading in /usr/local/lib/python3.6/dist-packages\n",
-      "Requirement already satisfied: Pillow in /usr/lib/python3/dist-packages (from dataset_loading)\n",
-      "Requirement already satisfied: scipy in /usr/lib/python3/dist-packages (from dataset_loading)\n",
-      "Connection to 99.121.248.96 closed.\n"
-     ]
-    }
-   ],
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
    "source": [
     "! ssh {options} -t {username}@{ip} -p {port} 'echo {password} | sudo -S pip3 install git+https://github.com/fbcotter/dataset_loading.git@0.0.4#egg=dataset_loading'"
    ]
@@ -905,31 +626,9 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 23,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "[sudo] password for xilinx: Tar File found in dest_dir. Not Downloading again\n",
-      "Extracting Python CIFAR10 data.\n",
-      "Files extracted\n",
-      "batch 1 / 10 : total OK 851 NOK 149\n",
-      "batch 2 / 10 : total OK 1683 NOK 317\n",
-      "batch 3 / 10 : total OK 2522 NOK 478\n",
-      "batch 4 / 10 : total OK 3370 NOK 630\n",
-      "batch 5 / 10 : total OK 4207 NOK 793\n",
-      "batch 6 / 10 : total OK 5044 NOK 956\n",
-      "batch 7 / 10 : total OK 5887 NOK 1113\n",
-      "batch 8 / 10 : total OK 6728 NOK 1272\n",
-      "batch 9 / 10 : total OK 7570 NOK 1430\n",
-      "batch 10 / 10 : total OK 8419 NOK 1581\n",
-      "Final accuracy: 84.190000\n",
-      "Connection to 99.121.248.96 closed.\n"
-     ]
-    }
-   ],
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
    "source": [
     "! ssh {options} -t {username}@{ip} -p {port} 'cd {target_dir_pynq}; echo {password} | sudo -S python3.6 validate.py --dataset cifar10 --batchsize 1000'"
    ]
@@ -940,13 +639,6 @@
    "source": [
     "We see that the final top-1 accuracy is 84.19%, which is very close to the 84.22% reported on the [BNN-PYNQ accuracy table in Brevitas](https://github.com/Xilinx/brevitas/tree/master/src/brevitas_examples/bnn_pynq). "
    ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": []
   }
  ],
  "metadata": {
diff --git a/notebooks/end2end_example/bnn-pynq/tfc_end2end_example.ipynb b/notebooks/end2end_example/bnn-pynq/tfc_end2end_example.ipynb
index a1a8450225..a6f05df309 100644
--- a/notebooks/end2end_example/bnn-pynq/tfc_end2end_example.ipynb
+++ b/notebooks/end2end_example/bnn-pynq/tfc_end2end_example.ipynb
@@ -42,15 +42,15 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
     "from finn.util.visualization import showSrc, showInNetron\n",
     "from finn.util.basic import make_build_dir\n",
-    "\n",
+    "import os\n",
     "    \n",
-    "build_dir = \"/workspace/finn\""
+    "build_dir = os.environ[\"FINN_BUILD_DIR\"]"
    ]
   },
   {
@@ -70,78 +70,38 @@
    "metadata": {},
    "source": [
     "## 1. Brevitas export <a id='brev_exp'></a>\n",
-    "FINN expects an ONNX model as input. This can be a model trained with [Brevitas](https://github.com/Xilinx/brevitas). Brevitas is a PyTorch library for quantization-aware training and the FINN Docker image comes with several [example Brevitas networks](https://github.com/Xilinx/brevitas/tree/master/brevitas_examples/bnn_pynq). To show the FINN end-to-end flow, we'll use the TFC-w1a1 model as example network.\n",
+    "FINN expects an ONNX model as input. This can be a model trained with [Brevitas](https://github.com/Xilinx/brevitas). Brevitas is a PyTorch library for quantization-aware training and the FINN Docker image comes with several [example Brevitas networks](https://github.com/Xilinx/brevitas/tree/master/src/brevitas_examples/bnn_pynq). To show the FINN end-to-end flow, we'll use the TFC-w1a1 model as example network.\n",
     "\n",
     "First a few things have to be imported. Then the model can be loaded with the pretrained weights."
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "Downloading: \"https://github.com/Xilinx/brevitas/releases/download/bnn_pynq-r1/tfc_1w1a-45185b4d.pth\" to /home/maltanar/.cache/torch/checkpoints/tfc_1w1a-45185b4d.pth\n",
-      "100%|██████████| 249073/249073 [00:00<00:00, 767315.58it/s]\n",
-      "/workspace/brevitas/brevitas_examples/bnn_pynq/models/FC.py:84: TracerWarning: torch.tensor results are registered as constants in the trace. You can safely ignore this warning if you use this function to create tensors out of constant variables that would be the same every time you call this function. In any other case, this might cause the trace to be incorrect.\n",
-      "  x = 2.0 * x - torch.tensor([1.0], device=x.device)\n"
-     ]
-    }
-   ],
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
    "source": [
     "import onnx\n",
     "from finn.util.test import get_test_model_trained\n",
     "import brevitas.onnx as bo\n",
     "\n",
     "tfc = get_test_model_trained(\"TFC\", 1, 1)\n",
-    "bo.export_finn_onnx(tfc, (1, 1, 28, 28), build_dir+\"/tfc_w1_a1.onnx\")"
+    "bo.export_finn_onnx(tfc, (1, 1, 28, 28), build_dir+\"/tfc_w1_a1.onnx\"); # semicolon added to suppress log"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "The model was now exported, loaded with the pretrained weights and saved under the name \"lfc_w1_a1.onnx\".\n",
+    "The model was now exported, loaded with the pretrained weights and saved under the name \"tfc_w1_a1.onnx\".\n",
     "To visualize the exported model, Netron can be used. Netron is a visualizer for neural networks and allows interactive investigation of network properties. For example, you can click on the individual nodes and view the properties."
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Serving '/workspace/finn/tfc_w1_a1.onnx' at http://0.0.0.0:8081\n"
-     ]
-    },
-    {
-     "data": {
-      "text/html": [
-       "\n",
-       "        <iframe\n",
-       "            width=\"100%\"\n",
-       "            height=\"400\"\n",
-       "            src=\"http://0.0.0.0:8081/\"\n",
-       "            frameborder=\"0\"\n",
-       "            allowfullscreen\n",
-       "        ></iframe>\n",
-       "        "
-      ],
-      "text/plain": [
-       "<IPython.lib.display.IFrame at 0x7fe30c65e828>"
-      ]
-     },
-     "execution_count": 3,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
    "source": [
     "showInNetron(build_dir+\"/tfc_w1_a1.onnx\")"
    ]
@@ -150,16 +110,16 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "Now that we have the model in .onnx format, we can work with it using FINN. For that FINN `ModelWrapper` is used. It is a wrapper around the ONNX model which provides several helper functions to make it easier to work with the model."
+    "Now that we have the model in .onnx format, we can work with it using FINN. For that, `ModelWrapper` is used. It is a wrapper around the ONNX model which provides several helper functions to make it easier to work with the model. 'ModelWrapper' is imported from the [QONNX repo](https://github.com/fastmachinelearning/qonnx), this repository contains several functionality that is used in FINN."
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
-    "from finn.core.modelwrapper import ModelWrapper\n",
+    "from qonnx.core.modelwrapper import ModelWrapper\n",
     "model = ModelWrapper(build_dir+\"/tfc_w1_a1.onnx\")"
    ]
   },
@@ -243,14 +203,14 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
-    "from finn.transformation.general import GiveReadableTensorNames, GiveUniqueNodeNames, RemoveStaticGraphInputs\n",
-    "from finn.transformation.infer_shapes import InferShapes\n",
-    "from finn.transformation.infer_datatypes import InferDataTypes\n",
-    "from finn.transformation.fold_constants import FoldConstants\n",
+    "from qonnx.transformation.general import GiveReadableTensorNames, GiveUniqueNodeNames, RemoveStaticGraphInputs\n",
+    "from qonnx.transformation.infer_shapes import InferShapes\n",
+    "from qonnx.transformation.infer_datatypes import InferDataTypes\n",
+    "from qonnx.transformation.fold_constants import FoldConstants\n",
     "\n",
     "model = model.transform(InferShapes())\n",
     "model = model.transform(FoldConstants())\n",
@@ -271,40 +231,9 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 8,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "\n",
-      "Stopping http://0.0.0.0:8081\n",
-      "Serving '/workspace/finn/tfc_w1_a1_tidy.onnx' at http://0.0.0.0:8081\n"
-     ]
-    },
-    {
-     "data": {
-      "text/html": [
-       "\n",
-       "        <iframe\n",
-       "            width=\"100%\"\n",
-       "            height=\"400\"\n",
-       "            src=\"http://0.0.0.0:8081/\"\n",
-       "            frameborder=\"0\"\n",
-       "            allowfullscreen\n",
-       "        ></iframe>\n",
-       "        "
-      ],
-      "text/plain": [
-       "<IPython.lib.display.IFrame at 0x7fe2d26a7da0>"
-      ]
-     },
-     "execution_count": 8,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
    "source": [
     "showInNetron(build_dir+\"/tfc_w1_a1_tidy.onnx\")"
    ]
@@ -319,57 +248,18 @@
     "\n",
     "In FINN, we can bake some of these pre/postprocessing operatings into the graph, and in some cases these can be highly beneficial for performance by allowing our accelerator to directly consume raw data instead of going through CPU preprocessing. \n",
     "\n",
-    "We'll demonstrate this for our small image classification network as follows. Brevitas preprocesses BNN-PYNQ network inputs with `torchvision.transforms.ToTensor()` [prior to training](https://github.com/Xilinx/brevitas/blob/master/brevitas_examples/bnn_pynq/trainer.py#L85), which converts 8-bit RGB values into floats between 0 and 1 by dividing the input by 255. We can achieve the same effect in FINN by exporting a single-node ONNX graph for division by 255 (which already exists as `finn.util.pytorch.ToTensor` and merging this with our original model. Finally, we're going to mark our input tensor as 8-bit to let FINN know which level of precision to use."
+    "We'll demonstrate this for our small image classification network as follows. Brevitas preprocesses BNN-PYNQ network inputs with `torchvision.transforms.ToTensor()` [prior to training](https://github.com/Xilinx/brevitas/blob/master/src/brevitas_examples/bnn_pynq/trainer.py#L104), which converts 8-bit RGB values into floats between 0 and 1 by dividing the input by 255. We can achieve the same effect in FINN by exporting a single-node ONNX graph for division by 255 (which already exists as `finn.util.pytorch.ToTensor` and merging this with our original model. Finally, we're going to mark our input tensor as 8-bit to let FINN know which level of precision to use."
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 109,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "\n",
-      "Stopping http://0.0.0.0:8081\n",
-      "Serving '/workspace/finn/tfc_w1_a1_with_preproc.onnx' at http://0.0.0.0:8081\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "/workspace/finn/src/finn/transformation/infer_data_layouts.py:113: UserWarning: Assuming 4D input is NCHW\n",
-      "  warnings.warn(\"Assuming 4D input is NCHW\")\n"
-     ]
-    },
-    {
-     "data": {
-      "text/html": [
-       "\n",
-       "        <iframe\n",
-       "            width=\"100%\"\n",
-       "            height=\"400\"\n",
-       "            src=\"http://0.0.0.0:8081/\"\n",
-       "            frameborder=\"0\"\n",
-       "            allowfullscreen\n",
-       "        ></iframe>\n",
-       "        "
-      ],
-      "text/plain": [
-       "<IPython.lib.display.IFrame at 0x7fe264171f98>"
-      ]
-     },
-     "execution_count": 109,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
    "source": [
     "from finn.util.pytorch import ToTensor\n",
-    "from finn.transformation.merge_onnx_models import MergeONNXModels\n",
-    "from finn.core.datatype import DataType\n",
+    "from qonnx.transformation.merge_onnx_models import MergeONNXModels\n",
+    "from qonnx.core.datatype import DataType\n",
     "\n",
     "model = ModelWrapper(build_dir+\"/tfc_w1_a1_tidy.onnx\")\n",
     "global_inp_name = model.graph.input[0].name\n",
@@ -401,42 +291,11 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 110,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "\n",
-      "Stopping http://0.0.0.0:8081\n",
-      "Serving '/workspace/finn/tfc_w1_a1_pre_post.onnx' at http://0.0.0.0:8081\n"
-     ]
-    },
-    {
-     "data": {
-      "text/html": [
-       "\n",
-       "        <iframe\n",
-       "            width=\"100%\"\n",
-       "            height=\"400\"\n",
-       "            src=\"http://0.0.0.0:8081/\"\n",
-       "            frameborder=\"0\"\n",
-       "            allowfullscreen\n",
-       "        ></iframe>\n",
-       "        "
-      ],
-      "text/plain": [
-       "<IPython.lib.display.IFrame at 0x7fe2640f4588>"
-      ]
-     },
-     "execution_count": 110,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "from finn.transformation.insert_topk import InsertTopK\n",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from qonnx.transformation.insert_topk import InsertTopK\n",
     "\n",
     "# postprocessing: insert Top-1 node at the end\n",
     "model = model.transform(InsertTopK(k=1))\n",
@@ -472,49 +331,9 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 14,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "class Streamline(Transformation):\n",
-      "    \"\"\"Apply the streamlining transform, see arXiv:1709.04060.\"\"\"\n",
-      "\n",
-      "    def apply(self, model):\n",
-      "        streamline_transformations = [\n",
-      "            ConvertSubToAdd(),\n",
-      "            ConvertDivToMul(),\n",
-      "            BatchNormToAffine(),\n",
-      "            ConvertSignToThres(),\n",
-      "            AbsorbSignBiasIntoMultiThreshold(),\n",
-      "            MoveAddPastMul(),\n",
-      "            MoveScalarAddPastMatMul(),\n",
-      "            MoveAddPastConv(),\n",
-      "            MoveScalarMulPastMatMul(),\n",
-      "            MoveScalarMulPastConv(),\n",
-      "            MoveAddPastMul(),\n",
-      "            CollapseRepeatedAdd(),\n",
-      "            CollapseRepeatedMul(),\n",
-      "            AbsorbAddIntoMultiThreshold(),\n",
-      "            FactorOutMulSignMagnitude(),\n",
-      "            AbsorbMulIntoMultiThreshold(),\n",
-      "            Absorb1BitMulIntoMatMul(),\n",
-      "            Absorb1BitMulIntoConv(),\n",
-      "            RoundAndClipThresholds(),\n",
-      "        ]\n",
-      "        for trn in streamline_transformations:\n",
-      "            model = model.transform(trn)\n",
-      "            model = model.transform(RemoveIdentityOps())\n",
-      "            model = model.transform(GiveUniqueNodeNames())\n",
-      "            model = model.transform(GiveReadableTensorNames())\n",
-      "            model = model.transform(InferDataTypes())\n",
-      "        return (model, False)\n",
-      "\n"
-     ]
-    }
-   ],
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
    "source": [
     "from finn.transformation.streamline import Streamline\n",
     "showSrc(Streamline)"
@@ -531,40 +350,9 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 26,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "\n",
-      "Stopping http://0.0.0.0:8081\n",
-      "Serving '/workspace/finn/tfc_w1_a1_streamlined.onnx' at http://0.0.0.0:8081\n"
-     ]
-    },
-    {
-     "data": {
-      "text/html": [
-       "\n",
-       "        <iframe\n",
-       "            width=\"100%\"\n",
-       "            height=\"400\"\n",
-       "            src=\"http://0.0.0.0:8081/\"\n",
-       "            frameborder=\"0\"\n",
-       "            allowfullscreen\n",
-       "        ></iframe>\n",
-       "        "
-      ],
-      "text/plain": [
-       "<IPython.lib.display.IFrame at 0x7fe2640f4d30>"
-      ]
-     },
-     "execution_count": 26,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
    "source": [
     "from finn.transformation.streamline.reorder import MoveScalarLinearPastInvariants\n",
     "import finn.transformation.streamline.absorb as absorb\n",
@@ -582,7 +370,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "You can see that the network has become simplified considerably compared to the previous step -- a lot of nodes have disappeared between the `MatMul` layers, and the `Sign` nodes have been replaced with `MultiThreshold` nodes instead. \n",
+    "You can see that the network has become simplified considerably compared to the previous step -- a lot of nodes have disappeared between the `MatMul` layers. \n",
     "\n",
     "**The current implementation of streamlining is highly network-specific and may not work for your network if its topology is very different than the example network here. We hope to rectify this in future releases.**\n",
     "\n",
@@ -591,45 +379,14 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 28,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "\n",
-      "Stopping http://0.0.0.0:8081\n",
-      "Serving '/workspace/finn/tfc_w1a1_ready_for_hls_conversion.onnx' at http://0.0.0.0:8081\n"
-     ]
-    },
-    {
-     "data": {
-      "text/html": [
-       "\n",
-       "        <iframe\n",
-       "            width=\"100%\"\n",
-       "            height=\"400\"\n",
-       "            src=\"http://0.0.0.0:8081/\"\n",
-       "            frameborder=\"0\"\n",
-       "            allowfullscreen\n",
-       "        ></iframe>\n",
-       "        "
-      ],
-      "text/plain": [
-       "<IPython.lib.display.IFrame at 0x7fe30c65e898>"
-      ]
-     },
-     "execution_count": 28,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "from finn.transformation.bipolar_to_xnor import ConvertBipolarMatMulToXnorPopcount\n",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from qonnx.transformation.bipolar_to_xnor import ConvertBipolarMatMulToXnorPopcount\n",
     "from finn.transformation.streamline.round_thresholds import RoundAndClipThresholds\n",
-    "from finn.transformation.infer_data_layouts import InferDataLayouts\n",
-    "from finn.transformation.general import RemoveUnusedTensors\n",
+    "from qonnx.transformation.infer_data_layouts import InferDataLayouts\n",
+    "from qonnx.transformation.general import RemoveUnusedTensors\n",
     "\n",
     "model = model.transform(ConvertBipolarMatMulToXnorPopcount())\n",
     "model = model.transform(absorb.AbsorbAddIntoMultiThreshold())\n",
@@ -658,60 +415,27 @@
    "metadata": {},
    "source": [
     "### Conversion to HLS layers <a id='hls_layers'></a>\n",
-    "Converts the nodes to HLS layers that correspond to the functions in [finn-hls library](https://finn-hlslib.readthedocs.io/en/latest/). In our case this transformation converts pairs of binary XnorPopcountMatMul layers to StreamingFCLayer_Batch layers. Any immediately following MultiThreshold layers will also be absorbed into the MVTU.\n",
+    "Converts the nodes to HLS layers that correspond to the functions in [finn-hls library](https://finn-hlslib.readthedocs.io/en/latest/). In our case this transformation converts pairs of binary XnorPopcountMatMul layers to MatrixVectorActivation layers. Any immediately following MultiThreshold layers will also be absorbed into the MVTU.\n",
     "\n",
-    "Below is the code for the transformation and the network is visualized using netron to create the new structure with `StreamingFCLayer_Batch` nodes, which will correspond to a function call from the [finn-hlslib](https://finn-hlslib.readthedocs.io/en/latest/library/fclayer.html#_CPPv4I_j_j_j_j000_i_i000E22StreamingFCLayer_BatchvRN3hls6streamI7ap_uintI9InStreamWEEERN3hls6streamI7ap_uintI10OutStreamWEEERK2TWRK2TAKjRK1R) library."
+    "Below is the code for the transformation and the network is visualized using netron to create the new structure with `MatrixVectorActivation` nodes, which will correspond to a function call from the [finn-hlslib](https://finn-hlslib.readthedocs.io/en/latest/library/matrixvector.html) library."
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "**Note:** The transformation `to_hls.InferBinaryStreamingFCLayer` gets the string \"decoupled\" as argument, this indicates the `mem_mode` for the weights. In FINN there are different options to set the way the weights are stored and accessed. For details please have a look on the [FINN readthedocs website](https://finn.readthedocs.io/) under Internals."
+    "**Note:** The transformation `to_hls.InferBinaryMatrixVectorActivation` gets the string \"decoupled\" as argument, this indicates the `mem_mode` for the weights. In FINN there are different options to set the way the weights are stored and accessed. For details please have a look on the [FINN readthedocs website](https://finn.readthedocs.io/) under Internals."
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 29,
-   "metadata": {
-    "scrolled": false
-   },
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "\n",
-      "Stopping http://0.0.0.0:8081\n",
-      "Serving '/workspace/finn/tfc_w1_a1_hls_layers.onnx' at http://0.0.0.0:8081\n"
-     ]
-    },
-    {
-     "data": {
-      "text/html": [
-       "\n",
-       "        <iframe\n",
-       "            width=\"100%\"\n",
-       "            height=\"400\"\n",
-       "            src=\"http://0.0.0.0:8081/\"\n",
-       "            frameborder=\"0\"\n",
-       "            allowfullscreen\n",
-       "        ></iframe>\n",
-       "        "
-      ],
-      "text/plain": [
-       "<IPython.lib.display.IFrame at 0x7fe30c65e748>"
-      ]
-     },
-     "execution_count": 29,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
    "source": [
     "import finn.transformation.fpgadataflow.convert_to_hls_layers as to_hls\n",
     "model = ModelWrapper(build_dir+\"/tfc_w1a1_ready_for_hls_conversion.onnx\")\n",
-    "model = model.transform(to_hls.InferBinaryStreamingFCLayer(\"decoupled\"))\n",
+    "model = model.transform(to_hls.InferBinaryMatrixVectorActivation(\"decoupled\"))\n",
     "# TopK to LabelSelect\n",
     "model = model.transform(to_hls.InferLabelSelectLayer())\n",
     "# input quantization (if any) to standalone thresholding\n",
@@ -724,7 +448,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "Each StreamingFCLayer_Batch node has two attributes that specify the degree of folding, PE and SIMD. In all nodes the values for these attributes are set as default to 1, which would correspond to a maximum folding (time multiplexing) and thus minimum performance. We will shortly cover how these can be adjusted, but first we want to separate the HLS layers from the non-HLS layers in this network."
+    "Each MatrixVectorActivation node has two attributes that specify the degree of folding, PE and SIMD. In all nodes the values for these attributes are set as default to 1, which would correspond to a maximum folding (time multiplexing) and thus minimum performance. We will shortly cover how these can be adjusted, but first we want to separate the HLS layers from the non-HLS layers in this network."
    ]
   },
   {
@@ -733,45 +457,14 @@
    "source": [
     "### Creating a Dataflow Partition <a id='dataflow_partition'></a>\n",
     "\n",
-    "In the graph above, you can see that there is a mixture of FINN HLS layers (StreamingFCLayer_Batch) with regular ONNX layers (Reshape, Mul, Add). To create a bitstream, FINN needs a model with only HLS layers. In order to achieve this, we will use the `CreateDataflowPartition` transformation to create a \"dataflow partition\" in this graph, separating out the HLS layers into another model, and replacing them with a placeholder layer called StreamingDataflowPartition:"
+    "In the graph above, you can see that there is a mixture of FINN HLS layers (MatrixVectorActivation and Thresholding_Batch) with one regular ONNX layers (Reshape). To create a bitstream, FINN needs a model with only HLS layers. In order to achieve this, we will use the `CreateDataflowPartition` transformation to create a \"dataflow partition\" in this graph, separating out the HLS layers into another model, and replacing them with a placeholder layer called StreamingDataflowPartition."
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 30,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "\n",
-      "Stopping http://0.0.0.0:8081\n",
-      "Serving '/workspace/finn/tfc_w1_a1_dataflow_parent.onnx' at http://0.0.0.0:8081\n"
-     ]
-    },
-    {
-     "data": {
-      "text/html": [
-       "\n",
-       "        <iframe\n",
-       "            width=\"100%\"\n",
-       "            height=\"400\"\n",
-       "            src=\"http://0.0.0.0:8081/\"\n",
-       "            frameborder=\"0\"\n",
-       "            allowfullscreen\n",
-       "        ></iframe>\n",
-       "        "
-      ],
-      "text/plain": [
-       "<IPython.lib.display.IFrame at 0x7fe2640abc88>"
-      ]
-     },
-     "execution_count": 30,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
    "source": [
     "from finn.transformation.fpgadataflow.create_dataflow_partition import CreateDataflowPartition\n",
     "\n",
@@ -785,47 +478,16 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "We can see that the StreamingFCLayer instances have all been replaced with a single `StreamingDataflowPartition`, which has an attribute `model` that points to the extracted, HLS dataflow-only graph:"
+    "We can see that the `MatrixVectorActivation` instances and the `Thresholding_Batch` in the beginning have all been replaced with a single `StreamingDataflowPartition`, which has an attribute `model` that points to the extracted, HLS dataflow-only graph:"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 33,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "\n",
-      "Stopping http://0.0.0.0:8081\n",
-      "Serving '/tmp/finn_dev_maltanar/dataflow_partition0_q1ym9aul/df_model.onnx' at http://0.0.0.0:8081\n"
-     ]
-    },
-    {
-     "data": {
-      "text/html": [
-       "\n",
-       "        <iframe\n",
-       "            width=\"100%\"\n",
-       "            height=\"400\"\n",
-       "            src=\"http://0.0.0.0:8081/\"\n",
-       "            frameborder=\"0\"\n",
-       "            allowfullscreen\n",
-       "        ></iframe>\n",
-       "        "
-      ],
-      "text/plain": [
-       "<IPython.lib.display.IFrame at 0x7fe264098f60>"
-      ]
-     },
-     "execution_count": 33,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "from finn.custom_op.registry import getCustomOp\n",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from qonnx.custom_op.registry import getCustomOp\n",
     "sdp_node = parent_model.get_nodes_by_op_type(\"StreamingDataflowPartition\")[0]\n",
     "sdp_node = getCustomOp(sdp_node)\n",
     "dataflow_model_filename = sdp_node.get_nodeattr(\"model\")\n",
@@ -836,12 +498,12 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "We can see all the extracted `StreamingFCLayer` instances have been moved to the child (dataflow) model. We will load the child model with `ModelWrapper` and continue working on it."
+    "We can see all the extracted `MatrixVectorActivation` instances and the `Thresholding_Batch` have been moved to the child (dataflow) model. We will load the child model with `ModelWrapper` and continue working on it."
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 34,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -856,65 +518,23 @@
     "\n",
     "*Folding* in FINN describes how much a layer is time-multiplexed in terms of execution resources. There are several *folding factors* for each layer, controlled by the PE (parallelization over outputs) and SIMD (parallelization over inputs) parameters as described by the original [FINN paper](https://arxiv.org/pdf/1612.07119). The higher the PE and SIMD values are set, the faster the generated accelerator will run, and the more FPGA resources it will consume. \n",
     "\n",
-    "Since the folding parameters are node attributes, they can be easily accessed and changed using a helper function of the `ModelWrapper`. But first we take a closer look at one of the nodes that implement a StreamingFCLayer_Batch operation. This is where the Netron visualization helps us, in the above diagram we can see that the first four nodes are StreamingFCLayer_Batch. So as an example we extract the first node."
+    "Since the folding parameters are node attributes, they can be easily accessed and changed using a helper function of the `ModelWrapper`. But first we take a closer look at one of the nodes that implement a MatrixVectorActivation operation. This is where the Netron visualization helps us, in the above diagram we can see that the model contains four MatrixVectorActivation. So as an example we extract the second node of the graph."
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "We can use the higher-level [HLSCustomOp](https://github.com/Xilinx/finn/blob/master/src/finn/custom_op/fpgadataflow/__init__.py) wrappers for this node. These wrappers provide easy access to specific properties of these nodes, such as the folding factors (PE and SIMD). Let's have a look at which node attributes are defined by the CustomOp wrapper, and adjust the SIMD and PE attributes."
+    "We can use the higher-level [HLSCustomOp](https://github.com/Xilinx/finn/blob/main/src/finn/custom_op/fpgadataflow/__init__.py) wrappers for this node. These wrappers provide easy access to specific properties of these nodes, such as the folding factors (PE and SIMD). Let's have a look at which node attributes are defined by the CustomOp wrapper, and adjust the SIMD and PE attributes."
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 35,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "CustomOp wrapper is of class Thresholding_Batch\n"
-     ]
-    },
-    {
-     "data": {
-      "text/plain": [
-       "{'PE': ('i', True, 0),\n",
-       " 'NumChannels': ('i', True, 0),\n",
-       " 'ram_style': ('s', False, 'distributed'),\n",
-       " 'inputDataType': ('s', True, ''),\n",
-       " 'outputDataType': ('s', True, ''),\n",
-       " 'inFIFODepth': ('i', False, 2),\n",
-       " 'outFIFODepth': ('i', False, 2),\n",
-       " 'numInputVectors': ('ints', False, [1]),\n",
-       " 'ActVal': ('i', False, 0),\n",
-       " 'backend': ('s', True, 'fpgadataflow'),\n",
-       " 'code_gen_dir_cppsim': ('s', False, ''),\n",
-       " 'code_gen_dir_ipgen': ('s', False, ''),\n",
-       " 'executable_path': ('s', False, ''),\n",
-       " 'ipgen_path': ('s', False, ''),\n",
-       " 'ip_path': ('s', False, ''),\n",
-       " 'ip_vlnv': ('s', False, ''),\n",
-       " 'exec_mode': ('s', False, ''),\n",
-       " 'cycles_rtlsim': ('i', False, 0),\n",
-       " 'cycles_estimate': ('i', False, 0),\n",
-       " 'rtlsim_trace': ('s', False, ''),\n",
-       " 'res_estimate': ('s', False, ''),\n",
-       " 'res_hls': ('s', False, ''),\n",
-       " 'res_synth': ('s', False, ''),\n",
-       " 'rtlsim_so': ('s', False, ''),\n",
-       " 'partition_id': ('i', False, 0)}"
-      ]
-     },
-     "execution_count": 35,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "fc0 = model.graph.node[0]\n",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "fc0 = model.graph.node[1]\n",
     "fc0w = getCustomOp(fc0)\n",
     "\n",
     "print(\"CustomOp wrapper is of class \" + fc0w.__class__.__name__)\n",
@@ -926,17 +546,17 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "We can see that the PE and SIMD are listed as node attributes, as well as the depths of the FIFOs that will be inserted between consecutive layers, and all can be adjusted using `set_nodeattr` subject to certain constraints.\n",
+    "We can see that the PE and SIMD are listed as node attributes, as well as the depths of the FIFOs that will be inserted between consecutive layers, and all can be adjusted using `set_nodeattr` subject to certain constraints. There are also a lot of additional attributes that can be set for this node type.\n",
     "**In this notebook we are setting the folding factors and FIFO depths manually, but in a future version we will support determining the folding factors given an FPGA resource budget according to the analytical model from the [FINN-R paper](https://arxiv.org/pdf/1809.04570).**"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 41,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
-    "fc_layers = model.get_nodes_by_op_type(\"StreamingFCLayer_Batch\")\n",
+    "fc_layers = model.get_nodes_by_op_type(\"MatrixVectorActivation\")\n",
     "# (PE, SIMD, in_fifo_depth, out_fifo_depth, ramstyle) for each layer\n",
     "config = [\n",
     "    (16, 49, 16, 64, \"block\"),\n",
@@ -977,42 +597,9 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 42,
-   "metadata": {
-    "scrolled": true
-   },
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "\n",
-      "Stopping http://0.0.0.0:8081\n",
-      "Serving '/workspace/finn/tfc_w1_a1_set_folding_factors.onnx' at http://0.0.0.0:8081\n"
-     ]
-    },
-    {
-     "data": {
-      "text/html": [
-       "\n",
-       "        <iframe\n",
-       "            width=\"100%\"\n",
-       "            height=\"400\"\n",
-       "            src=\"http://0.0.0.0:8081/\"\n",
-       "            frameborder=\"0\"\n",
-       "            allowfullscreen\n",
-       "        ></iframe>\n",
-       "        "
-      ],
-      "text/plain": [
-       "<IPython.lib.display.IFrame at 0x7fe2640712e8>"
-      ]
-     },
-     "execution_count": 42,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
    "source": [
     "model.save(build_dir+\"/tfc_w1_a1_set_folding_factors.onnx\")\n",
     "showInNetron(build_dir+\"/tfc_w1_a1_set_folding_factors.onnx\")"
@@ -1038,17 +625,9 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 43,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "dict_keys(['Ultra96', 'Pynq-Z1', 'Pynq-Z2', 'ZCU102', 'ZCU104'])\n"
-     ]
-    }
-   ],
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
    "source": [
     "# print the names of the supported PYNQ boards\n",
     "from finn.util.basic import pynq_part_map\n",
@@ -1057,7 +636,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 44,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -1076,7 +655,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 45,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -1085,9 +664,26 @@
     "model = model.transform(ZynqBuild(platform = pynq_board, period_ns = target_clk_ns))"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "After the `ZynqBuild` we run one additional transformation to generate a PYNQ driver for the accelerator."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from finn.transformation.fpgadataflow.make_pynq_driver import MakePYNQDriver\n",
+    "model = model.transform(MakePYNQDriver(\"zynq-iodma\"))"
+   ]
+  },
   {
    "cell_type": "code",
-   "execution_count": 46,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -1105,40 +701,9 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 99,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "\n",
-      "Stopping http://0.0.0.0:8081\n",
-      "Serving '/workspace/finn/tfc_w1_a1_post_synthesis.onnx' at http://0.0.0.0:8081\n"
-     ]
-    },
-    {
-     "data": {
-      "text/html": [
-       "\n",
-       "        <iframe\n",
-       "            width=\"100%\"\n",
-       "            height=\"400\"\n",
-       "            src=\"http://0.0.0.0:8081/\"\n",
-       "            frameborder=\"0\"\n",
-       "            allowfullscreen\n",
-       "        ></iframe>\n",
-       "        "
-      ],
-      "text/plain": [
-       "<IPython.lib.display.IFrame at 0x7fe2ef58eb00>"
-      ]
-     },
-     "execution_count": 99,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
    "source": [
     "showInNetron(build_dir + \"/tfc_w1_a1_post_synthesis.onnx\")"
    ]
@@ -1152,40 +717,9 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 102,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "\n",
-      "Stopping http://0.0.0.0:8081\n",
-      "Serving '/tmp/finn_dev_maltanar/dataflow_partition2_b6c72_s0/df_model.onnx' at http://0.0.0.0:8081\n"
-     ]
-    },
-    {
-     "data": {
-      "text/html": [
-       "\n",
-       "        <iframe\n",
-       "            width=\"100%\"\n",
-       "            height=\"400\"\n",
-       "            src=\"http://0.0.0.0:8081/\"\n",
-       "            frameborder=\"0\"\n",
-       "            allowfullscreen\n",
-       "        ></iframe>\n",
-       "        "
-      ],
-      "text/plain": [
-       "<IPython.lib.display.IFrame at 0x7fe2ef5a0e48>"
-      ]
-     },
-     "execution_count": 102,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
    "source": [
     "model = ModelWrapper(build_dir + \"/tfc_w1_a1_post_synthesis.onnx\")\n",
     "sdp_node_middle = getCustomOp(model.graph.node[1])\n",
@@ -1203,34 +737,9 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 103,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "[key: \"pynq_driver_dir\"\n",
-       "value: \"/tmp/finn_dev_maltanar/pynq_driver_kl300vbh\"\n",
-       ", key: \"vivado_stitch_proj\"\n",
-       "value: \"/tmp/finn_dev_maltanar/vivado_stitch_proj_yy5ixo91\"\n",
-       ", key: \"clk_ns\"\n",
-       "value: \"10\"\n",
-       ", key: \"wrapper_filename\"\n",
-       "value: \"/tmp/finn_dev_maltanar/vivado_stitch_proj_yy5ixo91/finn_vivado_stitch_proj.srcs/sources_1/bd/StreamingDataflowPartition_1/hdl/StreamingDataflowPartition_1_wrapper.v\"\n",
-       ", key: \"vivado_stitch_vlnv\"\n",
-       "value: \"xilinx_finn:finn:StreamingDataflowPartition_1:1.0\"\n",
-       ", key: \"vivado_stitch_ifnames\"\n",
-       "value: \"{\\'clk\\': [\\'ap_clk\\'], \\'rst\\': [\\'ap_rst_n\\'], \\'s_axis\\': [\\'s_axis_0\\'], \\'m_axis\\': [\\'m_axis_0\\'], \\'aximm\\': [], \\'axilite\\': []}\"\n",
-       ", key: \"platform\"\n",
-       "value: \"zynq-iodma\"\n",
-       "]"
-      ]
-     },
-     "execution_count": 103,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
    "source": [
     "model = ModelWrapper(postsynth_layers)\n",
     "model.model.metadata_props"
@@ -1252,32 +761,9 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 97,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "[key: \"pynq_driver_dir\"\n",
-       "value: \"/tmp/finn_dev_maltanar/pynq_driver_kl300vbh\"\n",
-       ", key: \"vivado_pynq_proj\"\n",
-       "value: \"/tmp/finn_dev_maltanar/vivado_zynq_proj_kdf60v6f\"\n",
-       ", key: \"bitfile\"\n",
-       "value: \"/tmp/finn_dev_maltanar/vivado_zynq_proj_kdf60v6f/resizer.bit\"\n",
-       ", key: \"hw_handoff\"\n",
-       "value: \"/tmp/finn_dev_maltanar/vivado_zynq_proj_kdf60v6f/resizer.hwh\"\n",
-       ", key: \"vivado_synth_rpt\"\n",
-       "value: \"/tmp/finn_dev_maltanar/vivado_zynq_proj_kdf60v6f/synth_report.xml\"\n",
-       ", key: \"platform\"\n",
-       "value: \"zynq-iodma\"\n",
-       "]"
-      ]
-     },
-     "execution_count": 97,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
    "source": [
     "model = ModelWrapper(build_dir + \"/tfc_w1_a1_post_synthesis.onnx\")\n",
     "model.model.metadata_props"
@@ -1292,20 +778,9 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 98,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "NA\t\t\t      finn_zynq_link.runs  resizer.bit\t     vivado.jou\r\n",
-      "finn_zynq_link.cache\t      finn_zynq_link.srcs  resizer.hwh\t     vivado.log\r\n",
-      "finn_zynq_link.hw\t      finn_zynq_link.xpr   synth_project.sh\r\n",
-      "finn_zynq_link.ip_user_files  ip_config.tcl\t   synth_report.xml\r\n"
-     ]
-    }
-   ],
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
    "source": [
     "! ls {model.get_metadata_prop(\"vivado_pynq_proj\")}"
    ]
@@ -1344,27 +819,15 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 1,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Welcome to PYNQ Linux, based on Ubuntu 18.04 (GNU/Linux 5.4.0-xilinx-v2020.1 armv7l)\r\n",
-      "\r\n",
-      " * Pure upstream Kubernetes 1.21, smallest, simplest cluster ops!\r\n",
-      "\r\n",
-      "     https://microk8s.io/\r\n"
-     ]
-    }
-   ],
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
    "source": [
     "import os\n",
     "\n",
     "# set up the following values according to your own environment\n",
     "# FINN will use ssh to deploy and run the generated accelerator\n",
-    "ip = os.getenv(\"PYNQ_IP\", \"192.168.2.99\")\n",
+    "ip = \"192.168.2.99\"\n",
     "username = os.getenv(\"PYNQ_USERNAME\", \"xilinx\")\n",
     "password = os.getenv(\"PYNQ_PASSWORD\", \"xilinx\")\n",
     "port = os.getenv(\"PYNQ_PORT\", 22)\n",
@@ -1378,7 +841,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 47,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -1397,68 +860,18 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 48,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "[key: \"pynq_driver_dir\"\n",
-       "value: \"/tmp/finn_dev_maltanar/pynq_driver_kl300vbh\"\n",
-       ", key: \"vivado_pynq_proj\"\n",
-       "value: \"/tmp/finn_dev_maltanar/vivado_zynq_proj_kdf60v6f\"\n",
-       ", key: \"bitfile\"\n",
-       "value: \"/tmp/finn_dev_maltanar/vivado_zynq_proj_kdf60v6f/resizer.bit\"\n",
-       ", key: \"hw_handoff\"\n",
-       "value: \"/tmp/finn_dev_maltanar/vivado_zynq_proj_kdf60v6f/resizer.hwh\"\n",
-       ", key: \"vivado_synth_rpt\"\n",
-       "value: \"/tmp/finn_dev_maltanar/vivado_zynq_proj_kdf60v6f/synth_report.xml\"\n",
-       ", key: \"platform\"\n",
-       "value: \"zynq-iodma\"\n",
-       ", key: \"pynq_ip\"\n",
-       "value: \"192.168.2.99\"\n",
-       ", key: \"pynq_port\"\n",
-       "value: \"22\"\n",
-       ", key: \"pynq_username\"\n",
-       "value: \"xilinx\"\n",
-       ", key: \"pynq_password\"\n",
-       "value: \"xilinx\"\n",
-       ", key: \"pynq_target_dir\"\n",
-       "value: \"/home/xilinx/finn_tfc_end2end_example\"\n",
-       ", key: \"pynq_deployment_dir\"\n",
-       "value: \"/tmp/finn_dev_maltanar/pynq_deployment_3wrnn2sp\"\n",
-       ", key: \"pynq_deploy_dir\"\n",
-       "value: \"/tmp/finn_dev_maltanar/pynq_deployment_3wrnn2sp\"\n",
-       ", key: \"exec_mode\"\n",
-       "value: \"remote_pynq\"\n",
-       "]"
-      ]
-     },
-     "execution_count": 48,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
    "source": [
     "model.model.metadata_props"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 106,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "'/home/xilinx/finn_tfc_end2end_example/pynq_deployment_3wrnn2sp'"
-      ]
-     },
-     "execution_count": 106,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
    "source": [
     "target_dir_pynq = target_dir + \"/\" + model.get_metadata_prop(\"pynq_deployment_dir\").split(\"/\")[-1]\n",
     "target_dir_pynq"
@@ -1466,27 +879,9 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 107,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "total 4236\r\n",
-      "-rw-r--r-- 1 xilinx xilinx    8490 Sep 21 11:06 driver.py\r\n",
-      "drwxr-xr-x 4 xilinx xilinx    4096 Sep 21 11:06 finn\r\n",
-      "-rw-r--r-- 1 xilinx xilinx    3264 Sep 21 12:05 input.npy\r\n",
-      "-rw-r--r-- 1 root   root       205 Sep 21 12:34 nw_metrics.txt\r\n",
-      "-rw-r--r-- 1 root   root        84 Sep 21 12:06 output.npy\r\n",
-      "drwxrwxr-x 2 xilinx xilinx    4096 Sep 21 11:34 __pycache__\r\n",
-      "-rw-r--r-- 1 xilinx xilinx 4045671 Sep 21 11:06 resizer.bit\r\n",
-      "-rw-r--r-- 1 xilinx xilinx  246211 Sep 21 11:06 resizer.hwh\r\n",
-      "-rw-r--r-- 1 root   root        32 Sep 21 12:34 sds_trace_data.dat\r\n",
-      "-rw-r--r-- 1 xilinx xilinx    1727 Sep 21 11:06 validate.py\r\n"
-     ]
-    }
-   ],
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
    "source": [
     "! ssh {options} {username}@{ip} -p {port} 'ls -l {target_dir_pynq}'"
    ]
@@ -1500,55 +895,24 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "<matplotlib.image.AxesImage at 0x7fcb96004cc0>"
-      ]
-     },
-     "execution_count": 2,
-     "metadata": {},
-     "output_type": "execute_result"
-    },
-    {
-     "data": {
-      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAPsAAAD4CAYAAAAq5pAIAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjMuMSwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/d3fzzAAAACXBIWXMAAAsTAAALEwEAmpwYAAARYElEQVR4nO3dfYyVZXrH8d/FoDAw8iYRCaisG/5QqmUbgk1KyOKmxlUMbKJm/aPauAmarMmqTVqz/UOSaqJVa/pH3YStL9CsmiWoq0a7a82mWo1GNFQQW1CULGR4E5H3t+HqH/NgZ3We6549z3nOc9z7+0kmM3Ouec65OTM/zsv13Pdt7i4Af/xGNT0AAJ1B2IFMEHYgE4QdyARhBzIxupM3Zma89Z+ZUaPKH09OnTpV23VXvf6enp6wPjAw0PJ1183dbbjLK4XdzK6U9M+SeiT9q7vfV+X6cmU27O/mS6k/6ip/eKNHx38CqcCk6r29vaW1Q4cOhcem9PX1hfUDBw6U1lIt50mTJoX1zz77LKx3o5afxptZj6R/kfR9SRdLusHMLm7XwAC0V5XX7PMlfeTuW9z9uKSnJS1pz7AAtFuVsM+Q9Lsh328rLvs9ZrbMzNaa2doKtwWgotrfoHP3FZJWSLxBBzSpyiP7dknnDfl+ZnEZgC5UJezvSJptZt8yszMl/VDS8+0ZFoB2a/lpvLufNLPbJP1ag623x9z9g7aNLCPjx48P6wcPHmz5useMGRPWjx07FtZTbcFx48aF9ai9lmoppqSOj9prqT76vn37WhlSV6v0mt3dX5L0UpvGAqBGnC4LZIKwA5kg7EAmCDuQCcIOZIKwA5mwTq4um+vpsqled6qXffTo0bA+duzYlo9Nia676vWfffbZYb3qNNLofp06dWp47O7du8N6amrwyZMnw3qdyuaz88gOZIKwA5kg7EAmCDuQCcIOZIKwA5mg9fYNkGrNVfkd1nnddUtNDa6yem1q6m5qanCTS03TegMyR9iBTBB2IBOEHcgEYQcyQdiBTBB2IBP02TvgrLPOCuvRbqOSNHHixLB+4sSJ0lpqN9LUFNbPP/88rC9YsCCs33rrraW1VC/6jjvuCOtbt24N601OM20SfXYgc4QdyARhBzJB2IFMEHYgE4QdyARhBzJBn/0b4JFHHgnrUS871Wuuuox1b29vWI+ktk2+5JJLwvqmTZvC+vHjx0trZ5xxRnhsdO6ClP53HzlyJKzXqazPXmnLZjP7VNIBSQOSTrr7vCrXB6A+lcJeWOTue9pwPQBqxGt2IBNVw+6SfmNm75rZsuF+wMyWmdlaM1tb8bYAVFD1afwCd99uZudIesXM/sfdXxv6A+6+QtIKiTfogCZVemR39+3F512SnpU0vx2DAtB+LYfdzMab2Vmnv5Z0haQN7RoYgPaq8jR+mqRniz7taElPuvu/t2VUf2RSWzYvWrQorF922WVhPeqVHzx4MDw21W/u6+sL66nzNKI566m11x999NGWr1uS7rzzztLaW2+9FR5b93bSTWg57O6+RdKftnEsAGpE6w3IBGEHMkHYgUwQdiAThB3IBFNcu0Bqqubs2bPD+v79+0trEyZMCI+NpoFK6SmwVbZ8TrX9UlJLcO/du7e0tnTp0vDYdevWhfVUSzLV8qwTS0kDmSPsQCYIO5AJwg5kgrADmSDsQCYIO5CJdiw42TFRT7fOfnBK6thU/ZZbbgnrq1atCuszZ85s+bZTffZ77rknrK9evTqsn3nmmaW1K664Ijz2wQcfDOuprbCj2168eHF47LZt28L6nj3fvDVWeWQHMkHYgUwQdiAThB3IBGEHMkHYgUwQdiATHZ/Pnup3Rzo51naqOvd54cKFYf2iiy4qrY0bNy48dvTo+FSLNWvWhPUtW7aE9SpSyz3PmTMnrKfu90jq75T57AC6FmEHMkHYgUwQdiAThB3IBGEHMkHYgUx0vM8+alT5/y9V54XXqcpc+lOnTlW67eg+S9VPnjwZHjt+/PiwfujQobCe2o46+p2l5tJfffXVYf3pp58O61X67Kk17VP3a5Na7rOb2WNmtsvMNgy5bIqZvWJmm4vPk9s5WADtN5Kn8U9IuvIrl90l6VV3ny3p1eJ7AF0sGXZ3f03SV/fRWSJpZfH1SklL2zssAO3W6hp009y9v/h6h6RpZT9oZsskLWvxdgC0SeUFJ93dow0b3X2FpBUSGzsCTWq19bbTzKZLUvF5V/uGBKAOrYb9eUk3FV/fJOlX7RkOgLok++xm9pSk70qaKmmnpLslPSfpl5LOl7RV0vXuXr4Z9v9fV21P46uuG1+1Hkn1ZFN7qEf7r1fV29sb1o8cORLWU+cAVDnH4MILLwzrH3/8ccvXnRpXak36lMOHD1c6voqyPnvyNbu731BS+l6lEQHoKE6XBTJB2IFMEHYgE4QdyARhBzLBls2FVAtyYGAgrEd6enrCetVlh6M2UarFlJrCmpK6/mjb5KgmSYsWLWppTKdFv9MTJ06Ex6amuFb5e2gKj+xAJgg7kAnCDmSCsAOZIOxAJgg7kAnCDmSiq/rsdW7nXHU55yrqvu0DBw6U1lL94lSvO3V8qk8fLRedWsb6uuuuC+tHjx4N62PHji2tpfrsqd9Zk1syt4pHdiAThB3IBGEHMkHYgUwQdiAThB3IBGEHMtHxPns0t7ube+XRksmp5ZRT6txW+dJLLw2PnTNnTlhPLSX93HPPhfVI1AeXpIULF4b1Klt4p5ahjs5dkKovwd0EHtmBTBB2IBOEHcgEYQcyQdiBTBB2IBOEHchEx/vs0Zz1OvvoqbnyqXndUU949Oj4bly6dGlYTx2/ZMmSsD5mzJjS2ty5c8NjJ02aFNZTvezXX3+95eNnz54dHptamz3V616/fn1p7fLLLw+Pje5TqTv76CnJR3Yze8zMdpnZhiGXLTez7Wa2rvi4qt5hAqhqJE/jn5B05TCXP+zuc4uPl9o7LADtlgy7u78maW8HxgKgRlXeoLvNzN4vnuZPLvshM1tmZmvNbG2F2wJQUath/5mkb0uaK6lf0kNlP+juK9x9nrvPa/G2ALRBS2F3953uPuDupyT9XNL89g4LQLu1FHYzmz7k2x9I2lD2swC6g6X6qGb2lKTvSpoqaaeku4vv50pySZ9KusXd+5M3ZhbeWKrfnJr3HZk1a1ZYv+aaa8L64sWLS2upedepedupudPR/utSvIZ5X19feGxK1Xnd0e/0iy++CI+dOHFiWE/ZvHlzaW3VqlXhsQ89VPrKVFJ399ndfdiTSpIn1bj7DcNc/GjlEQHoKE6XBTJB2IFMEHYgE4QdyARhBzKRbL219cbMPFp2uc4prnfffXdYX758eVjfs2dPaW3q1KmtDOlLqa2H9+6NpyZE9QsuuCA8NtUWTG3ZnHLs2LHSWmoaaervIdWKjaYtp7Zcfvnll8P6zTffHNab3NK5rPXGIzuQCcIOZIKwA5kg7EAmCDuQCcIOZIKwA5noeJ89qlfZmjg11TLV96yy7fKuXbvC+tatW8P6Aw88ENZXr14d1ufNK18E6OGHHw6PTW3ZPHly6YpjkqRt27aF9eh3+sQTT4THfvLJJ2H92muvDevR1OOq02tffPHFsJ6aMl0n+uxA5gg7kAnCDmSCsAOZIOxAJgg7kAnCDmSio332UaNGeTQ/+vjx4+Hx55xzTmlt9+7d4bGpPntq7nTUL05tB71p06awPmXKlLCeWrY4Wu75/PPPD49NzWdPLe+9b9++sH7jjTeW1l544YXw2JTUOgLRctGLFi0Kj02tMZC6X1LLf9eJPjuQOcIOZIKwA5kg7EAmCDuQCcIOZIKwA5noqvnsVaT6nitXrgzr119/fcvXf/jw4fDYcePGhfXUtsipef4DAwOltdS672+++WZYf/LJJ8P6unXrwvobb7xRWkudX5Dq4ad+59F5G/Pnzw+Pffvtt8P6448/HtZT68rXqeU+u5mdZ2a/NbONZvaBmf2kuHyKmb1iZpuLz/EqBwAaNZKn8Scl/Y27XyzpzyX92MwulnSXpFfdfbakV4vvAXSpZNjdvd/d3yu+PiDpQ0kzJC2RdPq58UpJS2saI4A2iF/0fIWZzZL0HUlvS5rm7v1FaYekaSXHLJO0rMIYAbTBiN+NN7M+SWsk3e7u+4fWfPBdvmHffHP3Fe4+z93LV0UEULsRhd3MztBg0H/h7s8UF+80s+lFfbqkeIlVAI1Ktt5scP7mSkl73f32IZc/IOkzd7/PzO6SNMXd/zZxXeGNnXvuueFYduzYEdYj0fa9kjRz5sywfu+995bWZsyYER6b2nI5tXVxtF20JN1///2ltY0bN4bHpqa4prZFTklNW46k2oYnTpwI69HU49Tf/YQJE8J61SnTdSprvY3kNftfSPorSevNbF1x2U8l3Sfpl2b2I0lbJcWNagCNSobd3f9LUtl/kd9r73AA1IXTZYFMEHYgE4QdyARhBzJB2IFMdHSKa09Pj0d93dRU0aj3uX///tKaJPX19YX1VN806vlW6fdK6Z5v6hyBqJed6uEfO3YsrFcV/b5TyzWnpgan/l6q/M5Sqo6tTiwlDWSOsAOZIOxAJgg7kAnCDmSCsAOZIOxAJrpqKenUHOKol55aVrjqvOzp06eX1vr7+0trI9Hb2xvWU1s213ndqWWsDx06FNarzClPGTUqfqyqMqe86fMTqqDPDmSOsAOZIOxAJgg7kAnCDmSCsAOZIOxAJrqqzw6gOvrsQOYIO5AJwg5kgrADmSDsQCYIO5AJwg5kIhl2MzvPzH5rZhvN7AMz+0lx+XIz225m64qPq+ofLoBWJU+qMbPpkqa7+3tmdpakdyUt1eB+7Afd/cER3xgn1QC1KzupZiT7s/dL6i++PmBmH0qa0d7hAajbH/Sa3cxmSfqOpLeLi24zs/fN7DEzm1xyzDIzW2tma6sNFUAVIz433sz6JP2npHvd/RkzmyZpjySX9A8afKp/c+I6eBoP1KzsafyIwm5mZ0h6UdKv3f2fhqnPkvSiu/9J4noIO1CzlifC2ODyoI9K+nBo0Is37k77gaQNVQcJoD4jeTd+gaTXJa2XdHpt3p9KukHSXA0+jf9U0i3Fm3nRdfHIDtSs0tP4diHsQP2Yzw5kjrADmSDsQCYIO5AJwg5kgrADmSDsQCYIO5AJwg5kgrADmSDsQCYIO5AJwg5kgrADmUguONlmeyRtHfL91OKybtStY+vWcUmMrVXtHNsFZYWOzmf/2o2brXX3eY0NINCtY+vWcUmMrVWdGhtP44FMEHYgE02HfUXDtx/p1rF167gkxtaqjoyt0dfsADqn6Ud2AB1C2IFMNBJ2M7vSzP7XzD4ys7uaGEMZM/vUzNYX21A3uj9dsYfeLjPbMOSyKWb2ipltLj4Pu8deQ2Prim28g23GG73vmt7+vOOv2c2sR9ImSX8paZukdyTd4O4bOzqQEmb2qaR57t74CRhmtlDSQUmrTm+tZWb/KGmvu99X/Ec52d3/rkvGtlx/4DbeNY2tbJvxv1aD9107tz9vRROP7PMlfeTuW9z9uKSnJS1pYBxdz91fk7T3KxcvkbSy+HqlBv9YOq5kbF3B3fvd/b3i6wOSTm8z3uh9F4yrI5oI+wxJvxvy/TZ1137vLuk3ZvaumS1rejDDmDZkm60dkqY1OZhhJLfx7qSvbDPeNfddK9ufV8UbdF+3wN3/TNL3Jf24eLralXzwNVg39U5/JunbGtwDsF/SQ00OpthmfI2k2919/9Bak/fdMOPqyP3WRNi3SzpvyPczi8u6grtvLz7vkvSsBl92dJOdp3fQLT7vang8X3L3ne4+4O6nJP1cDd53xTbjayT9wt2fKS5u/L4bblydut+aCPs7kmab2bfM7ExJP5T0fAPj+BozG1+8cSIzGy/pCnXfVtTPS7qp+PomSb9qcCy/p1u28S7bZlwN33eNb3/u7h3/kHSVBt+R/1jS3zcxhpJxXSjpv4uPD5oem6SnNPi07oQG39v4kaSzJb0qabOk/5A0pYvG9m8a3Nr7fQ0Ga3pDY1ugwafo70taV3xc1fR9F4yrI/cbp8sCmeANOiAThB3IBGEHMkHYgUwQdiAThB3IBGEHMvF/rSIwqVQD1iIAAAAASUVORK5CYII=\n",
-      "text/plain": [
-       "<Figure size 432x288 with 1 Axes>"
-      ]
-     },
-     "metadata": {
-      "needs_background": "light"
-     },
-     "output_type": "display_data"
-    }
-   ],
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
    "source": [
     "from pkgutil import get_data\n",
     "import onnx.numpy_helper as nph\n",
     "import matplotlib.pyplot as plt\n",
     "\n",
-    "raw_i = get_data(\"finn.data\", \"onnx/mnist-conv/test_data_set_0/input_0.pb\")\n",
+    "raw_i = get_data(\"qonnx.data\", \"onnx/mnist-conv/test_data_set_0/input_0.pb\")\n",
     "x = nph.to_array(onnx.load_tensor_from_string(raw_i))\n",
     "plt.imshow(x.reshape(28,28), cmap='gray')"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 92,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Expected network input shape is [1, 784]\n"
-     ]
-    }
-   ],
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
    "source": [
     "model = ModelWrapper(build_dir + \"/tfc_w1_a1_pynq_deploy.onnx\")\n",
     "iname = model.graph.input[0].name\n",
@@ -1566,7 +930,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 95,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -1579,20 +943,9 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 96,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "array([[2.]], dtype=float32)"
-      ]
-     },
-     "execution_count": 96,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
    "source": [
     "ret[oname]"
    ]
@@ -1612,7 +965,7 @@
     "\n",
     "All the command line prompts here are meant to be executed with `sudo` on the PYNQ board, so we'll use a workaround (`echo password | sudo -S command`) to get that working from this notebook running on the host computer.\n",
     "\n",
-    "**Ensure that your PYNQ board has a working internet connecting for the next steps, since some there is some downloading involved.**\n",
+    "**Ensure that your PYNQ board has a working internet connecting for the next steps, since there is some downloading involved.**\n",
     "\n",
     "To validate the accuracy, we first need to install the [`dataset-loading`](https://github.com/fbcotter/dataset_loading) Python package to the PYNQ board. This will give us a convenient way of downloading and accessing the MNIST dataset.\n",
     "\n",
@@ -1624,22 +977,9 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 75,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "[sudo] password for xilinx: Collecting git+https://github.com/fbcotter/dataset_loading.git@0.0.4\n",
-      "  Cloning https://github.com/fbcotter/dataset_loading.git (to 0.0.4) to /tmp/pip-hhwx4j3n-build\n",
-      "  Requirement already satisfied (use --upgrade to upgrade): dataset-loading==0.0.4 from git+https://github.com/fbcotter/dataset_loading.git@0.0.4 in /usr/local/lib/python3.6/dist-packages\n",
-      "Requirement already satisfied: Pillow in /usr/lib/python3/dist-packages (from dataset-loading==0.0.4)\n",
-      "Requirement already satisfied: scipy in /usr/lib/python3/dist-packages (from dataset-loading==0.0.4)\n",
-      "Connection to 192.168.2.99 closed.\n"
-     ]
-    }
-   ],
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
    "source": [
     "! ssh {options} -t {username}@{ip} -p {port} 'echo {password} | sudo -S pip3 install git+https://github.com/fbcotter/dataset_loading.git@0.0.4#egg=dataset_loading'"
    ]
@@ -1657,36 +997,9 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 108,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "[sudo] password for xilinx: Looking for Train Imgs\n",
-      "Tar File found in data_dir. Not Downloading again\n",
-      "Looking for Train Labels\n",
-      "Tar File found in data_dir. Not Downloading again\n",
-      "Looking for Test Imgs\n",
-      "Tar File found in data_dir. Not Downloading again\n",
-      "Looking for Test Labels\n",
-      "Tar File found in data_dir. Not Downloading again\n",
-      "batch 0 / 10 : total OK 913 NOK 87\n",
-      "batch 1 / 10 : total OK 1800 NOK 200\n",
-      "batch 2 / 10 : total OK 2714 NOK 286\n",
-      "batch 3 / 10 : total OK 3619 NOK 381\n",
-      "batch 4 / 10 : total OK 4535 NOK 465\n",
-      "batch 5 / 10 : total OK 5488 NOK 512\n",
-      "batch 6 / 10 : total OK 6438 NOK 562\n",
-      "batch 7 / 10 : total OK 7399 NOK 601\n",
-      "batch 8 / 10 : total OK 8371 NOK 629\n",
-      "batch 9 / 10 : total OK 9296 NOK 704\n",
-      "Final accuracy: 92.960000\n",
-      "Connection to 192.168.2.99 closed.\n"
-     ]
-    }
-   ],
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
    "source": [
     "! ssh {options} -t {username}@{ip} -p {port} 'cd {target_dir_pynq}; echo {password} | sudo -S python3.6 validate.py --dataset mnist --batchsize 1000'"
    ]
@@ -1695,7 +1008,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "We see that the final top-1 accuracy is 92.96%, which is very close to the 93.17% reported on the [BNN-PYNQ accuracy table in Brevitas](https://github.com/Xilinx/brevitas/tree/master/brevitas_examples/bnn_pynq). "
+    "We see that the final top-1 accuracy is 92.96%, which is very close to the 93.17% reported on the [BNN-PYNQ accuracy table in Brevitas](https://github.com/Xilinx/brevitas/tree/master/src/brevitas_examples/bnn_pynq). "
    ]
   },
   {
@@ -1709,23 +1022,9 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 104,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Network metrics:\n",
-      "runtime[ms]: 10.43391227722168\n",
-      "throughput[images/s]: 958413.2714850444\n",
-      "DRAM_in_bandwidth[Mb/s]: 751.3960048442748\n",
-      "DRAM_out_bandwidth[Mb/s]: 0.9584132714850445\n",
-      "fclk[mhz]: 100.0\n",
-      "N: 10000\n"
-     ]
-    }
-   ],
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
    "source": [
     "from finn.core.throughput_test import throughput_test_remote\n",
     "\n",
@@ -1745,17 +1044,9 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 105,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "We reach approximately 61% of the ideal performance.\n"
-     ]
-    }
-   ],
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
    "source": [
     "II = 64\n",
     "# frequency in MHz\n",
@@ -1774,13 +1065,6 @@
    "source": [
     "The measured values were recorded with a batch size of 10000 and at a frequency of 100 MHz. We will be improving the efficiency of the generated accelerator examples in the coming FINN releases."
    ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": []
   }
  ],
  "metadata": {
diff --git a/notebooks/end2end_example/bnn-pynq/tfc_end2end_verification.ipynb b/notebooks/end2end_example/bnn-pynq/tfc_end2end_verification.ipynb
index 4a5d3dd07a..813127197e 100644
--- a/notebooks/end2end_example/bnn-pynq/tfc_end2end_verification.ipynb
+++ b/notebooks/end2end_example/bnn-pynq/tfc_end2end_verification.ipynb
@@ -28,14 +28,15 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
     "from finn.util.basic import make_build_dir\n",
     "from finn.util.visualization import showSrc, showInNetron\n",
-    "   \n",
-    "build_dir = \"/workspace/finn\""
+    "import os\n",
+    "\n",
+    "build_dir = os.environ[\"FINN_BUILD_DIR\"]"
    ]
   },
   {
@@ -47,22 +48,9 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "array([[-1.119972 , -1.7596636,  0.8423852, -1.0705007, -1.3218282,\n",
-       "        -1.5030646, -1.4598225, -1.2803943, -1.0334575, -1.7878995]],\n",
-       "      dtype=float32)"
-      ]
-     },
-     "execution_count": 2,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
    "source": [
     "from pkgutil import get_data\n",
     "import onnx\n",
@@ -71,7 +59,7 @@
     "from finn.util.test import get_test_model_trained\n",
     "\n",
     "fc = get_test_model_trained(\"TFC\", 1, 1)\n",
-    "raw_i = get_data(\"finn.data\", \"onnx/mnist-conv/test_data_set_0/input_0.pb\")\n",
+    "raw_i = get_data(\"qonnx.data\", \"onnx/mnist-conv/test_data_set_0/input_0.pb\")\n",
     "input_tensor = onnx.load_tensor_from_string(raw_i)\n",
     "input_brevitas = torch.from_numpy(nph.to_array(input_tensor)).float()\n",
     "output_golden = fc.forward(input_brevitas).detach().numpy()\n",
@@ -84,51 +72,18 @@
    "source": [
     "## Simulation using Python <a id='simpy'></a>\n",
     "\n",
-    "If an ONNX model consists of [standard ONNX](https://github.com/onnx/onnx/blob/master/docs/Operators.md) nodes and/or FINN custom operations that do not belong to the fpgadataflow (backend $\\neq$ \"fpgadataflow\") this model can be checked for functionality using Python.\n",
+    "If an ONNX model consists of [standard ONNX](https://github.com/onnx/onnx/blob/master/docs/Operators.md) nodes and/or FINN custom operations that do not belong to the fpgadataflow (`backend` $\\neq$ `fpgadataflow`) this model can be checked for functionality using Python.\n",
     "\n",
-    "To simulate a standard ONNX node [onnxruntime](https://github.com/microsoft/onnxruntime) is used. onnxruntime is an open source tool developed by Microsoft to run standard ONNX nodes. For the FINN custom op nodes execution functions are defined. The following is an example of the execution function of a XNOR popcount node.\n"
+    "To simulate a standard ONNX node [onnxruntime](https://github.com/microsoft/onnxruntime) is used. onnxruntime is an open source tool developed by Microsoft to run standard ONNX nodes. For the FINN custom op nodes execution, functions are defined. The following is an example of the execution function of a XNOR popcount node.\n"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "def xnorpopcountmatmul(inp0, inp1):\n",
-      "    \"\"\"Simulates XNOR-popcount matrix multiplication as a regular bipolar\n",
-      "    matrix multiplication followed by some post processing.\"\"\"\n",
-      "    # extract the operand shapes\n",
-      "    # (M, K0) = inp0.shape\n",
-      "    # (K1, N) = inp1.shape\n",
-      "    K0 = inp0.shape[-1]\n",
-      "    K1 = inp1.shape[0]\n",
-      "    # make sure shapes are compatible with matmul\n",
-      "    assert K0 == K1, \"Matrix shapes are not compatible with matmul.\"\n",
-      "    K = K0\n",
-      "    # convert binary inputs to bipolar\n",
-      "    inp0_bipolar = 2.0 * inp0 - 1.0\n",
-      "    inp1_bipolar = 2.0 * inp1 - 1.0\n",
-      "    # call regular numpy matrix multiplication\n",
-      "    out = np.matmul(inp0_bipolar, inp1_bipolar)\n",
-      "    # XNOR-popcount does not produce the regular dot product result --\n",
-      "    # it returns the number of +1s after XNOR. let P be the number of +1s\n",
-      "    # and N be the number of -1s. XNOR-popcount returns P, whereas the\n",
-      "    # regular dot product result from numpy is P-N, so we need to apply\n",
-      "    # some correction.\n",
-      "    # out = P-N\n",
-      "    # K = P+N\n",
-      "    # out + K = 2P, so P = (out + K)/2\n",
-      "    return (out + K) * 0.5\n",
-      "\n"
-     ]
-    }
-   ],
-   "source": [
-    "from finn.custom_op.general.xnorpopcount import xnorpopcountmatmul\n",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from qonnx.custom_op.general.xnorpopcount import xnorpopcountmatmul\n",
     "showSrc(xnorpopcountmatmul)"
    ]
   },
@@ -145,12 +100,12 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
     "import numpy as np\n",
-    "from finn.core.modelwrapper import ModelWrapper\n",
+    "from qonnx.core.modelwrapper import ModelWrapper\n",
     "input_dict = {\"global_in\": nph.to_array(input_tensor)}\n",
     "\n",
     "model_for_sim = ModelWrapper(build_dir+\"/tfc_w1a1_ready_for_hls_conversion.onnx\")"
@@ -158,25 +113,17 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Results are the same!\n"
-     ]
-    }
-   ],
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
    "source": [
     "import finn.core.onnx_exec as oxe\n",
-    "output_dict = oxe.execute_onnx(model_for_sim, input_dict)\n",
+    "output_dict = oxe.execute_onnx(model_for_sim, input_dict, return_full_exec_context=False)\n",
     "output_pysim = output_dict[list(output_dict.keys())[0]]\n",
     "\n",
     "\n",
     "\n",
-    "if np.isclose(output_pysim, output_golden, atol=1e-3).all():\n",
+    "if np.isclose(output_pysim, np.where(output_golden[0]==np.amax(output_golden[0])), atol=1e-3).all():\n",
     "    print(\"Results are the same!\")\n",
     "else:\n",
     "    print(\"The results are not the same!\")"
@@ -195,12 +142,12 @@
    "source": [
     "## Simulation (cppsim) using C++\n",
     "\n",
-    "When dealing with HLS custom op nodes in FINN the simulation using Python is no longer sufficient. After the nodes have been converted to HLS layers, the simulation using C++ can be used. To do this, the input tensor is stored in an .npy file and C++ code is generated that reads the values from the .npy array, streams them to the corresponding finn-hlslib function and writes the result to a new .npy file. This in turn can be read in Python and processed in the FINN flow. For this example the model after setting the folding factors in the HLS layers is used, please be aware that this is not the full model, but the dataflow partition, so before executing at the end of this section we have to integrate the model back into the parent model."
+    "When dealing with HLS custom op nodes in FINN the simulation using Python is no longer sufficient. After the nodes have been converted to HLS layers, the simulation using C++ can be used. To do this, the input tensor is stored in a .npy file and C++ code is generated that reads the values from the .npy array, streams them to the corresponding finn-hlslib function and writes the result to a new .npy file. This in turn can be read in Python and processed in the FINN flow. For this example the model after setting the folding factors in the HLS layers is used, please be aware that this is not the full model, but the dataflow partition, so before executing at the end of this section we have to integrate the model back into the parent model."
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -218,13 +165,13 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
     "from finn.transformation.fpgadataflow.prepare_cppsim import PrepareCppSim\n",
     "from finn.transformation.fpgadataflow.compile_cppsim import CompileCppSim\n",
-    "from finn.transformation.general import GiveUniqueNodeNames\n",
+    "from qonnx.transformation.general import GiveUniqueNodeNames\n",
     "\n",
     "model_for_cppsim = model_for_cppsim.transform(GiveUniqueNodeNames())\n",
     "model_for_cppsim = model_for_cppsim.transform(PrepareCppSim())\n",
@@ -240,38 +187,9 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 8,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Serving '/workspace/finn/tfc_w1_a1_for_cppsim.onnx' at http://0.0.0.0:8081\n"
-     ]
-    },
-    {
-     "data": {
-      "text/html": [
-       "\n",
-       "        <iframe\n",
-       "            width=\"100%\"\n",
-       "            height=\"400\"\n",
-       "            src=\"http://0.0.0.0:8081/\"\n",
-       "            frameborder=\"0\"\n",
-       "            allowfullscreen\n",
-       "        ></iframe>\n",
-       "        "
-      ],
-      "text/plain": [
-       "<IPython.lib.display.IFrame at 0x7f3cac09d978>"
-      ]
-     },
-     "execution_count": 8,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
    "source": [
     "model_for_cppsim.save(build_dir+\"/tfc_w1_a1_for_cppsim.onnx\")\n",
     "showInNetron(build_dir+\"/tfc_w1_a1_for_cppsim.onnx\")"
@@ -290,20 +208,11 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 9,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "compile.sh\t\t\t    memblock_0.dat  thresh.h\r\n",
-      "execute_StreamingFCLayer_Batch.cpp  node_model\t    weights.npy\r\n"
-     ]
-    }
-   ],
-   "source": [
-    "from finn.custom_op.registry import getCustomOp\n",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from qonnx.custom_op.registry import getCustomOp\n",
     "\n",
     "fc0 = model_for_cppsim.graph.node[1]\n",
     "fc0w = getCustomOp(fc0)\n",
@@ -327,7 +236,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 10,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -341,33 +250,25 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "Before the model can be executed using `execute_onnx`, we integrate the child model in the parent model. The function reads then the `exec_mode` and writes the input into the correct directory in a .npy file. To be able to read this in C++, there is an additional .hpp file ([npy2apintstream.hpp](https://github.com/Xilinx/finn/blob/master/src/finn/data/cpp/npy2apintstream.hpp)) in FINN, which uses cnpy to read .npy files and convert them into streams, or to read a stream and write it into an .npy. [cnpy](https://github.com/rogersce/cnpy) is a helper to read and write .npy and .npz formates in C++.\n",
+    "Before the model can be executed using `execute_onnx`, we integrate the child model in the parent model. The function reads then the `exec_mode` and writes the input into the correct directory in a .npy file. To be able to read this in C++, there is an additional .hpp file ([npy2apintstream.hpp](https://github.com/Xilinx/finn/blob/main/src/finn/qnn-data/cpp/npy2apintstream.hpp)) in FINN, which uses cnpy to read .npy files and convert them into streams, or to read a stream and write it into an .npy. [cnpy](https://github.com/rogersce/cnpy) is a helper to read and write .npy and .npz formates in C++.\n",
     "\n",
     "The result is again compared to the \"golden\" output."
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 11,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Results are the same!\n"
-     ]
-    }
-   ],
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
    "source": [
     "parent_model = ModelWrapper(build_dir+\"/tfc_w1_a1_dataflow_parent.onnx\")\n",
-    "sdp_node = parent_model.graph.node[2]\n",
+    "sdp_node = parent_model.graph.node[1]\n",
     "child_model = build_dir + \"/tfc_w1_a1_for_cppsim.onnx\"\n",
     "getCustomOp(sdp_node).set_nodeattr(\"model\", child_model)\n",
     "output_dict = oxe.execute_onnx(parent_model, input_dict)\n",
     "output_cppsim = output_dict[list(output_dict.keys())[0]]\n",
     "\n",
-    "if np.isclose(output_cppsim, output_golden, atol=1e-3).all():\n",
+    "if np.isclose(output_cppsim, np.where(output_golden[0]==np.amax(output_golden[0])), atol=1e-3).all():\n",
     "    print(\"Results are the same!\")\n",
     "else:\n",
     "    print(\"The results are not the same!\")"
@@ -404,7 +305,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 12,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -428,49 +329,34 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "The next step is to load the parent model and set the node attribute `model` in the StreamingDataflowPartition node (`sdp_node`). Afterwards the `exec_mode` is set in the parent model in each node."
+    "The next step is to load the parent model and set the node attribute `model` in the StreamingDataflowPartition node (`sdp_node`). Afterwards the `exec_mode` is set in the parent model in each node and the model can be executed."
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 13,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
     "# parent model\n",
     "model_for_rtlsim = ModelWrapper(build_dir + \"/tfc_w1_a1_dataflow_parent.onnx\")\n",
     "# reference child model\n",
-    "sdp_node = getCustomOp(model_for_rtlsim.graph.node[2])\n",
+    "sdp_node = getCustomOp(model_for_rtlsim.graph.node[1])\n",
     "sdp_node.set_nodeattr(\"model\", build_dir + \"/tfc_w1_a1_dataflow_child.onnx\")\n",
     "\n",
     "model_for_rtlsim = model_for_rtlsim.transform(SetExecMode(\"rtlsim\"))"
    ]
   },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "Because the necessary files for the emulation are already generated in Jupyter notebook [tfc_end2end_example](tfc_end2end_example.ipynb), in the next step the execution of the model can be done directly."
-   ]
-  },
   {
    "cell_type": "code",
-   "execution_count": 14,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Results are the same!\n"
-     ]
-    }
-   ],
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
    "source": [
     "output_dict = oxe.execute_onnx(model_for_rtlsim, input_dict)\n",
     "output_rtlsim = output_dict[list(output_dict.keys())[0]]\n",
     "\n",
-    "if np.isclose(output_rtlsim, output_golden, atol=1e-3).all():\n",
+    "if np.isclose(output_rtlsim, np.where(output_golden[0]==np.amax(output_golden[0])), atol=1e-3).all():\n",
     "    print(\"Results are the same!\")\n",
     "else:\n",
     "    print(\"The results are not the same!\")"
@@ -487,24 +373,9 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 18,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "/workspace/finn/src/finn/transformation/fpgadataflow/hlssynth_ip.py:70: UserWarning: Using pre-existing IP for StreamingFCLayer_Batch_3\n",
-      "  warnings.warn(\"Using pre-existing IP for %s\" % node.name)\n",
-      "/workspace/finn/src/finn/transformation/fpgadataflow/hlssynth_ip.py:70: UserWarning: Using pre-existing IP for StreamingFCLayer_Batch_1\n",
-      "  warnings.warn(\"Using pre-existing IP for %s\" % node.name)\n",
-      "/workspace/finn/src/finn/transformation/fpgadataflow/hlssynth_ip.py:70: UserWarning: Using pre-existing IP for StreamingFCLayer_Batch_2\n",
-      "  warnings.warn(\"Using pre-existing IP for %s\" % node.name)\n",
-      "/workspace/finn/src/finn/transformation/fpgadataflow/hlssynth_ip.py:70: UserWarning: Using pre-existing IP for StreamingFCLayer_Batch_0\n",
-      "  warnings.warn(\"Using pre-existing IP for %s\" % node.name)\n"
-     ]
-    }
-   ],
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
    "source": [
     "from finn.transformation.fpgadataflow.insert_dwc import InsertDWC\n",
     "from finn.transformation.fpgadataflow.insert_fifo import InsertFIFO\n",
@@ -519,43 +390,30 @@
     "child_model = child_model.transform(CreateStitchedIP(test_fpga_part, target_clk_ns))\n",
     "child_model = child_model.transform(PrepareRTLSim())\n",
     "child_model.set_metadata_prop(\"exec_mode\",\"rtlsim\")\n",
-    "child_model.save(build_dir + \"/tfc_w1_a1_dataflow_child.onnx\")"
+    "child_model.save(build_dir + \"/tfc_w1_a1_dataflow_child.onnx\");"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 19,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
     "# parent model\n",
     "model_for_rtlsim = ModelWrapper(build_dir + \"/tfc_w1_a1_dataflow_parent.onnx\")\n",
     "# reference child model\n",
-    "sdp_node = getCustomOp(model_for_rtlsim.graph.node[2])\n",
+    "sdp_node = getCustomOp(model_for_rtlsim.graph.node[1])\n",
     "sdp_node.set_nodeattr(\"model\", build_dir + \"/tfc_w1_a1_dataflow_child.onnx\")"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 20,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Results are the same!\n"
-     ]
-    }
-   ],
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
    "source": [
     "output_dict = oxe.execute_onnx(model_for_rtlsim, input_dict)\n",
-    "output_rtlsim = output_dict[list(output_dict.keys())[0]]\n",
-    "\n",
-    "if np.isclose(output_rtlsim, output_golden, atol=1e-3).all():\n",
-    "    print(\"Results are the same!\")\n",
-    "else:\n",
-    "    print(\"The results are not the same!\")"
+    "output_rtlsim = output_dict[list(output_dict.keys())[0]]"
    ]
   },
   {
@@ -563,7 +421,12 @@
    "execution_count": null,
    "metadata": {},
    "outputs": [],
-   "source": []
+   "source": [
+    "if np.isclose(output_rtlsim, np.where(output_golden[0]==np.amax(output_golden[0])), atol=1e-3).all():\n",
+    "    print(\"Results are the same!\")\n",
+    "else:\n",
+    "    print(\"The results are not the same!\")"
+   ]
   }
  ],
  "metadata": {
@@ -582,7 +445,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.6.8"
+   "version": "3.8.5"
   }
  },
  "nbformat": 4,
diff --git a/notebooks/end2end_example/cybersecurity/1-train-mlp-with-brevitas.ipynb b/notebooks/end2end_example/cybersecurity/1-train-mlp-with-brevitas.ipynb
index 2c9f4a99ed..5625a6f1c2 100644
--- a/notebooks/end2end_example/cybersecurity/1-train-mlp-with-brevitas.ipynb
+++ b/notebooks/end2end_example/cybersecurity/1-train-mlp-with-brevitas.ipynb
@@ -47,6 +47,7 @@
     "-------------\n",
     "\n",
     "* [Load the UNSW_NB15 Dataset](#load_dataset) \n",
+    "* [Define a PyTorch Device](#define_pytorch_device)\n",
     "* [Define the Quantized MLP Model](#define_quantized_mlp)\n",
     "* [Define Train and Test  Methods](#train_test)\n",
     "    * [(Option 1) Train the Model from Scratch](#train_scratch)\n",
@@ -57,7 +58,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -76,7 +77,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "## Load the UNSW_NB15 Dataset <a id='load_dataset'></a>\n",
+    "# Load the UNSW_NB15 Dataset <a id='load_dataset'></a>\n",
     "\n",
     "### Dataset Quantization <a id='dataset_qnt'></a>\n",
     "\n",
@@ -103,27 +104,9 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "--2021-10-12 15:49:17--  https://zenodo.org/record/4519767/files/unsw_nb15_binarized.npz?download=1\n",
-      "Resolving zenodo.org (zenodo.org)... 137.138.76.77\n",
-      "Connecting to zenodo.org (zenodo.org)|137.138.76.77|:443... connected.\n",
-      "HTTP request sent, awaiting response... 200 OK\n",
-      "Length: 13391907 (13M) [application/octet-stream]\n",
-      "Saving to: ‘unsw_nb15_binarized.npz’\n",
-      "\n",
-      "unsw_nb15_binarized 100%[===================>]  12.77M  3.56MB/s    in 3.7s    \n",
-      "\n",
-      "2021-10-12 15:49:22 (3.44 MB/s) - ‘unsw_nb15_binarized.npz’ saved [13391907/13391907]\n",
-      "\n"
-     ]
-    }
-   ],
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
    "source": [
     "! wget -O unsw_nb15_binarized.npz https://zenodo.org/record/4519767/files/unsw_nb15_binarized.npz?download=1"
    ]
@@ -137,18 +120,9 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Samples in each set: train = 175341, test = 82332\n",
-      "Shape of one input sample: torch.Size([593])\n"
-     ]
-    }
-   ],
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
    "source": [
     "import numpy as np\n",
     "from torch.utils.data import TensorDataset\n",
@@ -183,7 +157,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -198,18 +172,9 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Input shape for 1 batch: torch.Size([1000, 593])\n",
-      "Label shape for 1 batch: torch.Size([1000])\n"
-     ]
-    }
-   ],
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
    "source": [
     "count = 0\n",
     "for x,y in train_quantized_loader:\n",
@@ -220,6 +185,25 @@
     "        break"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Define a PyTorch Device <a id='define_pytorch_device'></a> \n",
+    "\n",
+    "GPUs can significantly speed-up training of deep neural networks. We check for availability of a GPU and if so define it as target device."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n",
+    "print(\"Target device: \" + str(device))"
+   ]
+  },
   {
    "cell_type": "markdown",
    "metadata": {},
@@ -236,7 +220,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -258,7 +242,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -282,7 +266,9 @@
     "      nn.Dropout(0.5),\n",
     "      QuantReLU(bit_width=act_bit_width),\n",
     "      QuantLinear(hidden3, num_classes, bias=True, weight_bit_width=weight_bit_width)\n",
-    ")\n"
+    ")\n",
+    "\n",
+    "model.to(device)"
    ]
   },
   {
@@ -302,7 +288,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 8,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -313,6 +299,7 @@
     "    \n",
     "    for i, data in enumerate(train_loader, 0):        \n",
     "        inputs, target = data\n",
+    "        inputs, target = inputs.to(device), target.to(device)\n",
     "        optimizer.zero_grad()   \n",
     "                \n",
     "        # forward pass\n",
@@ -324,14 +311,14 @@
     "        optimizer.step()\n",
     "        \n",
     "        # keep track of loss value\n",
-    "        losses.append(loss.data.numpy()) \n",
+    "        losses.append(loss.data.cpu().numpy()) \n",
     "           \n",
     "    return losses"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 9,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -347,12 +334,13 @@
     "    with torch.no_grad():\n",
     "        for data in test_loader:\n",
     "            inputs, target = data\n",
+    "            inputs, target = inputs.to(device), target.to(device)\n",
     "            output_orig = model(inputs.float())\n",
     "            # run the output through sigmoid\n",
     "            output = torch.sigmoid(output_orig)  \n",
     "            # compare against a threshold of 0.5 to generate 0/1\n",
-    "            pred = (output.detach().numpy() > 0.5) * 1\n",
-    "            target = target.float()\n",
+    "            pred = (output.detach().cpu().numpy() > 0.5) * 1\n",
+    "            target = target.cpu().float()\n",
     "            y_true.extend(target.tolist()) \n",
     "            y_pred.extend(pred.reshape(-1).tolist())\n",
     "        \n",
@@ -384,7 +372,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 10,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -402,30 +390,20 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 11,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
     "# loss criterion and optimizer\n",
-    "criterion = nn.BCEWithLogitsLoss()\n",
+    "criterion = nn.BCEWithLogitsLoss().to(device)\n",
     "optimizer = torch.optim.Adam(model.parameters(), lr=lr, betas=(0.9, 0.999))"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 12,
-   "metadata": {
-    "scrolled": true
-   },
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "Training loss = 0.132918 test accuracy = 0.798341: 100%|██████████| 10/10 [00:44<00:00,  4.45s/it]\n"
-     ]
-    }
-   ],
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
    "source": [
     "import numpy as np\n",
     "from sklearn.metrics import accuracy_score\n",
@@ -450,24 +428,9 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 13,
-   "metadata": {
-    "scrolled": true
-   },
-   "outputs": [
-    {
-     "data": {
-      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAY4AAAEWCAYAAABxMXBSAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjMuMSwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/d3fzzAAAACXBIWXMAAAsTAAALEwEAmpwYAAAofElEQVR4nO3de3Rd5X3m8e+jo5slSzq2JRMsHWMbTIi5SLSGXEsTSlpIO8C0uUCbhLRpmXRKmpY2DWlmpR2mWSuFTpJ2SjowJSSZkFJCksaTQsiNQNKEBBOMb9yM8N1g+Spbsu6/+eNsiSMhyTq2js+R9HzW0tLe776c3z4herz3u/e7FRGYmZlNVVmxCzAzs5nFwWFmZnlxcJiZWV4cHGZmlhcHh5mZ5cXBYWZmeXFwmJ0ASQ9Ium66182zhjdL2jnd+zU7nvJiF2B2qkg6mjNbA/QCg8n8f4mIu6e6r4i4ohDrms0EDg6bMyJi/vC0pK3A70fEd8euJ6k8IgZOZW1mM4kvVdmcN3zJR9JHJL0I3CVpgaRvSuqQdDCZbsnZ5geSfj+Zfp+kH0n6u2TdFyRdcYLrLpf0iKQjkr4r6TZJX5ricbwm+axDkjZJujJn2dskbU72u0vSnyftjcmxHZJ0QNIPJfnvgk3K/4GYZb0KWAicAVxP9v8bdyXzS4FjwD9Osv1rgWeARuAW4E5JOoF1vwz8DFgE/DXwnqkUL6kC+H/At4HFwAeBuyW9OlnlTrKX4+qA84DvJ+1/BuwEmoDTgL8EPA6RTcrBYZY1BPxVRPRGxLGI2B8RX42I7og4AnwC+OVJtt8WEf8nIgaBLwCnk/1DPOV1JS0FLgI+HhF9EfEjYM0U638dMB/4ZLLt94FvAtcmy/uBVZLqI+JgRPw8p/104IyI6I+IH4YHsLPjcHCYZXVERM/wjKQaSbdL2iapE3gESEtKTbD9i8MTEdGdTM7Pc90lwIGcNoAdU6x/CbAjIoZy2rYBzcn0bwFvA7ZJeljS65P2W4EtwLcltUu6aYqfZ3OYg8Msa+y/sv8MeDXw2oioBy5J2ie6/DQd9gALJdXktGWmuO1uIDOmf2IpsAsgIh6LiKvIXsb6N+DepP1IRPxZRKwArgRulPQrJ3cYNts5OMzGV0e2X+OQpIXAXxX6AyNiG7AW+GtJlclZwX+a4uY/BbqBv5BUIenNybb3JPv6HUkNEdEPdJK9NIek35B0VtLHcpjs7clD436CWcLBYTa+zwDzgH3Ao8C3TtHn/g7wemA/8DfAv5J93mRSEdFHNiiuIFvzZ4H3RsTTySrvAbYml90+kHwOwErgu8BR4CfAZyPioWk7GpuV5H4ws9Il6V+BpyOi4Gc8ZlPlMw6zEiLpIklnSiqTdDlwFdk+CbOS4SfHzUrLq4CvkX2OYyfwhxHxRHFLMhvNl6rMzCwvvlRlZmZ5mROXqhobG2PZsmXFLsPMbEZ5/PHH90VE09j2OREcy5YtY+3atcUuw8xsRpG0bbx2X6oyM7O8ODjMzCwvDg4zM8uLg8PMzPLi4DAzs7w4OMzMLC8ODjMzy4uDYxLfWLeLLz067m3MZmZzloNjEt/a+CJ3PNJe7DLMzEqKg2MSrZk02w90c6Crr9ilmJmVDAfHJFpb0gA8ueNQUeswMyslDo5JXNDSQJlgnYPDzGyEg2MStVXlrFxcx5M7DxW7FDOzkuHgOI62TJondxzCL7wyM8tycBxHaybNwe5+th/oLnYpZmYloaDBIelySc9I2iLppnGWf0DSBknrJP1I0qqcZR9NtntG0q9NdZ/TrTXTALifw8xsWMGCQ1IKuA24AlgFXJsbDIkvR8T5EdEG3AJ8Ktl2FXANcC5wOfBZSakp7nNavfq0OqoryhwcZmaJQp5xXAxsiYj2iOgD7gGuyl0hIjpzZmuB4Y6Eq4B7IqI3Il4AtiT7O+4+p1t5qozzmxt8S66ZWaKQwdEM7MiZ35m0jSLpjyQ9T/aM44+Ps+2U9pns93pJayWt7ejoOOGDgOzzHBt3d9I/OHRS+zEzmw2K3jkeEbdFxJnAR4D/No37vSMiVkfE6qamV7xrPS9tS9P0DQzx9J4j01SdmdnMVcjg2AVkcuZbkraJ3ANcfZxt893ntBh+gnydn+cwMytocDwGrJS0XFIl2c7uNbkrSFqZM/vrwHPJ9BrgGklVkpYDK4GfTWWfhdCyYB6LaitZt/1QoT/KzKzklRdqxxExIOkG4EEgBXwuIjZJuhlYGxFrgBskXQb0AweB65JtN0m6F9gMDAB/FBGDAOPts1DHMExS9kFAn3GYmRUuOAAi4n7g/jFtH8+Z/tAk234C+MRU9nkqtGbSfP+ZvXT29FNfXXGqP97MrGQUvXN8pmjLpImADTsPF7sUM7OicnBM0QUtfoLczAwcHFOWrqlkeWOtHwQ0sznPwZGHtkyadR4p18zmOAdHHlpbGth7pJcXO3uKXYqZWdE4OPLQmkkDfpWsmc1tDo48rFpST0VKPOHgMLM5zMGRh6ryFKtOr/cZh5nNaQ6OPLVm0mzYeZjBIXeQm9nc5ODIU1smTVffIFv2Hi12KWZmReHgyJM7yM1srnNw5Gn5olrqq8s9xLqZzVkOjjyVlYnWTNpDrJvZnOXgOAGtLWmeeekIx/oGi12Kmdkp5+A4AW2ZNINDwcbdHinXzOYeB8cJuCCTHSnXHeRmNhc5OE7A4rpqmtPzPMS6mc1JDo4TNDxSrpnZXOPgOEGtmQZ2HjzGvqO9xS7FzOyUKmhwSLpc0jOStki6aZzlN0raLGm9pO9JOiNpf4ukdTk/PZKuTpZ9XtILOcvaCnkME2ltSQOw3s9zmNkcU7DgkJQCbgOuAFYB10paNWa1J4DVEXEBcB9wC0BEPBQRbRHRBlwKdAPfztnuw8PLI2JdoY5hMue3NFAm/DyHmc05hTzjuBjYEhHtEdEH3ANclbtCEhDdyeyjQMs4+3k78EDOeiWhprKcs0+rY91O35JrZnNLIYOjGdiRM78zaZvI+4EHxmm/BviXMW2fSC5vfVpS1Xg7k3S9pLWS1nZ0dORT95S1ZdI86VfJmtkcUxKd45LeDawGbh3TfjpwPvBgTvNHgXOAi4CFwEfG22dE3BERqyNidVNTU0HqbsukOXysn637S+pkyMysoAoZHLuATM58S9I2iqTLgI8BV0bE2FuU3gl8PSL6hxsiYk9k9QJ3kb0kVhQeKdfM5qJCBsdjwEpJyyVVkr3ktCZ3BUkXAreTDY294+zjWsZcpkrOQpAk4Gpg4/SXPjVnn1ZHTWXKz3OY2ZxSXqgdR8SApBvIXmZKAZ+LiE2SbgbWRsQaspem5gNfyeYA2yPiSgBJy8iesTw8Ztd3S2oCBKwDPlCoYzieVJk4r7nBwWFmc0rBggMgIu4H7h/T9vGc6csm2XYr43SmR8Sl01jiSWvLpPn8f2ylb2CIyvKS6DIyMyso/6U7SW2ZNH2DQzy1p7PYpZiZnRIOjpM00kHuJ8jNbI5wcJykJQ3VNM6vcj+Hmc0ZDo6TJMkj5ZrZnOLgmAZtmQbaO7o4fKz/+Cubmc1wDo5pMNzPscHjVpnZHODgmAYXJEOsr9txsLiFmJmdAg6OadAwr4IVTbWs2+EzDjOb/Rwc02S4g9wj5ZrZbOfgmCZtmTT7jvay+3BPsUsxMysoB8c0GX6VrEfKNbPZzsExTV5zej2VqTIHh5nNeg6OaVJZXsaqJfU84eAws1nOwTGN2jJpNuw8zMDgULFLMTMrGAfHNGrLpDnWP8hze48WuxQzs4JxcEwjv0rWzOYCB8c0WraohoZ5FR5i3cxmNQfHNJJEaybNE9sPFbsUM7OCcXBMs7aWBp596QjdfQPFLsXMrCAKGhySLpf0jKQtkm4aZ/mNkjZLWi/pe5LOyFk2KGld8rMmp325pJ8m+/xXSZWFPIZ8tS1NMxSwcZdfJWtms1PBgkNSCrgNuAJYBVwradWY1Z4AVkfEBcB9wC05y45FRFvyc2VO+98Cn46Is4CDwPsLdQwnwiPlmtlsV8gzjouBLRHRHhF9wD3AVbkrRMRDEdGdzD4KtEy2Q0kCLiUbMgBfAK6ezqJPVuP8KloWzONJj5RrZrNUIYOjGdiRM78zaZvI+4EHcuarJa2V9Kikq5O2RcChiBjuQJhwn5KuT7Zf29HRcUIHcKL8Klkzm81KonNc0ruB1cCtOc1nRMRq4LeBz0g6M599RsQdEbE6IlY3NTVNY7XH15ZJs+vQMTqO9J7SzzUzOxUKGRy7gEzOfEvSNoqky4CPAVdGxMhf2ojYlfxuB34AXAjsB9KSyifbZ7H5QUAzm80KGRyPASuTu6AqgWuANbkrSLoQuJ1saOzNaV8gqSqZbgTeCGyO7FuSHgLenqx6HfCNAh7DCTlvSQOpMvlBQDOblQoWHEk/xA3Ag8BTwL0RsUnSzZKG75K6FZgPfGXMbbevAdZKepJsUHwyIjYnyz4C3ChpC9k+jzsLdQwnal5lilefVud+DjOblcqPv8qJi4j7gfvHtH08Z/qyCbb7MXD+BMvayd6xVdJaM2n+ff1uhoaCsjIVuxwzs2lTEp3js9GFmTSdPQNs3d9V7FLMzKaVg6NAhjvIfbnKzGYbB0eBnLV4PrWVKd9ZZWazjoOjQFJl4vyWBp9xmNms4+AooNZMms17OukdGCx2KWZm08bBUUAXZtL0DwZP7TlS7FLMzKaNg6OARjrIt3ukXDObPRwcBfSq+moW11Xx5E6PlGtms4eDo4Ak0ZZJ+84qM5tVHBwF1ppJ076vi8Pd/cUuxcxsWjg4CqxteKRcD3hoZrOEg6PAzm9pQPIQ62Y2ezg4Cqy+uoIzm+b7QUAzmzUcHKdAa0uaJ3ceIvs6ETOzmc3BcQq0LU2z72gfuw4dK3YpZmYnzcFxCrS1pAGPlGtms4OD4xQ45/Q6KsvL3EFuZrOCg+MUqEiVcd6Sep9xmNms4OA4RVozaTbsOszA4FCxSzEzOylTCg5JtZLKkumzJV0pqWIK210u6RlJWyTdNM7yGyVtlrRe0vcknZG0t0n6iaRNybJ35WzzeUkvSFqX/LRN+WiLqC2Tpqd/iGdfOlrsUszMTspUzzgeAaolNQPfBt4DfH6yDSSlgNuAK4BVwLWSVo1Z7QlgdURcANwH3JK0dwPvjYhzgcuBz0hK52z34YhoS37WTfEYiqrNr5I1s1liqsGhiOgGfhP4bES8Azj3ONtcDGyJiPaI6APuAa7KXSEiHkr2C/Ao0JK0PxsRzyXTu4G9QNMUay1JSxfWsKCmwh3kZjbjTTk4JL0e+B3g35O21HG2aQZ25MzvTNom8n7ggXE++GKgEng+p/kTySWsT0uqmqDg6yWtlbS2o6PjOKUWniRaM2mPWWVmM95Ug+NPgI8CX4+ITZJWAA9NVxGS3g2sBm4d03468H+B342I4V7ljwLnABcBC4GPjLfPiLgjIlZHxOqmptI4WWltSfPsS0fo6h0odilmZidsSsEREQ9HxJUR8bdJJ/m+iPjj42y2C8jkzLckbaNIugz4GHBlRPTmtNeTPbv5WEQ8mlPLnsjqBe4ie0lsRmjLpBkK2LDLL3Yys5lrqndVfVlSvaRaYCOwWdKHj7PZY8BKScslVQLXAGvG7PdC4HayobE3p70S+DrwxYi4b8w2pye/BVyd1DMjDL9K1v0cZjaTTfVS1aqI6CT7h/oBYDnZO6smFBEDwA3Ag8BTwL3JZa6bJV2ZrHYrMB/4SnJr7XCwvBO4BHjfOLfd3i1pA7ABaAT+ZorHUHQLaytZurDGd1aZ2YxWPsX1KpLnNq4G/jEi+iUdd6jXiLgfuH9M28dzpi+bYLsvAV+aYNmlU6y5JLVl0qzdeqDYZZiZnbCpnnHcDmwFaoFHkgf1OgtV1GzWmkmz+3APezt7il2KmdkJmWrn+D9ERHNEvC3pmN4GvKXAtc1KbZkGwA8CmtnMNdXO8QZJnxp+LkLS/yR79mF5OndJA+Vl8vMcZjZjTfVS1eeAI2Q7rd9J9jLVXYUqajarrkhxzul1PuMwsxlrqp3jZ0bEb+XM/3dJ6wpQz5zQ2pJmzbrdDA0FZWUqdjlmZnmZ6hnHMUlvGp6R9EbA70E9QW2ZNEd6B2jf11XsUszM8jbVM44PAF+U1JDMHwSuK0xJs1/uSLlnLZ5f3GLMzPI01buqnoyIVuAC4IKIuBCY0c9TFNOKpvnMryr3E+RmNiPl9QbAiOhMniAHuLEA9cwJqTJxQUuD76wysxnpZF4d617dk9CaSfPUnk56+geLXYqZWV5OJjiOO+SITay1JU3/YLB5jx/AN7OZZdLOcUlHGD8gBMwrSEVzxIVL00B2pNxfWLqguMWYmeVh0uCIiLpTVchcc1p9Na+qr/aDgGY245zMpSo7SW2ZtO+sMrMZx8FRRK2ZNFv3d3Oou6/YpZiZTZmDo4haPVKumc1ADo4iuqAljQRP7vA7yM1s5nBwFNH8qnJWLp7vBwHNbEZxcBRZa0uadTsOEeHHYsxsZihocEi6XNIzkrZIummc5TdK2ixpvaTvJa+kHV52naTnkp/rctp/UdKGZJ//IGlGP8HetjTNga4+dh70YMNmNjMULDgkpYDbgCuAVcC1klaNWe0JYHVEXADcB9ySbLsQ+CvgtcDFwF9JGn5K7p+APwBWJj+XF+oYToXWljQAT7iD3MxmiEKecVwMbImI9ojoA+4BrspdISIeiojuZPZRoCWZ/jXgOxFxICIOAt8BLpd0OlAfEY9G9trOF4GrC3gMBffqV9VRVV7m5znMbMYoZHA0Azty5ncmbRN5P/DAcbZtTqaPu09J1w+/I72joyPP0k+dilQZ5zc3ODjMbMYoic5xSe8GVgO3Ttc+I+KOiFgdEaubmpqma7cF0ZpJs2HXYfoHh4pdipnZcRUyOHYBmZz5lqRtFEmXAR8DroyI3uNsu4uXL2dNuM+Zpi2TpndgiGdePFLsUszMjquQwfEYsFLSckmVwDXAmtwVJF0I3E42NPbmLHoQ+FVJC5JO8V8FHoyIPUCnpNcld1O9F/hGAY/hlBh+layf5zCzmaBgwRERA8ANZEPgKeDeiNgk6WZJVyar3QrMB74iaZ2kNcm2B4D/QTZ8HgNuTtoA/ivwz8AW4Hle7heZsVoWzGNhbSXrth8qdilmZsc16bDqJysi7gfuH9P28ZzpyybZ9nPA58ZpXwucN41lFp2k7Ei5PuMwsxmgJDrHLfs8x3N7j3K0d6DYpZiZTcrBUSJaMw1EwHqfdZhZiXNwlIiRDnKPlGtmJc7BUSLSNZUsW1TjBwHNrOQ5OEpIaybtlzqZWclzcJSQtkyaFzt7ePFwT7FLMTObkIOjhLQm/Rw+6zCzUubgKCGrTq+nIiU/z2FmJc3BUUKqK1K85vR6d5CbWUlzcJSY1pY063ceZnDIr5I1s9Lk4CgxbZk0R3sHaO84WuxSzMzG5eAoMe4gN7NS5+AoMSsaa6mrLndwmFnJcnCUmLIy0drikXLNrHQ5OEpQa6aBp/ccoad/sNilmJm9goOjBLW2pBkYCjbt9oCHZlZ6HBwlqG2kg9zBYWalx8FRghbXV7OkodoPAppZSXJwlCiPlGtmpaqgwSHpcknPSNoi6aZxll8i6eeSBiS9Paf9LZLW5fz0SLo6WfZ5SS/kLGsr5DEUS1smzfYD3Rzo6it2KWZmoxQsOCSlgNuAK4BVwLWSVo1ZbTvwPuDLuY0R8VBEtEVEG3Ap0A18O2eVDw8vj4h1hTmC4modeSPgoaLWYWY2ViHPOC4GtkREe0T0AfcAV+WuEBFbI2I9MDTJft4OPBAR3YUrtfSc39xAmfwEuZmVnkIGRzOwI2d+Z9KWr2uAfxnT9glJ6yV9WlLViRZYymqryjn7tDo/CGhmJaekO8clnQ6cDzyY0/xR4BzgImAh8JEJtr1e0lpJazs6OgpeayG0tqR5cschIjxSrpmVjkIGxy4gkzPfkrTl453A1yOif7ghIvZEVi9wF9lLYq8QEXdExOqIWN3U1JTnx5aGtqVpDnb3s/3AnLpKZ2YlrpDB8RiwUtJySZVkLzmtyXMf1zLmMlVyFoIkAVcDG0++1NLU2pIG3M9hZqWlYMEREQPADWQvMz0F3BsRmyTdLOlKAEkXSdoJvAO4XdKm4e0lLSN7xvLwmF3fLWkDsAFoBP6mUMdQbGefNp95FSkHh5mVlPJC7jwi7gfuH9P28Zzpx8hewhpv262M05keEZdOb5WlqzxVxvnNDb4l18xKSkl3jlt2pNyNuzvpH5zsjmUzs1PHwVHiWjNp+gaGeHrPkWKXYmYGODhK3shIuX6ew8xKhIOjxDWn59Gcnsct33qaf/jecxzp6T/+RmZmBeTgKHGS+MLvXcwbzlzEp77zLJfc8hC3P/w8x/r8dkAzKw7NhaeSV69eHWvXri12GSdt/c5DfOo7z/KDZzponF/FDW85k2tfu5Sq8lSxSzOzWUjS4xGx+hXtDo6ZZ+3WA/zdt5/h0fYDLGmo5oZLV/KO1S1UpHwCaWbTx8Exi4Jj2I+37OPWbz/DE9sPsXRhDX9y2UquamsmVaZil2Zms8BEweF/os5gbzirka/94Ru4630XUVddzo33Psmvfvphvrl+N0NDs/8fBGZWHA6OGU4SbzlnMd/84Jv43+/+BVJl4oYvP8Hb/uGHfGfzSx5Z18ymnYNjlpDE5eedzgMfuoS/v6aNnv5B/uCLa7n6tv/gkWc7HCBmNm0cHLNMqkxc1dbMd2/8ZW75rQvYd7SP937uZ7zr9kf5afv+YpdnZrOAO8dnud6BQe59bAf/6/tb2Hukl19a2ciNbz2bC5cuKHZpZlbifFfVHA2OYT39g3zp0W189gfPc6Crj8tes5g/fevZnLukodilmVmJcnDM8eAY1tU7wOd/vJXbH36ezp4Bfv380/nTt67krMV1xS7NzEqMg8PBMcrhY/3c+cN27vzRCxzrH+TqtmY+dNlKzlhUW+zSzKxEODgcHOM60NXH7Q8/zxd+spX+weAdv9jCB39lJc3pecUuzcyKzMHh4JjU3s4ePvuD5/nyT7cDcO3FGf7oLWexuL66yJWZWbE4OBwcU7Lr0DH+8fvP8ZW1O0mVievesIwP/PKZLKytLHZpZnaKOTgcHHnZtr+Lv//uc3x93S5qKlL83puW8/u/tIKGeRXFLs3MTpGiBIeky4G/B1LAP0fEJ8csvwT4DHABcE1E3JezbBDYkMxuj4grk/blwD3AIuBx4D0R0TdZHQ6OE/fcS0f4zHef49837KGuupxLVjZxbnM95zc3cN6SBhb4TMRs1jrlwSEpBTwLvBXYCTwGXBsRm3PWWQbUA38OrBkTHEcjYv44+70X+FpE3CPpfwNPRsQ/TVaLg+Pkbdp9mP/zSDuPbz/IjgPHRtqb0/M4LwmSc5MwaaqrKmKlZjZdJgqO8gJ+5sXAlohoTwq4B7gKGAmOiNiaLBuayg4lCbgU+O2k6QvAXwOTBoedvHOXNPCZay4E4FB3H5t2d7Jh12E27jrMpt2dPLjppZF1X1VfzXnN9Zy7pCF7ZtLcwGn1VWT/5zOzma6QwdEM7MiZ3wm8No/tqyWtBQaAT0bEv5G9PHUoIgZy9tk83saSrgeuB1i6dGl+lduk0jWVvPGsRt54VuNIW2dPP5t3d7IxCZONuzv53tN7GT6hbZxfxXnN9Zy3JBsk5zXX05ye5zAxm4EKGRwn64yI2CVpBfB9SRuAw1PdOCLuAO6A7KWqAtVoifrqCl63YhGvW7FopK2rd4Cn9mTDZMOuTjbtPswPn9vHYPKukAU1FZzX3JBzZlLP0oU1DhOzElfI4NgFZHLmW5K2KYmIXcnvdkk/AC4EvgqkJZUnZx157dNOrdqqclYvW8jqZQtH2nr6B7NhsruTjTsPs3H3Ye78UTv9g9kwqasuT85K6pMzkwaWL6qlzG81NCsZhQyOx4CVyV1Qu4BreLlvYlKSFgDdEdErqRF4I3BLRISkh4C3k72z6jrgGwWp3gqiuiLFhUsXjBqdt3dgkGdfPMrG3YfZsOswm3Yd5gs/2UbfQLbrq7YyxblLGkbu5jp3SQPLGmuoKk8V6zDM5rRC3477NrK326aAz0XEJyTdDKyNiDWSLgK+DiwAeoAXI+JcSW8AbgeGyL4z5DMRcWeyzxVkQ2Mh8ATw7ojonawO31U18/QPDvHcS9kwGe432bynk57+bJiUCTILa1jeWMuKxvmsaKplRWMtK5rmuyPebJr4AUAHx4w3MDhE+74uNu0+zAsdXTy/r4v2ji5e2Hd0JFAAaipT2UBpms/yxlrObMqGy/KmWuZXlXK3nllpKcbtuGbTqjxVxtmn1XH2aaOHgB8aCl7s7BkJkec7umjf18W6HQf55vrd5P7baHFdFSuaalneOD8bKMl0ZsE8ylN+IabZVDg4bMYrKxNL0vNYkp7Hm1Y2jlrW0z/I9gPdtHdkA+WFfV20dxzlgY17ONTdP7JeRUosXVgzEijDZywrmmpZVFvpS19mORwcNqtVV6TGPUsBONjVR/u+o7QnZyjtHUd5YV8XjzzbQd/gy5e+6qrLWdE0nzMba0ddAlveWMu8SnfQ29zj4LA5a0FtJb9Yu5BfPGPhqPbBoWDXwWMjofLCvi7a9x3lJ+37+doTo+/+flV9NWcsqmHZolrOaEx+L6rhjEXuT7HZy/9lm42RKhNLF9WwdFENb3716GXdfQPJ5a5soGzb3822/V187+m97Ds6+ua+xvlVLEtCZFmyv2WLalm2qJaGGo8ybDOXg8MsDzWV5dlnSpY0vGLZ0d4Btu3PhsnW/V1s29fNtgNd/Pj5fXz15z2j1k3XVIwEytjfC92nYiXOwWE2TeZXTRwqw530W/flBMv+bh7fdpD/9+RuhnLu/KqrKueMxrGBkr0EtrjOz6hY8Tk4zE6ByTrpewcG2XnwGNv2d7F1X3c2YPZ3sXl3Jw9ufJGBnFSZV5FK+lCG+1NqWTS/kqGhYGAoGBz5PUT/4Oj5gaFgcPDl9fqHhkbNj1ovmR8YHMpZFgyM7DPb3p8zX1NZzuK6KhbXV3NafRWnJb8X11WzuL6KRbVVpDx0zKzg4DArsqryFGc2zefMple8foaBwSF2H+pJzlC62Jr0qTzf0cVDT4+++ysfqTJRnvykykR5qmykLfd3xSvas/NVFeU5+8i2He0dYPfhHtbtOMT+rle+Wy1VJprmV2XDpL6axXU54VJfzWlJwCysqfTYZCXOwWFWwspTZSMd9dA0atlg8uDjwa4+ylPDf9zLcsLg5T/qYwOh0Je7+gaG6Djay0udPezt7GXvkR5e6uzhpc5s244D3azdeoCDOc/SjBxzmUaduSyuywmXnLYFNRW+bFckDg6zGSpVJprT82hOzyt2Ka9QWV42pdp6+gfpOJINlr1JqLx05OXAeWFfF4+2H+DwsVcGTGWqjKa6qpHLYsNhU1ddTlV5GVXlqezvipenqytSLy+rKBuZriwv82W0PDg4zKxoqitSZBbWkFlYM+l6Pf2D2WAZFTAvTz+39yg/2rKPIz0Dk+5nMhUpvRw25WVUDYdMRU7bmMAZG0zD4VRZXkZFaswZYHJJcOzZ38hZYWr89orUmPXKVPRLeQ4OMyt51RWpnEt2EzvWN0hX3wC9A0P09g9mf4+dHhiktz873TPSnvzuz5ketd0gR3sH2H+0b2R5T866w68AOFUkXhk8EwTUnddddNzvLV8ODjObNeZVpooyDMzQUNA3ODqYcu9G6x97d9pQzvzgOO2D49/l9vKdba9sf+VnZNerqpj+wTsdHGZmJ6msTFSXpaiuSAGzf1QAjyNtZmZ5cXCYmVleHBxmZpYXB4eZmeWloMEh6XJJz0jaIummcZZfIunnkgYkvT2nvU3STyRtkrRe0rtyln1e0guS1iU/bYU8BjMzG61gd1VJSgG3AW8FdgKPSVoTEZtzVtsOvA/48zGbdwPvjYjnJC0BHpf0YEQcSpZ/OCLuK1TtZmY2sULejnsxsCUi2gEk3QNcBYwER0RsTZaNenomIp7Nmd4taS/ZgXoOFbBeMzObgkJeqmoGduTM70za8iLpYqASeD6n+RPJJaxPS6qaYLvrJa2VtLajoyPfjzUzswmU9AOAkk4H/i9wXUQMn5V8FHiRbJjcAXwEuHnsthFxR7IcSR2Stp1gGY3AvhPcdjby9/Eyfxej+fsYbTZ8H2eM11jI4NgFZHLmW5K2KZFUD/w78LGIeHS4PSL2JJO9ku7ilf0jrxARTcdbZ5I61kbE6hPdfrbx9/Eyfxej+fsYbTZ/H4W8VPUYsFLSckmVwDXAmqlsmKz/deCLYzvBk7MQlB2I/2pg43QWbWZmkytYcETEAHAD8CDwFHBvRGySdLOkKwEkXSRpJ/AO4HZJm5LN3wlcArxvnNtu75a0AdhA9lTwbwp1DGZm9kqKiOOvNYdJuj7pLzH8feTydzGav4/RZvP34eAwM7O8eMgRMzPLi4PDzMzy4uCYxPHG2porJGUkPSRpczJ+2IeKXVMpkJSS9ISkbxa7lmKTlJZ0n6SnJT0l6fXFrqlYJP1p8v+TjZL+RVJ1sWuabg6OCeSMtXUFsAq4VtKq4lZVNAPAn0XEKuB1wB/N4e8i14fI3jFo8PfAtyLiHKCVOfq9SGoG/hhYHRHnASmyjyLMKg6OiY2MtRURfcDwWFtzTkTsiYifJ9NHyP5RyHv4mNlEUgvw68A/F7uWYpPUQPb2+TsBIqIvZ0DSuagcmCepHKgBdhe5nmnn4JjYtIy1NdtIWgZcCPy0yKUU22eAvwCGjrPeXLAc6ADuSi7d/bOk2mIXVQwRsQv4O7Ijf+8BDkfEt4tb1fRzcNiUSZoPfBX4k4joLHY9xSLpN4C9EfF4sWspEeXALwD/FBEXAl3AnOwTlLSA7JWJ5cASoFbSu4tb1fRzcEzspMbamm0kVZANjbsj4mvFrqfI3ghcKWkr2UuYl0r6UnFLKqqdwM6IGD4LvY9skMxFlwEvRERHRPQDXwPeUOSapp2DY2InPNbWbJOMC3Yn8FREfKrY9RRbRHw0IloiYhnZ/y6+HxGz7l+VUxURLwI7JL06afoVct67M8dsB14nqSb5/82vMAtvFCjpYdWLKSIGJA2PtZUCPhcRm46z2Wz1RuA9wAZJ65K2v4yI+4tXkpWYD5IdR64SaAd+t8j1FEVE/FTSfcDPyd6N+ATJ6x1mEw85YmZmefGlKjMzy4uDw8zM8uLgMDOzvDg4zMwsLw4OMzPLi4PD7DgkHU1+L5P029O8778cM//j6dy/WSE4OMymbhmQV3AkA91NZlRwRMSse8rYZh8Hh9nUfRL4JUnrkncupCTdKukxSesl/RcASW+W9ENJa0ieoJb0b5IeT97TcH3S9kmyo6iuk3R30jZ8dqNk3xslbZD0rpx9/yDn3Rd3J08oI+mTyTtT1kv6u1P+7dic4SfHzabuJuDPI+I3AJIAOBwRF0mqAv5D0vBIqL8AnBcRLyTzvxcRByTNAx6T9NWIuEnSDRHRNs5n/SbQRvbdFo3JNo8kyy4EziU7XPd/AG+U9BTwn4FzIiIkpaf30M1e5jMOsxP3q8B7k2FYfgosAlYmy36WExoAfyzpSeBRsoNnrmRybwL+JSIGI+Il4GHgopx974yIIWAd2Utoh4Ee4E5Jvwl0n+SxmU3IwWF24gR8MCLakp/lOe9e6BpZSXoz2VFTXx8RrWTHLzqZ14n25kwPAuURMUD25WP3Ab8BfOsk9m82KQeH2dQdAepy5h8E/jAZch5JZ0/wAqMG4GBEdEs6h+zrd4f1D28/xg+BdyX9KE1k37D3s4kKS96V0pAMPPmnZC9xmRWE+zjMpm49MJhccvo82fdsLwN+nnRQdwBXj7Pdt4APJP0Qz5C9XDXsDmC9pJ9HxO/ktH8deD3wJBDAX0TEi0nwjKcO+IakarJnQjee0BGaTYFHxzUzs7z4UpWZmeXFwWFmZnlxcJiZWV4cHGZmlhcHh5mZ5cXBYWZmeXFwmJlZXv4/QAgzW/yBXxUAAAAASUVORK5CYII=\n",
-      "text/plain": [
-       "<Figure size 432x288 with 1 Axes>"
-      ]
-     },
-     "metadata": {
-      "needs_background": "light"
-     },
-     "output_type": "display_data"
-    }
-   ],
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
    "source": [
     "%matplotlib inline\n",
     "import matplotlib.pyplot as plt\n",
@@ -478,22 +441,9 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 14,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAYgAAAEWCAYAAAB8LwAVAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjMuMSwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/d3fzzAAAACXBIWXMAAAsTAAALEwEAmpwYAAAot0lEQVR4nO3deXxcZ33v8c9Xuyx5X+LEexI7djYSUMOSsrQ0kFJIKFDqpBRyWxrgEkqhhZv09lIaXuXSjcItebUNNMAtS6CBUtObktKyFgq1Q0xCrHHiOIttNIls2dZYsvbf/eMcyWN5JI1tjWak+b5fr3nNOc9Z5qeJc37znOc5z6OIwMzMbLyacgdgZmaVyQnCzMwKcoIwM7OCnCDMzKwgJwgzMyvICcLMzApygjAzs4KcIGzWk3Qs7zUi6Xje+q+dwfm+JenNpYjVbDapK3cAZmcrIlpHlyU9Abw5Iv6tfBGVlqS6iBgqdxw297kGYXOWpBpJt0p6TNIhSV+UtCTd1iTpM2n5EUnbJZ0j6Y+BFwIfS2sgH5vg3P8gKSvpqKTvSLokb1uzpL+Q9GS6/T8kNafbflbS99PP3CfpprT8pFqLpJsk/Ufeekh6u6RHgUfTso+m5+iWdL+kF+btXyvp99O/PZduXyPpDkl/Me5v2SbpXWf/jdtc4wRhc9k7gFcDLwbOAw4Dd6Tb3gQsBNYAS4G3Ascj4n8C3wVuiYjWiLhlgnP/C7ARWAH8CPhs3rY/B54DvABYArwXGJG0Lj3ur4DlwBXAztP4e14NPBe4OF3fnp5jCfA54B8kNaXb3g3cALwCWAD8BtALfBq4QVINgKRlwC+kx5udxLeYbC57K8mFfj+ApPcDT0n6dWCQJDFcGBEPAvefzokj4q7R5fS8hyUtBHIkF+PnRcSBdJfvp/vdCPxbRHw+LT+Uvor1vyOiKy+Gz+Rt+wtJfwBcBPwYeDPw3ojYnW7/8ehnSjoKvBT4OrAV+FZEPH0acViVcA3C5rJ1wD+mt3OOAO3AMHAO8PfAfcDdkn4q6U8l1Rdz0vT2zYfS2zfdwBPppmXpqwl4rMChayYoL9a+cXH8nqT29DbWEZIa0bIiPuvTwBvS5TeQfBdmp3CCsLlsH/CLEbEo79UUEQciYjAi/igiLia5FfRK4I3pcVMNcXwjcD3JrZmFwPq0XMBBoA+4YIJ4CpUD9ADz8tZXFthnLK60veG9wOuBxRGxCDiaxjDVZ30GuF7Ss4AtwFcm2M+qnBOEzWV/A/xxeu8fScslXZ8u/5ykyyTVAt0kt5xG0uOeBs6f5LzzgX6S20PzgA+OboiIEeAu4MOSzktrG8+X1EjSTvELkl4vqU7SUklXpIfuBF4jaZ6kC4HfnOJvmw8MAZ1AnaT3kbQ1jPoE8AFJG5W4XNLSNMb9JO0Xfw98KSKOT/FZVqWcIGwu+yiwDfhXSTngBySNvJD8Qr+HJDm0A9/mxK2WjwKvk3RY0v8pcN7/CzwJHAB2pefN93vAQyQX4S7gT4CaiHiKpNH4d9PyncCz0mP+EhggSU6f5uRG70LuA74GPJLG0sfJt6A+DHwR+Nf0b/w7oDlv+6eBy/DtJZuEPGGQWfWR9CKSW03rwhcBm4BrEGZVJm2MfyfwCScHm4wThFkVkbQFOAKcC3ykrMFYxfMtJjMzK8g1CDMzK2jOPEm9bNmyWL9+fbnDMDObVe6///6DEbG80LY5kyDWr1/Pjh07yh2GmdmsIunJibb5FpOZmRXkBGFmZgU5QZiZWUFOEGZmVpAThJmZFeQEYWZmBZU0QUi6VtJuSXsk3Vpg+1pJ35T0gKQHJb0ib9tt6XG7Jb28lHGamdmpSvYcRDrO/h3ANcB+YLukbRGxK2+3PwC+GBF/Leli4F5gfbq8FbiEZC7hf5O0KSKGSxWvmRUWETzd3c+D+4/w6DPHaKyrYdG8BhbPqx97XzyvgYXN9dTUaOoT2qxRygflrgL2RMReAEl3k8zClZ8gghOTnCwEfpouXw/cHRH9wOOS9qTn+88SxmtmwDPdfTy4/ygPHTjx6sz1T3mcBAubk2SxaN7J7yeSSd5yS7Ktqb52Bv4qOxOlTBCrOHkCk/2cmKxl1PtJJnN5B9BCMoXj6LH5k7DsT8tOIulm4GaAtWvXTkvQZtWkM9fPTw4cTRPCER46cJSnu5NkUCO4YHkrL9y4jMtXLeSy1QvZvHIBQ8PB4d4BjhwfTN57BzjcM5i8946WDfJ0dx+7szkO9w7QOzBx5b+pviZNJg0saq5nccvJNZNCtZUFzfXUVkFtpW9wmEM9A3QdG+BQTz+Hjg3Q1TPAwZ5+usaWB1i/dB4f3XrltH9+uYfauAH4VET8haTnA38v6dJiD46IO4E7Adra2jwsrdkkDh3rT2oEebWDjqN9QPLr//xlLbzggmVcliaDi89dQEtj4UvEwnn1p/XZ/UPDHEmTx6nJJFk+kiaW3dkcR3oHOXJ8kOGRwv9bS9DaUMf8pjrmN9WzoDl5T9ZPLC8Y9z5W3lxPS0Mt0swmmb7BYbp6BjiUXvBPLA/QlSaAQz3ptmMD9EyQWOtrxdKWRpa0NLC0tYFzFzYX3O9slTJBHADW5K2vTsvy/SZwLUBE/KekJmBZkcea2QQO9wycuEWUJoQDR05MPX3+shau2rAkSQarFnLJqoW0TpAMpkNjXS3nLKjlnAVNRR8zMhLk+odOTSY9SfLI9Q3SfXyIXN8gub4hnsn18VjnELm+IbqPDzI0QXIZVSNobTw5mUyVaOY31bMgr7y2RnT1JL/kD/UMcOhY/ynLB9Nf+l09AxzrHyoYS32tkot9SyNLWxtYt3Te2PLSloaxRLC0pZElrQ3Mb6ybkeRWygSxHdgoaQPJxX0rcOO4fZ4CXgp8Kp3IpIlkEvZtwOckfZikkXoj8F8ljNVszOGeAXbuP8IDTx1h574j9PYPsaA5uTAk78mFJHk/dX1+Ux31tTPXg/xo7yA/+enJt4n2dZ1IBuuXzuPZ6xbzphes47JVi7hk1QIWNJ1eDaAcamrEwuZ6FjbXs27p6R0bEfQNjiRJpO9EEulO33Nj70nZaKL56ZE+cv25sW0T1WCmUlcjlrY2sKSlkaUtyQV/SUsDy1qTX/3JcrJ9SUsDC5pm5oJ/ukqWICJiSNItJJOr1wJ3RcTDkm4HdkTENpLJ2z8u6V0kDdY3pVMgPizpiyQN2kPA292DyUphcHiE3dkcDzx1mAeeOsID+47w+MEeIPmFuemc+SxpaeCZXB97nhm9mAwy1XVjXkNtgURy9gmmu2+Qn4y7TfTkod6x7WuXzOPyVYv4teeu4/K0ZrCwufKTwXSTRHNDLc0NtaxYMPX+hUQEvQPDYwmlUKIZHgkWz2sY+6W/tLWyL/ina87MKNfW1hYe7tumkj3alySDfUd44KnDPHTgKH2DIwAsa23kyrWLkteaxVy+emHBe/CjF47RX56jSWNsPX+5b7DAflP/Mh2fYFoa63iqq3cseQGsXtw81l5w+apFXLpqAYvmNUzvF2ZznqT7I6Kt0LZyN1KblUzf4DAPHTjKA08dZue+5JbRaKNsQ20Nl6xawI1XrRtLCqsWNRf1q08SLY11tDTWce7C04/rTBLM4d4BNp3Tyuues5pL03aDJS1OBlZaThB2kogg291Ha2MdrTPUEDYdIoInDvWyc196q+ipI7R3dI81VK5Z0szPrF+SJoPFbDl3Po115el/f7YJxmymOEHYSe7evo/bvvwQkPzKXtxSnzakpe/z0vfWBpbMaxhrcFvSkvRRr5uhxtnuvkF+nNYKRmsIh3sHAWhpqOVZaxbxlhefz5VrFnPF2kUsa22ckbjM5hInCDvJ/U8eZvG8et72kgvo6hmkK+2r3dUzwEOHj9DVM0B3X+GuepA8STvaLW9xS9JwN/q+ZNzykpYG5jVM/U9weCR45OncWDJ4YN8RHus8RkTSH37jilZedvFKrkhvFW1cMb8qHqIyKzUnCDtJJtvNpasWcvOLLphwn8HhEQ73DNDVmzzh2dU7MPbAz+HepA/44Z4B9nX1Jr/sewYm7JPeVF+T1ERGu/zNO1Fj6RkYZudTR/jx/iNjT+IuaWngyjWLuP5Z53Hl2sVcvmbhrOiyaTYbOUHYmKHhER55+hg3vWD9pPvV19awYkETK4p86Cki6O4bGquJdKUJZPTp0bGaSu8gjx88NvYEaV2NuOS8BfzKc1Zz5drFXLl2EWuXzJs17SJms50ThI154lAPA0MjbF45f1rPK5144GnDspaijukbHEaibA3JZuYEYXnaO3IAbF55hk8WTSOP8GlWfp5RzsZkst3U1YgLVhT3K9/M5jYnCBuT6chxwfJW39YxM8AJwvJksjk2nzu97Q9mNns5QRgAR48PcuDI8YpofzCzyuAEYQDszqYN1K5BmFnKCcIA2J3tBmCLaxBmlnKCMADaszkWzavnnAUes8jMEk4QBkCmo5vNK+f7KWUzG+MEYYyMBLuzOTdQm9lJnCCM/YeP0zMwPO1DbJjZ7OYEYbSnDdSbz3UNwsxOKGmCkHStpN2S9ki6tcD2v5S0M309IulI3rbhvG3bShlntct05JBg0zmt5Q7FzCpIyQbrk1QL3AFcA+wHtkvaFhG7RveJiHfl7f8O4Mq8UxyPiCtKFZ+dkMl2s35pS1GT95hZ9ShlDeIqYE9E7I2IAeBu4PpJ9r8B+HwJ47EJZLI5tz+Y2SlKmSBWAfvy1venZaeQtA7YAHwjr7hJ0g5JP5D06gmOuzndZ0dnZ+c0hV1degeGeOJQj3swmdkpKqWReitwT0QM55Wti4g24EbgI5JOmQMzIu6MiLaIaFu+fPlMxTqnPPJ0Mrezh9gws/FKmSAOAGvy1lenZYVsZdztpYg4kL7vBb7Fye0TNk0yHR5iw8wKK2WC2A5slLRBUgNJEjilN5KkzcBi4D/zyhZLakyXlwFXA7vGH2tnL5PN0dJQy+rFzeUOxcwqTMm6rUTEkKRbgPuAWuCuiHhY0u3AjogYTRZbgbsjIvIO3wL8raQRkiT2ofzeTzZ92ju6uWjlfGpqPMSGmZ2spP0aI+Je4N5xZe8bt/7+Asd9H7islLEZRASZbI5fuvzccodiZhWoUhqprQyy3X0cPT7IFndxNbMCnCCqWKZjdJIgN1Cb2amcIKrY6BhMF7kGYWYFOEFUsUxHjlWLmlnQVF/uUMysAjlBVLFMtpstfkDOzCbgBFGl+oeGeazTQ2yY2cScIKrUnmeOMTwSHmLDzCbkBFGlxnowuQZhZhNwgqhSmWw3jXU1rF86r9yhmFmFcoKoUplsjk3nzKeu1v8EzKwwXx2qVHuHJwkys8k5QVShzlw/B4/1+wlqM5uUE0QV2p1NGqg9BpOZTcYJogplPMSGmRXBCaIKtXfkWDG/kaWtjeUOxcwqmBNEFcpku93+YGZTcoKoMkPDIzz69DG3P5jZlJwgqszjB3sYGB7xEBtmNiUniCrTnvUQG2ZWnJImCEnXStotaY+kWwts/0tJO9PXI5KO5G17k6RH09ebShlnNcl0dFNXIy5Y3lruUMyswtWV6sSSaoE7gGuA/cB2SdsiYtfoPhHxrrz93wFcmS4vAf4QaAMCuD899nCp4q0WmWyOC1e00lDnyqOZTa6UV4mrgD0RsTciBoC7gesn2f8G4PPp8suBr0dEV5oUvg5cW8JYq0amo9tDbJhZUUqZIFYB+/LW96dlp5C0DtgAfON0jpV0s6QdknZ0dnZOS9Bz2dHeQX56tM9dXM2sKJVyn2ErcE9EDJ/OQRFxZ0S0RUTb8uXLSxTa3DH6BLVrEGZWjFImiAPAmrz11WlZIVs5cXvpdI+1ImVGx2ByDcLMilDKBLEd2Chpg6QGkiSwbfxOkjYDi4H/zCu+D3iZpMWSFgMvS8vsLGSy3SyeV8+K+R5iw8ymVrJeTBExJOkWkgt7LXBXRDws6XZgR0SMJoutwN0REXnHdkn6AEmSAbg9IrpKFWu1yGRzbF65AEnlDsXMZoGSJQiAiLgXuHdc2fvGrb9/gmPvAu4qWXBVZmQk2J3N8as/s2bqnc3MqJxGaiuxfYd76R0YdgO1mRXNCaJKtHd4iA0zOz1OEFUik+1Ggk3nuAZhZsVxgqgSmY4cG5a20NxQW+5QzGyWcIKoEskkQa49mFnxJuzFJOk1RRzfl/ZUsgrW0z/Ek129vObZq8sdipnNIpN1c/048E/AZJ3mX8S4bqxWeR55OkeEh9gws9MzWYL4l4j4jckOlvSZaY7HSsBDbJjZmZiwDSIi3jDVwcXsY+WX6eimtbGOVYuayx2Kmc0iRTdSS7pQ0mckfUnS80sZlE2v9myOi1bOp6bGQ2yYWfEma6Ruioi+vKIPAO9Nl78KXFHCuGyaRASZjm5e9azzyh2Kmc0yk9UgvirpjXnrg8B6YB1wWvM2WPl0HO2ju2/IkwSZ2WmbLEFcCyyQ9DVJLwJ+j2Qq0F8Gfm0mgrOzNzpJ0Bb3YDKz0zThLaZ0drePSfp74H8BbwP+ICIem6ng7OyNjsG0yQnCzE7TZG0QzwXeAwwAHwSOA38s6QDwgYg4MiMR2lnJZHOsXtzMgqb6codiZrPMZM9B/C3wCqAV+GREXA1slfRi4Askt5uswmU6uj2Cq5mdkcnaIIY40Sg9MFoYEd+OCCeHWaBvcJi9B3vY4jGYzOwMTFaDuBF4C0lyeOMk+1mF2vPMMYZHwjUIMzsjkzVSPwL87gzGYtNsdIgNj+JqZmdiwltMkv55qoOn2kfStZJ2S9oj6dYJ9nm9pF2SHpb0ubzyYUk709e2qWKxU2U6ummsq2H90pZyh2Jms9Bkt5h+dooLs4CLJ9wo1QJ3ANcA+4HtkrZFxK68fTYCtwFXR8RhSSvyTnE8Iq4o4m+wCWTSITZqPcSGmZ2ByRLE9UUcPzDJtquAPRGxF0DS3ek5d+Xt81vAHRFxGCAininiM61ImWw3P795xdQ7mpkVMFkbxLfP8tyrgH156/uB547bZxOApO8BtcD7I+Jr6bYmSTtIelN9KCK+Mv4DJN0M3Aywdu3aswx3bunM9XPw2IAbqM3sjE1Wg5ipz98IvARYDXxH0mXpQ3jrIuKApPOBb0h6aPxT3BFxJ3AnQFtbW8xo5BVudIgNN1Cb2Zkq5ZzUB4A1eeur07J8+4FtETEYEY8Dj5AkDCLiQPq+F/gWcGUJY51zMukQG65BmNmZmjJBSHqVpDNJJNuBjZI2SGoAtgLjG72/QlJ7QNIykltOeyUtltSYV341J7dd2BTas92cs6CRJS0N5Q7FzGapYi78vwo8KulPJW0u9sQRMQTcAtwHtANfjIiHJd0u6bp0t/uAQ5J2Ad8E3hMRh4AtwA5JP07LP5Tf+8mmlunIufZgZmdlyjaIiHiDpAXADcCnJAXwSeDzEZGb4th7gXvHlb0vbzmAd6ev/H2+D1xW7B9hJxscHmHPM8d44aZl5Q7FzGaxom4dRUQ3cA9wN3AuyZwQP5L0jhLGZmfo8YM9DAyPsMU1CDM7C8W0QVwn6R9JGorrgasi4heBZ+GhOCpSe4d7MJnZ2Summ+trgb+MiO/kF0ZEr6TfLE1YdjYy2Rz1teL8Za3lDsXMZrFiEsT7gY7RFUnNwDkR8URE/HupArMzl+no5oLlrTTUlbIXs5nNdcVcQf4BGMlbH07LrEJlsjm2nOv2BzM7O8UkiLqIyJ8waABw5/oKdaR3gI6jfWz2HNRmdpaKSRCdec8tIOl64GDpQrKzcWIOCNcgzOzsFNMG8Vbgs5I+RjLE9z48w1zFyqQ9mLa4BmFmZ6mYB+UeA54nqTVdP1byqOyMZbI5lrQ0sHx+Y7lDMbNZrqjRXCX9EnAJyRDcAETE7SWMy85QezbH5pXzGf3vZGZ2pop5UO5vSMZjegfJLaZfAdaVOC47AyMjwSNZj8FkZtOjmEbqF0TEG4HDEfFHwPNJJ/qxyvJUVy/HB4f9BLWZTYtiEkRf+t4r6TxgkGQ8JqswY5MEuYHazKZBMW0QX5W0CPgz4EdAAB8vZVB2Zto7ctQINq5wgjCzszdpgkgnCvr3dArQL0n6Z6ApIo7ORHB2ejLZbtYva6G5obbcoZjZHDDpLaaIGAHuyFvvd3KoXJlszkN8m9m0KaYN4t8lvVbuN1nRevqHePJQr9sfzGzaFJMg3kIyOF+/pG5JOUndJY7LTtPupz3EhplNr2KepPZP0lkg05EmCNcgzGyaFPOg3IsKvYo5uaRrJe2WtEfSrRPs83pJuyQ9LOlzeeVvkvRo+npT8X9Sdcpku2ltrGP14uZyh2Jmc0Qx3Vzfk7fcBFwF3A/8/GQHSaolaeC+BtgPbJe0LSJ25e2zEbgNuDoiDktakZYvAf4QaCPpVnt/euzhov+yKpPp8BAbZja9pqxBRMSr8l7XAJcCxVyorwL2RMTedA6Ju4Hrx+3zW8Adoxf+iHgmLX858PWI6Eq3fR24trg/qfpEBO3Zbj9BbWbT6kzmpNwPbCliv1UkQ4PnH7dq3D6bgE2SvifpB5KuPY1jkXSzpB2SdnR2dhb9B8w1Pz3aR65vyGMwmdm0mvIWk6S/IrnNA0lCuYLkierp+vyNwEuA1cB3JF1W7MERcSdwJ0BbW1tMsfucNTYHhGsQZjaNimmD2JG3PAR8PiK+V8RxB4A1eeur07J8+4EfRsQg8LikR0gSxgGSpJF/7LeK+MyqNDqL3KZznCDMbPoUkyDuAfoiYhiSxmdJ8yKid4rjtgMbJW0gueBvBW4ct89XgBuAT0paRnLLaS/wGPBBSYvT/V5G0phtBbR3dLNmSTPzm+rLHYqZzSFFPUkN5PedbAb+baqDImIIuAW4D2gHvhgRD0u6PW+O6/uAQ5J2Ad8E3hMRhyKiC/gASZLZDtyellkBGc8BYWYlUEwNoil/mtGIOCZpXjEnj4h7gXvHlb0vbzmAd6ev8cfeBdxVzOdUs77BYfZ2HuMVl64sdyhmNscUU4PokfTs0RVJzwGOly4kOx17njnGSHiIDTObfsXUIH4H+AdJPyWZcnQlyRSkVgHaOzxJkJmVRjFjMW2XtBm4KC3anfY6sgqQyeZoqq9h3dKWcodiZnNMMWMxvR1oiYifRMRPgFZJ/730oVkxMtluLjpnPrU1HmLDzKZXMW0Qv5XOKAdAOvTFb5UsIitaRNDe4R5MZlYaxSSI2vzJgtJB+BpKF5IVq/NYP109Ax6DycxKophG6q8BX5D0t+n6W9IyK7MTc0C4BmFm06+YBPE/gJuBt6XrXwc+XrKIrGiZrHswmVnpFDPc90hE/E1EvC4iXgfsAv6q9KHZVDIdOVYuaGJxi+/4mdn0K6YGgaQrScZMej3wOPDlUgZlxWnP5tz+YGYlM2GCkLSJJCncABwEvgAoIn5uhmKzSQwOj7DnmRwv3rS83KGY2Rw1WQ0iA3wXeGVE7AGQ9K4ZicqmtLezh8Hh8BwQZlYyk7VBvAboAL4p6eOSXkoy1IZVgBMN1O7BZGalMWGCiIivRMRWYDPJUNy/A6yQ9NeSXjZD8dkE2jty1NeK85d7iA0zK41iejH1RMTnIuJVJDO7PUDS9dXKKJPt5sIV86mvPZNpxc3MpnZaV5eIOBwRd0bES0sVkBUn05Fji59/MLMS8s/PWehwzwDZ7j53cTWzknKCmIUyWQ+xYWal5wQxC431YHINwsxKqKQJQtK1knZL2iPp1gLbb5LUKWln+npz3rbhvPJtpYxztsl05Fja0sDy1sZyh2Jmc1hRQ22ciXRY8DuAa4D9wHZJ2yJi17hdvxARtxQ4xfGIuKJU8c1mmWw3m8+dT94o7GZm066UNYirgD0RsTciBoC7getL+HlVYXgk2P20Jwkys9IrZYJYBezLW9+flo33WkkPSrpH0pq88iZJOyT9QNKrC32ApJvTfXZ0dnZOX+QV7MlDPfQNjniIbzMruXI3Un8VWB8Rl5PMM/HpvG3rIqINuBH4iKQLxh+cPpPRFhFty5dXx6B1u9MeTFvOdQ3CzEqrlAniAJBfI1idlo2JiEMR0Z+ufgJ4Tt62A+n7XuBbwJUljHXWaM/mqBFcuKK13KGY2RxXygSxHdgoaYOkBmArcFJvJEnn5q1eB7Sn5YslNabLy4CrSSYqqnqZjm42LGuhqb623KGY2RxXsl5METEk6RbgPqAWuCsiHpZ0O7AjIrYBvy3pOmAI6AJuSg/fAvytpBGSJPahAr2fqlImm+Oy1QvLHYaZVYGSJQiAiLgXuHdc2fvylm8Dbitw3PeBy0oZ22x0rH+Ip7p6eX3b6nKHYmZVoNyN1HYadnuIDTObQU4Qs4iH2DCzmeQEMYtkOnLMb6xj1aLmcodiZlXACWIW8RAbZjaTnCBmiYgg0+EhNsxs5jhBzBIHjhwn1z/k9gczmzFOELNEpsM9mMxsZjlBzBKjPZgu8iB9ZjZDnCBmifZsjrVL5tHaWNJnG83MxjhBzBKZjm4P8W1mM8oJYhboGxzm8YM9bPYQ32Y2g5wgZoFHnz7GSMAW1yDMbAY5QcwC7WNDbLgGYWYzxwliFsh05Giur2XtknnlDsXMqogTxCyQyXazaeV8ams8xIaZzRwniAoXEbR3dLv9wcxmnBNEhevM9XO4d9BdXM1sxjlBVLj20UmC3EBtZjPMCaLCZTrSHkyuQZjZDCtpgpB0raTdkvZIurXA9pskdUramb7enLftTZIeTV9vKmWclSyTzXHuwiYWzWsodyhmVmVKNrCPpFrgDuAaYD+wXdK2iNg1btcvRMQt445dAvwh0AYEcH967OFSxVup2j3EhpmVSSlrEFcBeyJib0QMAHcD1xd57MuBr0dEV5oUvg5cW6I4K9bA0AiPdR5z+4OZlUUpE8QqYF/e+v60bLzXSnpQ0j2S1pzOsZJulrRD0o7Ozs7pirti7D14jMHhcA3CzMqi3I3UXwXWR8TlJLWET5/OwRFxZ0S0RUTb8uXLSxJgOY1OErTFNQgzK4NSJogDwJq89dVp2ZiIOBQR/enqJ4DnFHtsNWjPdtNQW8OGZS3lDsXMqlApE8R2YKOkDZIagK3AtvwdJJ2bt3od0J4u3we8TNJiSYuBl6VlVSXTkePCFa3U15a7omdm1ahkvZgiYkjSLSQX9lrgroh4WNLtwI6I2Ab8tqTrgCGgC7gpPbZL0gdIkgzA7RHRVapYK1Um283VFy4rdxhmVqVKOn9lRNwL3Duu7H15y7cBt01w7F3AXaWMr5J19QzwdHc/W1a6/cHMysP3LipUZmwOCPdgMrPycIKoUKM9mDa7BmFmZeIEUaEy2W6WtTawfH5juUMxsyrlBFGhMtmcaw9mVlZOEBVoeCTYnc35CWozKysniAr0xKEe+odGPAaTmZWVE0QFOtFA7RqEmZWPE0QFymS7qa0RF65oLXcoZlbFnCAqUCab4/xlLTTV15Y7FDOrYk4QFSiT7eYi314yszJzgqgwub5B9nUd9xDfZlZ2ThAV5pGn3UBtZpXBCaLCtI/2YHINwszKzAmiwmSy3cxvquO8hU3lDsXMqpwTRIXJdOTYsnIBksodiplVOSeIChIRyRhMHuLbzCqAE0QF2X/4OMf6hzxIn5lVBCeICpLJjjZQuwZhZuXnBFFBMh3JLHIXneMEYWblV9IEIelaSbsl7ZF06yT7vVZSSGpL19dLOi5pZ/r6m1LGWSky2Rzrls6jpbGkU4WbmRWlZFciSbXAHcA1wH5gu6RtEbFr3H7zgXcCPxx3isci4opSxVeJ2rPdfkDOzCpGKX+qXgXsiYi9AJLuBq4Hdo3b7wPAnwDvKWEsExoeCQ4e60eCGil9gdL30bIT2xlbn86uqMcHhnniYA+vuvy8aTunmdnZKGWCWAXsy1vfDzw3fwdJzwbWRMT/kzQ+QWyQ9ADQDfxBRHy3FEEe6R3guR/89zM6Nj9pTJ1Q8ren+9ec2H9oZISRgC1uoDazClG2m92SaoAPAzcV2NwBrI2IQ5KeA3xF0iUR0T3uHDcDNwOsXbv2jOJoaazjg798GSMRRAQjASPpe7J+oiwCRkby14vYP68sIhgZOXX/4XS/521YytUXLjujv8PMbLqVMkEcANbkra9Oy0bNBy4FvpXeqlkJbJN0XUTsAPoBIuJ+SY8Bm4Ad+R8QEXcCdwK0tbXFmQTZVF/Ljc89s+RiZjaXlbIX03Zgo6QNkhqArcC20Y0RcTQilkXE+ohYD/wAuC4idkhanjZyI+l8YCOwt4SxmpnZOCWrQUTEkKRbgPuAWuCuiHhY0u3AjojYNsnhLwJulzQIjABvjYiuUsVqZmanUsQZ3ZmpOG1tbbFjx46pdzQzszGS7o+ItkLb/CS1mZkV5ARhZmYFOUGYmVlBThBmZlaQE4SZmRU0Z3oxSeoEnjyLUywDDk5TOLOdv4uT+fs4mb+PE+bCd7EuIpYX2jBnEsTZkrRjoq5e1cbfxcn8fZzM38cJc/278C0mMzMryAnCzMwKcoI44c5yB1BB/F2czN/Hyfx9nDCnvwu3QZiZWUGuQZiZWUFOEGZmVlDVJwhJ10raLWmPpFvLHU85SVoj6ZuSdkl6WNI7yx1TuUmqlfSApH8udyzlJmmRpHskZSS1S3p+uWMqJ0nvSv8/+Ymkz0tqKndM062qE0Q6KdEdwC8CFwM3SLq4vFGV1RDwuxFxMfA84O1V/n0AvBNoL3cQFeKjwNciYjPwLKr4e5G0CvhtoC0iLiWZ82ZreaOaflWdIICrgD0RsTciBoC7gevLHFPZRERHRPwoXc6RXABWlTeq8pG0Gvgl4BPljqXcJC0kmcjr7wAiYiAijpQ1qPKrA5ol1QHzgJ+WOZ5pV+0JYhWwL299P1V8QcwnaT1wJfDDModSTh8B3ksyq2G12wB0Ap9Mb7l9QlJLuYMql4g4APw58BTQARyNiH8tb1TTr9oThBUgqRX4EvA7EdFd7njKQdIrgWci4v5yx1Ih6oBnA38dEVcCPUDVttlJWkxyt2EDcB7QIukN5Y1q+lV7gjgArMlbX52WVS1J9STJ4bMR8eVyx1NGVwPXSXqC5Nbjz0v6THlDKqv9wP6IGK1R3kOSMKrVLwCPR0RnRAwCXwZeUOaYpl21J4jtwEZJGyQ1kDQybStzTGUjSST3mNsj4sPljqecIuK2iFgdEetJ/l18IyLm3C/EYkVEFtgn6aK06KXArjKGVG5PAc+TNC/9/+alzMFG+7pyB1BOETEk6RbgPpJeCHdFxMNlDqucrgZ+HXhI0s607Pcj4t7yhWQV5B3AZ9MfU3uB/1bmeMomIn4o6R7gRyS9/x5gDg674aE2zMysoGq/xWRmZhNwgjAzs4KcIMzMrCAnCDMzK8gJwszMCnKCMEtJOpa+r5d04zSf+/fHrX9/Os9vVgpOEGanWg+cVoJIB2ybzEkJIiLm3FO3Nvc4QZid6kPACyXtTMf8r5X0Z5K2S3pQ0lsAJL1E0nclbSN9qljSVyTdn84TcHNa9iGSUT93SvpsWjZaW1F67p9IekjSr+ad+1t58y98Nn1iF0kfSufseFDSn8/4t2NVo6qfpDabwK3A70XEKwHSC/3RiPgZSY3A9ySNjtz5bODSiHg8Xf+NiOiS1Axsl/SliLhV0i0RcUWBz3oNcAXJ/ArL0mO+k267EriEZBjp7wFXS2oHfhnYHBEhadH0/ulmJ7gGYTa1lwFvTIcf+SGwFNiYbvuvvOQA8NuSfgz8gGQgyI1M7meBz0fEcEQ8DXwb+Jm8c++PiBFgJ8mtr6NAH/B3kl4D9J7l32Y2IScIs6kJeEdEXJG+NuSN/d8ztpP0EpJRPp8fEc8iGZ/nbKah7M9bHgbqImKIZKKre4BXAl87i/ObTcoJwuxUOWB+3vp9wNvSodCRtGmCyXIWAocjolfSZpJpW0cNjh4/zneBX03bOZaTzNr2XxMFls7VsTAdQPFdJLemzErCbRBmp3oQGE5vFX2KZC7m9cCP0obiTuDVBY77GvDWtJ1gN8ltplF3Ag9K+lFE/Fpe+T8Czwd+DATw3ojIpgmmkPnAP0lqIqnZvPuM/kKzIng0VzMzK8i3mMzMrCAnCDMzK8gJwszMCnKCMDOzgpwgzMysICcIMzMryAnCzMwK+v9dS7Ovcb84WwAAAABJRU5ErkJggg==\n",
-      "text/plain": [
-       "<Figure size 432x288 with 1 Axes>"
-      ]
-     },
-     "metadata": {
-      "needs_background": "light"
-     },
-     "output_type": "display_data"
-    }
-   ],
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
    "source": [
     "acc_per_epoch = [np.mean(acc_per_epoch) for acc_per_epoch in running_test_acc]\n",
     "display_loss_plot(acc_per_epoch, title=\"Test accuracy\", ylabel=\"Accuracy [%]\")"
@@ -501,27 +451,16 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 15,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "0.798340863819657"
-      ]
-     },
-     "execution_count": 15,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
    "source": [
     "test(model, test_quantized_loader)"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 16,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -540,23 +479,16 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 17,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "<All keys matched successfully>"
-      ]
-     },
-     "execution_count": 17,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
    "source": [
     "import torch\n",
     "\n",
+    "# Make sure the model is on CPU before loading a pretrained state_dict\n",
+    "model = model.cpu()\n",
+    "\n",
+    "# Load pretrained weights\n",
     "trained_state_dict = torch.load(\"state_dict.pth\")[\"models_state_dict\"][0]\n",
     "\n",
     "model.load_state_dict(trained_state_dict, strict=False)"
@@ -564,23 +496,14 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 18,
-   "metadata": {
-    "scrolled": true
-   },
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "0.9188772287810328"
-      ]
-     },
-     "execution_count": 18,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
    "source": [
+    "# Move the model back to it's target device\n",
+    "model.to(device)\n",
+    "\n",
+    "# Test for accuracy\n",
     "test(model, test_quantized_loader)"
    ]
   },
@@ -600,6 +523,16 @@
     "Sometimes, it's desirable to make some changes to our trained network prior to export (this is known in general as \"network surgery\"). This depends on the model and is not generally necessary, but in this case we want to make a couple of changes to get better results with FINN."
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Move the model to CPU before surgery\n",
+    "model = model.cpu()"
+   ]
+  },
   {
    "cell_type": "markdown",
    "metadata": {},
@@ -609,20 +542,9 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 19,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "(64, 593)"
-      ]
-     },
-     "execution_count": 19,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
    "source": [
     "from copy import deepcopy\n",
     "\n",
@@ -634,20 +556,9 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 20,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "(64, 600)"
-      ]
-     },
-     "execution_count": 20,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
    "source": [
     "import numpy as np\n",
     "\n",
@@ -658,20 +569,9 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 21,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "torch.Size([64, 600])"
-      ]
-     },
-     "execution_count": 21,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
    "source": [
     "modified_model[0].weight.data = torch.from_numpy(W_new)\n",
     "modified_model[0].weight.shape"
@@ -690,11 +590,10 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 22,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
-    "from brevitas.core.quant import QuantType\n",
     "from brevitas.nn import QuantIdentity\n",
     "\n",
     "\n",
@@ -702,23 +601,27 @@
     "    def __init__(self, my_pretrained_model):\n",
     "        super(CybSecMLPForExport, self).__init__()\n",
     "        self.pretrained = my_pretrained_model\n",
-    "        self.qnt_output = QuantIdentity(quant_type=QuantType.BINARY, bit_width=1, min_val=-1.0, max_val=1.0)\n",
+    "        self.qnt_output = QuantIdentity(\n",
+    "            quant_type='binary', \n",
+    "            scaling_impl_type='const',\n",
+    "            bit_width=1, min_val=-1.0, max_val=1.0)\n",
     "    \n",
     "    def forward(self, x):\n",
     "        # assume x contains bipolar {-1,1} elems\n",
     "        # shift from {-1,1} -> {0,1} since that is the\n",
     "        # input range for the trained network\n",
-    "        x = (x + torch.tensor([1.0])) / 2.0  \n",
+    "        x = (x + torch.tensor([1.0]).to(x.device)) / 2.0  \n",
     "        out_original = self.pretrained(x)\n",
     "        out_final = self.qnt_output(out_original)   # output as {-1,1}     \n",
     "        return out_final\n",
     "\n",
-    "model_for_export = CybSecMLPForExport(modified_model)"
+    "model_for_export = CybSecMLPForExport(modified_model)\n",
+    "model_for_export.to(device)"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 23,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -731,16 +634,17 @@
     "    with torch.no_grad():\n",
     "        for data in test_loader:\n",
     "            inputs, target = data\n",
+    "            inputs, target = inputs.to(device), target.to(device)\n",
     "            # pad inputs to 600 elements\n",
-    "            input_padded = np.pad(inputs, [(0,0), (0,7)])\n",
+    "            input_padded = torch.nn.functional.pad(inputs, (0,7,0,0))\n",
     "            # convert inputs to {-1,+1}\n",
-    "            input_scaled = 2*input_padded - 1\n",
+    "            input_scaled = 2 * input_padded - 1\n",
     "            # run the model\n",
-    "            output = model(torch.from_numpy(input_scaled).float())\n",
-    "            y_pred.extend(list(output.flatten()))\n",
+    "            output = model(input_scaled.float())\n",
+    "            y_pred.extend(list(output.flatten().cpu().numpy()))\n",
     "            # make targets bipolar {-1,+1}\n",
-    "            expected = 2*target.float() - 1\n",
-    "            expected = expected.detach().numpy()\n",
+    "            expected = 2 * target.float() - 1\n",
+    "            expected = expected.cpu().numpy()\n",
     "            y_true.extend(list(expected.flatten()))\n",
     "        \n",
     "    return accuracy_score(y_true, y_pred)"
@@ -748,20 +652,9 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 24,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "0.9188772287810328"
-      ]
-     },
-     "execution_count": 24,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
    "source": [
     "test_padded_bipolar(model_for_export, test_quantized_loader)"
    ]
@@ -780,35 +673,16 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 25,
-   "metadata": {
-    "scrolled": true
-   },
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Model saved to cybsec-mlp-ready.onnx\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "<ipython-input-22-78c27bb59095>:15: TracerWarning: torch.tensor results are registered as constants in the trace. You can safely ignore this warning if you use this function to create tensors out of constant variables that would be the same every time you call this function. In any other case, this might cause the trace to be incorrect.\n",
-      "  x = (x + torch.tensor([1.0])) / 2.0\n",
-      "/workspace/brevitas/src/brevitas/quant_tensor/__init__.py:74: TracerWarning: torch.tensor results are registered as constants in the trace. You can safely ignore this warning if you use this function to create tensors out of constant variables that would be the same every time you call this function. In any other case, this might cause the trace to be incorrect.\n",
-      "  training = torch.tensor(training, dtype=torch.bool)\n"
-     ]
-    }
-   ],
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
    "source": [
     "import brevitas.onnx as bo\n",
     "from brevitas.quant_tensor import QuantTensor\n",
     "\n",
     "ready_model_filename = \"cybsec-mlp-ready.onnx\"\n",
     "input_shape = (1, 600)\n",
+    "\n",
     "# create a QuantTensor instance to mark input as bipolar during export\n",
     "input_a = np.random.randint(0, 1, size=input_shape).astype(np.float32)\n",
     "input_a = 2 * input_a - 1\n",
@@ -818,6 +692,10 @@
     "    input_t, scale=torch.tensor(scale), bit_width=torch.tensor(1.0), signed=True\n",
     ")\n",
     "\n",
+    "#Move to CPU before export\n",
+    "model_for_export.cpu()\n",
+    "\n",
+    "# Export to ONNX\n",
     "bo.export_finn_onnx(\n",
     "    model_for_export, export_path=ready_model_filename, input_t=input_qt\n",
     ")\n",
@@ -837,44 +715,15 @@
     "* The input preprocessing (x + 1) / 2 is exported as part of the network (initial `Add` and `Div` layers)\n",
     "* Brevitas `QuantLinear` layers are exported to ONNX as `MatMul`. We've exported the padded version; shape of the first MatMul node's weight parameter is 600x64\n",
     "* The weight parameters (second inputs) for MatMul nodes are annotated with `quantization: finn_datatype: INT2`\n",
-    "* The quantized activations are exported as `MultiThreshold` nodes with `domain=finn.custom_op.general`\n",
+    "* The quantized activations are exported as `MultiThreshold` nodes with `domain=qonnx.custom_op.general`\n",
     "* There's a final `MultiThreshold` node with threshold=0 to produce the final bipolar output (this is the `qnt_output` from `CybSecMLPForExport`"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 26,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Serving 'cybsec-mlp-ready.onnx' at http://0.0.0.0:8081\n"
-     ]
-    },
-    {
-     "data": {
-      "text/html": [
-       "\n",
-       "        <iframe\n",
-       "            width=\"100%\"\n",
-       "            height=\"400\"\n",
-       "            src=\"http://localhost:8081/\"\n",
-       "            frameborder=\"0\"\n",
-       "            allowfullscreen\n",
-       "        ></iframe>\n",
-       "        "
-      ],
-      "text/plain": [
-       "<IPython.lib.display.IFrame at 0x7fb36398c3a0>"
-      ]
-     },
-     "execution_count": 26,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
    "source": [
     "from finn.util.visualization import showInNetron\n",
     "\n",
@@ -888,13 +737,6 @@
     "## That's it! <a id=\"thats_it\" ></a>\n",
     "You created, trained and tested a quantized MLP that is ready to be loaded into FINN, congratulations! You can now proceed to the next notebook."
    ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": []
   }
  ],
  "metadata": {
diff --git a/notebooks/end2end_example/cybersecurity/2-import-into-finn-and-verify.ipynb b/notebooks/end2end_example/cybersecurity/2-import-into-finn-and-verify.ipynb
index a0fef1ab61..370312c77e 100644
--- a/notebooks/end2end_example/cybersecurity/2-import-into-finn-and-verify.ipynb
+++ b/notebooks/end2end_example/cybersecurity/2-import-into-finn-and-verify.ipynb
@@ -20,7 +20,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -53,16 +53,16 @@
    "source": [
     "# 1. Import model into FINN with ModelWrapper <a id=\"brevitas_import_visualization\"></a>\n",
     "\n",
-    "Now that we have the model in .onnx format, we can work with it using FINN. To import it into FINN, we'll use the [`ModelWrapper`](https://finn.readthedocs.io/en/latest/source_code/finn.core.html#finn.core.modelwrapper.ModelWrapper). It is a wrapper around the ONNX model which provides several helper functions to make it easier to work with the model."
+    "Now that we have the model in .onnx format, we can work with it using FINN. To import it into FINN, we'll use the [`ModelWrapper`](https://finn.readthedocs.io/en/latest/source_code/finn.core.html#qonnx.core.modelwrapper.ModelWrapper). It is a wrapper around the ONNX model which provides several helper functions to make it easier to work with the model."
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
-    "from finn.core.modelwrapper import ModelWrapper\n",
+    "from qonnx.core.modelwrapper import ModelWrapper\n",
     "\n",
     "ready_model_filename = \"cybsec-mlp-ready.onnx\"\n",
     "model_for_sim = ModelWrapper(ready_model_filename)"
@@ -77,85 +77,9 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": null,
    "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "['__class__',\n",
-       " '__delattr__',\n",
-       " '__dict__',\n",
-       " '__dir__',\n",
-       " '__doc__',\n",
-       " '__eq__',\n",
-       " '__format__',\n",
-       " '__ge__',\n",
-       " '__getattribute__',\n",
-       " '__gt__',\n",
-       " '__hash__',\n",
-       " '__init__',\n",
-       " '__init_subclass__',\n",
-       " '__le__',\n",
-       " '__lt__',\n",
-       " '__module__',\n",
-       " '__ne__',\n",
-       " '__new__',\n",
-       " '__reduce__',\n",
-       " '__reduce_ex__',\n",
-       " '__repr__',\n",
-       " '__setattr__',\n",
-       " '__sizeof__',\n",
-       " '__str__',\n",
-       " '__subclasshook__',\n",
-       " '__weakref__',\n",
-       " '_model_proto',\n",
-       " 'analysis',\n",
-       " 'check_all_tensor_shapes_specified',\n",
-       " 'check_compatibility',\n",
-       " 'cleanup',\n",
-       " 'find_consumer',\n",
-       " 'find_consumers',\n",
-       " 'find_direct_predecessors',\n",
-       " 'find_direct_successors',\n",
-       " 'find_producer',\n",
-       " 'find_upstream',\n",
-       " 'get_all_tensor_names',\n",
-       " 'get_finn_nodes',\n",
-       " 'get_initializer',\n",
-       " 'get_metadata_prop',\n",
-       " 'get_node_index',\n",
-       " 'get_nodes_by_op_type',\n",
-       " 'get_non_finn_nodes',\n",
-       " 'get_tensor_datatype',\n",
-       " 'get_tensor_fanout',\n",
-       " 'get_tensor_layout',\n",
-       " 'get_tensor_shape',\n",
-       " 'get_tensor_sparsity',\n",
-       " 'get_tensor_valueinfo',\n",
-       " 'graph',\n",
-       " 'is_fork_node',\n",
-       " 'is_join_node',\n",
-       " 'make_empty_exec_context',\n",
-       " 'make_new_valueinfo_name',\n",
-       " 'model',\n",
-       " 'rename_tensor',\n",
-       " 'save',\n",
-       " 'set_initializer',\n",
-       " 'set_metadata_prop',\n",
-       " 'set_tensor_datatype',\n",
-       " 'set_tensor_layout',\n",
-       " 'set_tensor_shape',\n",
-       " 'set_tensor_sparsity',\n",
-       " 'temporary_fix_oldstyle_domain',\n",
-       " 'transform']"
-      ]
-     },
-     "execution_count": 3,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
+   "outputs": [],
    "source": [
     "dir(model_for_sim)"
    ]
@@ -169,26 +93,11 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": null,
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Input tensor name: 0\n",
-      "Output tensor name: 73\n",
-      "Input tensor shape: [1, 600]\n",
-      "Output tensor shape: [1, 1]\n",
-      "Input tensor datatype: BIPOLAR\n",
-      "Output tensor datatype: FLOAT32\n",
-      "List of node operator types in the graph: \n",
-      "['Mul', 'Add', 'Div', 'MatMul', 'Mul', 'Add', 'BatchNormalization', 'MultiThreshold', 'Mul', 'MatMul', 'Mul', 'Add', 'BatchNormalization', 'MultiThreshold', 'Mul', 'MatMul', 'Mul', 'Add', 'BatchNormalization', 'MultiThreshold', 'Mul', 'MatMul', 'Mul', 'Add', 'MultiThreshold']\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
-    "from finn.core.datatype import DataType\n",
+    "from qonnx.core.datatype import DataType\n",
     "\n",
     "finnonnx_in_tensor_name = model_for_sim.graph.input[0].name\n",
     "finnonnx_out_tensor_name = model_for_sim.graph.output[0].name\n",
@@ -226,14 +135,14 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
-    "from finn.transformation.general import GiveReadableTensorNames, GiveUniqueNodeNames, RemoveStaticGraphInputs\n",
-    "from finn.transformation.infer_shapes import InferShapes\n",
-    "from finn.transformation.infer_datatypes import InferDataTypes\n",
-    "from finn.transformation.fold_constants import FoldConstants\n",
+    "from qonnx.transformation.general import GiveReadableTensorNames, GiveUniqueNodeNames, RemoveStaticGraphInputs\n",
+    "from qonnx.transformation.infer_shapes import InferShapes\n",
+    "from qonnx.transformation.infer_datatypes import InferDataTypes\n",
+    "from qonnx.transformation.fold_constants import FoldConstants\n",
     "\n",
     "model_for_sim = model_for_sim.transform(InferShapes())\n",
     "model_for_sim = model_for_sim.transform(FoldConstants())\n",
@@ -262,38 +171,9 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": null,
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Serving 'cybsec-mlp-verification.onnx' at http://0.0.0.0:8081\n"
-     ]
-    },
-    {
-     "data": {
-      "text/html": [
-       "\n",
-       "        <iframe\n",
-       "            width=\"100%\"\n",
-       "            height=\"400\"\n",
-       "            src=\"http://localhost:8081/\"\n",
-       "            frameborder=\"0\"\n",
-       "            allowfullscreen\n",
-       "        ></iframe>\n",
-       "        "
-      ],
-      "text/plain": [
-       "<IPython.lib.display.IFrame at 0x7f3be619b2b0>"
-      ]
-     },
-     "execution_count": 7,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
+   "outputs": [],
    "source": [
     "from finn.util.visualization import showInNetron\n",
     "\n",
@@ -311,20 +191,9 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 8,
+   "execution_count": null,
    "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "torch.Size([100, 593])"
-      ]
-     },
-     "execution_count": 8,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
+   "outputs": [],
    "source": [
     "import numpy as np\n",
     "from torch.utils.data import TensorDataset\n",
@@ -356,20 +225,9 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 9,
+   "execution_count": null,
    "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "<All keys matched successfully>"
-      ]
-     },
-     "execution_count": 9,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
+   "outputs": [],
    "source": [
     "input_size = 593      \n",
     "hidden1 = 64      \n",
@@ -409,7 +267,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 10,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -441,7 +299,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 11,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -476,17 +334,9 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 12,
+   "execution_count": null,
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "ok 100 nok 0: 100%|██████████| 100/100 [00:21<00:00,  4.72it/s]\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "import numpy as np\n",
     "from tqdm import trange\n",
@@ -511,17 +361,9 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 13,
+   "execution_count": null,
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Verification succeeded. Brevitas and FINN-ONNX execution outputs are identical\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "if ok == n_verification_inputs:\n",
     "    print(\"Verification succeeded. Brevitas and FINN-ONNX execution outputs are identical\")\n",
@@ -535,13 +377,6 @@
    "source": [
     "This concludes our second notebook. In the next one, we'll take the ONNX model we just verified all the way down to FPGA hardware with the FINN compiler."
    ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": []
   }
  ],
  "metadata": {
diff --git a/notebooks/end2end_example/cybersecurity/3-build-accelerator-with-finn.ipynb b/notebooks/end2end_example/cybersecurity/3-build-accelerator-with-finn.ipynb
index 551c321534..33adb68dc8 100644
--- a/notebooks/end2end_example/cybersecurity/3-build-accelerator-with-finn.ipynb
+++ b/notebooks/end2end_example/cybersecurity/3-build-accelerator-with-finn.ipynb
@@ -106,17 +106,9 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": null,
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Previous run results deleted!\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "import finn.builder.build_dataflow as build\n",
     "import finn.builder.build_dataflow_config as build_cfg\n",
@@ -148,40 +140,9 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": null,
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Building dataflow accelerator from cybsec-mlp-ready.onnx\n",
-      "Intermediate outputs will be generated in /tmp/finn_dev_ubuntu\n",
-      "Final outputs will be generated in output_estimates_only\n",
-      "Build log is at output_estimates_only/build_dataflow.log\n",
-      "Running step: step_tidy_up [1/7]\n",
-      "Running step: step_streamline [2/7]\n",
-      "Running step: step_convert_to_hls [3/7]\n",
-      "Running step: step_create_dataflow_partition [4/7]\n",
-      "Running step: step_target_fps_parallelization [5/7]\n",
-      "Running step: step_apply_folding_config [6/7]\n",
-      "Running step: step_generate_estimate_reports [7/7]\n",
-      "Completed successfully\n",
-      "CPU times: user 1.84 s, sys: 599 ms, total: 2.44 s\n",
-      "Wall time: 1.77 s\n"
-     ]
-    },
-    {
-     "data": {
-      "text/plain": [
-       "0"
-      ]
-     },
-     "execution_count": 3,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
+   "outputs": [],
    "source": [
     "%%time\n",
     "build.build_dataflow_cfg(model_file, cfg_estimates)"
@@ -196,36 +157,18 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": null,
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "build_dataflow.log  intermediate_models  report  time_per_step.json\r\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "! ls {estimates_output_dir}"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": null,
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "estimate_layer_config_alternatives.json  estimate_network_performance.json\r\n",
-      "estimate_layer_cycles.json\t\t op_and_param_counts.json\r\n",
-      "estimate_layer_resources.json\r\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "! ls {estimates_output_dir}/report"
    ]
@@ -239,23 +182,9 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": null,
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "{\r\n",
-      "  \"critical_path_cycles\": 252,\r\n",
-      "  \"max_cycles\": 64,\r\n",
-      "  \"max_cycles_node_name\": \"StreamingFCLayer_Batch_1\",\r\n",
-      "  \"estimated_throughput_fps\": 1562500.0,\r\n",
-      "  \"estimated_latency_ns\": 2520.0\r\n",
-      "}"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "! cat {estimates_output_dir}/report/estimate_network_performance.json"
    ]
@@ -269,7 +198,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -282,23 +211,9 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 8,
+   "execution_count": null,
    "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "{'StreamingFCLayer_Batch_0': 60,\n",
-       " 'StreamingFCLayer_Batch_1': 64,\n",
-       " 'StreamingFCLayer_Batch_2': 64,\n",
-       " 'StreamingFCLayer_Batch_3': 64}"
-      ]
-     },
-     "execution_count": 8,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
+   "outputs": [],
    "source": [
     "read_json_dict(estimates_output_dir + \"/report/estimate_layer_cycles.json\")"
    ]
@@ -314,44 +229,9 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 9,
+   "execution_count": null,
    "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "{'StreamingFCLayer_Batch_0': {'BRAM_18K': 36,\n",
-       "  'BRAM_efficiency': 0.11574074074074074,\n",
-       "  'LUT': 8184,\n",
-       "  'URAM': 0,\n",
-       "  'URAM_efficiency': 1,\n",
-       "  'DSP': 0},\n",
-       " 'StreamingFCLayer_Batch_1': {'BRAM_18K': 4,\n",
-       "  'BRAM_efficiency': 0.1111111111111111,\n",
-       "  'LUT': 1217,\n",
-       "  'URAM': 0,\n",
-       "  'URAM_efficiency': 1,\n",
-       "  'DSP': 0},\n",
-       " 'StreamingFCLayer_Batch_2': {'BRAM_18K': 4,\n",
-       "  'BRAM_efficiency': 0.1111111111111111,\n",
-       "  'LUT': 1217,\n",
-       "  'URAM': 0,\n",
-       "  'URAM_efficiency': 1,\n",
-       "  'DSP': 0},\n",
-       " 'StreamingFCLayer_Batch_3': {'BRAM_18K': 1,\n",
-       "  'BRAM_efficiency': 0.006944444444444444,\n",
-       "  'LUT': 341,\n",
-       "  'URAM': 0,\n",
-       "  'URAM_efficiency': 1,\n",
-       "  'DSP': 0},\n",
-       " 'total': {'BRAM_18K': 45.0, 'LUT': 10959.0, 'URAM': 0.0, 'DSP': 0.0}}"
-      ]
-     },
-     "execution_count": 9,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
+   "outputs": [],
    "source": [
     "read_json_dict(estimates_output_dir + \"/report/estimate_layer_resources.json\")"
    ]
@@ -375,7 +255,7 @@
     "\n",
     "<font color=\"red\">**Live FINN tutorial:** These next builds will take about 10 minutes to complete since multiple calls to Vivado and a call to RTL simulation are involved. While this is running, you can examine the generated files with noVNC -- it is running on **(your AWS URL):6080/vnc.html**\n",
     "\n",
-    "* Once the `step_hls_codegen [8/16]` below is completed, you can view the generated HLS code under its own folder for each layer: `/tmp/finn_dev_ubuntu/code_gen_ipgen_StreamingFCLayer_Batch_XXXXXX`\n",
+    "* Once the `step_hls_codegen [8/16]` below is completed, you can view the generated HLS code under its own folder for each layer: `/tmp/finn_dev_ubuntu/code_gen_ipgen_MatrixVectorActivation_XXXXXX`\n",
     "    \n",
     "* Once the `step_create_stitched_ip [11/16]` below is completed, you can view the generated stitched IP in Vivado under `/home/ubuntu/finn/notebooks/end2end_example/cybersecurity/output_ipstitch_ooc_rtlsim/stitched_ip`\n",
     "</font> "
@@ -383,17 +263,9 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 10,
+   "execution_count": null,
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Previous run results deleted!\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "import finn.builder.build_dataflow as build\n",
     "import finn.builder.build_dataflow_config as build_cfg\n",
@@ -425,49 +297,9 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 11,
+   "execution_count": null,
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Building dataflow accelerator from cybsec-mlp-ready.onnx\n",
-      "Intermediate outputs will be generated in /tmp/finn_dev_ubuntu\n",
-      "Final outputs will be generated in output_ipstitch_ooc_rtlsim\n",
-      "Build log is at output_ipstitch_ooc_rtlsim/build_dataflow.log\n",
-      "Running step: step_tidy_up [1/16]\n",
-      "Running step: step_streamline [2/16]\n",
-      "Running step: step_convert_to_hls [3/16]\n",
-      "Running step: step_create_dataflow_partition [4/16]\n",
-      "Running step: step_target_fps_parallelization [5/16]\n",
-      "Running step: step_apply_folding_config [6/16]\n",
-      "Running step: step_generate_estimate_reports [7/16]\n",
-      "Running step: step_hls_codegen [8/16]\n",
-      "Running step: step_hls_ipgen [9/16]\n",
-      "Running step: step_set_fifo_depths [10/16]\n",
-      "Running step: step_create_stitched_ip [11/16]\n",
-      "Running step: step_measure_rtlsim_performance [12/16]\n",
-      "Running step: step_make_pynq_driver [13/16]\n",
-      "Running step: step_out_of_context_synthesis [14/16]\n",
-      "Running step: step_synthesize_bitfile [15/16]\n",
-      "Running step: step_deployment_package [16/16]\n",
-      "Completed successfully\n",
-      "CPU times: user 4.76 s, sys: 710 ms, total: 5.47 s\n",
-      "Wall time: 8min 5s\n"
-     ]
-    },
-    {
-     "data": {
-      "text/plain": [
-       "0"
-      ]
-     },
-     "execution_count": 11,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
+   "outputs": [],
    "source": [
     "%%time\n",
     "build.build_dataflow_cfg(model_file, cfg_stitched_ip)"
@@ -489,22 +321,9 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 12,
+   "execution_count": null,
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "all_verilog_srcs.txt\t\t       finn_vivado_stitch_proj.xpr\r\n",
-      "finn_vivado_stitch_proj.cache\t       ip\r\n",
-      "finn_vivado_stitch_proj.hw\t       make_project.sh\r\n",
-      "finn_vivado_stitch_proj.ip_user_files  make_project.tcl\r\n",
-      "finn_vivado_stitch_proj.sim\t       vivado.jou\r\n",
-      "finn_vivado_stitch_proj.srcs\t       vivado.log\r\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "! ls {rtlsim_output_dir}/stitched_ip"
    ]
@@ -518,18 +337,9 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 13,
+   "execution_count": null,
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "estimate_layer_resources_hls.json  rtlsim_performance.json\r\n",
-      "ooc_synth_and_timing.json\r\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "! ls {rtlsim_output_dir}/report"
    ]
@@ -543,27 +353,9 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 14,
+   "execution_count": null,
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "{\r\n",
-      "  \"vivado_proj_folder\": \"/tmp/finn_dev_ubuntu/synth_out_of_context_iut077er/results_finn_design_wrapper\",\r\n",
-      "  \"LUT\": 8667.0,\r\n",
-      "  \"FF\": 9063.0,\r\n",
-      "  \"DSP\": 0.0,\r\n",
-      "  \"BRAM\": 22.0,\r\n",
-      "  \"WNS\": 0.946,\r\n",
-      "  \"\": 0,\r\n",
-      "  \"fmax_mhz\": 110.44842058758559,\r\n",
-      "  \"estimated_throughput_fps\": 1725756.5716810247\r\n",
-      "}"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "! cat {rtlsim_output_dir}/report/ooc_synth_and_timing.json"
    ]
@@ -577,26 +369,9 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 15,
+   "execution_count": null,
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "{\r\n",
-      "  \"cycles\": 643,\r\n",
-      "  \"runtime[ms]\": 0.00643,\r\n",
-      "  \"throughput[images/s]\": 1088646.967340591,\r\n",
-      "  \"DRAM_in_bandwidth[Mb/s]\": 81.64852255054431,\r\n",
-      "  \"DRAM_out_bandwidth[Mb/s]\": 0.13608087091757387,\r\n",
-      "  \"fclk[mhz]\": 100.0,\r\n",
-      "  \"N\": 7,\r\n",
-      "  \"latency_cycles\": 211\r\n",
-      "}"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "! cat {rtlsim_output_dir}/report/rtlsim_performance.json"
    ]
@@ -610,62 +385,9 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 16,
+   "execution_count": null,
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "{\r\n",
-      "  \"Defaults\": {},\r\n",
-      "  \"StreamingFIFO_0\": {\r\n",
-      "    \"ram_style\": \"auto\",\r\n",
-      "    \"depth\": 32,\r\n",
-      "    \"impl_style\": \"rtl\"\r\n",
-      "  },\r\n",
-      "  \"StreamingFCLayer_Batch_0\": {\r\n",
-      "    \"PE\": 16,\r\n",
-      "    \"SIMD\": 40,\r\n",
-      "    \"ram_style\": \"auto\",\r\n",
-      "    \"resType\": \"lut\",\r\n",
-      "    \"mem_mode\": \"decoupled\",\r\n",
-      "    \"runtime_writeable_weights\": 0\r\n",
-      "  },\r\n",
-      "  \"StreamingDataWidthConverter_Batch_0\": {\r\n",
-      "    \"impl_style\": \"hls\"\r\n",
-      "  },\r\n",
-      "  \"StreamingFCLayer_Batch_1\": {\r\n",
-      "    \"PE\": 1,\r\n",
-      "    \"SIMD\": 64,\r\n",
-      "    \"ram_style\": \"auto\",\r\n",
-      "    \"resType\": \"lut\",\r\n",
-      "    \"mem_mode\": \"decoupled\",\r\n",
-      "    \"runtime_writeable_weights\": 0\r\n",
-      "  },\r\n",
-      "  \"StreamingDataWidthConverter_Batch_1\": {\r\n",
-      "    \"impl_style\": \"hls\"\r\n",
-      "  },\r\n",
-      "  \"StreamingFCLayer_Batch_2\": {\r\n",
-      "    \"PE\": 1,\r\n",
-      "    \"SIMD\": 64,\r\n",
-      "    \"ram_style\": \"auto\",\r\n",
-      "    \"resType\": \"lut\",\r\n",
-      "    \"mem_mode\": \"decoupled\",\r\n",
-      "    \"runtime_writeable_weights\": 0\r\n",
-      "  },\r\n",
-      "  \"StreamingFCLayer_Batch_3\": {\r\n",
-      "    \"PE\": 1,\r\n",
-      "    \"SIMD\": 1,\r\n",
-      "    \"ram_style\": \"auto\",\r\n",
-      "    \"resType\": \"lut\",\r\n",
-      "    \"mem_mode\": \"decoupled\",\r\n",
-      "    \"runtime_writeable_weights\": 0\r\n",
-      "  }\r\n",
-      "}"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "! cat {rtlsim_output_dir}/final_hw_config.json"
    ]
@@ -681,7 +403,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 17,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -716,49 +438,9 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 18,
+   "execution_count": null,
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Building dataflow accelerator from cybsec-mlp-ready.onnx\n",
-      "Intermediate outputs will be generated in /tmp/finn_dev_ubuntu\n",
-      "Final outputs will be generated in output_final\n",
-      "Build log is at output_final/build_dataflow.log\n",
-      "Running step: step_tidy_up [1/16]\n",
-      "Running step: step_streamline [2/16]\n",
-      "Running step: step_convert_to_hls [3/16]\n",
-      "Running step: step_create_dataflow_partition [4/16]\n",
-      "Running step: step_target_fps_parallelization [5/16]\n",
-      "Running step: step_apply_folding_config [6/16]\n",
-      "Running step: step_generate_estimate_reports [7/16]\n",
-      "Running step: step_hls_codegen [8/16]\n",
-      "Running step: step_hls_ipgen [9/16]\n",
-      "Running step: step_set_fifo_depths [10/16]\n",
-      "Running step: step_create_stitched_ip [11/16]\n",
-      "Running step: step_measure_rtlsim_performance [12/16]\n",
-      "Running step: step_make_pynq_driver [13/16]\n",
-      "Running step: step_out_of_context_synthesis [14/16]\n",
-      "Running step: step_synthesize_bitfile [15/16]\n",
-      "Running step: step_deployment_package [16/16]\n",
-      "Completed successfully\n",
-      "CPU times: user 4.47 s, sys: 766 ms, total: 5.24 s\n",
-      "Wall time: 22min 13s\n"
-     ]
-    },
-    {
-     "data": {
-      "text/plain": [
-       "0"
-      ]
-     },
-     "execution_count": 18,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
+   "outputs": [],
    "source": [
     "#%%time\n",
     "#build.build_dataflow_cfg(model_file, cfg)"
@@ -773,17 +455,9 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 19,
+   "execution_count": null,
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "finn-accel.bit\tfinn-accel.hwh\r\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "#! ls {final_output_dir}/bitfile"
    ]
@@ -797,17 +471,9 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 20,
+   "execution_count": null,
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "driver.py  driver_base.py  finn  runtime_weights  validate.py\r\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "#! ls {final_output_dir}/driver"
    ]
@@ -821,18 +487,9 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 21,
+   "execution_count": null,
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "estimate_layer_resources_hls.json  post_synth_resources.xml\r\n",
-      "post_route_timing.rpt\r\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "#! ls {final_output_dir}/report"
    ]
@@ -846,17 +503,9 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 22,
+   "execution_count": null,
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "bitfile  driver\r\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "#! ls {final_output_dir}/deploy"
    ]
@@ -874,7 +523,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 23,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -883,7 +532,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 24,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -892,38 +541,18 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 25,
+   "execution_count": null,
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "driver.py\tfinn\t\t unsw_nb15_binarized.npz  validate.py\r\n",
-      "driver_base.py\truntime_weights  validate-unsw-nb15.py\r\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "#! ls {final_output_dir}/deploy/driver"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 26,
+   "execution_count": null,
    "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "'/workspace/finn/notebooks/end2end_example/cybersecurity/deploy-on-pynq.zip'"
-      ]
-     },
-     "execution_count": 26,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
+   "outputs": [],
    "source": [
     "#from shutil import make_archive\n",
     "#make_archive('deploy-on-pynq', 'zip', final_output_dir+\"/deploy\")"
@@ -991,13 +620,6 @@
     "\n",
     "Finally, we can see that `throughput[images/s]`, which is the pure hardware throughput without any software and data movement overheads, is close to 1M inferences per second."
    ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": []
   }
  ],
  "metadata": {
@@ -1016,7 +638,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.6.8"
+   "version": "3.8.5"
   }
  },
  "nbformat": 4,
diff --git a/requirements.txt b/requirements.txt
index a55c3f0aa0..7e6f7990e2 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,14 +1,14 @@
 bitstring==3.1.7
 clize==4.1.1
-dataclasses-json==0.5.2
+dataclasses-json==0.5.7
 docrep==0.2.7
 future==0.18.2
 gspread==3.6.0
 numpy==1.22.0
-onnx==1.7.0
+onnx==1.11.0
 onnxoptimizer
-onnxruntime==1.4.0
-pre-commit==2.6.0
+onnxruntime==1.11.1
+pre-commit==2.9.2
 protobuf==3.20.1
 pyscaffold==3.2.1
 scipy==1.5.2
diff --git a/run-docker.sh b/run-docker.sh
index 2abd67f067..381be35293 100755
--- a/run-docker.sh
+++ b/run-docker.sh
@@ -1,5 +1,5 @@
 #!/bin/bash
-# Copyright (c) 2020, Xilinx
+# Copyright (c) 2020-2022, Xilinx, Inc.
 # All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
@@ -86,13 +86,15 @@ SCRIPTPATH=$(dirname "$SCRIPT")
 : ${ALVEO_BOARD="U250"}
 : ${ALVEO_TARGET_DIR="/tmp"}
 : ${PLATFORM_REPO_PATHS="/opt/xilinx/platforms"}
-: ${XRT_DEB_VERSION="xrt_202010.2.7.766_18.04-amd64-xrt"}
+: ${XRT_DEB_VERSION="xrt_202210.2.13.466_18.04-amd64-xrt"}
 : ${FINN_HOST_BUILD_DIR="/tmp/$DOCKER_INST_NAME"}
-: ${FINN_DOCKER_TAG="xilinx/finn:$(git describe --tags --dirty).$XRT_DEB_VERSION"}
+: ${FINN_DOCKER_TAG="xilinx/finn:$(git describe --always --tags --dirty).$XRT_DEB_VERSION"}
 : ${FINN_DOCKER_PREBUILT="0"}
 : ${FINN_DOCKER_RUN_AS_ROOT="0"}
 : ${FINN_DOCKER_GPU="$(docker info | grep nvidia | wc -m)"}
 : ${FINN_DOCKER_EXTRA=""}
+: ${FINN_SKIP_DEP_REPOS="0"}
+: ${OHMYXILINX="${SCRIPTPATH}/deps/oh-my-xilinx"}
 : ${NVIDIA_VISIBLE_DEVICES=""}
 : ${DOCKER_BUILDKIT="1"}
 
@@ -118,24 +120,29 @@ elif [ "$1" = "notebook" ]; then
   FINN_DOCKER_EXTRA+="-p $NETRON_PORT:$NETRON_PORT "
 elif [ "$1" = "build_dataflow" ]; then
   BUILD_DATAFLOW_DIR=$(readlink -f "$2")
-  FINN_DOCKER_EXTRA="-v $BUILD_DATAFLOW_DIR:$BUILD_DATAFLOW_DIR "
+  FINN_DOCKER_EXTRA+="-v $BUILD_DATAFLOW_DIR:$BUILD_DATAFLOW_DIR "
   DOCKER_INTERACTIVE="-it"
   #FINN_HOST_BUILD_DIR=$BUILD_DATAFLOW_DIR/build
   gecho "Running build_dataflow for folder $BUILD_DATAFLOW_DIR"
   DOCKER_CMD="build_dataflow $BUILD_DATAFLOW_DIR"
 elif [ "$1" = "build_custom" ]; then
   BUILD_CUSTOM_DIR=$(readlink -f "$2")
-  FINN_DOCKER_EXTRA="-v $BUILD_CUSTOM_DIR:$BUILD_CUSTOM_DIR -w $BUILD_CUSTOM_DIR "
+  FLOW_NAME=${3:-build}
+  FINN_DOCKER_EXTRA+="-v $BUILD_CUSTOM_DIR:$BUILD_CUSTOM_DIR -w $BUILD_CUSTOM_DIR "
   DOCKER_INTERACTIVE="-it"
   #FINN_HOST_BUILD_DIR=$BUILD_DATAFLOW_DIR/build
-  gecho "Running build_custom: $BUILD_CUSTOM_DIR/build.py"
-  DOCKER_CMD="python -mpdb -cc -cq build.py"
+  gecho "Running build_custom: $BUILD_CUSTOM_DIR/$FLOW_NAME.py"
+  DOCKER_CMD="python -mpdb -cc -cq $FLOW_NAME.py"
+elif [ -z "$1" ]; then
+   gecho "Running container only"
+   DOCKER_CMD="bash"
+   DOCKER_INTERACTIVE="-it"
 else
-  gecho "Running container only"
-  DOCKER_CMD="bash"
-  DOCKER_INTERACTIVE="-it"
+  gecho "Running container with passed arguments"
+  DOCKER_CMD="$@"
 fi
 
+
 if [ "$FINN_DOCKER_GPU" != 0 ];then
   gecho "nvidia-docker detected, enabling GPUs"
   if [ ! -z "$NVIDIA_VISIBLE_DEVICES" ];then
@@ -161,6 +168,11 @@ gecho "Port-forwarding for Netron $NETRON_PORT:$NETRON_PORT"
 gecho "Vivado IP cache dir is at $VIVADO_IP_CACHE"
 gecho "Using default PYNQ board $PYNQ_BOARD"
 
+# Ensure git-based deps are checked out at correct commit
+if [ "$FINN_SKIP_DEP_REPOS" = "0" ]; then
+  ./fetch-repos.sh
+fi
+
 # Build the FINN Docker image
 if [ "$FINN_DOCKER_PREBUILT" = "0" ]; then
   # Need to ensure this is done within the finn/ root folder:
@@ -175,10 +187,11 @@ fi
 DOCKER_EXEC="docker run -t --rm $DOCKER_INTERACTIVE --tty --init "
 DOCKER_EXEC+="--hostname $DOCKER_INST_NAME "
 DOCKER_EXEC+="-e SHELL=/bin/bash "
-DOCKER_EXEC+="-v $SCRIPTPATH:/workspace/finn "
+DOCKER_EXEC+="-w $SCRIPTPATH "
+DOCKER_EXEC+="-v $SCRIPTPATH:$SCRIPTPATH "
 DOCKER_EXEC+="-v $FINN_HOST_BUILD_DIR:$FINN_HOST_BUILD_DIR "
 DOCKER_EXEC+="-e FINN_BUILD_DIR=$FINN_HOST_BUILD_DIR "
-DOCKER_EXEC+="-e FINN_ROOT="/workspace/finn" "
+DOCKER_EXEC+="-e FINN_ROOT="$SCRIPTPATH" "
 DOCKER_EXEC+="-e LOCALHOST_URL=$LOCALHOST_URL "
 DOCKER_EXEC+="-e VIVADO_IP_CACHE=$VIVADO_IP_CACHE "
 DOCKER_EXEC+="-e PYNQ_BOARD=$PYNQ_BOARD "
@@ -186,6 +199,7 @@ DOCKER_EXEC+="-e PYNQ_IP=$PYNQ_IP "
 DOCKER_EXEC+="-e PYNQ_USERNAME=$PYNQ_USERNAME "
 DOCKER_EXEC+="-e PYNQ_PASSWORD=$PYNQ_PASSWORD "
 DOCKER_EXEC+="-e PYNQ_TARGET_DIR=$PYNQ_TARGET_DIR "
+DOCKER_EXEC+="-e OHMYXILINX=$OHMYXILINX "
 DOCKER_EXEC+="-e NUM_DEFAULT_WORKERS=$NUM_DEFAULT_WORKERS "
 if [ "$FINN_DOCKER_RUN_AS_ROOT" = "0" ];then
   DOCKER_EXEC+="-v /etc/group:/etc/group:ro "
@@ -204,11 +218,15 @@ fi
 if [ ! -z "$FINN_XILINX_PATH" ];then
   VIVADO_PATH="$FINN_XILINX_PATH/Vivado/$FINN_XILINX_VERSION"
   VITIS_PATH="$FINN_XILINX_PATH/Vitis/$FINN_XILINX_VERSION"
+  HLS_PATH="$FINN_XILINX_PATH/Vitis_HLS/$FINN_XILINX_VERSION"
   DOCKER_EXEC+="-v $FINN_XILINX_PATH:$FINN_XILINX_PATH "
   if [ -d "$VIVADO_PATH" ];then
     DOCKER_EXEC+="-e "XILINX_VIVADO=$VIVADO_PATH" "
     DOCKER_EXEC+="-e VIVADO_PATH=$VIVADO_PATH "
   fi
+  if [ -d "$HLS_PATH" ];then
+    DOCKER_EXEC+="-e HLS_PATH=$HLS_PATH "
+  fi
   if [ -d "$VITIS_PATH" ];then
     DOCKER_EXEC+="-e VITIS_PATH=$VITIS_PATH "
   fi
diff --git a/setup.cfg b/setup.cfg
index 96618e0ffc..0823981aeb 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -76,7 +76,7 @@ exclude =
 docs =
     finn-base==0.0.3
     docutils==0.17.1
-    dataclasses-json==0.5.2
+    dataclasses-json==0.5.7
     gspread==3.6.0
     pytest
     netron
@@ -120,6 +120,12 @@ markers =
     vivado: mark tests that require Vivado or Vivado HLS
     vitis: mark tests that require Vitis
     board: mark tests that require a PYNQ board
+    brevitas_export : mark tests that test brevitas export functionality
+    streamline: mark tests that test streamlining functionality
+    util: mark tests that test util functions
+    transform: mark tests that test transformations (before hls layers)
+    fpgadataflow: mark tests related to hls layers
+    end2end: mark tests that run the end2end flow
 norecursedirs =
     dist
     build
diff --git a/src/finn/analysis/__init__.py b/src/finn/analysis/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/src/finn/analysis/fpgadataflow/__init__.py b/src/finn/analysis/fpgadataflow/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/src/finn/analysis/fpgadataflow/dataflow_performance.py b/src/finn/analysis/fpgadataflow/dataflow_performance.py
index dafe8a9f89..5726702666 100644
--- a/src/finn/analysis/fpgadataflow/dataflow_performance.py
+++ b/src/finn/analysis/fpgadataflow/dataflow_performance.py
@@ -26,7 +26,8 @@
 # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-from finn.custom_op.registry import getCustomOp
+from qonnx.custom_op.registry import getCustomOp
+
 from finn.util.fpgadataflow import is_fpgadataflow_node
 
 
diff --git a/src/finn/analysis/fpgadataflow/exp_cycles_per_layer.py b/src/finn/analysis/fpgadataflow/exp_cycles_per_layer.py
index bb1cad56da..e1517ec636 100644
--- a/src/finn/analysis/fpgadataflow/exp_cycles_per_layer.py
+++ b/src/finn/analysis/fpgadataflow/exp_cycles_per_layer.py
@@ -26,7 +26,8 @@
 # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-import finn.custom_op.registry as registry
+import qonnx.custom_op.registry as registry
+
 from finn.util.fpgadataflow import is_fpgadataflow_node
 
 
diff --git a/src/finn/analysis/fpgadataflow/floorplan_params.py b/src/finn/analysis/fpgadataflow/floorplan_params.py
index 9ba99fb546..d57b660bce 100644
--- a/src/finn/analysis/fpgadataflow/floorplan_params.py
+++ b/src/finn/analysis/fpgadataflow/floorplan_params.py
@@ -26,7 +26,8 @@
 # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-from finn.custom_op.registry import getCustomOp
+from qonnx.custom_op.registry import getCustomOp
+
 from finn.util.fpgadataflow import is_fpgadataflow_node
 
 
diff --git a/src/finn/analysis/fpgadataflow/hls_synth_res_estimation.py b/src/finn/analysis/fpgadataflow/hls_synth_res_estimation.py
index aff99efd80..4d921438f6 100644
--- a/src/finn/analysis/fpgadataflow/hls_synth_res_estimation.py
+++ b/src/finn/analysis/fpgadataflow/hls_synth_res_estimation.py
@@ -26,10 +26,10 @@
 # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 import os
+import qonnx.custom_op.registry as registry
 import warnings
 import xml.etree.ElementTree as ET
 
-import finn.custom_op.registry as registry
 from finn.util.fpgadataflow import is_fpgadataflow_node
 
 
diff --git a/src/finn/analysis/fpgadataflow/op_and_param_counts.py b/src/finn/analysis/fpgadataflow/op_and_param_counts.py
index 27c6dfd997..0bc9655c0d 100644
--- a/src/finn/analysis/fpgadataflow/op_and_param_counts.py
+++ b/src/finn/analysis/fpgadataflow/op_and_param_counts.py
@@ -26,8 +26,8 @@
 # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-import finn.custom_op.registry as registry
-from finn.util.basic import is_finn_op
+import qonnx.custom_op.registry as registry
+from qonnx.util.basic import is_finn_op
 
 
 def aggregate_dict_keys(res_dict):
diff --git a/src/finn/analysis/fpgadataflow/post_synth_res.py b/src/finn/analysis/fpgadataflow/post_synth_res.py
index 4b81791094..8b9c5d2a04 100644
--- a/src/finn/analysis/fpgadataflow/post_synth_res.py
+++ b/src/finn/analysis/fpgadataflow/post_synth_res.py
@@ -28,9 +28,9 @@
 
 import os
 import xml.etree.ElementTree as ET
+from qonnx.core.modelwrapper import ModelWrapper
+from qonnx.custom_op.registry import getCustomOp
 
-from finn.core.modelwrapper import ModelWrapper
-from finn.custom_op.registry import getCustomOp
 from finn.transformation.move_reshape import _is_fpgadataflow_node
 
 
diff --git a/src/finn/analysis/fpgadataflow/res_estimation.py b/src/finn/analysis/fpgadataflow/res_estimation.py
index 31cfeb76a6..406496bc0e 100644
--- a/src/finn/analysis/fpgadataflow/res_estimation.py
+++ b/src/finn/analysis/fpgadataflow/res_estimation.py
@@ -26,7 +26,8 @@
 # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-import finn.custom_op.registry as registry
+import qonnx.custom_op.registry as registry
+
 from finn.util.fpgadataflow import is_fpgadataflow_node
 
 
@@ -62,8 +63,8 @@ def res_estimation_complete(model):
             op_type = node.op_type
             inst = registry.getCustomOp(node)
             if (
-                op_type == "StreamingFCLayer_Batch"
-                or op_type == "Vector_Vector_Activate_Batch"
+                op_type == "MatrixVectorActivation"
+                or op_type == "VectorVectorActivation"
             ):
                 orig_restype = inst.get_nodeattr("resType")
                 res_dict[node.name] = []
diff --git a/src/finn/analysis/verify_custom_nodes.py b/src/finn/analysis/verify_custom_nodes.py
index 62dac2827f..83a985e71f 100644
--- a/src/finn/analysis/verify_custom_nodes.py
+++ b/src/finn/analysis/verify_custom_nodes.py
@@ -26,8 +26,8 @@
 # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-import finn.custom_op.registry as registry
-from finn.util.basic import is_finn_op
+import qonnx.custom_op.registry as registry
+from qonnx.util.basic import is_finn_op
 
 
 def verify_nodes(model):
diff --git a/src/finn/builder/__init__.py b/src/finn/builder/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/src/finn/builder/build_dataflow.py b/src/finn/builder/build_dataflow.py
index c4664a5471..238083f653 100644
--- a/src/finn/builder/build_dataflow.py
+++ b/src/finn/builder/build_dataflow.py
@@ -34,13 +34,13 @@
 import sys
 import time
 import traceback
+from qonnx.core.modelwrapper import ModelWrapper
 
 from finn.builder.build_dataflow_config import (
     DataflowBuildConfig,
     default_build_dataflow_steps,
 )
 from finn.builder.build_dataflow_steps import build_dataflow_step_lookup
-from finn.core.modelwrapper import ModelWrapper
 
 
 # adapted from https://stackoverflow.com/a/39215961
diff --git a/src/finn/builder/build_dataflow_config.py b/src/finn/builder/build_dataflow_config.py
index 106c221203..8d587636cd 100644
--- a/src/finn/builder/build_dataflow_config.py
+++ b/src/finn/builder/build_dataflow_config.py
@@ -59,7 +59,7 @@ class DataflowOutputType(str, Enum):
 
 class ComputeEngineMemMode(str, Enum):
     """Memory mode for generated compute engines. See
-    https://finn.readthedocs.io/en/latest/internals.html#streamingfclayer-mem-mode
+    https://finn.readthedocs.io/en/latest/internals.html#matrixvectoractivation-mem-mode
     for more information."""
 
     CONST = "const"
@@ -166,7 +166,7 @@ class DataflowBuildConfig:
     #: If the parallelization attributes (PE, SIMD) are part of the config,
     #: this will override the automatically generated parallelization
     #: attributes inferred from target_fps (if any)
-    #: Will be applied with :py:mod:`finn.transformation.general.ApplyConfig`
+    #: Will be applied with :py:mod:`qonnx.transformation.general.ApplyConfig`
     folding_config_file: Optional[str] = None
 
     #: (Optional) Target inference performance in frames per second.
@@ -209,6 +209,10 @@ class DataflowBuildConfig:
     #: the full list of layer IP build directories. By default, synthesis will not run.
     stitched_ip_gen_dcp: Optional[bool] = False
 
+    #: Insert a signature node to the stitched-IP to read/write information
+    #: to the design: e.g. Customer signature, application signature, version
+    signature: Optional[List[int]] = None
+
     #: (Optional) Control the maximum width of the per-PE MVAU stream while
     #: exploring the parallelization attributes to reach target_fps
     #: Only relevant if target_fps is specified.
@@ -218,8 +222,8 @@ class DataflowBuildConfig:
 
     #: (Optional) Whether thresholding layers (which implement quantized
     #: activations in FINN) will be implemented as stand-alone HLS layers,
-    #: instead of being part of StreamingFCLayer. This gives larger flexibility,
-    #: and makes it possible to have runtime-writable thresholds.
+    #: instead of being part of MatrixVectorActivation layer. This gives larger
+    #: flexibility, and makes it possible to have runtime-writable thresholds.
     standalone_thresholds: Optional[bool] = False
 
     #: Target board, only needed for generating full bitfiles where the FINN
@@ -261,7 +265,7 @@ class DataflowBuildConfig:
 
     #: Path to JSON config file assigning each layer to an SLR.
     #: Only relevant when `shell_flow_type = ShellFlowType.VITIS_ALVEO`
-    #: Will be applied with :py:mod:`finn.transformation.general.ApplyConfig`
+    #: Will be applied with :py:mod:`qonnx.transformation.general.ApplyConfig`
     vitis_floorplan_file: Optional[str] = None
 
     #: Vitis optimization strategy
@@ -376,6 +380,6 @@ def _resolve_verification_io_pair(self):
             )
             verify_expected_output_npy = np.load(self.verify_expected_output_npy)
             return (
-                verify_input_npy.astype(np.float32),
-                verify_expected_output_npy.astype(np.float32),
+                verify_input_npy,
+                verify_expected_output_npy,
             )
diff --git a/src/finn/builder/build_dataflow_steps.py b/src/finn/builder/build_dataflow_steps.py
index bf3701f12d..e6873fb8de 100644
--- a/src/finn/builder/build_dataflow_steps.py
+++ b/src/finn/builder/build_dataflow_steps.py
@@ -31,7 +31,23 @@
 import os
 from copy import deepcopy
 from distutils.dir_util import copy_tree
+from qonnx.core.modelwrapper import ModelWrapper
+from qonnx.custom_op.registry import getCustomOp
+from qonnx.transformation.bipolar_to_xnor import ConvertBipolarMatMulToXnorPopcount
+from qonnx.transformation.fold_constants import FoldConstants
+from qonnx.transformation.general import (
+    ApplyConfig,
+    GiveReadableTensorNames,
+    GiveUniqueNodeNames,
+    RemoveStaticGraphInputs,
+    RemoveUnusedTensors,
+)
+from qonnx.transformation.infer_data_layouts import InferDataLayouts
+from qonnx.transformation.infer_datatypes import InferDataTypes
+from qonnx.transformation.infer_shapes import InferShapes
+from qonnx.transformation.lower_convs_to_matmul import LowerConvsToMatMul
 from qonnx.util.cleanup import cleanup_model
+from qonnx.util.config import extract_model_config_to_json
 from shutil import copy
 
 import finn.transformation.fpgadataflow.convert_to_hls_layers as to_hls
@@ -53,12 +69,9 @@
     ShellFlowType,
     VerificationStepType,
 )
-from finn.core.modelwrapper import ModelWrapper
 from finn.core.onnx_exec import execute_onnx
+from finn.core.rtlsim_exec import rtlsim_exec
 from finn.core.throughput_test import throughput_test_rtlsim
-from finn.custom_op.registry import getCustomOp
-from finn.transformation.bipolar_to_xnor import ConvertBipolarMatMulToXnorPopcount
-from finn.transformation.fold_constants import FoldConstants
 from finn.transformation.fpgadataflow.annotate_cycles import AnnotateCycles
 from finn.transformation.fpgadataflow.compile_cppsim import CompileCppSim
 from finn.transformation.fpgadataflow.create_dataflow_partition import (
@@ -83,17 +96,6 @@
 from finn.transformation.fpgadataflow.set_folding import SetFolding
 from finn.transformation.fpgadataflow.synth_ooc import SynthOutOfContext
 from finn.transformation.fpgadataflow.vitis_build import VitisBuild
-from finn.transformation.general import (
-    ApplyConfig,
-    GiveReadableTensorNames,
-    GiveUniqueNodeNames,
-    RemoveStaticGraphInputs,
-    RemoveUnusedTensors,
-)
-from finn.transformation.infer_data_layouts import InferDataLayouts
-from finn.transformation.infer_datatypes import InferDataTypes
-from finn.transformation.infer_shapes import InferShapes
-from finn.transformation.lower_convs_to_matmul import LowerConvsToMatMul
 from finn.transformation.move_reshape import RemoveCNVtoFCFlatten
 from finn.transformation.qonnx.convert_qonnx_to_finn import ConvertQONNXtoFINN
 from finn.transformation.qonnx.quant_act_to_multithreshold import (
@@ -101,14 +103,19 @@
 )
 from finn.transformation.streamline import Streamline
 from finn.transformation.streamline.reorder import MakeMaxPoolNHWC
-from finn.util.basic import get_rtlsim_trace_depth
-from finn.util.config import extract_model_config_to_json
-from finn.util.pyverilator import pyverilate_get_liveness_threshold_cycles
+from finn.util.basic import (
+    get_rtlsim_trace_depth,
+    pyverilate_get_liveness_threshold_cycles,
+)
 from finn.util.test import execute_parent
 
 
 def verify_step(
-    model: ModelWrapper, cfg: DataflowBuildConfig, step_name: str, need_parent: bool
+    model: ModelWrapper,
+    cfg: DataflowBuildConfig,
+    step_name: str,
+    need_parent: bool,
+    rtlsim_pre_hook=None,
 ):
     print("Running verification for " + step_name)
     verify_out_dir = cfg.output_dir + "/verification_output"
@@ -131,7 +138,10 @@ def verify_step(
         inp_tensor_name = model.graph.input[0].name
         out_tensor_name = model.graph.output[0].name
         inp_dict = {inp_tensor_name: in_npy}
-        out_dict = execute_onnx(model, inp_dict, True)
+        if rtlsim_pre_hook is not None:
+            out_dict = rtlsim_exec(model, inp_dict, pre_hook=rtlsim_pre_hook)
+        else:
+            out_dict = execute_onnx(model, inp_dict, True)
         out_npy = out_dict[out_tensor_name]
     res = np.isclose(exp_out_npy, out_npy, atol=1e-3).all()
     res_to_str = {True: "SUCCESS", False: "FAIL"}
@@ -282,9 +292,9 @@ def step_convert_to_hls(model: ModelWrapper, cfg: DataflowBuildConfig):
         # doing this first causes all threshold layers to be standalone
         model = model.transform(to_hls.InferThresholdingLayer())
     # needed for bipolar MatMul layers
-    model = model.transform(to_hls.InferBinaryStreamingFCLayer(mem_mode))
+    model = model.transform(to_hls.InferBinaryMatrixVectorActivation(mem_mode))
     # needed for non-bipolar MatMul layers
-    model = model.transform(to_hls.InferQuantizedStreamingFCLayer(mem_mode))
+    model = model.transform(to_hls.InferQuantizedMatrixVectorActivation(mem_mode))
     # TopK to LabelSelect
     model = model.transform(to_hls.InferLabelSelectLayer())
     # input quantization (if any) as standalone threshold
@@ -503,6 +513,7 @@ def step_create_stitched_ip(model: ModelWrapper, cfg: DataflowBuildConfig):
                 cfg._resolve_fpga_part(),
                 cfg.synth_clk_period_ns,
                 vitis=cfg.stitched_ip_gen_dcp,
+                signature=cfg.signature,
             )
         )
         # TODO copy all ip sources into output dir? as zip?
diff --git a/src/finn/core/__init__.py b/src/finn/core/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/src/finn/core/onnx_exec.py b/src/finn/core/onnx_exec.py
new file mode 100644
index 0000000000..2695113661
--- /dev/null
+++ b/src/finn/core/onnx_exec.py
@@ -0,0 +1,152 @@
+# Copyright (c) 2022, Xilinx, Inc.
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of FINN nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import copy
+import numpy as np
+import qonnx.analysis.topology as ta
+from qonnx.core.onnx_exec import execute_onnx as execute_onnx_base
+
+from finn.core.remote_exec import remote_exec
+from finn.core.rtlsim_exec import rtlsim_exec
+
+
+def execute_onnx(
+    model, input_dict, return_full_exec_context=False, start_node=None, end_node=None
+):
+    """Executes given ONNX ModelWrapper with given named inputs.
+    If return_full_exec_context is False, a dict of named outputs is returned
+    as indicated by the model.graph.output.
+    If return return_full_exec_context is True, the full set of tensors used by
+    the execution (including inputs, weights, activations and final outputs)
+    will be returned as a dict.
+    When start_node and end_node are set to None, the whole graph is executed.
+    If they are set to particular ONNX nodes, only the subgraph between (and
+    including) those nodes is executed.
+    """
+
+    # check if model has an execution mode set
+    # if None, execute model node using the QONNX-provided execute_onnx impl
+    # if set to "remote_pynq" execute model on PYNQ board
+    # if set to "rtlsim" execute model using pyverilator
+    model_exec_mode = model.get_metadata_prop("exec_mode")
+    if (model_exec_mode is None) or (model_exec_mode == ""):
+        return execute_onnx_base(
+            model, input_dict, return_full_exec_context, start_node, end_node
+        )
+
+    if not model.check_all_tensor_shapes_specified():
+        raise Exception("Found unspecified tensor shapes, try infer_shapes")
+    ret = model.analysis(ta.nodes_topologically_sorted)
+    assert (
+        ret["nodes_topologically_sorted"] is True
+    ), """Nodes must be
+    topologically sorted."""
+
+    graph = model.graph
+    # first, we need to make sure that every variable required by the graph has
+    # some buffer associated with it. this includes graph inputs (which includes
+    # the input data as well as the trained parameters) and the graph ValueInfo
+    # (intermediate tensors between layers)
+    # this is provided by the execution_context, which is a dict of np.ndarray
+    execution_context = model.make_empty_exec_context()
+    # fill in any inputs provided to this function
+    for inp_name in input_dict.keys():
+        if inp_name in execution_context:
+            if execution_context[inp_name].shape == input_dict[inp_name].shape:
+                execution_context[inp_name] = input_dict[inp_name]
+            else:
+                raise Exception(
+                    "Shape mismatch for provided input %s: found %s expected %s "
+                    % (
+                        inp_name,
+                        str(execution_context[inp_name].shape),
+                        str(input_dict[inp_name].shape),
+                    )
+                )
+
+    # check if model has an execution mode set
+    # if None, execute model node by node using execute_node()
+    # if set to "remote_pynq" execute model on PYNQ board
+    # if set to "rtlsim" execute model using pyverilator
+    model_exec_mode = model.get_metadata_prop("exec_mode")
+    if (model_exec_mode is None) or (model_exec_mode == ""):
+        return execute_onnx_base()
+    elif model_exec_mode == "remote_pynq":
+        # use remote exec metadata built into model to execute on a remote PYNQ
+        remote_exec(model, execution_context)
+    elif model_exec_mode == "rtlsim":
+        # use stitched IP for rtlsim
+        rtlsim_exec(model, execution_context)
+    else:
+        raise Exception(
+            """Metadata property "exec_mode" is set to an unknown value.
+        Can be left unset or has to be set to "remote_pynq" for remote execution
+        on PYNQ board or "rtlsim" for execution using pyverilator!"""
+        )
+
+    if return_full_exec_context:
+        return execution_context
+    else:
+        # provide outputs as dict
+        output_dict = dict()
+        for out_tensor in graph.output:
+            out_name = out_tensor.name
+            output_dict[out_name] = execution_context[out_name]
+        return output_dict
+
+
+def execute_onnx_and_make_model(model, input_dict):
+    """Executes given ONNX ModelWrapper with given named inputs and return a new
+    ModelWrapper where an initializer is provided for each tensor as taken from
+    the execution. This new model is useful for debugging, since it contains
+    all the intermediate activation values."""
+
+    # retrieve the full execution context
+    execution_context = execute_onnx(model, input_dict, True)
+    new_model = copy.deepcopy(model)
+    # create value_info entries and initializers for everything
+    for i in execution_context.keys():
+        new_model.set_initializer(i, execution_context[i])
+    for vi in new_model.graph.value_info:
+        new_model.graph.output.append(vi)
+    return new_model
+
+
+def compare_execution(
+    model_a,
+    model_b,
+    input_dict,
+    compare_fxn=lambda x, y: np.isclose(x, y, atol=1e-3).all(),
+):
+    """Executes two ONNX models and compare their outputs using given function.
+
+    compare_fxn should take in two tensors and return a Boolean"""
+    # compare values from first output tensors produced
+    res_a = list(execute_onnx(model_a, input_dict).items())[0][1]
+    res_b = list(execute_onnx(model_b, input_dict).items())[0][1]
+    return compare_fxn(res_a, res_b)
diff --git a/src/finn/core/remote_exec.py b/src/finn/core/remote_exec.py
new file mode 100644
index 0000000000..f487b48f86
--- /dev/null
+++ b/src/finn/core/remote_exec.py
@@ -0,0 +1,119 @@
+# Copyright (c) 2020 Xilinx, Inc.
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of Xilinx nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import numpy as np
+import os
+import subprocess
+import warnings
+
+
+def remote_exec(model, execution_context):
+    """Executes the given model remotely on the pynq board. The metadata properties
+    related to the pynq board have to be set. The execution context contains the
+    input values."""
+    # TODO fix for multi input-output
+    pynq_ip = model.get_metadata_prop("pynq_ip")
+    pynq_port = int(model.get_metadata_prop("pynq_port"))
+    pynq_username = model.get_metadata_prop("pynq_username")
+    pynq_password = model.get_metadata_prop("pynq_password")
+    pynq_target_dir = model.get_metadata_prop("pynq_target_dir")
+    deployment_dir = model.get_metadata_prop("pynq_deploy_dir")
+    platform = model.get_metadata_prop("platform")
+    assert platform in ["alveo", "zynq-iodma"]
+    bitfile = model.get_metadata_prop("bitfile")
+    bitfile = os.path.basename(bitfile)
+    if pynq_password == "":
+        if "zynq" in platform:
+            raise Exception("PYNQ board remote exec needs password for sudo")
+        else:
+            local_prefix = ""  # assume we are using an ssh key
+            warnings.warn("Empty password, make sure you've set up an ssh key")
+    else:
+        local_prefix = "sshpass -p %s " % pynq_password
+
+    if platform == "alveo":
+        # Alveo can run without sudo
+        remote_prefix = ""
+    elif "zynq" in platform:
+        # PYNQ Zynq boards need to execute with sudo
+        remote_prefix = "echo %s | sudo -S " % pynq_password
+
+    inp = execution_context[model.graph.input[0].name]
+    # make copy of array before saving it
+    inp = inp.copy()
+    batchsize = inp.shape[0]
+    np.save(os.path.join(deployment_dir, "input.npy"), inp)
+    # extracting last folder of absolute path (deployment_dir)
+    deployment_folder = os.path.basename(os.path.normpath(deployment_dir))
+    # copy input to PYNQ board
+    cmd = local_prefix + "scp -P{} -r {}/input.npy {}@{}:{}/{}".format(
+        pynq_port,
+        deployment_dir,
+        pynq_username,
+        pynq_ip,
+        pynq_target_dir,
+        deployment_folder,
+    )
+    bash_command = ["/bin/bash", "-c", cmd]
+    process_scp_in = subprocess.Popen(bash_command, stdout=subprocess.PIPE)
+    process_scp_in.communicate()
+
+    # use platform attribute for correct remote execution
+    if platform == "alveo":
+        remote_cmd = "bash -ic 'bash alveo_run.sh execute %d' \"" % batchsize
+    else:
+        remote_cmd = (
+            "python3.6 driver.py --exec_mode=execute --batchsize={} "
+            "--bitfile={} --inputfile=input.npy --outputfile=output.npy "
+            '--platform={} "'
+        ).format(batchsize, bitfile, platform)
+    cmd = (
+        local_prefix + 'ssh {}@{} -p {} "cd {}/{}; ' + remote_prefix + remote_cmd
+    ).format(pynq_username, pynq_ip, pynq_port, pynq_target_dir, deployment_folder)
+    bash_command = ["/bin/bash", "-c", cmd]
+    process_exec_accel = subprocess.Popen(bash_command, stdout=subprocess.PIPE)
+    process_exec_accel.communicate()
+    # remove stale output file from local dir, if any
+    try:
+        os.remove("{}/output.npy".format(deployment_dir))
+    except FileNotFoundError:
+        pass
+    # copy generated output to local
+    cmd = local_prefix + "scp -P{} {}@{}:{}/{}/output.npy {}".format(
+        pynq_port,
+        pynq_username,
+        pynq_ip,
+        pynq_target_dir,
+        deployment_folder,
+        deployment_dir,
+    )
+    bash_command = ["/bin/bash", "-c", cmd]
+    process_scp_out = subprocess.Popen(bash_command, stdout=subprocess.PIPE)
+    process_scp_out.communicate()
+    outp = np.load("{}/output.npy".format(deployment_dir))
+    execution_context[model.graph.output[0].name] = outp
diff --git a/src/finn/core/rtlsim_exec.py b/src/finn/core/rtlsim_exec.py
new file mode 100644
index 0000000000..d45c972928
--- /dev/null
+++ b/src/finn/core/rtlsim_exec.py
@@ -0,0 +1,161 @@
+# Copyright (c) 2020 Xilinx, Inc.
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of Xilinx nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import os
+from pyverilator.util.axi_utils import reset_rtlsim, rtlsim_multi_io
+from qonnx.custom_op.registry import getCustomOp
+
+from finn.util.basic import pyverilate_get_liveness_threshold_cycles
+from finn.util.data_packing import npy_to_rtlsim_input, rtlsim_output_to_npy
+from finn.util.pyverilator import pyverilate_stitched_ip
+
+try:
+    from pyverilator import PyVerilator
+except ModuleNotFoundError:
+    PyVerilator = None
+
+
+def rtlsim_exec(model, execution_context, pre_hook=None, post_hook=None):
+    """Use PyVerilator to execute given model with stitched IP. The execution
+    context contains the input values. Hook functions can be optionally
+    specified to observe/alter the state of the circuit, receiving the
+    PyVerilator sim object as their first argument:
+    - pre_hook : hook function to be called before sim start (after reset)
+    - post_hook : hook function to be called after sim end
+    """
+    if PyVerilator is None:
+        raise ImportError("Installation of PyVerilator is required.")
+    # ensure stitched ip project already exists
+    assert os.path.isfile(
+        model.get_metadata_prop("wrapper_filename")
+    ), """The
+    file name from metadata property "wrapper_filename" doesn't exist."""
+    assert os.path.isdir(
+        model.get_metadata_prop("vivado_stitch_proj")
+    ), """The
+    directory from metadata property "vivado_stitch_proj" doesn't exist"""
+    trace_file = model.get_metadata_prop("rtlsim_trace")
+    if trace_file is None:
+        trace_file = ""
+    extra_verilator_args = model.get_metadata_prop("extra_verilator_args")
+    if extra_verilator_args is None:
+        extra_verilator_args = []
+    else:
+        extra_verilator_args = eval(extra_verilator_args)
+
+    # extract i/o info to prepare io_dict
+    io_dict = {"inputs": {}, "outputs": {}}
+    if_dict = eval(model.get_metadata_prop("vivado_stitch_ifnames"))
+    # go over and prepare inputs
+    for i, i_vi in enumerate(model.graph.input):
+        i_name = i_vi.name
+        i_tensor = execution_context[i_name]
+        i_dt = model.get_tensor_datatype(i_name)
+        first_node_onnx = model.find_consumer(i_name)
+        first_node = getCustomOp(first_node_onnx)
+        node_inp_ind = list(first_node_onnx.input).index(i_name)
+        if node_inp_ind == 0:
+            # default node input (input 0)
+            i_stream_w = first_node.get_instream_width()
+            i_folded_shape = first_node.get_folded_input_shape()
+        else:
+            # not input 0; node must support specifying inp index
+            # for these functions
+            i_stream_w = first_node.get_instream_width(node_inp_ind)
+            i_folded_shape = first_node.get_folded_input_shape(node_inp_ind)
+        batchsize = i_tensor.shape[0]
+        # override batch size for input
+        i_folded_shape = list(i_folded_shape)
+        i_folded_shape[0] = batchsize
+        i_folded_shape = tuple(i_folded_shape)
+        # TODO any other layout transformations need to happen here!
+        i_tensor = i_tensor.reshape(i_folded_shape)
+        # pack input for rtlsim
+        packed_input = npy_to_rtlsim_input(i_tensor, i_dt, i_stream_w)
+        # add to io_dict
+        if_name = if_dict["s_axis"][i][0]
+        io_dict["inputs"][if_name] = packed_input
+    # go over outputs to determine how many values will be produced
+    num_out_values = 0
+    o_tensor_info = []
+    for o, o_vi in enumerate(model.graph.output):
+        # output in io_dict just needs an empty list
+        if_name = if_dict["m_axis"][o][0]
+        io_dict["outputs"][if_name] = []
+        # extract output shape
+        o_name = o_vi.name
+        o_shape = model.get_tensor_shape(o_name)
+        o_dt = model.get_tensor_datatype(o_name)
+        last_node = getCustomOp(model.find_producer(o_name))
+        o_folded_shape = last_node.get_folded_output_shape()
+        # override batch size from actual input
+        o_shape = list(o_shape)
+        o_shape[0] = batchsize
+        o_shape = tuple(o_shape)
+        o_folded_shape = list(o_folded_shape)
+        o_folded_shape[0] = batchsize
+        o_folded_shape = tuple(o_folded_shape)
+        o_stream_w = last_node.get_outstream_width()
+        o_tensor_info.append((o_stream_w, o_dt, o_folded_shape, o_shape))
+        num_out_values += batchsize * last_node.get_number_output_values()
+
+    # prepare pyverilator model
+    rtlsim_so = model.get_metadata_prop("rtlsim_so")
+    if (rtlsim_so is None) or (not os.path.isfile(rtlsim_so)):
+        sim = pyverilate_stitched_ip(model, extra_verilator_args=extra_verilator_args)
+        model.set_metadata_prop("rtlsim_so", sim.lib._name)
+    else:
+        sim = PyVerilator(rtlsim_so, auto_eval=False)
+
+    # reset and call rtlsim, including any pre/post hooks
+    reset_rtlsim(sim)
+    if pre_hook is not None:
+        pre_hook(sim)
+    n_cycles = rtlsim_multi_io(
+        sim,
+        io_dict,
+        num_out_values,
+        trace_file=trace_file,
+        sname="_",
+        liveness_threshold=pyverilate_get_liveness_threshold_cycles(),
+    )
+    if post_hook is not None:
+        post_hook(sim)
+
+    # unpack outputs and put back into execution context
+    for o, o_vi in enumerate(model.graph.output):
+        o_name = o_vi.name
+        if_name = if_dict["m_axis"][o][0]
+        o_stream_w, o_dt, o_folded_shape, o_shape = o_tensor_info[o]
+        packed_output = io_dict["outputs"][if_name]
+        o_folded_tensor = rtlsim_output_to_npy(
+            packed_output, None, o_dt, o_folded_shape, o_stream_w, o_dt.bitwidth()
+        )
+        execution_context[o_name] = o_folded_tensor.reshape(o_shape)
+
+    model.set_metadata_prop("cycles_rtlsim", str(n_cycles))
diff --git a/src/finn/core/throughput_test.py b/src/finn/core/throughput_test.py
new file mode 100644
index 0000000000..3533fd1339
--- /dev/null
+++ b/src/finn/core/throughput_test.py
@@ -0,0 +1,165 @@
+# Copyright (c) 2020 Xilinx, Inc.
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of Xilinx nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import numpy as np
+import os
+import subprocess
+import warnings
+from qonnx.util.basic import gen_finn_dt_tensor
+
+from finn.core.rtlsim_exec import rtlsim_exec
+
+
+def throughput_test_remote(model, batchsize=1000, timeout=None):
+    """Runs the throughput test for the given model remotely on the pynq board.
+    The metadata properties related to the pynq board have to be set.
+    Additionally a timeout for the SSH communication can be set.
+    Returns a dictionary with results of the throughput test. Returns None
+    if the test fails."""
+
+    pynq_ip = model.get_metadata_prop("pynq_ip")
+    pynq_port = int(model.get_metadata_prop("pynq_port"))
+    pynq_username = model.get_metadata_prop("pynq_username")
+    pynq_password = model.get_metadata_prop("pynq_password")
+    pynq_target_dir = model.get_metadata_prop("pynq_target_dir")
+    deployment_dir = model.get_metadata_prop("pynq_deploy_dir")
+    # extracting last folder of absolute path (deployment_dir)
+    deployment_folder = os.path.basename(os.path.normpath(deployment_dir))
+    platform = model.get_metadata_prop("platform")
+    assert platform in ["alveo", "zynq-iodma"]
+    bitfile = model.get_metadata_prop("bitfile")
+    bitfile = os.path.basename(bitfile)
+    if pynq_password == "":
+        if "zynq" in platform:
+            raise Exception("PYNQ board remote exec needs password for sudo")
+        else:
+            local_prefix = ""  # assume we are using an ssh key
+            warnings.warn("Empty password, make sure you've set up an ssh key")
+    else:
+        local_prefix = "sshpass -p %s " % pynq_password
+
+    if platform == "alveo":
+        # Alveo can run without sudo but needs correct environment
+        remote_prefix = "conda activate finn-pynq-alveo; "
+    elif "zynq" in platform:
+        # PYNQ Zynq boards need to execute with sudo
+        remote_prefix = "echo %s | sudo -S " % pynq_password
+
+    # use platform attribute for correct remote execution
+    if platform == "alveo":
+        remote_cmd = "bash -ic 'bash alveo_run.sh throughput_test %d' \"" % batchsize
+    else:
+        remote_cmd = (
+            "python3.6 driver.py --exec_mode=throughput_test --batchsize={} "
+            "--bitfile={} --inputfile=input.npy --outputfile=output.npy "
+            '--platform={} "'
+        ).format(batchsize, bitfile, platform)
+    cmd = (
+        local_prefix + 'ssh {}@{} -p {} "cd {}/{}; ' + remote_prefix + remote_cmd
+    ).format(pynq_username, pynq_ip, pynq_port, pynq_target_dir, deployment_folder)
+    bash_command = ["/bin/bash", "-c", cmd]
+    process_throughput_test = subprocess.Popen(bash_command, stdout=subprocess.PIPE)
+    process_throughput_test.communicate(timeout=timeout)
+
+    # remove any pre-existing metrics file
+    try:
+        os.remove("{}/nw_metrics.txt".format(deployment_dir))
+    except FileNotFoundError:
+        pass
+
+    cmd = local_prefix + "scp -P{} {}@{}:{}/{}/nw_metrics.txt {}".format(
+        pynq_port,
+        pynq_username,
+        pynq_ip,
+        pynq_target_dir,
+        deployment_folder,
+        deployment_dir,
+    )
+    bash_command = ["/bin/bash", "-c", cmd]
+    process_compile = subprocess.Popen(bash_command, stdout=subprocess.PIPE)
+    process_compile.communicate(timeout=timeout)
+
+    try:
+        with open("{}/nw_metrics.txt".format(deployment_dir), "r") as file:
+            res = eval(file.read())
+        return res
+    except FileNotFoundError:
+        return None
+
+
+def throughput_test_rtlsim(model, batchsize=100):
+    """Runs a throughput test for the given IP-stitched model. When combined
+    with tracing, useful to determine bottlenecks and required FIFO sizes."""
+
+    assert (
+        model.get_metadata_prop("exec_mode") == "rtlsim"
+    ), """Top-level exec_mode
+    metadata_prop must be set to rtlsim"""
+
+    # make empty exec context and insert random inputs
+    ctx = model.make_empty_exec_context()
+    i_bytes = 0
+    for i_vi in model.graph.input:
+        # create random input
+        iname = i_vi.name
+        ishape = model.get_tensor_shape(iname)
+        ishape_batch = ishape
+        ishape_batch[0] = batchsize
+        idt = model.get_tensor_datatype(iname)
+        dummy_input = gen_finn_dt_tensor(idt, ishape_batch)
+        ctx[iname] = dummy_input
+        i_bytes += (np.prod(ishape_batch) * idt.bitwidth()) / 8
+
+    # compute total output size as well
+    o_bytes = 0
+    for o_vi in model.graph.output:
+        oname = o_vi.name
+        oshape = model.get_tensor_shape(oname)
+        oshape_batch = oshape
+        oshape_batch[0] = batchsize
+        odt = model.get_tensor_datatype(oname)
+        o_bytes += (np.prod(oshape_batch) * odt.bitwidth()) / 8
+
+    # remove liveness threshold, launch rtlsim
+    os.environ["LIVENESS_THRESHOLD"] = "-1"
+    rtlsim_exec(model, ctx)
+    # extract metrics
+    cycles = int(model.get_metadata_prop("cycles_rtlsim"))
+    clk_ns = float(model.get_metadata_prop("clk_ns"))
+    fclk_mhz = 1 / (clk_ns * 0.001)
+    runtime_s = (cycles * clk_ns) * (10**-9)
+    res = dict()
+    res["cycles"] = cycles
+    res["runtime[ms]"] = runtime_s * 1000
+    res["throughput[images/s]"] = batchsize / runtime_s
+    res["DRAM_in_bandwidth[MB/s]"] = i_bytes * 0.000001 / runtime_s
+    res["DRAM_out_bandwidth[MB/s]"] = o_bytes * 0.000001 / runtime_s
+    res["fclk[mhz]"] = fclk_mhz
+    res["N"] = batchsize
+
+    return res
diff --git a/src/finn/custom_op/fpgadataflow/__init__.py b/src/finn/custom_op/fpgadataflow/__init__.py
index 417a505898..2c7c86c64e 100644
--- a/src/finn/custom_op/fpgadataflow/__init__.py
+++ b/src/finn/custom_op/fpgadataflow/__init__.py
@@ -28,6 +28,8 @@
 
 from finn.custom_op.fpgadataflow.addstreams_batch import AddStreams_Batch
 from finn.custom_op.fpgadataflow.channelwise_op_batch import ChannelwiseOp_Batch
+from finn.custom_op.fpgadataflow.checksum import CheckSum
+from finn.custom_op.fpgadataflow.concat import StreamingConcat
 from finn.custom_op.fpgadataflow.convolutioninputgenerator import (
     ConvolutionInputGenerator,
 )
@@ -41,6 +43,7 @@
 from finn.custom_op.fpgadataflow.iodma import IODMA
 from finn.custom_op.fpgadataflow.labelselect_batch import LabelSelect_Batch
 from finn.custom_op.fpgadataflow.lookup import Lookup
+from finn.custom_op.fpgadataflow.matrixvectoractivation import MatrixVectorActivation
 from finn.custom_op.fpgadataflow.pool_batch import Pool_Batch
 from finn.custom_op.fpgadataflow.streamingdataflowpartition import (
     StreamingDataflowPartition,
@@ -48,15 +51,12 @@
 from finn.custom_op.fpgadataflow.streamingdatawidthconverter_batch import (
     StreamingDataWidthConverter_Batch,
 )
-from finn.custom_op.fpgadataflow.streamingfclayer_batch import StreamingFCLayer_Batch
 from finn.custom_op.fpgadataflow.streamingfifo import StreamingFIFO
 from finn.custom_op.fpgadataflow.streamingmaxpool_batch import StreamingMaxPool_Batch
 from finn.custom_op.fpgadataflow.thresholding_batch import Thresholding_Batch
 from finn.custom_op.fpgadataflow.tlastmarker import TLastMarker
 from finn.custom_op.fpgadataflow.upsampler import UpsampleNearestNeighbour_Batch
-from finn.custom_op.fpgadataflow.vector_vector_activate_batch import (
-    Vector_Vector_Activate_Batch,
-)
+from finn.custom_op.fpgadataflow.vectorvectoractivation import VectorVectorActivation
 
 custom_op = dict()
 
@@ -64,7 +64,7 @@
 # registered and plug in correctly into the infrastructure
 custom_op["DownSampler"] = DownSampler
 custom_op["StreamingMaxPool_Batch"] = StreamingMaxPool_Batch
-custom_op["StreamingFCLayer_Batch"] = StreamingFCLayer_Batch
+custom_op["MatrixVectorActivation"] = MatrixVectorActivation
 custom_op["ConvolutionInputGenerator"] = ConvolutionInputGenerator
 custom_op["ConvolutionInputGenerator1D"] = ConvolutionInputGenerator1D
 custom_op["TLastMarker"] = TLastMarker
@@ -77,9 +77,11 @@
 custom_op["AddStreams_Batch"] = AddStreams_Batch
 custom_op["LabelSelect_Batch"] = LabelSelect_Batch
 custom_op["DuplicateStreams_Batch"] = DuplicateStreams_Batch
-custom_op["Vector_Vector_Activate_Batch"] = Vector_Vector_Activate_Batch
+custom_op["VectorVectorActivation"] = VectorVectorActivation
 custom_op["ChannelwiseOp_Batch"] = ChannelwiseOp_Batch
 custom_op["IODMA"] = IODMA
 custom_op["StreamingDataflowPartition"] = StreamingDataflowPartition
 custom_op["UpsampleNearestNeighbour_Batch"] = UpsampleNearestNeighbour_Batch
 custom_op["Lookup"] = Lookup
+custom_op["StreamingConcat"] = StreamingConcat
+custom_op["CheckSum"] = CheckSum
diff --git a/src/finn/custom_op/fpgadataflow/addstreams_batch.py b/src/finn/custom_op/fpgadataflow/addstreams_batch.py
index fa80e47485..13a4c5892c 100644
--- a/src/finn/custom_op/fpgadataflow/addstreams_batch.py
+++ b/src/finn/custom_op/fpgadataflow/addstreams_batch.py
@@ -29,8 +29,8 @@
 import numpy as np
 import os
 import warnings
+from qonnx.core.datatype import DataType
 
-from finn.core.datatype import DataType
 from finn.custom_op.fpgadataflow.hlscustomop import HLSCustomOp
 from finn.util.data_packing import npy_to_rtlsim_input, rtlsim_output_to_npy
 
@@ -56,7 +56,7 @@ def get_nodeattr_types(self):
         my_attrs.update(super().get_nodeattr_types())
         return my_attrs
 
-    def get_normal_input_shape(self):
+    def get_normal_input_shape(self, ind=0):
         ich = self.get_nodeattr("NumChannels")
         vecs = list(self.get_nodeattr("numInputVectors"))
         ishape = tuple(vecs + [ich])
@@ -166,7 +166,6 @@ def execute_node(self, context, graph):
         exp_ishape = self.get_normal_input_shape()
         exp_oshape = self.get_normal_output_shape()
         folded_ishape = self.get_folded_input_shape()
-        folded_oshape = self.get_folded_output_shape()
 
         if mode == "cppsim":
             code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim")
@@ -211,9 +210,8 @@ def execute_node(self, context, graph):
             # load output npy file
             super().npy_to_dynamic_output(context)
             assert (
-                context[node.output[0]].shape == folded_oshape
-            ), "cppsim did not produce expected folded output shape"
-            context[node.output[0]] = context[node.output[0]].reshape(*exp_oshape)
+                context[node.output[0]].shape == exp_oshape
+            ), "cppsim did not produce expected output shape"
         elif mode == "rtlsim":
             sim = self.get_rtlsim()
             nbits = self.get_instream_width()
@@ -340,14 +338,22 @@ def blackboxfunction(self):
         ]
 
     def pragmas(self):
-        self.code_gen_dict["$PRAGMAS$"] = ["#pragma HLS INTERFACE axis port=in0"]
-        self.code_gen_dict["$PRAGMAS$"].append("#pragma HLS INTERFACE axis port=in1")
-        self.code_gen_dict["$PRAGMAS$"].append("#pragma HLS INTERFACE axis port=out")
+        self.code_gen_dict["$PRAGMAS$"] = [
+            "#pragma HLS INTERFACE axis port=in0 name=in0_" + self.hls_sname()
+        ]
+        self.code_gen_dict["$PRAGMAS$"].append(
+            "#pragma HLS INTERFACE axis port=in1 name=in1_" + self.hls_sname()
+        )
+        self.code_gen_dict["$PRAGMAS$"].append(
+            "#pragma HLS INTERFACE axis port=out name=out_" + self.hls_sname()
+        )
         self.code_gen_dict["$PRAGMAS$"].append(
             "#pragma HLS INTERFACE ap_ctrl_none port=return"
         )
 
     def get_verilog_top_module_intf_names(self):
         intf_names = super().get_verilog_top_module_intf_names()
-        intf_names["s_axis"].append(("in1_V_V", self.get_instream_width_padded()))
+        sname = self.hls_sname()
+        swidth = self.get_instream_width_padded()
+        intf_names["s_axis"] = [(x + "_" + sname, swidth) for x in ["in0", "in1"]]
         return intf_names
diff --git a/src/finn/custom_op/fpgadataflow/channelwise_op_batch.py b/src/finn/custom_op/fpgadataflow/channelwise_op_batch.py
index 4961f61482..3ed76db298 100644
--- a/src/finn/custom_op/fpgadataflow/channelwise_op_batch.py
+++ b/src/finn/custom_op/fpgadataflow/channelwise_op_batch.py
@@ -30,8 +30,8 @@
 import os
 import warnings
 from math import ceil
+from qonnx.core.datatype import DataType
 
-from finn.core.datatype import DataType
 from finn.custom_op.fpgadataflow.hlscustomop import HLSCustomOp
 from finn.util.data_packing import (
     npy_to_rtlsim_input,
@@ -51,7 +51,7 @@
 def get_smallest_possible(vals):
     """Returns smallest (fewest bits) possible DataType that can represent
     value. Prefers unsigned integers where possible."""
-    vals = np.array(vals)
+    vals = np.array(vals, dtype=np.float64)
     for v in vals:
         assert int(v) == v, "Error float value"
 
@@ -350,13 +350,13 @@ def generate_params(self, model, path):
         # get desired function
         func = self.get_nodeattr("Func")
         if func == "cmp_le":
-            func_str = "comp::less_equal"
+            func_str = "comp::less_equal<%s, %s>" % (idt_hls, pdt_hls)
         elif func == "cmp_ge":
-            func_str = "std::greater_equal"
+            func_str = "comp::greater_equal<%s, %s>" % (idt_hls, pdt_hls)
         elif func == "add":
-            func_str = "std::plus"
+            func_str = "comp::add<%s, %s, %s>" % (odt_hls, odt_hls, odt_hls)
         elif func == "mul":
-            func_str = "std::multiplies"
+            func_str = "comp::mul<%s, %s, %s>" % (odt_hls, odt_hls, odt_hls)
         else:
             raise Exception(
                 """Invalid value for attribute Func! Is currently set to: {}
@@ -373,7 +373,7 @@ def generate_params(self, model, path):
                 idt_hls,
                 pdt_hls,
                 odt_hls,
-                "%s<%s>" % (func_str, odt_hls),
+                func_str,
             )
         )
         f_params.write(parameters_hls_code)
@@ -431,11 +431,8 @@ def execute_node(self, context, graph):
                 out = 2 * out - 1
                 context[node.output[0]] = out
             assert (
-                context[node.output[0]].shape == self.get_folded_output_shape()
+                context[node.output[0]].shape == self.get_normal_output_shape()
             ), """Output shape is not as expected"""
-            # reshape output to have expected shape
-            oshape = self.get_normal_output_shape()
-            context[node.output[0]] = context[node.output[0]].reshape(*oshape)
         elif mode == "rtlsim":
             sim = self.get_rtlsim()
             nbits = self.get_instream_width()
@@ -514,18 +511,15 @@ def docompute(self):
         # should ImgDim be defined or just filled in here like we do now?
         ishape = self.get_folded_input_shape()
         if len(ishape) == 3:
-            imgdim_h = 1
-            imgdim_w = 1
+            spatial_dim = 1
         elif len(ishape) == 5:
-            imgdim_h = ishape[1]
-            imgdim_w = ishape[2]
+            spatial_dim = ishape[1] * ishape[2]
         else:
             raise Exception("""Unexpeted input shape""")
         self.code_gen_dict["$DOCOMPUTE$"] = [
-            """Thresholding_Batch<{}, {}, NumChannels1, PE1, {}, {}>
+            """Thresholding_Batch<{}, NumChannels1, PE1, {}, {}>
             (in0, out, threshs, numReps);""".format(
-                imgdim_h,
-                imgdim_w,
+                spatial_dim,
                 tmpl_args["TSrcI"],
                 tmpl_args["TDstI"],
             )
@@ -574,8 +568,12 @@ def blackboxfunction(self):
         ]
 
     def pragmas(self):
-        self.code_gen_dict["$PRAGMAS$"] = ["#pragma HLS INTERFACE axis port=in0"]
-        self.code_gen_dict["$PRAGMAS$"].append("#pragma HLS INTERFACE axis port=out")
+        self.code_gen_dict["$PRAGMAS$"] = [
+            "#pragma HLS INTERFACE axis port=in0 name=in0_" + self.hls_sname()
+        ]
+        self.code_gen_dict["$PRAGMAS$"].append(
+            "#pragma HLS INTERFACE axis port=out name=out_" + self.hls_sname()
+        )
         self.code_gen_dict["$PRAGMAS$"].append(
             "#pragma HLS INTERFACE ap_ctrl_none port=return"
         )
diff --git a/src/finn/custom_op/fpgadataflow/checksum.py b/src/finn/custom_op/fpgadataflow/checksum.py
new file mode 100644
index 0000000000..bde285eb0d
--- /dev/null
+++ b/src/finn/custom_op/fpgadataflow/checksum.py
@@ -0,0 +1,333 @@
+# Copyright (c) 2022, Xilinx, Inc.
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of FINN nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import numpy as np
+import os
+import warnings
+from qonnx.core.datatype import DataType
+
+from finn.custom_op.fpgadataflow.hlscustomop import HLSCustomOp
+from finn.util.data_packing import npy_to_rtlsim_input, rtlsim_output_to_npy
+
+
+class CheckSum(HLSCustomOp):
+    """Class that corresponds to custom_hls checksum function."""
+
+    def __init__(self, onnx_node):
+        super().__init__(onnx_node)
+
+    def get_nodeattr_types(self):
+        my_attrs = {
+            # number of data words in a frame
+            "words_per_frame": ("i", True, 0),
+            # subword count per data word
+            "items_per_word": ("i", True, 0),
+            # FINN DataTypes for input
+            "inputDataType": ("s", True, ""),
+            # folded shape of input/output
+            "folded_shape": ("ints", True, []),
+        }
+        my_attrs.update(super().get_nodeattr_types())
+        return my_attrs
+
+    def make_shape_compatible_op(self, model):
+        oshape = self.get_normal_output_shape()
+        return super().make_const_shape_op(oshape)
+
+    def infer_node_datatype(self, model):
+        node = self.onnx_node
+        idt = model.get_tensor_datatype(node.input[0])
+        if idt != self.get_input_datatype():
+            warn_str = "inputDataType changing for %s: %s -> %s " % (
+                node.name,
+                str(self.get_input_datatype().name),
+                str(idt.name),
+            )
+            warnings.warn(warn_str)
+        self.set_nodeattr("inputDataType", idt.name)
+        # set output datatype from property
+        odt = self.get_output_datatype()
+        model.set_tensor_datatype(node.output[0], odt)
+
+    def verify_node(self):
+        pass
+
+    def get_input_datatype(self):
+        """Returns FINN DataType of input."""
+        return DataType[self.get_nodeattr("inputDataType")]
+
+    def get_output_datatype(self):
+        """Returns FINN DataType of output."""
+        # here same as input data type
+        return DataType[self.get_nodeattr("inputDataType")]
+
+    def get_instream_width(self):
+        dtype = DataType[self.get_nodeattr("inputDataType")]
+        folded_shape = self.get_nodeattr("folded_shape")
+        in_width = folded_shape[-1] * dtype.bitwidth()
+        return in_width
+
+    def get_outstream_width(self):
+        return self.get_instream_width()
+
+    def get_folded_input_shape(self):
+        return self.get_nodeattr("folded_shape")
+
+    def get_folded_output_shape(self):
+        return self.get_nodeattr("folded_shape")
+
+    def get_normal_input_shape(self):
+        # derive normal shape from folded shape
+        # checksum nodes are inserted in between fpgadataflow nodes
+        # the folded shape could be for example (1, nf, pe)
+        # with nf (neuron folding): mh // pe
+        # the normal input shape is in this case (1, mh)
+        # so to achieve this the two inner dimensions are multiplied
+        # and together with all previous dimensions
+        # this gives the normal input shape
+
+        folded_shape = self.get_nodeattr("folded_shape")
+        # extract inner dimension
+        inner_dim = folded_shape[-1]
+        # multiply with the next inner dimension
+        folding_factor = folded_shape[-2] * inner_dim
+        normal_ishape = []
+        # create the normal_ishape
+        for i in range(len(folded_shape) - 2):
+            normal_ishape.append(folded_shape[i])
+        normal_ishape.append(folding_factor)
+
+        return normal_ishape
+
+    def get_ap_int_max_w(self):
+        return max(super().get_ap_int_max_w(), 32)
+
+    def get_normal_output_shape(self):
+        # same shape as input
+        return self.get_normal_input_shape()
+
+    def get_number_output_values(self):
+        folded_oshape = self.get_folded_output_shape()
+        return np.prod(folded_oshape[:-1])
+
+    def npy_to_dynamic_output(self, context):
+        super().npy_to_dynamic_output(context)
+        node = self.onnx_node
+        code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim")
+        output_checksum = np.load("{}/output_checksum.npy".format(code_gen_dir))
+        context[node.output[1]] = output_checksum
+
+    def execute_node(self, context, graph):
+        mode = self.get_nodeattr("exec_mode")
+        node = self.onnx_node
+        inp = context[node.input[0]]
+
+        # TODO ensure codegen dir exists
+        if mode == "cppsim":
+            code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim")
+        elif mode == "rtlsim":
+            code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen")
+        else:
+            raise Exception(
+                """Invalid value for attribute exec_mode! Is currently set to: {}
+            has to be set to one of the following value ("cppsim", "rtlsim")""".format(
+                    mode
+                )
+            )
+
+        if mode == "cppsim":
+            self.dynamic_input_to_npy(context, 1)
+            self.exec_precompiled_singlenode_model()
+            self.npy_to_dynamic_output(context)
+        elif mode == "rtlsim":
+            # create a npy file for the input of the node
+            assert (
+                str(inp.dtype) == "float32"
+            ), """Input datatype is
+                not float32 as expected."""
+            expected_inp_shape = self.get_folded_input_shape()
+            reshaped_input = inp.reshape(expected_inp_shape)
+            if DataType[self.get_nodeattr("inputDataType")] == DataType["BIPOLAR"]:
+                # store bipolar activations as binary
+                reshaped_input = (reshaped_input + 1) / 2
+                export_idt = DataType["BINARY"]
+            else:
+                export_idt = DataType[self.get_nodeattr("inputDataType")]
+            # make copy before saving the array
+            reshaped_input = reshaped_input.copy()
+            np.save(os.path.join(code_gen_dir, "input_0.npy"), reshaped_input)
+            sim = self.get_rtlsim()
+            nbits = self.get_instream_width()
+            inp = npy_to_rtlsim_input(
+                "{}/input_0.npy".format(code_gen_dir), export_idt, nbits
+            )
+            super().reset_rtlsim(sim)
+            super().toggle_clk(sim)
+            io_dict = {
+                "inputs": {"in0": inp},
+                "outputs": {"out": []},
+            }
+            self.rtlsim_multi_io(sim, io_dict)
+            output = io_dict["outputs"]["out"]
+            odt = self.get_output_datatype()
+            target_bits = odt.bitwidth()
+            packed_bits = self.get_outstream_width()
+            out_npy_path = "{}/output.npy".format(code_gen_dir)
+            out_shape = self.get_folded_output_shape()
+            rtlsim_output_to_npy(
+                output, out_npy_path, odt, out_shape, packed_bits, target_bits
+            )
+
+            # load and reshape output
+            output = np.load(out_npy_path)
+            oshape = self.get_normal_output_shape()
+            output = np.asarray([output], dtype=np.float32).reshape(*oshape)
+            context[node.output[0]] = output
+        else:
+            raise Exception(
+                """Invalid value for attribute exec_mode! Is currently set to: {}
+            has to be set to one of the following value ("cppsim", "rtlsim")""".format(
+                    mode
+                )
+            )
+
+    def global_includes(self):
+        self.code_gen_dict["$GLOBALS$"] = ['#include "checksum.hpp"']
+
+    def defines(self, var):
+        items_per_word = self.get_nodeattr("items_per_word")
+        words_per_frame = self.get_nodeattr("words_per_frame")
+        word_size = self.get_instream_width()
+        my_defines = []
+        my_defines.append("#define WORDS_PER_FRAME {}".format(words_per_frame))
+        my_defines.append("#define ITEMS_PER_WORD {}".format(items_per_word))
+        my_defines.append("#define WORD_SIZE {}".format(word_size))
+        self.code_gen_dict["$DEFINES$"] = my_defines
+
+    def read_npy_data(self):
+        code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim")
+        dtype = self.get_input_datatype()
+        elem_bits = dtype.bitwidth()
+        packed_bits = self.get_instream_width()
+        packed_hls_type = "ap_uint<%d>" % packed_bits
+        elem_hls_type = dtype.get_hls_datatype_str()
+        npy_type = "float"
+        npy_in = "%s/input_0.npy" % code_gen_dir
+        self.code_gen_dict["$READNPYDATA$"] = []
+        # note: the innermost dim is reversed for the input
+        self.code_gen_dict["$READNPYDATA$"].append(
+            'npy2apintstream<%s, %s, %d, %s>("%s", in0, false);'
+            % (packed_hls_type, elem_hls_type, elem_bits, npy_type, npy_in)
+        )
+
+    def strm_decl(self):
+        self.code_gen_dict["$STREAMDECLARATIONS$"] = []
+        self.code_gen_dict["$STREAMDECLARATIONS$"].append(
+            'hls::stream<ap_uint<{}>> in0 ("in0");'.format(self.get_instream_width())
+        )
+        self.code_gen_dict["$STREAMDECLARATIONS$"].append(
+            'hls::stream<ap_uint<{}>> out ("out");'.format(self.get_outstream_width())
+        )
+        self.code_gen_dict["$STREAMDECLARATIONS$"].append("ap_uint<32> chk;")
+        # set drain = false for cppsim
+        self.code_gen_dict["$STREAMDECLARATIONS$"].append("ap_uint<1> drain = false;")
+
+    def docompute(self):
+        self.code_gen_dict["$DOCOMPUTE$"] = [
+            """checksum<WORDS_PER_FRAME, ITEMS_PER_WORD>(in0, out, chk, drain);"""
+        ]
+
+    def dataoutstrm(self):
+        code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim")
+        dtype = self.get_output_datatype()
+        if dtype == DataType["BIPOLAR"]:
+            # use binary for bipolar storage
+            dtype = DataType["BINARY"]
+        elem_bits = dtype.bitwidth()
+        packed_bits = self.get_outstream_width()
+        packed_hls_type = "ap_uint<%d>" % packed_bits
+        elem_hls_type = dtype.get_hls_datatype_str()
+        npy_type = "float"
+        npy_out = "%s/output.npy" % code_gen_dir
+        shape = tuple(self.get_folded_output_shape())
+        shape_cpp_str = str(shape).replace("(", "{").replace(")", "}")
+
+        # note: the innermost dim is not reversed for the output
+        self.code_gen_dict["$DATAOUTSTREAM$"] = [
+            'apintstream2npy<%s, %s, %d, %s>(out, %s, "%s", false);'
+            % (
+                packed_hls_type,
+                elem_hls_type,
+                elem_bits,
+                npy_type,
+                shape_cpp_str,
+                npy_out,
+            ),
+            "std::vector<unsigned int> checksum(1);",
+            "checksum[0] = chk;",
+            'cnpy::npy_save("%s/output_checksum.npy",&checksum[0],{1},"w");'
+            % code_gen_dir,
+        ]
+
+    def save_as_npy(self):
+        self.code_gen_dict["$SAVEASCNPY$"] = []
+
+    def blackboxfunction(self):
+        self.code_gen_dict["$BLACKBOXFUNCTION$"] = [
+            """using T = ap_uint<WORD_SIZE>;\n void {}(hls::stream<T> &in0,
+            hls::stream<T> &out, ap_uint<32> &chk, ap_uint<1> &drain)""".format(
+                self.onnx_node.name
+            )
+        ]
+
+    def pragmas(self):
+        self.code_gen_dict["$PRAGMAS$"] = [
+            "#pragma HLS interface axis port=in0 name=in0_" + self.hls_sname()
+        ]
+        self.code_gen_dict["$PRAGMAS$"].append(
+            "#pragma HLS interface axis port=out name=out_" + self.hls_sname()
+        )
+        self.code_gen_dict["$PRAGMAS$"].append(
+            "#pragma HLS interface s_axilite port=chk bundle=checksum"
+        )
+        self.code_gen_dict["$PRAGMAS$"].append(
+            "#pragma HLS interface s_axilite port=drain bundle=checksum"
+        )
+        self.code_gen_dict["$PRAGMAS$"].append(
+            "#pragma HLS interface ap_ctrl_none port=return"
+        )
+        self.code_gen_dict["$PRAGMAS$"].append("#pragma HLS dataflow")
+        self.code_gen_dict["$PRAGMAS$"].append(
+            "#pragma HLS dataflow disable_start_propagation"
+        )
+
+    def get_verilog_top_module_intf_names(self):
+        intf_names = super().get_verilog_top_module_intf_names()
+        # expose axilite interface
+        intf_names["axilite"] = ["s_axi_checksum"]
+        return intf_names
diff --git a/src/finn/custom_op/fpgadataflow/concat.py b/src/finn/custom_op/fpgadataflow/concat.py
new file mode 100644
index 0000000000..5fcf9cf96c
--- /dev/null
+++ b/src/finn/custom_op/fpgadataflow/concat.py
@@ -0,0 +1,376 @@
+# Copyright (c) 2021, Xilinx
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of FINN nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import numpy as np
+import os
+from qonnx.core.datatype import DataType
+from qonnx.util.basic import roundup_to_integer_multiple
+
+from finn.custom_op.fpgadataflow.hlscustomop import HLSCustomOp
+from finn.util.data_packing import npy_to_rtlsim_input, rtlsim_output_to_npy
+
+
+class StreamingConcat(HLSCustomOp):
+    """Streaming concatenation node with dynamically generated HLS.
+    Only supports concatenating along the last axis."""
+
+    def __init__(self, onnx_node):
+        super().__init__(onnx_node)
+
+    def get_nodeattr_types(self):
+        my_attrs = {
+            # number of elements from each stream to concat
+            "ElemsPerStream": ("ints", True, []),
+            # FINN DataTypes for inputs; output datatype inferred from input
+            "inputDataType": ("s", True, ""),
+            # number of input vectors for non-concat axes, examples:
+            # [1] is a single vector (like a FC layer with batch=1)
+            # [4] is four vectors (like a FC layer with batch=4)
+            # [1, 4, 4] is four * four vectors (like a conv layer with batch=1)
+            "numInputVectors": ("ints", False, [1]),
+        }
+        my_attrs.update(super().get_nodeattr_types())
+        return my_attrs
+
+    def get_n_inputs(self):
+        return len(self.get_nodeattr("ElemsPerStream"))
+
+    def get_total_elems(self):
+        elems_per_stream = self.get_nodeattr("ElemsPerStream")
+        return int(np.sum(elems_per_stream))
+
+    def get_normal_input_shape(self, ind=0):
+        elems_per_stream = self.get_nodeattr("ElemsPerStream")
+        elems = elems_per_stream[ind]
+        vecs = list(self.get_nodeattr("numInputVectors"))
+        ishape = tuple(vecs + [elems])
+        return ishape
+
+    def get_folded_input_shape(self, ind=0):
+        return self.get_normal_input_shape(ind)
+
+    def get_normal_output_shape(self):
+        total_elems = self.get_total_elems()
+        vecs = list(self.get_nodeattr("numInputVectors"))
+        return tuple(vecs + [total_elems])
+
+    def get_folded_output_shape(self):
+        return self.get_normal_output_shape()
+
+    def make_shape_compatible_op(self, model):
+        # check all input shapes
+        for i, inp in enumerate(self.onnx_node.input):
+            exp_ishape = self.get_normal_input_shape(i)
+            ishape = tuple(model.get_tensor_shape(inp))
+            assert ishape == exp_ishape, "Unexpected shape for " + inp
+        oshape = self.get_normal_output_shape()
+        return super().make_const_shape_op(oshape)
+
+    def infer_node_datatype(self, model):
+        # check all input datatypes
+        for i, inp in enumerate(self.onnx_node.input):
+            idt = model.get_tensor_datatype(inp)
+            assert idt == self.get_input_datatype()
+        odt = self.get_output_datatype()
+        model.set_tensor_datatype(self.onnx_node.output[0], odt)
+
+    def verify_node(self):
+        pass
+
+    def get_input_datatype(self, ind=0):
+        # input dt identical for all inputs
+        return DataType[self.get_nodeattr("inputDataType")]
+
+    def get_output_datatype(self):
+        return self.get_input_datatype()
+
+    def get_instream_width(self, ind=0):
+        elems_per_stream = self.get_nodeattr("ElemsPerStream")
+        elems = elems_per_stream[ind]
+        ibits = self.get_input_datatype().bitwidth()
+        return elems * ibits
+
+    def get_outstream_width(self):
+        obits = self.get_output_datatype().bitwidth()
+        total_elems = self.get_total_elems()
+        out_width = total_elems * obits
+        return out_width
+
+    def get_number_output_values(self):
+        return np.prod(self.get_folded_output_shape()[:-1])
+
+    def get_exp_cycles(self):
+        return np.prod(self.get_folded_output_shape()[:-1])
+
+    def generate_params(self, model, path):
+        elems_per_stream = self.get_nodeattr("ElemsPerStream")
+        inp_streams = []
+        commands = []
+        idt = self.get_input_datatype()
+        total_elems = self.get_total_elems()
+        total_bw = idt.bitwidth() * total_elems
+        for (i, elems) in enumerate(elems_per_stream):
+            bw = idt.bitwidth() * elems
+            inp_stream = "hls::stream<ap_uint<%d> > &in%d" % (bw, i)
+            inp_streams.append(inp_stream)
+            cmd = "in%d.read()" % i
+            commands.append(cmd)
+        out_stream = "hls::stream<ap_uint<%d> > &out" % (total_bw)
+        inp_streams.append(out_stream)
+
+        impl_hls_code = []
+        impl_hls_code.append("void StreamingConcat(")
+        impl_hls_code.append(",".join(inp_streams))
+        impl_hls_code.append(", unsigned int numReps) {")
+        impl_hls_code.append("for(unsigned int i = 0; i < numReps; i++) {")
+        impl_hls_code.append("#pragma HLS PIPELINE II=1")
+        impl_hls_code.append("ap_uint<%d> out_elem;" % total_bw)
+        # FIXME: the order of streams for concatenation works out differently
+        # for cppsim vs rtlsim, addressed via reversing the order of commands
+        # for now
+        impl_hls_code.append("#ifdef __SYNTHESIS__")
+        impl_hls_code.append("out_elem = (" + ",".join(commands[::-1]) + ");")
+        impl_hls_code.append("#else")
+        impl_hls_code.append("out_elem = (" + ",".join(commands) + ");")
+        impl_hls_code.append("#endif")
+        impl_hls_code.append("out.write(out_elem);")
+        impl_hls_code.append("}")
+        impl_hls_code.append("}")
+        impl_hls_code = "\n".join(impl_hls_code)
+
+        impl_filename = "{}/concat_impl.hpp".format(path)
+        f_impl = open(impl_filename, "w")
+        f_impl.write(impl_hls_code)
+        f_impl.close()
+
+    def execute_node(self, context, graph):
+        mode = self.get_nodeattr("exec_mode")
+        node = self.onnx_node
+        n_inps = len(self.onnx_node.input)
+        ishapes = [self.get_normal_input_shape(x) for x in range(n_inps)]
+        folded_ishapes = [self.get_folded_input_shape(x) for x in range(n_inps)]
+        exp_oshape = self.get_normal_output_shape()
+        folded_oshape = self.get_folded_output_shape()
+        export_idt = self.get_input_datatype()
+
+        if mode == "cppsim":
+            code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim")
+        elif mode == "rtlsim":
+            code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen")
+        else:
+            raise Exception(
+                """Invalid value for attribute exec_mode! Is currently set to: {}
+            has to be set to one of the following value ("cppsim", "rtlsim")""".format(
+                    mode
+                )
+            )
+
+        for i in range(n_inps):
+            inp = context[node.input[i]]
+            assert str(inp.dtype) == "float32", "Input datatype is not float32"
+            assert inp.shape == ishapes[i], "Input shape mismatch for " + node.input[i]
+            # reshape input into folded form
+            inp = inp.reshape(folded_ishapes[i])
+            # make copy before saving array
+            reshaped_input = inp.copy()
+            np.save(os.path.join(code_gen_dir, "input_%d.npy" % i), reshaped_input)
+
+        if mode == "cppsim":
+            # execute the precompiled model
+            super().exec_precompiled_singlenode_model()
+            # load output npy file
+            super().npy_to_dynamic_output(context)
+            assert (
+                context[node.output[0]].shape == folded_oshape
+            ), "cppsim did not produce expected folded output shape"
+            context[node.output[0]] = context[node.output[0]].reshape(*exp_oshape)
+        elif mode == "rtlsim":
+            sim = self.get_rtlsim()
+            io_dict = {"inputs": {}, "outputs": {"out": []}}
+            for i in range(n_inps):
+                nbits = self.get_instream_width(i)
+                rtlsim_inp = npy_to_rtlsim_input(
+                    "%s/input_%d.npy" % (code_gen_dir, i),
+                    export_idt,
+                    nbits,
+                    reverse_inner=True,
+                )
+                io_dict["inputs"]["in%d" % i] = rtlsim_inp
+            super().reset_rtlsim(sim)
+            super().toggle_clk(sim)
+
+            self.rtlsim_multi_io(sim, io_dict)
+            rtlsim_output = io_dict["outputs"]["out"]
+            odt = self.get_output_datatype()
+            target_bits = odt.bitwidth()
+            packed_bits = self.get_outstream_width()
+            out_npy_path = "{}/output.npy".format(code_gen_dir)
+            out_shape = self.get_folded_output_shape()
+            rtlsim_output_to_npy(
+                rtlsim_output,
+                out_npy_path,
+                odt,
+                out_shape,
+                packed_bits,
+                target_bits,
+                reverse_inner=True,
+            )
+            # load and reshape output
+            output = np.load(out_npy_path)
+            output = np.asarray([output], dtype=np.float32).reshape(*exp_oshape)
+            context[node.output[0]] = output
+        else:
+            raise Exception(
+                """Invalid value for attribute exec_mode! Is currently set to: {}
+            has to be set to one of the following value ("cppsim", "rtlsim")""".format(
+                    mode
+                )
+            )
+
+        assert (
+            context[node.output[0]].shape == exp_oshape
+        ), """Output shape doesn't match expected shape."""
+
+    def global_includes(self):
+        self.code_gen_dict["$GLOBALS$"] = ['#include "concat_impl.hpp"']
+
+    def defines(self, var):
+        num_reps = self.get_nodeattr("numInputVectors")
+        num_reps = np.prod(num_reps)
+        self.code_gen_dict["$DEFINES$"] = ["#define NumReps %d" % num_reps]
+
+    def read_npy_data(self):
+        n_inputs = self.get_n_inputs()
+        code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim")
+        npy_type = "float"
+        self.code_gen_dict["$READNPYDATA$"] = []
+        idt = self.get_input_datatype()
+        idt_bw = idt.bitwidth()
+        elem_hls_type = idt.get_hls_datatype_str()
+        elem_bits = idt_bw
+        for i in range(n_inputs):
+            packed_bits = self.get_instream_width(i)
+            packed_hls_type = "ap_uint<%d>" % packed_bits
+            npy_in = "%s/input_%d.npy" % (code_gen_dir, i)
+            self.code_gen_dict["$READNPYDATA$"].append(
+                'npy2apintstream<%s, %s, %d, %s>("%s", in%d);'
+                % (packed_hls_type, elem_hls_type, elem_bits, npy_type, npy_in, i)
+            )
+
+    def strm_decl(self):
+        self.code_gen_dict["$STREAMDECLARATIONS$"] = []
+        n_inputs = self.get_n_inputs()
+        for i in range(n_inputs):
+            packed_bits = self.get_instream_width(i)
+            packed_hls_type = "ap_uint<%d>" % packed_bits
+            stream_name = "in%d" % i
+            self.code_gen_dict["$STREAMDECLARATIONS$"].append(
+                'hls::stream<%s> %s ("%s");'
+                % (packed_hls_type, stream_name, stream_name)
+            )
+        self.code_gen_dict["$STREAMDECLARATIONS$"].append(
+            'hls::stream<ap_uint<{}>> out ("out");'.format(self.get_outstream_width())
+        )
+
+    def docompute(self):
+        self.code_gen_dict["$DOCOMPUTE$"] = []
+        n_inputs = self.get_n_inputs()
+        in_stream_names = ["in%d" % x for x in range(n_inputs)]
+        in_stream_names = ",".join(in_stream_names)
+        comp_call = "StreamingConcat(%s, out, NumReps);" % (in_stream_names)
+        self.code_gen_dict["$DOCOMPUTE$"] = [comp_call]
+
+    def dataoutstrm(self):
+        code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim")
+        dtype = self.get_output_datatype()
+        elem_bits = dtype.bitwidth()
+        packed_bits = self.get_outstream_width()
+        packed_hls_type = "ap_uint<%d>" % packed_bits
+        elem_hls_type = dtype.get_hls_datatype_str()
+        npy_type = "float"
+        npy_out = "%s/output.npy" % code_gen_dir
+        oshape = self.get_folded_output_shape()
+        oshape_cpp_str = str(oshape).replace("(", "{").replace(")", "}")
+
+        self.code_gen_dict["$DATAOUTSTREAM$"] = [
+            'apintstream2npy<%s, %s, %d, %s>(out, %s, "%s");'
+            % (
+                packed_hls_type,
+                elem_hls_type,
+                elem_bits,
+                npy_type,
+                oshape_cpp_str,
+                npy_out,
+            )
+        ]
+
+    def save_as_npy(self):
+        self.code_gen_dict["$SAVEASCNPY$"] = []
+
+    def blackboxfunction(self):
+        n_inputs = self.get_n_inputs()
+        in_streams = []
+        for i in range(n_inputs):
+            iwidth = self.get_instream_width(i)
+            in_streams.append("hls::stream<ap_uint<%d>> &in%d" % (iwidth, i))
+        in_streams = ",".join(in_streams)
+        total_width = self.get_input_datatype().bitwidth() * self.get_total_elems()
+        out_stream = "hls::stream<ap_uint<%d>> &out" % (total_width)
+        blackbox_hls = "void %s(%s, %s)" % (self.onnx_node.name, in_streams, out_stream)
+        self.code_gen_dict["$BLACKBOXFUNCTION$"] = [blackbox_hls]
+
+    def pragmas(self):
+        n_inputs = self.get_n_inputs()
+        pragmas = []
+        for i in range(n_inputs):
+            pragmas.append(
+                "#pragma HLS INTERFACE axis port=in%d name=in%d_%s"
+                % (i, i, self.hls_sname())
+            )
+        self.code_gen_dict["$PRAGMAS$"] = pragmas
+        self.code_gen_dict["$PRAGMAS$"].append(
+            "#pragma HLS INTERFACE axis port=out name=out_" + self.hls_sname()
+        )
+        self.code_gen_dict["$PRAGMAS$"].append(
+            "#pragma HLS INTERFACE ap_ctrl_none port=return"
+        )
+
+    def get_instream_width_padded(self, ind=0):
+        in_width = self.get_instream_width(ind)
+        return roundup_to_integer_multiple(in_width, 8)
+
+    def get_verilog_top_module_intf_names(self):
+        intf_names = super().get_verilog_top_module_intf_names()
+        n_inputs = self.get_n_inputs()
+        sname = self.hls_sname()
+        intf_names["s_axis"] = []
+        for i in range(n_inputs):
+            intf_names["s_axis"].append(
+                ("in%d_%s" % (i, sname), self.get_instream_width_padded(i))
+            )
+        return intf_names
diff --git a/src/finn/custom_op/fpgadataflow/convolutioninputgenerator.py b/src/finn/custom_op/fpgadataflow/convolutioninputgenerator.py
index a401883684..251a9882c5 100644
--- a/src/finn/custom_op/fpgadataflow/convolutioninputgenerator.py
+++ b/src/finn/custom_op/fpgadataflow/convolutioninputgenerator.py
@@ -29,10 +29,10 @@
 import math
 import numpy as np
 import os
+from qonnx.core.datatype import DataType
+from qonnx.custom_op.general.im2col import compute_conv_output_dim
 
-from finn.core.datatype import DataType
 from finn.custom_op.fpgadataflow.hlscustomop import HLSCustomOp
-from finn.custom_op.general.im2col import compute_conv_output_dim
 from finn.util.data_packing import npy_to_rtlsim_input, rtlsim_output_to_npy
 
 # ONNX i/o tensor shape assumptions for ConvolutionInputGenerator:
@@ -286,7 +286,6 @@ def execute_node(self, context, graph):
         exp_ishape = self.get_normal_input_shape()
         exp_oshape = self.get_normal_output_shape()
         folded_ishape = self.get_folded_input_shape()
-        folded_oshape = self.get_folded_output_shape()
 
         # TODO ensure codegen dir exists
         if mode == "cppsim":
@@ -325,10 +324,9 @@ def execute_node(self, context, graph):
             # load output npy file
             super().npy_to_dynamic_output(context)
             assert (
-                context[node.output[0]].shape == folded_oshape
+                context[node.output[0]].shape == exp_oshape
             ), "cppsim \
-            did not produce expected ofolded utput shape"
-            context[node.output[0]] = context[node.output[0]].reshape(*exp_oshape)
+            did not produce expected output shape"
         elif mode == "rtlsim":
             sim = self.get_rtlsim()
             nbits = self.get_instream_width()
@@ -489,8 +487,12 @@ def blackboxfunction(self):
         ]
 
     def pragmas(self):
-        self.code_gen_dict["$PRAGMAS$"] = ["#pragma HLS INTERFACE axis port=in0"]
-        self.code_gen_dict["$PRAGMAS$"].append("#pragma HLS INTERFACE axis port=out")
+        self.code_gen_dict["$PRAGMAS$"] = [
+            "#pragma HLS INTERFACE axis port=in0 name=in0_" + self.hls_sname()
+        ]
+        self.code_gen_dict["$PRAGMAS$"].append(
+            "#pragma HLS INTERFACE axis port=out name=out_" + self.hls_sname()
+        )
         self.code_gen_dict["$PRAGMAS$"].append(
             "#pragma HLS INTERFACE ap_ctrl_none port=return"
         )
diff --git a/src/finn/custom_op/fpgadataflow/convolutioninputgenerator1d.py b/src/finn/custom_op/fpgadataflow/convolutioninputgenerator1d.py
index e43d73b1cd..aba74baecc 100644
--- a/src/finn/custom_op/fpgadataflow/convolutioninputgenerator1d.py
+++ b/src/finn/custom_op/fpgadataflow/convolutioninputgenerator1d.py
@@ -29,10 +29,11 @@
 import math
 import numpy as np
 import os
+import warnings
+from qonnx.core.datatype import DataType
+from qonnx.custom_op.general.im2col import compute_conv_output_dim
 
-from finn.core.datatype import DataType
 from finn.custom_op.fpgadataflow.hlscustomop import HLSCustomOp
-from finn.custom_op.general.im2col import compute_conv_output_dim
 from finn.util.data_packing import npy_to_rtlsim_input, rtlsim_output_to_npy
 
 # This operation should only be used for 1D convolutions. Either the
@@ -85,6 +86,7 @@ def get_nodeattr_types(self):
                 "distributed",
                 {"auto", "block", "distributed", "ultra"},
             ),
+            "parallel_window": ("i", False, 0, {0, 1}),
         }
         my_attrs.update(super().get_nodeattr_types())
         return my_attrs
@@ -181,18 +183,36 @@ def get_number_output_values(self):
         num_output_elems = np.prod(folded_oshape[:-1])
         return num_output_elems
 
+    def get_swu_variant(self):
+        # checks which variant of the 1D ConvolutionInputGenerator (SWU) can be used
+        # We have 5 variants: ConvolutionInputGenerator_1D_parallel,
+        # ConvolutionInputGenerator_1D_dws_naive, ConvolutionInputGenerator_1D,
+        # ConvolutioninputGenerator_1D_dws, ConvolutionInputGenerator_1D_dws_stride
+        is_dws = self.get_nodeattr("depthwise")
+        is_strided = np.prod(self.get_nodeattr("Stride")) > 1
+        is_stride_2 = np.prod(self.get_nodeattr("Stride")) == 2
+        is_dilated = np.prod(self.get_nodeattr("Dilation")) > 1
+        if self.use_parallel_window_output():
+            return "ConvolutionInputGenerator_1D_parallel"
+        if not is_dws:
+            return "ConvolutionInputGenerator_1D"
+        if is_dws:
+            if (is_strided and not is_stride_2) or (is_dilated):
+                return "ConvolutionInputGenerator_1D_dws_naive"
+            elif is_stride_2:
+                return "ConvolutionInputGenerator_1D_dws_stride"
+            else:
+                return "ConvolutionInputGenerator_1D_dws"
+
     def get_1d_conv_attrs_normalized(self):
         # support both (1, D) and (D, 1) cases transparently:
         # For the kernel, presenting the input data of size D as
         # [H, W] = [Y, X] = [1, D] or [D, 1]
-        # effectively gives the same result. Because the
-        # ConvolutionInputGenerator_NonSquare_Dilated(_dws) kernel currently only
-        # supports dilation>1 along the X-axis and the
-        # ConvolutionInputGenerator_NonSquare only works for stride>1 along the
-        # X-axis, we are working with the following assumption:
-        # the dummy ('1') dimension is the Y-dimension, i.e.
-        # images and kernels (and their attributes) of dimension
-        # [H, W] = [Y, X] = [D, 1] or [1, D] are always mapped to [1, D]
+        # effectively gives the same result.
+        # For consistency and ease of programming, this function
+        # returns the attributes of the layer as follows:
+        # [H, W] = [Y, X] = [1, D] or [D, 1] are always mapped to [1, D].
+        # The dummy ('1') dimension is the Y-dimension.
         ifm_ch = self.get_nodeattr("IFMChannels")
         k = self.get_nodeattr("ConvKernelDim")
         ifm_dim = self.get_nodeattr("IFMDim")
@@ -217,56 +237,94 @@ def use_parallel_window_output(self):
         dilation = self.get_nodeattr("Dilation")
         stride_h, stride_w = stride
         dilation_h, dilation_w = dilation
+        ram_style = self.get_nodeattr("ram_style")
 
-        if self.get_nodeattr("SIMD") == self.get_nodeattr("IFMChannels"):
-            if self.get_nodeattr("depthwise") == 0:
-                if stride_h == 1 and stride_w == 1:
-                    if dilation_h == 1 and dilation_w == 1:
-                        return True
-
-        return False
+        fully_unfolded = self.get_nodeattr("SIMD") == self.get_nodeattr("IFMChannels")
+        non_dws = self.get_nodeattr("depthwise") == 0
+        no_stride = stride_h == 1 and stride_w == 1
+        no_dilation = dilation_h == 1 and dilation_w == 1
+        supported_ram_style = ram_style in ["auto", "distributed"]
+        if self.get_nodeattr("parallel_window") == 1:
+            if (
+                fully_unfolded
+                and non_dws
+                and no_stride
+                and no_dilation
+                and supported_ram_style
+            ):
+                return True
+            else:
+                warnings.warn(
+                    "{}: Parallel window output variant is not supported for this node,\
+                     please inspect requirements in use_parallel_window_output method\
+                     of the custom_op".format(
+                        self.onnx_node.name
+                    )
+                )
+                return False
+        else:
+            return False
 
     def get_exp_cycles(self):
         simd = self.get_nodeattr("SIMD")
         (
             ifm_ch,
-            ifm_dim,
-            ofm_dim,
-            k,
-            stride,
-            dilation,
+            [ifm_dim_h, ifm_dim_w],
+            [ofm_dim_h, ofm_dim_w],
+            [k_h, k_w],
+            [stride_h, stride_w],
+            [dilation_h, dilation_w],
         ) = self.get_1d_conv_attrs_normalized()
-        ifm_dim_h, ifm_dim_w = ifm_dim
-        ofm_dim_h, ofm_dim_w = ofm_dim
-        k_h, k_w = k
-        stride_h, stride_w = stride
-        dilation_h, dilation_w = dilation
 
         # since mmv != 1 is not supported yet, we set mmv for now to 1
-        mmv = 1
+        # mmv = 1
         # see https://github.com/Xilinx/finn-hlslib/blob/master/slidingwindow.h
-        if self.use_parallel_window_output():
+        swu_variant = self.get_swu_variant()
+        if swu_variant == "ConvolutionInputGenerator_1D_parallel":
             exp_cycles = k_w + ofm_dim_w
-        else:
-            cycles_write_block = (ofm_dim_w * k_w * k_h * (ifm_ch / simd)) / mmv
-            cycles_read_block = stride_w * ifm_dim_w * (ifm_ch / simd)
-            max_cycles = max(cycles_write_block, cycles_read_block)
+        elif swu_variant == "ConvolutionInputGenerator_1D":
+            exp_cycles = 1 + ofm_dim_w * k_w * ifm_ch / simd
+        elif swu_variant in [
+            "ConvolutionInputGenerator_1D_dws",
+            "ConvolutionInputGenerator_1D_dws_stride",
+        ]:
             exp_cycles = (
-                ifm_dim_w * k_h * dilation_h * (ifm_ch / simd) + ofm_dim_h * max_cycles
+                1
+                + ofm_dim_w * k_w * ifm_ch / simd
+                + (ifm_ch / simd) * (k_w - 1)
+                - (k_w - 1)
             )
+        elif swu_variant == "ConvolutionInputGenerator_1D_dws_naive":
+            cycles_read_block = ifm_dim_w * ifm_ch / simd
+            cycles_write_block = ofm_dim_w * k_w * ifm_ch / simd
+            exp_cycles = cycles_read_block + cycles_write_block
 
         return int(exp_cycles)
 
     def bram_estimation(self):
-        # NOTE: not tested for correctness
         simd = self.get_nodeattr("SIMD")
-        ifm_ch = self.get_nodeattr("IFMChannels")
-        ifm_dim = np.prod(self.get_nodeattr("IFMDim"))
-        k = np.prod(self.get_nodeattr("ConvKernelDim"))
-        stride = np.prod(self.get_nodeattr("Stride"))
+        (
+            ifm_ch,
+            [ifm_dim_h, ifm_dim_w],
+            [ofm_dim_h, ofm_dim_w],
+            [k_h, k_w],
+            [stride_h, stride_w],
+            [dilation_h, dilation_w],
+        ) = self.get_1d_conv_attrs_normalized()
         ram_style = self.get_nodeattr("ram_style")
+        swu_variant = self.get_swu_variant()
+        if swu_variant == "ConvolutionInputGenerator_1D_parallel":
+            return 0
         if ram_style == "block" or ram_style == "auto":
-            ram_depth = ifm_dim * ifm_ch / simd
+            if swu_variant == "ConvolutionInputGenerator_1D":
+                ram_depth = (k_w - 1) * ifm_ch / simd
+            elif swu_variant == "ConvolutionInputGenerator_1D_dws_naive":
+                ram_depth = ifm_dim_w * ifm_ch / simd
+            elif swu_variant in [
+                "ConvolutionInputGenerator_1D_dws",
+                "ConvolutionInputGenerator_1D_dws_stride",
+            ]:
+                ram_depth = k_w * ifm_ch / simd
             if ram_depth <= 512:
                 ram_width = 36
             elif ram_depth <= 1024:
@@ -279,53 +337,80 @@ def bram_estimation(self):
                 ram_width = 2
             else:
                 ram_width = 1
-            return int(
-                (k + stride)
-                * (
-                    math.ceil(simd * self.get_input_datatype().bitwidth() / ram_width)
-                    * math.ceil(ifm_dim * ifm_ch / simd / ram_depth)
-                )
+            width_mul = math.ceil(
+                simd * self.get_input_datatype().bitwidth() / ram_width
             )
+            depth_mul = math.ceil(ram_depth / 18432)
+            return width_mul * depth_mul
         else:
             return 0
 
     def lut_estimation(self):
-        # NOTE: not tested for correctness
         simd = self.get_nodeattr("SIMD")
-        ifm_ch = self.get_nodeattr("IFMChannels")
-        ifm_dim = np.prod(self.get_nodeattr("IFMDim"))
-        k = np.prod(self.get_nodeattr("ConvKernelDim"))
-        stride = np.prod(self.get_nodeattr("Stride"))
+        (
+            ifm_ch,
+            [ifm_dim_h, ifm_dim_w],
+            [ofm_dim_h, ofm_dim_w],
+            [k_h, k_w],
+            [stride_h, stride_w],
+            [dilation_h, dilation_w],
+        ) = self.get_1d_conv_attrs_normalized()
         ram_style = self.get_nodeattr("ram_style")
-        if ram_style == "distributed":
-            ram_luts = int(
-                (k + stride)
-                * (
-                    simd
-                    * self.get_input_datatype().bitwidth()
-                    * math.ceil(ifm_dim * ifm_ch / simd / 64)
-                )
+        swu_variant = self.get_swu_variant()
+        if swu_variant == "ConvolutionInputGenerator_1D_parallel":
+            ram_luts = math.ceil(
+                simd * self.get_input_datatype().bitwidth() * (k_w + 1) / 64
             )
+        elif ram_style == "distributed":
+            if swu_variant == "ConvolutionInputGenerator_1D":
+                ram_luts = math.ceil(
+                    self.get_input_datatype().bitwidth() * (k_w - 1) * ifm_ch / 64
+                )
+            elif swu_variant == "ConvolutionInputGenerator_1D_dws_naive":
+                ram_luts = math.ceil(
+                    self.get_input_datatype().bitwidth() * ifm_dim_w * ifm_ch / 64
+                )
+            elif swu_variant in [
+                "ConvolutionInputGenerator_1D_dws",
+                "ConvolutionInputGenerator_1D_dws_stride",
+            ]:
+                ram_luts = math.ceil(
+                    self.get_input_datatype().bitwidth() * k_w * ifm_ch / 64
+                )
         else:
             ram_luts = 0
         return 300 + ram_luts
 
     def uram_estimation(self):
-        # NOTE: not tested for correctness
         simd = self.get_nodeattr("SIMD")
-        ifm_ch = self.get_nodeattr("IFMChannels")
-        ifm_dim = np.prod(self.get_nodeattr("IFMDim"))
-        k = np.prod(self.get_nodeattr("ConvKernelDim"))
-        stride = np.prod(self.get_nodeattr("Stride"))
+        (
+            ifm_ch,
+            [ifm_dim_h, ifm_dim_w],
+            [ofm_dim_h, ofm_dim_w],
+            [k_h, k_w],
+            [stride_h, stride_w],
+            [dilation_h, dilation_w],
+        ) = self.get_1d_conv_attrs_normalized()
         ram_style = self.get_nodeattr("ram_style")
-        if ram_style == "ultra":
-            return int(
-                (k + stride)
-                * (
-                    math.ceil(simd * self.get_input_datatype().bitwidth() / 64)
-                    * math.ceil(ifm_dim * ifm_ch / simd / 4096)
-                )
-            )
+        swu_variant = self.get_swu_variant()
+        if swu_variant == "ConvolutionInputGenerator_1D_parallel":
+            return 0
+        elif ram_style == "ultra":
+            if swu_variant == "ConvolutionInputGenerator_1D":
+                width_mul = math.ceil(simd * self.get_input_datatype().bitwidth() / 72)
+                depth_mul = math.ceil((k_w - 1) * ifm_ch / simd / 4096)
+                return width_mul * depth_mul
+            elif swu_variant == "ConvolutionInputGenerator_1D_dws_naive":
+                width_mul = math.ceil(simd * self.get_input_datatype().bitwidth() / 72)
+                depth_mul = math.ceil(ifm_dim_w * ifm_ch / simd / 4096)
+                return width_mul * depth_mul
+            elif swu_variant in [
+                "ConvolutionInputGenerator_1D_dws",
+                "ConvolutionInputGenerator_1D_dws_stride",
+            ]:
+                width_mul = math.ceil(simd * self.get_input_datatype().bitwidth() / 72)
+                depth_mul = math.ceil(k_w * ifm_ch / simd / 4096)
+                return width_mul * depth_mul
         else:
             return 0
 
@@ -335,7 +420,6 @@ def execute_node(self, context, graph):
         exp_ishape = self.get_normal_input_shape()
         exp_oshape = self.get_normal_output_shape()
         folded_ishape = self.get_folded_input_shape()
-        folded_oshape = self.get_folded_output_shape()
 
         # TODO ensure codegen dir exists
         if mode == "cppsim":
@@ -374,10 +458,9 @@ def execute_node(self, context, graph):
             # load output npy file
             super().npy_to_dynamic_output(context)
             assert (
-                context[node.output[0]].shape == folded_oshape
+                context[node.output[0]].shape == exp_oshape
             ), "cppsim \
-            did not produce expected ofolded utput shape"
-            context[node.output[0]] = context[node.output[0]].reshape(*exp_oshape)
+            did not produce expected output shape"
         elif mode == "rtlsim":
             sim = self.get_rtlsim()
             nbits = self.get_instream_width()
@@ -423,89 +506,83 @@ def defines(self, var):
         numReps = 1
         (
             ifm_ch,
-            ifm_dim,
-            ofm_dim,
-            k,
-            stride,
-            dilation,
+            [ifm_dim_h, ifm_dim_w],
+            [ofm_dim_h, ofm_dim_w],
+            [k_h, k_w],
+            [stride_h, stride_w],
+            [dilation_h, dilation_w],
         ) = self.get_1d_conv_attrs_normalized()
         simd = self.get_nodeattr("SIMD")
         ifm_precision = self.get_input_datatype().bitwidth()
-        ifm_dim_y, ifm_dim_x = ifm_dim
-        ofm_dim_y, ofm_dim_x = ofm_dim
-        k_y, k_x = k
-        dilation_y, dilation_x = dilation
-        # For a 1d convolution with stride=[S,1] or [1,S], the finn-hlslib function
-        # of ConvInpGen must be created with [stride_y, stride_x] = [S, S].
-        # TODO: changes in finn-hlslib (slidingwindow.h)
-        stride_y = np.prod(stride)
-        stride_x = np.prod(stride)
-
-        if dilation_x > 1:
-            assert (
-                dilation_y == 1
-            ), "Dilation value greater than 1 along y-axis is not yet supported"
+        swu_variant = self.get_swu_variant()
+
+        if swu_variant in [
+            "ConvolutionInputGenerator_1D_parallel",
+            "ConvolutionInputGenerator_1D",
+            "ConvolutionInputGenerator_1D_dws_stride",
+        ]:
             self.code_gen_dict["$DEFINES$"] = [
                 """
             #define ConvKernelDim1_x {}\n
-            #define ConvKernelDim1_y {}\n
             #define IFMChannels1 {}\n
             #define Input_precision1 {}\n
             #define IFMDim1_x {}\n
-            #define IFMDim1_y {}\n
             #define OFMDim1_x {}\n
-            #define OFMDim1_y {}\n
-            #define SIMD1 {}\n
             #define Stride1_x {}\n
-            #define Stride1_y {}\n
-            #define Dilation1_x {}\n
-            #define Dilation1_y {}\n
+            #define SIMD1 {}\n
             #define numReps {}
             """.format(
-                    k_x,
-                    k_y,
+                    k_w,
                     ifm_ch,
                     ifm_precision,
-                    ifm_dim_x,
-                    ifm_dim_y,
-                    ofm_dim_x,
-                    ofm_dim_y,
+                    ifm_dim_w,
+                    ofm_dim_w,
+                    stride_w,
                     simd,
-                    stride_x,
-                    stride_y,
-                    dilation_x,
-                    dilation_y,
                     numReps,
                 )
             ]
-        else:
-            ofm_dim = self.get_nodeattr("OFMDim")
+        if swu_variant == "ConvolutionInputGenerator_1D_dws":
             self.code_gen_dict["$DEFINES$"] = [
                 """
             #define ConvKernelDim1_x {}\n
-            #define ConvKernelDim1_y {}\n
             #define IFMChannels1 {}\n
             #define Input_precision1 {}\n
             #define IFMDim1_x {}\n
-            #define IFMDim1_y {}\n
             #define OFMDim1_x {}\n
-            #define OFMDim1_y {}\n
             #define SIMD1 {}\n
+            #define numReps {}
+            """.format(
+                    k_w,
+                    ifm_ch,
+                    ifm_precision,
+                    ifm_dim_w,
+                    ofm_dim_w,
+                    simd,
+                    numReps,
+                )
+            ]
+        if swu_variant == "ConvolutionInputGenerator_1D_dws_naive":
+            self.code_gen_dict["$DEFINES$"] = [
+                """
+            #define ConvKernelDim1_x {}\n
+            #define IFMChannels1 {}\n
+            #define Input_precision1 {}\n
+            #define IFMDim1_x {}\n
+            #define OFMDim1_x {}\n
             #define Stride1_x {}\n
-            #define Stride1_y {}\n
+            #define Dilation1_x {}\n
+            #define SIMD1 {}\n
             #define numReps {}
             """.format(
-                    k_x,
-                    k_y,
+                    k_w,
                     ifm_ch,
                     ifm_precision,
-                    ifm_dim_x,
-                    ifm_dim_y,
-                    ofm_dim_x,
-                    ofm_dim_y,
+                    ifm_dim_w,
+                    ofm_dim_w,
+                    stride_w,
+                    dilation_w,
                     simd,
-                    stride_x,
-                    stride_y,
                     numReps,
                 )
             ]
@@ -546,49 +623,49 @@ def docompute(self):
             "ultra": "ap_resource_uram()",
         }
         hls_ram_style = map_to_hls_ram_style[ram_style]
+        swu_variant = self.get_swu_variant()
 
         # check which ConvolutionInputGenerator is needed
-        if self.use_parallel_window_output():
-            hls_call = "ConvolutionInputGenerator_1D_parallel"
+        if swu_variant == "ConvolutionInputGenerator_1D_parallel":
             self.code_gen_dict["$DOCOMPUTE$"] = [
                 """{}<ConvKernelDim1_x, IFMChannels1, Input_precision1,
-                IFMDim1_x, OFMDim1_x, SIMD1, Stride1_x>
+                IFMDim1_x, OFMDim1_x, Stride1_x, SIMD1>
                 (in0, out, numReps, {});""".format(
-                    hls_call, hls_ram_style
+                    swu_variant, hls_ram_style
+                )
+            ]
+        if swu_variant == "ConvolutionInputGenerator_1D":
+            self.code_gen_dict["$DOCOMPUTE$"] = [
+                """{}<ConvKernelDim1_x, IFMChannels1, Input_precision1,
+                IFMDim1_x, OFMDim1_x, Stride1_x, SIMD1>
+                (in0, out, numReps, {});""".format(
+                    swu_variant, hls_ram_style
+                )
+            ]
+        if swu_variant == "ConvolutionInputGenerator_1D_dws":
+            self.code_gen_dict["$DOCOMPUTE$"] = [
+                """{}<ConvKernelDim1_x, IFMChannels1, Input_precision1,
+                IFMDim1_x, OFMDim1_x, SIMD1>
+                (in0, out, numReps, {});""".format(
+                    swu_variant, hls_ram_style
+                )
+            ]
+        if swu_variant == "ConvolutionInputGenerator_1D_dws_stride":
+            self.code_gen_dict["$DOCOMPUTE$"] = [
+                """{}<ConvKernelDim1_x, IFMChannels1, Input_precision1,
+                IFMDim1_x, OFMDim1_x, Stride1_x, SIMD1>
+                (in0, out, numReps, {});""".format(
+                    swu_variant, hls_ram_style
+                )
+            ]
+        if swu_variant == "ConvolutionInputGenerator_1D_dws_naive":
+            self.code_gen_dict["$DOCOMPUTE$"] = [
+                """{}<ConvKernelDim1_x, IFMChannels1, Input_precision1,
+                IFMDim1_x, OFMDim1_x, Stride1_x, Dilation1_x, SIMD1>
+                (in0, out, numReps, {});""".format(
+                    swu_variant, hls_ram_style
                 )
             ]
-        else:
-            hls_call = "ConvolutionInputGenerator_NonSquare"
-            dilation_h, dilation_w = self.get_nodeattr("Dilation")
-            if dilation_h > 1 or dilation_w > 1:
-                hls_call += "_Dilated"
-                if self.get_nodeattr("depthwise") == 1:
-                    hls_call += "_dws"
-                self.code_gen_dict["$DOCOMPUTE$"] = [
-                    """{}<ConvKernelDim1_x, ConvKernelDim1_y, IFMChannels1,
-                    Input_precision1, IFMDim1_x, IFMDim1_y, OFMDim1_x, OFMDim1_y,
-                    SIMD1, Stride1_x, Stride1_y, Dilation1_x, Dilation1_y>
-                    (in0, out, numReps, {});""".format(
-                        hls_call, hls_ram_style
-                    )
-                ]
-            elif self.get_nodeattr("depthwise") == 1:
-                hls_call += "_dws"
-                self.code_gen_dict["$DOCOMPUTE$"] = [
-                    """{}<ConvKernelDim1_x, ConvKernelDim1_y, IFMChannels1,
-                    Input_precision1, IFMDim1_x, IFMDim1_y, OFMDim1_x, OFMDim1_y,
-                    SIMD1, Stride1_x, Stride1_y> (in0, out, numReps, {});""".format(
-                        hls_call, hls_ram_style
-                    )
-                ]
-            else:
-                self.code_gen_dict["$DOCOMPUTE$"] = [
-                    """{}<ConvKernelDim1_x, ConvKernelDim1_y, IFMChannels1,
-                    Input_precision1, IFMDim1_x, IFMDim1_y, OFMDim1_x, OFMDim1_y,
-                    SIMD1, Stride1_x, Stride1_y> (in0, out, numReps, {});""".format(
-                        hls_call, hls_ram_style
-                    )
-                ]
 
     def dataoutstrm(self):
         code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim")
@@ -646,8 +723,12 @@ def blackboxfunction(self):
             ]
 
     def pragmas(self):
-        self.code_gen_dict["$PRAGMAS$"] = ["#pragma HLS INTERFACE axis port=in0"]
-        self.code_gen_dict["$PRAGMAS$"].append("#pragma HLS INTERFACE axis port=out")
+        self.code_gen_dict["$PRAGMAS$"] = [
+            "#pragma HLS INTERFACE axis port=in0 name=in0_" + self.hls_sname()
+        ]
+        self.code_gen_dict["$PRAGMAS$"].append(
+            "#pragma HLS INTERFACE axis port=out name=out_" + self.hls_sname()
+        )
         self.code_gen_dict["$PRAGMAS$"].append(
             "#pragma HLS INTERFACE ap_ctrl_none port=return"
         )
diff --git a/src/finn/custom_op/fpgadataflow/downsampler.py b/src/finn/custom_op/fpgadataflow/downsampler.py
index 124b3e4645..da29a524b6 100644
--- a/src/finn/custom_op/fpgadataflow/downsampler.py
+++ b/src/finn/custom_op/fpgadataflow/downsampler.py
@@ -29,8 +29,8 @@
 import numpy as np
 import os
 import warnings
+from qonnx.core.datatype import DataType
 
-from finn.core.datatype import DataType
 from finn.custom_op.fpgadataflow.hlscustomop import HLSCustomOp
 from finn.util.data_packing import npy_to_rtlsim_input, rtlsim_output_to_npy
 
@@ -248,8 +248,12 @@ def blackboxfunction(self):
         ]
 
     def pragmas(self):
-        self.code_gen_dict["$PRAGMAS$"] = ["#pragma HLS INTERFACE axis port=in0"]
-        self.code_gen_dict["$PRAGMAS$"].append("#pragma HLS INTERFACE axis port=out")
+        self.code_gen_dict["$PRAGMAS$"] = [
+            "#pragma HLS INTERFACE axis port=in0 name=in0_" + self.hls_sname()
+        ]
+        self.code_gen_dict["$PRAGMAS$"].append(
+            "#pragma HLS INTERFACE axis port=out name=out_" + self.hls_sname()
+        )
         self.code_gen_dict["$PRAGMAS$"].append(
             "#pragma HLS INTERFACE ap_ctrl_none port=return"
         )
@@ -260,7 +264,6 @@ def execute_node(self, context, graph):
         exp_ishape = self.get_normal_input_shape()
         exp_oshape = self.get_normal_output_shape()
         folded_ishape = self.get_folded_input_shape()
-        folded_oshape = self.get_folded_output_shape()
 
         if mode == "cppsim":
             code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim")
@@ -291,9 +294,8 @@ def execute_node(self, context, graph):
             # load output npy file
             super().npy_to_dynamic_output(context)
             assert (
-                context[node.output[0]].shape == folded_oshape
-            ), "cppsim did not produce expected folded output shape"
-            context[node.output[0]] = context[node.output[0]].reshape(*exp_oshape)
+                context[node.output[0]].shape == exp_oshape
+            ), "cppsim did not produce expected output shape"
         elif mode == "rtlsim":
             sim = self.get_rtlsim()
             nbits = self.get_instream_width()
diff --git a/src/finn/custom_op/fpgadataflow/duplicatestreams_batch.py b/src/finn/custom_op/fpgadataflow/duplicatestreams_batch.py
index 3b0fa55b00..04ca45e7f1 100644
--- a/src/finn/custom_op/fpgadataflow/duplicatestreams_batch.py
+++ b/src/finn/custom_op/fpgadataflow/duplicatestreams_batch.py
@@ -29,9 +29,8 @@
 import numpy as np
 import os
 import warnings
-from onnx import TensorProto, helper
+from qonnx.core.datatype import DataType
 
-from finn.core.datatype import DataType
 from finn.custom_op.fpgadataflow.hlscustomop import HLSCustomOp
 from finn.util.data_packing import npy_to_rtlsim_input, rtlsim_output_to_npy
 
@@ -46,6 +45,8 @@ def get_nodeattr_types(self):
         my_attrs = {
             "NumChannels": ("i", True, 0),
             "PE": ("i", True, 0),
+            # how many duplicated output streams to create
+            "NumOutputStreams": ("i", True, 0),
             # FINN DataTypes for input
             "inputDataType": ("s", True, ""),
             # number of input vectors, examples:
@@ -57,6 +58,9 @@ def get_nodeattr_types(self):
         my_attrs.update(super().get_nodeattr_types())
         return my_attrs
 
+    def get_num_output_streams(self):
+        return self.get_nodeattr("NumOutputStreams")
+
     def get_normal_input_shape(self):
         ch = self.get_nodeattr("NumChannels")
         vecs = list(self.get_nodeattr("numInputVectors"))
@@ -72,36 +76,27 @@ def get_folded_input_shape(self):
         folded_ishape = tuple(vecs + [folds, pe])
         return folded_ishape
 
-    def get_normal_output_shape(self):
+    def get_normal_output_shape(self, ind=0):
+        # since the output shape of both out streams are the same
+        # return independently from index
         return self.get_normal_input_shape()
 
-    def get_folded_output_shape(self):
+    def get_folded_output_shape(self, ind=0):
+        # since the output shape of both out streams are the same
+        # return independently from index
         return self.get_folded_input_shape()
 
     def make_shape_compatible_op(self, model):
         exp_ishape = self.get_normal_input_shape()
         ishape = tuple(model.get_tensor_shape(self.onnx_node.input[0]))
         assert ishape == exp_ishape, "Unexpected input shape."
+        num_out = self.get_num_output_streams()
+        assert len(self.onnx_node.output) == num_out, "Unexpected number of outputs"
 
         oshape = self.get_normal_output_shape()
-        values = np.zeros(oshape).astype(np.float32)
-        split_input = np.concatenate((values, values), axis=0)
-
-        split_in = helper.make_tensor_value_info(
-            model.make_new_valueinfo_name(), TensorProto.FLOAT, oshape
-        )
-
-        model.graph.value_info.append(split_in)  # requires clean up
-        model.set_initializer(split_in.name, split_input)
-
-        shape_comp_node = helper.make_node(
-            "Split",
-            inputs=[split_in.name],
-            outputs=[self.onnx_node.output[0], self.onnx_node.output[1]],
-            axis=0,
-        )
-
-        return shape_comp_node
+        ret = super().make_const_shape_op(oshape)
+        ret.output[:] = self.onnx_node.output
+        return ret
 
     def infer_node_datatype(self, model):
         node = self.onnx_node
@@ -115,8 +110,8 @@ def infer_node_datatype(self, model):
             warnings.warn(warn_str)
         self.set_nodeattr("inputDataType", idt.name)
         odt = self.get_output_datatype()
-        model.set_tensor_datatype(self.onnx_node.output[0], odt)
-        model.set_tensor_datatype(self.onnx_node.output[1], odt)
+        for my_out in self.onnx_node.output:
+            model.set_tensor_datatype(my_out, odt)
 
     def verify_node(self):
         info_messages = []
@@ -133,6 +128,7 @@ def verify_node(self):
             self.get_nodeattr("executable_path")
             self.get_nodeattr("NumChannels")
             self.get_nodeattr("PE")
+            self.get_nodeattr("NumOutputStreams")
             self.get_nodeattr("inputDataType")
             info_messages.append("All necessary attributes exist")
         except Exception:
@@ -165,19 +161,53 @@ def get_outstream_width(self):
         return out_width
 
     def get_number_output_values(self):
-        return 2 * np.prod(self.get_folded_output_shape()[1:-1])
+        return self.get_num_output_streams() * np.prod(
+            self.get_folded_output_shape()[1:-1]
+        )
 
     def get_exp_cycles(self):
         # Channels/PE * batch size * fmdim * fmdim
         return np.prod(self.get_folded_output_shape()[:-1])
 
+    def generate_params(self, model, path):
+        n_outputs = self.get_num_output_streams()
+        inp_streams = []
+        commands = []
+        o_stream_w = self.get_outstream_width()
+        i_stream_w = self.get_instream_width()
+        in_stream = "hls::stream<ap_uint<%d> > &in0" % (i_stream_w)
+        inp_streams.append(in_stream)
+        commands.append("ap_uint<%d> e = in0.read();" % i_stream_w)
+        iters = self.get_number_output_values() // self.get_num_output_streams()
+        for i in range(n_outputs):
+            out_stream = "hls::stream<ap_uint<%d> > &out%d" % (o_stream_w, i)
+            inp_streams.append(out_stream)
+            cmd = "out%d.write(e);" % i
+            commands.append(cmd)
+
+        impl_hls_code = []
+        impl_hls_code.append("void DuplicateStreamsCustom(")
+        impl_hls_code.append(",".join(inp_streams))
+        impl_hls_code.append(") {")
+        impl_hls_code.append("for(unsigned int i = 0; i < %d; i++) {" % iters)
+        impl_hls_code.append("#pragma HLS PIPELINE II=1")
+        impl_hls_code.append("\n".join(commands))
+        impl_hls_code.append("}")
+        impl_hls_code.append("}")
+        impl_hls_code = "\n".join(impl_hls_code)
+
+        impl_filename = "{}/duplicate_impl.hpp".format(path)
+        f_impl = open(impl_filename, "w")
+        f_impl.write(impl_hls_code)
+        f_impl.close()
+
     def execute_node(self, context, graph):
         mode = self.get_nodeattr("exec_mode")
         node = self.onnx_node
         exp_ishape = self.get_normal_input_shape()
         exp_oshape = self.get_normal_output_shape()
         folded_ishape = self.get_folded_input_shape()
-        folded_oshape = self.get_folded_output_shape()
+        n_outputs = self.get_num_output_streams()
 
         if mode == "cppsim":
             code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim")
@@ -205,17 +235,14 @@ def execute_node(self, context, graph):
             # execute the precompiled model
             super().exec_precompiled_singlenode_model()
             # load output npy file
-            super().npy_to_dynamic_outputs(context, ["output0.npy", "output1.npy"])
-            assert (
-                context[node.output[0]].shape == folded_oshape
-            ), "cppsim \
-            did not produce expected ofolded utput shape"
-            assert (
-                context[node.output[1]].shape == folded_oshape
-            ), "cppsim \
-            did not produce expected ofolded utput shape"
-            context[node.output[0]] = context[node.output[0]].reshape(*exp_oshape)
-            context[node.output[1]] = context[node.output[1]].reshape(*exp_oshape)
+            super().npy_to_dynamic_outputs(
+                context, ["output%d.npy" % i for i in range(n_outputs)]
+            )
+            for i in range(n_outputs):
+                assert (
+                    context[node.output[i]].shape == exp_oshape
+                ), "cppsim \
+                did not produce expected output shape"
         elif mode == "rtlsim":
             sim = self.get_rtlsim()
             nbits = self.get_instream_width()
@@ -226,41 +253,30 @@ def execute_node(self, context, graph):
             super().toggle_clk(sim)
             rtlsim_dict = {
                 "inputs": {"in0": rtlsim_inp},
-                "outputs": {"out0": [], "out1": []},
+                "outputs": {},
             }
+            for i in range(n_outputs):
+                rtlsim_dict["outputs"]["out%d" % i] = []
             self.rtlsim_multi_io(sim, rtlsim_dict)
             odt = self.get_output_datatype()
             target_bits = odt.bitwidth()
             packed_bits = self.get_outstream_width()
             out_shape = self.get_folded_output_shape()
+            for i in range(n_outputs):
+                out_npy_path = "%s/output%d.npy" % (code_gen_dir, i)
+                rtlsim_output_to_npy(
+                    rtlsim_dict["outputs"]["out%d" % i],
+                    out_npy_path,
+                    odt,
+                    out_shape,
+                    packed_bits,
+                    target_bits,
+                )
+                # load and reshape output 0
+                output = np.load(out_npy_path)
+                output = np.asarray([output], dtype=np.float32).reshape(*exp_oshape)
+                context[node.output[i]] = output
 
-            out_npy_path = "{}/output0.npy".format(code_gen_dir)
-            rtlsim_output_to_npy(
-                rtlsim_dict["outputs"]["out0"],
-                out_npy_path,
-                odt,
-                out_shape,
-                packed_bits,
-                target_bits,
-            )
-            # load and reshape output 0
-            output = np.load(out_npy_path)
-            output = np.asarray([output], dtype=np.float32).reshape(*exp_oshape)
-            context[node.output[0]] = output
-
-            out_npy_path = "{}/output1.npy".format(code_gen_dir)
-            rtlsim_output_to_npy(
-                rtlsim_dict["outputs"]["out1"],
-                out_npy_path,
-                odt,
-                out_shape,
-                packed_bits,
-                target_bits,
-            )
-            # load and reshape output 1
-            output = np.load(out_npy_path)
-            output = np.asarray([output], dtype=np.float32).reshape(*exp_oshape)
-            context[node.output[1]] = output
         else:
             raise Exception(
                 """Invalid value for attribute exec_mode! Is currently set to: {}
@@ -277,7 +293,7 @@ def execute_node(self, context, graph):
         ), """Output1 shape doesn't match expected shape."""
 
     def global_includes(self):
-        self.code_gen_dict["$GLOBALS$"] = ['#include "streamtools.h"']
+        self.code_gen_dict["$GLOBALS$"] = ['#include "duplicate_impl.hpp"']
 
     def defines(self, var):
         self.code_gen_dict["$DEFINES$"] = []
@@ -298,24 +314,23 @@ def read_npy_data(self):
         )
 
     def strm_decl(self):
+        n_outputs = self.get_num_output_streams()
         self.code_gen_dict["$STREAMDECLARATIONS$"] = []
         self.code_gen_dict["$STREAMDECLARATIONS$"].append(
             'hls::stream<ap_uint<{}>> in0 ("in0");'.format(self.get_instream_width())
         )
-        self.code_gen_dict["$STREAMDECLARATIONS$"].append(
-            'hls::stream<ap_uint<{}>> out0 ("out0");'.format(self.get_outstream_width())
-        )
-        self.code_gen_dict["$STREAMDECLARATIONS$"].append(
-            'hls::stream<ap_uint<{}>> out1 ("out1");'.format(self.get_outstream_width())
-        )
+        for i in range(n_outputs):
+            out_name = "out%d" % i
+            self.code_gen_dict["$STREAMDECLARATIONS$"].append(
+                'hls::stream<ap_uint<%d>> %s ("%s");'
+                % (self.get_outstream_width(), out_name, out_name)
+            )
 
     def docompute(self):
-        self.code_gen_dict["$DOCOMPUTE$"] = [
-            """DuplicateStreams_Batch<{}, {}> (in0, out0, out1, 1);""".format(
-                self.get_outstream_width(),
-                self.get_number_output_values() // 2,
-            )
-        ]
+        n_outputs = self.get_num_output_streams()
+        ostreams = ["out%d" % x for x in range(n_outputs)]
+        dc = "DuplicateStreamsCustom(in0, %s);" % (",".join(ostreams))
+        self.code_gen_dict["$DOCOMPUTE$"] = [dc]
 
     def dataoutstrm(self):
         code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim")
@@ -325,62 +340,71 @@ def dataoutstrm(self):
         packed_hls_type = "ap_uint<%d>" % packed_bits
         elem_hls_type = dtype.get_hls_datatype_str()
         npy_type = "float"
-        npy_out = "%s/output0.npy" % code_gen_dir
-        npy_out1 = "%s/output1.npy" % code_gen_dir
+        n_outputs = self.get_num_output_streams()
         oshape = self.get_folded_output_shape()
         oshape_cpp_str = str(oshape).replace("(", "{").replace(")", "}")
-
-        self.code_gen_dict["$DATAOUTSTREAM$"] = [
-            'apintstream2npy<%s, %s, %d, %s>(out0, %s, "%s");'
-            % (
-                packed_hls_type,
-                elem_hls_type,
-                elem_bits,
-                npy_type,
-                oshape_cpp_str,
-                npy_out,
+        outstrm_code = []
+
+        for i in range(n_outputs):
+            out_name = "out%d" % i
+            npy_out = "%s/output%d.npy" % (code_gen_dir, i)
+            outstrm_code.append(
+                'apintstream2npy<%s, %s, %d, %s>(%s, %s, "%s");'
+                % (
+                    packed_hls_type,
+                    elem_hls_type,
+                    elem_bits,
+                    npy_type,
+                    out_name,
+                    oshape_cpp_str,
+                    npy_out,
+                )
             )
-        ]
 
-        self.code_gen_dict["$DATAOUTSTREAM$"] += [
-            'apintstream2npy<%s, %s, %d, %s>(out1, %s, "%s");'
-            % (
-                packed_hls_type,
-                elem_hls_type,
-                elem_bits,
-                npy_type,
-                oshape_cpp_str,
-                npy_out1,
-            )
-        ]
+        self.code_gen_dict["$DATAOUTSTREAM$"] = outstrm_code
 
     def save_as_npy(self):
         self.code_gen_dict["$SAVEASCNPY$"] = []
 
     def blackboxfunction(self):
+        n_outputs = self.get_num_output_streams()
+        inp_streams = []
+        o_stream_w = self.get_outstream_width()
+        i_stream_w = self.get_instream_width()
+        in_stream = "hls::stream<ap_uint<%d> > &in0" % (i_stream_w)
+        inp_streams.append(in_stream)
+        for i in range(n_outputs):
+            out_stream = "hls::stream<ap_uint<%d> > &out%d" % (o_stream_w, i)
+            inp_streams.append(out_stream)
+
         self.code_gen_dict["$BLACKBOXFUNCTION$"] = [
-            """void {}(hls::stream<ap_uint<{}>> &in0,
-                hls::stream<ap_uint<{}>> &out0,
-                hls::stream<ap_uint<{}>> &out1)""".format(
+            """void {}({})""".format(
                 self.onnx_node.name,
-                self.get_instream_width(),
-                self.get_outstream_width(),
-                self.get_outstream_width(),
+                ",".join(inp_streams),
             )
         ]
 
     def pragmas(self):
-        self.code_gen_dict["$PRAGMAS$"] = ["#pragma HLS INTERFACE axis port=in0"]
-        self.code_gen_dict["$PRAGMAS$"].append("#pragma HLS INTERFACE axis port=out0")
-        self.code_gen_dict["$PRAGMAS$"].append("#pragma HLS INTERFACE axis port=out1")
+        n_outputs = self.get_num_output_streams()
+        self.code_gen_dict["$PRAGMAS$"] = [
+            "#pragma HLS INTERFACE axis port=in0 name=in0_" + self.hls_sname()
+        ]
+        for i in range(n_outputs):
+            self.code_gen_dict["$PRAGMAS$"].append(
+                "#pragma HLS INTERFACE axis port=out%d name=out%d_%s"
+                % (i, i, self.hls_sname())
+            )
         self.code_gen_dict["$PRAGMAS$"].append(
             "#pragma HLS INTERFACE ap_ctrl_none port=return"
         )
 
     def get_verilog_top_module_intf_names(self):
         intf_names = super().get_verilog_top_module_intf_names()
-        intf_names["m_axis"] = [
-            ("out0_V_V", self.get_outstream_width_padded()),
-            ("out1_V_V", self.get_outstream_width_padded()),
-        ]
+        n_outputs = self.get_num_output_streams()
+        sname = self.hls_sname()
+        intf_names["m_axis"] = []
+        for i in range(n_outputs):
+            intf_names["m_axis"].append(
+                ("out%d_%s" % (i, sname), self.get_outstream_width_padded())
+            )
         return intf_names
diff --git a/src/finn/custom_op/fpgadataflow/fmpadding_batch.py b/src/finn/custom_op/fpgadataflow/fmpadding_batch.py
index 8ac30524eb..d69ea471ea 100644
--- a/src/finn/custom_op/fpgadataflow/fmpadding_batch.py
+++ b/src/finn/custom_op/fpgadataflow/fmpadding_batch.py
@@ -29,8 +29,8 @@
 import numpy as np
 import os
 import warnings
+from qonnx.core.datatype import DataType
 
-from finn.core.datatype import DataType
 from finn.custom_op.fpgadataflow.hlscustomop import HLSCustomOp
 from finn.util.data_packing import npy_to_rtlsim_input, rtlsim_output_to_npy
 
@@ -312,8 +312,12 @@ def blackboxfunction(self):
         ]
 
     def pragmas(self):
-        self.code_gen_dict["$PRAGMAS$"] = ["#pragma HLS INTERFACE axis port=in0"]
-        self.code_gen_dict["$PRAGMAS$"].append("#pragma HLS INTERFACE axis port=out")
+        self.code_gen_dict["$PRAGMAS$"] = [
+            "#pragma HLS INTERFACE axis port=in0 name=in0_" + self.hls_sname()
+        ]
+        self.code_gen_dict["$PRAGMAS$"].append(
+            "#pragma HLS INTERFACE axis port=out name=out_" + self.hls_sname()
+        )
         self.code_gen_dict["$PRAGMAS$"].append(
             "#pragma HLS INTERFACE ap_ctrl_none port=return"
         )
@@ -324,7 +328,6 @@ def execute_node(self, context, graph):
         exp_ishape = self.get_normal_input_shape()
         exp_oshape = self.get_normal_output_shape()
         folded_ishape = self.get_folded_input_shape()
-        folded_oshape = self.get_folded_output_shape()
 
         if mode == "cppsim":
             code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim")
@@ -355,9 +358,8 @@ def execute_node(self, context, graph):
             # load output npy file
             super().npy_to_dynamic_output(context)
             assert (
-                context[node.output[0]].shape == folded_oshape
-            ), "cppsim did not produce expected folded output shape"
-            context[node.output[0]] = context[node.output[0]].reshape(*exp_oshape)
+                context[node.output[0]].shape == exp_oshape
+            ), "cppsim did not produce expected output shape"
         elif mode == "rtlsim":
             sim = self.get_rtlsim()
             nbits = self.get_instream_width()
diff --git a/src/finn/custom_op/fpgadataflow/globalaccpool_batch.py b/src/finn/custom_op/fpgadataflow/globalaccpool_batch.py
index 6d4a55ee5c..adafa7dcf3 100644
--- a/src/finn/custom_op/fpgadataflow/globalaccpool_batch.py
+++ b/src/finn/custom_op/fpgadataflow/globalaccpool_batch.py
@@ -29,8 +29,8 @@
 import numpy as np
 import os
 import warnings
+from qonnx.core.datatype import DataType
 
-from finn.core.datatype import DataType
 from finn.custom_op.fpgadataflow.hlscustomop import HLSCustomOp
 from finn.util.data_packing import npy_to_rtlsim_input, rtlsim_output_to_npy
 
@@ -185,7 +185,6 @@ def execute_node(self, context, graph):
         exp_ishape = self.get_normal_input_shape()
         exp_oshape = self.get_normal_output_shape()
         folded_ishape = self.get_folded_input_shape()
-        folded_oshape = self.get_folded_output_shape()
 
         if mode == "cppsim":
             code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim")
@@ -215,10 +214,9 @@ def execute_node(self, context, graph):
             # load output npy file
             super().npy_to_dynamic_output(context)
             assert (
-                context[node.output[0]].shape == folded_oshape
+                context[node.output[0]].shape == exp_oshape
             ), "cppsim \
-            did not produce expected ofolded utput shape"
-            context[node.output[0]] = context[node.output[0]].reshape(*exp_oshape)
+            did not produce expected output shape"
         elif mode == "rtlsim":
             sim = self.get_rtlsim()
             nbits = self.get_instream_width()
@@ -331,8 +329,12 @@ def blackboxfunction(self):
         ]
 
     def pragmas(self):
-        self.code_gen_dict["$PRAGMAS$"] = ["#pragma HLS INTERFACE axis port=in0"]
-        self.code_gen_dict["$PRAGMAS$"].append("#pragma HLS INTERFACE axis port=out")
+        self.code_gen_dict["$PRAGMAS$"] = [
+            "#pragma HLS INTERFACE axis port=in0 name=in0_" + self.hls_sname()
+        ]
+        self.code_gen_dict["$PRAGMAS$"].append(
+            "#pragma HLS INTERFACE axis port=out name=out_" + self.hls_sname()
+        )
         self.code_gen_dict["$PRAGMAS$"].append(
             "#pragma HLS INTERFACE ap_ctrl_none port=return"
         )
diff --git a/src/finn/custom_op/fpgadataflow/hlscustomop.py b/src/finn/custom_op/fpgadataflow/hlscustomop.py
index 3aac7f6b45..9978ab0c71 100644
--- a/src/finn/custom_op/fpgadataflow/hlscustomop.py
+++ b/src/finn/custom_op/fpgadataflow/hlscustomop.py
@@ -25,25 +25,23 @@
 # CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-# namespace package, extend path
 
 import numpy as np
 import os
 import subprocess
 from abc import abstractmethod
+from pyverilator.util.axi_utils import rtlsim_multi_io
+from qonnx.core.datatype import DataType
+from qonnx.custom_op.base import CustomOp
+from qonnx.util.basic import roundup_to_integer_multiple
 
-from finn.custom_op.base import CustomOp
 from finn.util.basic import (
     CppBuilder,
     get_rtlsim_trace_depth,
     make_build_dir,
-    roundup_to_integer_multiple,
-)
-from finn.util.hls import CallHLS
-from finn.util.pyverilator import (
     pyverilate_get_liveness_threshold_cycles,
-    rtlsim_multi_io,
 )
+from finn.util.hls import CallHLS
 
 from . import templates
 
@@ -112,13 +110,15 @@ def get_nodeattr_types(self):
             # input and output FIFO depths
             "inFIFODepth": ("i", False, 2),
             "outFIFODepth": ("i", False, 2),
+            "output_hook": ("s", False, ""),
         }
 
     def get_verilog_top_module_name(self):
         "Return the Verilog top module name for this node."
 
         node = self.onnx_node
-        prefixed_top_name = "%s_%s" % (node.name, node.name)
+        prefixed_top_name = node.name
+
         return prefixed_top_name
 
     def get_verilog_top_module_intf_names(self):
@@ -133,8 +133,9 @@ def get_verilog_top_module_intf_names(self):
         intf_names = {}
         intf_names["clk"] = ["ap_clk"]
         intf_names["rst"] = ["ap_rst_n"]
-        intf_names["s_axis"] = [("in0_V_V", self.get_instream_width_padded())]
-        intf_names["m_axis"] = [("out_V_V", self.get_outstream_width_padded())]
+        sname = self.hls_sname()
+        intf_names["s_axis"] = [("in0_" + sname, self.get_instream_width_padded())]
+        intf_names["m_axis"] = [("out_" + sname, self.get_outstream_width_padded())]
         intf_names["aximm"] = []
         intf_names["axilite"] = []
         return intf_names
@@ -290,10 +291,9 @@ def code_generation_ipgen(self, model, fpgapart, clk):
         self.code_gen_dict["$PROJECTNAME$"] = ["project_{}".format(node.name)]
         self.code_gen_dict["$HWSRCDIR$"] = [code_gen_dir]
         self.code_gen_dict["$FPGAPART$"] = [fpgapart]
-        self.code_gen_dict["$FINNHLSLIBDIR$"] = ["/workspace/finn-hlslib"]
-        self.code_gen_dict["$FINNHLSCUSTOMDIR$"] = ["/workspace/finn/custom_hls"]
         self.code_gen_dict["$TOPFXN$"] = [node.name]
         self.code_gen_dict["$CLKPERIOD$"] = [str(clk)]
+        self.code_gen_dict["$DEFAULT_DIRECTIVES$"] = self.ipgen_default_directives()
         self.code_gen_dict["$EXTRA_DIRECTIVES$"] = self.ipgen_extra_directives()
 
         template = self.ipgentcl_template
@@ -308,13 +308,24 @@ def code_generation_ipgen(self, model, fpgapart, clk):
         f.close()
         self.code_gen_dict.clear()
 
+    def ipgen_default_directives(self):
+        """Return list of default HLS synthesis directives"""
+
+        default_directives = [
+            "set_param hls.enable_hidden_option_error false",
+            "config_compile -disable_unroll_code_size_check -pipeline_style flp",
+            "config_interface -m_axi_addr64",
+            "config_rtl -module_auto_prefix",
+            "config_rtl -deadlock_detection none",
+        ]
+        return default_directives
+
     def ipgen_extra_directives(self):
         "Return a list of extra tcl directives for HLS synthesis."
         return []
 
     def ipgen_singlenode_code(self):
-        """Builds the bash script for ip generation using the CallHLS from
-        finn.util.hls."""
+        """Builds the bash script for IP generation using the CallHLS utility."""
         node = self.onnx_node
         code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen")
         builder = CallHLS()
@@ -372,15 +383,15 @@ def compile_singlenode_code(self):
         builder = CppBuilder()
         # to enable additional debug features please uncommand the next line
         # builder.append_includes("-DDEBUG")
-        builder.append_includes("-I/workspace/finn/src/finn/qnn-data/cpp")
-        builder.append_includes("-I/workspace/cnpy/")
-        builder.append_includes("-I/workspace/finn-hlslib")
-        builder.append_includes("-I/workspace/finn/custom_hls")
-        builder.append_includes("-I{}/include".format(os.environ["VIVADO_PATH"]))
-        builder.append_includes("--std=c++11")
+        builder.append_includes("-I$FINN_ROOT/src/finn/qnn-data/cpp")
+        builder.append_includes("-I$FINN_ROOT/deps/cnpy/")
+        builder.append_includes("-I$FINN_ROOT/deps/finn-hlslib")
+        builder.append_includes("-I$FINN_ROOT/custom_hls")
+        builder.append_includes("-I{}/include".format(os.environ["HLS_PATH"]))
+        builder.append_includes("--std=c++14")
         builder.append_includes("-O3")
         builder.append_sources(code_gen_dir + "/*.cpp")
-        builder.append_sources("/workspace/cnpy/cnpy.cpp")
+        builder.append_sources("$FINN_ROOT/deps/cnpy/cnpy.cpp")
         builder.append_includes("-lz")
         builder.set_executable_path(code_gen_dir + "/node_model")
         builder.build(code_gen_dir)
@@ -402,10 +413,22 @@ def dynamic_input_to_npy(self, context, count):
         # assuming dynamic inputs start from 0
         for in_ind in range(count):
             current_input_name = node.input[in_ind]
-            # make copy before saving array
-            input_array = context[current_input_name].copy()
+            input_array = context[current_input_name]
+            if in_ind == 0:
+                expected_inp_shape = self.get_folded_input_shape()
+                idt = self.get_input_datatype()
+            else:
+                expected_inp_shape = self.get_folded_input_shape(in_ind)
+                idt = self.get_input_datatype(in_ind)
+            reshaped_input = input_array.reshape(expected_inp_shape)
+            if idt == DataType["BIPOLAR"]:
+                # store bipolar activations as binary
+                reshaped_input = (reshaped_input + 1) / 2
+            # make copy before saving the array
+            reshaped_input = reshaped_input.copy()
             np.save(
-                os.path.join(code_gen_dir, "input_{}.npy".format(in_ind)), input_array
+                os.path.join(code_gen_dir, "input_{}.npy".format(in_ind)),
+                reshaped_input,
             )
 
     def npy_to_dynamic_output(self, context):
@@ -414,7 +437,8 @@ def npy_to_dynamic_output(self, context):
         node = self.onnx_node
         code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim")
         output = np.load("{}/output.npy".format(code_gen_dir))
-        context[node.output[0]] = output
+        exp_shape = self.get_normal_output_shape()
+        context[node.output[0]] = output.reshape(exp_shape)
 
     def npy_to_dynamic_outputs(self, context, npy_list):
         """Reads the output from .npy files generated from cppsim and places
@@ -425,7 +449,11 @@ def npy_to_dynamic_outputs(self, context, npy_list):
         code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim")
         for i in range(len(npy_list)):
             output = np.load("{}/{}".format(code_gen_dir, npy_list[i]))
-            context[node.output[i]] = output
+            if i == 0:
+                exp_shape = self.get_normal_output_shape()
+            else:
+                exp_shape = self.get_normal_output_shape(i)
+            context[node.output[i]] = output.reshape(exp_shape)
 
     def exec_precompiled_singlenode_model(self):
         """Executes precompiled executable."""
@@ -453,6 +481,12 @@ def toggle_clk(self, sim):
         sim.io.ap_clk = 1
         sim.io.ap_clk = 0
 
+    def hls_sname(self):
+        """Get the naming convention used by Vitis HLS for stream signals
+        Example: the TDATA for a stream called "out" would be out_V_TDATA.
+        """
+        return "V"
+
     def rtlsim(self, sim, inp, inp2=None):
         """Runs the pyverilator simulation by passing the input values to the simulation,
         toggle the clock and observing the execution time. Function contains also an
@@ -466,7 +500,18 @@ def rtlsim(self, sim, inp, inp2=None):
             sim.start_vcd_trace(trace_file)
         inputs = inp
         outputs = []
-        sim.io.out_V_V_TREADY = 1
+        sname = self.hls_sname()
+        o_ready = "out_" + sname + "_TREADY"
+        o_valid = "out_" + sname + "_TVALID"
+        o_data = "out_" + sname + "_TDATA"
+        in0_ready = "in0_" + sname + "_TREADY"
+        in0_valid = "in0_" + sname + "_TVALID"
+        in0_data = "in0_" + sname + "_TDATA"
+        in1_ready = "in1_" + sname + "_TREADY"
+        in1_valid = "in1_" + sname + "_TVALID"
+        in1_data = "in1_" + sname + "_TDATA"
+
+        sim.io[o_ready] = 1
 
         # observe if output is completely calculated
         # observation_count will contain the number of cycles the calculation ran
@@ -481,19 +526,19 @@ def rtlsim(self, sim, inp, inp2=None):
         liveness_threshold = pyverilate_get_liveness_threshold_cycles()
 
         while not (output_observed):
-            sim.io.in0_V_V_TVALID = 1 if len(inputs) > 0 else 0
-            sim.io.in0_V_V_TDATA = inputs[0] if len(inputs) > 0 else 0
-            if sim.io.in0_V_V_TREADY == 1 and sim.io.in0_V_V_TVALID == 1:
+            sim.io[in0_valid] = 1 if len(inputs) > 0 else 0
+            sim.io[in0_data] = inputs[0] if len(inputs) > 0 else 0
+            if sim.io[in0_ready] == 1 and sim.io[in0_valid] == 1:
                 inputs = inputs[1:]
 
             if inp2 is not None:
-                sim.io.in1_V_V_TVALID = 1 if len(inp2) > 0 else 0
-                sim.io.in1_V_V_TDATA = inp2[0] if len(inp2) > 0 else 0
-                if sim.io.in1_V_V_TREADY == 1 and sim.io.in1_V_V_TVALID == 1:
+                sim.io[in1_valid] = 1 if len(inp2) > 0 else 0
+                sim.io[in1_data] = inp2[0] if len(inp2) > 0 else 0
+                if sim.io[in1_ready] == 1 and sim.io[in1_valid] == 1:
                     inp2 = inp2[1:]
 
-            if sim.io.out_V_V_TVALID == 1 and sim.io.out_V_V_TREADY == 1:
-                outputs = outputs + [sim.io.out_V_V_TDATA]
+            if sim.io[o_valid] == 1 and sim.io[o_ready] == 1:
+                outputs = outputs + [sim.io[o_data]]
             sim.io.ap_clk = 1
             sim.io.ap_clk = 0
 
@@ -525,11 +570,21 @@ def rtlsim(self, sim, inp, inp2=None):
     def rtlsim_multi_io(self, sim, io_dict):
         "Run rtlsim for this node, supports multiple i/o streams."
 
+        # signal name
+        sname = "_" + self.hls_sname() + "_"
+
         trace_file = self.get_nodeattr("rtlsim_trace")
         if trace_file == "default":
             trace_file = self.onnx_node.name + ".vcd"
         num_out_values = self.get_number_output_values()
-        total_cycle_count = rtlsim_multi_io(sim, io_dict, num_out_values, trace_file)
+        total_cycle_count = rtlsim_multi_io(
+            sim,
+            io_dict,
+            num_out_values,
+            trace_file=trace_file,
+            sname=sname,
+            liveness_threshold=pyverilate_get_liveness_threshold_cycles(),
+        )
         self.set_nodeattr("cycles_rtlsim", total_cycle_count)
 
     def execute_node(self, context, graph):
@@ -580,7 +635,7 @@ def defines(self, var):
         be filled by every node.
 
         var: makes it possible to reuse the function for different c++ code generation.
-        I.e. if set to "ipgen" in StreamingFCLayer_Batch additional PRAGMA defines are
+        I.e. if set to "ipgen" in MatrixVectorActivation additional PRAGMA defines are
         added."""
         pass
 
diff --git a/src/finn/custom_op/fpgadataflow/iodma.py b/src/finn/custom_op/fpgadataflow/iodma.py
index 802c7e7851..33ee1d359c 100644
--- a/src/finn/custom_op/fpgadataflow/iodma.py
+++ b/src/finn/custom_op/fpgadataflow/iodma.py
@@ -29,8 +29,8 @@
 import math
 import numpy as np
 import warnings
+from qonnx.core.datatype import DataType
 
-from finn.core.datatype import DataType
 from finn.custom_op.fpgadataflow.hlscustomop import HLSCustomOp
 
 # the IODMA inerfaces a memory-mapped AXI interface and an AXI stream
@@ -83,11 +83,14 @@ def get_nodeattr_types(self):
             "NumChannels": ("i", True, 0),
             # FINN input datatype
             "dataType": ("s", True, ""),
-            # Stream parameters
+            # Width of input or output stream
             "streamWidth": ("i", False, 32),
             # DMA-specific parameters
+            # width of axi-mm interface
             "intfWidth": ("i", False, 32),
+            # burst mode for axi-mm interface (wrap used for DRAM weights)
             "burstMode": ("s", False, "increment", {"wrap", "increment"}),
+            # IODMA direction: in = read from DRAM, out = write to DRAM
             "direction": ("s", False, "in", {"in", "out"}),
             # shape describing input vecs per execution
             "numInputVectors": ("ints", False, [1]),
@@ -224,20 +227,19 @@ def get_ap_int_max_w(self):
     def docompute(self):
         direction = self.get_nodeattr("direction")
         mode = self.get_nodeattr("burstMode")
+        dwc_func = "StreamingDataWidthConverter_Batch"
         if direction == "in":
             if mode == "wrap":
                 func = "Mem2Stream_Batch_external_wmem"
             else:
                 func = "Mem2Stream_Batch"
-            dwc_func = "WidthAdjustedOutputStream"
         elif direction == "out":
             func = "Stream2Mem_Batch"
-            dwc_func = "WidthAdjustedInputStream"
         else:
             raise ValueError("Invalid IODMA direction, please set to in or out")
         # define templates for instantiation
         dma_inst_template = func + "<DataWidth1, NumBytes1>(%s, %s, numReps);"
-        dwc_inst_template = dwc_func + "<%d, %d, %d> %s(%s, numReps);"
+        dwc_inst_template = dwc_func + "<%d, %d, %d>(%s, %s, numReps);"
         # do stream infrastructure and instantiations
         intfw = self.get_nodeattr("intfWidth")
         strmw = self.get_nodeattr("streamWidth")
@@ -246,22 +248,65 @@ def docompute(self):
         # because we use WidthAdjustedInputStream,
         dtype_bits = self.get_input_datatype().bitwidth()
         total_bits = dtype_bits * np.prod(self.get_normal_input_shape())
+
         if direction == "in":
-            self.code_gen_dict["$DOCOMPUTE$"] = [
-                dwc_inst_template
-                % (width_lcm, strmw, total_bits // width_lcm, "dwc_lcm", "out"),
-                dwc_inst_template
-                % (intfw, width_lcm, total_bits // intfw, "dwc_intfw", "dwc_lcm"),
-                dma_inst_template % ("in0", "dwc_intfw"),
-            ]
+            # AXI MM -> IODMA -> (DWCs) -> out
+            # DWCs depend on AXI MM and out interface width
+            if strmw == intfw:
+                # case 0: AXI MM width = out width, no DWCs needed
+                self.code_gen_dict["$DOCOMPUTE$"] = [dma_inst_template % ("in0", "out")]
+            elif (strmw % intfw == 0) or (intfw % strmw == 0):
+                # case 1: AXI MM width divisible by out width or vice versa
+                # single DWC + single extra stream needed
+                self.code_gen_dict["$DOCOMPUTE$"] = [
+                    "hls::stream<ap_uint<%d> > dma2dwc;" % intfw,
+                    dma_inst_template % ("in0", "dma2dwc"),
+                    dwc_inst_template
+                    % (intfw, strmw, total_bits // intfw, "dma2dwc", "out"),
+                ]
+            else:
+                # case 2: AXI MM width not divisible by out width or vice versa
+                # need 2 DWCs (going through the least common multiple width)
+                # and 2 streams
+                self.code_gen_dict["$DOCOMPUTE$"] = [
+                    "hls::stream<ap_uint<%d> > dma2lcm;" % intfw,
+                    "hls::stream<ap_uint<%d> > lcm2out;" % width_lcm,
+                    dma_inst_template % ("in0", "dma2lcm"),
+                    dwc_inst_template
+                    % (intfw, width_lcm, total_bits // intfw, "dma2lcm", "lcm2out"),
+                    dwc_inst_template
+                    % (width_lcm, strmw, total_bits // width_lcm, "lcm2out", "out"),
+                ]
+        elif direction == "out":
+            # in0 -> (DWCs) -> IODMA -> AXI MM
+            # DWCs depend on AXI MM and out interface width
+            if strmw == intfw:
+                # case 0: in width = AXI MM width, no DWCs needed
+                self.code_gen_dict["$DOCOMPUTE$"] = [dma_inst_template % ("in0", "out")]
+            elif (strmw % intfw == 0) or (intfw % strmw == 0):
+                # case 1: AXI MM width divisible by in width or vice versa
+                # single DWC + single extra stream needed
+                self.code_gen_dict["$DOCOMPUTE$"] = [
+                    "hls::stream<ap_uint<%d> > dwc2dma;" % intfw,
+                    dwc_inst_template
+                    % (strmw, intfw, total_bits // strmw, "in0", "dwc2dma"),
+                    dma_inst_template % ("dwc2dma", "out"),
+                ]
+            else:
+                # case 2: AXI MM width not divisible by out width or vice versa
+                # need 2 DWCs (going through the least common multiple width)
+                # and 2 streams
+                self.code_gen_dict["$DOCOMPUTE$"] = [
+                    "hls::stream<ap_uint<%d> > in2lcm;" % width_lcm,
+                    "hls::stream<ap_uint<%d> > lcm2dma;" % intfw,
+                    dwc_inst_template
+                    % (strmw, width_lcm, total_bits // strmw, "in0", "in2lcm"),
+                    dwc_inst_template
+                    % (width_lcm, intfw, total_bits // width_lcm, "in2lcm", "lcm2dma"),
+                    dma_inst_template % ("lcm2dma", "out"),
+                ]
         else:
-            self.code_gen_dict["$DOCOMPUTE$"] = [
-                dwc_inst_template
-                % (strmw, width_lcm, total_bits // strmw, "dwc_lcm", "in0"),
-                dwc_inst_template
-                % (width_lcm, intfw, total_bits // width_lcm, "dwc_intfw", "dwc_lcm"),
-                dma_inst_template % ("dwc_intfw", "out"),
-            ]
+            raise Exception("Unknown IODMA direction: %s" % direction)
 
     def blackboxfunction(self):
         packed_ibits = self.get_instream_width()
@@ -304,11 +349,11 @@ def pragmas(self):
                 "#pragma HLS INTERFACE s_axilite port=in0 bundle=control"
             )
             self.code_gen_dict["$PRAGMAS$"].append(
-                "#pragma HLS INTERFACE axis port=out"
+                "#pragma HLS INTERFACE axis port=out name=out_" + self.hls_sname()
             )
         elif direction == "out":
             self.code_gen_dict["$PRAGMAS$"].append(
-                "#pragma HLS INTERFACE axis port=in0"
+                "#pragma HLS INTERFACE axis port=in0 name=in0_" + self.hls_sname()
             )
             if intfname == "":
                 self.code_gen_dict["$PRAGMAS$"].append(
diff --git a/src/finn/custom_op/fpgadataflow/labelselect_batch.py b/src/finn/custom_op/fpgadataflow/labelselect_batch.py
index 1eb5962fdb..3e27ee0111 100644
--- a/src/finn/custom_op/fpgadataflow/labelselect_batch.py
+++ b/src/finn/custom_op/fpgadataflow/labelselect_batch.py
@@ -29,10 +29,10 @@
 import numpy as np
 import os
 from onnx import TensorProto, helper
+from qonnx.core.datatype import DataType
+from qonnx.util.basic import roundup_to_integer_multiple
 
-from finn.core.datatype import DataType
 from finn.custom_op.fpgadataflow.hlscustomop import HLSCustomOp
-from finn.util.basic import roundup_to_integer_multiple
 from finn.util.data_packing import npy_to_rtlsim_input, rtlsim_output_to_npy
 
 
@@ -182,7 +182,6 @@ def execute_node(self, context, graph):
         exp_ishape = self.get_normal_input_shape()
         exp_oshape = self.get_normal_output_shape()
         folded_ishape = self.get_folded_input_shape()
-        folded_oshape = self.get_folded_output_shape()
 
         if mode == "cppsim":
             code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim")
@@ -212,10 +211,9 @@ def execute_node(self, context, graph):
             # load output npy file
             super().npy_to_dynamic_output(context)
             assert (
-                context[node.output[0]].shape == folded_oshape
+                context[node.output[0]].shape == exp_oshape
             ), "cppsim \
-            did not produce expected ofolded utput shape"
-            context[node.output[0]] = context[node.output[0]].reshape(*exp_oshape)
+            did not produce expected output shape"
         elif mode == "rtlsim":
             sim = self.get_rtlsim()
             nbits = self.get_instream_width()
@@ -342,8 +340,12 @@ def blackboxfunction(self):
         ]
 
     def pragmas(self):
-        self.code_gen_dict["$PRAGMAS$"] = ["#pragma HLS INTERFACE axis port=in0"]
-        self.code_gen_dict["$PRAGMAS$"].append("#pragma HLS INTERFACE axis port=out")
+        self.code_gen_dict["$PRAGMAS$"] = [
+            "#pragma HLS INTERFACE axis port=in0 name=in0_" + self.hls_sname()
+        ]
+        self.code_gen_dict["$PRAGMAS$"].append(
+            "#pragma HLS INTERFACE axis port=out name=out_" + self.hls_sname()
+        )
         self.code_gen_dict["$PRAGMAS$"].append(
             "#pragma HLS INTERFACE ap_ctrl_none port=return"
         )
diff --git a/src/finn/custom_op/fpgadataflow/lookup.py b/src/finn/custom_op/fpgadataflow/lookup.py
index 27be06bdfa..d90fa0f05a 100644
--- a/src/finn/custom_op/fpgadataflow/lookup.py
+++ b/src/finn/custom_op/fpgadataflow/lookup.py
@@ -29,13 +29,14 @@
 import numpy as np
 import os
 import warnings
-from math import ceil
+from math import ceil, log2
+from qonnx.core.datatype import DataType
 
-from finn.core.datatype import DataType
 from finn.custom_op.fpgadataflow.hlscustomop import HLSCustomOp
 from finn.util.data_packing import (
     npy_to_rtlsim_input,
     numpy_to_hls_code,
+    pack_innermost_dim_as_hex_string,
     rtlsim_output_to_npy,
 )
 
@@ -58,6 +59,13 @@ def get_nodeattr_types(self):
             "InputType": ("s", True, ""),
             # Input shape
             "InputShape": ("ints", False, [1]),
+            # Memory mode
+            # const : parameters baked into bitfile (BRAM)
+            # external : lookup performed in external memory over AXI MM
+            "mem_mode": ("s", False, "const", ["const", "external"]),
+            # Width for AXI-MM interface
+            # only relevant when mem_mode="external"
+            "ext_mem_width": ("i", False, 32),
         }
         my_attrs.update(super().get_nodeattr_types())
         return my_attrs
@@ -72,7 +80,8 @@ def get_normal_input_shape(self):
 
     def get_normal_output_shape(self):
         ishape = self.get_normal_input_shape()
-        oshape = list(ishape) + [self.get_nodeattr("EmbeddingDim")]
+        emb_dim = self.get_nodeattr("EmbeddingDim")
+        oshape = list(ishape) + [emb_dim]
         return tuple(oshape)
 
     def get_folded_input_shape(self):
@@ -81,7 +90,23 @@ def get_folded_input_shape(self):
         return tuple(folded_ishape)
 
     def get_folded_output_shape(self):
-        return self.get_normal_output_shape()
+        ishape = self.get_normal_input_shape()
+        mem_mode = self.get_nodeattr("mem_mode")
+        emb_dim = self.get_nodeattr("EmbeddingDim")
+        if mem_mode == "const":
+            oshape = list(ishape) + [emb_dim]
+        elif mem_mode == "external":
+            ext_mem_width = self.get_nodeattr("ext_mem_width")
+            bits_per_emb_elem = self.get_output_datatype().bitwidth()
+            assert ext_mem_width % bits_per_emb_elem == 0
+            emb_elems_per_ext_mem_width = ext_mem_width // bits_per_emb_elem
+            oshape = list(ishape) + [
+                emb_dim // emb_elems_per_ext_mem_width,
+                emb_elems_per_ext_mem_width,
+            ]
+        else:
+            raise Exception("Unrecognized mem_mode:" + mem_mode)
+        return tuple(oshape)
 
     def make_shape_compatible_op(self, model):
         exp_ishape = tuple(self.get_normal_input_shape())
@@ -123,17 +148,20 @@ def get_instream_width(self):
         return ibits
 
     def get_outstream_width(self):
+        folded_oshape = self.get_folded_output_shape()
         obits = self.get_output_datatype().bitwidth()
-        ofm_ch = self.get_nodeattr("EmbeddingDim")
-        return obits * ofm_ch
+        return obits * folded_oshape[-1]
 
     def get_number_output_values(self):
         folded_oshape = self.get_folded_output_shape()
         return np.prod(folded_oshape[:-1])
 
     def global_includes(self):
-        global_incls = ['#include "lookup.hpp"']
-        global_incls.append('#include "embeddings.hpp"')
+        mem_mode = self.get_nodeattr("mem_mode")
+        global_incls = []
+        if mem_mode == "const":
+            global_incls.append('#include "lookup.hpp"')
+            global_incls.append('#include "embeddings.hpp"')
         self.code_gen_dict["$GLOBALS$"] = global_incls
 
     def defines(self, var):
@@ -142,14 +170,26 @@ def defines(self, var):
         elem_hls_type = dtype.get_hls_datatype_str()
         emb_type = DataType[self.get_nodeattr("EmbeddingType")]
         emb_hls_type = emb_type.get_hls_datatype_str()
+        emb_dim = self.get_nodeattr("EmbeddingDim")
+        mem_mode = self.get_nodeattr("mem_mode")
         my_defines = []
-        my_defines.append(
-            "#define NumEmbeddings %d" % self.get_nodeattr("NumEmbeddings")
-        )
-        my_defines.append("#define EmbeddingDim %d" % self.get_nodeattr("EmbeddingDim"))
         my_defines.append("#define NumInputs %d" % n_inputs)
-        my_defines.append("#define InputType %s" % elem_hls_type)
-        my_defines.append("#define EmbeddingType %s" % emb_hls_type)
+        if mem_mode == "external":
+            ext_mem_width = self.get_nodeattr("ext_mem_width")
+            ext_mem_emb_size = self.get_folded_output_shape()[-2]
+            ext_mem_emb_align = ceil(log2(ext_mem_emb_size))
+            my_defines.append("#define MemBits %d" % ext_mem_width)
+            my_defines.append("#define EmbeddingSize %d" % ext_mem_emb_size)
+            my_defines.append("#define EmbeddingAlign %d" % ext_mem_emb_align)
+            my_defines.append("#define T_SRC %s" % elem_hls_type)
+            my_defines.append("#define T_DST ap_uint<MemBits>")
+        elif mem_mode == "const":
+            my_defines.append(
+                "#define NumEmbeddings %d" % self.get_nodeattr("NumEmbeddings")
+            )
+            my_defines.append("#define EmbeddingDim %d" % emb_dim)
+            my_defines.append("#define InputType %s" % elem_hls_type)
+            my_defines.append("#define EmbeddingType %s" % emb_hls_type)
         self.code_gen_dict["$DEFINES$"] = my_defines
 
     def read_npy_data(self):
@@ -186,7 +226,7 @@ def dataoutstrm(self):
         oshape_cpp_str = str(oshape).replace("(", "{").replace(")", "}")
 
         self.code_gen_dict["$DATAOUTSTREAM$"] = [
-            'apintstream2npy<%s, %s, %d, %s>(out, %s, "%s");'
+            'apintstream2npy<%s, %s, %d, %s>(out, %s, "%s", %s);'
             % (
                 packed_hls_type,
                 elem_hls_type,
@@ -194,6 +234,7 @@ def dataoutstrm(self):
                 npy_type,
                 oshape_cpp_str,
                 npy_out,
+                "false",
             )
         ]
 
@@ -210,43 +251,115 @@ def strm_decl(self):
         )
 
     def docompute(self):
-        self.code_gen_dict["$DOCOMPUTE$"] = [
-            """StreamingLookup<NumEmbeddings,  EmbeddingDim, NumInputs,
-            InputType, EmbeddingType >(in0, out, embeddings);"""
-        ]
+        mem_mode = self.get_nodeattr("mem_mode")
+        if mem_mode == "const":
+            self.code_gen_dict["$DOCOMPUTE$"] = [
+                """StreamingLookup<NumEmbeddings,  EmbeddingDim, NumInputs,
+                InputType, EmbeddingType >(in0, out, embeddings);"""
+            ]
+        elif mem_mode == "external":
+            hls_impl = """
+    if(!in0.empty()) {
+        ap_uint<T_SRC::width+EmbeddingAlign> const  base =
+            (in0.read(), ap_uint<EmbeddingAlign>(0));
+        for(unsigned  j = 0; j < EmbeddingSize; j++) {
+#pragma HLS PIPELINE II=1
+            out.write(mem[base+j]);
+        }
+    }
+            """
+            self.code_gen_dict["$DOCOMPUTE$"] = [hls_impl]
 
     def blackboxfunction(self):
+        mem_mode = self.get_nodeattr("mem_mode")
         ibits = self.get_instream_width()
         packed_input_hls_type = "ap_uint<%d>" % ibits
         obits = self.get_outstream_width()
         packed_output_hls_type = "ap_uint<%d>" % obits
-        self.code_gen_dict["$BLACKBOXFUNCTION$"] = [
-            "void %s(hls::stream<%s > &in0, hls::stream<%s > &out)"
-            % (self.onnx_node.name, packed_input_hls_type, packed_output_hls_type)
-        ]
+        if mem_mode == "const":
+            self.code_gen_dict["$BLACKBOXFUNCTION$"] = [
+                "void %s(hls::stream<%s > &in0, hls::stream<%s > &out)"
+                % (self.onnx_node.name, packed_input_hls_type, packed_output_hls_type)
+            ]
+        elif mem_mode == "external":
+            self.code_gen_dict["$BLACKBOXFUNCTION$"] = [
+                "void "
+                + self.onnx_node.name
+                + "(hls::stream<T_SRC> &in0, hls::stream<T_DST> &out, "
+                + "T_DST const *const  mem)"
+            ]
 
     def pragmas(self):
-        my_pragmas = ["#pragma HLS INTERFACE axis port=in0"]
-        my_pragmas.append("#pragma HLS INTERFACE axis port=out")
+        mem_mode = self.get_nodeattr("mem_mode")
+        my_pragmas = [
+            "#pragma HLS INTERFACE axis port=in0 name=in0_" + self.hls_sname()
+        ]
+        my_pragmas.append(
+            "#pragma HLS INTERFACE axis port=out name=out_" + self.hls_sname()
+        )
         my_pragmas.append("#pragma HLS INTERFACE ap_ctrl_none port=return")
+        if mem_mode == "const":
+            my_pragmas.append(
+                "#pragma HLS BIND_STORAGE variable=embeddings type=ROM_2P impl=BRAM"
+            )
+        elif mem_mode == "external":
+            my_pragmas.append("#pragma HLS INTERFACE m_axi offset=slave port=mem")
+            my_pragmas.append("#pragma HLS INTERFACE s_axilite port=mem bundle=control")
+        else:
+            raise Exception("Unrecognized mem_mode: " + mem_mode)
         self.code_gen_dict["$PRAGMAS$"] = my_pragmas
 
     def generate_params(self, model, path):
-        code_gen_dir = path
+        mem_mode = self.get_nodeattr("mem_mode")
         embeddings = model.get_initializer(self.onnx_node.input[1])
-        weight_filename = "{}/embeddings.hpp".format(code_gen_dir)
-        edt = DataType[self.get_nodeattr("EmbeddingType")]
-        # obits = self.get_outstream_width()
-        # packed_output_hls_type = "ap_uint<%d>" % obits
-        assert np.vectorize(edt.allowed)(
-            embeddings
-        ).all(), "Embeddings can't be expressed with type %s" % str(edt)
-        embeddings_hls_code = numpy_to_hls_code(
-            embeddings, edt, "embeddings", True, False
-        )
-        f_thresh = open(weight_filename, "w")
-        f_thresh.write(embeddings_hls_code)
-        f_thresh.close()
+        if mem_mode == "const":
+            code_gen_dir = path
+            weight_filename = "{}/embeddings.hpp".format(code_gen_dir)
+            edt = DataType[self.get_nodeattr("EmbeddingType")]
+            # obits = self.get_outstream_width()
+            # packed_output_hls_type = "ap_uint<%d>" % obits
+            assert np.vectorize(edt.allowed)(
+                embeddings
+            ).all(), "Embeddings can't be expressed with type %s" % str(edt)
+            # reverse innertmost dim in embeddings to remain compatible with
+            # how we normally encode the data in FINN
+            embeddings_rev = np.flip(embeddings, -1)
+            embeddings_hls_code = numpy_to_hls_code(
+                embeddings_rev, edt, "embeddings", True, False
+            )
+            f_thresh = open(weight_filename, "w")
+            f_thresh.write(embeddings_hls_code)
+            f_thresh.close()
+        elif mem_mode == "external":
+            edt = DataType[self.get_nodeattr("EmbeddingType")]
+            ext_mem_width = self.get_nodeattr("ext_mem_width")
+            assert edt.bitwidth() == 8, (
+                "Lookup with mem_mode=external "
+                + "only works with 8-bit embeddings but found "
+                + str(edt)
+            )
+            emb_dim = self.get_nodeattr("EmbeddingDim")
+            # need to zero-pad embeddings in external mode for burst alignment
+            # compute how much padding we need
+            emb_elems_per_ext_mem_width = self.get_folded_output_shape()[-1]
+            ext_mem_emb_size = self.get_folded_output_shape()[-2]
+            ext_mem_emb_align = ceil(log2(ext_mem_emb_size))
+            align_factor = int((ext_mem_width / 8) * 2**ext_mem_emb_align)
+            pad_amount = align_factor - emb_dim
+            embeddings_padded = np.pad(embeddings, [(0, 0), (0, pad_amount)])
+            # reshape for packing the innermost dim
+            embeddings_padded = embeddings_padded.reshape(
+                -1, emb_elems_per_ext_mem_width
+            )
+            weight_filename = "%s/%s.dat" % (path, self.onnx_node.name)
+            ret = pack_innermost_dim_as_hex_string(
+                embeddings_padded, edt, ext_mem_width, True, prefix=""
+            )
+            with open(weight_filename, "w") as f:
+                for current_line in ret:
+                    f.write(current_line + "\n")
+        else:
+            raise Exception("Unrecognized mem_mode: " + mem_mode)
 
     def execute_node(self, context, graph):
         mode = self.get_nodeattr("exec_mode")
@@ -255,6 +368,10 @@ def execute_node(self, context, graph):
         exp_oshape = tuple(self.get_normal_output_shape())
         folded_ishape = tuple(self.get_folded_input_shape())
         folded_oshape = tuple(self.get_folded_output_shape())
+        mem_mode = self.get_nodeattr("mem_mode")
+        assert (
+            mem_mode == "const"
+        ), "Only mem_mode=const is supported for simulation of Lookup layer"
 
         if mode == "cppsim":
             code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim")
@@ -306,7 +423,7 @@ def execute_node(self, context, graph):
                 out_shape,
                 packed_bits,
                 target_bits,
-                reverse_inner=False,
+                reverse_inner=True,
             )
             # load and reshape output
             output = np.load(out_npy_path)
@@ -324,10 +441,16 @@ def execute_node(self, context, graph):
         ), """Output shape doesn't match expected shape."""
 
     def bram_estimation(self):
-        # current calculation assumes embeddings always stored in BRAM_18Ks
-        width_factor = ceil(self.get_outstream_width() / 16)
-        depth_factor = ceil(self.get_nodeattr("NumEmbeddings") / 1024)
-        return width_factor * depth_factor
+        mem_mode = self.get_nodeattr("mem_mode")
+        if mem_mode == "const":
+            # current calculation assumes embeddings always stored in BRAM_18Ks
+            # when mem_mode is const
+            width_factor = ceil(self.get_outstream_width() / 16)
+            depth_factor = ceil(self.get_nodeattr("NumEmbeddings") / 1024)
+            return width_factor * depth_factor
+        else:
+            # TODO can we estimate BRAMs for the DMA engine?
+            return 0
 
     def bram_efficiency_estimation(self):
         bram16_est = self.bram_estimation()
@@ -336,3 +459,20 @@ def bram_efficiency_estimation(self):
         ebits = self.get_outstream_width() * self.get_nodeattr("NumEmbeddings")
         bram16_est_capacity = bram16_est * 18 * 1024
         return ebits / bram16_est_capacity
+
+    def get_ap_int_max_w(self):
+        parent_max = super().get_ap_int_max_w()
+        mem_mode = self.get_nodeattr("mem_mode")
+        ext_mem_width = self.get_nodeattr("ext_mem_width")
+        if mem_mode == "external":
+            return max(ext_mem_width, parent_max)
+        else:
+            return parent_max
+
+    def get_verilog_top_module_intf_names(self):
+        intf_names = super().get_verilog_top_module_intf_names()
+        mem_mode = self.get_nodeattr("mem_mode")
+        if mem_mode == "external":
+            intf_names["axilite"] = ["s_axi_control"]
+            intf_names["aximm"] = [("m_axi_gmem", self.get_nodeattr("ext_mem_width"))]
+        return intf_names
diff --git a/src/finn/custom_op/fpgadataflow/streamingfclayer_batch.py b/src/finn/custom_op/fpgadataflow/matrixvectoractivation.py
similarity index 95%
rename from src/finn/custom_op/fpgadataflow/streamingfclayer_batch.py
rename to src/finn/custom_op/fpgadataflow/matrixvectoractivation.py
index 896e7c2925..9d2717dc8c 100644
--- a/src/finn/custom_op/fpgadataflow/streamingfclayer_batch.py
+++ b/src/finn/custom_op/fpgadataflow/matrixvectoractivation.py
@@ -31,14 +31,14 @@
 import os
 import textwrap
 import warnings
-
-from finn.core.datatype import DataType
-from finn.custom_op.fpgadataflow.hlscustomop import HLSCustomOp
-from finn.util.basic import (
+from qonnx.core.datatype import DataType
+from qonnx.util.basic import (
     calculate_matvec_accumulator_range,
     interleave_matrix_outer_dim_from_partitions,
     roundup_to_integer_multiple,
 )
+
+from finn.custom_op.fpgadataflow.hlscustomop import HLSCustomOp
 from finn.util.data_packing import (
     npy_to_rtlsim_input,
     numpy_to_hls_code,
@@ -48,7 +48,7 @@
 
 from . import templates
 
-# ONNX i/o tensor shape assumptions for StreamingFCLayer:
+# ONNX i/o tensor shape assumptions for MatrixVectorActivation:
 # input 0 is the input tensor, shape (.., i_size) = (..., MW)
 # input 1 is the weight tensor, shape (i_size, o_size) = (MW, MH)
 # (optional) input 2 is the thresholds tensor, shape (o_size, n_thres)
@@ -56,8 +56,9 @@
 # the ... here can be any shape (representing groups of vectors)
 
 
-class StreamingFCLayer_Batch(HLSCustomOp):
-    """Class that corresponds to finn-hls StreamingFCLayer_Batch function."""
+class MatrixVectorActivation(HLSCustomOp):
+    """Class that corresponds to finn-hls Matrix_Vector_Activate(_Stream)_Batch
+    function."""
 
     def __init__(self, onnx_node):
         super().__init__(onnx_node)
@@ -192,7 +193,7 @@ def verify_node(self):
             info_messages.append("All necessary attributes exist")
         except Exception:
             info_messages.append(
-                """The required StreamingFCLayer attributes do not exist."""
+                """The required MatrixVectorActivation attributes do not exist."""
             )
 
         # verify the number of inputs depending on noActivation value
@@ -204,7 +205,7 @@ def verify_node(self):
                 info_messages.append("The number of inputs is correct")
             else:
                 info_messages.append(
-                    """StreamingFCLayer_Batch needs in no
+                    """MatrixVectorActivation needs in no
                             activation mode 2 inputs (data input and weights)"""
                 )
         elif no_act == 0:
@@ -212,7 +213,7 @@ def verify_node(self):
                 info_messages.append("The number of inputs is correct")
             else:
                 info_messages.append(
-                    """StreamingFCLayer_Batch needs 3 inputs
+                    """MatrixVectorActivation needs 3 inputs
                             (data input and weights and threshold values)"""
                 )
         else:
@@ -393,9 +394,16 @@ def get_exp_cycles(self):
         exp_cycles = (mh / pe) * (mw / simd) * np.prod(num_inp_vec) / mmv
         return int(exp_cycles)
 
-    def get_input_datatype(self):
+    def get_input_datatype(self, ind=0):
         """Returns FINN DataType of input."""
-        return DataType[self.get_nodeattr("inputDataType")]
+        # when performing FIFO insertion on an FC layer with ext weights, the ind
+        # parameter can be > 0 (referring to the weights) so handle that here
+        if ind == 0:
+            return DataType[self.get_nodeattr("inputDataType")]
+        elif ind == 1:
+            return DataType[self.get_nodeattr("weightDataType")]
+        else:
+            raise Exception("Undefined input ind for this layer type")
 
     def get_weight_datatype(self):
         """Returns FINN DataType of weights."""
@@ -811,16 +819,28 @@ def generate_params(self, model, path):
             self.make_weight_file(weights, "decoupled_npy", weight_filename_sim)
             if mem_mode == "decoupled":
                 # also save weights as Verilog .dat file
-                weight_filename_rtl = "{}/memblock_0.dat".format(code_gen_dir)
+                # note that we provide two different .dat files, one for synth
+                # and one for synthesis. this is because URAM-based weights always
+                # need zero weights for synthesis, otherwise they get inferred
+                # as BRAM
+                weight_filename_rtl_synth = "{}/memblock_synth_0.dat".format(
+                    code_gen_dir
+                )
+                weight_filename_rtl_sim = "{}/memblock_sim_0.dat".format(code_gen_dir)
+                # sim weights are always the true weights
+                self.make_weight_file(
+                    weights, "decoupled_verilog_dat", weight_filename_rtl_sim
+                )
                 ram_style = self.get_nodeattr("ram_style")
                 if ram_style == "ultra":
                     # UltraRAM must have no memory initializer, or only zeroes
                     # otherwise BRAM will be inferred instead of URAM
                     # as a workaround we provide a zero-weight init here
-                    # TODO handle this in Verilog with an if statement
-                    weights = np.zeros_like(weights)
+                    synth_weights = np.zeros_like(weights, dtype=np.float32)
+                else:
+                    synth_weights = weights
                 self.make_weight_file(
-                    weights, "decoupled_verilog_dat", weight_filename_rtl
+                    synth_weights, "decoupled_verilog_dat", weight_filename_rtl_synth
                 )
         else:
             raise Exception(
@@ -871,7 +891,7 @@ def generate_params(self, model, path):
                         tdt_hls,
                         odt_hls,
                         self.get_nodeattr("ActVal"),
-                        "comp::less_equal<%s>" % tdt_hls,
+                        "comp::less_equal<%s, %s>" % (tdt_hls, tdt_hls),
                     )
                 )
                 f_thresh.write(thresholds_hls_code)
@@ -921,7 +941,7 @@ def execute_node(self, context, graph):
                     reshaped_input,
                 )
             elif in_ind > 2:
-                raise Exception("Unexpected input found for StreamingFCLayer")
+                raise Exception("Unexpected input found for MatrixVectorActivation")
             in_ind += 1
 
         if mode == "cppsim":
@@ -935,11 +955,8 @@ def execute_node(self, context, graph):
                 out = 2 * out - 1
                 context[node.output[0]] = out
             assert (
-                context[node.output[0]].shape == self.get_folded_output_shape()
-            ), """Output shape is not as expected"""
-            # reshape output to have expected shape
-            oshape = self.get_normal_output_shape()
-            context[node.output[0]] = context[node.output[0]].reshape(*oshape)
+                context[node.output[0]].shape == self.get_normal_output_shape()
+            ), "cppsim did not produce expected output shape"
         elif mode == "rtlsim":
             sim = self.get_rtlsim()
             nbits = self.get_instream_width()
@@ -994,16 +1011,12 @@ def global_includes(self):
         self.code_gen_dict["$GLOBALS$"] += ['#include "activations.hpp"']
 
         mem_mode = self.get_nodeattr("mem_mode")
-        if mem_mode == "const":
-            # self.code_gen_dict["$GLOBALS$"] += ['#include "params.h"']
-            pass
-        elif mem_mode == "decoupled" or mem_mode == "external":
-            self.code_gen_dict["$GLOBALS$"] += ['#include "mvau.hpp"']
-        else:
+        if mem_mode not in ["const", "decoupled", "external"]:
             raise Exception(
                 """Please set mem_mode to "const", "decoupled", or "external",
                 currently no other parameter value is supported!"""
             )
+        self.code_gen_dict["$GLOBALS$"] += ['#include "mvau.hpp"']
         if self.calc_tmem() != 0:
             # TODO find a better way of checking for no pregenerated thresholds
             self.code_gen_dict["$GLOBALS$"] += ['#include "thresh.h"']
@@ -1015,7 +1028,7 @@ def defines(self, var):
             MW = self.get_nodeattr("MW")
             condition = SIMD >= (MW / 1024)
             msg = (
-                f"HLS synthesis of StreamingFCLayer_Batch requires: "
+                f"HLS synthesis of MatrixVectorActivation requires: "
                 f"SIMD >= MW / 1024. This is not fulfilled with: SIMD={SIMD} "
                 f"and MW={MW} for node: {self.onnx_node.name}."
             )
@@ -1107,11 +1120,9 @@ def docompute(self):
         else:
             threshs = "threshs"
         if mem_mode == "const":
-            node = self.onnx_node
             self.code_gen_dict["$DOCOMPUTE$"] = [
-                """{}<MW1, MH1, SIMD1, PE1, {}, {}, {}>
+                """Matrix_Vector_Activate_Batch<MW1, MH1, SIMD1, PE1, 1, {}, {}, {}>
                 (in0, out, weights, {}, numReps, {});""".format(
-                    node.op_type,
                     tmpl_args["TSrcI"],
                     tmpl_args["TDstI"],
                     tmpl_args["TWeightI"],
@@ -1210,8 +1221,12 @@ def blackboxfunction(self):
     def pragmas(self):
         mem_mode = self.get_nodeattr("mem_mode")
         ram_style_thresholds = self.get_nodeattr("ram_style_thresholds")
-        self.code_gen_dict["$PRAGMAS$"] = ["#pragma HLS INTERFACE axis port=in0"]
-        self.code_gen_dict["$PRAGMAS$"].append("#pragma HLS INTERFACE axis port=out")
+        self.code_gen_dict["$PRAGMAS$"] = [
+            "#pragma HLS INTERFACE axis port=in0 name=in0_" + self.hls_sname()
+        ]
+        self.code_gen_dict["$PRAGMAS$"].append(
+            "#pragma HLS INTERFACE axis port=out name=out_" + self.hls_sname()
+        )
         in_fifo_depth = self.get_nodeattr("inFIFODepth")
         out_fifo_depth = self.get_nodeattr("outFIFODepth")
         # insert depth pragmas only if specified
@@ -1239,7 +1254,8 @@ def pragmas(self):
             )
         elif mem_mode == "decoupled" or mem_mode == "external":
             self.code_gen_dict["$PRAGMAS$"].append(
-                "#pragma HLS INTERFACE axis port=weights"
+                "#pragma HLS INTERFACE axis port=weights name=weights_"
+                + self.hls_sname()
             )
             self.code_gen_dict["$PRAGMAS$"].append(
                 "#pragma HLS stream depth=8 variable=weights"
@@ -1302,6 +1318,7 @@ def code_generation_ipi(self):
                     runtime_writable == 1
                 ), "Layer with URAM weights must have runtime_writeable_weights=1"
             node_name = self.onnx_node.name
+            sname = self.hls_sname()
             # create a hierarchy for this layer, with the same port names
             clk_name = self.get_verilog_top_module_intf_names()["clk"][0]
             rst_name = self.get_verilog_top_module_intf_names()["rst"][0]
@@ -1355,8 +1372,8 @@ def code_generation_ipi(self):
             )
             cmd.append(
                 "connect_bd_intf_net [get_bd_intf_pins %s/%s/m_axis_0] "
-                "[get_bd_intf_pins %s/%s/weights_V_V]"
-                % (node_name, strm_inst, node_name, node_name)
+                "[get_bd_intf_pins %s/%s/weights_%s]"
+                % (node_name, strm_inst, node_name, node_name, sname)
             )
             cmd.append(
                 "connect_bd_net [get_bd_pins %s/%s] [get_bd_pins %s/%s/aresetn]"
@@ -1404,15 +1421,16 @@ def code_generation_ipi(self):
             # base class impl sufficient for const/external modes
             return super().code_generation_ipi()
         else:
-            raise Exception("Unrecognized mem_mode for StreamingFCLayer")
+            raise Exception("Unrecognized mem_mode for MatrixVectorActivation")
         return cmd
 
     def get_verilog_top_module_intf_names(self):
         intf_names = super().get_verilog_top_module_intf_names()
         mem_mode = self.get_nodeattr("mem_mode")
+        sname = self.hls_sname()
         if mem_mode == "external":
             intf_names["s_axis"].append(
-                ("weights_V_V", self.get_weightstream_width_padded())
+                ("weights_" + sname, self.get_weightstream_width_padded())
             )
         if mem_mode == "decoupled":
             # only expose axilite interface if attribute is set
diff --git a/src/finn/custom_op/fpgadataflow/pool_batch.py b/src/finn/custom_op/fpgadataflow/pool_batch.py
index ba8a446f2c..43373eab17 100644
--- a/src/finn/custom_op/fpgadataflow/pool_batch.py
+++ b/src/finn/custom_op/fpgadataflow/pool_batch.py
@@ -28,8 +28,8 @@
 
 import numpy as np
 import os
+from qonnx.core.datatype import DataType
 
-from finn.core.datatype import DataType
 from finn.custom_op.fpgadataflow.hlscustomop import HLSCustomOp
 from finn.util.data_packing import npy_to_rtlsim_input, rtlsim_output_to_npy
 
@@ -198,7 +198,8 @@ def verify_node(self):
         return info_messages
 
     def global_includes(self):
-        self.code_gen_dict["$GLOBALS$"] = ['#include "maxpool.h"']
+        self.code_gen_dict["$GLOBALS$"] = ['#include "activations.hpp"']
+        self.code_gen_dict["$GLOBALS$"] += ['#include "maxpool.h"']
         self.code_gen_dict["$GLOBALS$"] += ['#include "pool.hpp"']
 
     def defines(self, var):
@@ -322,8 +323,12 @@ def blackboxfunction(self):
         ]
 
     def pragmas(self):
-        self.code_gen_dict["$PRAGMAS$"] = ["#pragma HLS INTERFACE axis port=in0"]
-        self.code_gen_dict["$PRAGMAS$"].append("#pragma HLS INTERFACE axis port=out")
+        self.code_gen_dict["$PRAGMAS$"] = [
+            "#pragma HLS INTERFACE axis port=in0 name=in0_" + self.hls_sname()
+        ]
+        self.code_gen_dict["$PRAGMAS$"].append(
+            "#pragma HLS INTERFACE axis port=out name=out_" + self.hls_sname()
+        )
         self.code_gen_dict["$PRAGMAS$"].append(
             "#pragma HLS INTERFACE ap_ctrl_none port=return"
         )
@@ -334,7 +339,6 @@ def execute_node(self, context, graph):
         exp_ishape = self.get_normal_input_shape()
         folded_ishape = self.get_folded_input_shape()
         exp_oshape = self.get_normal_output_shape()
-        folded_oshape = self.get_folded_output_shape()
 
         # TODO ensure codegen dir exists
         if mode == "cppsim":
@@ -368,9 +372,8 @@ def execute_node(self, context, graph):
             # load output npy file
             super().npy_to_dynamic_output(context)
             assert (
-                context[node.output[0]].shape == folded_oshape
-            ), "cppsim did not produce expected folded output shape"
-            context[node.output[0]] = context[node.output[0]].reshape(*exp_oshape)
+                context[node.output[0]].shape == exp_oshape
+            ), "cppsim did not produce expected output shape"
         elif mode == "rtlsim":
             sim = self.get_rtlsim()
             nbits = self.get_instream_width()
diff --git a/src/finn/custom_op/fpgadataflow/streamingdataflowpartition.py b/src/finn/custom_op/fpgadataflow/streamingdataflowpartition.py
index cf065cf156..2ae6d92b88 100644
--- a/src/finn/custom_op/fpgadataflow/streamingdataflowpartition.py
+++ b/src/finn/custom_op/fpgadataflow/streamingdataflowpartition.py
@@ -26,7 +26,10 @@
 # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-from finn.custom_op.base import CustomOp
+from qonnx.core.modelwrapper import ModelWrapper
+from qonnx.custom_op.base import CustomOp
+
+from finn.core.onnx_exec import execute_onnx
 
 # TODO move StreamingDataflowPartition to HLSCustomOp base class
 
@@ -48,6 +51,7 @@ def get_nodeattr_types(self):
             "device_id": ("i", False, 0),
             "mem_port": ("s", False, ""),
             "instance_name": ("s", False, ""),
+            "return_full_exec_context": ("i", False, 0),
         }
 
     def make_shape_compatible_op(self, model):
@@ -57,8 +61,26 @@ def infer_node_datatype(self, model):
         pass
 
     def execute_node(self, context, graph):
-        # TODO add RPC execution with synthesized bitfile?
-        # whole-design rtlsim with PyVerilator may also be an alternative
+        model = ModelWrapper(self.get_nodeattr("model"))
+        return_full_exec_context = self.get_nodeattr("return_full_exec_context") == 1
+        node = self.onnx_node
+        inp_ctx = dict(filter(lambda x: x[0] in node.input, context.items()))
+        # inputs may have been renamed in partition
+        for i, old_iname in enumerate(node.input):
+            new_iname = model.graph.input[i].name
+            if old_iname != new_iname:
+                inp_ctx[new_iname] = inp_ctx[old_iname]
+                del inp_ctx[old_iname]
+        ret = execute_onnx(model, inp_ctx, return_full_exec_context)
+        # outputs may have been renamed in partition
+        for i, node_oname in enumerate(node.output):
+            model_oname = model.graph.output[i].name
+            context[node_oname] = ret[model_oname]
+        # prefix and insert exec context entries
+        if return_full_exec_context:
+            for tname in ret.keys():
+                if tname not in [x.name for x in model.graph.output]:
+                    context[node.name + "_" + tname] = ret[tname]
         pass
 
     def verify_node(self):
diff --git a/src/finn/custom_op/fpgadataflow/streamingdatawidthconverter_batch.py b/src/finn/custom_op/fpgadataflow/streamingdatawidthconverter_batch.py
index 1791706afa..1e6b72e4d5 100644
--- a/src/finn/custom_op/fpgadataflow/streamingdatawidthconverter_batch.py
+++ b/src/finn/custom_op/fpgadataflow/streamingdatawidthconverter_batch.py
@@ -30,8 +30,8 @@
 import numpy as np
 import os
 import warnings
+from qonnx.core.datatype import DataType
 
-from finn.core.datatype import DataType
 from finn.custom_op.fpgadataflow.hlscustomop import HLSCustomOp
 from finn.util.data_packing import npy_to_rtlsim_input, rtlsim_output_to_npy
 
@@ -76,24 +76,30 @@ def get_normal_output_shape(self):
         oshape = self.get_nodeattr("shape")
         return oshape
 
+    def check_divisible_iowidths(self):
+        impl_style = self.get_nodeattr("impl_style")
+        if impl_style == "hls":
+            # when using impl_style = hls must have the following
+            # if inWidth > outWidth: inWidth % outWidth = 0
+            # if inWidth < outWidth: outWidth % inWidth = 0
+            iwidth = self.get_nodeattr("inWidth")
+            owidth = self.get_nodeattr("outWidth")
+            if iwidth > owidth:
+                assert (
+                    iwidth % owidth == 0
+                ), """DWC InWidth is bigger than OutWidth and is not divisible by it.
+                Please adjust PE and SIMD values so that InWidth % OutWidth = 0
+                or alternatively use impl_style = vivado"""
+            else:
+                assert (
+                    owidth % iwidth == 0
+                ), """DWC OutWidth is bigger than InWidth and is not divisible by it.
+                Please adjust PE and SIMD values so that OutWidth % InWidth = 0
+                or alternatively use impl_style = vivado"""
+
     def get_folded_input_shape(self):
-        # for correct functionality of the dwc node the
-        # following must apply:
-        # if inWidth > outWidth: inWidth % outWidth = 0
-        # if inWidth < outWidth: outWidth % inWidth = 0
+        self.check_divisible_iowidths()
         iwidth = self.get_nodeattr("inWidth")
-        owidth = self.get_nodeattr("outWidth")
-        if iwidth > owidth:
-            assert (
-                iwidth % owidth == 0
-            ), """InWidth is bigger than OutWidth and is not divisible by it.
-            Please adjust PE and SIMD values so that InWidth % OutWidth = 0"""
-        else:
-            assert (
-                owidth % iwidth == 0
-            ), """OutWidth is bigger than InWidth and is not divisible by it.
-            Please adjust PE and SIMD values so that OutWidth % InWidth = 0"""
-
         ishape = self.get_normal_input_shape()
         dummy_t = np.random.randn(*ishape)
         ibits = self.get_input_datatype().bitwidth()
@@ -112,23 +118,8 @@ def get_folded_input_shape(self):
         return dummy_t.shape
 
     def get_folded_output_shape(self):
-        # for correct functionality of the dwc node the
-        # following must apply:
-        # if inWidth > outWidth: inWidth % outWidth = 0
-        # if inWidth < outWidth: outWidth % inWidth = 0
-        iwidth = self.get_nodeattr("inWidth")
+        self.check_divisible_iowidths()
         owidth = self.get_nodeattr("outWidth")
-        if iwidth > owidth:
-            assert (
-                iwidth % owidth == 0
-            ), """InWidth is bigger than OutWidth and is not divisible by it.
-            Please adjust PE and SIMD values so that InWidth % OutWidth = 0"""
-        else:
-            assert (
-                owidth % iwidth == 0
-            ), """OutWidth is bigger than InWidth and is not divisible by it.
-            Please adjust PE and SIMD values so that OutWidth % InWidth = 0"""
-
         oshape = self.get_normal_output_shape()
         dummy_t = np.random.randn(*oshape)
         obits = self.get_output_datatype().bitwidth()
@@ -287,22 +278,29 @@ def blackboxfunction(self):
         ]
 
     def pragmas(self):
-        self.code_gen_dict["$PRAGMAS$"] = ["#pragma HLS INTERFACE axis port=in0"]
-        self.code_gen_dict["$PRAGMAS$"].append("#pragma HLS INTERFACE axis port=out")
+        self.code_gen_dict["$PRAGMAS$"] = [
+            "#pragma HLS INTERFACE axis port=in0 name=in0_" + self.hls_sname()
+        ]
+        self.code_gen_dict["$PRAGMAS$"].append(
+            "#pragma HLS INTERFACE axis port=out name=out_" + self.hls_sname()
+        )
         self.code_gen_dict["$PRAGMAS$"].append(
             "#pragma HLS INTERFACE ap_ctrl_none port=return"
         )
 
     def execute_node(self, context, graph):
         mode = self.get_nodeattr("exec_mode")
+        impl_style = self.get_nodeattr("impl_style")
         node = self.onnx_node
         exp_shape = self.get_normal_input_shape()
         folded_ishape = self.get_folded_input_shape()
 
         # TODO ensure codegen dir exists
         if mode == "cppsim":
+            assert impl_style == "hls", "DWC cppsim only possible when impl_style==hls"
             code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim")
         elif mode == "rtlsim":
+            assert impl_style == "hls", "DWC rtlsim only possible when impl_style==hls"
             code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen")
         else:
             raise Exception(
diff --git a/src/finn/custom_op/fpgadataflow/streamingfifo.py b/src/finn/custom_op/fpgadataflow/streamingfifo.py
index 91f6ed5b8d..a7c3cd0be5 100644
--- a/src/finn/custom_op/fpgadataflow/streamingfifo.py
+++ b/src/finn/custom_op/fpgadataflow/streamingfifo.py
@@ -30,10 +30,11 @@
 import os
 import subprocess
 import warnings
+from qonnx.core.datatype import DataType
 from shutil import copy
 
-from finn.core.datatype import DataType
 from finn.custom_op.fpgadataflow.hlscustomop import HLSCustomOp
+from finn.util.basic import get_finn_root
 from finn.util.data_packing import npy_to_rtlsim_input, rtlsim_output_to_npy
 
 from . import templates
@@ -110,7 +111,7 @@ def code_generation_ipgen(self, model, fpgapart, clk):
         )
         os.makedirs(verilog_dir)
         # copy Q_srl.v from finn-rtllib to verilog directory
-        memstream_dir = "/workspace/finn/finn-rtllib/memstream/hdl/"
+        memstream_dir = get_finn_root() + "/finn-rtllib/memstream/hdl/"
         Q_file = os.path.join(memstream_dir, "Q_srl.v")
         copy(Q_file, verilog_dir)
 
@@ -128,6 +129,7 @@ def code_generation_ipgen(self, model, fpgapart, clk):
         self.code_gen_dict["$OUT_RANGE$"] = ["[{}:0]".format(in_width - 1)]
         self.code_gen_dict["$WIDTH$"] = [str(in_width)]
         self.code_gen_dict["$DEPTH$"] = [str(self.get_nodeattr("depth"))]
+        self.code_gen_dict["$HLS_SNAME$"] = [self.hls_sname()]
 
         template = self.strm_fifo_wrapper
 
@@ -152,6 +154,7 @@ def ipgen_singlenode_code(self):
         # note: setting the root dir as absolute can cause path problems
         # the ipgen script will be invoked from the sources dir so root_dir=. is OK
         self.code_gen_dict["$VERILOG_DIR$"] = ["."]
+        self.code_gen_dict["$HLS_SNAME$"] = [self.hls_sname()]
         for key in self.code_gen_dict:
             # transform list into long string separated by '\n'
             code_gen_line = "\n".join(self.code_gen_dict[key])
diff --git a/src/finn/custom_op/fpgadataflow/streamingmaxpool_batch.py b/src/finn/custom_op/fpgadataflow/streamingmaxpool_batch.py
old mode 100644
new mode 100755
index 1e66a5c204..882b40a0aa
--- a/src/finn/custom_op/fpgadataflow/streamingmaxpool_batch.py
+++ b/src/finn/custom_op/fpgadataflow/streamingmaxpool_batch.py
@@ -29,12 +29,15 @@
 import numpy as np
 import os
 import warnings
+from qonnx.core.datatype import DataType
+from qonnx.custom_op.general.maxpoolnhwc import compute_pool_output_dim
 
-from finn.core.datatype import DataType
 from finn.custom_op.fpgadataflow.hlscustomop import HLSCustomOp
-from finn.custom_op.general.im2col import compute_conv_output_dim
 from finn.util.data_packing import npy_to_rtlsim_input, rtlsim_output_to_npy
 
+# TODO: consider splitting this into separate implementations for 1D and 2D
+# similar to what we do for ConvolutionInputGenerator
+
 
 class StreamingMaxPool_Batch(HLSCustomOp):
     """Class that corresponds to finn-hlslib StreamingMaxPool_batch function."""
@@ -44,6 +47,10 @@ def get_nodeattr_types(self):
             "ImgDim": ("ints", True, []),  # [H, W] = [Y, X]
             "PoolDim": ("ints", True, []),  # [H, W] = [Y, X]
             "NumChannels": ("i", True, 0),
+            # parallelism control - only supported for 1D maxpool
+            "PE": ("i", False, 0),
+            # round up (instead of down) output size - only supported for 1D maxpool
+            "CeilMode": ("i", False, 0),
             # FINN DataTypes for inputs/outputs
             "dataType": ("s", True, ""),
         }
@@ -82,24 +89,30 @@ def get_normal_input_shape(self):
         return ishape
 
     def get_folded_input_shape(self):
-        # even though there is no folding in the current hlslib op,
-        # insert a time multiplexing axis to remain compatible with the
-        # shapes produced by the rest of the dataflow pipeline
-        ret = list(self.get_normal_input_shape())
-        ret.insert(-1, 1)
-        return tuple(ret)
+        ifm_dim_h, ifm_dim_w = self.get_nodeattr("ImgDim")
+        ifm_ch = self.get_nodeattr("NumChannels")
+        pe = self.get_nodeattr("PE")
+        nf = int(ifm_ch / pe)
+        if self.is_1d():
+            folded_ishape = (1, ifm_dim_h, ifm_dim_w, nf, pe)
+        else:
+            folded_ishape = (1, ifm_dim_h, ifm_dim_w, 1, ifm_ch)
+        return folded_ishape
 
     def get_normal_output_shape(self):
         ifm_dim_h, ifm_dim_w = self.get_nodeattr("ImgDim")
         k_h, k_w = tuple(self.get_nodeattr("PoolDim"))
         ifm_ch = self.get_nodeattr("NumChannels")
-        stride_h = k_h
-        stride_w = k_w
-        pad = 0
-        assert ifm_dim_h % k_h == 0, "StreamingMaxPool needs ImgDim_h % PoolDim_h == 0"
-        assert ifm_dim_w % k_w == 0, "StreamingMaxPool needs ImgDim_w % PoolDim_w == 0"
-        ofm_dim_h = compute_conv_output_dim(ifm_dim_h, k_h, stride_h, pad)
-        ofm_dim_w = compute_conv_output_dim(ifm_dim_w, k_w, stride_w, pad)
+        ceil_mode = self.get_nodeattr("CeilMode")
+        if not self.is_1d():
+            assert (
+                ifm_dim_h % k_h == 0
+            ), "StreamingMaxPool needs ImgDim_h % PoolDim_h == 0"
+            assert (
+                ifm_dim_w % k_w == 0
+            ), "StreamingMaxPool needs ImgDim_w % PoolDim_w == 0"
+        ofm_dim_h = compute_pool_output_dim(ifm_dim_h, k_h, k_h, 0, ceil_mode)
+        ofm_dim_w = compute_pool_output_dim(ifm_dim_w, k_w, k_w, 0, ceil_mode)
         oshape = (1, ofm_dim_h, ofm_dim_w, ifm_ch)
         return oshape
 
@@ -107,8 +120,15 @@ def get_folded_output_shape(self):
         # even though there is no folding in the current hlslib op,
         # insert a time multiplexing axis to remain compatible with the
         # shapes produced by the rest of the dataflow pipeline
+        ifm_ch = self.get_nodeattr("NumChannels")
+        pe = self.get_nodeattr("PE")
+        nf = int(ifm_ch / pe)
         ret = list(self.get_normal_output_shape())
-        ret.insert(-1, 1)
+        if self.is_1d():
+            ret[-1] = nf
+            ret.append(pe)
+        else:
+            ret.insert(-1, 1)
         return tuple(ret)
 
     def get_number_output_values(self):
@@ -118,20 +138,35 @@ def get_number_output_values(self):
     def get_exp_cycles(self):
         # derived from StreamingMaxPool_Batch loop nest
         ifm_dim, k, ifm_ch = self.get_1d_attrs_normalized()
+
+        warnings.warn(
+            """Estimated latency for layer {} can be lower than
+             actual latency!""".format(
+                self.onnx_node.name
+            )
+        )
         if self.is_1d():
-            return int(ifm_dim[1] + k[1])
+            _, _, _, nf, _ = self.get_folded_output_shape()
+            ceil_mode = self.get_nodeattr("CeilMode")
+            ofm_dim = compute_pool_output_dim(ifm_dim[1], k[1], k[1], 0, ceil_mode)
+            exp_cycles = ofm_dim * nf * (k[1] + 1)
+            return int(exp_cycles)
         else:
             # TODO: adjust inaccurate formula
-            return int(ifm_dim[1] * (ifm_dim[1] + (ifm_dim[1] / k[1])))
+            return int(ifm_dim[1] * ifm_dim[1] * (1 + 1 / (k[1] * k[1])))
 
     def get_instream_width(self):
         dt_bits = self.get_input_datatype().bitwidth()
+        pe = self.get_nodeattr("PE")
         ifm_ch = self.get_nodeattr("NumChannels")
-        in_width = int(dt_bits * ifm_ch)
+        if self.is_1d():
+            in_width = int(dt_bits * pe)
+        else:
+            in_width = int(dt_bits * ifm_ch)
         return in_width
 
     def get_outstream_width(self):
-        """For streaming maxpool out stream with is the same as in stream width"""
+        """For streaming maxpool out stream width is the same as in stream width"""
         return self.get_instream_width()
 
     def make_shape_compatible_op(self, model):
@@ -176,18 +211,34 @@ def global_includes(self):
         self.code_gen_dict["$GLOBALS$"] = ['#include "maxpool.h"']
 
     def defines(self, var):
-        numReps = 2
+        numReps = 1
         ifm_dim, k, ifm_ch = self.get_1d_attrs_normalized()
+        ceil_mode = self.get_nodeattr("CeilMode")
+        output_size = compute_pool_output_dim(ifm_dim[1], k[1], k[1], 0, ceil_mode)
 
-        self.code_gen_dict["$DEFINES$"] = [
-            """#define ImgDim {}\n #define PoolDim {}\n
-            #define NumChannels {}\n #define numReps {}""".format(
-                ifm_dim[1],
-                k[1],
-                self.get_nodeattr("NumChannels"),
-                numReps,
-            )
-        ]
+        if self.is_1d():
+            self.code_gen_dict["$DEFINES$"] = [
+                """#define ImgDim {}\n #define PoolDim {}\n
+                #define NumChannels {}\n #define PE {}\n #define OutputSize {}
+                \n #define numReps {}""".format(
+                    ifm_dim[1],
+                    k[1],
+                    self.get_nodeattr("NumChannels"),
+                    self.get_nodeattr("PE"),
+                    output_size,
+                    numReps,
+                )
+            ]
+        else:
+            self.code_gen_dict["$DEFINES$"] = [
+                """#define ImgDim {}\n #define PoolDim {}\n
+                #define NumChannels {}\n #define numReps {}""".format(
+                    ifm_dim[1],
+                    k[1],
+                    self.get_nodeattr("NumChannels"),
+                    numReps,
+                )
+            ]
 
     def read_npy_data(self):
         code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim")
@@ -222,22 +273,27 @@ def docompute(self):
             if self.is_1d():
                 raise Exception("Binary 1d MaxPool not implemented on HLS backend")
             else:
-                op = "StreamingMaxPool_Batch"
+                op = "StreamingMaxPool"
             self.code_gen_dict["$DOCOMPUTE$"] = [
-                "%s<ImgDim, PoolDim, NumChannels>(in0, out, numReps);" % (op)
+                "%s<ImgDim, PoolDim, NumChannels>(in0, out);" % (op)
             ]
         else:
-            if self.is_1d():
-                op = "StreamingMaxPool_Precision_Batch_1d"
-            else:
-                op = "StreamingMaxPool_Precision_Batch"
             dtype = self.get_input_datatype()
             dtype_hls = dtype.get_hls_datatype_str()
             minval_str = str(int(dtype.min()))
-            self.code_gen_dict["$DOCOMPUTE$"] = [
-                "%s<ImgDim, PoolDim, NumChannels, %s, %s>(in0, out, numReps);"
-                % (op, dtype_hls, minval_str)
-            ]
+            if self.is_1d():
+                op = "StreamingMaxPool_Precision_1d"
+                self.code_gen_dict["$DOCOMPUTE$"] = [
+                    """%s<ImgDim, PoolDim, NumChannels, PE,
+                     OutputSize, %s, %s>(in0, out);"""
+                    % (op, dtype_hls, minval_str)
+                ]
+            else:
+                op = "StreamingMaxPool_Precision"
+                self.code_gen_dict["$DOCOMPUTE$"] = [
+                    "%s<ImgDim, PoolDim, NumChannels, %s, %s>(in0, out);"
+                    % (op, dtype_hls, minval_str)
+                ]
 
     def dataoutstrm(self):
         code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim")
@@ -278,8 +334,12 @@ def blackboxfunction(self):
         ]
 
     def pragmas(self):
-        self.code_gen_dict["$PRAGMAS$"] = ["#pragma HLS INTERFACE axis port=in0"]
-        self.code_gen_dict["$PRAGMAS$"].append("#pragma HLS INTERFACE axis port=out")
+        self.code_gen_dict["$PRAGMAS$"] = [
+            "#pragma HLS INTERFACE axis port=in0 name=in0_" + self.hls_sname()
+        ]
+        self.code_gen_dict["$PRAGMAS$"].append(
+            "#pragma HLS INTERFACE axis port=out name=out_" + self.hls_sname()
+        )
         self.code_gen_dict["$PRAGMAS$"].append(
             "#pragma HLS INTERFACE ap_ctrl_none port=return"
         )
@@ -289,7 +349,7 @@ def execute_node(self, context, graph):
         node = self.onnx_node
         exp_ishape = self.get_normal_input_shape()
         exp_oshape = self.get_normal_output_shape()
-        folded_oshape = self.get_folded_output_shape()
+        folded_ishape = self.get_folded_input_shape()
 
         # TODO ensure codegen dir exists
         if mode == "cppsim":
@@ -316,9 +376,8 @@ def execute_node(self, context, graph):
             export_idt = DataType["BINARY"]
         else:
             export_idt = self.get_input_datatype()
-        # no reshaping for input since assuming no folding on input
-        # make copy before saving array
-        reshaped_input = inp.copy()
+
+        reshaped_input = inp.reshape(folded_ishape)
         np.save(os.path.join(code_gen_dir, "input_0.npy"), reshaped_input)
 
         if mode == "cppsim":
@@ -327,10 +386,9 @@ def execute_node(self, context, graph):
             # load output npy file
             super().npy_to_dynamic_output(context)
             assert (
-                context[node.output[0]].shape == folded_oshape
+                context[node.output[0]].shape == exp_oshape
             ), "cppsim \
-            did not produce expected ofolded utput shape"
-            context[node.output[0]] = context[node.output[0]].reshape(*exp_oshape)
+            did not produce expected output shape"
         elif mode == "rtlsim":
             sim = self.get_rtlsim()
             nbits = self.get_instream_width()
@@ -367,4 +425,4 @@ def execute_node(self, context, graph):
         assert (
             context[node.output[0]].shape == exp_oshape
         ), """Output
-        shape doesn't match expected shape (1, ofm_dim, ofm_dim, k*k*ifm_ch)."""
+        shape doesn't match expected shape (1, ofm_dim, ofm_dim, ifm_ch)."""
diff --git a/src/finn/custom_op/fpgadataflow/templates.py b/src/finn/custom_op/fpgadataflow/templates.py
index e253348598..e73fa9bb28 100644
--- a/src/finn/custom_op/fpgadataflow/templates.py
+++ b/src/finn/custom_op/fpgadataflow/templates.py
@@ -86,23 +86,21 @@
 set config_hwsrcdir "$HWSRCDIR$"
 puts "HW source dir: $config_hwsrcdir"
 set config_proj_part "$FPGAPART$"
-
-set config_bnnlibdir "$FINNHLSLIBDIR$"
-set config_customhlsdir "$FINNHLSCUSTOMDIR$"
-
+set config_bnnlibdir "$::env(FINN_ROOT)/deps/finn-hlslib"
+puts "finn-hlslib dir: $config_bnnlibdir"
+set config_customhlsdir "$::env(FINN_ROOT)/custom_hls"
+puts "custom HLS dir: $config_customhlsdir"
 set config_toplevelfxn "$TOPFXN$"
 set config_clkperiod $CLKPERIOD$
 
 open_project $config_proj_name
-add_files $config_hwsrcdir/top_$TOPFXN$.cpp -cflags "-std=c++0x -I$config_bnnlibdir -I$config_customhlsdir"
+add_files $config_hwsrcdir/top_$TOPFXN$.cpp -cflags "-std=c++14 -I$config_bnnlibdir -I$config_customhlsdir"
 
 set_top $config_toplevelfxn
 open_solution sol1
 set_part $config_proj_part
 
-config_compile -ignore_long_run_time -disable_unroll_code_size_check
-config_interface -m_axi_addr64
-config_rtl -auto_prefix
+$DEFAULT_DIRECTIVES$
 $EXTRA_DIRECTIVES$
 
 create_clock -period $config_clkperiod -name default
@@ -116,22 +114,22 @@
 module $TOPNAME$(
 ap_clk,
 ap_rst_n,
-in0_V_V_TDATA,
-in0_V_V_TVALID,
-in0_V_V_TREADY,
-out_V_V_TDATA,
-out_V_V_TVALID,
-out_V_V_TREADY
+in0_$HLS_SNAME$_TDATA,
+in0_$HLS_SNAME$_TVALID,
+in0_$HLS_SNAME$_TREADY,
+out_$HLS_SNAME$_TDATA,
+out_$HLS_SNAME$_TVALID,
+out_$HLS_SNAME$_TREADY
 );
 
 input   ap_clk;
 input   ap_rst_n;
-input  $IN_RANGE$ in0_V_V_TDATA;
-input   in0_V_V_TVALID;
-output   in0_V_V_TREADY;
-output  $OUT_RANGE$ out_V_V_TDATA;
-output   out_V_V_TVALID;
-input   out_V_V_TREADY;
+input  $IN_RANGE$ in0_$HLS_SNAME$_TDATA;
+input   in0_$HLS_SNAME$_TVALID;
+output   in0_$HLS_SNAME$_TREADY;
+output  $OUT_RANGE$ out_$HLS_SNAME$_TDATA;
+output   out_$HLS_SNAME$_TVALID;
+input   out_$HLS_SNAME$_TREADY;
 
 reg [31:0] config_address = 0;
 reg config_ce = 0;
@@ -198,15 +196,15 @@
 (
 .ap_clk(ap_clk),			//input
 .ap_rst_n(ap_rst_n), 			//input
-.in0_V_V_TDATA(in0_V_V_TDATA),		//$IN_RANGE$ input
-.in0_V_V_TVALID(in0_V_V_TVALID),  	//input
-.in0_V_V_TREADY(in0_V_V_TREADY),	//output
-.weights_V_V_TDATA(m_axis_0_tdata),	//$WEIGHT_RANGE$ input
-.weights_V_V_TVALID(m_axis_0_tvalid),	//input
-.weights_V_V_TREADY(m_axis_0_tready),	//output
-.out_V_V_TDATA(out_V_V_TDATA),		//$OUT_RANGE$ output
-.out_V_V_TVALID(out_V_V_TVALID),	//output
-.out_V_V_TREADY(out_V_V_TREADY)		//input
+.in0_$HLS_SNAME$_TDATA(in0_$HLS_SNAME$_TDATA),		//$IN_RANGE$ input
+.in0_$HLS_SNAME$_TVALID(in0_$HLS_SNAME$_TVALID),  	//input
+.in0_$HLS_SNAME$_TREADY(in0_$HLS_SNAME$_TREADY),	//output
+.weights_$HLS_SNAME$_TDATA(m_axis_0_tdata),	//$WEIGHT_RANGE$ input
+.weights_$HLS_SNAME$_TVALID(m_axis_0_tvalid),	//input
+.weights_$HLS_SNAME$_TREADY(m_axis_0_tready),	//output
+.out_$HLS_SNAME$_TDATA(out_$HLS_SNAME$_TDATA),		//$OUT_RANGE$ output
+.out_$HLS_SNAME$_TVALID(out_$HLS_SNAME$_TVALID),	//output
+.out_$HLS_SNAME$_TREADY(out_$HLS_SNAME$_TREADY)		//input
 );
 
 endmodule
@@ -248,6 +246,8 @@
   kintex7l Production \
   kintexu Production \
   kintexuplus Production \
+  versal Production \
+  versalprime Production \
   virtex7 Production \
   virtexu Production \
   virtexuplus Production \
@@ -301,10 +301,10 @@
 ## Infer interfaces
 ipx::infer_bus_interface ap_clk xilinx.com:signal:clock_rtl:1.0 [ipx::current_core]
 ipx::infer_bus_interface ap_rst_n xilinx.com:signal:reset_rtl:1.0 [ipx::current_core]
-ipx::infer_bus_interface {in0_V_V_TDATA in0_V_V_TVALID in0_V_V_TREADY} xilinx.com:interface:axis_rtl:1.0 [ipx::current_core]
-ipx::infer_bus_interface {out_V_V_TREADY out_V_V_TDATA out_V_V_TVALID} xilinx.com:interface:axis_rtl:1.0 [ipx::current_core]
-ipx::associate_bus_interfaces -busif in0_V_V -clock ap_clk [ipx::current_core]
-ipx::associate_bus_interfaces -busif out_V_V -clock ap_clk [ipx::current_core]
+ipx::infer_bus_interface {in0_$HLS_SNAME$_TDATA in0_$HLS_SNAME$_TVALID in0_$HLS_SNAME$_TREADY} xilinx.com:interface:axis_rtl:1.0 [ipx::current_core]
+ipx::infer_bus_interface {out_$HLS_SNAME$_TREADY out_$HLS_SNAME$_TDATA out_$HLS_SNAME$_TVALID} xilinx.com:interface:axis_rtl:1.0 [ipx::current_core]
+ipx::associate_bus_interfaces -busif in0_$HLS_SNAME$ -clock ap_clk [ipx::current_core]
+ipx::associate_bus_interfaces -busif out_$HLS_SNAME$ -clock ap_clk [ipx::current_core]
 
 ## Finalize
 set_property core_revision 2 [ipx::current_core]
@@ -319,23 +319,23 @@
 ap_clk,
 ap_rst_n,
 count,
-in0_V_V_TDATA,
-in0_V_V_TVALID,
-in0_V_V_TREADY,
-out_V_V_TDATA,
-out_V_V_TVALID,
-out_V_V_TREADY
+in0_$HLS_SNAME$_TDATA,
+in0_$HLS_SNAME$_TVALID,
+in0_$HLS_SNAME$_TREADY,
+out_$HLS_SNAME$_TDATA,
+out_$HLS_SNAME$_TVALID,
+out_$HLS_SNAME$_TREADY
 );
 
 input   ap_clk;
 input   ap_rst_n;
 output $COUNT_RANGE$ count;
-input  $IN_RANGE$ in0_V_V_TDATA;
-input   in0_V_V_TVALID;
-output   in0_V_V_TREADY;
-output  $OUT_RANGE$ out_V_V_TDATA;
-output   out_V_V_TVALID;
-input   out_V_V_TREADY;
+input  $IN_RANGE$ in0_$HLS_SNAME$_TDATA;
+input   in0_$HLS_SNAME$_TVALID;
+output   in0_$HLS_SNAME$_TREADY;
+output  $OUT_RANGE$ out_$HLS_SNAME$_TDATA;
+output   out_$HLS_SNAME$_TVALID;
+input   out_$HLS_SNAME$_TREADY;
 
 Q_srl #(
 .depth($DEPTH$),
@@ -346,12 +346,12 @@
  .clock(ap_clk),
  .reset(!ap_rst_n),
  .count(count),
- .i_d(in0_V_V_TDATA),
- .i_v(in0_V_V_TVALID),
- .i_r(in0_V_V_TREADY),
- .o_d(out_V_V_TDATA),
- .o_v(out_V_V_TVALID),
- .o_r(out_V_V_TREADY)
+ .i_d(in0_$HLS_SNAME$_TDATA),
+ .i_v(in0_$HLS_SNAME$_TVALID),
+ .i_r(in0_$HLS_SNAME$_TREADY),
+ .o_d(out_$HLS_SNAME$_TDATA),
+ .o_v(out_$HLS_SNAME$_TVALID),
+ .o_r(out_$HLS_SNAME$_TREADY)
 );
 
 endmodule
diff --git a/src/finn/custom_op/fpgadataflow/thresholding_batch.py b/src/finn/custom_op/fpgadataflow/thresholding_batch.py
index 610139f44e..5383cc1f4b 100644
--- a/src/finn/custom_op/fpgadataflow/thresholding_batch.py
+++ b/src/finn/custom_op/fpgadataflow/thresholding_batch.py
@@ -31,13 +31,13 @@
 import textwrap
 import warnings
 from math import ceil, log2
-
-from finn.core.datatype import DataType
-from finn.custom_op.fpgadataflow.hlscustomop import HLSCustomOp
-from finn.util.basic import (
+from qonnx.core.datatype import DataType
+from qonnx.util.basic import (
     interleave_matrix_outer_dim_from_partitions,
     roundup_to_integer_multiple,
 )
+
+from finn.custom_op.fpgadataflow.hlscustomop import HLSCustomOp
 from finn.util.data_packing import (
     npy_to_rtlsim_input,
     numpy_to_hls_code,
@@ -389,7 +389,7 @@ def make_weight_file(self, weights, weight_file_mode, weight_file_name):
                     tdt_hls,
                     odt_hls,
                     self.get_nodeattr("ActVal"),
-                    "comp::less_equal<%s>" % tdt_hls,
+                    "comp::less_equal<%s, %s>" % (tdt_hls, tdt_hls),
                 )
             )
             f_thresh.write(thresholds_hls_code)
@@ -465,9 +465,26 @@ def generate_params(self, model, path):
             weight_filename_sim = "{}/thresholds.npy".format(code_gen_dir)
             self.make_weight_file(thresholds, "decoupled_npy", weight_filename_sim)
             # also save weights as Verilog .dat file
-            weight_filename_rtl = "{}/memblock_0.dat".format(code_gen_dir)
+            # note that we provide two different .dat files, one for synth
+            # and one for synthesis. this is because URAM-based weights always
+            # need zero weights for synthesis, otherwise they get inferred
+            # as BRAM
+            weight_filename_rtl_synth = "{}/memblock_synth_0.dat".format(code_gen_dir)
+            weight_filename_rtl_sim = "{}/memblock_sim_0.dat".format(code_gen_dir)
+            # sim weights are always the true weights
+            self.make_weight_file(
+                thresholds, "decoupled_verilog_dat", weight_filename_rtl_sim
+            )
+            ram_style = self.get_nodeattr("ram_style")
+            if ram_style == "ultra":
+                # UltraRAM must have no memory initializer, or only zeroes
+                # otherwise BRAM will be inferred instead of URAM
+                # as a workaround we provide a zero-weight init here
+                synth_thresholds = np.zeros_like(thresholds, dtype=np.float32)
+            else:
+                synth_thresholds = thresholds
             self.make_weight_file(
-                thresholds, "decoupled_verilog_dat", weight_filename_rtl
+                synth_thresholds, "decoupled_verilog_dat", weight_filename_rtl_synth
             )
         else:
             raise Exception("Unrecognized mem_mode")
@@ -528,12 +545,10 @@ def execute_node(self, context, graph):
                 out = context[node.output[0]]
                 out = 2 * out - 1
                 context[node.output[0]] = out
+            oshape = self.get_normal_output_shape()
             assert (
-                context[node.output[0]].shape == self.get_folded_output_shape()
+                context[node.output[0]].shape == oshape
             ), """Output shape is not as expected"""
-            # reshape output to have expected shape
-            oshape = self.get_normal_output_shape()
-            context[node.output[0]] = context[node.output[0]].reshape(*oshape)
         elif mode == "rtlsim":
             sim = self.get_rtlsim()
             nbits = self.get_instream_width()
@@ -589,7 +604,7 @@ def global_includes(self):
     # TODO check and add whatever missing
     def defines(self, var):
         numInputVectors = list(self.get_nodeattr("numInputVectors"))
-        numReps = numInputVectors[0]
+        numReps = int(np.prod(numInputVectors))
         self.code_gen_dict["$DEFINES$"] = [
             """#define NumChannels1 {}\n #define PE1 {}\n #define numReps {}""".format(
                 self.get_nodeattr("NumChannels"),
@@ -660,34 +675,28 @@ def docompute(self):
         # TODO: why put some template parameters into defines and not others?
         # should ImgDim be defined or just filled in here like we do now?
         node = self.onnx_node
-        ishape = self.get_folded_input_shape()
-        if len(ishape) == 3:
-            imgdimh = 1
-            imgdimw = 1
-        elif len(ishape) == 5:
-            imgdimh = ishape[1]
-            imgdimw = ishape[2]
-        else:
-            raise Exception("""Unexpected input shape""")
+        inp_vecs = self.get_nodeattr("numInputVectors")
+        total_spatial_size = int(np.prod(inp_vecs))
         mem_mode = self.get_nodeattr("mem_mode")
         if mem_mode == "const":
             self.code_gen_dict["$DOCOMPUTE$"] = [
-                """{}<{}, {}, NumChannels1, PE1, {}, {}>
+                """{}<{}, NumChannels1, PE1, {}, {}>
                 (in0, out, threshs, numReps);""".format(
                     node.op_type,
-                    imgdimh,
-                    imgdimw,
+                    total_spatial_size,
                     tmpl_args["TSrcI"],
                     tmpl_args["TDstI"],
                 )
             ]
         elif mem_mode == "decoupled":
+            # note that numReps is set to 1 in the invocation below, since
+            # - for cppsim the repetition comes from the threshold stream reader+input
+            # - for synth the unit runs continuously anyway (ap_ctrl_none)
             self.code_gen_dict["$DOCOMPUTE$"] = [
-                """{}<{}, {}, NumChannels1, PE1, {}, {}, ActVal1, ThresType1, NumSteps1>
-                (in0, out, weights, numReps);""".format(
+                """{}<{}, NumChannels1, PE1, {}, {}, ActVal1, ThresType1, NumSteps1>
+                (in0, out, weights, 1);""".format(
                     "Thresholding_Stream_Batch",
-                    imgdimh,
-                    imgdimw,
+                    total_spatial_size,
                     tmpl_args["TSrcI"],
                     tmpl_args["TDstI"],
                 )
@@ -753,8 +762,12 @@ def blackboxfunction(self):
             raise Exception("Unrecognized mem_mode")
 
     def pragmas(self):
-        self.code_gen_dict["$PRAGMAS$"] = ["#pragma HLS INTERFACE axis port=in0"]
-        self.code_gen_dict["$PRAGMAS$"].append("#pragma HLS INTERFACE axis port=out")
+        self.code_gen_dict["$PRAGMAS$"] = [
+            "#pragma HLS INTERFACE axis port=in0 name=in0_" + self.hls_sname()
+        ]
+        self.code_gen_dict["$PRAGMAS$"].append(
+            "#pragma HLS INTERFACE axis port=out name=out_" + self.hls_sname()
+        )
         self.code_gen_dict["$PRAGMAS$"].append(
             "#pragma HLS INTERFACE ap_ctrl_none port=return"
         )
@@ -805,7 +818,8 @@ def pragmas(self):
                     )
         elif self.get_nodeattr("mem_mode") == "decoupled":
             self.code_gen_dict["$PRAGMAS$"].append(
-                "#pragma HLS INTERFACE axis port=weights"
+                "#pragma HLS INTERFACE axis port=weights name=weights_"
+                + self.hls_sname()
             )
 
     def code_generation_ipi(self):
@@ -815,6 +829,7 @@ def code_generation_ipi(self):
         if mem_mode == "decoupled":
             node_name = self.onnx_node.name
             runtime_writable = self.get_nodeattr("runtime_writeable_weights") == 1
+            sname = self.hls_sname()
             # create a hierarchy for this layer, with the same port names
             clk_name = self.get_verilog_top_module_intf_names()["clk"][0]
             rst_name = self.get_verilog_top_module_intf_names()["rst"][0]
@@ -868,8 +883,8 @@ def code_generation_ipi(self):
             )
             cmd.append(
                 "connect_bd_intf_net [get_bd_intf_pins %s/%s/m_axis_0] "
-                "[get_bd_intf_pins %s/%s/weights_V_V]"
-                % (node_name, strm_inst, node_name, node_name)
+                "[get_bd_intf_pins %s/%s/weights_%s]"
+                % (node_name, strm_inst, node_name, node_name, sname)
             )
             cmd.append(
                 "connect_bd_net [get_bd_pins %s/%s] [get_bd_pins %s/%s/aresetn]"
@@ -940,3 +955,8 @@ def get_op_and_param_counts(self):
         thres_count = out_features * num_steps
         ret_dict[thres_param_type] = thres_count
         return ret_dict
+
+    def ipgen_extra_directives(self):
+        "Return a list of extra tcl directives for HLS synthesis."
+
+        return ["config_compile -pipeline_style frp"]
diff --git a/src/finn/custom_op/fpgadataflow/tlastmarker.py b/src/finn/custom_op/fpgadataflow/tlastmarker.py
index 70edaee9cf..7386aa7e63 100644
--- a/src/finn/custom_op/fpgadataflow/tlastmarker.py
+++ b/src/finn/custom_op/fpgadataflow/tlastmarker.py
@@ -198,8 +198,12 @@ def blackboxfunction(self):
             ]
 
     def pragmas(self):
-        self.code_gen_dict["$PRAGMAS$"] = ["#pragma HLS INTERFACE axis port=in0"]
-        self.code_gen_dict["$PRAGMAS$"].append("#pragma HLS INTERFACE axis port=out")
+        self.code_gen_dict["$PRAGMAS$"] = [
+            "#pragma HLS INTERFACE axis port=in0 name=in0_" + self.hls_sname()
+        ]
+        self.code_gen_dict["$PRAGMAS$"].append(
+            "#pragma HLS INTERFACE axis port=out name=out_" + self.hls_sname()
+        )
 
         dyn_iters = self.get_nodeattr("DynIters")
         if dyn_iters == 1:
@@ -244,12 +248,9 @@ def strm_decl(self):
     def get_verilog_top_module_intf_names(self):
         intf_names = super().get_verilog_top_module_intf_names()
         stream_width = self.get_nodeattr("StreamWidth")
-        if self.get_nodeattr("Direction") == "in":
-            intf_names["s_axis"] = [("in0", stream_width)]
-            intf_names["m_axis"] = [("out_V_V", stream_width)]
-        else:
-            intf_names["s_axis"] = [("in0_V_V", stream_width)]
-            intf_names["m_axis"] = [("out_r", stream_width)]
+        sname = self.hls_sname()
+        intf_names["s_axis"] = [("in0_" + sname, stream_width)]
+        intf_names["m_axis"] = [("out_" + sname, stream_width)]
         if self.get_nodeattr("DynIters") == 1:
             intf_names["axilite"] = ["s_axi_control"]
         return intf_names
diff --git a/src/finn/custom_op/fpgadataflow/upsampler.py b/src/finn/custom_op/fpgadataflow/upsampler.py
index 7114cd83ed..b62e4f2f67 100644
--- a/src/finn/custom_op/fpgadataflow/upsampler.py
+++ b/src/finn/custom_op/fpgadataflow/upsampler.py
@@ -29,8 +29,8 @@
 import numpy as np
 import os
 import warnings
+from qonnx.core.datatype import DataType
 
-from finn.core.datatype import DataType
 from finn.custom_op.fpgadataflow.hlscustomop import HLSCustomOp
 from finn.util.data_packing import npy_to_rtlsim_input, rtlsim_output_to_npy
 
@@ -231,8 +231,12 @@ def blackboxfunction(self):
         ]
 
     def pragmas(self):
-        self.code_gen_dict["$PRAGMAS$"] = ["#pragma HLS INTERFACE axis port=in0"]
-        self.code_gen_dict["$PRAGMAS$"].append("#pragma HLS INTERFACE axis port=out")
+        self.code_gen_dict["$PRAGMAS$"] = [
+            "#pragma HLS INTERFACE axis port=in0 name=in0_" + self.hls_sname()
+        ]
+        self.code_gen_dict["$PRAGMAS$"].append(
+            "#pragma HLS INTERFACE axis port=out name=out_" + self.hls_sname()
+        )
         self.code_gen_dict["$PRAGMAS$"].append(
             "#pragma HLS INTERFACE ap_ctrl_none port=return"
         )
diff --git a/src/finn/custom_op/fpgadataflow/vector_vector_activate_batch.py b/src/finn/custom_op/fpgadataflow/vectorvectoractivation.py
similarity index 96%
rename from src/finn/custom_op/fpgadataflow/vector_vector_activate_batch.py
rename to src/finn/custom_op/fpgadataflow/vectorvectoractivation.py
index f50c5d1ef6..27b23dd328 100644
--- a/src/finn/custom_op/fpgadataflow/vector_vector_activate_batch.py
+++ b/src/finn/custom_op/fpgadataflow/vectorvectoractivation.py
@@ -30,14 +30,14 @@
 import numpy as np
 import os
 import warnings
-
-from finn.core.datatype import DataType
-from finn.custom_op.fpgadataflow.hlscustomop import HLSCustomOp
-from finn.util.basic import (
+from qonnx.core.datatype import DataType
+from qonnx.util.basic import (
     calculate_matvec_accumulator_range,
     interleave_matrix_outer_dim_from_partitions,
     roundup_to_integer_multiple,
 )
+
+from finn.custom_op.fpgadataflow.hlscustomop import HLSCustomOp
 from finn.util.data_packing import (
     npy_to_rtlsim_input,
     numpy_to_hls_code,
@@ -45,7 +45,7 @@
 )
 
 
-class Vector_Vector_Activate_Batch(HLSCustomOp):
+class VectorVectorActivation(HLSCustomOp):
     """Class that corresponds to finn-hlslib Vector_Vector_Activate_Batch function"""
 
     def __init__(self, onnx_node):
@@ -379,7 +379,7 @@ def generate_params(self, model, path):
                         tdt_hls,
                         odt_hls,
                         self.get_nodeattr("ActVal"),
-                        "comp::less_equal<%s>" % tdt_hls,
+                        "comp::less_equal<%s, %s>" % (tdt_hls, tdt_hls),
                     )
                 )
                 f_thresh.write(thresholds_hls_code)
@@ -422,9 +422,7 @@ def execute_node(self, context, graph):
                     reshaped_input,
                 )
             elif in_ind > 2:
-                raise Exception(
-                    "Unexpected input found for Vector_Vector_Activate_Unit"
-                )
+                raise Exception("Unexpected input found for VectorVectorActivation")
             in_ind += 1
 
         if mode == "cppsim":
@@ -433,11 +431,8 @@ def execute_node(self, context, graph):
             # load output npy file
             super().npy_to_dynamic_output(context)
             assert (
-                context[node.output[0]].shape == self.get_folded_output_shape()
-            ), """Output shape is not as expected"""
-            # reshape output to have expected shape
-            oshape = self.get_normal_output_shape()
-            context[node.output[0]] = context[node.output[0]].reshape(*oshape)
+                context[node.output[0]].shape == self.get_normal_output_shape()
+            ), "cppsim did not produce expected output shape"
         elif mode == "rtlsim":
             sim = self.get_rtlsim()
             nbits = self.get_instream_width()
@@ -526,11 +521,9 @@ def docompute(self):
             threshs = "PassThroughActivation<%s>()" % odtype_hls_str
         else:
             threshs = "threshs"
-        node = self.onnx_node
         self.code_gen_dict["$DOCOMPUTE$"] = [
-            """{}<Channels1, InnerProdDim, SIMD1, PE1, 1, {}, {}, {}>
+            """Vector_Vector_Activate_Batch<Channels1, InnerProdDim, SIMD1, PE1, 1, {}, {}, {}>
             (in0, out, weights, {}, numReps, {});""".format(
-                node.op_type,
                 tmpl_args["TSrcI"],
                 tmpl_args["TDstI"],
                 tmpl_args["TWeightI"],
@@ -579,8 +572,12 @@ def blackboxfunction(self):
         ]
 
     def pragmas(self):
-        self.code_gen_dict["$PRAGMAS$"] = ["#pragma HLS INTERFACE axis port=in0"]
-        self.code_gen_dict["$PRAGMAS$"].append("#pragma HLS INTERFACE axis port=out")
+        self.code_gen_dict["$PRAGMAS$"] = [
+            "#pragma HLS INTERFACE axis port=in0 name=in0_" + self.hls_sname()
+        ]
+        self.code_gen_dict["$PRAGMAS$"].append(
+            "#pragma HLS INTERFACE axis port=out name=out_" + self.hls_sname()
+        )
         in_fifo_depth = self.get_nodeattr("inFIFODepth")
         out_fifo_depth = self.get_nodeattr("outFIFODepth")
         # insert depth pragmas only if specified
diff --git a/src/finn/qnn-data/build_dataflow/folding_config.json b/src/finn/qnn-data/build_dataflow/folding_config.json
index 1fbe289608..95167f1a30 100644
--- a/src/finn/qnn-data/build_dataflow/folding_config.json
+++ b/src/finn/qnn-data/build_dataflow/folding_config.json
@@ -4,22 +4,22 @@
     "PE": 49,
     "ram_style": "distributed"
   },
-  "StreamingFCLayer_Batch_0": {
+  "MatrixVectorActivation_0": {
     "PE": 16,
     "SIMD": 49,
     "ram_style": "block"
   },
-  "StreamingFCLayer_Batch_1": {
+  "MatrixVectorActivation_1": {
     "PE": 8,
     "SIMD": 8,
     "ram_style": "auto"
   },
-  "StreamingFCLayer_Batch_2": {
+  "MatrixVectorActivation_2": {
     "PE": 8,
     "SIMD": 8,
     "ram_style": "auto"
   },
-  "StreamingFCLayer_Batch_3": {
+  "MatrixVectorActivation_3": {
     "PE": 10,
     "SIMD": 8,
     "ram_style": "distributed"
diff --git a/src/finn/qnn-data/mdd-data/finn_design.mdd b/src/finn/qnn-data/mdd-data/finn_design.mdd
new file mode 100644
index 0000000000..0be2da6e8f
--- /dev/null
+++ b/src/finn/qnn-data/mdd-data/finn_design.mdd
@@ -0,0 +1,36 @@
+# Copyright (c) 2022  Xilinx, Inc.
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of Xilinx nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+OPTION psf_version = 2.1;
+
+BEGIN driver finn_design
+        OPTION supported_peripherals = (finn_design);
+        OPTION driver_state = ACTIVE;
+        OPTION VERSION = 1.0;
+        OPTION NAME = finn_design;
+END driver
diff --git a/src/finn/qnn-data/mdd-data/finn_design.tcl b/src/finn/qnn-data/mdd-data/finn_design.tcl
new file mode 100644
index 0000000000..d4915d468d
--- /dev/null
+++ b/src/finn/qnn-data/mdd-data/finn_design.tcl
@@ -0,0 +1,60 @@
+# Copyright (c) 2022  Xilinx, Inc.
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of Xilinx nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# @brief	Address range defines for FINN IP.
+# @author	Thomas B. Preußer <thomas.preusser@amd.com>
+##
+
+proc generate {drv_handle} {
+	# Bounds of all exposed slave address ranges to xparameters.h
+	set file_handle [hsi::utils::open_include_file "xparameters.h"]
+	foreach drv [hsi::get_drivers -filter "NAME==[common::get_property NAME $drv_handle]"] {
+		generate_memrange_parameters $drv $file_handle
+	}
+	close $file_handle
+}
+
+proc generate_memrange_parameters {drv_handle file_handle} {
+	# Collect unique slave interfaces to custom module
+	array unset ranges
+	foreach mem_range [hsi::get_mem_ranges -of_object [hsi::get_cells -hier [hsi::get_sw_processor]] $drv_handle] {
+		set ranges([common::get_property SLAVE_INTERFACE $mem_range]) [list \
+			[common::get_property BASE_NAME  $mem_range] \
+			[common::get_property BASE_VALUE $mem_range] \
+			[common::get_property HIGH_NAME  $mem_range] \
+			[common::get_property HIGH_VALUE $mem_range] \
+		]
+	}
+
+	# Produce defines for the address range bounds
+	set prefix "XPAR_[string toupper $drv_handle]"
+	foreach {key val} [array get ranges] {
+		puts $file_handle "#define [format "%s_%s_%s" $prefix $key [lindex $val 0]] [lindex $val 1]"
+		puts $file_handle "#define [format "%s_%s_%s" $prefix $key [lindex $val 2]] [lindex $val 3]"
+	}
+	puts $file_handle ""
+}
diff --git a/src/finn/qnn-data/templates/driver/driver_base.py b/src/finn/qnn-data/templates/driver/driver_base.py
index b6dd835080..2096760580 100644
--- a/src/finn/qnn-data/templates/driver/driver_base.py
+++ b/src/finn/qnn-data/templates/driver/driver_base.py
@@ -31,9 +31,9 @@
 import time
 from pynq import Overlay, allocate
 from pynq.ps import Clocks
+from qonnx.core.datatype import DataType
+from qonnx.util.basic import gen_finn_dt_tensor
 
-from finn.core.datatype import DataType
-from finn.util.basic import gen_finn_dt_tensor
 from finn.util.data_packing import (
     finnpy_to_packed_bytearray,
     packed_bytearray_to_finnpy,
@@ -439,13 +439,13 @@ def throughput_test(self):
         total_in = 0
         for i in range(self.num_inputs):
             total_in += np.prod(self.ishape_packed(i))
-        res["DRAM_in_bandwidth[Mb/s]"] = total_in * 0.000001 / runtime
+        res["DRAM_in_bandwidth[MB/s]"] = total_in * 0.000001 / runtime
         total_out = 0
         for o in range(self.num_outputs):
             total_out += np.prod(self.oshape_packed(o))
-        res["DRAM_out_bandwidth[Mb/s]"] = total_out * 0.000001 / runtime
+        res["DRAM_out_bandwidth[MB/s]"] = total_out * 0.000001 / runtime
         for iwdma, iwbuf, iwdma_name in self.external_weights:
-            res["DRAM_extw_%s_bandwidth[Mb/s]" % iwdma_name] = (
+            res["DRAM_extw_%s_bandwidth[MB/s]" % iwdma_name] = (
                 self.batch_size * np.prod(iwbuf.shape) * 0.000001 / runtime
             )
         if self.platform == "zynq-iodma":
diff --git a/src/finn/qnn-data/test_ext_weights/tfc-w1a1-extw.json b/src/finn/qnn-data/test_ext_weights/tfc-w1a1-extw.json
index 299a8be815..442ea72d9a 100644
--- a/src/finn/qnn-data/test_ext_weights/tfc-w1a1-extw.json
+++ b/src/finn/qnn-data/test_ext_weights/tfc-w1a1-extw.json
@@ -4,22 +4,22 @@
       "PE": 49,
       "ram_style": "distributed"
     },
-    "StreamingFCLayer_Batch_0": {
+    "MatrixVectorActivation_0": {
       "PE": 16,
       "SIMD": 49,
       "ram_style": "block"
     },
-    "StreamingFCLayer_Batch_1": {
+    "MatrixVectorActivation_1": {
       "PE": 8,
       "SIMD": 8,
       "mem_mode": "external"
     },
-    "StreamingFCLayer_Batch_2": {
+    "MatrixVectorActivation_2": {
       "PE": 8,
       "SIMD": 8,
       "mem_mode": "external"
     },
-    "StreamingFCLayer_Batch_3": {
+    "MatrixVectorActivation_3": {
       "PE": 10,
       "SIMD": 8,
       "ram_style": "distributed"
diff --git a/src/finn/transformation/__init__.py b/src/finn/transformation/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/src/finn/transformation/fpgadataflow/__init__.py b/src/finn/transformation/fpgadataflow/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/src/finn/transformation/fpgadataflow/annotate_cycles.py b/src/finn/transformation/fpgadataflow/annotate_cycles.py
index 5ab491dd10..7befad7aa7 100644
--- a/src/finn/transformation/fpgadataflow/annotate_cycles.py
+++ b/src/finn/transformation/fpgadataflow/annotate_cycles.py
@@ -26,10 +26,11 @@
 # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-import finn.custom_op.registry as registry
-from finn.core.modelwrapper import ModelWrapper
-from finn.custom_op.registry import getCustomOp
-from finn.transformation.base import Transformation
+import qonnx.custom_op.registry as registry
+from qonnx.core.modelwrapper import ModelWrapper
+from qonnx.custom_op.registry import getCustomOp
+from qonnx.transformation.base import Transformation
+
 from finn.transformation.move_reshape import _is_fpgadataflow_node
 
 
diff --git a/src/finn/transformation/fpgadataflow/annotate_resources.py b/src/finn/transformation/fpgadataflow/annotate_resources.py
index d9089cbeba..0cc4234c8c 100644
--- a/src/finn/transformation/fpgadataflow/annotate_resources.py
+++ b/src/finn/transformation/fpgadataflow/annotate_resources.py
@@ -26,13 +26,14 @@
 # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-import finn.custom_op.registry as registry
+import qonnx.custom_op.registry as registry
+from qonnx.core.modelwrapper import ModelWrapper
+from qonnx.custom_op.registry import getCustomOp
+from qonnx.transformation.base import Transformation
+
 from finn.analysis.fpgadataflow.hls_synth_res_estimation import hls_synth_res_estimation
 from finn.analysis.fpgadataflow.post_synth_res import post_synth_res
 from finn.analysis.fpgadataflow.res_estimation import res_estimation
-from finn.core.modelwrapper import ModelWrapper
-from finn.custom_op.registry import getCustomOp
-from finn.transformation.base import Transformation
 from finn.transformation.move_reshape import _is_fpgadataflow_node
 
 
diff --git a/src/finn/transformation/fpgadataflow/cleanup.py b/src/finn/transformation/fpgadataflow/cleanup.py
index f59f4bdeab..1d0efaf4bb 100644
--- a/src/finn/transformation/fpgadataflow/cleanup.py
+++ b/src/finn/transformation/fpgadataflow/cleanup.py
@@ -27,10 +27,10 @@
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 import os
+import qonnx.custom_op.registry as registry
 import shutil
+from qonnx.transformation.base import Transformation
 
-import finn.custom_op.registry as registry
-from finn.transformation.base import Transformation
 from finn.util.fpgadataflow import is_fpgadataflow_node
 
 
diff --git a/src/finn/transformation/fpgadataflow/compile_cppsim.py b/src/finn/transformation/fpgadataflow/compile_cppsim.py
index 5f7c534b45..da337caa62 100644
--- a/src/finn/transformation/fpgadataflow/compile_cppsim.py
+++ b/src/finn/transformation/fpgadataflow/compile_cppsim.py
@@ -26,8 +26,9 @@
 # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-import finn.custom_op.registry as registry
-from finn.transformation.base import NodeLocalTransformation
+import qonnx.custom_op.registry as registry
+from qonnx.transformation.base import NodeLocalTransformation
+
 from finn.util.fpgadataflow import is_fpgadataflow_node
 
 
diff --git a/src/finn/transformation/fpgadataflow/convert_to_hls_layers.py b/src/finn/transformation/fpgadataflow/convert_to_hls_layers.py
index 113ccb93b8..f0bd5fbd06 100644
--- a/src/finn/transformation/fpgadataflow/convert_to_hls_layers.py
+++ b/src/finn/transformation/fpgadataflow/convert_to_hls_layers.py
@@ -28,21 +28,21 @@
 
 
 import numpy as np
+import qonnx.core.data_layout as DataLayout
 import warnings
 from onnx import TensorProto, helper
+from qonnx.core.datatype import DataType
+from qonnx.custom_op.registry import getCustomOp
+from qonnx.transformation.base import Transformation
+from qonnx.transformation.general import SortGraph
+from qonnx.transformation.infer_datatypes import InferDataTypes
+from qonnx.transformation.infer_shapes import InferShapes
+from qonnx.util.basic import get_by_name
+from qonnx.util.onnx import nchw_to_nhwc
 
-import finn.core.data_layout as DataLayout
-from finn.core.datatype import DataType
-from finn.custom_op.registry import getCustomOp
-from finn.transformation.base import Transformation
 from finn.transformation.fpgadataflow.minimize_accumulator_width import (
     MinimizeAccumulatorWidth,
 )
-from finn.transformation.general import SortGraph
-from finn.transformation.infer_datatypes import InferDataTypes
-from finn.transformation.infer_shapes import InferShapes
-from finn.util.basic import get_by_name
-from finn.util.onnx import nchw_to_nhwc
 
 
 class InferConvInpGen(Transformation):
@@ -197,15 +197,15 @@ def apply(self, model):
                             depthwise=depthwise,
                             name="ConvolutionInputGenerator_" + n.name,
                         )
-                    else:  # non-square images and/or kernels
+                    else:  # 1D images and/or kernels
                         assert is_1d_convolution, (
                             "%s: ConvolutionInputGenerator1D works only for 1D convs"
                             % n.name
                         )
                         if dilation_h > 1 or dilation_w > 1:
-                            assert stride_h == 1 and stride_w == 1, (
-                                """%s: Stride value of greater than 1 is not supported for convolutions
-                                with dilation value greater than 1"""
+                            assert depthwise == 1, (
+                                """%s: Dilation value > 1 is only supported for
+                                1D depthwise separable convolutions"""
                                 % n.name
                             )
                         ConvInpGen_node = helper.make_node(
@@ -339,20 +339,27 @@ def apply(self, model):
         graph = model.graph
         node_ind = 0
         graph_modified = False
-        for n in graph.node:
+        for node in graph.node:
             node_ind += 1
-            if n.op_type == "MaxPoolNHWC":
-                mp_input = n.input[0]
-                mp_output = n.output[0]
+            if node.op_type == "MaxPoolNHWC":
+                mp_input = node.input[0]
+                mp_output = node.output[0]
                 mp_in_shape = model.get_tensor_shape(mp_input)
                 # mp_out_shape = model.get_tensor_shape(mp_output)
                 dt = model.get_tensor_datatype(mp_input)
-                mp_inst = getCustomOp(n)
+                mp_inst = getCustomOp(node)
                 k_h, k_w = mp_inst.get_nodeattr("kernel_shape")
                 ifm_ch = mp_in_shape[-1]
                 ifm_dim_h = mp_in_shape[1]
                 ifm_dim_w = mp_in_shape[2]
-                if ifm_dim_h % k_h == 0 and ifm_dim_w % k_w == 0:
+                pe = 1
+                ceil_mode = mp_inst.get_nodeattr("ceil_mode")
+                is_1d = (ifm_dim_h == 1 and k_h == 1) or (ifm_dim_w == 1 and k_w == 1)
+                is_divisable = (ifm_dim_h % k_h == 0) or (ifm_dim_w % k_w == 0)
+                is_bipolar = dt == DataType["BIPOLAR"]
+                pass_1d = is_1d and (not is_bipolar)
+                pass_2d = (not is_1d) and is_divisable
+                if pass_1d or pass_2d:
                     # create equivalent StreamingMaxPool_Batch node
                     new_node = helper.make_node(
                         "StreamingMaxPool_Batch",
@@ -364,12 +371,16 @@ def apply(self, model):
                         NumChannels=ifm_ch,
                         ImgDim=(ifm_dim_h, ifm_dim_w),
                         dataType=dt.name,
-                        name="StreamingMaxPool_Batch_" + n.name,
+                        PE=pe,
+                        CeilMode=ceil_mode,
+                        name="StreamingMaxPool_Batch_" + node.name,
                     )
                     graph.node.insert(node_ind, new_node)
                     # remove old nodes
-                    graph.node.remove(n)
+                    graph.node.remove(node)
                     graph_modified = True
+                else:
+                    warnings.warn(node.name + ": could not convert to HLS")
         if graph_modified:
             model = model.transform(InferShapes())
             model = model.transform(InferDataTypes())
@@ -385,62 +396,57 @@ def apply(self, model):
         graph = model.graph
         node_ind = 0
         graph_modified = False
-        for n in graph.node:
+        for node in graph.node:
             node_ind += 1
-            if n.op_type in ["MaxPool", "QuantAvgPool2d", "MaxPoolNHWC"]:
-                # extract pool parameters
+            if node.op_type in ["MaxPool", "QuantAvgPool2d", "MaxPoolNHWC"]:
+                node_input = node.input[0]
+                ishape = model.get_tensor_shape(node_input)
+                node_output = node.output[0]
+                idt = model.get_tensor_datatype(node_input)
+                oshape = model.get_tensor_shape(node_output)
+                # only support 4D input tensors (1D convs need extra dummy dim)
+                if len(ishape) != 4:
+                    continue
 
-                if n.op_type == "MaxPool":
-                    k = get_by_name(n.attribute, "kernel_shape").ints[-1]
-                    stride = get_by_name(n.attribute, "strides").ints[-1]
-                    # assumed datalayout
+                # extract pool parameters
+                if node.op_type == "MaxPool":
+                    kh, kw = list(get_by_name(node.attribute, "kernel_shape").ints)
+                    sh, sw = list(get_by_name(node.attribute, "strides").ints)
                     dlayout = "NCHW"
-                elif n.op_type == "QuantAvgPool2d":
-                    inst = getCustomOp(n)
-                    k = inst.get_nodeattr("kernel")
-                    stride = inst.get_nodeattr("stride")
+                elif node.op_type == "QuantAvgPool2d":
+                    inst = getCustomOp(node)
+                    # QuantAvgPool2d has a single scalar attribute
+                    # for kernel size and stride (implicit square)
+                    kh = kw = inst.get_nodeattr("kernel")
+                    sh = sw = inst.get_nodeattr("stride")
                     dlayout = inst.get_nodeattr("data_layout")
-                elif n.op_type == "MaxPoolNHWC":
-                    inst = getCustomOp(n)
-                    k_shape = inst.get_nodeattr("kernel_shape")
-                    strides = inst.get_nodeattr("strides")
-                    assert k_shape[0] == k_shape[1]
-                    assert strides[0] == strides[1]
-                    k = k_shape[0]
-                    stride = strides[0]
+                elif node.op_type == "MaxPoolNHWC":
+                    inst = getCustomOp(node)
+                    kh, kw = inst.get_nodeattr("kernel_shape")
+                    sh, sw = inst.get_nodeattr("strides")
                     dlayout = "NHWC"
                 try:
-                    pad = get_by_name(n.attribute, "pads").ints[-1]
+                    pad = list(get_by_name(node.attribute, "pads").ints)
                 except AttributeError:
-                    pad = 0
-
-                node_input = n.input[0]
-                node_output = n.output[0]
-                idt = model.get_tensor_datatype(node_input)
+                    pad = [0, 0, 0, 0]
 
                 if not idt.is_integer():
                     continue
 
-                if k < stride:
+                if (kh < sh) or (kw < sw):
+                    # TODO check/implement swg support
                     continue
-                elif k == stride:
-                    warnings.warn(
-                        n.name
-                        + """: Inferring Pool_Batch node for k == stride.
-                        This case can be optimized.
-                        For example, for MaxPool run InferStreamingMaxPool before
-                        InferPool_Batch """
-                    )
 
                 odt = model.get_tensor_datatype(node_output)
 
                 if dlayout == "NCHW":
-                    ifm_ch = model.get_tensor_shape(n.input[0])[1]
+                    _, ifm_ch, ifm_h, ifm_w = ishape
+                    _, ofm_ch, ofm_h, ofm_w = oshape
+                elif dlayout == "NHWC":
+                    _, ifm_h, ifm_w, ifm_ch = ishape
+                    _, ofm_h, ofm_w, ofm_ch = oshape
                 else:
-                    ifm_ch = model.get_tensor_shape(n.input[0])[-1]
-                ofm_ch = ifm_ch
-                ifm_dim = model.get_tensor_shape(n.input[0])[-2]
-                ofm_dim = model.get_tensor_shape(n.output[0])[-2]
+                    raise Exception("Unknown dlayout: " + str(dlayout))
 
                 # if data layout NCHW, we need transpose nodes surrounding
                 # the hls layer
@@ -449,7 +455,7 @@ def apply(self, model):
                     inp_trans_out = helper.make_tensor_value_info(
                         model.make_new_valueinfo_name(),
                         TensorProto.FLOAT,
-                        (1, ifm_dim, ifm_dim, ifm_ch),  # NHWC
+                        (1, ifm_h, ifm_w, ifm_ch),  # NHWC
                     )
                     graph.value_info.append(inp_trans_out)
                     inp_trans_out = inp_trans_out.name
@@ -458,7 +464,7 @@ def apply(self, model):
                     pool_output = helper.make_tensor_value_info(
                         model.make_new_valueinfo_name(),
                         TensorProto.FLOAT,
-                        (1, ofm_dim, ofm_dim, ofm_ch),
+                        (1, ofm_h, ofm_w, ofm_ch),
                     )
                     graph.value_info.append(pool_output)
                     pool_output = pool_output.name
@@ -467,7 +473,7 @@ def apply(self, model):
                 im2col_out = helper.make_tensor_value_info(
                     model.make_new_valueinfo_name(),
                     TensorProto.FLOAT,
-                    (1, ofm_dim, ofm_dim, ifm_ch * k * k),
+                    (1, ofm_h, ofm_w, ifm_ch * kh * kw),
                 )
                 graph.value_info.append(im2col_out)
                 im2col_out = im2col_out.name
@@ -485,24 +491,28 @@ def apply(self, model):
                     pool_output = node_output
 
                 accum_bits = 0
-                pool_size_param = k
+                pool_size_param = 0  # will be overridden if neededs
                 pad_value = 0
-                if n.op_type in ["MaxPool", "MaxPoolNHWC"]:
+                if node.op_type in ["MaxPool", "MaxPoolNHWC"]:
                     pool_fxn = "MaxPool"
                     odt = idt
                     pad_value = idt.min()
-                elif n.op_type == "QuantAvgPool2d":
+                elif node.op_type == "QuantAvgPool2d":
                     assert odt.is_integer(), """Output data type for QuantAvgPool2d
                     needs to be integer"""
-                    assert pad == 0, "Padding is not supported for QuantAvgPool2d"
-                    inst = getCustomOp(n)
+                    assert all(
+                        x == 0 for x in pad
+                    ), "Padding is not supported for QuantAvgPool2d"
+                    inst = getCustomOp(node)
                     pool_fxn = "QuantAvgPool"
                     pool_size_param = inst.get_shifts()
                     accum_bits = inst.get_accum_size()
 
                 else:
                     raise Exception(
-                        "pad_value and pool_fxn not configured for {}".format(n.op_type)
+                        "pad_value and pool_fxn not configured for {}".format(
+                            node.op_type
+                        )
                     )
 
                 # format input tensor
@@ -510,14 +520,14 @@ def apply(self, model):
                     "Im2Col",
                     [im2col_in],
                     [im2col_out],
-                    domain="finn.custom_op.general",
-                    stride=[stride, stride],
-                    kernel_size=[k, k],
-                    pad_amount=[pad, pad, pad, pad],
+                    domain="qonnx.custom_op.general",
+                    stride=[sh, sw],
+                    kernel_size=[kh, kw],
+                    pad_amount=pad,
                     pad_value=pad_value,
                     depthwise=1,
-                    input_shape="(1,{},{},{})".format(ifm_dim, ifm_dim, ifm_ch),
-                    name="Im2Col_" + n.name,
+                    input_shape="(1,{},{},{})".format(ifm_h, ifm_w, ifm_ch),
+                    name="Im2Col_" + node.name,
                 )
 
                 # Warning PE has to be equal to ifm_ch until Im2Col is replaced by
@@ -534,13 +544,13 @@ def apply(self, model):
                     OutputDataType=odt.name,
                     Channels=ifm_ch,
                     PE=ifm_ch,
-                    KernelSize=k,
+                    KernelSize=[kh, kw],
                     Function=pool_fxn,
-                    OutImgDim=ofm_dim,
+                    OutImgDims=[ofm_h, ofm_w],
                     AccumBits=accum_bits,
                     Size=pool_size_param,
                     BatchSize=1,
-                    name="Pool_Batch_" + n.name,
+                    name="Pool_Batch_" + node.name,
                 )
 
                 if dlayout == "NCHW":
@@ -559,7 +569,7 @@ def apply(self, model):
                     graph.node.insert(node_ind, im2col_node)
                     graph.node.insert(node_ind + 1, pool_node)
                 # remove old node
-                graph.node.remove(n)
+                graph.node.remove(node)
                 graph_modified = True
 
         if graph_modified:
@@ -568,9 +578,9 @@ def apply(self, model):
         return (model, graph_modified)
 
 
-class InferBinaryStreamingFCLayer(Transformation):
+class InferBinaryMatrixVectorActivation(Transformation):
     """Convert XnorPopcountMatMul layers to
-    StreamingFCLayer_Batch layers. Any immediately following MultiThreshold
+    MatrixVectorActivation layers. Any immediately following MultiThreshold
     layers will also be absorbed into the MVTU."""
 
     def __init__(self, mem_mode="const"):
@@ -640,9 +650,9 @@ def apply(self, model):
                         actval = odt.min()
                     model.set_tensor_shape(mm_input, mm_in_shape)
                     model.set_tensor_shape(mt_output, mt_out_shape)
-                    # create and insert new StreamingFCLayer node
+                    # create and insert new MatrixVectorActivation node
                     new_node = helper.make_node(
-                        "StreamingFCLayer_Batch",
+                        "MatrixVectorActivation",
                         [mm_input, mm_weight, mt_thres],
                         [mt_output],
                         domain="finn.custom_op.fpgadataflow",
@@ -671,9 +681,9 @@ def apply(self, model):
                     odt = model.get_tensor_datatype(mm_output)
                     model.set_tensor_shape(mm_input, mm_in_shape)
                     model.set_tensor_shape(mm_output, mm_out_shape)
-                    # create and insert new StreamingFCLayer node
+                    # create and insert new MatrixVectorActivation node
                     new_node = helper.make_node(
-                        "StreamingFCLayer_Batch",
+                        "MatrixVectorActivation",
                         [mm_input, mm_weight],
                         [mm_output],
                         domain="finn.custom_op.fpgadataflow",
@@ -703,9 +713,9 @@ def apply(self, model):
         return (model, graph_modified)
 
 
-class InferQuantizedStreamingFCLayer(Transformation):
+class InferQuantizedMatrixVectorActivation(Transformation):
     """Convert MatMul layers with quantized inputs and weights to
-    StreamingFCLayer_Batch layers. Any immediately following MultiThreshold
+    MatrixVectorActivation layers. Any immediately following MultiThreshold
     layers will also be absorbed into the MVTU."""
 
     def __init__(self, mem_mode="const"):
@@ -783,9 +793,9 @@ def apply(self, model):
                             # remove bias for bipolar, since
                             # binary->bipolar is achieved by reinterpretation
                             actval = 0
-                        # create and insert new StreamingFCLayer node
+                        # create and insert new MatrixVectorActivation node
                         new_node = helper.make_node(
-                            "StreamingFCLayer_Batch",
+                            "MatrixVectorActivation",
                             [mm_input, mm_weight, mt_thres],
                             [mt_output],
                             domain="finn.custom_op.fpgadataflow",
@@ -802,7 +812,7 @@ def apply(self, model):
                             noActivation=0,
                             numInputVectors=list(mm_in_shape[:-1]),
                             mem_mode=self.mem_mode,
-                            name="StreamingFCLayer_Batch_" + n.name,
+                            name="MatrixVectorActivation_" + n.name,
                         )
                         graph.node.insert(node_ind, new_node)
                         # remove old nodes
@@ -814,9 +824,9 @@ def apply(self, model):
                         odt = model.get_tensor_datatype(mm_output)
                         model.set_tensor_shape(mm_input, mm_in_shape)
                         model.set_tensor_shape(mm_output, mm_out_shape)
-                        # create and insert new StreamingFCLayer node
+                        # create and insert new MatrixVectorActivation node
                         new_node = helper.make_node(
-                            "StreamingFCLayer_Batch",
+                            "MatrixVectorActivation",
                             [mm_input, mm_weight],
                             [mm_output],
                             domain="finn.custom_op.fpgadataflow",
@@ -833,7 +843,7 @@ def apply(self, model):
                             noActivation=1,
                             numInputVectors=list(mm_in_shape[:-1]),
                             mem_mode=self.mem_mode,
-                            name="StreamingFCLayer_Batch_" + n.name,
+                            name="MatrixVectorActivation_" + n.name,
                         )
                         graph.node.insert(node_ind, new_node)
                         # remove old node
@@ -846,9 +856,9 @@ def apply(self, model):
         return (model, graph_modified)
 
 
-class InferVVAU(Transformation):
+class InferVectorVectorActivation(Transformation):
     """Convert MatMul layers with quantized inputs and weights to
-    Vector_Vector_Activate_Batch layers, if the sparsity annotation
+    VectorVectorActivation layers, if the sparsity annotation
     of the weight matrix indicates that the MatMul layer belongs to
     a depthwise convolution. Any immediately following MultiThreshold
     layers will also be absorbed into the VVAU."""
@@ -898,7 +908,7 @@ def apply(self, model):
                     W = W.transpose(0, 3, 1, 2)
                     # now we can extract the values using a for loop over the channels
                     # and fill a zero numpy array in the correct shape
-                    w_tensor = np.zeros((channels, 1, k_h, k_w))
+                    w_tensor = np.zeros((channels, 1, k_h, k_w), dtype=np.float32)
                     for ch in range(channels):
                         w_tensor[ch][0] = W[ch][ch]
                     model.set_initializer(mm_weight, w_tensor)
@@ -935,9 +945,9 @@ def apply(self, model):
                         )
                         model.set_tensor_shape(mm_input, mm_in_shape)
                         model.set_tensor_shape(mt_output, mt_out_shape)
-                        # create and insert new Vector_Vector_Activate_Batch node
+                        # create and insert new VectorVectorActivation node
                         new_node = helper.make_node(
-                            "Vector_Vector_Activate_Batch",
+                            "VectorVectorActivation",
                             [mm_input, mm_weight, mt_thres],
                             [mt_output],
                             domain="finn.custom_op.fpgadataflow",
@@ -952,7 +962,7 @@ def apply(self, model):
                             outputDataType=odt.name,
                             ActVal=actval,
                             noActivation=0,
-                            name="Vector_Vector_Activate_Batch_" + n.name,
+                            name="VectorVectorActivation_" + n.name,
                         )
                         graph.node.insert(node_ind, new_node)
                         # remove old nodes
@@ -966,7 +976,7 @@ def apply(self, model):
                         model.set_tensor_shape(mm_output, mm_out_shape)
                         # create and insert new VVAU node
                         new_node = helper.make_node(
-                            "Vector_Vector_Activate_Batch",
+                            "VectorVectorActivation",
                             [mm_input, mm_weight],
                             [mm_output],
                             domain="finn.custom_op.fpgadataflow",
@@ -981,7 +991,7 @@ def apply(self, model):
                             outputDataType=odt.name,
                             ActVal=0,
                             noActivation=1,
-                            name="Vector_Vector_Activate_Batch_" + n.name,
+                            name="VectorVectorActivation_" + n.name,
                         )
                         graph.node.insert(node_ind, new_node)
                         # remove old node
@@ -1146,7 +1156,7 @@ def apply(self, model):
                 # create node with no parallelization first
                 pe = 1
 
-                # create and insert new StreamingFCLayer node
+                # create and insert new AddStreams_Batch node
                 new_node = helper.make_node(
                     "AddStreams_Batch",
                     [in0, in1],
@@ -1180,8 +1190,9 @@ def apply(self, model):
         for node in graph.node:
             node_ind += 1
             successors = model.find_consumers(node.output[0])
-            if successors is not None and len(successors) == 2:
+            if successors is not None and len(successors) >= 2:
                 output_tensor = node.output[0]
+                n_outputs = len(successors)
 
                 dt = model.get_tensor_datatype(output_tensor)
 
@@ -1192,7 +1203,7 @@ def apply(self, model):
                 # create clone tensors
                 out_shape = model.get_tensor_shape(output_tensor)
                 out_tensor_clones = []
-                for i in range(2):
+                for i in range(n_outputs):
                     clone = helper.make_tensor_value_info(
                         model.make_new_valueinfo_name(), TensorProto.FLOAT, out_shape
                     )
@@ -1215,6 +1226,7 @@ def apply(self, model):
                     PE=pe,
                     inputDataType=dt.name,
                     numInputVectors=vecs,
+                    NumOutputStreams=n_outputs,
                     name="DuplicateStreams_Batch_" + node.name,
                 )
 
@@ -1247,7 +1259,7 @@ class InferChannelwiseLinearLayer(Transformation):
     def get_smallest_possible(self, vals):
         """Returns smallest (fewest bits) possible DataType that can represent
         value. Prefers unsigned integers where possible."""
-        vals = np.array(vals)
+        vals = np.array(vals, dtype=np.float64)
         for v in vals:
             assert int(v) == v, "Error float value"
 
@@ -1430,7 +1442,7 @@ def apply(self, model):
 
                 k = model.get_initializer(k_input)[0]
 
-                # create and insert new StreamingFCLayer node
+                # create and insert new LabelSelect_Batch node
                 new_node = helper.make_node(
                     "LabelSelect_Batch",
                     [fc_input],
@@ -1523,7 +1535,9 @@ def apply(self, model):
                     model.make_new_valueinfo_name(), TensorProto.FLOAT, [1]
                 )
                 model.graph.value_info.append(mul_value)
-                model.set_initializer(mul_value.name, np.array(1 / (vecs[1] * vecs[2])))
+                model.set_initializer(
+                    mul_value.name, np.array(1 / (vecs[1] * vecs[2]), dtype=np.float32)
+                )
                 new_mul = helper.make_node(
                     "Mul",
                     [pool_out, mul_value.name],
@@ -1593,3 +1607,60 @@ def apply(self, model):
             model = model.transform(InferShapes())
             model = model.transform(InferDataTypes())
         return (model, graph_modified)
+
+
+class InferConcatLayer(Transformation):
+    """Convert suitable Concat nodes (operating on last/-1 axis)
+    into StreamingConcat HLS layers."""
+
+    def apply(self, model):
+        graph = model.graph
+        node_ind = 0
+        graph_modified = False
+        for node in graph.node:
+            node_ind += 1
+            if node.op_type == "Concat":
+                ishape = model.get_tensor_shape(node.input[0])
+                axis = get_by_name(node.attribute, "axis")
+                if (axis is None) or (ishape is None):
+                    continue
+                axis = axis.i
+                last_axis = len(ishape) - 1
+                # skip conversion if not using last axis
+                if (axis != -1) and (axis != last_axis):
+                    continue
+                # check datatype coherence
+                dt0 = model.get_tensor_datatype(node.input[0])
+                if dt0 is None:
+                    continue
+                dt_coherent = all(
+                    [model.get_tensor_datatype(x) == dt0 for x in node.input]
+                )
+                if not dt_coherent:
+                    continue
+                # skip conversion if inputs are not integers
+                if not dt0.is_integer():
+                    continue
+                # ready for conversion
+                elems_per_stream = [model.get_tensor_shape(x)[-1] for x in node.input]
+                inp_vec = list(model.get_tensor_shape(node.input[0])[:-1])
+                new_node = helper.make_node(
+                    "StreamingConcat",
+                    node.input,
+                    node.output,
+                    domain="finn.custom_op.fpgadataflow",
+                    backend="fpgadataflow",
+                    name="Concat_" + node.name,
+                    ElemsPerStream=elems_per_stream,
+                    inputDataType=dt0.name,
+                    numInputVectors=inp_vec,
+                )
+                graph.node.insert(node_ind, new_node)
+                # remove old node
+                graph.node.remove(node)
+                graph_modified = True
+
+        if graph_modified:
+            model = model.transform(InferShapes())
+            model = model.transform(InferDataTypes())
+        return (model, graph_modified)
diff --git a/src/finn/transformation/fpgadataflow/create_dataflow_partition.py b/src/finn/transformation/fpgadataflow/create_dataflow_partition.py
index 9b2577bc2b..07d6961be3 100644
--- a/src/finn/transformation/fpgadataflow/create_dataflow_partition.py
+++ b/src/finn/transformation/fpgadataflow/create_dataflow_partition.py
@@ -26,12 +26,14 @@
 # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-from finn.core.modelwrapper import ModelWrapper
-from finn.custom_op.registry import getCustomOp
-from finn.transformation.base import Transformation
-from finn.transformation.create_generic_partitions import PartitionFromLambda
+from qonnx.core.modelwrapper import ModelWrapper
+from qonnx.custom_op.registry import getCustomOp
+from qonnx.transformation.base import Transformation
+from qonnx.transformation.create_generic_partitions import PartitionFromLambda
+from qonnx.util.basic import get_by_name
+
 from finn.transformation.fpgadataflow.externalize_params import ExternalizeParams
-from finn.util.basic import get_by_name, make_build_dir
+from finn.util.basic import make_build_dir
 
 
 class CreateDataflowPartition(Transformation):
diff --git a/src/finn/transformation/fpgadataflow/create_stitched_ip.py b/src/finn/transformation/fpgadataflow/create_stitched_ip.py
index 327c7867fe..a3f98b8a58 100644
--- a/src/finn/transformation/fpgadataflow/create_stitched_ip.py
+++ b/src/finn/transformation/fpgadataflow/create_stitched_ip.py
@@ -26,18 +26,22 @@
 # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
+import pkg_resources as pk
+
 import json
 import multiprocessing as mp
 import os
 import subprocess
 import warnings
+from qonnx.custom_op.registry import getCustomOp
+from qonnx.transformation.base import Transformation
+from qonnx.util.basic import get_num_default_workers
+from shutil import copytree
 
-from finn.custom_op.registry import getCustomOp
-from finn.transformation.base import Transformation
 from finn.transformation.fpgadataflow.replace_verilog_relpaths import (
     ReplaceVerilogRelPaths,
 )
-from finn.util.basic import get_num_default_workers, make_build_dir
+from finn.util.basic import make_build_dir
 from finn.util.fpgadataflow import is_fpgadataflow_node
 
 
@@ -51,7 +55,7 @@ def is_external_input(model, node, i):
         if model.get_initializer(node.input[i]) is None:
             return True
         else:
-            if node.op_type == "StreamingFCLayer_Batch":
+            if node.op_type == "MatrixVectorActivation":
                 if node_inst.get_nodeattr("mem_mode") == "external":
                     return True
     return False
@@ -61,7 +65,9 @@ def is_external_output(model, node, i):
     # indicate whether output i of node should be made external
     # True only if output is unconnected
     consumers = model.find_consumers(node.output[i])
-    if consumers is None:
+    if consumers == []:
+        # TODO should ideally check if tensor is in top-level
+        # outputs
         return True
     return False
 
@@ -80,12 +86,15 @@ class CreateStitchedIP(Transformation):
     The packaged block design IP can be found under the ip subdirectory.
     """
 
-    def __init__(self, fpgapart, clk_ns, ip_name="finn_design", vitis=False):
+    def __init__(
+        self, fpgapart, clk_ns, ip_name="finn_design", vitis=False, signature=[]
+    ):
         super().__init__()
         self.fpgapart = fpgapart
         self.clk_ns = clk_ns
         self.ip_name = ip_name
         self.vitis = vitis
+        self.signature = signature
         self.has_aximm = False
         self.has_m_axis = False
         self.m_axis_idx = 0
@@ -157,11 +166,20 @@ def connect_axi(self, node):
                 "make_bd_intf_pins_external [get_bd_intf_pins %s/%s]"
                 % (inst_name, aximm_intf_name[0][0])
             )
+            ext_if_name = "m_axi_gmem%d" % (len(self.intf_names["aximm"]))
+            self.connect_cmds.append(
+                "set_property name %s [get_bd_intf_ports m_axi_gmem_0]" % ext_if_name
+            )
+            self.connect_cmds.append("assign_bd_address")
+            seg_name = "%s/Data_m_axi_gmem/SEG_%s_Reg" % (inst_name, ext_if_name)
+            self.connect_cmds.append(
+                "set_property offset 0 [get_bd_addr_segs {%s}]" % (seg_name)
+            )
+            # TODO should propagate this information from the node instead of 4G
             self.connect_cmds.append(
-                "set_property name m_axi_gmem0 [get_bd_intf_ports m_axi_gmem_0]"
+                "set_property range 4G [get_bd_addr_segs {%s}]" % (seg_name)
             )
-            self.intf_names["aximm"] = [("m_axi_gmem0", aximm_intf_name[0][1])]
-            assert self.has_aximm is False, "Currently limited to one AXI-MM interface"
+            self.intf_names["aximm"] = [(ext_if_name, aximm_intf_name[0][1])]
             self.has_aximm = True
 
     def connect_m_axis_external(self, node, idx=None):
@@ -210,12 +228,65 @@ def connect_s_axis_external(self, node, idx=None):
             )
             self.s_axis_idx += 1
 
+    def insert_signature(self, checksum_count):
+        signature_vlnv = "AMD:user:axi_info_top:1.0"
+        signature_name = "axi_info_top0"
+        self.create_cmds.append(
+            "create_bd_cell -type ip -vlnv %s %s" % (signature_vlnv, signature_name)
+        )
+        self.create_cmds.append(
+            "set_property -dict [list "
+            "CONFIG.SIG_CUSTOMER {%s} "
+            "CONFIG.SIG_APPLICATION {%s} "
+            "CONFIG.VERSION {%s} "
+            "CONFIG.CHECKSUM_COUNT {%s} "
+            "] [get_bd_cells %s]"
+            % (
+                self.signature[0],
+                self.signature[1],
+                self.signature[2],
+                checksum_count,
+                signature_name,
+            )
+        )
+        # set clk and reset
+        self.connect_cmds.append(
+            "connect_bd_net [get_bd_ports ap_clk] [get_bd_pins %s/ap_clk]"
+            % signature_name
+        )
+        self.connect_cmds.append(
+            "connect_bd_net [get_bd_ports ap_rst_n] [get_bd_pins %s/ap_rst_n]"
+            % signature_name
+        )
+        fclk_mhz = 1 / (self.clk_ns * 0.001)
+        fclk_hz = fclk_mhz * 1000000
+        self.connect_cmds.append(
+            "set_property -dict [list "
+            "CONFIG.FREQ_HZ {%f} "
+            "CONFIG.CLK_DOMAIN {ap_clk} "
+            "] [get_bd_intf_pins %s/s_axi]"
+            % (
+                fclk_hz,
+                signature_name,
+            )
+        )
+        # make axilite interface external
+        self.connect_cmds.append(
+            "make_bd_intf_pins_external [get_bd_intf_pins %s/s_axi]" % signature_name
+        )
+        self.connect_cmds.append(
+            "set_property name s_axis_info [get_bd_intf_ports s_axi_0]"
+        )
+        self.connect_cmds.append("assign_bd_address")
+
     def apply(self, model):
         # ensure non-relative readmemh .dat files
         model = model.transform(ReplaceVerilogRelPaths())
         ip_dirs = ["list"]
         # add RTL streamer IP
-        ip_dirs.append("/workspace/finn/finn-rtllib/memstream")
+        ip_dirs.append("$::env(FINN_ROOT)/finn-rtllib/memstream")
+        if self.signature:
+            ip_dirs.append("$::env(FINN_ROOT)/finn-rtllib/axi_info")
         if model.graph.node[0].op_type not in ["StreamingFIFO", "IODMA"]:
             warnings.warn(
                 """First node is not StreamingFIFO or IODMA.
@@ -254,10 +325,32 @@ def apply(self, model):
                         "[get_bd_intf_pins %s/%s]"
                         % (producer.name, src_intf_name, node.name, dst_intf_name)
                     )
+
+        # process external inputs and outputs in top-level graph input order
+        for input in model.graph.input:
+            inp_name = input.name
+            inp_cons = model.find_consumers(inp_name)
+            assert inp_cons != [], "No consumer for input " + inp_name
+            assert len(inp_cons) == 1, "Multiple consumers for input " + inp_name
+            node = inp_cons[0]
+            node_inst = getCustomOp(node)
+            for i in range(len(node.input)):
+                if node.input[i] == inp_name:
+                    self.connect_s_axis_external(node, idx=i)
+        for output in model.graph.output:
+            out_name = output.name
+            node = model.find_producer(out_name)
+            assert node is not None, "No producer for output " + out_name
+            node_inst = getCustomOp(node)
             for i in range(len(node.output)):
-                if is_external_output(model, node, i):
+                if node.output[i] == out_name:
                     self.connect_m_axis_external(node, idx=i)
 
+        if self.signature:
+            # extract number of checksum layer from graph
+            checksum_layers = model.get_nodes_by_op_type("checksum")
+            self.insert_signature(len(checksum_layers))
+
         # create a temporary folder for the project
         prjname = "finn_vivado_stitch_proj"
         vivado_stitch_proj_dir = make_build_dir(prefix="vivado_stitch_proj_")
@@ -330,6 +423,13 @@ def apply(self, model):
             )
             % (vivado_stitch_proj_dir, block_vendor, block_library, block_name)
         )
+        # in some cases, the IP packager seems to infer an aperture of 64K or 4G,
+        # preventing address assignment of the DDR_LOW and/or DDR_HIGH segments
+        # the following is a hotfix to remove this aperture during IODMA packaging
+        tcl.append(
+            "ipx::remove_segment -quiet m_axi_gmem0:APERTURE_0 "
+            "[ipx::get_address_spaces m_axi_gmem0 -of_objects [ipx::current_core]]"
+        )
         tcl.append("set_property core_revision 2 [ipx::find_open_core %s]" % block_vlnv)
         tcl.append("ipx::create_xgui_files [ipx::find_open_core %s]" % block_vlnv)
         # mark bus interface params as user-resolvable to avoid FREQ_MHZ mismatches
@@ -414,12 +514,28 @@ def apply(self, model):
                 "ipx::add_file dcp/%s.dcp "
                 "[ipx::get_file_groups xilinx_simulationcheckpoint]" % block_name
             )
+        # add a rudimentary driver mdd to get correct ranges in xparameters.h later on
+        example_data_dir = pk.resource_filename("finn.qnn-data", "mdd-data/")
+        copytree(example_data_dir, vivado_stitch_proj_dir + "/data")
+        tcl.append("file copy -force data ip/")
+        tcl.append("ipx::add_file_group -type software_driver {} [ipx::current_core]")
+        tcl.append(
+            "set_property type mdd [ipx::add_file data/finn_design.mdd "
+            "[ipx::get_file_groups xilinx_softwaredriver -of_objects "
+            "[ipx::current_core]]]"
+        )
+        tcl.append(
+            "set_property type tclSource [ipx::add_file data/finn_design.tcl "
+            "[ipx::get_file_groups xilinx_softwaredriver -of_objects "
+            "[ipx::current_core]]]"
+        )
         tcl.append("ipx::update_checksums [ipx::find_open_core %s]" % block_vlnv)
         tcl.append("ipx::save_core [ipx::find_open_core %s]" % block_vlnv)
         # export list of used Verilog files (for rtlsim later on)
         tcl.append(
-            "set all_v_files [get_files -filter {FILE_TYPE == Verilog "
-            + "&& USED_IN_SYNTHESIS == 1} ]"
+            "set all_v_files [get_files -filter {USED_IN_SYNTHESIS == 1 "
+            + "&& (FILE_TYPE == Verilog || FILE_TYPE == SystemVerilog "
+            + '|| FILE_TYPE =="Verilog Header")}]'
         )
         v_file_list = "%s/all_verilog_srcs.txt" % vivado_stitch_proj_dir
         tcl.append("set fp [open %s w]" % v_file_list)
@@ -441,4 +557,13 @@ def apply(self, model):
         bash_command = ["bash", make_project_sh]
         process_compile = subprocess.Popen(bash_command, stdout=subprocess.PIPE)
         process_compile.communicate()
+        # wrapper may be created in different location depending on Vivado version
+        if not os.path.isfile(wrapper_filename):
+            # check in alternative location (.gen instead of .srcs)
+            wrapper_filename_alt = wrapper_filename.replace(".srcs", ".gen")
+            if os.path.isfile(wrapper_filename_alt):
+                model.set_metadata_prop("wrapper_filename", wrapper_filename_alt)
+            else:
+                raise Exception("CreateStitchedIP failed, no wrapper HDL found.")
+
         return (model, False)
diff --git a/src/finn/transformation/fpgadataflow/externalize_params.py b/src/finn/transformation/fpgadataflow/externalize_params.py
index dcb66a8538..732b82c675 100644
--- a/src/finn/transformation/fpgadataflow/externalize_params.py
+++ b/src/finn/transformation/fpgadataflow/externalize_params.py
@@ -27,8 +27,8 @@
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 
-from finn.transformation.base import Transformation
-from finn.util.basic import get_by_name
+from qonnx.transformation.base import Transformation
+from qonnx.util.basic import get_by_name
 
 
 class ExternalizeParams(Transformation):
diff --git a/src/finn/transformation/fpgadataflow/floorplan.py b/src/finn/transformation/fpgadataflow/floorplan.py
index 2bda788313..6792017223 100644
--- a/src/finn/transformation/fpgadataflow/floorplan.py
+++ b/src/finn/transformation/fpgadataflow/floorplan.py
@@ -28,12 +28,13 @@
 
 import json
 import warnings
+from qonnx.custom_op.registry import getCustomOp
+from qonnx.transformation.base import Transformation
+from qonnx.transformation.general import ApplyConfig
+from qonnx.util.basic import get_by_name
 
 from finn.analysis.fpgadataflow.floorplan_params import floorplan_params
-from finn.custom_op.registry import getCustomOp
-from finn.transformation.base import Transformation
-from finn.transformation.general import ApplyConfig
-from finn.util.basic import get_by_name, make_build_dir
+from finn.util.basic import make_build_dir
 
 
 class Floorplan(Transformation):
@@ -151,7 +152,7 @@ def apply(self, model):
                 partition_cnt += 1
                 continue
             elif not (
-                node.op_type == "StreamingFCLayer_Batch"
+                node.op_type == "MatrixVectorActivation"
                 and node_inst.get_nodeattr("mem_mode") is not None
                 and node_inst.get_nodeattr("mem_mode") == "external"
             ):
diff --git a/src/finn/transformation/fpgadataflow/hlssynth_ip.py b/src/finn/transformation/fpgadataflow/hlssynth_ip.py
index 2a7d9e9066..1fede06678 100644
--- a/src/finn/transformation/fpgadataflow/hlssynth_ip.py
+++ b/src/finn/transformation/fpgadataflow/hlssynth_ip.py
@@ -27,10 +27,10 @@
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 import os
+import qonnx.custom_op.registry as registry
 import warnings
+from qonnx.transformation.base import NodeLocalTransformation
 
-import finn.custom_op.registry as registry
-from finn.transformation.base import NodeLocalTransformation
 from finn.util.fpgadataflow import is_fpgadataflow_node
 
 
diff --git a/src/finn/transformation/fpgadataflow/insert_dwc.py b/src/finn/transformation/fpgadataflow/insert_dwc.py
index 58efe65eb5..51da7958b1 100644
--- a/src/finn/transformation/fpgadataflow/insert_dwc.py
+++ b/src/finn/transformation/fpgadataflow/insert_dwc.py
@@ -1,9 +1,9 @@
 import warnings
 from onnx import TensorProto
 from onnx import helper as oh
+from qonnx.custom_op.registry import getCustomOp
+from qonnx.transformation.base import Transformation
 
-from finn.custom_op.registry import getCustomOp
-from finn.transformation.base import Transformation
 from finn.util.fpgadataflow import is_fpgadataflow_node
 
 
@@ -46,7 +46,7 @@ def apply(self, model):
             if _suitable_node(n):
                 for output_name in n.output:
                     consumers = model.find_consumers(output_name)
-                    if consumers is None:
+                    if consumers == []:
                         continue
                     if len(consumers) > 1:
                         warnings.warn(
@@ -62,7 +62,7 @@ def apply(self, model):
 
                         # If FC and external mem, it could be connected to input 1
                         if (
-                            consumer.op_type == "StreamingFCLayer_Batch"
+                            consumer.op_type == "MatrixVectorActivation"
                             and n1.get_nodeattr("mem_mode") == "external"
                         ):
                             # get input idx
diff --git a/src/finn/transformation/fpgadataflow/insert_fifo.py b/src/finn/transformation/fpgadataflow/insert_fifo.py
index c8bb716922..b378a06ff6 100644
--- a/src/finn/transformation/fpgadataflow/insert_fifo.py
+++ b/src/finn/transformation/fpgadataflow/insert_fifo.py
@@ -1,10 +1,38 @@
+# Copyright (c) 2020, Xilinx
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of FINN nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
 import numpy as np
 import warnings
 from onnx import TensorProto
 from onnx import helper as oh
+from qonnx.custom_op.registry import getCustomOp
+from qonnx.transformation.base import Transformation
 
-from finn.custom_op.registry import getCustomOp
-from finn.transformation.base import Transformation
 from finn.util.fpgadataflow import is_fpgadataflow_node
 
 
@@ -62,7 +90,7 @@ def apply(self, model):
             if _suitable_node(n):
                 for n_output in n.output:
                     consumers = model.find_consumers(n_output)
-                    if consumers is None:
+                    if consumers == []:
                         continue
                     if len(consumers) > 1:
                         warnings.warn(
diff --git a/src/finn/transformation/fpgadataflow/insert_hook.py b/src/finn/transformation/fpgadataflow/insert_hook.py
new file mode 100644
index 0000000000..21ec3f049f
--- /dev/null
+++ b/src/finn/transformation/fpgadataflow/insert_hook.py
@@ -0,0 +1,131 @@
+# Copyright (c) 2020, Xilinx
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of FINN nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import numpy as np
+from onnx import TensorProto
+from onnx import helper as oh
+from qonnx.custom_op.registry import getCustomOp
+from qonnx.transformation.base import Transformation
+from qonnx.transformation.general import GiveReadableTensorNames, GiveUniqueNodeNames
+
+from finn.util.fpgadataflow import is_fpgadataflow_node
+
+
+def _is_hook_node(node):
+    if node.op_type in ["CheckSum"]:
+        return True
+    else:
+        return False
+
+
+def _suitable_node(node):
+    if node is not None:
+        if is_fpgadataflow_node(node) is True:
+            if _is_hook_node(node) is False:
+                return True
+            else:
+                return False
+        else:
+            return False
+    else:
+        return False
+
+
+class InsertHook(Transformation):
+    """Inserting hook layer after each layer that has the node attribute
+    'output_hook' specified"""
+
+    def __init__(self):
+        super().__init__()
+
+    def apply(self, model):
+        list_supported_hooks = ["checksum"]
+        graph = model.graph
+        node_ind = -1
+        graph_modified = False
+        for n in graph.node:
+            node_ind += 1
+            if _suitable_node(n):
+                for output_name in n.output:
+                    consumers = model.find_consumers(output_name)
+                    assert len(consumers) <= 1, (
+                        n.name
+                        + ": HLS node with fan-out higher than 1 cannot be stitched"
+                    )
+                    n0 = getCustomOp(n)
+                    n0_hook = n0.get_nodeattr("output_hook")
+                    if n0_hook in list_supported_hooks:
+                        if n0_hook == "checksum":
+                            if len(consumers) == 1:
+                                if consumers[0].op_type == "CheckSum":
+                                    continue
+                            n0_normal_oshape = n0.get_normal_output_shape()
+                            n0_folded_oshape = n0.get_folded_output_shape()
+                            n0_odt = n0.get_output_datatype()
+                            items_per_word = n0.get_nodeattr("PE")
+                            words_per_frame = np.prod(n0_folded_oshape[:-1])
+                            chk_otensor = oh.make_tensor_value_info(
+                                model.make_new_valueinfo_name(),
+                                TensorProto.FLOAT,
+                                n0_normal_oshape,
+                            )
+                            chk_result = oh.make_tensor_value_info(
+                                model.make_new_valueinfo_name(),
+                                TensorProto.FLOAT,
+                                [1],
+                            )
+                            chk_node = oh.make_node(
+                                "CheckSum",
+                                [output_name],
+                                outputs=[chk_otensor.name, chk_result.name],
+                                domain="finn.custom_op.fpgadataflow",
+                                backend="fpgadataflow",
+                                words_per_frame=words_per_frame,
+                                items_per_word=items_per_word,
+                                inputDataType=str(n0_odt.name),
+                                folded_shape=n0_folded_oshape,
+                            )
+                            # insert checksum node
+                            graph.node.insert(node_ind + 1, chk_node)
+                            # insert newly-created tensors
+                            graph.value_info.append(chk_otensor)
+                            graph.value_info.append(chk_result)
+
+                            # set chk output tensor as new input tensor of second node
+                            if len(consumers) == 1:
+                                consumers[0].input[0] = chk_otensor.name
+                            else:
+                                model.graph.output.pop()
+                                model.graph.output.append(chk_otensor)
+                                model.graph.value_info.remove(chk_otensor)
+                                model = model.transform(GiveUniqueNodeNames())
+                                model = model.transform(GiveReadableTensorNames())
+                            graph_modified = True
+                            return (model, graph_modified)
+
+        return (model, graph_modified)
diff --git a/src/finn/transformation/fpgadataflow/insert_iodma.py b/src/finn/transformation/fpgadataflow/insert_iodma.py
index d0ef270816..4b4eb6362f 100644
--- a/src/finn/transformation/fpgadataflow/insert_iodma.py
+++ b/src/finn/transformation/fpgadataflow/insert_iodma.py
@@ -30,18 +30,27 @@
 import numpy as np
 from onnx import TensorProto
 from onnx import helper as oh
-
-from finn.custom_op.registry import getCustomOp
-from finn.transformation.base import Transformation
-from finn.transformation.general import SortGraph
-from finn.util.basic import get_by_name
+from qonnx.custom_op.registry import getCustomOp
+from qonnx.transformation.base import Transformation
+from qonnx.transformation.general import SortGraph
+from qonnx.util.basic import get_by_name
 
 
 class InsertIODMA(Transformation):
-    """Insert DMA nodes on all inputs and outputs."""
+    """Insert DMA nodes on inputs and outputs, or as specified by filters in
+    the constructor."""
 
-    def __init__(self, max_intfwidth=32):
+    def __init__(
+        self,
+        max_intfwidth=32,
+        insert_input=True,
+        insert_output=True,
+        insert_extmemw=True,
+    ):
         super().__init__()
+        self.insert_input = insert_input
+        self.insert_output = insert_output
+        self.insert_extmemw = insert_extmemw
         assert (
             2 ** math.log2(max_intfwidth) == max_intfwidth
         ), "max_intfwidth must be a power of 2"
@@ -59,16 +68,16 @@ def get_mem_init(self, weights, pe, simd):
         .
         """
 
-        # TODO: refactor this into streamingfclayer_batch.py, could go into
+        # TODO: refactor this into matrixvectoractivation.py, could go into
         # make_weight_file except it doesn't write a file but returns a npy
         # array instead
         w_shape = weights.shape
-        assert len(w_shape) == 2, "weights withincorrect number of dims"
+        assert len(w_shape) == 2, "weights with incorrect number of dims"
         inp_w, out_w = w_shape
 
         assert out_w % pe == 0, "Malformed weight matrix"
         assert inp_w % simd == 0, "Malformed weight matrix"
-        reshaped_w = np.zeros(inp_w * out_w).reshape(-1, pe * simd)
+        reshaped_w = np.zeros(inp_w * out_w, dtype=np.float32).reshape(-1, pe * simd)
 
         addr = 0
         for fr in range(out_w // pe):
@@ -94,152 +103,163 @@ def apply(self, model):
             get_by_name(x.attribute, "backend").s.decode("UTF-8") == "fpgadataflow"
             for x in all_nodes
         )
-        # parse streamingfclayers looking for external weights with no attached IODMA
-        fc_extw_nodes = list(
-            filter(
-                lambda x: x.op_type == "StreamingFCLayer_Batch"
-                and getCustomOp(x).get_nodeattr("mem_mode") == "external"
-                and model.find_producer(x.input[1]) is None,
-                all_nodes,
-            )
-        )
         # insert IODMAs for graph inputs
-        graph_in_names = [x.name for x in model.graph.input]
-        for graph_in_name in graph_in_names:
-            first_node = model.find_consumer(graph_in_name)
-            if first_node.op_type == "IODMA":
-                # IODMA already inserted for this input
-                continue
-            else:
-                in_shape = model.get_tensor_shape(graph_in_name)
-                in_dtype = model.get_tensor_datatype(graph_in_name)
-                first_node_inst = getCustomOp(first_node)
-                in_folded_shape = first_node_inst.get_folded_input_shape()
-                # take advantage of AXI stream width padding for DMA alignment
-                # (AXI streams are always padded to 8 bits)
-                # this is the width of stream output expected from the DMA
-                padded_instream_width = first_node_inst.get_instream_width_padded()
-                padded_instream_bytes = padded_instream_width // 8
+        if self.insert_input:
+            graph_in_names = [x.name for x in model.graph.input]
+            for graph_in_name in graph_in_names:
+                first_node = model.find_consumer(graph_in_name)
+                if first_node.op_type == "IODMA":
+                    # IODMA already inserted for this input
+                    continue
+                else:
+                    in_shape = model.get_tensor_shape(graph_in_name)
+                    in_dtype = model.get_tensor_datatype(graph_in_name)
+                    first_node_inst = getCustomOp(first_node)
+                    in_folded_shape = first_node_inst.get_folded_input_shape()
+                    # take advantage of AXI stream width padding for DMA alignment
+                    # (AXI streams are always padded to 8 bits)
+                    # this is the width of stream output expected from the DMA
+                    padded_instream_width = first_node_inst.get_instream_width_padded()
+                    padded_instream_bytes = padded_instream_width // 8
+                    # determine the feasible interface width
+                    transfer_bits = padded_instream_width * np.prod(
+                        in_folded_shape[:-1]
+                    )
+                    intfwidth = math.gcd(transfer_bits, self.max_intfwidth)
+                    assert (
+                        intfwidth % 8 == 0
+                    ), "No feasible interface width for transfer size"
+                    # make new buffer
+                    first_node_in = oh.make_tensor_value_info(
+                        model.make_new_valueinfo_name(), TensorProto.FLOAT, in_shape
+                    )
+                    model.graph.value_info.append(first_node_in)
+                    model.set_tensor_datatype(first_node_in.name, in_dtype)
+                    # reroute first node input
+                    # FIXME: currently always using 8-bit dtypes to work around the
+                    # padding problems for i/o DMA
+                    first_node.input[0] = first_node_in.name
+                    dma_node = oh.make_node(
+                        "IODMA",
+                        [graph_in_name],
+                        [first_node_in.name],
+                        numInputVectors=in_folded_shape[:-1],
+                        NumChannels=padded_instream_bytes,
+                        dataType="UINT8",
+                        intfWidth=intfwidth,
+                        streamWidth=padded_instream_width,
+                        direction="in",
+                        domain="finn.custom_op.fpgadataflow",
+                        backend="fpgadataflow",
+                    )
+                    model.graph.node.insert(0, dma_node)
+                    modified = True
+        # insert IODMAs for graph outputs
+        if self.insert_output:
+            graph_out_names = [x.name for x in model.graph.output]
+            for graph_out_name in graph_out_names:
+                final_node = model.find_producer(graph_out_name)
+                if final_node.op_type == "IODMA":
+                    continue
+                else:
+                    out_shape = model.get_tensor_shape(graph_out_name)
+                    out_dtype = model.get_tensor_datatype(graph_out_name)
+                    final_node_inst = getCustomOp(final_node)
+                    out_folded_shape = final_node_inst.get_folded_output_shape()
+                    # take advantage of AXI stream width padding for DMA alignment
+                    # (AXI streams are always padded to 8 bits)
+                    # this is the width of stream input to DMA
+                    padded_outstream_width = (
+                        final_node_inst.get_outstream_width_padded()
+                    )
+                    padded_outstream_bytes = padded_outstream_width // 8
+                    # determine the feasible interface width
+                    transfer_bits = padded_outstream_width * np.prod(
+                        out_folded_shape[:-1]
+                    )
+                    intfwidth = math.gcd(transfer_bits, self.max_intfwidth)
+                    assert (
+                        intfwidth % 8 == 0
+                    ), "No feasible interface width for transfer size"
+                    # make new buffer
+                    final_node_out = oh.make_tensor_value_info(
+                        model.make_new_valueinfo_name(), TensorProto.FLOAT, out_shape
+                    )
+                    model.graph.value_info.append(final_node_out)
+                    model.set_tensor_datatype(final_node_out.name, out_dtype)
+                    # reroute final node output to final_node_out_name
+                    final_node.output[0] = final_node_out.name
+                    # FIXME: currently always using 8-bit dtypes to work around the
+                    # padding problems for i/o DMA
+                    dma_node = oh.make_node(
+                        "IODMA",
+                        [final_node_out.name],
+                        [graph_out_name],
+                        numInputVectors=out_folded_shape[:-1],
+                        NumChannels=padded_outstream_bytes,
+                        dataType="UINT8",
+                        intfWidth=intfwidth,
+                        streamWidth=padded_outstream_width,
+                        direction="out",
+                        domain="finn.custom_op.fpgadataflow",
+                        backend="fpgadataflow",
+                    )
+                    model.graph.node.append(dma_node)
+                    modified = True
+        if self.insert_extmemw:
+            # parse matrixvectoractivation layers looking for external weights with no
+            # attached IODMA
+            fc_extw_nodes = list(
+                filter(
+                    lambda x: x.op_type == "MatrixVectorActivation"
+                    and getCustomOp(x).get_nodeattr("mem_mode") == "external"
+                    and model.find_producer(x.input[1]) is None,
+                    all_nodes,
+                )
+            )
+            for fc_node in fc_extw_nodes:
+                fc_inst = getCustomOp(fc_node)
+                fc_w_name = fc_node.input[1]
+                w_shape = model.get_tensor_shape(fc_w_name)
+                w_dtype = model.get_tensor_datatype(fc_w_name)
                 # determine the feasible interface width
-                transfer_bits = padded_instream_width * np.prod(in_folded_shape[:-1])
+                transfer_bits = np.prod(w_shape) * w_dtype.bitwidth()
                 intfwidth = math.gcd(transfer_bits, self.max_intfwidth)
                 assert (
                     intfwidth % 8 == 0
                 ), "No feasible interface width for transfer size"
+                # calculate width of stream output from DMA
+                pe = get_by_name(fc_node.attribute, "PE").i
+                simd = get_by_name(fc_node.attribute, "SIMD").i
+                streamWidth = fc_inst.get_weightstream_width_padded()
                 # make new buffer
-                first_node_in = oh.make_tensor_value_info(
-                    model.make_new_valueinfo_name(), TensorProto.FLOAT, in_shape
+                W = model.get_initializer(fc_w_name)
+                iodma_mem = self.get_mem_init(W, pe, simd)
+                model.set_initializer(fc_w_name, iodma_mem)
+
+                fc_node_in = oh.make_tensor_value_info(
+                    model.make_new_valueinfo_name(), TensorProto.FLOAT, iodma_mem.shape
                 )
-                model.graph.value_info.append(first_node_in)
-                model.set_tensor_datatype(first_node_in.name, in_dtype)
-                # reroute first node input
-                # FIXME: currently always using 8-bit dtypes to work around the
-                # padding problems for i/o DMA
-                first_node.input[0] = first_node_in.name
+                model.graph.value_info.append(fc_node_in)
+                model.set_tensor_datatype(fc_node_in.name, w_dtype)
+                model.set_initializer(fc_node_in.name, W)
                 dma_node = oh.make_node(
                     "IODMA",
-                    [graph_in_name],
-                    [first_node_in.name],
-                    numInputVectors=in_folded_shape[:-1],
-                    NumChannels=padded_instream_bytes,
-                    dataType="UINT8",
+                    [fc_w_name],
+                    [fc_node_in.name],
+                    numInputVectors=[iodma_mem.shape[0]],
+                    NumChannels=pe * simd,
+                    dataType=str(w_dtype.name),
                     intfWidth=intfwidth,
-                    streamWidth=padded_instream_width,
+                    streamWidth=streamWidth,
                     direction="in",
+                    burstMode="wrap",
                     domain="finn.custom_op.fpgadataflow",
                     backend="fpgadataflow",
                 )
+                fc_node.input[1] = fc_node_in.name
                 model.graph.node.insert(0, dma_node)
                 modified = True
-        # insert IODMAs for graph outputs
-        graph_out_names = [x.name for x in model.graph.output]
-        for graph_out_name in graph_out_names:
-            final_node = model.find_producer(graph_out_name)
-            if final_node.op_type == "IODMA":
-                continue
-            else:
-                out_shape = model.get_tensor_shape(graph_out_name)
-                out_dtype = model.get_tensor_datatype(graph_out_name)
-                final_node_inst = getCustomOp(final_node)
-                out_folded_shape = final_node_inst.get_folded_output_shape()
-                # take advantage of AXI stream width padding for DMA alignment
-                # (AXI streams are always padded to 8 bits)
-                # this is the width of stream input to DMA
-                padded_outstream_width = final_node_inst.get_outstream_width_padded()
-                padded_outstream_bytes = padded_outstream_width // 8
-                # determine the feasible interface width
-                transfer_bits = padded_outstream_width * np.prod(out_folded_shape[:-1])
-                intfwidth = math.gcd(transfer_bits, self.max_intfwidth)
-                assert (
-                    intfwidth % 8 == 0
-                ), "No feasible interface width for transfer size"
-                # make new buffer
-                final_node_out = oh.make_tensor_value_info(
-                    model.make_new_valueinfo_name(), TensorProto.FLOAT, out_shape
-                )
-                model.graph.value_info.append(final_node_out)
-                model.set_tensor_datatype(final_node_out.name, out_dtype)
-                # reroute final node output to final_node_out_name
-                final_node.output[0] = final_node_out.name
-                # FIXME: currently always using 8-bit dtypes to work around the
-                # padding problems for i/o DMA
-                dma_node = oh.make_node(
-                    "IODMA",
-                    [final_node_out.name],
-                    [graph_out_name],
-                    numInputVectors=out_folded_shape[:-1],
-                    NumChannels=padded_outstream_bytes,
-                    dataType="UINT8",
-                    intfWidth=intfwidth,
-                    streamWidth=padded_outstream_width,
-                    direction="out",
-                    domain="finn.custom_op.fpgadataflow",
-                    backend="fpgadataflow",
-                )
-                model.graph.node.append(dma_node)
-                modified = True
-
-        for fc_node in fc_extw_nodes:
-            fc_inst = getCustomOp(fc_node)
-            fc_w_name = fc_node.input[1]
-            w_shape = model.get_tensor_shape(fc_w_name)
-            w_dtype = model.get_tensor_datatype(fc_w_name)
-            # determine the feasible interface width
-            transfer_bits = np.prod(w_shape) * w_dtype.bitwidth()
-            intfwidth = math.gcd(transfer_bits, self.max_intfwidth)
-            assert intfwidth % 8 == 0, "No feasible interface width for transfer size"
-            # calculate width of stream output from DMA
-            pe = get_by_name(fc_node.attribute, "PE").i
-            simd = get_by_name(fc_node.attribute, "SIMD").i
-            streamWidth = fc_inst.get_weightstream_width_padded()
-            # make new buffer
-            W = model.get_initializer(fc_w_name)
-            iodma_mem = self.get_mem_init(W, pe, simd)
-            model.set_initializer(fc_w_name, iodma_mem)
-
-            fc_node_in = oh.make_tensor_value_info(
-                model.make_new_valueinfo_name(), TensorProto.FLOAT, iodma_mem.shape
-            )
-            model.graph.value_info.append(fc_node_in)
-            model.set_tensor_datatype(fc_node_in.name, w_dtype)
-            model.set_initializer(fc_node_in.name, W)
-            dma_node = oh.make_node(
-                "IODMA",
-                [fc_w_name],
-                [fc_node_in.name],
-                numInputVectors=[iodma_mem.shape[0]],
-                NumChannels=pe * simd,
-                dataType=str(w_dtype.name),
-                intfWidth=intfwidth,
-                streamWidth=streamWidth,
-                direction="in",
-                burstMode="wrap",
-                domain="finn.custom_op.fpgadataflow",
-                backend="fpgadataflow",
-            )
-            fc_node.input[1] = fc_node_in.name
-            model.graph.node.insert(0, dma_node)
-            modified = True
         if modified:
             model = model.transform(SortGraph())
         return (model, modified)
diff --git a/src/finn/transformation/fpgadataflow/insert_tlastmarker.py b/src/finn/transformation/fpgadataflow/insert_tlastmarker.py
index 34cb61346d..1610916eb6 100644
--- a/src/finn/transformation/fpgadataflow/insert_tlastmarker.py
+++ b/src/finn/transformation/fpgadataflow/insert_tlastmarker.py
@@ -29,10 +29,9 @@
 import numpy as np
 from onnx import TensorProto
 from onnx import helper as oh
-
-from finn.custom_op.registry import getCustomOp
-from finn.transformation.base import Transformation
-from finn.util.basic import get_by_name
+from qonnx.custom_op.registry import getCustomOp
+from qonnx.transformation.base import Transformation
+from qonnx.util.basic import get_by_name
 
 
 class InsertTLastMarker(Transformation):
@@ -97,7 +96,7 @@ def apply(self, model):
                 first_node = model.find_consumers(graph_in_name)
                 # skip if no consumers (this may be the case for unused initializers)
                 # TODO: fix this with a cleanup transform
-                if first_node is None:
+                if first_node == []:
                     continue
                 assert len(first_node) == 1, "Input fans out to multiple nodes"
                 first_node = first_node[0]
@@ -106,7 +105,7 @@ def apply(self, model):
                 #    the input is in the list of graph inputs because it has an
                 #    initializer (TODO: fix this with a clean-up transform)
                 if (
-                    first_node.op_type == "StreamingFCLayer_Batch"
+                    first_node.op_type == "MatrixVectorActivation"
                     and get_by_name(first_node.attribute, "mem_mode").s.decode("UTF-8")
                     != "external"
                 ):
@@ -123,7 +122,7 @@ def apply(self, model):
                     inp_idx = list(first_node.input).index(graph_in_name)
                     if inp_idx > 0:
                         if (
-                            first_node.op_type == "StreamingFCLayer_Batch"
+                            first_node.op_type == "MatrixVectorActivation"
                             and inp_idx == 1
                         ):
                             stream_width = int(custom_op.get_weightstream_width())
diff --git a/src/finn/transformation/fpgadataflow/make_deployment.py b/src/finn/transformation/fpgadataflow/make_deployment.py
index d43d81716a..d4684dc83c 100644
--- a/src/finn/transformation/fpgadataflow/make_deployment.py
+++ b/src/finn/transformation/fpgadataflow/make_deployment.py
@@ -29,10 +29,10 @@
 import os
 import subprocess
 from distutils.dir_util import copy_tree
+from qonnx.transformation.base import Transformation
 from shutil import copy
 
 import finn.transformation.fpgadataflow.templates as templates
-from finn.transformation.base import Transformation
 from finn.util.basic import make_build_dir
 
 
diff --git a/src/finn/transformation/fpgadataflow/make_pynq_driver.py b/src/finn/transformation/fpgadataflow/make_pynq_driver.py
index 2c3bd7ee59..dce98e54a3 100644
--- a/src/finn/transformation/fpgadataflow/make_pynq_driver.py
+++ b/src/finn/transformation/fpgadataflow/make_pynq_driver.py
@@ -31,19 +31,17 @@
 
 import numpy as np
 import os
+import qonnx
 import shutil
 import warnings
+from qonnx.core.modelwrapper import ModelWrapper
+from qonnx.custom_op.registry import getCustomOp
+from qonnx.transformation.base import Transformation
+from qonnx.util.basic import gen_finn_dt_tensor, roundup_to_integer_multiple
 
-import finn.core.datatype as dtp
+import finn.util
 import finn.util.data_packing as dpk
-from finn.core.modelwrapper import ModelWrapper
-from finn.custom_op.registry import getCustomOp
-from finn.transformation.base import Transformation
-from finn.util.basic import (
-    gen_finn_dt_tensor,
-    make_build_dir,
-    roundup_to_integer_multiple,
-)
+from finn.util.basic import make_build_dir
 from finn.util.data_packing import (
     hexstring2npbytearray,
     pack_innermost_dim_as_hex_string,
@@ -101,6 +99,42 @@ def apply(self, model):
         )
         driver_base_py = pynq_driver_dir + "/driver_base.py"
         shutil.copy(driver_base_template, driver_base_py)
+        # driver depends on qonnx and finn packages
+        # extract individual source files and copy to driver folder
+        qonnx_target_path = pynq_driver_dir + "/qonnx"
+        finn_target_path = pynq_driver_dir + "/finn"
+        os.makedirs(qonnx_target_path + "/core", exist_ok=True)
+        os.makedirs(qonnx_target_path + "/util", exist_ok=True)
+        os.makedirs(finn_target_path + "/util", exist_ok=True)
+        qonnx_path = qonnx.__path__[0]
+        finn_util_path = finn.util.__path__[0]
+        files_to_copy = []
+        files_to_copy.append(
+            (qonnx_path + "/core/datatype.py", qonnx_target_path + "/core/datatype.py")
+        )
+        files_to_copy.append(
+            (qonnx_path + "/core/__init__.py", qonnx_target_path + "/core/__init__.py")
+        )
+        files_to_copy.append(
+            (qonnx_path + "/util/basic.py", qonnx_target_path + "/util/basic.py")
+        )
+        files_to_copy.append(
+            (qonnx_path + "/util/__init__.py", qonnx_target_path + "/util/__init__.py")
+        )
+        files_to_copy.append(
+            (
+                finn_util_path + "/data_packing.py",
+                finn_target_path + "/util/data_packing.py",
+            )
+        )
+        files_to_copy.append(
+            (
+                finn_util_path + "/__init__.py",
+                finn_target_path + "/util/__init__.py",
+            )
+        )
+        for (src_file, target_file) in files_to_copy:
+            shutil.copy(src_file, target_file)
         # extract input-output shapes from the graph
         # TODO convert this to an analysis pass?
         idt = []
@@ -264,20 +298,6 @@ def apply(self, model):
         )
         shutil.copy(validate_template, validate_py)
 
-        # copy all the dependencies into the driver folder
-        # driver imports utils/data_packing and core/datatype
-        # both of which are in finn-base
-        # e.g. /workspace/finn-base/src/finn/util/data_packing.py
-        dpk_root = dpk.__file__
-        # e.g. /workspace/finn-base/src/finn/util
-        dpk_root = dpk_root.replace("data_packing.py", "")
-        # e.g. /workspace/finn-base/src/finn/core/datatype.py
-        dtp_root = dtp.__file__
-        # e.g. /workspace/finn-base/src/finn/core
-        dtp_root = dtp_root.replace("datatype.py", "")
-        shutil.copytree(dpk_root, pynq_driver_dir + "/finn/util")
-        shutil.copytree(dtp_root, pynq_driver_dir + "/finn/core")
-
         # generate weight files for runtime-writable layers
 
         for sdp_ind, sdp_node in enumerate(model.graph.node):
@@ -288,7 +308,7 @@ def apply(self, model):
             dataflow_model = ModelWrapper(dataflow_model_filename)
             rt_layer_ind = 0
             for node in dataflow_model.graph.node:
-                if node.op_type in ["StreamingFCLayer_Batch", "Thresholding_Batch"]:
+                if node.op_type in ["MatrixVectorActivation", "Thresholding_Batch"]:
                     node_inst = getCustomOp(node)
                     is_rt_weights = node_inst.get_nodeattr("runtime_writeable_weights")
                     if is_rt_weights == 1:
diff --git a/src/finn/transformation/fpgadataflow/make_zynq_proj.py b/src/finn/transformation/fpgadataflow/make_zynq_proj.py
index 84d587b6ce..a589cb039c 100644
--- a/src/finn/transformation/fpgadataflow/make_zynq_proj.py
+++ b/src/finn/transformation/fpgadataflow/make_zynq_proj.py
@@ -28,11 +28,13 @@
 
 import os
 import subprocess
+from qonnx.core.modelwrapper import ModelWrapper
+from qonnx.custom_op.registry import getCustomOp
+from qonnx.transformation.base import Transformation
+from qonnx.transformation.general import GiveReadableTensorNames, GiveUniqueNodeNames
+from qonnx.transformation.infer_data_layouts import InferDataLayouts
 from shutil import copy
 
-from finn.core.modelwrapper import ModelWrapper
-from finn.custom_op.registry import getCustomOp
-from finn.transformation.base import Transformation
 from finn.transformation.fpgadataflow.create_dataflow_partition import (
     CreateDataflowPartition,
 )
@@ -43,8 +45,6 @@
 from finn.transformation.fpgadataflow.insert_fifo import InsertFIFO
 from finn.transformation.fpgadataflow.insert_iodma import InsertIODMA
 from finn.transformation.fpgadataflow.prepare_ip import PrepareIP
-from finn.transformation.general import GiveReadableTensorNames, GiveUniqueNodeNames
-from finn.transformation.infer_data_layouts import InferDataLayouts
 from finn.util.basic import make_build_dir, pynq_part_map
 
 from . import templates
@@ -62,13 +62,13 @@ def collect_ip_dirs(model, ipstitch_path):
         ), """The directory that should
         contain the generated ip blocks doesn't exist."""
         ip_dirs += [ip_dir_value]
-        if node.op_type in ["StreamingFCLayer_Batch", "Thresholding_Batch"]:
+        if node.op_type in ["MatrixVectorActivation", "Thresholding_Batch"]:
             if node_inst.get_nodeattr("mem_mode") == "decoupled":
                 need_memstreamer = True
     ip_dirs += [ipstitch_path + "/ip"]
     if need_memstreamer:
         # add RTL streamer IP
-        ip_dirs.append("/workspace/finn/finn-rtllib/memstream")
+        ip_dirs.append("$::env(FINN_ROOT)/finn-rtllib/memstream")
     return ip_dirs
 
 
@@ -152,11 +152,13 @@ def apply(self, model):
             # define kernel instances
             # name kernels connected to graph inputs as idmaxx
             # name kernels connected to graph outputs as odmaxx
-            if producer is None or consumer is None:
+            if (producer is None) or (consumer == []):
+                # TODO not a good way of checking for external inp&out
+                # should look at the list of top-level in/out instead
                 if producer is None:
                     instance_names[node.name] = "idma" + str(idma_idx)
                     idma_idx += 1
-                elif consumer is None:
+                elif consumer == []:
                     instance_names[node.name] = "odma" + str(odma_idx)
                     odma_idx += 1
                 config.append(
@@ -279,10 +281,16 @@ def apply(self, model):
         copy(bitfile_name, deploy_bitfile_name)
         # set bitfile attribute
         model.set_metadata_prop("bitfile", deploy_bitfile_name)
-        hwh_name = (
+        hwh_name_alts = [
             vivado_pynq_proj_dir
-            + "/finn_zynq_link.srcs/sources_1/bd/top/hw_handoff/top.hwh"
-        )
+            + "/finn_zynq_link.srcs/sources_1/bd/top/hw_handoff/top.hwh",
+            vivado_pynq_proj_dir
+            + "/finn_zynq_link.gen/sources_1/bd/top/hw_handoff/top.hwh",
+        ]
+        hwh_name = None
+        for hwh_name_cand in hwh_name_alts:
+            if os.path.isfile(hwh_name_cand):
+                hwh_name = hwh_name_cand
         if not os.path.isfile(hwh_name):
             raise Exception(
                 "Synthesis failed, no bitfile found. Check logs under %s"
diff --git a/src/finn/transformation/fpgadataflow/minimize_accumulator_width.py b/src/finn/transformation/fpgadataflow/minimize_accumulator_width.py
index 0a0c45b6be..bc020ca428 100644
--- a/src/finn/transformation/fpgadataflow/minimize_accumulator_width.py
+++ b/src/finn/transformation/fpgadataflow/minimize_accumulator_width.py
@@ -26,8 +26,9 @@
 # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-from finn.custom_op.registry import getCustomOp
-from finn.transformation.base import Transformation
+from qonnx.custom_op.registry import getCustomOp
+from qonnx.transformation.base import Transformation
+
 from finn.util.fpgadataflow import is_fpgadataflow_node
 
 
diff --git a/src/finn/transformation/fpgadataflow/prepare_cppsim.py b/src/finn/transformation/fpgadataflow/prepare_cppsim.py
index 8b332972ca..07021c1e8d 100644
--- a/src/finn/transformation/fpgadataflow/prepare_cppsim.py
+++ b/src/finn/transformation/fpgadataflow/prepare_cppsim.py
@@ -29,10 +29,11 @@
 import copy
 import multiprocessing as mp
 import os
+import qonnx.custom_op.registry as registry
+from qonnx.transformation.base import Transformation
+from qonnx.util.basic import get_num_default_workers
 
-import finn.custom_op.registry as registry
-from finn.transformation.base import Transformation
-from finn.util.basic import get_num_default_workers, make_build_dir
+from finn.util.basic import make_build_dir
 from finn.util.fpgadataflow import is_fpgadataflow_node
 
 
diff --git a/src/finn/transformation/fpgadataflow/prepare_ip.py b/src/finn/transformation/fpgadataflow/prepare_ip.py
index 4fdcf3939f..2ebd6310f0 100644
--- a/src/finn/transformation/fpgadataflow/prepare_ip.py
+++ b/src/finn/transformation/fpgadataflow/prepare_ip.py
@@ -27,10 +27,10 @@
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 import os
+import qonnx.custom_op.registry as registry
 import warnings
+from qonnx.transformation.base import Transformation
 
-import finn.custom_op.registry as registry
-from finn.transformation.base import Transformation
 from finn.util.basic import make_build_dir
 from finn.util.fpgadataflow import is_fpgadataflow_node
 
diff --git a/src/finn/transformation/fpgadataflow/prepare_rtlsim.py b/src/finn/transformation/fpgadataflow/prepare_rtlsim.py
index 66799ff429..645d86cf14 100644
--- a/src/finn/transformation/fpgadataflow/prepare_rtlsim.py
+++ b/src/finn/transformation/fpgadataflow/prepare_rtlsim.py
@@ -26,8 +26,9 @@
 # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-import finn.custom_op.registry as registry
-from finn.transformation.base import NodeLocalTransformation
+import qonnx.custom_op.registry as registry
+from qonnx.transformation.base import NodeLocalTransformation
+
 from finn.transformation.fpgadataflow.replace_verilog_relpaths import (
     ReplaceVerilogRelPaths,
 )
diff --git a/src/finn/transformation/fpgadataflow/replace_verilog_relpaths.py b/src/finn/transformation/fpgadataflow/replace_verilog_relpaths.py
index 7850d37423..4e7970caa0 100644
--- a/src/finn/transformation/fpgadataflow/replace_verilog_relpaths.py
+++ b/src/finn/transformation/fpgadataflow/replace_verilog_relpaths.py
@@ -27,9 +27,9 @@
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 import os
+import qonnx.custom_op.registry as registry
+from qonnx.transformation.base import Transformation
 
-import finn.custom_op.registry as registry
-from finn.transformation.base import Transformation
 from finn.util.fpgadataflow import is_fpgadataflow_node
 
 
diff --git a/src/finn/transformation/fpgadataflow/set_exec_mode.py b/src/finn/transformation/fpgadataflow/set_exec_mode.py
index caf891bc44..a08d153cb2 100644
--- a/src/finn/transformation/fpgadataflow/set_exec_mode.py
+++ b/src/finn/transformation/fpgadataflow/set_exec_mode.py
@@ -26,8 +26,9 @@
 # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-import finn.custom_op.registry as registry
-from finn.transformation.base import Transformation
+import qonnx.custom_op.registry as registry
+from qonnx.transformation.base import Transformation
+
 from finn.util.fpgadataflow import is_fpgadataflow_node
 
 
diff --git a/src/finn/transformation/fpgadataflow/set_fifo_depths.py b/src/finn/transformation/fpgadataflow/set_fifo_depths.py
index ce7cf7bc58..0139c71666 100644
--- a/src/finn/transformation/fpgadataflow/set_fifo_depths.py
+++ b/src/finn/transformation/fpgadataflow/set_fifo_depths.py
@@ -29,19 +29,20 @@
 import math
 import numpy as np
 import warnings
+from pyverilator.util.axi_utils import reset_rtlsim, toggle_clk
+from qonnx.custom_op.registry import getCustomOp
+from qonnx.transformation.base import Transformation
+from qonnx.transformation.general import GiveReadableTensorNames, GiveUniqueNodeNames
 
 from finn.analysis.fpgadataflow.dataflow_performance import dataflow_performance
-from finn.custom_op.registry import getCustomOp
-from finn.transformation.base import Transformation
 from finn.transformation.fpgadataflow.annotate_cycles import AnnotateCycles
 from finn.transformation.fpgadataflow.create_stitched_ip import CreateStitchedIP
 from finn.transformation.fpgadataflow.hlssynth_ip import HLSSynthIP
 from finn.transformation.fpgadataflow.insert_dwc import InsertDWC
 from finn.transformation.fpgadataflow.insert_fifo import InsertFIFO
 from finn.transformation.fpgadataflow.prepare_ip import PrepareIP
-from finn.transformation.general import GiveReadableTensorNames, GiveUniqueNodeNames
 from finn.util.fpgadataflow import is_fpgadataflow_node
-from finn.util.pyverilator import pyverilate_stitched_ip, reset_rtlsim, toggle_clk
+from finn.util.pyverilator import pyverilate_stitched_ip
 
 
 def reset_implementation(node):
@@ -99,7 +100,7 @@ def apply(self, model):
                 # bypass shallow fifos
                 shallow_fifos.append(node)
                 consumers = model.find_consumers(node.output[0])
-                if consumers is None:
+                if consumers == []:
                     producer = model.find_producer(node.input[0])
                     for idx, inp in enumerate(producer.output):
                         if inp == node.input[0]:
@@ -137,7 +138,7 @@ class CapConvolutionFIFODepths(Transformation):
     Background:
     The simulation-based rtlsim_exec tends to overestimate the required depth
     of FIFOs between the ConvolutionInputGenerator (here called SWG) and the
-    StreamingFCLayer (here called MVAU). As the SWG has an internal buffer of 1
+    MatrixVectorActivation (here called MVAU). As the SWG has an internal buffer of 1
     image row, we use this as a rule of thumb to set FIFO depth to be no larger
     than 1 row.
     """
@@ -152,7 +153,7 @@ def apply(self, model):
         # TODO move this to own transformation
         for node in model.graph.node:
             # look for following pattern:
-            # ConvolutionInputGenerator -> StreamingFIFO -> StreamingFCLayer
+            # ConvolutionInputGenerator -> StreamingFIFO -> MatrixVectorActivation
             if node.op_type == "StreamingFIFO":
                 fifo_prod = model.find_producer(node.input[0])
                 fifo_cons = model.find_consumer(node.output[0])
@@ -162,7 +163,7 @@ def apply(self, model):
                     continue
                 if fifo_cons is None:
                     continue
-                if fifo_cons.op_type != "StreamingFCLayer_Batch":
+                if fifo_cons.op_type != "MatrixVectorActivation":
                     continue
                 op_inst = getCustomOp(node)
                 depth = op_inst.get_nodeattr("depth")
@@ -247,7 +248,7 @@ def apply(self, model):
             node = getCustomOp(node)
             node.set_nodeattr("inFIFODepth", self.max_depth)
             node.set_nodeattr("outFIFODepth", self.max_depth)
-            if node.onnx_node.op_type == "StreamingFCLayer_Batch":
+            if node.onnx_node.op_type == "MatrixVectorActivation":
                 mmode = node.get_nodeattr("mem_mode")
                 if mmode == "external":
                     modified_fc_nodes.append(node.onnx_node.name)
@@ -377,7 +378,7 @@ def apply(self, model):
                 getCustomOp(node).set_nodeattr("outFIFODepth", 0)
                 # for every FC node we changed from external to decoupled,
                 # change back and reset implementation
-                if node.op_type == "StreamingFCLayer_Batch":
+                if node.op_type == "MatrixVectorActivation":
                     if node.name in modified_fc_nodes:
                         node_inst = getCustomOp(node)
                         node_inst.set_nodeattr("mem_mode", "external")
diff --git a/src/finn/transformation/fpgadataflow/set_folding.py b/src/finn/transformation/fpgadataflow/set_folding.py
index 64d7a08072..62131b7ac3 100644
--- a/src/finn/transformation/fpgadataflow/set_folding.py
+++ b/src/finn/transformation/fpgadataflow/set_folding.py
@@ -28,12 +28,12 @@
 
 import numpy as np
 import warnings
+from qonnx.custom_op.registry import getCustomOp
+from qonnx.transformation.base import Transformation
+from qonnx.transformation.general import GiveUniqueNodeNames
 
 from finn.analysis.fpgadataflow.dataflow_performance import dataflow_performance
-from finn.custom_op.registry import getCustomOp
-from finn.transformation.base import Transformation
 from finn.transformation.fpgadataflow.annotate_cycles import AnnotateCycles
-from finn.transformation.general import GiveUniqueNodeNames
 from finn.util.fpgadataflow import is_fpgadataflow_node
 
 
@@ -62,13 +62,13 @@ class SetFolding(Transformation):
 
     Notable exceptions and special behavior:
 
-    * When folding dense convolution/FC compute engines (StreamingFCLayer_Batch),
+    * When folding dense convolution/FC compute engines ("MVAU"/MatrixVectorActivation),
     which have two attributes (PE and SIMD):
         * first increases SIMD while weight stream width per PE is <= mvau_wwidth_max
           (configurable in the SetFolding initializer, defaults to 36)
         * then increases PE until the target is met or max PE reached
 
-    * When folding depthwise convolutions ("VVAU"/Vector_Vector_Activate_Batch)
+    * When folding depthwise convolutions ("VVAU"/VectorVectorActivation)
     or spatial reduction ops (Pool_Batch):
         * the producer of the node is expected to be a ConvolutionInputGenerator
         with depthwise=1, whose SIMD value will be set equal to the PE value of
@@ -107,13 +107,13 @@ def apply(self, model):
         simd_ops = ["DownSampler", "FMPadding_Batch", "ConvolutionInputGenerator"]
         # these ops are preceded by depthwise SWG and have special behavior,
         # as explained in the SetFolding docstring
-        depthwise_op_exceptions = ["Vector_Vector_Activate_Batch", "Pool_Batch"]
+        depthwise_op_exceptions = ["VectorVectorActivation", "Pool_Batch"]
         for node in graph.node:
             if not is_fpgadataflow_node(node):
                 continue
             op_type = node.op_type
             node_inst = getCustomOp(node)
-            if op_type == "StreamingFCLayer_Batch":
+            if op_type == "MatrixVectorActivation":
                 max_simd = node_inst.get_nodeattr("MW")
                 max_pe = node_inst.get_nodeattr("MH")
                 node_inst.set_nodeattr("PE", 1)
@@ -155,7 +155,7 @@ def apply(self, model):
                     pe = node_inst.get_nodeattr("PE")
                     swu_node_inst.set_nodeattr("SIMD", pe)
                 else:
-                    if op_type == "Vector_Vector_Activate_Batch":
+                    if op_type == "VectorVectorActivation":
                         ksize = np.prod(node_inst.get_nodeattr("Kernel"))
                     elif op_type == "Pool_Batch":
                         ksize = node_inst.get_nodeattr("KernelSize")
diff --git a/src/finn/transformation/fpgadataflow/synth_ooc.py b/src/finn/transformation/fpgadataflow/synth_ooc.py
index 49cd6c82bc..6070cce636 100644
--- a/src/finn/transformation/fpgadataflow/synth_ooc.py
+++ b/src/finn/transformation/fpgadataflow/synth_ooc.py
@@ -27,9 +27,9 @@
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 import os
+from qonnx.transformation.base import Transformation
 from shutil import copy2
 
-from finn.transformation.base import Transformation
 from finn.util.basic import make_build_dir
 from finn.util.vivado import out_of_context_synth
 
@@ -52,7 +52,7 @@ def file_to_basename(x):
         top_module_name = model.get_metadata_prop("wrapper_filename")
         top_module_name = file_to_basename(top_module_name).strip(".v")
         build_dir = make_build_dir("synth_out_of_context_")
-        verilog_extensions = [".v", ".vh"]
+        verilog_extensions = [".v", ".sv", ".vh"]
         with open(vivado_stitch_proj_dir + "/all_verilog_srcs.txt", "r") as f:
             all_verilog_srcs = f.read().split()
         for file in all_verilog_srcs:
diff --git a/src/finn/transformation/fpgadataflow/template_driver.py b/src/finn/transformation/fpgadataflow/template_driver.py
index 31dd22573e..05ee6ad920 100644
--- a/src/finn/transformation/fpgadataflow/template_driver.py
+++ b/src/finn/transformation/fpgadataflow/template_driver.py
@@ -60,7 +60,7 @@
 import argparse
 import numpy as np
 import os
-from finn.core.datatype import DataType
+from qonnx.core.datatype import DataType
 from driver_base import FINNExampleOverlay
 
 # dictionary describing the I/O of the FINN-generated accelerator
diff --git a/src/finn/transformation/fpgadataflow/templates.py b/src/finn/transformation/fpgadataflow/templates.py
index a12f359c7d..78bcdea0d7 100644
--- a/src/finn/transformation/fpgadataflow/templates.py
+++ b/src/finn/transformation/fpgadataflow/templates.py
@@ -103,8 +103,8 @@
 # set board part repo paths to find PYNQ-Z1/Z2
 set paths_prop [get_property BOARD_PART_REPO_PATHS [current_project]]
 set paths_param [get_param board.repoPaths]
-lappend paths_prop /workspace/board_files
-lappend paths_param /workspace/board_files
+lappend paths_prop $::env(FINN_ROOT)/deps/board_files
+lappend paths_param $::env(FINN_ROOT)/deps/board_files
 set_property BOARD_PART_REPO_PATHS $paths_prop [current_project]
 set_param board.repoPaths $paths_param
 
@@ -114,6 +114,9 @@
 } elseif {$BOARD == "ZCU102"} {
     set_property board_part xilinx.com:zcu102:part0:3.3 [current_project]
     set ZYNQ_TYPE "zynq_us+"
+} elseif {$BOARD == "RFSoC2x2"} {
+    set_property board_part xilinx.com:rfsoc2x2:part0:1.1 [current_project]
+    set ZYNQ_TYPE "zynq_us+"
 } elseif {$BOARD == "Ultra96"} {
     set_property board_part avnet.com:ultra96v1:part0:1.2 [current_project]
     set ZYNQ_TYPE "zynq_us+"
@@ -129,7 +132,7 @@
 
 create_bd_design "top"
 if {$ZYNQ_TYPE == "zynq_us+"} {
-    create_bd_cell -type ip -vlnv xilinx.com:ip:zynq_ultra_ps_e:3.3 zynq_ps
+    create_bd_cell -type ip -vlnv xilinx.com:ip:zynq_ultra_ps_e:3.4 zynq_ps
     apply_bd_automation -rule xilinx.com:bd_rule:zynq_ultra_ps_e -config {apply_board_preset "1" }  [get_bd_cells zynq_ps]
     #activate one slave port, deactivate the second master port
     set_property -dict [list CONFIG.PSU__USE__S_AXI_GP2 {1}] [get_bd_cells zynq_ps]
@@ -182,7 +185,7 @@
     #align base address to range
     set offset [expr ($axi_peripheral_base + ($range-1)) & ~($range-1)]
     #perform assignment
-    assign_bd_address [get_bd_addr_segs $axi_intf_path/Reg] -offset $offset -range $range
+    assign_bd_address [get_bd_addr_segs $axi_intf_path/Reg*] -offset $offset -range $range
     #advance base address
     set axi_peripheral_base [expr $offset + $range]
 }
diff --git a/src/finn/transformation/fpgadataflow/vitis_build.py b/src/finn/transformation/fpgadataflow/vitis_build.py
index a286532141..855b30fe95 100644
--- a/src/finn/transformation/fpgadataflow/vitis_build.py
+++ b/src/finn/transformation/fpgadataflow/vitis_build.py
@@ -30,10 +30,15 @@
 import os
 import subprocess
 from enum import Enum
+from qonnx.core.modelwrapper import ModelWrapper
+from qonnx.custom_op.registry import getCustomOp
+from qonnx.transformation.base import Transformation
+from qonnx.transformation.general import (
+    GiveReadableTensorNames,
+    GiveUniqueNodeNames,
+    RemoveUnusedTensors,
+)
 
-from finn.core.modelwrapper import ModelWrapper
-from finn.custom_op.registry import getCustomOp
-from finn.transformation.base import Transformation
 from finn.transformation.fpgadataflow.create_dataflow_partition import (
     CreateDataflowPartition,
 )
@@ -44,12 +49,6 @@
 from finn.transformation.fpgadataflow.insert_fifo import InsertFIFO
 from finn.transformation.fpgadataflow.insert_iodma import InsertIODMA
 from finn.transformation.fpgadataflow.prepare_ip import PrepareIP
-from finn.transformation.general import (
-    GiveReadableTensorNames,
-    GiveUniqueNodeNames,
-    RemoveUnusedTensors,
-)
-from finn.transformation.infer_data_layouts import InferDataLayouts
 from finn.util.basic import make_build_dir
 
 from . import templates
@@ -214,11 +213,13 @@ def apply(self, model):
             # define kernel instances
             # name kernels connected to graph inputs as idmaxx
             # name kernels connected to graph inputs as odmaxx
+            # TODO not a good way of checking for external in/out
+            # check top-level in/out list instead
             if producer is None:
                 instance_names[node.name] = "idma" + str(idma_idx)
                 config.append("nk=%s:1:%s" % (node.name, instance_names[node.name]))
                 idma_idx += 1
-            elif consumer is None:
+            elif consumer == []:
                 instance_names[node.name] = "odma" + str(odma_idx)
                 config.append("nk=%s:1:%s" % (node.name, instance_names[node.name]))
                 odma_idx += 1
@@ -392,8 +393,6 @@ def __init__(
 
     def apply(self, model):
         _check_vitis_envvars()
-        # first infer layouts
-        model = model.transform(InferDataLayouts())
         # prepare at global level, then break up into kernels
         prep_transforms = [InsertIODMA(512), InsertDWC()]
         for trn in prep_transforms:
diff --git a/src/finn/transformation/move_reshape.py b/src/finn/transformation/move_reshape.py
index 6c9a297337..cec04a182b 100644
--- a/src/finn/transformation/move_reshape.py
+++ b/src/finn/transformation/move_reshape.py
@@ -1,8 +1,7 @@
 import warnings
-
-from finn.custom_op.registry import getCustomOp
-from finn.transformation.base import Transformation
-from finn.util.basic import get_by_name, is_finn_op
+from qonnx.custom_op.registry import getCustomOp
+from qonnx.transformation.base import Transformation
+from qonnx.util.basic import get_by_name, is_finn_op
 
 
 def _is_fpgadataflow_node(node):
@@ -51,7 +50,7 @@ def apply(self, model):
                             producer = model.find_producer(transp_node.input[0])
                             if _is_fpgadataflow_node(producer) is True:
                                 consumer = model.find_consumer(n.output[0])
-                                if consumer.op_type == "StreamingFCLayer_Batch":
+                                if consumer.op_type == "MatrixVectorActivation":
                                     fc_inst = getCustomOp(consumer)
                                     mw = fc_inst.get_nodeattr("MW")
                                     mh = fc_inst.get_nodeattr("MH")
diff --git a/src/finn/transformation/qonnx/__init__.py b/src/finn/transformation/qonnx/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/src/finn/transformation/qonnx/convert_qonnx_to_finn.py b/src/finn/transformation/qonnx/convert_qonnx_to_finn.py
index 70656e4d09..967a127636 100644
--- a/src/finn/transformation/qonnx/convert_qonnx_to_finn.py
+++ b/src/finn/transformation/qonnx/convert_qonnx_to_finn.py
@@ -26,12 +26,13 @@
 # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
+from qonnx.transformation.base import Transformation
+from qonnx.transformation.extract_conv_bias import ExtractBiasFromConv
+from qonnx.transformation.gemm_to_matmul import GemmToMatMul
+from qonnx.transformation.infer_datatypes import InferDataTypes
 from qonnx.transformation.quant_constant_folding import FoldTransposeIntoQuantInit
+from qonnx.transformation.remove import RemoveIdentityOps
 
-from finn.transformation.base import Transformation
-from finn.transformation.extract_conv_bias import ExtractBiasFromConv
-from finn.transformation.gemm_to_matmul import GemmToMatMul
-from finn.transformation.infer_datatypes import InferDataTypes
 from finn.transformation.qonnx.fold_quant_weights import FoldQuantWeights
 from finn.transformation.qonnx.infer_quant_avg_pool_2d import (
     AvgPoolAndTruncToQuantAvgPool,
@@ -40,7 +41,6 @@
     ConvertQuantActToMultiThreshold,
     default_filter_function_generator,
 )
-from finn.transformation.remove import RemoveIdentityOps
 
 
 class ConvertQONNXtoFINN(Transformation):
diff --git a/src/finn/transformation/qonnx/fold_quant_weights.py b/src/finn/transformation/qonnx/fold_quant_weights.py
index 12c854d3ba..80b6042d03 100644
--- a/src/finn/transformation/qonnx/fold_quant_weights.py
+++ b/src/finn/transformation/qonnx/fold_quant_weights.py
@@ -27,14 +27,13 @@
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 import numpy as np
+import qonnx.core.onnx_exec as oxe
 from onnx import TensorProto, helper
+from qonnx.custom_op.registry import getCustomOp
+from qonnx.transformation.base import Transformation
+from qonnx.transformation.infer_shapes import InferShapes
 from qonnx.transformation.quant_constant_folding import FoldTransposeIntoQuantInit
-
-import finn.core.onnx_exec as oxe
-from finn.custom_op.registry import getCustomOp
-from finn.transformation.base import Transformation
-from finn.transformation.infer_shapes import InferShapes
-from finn.transformation.remove import remove_node_and_rewire
+from qonnx.transformation.remove import remove_node_and_rewire
 
 
 class FoldQuantWeights(Transformation):
@@ -103,7 +102,7 @@ def apply(self, model):
                         model.set_initializer(node_out, q_node_output)
                     else:
                         # Check next operator type
-                        mul_like_nodes = ["Mul", "Div", "Conv", "MatMul"]
+                        mul_like_nodes = ["Mul", "Div", "Conv", "MatMul", "Gather"]
                         add_like_nodes = ["Add", "Sub"]
                         all_supported_ops = mul_like_nodes.copy()
                         all_supported_ops.extend(add_like_nodes)
@@ -146,11 +145,14 @@ def apply(self, model):
                         model.set_initializer(mul_tensor.name, scale)
 
                         successor = model.find_consumers(node_out)
-                        if successor is None:
+                        if successor == []:
                             raise RuntimeError(
                                 "Can only constant fold scaled Quant weights "
                                 "if a successor exists."
                             )
+                        assert (
+                            len(successor) == 1
+                        ), "Only implemented for a single consumer"
                         successor = successor[0]
                         succ_output_name = successor.output[0]
 
diff --git a/src/finn/transformation/qonnx/infer_quant_avg_pool_2d.py b/src/finn/transformation/qonnx/infer_quant_avg_pool_2d.py
index c234bd38d9..5a3f176f1f 100644
--- a/src/finn/transformation/qonnx/infer_quant_avg_pool_2d.py
+++ b/src/finn/transformation/qonnx/infer_quant_avg_pool_2d.py
@@ -29,13 +29,12 @@
 
 import math
 from onnx import TensorProto, helper
-
-from finn.core.datatype import DataType
-from finn.custom_op.registry import getCustomOp
-from finn.transformation.base import Transformation
-from finn.transformation.infer_datatypes import InferDataTypes
-from finn.transformation.infer_shapes import InferShapes
-from finn.util.basic import get_by_name
+from qonnx.core.datatype import DataType
+from qonnx.custom_op.registry import getCustomOp
+from qonnx.transformation.base import Transformation
+from qonnx.transformation.infer_datatypes import InferDataTypes
+from qonnx.transformation.infer_shapes import InferShapes
+from qonnx.util.basic import get_by_name
 
 
 def _get_signed_from_upstream(model, trunc_node):
@@ -274,7 +273,7 @@ def apply(self, model):
                             "QuantAvgPool2d",
                             [act_scale_div_tensor.name],
                             [act_scale_mul_tensor.name],
-                            domain="finn.custom_op.general",
+                            domain="qonnx.custom_op.general",
                             stride=stride,
                             kernel=k_s,
                             ibits=ibits,
diff --git a/src/finn/transformation/qonnx/qonnx_activation_handlers.py b/src/finn/transformation/qonnx/qonnx_activation_handlers.py
index c8bde7fea8..a50a585077 100644
--- a/src/finn/transformation/qonnx/qonnx_activation_handlers.py
+++ b/src/finn/transformation/qonnx/qonnx_activation_handlers.py
@@ -29,16 +29,17 @@
 import numpy as np
 from abc import ABC, abstractmethod
 from onnx import TensorProto, helper
+from qonnx.core.modelwrapper import ModelWrapper
+from qonnx.custom_op.registry import getCustomOp
 
-from finn.core.modelwrapper import ModelWrapper
-from finn.custom_op.registry import getCustomOp
+np_default_dtype = np.float32
 
 
 class QuantActBaseHandler(ABC):
     """Base class for converting quantized activation expressed in the QONNX dialect
     to the FINN ONNX dialect.
     :param model: The model on which this handler should operate.
-    :type model: class: `finn.core.modelwrapper.ModelWrapper`
+    :type model: class: `qonnx.core.modelwrapper.ModelWrapper`
     :param quant_node: The Quant node which a given handler should replace.
     :param quant_node_index: The index of the Quant node in the given model.
     :type quant_node_index: `int`
@@ -146,7 +147,7 @@ def replace_quant_node(self):
             [n.input[0], thresh_tensor.name],
             [n.output[0]],
             out_dtype="FLOAT32",
-            domain="finn.custom_op.general",
+            domain="qonnx.custom_op.general",
         )
         graph.node.insert(running_node_index, outp_trans_node)
         running_node_index += 1
@@ -164,17 +165,16 @@ def replace_quant_node(self):
         if scale_scalar and bias_scalar and self._q_node.op_type == "BipolarQuant":
             # Get Quant parameters
             mul_scale = np.atleast_1d(mul_scale)
-            # ONNX only accepts 64bit floats as attributes
-            mul_scale = mul_scale.astype(dtype=np.float64)
             adder_bias = np.atleast_1d(adder_bias)
-            adder_bias = adder_bias.astype(dtype=np.float64)
 
             # Set Bias and scale
-            mt_inst.set_nodeattr("out_scale", mul_scale[0])
+            # note calls to .item() to get Python float instead of numpy float
+            # ONNX attribute setting fails otherwise
+            mt_inst.set_nodeattr("out_scale", mul_scale[0].item())
             # FINN applies scale first then bias,
             # which is the other way around in Brevitas,
             # we thus need to adjust the bias in the MultiThreshold node
-            finn_bias = adder_bias[0] * mul_scale[0]
+            finn_bias = adder_bias[0].item() * mul_scale[0].item()
             mt_inst.set_nodeattr("out_bias", finn_bias)
 
             # Set the output data type
@@ -190,8 +190,7 @@ def replace_quant_node(self):
             zero_bias = False
             if bias_scalar:
                 adder_bias = np.atleast_1d(adder_bias)
-                # ONNX only accepts 64bit floats as attributes
-                adder_bias = adder_bias.astype(dtype=np.float64)[0]
+                adder_bias = adder_bias[0]
                 add_shape = tuple()
                 if adder_bias == 0.0:
                     zero_bias = True
@@ -234,7 +233,7 @@ def replace_quant_node(self):
             unity_scale = False
             if scale_scalar:
                 mul_scale = np.atleast_1d(mul_scale)
-                mul_scale = mul_scale.astype(dtype=np.float64)[0]
+                mul_scale = mul_scale[0]
                 mul_shape = tuple()
                 if mul_scale == 1.0:
                     unity_scale = True
@@ -313,7 +312,7 @@ def _calculate_act_bias(self):
         # No bias allowed for Relu activations, see: https://github.com/Xilinx/
         # brevitas/blob/a5bfd6dc5e030f0047ac1ee47932b60e8e873e17/src/brevitas/
         # export/onnx/finn/handler/act.py#L48
-        bias = np.array([0.0])
+        bias = np.array([0.0], dtype=np_default_dtype)
         return bias
 
     def _calculate_thresholds(self):
@@ -339,7 +338,9 @@ def _calculate_thresholds(self):
         num_scale_channels = flat_scale.shape[0]
         step = np.abs(flat_scale).astype(np.float32)
         min_threshold = step / 2
-        thresholds = np.empty((num_scale_channels, num_thresholds)).astype(np.float32)
+        thresholds = np.empty(
+            (num_scale_channels, num_thresholds), dtype=np_default_dtype
+        )
         for c in range(num_scale_channels):
             for t in range(num_thresholds):
                 thresholds[c][t] = min_threshold[c] + step[c] * t
@@ -438,13 +439,13 @@ def _calculate_act_bias(self):
         # a5bfd6dc5e030f0047ac1ee47932b60e8e873e17/src/brevitas/export/
         # onnx/finn/handler/act.py#L64
         if bit_width == 1.0:
-            bias = np.array([-0.5])
+            bias = np.array([-0.5], dtype=np_default_dtype)
         else:
             if narrow:
                 min_non_scaled_val = -(2 ** (bit_width - 1) - 1)
             else:
                 min_non_scaled_val = -(2 ** (bit_width - 1))
-            bias = np.array([min_non_scaled_val])
+            bias = np.array([min_non_scaled_val], dtype=np_default_dtype)
         return bias
 
     def _calculate_thresholds(self):
@@ -463,7 +464,7 @@ def _calculate_thresholds(self):
         # blob/a5bfd6dc5e030f0047ac1ee47932b60e8e873e17/src/brevitas/
         # export/onnx/finn/handler/act.py#L76
         if bit_width == 1.0:
-            thresholds = np.empty([1, 1])
+            thresholds = np.empty([1, 1], dtype=np_default_dtype)
             thresholds[0] = 0
             return thresholds
         else:
@@ -477,7 +478,9 @@ def _calculate_thresholds(self):
             num_scale_channels = flat_scale.shape[0]
             step = np.abs(flat_scale)
             half_step = step / 2.0
-            thresholds = np.empty((num_scale_channels, num_thresholds))
+            thresholds = np.empty(
+                (num_scale_channels, num_thresholds), dtype=np_default_dtype
+            )
             # compute the value of the smallest threshold, we'll neg-bias all
             # generated thresholds by this much
             min_threshold = -half_step - step * ((num_thresholds // 2) - 1)
diff --git a/src/finn/transformation/qonnx/quant_act_to_multithreshold.py b/src/finn/transformation/qonnx/quant_act_to_multithreshold.py
index 29ba93dfcf..c52d69b0f0 100644
--- a/src/finn/transformation/qonnx/quant_act_to_multithreshold.py
+++ b/src/finn/transformation/qonnx/quant_act_to_multithreshold.py
@@ -28,8 +28,8 @@
 
 
 import warnings
+from qonnx.transformation.base import Transformation
 
-from finn.transformation.base import Transformation
 from finn.transformation.qonnx.qonnx_activation_handlers import QuantActBaseHandler
 
 
diff --git a/src/finn/transformation/streamline/__init__.py b/src/finn/transformation/streamline/__init__.py
index d0ec26a4d1..2e68de698b 100644
--- a/src/finn/transformation/streamline/__init__.py
+++ b/src/finn/transformation/streamline/__init__.py
@@ -30,16 +30,17 @@
 
 __path__ = extend_path(__path__, __name__)
 
-from finn.transformation.base import Transformation
-from finn.transformation.batchnorm_to_affine import BatchNormToAffine
-from finn.transformation.general import (
+from qonnx.transformation.base import Transformation
+from qonnx.transformation.batchnorm_to_affine import BatchNormToAffine
+from qonnx.transformation.general import (
     ConvertDivToMul,
     ConvertSubToAdd,
     GiveReadableTensorNames,
     GiveUniqueNodeNames,
 )
-from finn.transformation.infer_datatypes import InferDataTypes
-from finn.transformation.remove import RemoveIdentityOps
+from qonnx.transformation.infer_datatypes import InferDataTypes
+from qonnx.transformation.remove import RemoveIdentityOps
+
 from finn.transformation.streamline.absorb import (
     Absorb1BitMulIntoConv,
     Absorb1BitMulIntoMatMul,
diff --git a/src/finn/transformation/streamline/absorb.py b/src/finn/transformation/streamline/absorb.py
index 97ae3b51a8..0299c4f4d8 100644
--- a/src/finn/transformation/streamline/absorb.py
+++ b/src/finn/transformation/streamline/absorb.py
@@ -27,16 +27,15 @@
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 import numpy as np
+import qonnx.core.data_layout as DataLayout
 import warnings
 from onnx import helper as oh
-
-import finn.core.data_layout as DataLayout
-from finn.core.datatype import DataType
-from finn.custom_op.registry import getCustomOp
-from finn.transformation.base import Transformation
-from finn.transformation.infer_datatypes import InferDataTypes
-from finn.transformation.infer_shapes import InferShapes
-from finn.util.basic import get_by_name
+from qonnx.core.datatype import DataType
+from qonnx.custom_op.registry import getCustomOp
+from qonnx.transformation.base import Transformation
+from qonnx.transformation.infer_datatypes import InferDataTypes
+from qonnx.transformation.infer_shapes import InferShapes
+from qonnx.util.basic import get_by_name
 
 
 class AbsorbSignBiasIntoMultiThreshold(Transformation):
@@ -627,10 +626,9 @@ def apply(self, model):
                         graph.node.insert(node_ind + 1, new_transpose)
                         # rewire nodes
                         final_t_cands = model.find_consumers(mt_cand.output[0])
-                        if final_t_cands is not None:
-                            # rewire next nodes' inputs
-                            for final_t_cand in final_t_cands:
-                                final_t_cand.input[0] = trans_output
+                        # rewire next nodes' inputs
+                        for final_t_cand in final_t_cands:
+                            final_t_cand.input[0] = trans_output
                         mt_cand.output[0] = trans_input
                         graph_modified = True
         if graph_modified:
diff --git a/src/finn/transformation/streamline/collapse_repeated.py b/src/finn/transformation/streamline/collapse_repeated.py
index 92c48c84ff..d297110186 100644
--- a/src/finn/transformation/streamline/collapse_repeated.py
+++ b/src/finn/transformation/streamline/collapse_repeated.py
@@ -27,10 +27,9 @@
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 from onnx import helper as oh
-
-from finn.core.datatype import DataType
-from finn.transformation.base import Transformation
-from finn.transformation.infer_shapes import InferShapes
+from qonnx.core.datatype import DataType
+from qonnx.transformation.base import Transformation
+from qonnx.transformation.infer_shapes import InferShapes
 
 
 class CollapseRepeatedOp(Transformation):
diff --git a/src/finn/transformation/streamline/reorder.py b/src/finn/transformation/streamline/reorder.py
index 0cdd6651d9..9ff8a2173c 100644
--- a/src/finn/transformation/streamline/reorder.py
+++ b/src/finn/transformation/streamline/reorder.py
@@ -27,20 +27,19 @@
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 import numpy as np
+import qonnx.core.data_layout as DataLayout
 import warnings
 from onnx import TensorProto
 from onnx import helper as oh
-
-import finn.core.data_layout as DataLayout
-from finn.core.datatype import DataType
-from finn.core.onnx_exec import execute_node
-from finn.custom_op.registry import getCustomOp
-from finn.transformation.base import Transformation
-from finn.transformation.general import SortGraph
-from finn.transformation.infer_data_layouts import InferDataLayouts
-from finn.transformation.infer_datatypes import InferDataTypes
-from finn.transformation.infer_shapes import InferShapes
-from finn.util.basic import get_by_name
+from qonnx.core.datatype import DataType
+from qonnx.core.onnx_exec import execute_node
+from qonnx.custom_op.registry import getCustomOp
+from qonnx.transformation.base import Transformation
+from qonnx.transformation.general import SortGraph
+from qonnx.transformation.infer_data_layouts import InferDataLayouts
+from qonnx.transformation.infer_datatypes import InferDataTypes
+from qonnx.transformation.infer_shapes import InferShapes
+from qonnx.util.basic import get_by_name
 
 
 class MoveAddPastMul(Transformation):
@@ -670,8 +669,15 @@ def apply(self, model):
                 if consumer is not None and consumer.op_type == "Transpose":
                     perms = list(get_by_name(consumer.attribute, "perm").ints)
                     if perms == [0, 2, 3, 1]:
+                        ceil_mode = get_by_name(n.attribute, "ceil_mode")
+                        if ceil_mode is not None:
+                            ceil_mode = ceil_mode.i
+                        else:
+                            ceil_mode = (
+                                0  # default to ceil_mode=0 (equivalent to np.floor)
+                            )
                         n.op_type = "MaxPoolNHWC"
-                        n.domain = "finn.custom_op.general"
+                        n.domain = "qonnx.custom_op.general"
                         start_name = n.input[0]
                         mid_name = consumer.input[0]
                         end_name = consumer.output[0]
@@ -683,14 +689,22 @@ def apply(self, model):
                         n.output[0] = end_name
                         model.set_tensor_shape(mid_name, (b, hi, wi, c))
                         model.set_tensor_shape(end_name, (b, ho, wo, c))
+                        getCustomOp(n).set_nodeattr("ceil_mode", ceil_mode)
                         graph.node.remove(consumer)
                         graph.node.insert(node_ind - 1, consumer)
                         graph_modified = True
                 elif producer is not None and producer.op_type == "Transpose":
                     perms = list(get_by_name(producer.attribute, "perm").ints)
                     if perms == [0, 3, 1, 2]:
+                        ceil_mode = get_by_name(n.attribute, "ceil_mode")
+                        if ceil_mode is not None:
+                            ceil_mode = ceil_mode.i
+                        else:
+                            ceil_mode = (
+                                0  # default to ceil_mode=0 (equivalent to np.floor)
+                            )
                         n.op_type = "MaxPoolNHWC"
-                        n.domain = "finn.custom_op.general"
+                        n.domain = "qonnx.custom_op.general"
                         start_name = producer.input[0]
                         mid_name = n.input[0]
                         end_name = n.output[0]
@@ -702,6 +716,7 @@ def apply(self, model):
                         n.output[0] = mid_name
                         model.set_tensor_shape(mid_name, (b, ho, wo, c))
                         model.set_tensor_shape(end_name, (b, c, ho, wo))
+                        getCustomOp(n).set_nodeattr("ceil_mode", ceil_mode)
                         graph.node.remove(producer)
                         graph.node.insert(node_ind, producer)
                         graph_modified = True
@@ -739,6 +754,7 @@ def apply(self, model):
                 # Check case when branches are empty and go
                 # to the same node
                 consumers = model.find_consumers(n.output[0])
+                assert len(consumers) > 1, "Must have >1 consumer"
                 unique_consumer = True
                 for consum_node in consumers[1:]:
                     if consumers[0] != consum_node:
diff --git a/src/finn/transformation/streamline/round_thresholds.py b/src/finn/transformation/streamline/round_thresholds.py
index ba476504a4..601dab04cb 100644
--- a/src/finn/transformation/streamline/round_thresholds.py
+++ b/src/finn/transformation/streamline/round_thresholds.py
@@ -27,8 +27,7 @@
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 import numpy as np
-
-from finn.transformation.base import Transformation
+from qonnx.transformation.base import Transformation
 
 
 class RoundAndClipThresholds(Transformation):
diff --git a/src/finn/transformation/streamline/sign_to_thres.py b/src/finn/transformation/streamline/sign_to_thres.py
index 61d7eb3543..eafc071fb6 100644
--- a/src/finn/transformation/streamline/sign_to_thres.py
+++ b/src/finn/transformation/streamline/sign_to_thres.py
@@ -28,9 +28,8 @@
 
 import numpy as np
 from onnx import helper as oh
-
-from finn.core.datatype import DataType
-from finn.transformation.base import Transformation
+from qonnx.core.datatype import DataType
+from qonnx.transformation.base import Transformation
 
 
 class ConvertSignToThres(Transformation):
@@ -60,7 +59,7 @@ def apply(self, model):
                     "MultiThreshold",
                     [sign_in_name, thres_param_name],
                     [sign_out_name],
-                    domain="finn.custom_op.general",
+                    domain="qonnx.custom_op.general",
                     out_scale=2.0,
                     out_bias=-1.0,
                     out_dtype="BIPOLAR",
diff --git a/src/finn/util/__init__.py b/src/finn/util/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/src/finn/util/basic.py b/src/finn/util/basic.py
new file mode 100644
index 0000000000..4aba87216c
--- /dev/null
+++ b/src/finn/util/basic.py
@@ -0,0 +1,218 @@
+# Copyright (c) 2020 Xilinx, Inc.
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of Xilinx nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import os
+import subprocess
+import sys
+import tempfile
+
+# mapping from PYNQ board names to FPGA part names
+pynq_part_map = dict()
+pynq_part_map["Ultra96"] = "xczu3eg-sbva484-1-e"
+pynq_part_map["Pynq-Z1"] = "xc7z020clg400-1"
+pynq_part_map["Pynq-Z2"] = "xc7z020clg400-1"
+pynq_part_map["ZCU102"] = "xczu9eg-ffvb1156-2-e"
+pynq_part_map["ZCU104"] = "xczu7ev-ffvc1156-2-e"
+pynq_part_map["ZCU111"] = "xczu28dr-ffvg1517-2-e"
+pynq_part_map["RFSoC2x2"] = "xczu28dr-ffvg1517-2-e"
+
+# native AXI HP port width (in bits) for PYNQ boards
+pynq_native_port_width = dict()
+pynq_native_port_width["Pynq-Z1"] = 64
+pynq_native_port_width["Pynq-Z2"] = 64
+pynq_native_port_width["Ultra96"] = 128
+pynq_native_port_width["ZCU102"] = 128
+pynq_native_port_width["ZCU104"] = 128
+pynq_native_port_width["ZCU111"] = 128
+pynq_native_port_width["RFSoC2x2"] = 128
+
+# Alveo device and platform mappings
+alveo_part_map = dict()
+alveo_part_map["U50"] = "xcu50-fsvh2104-2L-e"
+alveo_part_map["U200"] = "xcu200-fsgd2104-2-e"
+alveo_part_map["U250"] = "xcu250-figd2104-2L-e"
+alveo_part_map["U280"] = "xcu280-fsvh2892-2L-e"
+
+alveo_default_platform = dict()
+alveo_default_platform["U50"] = "xilinx_u50_gen3x16_xdma_201920_3"
+alveo_default_platform["U200"] = "xilinx_u200_xdma_201830_2"
+alveo_default_platform["U250"] = "xilinx_u250_gen3x16_xdma_2_1_202010_1"
+alveo_default_platform["U280"] = "xilinx_u280_xdma_201920_3"
+
+
+def get_rtlsim_trace_depth():
+    """Return the trace depth for rtlsim via PyVerilator. Controllable
+    via the RTLSIM_TRACE_DEPTH environment variable. If the env.var. is
+    undefined, the default value of 1 is returned. A trace depth of 1
+    will only show top-level signals and yield smaller .vcd files.
+
+    The following depth values are of interest for whole-network stitched IP
+    rtlsim:
+    - level 1 shows top-level input/output streams
+    - level 2 shows per-layer input/output streams
+    - level 3 shows per full-layer I/O including FIFO count signals
+    """
+
+    try:
+        return int(os.environ["RTLSIM_TRACE_DEPTH"])
+    except KeyError:
+        return 1
+
+
+def get_remote_vivado():
+    """Return the address of the remote Vivado synthesis server as set by the,
+    REMOTE_VIVADO environment variable, otherwise return None"""
+
+    try:
+        return os.environ["REMOTE_VIVADO"]
+    except KeyError:
+        return None
+
+
+def get_finn_root():
+    "Return the root directory that FINN is cloned into."
+
+    try:
+        return os.environ["FINN_ROOT"]
+    except KeyError:
+        raise Exception(
+            """Environment variable FINN_ROOT must be set
+        correctly. Please ensure you have launched the Docker contaier correctly.
+        """
+        )
+
+
+def pyverilate_get_liveness_threshold_cycles():
+    """Return the number of no-output cycles rtlsim will wait before assuming
+    the simulation is not finishing and throwing an exception."""
+
+    return int(os.getenv("LIVENESS_THRESHOLD", 10000))
+
+
+def make_build_dir(prefix=""):
+    """Creates a folder with given prefix to be used as a build dir.
+    Use this function instead of tempfile.mkdtemp to ensure any generated files
+    will survive on the host after the FINN Docker container exits."""
+    try:
+        tmpdir = tempfile.mkdtemp(prefix=prefix)
+        newdir = tmpdir.replace("/tmp", os.environ["FINN_BUILD_DIR"])
+        os.makedirs(newdir)
+        return newdir
+    except KeyError:
+        raise Exception(
+            """Environment variable FINN_BUILD_DIR must be set
+        correctly. Please ensure you have launched the Docker contaier correctly.
+        """
+        )
+
+
+class CppBuilder:
+    """Builds the g++ compiler command to produces the executable of the c++ code
+    in code_gen_dir which is passed to the function build() of this class."""
+
+    def __init__(self):
+        self.include_paths = []
+        self.cpp_files = []
+        self.executable_path = ""
+        self.code_gen_dir = ""
+        self.compile_components = []
+        self.compile_script = ""
+
+    def append_includes(self, library_path):
+        """Adds given library path to include_paths list."""
+        self.include_paths.append(library_path)
+
+    def append_sources(self, cpp_file):
+        """Adds given c++ file to cpp_files list."""
+        self.cpp_files.append(cpp_file)
+
+    def set_executable_path(self, path):
+        """Sets member variable "executable_path" to given path."""
+        self.executable_path = path
+
+    def build(self, code_gen_dir):
+        """Builds the g++ compiler command according to entries in include_paths
+        and cpp_files lists. Saves it in bash script in given folder and
+        executes it."""
+        # raise error if includes are empty
+        self.code_gen_dir = code_gen_dir
+        self.compile_components.append("g++ -o " + str(self.executable_path))
+        for cpp_file in self.cpp_files:
+            self.compile_components.append(cpp_file)
+        for lib in self.include_paths:
+            self.compile_components.append(lib)
+        bash_compile = ""
+        for component in self.compile_components:
+            bash_compile += str(component) + " "
+        self.compile_script = str(self.code_gen_dir) + "/compile.sh"
+        with open(self.compile_script, "w") as f:
+            f.write("#!/bin/bash \n")
+            f.write(bash_compile + "\n")
+        bash_command = ["bash", self.compile_script]
+        process_compile = subprocess.Popen(bash_command, stdout=subprocess.PIPE)
+        process_compile.communicate()
+
+
+def launch_process_helper(args, proc_env=None, cwd=None):
+    """Helper function to launch a process in a way that facilitates logging
+    stdout/stderr with Python loggers.
+    Returns (cmd_out, cmd_err)."""
+    if proc_env is None:
+        proc_env = os.environ.copy()
+    with subprocess.Popen(
+        args, stdout=subprocess.PIPE, stderr=subprocess.PIPE, env=proc_env, cwd=cwd
+    ) as proc:
+        (cmd_out, cmd_err) = proc.communicate()
+    if cmd_out is not None:
+        cmd_out = cmd_out.decode("utf-8")
+        sys.stdout.write(cmd_out)
+    if cmd_err is not None:
+        cmd_err = cmd_err.decode("utf-8")
+        sys.stderr.write(cmd_err)
+    return (cmd_out, cmd_err)
+
+
+def which(program):
+    "Python equivalent of the shell cmd 'which'."
+
+    # source:
+    # https://stackoverflow.com/questions/377017/test-if-executable-exists-in-python
+    def is_exe(fpath):
+        return os.path.isfile(fpath) and os.access(fpath, os.X_OK)
+
+    fpath, fname = os.path.split(program)
+    if fpath:
+        if is_exe(program):
+            return program
+    else:
+        for path in os.environ["PATH"].split(os.pathsep):
+            exe_file = os.path.join(path, program)
+            if is_exe(exe_file):
+                return exe_file
+
+    return None
diff --git a/src/finn/util/create.py b/src/finn/util/create.py
index 62229a69b6..a8c2e67b38 100644
--- a/src/finn/util/create.py
+++ b/src/finn/util/create.py
@@ -28,10 +28,9 @@
 
 import numpy as np
 from onnx import TensorProto, helper
-
-from finn.core.datatype import DataType
-from finn.core.modelwrapper import ModelWrapper
-from finn.util.basic import calculate_signed_dot_prod_range, gen_finn_dt_tensor
+from qonnx.core.datatype import DataType
+from qonnx.core.modelwrapper import ModelWrapper
+from qonnx.util.basic import calculate_signed_dot_prod_range, gen_finn_dt_tensor
 
 
 def hls_random_mlp_maker(layer_spec):
@@ -117,7 +116,7 @@ def hls_mlp_maker(layer_spec):
             model.graph.output.append(global_out)
 
         # there are two ways to implement bipolar weights and inputs for
-        # StreamingFC:
+        # MatrixVectorActivation:
         # - specify their datatypes as such
         # - specify their datatypes as BINARY as use binaryXnorMode
         if wdt == DataType["BIPOLAR"] and idt == DataType["BIPOLAR"]:
@@ -144,7 +143,7 @@ def hls_mlp_maker(layer_spec):
             actval = 0
             no_act = 1
         FCLayer_node = helper.make_node(
-            "StreamingFCLayer_Batch",
+            "MatrixVectorActivation",
             node_inp_list,
             [current_out_name],
             domain="finn.custom_op.fpgadataflow",
diff --git a/src/finn/util/data_packing.py b/src/finn/util/data_packing.py
new file mode 100644
index 0000000000..65478d2540
--- /dev/null
+++ b/src/finn/util/data_packing.py
@@ -0,0 +1,456 @@
+# Copyright (c) 2020 Xilinx, Inc.
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of Xilinx nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import binascii
+import numpy as np
+import os
+import sys
+from bitstring import BitArray
+from qonnx.core.datatype import DataType
+from qonnx.util.basic import roundup_to_integer_multiple
+
+
+def array2hexstring(array, dtype, pad_to_nbits, prefix="0x", reverse=False):
+    """
+    Pack given one-dimensional NumPy array with FINN DataType dtype into a hex
+    string.
+    Any BIPOLAR values will be converted to a single bit with a 0 representing
+    -1.
+    pad_to_nbits is used to prepend leading zeros to ensure packed strings of
+    fixed width. The minimum value for pad_to_nbits is 4, since a single hex
+    digit is four bits. reverse can be used to reverse the array prior to
+    packing.
+
+    Examples:
+
+    array2hexstring([1, 1, 1, 0], DataType["BINARY"], 4) = "0xe"
+
+    array2hexstring([1, 1, 1, 0], DataType["BINARY"], 8) = "0x0e"
+
+    array2hexstring([1, 1, 0, 1], DataType["BINARY"], 4, reverse=True) = "0xb"
+
+    array2hexstring([1, 1, 1, 0], DataType["BINARY"], 8, reverse=True) = "0x07"
+    """
+    if pad_to_nbits < 4:
+        pad_to_nbits = 4
+    # ensure input is a numpy array with float values
+    if type(array) != np.ndarray or array.dtype != np.float32:
+        # try to convert to a float numpy array (container dtype is float)
+        array = np.asarray(array, dtype=np.float32)
+    # ensure one-dimensional array to pack
+    assert array.ndim == 1, "The given array is not one-dimensional."
+    if dtype == DataType["BIPOLAR"]:
+        # convert bipolar values to binary
+        array = (array + 1) / 2
+        dtype = DataType["BINARY"]
+    # reverse prior to packing, if desired
+    if reverse:
+        array = np.flip(array, -1)
+    lineval = BitArray(length=0)
+    bw = dtype.bitwidth()
+    # special handling for fixed point: rescale, then pack as integers
+    if dtype.is_fixed_point():
+        sf = dtype.scale_factor()
+        array = array / sf
+        # replace dtype with signed integer equivalent
+        dtype = DataType["INT" + str(bw)]
+    for val in array:
+        # ensure that this value is permitted by chosen dtype
+        assert dtype.allowed(val), "This value is not permitted by chosen dtype."
+        if dtype.is_integer():
+            if dtype.signed():
+                lineval.append(BitArray(int=int(val), length=bw))
+            else:
+                lineval.append(BitArray(uint=int(val), length=bw))
+        else:
+            lineval.append(BitArray(float=val, length=bw))
+    if pad_to_nbits >= lineval.len:
+        # extend to the desired output width (a minimum of 4 bits)
+        lineval.prepend(BitArray(length=pad_to_nbits - lineval.len))
+    else:
+        raise Exception("Number of bits is greater than pad_to_nbits")
+    # represent as hex
+    return prefix + lineval.hex
+
+
+def hexstring2npbytearray(hexstring, remove_prefix="0x"):
+    """Convert a hex string into a NumPy array of dtype uint8.
+
+    Example:
+
+    hexstring2npbytearray("0f01") = array([15,  1], dtype=uint8)
+    """
+    # remove prefix if found
+    if hexstring.startswith(remove_prefix):
+        lrp = len(remove_prefix)
+        hexstring = hexstring[lrp:]
+    # use Python's built-in bytearray
+    return np.asarray(bytearray.fromhex(hexstring), dtype=np.uint8)
+
+
+def npbytearray2hexstring(npbytearray, prefix="0x"):
+    """Convert a NumPy array of uint8 dtype into a hex string.
+
+    Example:
+
+    npbytearray2hexstring(array([15,  1], dtype=uint8)) = "0x0f01"
+    """
+    return prefix + binascii.hexlify(bytearray(npbytearray)).decode("utf-8")
+
+
+def pack_innermost_dim_as_hex_string(
+    ndarray, dtype, pad_to_nbits, reverse_inner=False, prefix="0x"
+):
+    """Pack the innermost dimension of the given numpy ndarray into hex
+    strings using array2hexstring.
+
+    Examples:
+
+    A = [[1, 1, 1, 0], [0, 1, 1, 0]]
+
+    eA = ["0e", "06"]
+
+    pack_innermost_dim_as_hex_string(A, DataType["BINARY"], 8) == eA
+
+    B = [[[3, 3], [3, 3]], [[1, 3], [3, 1]]]
+
+    eB = [[ "0f", "0f"], ["07", "0d"]]
+
+    pack_innermost_dim_as_hex_string(B, DataType["UINT2"], 8) == eB
+    """
+
+    if type(ndarray) != np.ndarray or ndarray.dtype != np.float32:
+        # try to convert to a float numpy array (container dtype is float)
+        ndarray = np.asarray(ndarray, dtype=np.float32)
+
+    def fun(x):
+        return array2hexstring(
+            x, dtype, pad_to_nbits, reverse=reverse_inner, prefix=prefix
+        )
+
+    return np.apply_along_axis(fun, ndarray.ndim - 1, ndarray)
+
+
+def unpack_innermost_dim_from_hex_string(
+    ndarray, dtype, out_shape, packedBits, reverse_inner=False
+):
+    """Convert a NumPy array of hex strings into a FINN NumPy array by unpacking
+    the hex strings into the specified data type. out_shape can be specified
+    such that any padding in the packing dimension is removed. If reverse_inner
+    is set, the innermost unpacked dimension will be reversed."""
+
+    if type(ndarray) != np.ndarray:
+        raise Exception(
+            """unpack_innermost_dim_from_hex_string needs ndarray
+        as input"""
+        )
+    if ndarray.dtype.kind not in {"U", "S"}:
+        raise Exception(
+            """unpack_innermost_dim_from_hex_string needs ndarray of
+        hex strings as input"""
+        )
+    # convert ndarray into flattened list
+    data = ndarray.flatten().tolist()
+    targetBits = dtype.bitwidth()
+    # calculate outer and inner dim shapes
+    outer_dim_elems = 1
+    for dim in range(len(out_shape) - 1):
+        outer_dim_elems = outer_dim_elems * out_shape[dim]
+    inner_dim_elems = out_shape[-1]
+
+    array = []
+    if dtype.is_fixed_point():
+        # convert fixed point as signed integer
+        conv_dtype = DataType["INT" + str(targetBits)]
+    else:
+        conv_dtype = dtype
+    for outer_elem in range(outer_dim_elems):
+        ar_list = []
+        ar_elem = data[0]
+        data.pop(0)
+        ar_elem = ar_elem.split("x")
+        ar_elem_bin = bin(int(ar_elem[1], 16))[2:].zfill(packedBits)
+        ar_elem_bin = [int(x) for x in ar_elem_bin]
+
+        ar_elem_bin.reverse()
+        for i in range(inner_dim_elems):
+            upper_limit = (i + 1) * targetBits
+            lower_limit = i * targetBits
+            elem = ar_elem_bin[lower_limit:upper_limit]
+            elem.reverse()
+            elem_str = "".join(map(str, elem))
+            if conv_dtype == DataType["FLOAT32"]:
+                ar_list.append(BitArray(bin=elem_str).float)
+            elif conv_dtype.is_integer():
+                ar_list.append(int(elem_str, 2))
+            else:
+                raise Exception("Not implemented for conv_dtype " + conv_dtype.name)
+        # reverse inner dimension back to "normal" positions
+        if reverse_inner is False:
+            ar_list.reverse()
+
+        # interpret output values correctly
+
+        # interpret values as bipolar
+        if conv_dtype == DataType["BIPOLAR"]:
+            ar_list = [2 * x - 1 for x in ar_list]
+        # interpret values as signed values
+        elif conv_dtype.name.startswith("INT"):
+            mask = 2 ** (conv_dtype.bitwidth() - 1)
+            ar_list = [-(x & mask) + (x & ~mask) for x in ar_list]
+
+        array.append(ar_list)
+    array = np.asarray(array, dtype=np.float32).reshape(out_shape)
+    if dtype.is_fixed_point():
+        # convert signed integer to fixed point by applying scale
+        array = array * dtype.scale_factor()
+    return array
+
+
+def numpy_to_hls_code(
+    ndarray, dtype, hls_var_name, pack_innermost_dim=True, no_decl=False
+):
+    """Return C++ code representation of a numpy ndarray with FINN DataType
+    dtype, using hls_var_name as the resulting C++ variable name. If
+    pack_innermost_dim is specified, the innermost dimension of the ndarray
+    will be packed into a hex string using array2hexstring. If no_decl is
+    set to True, no variable name and type will be generated as part of the
+    emitted string.
+    """
+    hls_dtype = dtype.get_hls_datatype_str()
+    if type(ndarray) != np.ndarray or ndarray.dtype != np.float32:
+        # try to convert to a float numpy array (container dtype is float)
+        ndarray = np.asarray(ndarray, dtype=np.float32)
+    if pack_innermost_dim:
+        idimlen = ndarray.shape[-1]
+        idimbits = idimlen * dtype.bitwidth()
+        idimbits = roundup_to_integer_multiple(idimbits, 4)
+        ndarray = pack_innermost_dim_as_hex_string(ndarray, dtype, idimbits)
+        hls_dtype = "ap_uint<%d>" % idimbits
+    ndims = ndarray.ndim
+    # add type string and variable name
+    # e.g. "const ap_uint<64>" "weightMem0"
+    ret = "%s %s" % (hls_dtype, hls_var_name)
+    # add dimensions
+    for d in range(ndims):
+        ret += "[%d]" % ndarray.shape[d]
+    orig_printops = np.get_printoptions()
+    np.set_printoptions(threshold=sys.maxsize)
+
+    # define a function to convert a single element into a C++ init string
+    # a single element can be a hex string if we are using packing
+    def elem2str(x):
+        if type(x) == str or type(x) == np.str_ or type(x) == np.str:
+            return '%s("%s", 16)' % (hls_dtype, x)
+        elif type(x) == np.float32:
+            if dtype.is_integer():
+                return str(int(x))
+            else:
+                return str(x)
+        else:
+            raise Exception("Unsupported type for numpy_to_hls_code")
+
+    strarr = np.array2string(ndarray, separator=", ", formatter={"all": elem2str})
+    np.set_printoptions(**orig_printops)
+    strarr = strarr.replace("[", "{").replace("]", "}")
+    if no_decl:
+        ret = strarr + ";"
+    else:
+        ret = ret + " = \n" + strarr + ";"
+    return ret
+
+
+def npy_to_rtlsim_input(input_file, input_dtype, pad_to_nbits, reverse_inner=True):
+    """Convert the multidimensional NumPy array of integers (stored as floats)
+    from input_file into a flattened sequence of Python arbitrary-precision
+    integers, packing the innermost dimension. See
+    finn.util.basic.pack_innermost_dim_as_hex_string() for more info on how the
+    packing works. If reverse_inner is set, the innermost dimension will be
+    reversed prior to packing."""
+    pad_to_nbits = roundup_to_integer_multiple(pad_to_nbits, 4)
+    if issubclass(type(input_file), np.ndarray):
+        inp = input_file
+    elif os.path.isfile(input_file):
+        inp = np.load(input_file)
+    else:
+        raise Exception("input_file must be ndarray or filename for .npy")
+    if inp.shape[-1] == 1 and input_dtype.is_integer():
+        packed_data = inp.flatten().astype(input_dtype.to_numpy_dt())
+        packed_data = [int(x) for x in packed_data]
+    else:
+        packed_data = pack_innermost_dim_as_hex_string(
+            inp, input_dtype, pad_to_nbits, reverse_inner=reverse_inner
+        )
+        packed_data = packed_data.flatten()
+        packed_data = [int(x[2:], 16) for x in packed_data]
+    return packed_data
+
+
+def rtlsim_output_to_npy(
+    output, path, dtype, shape, packedBits, targetBits, reverse_inner=True
+):
+    """Convert a flattened sequence of Python arbitrary-precision integers
+    output into a NumPy array, saved as npy file at path. Each arbitrary-precision
+    integer is assumed to be a packed array of targetBits-bit elements, which
+    will be unpacked as the innermost dimension of the NumPy array. If path is
+    not None it will also be saved as a npy file."""
+
+    # TODO should have its own testbench?
+    output = np.asarray([hex(int(x)) for x in output])
+    out_array = unpack_innermost_dim_from_hex_string(
+        output, dtype, shape, packedBits=packedBits, reverse_inner=reverse_inner
+    )
+    # make copy before saving the array
+    out_array = out_array.copy()
+    if path is not None:
+        np.save(path, out_array)
+    return out_array
+
+
+def finnpy_to_packed_bytearray(
+    ndarray, dtype, reverse_inner=False, reverse_endian=False, fast_mode=False
+):
+    """Given a numpy ndarray with FINN DataType dtype, pack the innermost
+    dimension and return the packed representation as an ndarray of uint8.
+    The packed innermost dimension will be padded to the nearest multiple
+    of 8 bits. The returned ndarray has the same number of dimensions as the
+    input.
+
+    If fast_mode is enabled, will attempt to use shortcuts  to save
+    on runtime for certain cases:
+    * 8-bit ndarray -> 8-bit
+    * ndarray -> 1-bit and total bits % 8 == 0
+    This mode is currently not well-tested, use at your own risk!
+    """
+
+    # handle fast_mode cases (currently only called from driver):
+    if issubclass(type(ndarray), np.ndarray) and fast_mode:
+        inp_is_byte = ndarray.dtype in [np.uint8, np.int8]
+        out_is_byte = dtype.bitwidth() == 8
+        double_reverse = reverse_inner and reverse_endian
+        # fast mode case: byte -> byte: cast
+        if inp_is_byte and out_is_byte and double_reverse:
+            return ndarray.view(np.uint8)
+        # fast mode case: xxx -> bit with nbits % 8 == 0: np.packbits
+        out_is_bit = dtype.bitwidth() == 1
+        bits = dtype.bitwidth() * ndarray.shape[-1]
+        bits_padded = roundup_to_integer_multiple(bits, 8)
+        no_pad = bits_padded == bits
+        if out_is_bit and no_pad and double_reverse:
+            in_as_int8 = ndarray.astype(np.int8)
+            # bipolar -> binary if needed
+            if dtype == DataType["BIPOLAR"]:
+                in_as_int8 = (in_as_int8 + 1) // 2
+            # reverse inner
+            in_as_int8 = np.flip(in_as_int8, axis=-1)
+            # pack with numpy
+            packed_data = np.packbits(in_as_int8, axis=-1)
+            # reverse endianness and return
+            return np.flip(packed_data, axis=-1)
+
+    if (not issubclass(type(ndarray), np.ndarray)) or ndarray.dtype != np.float32:
+        # try to convert to a float numpy array (container dtype is float)
+        ndarray = np.asarray(ndarray, dtype=np.float32)
+    # pack innermost dim to hex strings padded to 8 bits
+    bits = dtype.bitwidth() * ndarray.shape[-1]
+    bits_padded = roundup_to_integer_multiple(bits, 8)
+    packed_hexstring = pack_innermost_dim_as_hex_string(
+        ndarray, dtype, bits_padded, reverse_inner=reverse_inner
+    )
+
+    def fn(x):
+        return np.asarray(list(map(hexstring2npbytearray, x)))
+
+    if packed_hexstring.ndim == 0:
+        # scalar, call hexstring2npbytearray directly
+        ret = hexstring2npbytearray(np.asscalar(packed_hexstring))
+    else:
+        # convert ndarray of hex strings to byte array
+        ret = np.apply_along_axis(fn, packed_hexstring.ndim - 1, packed_hexstring)
+    if reverse_endian:
+        # reverse the endianness of packing dimension
+        ret = np.flip(ret, axis=-1)
+    return ret
+
+
+def packed_bytearray_to_finnpy(
+    packed_bytearray,
+    dtype,
+    output_shape=None,
+    reverse_inner=False,
+    reverse_endian=False,
+    fast_mode=False,
+):
+    """Given a packed numpy uint8 ndarray, unpack it into a FINN array of
+    given DataType.
+
+    output_shape can be specified to remove padding from the
+    packed dimension, or set to None to be inferred from the input.
+
+    If fast_mode is enabled, will attempt to use shortcuts (casting) to save
+    on runtime for certain cases.
+    This mode is currently not well-tested, use at your own risk.
+
+    """
+
+    if (
+        not issubclass(type(packed_bytearray), np.ndarray)
+    ) or packed_bytearray.dtype != np.uint8:
+        raise Exception("packed_bytearray_to_finnpy needs NumPy uint8 arrays")
+    if packed_bytearray.ndim == 0:
+        raise Exception("packed_bytearray_to_finnpy expects at least 1D ndarray")
+    packed_dim = packed_bytearray.ndim - 1
+    packed_bits = packed_bytearray.shape[packed_dim] * 8
+    target_bits = dtype.bitwidth()
+    if output_shape is None:
+        # determine output shape from input shape
+        assert (
+            packed_bits % target_bits == 0
+        ), """packed_bits are not divisable by
+        target_bits."""
+        n_target_elems = packed_bits // target_bits
+        output_shape = packed_bytearray.shape[:-1] + (n_target_elems,)
+    # handle no-packing cases (if fast_mode) via casting to save on compute
+    out_is_byte = target_bits in [8, 16]
+    double_reverse = reverse_inner and reverse_endian
+    if out_is_byte and double_reverse and fast_mode:
+        no_unpad = np.prod(packed_bytearray.shape) == np.prod(output_shape)
+        if no_unpad:
+            as_np_type = packed_bytearray.view(dtype.to_numpy_dt())
+            return as_np_type.reshape(output_shape).astype(np.float32)
+    if reverse_endian:
+        packed_bytearray = np.flip(packed_bytearray, axis=-1)
+    # convert innermost dim of byte array to hex strings
+    packed_hexstring = np.apply_along_axis(
+        npbytearray2hexstring, packed_dim, packed_bytearray
+    )
+    ret = unpack_innermost_dim_from_hex_string(
+        packed_hexstring, dtype, output_shape, packed_bits, reverse_inner
+    )
+
+    return ret
diff --git a/src/finn/util/fpgadataflow.py b/src/finn/util/fpgadataflow.py
new file mode 100644
index 0000000000..769ddb9465
--- /dev/null
+++ b/src/finn/util/fpgadataflow.py
@@ -0,0 +1,43 @@
+# Copyright (c) 2020 Xilinx, Inc.
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of Xilinx nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+from qonnx.util.basic import get_by_name, is_finn_op
+
+
+def is_fpgadataflow_node(node):
+    """Returns True if given node is fpgadataflow node. Otherwise False."""
+    is_node = False
+    if node is not None:
+        if is_finn_op(node.domain):
+            n_backend = get_by_name(node.attribute, "backend")
+            if n_backend is not None:
+                backend_value = n_backend.s.decode("UTF-8")
+                if backend_value == "fpgadataflow":
+                    is_node = True
+
+    return is_node
diff --git a/src/finn/util/gdrive.py b/src/finn/util/gdrive.py
index 5a904ed7c8..d525437300 100644
--- a/src/finn/util/gdrive.py
+++ b/src/finn/util/gdrive.py
@@ -31,9 +31,11 @@
 import warnings
 from datetime import datetime
 
+from finn.util.basic import get_finn_root
+
 
 def upload_to_end2end_dashboard(data_dict):
-    gdrive_key = "/workspace/finn/gdrive-key/service_account.json"
+    gdrive_key = get_finn_root() + "/gdrive-key/service_account.json"
     if not os.path.isfile(gdrive_key):
         warnings.warn("Google Drive key not found, skipping dashboard upload")
         return
diff --git a/src/finn/util/hls.py b/src/finn/util/hls.py
new file mode 100644
index 0000000000..52ed121a43
--- /dev/null
+++ b/src/finn/util/hls.py
@@ -0,0 +1,69 @@
+# Copyright (c) 2021 Xilinx, Inc.
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of Xilinx nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+import os
+import subprocess
+
+from finn.util.basic import which
+
+
+class CallHLS:
+    """Call vitis_hls to run HLS build tcl scripts."""
+
+    def __init__(self):
+        self.tcl_script = ""
+        self.ipgen_path = ""
+        self.code_gen_dir = ""
+        self.ipgen_script = ""
+
+    def append_tcl(self, tcl_script):
+        """Sets the tcl script to be executed."""
+        self.tcl_script = tcl_script
+
+    def set_ipgen_path(self, path):
+        """Sets member variable ipgen_path to given path."""
+        self.ipgen_path = path
+
+    def build(self, code_gen_dir):
+        """Builds the bash script with given parameters and saves it in given folder.
+        To guarantee the generation in the correct folder the bash script contains a
+        cd command."""
+        assert which("vitis_hls") is not None, "vitis_hls not found in PATH"
+        self.code_gen_dir = code_gen_dir
+        self.ipgen_script = str(self.code_gen_dir) + "/ipgen.sh"
+        working_dir = os.environ["PWD"]
+        f = open(self.ipgen_script, "w")
+        f.write("#!/bin/bash \n")
+        f.write("cd {}\n".format(code_gen_dir))
+        f.write("vitis_hls %s\n" % (self.tcl_script))
+        f.write("cd {}\n".format(working_dir))
+        f.close()
+        bash_command = ["bash", self.ipgen_script]
+        process_compile = subprocess.Popen(bash_command, stdout=subprocess.PIPE)
+        process_compile.communicate()
diff --git a/src/finn/util/imagenet.py b/src/finn/util/imagenet.py
index abd412e8d9..b4548bb352 100644
--- a/src/finn/util/imagenet.py
+++ b/src/finn/util/imagenet.py
@@ -29,8 +29,8 @@
 import numpy as np
 import os
 from PIL import Image
+from qonnx.core.data_layout import NCHW, NHWC
 
-from finn.core.data_layout import NCHW, NHWC
 from finn.util.test import crop_center, resize_smaller_side
 
 
diff --git a/src/finn/util/platforms.py b/src/finn/util/platforms.py
new file mode 100644
index 0000000000..8212cb5712
--- /dev/null
+++ b/src/finn/util/platforms.py
@@ -0,0 +1,480 @@
+# Copyright (c) 2021, Xilinx
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of FINN nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import numpy as np
+from abc import abstractmethod
+
+# contains the amount of available FPGA resources for several
+# Xilinx platforms, as well as certain resource limit guidelines
+# for creating designs that can achieve timing closure
+
+# explicit value for res types/costs we don't care about
+DONT_CARE = -1
+# recommended resource limits from Xilinx for timing closure
+# respectively for LUT, FF, BRAM_18K, URAM, DSP res types
+DEFAULT_RES_LIMITS = np.array([0.7, 0.5, 0.80, 0.80, 0.80])
+DEFAULT_AVG_CONSTRAINTS = [((2, 3, 4), 0.7)]  #
+
+# resources required to instantiate certain infrastructure components
+# such as memory controllers and network interfaces
+DDR_RESOURCE_REQUIREMENTS = {
+    "LUT": 33256,
+    "FF": 44889,
+    "BRAM_18K": 199,
+    "URAM": 0,
+    "DSP": 3,
+}
+HBM_RESOURCE_REQUIREMENTS = {
+    "LUT": 10718,
+    "FF": 21793,
+    "BRAM_18K": 8,
+    "URAM": 0,
+    "DSP": 0,
+}
+
+# we assume use of VNx Alveo UDP stack
+# see: https://gitenterprise.xilinx.com/mruiznog/vitis_network_layer
+ETH_RESOURCE_REQUIREMENTS = {
+    "LUT": 35219,
+    "FF": 86269,
+    "BRAM_18K": 183,
+    "URAM": 0,
+    "DSP": 0,
+}
+
+
+class Platform:
+    def __init__(
+        self,
+        nslr=1,
+        ndevices=1,
+        sll_count=[],
+        hbm_slr=-1,
+        ddr_slr=[0],
+        eth_slr=0,
+        eth_gbps=0,
+        limits=DEFAULT_RES_LIMITS,
+        avg_constraints=DEFAULT_AVG_CONSTRAINTS,
+    ):
+        self.nslr = nslr
+        self.sll_count = sll_count
+        self.eth_slr = eth_slr
+        self.eth_gbps = eth_gbps
+        self.ndevices = ndevices
+        self.hbm_slr = hbm_slr
+        self.ddr_slr = ddr_slr
+        # limits must be a np.array either of
+        # the same shape as compute_resources
+        # or broadcastable to it
+        self.res_limits = limits
+        # list of tuples of the form ( tuple of resource positions to avg, limit )
+        self.avg_constraints = avg_constraints
+
+    @property
+    @abstractmethod
+    def compute_resources(self):
+        pass
+
+    @property
+    def guide_resources(self):
+        guide = []
+        # TODO: assert limits is of correct size
+        guide_res = (
+            np.tile(np.array(self.compute_resources), (self.ndevices, 1))
+        ).astype(int)
+        for i in range(self.nslr * self.ndevices):
+            # when in multi-FPGA mode, subtract cost of UDP connection from eth_slr
+            local_slr = i % self.nslr
+            if self.ndevices > 1 and local_slr == self.eth_slr:
+                guide_res[i][0] -= ETH_RESOURCE_REQUIREMENTS["LUT"]
+                guide_res[i][1] -= ETH_RESOURCE_REQUIREMENTS["FF"]
+                guide_res[i][2] -= ETH_RESOURCE_REQUIREMENTS["BRAM_18K"]
+                guide_res[i][3] -= ETH_RESOURCE_REQUIREMENTS["URAM"]
+                guide_res[i][4] -= ETH_RESOURCE_REQUIREMENTS["DSP"]
+            # subtract the cost of memory controllers
+            # if we have a choice between DDR and HBM, use HBM
+            if local_slr == self.hbm_slr:
+                guide_res[i][0] -= HBM_RESOURCE_REQUIREMENTS["LUT"]
+                guide_res[i][1] -= HBM_RESOURCE_REQUIREMENTS["FF"]
+                guide_res[i][2] -= HBM_RESOURCE_REQUIREMENTS["BRAM_18K"]
+                guide_res[i][3] -= HBM_RESOURCE_REQUIREMENTS["URAM"]
+                guide_res[i][4] -= HBM_RESOURCE_REQUIREMENTS["DSP"]
+            elif local_slr in self.ddr_slr:
+                guide_res[i][0] -= DDR_RESOURCE_REQUIREMENTS["LUT"]
+                guide_res[i][1] -= DDR_RESOURCE_REQUIREMENTS["FF"]
+                guide_res[i][2] -= DDR_RESOURCE_REQUIREMENTS["BRAM_18K"]
+                guide_res[i][3] -= DDR_RESOURCE_REQUIREMENTS["URAM"]
+                guide_res[i][4] -= DDR_RESOURCE_REQUIREMENTS["DSP"]
+            guide.append(list(guide_res[i]))
+        return guide
+
+    @property
+    def resource_count_dict(self):
+        res = dict()
+        for i in range(self.nslr * self.ndevices):
+            slr_res = dict()
+            slr_res["LUT"] = self.compute_resources[i % self.nslr][0]
+            slr_res["FF"] = self.compute_resources[i % self.nslr][1]
+            slr_res["BRAM_18K"] = self.compute_resources[i % self.nslr][2]
+            slr_res["URAM"] = self.compute_resources[i % self.nslr][3]
+            slr_res["DSP"] = self.compute_resources[i % self.nslr][4]
+            res["slr" + str(i)] = slr_res
+        return res
+
+    @property
+    def compute_connection_cost(self):
+        x = np.full((self.nslr * self.ndevices, self.nslr * self.ndevices), DONT_CARE)
+        # build connection cost matrix for one device's SLRs
+        xlocal = np.full((self.nslr, self.nslr), DONT_CARE)
+        for i in range(self.nslr):
+            for j in range(self.nslr):
+                if i == j:
+                    xlocal[i][j] = 0
+                elif abs(i - j) == 1:
+                    xlocal[i][j] = 1
+        # tile connection cost matrices for entire system
+        for i in range(self.ndevices):
+            x[
+                i * self.nslr : (i + 1) * self.nslr, i * self.nslr : (i + 1) * self.nslr
+            ] = xlocal
+        # set cost for ethernet connections, assuming daisy-chaining
+        for i in range(self.ndevices - 1):
+            x[i * self.nslr + self.eth_slr][(i + 1) * self.nslr + self.eth_slr] = 10
+            x[(i + 1) * self.nslr + self.eth_slr][i * self.nslr + self.eth_slr] = 10
+        return x
+
+    @property
+    def compute_connection_resource(self):
+        sll = np.full((self.nslr * self.ndevices, self.nslr * self.ndevices), 0)
+        # build connection resource matrix for one device's SLRs
+        slllocal = np.full((self.nslr, self.nslr), -1)
+        for i in range(self.nslr):
+            for j in range(self.nslr):
+                if i == j:
+                    # no SLL constraint when going from one SLR to itself
+                    slllocal[i][j] = -1
+                else:
+                    slllocal[i][j] = self.sll_count[i][j]
+        # tile connection cost matrices for entire system
+        for i in range(self.ndevices):
+            sll[
+                i * self.nslr : (i + 1) * self.nslr, i * self.nslr : (i + 1) * self.nslr
+            ] = slllocal
+        # set cost for ethernet connections, assuming daisy-chaining
+        eth = np.full((self.nslr * self.ndevices, self.nslr * self.ndevices), 0)
+        # no Eth throughput constraints from one SLR to itself
+        for i in range(self.ndevices * self.nslr):
+            eth[i][i] = -1
+        # apply symmetric ETH throughput constraints between the SLRs that have GTXes
+        for i in range(self.ndevices - 1):
+            eth[i * self.nslr + self.eth_slr][
+                (i + 1) * self.nslr + self.eth_slr
+            ] = self.eth_gbps * (10**9)
+            eth[(i + 1) * self.nslr + self.eth_slr][
+                i * self.nslr + self.eth_slr
+            ] = self.eth_gbps * (10**9)
+        # pack sll and eth info in one list-of-list-of-tuple structure
+        constraints = []
+        for i in range(self.ndevices * self.nslr):
+            constraints_line = []
+            for j in range(self.ndevices * self.nslr):
+                # make sure not to constrain both resources at the same time
+                # constrain for Eth throughput between SLRs on different devices
+                # constrain for SLLs between SLRs on same device
+                is_offchip = i // self.nslr != j // self.nslr
+                constraints_line.append(
+                    (-1 if is_offchip else sll[i][j], eth[i][j] if is_offchip else -1)
+                )
+            constraints.append(constraints_line)
+        return constraints
+
+    def map_device_to_slr(self, idx):
+        """Given a global SLR index, return device id and local slr index"""
+        assert idx <= self.nslr * self.ndevices
+        return (idx % self.nslr, idx // self.nslr)
+
+
+class Zynq7020_Platform(Platform):
+    def __init__(
+        self,
+        ndevices=1,
+        limits=DEFAULT_RES_LIMITS,
+        avg_constraints=DEFAULT_AVG_CONSTRAINTS,
+    ):
+        super(Zynq7020_Platform, self).__init__(
+            nslr=1,
+            ndevices=ndevices,
+            sll_count=[[0]],
+            ddr_slr=[],
+            eth_slr=0,
+            eth_gbps=1,
+            limits=limits,
+            avg_constraints=avg_constraints,
+        )
+
+    @property
+    def compute_resources(self):
+        return [[53200, 2 * 53200, 280, 0, 220] for i in range(1)]
+
+
+class ZU3EG_Platform(Platform):
+    def __init__(
+        self,
+        ndevices=1,
+        limits=DEFAULT_RES_LIMITS,
+        avg_constraints=DEFAULT_AVG_CONSTRAINTS,
+    ):
+        super(ZU3EG_Platform, self).__init__(
+            nslr=1,
+            ndevices=ndevices,
+            sll_count=[[0]],
+            ddr_slr=[],
+            eth_slr=0,
+            eth_gbps=1,
+            limits=limits,
+            avg_constraints=avg_constraints,
+        )
+
+    @property
+    def compute_resources(self):
+        return [[71000, 2 * 71000, 412, 0, 360] for i in range(1)]
+
+
+class ZU7EV_Platform(Platform):
+    def __init__(
+        self,
+        ndevices=1,
+        limits=DEFAULT_RES_LIMITS,
+        avg_constraints=DEFAULT_AVG_CONSTRAINTS,
+    ):
+        super(ZU7EV_Platform, self).__init__(
+            nslr=1,
+            ndevices=ndevices,
+            sll_count=[[0]],
+            ddr_slr=[],
+            eth_slr=0,
+            eth_gbps=1,
+            limits=limits,
+            avg_constraints=avg_constraints,
+        )
+
+    @property
+    def compute_resources(self):
+        return [[230000, 2 * 230000, 610, 92, 1728] for i in range(1)]
+
+
+class ZU9EG_Platform(Platform):
+    def __init__(
+        self,
+        ndevices=1,
+        limits=DEFAULT_RES_LIMITS,
+        avg_constraints=DEFAULT_AVG_CONSTRAINTS,
+    ):
+        super(ZU9EG_Platform, self).__init__(
+            nslr=1,
+            ndevices=ndevices,
+            sll_count=[[0]],
+            ddr_slr=[],
+            eth_slr=0,
+            eth_gbps=1,
+            limits=limits,
+            avg_constraints=avg_constraints,
+        )
+
+    @property
+    def compute_resources(self):
+        return [[274000, 2 * 274000, 1824, 0, 2520] for i in range(1)]
+
+
+class ZU28DR_Platform(Platform):
+    def __init__(
+        self,
+        ndevices=1,
+        limits=DEFAULT_RES_LIMITS,
+        avg_constraints=DEFAULT_AVG_CONSTRAINTS,
+    ):
+        super(ZU28DR_Platform, self).__init__(
+            nslr=1,
+            ndevices=ndevices,
+            sll_count=[[0]],
+            ddr_slr=[],
+            eth_slr=0,
+            eth_gbps=1,
+            limits=limits,
+            avg_constraints=avg_constraints,
+        )
+
+    @property
+    def compute_resources(self):
+        return [[425000, 2 * 425000, 2160, 80, 4272] for i in range(1)]
+
+
+class Alveo_NxU50_Platform(Platform):
+    def __init__(
+        self,
+        ndevices=1,
+        limits=DEFAULT_RES_LIMITS,
+        avg_constraints=DEFAULT_AVG_CONSTRAINTS,
+    ):
+        # according to Vivado: 23040 SLR0 <-> SLR1
+        sll_counts = [[0, 5000], [5000, 0]]
+        super(Alveo_NxU50_Platform, self).__init__(
+            nslr=2,
+            ndevices=ndevices,
+            sll_count=sll_counts,
+            ddr_slr=[],
+            hbm_slr=0,
+            eth_slr=1,
+            eth_gbps=100,
+            limits=limits,
+            avg_constraints=avg_constraints,
+        )
+
+    @property
+    def compute_resources(self):
+        # According to UG1120:
+        # U50 has identical resource counts on both SLRs
+        # return [[365000,2*365000,2*564, 304, 2580] for i in range(2)]
+        # we observe from Vivado that the resource counts are actually:
+        return [
+            [374400, 2 * 374400, 2 * 564, 304, 2592],
+            [368160, 2 * 368160, 2 * 564, 304, 2760],
+        ]
+
+
+class Alveo_NxU200_Platform(Platform):
+    def __init__(
+        self,
+        ndevices=1,
+        limits=DEFAULT_RES_LIMITS,
+        avg_constraints=DEFAULT_AVG_CONSTRAINTS,
+    ):
+        sll_counts = [[0, 5000, 0], [5000, 0, 5000], [0, 5000, 0]]
+        super(Alveo_NxU200_Platform, self).__init__(
+            nslr=3,
+            ndevices=ndevices,
+            sll_count=sll_counts,
+            ddr_slr=[0, 2],
+            eth_slr=2,
+            eth_gbps=100,
+            limits=limits,
+            avg_constraints=avg_constraints,
+        )
+
+    @property
+    def compute_resources(self):
+        # According to UG1120:
+        # return [[355000, 723000, 2*638, 320, 2265],
+        #        [160000, 331000, 2*326, 160, 1317],
+        #        [355000, 723000, 2*638, 320, 2265]]
+        # we observe from Vivado that the resource counts are actually:
+        return [
+            [385920, 2 * 385920, 2 * 714, 320, 2268],
+            [199680, 2 * 199680, 2 * 420, 160, 1320],
+            [385920, 2 * 385920, 2 * 714, 320, 2268],
+        ]
+
+
+class Alveo_NxU250_Platform(Platform):
+    def __init__(
+        self,
+        ndevices=1,
+        limits=DEFAULT_RES_LIMITS,
+        avg_constraints=DEFAULT_AVG_CONSTRAINTS,
+    ):
+        sll_counts = [
+            [0, 5000, 0, 0],
+            [5000, 0, 5000, 0],
+            [0, 5000, 0, 5000],
+            [0, 0, 5000, 0],
+        ]
+        super(Alveo_NxU250_Platform, self).__init__(
+            nslr=4,
+            ndevices=ndevices,
+            sll_count=sll_counts,
+            ddr_slr=[0, 1, 2, 3],
+            eth_slr=3,
+            eth_gbps=100,
+            limits=limits,
+            avg_constraints=avg_constraints,
+        )
+
+    @property
+    def compute_resources(self):
+        # According to UG1120:
+        # U250 has identical resource counts on all 4 SLRs:
+        # return [[345000,2*345000,2*500, 320, 2877] for i in range(4)]
+        # we observe from Vivado that the resource counts are actually:
+        return [[375000, 2 * 375000, 2 * 576, 320, 2880] for i in range(4)]
+
+
+class Alveo_NxU280_Platform(Platform):
+    def __init__(
+        self,
+        ndevices=1,
+        limits=DEFAULT_RES_LIMITS,
+        avg_constraints=DEFAULT_AVG_CONSTRAINTS,
+    ):
+        sll_counts = [[0, 5000, 0], [5000, 0, 5000], [0, 5000, 0]]
+        super(Alveo_NxU280_Platform, self).__init__(
+            nslr=3,
+            ndevices=ndevices,
+            sll_count=sll_counts,
+            ddr_slr=[0, 1],
+            hbm_slr=0,
+            eth_slr=2,
+            eth_gbps=100,
+            limits=limits,
+            avg_constraints=avg_constraints,
+        )
+
+    @property
+    def compute_resources(self):
+        # according to UG1120
+        # return [[369000, 746000, 2*507, 320, 2733],
+        #        [333000, 675000, 2*468, 320, 2877],
+        #        [367000, 729000, 2*512, 320, 2880]]
+        # observed from Vivado:
+        return [
+            [400800, 2 * 400800, 2 * 600, 320, 2736],
+            [382080, 2 * 382080, 2 * 576, 320, 2880],
+            [380640, 2 * 380640, 2 * 576, 320, 2880],
+        ]
+
+
+platforms = dict()
+platforms["U50"] = Alveo_NxU50_Platform
+platforms["U200"] = Alveo_NxU200_Platform
+platforms["U250"] = Alveo_NxU250_Platform
+platforms["U280"] = Alveo_NxU280_Platform
+platforms["Pynq-Z1"] = Zynq7020_Platform
+platforms["Pynq-Z2"] = Zynq7020_Platform
+platforms["Ultra96"] = ZU3EG_Platform
+platforms["ZCU104"] = ZU7EV_Platform
+platforms["ZCU102"] = ZU9EG_Platform
+platforms["ZCU111"] = ZU28DR_Platform
diff --git a/src/finn/util/pyverilator.py b/src/finn/util/pyverilator.py
new file mode 100644
index 0000000000..f6a51da8e4
--- /dev/null
+++ b/src/finn/util/pyverilator.py
@@ -0,0 +1,122 @@
+# Copyright (c) 2020, Xilinx
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of FINN nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import os
+from pyverilator import PyVerilator
+
+from finn.util.basic import get_rtlsim_trace_depth, make_build_dir
+
+
+def pyverilate_stitched_ip(
+    model,
+    read_internal_signals=True,
+    disable_common_warnings=True,
+    extra_verilator_args=[],
+):
+    """Given a model with stitched IP, return a PyVerilator sim object.
+    Trace depth is also controllable, see get_rtlsim_trace_depth()
+
+    :param read_internal_signals  If set, it will be possible to examine the
+        internal (not only port) signals of the Verilog module, but this may
+        slow down compilation and emulation.
+
+    :param disable_common_warnings If set, disable the set of warnings that
+        Vivado-HLS-generated Verilog typically triggers in Verilator
+        (which can be very verbose otherwise)
+
+    """
+    if PyVerilator is None:
+        raise ImportError("Installation of PyVerilator is required.")
+
+    vivado_stitch_proj_dir = model.get_metadata_prop("vivado_stitch_proj")
+    with open(vivado_stitch_proj_dir + "/all_verilog_srcs.txt", "r") as f:
+        all_verilog_srcs = f.read().split()
+
+    def file_to_dir(x):
+        return os.path.dirname(os.path.realpath(x))
+
+    def file_to_basename(x):
+        return os.path.basename(os.path.realpath(x))
+
+    top_module_file_name = file_to_basename(model.get_metadata_prop("wrapper_filename"))
+    top_module_name = top_module_file_name.strip(".v")
+    build_dir = make_build_dir("pyverilator_ipstitched_")
+
+    # dump all Verilog code to a single file
+    # this is because large models with many files require
+    # a verilator command line too long for bash on most systems
+    # NOTE: there are duplicates in this list, and some files
+    # are identical but in multiple directories (regslice_core.v)
+
+    # remove duplicates from list by doing list -> set -> list
+    all_verilog_files = list(
+        set(filter(lambda x: x.endswith(".v") or x.endswith(".sv"), all_verilog_srcs))
+    )
+
+    # remove all but one instances of regslice_core.v
+    filtered_verilog_files = []
+    remove_entry = False
+    for vfile in all_verilog_files:
+        if "regslice_core" in vfile:
+            if not remove_entry:
+                filtered_verilog_files.append(vfile)
+            remove_entry = True
+        else:
+            filtered_verilog_files.append(vfile)
+
+    # concatenate all verilog code into a single file
+    with open(vivado_stitch_proj_dir + "/" + top_module_file_name, "w") as wf:
+        for vfile in filtered_verilog_files:
+            with open(vfile) as rf:
+                wf.write("//Added from " + vfile + "\n\n")
+                wf.write(rf.read())
+
+    verilator_args = []
+    # disable common verilator warnings that should be harmless but commonly occur
+    # in large quantities for Vivado HLS-generated verilog code
+    if disable_common_warnings:
+        verilator_args += ["-Wno-STMTDLY"]
+        verilator_args += ["-Wno-PINMISSING"]
+        verilator_args += ["-Wno-IMPLICIT"]
+        verilator_args += ["-Wno-WIDTH"]
+        verilator_args += ["-Wno-COMBDLY"]
+    # force inlining of all submodules to ensure we can read internal signals properly
+    if read_internal_signals:
+        verilator_args += ["--inline-mult", "0"]
+
+    sim = PyVerilator.build(
+        top_module_file_name,
+        verilog_path=[vivado_stitch_proj_dir],
+        build_dir=build_dir,
+        trace_depth=get_rtlsim_trace_depth(),
+        top_module_name=top_module_name,
+        auto_eval=False,
+        read_internal_signals=read_internal_signals,
+        extra_args=verilator_args + extra_verilator_args,
+    )
+    return sim
diff --git a/src/finn/util/test.py b/src/finn/util/test.py
index 9c5462ae7f..f5d3b1c30b 100644
--- a/src/finn/util/test.py
+++ b/src/finn/util/test.py
@@ -38,10 +38,10 @@
 import warnings
 from brevitas_examples import bnn_pynq, imagenet_classification
 from pkgutil import get_data
+from qonnx.core.modelwrapper import ModelWrapper
+from qonnx.custom_op.registry import getCustomOp
 
-from finn.core.modelwrapper import ModelWrapper
 from finn.core.onnx_exec import execute_onnx
-from finn.custom_op.registry import getCustomOp
 from finn.transformation.fpgadataflow.make_zynq_proj import ZynqBuild
 from finn.transformation.fpgadataflow.vitis_build import VitisBuild, VitisOptStrategy
 from finn.util.basic import alveo_default_platform, alveo_part_map, pynq_part_map
@@ -144,7 +144,7 @@ def get_example_input(topology):
     "Get example numpy input tensor for given topology."
 
     if "fc" in topology:
-        raw_i = get_data("finn.data", "onnx/mnist-conv/test_data_set_0/input_0.pb")
+        raw_i = get_data("qonnx.data", "onnx/mnist-conv/test_data_set_0/input_0.pb")
         onnx_tensor = onnx.load_tensor_from_string(raw_i)
         return nph.to_array(onnx_tensor)
     elif topology == "cnv":
diff --git a/src/finn/util/vcd.py b/src/finn/util/vcd.py
index 6a5a68f099..aaeb3ab920 100644
--- a/src/finn/util/vcd.py
+++ b/src/finn/util/vcd.py
@@ -27,10 +27,9 @@
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 import multiprocessing as mp
+from qonnx.util.basic import get_num_default_workers
 from vcdvcd import VCDVCD
 
-from finn.util.basic import get_num_default_workers
-
 # string patterns to search for to find particular interfaces
 # streaming interfaces
 vname = "TVALID"
diff --git a/src/finn/util/vivado.py b/src/finn/util/vivado.py
new file mode 100644
index 0000000000..bc8ca40d88
--- /dev/null
+++ b/src/finn/util/vivado.py
@@ -0,0 +1,83 @@
+# Copyright (c) 2020, Xilinx
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of FINN nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import os
+
+from finn.util.basic import launch_process_helper, which
+
+
+def out_of_context_synth(
+    verilog_dir,
+    top_name,
+    fpga_part="xczu3eg-sbva484-1-e",
+    clk_name="ap_clk_0",
+    clk_period_ns=5.0,
+):
+    "Run out-of-context Vivado synthesis, return resources and slack."
+
+    # ensure that the OH_MY_XILINX envvar is set
+    if "OHMYXILINX" not in os.environ:
+        raise Exception("The environment variable OHMYXILINX is not defined.")
+    # ensure that vivado is in PATH: source $VIVADO_PATH/settings64.sh
+    if which("vivado") is None:
+        raise Exception("vivado is not in PATH, ensure settings64.sh is sourced.")
+    omx_path = os.environ["OHMYXILINX"]
+    script = "vivadocompile.sh"
+    # vivadocompile.sh <top-level-entity> <clock-name (optional)> <fpga-part (optional)>
+    call_omx = "zsh %s/%s %s %s %s %f" % (
+        omx_path,
+        script,
+        top_name,
+        clk_name,
+        fpga_part,
+        float(clk_period_ns),
+    )
+    call_omx = call_omx.split()
+    launch_process_helper(call_omx, proc_env=os.environ.copy(), cwd=verilog_dir)
+
+    vivado_proj_folder = "%s/results_%s" % (verilog_dir, top_name)
+    res_counts_path = vivado_proj_folder + "/res.txt"
+
+    with open(res_counts_path, "r") as myfile:
+        res_data = myfile.read().split("\n")
+    ret = {}
+    ret["vivado_proj_folder"] = vivado_proj_folder
+    for res_line in res_data:
+        res_fields = res_line.split("=")
+        print(res_fields)
+        try:
+            ret[res_fields[0]] = float(res_fields[1])
+        except ValueError:
+            ret[res_fields[0]] = 0
+        except IndexError:
+            ret[res_fields[0]] = 0
+    if ret["WNS"] == 0:
+        ret["fmax_mhz"] = 0
+    else:
+        ret["fmax_mhz"] = 1000.0 / (clk_period_ns - ret["WNS"])
+    return ret
diff --git a/tests/brevitas/test_brevitas_avg_pool_export.py b/tests/brevitas/test_brevitas_avg_pool_export.py
index 1b38914a83..669601ecb6 100644
--- a/tests/brevitas/test_brevitas_avg_pool_export.py
+++ b/tests/brevitas/test_brevitas_avg_pool_export.py
@@ -34,19 +34,20 @@
 from brevitas.export.onnx.generic.manager import BrevitasONNXManager
 from brevitas.nn import QuantAvgPool2d
 from brevitas.quant_tensor import QuantTensor
+from qonnx.core.datatype import DataType
+from qonnx.core.modelwrapper import ModelWrapper
+from qonnx.transformation.infer_datatypes import InferDataTypes
+from qonnx.transformation.infer_shapes import InferShapes
+from qonnx.util.basic import gen_finn_dt_tensor
 from qonnx.util.cleanup import cleanup as qonnx_cleanup
 
 import finn.core.onnx_exec as oxe
-from finn.core.datatype import DataType
-from finn.core.modelwrapper import ModelWrapper
-from finn.transformation.infer_datatypes import InferDataTypes
-from finn.transformation.infer_shapes import InferShapes
 from finn.transformation.qonnx.convert_qonnx_to_finn import ConvertQONNXtoFINN
-from finn.util.basic import gen_finn_dt_tensor
 
 base_export_onnx_path = "test_brevitas_avg_pool_export.onnx"
 
 
+@pytest.mark.brevitas_export
 @pytest.mark.parametrize("QONNX_export", [False, True])
 @pytest.mark.parametrize("kernel_size", [2, 3])
 @pytest.mark.parametrize("stride", [1, 2])
diff --git a/tests/brevitas/test_brevitas_cnv.py b/tests/brevitas/test_brevitas_cnv.py
index 78ca361366..62aab2e3c2 100644
--- a/tests/brevitas/test_brevitas_cnv.py
+++ b/tests/brevitas/test_brevitas_cnv.py
@@ -35,19 +35,20 @@
 import os
 import torch
 from brevitas.export.onnx.generic.manager import BrevitasONNXManager
+from qonnx.core.modelwrapper import ModelWrapper
+from qonnx.transformation.fold_constants import FoldConstants
+from qonnx.transformation.general import GiveUniqueNodeNames, RemoveStaticGraphInputs
+from qonnx.transformation.infer_shapes import InferShapes
 from qonnx.util.cleanup import cleanup as qonnx_cleanup
 
 import finn.core.onnx_exec as oxe
-from finn.core.modelwrapper import ModelWrapper
-from finn.transformation.fold_constants import FoldConstants
-from finn.transformation.general import GiveUniqueNodeNames, RemoveStaticGraphInputs
-from finn.transformation.infer_shapes import InferShapes
 from finn.transformation.qonnx.convert_qonnx_to_finn import ConvertQONNXtoFINN
 from finn.util.test import get_test_model_trained
 
 export_onnx_path = "test_brevitas_cnv.onnx"
 
 
+@pytest.mark.brevitas_export
 @pytest.mark.parametrize("abits", [1, 2])
 @pytest.mark.parametrize("wbits", [1, 2])
 @pytest.mark.parametrize("QONNX_export", [False, True])
diff --git a/tests/brevitas/test_brevitas_debug.py b/tests/brevitas/test_brevitas_debug.py
index e42b93babe..181d610fff 100644
--- a/tests/brevitas/test_brevitas_debug.py
+++ b/tests/brevitas/test_brevitas_debug.py
@@ -36,17 +36,18 @@
 import torch
 from brevitas.export.onnx.generic.manager import BrevitasONNXManager
 from pkgutil import get_data
+from qonnx.core.modelwrapper import ModelWrapper
+from qonnx.transformation.fold_constants import FoldConstants
+from qonnx.transformation.general import RemoveStaticGraphInputs
+from qonnx.transformation.infer_shapes import InferShapes
 from qonnx.util.cleanup import cleanup as qonnx_cleanup
 
 import finn.core.onnx_exec as oxe
-from finn.core.modelwrapper import ModelWrapper
-from finn.transformation.fold_constants import FoldConstants
-from finn.transformation.general import RemoveStaticGraphInputs
-from finn.transformation.infer_shapes import InferShapes
 from finn.transformation.qonnx.convert_qonnx_to_finn import ConvertQONNXtoFINN
 from finn.util.test import get_test_model_trained
 
 
+@pytest.mark.brevitas_export
 @pytest.mark.parametrize("QONNX_export", [False, True])
 @pytest.mark.parametrize("QONNX_FINN_conversion", [False, True])
 def test_brevitas_debug(QONNX_export, QONNX_FINN_conversion):
@@ -62,7 +63,7 @@ def test_brevitas_debug(QONNX_export, QONNX_FINN_conversion):
         model = ModelWrapper(finn_onnx)
         dbg_nodes = model.get_nodes_by_op_type("DebugMarker")
         for dbg_node in dbg_nodes:
-            dbg_node.domain = "finn.custom_op.general"
+            dbg_node.domain = "qonnx.custom_op.general"
         model.save(finn_onnx)
         qonnx_cleanup(finn_onnx, out_file=finn_onnx)
         if QONNX_FINN_conversion:
@@ -78,7 +79,7 @@ def test_brevitas_debug(QONNX_export, QONNX_FINN_conversion):
         #  domain conversion for us?
         dbg_nodes = model.get_nodes_by_op_type("DebugMarker")
         for dbg_node in dbg_nodes:
-            dbg_node.domain = "finn.custom_op.general"
+            dbg_node.domain = "qonnx.custom_op.general"
         model = model.transform(InferShapes())
         model = model.transform(FoldConstants())
         model = model.transform(RemoveStaticGraphInputs())
@@ -87,7 +88,7 @@ def test_brevitas_debug(QONNX_export, QONNX_FINN_conversion):
     assert len(model.graph.input) == 1
     assert len(model.graph.output) == 1
     # load one of the test vectors
-    raw_i = get_data("finn.data", "onnx/mnist-conv/test_data_set_0/input_0.pb")
+    raw_i = get_data("qonnx.data", "onnx/mnist-conv/test_data_set_0/input_0.pb")
     input_tensor = onnx.load_tensor_from_string(raw_i)
     # run using FINN-based execution
     input_dict = {model.graph.input[0].name: nph.to_array(input_tensor)}
diff --git a/tests/brevitas/test_brevitas_fc.py b/tests/brevitas/test_brevitas_fc.py
index 8e1e3de8d0..211fdb629b 100644
--- a/tests/brevitas/test_brevitas_fc.py
+++ b/tests/brevitas/test_brevitas_fc.py
@@ -35,13 +35,13 @@
 import torch
 from brevitas.export.onnx.generic.manager import BrevitasONNXManager
 from pkgutil import get_data
+from qonnx.core.modelwrapper import ModelWrapper
+from qonnx.transformation.fold_constants import FoldConstants
+from qonnx.transformation.general import RemoveStaticGraphInputs
+from qonnx.transformation.infer_shapes import InferShapes
 from qonnx.util.cleanup import cleanup as qonnx_cleanup
 
 import finn.core.onnx_exec as oxe
-from finn.core.modelwrapper import ModelWrapper
-from finn.transformation.fold_constants import FoldConstants
-from finn.transformation.general import RemoveStaticGraphInputs
-from finn.transformation.infer_shapes import InferShapes
 from finn.transformation.qonnx.convert_qonnx_to_finn import ConvertQONNXtoFINN
 from finn.util.basic import make_build_dir
 from finn.util.test import get_test_model_trained
@@ -49,6 +49,7 @@
 export_onnx_path = make_build_dir("test_brevitas_fc_")
 
 
+@pytest.mark.brevitas_export
 # act bits
 @pytest.mark.parametrize("abits", [1, 2])
 # weight bits
@@ -81,7 +82,7 @@ def test_brevitas_fc_onnx_export_and_exec(size, wbits, abits, QONNX_export):
     assert len(model.graph.input) == 1
     assert len(model.graph.output) == 1
     # load one of the test vectors
-    raw_i = get_data("finn.data", "onnx/mnist-conv/test_data_set_0/input_0.pb")
+    raw_i = get_data("qonnx.data", "onnx/mnist-conv/test_data_set_0/input_0.pb")
     input_tensor = onnx.load_tensor_from_string(raw_i)
     # run using FINN-based execution
     input_dict = {model.graph.input[0].name: nph.to_array(input_tensor)}
diff --git a/tests/brevitas/test_brevitas_mobilenet.py b/tests/brevitas/test_brevitas_mobilenet.py
index 108c97c2e8..b1475b6f4e 100644
--- a/tests/brevitas/test_brevitas_mobilenet.py
+++ b/tests/brevitas/test_brevitas_mobilenet.py
@@ -32,31 +32,32 @@
 import numpy as np
 import torch
 from PIL import Image
-
-import finn.core.onnx_exec as oxe
-import finn.transformation.streamline.absorb as absorb
-from finn.core.datatype import DataType
-from finn.core.modelwrapper import ModelWrapper
-from finn.transformation.fold_constants import FoldConstants
-from finn.transformation.general import (
+from qonnx.core.datatype import DataType
+from qonnx.core.modelwrapper import ModelWrapper
+from qonnx.transformation.fold_constants import FoldConstants
+from qonnx.transformation.general import (
     GiveReadableTensorNames,
     GiveUniqueNodeNames,
     GiveUniqueParameterTensors,
 )
-from finn.transformation.infer_data_layouts import InferDataLayouts
-from finn.transformation.infer_datatypes import InferDataTypes
-from finn.transformation.infer_shapes import InferShapes
-from finn.transformation.insert_topk import InsertTopK
-from finn.transformation.merge_onnx_models import MergeONNXModels
-from finn.util.basic import make_build_dir
+from qonnx.transformation.infer_data_layouts import InferDataLayouts
+from qonnx.transformation.infer_datatypes import InferDataTypes
+from qonnx.transformation.infer_shapes import InferShapes
+from qonnx.transformation.insert_topk import InsertTopK
+from qonnx.transformation.merge_onnx_models import MergeONNXModels
+
+import finn.core.onnx_exec as oxe
+import finn.transformation.streamline.absorb as absorb
+from finn.util.basic import get_finn_root, make_build_dir
 from finn.util.pytorch import NormalizePreProc
 from finn.util.test import crop_center, get_test_model_trained, resize_smaller_side
 
 
+@pytest.mark.brevitas_export
 @pytest.mark.xfail
 def test_brevitas_mobilenet():
     # get single image as input and prepare image
-    img = Image.open("/workspace/finn/tests/brevitas/king_charles.jpg")
+    img = Image.open(get_finn_root() + "/tests/brevitas/king_charles.jpg")
     # resize smallest side of the image to 256 pixels and resize larger side
     # with same ratio
     img = resize_smaller_side(256, img)
diff --git a/tests/brevitas/test_brevitas_non_scaled_quanthardtanh_export.py b/tests/brevitas/test_brevitas_non_scaled_quanthardtanh_export.py
index b530b4bd84..5d70acb102 100644
--- a/tests/brevitas/test_brevitas_non_scaled_quanthardtanh_export.py
+++ b/tests/brevitas/test_brevitas_non_scaled_quanthardtanh_export.py
@@ -38,16 +38,17 @@
 from brevitas.core.scaling import ScalingImplType
 from brevitas.export.onnx.generic.manager import BrevitasONNXManager
 from brevitas.nn import QuantHardTanh
+from qonnx.core.modelwrapper import ModelWrapper
+from qonnx.transformation.infer_shapes import InferShapes
 from qonnx.util.cleanup import cleanup as qonnx_cleanup
 
 import finn.core.onnx_exec as oxe
-from finn.core.modelwrapper import ModelWrapper
-from finn.transformation.infer_shapes import InferShapes
 from finn.transformation.qonnx.convert_qonnx_to_finn import ConvertQONNXtoFINN
 
 export_onnx_path = "test_brevitas_non_scaled_QuantHardTanh_export.onnx"
 
 
+@pytest.mark.brevitas_export
 @pytest.mark.parametrize("abits", [1, 2, 4, 8])
 @pytest.mark.parametrize("narrow_range", [False, True])
 @pytest.mark.parametrize("max_val", [1.0, 1 - 2 ** (-7)])
diff --git a/tests/brevitas/test_brevitas_qconv2d.py b/tests/brevitas/test_brevitas_qconv2d.py
index beaea4e51e..214c55e5fd 100644
--- a/tests/brevitas/test_brevitas_qconv2d.py
+++ b/tests/brevitas/test_brevitas_qconv2d.py
@@ -38,18 +38,19 @@
 from brevitas.core.stats import StatsOp
 from brevitas.export.onnx.generic.manager import BrevitasONNXManager
 from brevitas.nn import QuantConv2d
+from qonnx.core.datatype import DataType
+from qonnx.core.modelwrapper import ModelWrapper
+from qonnx.transformation.infer_shapes import InferShapes
+from qonnx.util.basic import gen_finn_dt_tensor
 from qonnx.util.cleanup import cleanup as qonnx_cleanup
 
 import finn.core.onnx_exec as oxe
-from finn.core.datatype import DataType
-from finn.core.modelwrapper import ModelWrapper
-from finn.transformation.infer_shapes import InferShapes
 from finn.transformation.qonnx.convert_qonnx_to_finn import ConvertQONNXtoFINN
-from finn.util.basic import gen_finn_dt_tensor
 
 export_onnx_path = "test_brevitas_conv.onnx"
 
 
+@pytest.mark.brevitas_export
 @pytest.mark.parametrize("dw", [False, True])
 @pytest.mark.parametrize("bias", [True, False])
 @pytest.mark.parametrize("in_channels", [32])
diff --git a/tests/brevitas/test_brevitas_qlinear.py b/tests/brevitas/test_brevitas_qlinear.py
index 1099d3ec83..bcd75a5455 100644
--- a/tests/brevitas/test_brevitas_qlinear.py
+++ b/tests/brevitas/test_brevitas_qlinear.py
@@ -35,18 +35,19 @@
 from brevitas.core.quant import QuantType
 from brevitas.export.onnx.generic.manager import BrevitasONNXManager
 from brevitas.nn import QuantLinear
+from qonnx.core.datatype import DataType
+from qonnx.core.modelwrapper import ModelWrapper
+from qonnx.transformation.infer_shapes import InferShapes
+from qonnx.util.basic import gen_finn_dt_tensor
 from qonnx.util.cleanup import cleanup as qonnx_cleanup
 
 import finn.core.onnx_exec as oxe
-from finn.core.datatype import DataType
-from finn.core.modelwrapper import ModelWrapper
-from finn.transformation.infer_shapes import InferShapes
 from finn.transformation.qonnx.convert_qonnx_to_finn import ConvertQONNXtoFINN
-from finn.util.basic import gen_finn_dt_tensor
 
 export_onnx_path = "test_brevitas_qlinear.onnx"
 
 
+@pytest.mark.brevitas_export
 @pytest.mark.parametrize("bias", [False, True])
 @pytest.mark.parametrize("out_features", [4])
 @pytest.mark.parametrize("in_features", [3])
diff --git a/tests/brevitas/test_brevitas_relu_act_export.py b/tests/brevitas/test_brevitas_relu_act_export.py
index 57ead3b6c0..b0c3d6088c 100644
--- a/tests/brevitas/test_brevitas_relu_act_export.py
+++ b/tests/brevitas/test_brevitas_relu_act_export.py
@@ -38,16 +38,17 @@
 from brevitas.core.scaling import ScalingImplType
 from brevitas.export.onnx.generic.manager import BrevitasONNXManager
 from brevitas.nn import QuantReLU
+from qonnx.core.modelwrapper import ModelWrapper
+from qonnx.transformation.infer_shapes import InferShapes
 from qonnx.util.cleanup import cleanup as qonnx_cleanup
 
 import finn.core.onnx_exec as oxe
-from finn.core.modelwrapper import ModelWrapper
-from finn.transformation.infer_shapes import InferShapes
 from finn.transformation.qonnx.convert_qonnx_to_finn import ConvertQONNXtoFINN
 
 export_onnx_path = "test_brevitas_relu_act_export.onnx"
 
 
+@pytest.mark.brevitas_export
 @pytest.mark.parametrize("abits", [2, 4, 8])
 @pytest.mark.parametrize("max_val", [1.0, 1.5, 1 - 2 ** (-7)])
 @pytest.mark.parametrize(
@@ -111,6 +112,7 @@ def test_brevitas_act_export_relu(abits, max_val, scaling_impl_type, QONNX_expor
     os.remove(export_onnx_path)
 
 
+@pytest.mark.brevitas_export
 @pytest.mark.parametrize("abits", [2, 4, 8])
 @pytest.mark.parametrize("max_val", [1.0, 1.5, 1 - 2 ** (-7)])
 @pytest.mark.parametrize("scaling_per_channel", [True, False])
diff --git a/tests/brevitas/test_brevitas_scaled_qhardtanh_export.py b/tests/brevitas/test_brevitas_scaled_qhardtanh_export.py
index c6da2e2e97..403d406105 100644
--- a/tests/brevitas/test_brevitas_scaled_qhardtanh_export.py
+++ b/tests/brevitas/test_brevitas_scaled_qhardtanh_export.py
@@ -38,16 +38,17 @@
 from brevitas.core.scaling import ScalingImplType
 from brevitas.export.onnx.generic.manager import BrevitasONNXManager
 from brevitas.nn import QuantHardTanh
+from qonnx.core.modelwrapper import ModelWrapper
+from qonnx.transformation.infer_shapes import InferShapes
 from qonnx.util.cleanup import cleanup as qonnx_cleanup
 
 import finn.core.onnx_exec as oxe
-from finn.core.modelwrapper import ModelWrapper
-from finn.transformation.infer_shapes import InferShapes
 from finn.transformation.qonnx.convert_qonnx_to_finn import ConvertQONNXtoFINN
 
 export_onnx_path = "test_brevitas_scaled_QHardTanh_export.onnx"
 
 
+@pytest.mark.brevitas_export
 @pytest.mark.parametrize("abits", [2, 4, 8])
 @pytest.mark.parametrize("narrow_range", [False, True])
 @pytest.mark.parametrize("min_val", [-1.0, -(1 - 2 ** (-7)), -2])
diff --git a/tests/brevitas/test_brevitas_validate_mobilenet.py b/tests/brevitas/test_brevitas_validate_mobilenet.py
index 12e7e7aff2..55915838e8 100644
--- a/tests/brevitas/test_brevitas_validate_mobilenet.py
+++ b/tests/brevitas/test_brevitas_validate_mobilenet.py
@@ -35,23 +35,23 @@
 import torch
 import torchvision.datasets as datasets
 import torchvision.transforms as transforms
-
-import finn.core.onnx_exec as oxe
-import finn.transformation.streamline.absorb as absorb
-import finn.util.imagenet as imagenet_util
-from finn.core.modelwrapper import ModelWrapper
-from finn.transformation.fold_constants import FoldConstants
-from finn.transformation.general import (
+from qonnx.core.modelwrapper import ModelWrapper
+from qonnx.transformation.fold_constants import FoldConstants
+from qonnx.transformation.general import (
     GiveReadableTensorNames,
     GiveUniqueNodeNames,
     GiveUniqueParameterTensors,
     RemoveStaticGraphInputs,
 )
-from finn.transformation.infer_data_layouts import InferDataLayouts
-from finn.transformation.infer_datatypes import InferDataTypes
-from finn.transformation.infer_shapes import InferShapes
-from finn.transformation.insert_topk import InsertTopK
-from finn.transformation.merge_onnx_models import MergeONNXModels
+from qonnx.transformation.infer_data_layouts import InferDataLayouts
+from qonnx.transformation.infer_datatypes import InferDataTypes
+from qonnx.transformation.infer_shapes import InferShapes
+from qonnx.transformation.insert_topk import InsertTopK
+from qonnx.transformation.merge_onnx_models import MergeONNXModels
+
+import finn.core.onnx_exec as oxe
+import finn.transformation.streamline.absorb as absorb
+import finn.util.imagenet as imagenet_util
 from finn.util.basic import make_build_dir
 from finn.util.pytorch import NormalizePreProc
 from finn.util.test import get_test_model_trained
@@ -62,6 +62,7 @@
 ch = 3
 
 
+@pytest.mark.brevitas_export
 def test_brevitas_mobilenet_preproc():
     if "IMAGENET_VAL_PATH" not in os.environ.keys():
         pytest.skip("Can't do validation without IMAGENET_VAL_PATH")
@@ -98,6 +99,7 @@ def test_brevitas_mobilenet_preproc():
         assert (finn_img == pyt_img).all()
 
 
+@pytest.mark.brevitas_export
 @pytest.mark.slow
 # marked as XFAIL until Brevitas export issues are resolved:
 # https://github.com/Xilinx/brevitas/issues/173
diff --git a/tests/end2end/test_end2end_access_board.py b/tests/end2end/test_end2end_access_board.py
index ee15980ffb..ba3c49195b 100644
--- a/tests/end2end/test_end2end_access_board.py
+++ b/tests/end2end/test_end2end_access_board.py
@@ -34,6 +34,7 @@
 
 
 @pytest.mark.board
+@pytest.mark.end2end
 def test_end2end_access_board():
     build_env = get_build_env("zynq", 5)
     if build_env["ip"] == "":
diff --git a/tests/end2end/test_end2end_bnn_pynq.py b/tests/end2end/test_end2end_bnn_pynq.py
index 1d7d5e3e9a..103f18b514 100644
--- a/tests/end2end/test_end2end_bnn_pynq.py
+++ b/tests/end2end/test_end2end_bnn_pynq.py
@@ -42,19 +42,31 @@
 from collections import OrderedDict
 from dataset_loading import cifar, mnist
 from datetime import datetime
+from qonnx.core.datatype import DataType
+from qonnx.core.modelwrapper import ModelWrapper
+from qonnx.custom_op.registry import getCustomOp
+from qonnx.transformation.bipolar_to_xnor import ConvertBipolarMatMulToXnorPopcount
+from qonnx.transformation.fold_constants import FoldConstants
+from qonnx.transformation.general import (
+    GiveReadableTensorNames,
+    GiveUniqueNodeNames,
+    RemoveStaticGraphInputs,
+    RemoveUnusedTensors,
+)
+from qonnx.transformation.infer_data_layouts import InferDataLayouts
+from qonnx.transformation.infer_datatypes import InferDataTypes
+from qonnx.transformation.infer_shapes import InferShapes
+from qonnx.transformation.insert_topk import InsertTopK
+from qonnx.transformation.lower_convs_to_matmul import LowerConvsToMatMul
+from qonnx.transformation.merge_onnx_models import MergeONNXModels
 from qonnx.util.cleanup import cleanup as qonnx_cleanup
 from scipy.stats import linregress
 
 import finn.transformation.fpgadataflow.convert_to_hls_layers as to_hls
 import finn.transformation.streamline.absorb as absorb
 from finn.analysis.fpgadataflow.dataflow_performance import dataflow_performance
-from finn.core.datatype import DataType
-from finn.core.modelwrapper import ModelWrapper
 from finn.core.onnx_exec import execute_onnx
 from finn.core.throughput_test import throughput_test_remote, throughput_test_rtlsim
-from finn.custom_op.registry import getCustomOp
-from finn.transformation.bipolar_to_xnor import ConvertBipolarMatMulToXnorPopcount
-from finn.transformation.fold_constants import FoldConstants
 from finn.transformation.fpgadataflow.annotate_cycles import AnnotateCycles
 from finn.transformation.fpgadataflow.annotate_resources import AnnotateResources
 from finn.transformation.fpgadataflow.compile_cppsim import CompileCppSim
@@ -71,18 +83,6 @@
 from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim
 from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode
 from finn.transformation.fpgadataflow.set_fifo_depths import InsertAndSetFIFODepths
-from finn.transformation.general import (
-    GiveReadableTensorNames,
-    GiveUniqueNodeNames,
-    RemoveStaticGraphInputs,
-    RemoveUnusedTensors,
-)
-from finn.transformation.infer_data_layouts import InferDataLayouts
-from finn.transformation.infer_datatypes import InferDataTypes
-from finn.transformation.infer_shapes import InferShapes
-from finn.transformation.insert_topk import InsertTopK
-from finn.transformation.lower_convs_to_matmul import LowerConvsToMatMul
-from finn.transformation.merge_onnx_models import MergeONNXModels
 from finn.transformation.move_reshape import RemoveCNVtoFCFlatten
 from finn.transformation.qonnx.convert_qonnx_to_finn import ConvertQONNXtoFINN
 from finn.transformation.streamline import Streamline
@@ -90,6 +90,7 @@
     MakeMaxPoolNHWC,
     MoveScalarLinearPastInvariants,
 )
+from finn.util.basic import get_finn_root
 from finn.util.gdrive import upload_to_end2end_dashboard
 from finn.util.pytorch import ToTensor
 from finn.util.test import (
@@ -136,7 +137,7 @@ def update_dashboard_data(topology, wbits, abits, key, val):
 
 
 def fold_tfc(model):
-    fc_layers = model.get_nodes_by_op_type("StreamingFCLayer_Batch")
+    fc_layers = model.get_nodes_by_op_type("MatrixVectorActivation")
     # (PE, SIMD, ramstyle) for each layer
     config = [(16, 49, "block"), (8, 8, "auto"), (8, 8, "auto"), (10, 8, "distributed")]
     for fcl, (pe, simd, ramstyle) in zip(fc_layers, config):
@@ -154,7 +155,7 @@ def fold_tfc(model):
 
 
 def fold_lfc(model):
-    fc_layers = model.get_nodes_by_op_type("StreamingFCLayer_Batch")
+    fc_layers = model.get_nodes_by_op_type("MatrixVectorActivation")
     # (PE, SIMD, ramstyle) for each layer
     config = [
         (32, 49, "block"),
@@ -176,7 +177,7 @@ def fold_lfc(model):
 
 
 def fold_cnv_large(model):
-    fc_layers = model.get_nodes_by_op_type("StreamingFCLayer_Batch")
+    fc_layers = model.get_nodes_by_op_type("MatrixVectorActivation")
     # each tuple is (PE, SIMD) for a layer
     folding = [
         (16, 3),
@@ -203,11 +204,11 @@ def fold_cnv_large(model):
 
 
 def fold_cnv_small(model):
-    fc_layers = model.get_nodes_by_op_type("StreamingFCLayer_Batch")
+    fc_layers = model.get_nodes_by_op_type("MatrixVectorActivation")
     # each tuple is (PE, SIMD) for a layer
     folding = [
-        (8, 3, "auto"),
-        (16, 16, "auto"),
+        (8, 3, "distributed"),
+        (16, 16, "distributed"),
         (8, 16, "auto"),
         (8, 16, "block"),
         (4, 8, "auto"),
@@ -259,11 +260,11 @@ def get_golden_io_pair(topology, wbits, abits, preproc=ToTensor(), return_topk=N
 def measure_top1_accuracy(model_chkpt, dataset, parent_chkpt=None):
     if dataset == "cifar10":
         trainx, trainy, testx, testy, valx, valy = cifar.load_cifar_data(
-            "/workspace/finn/dataset", download=True, one_hot=False
+            get_finn_root() + "/dataset", download=True, one_hot=False
         )
     elif dataset == "mnist":
         trainx, trainy, testx, testy, valx, valy = mnist.load_mnist_data(
-            "/workspace/finn/dataset", download=True, one_hot=False
+            get_finn_root() + "/dataset", download=True, one_hot=False
         )
     else:
         raise Exception("Unrecognized dataset")
@@ -313,6 +314,7 @@ def topology2dataset(topology):
 @pytest.mark.parametrize("abits", [1, 2])
 @pytest.mark.parametrize("topology", ["lfc", "tfc", "cnv"])
 @pytest.mark.parametrize("QONNX_export", [False, True])
+@pytest.mark.end2end
 class TestEnd2End:
     def test_export(self, topology, wbits, abits, QONNX_export):
         if wbits > abits:
@@ -334,7 +336,7 @@ def test_export(self, topology, wbits, abits, QONNX_export):
         dtstr = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
         update_dashboard_data(topology, wbits, abits, "datetime", dtstr)
         finn_commit = subprocess.check_output(
-            ["git", "rev-parse", "HEAD"], cwd="/workspace/finn"
+            ["git", "rev-parse", "HEAD"], cwd=get_finn_root()
         )
         finn_commit = finn_commit.decode("utf-8").strip()
         update_dashboard_data(topology, wbits, abits, "finn-commit", finn_commit)
@@ -425,9 +427,9 @@ def test_convert_to_hls_layers(self, topology, wbits, abits, QONNX_export):
             # use standalone thresholds for tfc-w1a1 to also exercise that option
             model = model.transform(to_hls.InferThresholdingLayer())
         # needed for bipolar MatMul layers
-        model = model.transform(to_hls.InferBinaryStreamingFCLayer(mem_mode))
+        model = model.transform(to_hls.InferBinaryMatrixVectorActivation(mem_mode))
         # needed for non-bipolar MatMul layers
-        model = model.transform(to_hls.InferQuantizedStreamingFCLayer(mem_mode))
+        model = model.transform(to_hls.InferQuantizedMatrixVectorActivation(mem_mode))
         # TopK to LabelSelect
         model = model.transform(to_hls.InferLabelSelectLayer())
         # input quantization (if any) to standalone thresholding
@@ -450,26 +452,26 @@ def test_convert_to_hls_layers(self, topology, wbits, abits, QONNX_export):
             "tfc": [
                 ("Reshape", 1),
                 ("Thresholding_Batch", 1),
-                ("StreamingFCLayer_Batch", 4),
+                ("MatrixVectorActivation", 4),
                 ("LabelSelect_Batch", 1),
             ],
             "tfc-1-1": [
                 ("Reshape", 1),
                 ("Thresholding_Batch", 4),
-                ("StreamingFCLayer_Batch", 4),
+                ("MatrixVectorActivation", 4),
                 ("LabelSelect_Batch", 1),
             ],
             "lfc": [
                 ("Reshape", 1),
                 ("Thresholding_Batch", 1),
-                ("StreamingFCLayer_Batch", 4),
+                ("MatrixVectorActivation", 4),
                 ("LabelSelect_Batch", 1),
             ],
             "cnv": [
                 ("Transpose", 1),
                 ("Thresholding_Batch", 1),
                 ("ConvolutionInputGenerator", 6),
-                ("StreamingFCLayer_Batch", 9),
+                ("MatrixVectorActivation", 9),
                 ("StreamingMaxPool_Batch", 2),
                 ("LabelSelect_Batch", 1),
             ],
@@ -786,7 +788,7 @@ def test_throughput_hw(self, topology, wbits, abits, QONNX_export, kind):
         ret_str += "\n" + "Raw data:"
 
         ret_str += "\n" + "{:<8} {:<16} {:<16} {:<16} {:<16} {:<16}".format(
-            "N", "runtime[ms]", "fclk[mhz]", "fps", "DRAM rd[Mb/s]", "DRAM wr[Mb/s]"
+            "N", "runtime[ms]", "fclk[mhz]", "fps", "DRAM rd[MB/s]", "DRAM wr[MB/s]"
         )
         for k in bsize_range:
             v = ret[k]
@@ -795,8 +797,8 @@ def test_throughput_hw(self, topology, wbits, abits, QONNX_export, kind):
                 np.round(v["runtime[ms]"], 4),
                 v["fclk[mhz]"],
                 np.round(v["throughput[images/s]"], 2),
-                np.round(v["DRAM_in_bandwidth[Mb/s]"], 2),
-                np.round(v["DRAM_out_bandwidth[Mb/s]"], 2),
+                np.round(v["DRAM_in_bandwidth[MB/s]"], 2),
+                np.round(v["DRAM_out_bandwidth[MB/s]"], 2),
             )
         ret_str += "\n" + "-----------------------------"
         warnings.warn(ret_str)
diff --git a/tests/end2end/test_end2end_cybsec_mlp.py b/tests/end2end/test_end2end_cybsec_mlp.py
index e24d87ca6a..b6482dc96c 100644
--- a/tests/end2end/test_end2end_cybsec_mlp.py
+++ b/tests/end2end/test_end2end_cybsec_mlp.py
@@ -43,12 +43,12 @@
 from brevitas.export.onnx.generic.manager import BrevitasONNXManager
 from brevitas.nn import QuantIdentity, QuantLinear, QuantReLU
 from brevitas.quant_tensor import QuantTensor
+from qonnx.core.datatype import DataType
+from qonnx.core.modelwrapper import ModelWrapper
 from qonnx.util.cleanup import cleanup as qonnx_cleanup
 
 import finn.builder.build_dataflow as build
 import finn.builder.build_dataflow_config as build_cfg
-from finn.core.datatype import DataType
-from finn.core.modelwrapper import ModelWrapper
 from finn.transformation.qonnx.convert_qonnx_to_finn import ConvertQONNXtoFINN
 from finn.util.basic import make_build_dir
 from finn.util.test import get_build_env, load_test_checkpoint_or_skip
@@ -86,6 +86,7 @@ def forward(self, x):
 
 
 @pytest.mark.parametrize("QONNX_export", [False, True])
+@pytest.mark.end2end
 def test_end2end_cybsec_mlp_export(QONNX_export):
     assets_dir = pk.resource_filename("finn.qnn-data", "cybsec-mlp/")
     # load up trained net in Brevitas
@@ -180,6 +181,7 @@ def test_end2end_cybsec_mlp_export(QONNX_export):
 
 @pytest.mark.slow
 @pytest.mark.vivado
+@pytest.mark.end2end
 @pytest.mark.parametrize("QONNX_export", [False, True])
 def test_end2end_cybsec_mlp_build(QONNX_export):
     model_file = get_checkpoint_name("export", QONNX_export)
@@ -217,8 +219,8 @@ def test_end2end_cybsec_mlp_build(QONNX_export):
     # examine the report contents
     with open(est_cycles_report, "r") as f:
         est_cycles_dict = json.load(f)
-        assert est_cycles_dict["StreamingFCLayer_Batch_0"] == 80
-        assert est_cycles_dict["StreamingFCLayer_Batch_1"] == 64
+        assert est_cycles_dict["MatrixVectorActivation_0"] == 80
+        assert est_cycles_dict["MatrixVectorActivation_1"] == 64
     with open(est_res_report, "r") as f:
         est_res_dict = json.load(f)
         assert est_res_dict["total"]["LUT"] == 11360.0
@@ -226,6 +228,7 @@ def test_end2end_cybsec_mlp_build(QONNX_export):
     shutil.copytree(output_dir + "/deploy", get_checkpoint_name("build", QONNX_export))
 
 
+@pytest.mark.end2end
 @pytest.mark.parametrize("QONNX_export", [False, True])
 def test_end2end_cybsec_mlp_run_on_hw(QONNX_export):
     build_env = get_build_env(build_kind, target_clk_ns)
diff --git a/tests/end2end/test_end2end_mobilenet_v1.py b/tests/end2end/test_end2end_mobilenet_v1.py
index e459bfbc3e..2f4df956ac 100644
--- a/tests/end2end/test_end2end_mobilenet_v1.py
+++ b/tests/end2end/test_end2end_mobilenet_v1.py
@@ -33,40 +33,40 @@
 import time
 import torch
 from PIL import Image
+from qonnx.core.datatype import DataType
+from qonnx.core.modelwrapper import ModelWrapper
+from qonnx.custom_op.registry import getCustomOp
+from qonnx.transformation.change_datalayout import ChangeDataLayoutQuantAvgPool2d
+from qonnx.transformation.double_to_single_float import DoubleToSingleFloat
+from qonnx.transformation.fold_constants import FoldConstants
+from qonnx.transformation.general import (
+    GiveReadableTensorNames,
+    GiveUniqueNodeNames,
+    GiveUniqueParameterTensors,
+    RemoveUnusedTensors,
+)
+from qonnx.transformation.infer_data_layouts import InferDataLayouts
+from qonnx.transformation.infer_datatypes import InferDataTypes
+from qonnx.transformation.infer_shapes import InferShapes
+from qonnx.transformation.insert_topk import InsertTopK
+from qonnx.transformation.lower_convs_to_matmul import LowerConvsToMatMul
+from qonnx.transformation.merge_onnx_models import MergeONNXModels
+from qonnx.transformation.remove import RemoveIdentityOps
 
 import finn.transformation.fpgadataflow.convert_to_hls_layers as to_hls
 import finn.transformation.streamline.absorb as absorb
 import finn.transformation.streamline.reorder as reorder
-from finn.core.datatype import DataType
-from finn.core.modelwrapper import ModelWrapper
 from finn.core.onnx_exec import execute_onnx
-from finn.custom_op.registry import getCustomOp
-from finn.transformation.change_datalayout import ChangeDataLayoutQuantAvgPool2d
-from finn.transformation.double_to_single_float import DoubleToSingleFloat
-from finn.transformation.fold_constants import FoldConstants
 from finn.transformation.fpgadataflow.compile_cppsim import CompileCppSim
 from finn.transformation.fpgadataflow.create_dataflow_partition import (
     CreateDataflowPartition,
 )
 from finn.transformation.fpgadataflow.prepare_cppsim import PrepareCppSim
 from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode
-from finn.transformation.general import (
-    GiveReadableTensorNames,
-    GiveUniqueNodeNames,
-    GiveUniqueParameterTensors,
-    RemoveUnusedTensors,
-)
-from finn.transformation.infer_data_layouts import InferDataLayouts
-from finn.transformation.infer_datatypes import InferDataTypes
-from finn.transformation.infer_shapes import InferShapes
-from finn.transformation.insert_topk import InsertTopK
-from finn.transformation.lower_convs_to_matmul import LowerConvsToMatMul
-from finn.transformation.merge_onnx_models import MergeONNXModels
-from finn.transformation.remove import RemoveIdentityOps
 from finn.transformation.streamline import Streamline
 from finn.transformation.streamline.collapse_repeated import CollapseRepeatedMul
 from finn.transformation.streamline.round_thresholds import RoundAndClipThresholds
-from finn.util.basic import alveo_default_platform, alveo_part_map
+from finn.util.basic import alveo_default_platform, alveo_part_map, get_finn_root
 from finn.util.pytorch import NormalizePreProc
 from finn.util.test import (
     crop_center,
@@ -87,6 +87,7 @@
 first_layer_res_type = "dsp"
 
 
+@pytest.mark.end2end
 def test_end2end_mobilenet_export():
     # export preprocessing
     preproc_onnx = build_dir + "/end2end_mobilenet_preproc.onnx"
@@ -114,7 +115,7 @@ def test_end2end_mobilenet_export():
 
     # calculate golden output with pytorch/brevitas and save as .npy
     # get single image as input and prepare image
-    img = Image.open("/workspace/finn/tests/brevitas/king_charles.jpg")
+    img = Image.open(get_finn_root() + "/tests/brevitas/king_charles.jpg")
     # resize smallest side of the image to 256 pixels and resize larger side
     # with same ratio
     img = resize_smaller_side(256, img)
@@ -142,6 +143,7 @@ def test_end2end_mobilenet_export():
     assert os.path.isfile(build_dir + "/end2end_mobilenet_preproc.onnx")
 
 
+@pytest.mark.end2end
 def test_end2end_mobilenet_tidy_and_merge_with_preproc():
     preproc_model = load_test_checkpoint_or_skip(
         build_dir + "/end2end_mobilenet_preproc.onnx"
@@ -164,6 +166,7 @@ def test_end2end_mobilenet_tidy_and_merge_with_preproc():
     model.save(build_dir + "/end2end_mobilenet_tidy.onnx")
 
 
+@pytest.mark.end2end
 def test_end2end_mobilenet_streamline():
     model = load_test_checkpoint_or_skip(build_dir + "/end2end_mobilenet_tidy.onnx")
     model = model.transform(Streamline())
@@ -194,6 +197,7 @@ def test_end2end_mobilenet_streamline():
     assert len(model.get_nodes_by_op_type("Mul")) == 0  # no Mul ops remain
 
 
+@pytest.mark.end2end
 def test_end2end_mobilenet_lowering():
     model = load_test_checkpoint_or_skip(
         build_dir + "/end2end_mobilenet_streamlined.onnx"
@@ -208,12 +212,13 @@ def test_end2end_mobilenet_lowering():
     model.save(build_dir + "/end2end_mobilenet_lowered.onnx")
 
 
+@pytest.mark.end2end
 def test_end2end_mobilenet_convert_to_hls_layers():
     model = load_test_checkpoint_or_skip(build_dir + "/end2end_mobilenet_lowered.onnx")
     model = model.transform(to_hls.InferPool_Batch())
     model = model.transform(to_hls.InferConvInpGen())
-    model = model.transform(to_hls.InferVVAU())
-    model = model.transform(to_hls.InferQuantizedStreamingFCLayer(mem_mode))
+    model = model.transform(to_hls.InferVectorVectorActivation())
+    model = model.transform(to_hls.InferQuantizedMatrixVectorActivation(mem_mode))
     model = model.transform(to_hls.InferChannelwiseLinearLayer())
     model = model.transform(to_hls.InferLabelSelectLayer())
     model = model.transform(InferShapes())
@@ -222,6 +227,7 @@ def test_end2end_mobilenet_convert_to_hls_layers():
     model.save(build_dir + "/end2end_mobilenet_hls_layers.onnx")
 
 
+@pytest.mark.end2end
 def test_end2end_mobilenet_folding():
     model = load_test_checkpoint_or_skip(
         build_dir + "/end2end_mobilenet_hls_layers.onnx"
@@ -231,7 +237,7 @@ def test_end2end_mobilenet_folding():
     assert extra_fold in [1, 2, 4]
     # set up folding for the depthwise conv layers impl'd by VVAUs
     # each value is PE for a layer
-    fc_layers = model.get_nodes_by_op_type("StreamingFCLayer_Batch")
+    fc_layers = model.get_nodes_by_op_type("MatrixVectorActivation")
     # each tuple is (PE, SIMD, ram_style) for a layer
     folding = [
         (32, 3, "block"),
@@ -260,7 +266,7 @@ def test_end2end_mobilenet_folding():
     getCustomOp(fc_layers[0]).set_nodeattr("resType", first_layer_res_type)
     # set up folding for the depthwise conv layers impl'd by VVAUs
     # each value is PE for a layer
-    vvau_layers = model.get_nodes_by_op_type("Vector_Vector_Activate_Batch")
+    vvau_layers = model.get_nodes_by_op_type("VectorVectorActivation")
     folding = [32, 32, 64, 16, 32, 8, 16, 16, 16, 16, 16, 4, 8]
     for vvau, pe in zip(vvau_layers, folding):
         vvau_inst = getCustomOp(vvau)
@@ -285,6 +291,7 @@ def test_end2end_mobilenet_folding():
     model.save(build_dir + "/end2end_mobilenet_folded.onnx")
 
 
+@pytest.mark.end2end
 def test_end2end_mobilenet_create_dataflow_partition():
     model = load_test_checkpoint_or_skip(build_dir + "/end2end_mobilenet_folded.onnx")
     parent_model = model.transform(CreateDataflowPartition())
@@ -299,6 +306,7 @@ def test_end2end_mobilenet_create_dataflow_partition():
 
 @pytest.mark.slow
 @pytest.mark.vivado
+@pytest.mark.end2end
 @pytest.mark.xfail
 def test_end2end_mobilenet_cppsim():
     model = load_test_checkpoint_or_skip(build_dir + "/end2end_mobilenet_folded.onnx")
diff --git a/tests/end2end/test_ext_weights.py b/tests/end2end/test_ext_weights.py
index 550dab4d03..9483ccf0b2 100644
--- a/tests/end2end/test_ext_weights.py
+++ b/tests/end2end/test_ext_weights.py
@@ -68,6 +68,7 @@ def get_checkpoint_name(step):
         return build_dir + "/end2end_ext_weights_%s.onnx" % (step)
 
 
+@pytest.mark.end2end
 def test_end2end_ext_weights_download():
     if not os.path.isfile(onnx_zip_local):
         wget.download(onnx_zip_url, out=onnx_zip_local)
@@ -78,6 +79,7 @@ def test_end2end_ext_weights_download():
 
 @pytest.mark.slow
 @pytest.mark.vivado
+@pytest.mark.end2end
 def test_end2end_ext_weights_build():
     model_file = get_checkpoint_name("download")
     load_test_checkpoint_or_skip(model_file)
@@ -110,6 +112,7 @@ def test_end2end_ext_weights_build():
 
 
 @pytest.mark.board
+@pytest.mark.end2end
 def test_end2end_ext_weights_dataset():
     # make sure we have local copies of mnist dataset files
     subprocess.check_output(["mkdir", "-p", mnist_local])
@@ -125,6 +128,7 @@ def test_end2end_ext_weights_dataset():
     subprocess.check_output(rsync_dataset_cmd)
 
 
+@pytest.mark.end2end
 def test_end2end_ext_weights_run_on_hw():
     build_env = get_build_env(build_kind, target_clk_ns)
     deploy_dir = get_checkpoint_name("build")
diff --git a/tests/fpgadataflow/test_code_gen_trafo.py b/tests/fpgadataflow/test_code_gen_trafo.py
index 5ddff3d36f..49ee32c71e 100644
--- a/tests/fpgadataflow/test_code_gen_trafo.py
+++ b/tests/fpgadataflow/test_code_gen_trafo.py
@@ -30,13 +30,14 @@
 
 import os
 from onnx import TensorProto, helper
+from qonnx.core.datatype import DataType
+from qonnx.core.modelwrapper import ModelWrapper
+from qonnx.util.basic import gen_finn_dt_tensor, get_by_name
 
-import finn.util.basic as util
-from finn.core.datatype import DataType
-from finn.core.modelwrapper import ModelWrapper
 from finn.transformation.fpgadataflow.prepare_cppsim import PrepareCppSim
 
 
+@pytest.mark.fpgadataflow
 @pytest.mark.vivado
 def test_code_gen_trafo():
     idt = wdt = odt = DataType["BIPOLAR"]
@@ -49,7 +50,7 @@ def test_code_gen_trafo():
     outp = helper.make_tensor_value_info("outp", TensorProto.FLOAT, [1, mh])
     node_inp_list = ["inp", "weights", "thresh"]
     FCLayer_node = helper.make_node(
-        "StreamingFCLayer_Batch",
+        "MatrixVectorActivation",
         node_inp_list,
         ["outp"],
         domain="finn.custom_op.fpgadataflow",
@@ -75,12 +76,12 @@ def test_code_gen_trafo():
     model.set_tensor_datatype("inp", idt)
     model.set_tensor_datatype("outp", odt)
     model.set_tensor_datatype("weights", wdt)
-    W = util.gen_finn_dt_tensor(wdt, (mw, mh))
+    W = gen_finn_dt_tensor(wdt, (mw, mh))
     model.set_initializer("weights", W)
 
     model = model.transform(PrepareCppSim())
     for node in model.graph.node:
-        code_gen_attribute = util.get_by_name(node.attribute, "code_gen_dir_cppsim")
+        code_gen_attribute = get_by_name(node.attribute, "code_gen_dir_cppsim")
         tmp_dir = code_gen_attribute.s.decode("UTF-8")
         assert os.path.isdir(
             tmp_dir
diff --git a/tests/fpgadataflow/test_compilation_trafo.py b/tests/fpgadataflow/test_compilation_trafo.py
index 81e2ff9a7c..9bafb101ce 100644
--- a/tests/fpgadataflow/test_compilation_trafo.py
+++ b/tests/fpgadataflow/test_compilation_trafo.py
@@ -30,14 +30,15 @@
 
 import os
 from onnx import TensorProto, helper
+from qonnx.core.datatype import DataType
+from qonnx.core.modelwrapper import ModelWrapper
+from qonnx.util.basic import gen_finn_dt_tensor, get_by_name
 
-import finn.util.basic as util
-from finn.core.datatype import DataType
-from finn.core.modelwrapper import ModelWrapper
 from finn.transformation.fpgadataflow.compile_cppsim import CompileCppSim
 from finn.transformation.fpgadataflow.prepare_cppsim import PrepareCppSim
 
 
+@pytest.mark.fpgadataflow
 @pytest.mark.vivado
 def test_compilation_trafo():
     idt = wdt = odt = DataType["BIPOLAR"]
@@ -50,7 +51,7 @@ def test_compilation_trafo():
     outp = helper.make_tensor_value_info("outp", TensorProto.FLOAT, [1, mh])
     node_inp_list = ["inp", "weights", "thresh"]
     FCLayer_node = helper.make_node(
-        "StreamingFCLayer_Batch",
+        "MatrixVectorActivation",
         node_inp_list,
         ["outp"],
         domain="finn.custom_op.fpgadataflow",
@@ -76,13 +77,13 @@ def test_compilation_trafo():
     model.set_tensor_datatype("inp", idt)
     model.set_tensor_datatype("outp", odt)
     model.set_tensor_datatype("weights", wdt)
-    W = util.gen_finn_dt_tensor(wdt, (mw, mh))
+    W = gen_finn_dt_tensor(wdt, (mw, mh))
     model.set_initializer("weights", W)
 
     model = model.transform(PrepareCppSim())
     model = model.transform(CompileCppSim())
     for node in model.graph.node:
-        compilation_attribute = util.get_by_name(node.attribute, "executable_path")
+        compilation_attribute = get_by_name(node.attribute, "executable_path")
         executable = compilation_attribute.s.decode("UTF-8")
         print(executable)
         assert os.path.isfile(
diff --git a/tests/fpgadataflow/test_convert_to_hls_1d_conv_layer.py b/tests/fpgadataflow/test_convert_to_hls_1d_conv_layer.py
index 5cc5f8fa6c..5bbaefac2d 100644
--- a/tests/fpgadataflow/test_convert_to_hls_1d_conv_layer.py
+++ b/tests/fpgadataflow/test_convert_to_hls_1d_conv_layer.py
@@ -30,25 +30,25 @@
 
 import numpy as np
 from onnx import TensorProto, helper
+from qonnx.core.datatype import DataType
+from qonnx.core.modelwrapper import ModelWrapper
+from qonnx.custom_op.general.im2col import compute_conv_output_dim
+from qonnx.custom_op.registry import getCustomOp
+from qonnx.transformation.general import GiveUniqueNodeNames
+from qonnx.transformation.infer_datatypes import InferDataTypes
+from qonnx.transformation.infer_shapes import InferShapes
+from qonnx.transformation.lower_convs_to_matmul import LowerConvsToMatMul
+from qonnx.util.basic import gen_finn_dt_tensor
 
 import finn.core.onnx_exec as oxe
 import finn.transformation.fpgadataflow.convert_to_hls_layers as to_hls
 from finn.analysis.fpgadataflow.exp_cycles_per_layer import exp_cycles_per_layer
-from finn.core.datatype import DataType
-from finn.core.modelwrapper import ModelWrapper
-from finn.custom_op.general.im2col import compute_conv_output_dim
-from finn.custom_op.registry import getCustomOp
 from finn.transformation.fpgadataflow.compile_cppsim import CompileCppSim
 from finn.transformation.fpgadataflow.hlssynth_ip import HLSSynthIP
 from finn.transformation.fpgadataflow.prepare_cppsim import PrepareCppSim
 from finn.transformation.fpgadataflow.prepare_ip import PrepareIP
 from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim
 from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode
-from finn.transformation.general import GiveUniqueNodeNames
-from finn.transformation.infer_datatypes import InferDataTypes
-from finn.transformation.infer_shapes import InferShapes
-from finn.transformation.lower_convs_to_matmul import LowerConvsToMatMul
-from finn.util.basic import gen_finn_dt_tensor
 
 
 # conv_config:
@@ -67,6 +67,7 @@
 )
 @pytest.mark.parametrize("depthwise", [False, True])
 @pytest.mark.parametrize("exec_mode", ["cppsim", "rtlsim"])
+@pytest.mark.fpgadataflow
 @pytest.mark.slow
 @pytest.mark.vivado
 def test_convert_to_hls_1d_conv_layer(conv_config, depthwise, exec_mode):
@@ -140,10 +141,10 @@ def test_convert_to_hls_1d_conv_layer(conv_config, depthwise, exec_mode):
     new_model = model.transform(LowerConvsToMatMul())
     new_model = new_model.transform(to_hls.InferConvInpGen())
     if depthwise is True:
-        new_model = new_model.transform(to_hls.InferVVAU())
+        new_model = new_model.transform(to_hls.InferVectorVectorActivation())
     else:
-        new_model = new_model.transform(to_hls.InferQuantizedStreamingFCLayer())
-        fc_node = new_model.get_nodes_by_op_type("StreamingFCLayer_Batch")[0]
+        new_model = new_model.transform(to_hls.InferQuantizedMatrixVectorActivation())
+        fc_node = new_model.get_nodes_by_op_type("MatrixVectorActivation")[0]
         fc_inst = getCustomOp(fc_node)
         mw = fc_inst.get_nodeattr("MW")
         mh = fc_inst.get_nodeattr("MH")
@@ -179,7 +180,7 @@ def test_convert_to_hls_1d_conv_layer(conv_config, depthwise, exec_mode):
         assert padding_inst.get_nodeattr("SIMD") == in_chn
 
     if depthwise is True and exec_mode == "rtlsim":
-        node = new_model.get_nodes_by_op_type("Vector_Vector_Activate_Batch")[0]
+        node = new_model.get_nodes_by_op_type("VectorVectorActivation")[0]
         inst = getCustomOp(node)
         cycles_rtlsim = inst.get_nodeattr("cycles_rtlsim")
         exp_cycles_dict = new_model.analysis(exp_cycles_per_layer)
diff --git a/tests/fpgadataflow/test_convert_to_hls_channelwise_layer.py b/tests/fpgadataflow/test_convert_to_hls_channelwise_layer.py
index bf690d1d68..0f19b6d79a 100644
--- a/tests/fpgadataflow/test_convert_to_hls_channelwise_layer.py
+++ b/tests/fpgadataflow/test_convert_to_hls_channelwise_layer.py
@@ -30,21 +30,21 @@
 
 import numpy as np
 from onnx import TensorProto, helper
+from qonnx.core.datatype import DataType
+from qonnx.core.modelwrapper import ModelWrapper
+from qonnx.transformation.general import GiveUniqueNodeNames
+from qonnx.transformation.infer_data_layouts import InferDataLayouts
+from qonnx.transformation.infer_shapes import InferShapes
+from qonnx.util.basic import gen_finn_dt_tensor
 
 import finn.core.onnx_exec as oxe
 import finn.transformation.fpgadataflow.convert_to_hls_layers as to_hls
-from finn.core.datatype import DataType
-from finn.core.modelwrapper import ModelWrapper
 from finn.transformation.fpgadataflow.compile_cppsim import CompileCppSim
 from finn.transformation.fpgadataflow.hlssynth_ip import HLSSynthIP
 from finn.transformation.fpgadataflow.prepare_cppsim import PrepareCppSim
 from finn.transformation.fpgadataflow.prepare_ip import PrepareIP
 from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim
 from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode
-from finn.transformation.general import GiveUniqueNodeNames
-from finn.transformation.infer_data_layouts import InferDataLayouts
-from finn.transformation.infer_shapes import InferShapes
-from finn.util.basic import gen_finn_dt_tensor
 
 
 def prepare_inputs(input_tensor):
@@ -89,6 +89,7 @@ def make_single_maxpool_modelwrapper(onnx_op_name, ishape, idt, pdt, pshape):
 @pytest.mark.parametrize("scalar_param", [True, False])
 # execution mode
 @pytest.mark.parametrize("exec_mode", ["cppsim", "rtlsim"])
+@pytest.mark.fpgadataflow
 @pytest.mark.vivado
 @pytest.mark.slow
 def test_convert_to_hls_channelwise_layer(
diff --git a/tests/fpgadataflow/test_convert_to_hls_conv_fc_transition.py b/tests/fpgadataflow/test_convert_to_hls_conv_fc_transition.py
index 9b0f3d68ae..0760ff9b37 100755
--- a/tests/fpgadataflow/test_convert_to_hls_conv_fc_transition.py
+++ b/tests/fpgadataflow/test_convert_to_hls_conv_fc_transition.py
@@ -29,27 +29,27 @@
 import pytest
 
 import numpy as np
+import qonnx.core.data_layout as DataLayout
 from onnx import TensorProto, helper
+from qonnx.core.datatype import DataType
+from qonnx.core.modelwrapper import ModelWrapper
+from qonnx.custom_op.general.im2col import compute_conv_output_dim
+from qonnx.transformation.general import GiveUniqueNodeNames, RemoveUnusedTensors
+from qonnx.transformation.infer_data_layouts import InferDataLayouts
+from qonnx.transformation.infer_datatypes import InferDataTypes
+from qonnx.transformation.infer_shapes import InferShapes
+from qonnx.transformation.lower_convs_to_matmul import LowerConvsToMatMul
+from qonnx.util.basic import gen_finn_dt_tensor
 
-import finn.core.data_layout as DataLayout
 import finn.core.onnx_exec as oxe
 import finn.transformation.fpgadataflow.convert_to_hls_layers as to_hls
 import finn.transformation.streamline.absorb as absorb
-from finn.core.datatype import DataType
-from finn.core.modelwrapper import ModelWrapper
-from finn.custom_op.general.im2col import compute_conv_output_dim
 from finn.transformation.fpgadataflow.compile_cppsim import CompileCppSim
 from finn.transformation.fpgadataflow.prepare_cppsim import PrepareCppSim
 from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode
-from finn.transformation.general import GiveUniqueNodeNames, RemoveUnusedTensors
-from finn.transformation.infer_data_layouts import InferDataLayouts
-from finn.transformation.infer_datatypes import InferDataTypes
-from finn.transformation.infer_shapes import InferShapes
-from finn.transformation.lower_convs_to_matmul import LowerConvsToMatMul
 from finn.transformation.move_reshape import RemoveCNVtoFCFlatten
 from finn.transformation.streamline import Streamline
 from finn.transformation.streamline.reorder import MoveScalarLinearPastInvariants
-from finn.util.basic import gen_finn_dt_tensor
 
 
 def get_multithreshold_rand_params(channels, num_of_thres, seed=None):
@@ -75,6 +75,7 @@ def get_multithreshold_rand_params(channels, num_of_thres, seed=None):
 )
 @pytest.mark.parametrize("depthwise", [False, True])
 @pytest.mark.parametrize("use_reshape", [False, True])
+@pytest.mark.fpgadataflow
 @pytest.mark.vivado
 @pytest.mark.slow
 def test_convert_to_hls_conv_fc_transition(conv_config, depthwise, use_reshape):
@@ -162,7 +163,7 @@ def test_convert_to_hls_conv_fc_transition(conv_config, depthwise, use_reshape):
                     "MultiThreshold",
                     ["conv_out", "thres1_param"],
                     ["thres1_out"],
-                    domain="finn.custom_op.general",
+                    domain="qonnx.custom_op.general",
                     out_dtype="UINT4",
                 ),
                 flatten_node,
@@ -173,7 +174,7 @@ def test_convert_to_hls_conv_fc_transition(conv_config, depthwise, use_reshape):
                     "MultiThreshold",
                     ["matmul_out", "thres2_param"],
                     ["global_out"],
-                    domain="finn.custom_op.general",
+                    domain="qonnx.custom_op.general",
                     out_dtype="UINT4",
                 ),
             ],
@@ -201,7 +202,7 @@ def test_convert_to_hls_conv_fc_transition(conv_config, depthwise, use_reshape):
     model.set_initializer(
         "matmul_param", gen_finn_dt_tensor(fc_weight_dt, fc_param_shape)
     )
-    model.set_initializer("reshape_shape", np.array([1, -1]))
+    model.set_initializer("reshape_shape", np.array([1, -1], dtype=np.int64))
 
     model = model.transform(InferShapes())
     model = model.transform(InferDataTypes())
@@ -218,8 +219,8 @@ def test_convert_to_hls_conv_fc_transition(conv_config, depthwise, use_reshape):
 
     # convert_to_hls
     if depthwise is True:
-        new_model = new_model.transform(to_hls.InferVVAU())
-    new_model = new_model.transform(to_hls.InferQuantizedStreamingFCLayer())
+        new_model = new_model.transform(to_hls.InferVectorVectorActivation())
+    new_model = new_model.transform(to_hls.InferQuantizedMatrixVectorActivation())
     new_model = new_model.transform(to_hls.InferThresholdingLayer())
     new_model = new_model.transform(to_hls.InferConvInpGen())
     new_model = new_model.transform(to_hls.InferStreamingMaxPool())
diff --git a/tests/fpgadataflow/test_convert_to_hls_conv_layer.py b/tests/fpgadataflow/test_convert_to_hls_conv_layer.py
index d96bc98756..55dc77cafb 100644
--- a/tests/fpgadataflow/test_convert_to_hls_conv_layer.py
+++ b/tests/fpgadataflow/test_convert_to_hls_conv_layer.py
@@ -30,25 +30,25 @@
 
 import numpy as np
 from onnx import TensorProto, helper
+from qonnx.core.datatype import DataType
+from qonnx.core.modelwrapper import ModelWrapper
+from qonnx.custom_op.general.im2col import compute_conv_output_dim
+from qonnx.custom_op.registry import getCustomOp
+from qonnx.transformation.general import GiveUniqueNodeNames
+from qonnx.transformation.infer_datatypes import InferDataTypes
+from qonnx.transformation.infer_shapes import InferShapes
+from qonnx.transformation.lower_convs_to_matmul import LowerConvsToMatMul
+from qonnx.util.basic import gen_finn_dt_tensor
 
 import finn.core.onnx_exec as oxe
 import finn.transformation.fpgadataflow.convert_to_hls_layers as to_hls
 from finn.analysis.fpgadataflow.exp_cycles_per_layer import exp_cycles_per_layer
-from finn.core.datatype import DataType
-from finn.core.modelwrapper import ModelWrapper
-from finn.custom_op.general.im2col import compute_conv_output_dim
-from finn.custom_op.registry import getCustomOp
 from finn.transformation.fpgadataflow.compile_cppsim import CompileCppSim
 from finn.transformation.fpgadataflow.hlssynth_ip import HLSSynthIP
 from finn.transformation.fpgadataflow.prepare_cppsim import PrepareCppSim
 from finn.transformation.fpgadataflow.prepare_ip import PrepareIP
 from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim
 from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode
-from finn.transformation.general import GiveUniqueNodeNames
-from finn.transformation.infer_datatypes import InferDataTypes
-from finn.transformation.infer_shapes import InferShapes
-from finn.transformation.lower_convs_to_matmul import LowerConvsToMatMul
-from finn.util.basic import gen_finn_dt_tensor
 
 # conv_config  kernel_size,stride, pad
 
@@ -58,6 +58,7 @@
 )
 @pytest.mark.parametrize("depthwise", [False, True])
 @pytest.mark.parametrize("exec_mode", ["cppsim", "rtlsim"])
+@pytest.mark.fpgadataflow
 @pytest.mark.slow
 @pytest.mark.vivado
 def test_convert_to_hls_conv_layer(conv_config, depthwise, exec_mode):
@@ -123,10 +124,10 @@ def test_convert_to_hls_conv_layer(conv_config, depthwise, exec_mode):
     new_model = model.transform(LowerConvsToMatMul())
     new_model = new_model.transform(to_hls.InferConvInpGen())
     if depthwise is True:
-        new_model = new_model.transform(to_hls.InferVVAU())
+        new_model = new_model.transform(to_hls.InferVectorVectorActivation())
     else:
-        new_model = new_model.transform(to_hls.InferQuantizedStreamingFCLayer())
-        fc_node = new_model.get_nodes_by_op_type("StreamingFCLayer_Batch")[0]
+        new_model = new_model.transform(to_hls.InferQuantizedMatrixVectorActivation())
+        fc_node = new_model.get_nodes_by_op_type("MatrixVectorActivation")[0]
         fc_inst = getCustomOp(fc_node)
         mw = fc_inst.get_nodeattr("MW")
         mh = fc_inst.get_nodeattr("MH")
@@ -172,7 +173,7 @@ def test_convert_to_hls_conv_layer(conv_config, depthwise, exec_mode):
         assert padding_inst.get_nodeattr("SIMD") == in_chn
 
     if depthwise is True and exec_mode == "rtlsim":
-        node = new_model.get_nodes_by_op_type("Vector_Vector_Activate_Batch")[0]
+        node = new_model.get_nodes_by_op_type("VectorVectorActivation")[0]
         inst = getCustomOp(node)
         cycles_rtlsim = inst.get_nodeattr("cycles_rtlsim")
         exp_cycles_dict = new_model.analysis(exp_cycles_per_layer)
diff --git a/tests/fpgadataflow/test_convert_to_hls_layers_cnv.py b/tests/fpgadataflow/test_convert_to_hls_layers_cnv.py
index 3357ee6d6c..9997f28438 100644
--- a/tests/fpgadataflow/test_convert_to_hls_layers_cnv.py
+++ b/tests/fpgadataflow/test_convert_to_hls_layers_cnv.py
@@ -33,21 +33,21 @@
 import brevitas.onnx as bo
 import numpy as np
 import os
+from qonnx.core.modelwrapper import ModelWrapper
+from qonnx.custom_op.registry import getCustomOp
+from qonnx.transformation.bipolar_to_xnor import ConvertBipolarMatMulToXnorPopcount
+from qonnx.transformation.fold_constants import FoldConstants
+from qonnx.transformation.general import GiveReadableTensorNames, GiveUniqueNodeNames
+from qonnx.transformation.infer_data_layouts import InferDataLayouts
+from qonnx.transformation.infer_shapes import InferShapes
+from qonnx.transformation.lower_convs_to_matmul import LowerConvsToMatMul
 
 import finn.core.onnx_exec as oxe
 import finn.transformation.fpgadataflow.convert_to_hls_layers as to_hls
 import finn.transformation.streamline.absorb as absorb
-from finn.core.modelwrapper import ModelWrapper
-from finn.custom_op.registry import getCustomOp
-from finn.transformation.bipolar_to_xnor import ConvertBipolarMatMulToXnorPopcount
-from finn.transformation.fold_constants import FoldConstants
 from finn.transformation.fpgadataflow.compile_cppsim import CompileCppSim
 from finn.transformation.fpgadataflow.prepare_cppsim import PrepareCppSim
 from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode
-from finn.transformation.general import GiveReadableTensorNames, GiveUniqueNodeNames
-from finn.transformation.infer_data_layouts import InferDataLayouts
-from finn.transformation.infer_shapes import InferShapes
-from finn.transformation.lower_convs_to_matmul import LowerConvsToMatMul
 from finn.transformation.streamline import Streamline
 from finn.transformation.streamline.reorder import MakeMaxPoolNHWC
 from finn.util.test import get_test_model_trained
@@ -55,6 +55,7 @@
 export_onnx_path_cnv = "test_convert_to_hls_layers_cnv.onnx"
 
 
+@pytest.mark.fpgadataflow
 @pytest.mark.vivado
 # Standalone or fused thresholding-based activation
 @pytest.mark.parametrize("fused_activation", [True, False])
@@ -89,10 +90,10 @@ def test_convert_to_hls_layers_cnv_w1a1(fused_activation):
     # subsequently, the FC inference will generate passthrough MVAUs
     if not fused_activation:
         model = model.transform(to_hls.InferThresholdingLayer())
-    model = model.transform(to_hls.InferBinaryStreamingFCLayer())
-    model = model.transform(to_hls.InferQuantizedStreamingFCLayer())
+    model = model.transform(to_hls.InferBinaryMatrixVectorActivation())
+    model = model.transform(to_hls.InferQuantizedMatrixVectorActivation())
     for node in model.graph.node:
-        if node.op_type == "StreamingFCLayer_Batch":
+        if node.op_type == "MatrixVectorActivation":
             inst = getCustomOp(node)
             inst.set_nodeattr("mem_mode", "decoupled")
             mw = inst.get_nodeattr("MW")
@@ -121,7 +122,7 @@ def test_convert_to_hls_layers_cnv_w1a1(fused_activation):
     assert len(non_finn_nodes) == 5
     exp_non_finn_nodes = ["Transpose", "Transpose", "Reshape", "Mul", "Add"]
     assert [x.op_type for x in non_finn_nodes] == exp_non_finn_nodes
-    fc_nodes = model.get_nodes_by_op_type("StreamingFCLayer_Batch")
+    fc_nodes = model.get_nodes_by_op_type("MatrixVectorActivation")
     assert len(fc_nodes) == 9
     swg_nodes = model.get_nodes_by_op_type("ConvolutionInputGenerator")
     assert len(swg_nodes) == 6
diff --git a/tests/fpgadataflow/test_convert_to_hls_layers_fc.py b/tests/fpgadataflow/test_convert_to_hls_layers_fc.py
index a1dc11e0ee..fd4e3679d7 100644
--- a/tests/fpgadataflow/test_convert_to_hls_layers_fc.py
+++ b/tests/fpgadataflow/test_convert_to_hls_layers_fc.py
@@ -35,19 +35,19 @@
 import os
 import torch
 from pkgutil import get_data
+from qonnx.core.modelwrapper import ModelWrapper
+from qonnx.custom_op.registry import getCustomOp
+from qonnx.transformation.bipolar_to_xnor import ConvertBipolarMatMulToXnorPopcount
+from qonnx.transformation.fold_constants import FoldConstants
+from qonnx.transformation.general import GiveReadableTensorNames, GiveUniqueNodeNames
+from qonnx.transformation.infer_shapes import InferShapes
 
 import finn.core.onnx_exec as oxe
 import finn.transformation.fpgadataflow.convert_to_hls_layers as to_hls
 import finn.transformation.streamline.absorb as absorb
-from finn.core.modelwrapper import ModelWrapper
-from finn.custom_op.registry import getCustomOp
-from finn.transformation.bipolar_to_xnor import ConvertBipolarMatMulToXnorPopcount
-from finn.transformation.fold_constants import FoldConstants
 from finn.transformation.fpgadataflow.compile_cppsim import CompileCppSim
 from finn.transformation.fpgadataflow.prepare_cppsim import PrepareCppSim
 from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode
-from finn.transformation.general import GiveReadableTensorNames, GiveUniqueNodeNames
-from finn.transformation.infer_shapes import InferShapes
 from finn.transformation.streamline import Streamline
 from finn.transformation.streamline.round_thresholds import RoundAndClipThresholds
 from finn.util.test import get_test_model_trained
@@ -55,6 +55,7 @@
 export_onnx_path = "test_convert_to_hls_layers_fc.onnx"
 
 
+@pytest.mark.fpgadataflow
 @pytest.mark.vivado
 def test_convert_to_hls_layers_tfc_w1a1():
     tfc = get_test_model_trained("TFC", 1, 1)
@@ -69,24 +70,24 @@ def test_convert_to_hls_layers_tfc_w1a1():
     model = model.transform(absorb.AbsorbAddIntoMultiThreshold())
     model = model.transform(absorb.AbsorbMulIntoMultiThreshold())
     model = model.transform(RoundAndClipThresholds())
-    model = model.transform(to_hls.InferBinaryStreamingFCLayer())
+    model = model.transform(to_hls.InferBinaryMatrixVectorActivation())
     fc0 = model.graph.node[2]
-    assert fc0.op_type == "StreamingFCLayer_Batch"
+    assert fc0.op_type == "MatrixVectorActivation"
     assert model.get_tensor_shape(fc0.input[0]) == [1, 784]
     assert model.get_tensor_shape(fc0.input[1]) == [784, 64]
     assert model.get_tensor_shape(fc0.input[2]) == [64, 1]
     fc1 = model.graph.node[3]
-    assert fc1.op_type == "StreamingFCLayer_Batch"
+    assert fc1.op_type == "MatrixVectorActivation"
     assert model.get_tensor_shape(fc1.input[0]) == [1, 64]
     assert model.get_tensor_shape(fc1.input[1]) == [64, 64]
     assert model.get_tensor_shape(fc1.input[2]) == [64, 1]
     fc2 = model.graph.node[4]
-    assert fc2.op_type == "StreamingFCLayer_Batch"
+    assert fc2.op_type == "MatrixVectorActivation"
     assert model.get_tensor_shape(fc2.input[0]) == [1, 64]
     assert model.get_tensor_shape(fc2.input[1]) == [64, 64]
     assert model.get_tensor_shape(fc2.input[2]) == [64, 1]
     fc3 = model.graph.node[5]
-    assert fc3.op_type == "StreamingFCLayer_Batch"
+    assert fc3.op_type == "MatrixVectorActivation"
     assert model.get_tensor_shape(fc3.input[0]) == [1, 64]
     assert model.get_tensor_shape(fc3.input[1]) == [64, 10]
 
@@ -110,7 +111,7 @@ def test_convert_to_hls_layers_tfc_w1a1():
     model = model.transform(CompileCppSim())
     model = model.transform(SetExecMode("cppsim"))
 
-    raw_i = get_data("finn.data", "onnx/mnist-conv/test_data_set_0/input_0.pb")
+    raw_i = get_data("qonnx.data", "onnx/mnist-conv/test_data_set_0/input_0.pb")
     input_tensor = onnx.load_tensor_from_string(raw_i)
     # run using FINN-based execution
     input_dict = {"global_in": nph.to_array(input_tensor)}
@@ -125,6 +126,7 @@ def test_convert_to_hls_layers_tfc_w1a1():
     os.remove(export_onnx_path)
 
 
+@pytest.mark.fpgadataflow
 @pytest.mark.vivado
 def test_convert_to_hls_layers_tfc_w1a2():
     tfc = get_test_model_trained("TFC", 1, 2)
@@ -136,28 +138,28 @@ def test_convert_to_hls_layers_tfc_w1a2():
     model = model.transform(GiveReadableTensorNames())
     model = model.transform(Streamline())
     from finn.transformation.fpgadataflow.convert_to_hls_layers import (
-        InferQuantizedStreamingFCLayer,
+        InferQuantizedMatrixVectorActivation,
     )
 
-    model = model.transform(InferQuantizedStreamingFCLayer())
+    model = model.transform(InferQuantizedMatrixVectorActivation())
 
     fc0 = model.graph.node[2]
-    assert fc0.op_type == "StreamingFCLayer_Batch"
+    assert fc0.op_type == "MatrixVectorActivation"
     assert model.get_tensor_shape(fc0.input[0]) == [1, 784]
     assert model.get_tensor_shape(fc0.input[1]) == [784, 64]
     assert model.get_tensor_shape(fc0.input[2]) == [64, 2]
     fc1 = model.graph.node[3]
-    assert fc1.op_type == "StreamingFCLayer_Batch"
+    assert fc1.op_type == "MatrixVectorActivation"
     assert model.get_tensor_shape(fc1.input[0]) == [1, 64]
     assert model.get_tensor_shape(fc1.input[1]) == [64, 64]
     assert model.get_tensor_shape(fc1.input[2]) == [64, 2]
     fc2 = model.graph.node[4]
-    assert fc2.op_type == "StreamingFCLayer_Batch"
+    assert fc2.op_type == "MatrixVectorActivation"
     assert model.get_tensor_shape(fc2.input[0]) == [1, 64]
     assert model.get_tensor_shape(fc2.input[1]) == [64, 64]
     assert model.get_tensor_shape(fc2.input[2]) == [64, 2]
     fc3 = model.graph.node[5]
-    assert fc3.op_type == "StreamingFCLayer_Batch"
+    assert fc3.op_type == "MatrixVectorActivation"
     assert model.get_tensor_shape(fc3.input[0]) == [1, 64]
     assert model.get_tensor_shape(fc3.input[1]) == [64, 10]
     fc0w = getCustomOp(fc0)
@@ -175,7 +177,7 @@ def test_convert_to_hls_layers_tfc_w1a2():
     model = model.transform(PrepareCppSim())
     model = model.transform(CompileCppSim())
     model = model.transform(SetExecMode("cppsim"))
-    raw_i = get_data("finn.data", "onnx/mnist-conv/test_data_set_0/input_0.pb")
+    raw_i = get_data("qonnx.data", "onnx/mnist-conv/test_data_set_0/input_0.pb")
     input_tensor = onnx.load_tensor_from_string(raw_i)
     # run using FINN-based execution
     input_dict = {"global_in": nph.to_array(input_tensor)}
diff --git a/tests/fpgadataflow/test_convert_to_hls_layers_synthetic.py b/tests/fpgadataflow/test_convert_to_hls_layers_synthetic.py
index 6089901566..79a48793e0 100644
--- a/tests/fpgadataflow/test_convert_to_hls_layers_synthetic.py
+++ b/tests/fpgadataflow/test_convert_to_hls_layers_synthetic.py
@@ -31,24 +31,25 @@
 import numpy as np
 import os
 from onnx import TensorProto, helper
+from qonnx.core.datatype import DataType
+from qonnx.core.modelwrapper import ModelWrapper
+from qonnx.transformation.fold_constants import FoldConstants
+from qonnx.transformation.general import (
+    GiveReadableTensorNames,
+    GiveUniqueNodeNames,
+    SortGraph,
+)
+from qonnx.transformation.infer_data_layouts import InferDataLayouts
+from qonnx.transformation.infer_datatypes import InferDataTypes
+from qonnx.transformation.infer_shapes import InferShapes
+from qonnx.transformation.insert_topk import InsertTopK
+from qonnx.util.basic import gen_finn_dt_tensor
 
 import finn.core.onnx_exec as oxe
 import finn.transformation.fpgadataflow.convert_to_hls_layers as to_hls
-from finn.core.datatype import DataType
-from finn.core.modelwrapper import ModelWrapper
-from finn.transformation.fold_constants import FoldConstants
 from finn.transformation.fpgadataflow.compile_cppsim import CompileCppSim
 from finn.transformation.fpgadataflow.prepare_cppsim import PrepareCppSim
 from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode
-from finn.transformation.general import (
-    GiveReadableTensorNames,
-    GiveUniqueNodeNames,
-    SortGraph,
-)
-from finn.transformation.infer_data_layouts import InferDataLayouts
-from finn.transformation.infer_datatypes import InferDataTypes
-from finn.transformation.infer_shapes import InferShapes
-from finn.transformation.insert_topk import InsertTopK
 from finn.transformation.streamline.absorb import (
     AbsorbConsecutiveTransposes,
     AbsorbScalarMulAddIntoTopK,
@@ -61,7 +62,6 @@
     MoveAddPastMul,
     MoveScalarLinearPastInvariants,
 )
-from finn.util.basic import gen_finn_dt_tensor
 from finn.util.test import soft_verify_topk
 
 export_onnx_path = "test_output_synthetic.onnx"
@@ -127,12 +127,12 @@ def make_model(ch, ifmdim):
     model = ModelWrapper(model)
 
     # set initializers for scalar add/mul nodes
-    model.set_initializer(add0_node.input[1], np.array([0.0]))
-    model.set_initializer(add1_node.input[1], np.array([7.0]))
-    model.set_initializer(add2_node.input[1], np.array([8.0]))
-    model.set_initializer(mul1_node.input[1], np.array([2.0]))
-    model.set_initializer(mul2_node.input[1], np.array([2.0]))
-    model.set_initializer(reshape_node.input[1], np.array([1, -1]))
+    model.set_initializer(add0_node.input[1], np.array([0.0], dtype=np.float32))
+    model.set_initializer(add1_node.input[1], np.array([7.0], dtype=np.float32))
+    model.set_initializer(add2_node.input[1], np.array([8.0], dtype=np.float32))
+    model.set_initializer(mul1_node.input[1], np.array([2.0], dtype=np.float32))
+    model.set_initializer(mul2_node.input[1], np.array([2.0], dtype=np.float32))
+    model.set_initializer(reshape_node.input[1], np.array([1, -1], dtype=np.int64))
 
     return model
 
@@ -143,12 +143,13 @@ def make_model(ch, ifmdim):
 @pytest.mark.parametrize("ch", [16])
 # ifmdim
 @pytest.mark.parametrize("ifmdim", [5])
+@pytest.mark.fpgadataflow
 @pytest.mark.vivado
 @pytest.mark.slow
 def test_convert_to_hls_layers_synthetic(ch, ifmdim, idt):
     model = make_model(ch, ifmdim)
     model.save(export_onnx_path)
-    model = ModelWrapper(export_onnx_path)
+    model = ModelWrapper(export_onnx_path, fix_float64=True)
     model = model.transform(InferShapes())
     model = model.transform(FoldConstants())
     model = model.transform(GiveUniqueNodeNames())
diff --git a/tests/fpgadataflow/test_convert_to_hls_pool_batch.py b/tests/fpgadataflow/test_convert_to_hls_pool_batch.py
index 3efafc040d..0a070e9a1e 100644
--- a/tests/fpgadataflow/test_convert_to_hls_pool_batch.py
+++ b/tests/fpgadataflow/test_convert_to_hls_pool_batch.py
@@ -30,22 +30,22 @@
 
 import numpy as np
 from onnx import TensorProto, helper
+from qonnx.core.datatype import DataType
+from qonnx.core.modelwrapper import ModelWrapper
+from qonnx.custom_op.registry import getCustomOp
+from qonnx.transformation.general import GiveUniqueNodeNames
+from qonnx.transformation.infer_shapes import InferShapes
+from qonnx.util.basic import gen_finn_dt_tensor
 
 import finn.core.onnx_exec as oxe
 import finn.transformation.fpgadataflow.convert_to_hls_layers as to_hls
 from finn.analysis.fpgadataflow.exp_cycles_per_layer import exp_cycles_per_layer
-from finn.core.datatype import DataType
-from finn.core.modelwrapper import ModelWrapper
-from finn.custom_op.registry import getCustomOp
 from finn.transformation.fpgadataflow.compile_cppsim import CompileCppSim
 from finn.transformation.fpgadataflow.hlssynth_ip import HLSSynthIP
 from finn.transformation.fpgadataflow.prepare_cppsim import PrepareCppSim
 from finn.transformation.fpgadataflow.prepare_ip import PrepareIP
 from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim
 from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode
-from finn.transformation.general import GiveUniqueNodeNames
-from finn.transformation.infer_shapes import InferShapes
-from finn.util.basic import gen_finn_dt_tensor
 
 
 def make_single_maxpool_modelwrapper(k, stride, pad, ifm_ch, ifm_dim, ofm_dim, idt):
@@ -91,7 +91,7 @@ def make_single_quantavpool_modelwrapper(k, stride, ifm_ch, ifm_dim, ofm_dim, id
         "QuantAvgPool2d",
         ["inp"],
         ["outp"],
-        domain="finn.custom_op.general",
+        domain="qonnx.custom_op.general",
         stride=stride,
         kernel=k,
         ibits=idt.bitwidth(),
@@ -131,6 +131,7 @@ def prepare_inputs(input_tensor):
 @pytest.mark.parametrize("op_type", ["QuantAvgPool2d", "MaxPool"])
 # execution mode
 @pytest.mark.parametrize("exec_mode", ["cppsim", "rtlsim"])
+@pytest.mark.fpgadataflow
 @pytest.mark.slow
 @pytest.mark.vivado
 def test_convert_to_hls_pool_batch(
diff --git a/tests/fpgadataflow/test_depthwise_convolution.py b/tests/fpgadataflow/test_depthwise_convolution.py
index 633db668d3..5228ade3d0 100644
--- a/tests/fpgadataflow/test_depthwise_convolution.py
+++ b/tests/fpgadataflow/test_depthwise_convolution.py
@@ -31,25 +31,25 @@
 import numpy as np
 import onnx.helper as oh
 from onnx import TensorProto
+from qonnx.core.datatype import DataType
+from qonnx.core.modelwrapper import ModelWrapper
+from qonnx.custom_op.general.im2col import compute_conv_output_dim
+from qonnx.custom_op.registry import getCustomOp
+from qonnx.transformation.general import GiveUniqueNodeNames
+from qonnx.transformation.infer_shapes import InferShapes
+from qonnx.util.basic import calculate_signed_dot_prod_range, gen_finn_dt_tensor
 
 import finn.core.onnx_exec as oxe
-from finn.core.datatype import DataType
-from finn.core.modelwrapper import ModelWrapper
-from finn.custom_op.general.im2col import compute_conv_output_dim
-from finn.custom_op.registry import getCustomOp
 from finn.transformation.fpgadataflow.compile_cppsim import CompileCppSim
 from finn.transformation.fpgadataflow.convert_to_hls_layers import (
     InferConvInpGen,
-    InferVVAU,
+    InferVectorVectorActivation,
 )
 from finn.transformation.fpgadataflow.hlssynth_ip import HLSSynthIP
 from finn.transformation.fpgadataflow.prepare_cppsim import PrepareCppSim
 from finn.transformation.fpgadataflow.prepare_ip import PrepareIP
 from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim
 from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode
-from finn.transformation.general import GiveUniqueNodeNames
-from finn.transformation.infer_shapes import InferShapes
-from finn.util.basic import calculate_signed_dot_prod_range, gen_finn_dt_tensor
 
 
 def set_up_reference_model(act, idt, wdt, k, ifm_dim, ifm_ch, stride, padding):
@@ -70,7 +70,7 @@ def set_up_reference_model(act, idt, wdt, k, ifm_dim, ifm_ch, stride, padding):
         tdt = DataType["INT32"]
         thresh_node = oh.make_node(
             "MultiThreshold",
-            domain="finn.custom_op.general",
+            domain="qonnx.custom_op.general",
             inputs=["outp", "T"],
             outputs=["out_act"],
             data_layout="NHWC",
@@ -93,7 +93,7 @@ def set_up_reference_model(act, idt, wdt, k, ifm_dim, ifm_ch, stride, padding):
 
     im2col_node = oh.make_node(
         "Im2Col",
-        domain="finn.custom_op.general",
+        domain="qonnx.custom_op.general",
         inputs=["inp"],
         outputs=["im2col_out"],
         kernel_size=[k, k],
@@ -133,7 +133,7 @@ def set_up_reference_model(act, idt, wdt, k, ifm_dim, ifm_ch, stride, padding):
 
     w_tensor = gen_finn_dt_tensor(wdt, [ofm_ch, 1, k, k])
     # create sparse matrix
-    W_matrix = np.zeros((ofm_ch, ifm_ch, k, k))
+    W_matrix = np.zeros((ofm_ch, ifm_ch, k, k), dtype=np.float32)
     for ch in range(ifm_ch):
         W_matrix[ch][ch] = w_tensor[ch][0]
     W_matrix = W_matrix.astype(np.float32)
@@ -168,6 +168,7 @@ def set_up_reference_model(act, idt, wdt, k, ifm_dim, ifm_ch, stride, padding):
 @pytest.mark.parametrize("stride", [1, 2])
 # padding
 @pytest.mark.parametrize("padding", [0, 1])
+@pytest.mark.fpgadataflow
 @pytest.mark.slow
 @pytest.mark.vivado
 def test_depthwise_conv_hls_cppsim(act, pe, k, stride, padding):
@@ -182,7 +183,7 @@ def test_depthwise_conv_hls_cppsim(act, pe, k, stride, padding):
     input_dict = {"inp": input_tensor}
 
     new_model = model.transform(InferConvInpGen())
-    new_model = new_model.transform(InferVVAU())
+    new_model = new_model.transform(InferVectorVectorActivation())
 
     # set SIMD in ConvInputGen node and PE in VVAU node
 
@@ -190,7 +191,7 @@ def test_depthwise_conv_hls_cppsim(act, pe, k, stride, padding):
         if n.op_type == "ConvolutionInputGenerator":
             convinputgen_node = getCustomOp(n)
             convinputgen_node.set_nodeattr("SIMD", pe)
-        elif n.op_type == "Vector_Vector_Activate_Batch":
+        elif n.op_type == "VectorVectorActivation":
             vvau_node = getCustomOp(n)
             vvau_node.set_nodeattr("PE", pe)
     new_model = new_model.transform(SetExecMode("cppsim"))
@@ -210,6 +211,7 @@ def test_depthwise_conv_hls_cppsim(act, pe, k, stride, padding):
 @pytest.mark.parametrize("stride", [1, 2])
 # padding
 @pytest.mark.parametrize("padding", [0, 1])
+@pytest.mark.fpgadataflow
 @pytest.mark.slow
 @pytest.mark.vivado
 def test_depthwise_conv_hls_rtlsim(act, pe, k, stride, padding):
@@ -224,7 +226,7 @@ def test_depthwise_conv_hls_rtlsim(act, pe, k, stride, padding):
     input_dict = {"inp": input_tensor}
 
     new_model = model.transform(InferConvInpGen())
-    new_model = new_model.transform(InferVVAU())
+    new_model = new_model.transform(InferVectorVectorActivation())
 
     # set SIMD in ConvInputGen node and PE in VVAU node
 
@@ -232,7 +234,7 @@ def test_depthwise_conv_hls_rtlsim(act, pe, k, stride, padding):
         if n.op_type == "ConvolutionInputGenerator":
             convinputgen_node = getCustomOp(n)
             convinputgen_node.set_nodeattr("SIMD", pe)
-        elif n.op_type == "Vector_Vector_Activate_Batch":
+        elif n.op_type == "VectorVectorActivation":
             vvau_node = getCustomOp(n)
             vvau_node.set_nodeattr("PE", pe)
 
diff --git a/tests/fpgadataflow/test_fpgadataflow_addstreams.py b/tests/fpgadataflow/test_fpgadataflow_addstreams.py
index 8cbf54ec18..6d881f45b6 100644
--- a/tests/fpgadataflow/test_fpgadataflow_addstreams.py
+++ b/tests/fpgadataflow/test_fpgadataflow_addstreams.py
@@ -30,20 +30,20 @@
 
 import numpy as np
 from onnx import TensorProto, helper
+from qonnx.core.datatype import DataType
+from qonnx.core.modelwrapper import ModelWrapper
+from qonnx.custom_op.registry import getCustomOp
+from qonnx.transformation.general import GiveUniqueNodeNames
+from qonnx.util.basic import gen_finn_dt_tensor
 
 import finn.core.onnx_exec as oxe
 from finn.analysis.fpgadataflow.exp_cycles_per_layer import exp_cycles_per_layer
-from finn.core.datatype import DataType
-from finn.core.modelwrapper import ModelWrapper
-from finn.custom_op.registry import getCustomOp
 from finn.transformation.fpgadataflow.compile_cppsim import CompileCppSim
 from finn.transformation.fpgadataflow.hlssynth_ip import HLSSynthIP
 from finn.transformation.fpgadataflow.prepare_cppsim import PrepareCppSim
 from finn.transformation.fpgadataflow.prepare_ip import PrepareIP
 from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim
 from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode
-from finn.transformation.general import GiveUniqueNodeNames
-from finn.util.basic import gen_finn_dt_tensor
 
 
 def make_addstreams_modelwrapper(ch, pe, idt):
@@ -89,6 +89,7 @@ def prepare_inputs(input1, input2):
 @pytest.mark.parametrize("fold", [-1, 2, 1])
 # execution mode
 @pytest.mark.parametrize("exec_mode", ["cppsim", "rtlsim"])
+@pytest.mark.fpgadataflow
 @pytest.mark.vivado
 def test_fpgadataflow_addstreams(idt, ch, fold, exec_mode):
     if fold == -1:
diff --git a/tests/fpgadataflow/test_fpgadataflow_channelwise_ops.py b/tests/fpgadataflow/test_fpgadataflow_channelwise_ops.py
index 949046d4ae..ceafda90e5 100644
--- a/tests/fpgadataflow/test_fpgadataflow_channelwise_ops.py
+++ b/tests/fpgadataflow/test_fpgadataflow_channelwise_ops.py
@@ -30,21 +30,21 @@
 
 import numpy as np
 from onnx import TensorProto, helper
+from qonnx.core.datatype import DataType
+from qonnx.core.modelwrapper import ModelWrapper
+from qonnx.custom_op.registry import getCustomOp
+from qonnx.transformation.general import GiveUniqueNodeNames
+from qonnx.util.basic import gen_finn_dt_tensor
 
 import finn.core.onnx_exec as oxe
 from finn.analysis.fpgadataflow.exp_cycles_per_layer import exp_cycles_per_layer
 from finn.analysis.fpgadataflow.hls_synth_res_estimation import hls_synth_res_estimation
-from finn.core.datatype import DataType
-from finn.core.modelwrapper import ModelWrapper
-from finn.custom_op.registry import getCustomOp
 from finn.transformation.fpgadataflow.compile_cppsim import CompileCppSim
 from finn.transformation.fpgadataflow.hlssynth_ip import HLSSynthIP
 from finn.transformation.fpgadataflow.prepare_cppsim import PrepareCppSim
 from finn.transformation.fpgadataflow.prepare_ip import PrepareIP
 from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim
 from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode
-from finn.transformation.general import GiveUniqueNodeNames
-from finn.util.basic import gen_finn_dt_tensor
 
 
 def make_modelwrapper(C, pe, idt, odt, pdt, func, vecs):
@@ -100,6 +100,7 @@ def make_modelwrapper(C, pe, idt, odt, pdt, func, vecs):
 @pytest.mark.parametrize("func", ["add", "mul"])
 # execution mode
 @pytest.mark.parametrize("exec_mode", ["cppsim", "rtlsim"])
+@pytest.mark.fpgadataflow
 @pytest.mark.vivado
 @pytest.mark.slow
 def test_fpgadataflow_channelwise_ops(idt, act, pdt, nf, ich, func, vecs, exec_mode):
diff --git a/tests/fpgadataflow/test_fpgadataflow_checksum.py b/tests/fpgadataflow/test_fpgadataflow_checksum.py
new file mode 100644
index 0000000000..5e79ea2dad
--- /dev/null
+++ b/tests/fpgadataflow/test_fpgadataflow_checksum.py
@@ -0,0 +1,224 @@
+# Copyright (c) 2022, Xilinx, Inc.
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of FINN nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import pytest
+
+import numpy as np
+from onnx import TensorProto, helper
+from pyverilator.util.axi_utils import axilite_read, axilite_write
+from qonnx.core.datatype import DataType
+from qonnx.core.modelwrapper import ModelWrapper
+from qonnx.custom_op.registry import getCustomOp
+from qonnx.transformation.general import GiveReadableTensorNames, GiveUniqueNodeNames
+from qonnx.transformation.infer_shapes import InferShapes
+from qonnx.util.basic import gen_finn_dt_tensor
+
+import finn.core.onnx_exec as oxe
+from finn.core.rtlsim_exec import rtlsim_exec
+from finn.transformation.fpgadataflow.compile_cppsim import CompileCppSim
+from finn.transformation.fpgadataflow.create_stitched_ip import CreateStitchedIP
+from finn.transformation.fpgadataflow.hlssynth_ip import HLSSynthIP
+from finn.transformation.fpgadataflow.insert_fifo import InsertFIFO
+from finn.transformation.fpgadataflow.insert_hook import InsertHook
+from finn.transformation.fpgadataflow.prepare_cppsim import PrepareCppSim
+from finn.transformation.fpgadataflow.prepare_ip import PrepareIP
+from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode
+
+test_fpga_part = "xczu3eg-sbva484-1-e"
+target_clk_ns = 5
+
+
+def create_two_fc_model():
+    # create a model with two MatrixVectorActivation instances
+    wdt = DataType["INT2"]
+    idt = DataType["INT32"]
+    odt = DataType["INT32"]
+    m = 4
+    actval = 0
+    no_act = 1
+    binary_xnor_mode = 0
+    pe = 2
+    simd = 2
+
+    inp = helper.make_tensor_value_info("inp", TensorProto.FLOAT, [1, m])
+    mid = helper.make_tensor_value_info("mid", TensorProto.FLOAT, [1, m])
+    outp = helper.make_tensor_value_info("outp", TensorProto.FLOAT, [1, m])
+
+    fc0 = helper.make_node(
+        "MatrixVectorActivation",
+        ["inp", "w0"],
+        ["mid"],
+        domain="finn.custom_op.fpgadataflow",
+        backend="fpgadataflow",
+        MW=m,
+        MH=m,
+        SIMD=simd,
+        PE=pe,
+        inputDataType=idt.name,
+        weightDataType=wdt.name,
+        outputDataType=odt.name,
+        ActVal=actval,
+        binaryXnorMode=binary_xnor_mode,
+        noActivation=no_act,
+        mem_mode="decoupled",
+    )
+
+    fc1 = helper.make_node(
+        "MatrixVectorActivation",
+        ["mid", "w1"],
+        ["outp"],
+        domain="finn.custom_op.fpgadataflow",
+        backend="fpgadataflow",
+        MW=m,
+        MH=m,
+        SIMD=simd,
+        PE=pe,
+        inputDataType=idt.name,
+        weightDataType=wdt.name,
+        outputDataType=odt.name,
+        ActVal=actval,
+        binaryXnorMode=binary_xnor_mode,
+        noActivation=no_act,
+        mem_mode="decoupled",
+    )
+
+    graph = helper.make_graph(
+        nodes=[fc0, fc1],
+        name="fclayer_graph",
+        inputs=[inp],
+        outputs=[outp],
+        value_info=[mid],
+    )
+
+    model = helper.make_model(graph, producer_name="fclayer-model")
+    model = ModelWrapper(model)
+
+    model.set_tensor_datatype("inp", idt)
+    model.set_tensor_datatype("mid", idt)
+    model.set_tensor_datatype("outp", odt)
+    model.set_tensor_datatype("w0", wdt)
+    model.set_tensor_datatype("w1", wdt)
+
+    # generate weights
+    w0 = np.eye(m, dtype=np.float32)
+    w1 = np.eye(m, dtype=np.float32)
+    model.set_initializer("w0", w0)
+    model.set_initializer("w1", w1)
+
+    return model
+
+
+@pytest.mark.fpgadataflow
+def test_fpgadataflow_checksum():
+    # use a graph consisting of two fc layers to test
+    # checksum node insertion
+    model = create_two_fc_model()
+
+    # set checksum output hook
+    for n in model.graph.node:
+        n0 = getCustomOp(n)
+        n0.set_nodeattr("output_hook", "checksum")
+
+    model = model.transform(InsertHook())
+    model = model.transform(GiveUniqueNodeNames())
+    model = model.transform(GiveReadableTensorNames())
+    model = model.transform(InferShapes())
+
+    assert (
+        len(model.get_nodes_by_op_type("CheckSum")) == 2
+    ), """Insertion of
+        checksum layers was unsuccessful"""
+
+    # to verify the functionality of the checksum layer
+    # cppsim and rtlsim will be compared
+
+    x = gen_finn_dt_tensor(DataType["INT32"], (1, 4))
+
+    # cppsim
+    model = model.transform(SetExecMode("cppsim"))
+    model = model.transform(PrepareCppSim())
+    model = model.transform(CompileCppSim())
+    inp = {"global_in": x}
+    y_cppsim = oxe.execute_onnx(model, inp, return_full_exec_context=True)
+    checksum0_cppsim = y_cppsim["CheckSum_0_out1"]
+    checksum1_cppsim = y_cppsim["CheckSum_1_out1"]
+
+    # in this test case scenario the checksums are equal
+    assert checksum0_cppsim == checksum1_cppsim, "CheckSums are not equal"
+
+    # rtlsim
+    model = model.transform(InsertFIFO(True))
+    model = model.transform(GiveUniqueNodeNames())
+    model = model.transform(PrepareIP(test_fpga_part, target_clk_ns))
+    model = model.transform(HLSSynthIP())
+    model = model.transform(CreateStitchedIP(test_fpga_part, target_clk_ns))
+    model.set_metadata_prop("exec_mode", "rtlsim")
+
+    # define function to read out the checksums from axilite
+    checksums = []
+    drain = []
+
+    def read_checksum_and_drain(sim):
+        chk_addr = 16
+        drain_addr = 32
+        for i in range(len(model.get_nodes_by_op_type("CheckSum"))):
+            axi_name = "s_axi_checksum_{}_".format(i)
+            checksums.append(axilite_read(sim, chk_addr, basename=axi_name))
+            drain.append(axilite_read(sim, drain_addr, basename=axi_name))
+
+    drain_value = False
+
+    def write_drain(sim):
+        addr = 32
+        for i in range(len(model.get_nodes_by_op_type("CheckSum"))):
+            axi_name = "s_axi_checksum_{}_".format(i)
+            axilite_write(sim, addr, drain_value, basename=axi_name)
+
+    rtlsim_exec(model, inp, pre_hook=write_drain, post_hook=read_checksum_and_drain)
+    checksum0_rtlsim = int(checksums[0])
+    checksum1_rtlsim = int(checksums[1])
+    checksum0_drain = int(drain[0])
+    checksum1_drain = int(drain[1])
+
+    assert (
+        checksum0_rtlsim == checksum0_cppsim
+    ), """The first checksums do not
+        match in cppsim vs. rtlsim"""
+    assert (
+        checksum1_rtlsim == checksum1_cppsim
+    ), """The second checksums do not
+        match in cppsim vs. rtlsim"""
+
+    assert (
+        checksum0_drain == 0
+    ), "Drain read doesn't match drain write for first checksum"
+    assert (
+        checksum1_drain == 0
+    ), "Drain read doesn't match drain write for second checksum"
+
+    # TODO: test for drain set to true
diff --git a/tests/fpgadataflow/test_fpgadataflow_concat.py b/tests/fpgadataflow/test_fpgadataflow_concat.py
new file mode 100644
index 0000000000..dddc470ec2
--- /dev/null
+++ b/tests/fpgadataflow/test_fpgadataflow_concat.py
@@ -0,0 +1,149 @@
+# Copyright (c) 2021, Xilinx
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of FINN nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import pytest
+
+import numpy as np
+import onnx
+import torch
+from io import BytesIO
+from qonnx.core.datatype import DataType
+from qonnx.core.modelwrapper import ModelWrapper
+from qonnx.transformation.general import GiveUniqueNodeNames
+from qonnx.util.basic import gen_finn_dt_tensor
+from torch import nn
+
+from finn.core.onnx_exec import execute_onnx
+from finn.transformation.fpgadataflow.compile_cppsim import CompileCppSim
+from finn.transformation.fpgadataflow.convert_to_hls_layers import InferConcatLayer
+from finn.transformation.fpgadataflow.create_stitched_ip import CreateStitchedIP
+from finn.transformation.fpgadataflow.hlssynth_ip import HLSSynthIP
+from finn.transformation.fpgadataflow.insert_fifo import InsertFIFO
+from finn.transformation.fpgadataflow.prepare_cppsim import PrepareCppSim
+from finn.transformation.fpgadataflow.prepare_ip import PrepareIP
+from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim
+from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode
+
+
+def make_concat_model(i_shapes, idt):
+    class ConcatModel(nn.Module):
+        def forward(self, *args):
+            return torch.cat(args, -1)
+
+    torch_model = ConcatModel()
+    torch_model.eval()
+    input_t = []
+    for i_shape in i_shapes:
+        input_t.append(torch.zeros(i_shape, dtype=torch.float32))
+    input_t = tuple(input_t)
+    model_bytes = BytesIO()
+    torch.onnx.export(torch_model, input_t, model_bytes, opset_version=11)
+    model = onnx.ModelProto.FromString(model_bytes.getvalue())
+    model = ModelWrapper(model)
+    for inp in model.graph.input:
+        model.set_tensor_datatype(inp.name, idt)
+    return model
+
+
+@pytest.mark.parametrize("exec_mode", ["cppsim", "rtlsim"])
+@pytest.mark.parametrize("idt", [DataType["INT4"]])
+@pytest.mark.vivado
+@pytest.mark.slow
+def test_fpgadataflow_concat(exec_mode, idt):
+    i_shapes = [(1, 2, 4), (1, 2, 6), (1, 2, 1)]
+    i_data = [gen_finn_dt_tensor(idt, x) for x in i_shapes]
+    model = make_concat_model(i_shapes, idt)
+    assert len(i_shapes) == len(model.graph.input)
+    assert len(model.graph.output) == 1
+    exp_oshape = list(i_shapes[0][:-1]) + [sum(x[-1] for x in i_shapes)]
+    oname = model.graph.output[0].name
+    assert model.get_tensor_shape(oname) == exp_oshape
+    exp_out = np.concatenate(i_data, axis=-1)
+    inp_dict = {}
+    for i in range(len(i_shapes)):
+        inp_dict[model.graph.input[i].name] = i_data[i]
+    ret = execute_onnx(model, inp_dict)
+    assert (ret[oname] == exp_out).all()
+    # call transformation to convert to HLS and verify conversion
+    model = model.transform(InferConcatLayer())
+    assert model.graph.node[0].op_type == "StreamingConcat"
+    assert model.graph.node[0].domain == "finn.custom_op.fpgadataflow"
+    if exec_mode == "cppsim":
+        model = model.transform(PrepareCppSim())
+        model = model.transform(CompileCppSim())
+        model = model.transform(SetExecMode("cppsim"))
+    elif exec_mode == "rtlsim":
+        model = model.transform(GiveUniqueNodeNames())
+        model = model.transform(PrepareIP("xc7z020clg400-1", 10))
+        model = model.transform(HLSSynthIP())
+        model = model.transform(SetExecMode("rtlsim"))
+        model = model.transform(PrepareRTLSim())
+    ret_sim = execute_onnx(model, inp_dict)
+    assert (exp_out == ret_sim[oname]).all()
+
+
+@pytest.mark.vivado
+@pytest.mark.slow
+def test_fpgadataflow_concat_stitchedip():
+    idt = DataType["INT4"]
+    fpga_part = "xc7z020clg400-1"
+    clk_ns = 10
+    i_shapes = [(1, 2, 4), (1, 2, 6), (1, 2, 1)]
+    i_data = [gen_finn_dt_tensor(idt, x) for x in i_shapes]
+    model = make_concat_model(i_shapes, idt)
+    assert len(i_shapes) == len(model.graph.input)
+    assert len(model.graph.output) == 1
+    exp_oshape = list(i_shapes[0][:-1]) + [sum(x[-1] for x in i_shapes)]
+    oname = model.graph.output[0].name
+    assert model.get_tensor_shape(oname) == exp_oshape
+    exp_out = np.concatenate(i_data, axis=-1)
+    inp_dict = {}
+    for i in range(len(i_shapes)):
+        inp_dict[model.graph.input[i].name] = i_data[i]
+    ret = execute_onnx(model, inp_dict)
+    assert (ret[oname] == exp_out).all()
+    # call transformation to convert to HLS and verify conversion
+    model = model.transform(InferConcatLayer())
+    assert model.graph.node[0].op_type == "StreamingConcat"
+    assert model.graph.node[0].domain == "finn.custom_op.fpgadataflow"
+    model = model.transform(InsertFIFO(create_shallow_fifos=True))
+    model = model.transform(GiveUniqueNodeNames())
+    model = model.transform(PrepareIP(fpga_part, clk_ns))
+    model = model.transform(HLSSynthIP())
+    model = model.transform(
+        CreateStitchedIP(
+            fpga_part,
+            clk_ns,
+            vitis=False,
+        )
+    )
+    model.set_metadata_prop("exec_mode", "rtlsim")
+    model.set_metadata_prop("rtlsim_trace", "trace.vcd")
+    model.save("dbg.onnx")
+    ret_sim = execute_onnx(model, inp_dict)
+    assert (exp_out == ret_sim[oname]).all()
diff --git a/tests/fpgadataflow/test_fpgadataflow_convinputgenerator.py b/tests/fpgadataflow/test_fpgadataflow_convinputgenerator.py
index 47cd7e7ba1..a196ecbb61 100644
--- a/tests/fpgadataflow/test_fpgadataflow_convinputgenerator.py
+++ b/tests/fpgadataflow/test_fpgadataflow_convinputgenerator.py
@@ -30,20 +30,20 @@
 
 import numpy as np
 from onnx import TensorProto, helper
+from qonnx.core.datatype import DataType
+from qonnx.core.modelwrapper import ModelWrapper
+from qonnx.custom_op.registry import getCustomOp
+from qonnx.transformation.general import GiveUniqueNodeNames
+from qonnx.util.basic import gen_finn_dt_tensor
 
 import finn.core.onnx_exec as oxe
 from finn.analysis.fpgadataflow.exp_cycles_per_layer import exp_cycles_per_layer
-from finn.core.datatype import DataType
-from finn.core.modelwrapper import ModelWrapper
-from finn.custom_op.registry import getCustomOp
 from finn.transformation.fpgadataflow.compile_cppsim import CompileCppSim
 from finn.transformation.fpgadataflow.hlssynth_ip import HLSSynthIP
 from finn.transformation.fpgadataflow.prepare_cppsim import PrepareCppSim
 from finn.transformation.fpgadataflow.prepare_ip import PrepareIP
 from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim
 from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode
-from finn.transformation.general import GiveUniqueNodeNames
-from finn.util.basic import gen_finn_dt_tensor
 
 
 def make_single_im2col_modelwrapper(
@@ -61,7 +61,7 @@ def make_single_im2col_modelwrapper(
         "Im2Col",
         ["inp"],
         ["outp"],
-        domain="finn.custom_op.general",
+        domain="qonnx.custom_op.general",
         stride=[stride, stride],
         kernel_size=[k, k],
         input_shape=str((1, ifm_dim, ifm_dim, ifm_ch)),
@@ -149,6 +149,7 @@ def prepare_inputs(input_tensor):
 @pytest.mark.parametrize("simd", [1, 2])
 # depthwise
 @pytest.mark.parametrize("dw", [0, 1])
+@pytest.mark.fpgadataflow
 @pytest.mark.slow
 @pytest.mark.vivado
 def test_fpgadataflow_slidingwindow(
diff --git a/tests/fpgadataflow/test_fpgadataflow_convinputgenerator1d.py b/tests/fpgadataflow/test_fpgadataflow_convinputgenerator1d.py
index 8440ac1fe4..0fc3ca82cf 100644
--- a/tests/fpgadataflow/test_fpgadataflow_convinputgenerator1d.py
+++ b/tests/fpgadataflow/test_fpgadataflow_convinputgenerator1d.py
@@ -30,21 +30,23 @@
 
 import numpy as np
 from onnx import TensorProto, helper
+from qonnx.core.datatype import DataType
+from qonnx.core.modelwrapper import ModelWrapper
+from qonnx.custom_op.general.im2col import compute_conv_output_dim
+from qonnx.custom_op.registry import getCustomOp
+from qonnx.transformation.general import GiveUniqueNodeNames
+from qonnx.util.basic import gen_finn_dt_tensor
 
 import finn.core.onnx_exec as oxe
 from finn.analysis.fpgadataflow.exp_cycles_per_layer import exp_cycles_per_layer
-from finn.core.datatype import DataType
-from finn.core.modelwrapper import ModelWrapper
-from finn.custom_op.general.im2col import compute_conv_output_dim
-from finn.custom_op.registry import getCustomOp
 from finn.transformation.fpgadataflow.compile_cppsim import CompileCppSim
 from finn.transformation.fpgadataflow.hlssynth_ip import HLSSynthIP
 from finn.transformation.fpgadataflow.prepare_cppsim import PrepareCppSim
 from finn.transformation.fpgadataflow.prepare_ip import PrepareIP
 from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim
 from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode
-from finn.transformation.general import GiveUniqueNodeNames
-from finn.util.basic import gen_finn_dt_tensor
+
+fpga_part = "xczu3eg-sbva484-1-e"
 
 
 def make_single_im2col_modelwrapper(
@@ -68,7 +70,7 @@ def make_single_im2col_modelwrapper(
         "Im2Col",
         ["inp"],
         ["outp"],
-        domain="finn.custom_op.general",
+        domain="qonnx.custom_op.general",
         stride=[stride_h, stride_w],
         kernel_size=[k_h, k_w],
         input_shape=str((1, ifm_dim_h, ifm_dim_w, ifm_ch)),
@@ -90,7 +92,7 @@ def make_single_im2col_modelwrapper(
 
 
 def make_single_slidingwindow_modelwrapper(
-    k, ifm_ch, ifm_dim, ofm_dim, simd, stride, dilation, idt, dw=0
+    k, ifm_ch, ifm_dim, ofm_dim, simd, stride, dilation, idt, parallel_window, dw=0
 ):
     k_h, k_w = k
     ifm_dim_h, ifm_dim_w = ifm_dim
@@ -122,6 +124,7 @@ def make_single_slidingwindow_modelwrapper(
         inputDataType=idt.name,
         outputDataType=odt.name,
         depthwise=dw,
+        parallel_window=parallel_window,
     )
     graph = helper.make_graph(
         nodes=[SlidingWindow_node],
@@ -155,8 +158,7 @@ def prepare_inputs(input_tensor):
 # Stride
 @pytest.mark.parametrize("stride", [[1, 1], [2, 1]])
 # Dilation
-# @pytest.mark.parametrize("dilation", [[1, 1], [2, 1]])
-@pytest.mark.parametrize("dilation", [[1, 1]])
+@pytest.mark.parametrize("dilation", [[1, 1], [2, 1]])
 # execution mode
 @pytest.mark.parametrize("exec_mode", ["cppsim", "rtlsim"])
 # input channel parallelism ("SIMD")
@@ -165,10 +167,23 @@ def prepare_inputs(input_tensor):
 @pytest.mark.parametrize("dw", [0, 1])
 # Flip dimensions
 @pytest.mark.parametrize("flip", [False, True])
+# Use parallel window output variant
+@pytest.mark.parametrize("parallel_window", [False, True])
+@pytest.mark.fpgadataflow
 @pytest.mark.slow
 @pytest.mark.vivado
 def test_fpgadataflow_slidingwindow_1d(
-    idt, k, ifm_dim, ifm_ch, stride, dilation, exec_mode, simd, dw, flip
+    idt,
+    k,
+    ifm_dim,
+    ifm_ch,
+    stride,
+    dilation,
+    exec_mode,
+    simd,
+    dw,
+    flip,
+    parallel_window,
 ):
     if flip:
         k = k[::-1]
@@ -186,6 +201,11 @@ def test_fpgadataflow_slidingwindow_1d(
             """Dilation value greater than 1 and stride greater than 1
             currently not supported for 1D convolutions"""
         )
+    if (dilation_h > 1 or dilation_w > 1) and dw == 0:
+        pytest.skip(
+            """Dilation value greater than 1 currently not supported
+            for non-dws 1D convolutions"""
+        )
     if simd > ifm_ch:
         pytest.skip("SIMD cannot be larger than number of input channels")
 
@@ -203,6 +223,7 @@ def test_fpgadataflow_slidingwindow_1d(
         stride=stride,
         dilation=dilation,
         idt=idt,
+        parallel_window=parallel_window,
         dw=dw,
     )
 
@@ -213,7 +234,7 @@ def test_fpgadataflow_slidingwindow_1d(
     elif exec_mode == "rtlsim":
         model = model.transform(SetExecMode("rtlsim"))
         model = model.transform(GiveUniqueNodeNames())
-        model = model.transform(PrepareIP("xc7z020clg400-1", 5))
+        model = model.transform(PrepareIP(fpga_part, 5))
         model = model.transform(HLSSynthIP())
         model = model.transform(PrepareRTLSim())
     else:
diff --git a/tests/fpgadataflow/test_fpgadataflow_duplicatestreams.py b/tests/fpgadataflow/test_fpgadataflow_duplicatestreams.py
index 73bf1165af..6d37cf9d94 100644
--- a/tests/fpgadataflow/test_fpgadataflow_duplicatestreams.py
+++ b/tests/fpgadataflow/test_fpgadataflow_duplicatestreams.py
@@ -30,22 +30,22 @@
 
 import numpy as np
 from onnx import TensorProto, helper
+from qonnx.core.datatype import DataType
+from qonnx.core.modelwrapper import ModelWrapper
+from qonnx.custom_op.registry import getCustomOp
+from qonnx.transformation.general import GiveUniqueNodeNames
+from qonnx.transformation.infer_datatypes import InferDataTypes
+from qonnx.transformation.infer_shapes import InferShapes
+from qonnx.util.basic import gen_finn_dt_tensor
 
 import finn.core.onnx_exec as oxe
 from finn.analysis.fpgadataflow.exp_cycles_per_layer import exp_cycles_per_layer
-from finn.core.datatype import DataType
-from finn.core.modelwrapper import ModelWrapper
-from finn.custom_op.registry import getCustomOp
 from finn.transformation.fpgadataflow.compile_cppsim import CompileCppSim
 from finn.transformation.fpgadataflow.hlssynth_ip import HLSSynthIP
 from finn.transformation.fpgadataflow.prepare_cppsim import PrepareCppSim
 from finn.transformation.fpgadataflow.prepare_ip import PrepareIP
 from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim
 from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode
-from finn.transformation.general import GiveUniqueNodeNames
-from finn.transformation.infer_datatypes import InferDataTypes
-from finn.transformation.infer_shapes import InferShapes
-from finn.util.basic import gen_finn_dt_tensor
 
 
 def make_dupstreams_modelwrapper(ch, pe, idim, idt):
@@ -94,6 +94,7 @@ def prepare_inputs(input_tensor, idt):
 @pytest.mark.parametrize("imdim", [7])
 # execution mode
 @pytest.mark.parametrize("exec_mode", ["cppsim", "rtlsim"])
+@pytest.mark.fpgadataflow
 @pytest.mark.vivado
 def test_fpgadataflow_duplicatestreams(idt, ch, fold, imdim, exec_mode):
     if fold == -1:
diff --git a/tests/fpgadataflow/test_fpgadataflow_dwc.py b/tests/fpgadataflow/test_fpgadataflow_dwc.py
index 248b591eb4..bcf2a1fe3d 100644
--- a/tests/fpgadataflow/test_fpgadataflow_dwc.py
+++ b/tests/fpgadataflow/test_fpgadataflow_dwc.py
@@ -29,16 +29,16 @@
 import pytest
 
 from onnx import TensorProto, helper
+from qonnx.core.datatype import DataType
+from qonnx.core.modelwrapper import ModelWrapper
+from qonnx.transformation.general import GiveUniqueNodeNames
+from qonnx.util.basic import gen_finn_dt_tensor
 
 import finn.core.onnx_exec as oxe
-from finn.core.datatype import DataType
-from finn.core.modelwrapper import ModelWrapper
 from finn.transformation.fpgadataflow.hlssynth_ip import HLSSynthIP
 from finn.transformation.fpgadataflow.prepare_ip import PrepareIP
 from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim
 from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode
-from finn.transformation.general import GiveUniqueNodeNames
-from finn.util.basic import gen_finn_dt_tensor
 
 
 def make_single_dwc_modelwrapper(Shape, INWidth, OUTWidth, finn_dtype):
@@ -83,6 +83,7 @@ def prepare_inputs(input_tensor, dt):
 @pytest.mark.parametrize("OUTWidth", [2, 4])
 # finn_dtype
 @pytest.mark.parametrize("finn_dtype", [DataType["BIPOLAR"], DataType["INT2"]])
+@pytest.mark.fpgadataflow
 @pytest.mark.slow
 @pytest.mark.vivado
 def test_fpgadataflow_dwc_rtlsim(Shape, INWidth, OUTWidth, finn_dtype):
diff --git a/tests/fpgadataflow/test_fpgadataflow_fifo.py b/tests/fpgadataflow/test_fpgadataflow_fifo.py
index 4d3074fe14..b9c74185d9 100644
--- a/tests/fpgadataflow/test_fpgadataflow_fifo.py
+++ b/tests/fpgadataflow/test_fpgadataflow_fifo.py
@@ -30,16 +30,16 @@
 
 import os
 from onnx import TensorProto, helper
+from qonnx.core.datatype import DataType
+from qonnx.core.modelwrapper import ModelWrapper
+from qonnx.transformation.general import GiveUniqueNodeNames
+from qonnx.util.basic import gen_finn_dt_tensor
 
 import finn.core.onnx_exec as oxe
-from finn.core.datatype import DataType
-from finn.core.modelwrapper import ModelWrapper
 from finn.transformation.fpgadataflow.hlssynth_ip import HLSSynthIP
 from finn.transformation.fpgadataflow.prepare_ip import PrepareIP
 from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim
 from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode
-from finn.transformation.general import GiveUniqueNodeNames
-from finn.util.basic import gen_finn_dt_tensor
 
 build_dir = os.environ["FINN_BUILD_DIR"]
 test_fpga_part = "xc7z020clg400-1"
@@ -87,6 +87,7 @@ def prepare_inputs(input_tensor, dt):
 @pytest.mark.parametrize("depth", [16])
 # finn_dtype
 @pytest.mark.parametrize("finn_dtype", [DataType["BIPOLAR"]])  # , DataType["INT2"]])
+@pytest.mark.fpgadataflow
 @pytest.mark.slow
 @pytest.mark.vivado
 def test_fpgadataflow_fifo_rtlsim(Shape, folded_shape, depth, finn_dtype):
diff --git a/tests/fpgadataflow/test_fpgadataflow_fmpadding.py b/tests/fpgadataflow/test_fpgadataflow_fmpadding.py
index b564273c09..2e2da0da7a 100644
--- a/tests/fpgadataflow/test_fpgadataflow_fmpadding.py
+++ b/tests/fpgadataflow/test_fpgadataflow_fmpadding.py
@@ -31,21 +31,22 @@
 import numpy as np
 import os
 from onnx import TensorProto, helper
+from qonnx.core.datatype import DataType
+from qonnx.core.modelwrapper import ModelWrapper
+from qonnx.custom_op.registry import getCustomOp
+from qonnx.transformation.general import GiveUniqueNodeNames
+from qonnx.transformation.infer_shapes import InferShapes
+from qonnx.util.basic import gen_finn_dt_tensor
 
 import finn.core.onnx_exec as oxe
 from finn.analysis.fpgadataflow.exp_cycles_per_layer import exp_cycles_per_layer
-from finn.core.datatype import DataType
-from finn.core.modelwrapper import ModelWrapper
-from finn.custom_op.registry import getCustomOp
 from finn.transformation.fpgadataflow.compile_cppsim import CompileCppSim
 from finn.transformation.fpgadataflow.hlssynth_ip import HLSSynthIP
 from finn.transformation.fpgadataflow.prepare_cppsim import PrepareCppSim
 from finn.transformation.fpgadataflow.prepare_ip import PrepareIP
 from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim
 from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode
-from finn.transformation.general import GiveUniqueNodeNames
-from finn.transformation.infer_shapes import InferShapes
-from finn.util.basic import gen_finn_dt_tensor, pynq_part_map
+from finn.util.basic import pynq_part_map
 
 test_pynq_board = os.getenv("PYNQ_BOARD", default="Pynq-Z1")
 test_fpga_part = pynq_part_map[test_pynq_board]
@@ -111,6 +112,7 @@ def make_single_fmpadding_modelwrapper(idim, padding, num_ch, simd, idt, pad_sty
 @pytest.mark.parametrize("idt", [DataType["INT2"], DataType["INT4"]])
 # execution mode
 @pytest.mark.parametrize("mode", ["cppsim", "rtlsim"])
+@pytest.mark.fpgadataflow
 @pytest.mark.slow
 @pytest.mark.vivado
 def test_fpgadataflow_fmpadding(idim, pad, num_ch, simd, pad_style, idt, mode):
diff --git a/tests/fpgadataflow/test_fpgadataflow_globalaccpool.py b/tests/fpgadataflow/test_fpgadataflow_globalaccpool.py
index 2299cc6e8f..a37e6e3271 100644
--- a/tests/fpgadataflow/test_fpgadataflow_globalaccpool.py
+++ b/tests/fpgadataflow/test_fpgadataflow_globalaccpool.py
@@ -30,20 +30,20 @@
 
 import numpy as np
 from onnx import TensorProto, helper
+from qonnx.core.datatype import DataType
+from qonnx.core.modelwrapper import ModelWrapper
+from qonnx.custom_op.registry import getCustomOp
+from qonnx.transformation.general import GiveUniqueNodeNames
+from qonnx.util.basic import gen_finn_dt_tensor
 
 import finn.core.onnx_exec as oxe
 from finn.analysis.fpgadataflow.exp_cycles_per_layer import exp_cycles_per_layer
-from finn.core.datatype import DataType
-from finn.core.modelwrapper import ModelWrapper
-from finn.custom_op.registry import getCustomOp
 from finn.transformation.fpgadataflow.compile_cppsim import CompileCppSim
 from finn.transformation.fpgadataflow.hlssynth_ip import HLSSynthIP
 from finn.transformation.fpgadataflow.prepare_cppsim import PrepareCppSim
 from finn.transformation.fpgadataflow.prepare_ip import PrepareIP
 from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim
 from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode
-from finn.transformation.general import GiveUniqueNodeNames
-from finn.util.basic import gen_finn_dt_tensor
 
 
 def make_accpool_modelwrapper(ch, pe, idim, idt):
@@ -87,6 +87,7 @@ def prepare_inputs(input_tensor, idt):
 @pytest.mark.parametrize("imdim", [7])
 # execution mode
 @pytest.mark.parametrize("exec_mode", ["cppsim", "rtlsim"])
+@pytest.mark.fpgadataflow
 @pytest.mark.vivado
 def test_fpgadataflow_globalaccpool(idt, ch, fold, imdim, exec_mode):
     if fold == -1:
diff --git a/tests/fpgadataflow/test_fpgadataflow_ipstitch.py b/tests/fpgadataflow/test_fpgadataflow_ipstitch.py
index a4e75f5254..80f2d724ad 100644
--- a/tests/fpgadataflow/test_fpgadataflow_ipstitch.py
+++ b/tests/fpgadataflow/test_fpgadataflow_ipstitch.py
@@ -31,11 +31,14 @@
 import numpy as np
 import os
 from onnx import TensorProto, helper
+from qonnx.core.datatype import DataType
+from qonnx.core.modelwrapper import ModelWrapper
+from qonnx.custom_op.registry import getCustomOp
+from qonnx.transformation.general import GiveUniqueNodeNames
+from qonnx.transformation.infer_data_layouts import InferDataLayouts
+from qonnx.util.basic import gen_finn_dt_tensor
 
-from finn.core.datatype import DataType
-from finn.core.modelwrapper import ModelWrapper
 from finn.core.onnx_exec import execute_onnx
-from finn.custom_op.registry import getCustomOp
 from finn.transformation.fpgadataflow.create_dataflow_partition import (
     CreateDataflowPartition,
 )
@@ -48,14 +51,7 @@
 from finn.transformation.fpgadataflow.prepare_ip import PrepareIP
 from finn.transformation.fpgadataflow.synth_ooc import SynthOutOfContext
 from finn.transformation.fpgadataflow.vitis_build import VitisBuild
-from finn.transformation.general import GiveUniqueNodeNames
-from finn.transformation.infer_data_layouts import InferDataLayouts
-from finn.util.basic import (
-    alveo_default_platform,
-    alveo_part_map,
-    gen_finn_dt_tensor,
-    pynq_part_map,
-)
+from finn.util.basic import alveo_default_platform, alveo_part_map, pynq_part_map
 from finn.util.pyverilator import pyverilate_stitched_ip
 from finn.util.test import load_test_checkpoint_or_skip
 
@@ -66,7 +62,7 @@
 
 
 def create_one_fc_model(mem_mode="const"):
-    # create a model with a StreamingFCLayer instance with no activation
+    # create a model with a MatrixVectorActivation instance with no activation
     # the wider range of the full accumulator makes debugging a bit easier
     wdt = DataType["INT2"]
     idt = DataType["INT32"]
@@ -82,7 +78,7 @@ def create_one_fc_model(mem_mode="const"):
     outp = helper.make_tensor_value_info("outp", TensorProto.FLOAT, [1, m])
 
     fc0 = helper.make_node(
-        "StreamingFCLayer_Batch",
+        "MatrixVectorActivation",
         ["inp", "w0"],
         ["outp"],
         domain="finn.custom_op.fpgadataflow",
@@ -120,7 +116,7 @@ def create_one_fc_model(mem_mode="const"):
 
 
 def create_two_fc_model(mem_mode="decoupled"):
-    # create a model with two StreamingFCLayer instances
+    # create a model with two MatrixVectorActivation instances
     wdt = DataType["INT2"]
     idt = DataType["INT32"]
     odt = DataType["INT32"]
@@ -136,7 +132,7 @@ def create_two_fc_model(mem_mode="decoupled"):
     outp = helper.make_tensor_value_info("outp", TensorProto.FLOAT, [1, m])
 
     fc0 = helper.make_node(
-        "StreamingFCLayer_Batch",
+        "MatrixVectorActivation",
         ["inp", "w0"],
         ["mid"],
         domain="finn.custom_op.fpgadataflow",
@@ -155,7 +151,7 @@ def create_two_fc_model(mem_mode="decoupled"):
     )
 
     fc1 = helper.make_node(
-        "StreamingFCLayer_Batch",
+        "MatrixVectorActivation",
         ["mid", "w1"],
         ["outp"],
         domain="finn.custom_op.fpgadataflow",
@@ -201,6 +197,7 @@ def create_two_fc_model(mem_mode="decoupled"):
 
 
 @pytest.mark.parametrize("mem_mode", ["const", "decoupled"])
+@pytest.mark.fpgadataflow
 @pytest.mark.vivado
 def test_fpgadataflow_ipstitch_gen_model(mem_mode):
     model = create_one_fc_model(mem_mode)
@@ -214,7 +211,7 @@ def test_fpgadataflow_ipstitch_gen_model(mem_mode):
     model = model.transform(GiveUniqueNodeNames())
     model = model.transform(PrepareIP(test_fpga_part, 5))
     model = model.transform(HLSSynthIP())
-    assert model.graph.node[0].op_type == "StreamingFCLayer_Batch"
+    assert model.graph.node[0].op_type == "MatrixVectorActivation"
     assert model.graph.node[-1].op_type == "TLastMarker"
     model.save(
         ip_stitch_model_dir + "/test_fpgadataflow_ipstitch_gen_model_%s.onnx" % mem_mode
@@ -222,6 +219,7 @@ def test_fpgadataflow_ipstitch_gen_model(mem_mode):
 
 
 @pytest.mark.parametrize("mem_mode", ["const", "decoupled"])
+@pytest.mark.fpgadataflow
 @pytest.mark.vivado
 def test_fpgadataflow_ipstitch_do_stitch(mem_mode):
     model = load_test_checkpoint_or_skip(
@@ -239,6 +237,7 @@ def test_fpgadataflow_ipstitch_do_stitch(mem_mode):
 
 
 @pytest.mark.parametrize("mem_mode", ["const", "decoupled"])
+@pytest.mark.fpgadataflow
 @pytest.mark.vivado
 def test_fpgadataflow_ipstitch_rtlsim(mem_mode):
     model = load_test_checkpoint_or_skip(
@@ -287,6 +286,7 @@ def test_fpgadataflow_ipstitch_rtlsim(mem_mode):
 
 
 @pytest.mark.parametrize("mem_mode", ["const", "decoupled"])
+@pytest.mark.fpgadataflow
 @pytest.mark.vivado
 @pytest.mark.slow
 def test_fpgadataflow_ipstitch_synth_ooc(mem_mode):
@@ -308,6 +308,7 @@ def test_fpgadataflow_ipstitch_synth_ooc(mem_mode):
     assert ret["fmax_mhz"] > 100
 
 
+@pytest.mark.fpgadataflow
 def test_fpgadataflow_ipstitch_iodma_floorplan():
     model = create_one_fc_model()
     if model.graph.node[0].op_type == "StreamingDataflowPartition":
@@ -330,10 +331,11 @@ def test_fpgadataflow_ipstitch_iodma_floorplan():
 @pytest.mark.parametrize("period_ns", [5])
 # override mem_mode to external
 @pytest.mark.parametrize("extw", [True, False])
+@pytest.mark.fpgadataflow
 @pytest.mark.slow
 @pytest.mark.vivado
 @pytest.mark.vitis
-def test_fpgadataflow_ipstitch_vitis(board, period_ns, extw):
+def test_fpgadataflow_ipstitch_vitis_end2end(board, period_ns, extw):
     if "VITIS_PATH" not in os.environ:
         pytest.skip("VITIS_PATH not set")
     platform = alveo_default_platform[board]
@@ -344,6 +346,8 @@ def test_fpgadataflow_ipstitch_vitis(board, period_ns, extw):
         assert sdp_node.__class__.__name__ == "StreamingDataflowPartition"
         assert os.path.isfile(sdp_node.get_nodeattr("model"))
         model = load_test_checkpoint_or_skip(sdp_node.get_nodeattr("model"))
+    model = model.transform(GiveUniqueNodeNames())
+    model = model.transform(PrepareIP(fpga_part, period_ns))
     model = model.transform(VitisBuild(fpga_part, period_ns, platform))
     model.save(ip_stitch_model_dir + "/test_fpgadataflow_ipstitch_vitis.onnx")
     assert model.get_metadata_prop("platform") == "alveo"
@@ -353,9 +357,10 @@ def test_fpgadataflow_ipstitch_vitis(board, period_ns, extw):
 
 # board
 @pytest.mark.parametrize("board", ["Pynq-Z1"])
+@pytest.mark.fpgadataflow
 @pytest.mark.slow
 @pytest.mark.vivado
-def test_fpgadataflow_ipstitch_zynqbuild(board):
+def test_fpgadataflow_ipstitch_zynqbuild_end2end(board):
     model = create_two_fc_model()
     if model.graph.node[0].op_type == "StreamingDataflowPartition":
         sdp_node = getCustomOp(model.graph.node[0])
diff --git a/tests/fpgadataflow/test_fpgadataflow_labelselect.py b/tests/fpgadataflow/test_fpgadataflow_labelselect.py
index 8ed06c8bdf..a9b98ecaf8 100644
--- a/tests/fpgadataflow/test_fpgadataflow_labelselect.py
+++ b/tests/fpgadataflow/test_fpgadataflow_labelselect.py
@@ -30,18 +30,18 @@
 
 import numpy as np
 from onnx import TensorProto, helper
+from qonnx.core.datatype import DataType
+from qonnx.core.modelwrapper import ModelWrapper
+from qonnx.transformation.general import GiveUniqueNodeNames
+from qonnx.util.basic import gen_finn_dt_tensor
 
 import finn.core.onnx_exec as oxe
-from finn.core.datatype import DataType
-from finn.core.modelwrapper import ModelWrapper
 from finn.transformation.fpgadataflow.compile_cppsim import CompileCppSim
 from finn.transformation.fpgadataflow.hlssynth_ip import HLSSynthIP
 from finn.transformation.fpgadataflow.prepare_cppsim import PrepareCppSim
 from finn.transformation.fpgadataflow.prepare_ip import PrepareIP
 from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim
 from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode
-from finn.transformation.general import GiveUniqueNodeNames
-from finn.util.basic import gen_finn_dt_tensor
 from finn.util.test import soft_verify_topk
 
 
@@ -92,6 +92,7 @@ def prepare_inputs(input_tensor, idt):
 @pytest.mark.parametrize("k", [1, 5])
 # execution mode
 @pytest.mark.parametrize("exec_mode", ["cppsim", "rtlsim"])
+@pytest.mark.fpgadataflow
 @pytest.mark.vivado
 def test_fpgadataflow_labelselect(idt, labels, fold, k, exec_mode):
     np.random.seed(0)
diff --git a/tests/fpgadataflow/test_fpgadataflow_lookup.py b/tests/fpgadataflow/test_fpgadataflow_lookup.py
index 45678bbdf2..da4204c81a 100644
--- a/tests/fpgadataflow/test_fpgadataflow_lookup.py
+++ b/tests/fpgadataflow/test_fpgadataflow_lookup.py
@@ -31,22 +31,24 @@
 import numpy as np
 import torch
 from brevitas.export import FINNManager
+from qonnx.core.datatype import DataType
+from qonnx.core.modelwrapper import ModelWrapper
+from qonnx.custom_op.registry import getCustomOp
+from qonnx.transformation.general import GiveUniqueNodeNames
+from qonnx.transformation.infer_datatypes import InferDataTypes
+from qonnx.transformation.infer_shapes import InferShapes
+from qonnx.util.basic import gen_finn_dt_tensor
 from torch import nn
 
-from finn.core.datatype import DataType
-from finn.core.modelwrapper import ModelWrapper
 from finn.core.onnx_exec import execute_onnx
 from finn.transformation.fpgadataflow.compile_cppsim import CompileCppSim
 from finn.transformation.fpgadataflow.convert_to_hls_layers import InferLookupLayer
+from finn.transformation.fpgadataflow.create_stitched_ip import CreateStitchedIP
 from finn.transformation.fpgadataflow.hlssynth_ip import HLSSynthIP
 from finn.transformation.fpgadataflow.prepare_cppsim import PrepareCppSim
 from finn.transformation.fpgadataflow.prepare_ip import PrepareIP
 from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim
 from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode
-from finn.transformation.general import GiveUniqueNodeNames
-from finn.transformation.infer_datatypes import InferDataTypes
-from finn.transformation.infer_shapes import InferShapes
-from finn.util.basic import gen_finn_dt_tensor
 
 
 def make_lookup_model(embeddings, ishape, idt, edt):
@@ -87,6 +89,7 @@ def forward(self, x):
 )
 # execution mode
 @pytest.mark.parametrize("exec_mode", ["cppsim", "rtlsim"])
+@pytest.mark.fpgadataflow
 @pytest.mark.vivado
 @pytest.mark.slow
 def test_fpgadataflow_lookup(edt, embedding_cfg, exec_mode):
@@ -124,9 +127,57 @@ def test_fpgadataflow_lookup(edt, embedding_cfg, exec_mode):
         model = model.transform(SetExecMode("cppsim"))
     elif exec_mode == "rtlsim":
         model = model.transform(GiveUniqueNodeNames())
-        model = model.transform(PrepareIP("xc7z020clg400-1", 10))
+        model = model.transform(PrepareIP("xczu3eg-sbva484-1-e", 10))
         model = model.transform(HLSSynthIP())
         model = model.transform(SetExecMode("rtlsim"))
         model = model.transform(PrepareRTLSim())
     ret_sim = execute_onnx(model, {iname: itensor})
     assert (exp_out == ret_sim[oname]).all()
+
+
+@pytest.mark.fpgadataflow
+@pytest.mark.vivado
+@pytest.mark.slow
+def test_fpgadataflow_lookup_external():
+    fpga_part = "xczu3eg-sbva484-1-e"
+    edt = DataType["INT8"]
+    embedding_cfg = (200000, DataType["UINT32"], 300)
+    ishape = (1, 600)
+    num_embeddings, idt, embedding_dim = embedding_cfg
+    eshape = (num_embeddings, embedding_dim)
+    exp_oshape = tuple(list(ishape) + [embedding_dim])
+    embeddings = gen_finn_dt_tensor(edt, eshape)
+    model = make_lookup_model(embeddings, ishape, idt, edt)
+    assert len(model.graph.node) == 1
+    assert model.graph.node[0].op_type == "Gather"
+    iname = model.graph.input[0].name
+    ename = model.graph.node[0].input[0]
+    oname = model.graph.output[0].name
+    assert model.get_tensor_datatype(iname) == idt
+    assert model.get_tensor_datatype(ename) == edt
+    assert model.get_tensor_datatype(oname) == edt
+    assert tuple(model.get_tensor_shape(ename)) == eshape
+    assert tuple(model.get_tensor_shape(oname)) == exp_oshape
+    assert (model.get_initializer(ename) == embeddings).all()
+    # itensor = gen_finn_dt_tensor(idt, ishape).astype(np.int64)
+    # itensor = np.clip(itensor, 0, num_embeddings - 1)
+    # ret = execute_onnx(model, {iname: itensor})
+    # exp_out = np.take(embeddings, itensor, axis=0)
+    # assert (exp_out == ret[oname]).all()
+    # call transformation to convert to HLS and verify conversion
+    model = model.transform(InferLookupLayer())
+    assert model.graph.node[0].op_type == "Lookup"
+    assert model.graph.node[0].input[0] == iname
+    assert model.graph.node[0].input[1] == ename
+    assert model.graph.node[0].output[0] == oname
+    getCustomOp(model.graph.node[0]).set_nodeattr("mem_mode", "external")
+    model = model.transform(GiveUniqueNodeNames())
+    model = model.transform(PrepareIP(fpga_part, 10))
+    model = model.transform(HLSSynthIP())
+    model = model.transform(CreateStitchedIP(fpga_part, 10.0))
+    ifnames = eval(model.get_metadata_prop("vivado_stitch_ifnames"))
+    # check some generated files/interfaces for the generated stitched IP
+    assert ifnames["aximm"] == [["m_axi_gmem0", 32]]
+    assert ifnames["s_axis"] == [["s_axis_0", 32]]
+    assert ifnames["m_axis"] == [["m_axis_0", 32]]
+    assert ifnames["axilite"] == ["s_axi_control_0"]
diff --git a/tests/fpgadataflow/test_fpgadataflow_fclayer.py b/tests/fpgadataflow/test_fpgadataflow_mvau.py
similarity index 95%
rename from tests/fpgadataflow/test_fpgadataflow_fclayer.py
rename to tests/fpgadataflow/test_fpgadataflow_mvau.py
index 02c3a3dc95..d1895a1267 100644
--- a/tests/fpgadataflow/test_fpgadataflow_fclayer.py
+++ b/tests/fpgadataflow/test_fpgadataflow_mvau.py
@@ -29,24 +29,24 @@
 import pytest
 
 import numpy as np
+import qonnx.custom_op.general.xnorpopcount as xp
 from onnx import TensorProto, helper
+from qonnx.core.datatype import DataType
+from qonnx.core.modelwrapper import ModelWrapper
+from qonnx.custom_op.general.multithreshold import multithreshold
+from qonnx.custom_op.registry import getCustomOp
+from qonnx.transformation.general import GiveUniqueNodeNames
+from qonnx.util.basic import calculate_signed_dot_prod_range, gen_finn_dt_tensor
 
 import finn.core.onnx_exec as oxe
-import finn.custom_op.general.xnorpopcount as xp
 from finn.analysis.fpgadataflow.exp_cycles_per_layer import exp_cycles_per_layer
 from finn.analysis.fpgadataflow.hls_synth_res_estimation import hls_synth_res_estimation
-from finn.core.datatype import DataType
-from finn.core.modelwrapper import ModelWrapper
-from finn.custom_op.general.multithreshold import multithreshold
-from finn.custom_op.registry import getCustomOp
 from finn.transformation.fpgadataflow.compile_cppsim import CompileCppSim
 from finn.transformation.fpgadataflow.hlssynth_ip import HLSSynthIP
 from finn.transformation.fpgadataflow.prepare_cppsim import PrepareCppSim
 from finn.transformation.fpgadataflow.prepare_ip import PrepareIP
 from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim
 from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode
-from finn.transformation.general import GiveUniqueNodeNames
-from finn.util.basic import calculate_signed_dot_prod_range, gen_finn_dt_tensor
 
 
 def make_single_fclayer_modelwrapper(W, pe, simd, wdt, idt, odt, T=None, tdt=None):
@@ -56,7 +56,7 @@ def make_single_fclayer_modelwrapper(W, pe, simd, wdt, idt, odt, T=None, tdt=Non
     assert mw % simd == 0
 
     # there are two ways to implement bipolar weights and inputs for
-    # StreamingFC:
+    # MatrixVectorActivation:
     # - specify their datatypes as such
     # - specify their datatypes as BINARY as use binaryXnorMode
     if wdt == DataType["BIPOLAR"] and idt == DataType["BIPOLAR"]:
@@ -85,7 +85,7 @@ def make_single_fclayer_modelwrapper(W, pe, simd, wdt, idt, odt, T=None, tdt=Non
         actval = 0
         no_act = 1
     FCLayer_node = helper.make_node(
-        "StreamingFCLayer_Batch",
+        "MatrixVectorActivation",
         node_inp_list,
         ["outp"],
         domain="finn.custom_op.fpgadataflow",
@@ -146,6 +146,7 @@ def prepare_inputs(input_tensor, idt, wdt):
 @pytest.mark.parametrize("mw", [16])
 # HLS matrix height (output features)
 @pytest.mark.parametrize("mh", [16])
+@pytest.mark.fpgadataflow
 @pytest.mark.slow
 @pytest.mark.vivado
 def test_fpgadataflow_fclayer_cppsim(mem_mode, idt, wdt, act, nf, sf, mw, mh):
@@ -233,6 +234,7 @@ def test_fpgadataflow_fclayer_cppsim(mem_mode, idt, wdt, act, nf, sf, mw, mh):
 @pytest.mark.parametrize("mw", [16])
 # HLS matrix height (output features)
 @pytest.mark.parametrize("mh", [16])
+@pytest.mark.fpgadataflow
 @pytest.mark.slow
 @pytest.mark.vivado
 def test_fpgadataflow_fclayer_rtlsim(mem_mode, idt, wdt, act, nf, sf, mw, mh):
@@ -305,9 +307,9 @@ def test_fpgadataflow_fclayer_rtlsim(mem_mode, idt, wdt, act, nf, sf, mw, mh):
     assert (y_produced.reshape(y_expected.shape) == y_expected).all(), "rtlsim failed"
 
     hls_synt_res_est = model.analysis(hls_synth_res_estimation)
-    assert "StreamingFCLayer_Batch_0" in hls_synt_res_est
+    assert "MatrixVectorActivation_0" in hls_synt_res_est
 
-    node = model.get_nodes_by_op_type("StreamingFCLayer_Batch")[0]
+    node = model.get_nodes_by_op_type("MatrixVectorActivation")[0]
     inst = getCustomOp(node)
     cycles_rtlsim = inst.get_nodeattr("cycles_rtlsim")
     exp_cycles_dict = model.analysis(exp_cycles_per_layer)
@@ -332,6 +334,7 @@ def test_fpgadataflow_fclayer_rtlsim(mem_mode, idt, wdt, act, nf, sf, mw, mh):
 @pytest.mark.parametrize("mw", [128])
 # HLS matrix height (output features)
 @pytest.mark.parametrize("mh", [128])
+@pytest.mark.fpgadataflow
 @pytest.mark.vivado
 def test_fpgadataflow_fclayer_large_depth_decoupled_mode_rtlsim(
     mem_mode, idt, wdt, act, nf, sf, mw, mh
@@ -405,9 +408,9 @@ def test_fpgadataflow_fclayer_large_depth_decoupled_mode_rtlsim(
     assert (y_produced.reshape(y_expected.shape) == y_expected).all(), "rtlsim failed"
 
     hls_synt_res_est = model.analysis(hls_synth_res_estimation)
-    assert "StreamingFCLayer_Batch_0" in hls_synt_res_est
+    assert "MatrixVectorActivation_0" in hls_synt_res_est
 
-    node = model.get_nodes_by_op_type("StreamingFCLayer_Batch")[0]
+    node = model.get_nodes_by_op_type("MatrixVectorActivation")[0]
     inst = getCustomOp(node)
     cycles_rtlsim = inst.get_nodeattr("cycles_rtlsim")
     exp_cycles_dict = model.analysis(exp_cycles_per_layer)
diff --git a/tests/fpgadataflow/test_fpgadataflow_res_estimate.py b/tests/fpgadataflow/test_fpgadataflow_res_estimate.py
index fe52a73fc0..e3c79fa44f 100644
--- a/tests/fpgadataflow/test_fpgadataflow_res_estimate.py
+++ b/tests/fpgadataflow/test_fpgadataflow_res_estimate.py
@@ -26,15 +26,17 @@
 # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
+import pytest
+
 from onnx import TensorProto, helper
+from qonnx.core.datatype import DataType
+from qonnx.core.modelwrapper import ModelWrapper
+from qonnx.transformation.general import GiveUniqueNodeNames
 
 from finn.analysis.fpgadataflow.res_estimation import (
     res_estimation,
     res_estimation_complete,
 )
-from finn.core.datatype import DataType
-from finn.core.modelwrapper import ModelWrapper
-from finn.transformation.general import GiveUniqueNodeNames
 
 
 def check_two_dict_for_equality(dict1, dict2):
@@ -50,6 +52,7 @@ def check_two_dict_for_equality(dict1, dict2):
     return True
 
 
+@pytest.mark.fpgadataflow
 def test_res_estimate():
     mw = mh = 4
     simd = 1
@@ -64,7 +67,7 @@ def test_res_estimate():
     node_inp_list = ["inp", "weights", "thresh"]
 
     FCLayer_node = helper.make_node(
-        "StreamingFCLayer_Batch",
+        "MatrixVectorActivation",
         node_inp_list,
         ["outp"],
         domain="finn.custom_op.fpgadataflow",
@@ -94,7 +97,7 @@ def test_res_estimate():
     model = model.transform(GiveUniqueNodeNames())
     prod_resource_estimation = model.analysis(res_estimation)
     expect_resource_estimation = {
-        "StreamingFCLayer_Batch_0": {
+        "MatrixVectorActivation_0": {
             "BRAM_18K": 0,
             "BRAM_efficiency": 1,
             "LUT": 357,
@@ -111,7 +114,7 @@ def test_res_estimate():
 
     prod_resource_estimation = model.analysis(res_estimation_complete)
     expect_resource_estimation = {
-        "StreamingFCLayer_Batch_0": [
+        "MatrixVectorActivation_0": [
             {
                 "BRAM_18K": 0,
                 "BRAM_efficiency": 1,
diff --git a/tests/fpgadataflow/test_layer_streaming_maxpool_batch.py b/tests/fpgadataflow/test_fpgadataflow_streamingmaxpool.py
similarity index 68%
rename from tests/fpgadataflow/test_layer_streaming_maxpool_batch.py
rename to tests/fpgadataflow/test_fpgadataflow_streamingmaxpool.py
index 236eb2a034..a3968cf797 100644
--- a/tests/fpgadataflow/test_layer_streaming_maxpool_batch.py
+++ b/tests/fpgadataflow/test_fpgadataflow_streamingmaxpool.py
@@ -28,25 +28,27 @@
 
 import pytest
 
-import numpy as np
 from onnx import TensorProto, helper
+from qonnx.core.datatype import DataType
+from qonnx.core.modelwrapper import ModelWrapper
+from qonnx.custom_op.general.maxpoolnhwc import compute_pool_output_dim
+from qonnx.custom_op.registry import getCustomOp
+from qonnx.transformation.general import GiveUniqueNodeNames
+from qonnx.transformation.infer_shapes import InferShapes
+from qonnx.util.basic import gen_finn_dt_tensor
 
 import finn.core.onnx_exec as oxe
 from finn.analysis.fpgadataflow.exp_cycles_per_layer import exp_cycles_per_layer
-from finn.core.datatype import DataType
-from finn.core.modelwrapper import ModelWrapper
-from finn.custom_op.registry import getCustomOp
 from finn.transformation.fpgadataflow.compile_cppsim import CompileCppSim
+from finn.transformation.fpgadataflow.convert_to_hls_layers import InferStreamingMaxPool
 from finn.transformation.fpgadataflow.hlssynth_ip import HLSSynthIP
 from finn.transformation.fpgadataflow.prepare_cppsim import PrepareCppSim
 from finn.transformation.fpgadataflow.prepare_ip import PrepareIP
 from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim
 from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode
-from finn.transformation.general import GiveUniqueNodeNames
-from finn.util.basic import gen_finn_dt_tensor
 
 
-def make_single_maxpoolnhwc_modelwrapper(k, ifm_ch, ifm_dim, ofm_dim, idt):
+def make_single_maxpoolnhwc_modelwrapper(k, ifm_ch, ifm_dim, ofm_dim, idt, ceil_mode):
     k_h, k_w = k
     ifm_dim_h, ifm_dim_w = ifm_dim
     ofm_dim_h, ofm_dim_w = ofm_dim
@@ -62,9 +64,10 @@ def make_single_maxpoolnhwc_modelwrapper(k, ifm_ch, ifm_dim, ofm_dim, idt):
         "MaxPoolNHWC",
         ["inp"],
         ["outp"],
-        domain="finn.custom_op.general",
+        domain="qonnx.custom_op.general",
         kernel_shape=[k_h, k_w],
         strides=[k_h, k_w],
+        ceil_mode=ceil_mode,
         pads=[0, 0, 0, 0],
     )
     graph = helper.make_graph(
@@ -80,42 +83,6 @@ def make_single_maxpoolnhwc_modelwrapper(k, ifm_ch, ifm_dim, ofm_dim, idt):
     return model
 
 
-def make_single_streamingmaxpool_modelwrapper(k, ifm_ch, ifm_dim, ofm_dim, idt):
-    k_h, k_w = k
-    ifm_dim_h, ifm_dim_w = ifm_dim
-    ofm_dim_h, ofm_dim_w = ofm_dim
-    odt = idt
-    inp = helper.make_tensor_value_info(
-        "inp", TensorProto.FLOAT, [1, ifm_dim_h, ifm_dim_w, ifm_ch]
-    )
-    outp = helper.make_tensor_value_info(
-        "outp", TensorProto.FLOAT, [1, ofm_dim_h, ofm_dim_w, ifm_ch]
-    )
-
-    smp_node = helper.make_node(
-        "StreamingMaxPool_Batch",
-        ["inp"],
-        ["outp"],
-        domain="finn.custom_op.fpgadataflow",
-        backend="fpgadataflow",
-        PoolDim=[k_h, k_w],
-        NumChannels=ifm_ch,
-        ImgDim=[ifm_dim_h, ifm_dim_w],
-        dataType=idt.name,
-    )
-    graph = helper.make_graph(
-        nodes=[smp_node], name="smp_graph", inputs=[inp], outputs=[outp]
-    )
-
-    model = helper.make_model(graph, producer_name="smp-model")
-    model = ModelWrapper(model)
-
-    model.set_tensor_datatype("inp", idt)
-    model.set_tensor_datatype("outp", odt)
-
-    return model
-
-
 def prepare_inputs(input_tensor):
     return {"inp": input_tensor}
 
@@ -127,14 +94,21 @@ def prepare_inputs(input_tensor):
 # kernel size
 @pytest.mark.parametrize("k", [2, 4])
 # input dimension
-@pytest.mark.parametrize("ifm_dim", [4, 8])
+@pytest.mark.parametrize("ifm_dim", [4, 10])
 # input channels
 @pytest.mark.parametrize("ifm_ch", [1, 3])  # 1,3
+# pe
+@pytest.mark.parametrize("pe", [1, 3])
+# ceil mode
+@pytest.mark.parametrize("ceil_mode", [1])
 # execution mode
 @pytest.mark.parametrize("exec_mode", ["rtlsim", "cppsim"])
+@pytest.mark.fpgadataflow
 @pytest.mark.slow
 @pytest.mark.vivado
-def test_fpgadataflow_streamingmaxpool(idt, dim_1d, k, ifm_dim, ifm_ch, exec_mode):
+def test_fpgadataflow_streamingmaxpool(
+    idt, dim_1d, k, ifm_dim, ifm_ch, pe, ceil_mode, exec_mode
+):
     ifm_dim_h = ifm_dim
     k_h = k
     if dim_1d:
@@ -148,22 +122,35 @@ def test_fpgadataflow_streamingmaxpool(idt, dim_1d, k, ifm_dim, ifm_ch, exec_mod
 
     stride_h = k_h
     stride_w = k_w
-    ofm_dim_h = int(((ifm_dim_h - k_h) / stride_h) + 1)
-    ofm_dim_w = int(((ifm_dim_w - k_w) / stride_w) + 1)
+    ofm_dim_h = compute_pool_output_dim(ifm_dim_h, k_h, stride_h, 0, ceil_mode)
+    ofm_dim_w = compute_pool_output_dim(ifm_dim_w, k_w, stride_w, 0, ceil_mode)
     ofm_dim = (ofm_dim_h, ofm_dim_w)
     if idt == DataType["BIPOLAR"] and dim_1d:
         pytest.skip("Skipping binary StreamingMaxPool_1d (not implemented)")
-    if ifm_dim_h % k_h != 0 or ifm_dim_w % k_w != 0:
-        pytest.skip("Skipping StreamingMaxPool test w/ ImgDim % PoolDim != 0")
+    if (ifm_dim_h % k_h != 0 or ifm_dim_w % k_w != 0) and (not dim_1d):
+        pytest.skip("StreamingMaxPool_2d test w/ ImgDim % PoolDim != 0 not implemented")
+    if pe > ifm_ch:
+        pytest.skip("PE cannot be larger than number of input channels")
+    if pe > 1 and (not dim_1d):
+        pytest.skip("PE>1 only supported for StreamingMaxPool_1d")
 
     x = gen_finn_dt_tensor(idt, (1, ifm_dim_h, ifm_dim_w, ifm_ch))
     # prepare input data
     input_dict = prepare_inputs(x)
 
-    golden = make_single_maxpoolnhwc_modelwrapper(k, ifm_ch, ifm_dim, ofm_dim, idt)
+    golden = make_single_maxpoolnhwc_modelwrapper(
+        k, ifm_ch, ifm_dim, ofm_dim, idt, ceil_mode
+    )
     y_expected = oxe.execute_onnx(golden, input_dict)["outp"]
 
-    model = make_single_streamingmaxpool_modelwrapper(k, ifm_ch, ifm_dim, ofm_dim, idt)
+    model = golden.transform(InferStreamingMaxPool())
+    model = model.transform(InferShapes())
+
+    assert model.graph.node[0].op_type == "StreamingMaxPool_Batch"
+
+    # Ensure PE value is set
+    streamingmaxpool_node = model.get_nodes_by_op_type("StreamingMaxPool_Batch")[0]
+    getCustomOp(streamingmaxpool_node).set_nodeattr("PE", pe)
 
     if exec_mode == "cppsim":
         model = model.transform(SetExecMode("cppsim"))
@@ -172,11 +159,11 @@ def test_fpgadataflow_streamingmaxpool(idt, dim_1d, k, ifm_dim, ifm_ch, exec_mod
     elif exec_mode == "rtlsim":
         model = model.transform(SetExecMode("rtlsim"))
         model = model.transform(GiveUniqueNodeNames())
-        model = model.transform(PrepareIP("xc7z020clg400-1", 5))
+        model = model.transform(PrepareIP("xczu3eg-sbva484-1-e", 5))
         model = model.transform(HLSSynthIP())
         model = model.transform(PrepareRTLSim())
     else:
-        raise Exception("Unknown exec_mode in test_layer_streaming_maxpool_batch")
+        raise Exception("Unknown exec_mode in test_fpgadataflow_streamingmaxpool")
 
     # execute model
     y_produced = oxe.execute_onnx(model, input_dict)["outp"]
@@ -184,9 +171,12 @@ def test_fpgadataflow_streamingmaxpool(idt, dim_1d, k, ifm_dim, ifm_ch, exec_mod
 
     if exec_mode == "rtlsim":
         node = model.get_nodes_by_op_type("StreamingMaxPool_Batch")[0]
-        inst = getCustomOp(node)
-        cycles_rtlsim = inst.get_nodeattr("cycles_rtlsim")
+        # inst = getCustomOp(node)
+        # cycles_rtlsim = inst.get_nodeattr("cycles_rtlsim")
         exp_cycles_dict = model.analysis(exp_cycles_per_layer)
         exp_cycles = exp_cycles_dict[node.name]
-        assert np.isclose(exp_cycles, cycles_rtlsim, atol=15)
+        # FIXME: maxpool cycles prediction needs a fix
+        # most likely due to inaccurate cycle prediction of
+        # nested for-loops
+        # assert np.isclose(exp_cycles, cycles_rtlsim, atol=15)
         assert exp_cycles != 0
diff --git a/tests/fpgadataflow/test_fpgadataflow_thresholding.py b/tests/fpgadataflow/test_fpgadataflow_thresholding.py
index 341bd3f370..706679b680 100644
--- a/tests/fpgadataflow/test_fpgadataflow_thresholding.py
+++ b/tests/fpgadataflow/test_fpgadataflow_thresholding.py
@@ -31,15 +31,18 @@
 import numpy as np
 import os
 from onnx import TensorProto, helper
+from pyverilator.util.axi_utils import axilite_read, axilite_write
+from qonnx.core.datatype import DataType
+from qonnx.core.modelwrapper import ModelWrapper
+from qonnx.custom_op.general.multithreshold import multithreshold
+from qonnx.custom_op.registry import getCustomOp
+from qonnx.transformation.general import GiveUniqueNodeNames
+from qonnx.util.basic import gen_finn_dt_tensor
 
 import finn.core.onnx_exec as oxe
 from finn.analysis.fpgadataflow.exp_cycles_per_layer import exp_cycles_per_layer
 from finn.analysis.fpgadataflow.hls_synth_res_estimation import hls_synth_res_estimation
-from finn.core.datatype import DataType
-from finn.core.modelwrapper import ModelWrapper
 from finn.core.rtlsim_exec import rtlsim_exec
-from finn.custom_op.general.multithreshold import multithreshold
-from finn.custom_op.registry import getCustomOp
 from finn.transformation.fpgadataflow.compile_cppsim import CompileCppSim
 from finn.transformation.fpgadataflow.create_stitched_ip import CreateStitchedIP
 from finn.transformation.fpgadataflow.hlssynth_ip import HLSSynthIP
@@ -48,19 +51,22 @@
 from finn.transformation.fpgadataflow.prepare_ip import PrepareIP
 from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim
 from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode
-from finn.transformation.general import GiveUniqueNodeNames
-from finn.util.basic import gen_finn_dt_tensor
-from finn.util.pyverilator import axilite_read, axilite_write
 
-test_fpga_part = "xc7z020clg400-1"
+test_fpga_part = "xczu3eg-sbva484-1-e"
 target_clk_ns = 5
 
 
-def make_single_thresholding_modelwrapper(T, pe, idt, odt, actval, mem_mode):
+def make_single_thresholding_modelwrapper(
+    T, pe, idt, odt, actval, mem_mode, n_inp_vecs
+):
     NumChannels = T.shape[0]
 
-    inp = helper.make_tensor_value_info("inp", TensorProto.FLOAT, [1, NumChannels])
-    outp = helper.make_tensor_value_info("outp", TensorProto.FLOAT, [1, NumChannels])
+    inp = helper.make_tensor_value_info(
+        "inp", TensorProto.FLOAT, n_inp_vecs + [NumChannels]
+    )
+    outp = helper.make_tensor_value_info(
+        "outp", TensorProto.FLOAT, n_inp_vecs + [NumChannels]
+    )
 
     node_inp_list = ["inp", "thresh"]
 
@@ -78,6 +84,7 @@ def make_single_thresholding_modelwrapper(T, pe, idt, odt, actval, mem_mode):
         outputDataType=odt.name,
         ActVal=actval,
         mem_mode=mem_mode,
+        numInputVectors=n_inp_vecs,
     )
     graph = helper.make_graph(
         nodes=[Thresholding_node],
@@ -109,16 +116,18 @@ def make_single_thresholding_modelwrapper(T, pe, idt, odt, actval, mem_mode):
 @pytest.mark.parametrize("exec_mode", ["cppsim", "rtlsim"])
 # memory mode
 @pytest.mark.parametrize("mem_mode", ["const", "decoupled"])
+@pytest.mark.fpgadataflow
 @pytest.mark.vivado
 @pytest.mark.slow
 def test_fpgadataflow_thresholding(idt, act, nf, ich, exec_mode, mem_mode):
     if nf == -1:
         nf = ich
     pe = ich // nf
+    n_inp_vecs = [1, 2, 2]
     assert ich % pe == 0
 
     # generate input data
-    x = gen_finn_dt_tensor(idt, (1, ich))
+    x = gen_finn_dt_tensor(idt, tuple(n_inp_vecs + [ich]))
 
     odt = act
     n_steps = act.get_num_possible_values() - 1
@@ -135,7 +144,9 @@ def test_fpgadataflow_thresholding(idt, act, nf, ich, exec_mode, mem_mode):
     else:
         actval = odt.min()
 
-    model = make_single_thresholding_modelwrapper(T, pe, idt, odt, actval, mem_mode)
+    model = make_single_thresholding_modelwrapper(
+        T, pe, idt, odt, actval, mem_mode, n_inp_vecs
+    )
 
     if exec_mode == "cppsim":
         model = model.transform(PrepareCppSim())
@@ -153,7 +164,10 @@ def test_fpgadataflow_thresholding(idt, act, nf, ich, exec_mode, mem_mode):
     # package input data as dictionary
     input_dict = {"inp": x}
 
-    y = multithreshold(x, T)
+    # multithreshold util fxn wants NCHW input, not NHWC
+    y = multithreshold(np.transpose(x, (0, 3, 1, 2)), T)
+    # convert back to NHWC for comparison to hw outputs
+    y = np.transpose(y, (0, 2, 3, 1))
     if act == DataType["BIPOLAR"]:
         # binary to bipolar
         y = 2 * y - 1
@@ -183,8 +197,10 @@ def test_fpgadataflow_thresholding(idt, act, nf, ich, exec_mode, mem_mode):
         assert exp_cycles != 0
 
 
+@pytest.mark.fpgadataflow
 @pytest.mark.vivado
 def test_runtime_thresholds_single_layer():
+    n_inp_vecs = [1, 2, 2]
     mem_mode = "decoupled"
     act = DataType["INT4"]
     idt = DataType["INT16"]
@@ -194,7 +210,7 @@ def test_runtime_thresholds_single_layer():
     assert ich % pe == 0
 
     # generate input data
-    in_tensor = gen_finn_dt_tensor(idt, (1, ich))
+    in_tensor = gen_finn_dt_tensor(idt, tuple(n_inp_vecs + [ich]))
 
     odt = act
     n_steps = act.get_num_possible_values() - 1
@@ -207,7 +223,9 @@ def test_runtime_thresholds_single_layer():
     else:
         actval = odt.min()
 
-    model = make_single_thresholding_modelwrapper(T, pe, idt, odt, actval, mem_mode)
+    model = make_single_thresholding_modelwrapper(
+        T, pe, idt, odt, actval, mem_mode, n_inp_vecs
+    )
     op_inst = getCustomOp(model.graph.node[0])
     op_inst.set_nodeattr("runtime_writeable_weights", 1)
     op_inst.make_weight_file(T, "decoupled_runtime", "old_weights.dat")
@@ -227,7 +245,7 @@ def test_runtime_thresholds_single_layer():
     # add two copies of the input tensor as the first one is just used to
     # "flush out" the pipeline (as mvau already starts receiving old weights while
     # we read/write new ones and reads seem to cause a disturbance too)
-    in_tensor = np.tile(in_tensor, (2, 1))
+    in_tensor = np.tile(in_tensor, (2, 1, 1, 1))
     exec_ctx = {"inp": in_tensor}
     extracted_weight_stream = []
 
@@ -244,7 +262,13 @@ def read_weights(sim):
     # only use second batch element in output; first will be invalid due to
     # old weights (see above)
     y = exec_ctx["outp"][1]
-    expected = multithreshold(in_tensor, T)[1]
+
+    # multithreshold util fxn wants NCHW input, not NHWC
+    expected = multithreshold(np.transpose(in_tensor, (0, 3, 1, 2)), T)
+    # convert back to NHWC for comparison to hw outputs
+    expected = np.transpose(expected, (0, 2, 3, 1))[1]
+
+    # expected = multithreshold(in_tensor, T)[1]
     if act == DataType["BIPOLAR"]:
         # binary to bipolar
         expected = 2 * expected - 1
@@ -273,7 +297,10 @@ def write_weights(sim):
 
     rtlsim_exec(model, exec_ctx, pre_hook=write_weights)
     y = exec_ctx["outp"][1]
-    expected = multithreshold(in_tensor, new_weights)[1]
+    # multithreshold util fxn wants NCHW input, not NHWC
+    expected = multithreshold(np.transpose(in_tensor, (0, 3, 1, 2)), new_weights)
+    # convert back to NHWC for comparison to hw outputs
+    expected = np.transpose(expected, (0, 2, 3, 1))[1]
     if act == DataType["BIPOLAR"]:
         # binary to bipolar
         expected = 2 * expected - 1
diff --git a/tests/fpgadataflow/test_fpgadataflow_upsampler.py b/tests/fpgadataflow/test_fpgadataflow_upsampler.py
index 1709cfe329..d1ef0b890a 100644
--- a/tests/fpgadataflow/test_fpgadataflow_upsampler.py
+++ b/tests/fpgadataflow/test_fpgadataflow_upsampler.py
@@ -32,13 +32,18 @@
 import os
 import torch
 from brevitas.export import FINNManager
+from qonnx.core.datatype import DataType
+from qonnx.core.modelwrapper import ModelWrapper
+from qonnx.transformation.base import Transformation
+from qonnx.transformation.general import GiveUniqueNodeNames
+from qonnx.transformation.infer_data_layouts import InferDataLayouts
+from qonnx.transformation.infer_datatypes import InferDataTypes
+from qonnx.transformation.infer_shapes import InferShapes
+from qonnx.transformation.make_input_chanlast import MakeInputChannelsLast
 from torch import nn
 
 import finn.core.onnx_exec as oxe
 import finn.transformation.streamline.absorb as absorb
-from finn.core.datatype import DataType
-from finn.core.modelwrapper import ModelWrapper
-from finn.transformation.base import Transformation
 from finn.transformation.fpgadataflow.compile_cppsim import CompileCppSim
 from finn.transformation.fpgadataflow.convert_to_hls_layers import InferUpsample
 from finn.transformation.fpgadataflow.hlssynth_ip import HLSSynthIP
@@ -46,11 +51,6 @@
 from finn.transformation.fpgadataflow.prepare_ip import PrepareIP
 from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim
 from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode
-from finn.transformation.general import GiveUniqueNodeNames
-from finn.transformation.infer_data_layouts import InferDataLayouts
-from finn.transformation.infer_datatypes import InferDataTypes
-from finn.transformation.infer_shapes import InferShapes
-from finn.transformation.make_input_chanlast import MakeInputChannelsLast
 
 tmpdir = os.environ["FINN_BUILD_DIR"]
 
@@ -125,6 +125,7 @@ def forward(self, x):
 @pytest.mark.parametrize("NumChannels", [4])
 # execution mode
 @pytest.mark.parametrize("exec_mode", ["cppsim", "rtlsim"])
+@pytest.mark.fpgadataflow
 @pytest.mark.vivado
 @pytest.mark.slow
 def test_fpgadataflow_upsampler(dt, IFMDim, scale, NumChannels, exec_mode):
diff --git a/tests/fpgadataflow/test_fpgadataflow_vvau.py b/tests/fpgadataflow/test_fpgadataflow_vvau.py
index 9eb3a7f451..c48448787d 100644
--- a/tests/fpgadataflow/test_fpgadataflow_vvau.py
+++ b/tests/fpgadataflow/test_fpgadataflow_vvau.py
@@ -30,25 +30,25 @@
 
 import numpy as np
 from onnx import TensorProto, helper
+from qonnx.core.datatype import DataType
+from qonnx.core.modelwrapper import ModelWrapper
+from qonnx.custom_op.general.multithreshold import multithreshold
+from qonnx.custom_op.registry import getCustomOp
+from qonnx.transformation.general import GiveUniqueNodeNames
+from qonnx.util.basic import gen_finn_dt_tensor
 
 import finn.core.onnx_exec as oxe
 from finn.analysis.fpgadataflow.exp_cycles_per_layer import exp_cycles_per_layer
-from finn.core.datatype import DataType
-from finn.core.modelwrapper import ModelWrapper
-from finn.custom_op.general.multithreshold import multithreshold
-from finn.custom_op.registry import getCustomOp
 from finn.transformation.fpgadataflow.compile_cppsim import CompileCppSim
 from finn.transformation.fpgadataflow.hlssynth_ip import HLSSynthIP
 from finn.transformation.fpgadataflow.prepare_cppsim import PrepareCppSim
 from finn.transformation.fpgadataflow.prepare_ip import PrepareIP
 from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim
 from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode
-from finn.transformation.general import GiveUniqueNodeNames
-from finn.util.basic import gen_finn_dt_tensor
 
 
 def _infer_sparse_weight_tensor(W_conv, k_h, k_w, channels):
-    W_sparse = np.zeros((channels, channels, k_h, k_w))
+    W_sparse = np.zeros((channels, channels, k_h, k_w), dtype=np.float32)
     for ch in range(channels):
         W_sparse[ch][ch] = W_conv[ch][0]
     W_conv = W_sparse.astype(np.float32)
@@ -98,7 +98,7 @@ def _make_single_vvau_modelwrapper(
         actval = 0
 
     VVAU_node = helper.make_node(
-        "Vector_Vector_Activate_Batch",
+        "VectorVectorActivation",
         node_inp_list,
         ["outp"],
         domain="finn.custom_op.fpgadataflow",
@@ -158,6 +158,7 @@ def prepare_inputs(input_tensor):
 @pytest.mark.parametrize("channels", [3, 4])
 # execution mode
 @pytest.mark.parametrize("exec_mode", ["cppsim", "rtlsim"])
+@pytest.mark.fpgadataflow
 @pytest.mark.slow
 @pytest.mark.vivado
 def test_fpgadataflow_vvau(
@@ -232,7 +233,7 @@ def test_fpgadataflow_vvau(
     assert (y_produced == y_expected).all(), "cppsim failed"
 
     if exec_mode == "rtlsim":
-        node = model.get_nodes_by_op_type("Vector_Vector_Activate_Batch")[0]
+        node = model.get_nodes_by_op_type("VectorVectorActivation")[0]
         inst = getCustomOp(node)
         cycles_rtlsim = inst.get_nodeattr("cycles_rtlsim")
         exp_cycles_dict = model.analysis(exp_cycles_per_layer)
diff --git a/tests/fpgadataflow/test_runtime_weights.py b/tests/fpgadataflow/test_runtime_weights.py
index 0196a78d5c..16fed5c3cb 100644
--- a/tests/fpgadataflow/test_runtime_weights.py
+++ b/tests/fpgadataflow/test_runtime_weights.py
@@ -30,23 +30,24 @@
 
 import numpy as np
 import os
+from pyverilator.util.axi_utils import axilite_read, axilite_write
+from qonnx.core.datatype import DataType
+from qonnx.custom_op.registry import getCustomOp
+from qonnx.transformation.general import GiveUniqueNodeNames
+from qonnx.util.basic import gen_finn_dt_tensor
 
-from finn.core.datatype import DataType
 from finn.core.rtlsim_exec import rtlsim_exec
-from finn.custom_op.registry import getCustomOp
 from finn.transformation.fpgadataflow.create_stitched_ip import CreateStitchedIP
 from finn.transformation.fpgadataflow.hlssynth_ip import HLSSynthIP
 from finn.transformation.fpgadataflow.insert_fifo import InsertFIFO
 from finn.transformation.fpgadataflow.prepare_ip import PrepareIP
-from finn.transformation.general import GiveUniqueNodeNames
-from finn.util.basic import gen_finn_dt_tensor
 from finn.util.create import hls_random_mlp_maker
-from finn.util.pyverilator import axilite_read, axilite_write
 
-test_fpga_part = "xc7z020clg400-1"
+test_fpga_part = "xczu3eg-sbva484-1-e"
 target_clk_ns = 5
 
 
+@pytest.mark.fpgadataflow
 @pytest.mark.vivado
 def test_runtime_weights_single_layer():
     idt = DataType["UINT32"]
@@ -67,7 +68,7 @@ def test_runtime_weights_single_layer():
     }
     layer_spec_list = [layer_spec]
     model = hls_random_mlp_maker(layer_spec_list)
-    fcl = model.get_nodes_by_op_type("StreamingFCLayer_Batch")[0]
+    fcl = model.get_nodes_by_op_type("MatrixVectorActivation")[0]
     op_inst = getCustomOp(fcl)
     op_inst.set_nodeattr("mem_mode", "decoupled")
     op_inst.set_nodeattr("runtime_writeable_weights", 1)
diff --git a/tests/fpgadataflow/test_set_folding.py b/tests/fpgadataflow/test_set_folding.py
index 492f208671..8ea0e18f2c 100644
--- a/tests/fpgadataflow/test_set_folding.py
+++ b/tests/fpgadataflow/test_set_folding.py
@@ -30,16 +30,16 @@
 
 import numpy as np
 from onnx import TensorProto, helper
+from qonnx.core.datatype import DataType
+from qonnx.core.modelwrapper import ModelWrapper
+from qonnx.custom_op.registry import getCustomOp
+from qonnx.transformation.general import GiveUniqueNodeNames
 
 from finn.analysis.fpgadataflow.exp_cycles_per_layer import exp_cycles_per_layer
-from finn.core.datatype import DataType
-from finn.core.modelwrapper import ModelWrapper
-from finn.custom_op.registry import getCustomOp
 from finn.transformation.fpgadataflow.create_dataflow_partition import (
     CreateDataflowPartition,
 )
 from finn.transformation.fpgadataflow.set_folding import SetFolding
-from finn.transformation.general import GiveUniqueNodeNames
 from finn.util.test import load_test_checkpoint_or_skip
 
 
@@ -66,7 +66,7 @@ def make_multi_fclayer_model(ch, wdt, adt, tdt, nnodes):
         simd = 1
         FCLayer_nodes += [
             helper.make_node(
-                "StreamingFCLayer_Batch",
+                "MatrixVectorActivation",
                 [tensors[i].name, "weights_" + str(i), "thresh_" + str(i)],
                 [tensors[i + 1].name],
                 domain="finn.custom_op.fpgadataflow",
@@ -112,6 +112,7 @@ def make_multi_fclayer_model(ch, wdt, adt, tdt, nnodes):
 @pytest.mark.parametrize("target_fps", [30, 10**5, 10**7])
 # target chip or board
 @pytest.mark.parametrize("platform", ["Pynq-Z1", "Ultra96", "U200"])
+@pytest.mark.fpgadataflow
 def test_set_folding(target_fps, platform):
 
     model = make_multi_fclayer_model(
diff --git a/tests/transformation/streamline/test_absorb_mul_into_topk.py b/tests/transformation/streamline/test_absorb_mul_into_topk.py
index bc9a31d49c..a6dff788dc 100644
--- a/tests/transformation/streamline/test_absorb_mul_into_topk.py
+++ b/tests/transformation/streamline/test_absorb_mul_into_topk.py
@@ -29,16 +29,17 @@
 
 import numpy as np
 from onnx import TensorProto, helper
+from qonnx.core.modelwrapper import ModelWrapper
+from qonnx.transformation.general import GiveReadableTensorNames, GiveUniqueNodeNames
+from qonnx.transformation.infer_datatypes import InferDataTypes
+from qonnx.transformation.infer_shapes import InferShapes
+from qonnx.transformation.insert_topk import InsertTopK
 
 import finn.core.onnx_exec as oxe
-from finn.core.modelwrapper import ModelWrapper
-from finn.transformation.general import GiveReadableTensorNames, GiveUniqueNodeNames
-from finn.transformation.infer_datatypes import InferDataTypes
-from finn.transformation.infer_shapes import InferShapes
-from finn.transformation.insert_topk import InsertTopK
 from finn.transformation.streamline.absorb import AbsorbScalarMulAddIntoTopK
 
 
+@pytest.mark.streamline
 # parameter to indicate if mul parameter is negative or positive
 @pytest.mark.parametrize("mul_positive", [True, False])
 # parameter to indicate if mul parameter is scalar or not
diff --git a/tests/transformation/streamline/test_absorb_opposite_transposes.py b/tests/transformation/streamline/test_absorb_opposite_transposes.py
index 859e691277..51ea5edfc4 100644
--- a/tests/transformation/streamline/test_absorb_opposite_transposes.py
+++ b/tests/transformation/streamline/test_absorb_opposite_transposes.py
@@ -26,16 +26,19 @@
 # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
+import pytest
+
 import numpy as np
 import onnx.helper as oh
 from onnx import TensorProto
+from qonnx.core.modelwrapper import ModelWrapper
+from qonnx.transformation.infer_shapes import InferShapes
 
 import finn.core.onnx_exec as ox
-from finn.core.modelwrapper import ModelWrapper
-from finn.transformation.infer_shapes import InferShapes
 from finn.transformation.streamline.absorb import AbsorbConsecutiveTransposes
 
 
+@pytest.mark.streamline
 def test_absorb_opposite_transposes():
     np.random.seed(0)
     input_shape = [1, 3, 4, 2]
diff --git a/tests/transformation/streamline/test_absorb_transp_into_flatten.py b/tests/transformation/streamline/test_absorb_transp_into_flatten.py
index 1e5d5fe580..1358d468c0 100644
--- a/tests/transformation/streamline/test_absorb_transp_into_flatten.py
+++ b/tests/transformation/streamline/test_absorb_transp_into_flatten.py
@@ -1,18 +1,19 @@
 import pytest
 
 import numpy as np
+import qonnx.core.data_layout as DataLayout
 from onnx import TensorProto, helper
+from qonnx.core.modelwrapper import ModelWrapper
+from qonnx.transformation.general import GiveReadableTensorNames, GiveUniqueNodeNames
+from qonnx.transformation.infer_data_layouts import InferDataLayouts
+from qonnx.transformation.infer_datatypes import InferDataTypes
+from qonnx.transformation.infer_shapes import InferShapes
 
-import finn.core.data_layout as DataLayout
 import finn.core.onnx_exec as oxe
-from finn.core.modelwrapper import ModelWrapper
-from finn.transformation.general import GiveReadableTensorNames, GiveUniqueNodeNames
-from finn.transformation.infer_data_layouts import InferDataLayouts
-from finn.transformation.infer_datatypes import InferDataTypes
-from finn.transformation.infer_shapes import InferShapes
 from finn.transformation.streamline.absorb import AbsorbTransposeIntoFlatten
 
 
+@pytest.mark.streamline
 # permutation of transpose node
 @pytest.mark.parametrize("perm", [[0, 2, 3, 1], [0, 1, 3, 2], [3, 2, 0, 1]])
 # reshape or flatten
diff --git a/tests/transformation/streamline/test_collapse_repeated_op.py b/tests/transformation/streamline/test_collapse_repeated_op.py
index 1741ab6b8f..268e0ffc5c 100644
--- a/tests/transformation/streamline/test_collapse_repeated_op.py
+++ b/tests/transformation/streamline/test_collapse_repeated_op.py
@@ -31,13 +31,14 @@
 import numpy as np
 import onnx.helper as oh
 from onnx import TensorProto
+from qonnx.core.modelwrapper import ModelWrapper
+from qonnx.transformation.infer_shapes import InferShapes
 
 import finn.core.onnx_exec as ox
-from finn.core.modelwrapper import ModelWrapper
-from finn.transformation.infer_shapes import InferShapes
 from finn.transformation.streamline import CollapseRepeatedAdd, CollapseRepeatedMul
 
 
+@pytest.mark.streamline
 def test_collapse_repeated_op():
     top_in = oh.make_tensor_value_info("top_in", TensorProto.FLOAT, [2])
     add_param_0 = oh.make_tensor_value_info("add_param_0", TensorProto.FLOAT, [2])
@@ -74,6 +75,7 @@ def test_collapse_repeated_op():
     assert new_model.graph.node[1].op_type == "Mul"
 
 
+@pytest.mark.streamline
 @pytest.mark.parametrize(
     "test_args",
     [("Add", CollapseRepeatedAdd()), ("Mul", CollapseRepeatedMul())],
diff --git a/tests/transformation/streamline/test_factor_out_mul_sign_magnitude.py b/tests/transformation/streamline/test_factor_out_mul_sign_magnitude.py
index fca073f5a0..04ab9bf0b9 100644
--- a/tests/transformation/streamline/test_factor_out_mul_sign_magnitude.py
+++ b/tests/transformation/streamline/test_factor_out_mul_sign_magnitude.py
@@ -26,16 +26,19 @@
 # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
+import pytest
+
 import numpy as np
 import onnx.helper as oh
 from onnx import TensorProto
+from qonnx.core.modelwrapper import ModelWrapper
+from qonnx.transformation.infer_shapes import InferShapes
 
 import finn.core.onnx_exec as ox
-from finn.core.modelwrapper import ModelWrapper
-from finn.transformation.infer_shapes import InferShapes
 from finn.transformation.streamline import FactorOutMulSignMagnitude
 
 
+@pytest.mark.streamline
 def test_factor_out_mul_sign_magnitude():
     top_in = oh.make_tensor_value_info("top_in", TensorProto.FLOAT, [1, 2])
     mul_param = oh.make_tensor_value_info("mul_param", TensorProto.FLOAT, [1, 2])
diff --git a/tests/transformation/streamline/test_linear_past_eltwise.py b/tests/transformation/streamline/test_linear_past_eltwise.py
index 098b3f9d4f..12633d750b 100644
--- a/tests/transformation/streamline/test_linear_past_eltwise.py
+++ b/tests/transformation/streamline/test_linear_past_eltwise.py
@@ -31,15 +31,16 @@
 import numpy as np
 import os
 from onnx import TensorProto, helper
+from qonnx.core.modelwrapper import ModelWrapper
+from qonnx.transformation.fold_constants import FoldConstants
+from qonnx.transformation.general import GiveReadableTensorNames, GiveUniqueNodeNames
+from qonnx.transformation.infer_shapes import InferShapes
 
 import finn.core.onnx_exec as oxe
-from finn.core.modelwrapper import ModelWrapper
-from finn.transformation.fold_constants import FoldConstants
-from finn.transformation.general import GiveReadableTensorNames, GiveUniqueNodeNames
-from finn.transformation.infer_shapes import InferShapes
 from finn.transformation.streamline.reorder import MoveLinearPastEltwiseAdd
 
 export_onnx_path = "test_linear_past_eltwise.onnx"
+np_default_dtype = np.float32
 
 # construct a synthetic graph to test:
 # topk insertion, topk conversion to hls, add conversion to hls
@@ -81,14 +82,15 @@ def make_model(shape):
     model = ModelWrapper(model)
 
     # set initializers for scalar add/mul nodes
-    model.set_initializer(add1_node.input[1], np.array([7.0]))
-    model.set_initializer(add2_node.input[1], np.array([8.0]))
-    model.set_initializer(mul1_node.input[1], np.array([3.0]))
-    model.set_initializer(mul2_node.input[1], np.array([3.0]))
+    model.set_initializer(add1_node.input[1], np.array([7.0], dtype=np_default_dtype))
+    model.set_initializer(add2_node.input[1], np.array([8.0], dtype=np_default_dtype))
+    model.set_initializer(mul1_node.input[1], np.array([3.0], dtype=np_default_dtype))
+    model.set_initializer(mul2_node.input[1], np.array([3.0], dtype=np_default_dtype))
 
     return model
 
 
+@pytest.mark.streamline
 # channels
 @pytest.mark.parametrize("ch", [64])
 # ifmdim
@@ -133,6 +135,7 @@ def test_linear_past_eltwise_add(ch, ifmdim):
     os.remove(export_onnx_path)
 
 
+@pytest.mark.streamline
 @pytest.mark.parametrize("ch", [64, 1])
 # ifmdim
 @pytest.mark.parametrize("ifmdim", [-1, 7])
diff --git a/tests/transformation/streamline/test_maxpool_nhwc.py b/tests/transformation/streamline/test_maxpool_nhwc.py
new file mode 100644
index 0000000000..aa77b5cf1a
--- /dev/null
+++ b/tests/transformation/streamline/test_maxpool_nhwc.py
@@ -0,0 +1,109 @@
+import pytest
+
+import onnx
+import onnx.helper as oh
+from onnx import TensorProto
+from qonnx.core.datatype import DataType
+from qonnx.core.modelwrapper import ModelWrapper
+from qonnx.custom_op.general.maxpoolnhwc import compute_pool_output_dim
+from qonnx.transformation.infer_shapes import InferShapes
+from qonnx.util.basic import gen_finn_dt_tensor
+
+import finn.core.onnx_exec as oxe
+from finn.transformation.streamline.reorder import MakeMaxPoolNHWC
+
+
+def create_maxpool(ifm_dim, ifm_ch, kernel_shape, pads, strides, ceil_mode, idt):
+    ofm_dim_h = compute_pool_output_dim(
+        ifm_dim[0], kernel_shape[0], strides[0], pads[0], ceil_mode
+    )
+    ofm_dim_w = compute_pool_output_dim(
+        ifm_dim[1], kernel_shape[1], strides[1], pads[1], ceil_mode
+    )
+    inp = oh.make_tensor_value_info(
+        "inp", TensorProto.FLOAT, [1, ifm_ch, ifm_dim[0], ifm_dim[1]]
+    )
+    outp_mp = oh.make_tensor_value_info(
+        "outp_mp", TensorProto.FLOAT, [1, ifm_ch, ofm_dim_h, ofm_dim_w]
+    )
+    outp = oh.make_tensor_value_info(
+        "outp", TensorProto.FLOAT, [1, ofm_dim_h, ofm_dim_w, ifm_ch]
+    )
+
+    maxpool_node = oh.make_node(
+        "MaxPool",
+        inputs=["inp"],
+        outputs=["out_mp"],
+        ceil_mode=ceil_mode,
+        kernel_shape=kernel_shape,
+        pads=pads,
+        strides=strides,
+    )
+
+    transpose_node = onnx.helper.make_node(
+        "Transpose",
+        inputs=["out_mp"],
+        outputs=["outp"],
+        name="Transpose1",
+        perm=[0, 2, 3, 1],
+    )
+
+    graph = oh.make_graph(
+        nodes=[maxpool_node, transpose_node],
+        name="maxpool_graph",
+        inputs=[inp],
+        outputs=[outp],
+        value_info=[outp_mp],
+    )
+
+    model = oh.make_model(graph, producer_name="maxpool_model")
+    model = ModelWrapper(model)
+    model.set_tensor_datatype("inp", idt)
+    model.set_tensor_datatype("outp", idt)
+
+    model = model.transform(InferShapes())
+
+    return model
+
+
+@pytest.mark.streamline
+# input dimension
+@pytest.mark.parametrize("ifm_dim", [[8, 8], [9, 9]])
+# input channels
+@pytest.mark.parametrize("ifm_ch", [3])
+# kernel shape
+@pytest.mark.parametrize("kernel_shape", [[2, 2]])
+# padding
+@pytest.mark.parametrize("pads", [[0, 0, 0, 0], [1, 1, 1, 1]])
+# strides
+@pytest.mark.parametrize("strides", [[2, 2]])
+# ceil_mode
+@pytest.mark.parametrize("ceil_mode", [0, 1])
+# input datatype
+@pytest.mark.parametrize("idt", [DataType["INT4"]])
+def test_maxpool_nhwc(ifm_dim, ifm_ch, kernel_shape, pads, strides, ceil_mode, idt):
+    # create MaxPool node
+    maxpool_model = create_maxpool(
+        ifm_dim, ifm_ch, kernel_shape, pads, strides, ceil_mode, idt
+    )
+
+    # generate input tensor for testing
+    input_tensor = gen_finn_dt_tensor(idt, [1, ifm_ch, ifm_dim[0], ifm_dim[1]])
+    input_dict = {"inp": input_tensor}
+
+    # execute first model
+    output_dict = oxe.execute_onnx(maxpool_model, input_dict)
+    expected = output_dict["outp"]
+
+    # transform MaxPool into MaxPoolNHWC
+    maxpool_model = maxpool_model.transform(MakeMaxPoolNHWC())
+
+    # execute transformed model
+    output_node_name = maxpool_model.graph.output[0].name
+    output_dict = oxe.execute_onnx(
+        maxpool_model, input_dict, return_full_exec_context=False
+    )
+    output = output_dict[output_node_name]
+
+    # compare outputs
+    assert (expected == output).all()
diff --git a/tests/transformation/streamline/test_move_add_past_mul.py b/tests/transformation/streamline/test_move_add_past_mul.py
index 163b9d310a..0fb4dd9f7a 100644
--- a/tests/transformation/streamline/test_move_add_past_mul.py
+++ b/tests/transformation/streamline/test_move_add_past_mul.py
@@ -26,16 +26,19 @@
 # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
+import pytest
+
 import numpy as np
 import onnx.helper as oh
 from onnx import TensorProto
+from qonnx.core.modelwrapper import ModelWrapper
+from qonnx.transformation.infer_shapes import InferShapes
 
 import finn.core.onnx_exec as ox
-from finn.core.modelwrapper import ModelWrapper
-from finn.transformation.infer_shapes import InferShapes
 from finn.transformation.streamline import MoveAddPastMul
 
 
+@pytest.mark.streamline
 def test_move_add_past_mul_single():
     top_in = oh.make_tensor_value_info("top_in", TensorProto.FLOAT, [2])
     add_param = oh.make_tensor_value_info("add_param", TensorProto.FLOAT, [2])
@@ -65,6 +68,7 @@ def test_move_add_past_mul_single():
     assert new_model.graph.node[0].output[0] == new_model.graph.node[1].input[0]
 
 
+@pytest.mark.streamline
 def test_move_add_past_mul_multi():
     top_in = oh.make_tensor_value_info("top_in", TensorProto.FLOAT, [2])
     add_param_0 = oh.make_tensor_value_info("add_param_0", TensorProto.FLOAT, [2])
@@ -103,6 +107,7 @@ def test_move_add_past_mul_multi():
         assert new_model.graph.node[i].output[0] == new_model.graph.node[i + 1].input[0]
 
 
+@pytest.mark.streamline
 def test_move_add_past_mul_only_if_linear():
     top_in = oh.make_tensor_value_info("top_in", TensorProto.FLOAT, [2])
     top_out = oh.make_tensor_value_info("top_out", TensorProto.FLOAT, [2])
diff --git a/tests/transformation/streamline/test_move_chw_add_past_conv.py b/tests/transformation/streamline/test_move_chw_add_past_conv.py
index e4be8fc383..7eb7f9f1af 100644
--- a/tests/transformation/streamline/test_move_chw_add_past_conv.py
+++ b/tests/transformation/streamline/test_move_chw_add_past_conv.py
@@ -30,14 +30,15 @@
 
 import numpy as np
 from onnx import TensorProto, helper
+from qonnx.core.modelwrapper import ModelWrapper
+from qonnx.custom_op.general.im2col import compute_conv_output_dim
+from qonnx.transformation.infer_shapes import InferShapes
 
 import finn.core.onnx_exec as oxe
-from finn.core.modelwrapper import ModelWrapper
-from finn.custom_op.general.im2col import compute_conv_output_dim
-from finn.transformation.infer_shapes import InferShapes
 from finn.transformation.streamline.reorder import MoveAddPastConv
 
 
+@pytest.mark.streamline
 # input dimension
 @pytest.mark.parametrize("idim", [4, 7])
 # kernel size
diff --git a/tests/transformation/streamline/test_move_flatten_past_affine.py b/tests/transformation/streamline/test_move_flatten_past_affine.py
index ef01436dc9..8c3f71d1f3 100644
--- a/tests/transformation/streamline/test_move_flatten_past_affine.py
+++ b/tests/transformation/streamline/test_move_flatten_past_affine.py
@@ -28,20 +28,21 @@
 import pytest
 
 import numpy as np
+import qonnx.core.data_layout as DataLayout
 from onnx import TensorProto, helper
+from qonnx.core.datatype import DataType
+from qonnx.core.modelwrapper import ModelWrapper
+from qonnx.transformation.general import GiveReadableTensorNames, GiveUniqueNodeNames
+from qonnx.transformation.infer_data_layouts import InferDataLayouts
+from qonnx.transformation.infer_datatypes import InferDataTypes
+from qonnx.transformation.infer_shapes import InferShapes
+from qonnx.util.basic import gen_finn_dt_tensor
 
-import finn.core.data_layout as DataLayout
 import finn.core.onnx_exec as oxe
-from finn.core.datatype import DataType
-from finn.core.modelwrapper import ModelWrapper
-from finn.transformation.general import GiveReadableTensorNames, GiveUniqueNodeNames
-from finn.transformation.infer_data_layouts import InferDataLayouts
-from finn.transformation.infer_datatypes import InferDataTypes
-from finn.transformation.infer_shapes import InferShapes
 from finn.transformation.streamline.reorder import MoveFlattenPastAffine
-from finn.util.basic import gen_finn_dt_tensor
 
 
+@pytest.mark.streamline
 # data layout
 @pytest.mark.parametrize("data_layout", [DataLayout.NHWC, DataLayout.NCHW])
 # batch size
diff --git a/tests/transformation/streamline/test_move_flatten_past_topk.py b/tests/transformation/streamline/test_move_flatten_past_topk.py
index 6086f7804e..83d7a28c05 100644
--- a/tests/transformation/streamline/test_move_flatten_past_topk.py
+++ b/tests/transformation/streamline/test_move_flatten_past_topk.py
@@ -27,21 +27,22 @@
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 import pytest
 
+import qonnx.core.data_layout as DataLayout
 from onnx import TensorProto, helper
+from qonnx.core.datatype import DataType
+from qonnx.core.modelwrapper import ModelWrapper
+from qonnx.transformation.general import GiveReadableTensorNames, GiveUniqueNodeNames
+from qonnx.transformation.infer_data_layouts import InferDataLayouts
+from qonnx.transformation.infer_datatypes import InferDataTypes
+from qonnx.transformation.infer_shapes import InferShapes
+from qonnx.transformation.insert_topk import InsertTopK
+from qonnx.util.basic import gen_finn_dt_tensor
 
-import finn.core.data_layout as DataLayout
 import finn.core.onnx_exec as oxe
-from finn.core.datatype import DataType
-from finn.core.modelwrapper import ModelWrapper
-from finn.transformation.general import GiveReadableTensorNames, GiveUniqueNodeNames
-from finn.transformation.infer_data_layouts import InferDataLayouts
-from finn.transformation.infer_datatypes import InferDataTypes
-from finn.transformation.infer_shapes import InferShapes
-from finn.transformation.insert_topk import InsertTopK
 from finn.transformation.streamline.reorder import MoveFlattenPastTopK
-from finn.util.basic import gen_finn_dt_tensor
 
 
+@pytest.mark.streamline
 # data layout
 @pytest.mark.parametrize("data_layout", [DataLayout.NHWC, DataLayout.NCHW])
 # batch size
diff --git a/tests/transformation/streamline/test_move_identical_op_past_join_op.py b/tests/transformation/streamline/test_move_identical_op_past_join_op.py
index 60e76b8b07..4986363ff4 100644
--- a/tests/transformation/streamline/test_move_identical_op_past_join_op.py
+++ b/tests/transformation/streamline/test_move_identical_op_past_join_op.py
@@ -1,12 +1,39 @@
+# Copyright (c) 2020, Xilinx
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of FINN nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 import pytest
 
 from onnx import TensorProto
 from onnx import helper as oh
+from qonnx.core.modelwrapper import ModelWrapper
+from qonnx.util.basic import gen_finn_dt_tensor
 
 import finn.core.onnx_exec as oxe
-from finn.core.modelwrapper import ModelWrapper
 from finn.transformation.streamline.reorder import MoveTransposePastJoinAdd
-from finn.util.basic import gen_finn_dt_tensor
 
 
 def create_model(perm):
@@ -60,6 +87,7 @@ def create_model(perm):
     return model
 
 
+@pytest.mark.streamline
 # Permutation of transpose node
 @pytest.mark.parametrize("perm", [[0, 3, 1, 2], [0, 2, 3, 1]])
 def test_move_identical_op_past_join_op(perm):
diff --git a/tests/transformation/streamline/test_move_maxpool_past_multithreshold.py b/tests/transformation/streamline/test_move_maxpool_past_multithreshold.py
index fca05afa5b..bf25eee9e6 100644
--- a/tests/transformation/streamline/test_move_maxpool_past_multithreshold.py
+++ b/tests/transformation/streamline/test_move_maxpool_past_multithreshold.py
@@ -1,10 +1,39 @@
+# Copyright (c) 2020, Xilinx
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of FINN nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+import pytest
+
 import numpy as np
 from onnx import TensorProto, helper
+from qonnx.core.modelwrapper import ModelWrapper
+from qonnx.transformation.infer_datatypes import InferDataTypes
+from qonnx.transformation.infer_shapes import InferShapes
 
 import finn.core.onnx_exec as oxe
-from finn.core.modelwrapper import ModelWrapper
-from finn.transformation.infer_datatypes import InferDataTypes
-from finn.transformation.infer_shapes import InferShapes
 from finn.transformation.streamline.reorder import MoveMaxPoolPastMultiThreshold
 
 
@@ -18,6 +47,7 @@ def get_multithreshold_rand_params(channels, num_of_thres, seed=None):
     return thres
 
 
+@pytest.mark.streamline
 def test_move_maxpool_past_multithreshold():
     # generate test vectors of correct shape
     ch = 64
@@ -52,7 +82,7 @@ def test_move_maxpool_past_multithreshold():
             "MultiThreshold",
             ["t1", "thres1"],
             ["t2"],
-            domain="finn.custom_op.general",
+            domain="qonnx.custom_op.general",
             out_dtype="BIPOLAR",
             out_bias=-1.0,
             out_scale=1.0,
@@ -64,7 +94,7 @@ def test_move_maxpool_past_multithreshold():
             "MultiThreshold",
             ["t3", "thres2"],
             ["top_out"],
-            domain="finn.custom_op.general",
+            domain="qonnx.custom_op.general",
             out_dtype="UINT4",
         )
     ]
@@ -82,7 +112,7 @@ def test_move_maxpool_past_multithreshold():
     model = model.transform(InferShapes())
     model = model.transform(InferDataTypes())
 
-    model.set_initializer("thres1", np.array([[0]]))
+    model.set_initializer("thres1", np.array([[0]], dtype=np.float32))
     model.set_initializer(
         "thres2", get_multithreshold_rand_params(*thres2_shape, seed=0)
     )
diff --git a/tests/transformation/streamline/test_move_mul_past_dw_conv.py b/tests/transformation/streamline/test_move_mul_past_dw_conv.py
index e9e956d845..401631a728 100644
--- a/tests/transformation/streamline/test_move_mul_past_dw_conv.py
+++ b/tests/transformation/streamline/test_move_mul_past_dw_conv.py
@@ -1,17 +1,45 @@
+# Copyright (c) 2020, Xilinx
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of FINN nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 import pytest
 
 from onnx import TensorProto, helper
+from qonnx.core.datatype import DataType
+from qonnx.core.modelwrapper import ModelWrapper
+from qonnx.custom_op.general.im2col import compute_conv_output_dim
+from qonnx.transformation.infer_datatypes import InferDataTypes
+from qonnx.transformation.infer_shapes import InferShapes
+from qonnx.util.basic import gen_finn_dt_tensor
 
 import finn.core.onnx_exec as oxe
-from finn.core.datatype import DataType
-from finn.core.modelwrapper import ModelWrapper
-from finn.custom_op.general.im2col import compute_conv_output_dim
-from finn.transformation.infer_datatypes import InferDataTypes
-from finn.transformation.infer_shapes import InferShapes
 from finn.transformation.streamline.reorder import MoveMulPastDWConv
-from finn.util.basic import gen_finn_dt_tensor
 
 
+@pytest.mark.streamline
 # input dimension
 @pytest.mark.parametrize("ifm_dim", [4, 7])
 # input channels
diff --git a/tests/transformation/streamline/test_move_mul_past_maxpool.py b/tests/transformation/streamline/test_move_mul_past_maxpool.py
index 2c51aaf36a..fcc1b65132 100755
--- a/tests/transformation/streamline/test_move_mul_past_maxpool.py
+++ b/tests/transformation/streamline/test_move_mul_past_maxpool.py
@@ -1,18 +1,46 @@
+# Copyright (c) 2020, Xilinx
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of FINN nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 import pytest
 
 import numpy as np
 from onnx import TensorProto, helper
+from qonnx.core.datatype import DataType
+from qonnx.core.modelwrapper import ModelWrapper
+from qonnx.custom_op.general.maxpoolnhwc import compute_pool_output_dim
+from qonnx.transformation.infer_datatypes import InferDataTypes
+from qonnx.transformation.infer_shapes import InferShapes
+from qonnx.util.basic import gen_finn_dt_tensor
 
 import finn.core.onnx_exec as oxe
-from finn.core.datatype import DataType
-from finn.core.modelwrapper import ModelWrapper
-from finn.custom_op.general.maxpoolnhwc import compute_pool_output_dim
-from finn.transformation.infer_datatypes import InferDataTypes
-from finn.transformation.infer_shapes import InferShapes
 from finn.transformation.streamline.reorder import MoveMulPastMaxPool
-from finn.util.basic import gen_finn_dt_tensor
 
 
+@pytest.mark.streamline
 # input dimension
 @pytest.mark.parametrize("ifm_dim", [4, 7])
 # input channels
diff --git a/tests/transformation/streamline/test_move_past_fork.py b/tests/transformation/streamline/test_move_past_fork.py
index 364590f933..5064fa3fca 100644
--- a/tests/transformation/streamline/test_move_past_fork.py
+++ b/tests/transformation/streamline/test_move_past_fork.py
@@ -1,14 +1,42 @@
+# Copyright (c) 2020, Xilinx
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of FINN nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 import pytest
 
 import numpy as np
 from onnx import TensorProto, helper
+from qonnx.core.modelwrapper import ModelWrapper
+from qonnx.transformation.infer_shapes import InferShapes
 
 import finn.core.onnx_exec as oxe
-from finn.core.modelwrapper import ModelWrapper
-from finn.transformation.infer_shapes import InferShapes
 from finn.transformation.streamline.reorder import MoveLinearPastFork
 
 
+@pytest.mark.streamline
 @pytest.mark.parametrize("ch", [64, 1])
 # ifmdim
 @pytest.mark.parametrize("ifmdim", [-1, 7])
diff --git a/tests/transformation/streamline/test_move_scalar_past_conv.py b/tests/transformation/streamline/test_move_scalar_past_conv.py
index 5e2ded0174..59b8b8f8b2 100644
--- a/tests/transformation/streamline/test_move_scalar_past_conv.py
+++ b/tests/transformation/streamline/test_move_scalar_past_conv.py
@@ -1,15 +1,43 @@
+# Copyright (c) 2020, Xilinx
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of FINN nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 import pytest
 
 import numpy as np
 import onnx.helper as oh
 from onnx import TensorProto
+from qonnx.core.modelwrapper import ModelWrapper
+from qonnx.transformation.infer_shapes import InferShapes
 
 import finn.core.onnx_exec as ox
-from finn.core.modelwrapper import ModelWrapper
-from finn.transformation.infer_shapes import InferShapes
 from finn.transformation.streamline import MoveAddPastConv, MoveScalarMulPastConv
 
 
+@pytest.mark.streamline
 @pytest.mark.parametrize("padding", [False, True])
 @pytest.mark.parametrize(
     "test_args",
@@ -90,6 +118,7 @@ def test_move_scalar_past_conv(test_args, padding):
         assert new_model.graph.node[2].op_type == scalar_op
 
 
+@pytest.mark.streamline
 @pytest.mark.parametrize(
     "test_args",
     [("Add", MoveAddPastConv()), ("Mul", MoveScalarMulPastConv())],
diff --git a/tests/transformation/streamline/test_move_scalar_past_matmul.py b/tests/transformation/streamline/test_move_scalar_past_matmul.py
index b15f84303b..6fdaaadfae 100644
--- a/tests/transformation/streamline/test_move_scalar_past_matmul.py
+++ b/tests/transformation/streamline/test_move_scalar_past_matmul.py
@@ -31,16 +31,17 @@
 import numpy as np
 import onnx.helper as oh
 from onnx import TensorProto
+from qonnx.core.modelwrapper import ModelWrapper
+from qonnx.transformation.infer_shapes import InferShapes
 
 import finn.core.onnx_exec as ox
-from finn.core.modelwrapper import ModelWrapper
-from finn.transformation.infer_shapes import InferShapes
 from finn.transformation.streamline import (
     MoveScalarAddPastMatMul,
     MoveScalarMulPastMatMul,
 )
 
 
+@pytest.mark.streamline
 def test_move_scalar_mul_past_matmul():
     top_in = oh.make_tensor_value_info("top_in", TensorProto.FLOAT, [1, 2])
     mul_param = oh.make_tensor_value_info("mul_param", TensorProto.FLOAT, [1, 1])
@@ -72,6 +73,7 @@ def test_move_scalar_mul_past_matmul():
     assert new_model.graph.node[0].output[0] == new_model.graph.node[1].input[0]
 
 
+@pytest.mark.streamline
 def test_move_scalar_add_past_matmul():
     top_in = oh.make_tensor_value_info("top_in", TensorProto.FLOAT, [1, 2])
     add_param = oh.make_tensor_value_info("add_param", TensorProto.FLOAT, [1, 1])
@@ -103,6 +105,7 @@ def test_move_scalar_add_past_matmul():
     assert new_model.graph.node[0].output[0] == new_model.graph.node[1].input[0]
 
 
+@pytest.mark.streamline
 @pytest.mark.parametrize(
     "test_args",
     [("Add", MoveScalarAddPastMatMul()), ("Mul", MoveScalarMulPastMatMul())],
diff --git a/tests/transformation/streamline/test_move_transpose_past_scalar_mul.py b/tests/transformation/streamline/test_move_transpose_past_scalar_mul.py
index 9110ede98d..9662ba8a90 100644
--- a/tests/transformation/streamline/test_move_transpose_past_scalar_mul.py
+++ b/tests/transformation/streamline/test_move_transpose_past_scalar_mul.py
@@ -1,18 +1,47 @@
+# Copyright (c) 2020, Xilinx
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of FINN nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
 import pytest
 
 import numpy as np
+import qonnx.core.data_layout as DataLayout
 from onnx import TensorProto, helper
+from qonnx.core.modelwrapper import ModelWrapper
+from qonnx.transformation.general import GiveReadableTensorNames, GiveUniqueNodeNames
+from qonnx.transformation.infer_data_layouts import InferDataLayouts
+from qonnx.transformation.infer_datatypes import InferDataTypes
+from qonnx.transformation.infer_shapes import InferShapes
 
-import finn.core.data_layout as DataLayout
 import finn.core.onnx_exec as oxe
-from finn.core.modelwrapper import ModelWrapper
-from finn.transformation.general import GiveReadableTensorNames, GiveUniqueNodeNames
-from finn.transformation.infer_data_layouts import InferDataLayouts
-from finn.transformation.infer_datatypes import InferDataTypes
-from finn.transformation.infer_shapes import InferShapes
 from finn.transformation.streamline.reorder import MoveTransposePastScalarMul
 
 
+@pytest.mark.streamline
 # permutation of transpose node
 @pytest.mark.parametrize("perm", [[0, 2, 3, 1], [0, 1, 3, 2], [3, 2, 0, 1]])
 # scalar mul
diff --git a/tests/transformation/streamline/test_round_thresholds.py b/tests/transformation/streamline/test_round_thresholds.py
index 2e57f1c85f..1ec5f02e87 100644
--- a/tests/transformation/streamline/test_round_thresholds.py
+++ b/tests/transformation/streamline/test_round_thresholds.py
@@ -26,21 +26,24 @@
 # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
+import pytest
+
 import numpy as np
 from onnx import TensorProto, helper
+from qonnx.core.datatype import DataType
+from qonnx.core.modelwrapper import ModelWrapper
 
 import finn.core.onnx_exec as oxe
-from finn.core.datatype import DataType
-from finn.core.modelwrapper import ModelWrapper
 from finn.transformation.streamline import RoundAndClipThresholds
 
 
+@pytest.mark.streamline
 def test_round_thresholds():
     v = helper.make_tensor_value_info("v", TensorProto.FLOAT, [1, 4])
     thresholds = helper.make_tensor_value_info("thresholds", TensorProto.FLOAT, [4, 1])
     out = helper.make_tensor_value_info("out", TensorProto.FLOAT, [1, 4])
     node_def = helper.make_node(
-        "MultiThreshold", ["v", "thresholds"], ["out"], domain="finn.custom_op.general"
+        "MultiThreshold", ["v", "thresholds"], ["out"], domain="qonnx.custom_op.general"
     )
     graph_def = helper.make_graph([node_def], "test_model", [v, thresholds], [out])
     model_def = helper.make_model(graph_def)
diff --git a/tests/transformation/streamline/test_sign_to_thres.py b/tests/transformation/streamline/test_sign_to_thres.py
index 2ffb5713c0..839680bd7a 100644
--- a/tests/transformation/streamline/test_sign_to_thres.py
+++ b/tests/transformation/streamline/test_sign_to_thres.py
@@ -26,22 +26,25 @@
 # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
+import pytest
+
 import brevitas.onnx as bo
 import onnx
 import onnx.numpy_helper as nph
 import os
 from pkgutil import get_data
+from qonnx.core.modelwrapper import ModelWrapper
+from qonnx.transformation.fold_constants import FoldConstants
+from qonnx.transformation.infer_shapes import InferShapes
 
 import finn.core.onnx_exec as oxe
-from finn.core.modelwrapper import ModelWrapper
-from finn.transformation.fold_constants import FoldConstants
-from finn.transformation.infer_shapes import InferShapes
 from finn.transformation.streamline import ConvertSignToThres
 from finn.util.test import get_test_model_trained
 
 export_onnx_path = "test_sign_to_thres.onnx"
 
 
+@pytest.mark.streamline
 def test_sign_to_thres():
     lfc = get_test_model_trained("LFC", 1, 1)
     bo.export_finn_onnx(lfc, (1, 1, 28, 28), export_onnx_path)
@@ -51,7 +54,7 @@ def test_sign_to_thres():
     new_model = model.transform(ConvertSignToThres())
     assert new_model.graph.node[3].op_type == "MultiThreshold"
     # load one of the test vectors
-    raw_i = get_data("finn.data", "onnx/mnist-conv/test_data_set_0/input_0.pb")
+    raw_i = get_data("qonnx.data", "onnx/mnist-conv/test_data_set_0/input_0.pb")
     input_tensor = onnx.load_tensor_from_string(raw_i)
     input_dict = {"0": nph.to_array(input_tensor)}
     assert oxe.compare_execution(model, new_model, input_dict)
diff --git a/tests/transformation/streamline/test_streamline_cnv.py b/tests/transformation/streamline/test_streamline_cnv.py
index ed25953303..6a82925012 100644
--- a/tests/transformation/streamline/test_streamline_cnv.py
+++ b/tests/transformation/streamline/test_streamline_cnv.py
@@ -32,17 +32,17 @@
 
 import brevitas.onnx as bo
 import numpy as np
-
-import finn.core.onnx_exec as oxe
-from finn.core.modelwrapper import ModelWrapper
-from finn.transformation.fold_constants import FoldConstants
-from finn.transformation.general import (
+from qonnx.core.modelwrapper import ModelWrapper
+from qonnx.transformation.fold_constants import FoldConstants
+from qonnx.transformation.general import (
     GiveReadableTensorNames,
     GiveUniqueNodeNames,
     RemoveStaticGraphInputs,
     RemoveUnusedTensors,
 )
-from finn.transformation.infer_shapes import InferShapes
+from qonnx.transformation.infer_shapes import InferShapes
+
+import finn.core.onnx_exec as oxe
 from finn.transformation.streamline import Streamline
 from finn.util.basic import make_build_dir
 from finn.util.test import get_test_model_trained
@@ -50,6 +50,7 @@
 export_onnx_path = make_build_dir("test_streamline_cnv_")
 
 
+@pytest.mark.streamline
 # act bits
 @pytest.mark.parametrize("abits", [1, 2])
 # weight bits
diff --git a/tests/transformation/streamline/test_streamline_fc.py b/tests/transformation/streamline/test_streamline_fc.py
index 3563b87c45..9000821435 100644
--- a/tests/transformation/streamline/test_streamline_fc.py
+++ b/tests/transformation/streamline/test_streamline_fc.py
@@ -33,17 +33,17 @@
 import onnx
 import onnx.numpy_helper as nph
 from pkgutil import get_data
-
-import finn.core.onnx_exec as oxe
-from finn.core.modelwrapper import ModelWrapper
-from finn.transformation.fold_constants import FoldConstants
-from finn.transformation.general import (
+from qonnx.core.modelwrapper import ModelWrapper
+from qonnx.transformation.fold_constants import FoldConstants
+from qonnx.transformation.general import (
     GiveReadableTensorNames,
     GiveUniqueNodeNames,
     RemoveStaticGraphInputs,
     RemoveUnusedTensors,
 )
-from finn.transformation.infer_shapes import InferShapes
+from qonnx.transformation.infer_shapes import InferShapes
+
+import finn.core.onnx_exec as oxe
 from finn.transformation.streamline import Streamline
 from finn.util.basic import make_build_dir
 from finn.util.test import get_test_model_trained
@@ -51,6 +51,7 @@
 export_onnx_path = make_build_dir("test_streamline_fc_")
 
 
+@pytest.mark.streamline
 # act bits
 @pytest.mark.parametrize("abits", [1, 2])
 # weight bits
@@ -73,7 +74,7 @@ def test_streamline_fc(size, wbits, abits):
     model = model.transform(GiveReadableTensorNames())
     model = model.transform(RemoveStaticGraphInputs())
     # load one of the test vectors
-    raw_i = get_data("finn.data", "onnx/mnist-conv/test_data_set_0/input_0.pb")
+    raw_i = get_data("qonnx.data", "onnx/mnist-conv/test_data_set_0/input_0.pb")
     input_tensor = onnx.load_tensor_from_string(raw_i)
     # run using FINN-based execution
     input_dict = {"global_in": nph.to_array(input_tensor)}
diff --git a/tests/transformation/test_batchnorm_to_affine_bnn_pynq.py b/tests/transformation/test_batchnorm_to_affine_bnn_pynq.py
index 300ef85faa..fd4e37807c 100644
--- a/tests/transformation/test_batchnorm_to_affine_bnn_pynq.py
+++ b/tests/transformation/test_batchnorm_to_affine_bnn_pynq.py
@@ -28,23 +28,26 @@
 
 import pkg_resources as pk
 
+import pytest
+
 import brevitas.onnx as bo
 import numpy as np
 import onnx
 import onnx.numpy_helper as nph
 import os
 from pkgutil import get_data
+from qonnx.core.modelwrapper import ModelWrapper
+from qonnx.transformation.batchnorm_to_affine import BatchNormToAffine
+from qonnx.transformation.fold_constants import FoldConstants
+from qonnx.transformation.infer_shapes import InferShapes
 
 import finn.core.onnx_exec as oxe
-from finn.core.modelwrapper import ModelWrapper
-from finn.transformation.batchnorm_to_affine import BatchNormToAffine
-from finn.transformation.fold_constants import FoldConstants
-from finn.transformation.infer_shapes import InferShapes
 from finn.util.test import get_test_model_trained
 
 export_onnx_path = "test_output_bn2affine.onnx"
 
 
+@pytest.mark.transform
 def test_batchnorm_to_affine_cnv_w1a1():
     lfc = get_test_model_trained("CNV", 1, 1)
     bo.export_finn_onnx(lfc, (1, 3, 32, 32), export_onnx_path)
@@ -69,6 +72,7 @@ def test_batchnorm_to_affine_cnv_w1a1():
     os.remove(export_onnx_path)
 
 
+@pytest.mark.transform
 def test_batchnorm_to_affine_lfc_w1a1():
     lfc = get_test_model_trained("LFC", 1, 1)
     bo.export_finn_onnx(lfc, (1, 1, 28, 28), export_onnx_path)
@@ -77,7 +81,7 @@ def test_batchnorm_to_affine_lfc_w1a1():
     model = model.transform(FoldConstants())
     new_model = model.transform(BatchNormToAffine())
     # load one of the test vectors
-    raw_i = get_data("finn.data", "onnx/mnist-conv/test_data_set_0/input_0.pb")
+    raw_i = get_data("qonnx.data", "onnx/mnist-conv/test_data_set_0/input_0.pb")
     input_tensor = onnx.load_tensor_from_string(raw_i)
     input_dict = {"0": nph.to_array(input_tensor)}
     assert oxe.compare_execution(model, new_model, input_dict)
diff --git a/tests/transformation/test_infer_data_layouts_cnv.py b/tests/transformation/test_infer_data_layouts_cnv.py
index 10bc687d13..952ce306a4 100644
--- a/tests/transformation/test_infer_data_layouts_cnv.py
+++ b/tests/transformation/test_infer_data_layouts_cnv.py
@@ -26,19 +26,21 @@
 # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
+import pytest
+
 import brevitas.onnx as bo
 import os
+import qonnx.core.data_layout as DataLayout
+from qonnx.core.modelwrapper import ModelWrapper
+from qonnx.transformation.bipolar_to_xnor import ConvertBipolarMatMulToXnorPopcount
+from qonnx.transformation.fold_constants import FoldConstants
+from qonnx.transformation.general import GiveReadableTensorNames, GiveUniqueNodeNames
+from qonnx.transformation.infer_data_layouts import InferDataLayouts
+from qonnx.transformation.infer_shapes import InferShapes
+from qonnx.transformation.lower_convs_to_matmul import LowerConvsToMatMul
 
-import finn.core.data_layout as DataLayout
 import finn.transformation.fpgadataflow.convert_to_hls_layers as to_hls
 import finn.transformation.streamline.absorb as absorb
-from finn.core.modelwrapper import ModelWrapper
-from finn.transformation.bipolar_to_xnor import ConvertBipolarMatMulToXnorPopcount
-from finn.transformation.fold_constants import FoldConstants
-from finn.transformation.general import GiveReadableTensorNames, GiveUniqueNodeNames
-from finn.transformation.infer_data_layouts import InferDataLayouts
-from finn.transformation.infer_shapes import InferShapes
-from finn.transformation.lower_convs_to_matmul import LowerConvsToMatMul
 from finn.transformation.streamline import Streamline
 from finn.transformation.streamline.reorder import MakeMaxPoolNHWC
 from finn.util.test import get_test_model_trained
@@ -46,6 +48,7 @@
 export_onnx_path_cnv = "test_infer_data_layouts.onnx"
 
 
+@pytest.mark.transform
 def test_infer_data_layouts_cnv():
     cnv = get_test_model_trained("CNV", 1, 1)
     bo.export_finn_onnx(cnv, (1, 3, 32, 32), export_onnx_path_cnv)
@@ -87,8 +90,8 @@ def test_infer_data_layouts_cnv():
     model = model.transform(absorb.AbsorbTransposeIntoMultiThreshold())
     model = model.transform(ConvertBipolarMatMulToXnorPopcount())
     model = model.transform(Streamline())
-    model = model.transform(to_hls.InferBinaryStreamingFCLayer())
-    model = model.transform(to_hls.InferQuantizedStreamingFCLayer())
+    model = model.transform(to_hls.InferBinaryMatrixVectorActivation())
+    model = model.transform(to_hls.InferQuantizedMatrixVectorActivation())
     model = model.transform(to_hls.InferConvInpGen())
     model = model.transform(to_hls.InferStreamingMaxPool())
     model = model.transform(GiveUniqueNodeNames())
@@ -103,9 +106,9 @@ def test_infer_data_layouts_cnv():
     assert (
         model.get_tensor_layout("ConvolutionInputGenerator_0_out0") == DataLayout.NHWC
     )
-    assert model.get_tensor_layout("StreamingFCLayer_Batch_3_out0") == DataLayout.NHWC
+    assert model.get_tensor_layout("MatrixVectorActivation_3_out0") == DataLayout.NHWC
     assert model.get_tensor_layout("Reshape_0_out0") == DataLayout.NC
-    assert model.get_tensor_layout("StreamingFCLayer_Batch_6_out0") == DataLayout.NC
+    assert model.get_tensor_layout("MatrixVectorActivation_6_out0") == DataLayout.NC
     assert model.get_tensor_layout("global_out") == DataLayout.NC
 
     os.remove(export_onnx_path_cnv)
diff --git a/tests/transformation/test_infer_datatypes_lfc.py b/tests/transformation/test_infer_datatypes_lfc.py
index 8883dac7a5..9798005349 100644
--- a/tests/transformation/test_infer_datatypes_lfc.py
+++ b/tests/transformation/test_infer_datatypes_lfc.py
@@ -26,20 +26,23 @@
 # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
+import pytest
+
 import brevitas.onnx as bo
 import os
+from qonnx.core.datatype import DataType
+from qonnx.core.modelwrapper import ModelWrapper
+from qonnx.transformation.fold_constants import FoldConstants
+from qonnx.transformation.general import GiveReadableTensorNames, GiveUniqueNodeNames
+from qonnx.transformation.infer_datatypes import InferDataTypes
+from qonnx.transformation.infer_shapes import InferShapes
 
-from finn.core.datatype import DataType
-from finn.core.modelwrapper import ModelWrapper
-from finn.transformation.fold_constants import FoldConstants
-from finn.transformation.general import GiveReadableTensorNames, GiveUniqueNodeNames
-from finn.transformation.infer_datatypes import InferDataTypes
-from finn.transformation.infer_shapes import InferShapes
 from finn.util.test import get_test_model_trained
 
 export_onnx_path = "test_infer_datatypes.onnx"
 
 
+@pytest.mark.transform
 def test_infer_datatypes_lfc():
     lfc = get_test_model_trained("LFC", 1, 1)
     bo.export_finn_onnx(lfc, (1, 1, 28, 28), export_onnx_path)
diff --git a/tests/transformation/test_qonnx_to_finn.py b/tests/transformation/test_qonnx_to_finn.py
index df7d63e3d2..43055f6704 100644
--- a/tests/transformation/test_qonnx_to_finn.py
+++ b/tests/transformation/test_qonnx_to_finn.py
@@ -38,14 +38,14 @@
 import onnx.numpy_helper as nph
 import torch
 from pkgutil import get_data
+from qonnx.core.modelwrapper import ModelWrapper
+from qonnx.transformation.fold_constants import FoldConstants
+from qonnx.transformation.general import GiveUniqueNodeNames, RemoveStaticGraphInputs
+from qonnx.transformation.infer_shapes import InferShapes
 from qonnx.util.cleanup import cleanup
 from tempfile import TemporaryDirectory
 
 import finn.core.onnx_exec as oxe
-from finn.core.modelwrapper import ModelWrapper
-from finn.transformation.fold_constants import FoldConstants
-from finn.transformation.general import GiveUniqueNodeNames, RemoveStaticGraphInputs
-from finn.transformation.infer_shapes import InferShapes
 from finn.transformation.qonnx.convert_qonnx_to_finn import ConvertQONNXtoFINN
 from finn.util.test import get_test_model_trained
 
@@ -53,7 +53,7 @@
 def get_brev_model_and_sample_inputs(model_name, wbits, abits):
     if "FC" in model_name:
         in_shape = (1, 1, 28, 28)
-        raw_i = get_data("finn.data", "onnx/mnist-conv/test_data_set_0/input_0.pb")
+        raw_i = get_data("qonnx.data", "onnx/mnist-conv/test_data_set_0/input_0.pb")
         input_tensor = onnx.load_tensor_from_string(raw_i)
         input_tensor = nph.to_array(input_tensor)
         brev_model = get_test_model_trained(model_name, wbits, abits)
@@ -88,6 +88,7 @@ def analysis_testing_for_no_quant_nodes(model):
     return dict()
 
 
+@pytest.mark.transform
 # This test currently takes about 4 min and 20 seconds
 @pytest.mark.parametrize("abits", [1, 2])
 @pytest.mark.parametrize("wbits", [1, 2])
@@ -133,7 +134,7 @@ def test_QONNX_to_FINN(model_name, wbits, abits):
         ).all(), "The output of the Brevitas model and the FINN model should match."
 
     # Get the equivalent QONNX model
-    b_onnx.function.DOMAIN_STRING = "finn.custom_op.general"
+    b_onnx.function.DOMAIN_STRING = "qonnx.custom_op.general"
     _ = b_onnx.manager.BrevitasONNXManager.export(
         brev_model, in_shape, qonnx_base_path.format("raw")
     )
diff --git a/tests/util/test_build_dataflow.py b/tests/util/test_build_dataflow.py
index de1b3abcc3..cdf69aebdd 100644
--- a/tests/util/test_build_dataflow.py
+++ b/tests/util/test_build_dataflow.py
@@ -39,7 +39,8 @@
 
 @pytest.mark.slow
 @pytest.mark.vivado
-def test_build_dataflow_directory():
+@pytest.mark.end2end
+def test_end2end_build_dataflow_directory():
     test_dir = make_build_dir("test_build_dataflow_directory_")
     target_dir = test_dir + "/build_dataflow"
     example_data_dir = pk.resource_filename("finn.qnn-data", "build_dataflow/")
diff --git a/tests/util/test_create.py b/tests/util/test_create.py
index c11e60175e..dc44e4bd45 100644
--- a/tests/util/test_create.py
+++ b/tests/util/test_create.py
@@ -28,10 +28,12 @@
 
 import pytest
 
+from qonnx.core.datatype import DataType
+
 import finn.util.create as create
-from finn.core.datatype import DataType
 
 
+@pytest.mark.util
 @pytest.mark.parametrize(
     "bitwidth", [DataType["BIPOLAR"], DataType["INT2"], DataType["INT4"]]
 )
diff --git a/tests/util/test_data_packing_hls.py b/tests/util/test_data_packing_hls.py
index 7113a3051b..859b926543 100644
--- a/tests/util/test_data_packing_hls.py
+++ b/tests/util/test_data_packing_hls.py
@@ -32,12 +32,14 @@
 import os
 import shutil
 import subprocess
+from qonnx.core.datatype import DataType
+from qonnx.util.basic import gen_finn_dt_tensor
 
-import finn.util.basic as cutil
-from finn.core.datatype import DataType
+from finn.util.basic import make_build_dir
 from finn.util.data_packing import numpy_to_hls_code
 
 
+@pytest.mark.util
 @pytest.mark.parametrize(
     "dtype",
     [
@@ -51,8 +53,8 @@
 @pytest.mark.parametrize("test_shape", [(1, 2, 4), (1, 1, 64), (2, 64)])
 @pytest.mark.vivado
 def test_npy2apintstream(test_shape, dtype):
-    ndarray = cutil.gen_finn_dt_tensor(dtype, test_shape)
-    test_dir = cutil.make_build_dir(prefix="test_npy2apintstream_")
+    ndarray = gen_finn_dt_tensor(dtype, test_shape)
+    test_dir = make_build_dir(prefix="test_npy2apintstream_")
     shape = ndarray.shape
     elem_bits = dtype.bitwidth()
     packed_bits = shape[-1] * elem_bits
@@ -96,10 +98,10 @@ def test_npy2apintstream(test_shape, dtype):
     with open(test_dir + "/test.cpp", "w") as f:
         f.write("\n".join(test_app_string))
     cmd_compile = """
-g++ -o test_npy2apintstream test.cpp /workspace/cnpy/cnpy.cpp \
--I/workspace/cnpy/ -I{}/include -I/workspace/finn/src/finn/qnn-data/cpp \
+g++ -o test_npy2apintstream test.cpp $FINN_ROOT/deps/cnpy/cnpy.cpp \
+-I$FINN_ROOT/deps/cnpy/ -I{}/include -I$FINN_ROOT/src/finn/qnn-data/cpp \
 --std=c++11 -lz""".format(
-        os.environ["VIVADO_PATH"]
+        os.environ["HLS_PATH"]
     )
     with open(test_dir + "/compile.sh", "w") as f:
         f.write(cmd_compile)
@@ -123,6 +125,7 @@ def test_npy2apintstream(test_shape, dtype):
     assert success
 
 
+@pytest.mark.util
 def test_numpy_to_hls_code():
     def remove_all_whitespace(s):
         return "".join(s.split())
diff --git a/tutorials/fpga_flow/README.md b/tutorials/fpga_flow/README.md
new file mode 100644
index 0000000000..63ca6ac832
--- /dev/null
+++ b/tutorials/fpga_flow/README.md
@@ -0,0 +1,119 @@
+# FINN Example FPGA Flow Using MNIST Numerals
+
+This example demonstrates how to bring a FINN compiled model into the Vivado FPGA design environment for integration into a larger FPGA application. It extends on the command-line [build_dataflow](https://github.com/Xilinx/finn/tree/master/src/finn/qnn-data/build_dataflow) using a model that was quantized with [Brevitas](https://github.com/Xilinx/brevitas) down to single-bit weight/ativation precision to classify hand-written numerals from the MNIST data set.
+
+If you are new to the command-line flow, more information can be found [here](https://finn.readthedocs.io/en/latest/command_line.html).
+
+This demo was created using Vivado 2020.1.
+
+## Compiling the Model in FINN
+
+#### Configuration
+`build.py` assembles the needed files and configures how the model is compiled when generating the "stitched IP".  The following items will need to be set appropriately for specific use cases:
+- `output_dir`: defines the directory to be created for FINN compiler output.
+- `target_fps`: desired throughput performance target for FINN compiler to achieve.
+- `mvau_wwidth_max`:  _an optional parameter_ ([described here](https://finn.readthedocs.io/en/latest/source_code/finn.builder.html#finn.builder.build_dataflow_config.DataflowBuildConfig.mvau_wwidth_max)) shown only to illustrate passing additional configuration items to the compiler.
+- `folding_config_file`: an optional parameter to pass a json file defining the layer optimizations (PE,SIMD,ramstyle, etc.) to the compiler.
+- `synth_clk_period_ns`: set the desired clock period in nS.
+- `fpga_part` configures the IP for your target device that the stitched IP will be implemented in.  It should be the full string recognized in Vivado: \<device\>-\<package\>-\<temperature_grade\>-\<speed_grade\>
+- `generate_outputs`: for integration purposes, the only output needed is `STITCHED_IP`.  You might also find the `ESTIMATE_REPORTS` interesting.  Other options are documented [here](https://finn.readthedocs.io/en/latest/command_line.html#generated-outputs) and some of them (namely OOC_SYNTH, BITFILE) add substantial runtime and are not needed for this flow.
+- `stitched_ip_gen_dcp` : will generate an IP block with a synthesized design checkpoint (.dcp) which makes the design more portable across different machines, but will add some runtime.
+
+
+### Running FINN Compiler
+
+Prior to running, insure the following prerequisites have been met:
+- Install FINN and prerequisites.  The [Getting Started](https://finn.readthedocs.io/en/latest/getting_started.html#quickstart) section of the FINN documentation might be helpful for this.
+- Ensure you have the `FINN_XILINX_PATH` and `FINN_XILINX_VERSION` env variables set appropriately for your install.  For example:
+> export FINN_XILINX_PATH=/opt/Xilinx
+> export FINN_XILINX_VERSION=2020.1
+- Set the env variable for your `finn` install top directory (where you cloned the FINN compiler repo):
+> export FINN_ROOT=/home/foo/finn
+
+Then, change to `finn` install directory and invoke the build as follows:
+> cd ${FINN_ROOT}
+> ./run-docker.sh build_custom ${FINN_ROOT}/tutorials/fpga_flow/
+
+Alternatively, since the tutorials folder is already part of the FINN compiler installation, you can invoke it from within the Docker container:
+> cd ${FINN_ROOT}
+> ./run-docker.sh
+> cd tutorials/fpga_flow
+> python build.py
+
+The build should finish in about 10 minutes, and the FINN docker will close on success.
+
+```
+   ...
+   Running step: step_create_stitched_ip [12/18]
+   Running step: step_measure_rtlsim_performance [13/18]
+   Running step: step_out_of_context_synthesis [14/18]
+   Running step: step_synthesize_bitfile [15/18]
+   Running step: step_make_pynq_driver [16/18]
+   Running step: step_deployment_package [17/18]
+   Running step: custom_step_gen_tb_and_io [18/18]
+   Completed successfully
+   The program finished and will be restarted
+```
+
+
+### Examine the Stitched IP
+
+Navigate to the stitched IP project directory:
+
+> cd ${FINN_ROOT}/tutorials/fpga_flow/output_tfc_w0a1_fpga/stitched_ip
+
+And, open the project:
+
+> vivado finn_vivado_stitch_proj.xpr
+
+Explore the IPI board design and note the interfaces.
+
+
+### Simulating the Stitched IP with a Verilog Test Bench
+
+You may have noticed that the final build step invoked by FINN is `custom_step_gen_tb_and_io`.
+This custom step generates the files we'll need to simulate the FINN design in Vivado, and places
+them under `${FINN_ROOT}/tutorials/fpga_flow/output_tfc_w0a1_fpga/sim`. Let's examine these files.
+
+* `input.dat` and `expected_output.dat`: text files containing hex data for sample input and its expected
+   output. These are generated from the `input.npy` and `expected_output.npy` files by the FINN compiler.
+   Notice how the structure of the .dat files reflects the parallelization parameters of the first (for input)
+   and last (for output) layers of the hardware. The input is fed 49 bytes at a time, over 19 cycles to finish
+   a sample of 28x28=784 bytes from the MNIST dataset. Note how this matches PE=49 as selected for the first layer in `folding_config.json`. Additionally, note the reversal along each line in the .dat file to align the
+   byte order with what the FINN-generated hardware expects.
+
+* `finn_testbench.sv` : created by filling in a testbench template (under `templates/finn_testbench.template.sv`) with
+   relevant information by the FINN compiler, including the sizes of the input/output streams, folding factors and number of samples in the generated .dat file.
+
+* `make_sim_proj.tcl` : created by filling in a TCL script template (under `templates/make_sim_proj.template.tcl`) by
+   the FINN compiler. Used for launching the testbench simulation.
+
+You can now launch the simulation as follows:
+
+> cd ${FINN_ROOT}/tutorials/fpga_flow/output_tfc_w0a1_fpga/sim
+> vivado -mode gui -source make_sim_proj.tcl
+
+The simulation should complete with:
+
+```
+ # run all
+CHK: Data    match 02 == 02   --> 0
+
+************************************************************
+  SIM COMPLETE
+  Validated 1 data points
+  Total error count: ====>  0  <====
+```
+
+You can also use the provided testbench skeleton and the custom step in `build.py` to build your own
+testbench generators.
+
+#### Instantiation in Mission Design
+
+There are any number of ways to bring the stitched IP into larger design.
+
+FINN already packages the stitched IP block design as a standalone IP-XACT component, which you can find under `${FINN_ROOT}/tutorials/fpga_flow/output_tfc_w0a1_fpga/stitched_ip/ip`. You can add this to the list of IP repos and use it in your own Vivado designs. A good reference for this is [UG1119](https://www.xilinx.com/support/documentation/sw_manuals/xilinx2020_1/ug1119-vivado-creating-packaging-ip-tutorial.pdf)
+
+Keep in mind that all of the User IP Repo's included in the Stitched IP project (from `$FINN_HOST_BUILD_DIR` which is normally located under `/tmp/finn_dev_<username>`) need to also be brought in as IP Repo's to any project using the stitched IP.  It would be prudent to copy those IP repos to an appropriate archive location. You should also set the
+`FINN_ROOT` environment variable to point to the compiler installation directory, as some of the build scripts will
+use this to access various components. Alternatively, if you don't want to copy all of the dependencies, you can ask FINN to generate the IP-XACT component with a synthesized .dcp checkpoint by passing the [stitched_ip_gen_dcp=True](https://finn-dev.readthedocs.io/en/latest/source_code/finn.builder.html#finn.builder.build_dataflow_config.DataflowBuildConfig.stitched_ip_gen_dcp) option as part of the build configuration.
diff --git a/tutorials/fpga_flow/build.py b/tutorials/fpga_flow/build.py
new file mode 100644
index 0000000000..8b50a31144
--- /dev/null
+++ b/tutorials/fpga_flow/build.py
@@ -0,0 +1,145 @@
+# Copyright (c) 2022 Xilinx, Inc.
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of Xilinx nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+# This file is intended to serve as an example showing how to set up custom builds
+# using FINN. The custom build can be launched like this:
+# ./run-docker.sh build_custom /path/to/folder
+
+
+import numpy as np
+import os
+from qonnx.custom_op.registry import getCustomOp
+
+import finn.builder.build_dataflow as build
+import finn.builder.build_dataflow_config as build_cfg
+import finn.util.data_packing as dpk
+
+model_name = "tfc_w1a1"
+platform_name = "fpga"
+
+
+def custom_step_gen_tb_and_io(model, cfg):
+    sim_output_dir = cfg.output_dir + "/sim"
+    os.makedirs(sim_output_dir, exist_ok=True)
+    # load the provided input data
+    inp_data = np.load("input.npy")
+    batchsize = inp_data.shape[0]
+    # permute input image from NCHW -> NHWC format (needed by FINN)
+    # this example (MNIST) only has 1 channel, which means this doesn't
+    # really do anything in terms of data layout changes, but provided for
+    # completeness
+    inp_data = np.transpose(inp_data, (0, 2, 3, 1))
+    # this network is an MLP and takes in flattened input
+    inp_data = inp_data.reshape(batchsize, -1)
+    # query the parallelism-dependent folded input shape from the
+    # node consuming the graph input
+    inp_name = model.graph.input[0].name
+    inp_node = getCustomOp(model.find_consumer(inp_name))
+    inp_shape_folded = list(inp_node.get_folded_input_shape())
+    inp_stream_width = inp_node.get_instream_width_padded()
+    # fix first dimension (N: batch size) to correspond to input data
+    # since FINN model itself always uses N=1
+    inp_shape_folded[0] = batchsize
+    inp_shape_folded = tuple(inp_shape_folded)
+    inp_dtype = model.get_tensor_datatype(inp_name)
+    # now re-shape input data into the folded shape and do hex packing
+    inp_data = inp_data.reshape(inp_shape_folded)
+    inp_data_packed = dpk.pack_innermost_dim_as_hex_string(
+        inp_data, inp_dtype, inp_stream_width, prefix="", reverse_inner=True
+    )
+    np.savetxt(sim_output_dir + "/input.dat", inp_data_packed, fmt="%s", delimiter="\n")
+    # load expected output and calculate folded shape
+    exp_out = np.load("expected_output.npy")
+    out_name = model.graph.output[0].name
+    out_node = getCustomOp(model.find_producer(out_name))
+    out_shape_folded = list(out_node.get_folded_output_shape())
+    out_stream_width = out_node.get_outstream_width_padded()
+    out_shape_folded[0] = batchsize
+    out_shape_folded = tuple(out_shape_folded)
+    out_dtype = model.get_tensor_datatype(out_name)
+    exp_out = exp_out.reshape(out_shape_folded)
+    out_data_packed = dpk.pack_innermost_dim_as_hex_string(
+        exp_out, out_dtype, out_stream_width, prefix="", reverse_inner=True
+    )
+    np.savetxt(
+        sim_output_dir + "/expected_output.dat",
+        out_data_packed,
+        fmt="%s",
+        delimiter="\n",
+    )
+    # fill in testbench template
+    with open("templates/finn_testbench.template.sv", "r") as f:
+        testbench_sv = f.read()
+    testbench_sv = testbench_sv.replace("@N_SAMPLES@", str(batchsize))
+    testbench_sv = testbench_sv.replace("@IN_STREAM_BITWIDTH@", str(inp_stream_width))
+    testbench_sv = testbench_sv.replace("@OUT_STREAM_BITWIDTH@", str(out_stream_width))
+    testbench_sv = testbench_sv.replace(
+        "@IN_BEATS_PER_SAMPLE@", str(np.prod(inp_shape_folded[:-1]))
+    )
+    testbench_sv = testbench_sv.replace(
+        "@OUT_BEATS_PER_SAMPLE@", str(np.prod(out_shape_folded[:-1]))
+    )
+    testbench_sv = testbench_sv.replace("@TIMEOUT_CYCLES@", "1000")
+    with open(sim_output_dir + "/finn_testbench.sv", "w") as f:
+        f.write(testbench_sv)
+    # fill in testbench project creator template
+    with open("templates/make_sim_proj.template.tcl", "r") as f:
+        testbench_tcl = f.read()
+    testbench_tcl = testbench_tcl.replace("@STITCHED_IP_ROOT@", "../stitched_ip")
+    with open(sim_output_dir + "/make_sim_proj.tcl", "w") as f:
+        f.write(testbench_tcl)
+
+    return model
+
+
+build_steps = build_cfg.default_build_dataflow_steps + [custom_step_gen_tb_and_io]
+
+
+cfg = build.DataflowBuildConfig(
+    steps=build_steps,
+    board=platform_name,
+    output_dir="output_%s_%s" % (model_name, platform_name),
+    synth_clk_period_ns=10.0,
+    folding_config_file="folding_config.json",
+    fpga_part="xczu3eg-sbva484-1-e",
+    shell_flow_type=build_cfg.ShellFlowType.VIVADO_ZYNQ,
+    stitched_ip_gen_dcp=False,
+    generate_outputs=[
+        build_cfg.DataflowOutputType.STITCHED_IP,
+    ],
+    verify_steps=[
+        build_cfg.VerificationStepType.TIDY_UP_PYTHON,
+        build_cfg.VerificationStepType.STREAMLINED_PYTHON,
+        build_cfg.VerificationStepType.FOLDED_HLS_CPPSIM,
+        build_cfg.VerificationStepType.STITCHED_IP_RTLSIM,
+    ],
+    save_intermediate_models=True,
+)
+model_file = "model.onnx"
+build.build_dataflow_cfg(model_file, cfg)
diff --git a/tutorials/fpga_flow/folding_config.json b/tutorials/fpga_flow/folding_config.json
new file mode 100644
index 0000000000..642200d02b
--- /dev/null
+++ b/tutorials/fpga_flow/folding_config.json
@@ -0,0 +1,30 @@
+{
+  "Defaults": {},
+  "Thresholding_Batch_0": {
+    "PE": 49,
+    "ram_style": "block"
+  },
+  "MatrixVectorActivation_0": {
+    "PE": 16,
+    "SIMD": 49,
+    "ram_style": "block"
+  },
+  "MatrixVectorActivation_1": {
+    "PE": 8,
+    "SIMD": 8,
+    "ram_style": "auto"
+  },
+  "MatrixVectorActivation_2": {
+    "PE": 8,
+    "SIMD": 8,
+    "ram_style": "auto"
+  },
+  "MatrixVectorActivation_3": {
+    "PE": 10,
+    "SIMD": 8,
+    "ram_style": "distributed"
+  },
+  "LabelSelect_Batch_0": {
+    "PE": 1
+  }
+}
diff --git a/tutorials/fpga_flow/templates/finn_testbench.template.sv b/tutorials/fpga_flow/templates/finn_testbench.template.sv
new file mode 100644
index 0000000000..0d8c08efd7
--- /dev/null
+++ b/tutorials/fpga_flow/templates/finn_testbench.template.sv
@@ -0,0 +1,173 @@
+// Copyright (c) 2022 Xilinx, Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+//
+// * Redistributions of source code must retain the above copyright notice, this
+//   list of conditions and the following disclaimer.
+//
+// * Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+//
+// * Neither the name of Xilinx nor the names of its
+//   contributors may be used to endorse or promote products derived from
+//   this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+// DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+// FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+// DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+// SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// `timescale 1 ns / 1 ps
+`define INPUT_HEXFILE "input.dat"
+`define EXPECTED_OUTPUT_HEXFILE "expected_output.dat"
+
+// general FINN testbench parameters
+parameter N_SAMPLES = @N_SAMPLES@;
+parameter IN_STREAM_BITWIDTH = @IN_STREAM_BITWIDTH@;
+parameter OUT_STREAM_BITWIDTH = @OUT_STREAM_BITWIDTH@;
+parameter IN_BEATS_PER_SAMPLE = @IN_BEATS_PER_SAMPLE@;
+parameter OUT_BEATS_PER_SAMPLE = @OUT_BEATS_PER_SAMPLE@;
+parameter TIMEOUT_CYCLES = @TIMEOUT_CYCLES@;
+
+parameter IN_SAMPLE_BITWIDTH = IN_STREAM_BITWIDTH * IN_BEATS_PER_SAMPLE;
+parameter OUT_SAMPLE_BITWIDTH = OUT_STREAM_BITWIDTH * OUT_BEATS_PER_SAMPLE;
+
+module tb ();
+
+
+logic [IN_STREAM_BITWIDTH-1:0] input_data [N_SAMPLES*IN_BEATS_PER_SAMPLE];
+logic [OUT_STREAM_BITWIDTH-1:0] exp_output_data [N_SAMPLES*OUT_BEATS_PER_SAMPLE];
+logic [IN_STREAM_BITWIDTH-1:0] current_input [IN_BEATS_PER_SAMPLE];
+logic [$clog2(N_SAMPLES*OUT_BEATS_PER_SAMPLE):0] rd_ptr=0;
+logic [$clog2(N_SAMPLES*OUT_BEATS_PER_SAMPLE):0] wr_ptr=0;
+int err_count=0;
+int data_count=0;
+int i,j;
+logic [31:0] input_file_lines;
+logic [31:0] exp_output_file_lines;
+
+logic ap_clk = 0;
+logic ap_rst_n = 0;
+
+logic [OUT_STREAM_BITWIDTH-1:0] dout_tdata;
+logic dout_tlast;
+logic dout_tready;
+logic dout_tvalid;
+
+logic [IN_STREAM_BITWIDTH-1:0] din_tdata;
+logic din_tready;
+logic din_tvalid;
+
+
+
+finn_design_wrapper finn_design_wrapper (
+  .ap_clk                (ap_clk               ),
+  .ap_rst_n              (ap_rst_n             ),
+  // output stream
+  .m_axis_0_tdata        (dout_tdata           ),
+  .m_axis_0_tready       (dout_tready          ),
+  .m_axis_0_tvalid       (dout_tvalid          ),
+  // input stream
+  .s_axis_0_tdata        (din_tdata           ),
+  .s_axis_0_tready       (din_tready          ),
+  .s_axis_0_tvalid       (din_tvalid          )
+);
+
+always #5ns ap_clk = !ap_clk;
+
+initial begin
+    // read input hexfile
+    $readmemh(`INPUT_HEXFILE, input_data);
+    for (i=0; i<N_SAMPLES*IN_BEATS_PER_SAMPLE; i+=1)  if (input_data[i][0] !== 1'bx) input_file_lines = i;
+    if (input_file_lines[0] === {1'bx}) begin
+        $display("ERROR:  Unable to read dat file: %s",`INPUT_HEXFILE);
+        $finish;
+    end
+    // read expected output hexfile
+    $readmemh(`EXPECTED_OUTPUT_HEXFILE, exp_output_data);
+    for (i=0; i<N_SAMPLES*OUT_BEATS_PER_SAMPLE; i+=1)  if (exp_output_data[i][0] !== 1'bx) exp_output_file_lines = i;
+    if (exp_output_file_lines[0] === {1'bx}) begin
+        $display("ERROR:  Unable to read dat file: %s",`EXPECTED_OUTPUT_HEXFILE);
+        $finish;
+    end
+
+    din_tvalid = 0;
+    din_tdata = 0;
+    dout_tready = 1;
+
+    // perform reset
+    repeat (100)  @(negedge ap_clk);
+    ap_rst_n = 1;
+    repeat (100)  @(negedge ap_clk);
+    dout_tready = 1;
+
+    repeat (10)  @(negedge ap_clk);
+    @(negedge ap_clk);
+    @(negedge ap_clk);
+
+
+    // feed all inputs
+    for (j=0; j<N_SAMPLES; j+=1) begin
+        // get current input and expected output samples from batch data
+        for (i=0; i<IN_BEATS_PER_SAMPLE; i+=1) begin
+            current_input[i] = input_data[j*IN_BEATS_PER_SAMPLE+i];
+        end
+        // put corresponding expected output into queue
+        // data is already in exp_output_data
+        for (i=0; i<OUT_BEATS_PER_SAMPLE; i+=1) begin
+            wr_ptr++;
+        end
+        // feed current input
+        for (i=0; i<IN_BEATS_PER_SAMPLE; i+=1) begin
+            din_tvalid = 1;
+            din_tdata = current_input[i];
+            @(negedge ap_clk);
+            // TODO add timeout on input backpressure
+            while (~din_tready)  @(negedge ap_clk);
+            din_tvalid = 0;
+        end
+    end
+
+    din_tdata = 0;
+    din_tvalid = 0;
+
+    repeat (TIMEOUT_CYCLES)  @(negedge ap_clk);
+    din_tdata = 0;
+    if (wr_ptr != rd_ptr) begin
+        $display("ERR: End-sim check: rd_ptr %h != %h wr_ptr",rd_ptr, wr_ptr);
+        err_count++;
+    end
+
+    $display("\n************************************************************ ");
+    $display("  SIM COMPLETE");
+    $display("  Validated %0d data points ",data_count);
+    $display("  Total error count: ====>  %0d  <====\n",err_count);
+    $finish;
+end
+
+
+// Check the result at each valid output from the model
+always @(posedge ap_clk) begin
+  if (dout_tvalid && ap_rst_n) begin
+    // TODO implement output folding - current code assumes OUT_BEATS_PER_SAMPLE=1
+    if (dout_tdata !== exp_output_data[rd_ptr]) begin
+      $display("ERR: Data mismatch %h != %h ",dout_tdata, exp_output_data[rd_ptr]);
+      err_count++;
+    end else begin
+      $display("CHK: Data    match %h == %h   --> %0d",dout_tdata, exp_output_data[rd_ptr], data_count);
+    end
+    rd_ptr++;
+    data_count++;
+  end
+end
+
+endmodule
diff --git a/tutorials/fpga_flow/templates/make_sim_proj.template.tcl b/tutorials/fpga_flow/templates/make_sim_proj.template.tcl
new file mode 100644
index 0000000000..9dae5a02a9
--- /dev/null
+++ b/tutorials/fpga_flow/templates/make_sim_proj.template.tcl
@@ -0,0 +1,39 @@
+# Copyright (c) 2022 Xilinx, Inc.
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of Xilinx nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+# open the stitched IP project to get all the sources
+open_project @STITCHED_IP_ROOT@/finn_vivado_stitch_proj.xpr
+
+read_verilog -quiet [glob -nocomplain *.v]
+read_verilog -quiet -sv [glob -nocomplain *.sv]
+
+save_project_as sim -force
+add_files -fileset sim_1 [glob *.dat]
+set_property top tb [get_fileset sim_1]
+launch_simulation -simset sim_1 -mode behavioral
+run all