OpenFPGA/openfpga_flow/benchmarks/vtr_benchmark/tpu.16x16.int8.v

5712 lines
219 KiB
Coq
Raw Normal View History

2021-03-17 16:24:26 -05:00
`timescale 1ns / 1ps
///////////////////////////////////
// Overview
///////////////////////////////////
//This design is based on the architecture from Google's TPU v1 [1]. At its heart,
//it uses a 16x16 matrix multiplication unit, instead of a 256x256 matrix multiplication
//unit used by the TPU. The design uses int8 precision. This systolic matrix multiplication
//unit is a output stationary unit, compared to weight stationary architecture used in the TPU.
//The activations are stored in RAM block A, whereas the weights are stored in RAM block B.
//Control and configuration are done through an APB interface, instead of a PCIe interface on
//the TPU. The normalization block applies the mean and variance values to the output of the
//matrix multiplication unit. Pooling unit supports 3 pooling windows - 1x1, 2x2 and 4x4.
//The activation unit supports two activation functions - rectified linear unit (ReLU) and
//the hyperbolic tangent (TanH). The activation unit is the last unit before the results
//are written back to RAM block A, from where they can be read again into the matrix
//multiplication unit for the next layer.
//
//[1] Jouppi et. al., In-Datacenter Performance Analysis of a Tensor Processing Unit, ISCA 2017
//////////////////////////////////////
// Module hierarchy
//////////////////////////////////////
// top (the top level design)
// |--- ram matrix_A (the RAM that stores matrix A (activations))
// |--- ram matrix_B (the RAM that stores matrix B (weights))
// |--- control u_control (the state machine that controls the operation)
// |--- cfg u_cfg (unit to configure/observe registers using an APB interface)
// |--- matmul_16x16_systolic u_matmul (systolic 16x16 matrix multiplication unit)
// | |--- output_logic (contains logic to shift out the outputs of matmul)
// | |--- systolic_data_setup (contains logic to shift in the inputs of the matmul)
// | |--- systolic_pe_matrix (16x16 matrix of processing elements)
// | |--- processing_element (one processing element)
// | |--- seq_mac (mac block inside each processing element)
// | |--- qmult (multiplier inside each mac)
// | |--- qadd (adder inside each mac)
// |--- norm u_norm (normalization block; applies mean and variance)
// |--- pool u_pool (block that performs pooling)
// |--- activation u_activation(block that applies activation - relu or tanh)
//////////////////////////////////////
// Tested architectures
//////////////////////////////////////
// This design has been tested with:
// 1. The VTR flagship 40nm architecture. Example: arch/timing/k6_frac_N10_frac_chain_mem32K_40nm.xml
// Properties of this design on this architecture:
// Critical path delay: 8.32 ns
// Clock frequency: 120.19 MHz
// Critical path: Includes the multiplier in the MAC in a PE and inter-CLB routing
// Logic area (used): 1.97532e+08 MWTAs
// Resource usage: 1556 LBs, 8 RAMs, 276 Multipliers
// Runtime (on Intel Xeon E5-2430 2.5GHz with single thread): 3200 sec
// 2. 22nm architectures generated from COFFE. Example: arch/COFFE_22nm/stratix10_arch.xml
// Properties of this design on this architecture:
// Critical path delay: 9.24 ns
// Clock frequency: 108.17 MHz
// Critical path: Includes the multiplier in the MAC in a PE and inter-CLB routing
// Logic area (used): 4.95598e+07 MWTAs
// Resource usage: 1477 LBs, 14 RAMs, 280 Multipliers
// Runtime (on Intel Xeon E5-2430 2.5GHz with single thread): 3400 sec
//////////////////////////////////////
// Parameters
//////////////////////////////////////
//The width of the data. This design uses int8 precision. So, DWIDTH is 8
//To change to a floating point 16 version, change this to 16 and also
//change the datapath components (like adder and multiplier) to be floating point.
`define DWIDTH 8
//This is the size of the matrix multiplier unit. In this design, we have a systolic
//matrix multiplication unit that can multiply 16x16 matrix with a 16x16 matrix.
`define DESIGN_SIZE 16
`define LOG2_DESIGN_SIZE 5
`define MAT_MUL_SIZE 16
`define MASK_WIDTH 16
`define LOG2_MAT_MUL_SIZE 5
//This it the size of the address bus, or the depth of the RAM. Each location of
//the RAM is DWIDTH * MAT_MUL_SIZE wide. So, in this design, we use a total of
//1024 * 16 bytes of memory (i.e. 16 KB).
`define AWIDTH 10
//This is the number of clock cycles spent in the mac block
`define NUM_CYCLES_IN_MAC 3
//This defines the latency of accessing data from a block ram
`define MEM_ACCESS_LATENCY 1
//Data width and address width of the APB interface for registers
`define REG_DATAWIDTH 32
`define REG_ADDRWIDTH 8
//Width of the stride for each column in the matrices (same as ram address width)
`define ADDR_STRIDE_WIDTH 16
//Number of bits to specify the pooling window. We support 3 sizes.
`define MAX_BITS_POOL 3
/////////////////////////////////////////////////
// Register specification
/////////////////////////////////////////////////
//---------------------------------------
//Addr 0 : Register with enables for various blocks.
//Includes mode of operation (convolution or fully_connected)
//---------------------------------------
`define REG_ENABLES_ADDR 32'h0
//Bit 0: enable_matmul
//Bit 1: enable_norm
//Bit 2: enable_pool
//Bit 3: enable_activation
//Bit 31: enable_conv_mode
//---------------------------------------
//Addr 4: Register that triggers the whole TPU
//---------------------------------------
`define REG_STDN_TPU_ADDR 32'h4
//Bit 0: start_tpu
//Bit 31: done_tpu
//---------------------------------------
//Addr 8: Register that stores the mean of the values
//---------------------------------------
`define REG_MEAN_ADDR 32'h8
//Bit 7:0: mean
//---------------------------------------
//Addr A: Register that stores the inverse variance of the values
//---------------------------------------
`define REG_INV_VAR_ADDR 32'hA
//Bit 7:0: inv_var
//---------------------------------------
//Addr E: Register that stores the starting address of matrix A in BRAM A.
//In fully-connected mode, this register should be programmed with the
//address of the matrix being currently multiplied. That is, the
//address of the matrix of the matmul. So, this register will be
//programmed every time the matmul is kicked off during accumulation stages.
//Use the STRIDE registers to tell the matmul to increment addresses.
//In convolution mode, this register should be programmed with the
//address of the input activation matrix. No need to configure
//this every time the matmul is kicked off for accmulation. Just program it
//once it the beginning. Address increments are handled automatically .
//---------------------------------------
`define REG_MATRIX_A_ADDR 32'he
//Bit `AWIDTH-1:0 address_mat_a
//---------------------------------------
//Addr 12: Register that stores the starting address of matrix B in BRAM B.
//See detailed note on the usage of this register in REG_MATRIX_A_ADDR.
//---------------------------------------
`define REG_MATRIX_B_ADDR 32'h12
//Bit `AWIDTH-1:0 address_mat_b
//---------------------------------------
//Addr 16: Register that stores the starting address of matrix C in BRAM C.
//See detailed note on the usage of this register in REG_MATRIX_A_ADDR.
//---------------------------------------
`define REG_MATRIX_C_ADDR 32'h16
//Bit `AWIDTH-1:0 address_mat_c
//---------------------------------------
//Addr 24: Register that controls the accumulation logic
//---------------------------------------
`define REG_ACCUM_ACTIONS_ADDR 32'h24
//Bit 0 save_output_to_accumulator
//Bit 1 add_accumulator_to_output
//---------------------------------------
//(Only applicable in fully-connected mode)
//Addr 28: Register that stores the stride that should be taken to address
//elements in matrix A, after every MAT_MUL_SIZE worth of data has been fetched.
//See the diagram in "Meeting-16" notes in the EE382V project Onenote notebook.
//This stride is applied when incrementing addresses for matrix A in the vertical
//direction.
//---------------------------------------
`define REG_MATRIX_A_STRIDE_ADDR 32'h28
//Bit `ADDR_STRIDE_WIDTH-1:0 address_stride_a
//---------------------------------------
//(Only applicable in fully-connected mode)
//Addr 32: Register that stores the stride that should be taken to address
//elements in matrix B, after every MAT_MUL_SIZE worth of data has been fetched.
//See the diagram in "Meeting-16" notes in the EE382V project Onenote notebook.
//This stride is applied when incrementing addresses for matrix B in the horizontal
//direction.
//---------------------------------------
`define REG_MATRIX_B_STRIDE_ADDR 32'h32
//Bit `ADDR_STRIDE_WIDTH-1:0 address_stride_b
//---------------------------------------
//(Only applicable in fully-connected mode)
//Addr 36: Register that stores the stride that should be taken to address
//elements in matrix C, after every MAT_MUL_SIZE worth of data has been fetched.
//See the diagram in "Meeting-16" notes in the EE382V project Onenote notebook.
//This stride is applied when incrementing addresses for matrix C in the vertical
//direction (this is generally same as address_stride_a).
//---------------------------------------
`define REG_MATRIX_C_STRIDE_ADDR 32'h36
//Bit `ADDR_STRIDE_WIDTH-1:0 address_stride_c
//---------------------------------------
//Addr 3A: Register that controls the activation block. Currently, the available
//settings are the selector of activation function that will be used. There are
//two options: ReLU and TanH. To use ReLU, clear the LSB of this register. To
//use TanH, set the LSB of this register.
//---------------------------------------
`define REG_ACTIVATION_CSR_ADDR 32'h3A
//---------------------------------------
//Addr 3E: Register defining pooling window size
//---------------------------------------
`define REG_POOL_WINDOW_ADDR 32'h3E
//Bit `MAX_BITS_POOL-1:0 pool window size
//---------------------------------------
//Addr 40: Register defining convolution parameters - 1
//----------------------------------------
`define REG_CONV_PARAMS_1_ADDR 32'h40
//Bits filter_height (R) 3:0
//Bits filter width (S) 7:4
//Bits stride_horizontal 11:8
//Bits stride_vertical 15:12
//Bits pad_left 19:16
//Bits pad_right 23:20
//Bits pad_top 27:24
//Bits pad_bottom 31:28
//---------------------------------------
//Addr 44: Register defining convolution parameters - 2
//----------------------------------------
`define REG_CONV_PARAMS_2_ADDR 32'h44
//Bits num_channels_input (C) 15:0
//Bits num_channels_output (K) 31:16
//---------------------------------------
//Addr 48: Register defining convolution parameters - 3
//----------------------------------------
`define REG_CONV_PARAMS_3_ADDR 32'h48
//Bits input_image_height (H) 15:0
//Bits input_image_width (W) 31:16
//---------------------------------------
//Addr 4C: Register defining convolution parameters - 4
//----------------------------------------
`define REG_CONV_PARAMS_4_ADDR 32'h4C
//Bits output_image_height (P) 15:0
//Bits output_image_width (Q) 31:16
//---------------------------------------
//Addr 50: Register defining batch size
//----------------------------------------
`define REG_BATCH_SIZE_ADDR 32'h50
//Bits 31:0 batch_size (number of images, N)
//---------------------------------------
//Addresses 54,58,5C: Registers that stores the mask of which parts of the matrices are valid.
//
//Some examples where this is useful:
//1. Input matrix is smaller than the matmul.
// Say we want to multiply a 6x6 using an 8x8 matmul.
// The matmul still operates on the whole 8x8 part, so we need
// to ensure that there are 0s in the BRAMs in the invalid parts.
// But the mask is used by the blocks other than matmul. For ex,
// norm block will use the mask to avoid applying mean and variance
// to invalid parts (so tha they stay 0).
//2. When we start with large matrices, the size of the matrices can
// reduce to something less than the matmul size because of pooling.
// In that case for the next layer, we need to tell blocks like norm,
// what is valid and what is not.
//
//Note: This masks is applied to both x and y directions and also
//applied to both input matrices - A and B.
//---------------------------------------
`define REG_VALID_MASK_A_ROWS_ADDR 32'h20
`define REG_VALID_MASK_A_COLS_ADDR 32'h54
`define REG_VALID_MASK_B_ROWS_ADDR 32'h5c
`define REG_VALID_MASK_B_COLS_ADDR 32'h58
//Bit `MASK_WIDTH-1:0 validity_mask
//This used to be a normal signal, but changing it to a `define.
//That's because it's not required to be a variable in this design.
//And ODIN doesn't seem to propagate constants properly.
`define final_mat_mul_size 16
/////////////////////////////////////
// Matrix multiplication unit
////////////////////////////////////
//////////////////////////////////////////////////////////////////////////////////
// Company:
// Engineer:
//
// Create Date: 2020-09-27 21:12:45.762386
// Design Name:
// Module Name: matmul_16x16_systolic
// Project Name:
// Target Devices:
// Tool Versions:
// Description:
//
// Dependencies:
//
// Revision:
// Revision 0.01 - File Created
// Additional Comments:
//
//////////////////////////////////////////////////////////////////////////////////
module matmul_16x16_systolic(
clk,
reset,
pe_reset,
start_mat_mul,
done_mat_mul,
address_mat_a,
address_mat_b,
address_mat_c,
address_stride_a,
address_stride_b,
address_stride_c,
a_data,
b_data,
a_data_in, //Data values coming in from previous matmul - systolic connections
b_data_in,
c_data_in, //Data values coming in from previous matmul - systolic shifting
c_data_out, //Data values going out to next matmul - systolic shifting
a_data_out,
b_data_out,
a_addr,
b_addr,
c_addr,
c_data_available,
validity_mask_a_rows,
validity_mask_a_cols,
validity_mask_b_rows,
validity_mask_b_cols,
final_mat_mul_size,
a_loc,
b_loc
);
input clk;
input reset;
input pe_reset;
input start_mat_mul;
output done_mat_mul;
input [`AWIDTH-1:0] address_mat_a;
input [`AWIDTH-1:0] address_mat_b;
input [`AWIDTH-1:0] address_mat_c;
input [`ADDR_STRIDE_WIDTH-1:0] address_stride_a;
input [`ADDR_STRIDE_WIDTH-1:0] address_stride_b;
input [`ADDR_STRIDE_WIDTH-1:0] address_stride_c;
input [`MAT_MUL_SIZE*`DWIDTH-1:0] a_data;
input [`MAT_MUL_SIZE*`DWIDTH-1:0] b_data;
input [`MAT_MUL_SIZE*`DWIDTH-1:0] a_data_in;
input [`MAT_MUL_SIZE*`DWIDTH-1:0] b_data_in;
input [`MAT_MUL_SIZE*`DWIDTH-1:0] c_data_in;
output [`MAT_MUL_SIZE*`DWIDTH-1:0] c_data_out;
output [`MAT_MUL_SIZE*`DWIDTH-1:0] a_data_out;
output [`MAT_MUL_SIZE*`DWIDTH-1:0] b_data_out;
output [`AWIDTH-1:0] a_addr;
output [`AWIDTH-1:0] b_addr;
output [`AWIDTH-1:0] c_addr;
output c_data_available;
input [`MASK_WIDTH-1:0] validity_mask_a_rows;
input [`MASK_WIDTH-1:0] validity_mask_a_cols;
input [`MASK_WIDTH-1:0] validity_mask_b_rows;
input [`MASK_WIDTH-1:0] validity_mask_b_cols;
//7:0 is okay here. We aren't going to make a matmul larger than 128x128
//In fact, these will get optimized out by the synthesis tool, because
//we hardcode them at the instantiation level.
input [7:0] final_mat_mul_size;
input [7:0] a_loc;
input [7:0] b_loc;
//////////////////////////////////////////////////////////////////////////
// Logic for clock counting and when to assert done
//////////////////////////////////////////////////////////////////////////
reg done_mat_mul;
//This is 7 bits because the expectation is that clock count will be pretty
//small. For large matmuls, this will need to increased to have more bits.
//In general, a systolic multiplier takes 4*N-2+P cycles, where N is the size
//of the matmul and P is the number of pipleine stages in the MAC block.
reg [7:0] clk_cnt;
//Finding out number of cycles to assert matmul done.
//When we have to save the outputs to accumulators, then we don't need to
//shift out data. So, we can assert done_mat_mul early.
//In the normal case, we have to include the time to shift out the results.
//Note: the count expression used to contain "4*final_mat_mul_size", but
//to avoid multiplication, we now use "final_mat_mul_size<<2"
wire [7:0] clk_cnt_for_done;
assign clk_cnt_for_done =
((final_mat_mul_size<<2) - 2 + `NUM_CYCLES_IN_MAC);
always @(posedge clk) begin
if (reset || ~start_mat_mul) begin
clk_cnt <= 0;
done_mat_mul <= 0;
end
else if (clk_cnt == clk_cnt_for_done) begin
done_mat_mul <= 1;
clk_cnt <= clk_cnt + 1;
end
else if (done_mat_mul == 0) begin
clk_cnt <= clk_cnt + 1;
end
else begin
done_mat_mul <= 0;
clk_cnt <= clk_cnt + 1;
end
end
wire [`DWIDTH-1:0] a0_data;
wire [`DWIDTH-1:0] a1_data;
wire [`DWIDTH-1:0] a2_data;
wire [`DWIDTH-1:0] a3_data;
wire [`DWIDTH-1:0] a4_data;
wire [`DWIDTH-1:0] a5_data;
wire [`DWIDTH-1:0] a6_data;
wire [`DWIDTH-1:0] a7_data;
wire [`DWIDTH-1:0] a8_data;
wire [`DWIDTH-1:0] a9_data;
wire [`DWIDTH-1:0] a10_data;
wire [`DWIDTH-1:0] a11_data;
wire [`DWIDTH-1:0] a12_data;
wire [`DWIDTH-1:0] a13_data;
wire [`DWIDTH-1:0] a14_data;
wire [`DWIDTH-1:0] a15_data;
wire [`DWIDTH-1:0] b0_data;
wire [`DWIDTH-1:0] b1_data;
wire [`DWIDTH-1:0] b2_data;
wire [`DWIDTH-1:0] b3_data;
wire [`DWIDTH-1:0] b4_data;
wire [`DWIDTH-1:0] b5_data;
wire [`DWIDTH-1:0] b6_data;
wire [`DWIDTH-1:0] b7_data;
wire [`DWIDTH-1:0] b8_data;
wire [`DWIDTH-1:0] b9_data;
wire [`DWIDTH-1:0] b10_data;
wire [`DWIDTH-1:0] b11_data;
wire [`DWIDTH-1:0] b12_data;
wire [`DWIDTH-1:0] b13_data;
wire [`DWIDTH-1:0] b14_data;
wire [`DWIDTH-1:0] b15_data;
wire [`DWIDTH-1:0] a1_data_delayed_1;
wire [`DWIDTH-1:0] a2_data_delayed_1;
wire [`DWIDTH-1:0] a2_data_delayed_2;
wire [`DWIDTH-1:0] a3_data_delayed_1;
wire [`DWIDTH-1:0] a3_data_delayed_2;
wire [`DWIDTH-1:0] a3_data_delayed_3;
wire [`DWIDTH-1:0] a4_data_delayed_1;
wire [`DWIDTH-1:0] a4_data_delayed_2;
wire [`DWIDTH-1:0] a4_data_delayed_3;
wire [`DWIDTH-1:0] a4_data_delayed_4;
wire [`DWIDTH-1:0] a5_data_delayed_1;
wire [`DWIDTH-1:0] a5_data_delayed_2;
wire [`DWIDTH-1:0] a5_data_delayed_3;
wire [`DWIDTH-1:0] a5_data_delayed_4;
wire [`DWIDTH-1:0] a5_data_delayed_5;
wire [`DWIDTH-1:0] a6_data_delayed_1;
wire [`DWIDTH-1:0] a6_data_delayed_2;
wire [`DWIDTH-1:0] a6_data_delayed_3;
wire [`DWIDTH-1:0] a6_data_delayed_4;
wire [`DWIDTH-1:0] a6_data_delayed_5;
wire [`DWIDTH-1:0] a6_data_delayed_6;
wire [`DWIDTH-1:0] a7_data_delayed_1;
wire [`DWIDTH-1:0] a7_data_delayed_2;
wire [`DWIDTH-1:0] a7_data_delayed_3;
wire [`DWIDTH-1:0] a7_data_delayed_4;
wire [`DWIDTH-1:0] a7_data_delayed_5;
wire [`DWIDTH-1:0] a7_data_delayed_6;
wire [`DWIDTH-1:0] a7_data_delayed_7;
wire [`DWIDTH-1:0] a8_data_delayed_1;
wire [`DWIDTH-1:0] a8_data_delayed_2;
wire [`DWIDTH-1:0] a8_data_delayed_3;
wire [`DWIDTH-1:0] a8_data_delayed_4;
wire [`DWIDTH-1:0] a8_data_delayed_5;
wire [`DWIDTH-1:0] a8_data_delayed_6;
wire [`DWIDTH-1:0] a8_data_delayed_7;
wire [`DWIDTH-1:0] a8_data_delayed_8;
wire [`DWIDTH-1:0] a9_data_delayed_1;
wire [`DWIDTH-1:0] a9_data_delayed_2;
wire [`DWIDTH-1:0] a9_data_delayed_3;
wire [`DWIDTH-1:0] a9_data_delayed_4;
wire [`DWIDTH-1:0] a9_data_delayed_5;
wire [`DWIDTH-1:0] a9_data_delayed_6;
wire [`DWIDTH-1:0] a9_data_delayed_7;
wire [`DWIDTH-1:0] a9_data_delayed_8;
wire [`DWIDTH-1:0] a9_data_delayed_9;
wire [`DWIDTH-1:0] a10_data_delayed_1;
wire [`DWIDTH-1:0] a10_data_delayed_2;
wire [`DWIDTH-1:0] a10_data_delayed_3;
wire [`DWIDTH-1:0] a10_data_delayed_4;
wire [`DWIDTH-1:0] a10_data_delayed_5;
wire [`DWIDTH-1:0] a10_data_delayed_6;
wire [`DWIDTH-1:0] a10_data_delayed_7;
wire [`DWIDTH-1:0] a10_data_delayed_8;
wire [`DWIDTH-1:0] a10_data_delayed_9;
wire [`DWIDTH-1:0] a10_data_delayed_10;
wire [`DWIDTH-1:0] a11_data_delayed_1;
wire [`DWIDTH-1:0] a11_data_delayed_2;
wire [`DWIDTH-1:0] a11_data_delayed_3;
wire [`DWIDTH-1:0] a11_data_delayed_4;
wire [`DWIDTH-1:0] a11_data_delayed_5;
wire [`DWIDTH-1:0] a11_data_delayed_6;
wire [`DWIDTH-1:0] a11_data_delayed_7;
wire [`DWIDTH-1:0] a11_data_delayed_8;
wire [`DWIDTH-1:0] a11_data_delayed_9;
wire [`DWIDTH-1:0] a11_data_delayed_10;
wire [`DWIDTH-1:0] a11_data_delayed_11;
wire [`DWIDTH-1:0] a12_data_delayed_1;
wire [`DWIDTH-1:0] a12_data_delayed_2;
wire [`DWIDTH-1:0] a12_data_delayed_3;
wire [`DWIDTH-1:0] a12_data_delayed_4;
wire [`DWIDTH-1:0] a12_data_delayed_5;
wire [`DWIDTH-1:0] a12_data_delayed_6;
wire [`DWIDTH-1:0] a12_data_delayed_7;
wire [`DWIDTH-1:0] a12_data_delayed_8;
wire [`DWIDTH-1:0] a12_data_delayed_9;
wire [`DWIDTH-1:0] a12_data_delayed_10;
wire [`DWIDTH-1:0] a12_data_delayed_11;
wire [`DWIDTH-1:0] a12_data_delayed_12;
wire [`DWIDTH-1:0] a13_data_delayed_1;
wire [`DWIDTH-1:0] a13_data_delayed_2;
wire [`DWIDTH-1:0] a13_data_delayed_3;
wire [`DWIDTH-1:0] a13_data_delayed_4;
wire [`DWIDTH-1:0] a13_data_delayed_5;
wire [`DWIDTH-1:0] a13_data_delayed_6;
wire [`DWIDTH-1:0] a13_data_delayed_7;
wire [`DWIDTH-1:0] a13_data_delayed_8;
wire [`DWIDTH-1:0] a13_data_delayed_9;
wire [`DWIDTH-1:0] a13_data_delayed_10;
wire [`DWIDTH-1:0] a13_data_delayed_11;
wire [`DWIDTH-1:0] a13_data_delayed_12;
wire [`DWIDTH-1:0] a13_data_delayed_13;
wire [`DWIDTH-1:0] a14_data_delayed_1;
wire [`DWIDTH-1:0] a14_data_delayed_2;
wire [`DWIDTH-1:0] a14_data_delayed_3;
wire [`DWIDTH-1:0] a14_data_delayed_4;
wire [`DWIDTH-1:0] a14_data_delayed_5;
wire [`DWIDTH-1:0] a14_data_delayed_6;
wire [`DWIDTH-1:0] a14_data_delayed_7;
wire [`DWIDTH-1:0] a14_data_delayed_8;
wire [`DWIDTH-1:0] a14_data_delayed_9;
wire [`DWIDTH-1:0] a14_data_delayed_10;
wire [`DWIDTH-1:0] a14_data_delayed_11;
wire [`DWIDTH-1:0] a14_data_delayed_12;
wire [`DWIDTH-1:0] a14_data_delayed_13;
wire [`DWIDTH-1:0] a14_data_delayed_14;
wire [`DWIDTH-1:0] a15_data_delayed_1;
wire [`DWIDTH-1:0] a15_data_delayed_2;
wire [`DWIDTH-1:0] a15_data_delayed_3;
wire [`DWIDTH-1:0] a15_data_delayed_4;
wire [`DWIDTH-1:0] a15_data_delayed_5;
wire [`DWIDTH-1:0] a15_data_delayed_6;
wire [`DWIDTH-1:0] a15_data_delayed_7;
wire [`DWIDTH-1:0] a15_data_delayed_8;
wire [`DWIDTH-1:0] a15_data_delayed_9;
wire [`DWIDTH-1:0] a15_data_delayed_10;
wire [`DWIDTH-1:0] a15_data_delayed_11;
wire [`DWIDTH-1:0] a15_data_delayed_12;
wire [`DWIDTH-1:0] a15_data_delayed_13;
wire [`DWIDTH-1:0] a15_data_delayed_14;
wire [`DWIDTH-1:0] a15_data_delayed_15;
wire [`DWIDTH-1:0] b1_data_delayed_1;
wire [`DWIDTH-1:0] b2_data_delayed_1;
wire [`DWIDTH-1:0] b2_data_delayed_2;
wire [`DWIDTH-1:0] b3_data_delayed_1;
wire [`DWIDTH-1:0] b3_data_delayed_2;
wire [`DWIDTH-1:0] b3_data_delayed_3;
wire [`DWIDTH-1:0] b4_data_delayed_1;
wire [`DWIDTH-1:0] b4_data_delayed_2;
wire [`DWIDTH-1:0] b4_data_delayed_3;
wire [`DWIDTH-1:0] b4_data_delayed_4;
wire [`DWIDTH-1:0] b5_data_delayed_1;
wire [`DWIDTH-1:0] b5_data_delayed_2;
wire [`DWIDTH-1:0] b5_data_delayed_3;
wire [`DWIDTH-1:0] b5_data_delayed_4;
wire [`DWIDTH-1:0] b5_data_delayed_5;
wire [`DWIDTH-1:0] b6_data_delayed_1;
wire [`DWIDTH-1:0] b6_data_delayed_2;
wire [`DWIDTH-1:0] b6_data_delayed_3;
wire [`DWIDTH-1:0] b6_data_delayed_4;
wire [`DWIDTH-1:0] b6_data_delayed_5;
wire [`DWIDTH-1:0] b6_data_delayed_6;
wire [`DWIDTH-1:0] b7_data_delayed_1;
wire [`DWIDTH-1:0] b7_data_delayed_2;
wire [`DWIDTH-1:0] b7_data_delayed_3;
wire [`DWIDTH-1:0] b7_data_delayed_4;
wire [`DWIDTH-1:0] b7_data_delayed_5;
wire [`DWIDTH-1:0] b7_data_delayed_6;
wire [`DWIDTH-1:0] b7_data_delayed_7;
wire [`DWIDTH-1:0] b8_data_delayed_1;
wire [`DWIDTH-1:0] b8_data_delayed_2;
wire [`DWIDTH-1:0] b8_data_delayed_3;
wire [`DWIDTH-1:0] b8_data_delayed_4;
wire [`DWIDTH-1:0] b8_data_delayed_5;
wire [`DWIDTH-1:0] b8_data_delayed_6;
wire [`DWIDTH-1:0] b8_data_delayed_7;
wire [`DWIDTH-1:0] b8_data_delayed_8;
wire [`DWIDTH-1:0] b9_data_delayed_1;
wire [`DWIDTH-1:0] b9_data_delayed_2;
wire [`DWIDTH-1:0] b9_data_delayed_3;
wire [`DWIDTH-1:0] b9_data_delayed_4;
wire [`DWIDTH-1:0] b9_data_delayed_5;
wire [`DWIDTH-1:0] b9_data_delayed_6;
wire [`DWIDTH-1:0] b9_data_delayed_7;
wire [`DWIDTH-1:0] b9_data_delayed_8;
wire [`DWIDTH-1:0] b9_data_delayed_9;
wire [`DWIDTH-1:0] b10_data_delayed_1;
wire [`DWIDTH-1:0] b10_data_delayed_2;
wire [`DWIDTH-1:0] b10_data_delayed_3;
wire [`DWIDTH-1:0] b10_data_delayed_4;
wire [`DWIDTH-1:0] b10_data_delayed_5;
wire [`DWIDTH-1:0] b10_data_delayed_6;
wire [`DWIDTH-1:0] b10_data_delayed_7;
wire [`DWIDTH-1:0] b10_data_delayed_8;
wire [`DWIDTH-1:0] b10_data_delayed_9;
wire [`DWIDTH-1:0] b10_data_delayed_10;
wire [`DWIDTH-1:0] b11_data_delayed_1;
wire [`DWIDTH-1:0] b11_data_delayed_2;
wire [`DWIDTH-1:0] b11_data_delayed_3;
wire [`DWIDTH-1:0] b11_data_delayed_4;
wire [`DWIDTH-1:0] b11_data_delayed_5;
wire [`DWIDTH-1:0] b11_data_delayed_6;
wire [`DWIDTH-1:0] b11_data_delayed_7;
wire [`DWIDTH-1:0] b11_data_delayed_8;
wire [`DWIDTH-1:0] b11_data_delayed_9;
wire [`DWIDTH-1:0] b11_data_delayed_10;
wire [`DWIDTH-1:0] b11_data_delayed_11;
wire [`DWIDTH-1:0] b12_data_delayed_1;
wire [`DWIDTH-1:0] b12_data_delayed_2;
wire [`DWIDTH-1:0] b12_data_delayed_3;
wire [`DWIDTH-1:0] b12_data_delayed_4;
wire [`DWIDTH-1:0] b12_data_delayed_5;
wire [`DWIDTH-1:0] b12_data_delayed_6;
wire [`DWIDTH-1:0] b12_data_delayed_7;
wire [`DWIDTH-1:0] b12_data_delayed_8;
wire [`DWIDTH-1:0] b12_data_delayed_9;
wire [`DWIDTH-1:0] b12_data_delayed_10;
wire [`DWIDTH-1:0] b12_data_delayed_11;
wire [`DWIDTH-1:0] b12_data_delayed_12;
wire [`DWIDTH-1:0] b13_data_delayed_1;
wire [`DWIDTH-1:0] b13_data_delayed_2;
wire [`DWIDTH-1:0] b13_data_delayed_3;
wire [`DWIDTH-1:0] b13_data_delayed_4;
wire [`DWIDTH-1:0] b13_data_delayed_5;
wire [`DWIDTH-1:0] b13_data_delayed_6;
wire [`DWIDTH-1:0] b13_data_delayed_7;
wire [`DWIDTH-1:0] b13_data_delayed_8;
wire [`DWIDTH-1:0] b13_data_delayed_9;
wire [`DWIDTH-1:0] b13_data_delayed_10;
wire [`DWIDTH-1:0] b13_data_delayed_11;
wire [`DWIDTH-1:0] b13_data_delayed_12;
wire [`DWIDTH-1:0] b13_data_delayed_13;
wire [`DWIDTH-1:0] b14_data_delayed_1;
wire [`DWIDTH-1:0] b14_data_delayed_2;
wire [`DWIDTH-1:0] b14_data_delayed_3;
wire [`DWIDTH-1:0] b14_data_delayed_4;
wire [`DWIDTH-1:0] b14_data_delayed_5;
wire [`DWIDTH-1:0] b14_data_delayed_6;
wire [`DWIDTH-1:0] b14_data_delayed_7;
wire [`DWIDTH-1:0] b14_data_delayed_8;
wire [`DWIDTH-1:0] b14_data_delayed_9;
wire [`DWIDTH-1:0] b14_data_delayed_10;
wire [`DWIDTH-1:0] b14_data_delayed_11;
wire [`DWIDTH-1:0] b14_data_delayed_12;
wire [`DWIDTH-1:0] b14_data_delayed_13;
wire [`DWIDTH-1:0] b14_data_delayed_14;
wire [`DWIDTH-1:0] b15_data_delayed_1;
wire [`DWIDTH-1:0] b15_data_delayed_2;
wire [`DWIDTH-1:0] b15_data_delayed_3;
wire [`DWIDTH-1:0] b15_data_delayed_4;
wire [`DWIDTH-1:0] b15_data_delayed_5;
wire [`DWIDTH-1:0] b15_data_delayed_6;
wire [`DWIDTH-1:0] b15_data_delayed_7;
wire [`DWIDTH-1:0] b15_data_delayed_8;
wire [`DWIDTH-1:0] b15_data_delayed_9;
wire [`DWIDTH-1:0] b15_data_delayed_10;
wire [`DWIDTH-1:0] b15_data_delayed_11;
wire [`DWIDTH-1:0] b15_data_delayed_12;
wire [`DWIDTH-1:0] b15_data_delayed_13;
wire [`DWIDTH-1:0] b15_data_delayed_14;
wire [`DWIDTH-1:0] b15_data_delayed_15;
//////////////////////////////////////////////////////////////////////////
// Instantiation of systolic data setup
//////////////////////////////////////////////////////////////////////////
systolic_data_setup u_systolic_data_setup(
.clk(clk),
.reset(reset),
.start_mat_mul(start_mat_mul),
.a_addr(a_addr),
.b_addr(b_addr),
.address_mat_a(address_mat_a),
.address_mat_b(address_mat_b),
.address_stride_a(address_stride_a),
.address_stride_b(address_stride_b),
.a_data(a_data),
.b_data(b_data),
.clk_cnt(clk_cnt),
.a0_data(a0_data),
.b0_data(b0_data),
.a1_data_delayed_1(a1_data_delayed_1),
.b1_data_delayed_1(b1_data_delayed_1),
.a2_data_delayed_2(a2_data_delayed_2),
.b2_data_delayed_2(b2_data_delayed_2),
.a3_data_delayed_3(a3_data_delayed_3),
.b3_data_delayed_3(b3_data_delayed_3),
.a4_data_delayed_4(a4_data_delayed_4),
.b4_data_delayed_4(b4_data_delayed_4),
.a5_data_delayed_5(a5_data_delayed_5),
.b5_data_delayed_5(b5_data_delayed_5),
.a6_data_delayed_6(a6_data_delayed_6),
.b6_data_delayed_6(b6_data_delayed_6),
.a7_data_delayed_7(a7_data_delayed_7),
.b7_data_delayed_7(b7_data_delayed_7),
.a8_data_delayed_8(a8_data_delayed_8),
.b8_data_delayed_8(b8_data_delayed_8),
.a9_data_delayed_9(a9_data_delayed_9),
.b9_data_delayed_9(b9_data_delayed_9),
.a10_data_delayed_10(a10_data_delayed_10),
.b10_data_delayed_10(b10_data_delayed_10),
.a11_data_delayed_11(a11_data_delayed_11),
.b11_data_delayed_11(b11_data_delayed_11),
.a12_data_delayed_12(a12_data_delayed_12),
.b12_data_delayed_12(b12_data_delayed_12),
.a13_data_delayed_13(a13_data_delayed_13),
.b13_data_delayed_13(b13_data_delayed_13),
.a14_data_delayed_14(a14_data_delayed_14),
.b14_data_delayed_14(b14_data_delayed_14),
.a15_data_delayed_15(a15_data_delayed_15),
.b15_data_delayed_15(b15_data_delayed_15),
.validity_mask_a_rows(validity_mask_a_rows),
.validity_mask_a_cols(validity_mask_a_cols),
.validity_mask_b_rows(validity_mask_b_rows),
.validity_mask_b_cols(validity_mask_b_cols),
.final_mat_mul_size(final_mat_mul_size),
.a_loc(a_loc),
.b_loc(b_loc)
);
//////////////////////////////////////////////////////////////////////////
// Logic to mux data_in coming from neighboring matmuls
//////////////////////////////////////////////////////////////////////////
wire [`DWIDTH-1:0] a0;
wire [`DWIDTH-1:0] a1;
wire [`DWIDTH-1:0] a2;
wire [`DWIDTH-1:0] a3;
wire [`DWIDTH-1:0] a4;
wire [`DWIDTH-1:0] a5;
wire [`DWIDTH-1:0] a6;
wire [`DWIDTH-1:0] a7;
wire [`DWIDTH-1:0] a8;
wire [`DWIDTH-1:0] a9;
wire [`DWIDTH-1:0] a10;
wire [`DWIDTH-1:0] a11;
wire [`DWIDTH-1:0] a12;
wire [`DWIDTH-1:0] a13;
wire [`DWIDTH-1:0] a14;
wire [`DWIDTH-1:0] a15;
wire [`DWIDTH-1:0] b0;
wire [`DWIDTH-1:0] b1;
wire [`DWIDTH-1:0] b2;
wire [`DWIDTH-1:0] b3;
wire [`DWIDTH-1:0] b4;
wire [`DWIDTH-1:0] b5;
wire [`DWIDTH-1:0] b6;
wire [`DWIDTH-1:0] b7;
wire [`DWIDTH-1:0] b8;
wire [`DWIDTH-1:0] b9;
wire [`DWIDTH-1:0] b10;
wire [`DWIDTH-1:0] b11;
wire [`DWIDTH-1:0] b12;
wire [`DWIDTH-1:0] b13;
wire [`DWIDTH-1:0] b14;
wire [`DWIDTH-1:0] b15;
wire [`DWIDTH-1:0] a0_data_in;
wire [`DWIDTH-1:0] a1_data_in;
wire [`DWIDTH-1:0] a2_data_in;
wire [`DWIDTH-1:0] a3_data_in;
wire [`DWIDTH-1:0] a4_data_in;
wire [`DWIDTH-1:0] a5_data_in;
wire [`DWIDTH-1:0] a6_data_in;
wire [`DWIDTH-1:0] a7_data_in;
wire [`DWIDTH-1:0] a8_data_in;
wire [`DWIDTH-1:0] a9_data_in;
wire [`DWIDTH-1:0] a10_data_in;
wire [`DWIDTH-1:0] a11_data_in;
wire [`DWIDTH-1:0] a12_data_in;
wire [`DWIDTH-1:0] a13_data_in;
wire [`DWIDTH-1:0] a14_data_in;
wire [`DWIDTH-1:0] a15_data_in;
assign a0_data_in = a_data_in[1*`DWIDTH-1:0*`DWIDTH];
assign a1_data_in = a_data_in[2*`DWIDTH-1:1*`DWIDTH];
assign a2_data_in = a_data_in[3*`DWIDTH-1:2*`DWIDTH];
assign a3_data_in = a_data_in[4*`DWIDTH-1:3*`DWIDTH];
assign a4_data_in = a_data_in[5*`DWIDTH-1:4*`DWIDTH];
assign a5_data_in = a_data_in[6*`DWIDTH-1:5*`DWIDTH];
assign a6_data_in = a_data_in[7*`DWIDTH-1:6*`DWIDTH];
assign a7_data_in = a_data_in[8*`DWIDTH-1:7*`DWIDTH];
assign a8_data_in = a_data_in[9*`DWIDTH-1:8*`DWIDTH];
assign a9_data_in = a_data_in[10*`DWIDTH-1:9*`DWIDTH];
assign a10_data_in = a_data_in[11*`DWIDTH-1:10*`DWIDTH];
assign a11_data_in = a_data_in[12*`DWIDTH-1:11*`DWIDTH];
assign a12_data_in = a_data_in[13*`DWIDTH-1:12*`DWIDTH];
assign a13_data_in = a_data_in[14*`DWIDTH-1:13*`DWIDTH];
assign a14_data_in = a_data_in[15*`DWIDTH-1:14*`DWIDTH];
assign a15_data_in = a_data_in[16*`DWIDTH-1:15*`DWIDTH];
wire [`DWIDTH-1:0] b0_data_in;
wire [`DWIDTH-1:0] b1_data_in;
wire [`DWIDTH-1:0] b2_data_in;
wire [`DWIDTH-1:0] b3_data_in;
wire [`DWIDTH-1:0] b4_data_in;
wire [`DWIDTH-1:0] b5_data_in;
wire [`DWIDTH-1:0] b6_data_in;
wire [`DWIDTH-1:0] b7_data_in;
wire [`DWIDTH-1:0] b8_data_in;
wire [`DWIDTH-1:0] b9_data_in;
wire [`DWIDTH-1:0] b10_data_in;
wire [`DWIDTH-1:0] b11_data_in;
wire [`DWIDTH-1:0] b12_data_in;
wire [`DWIDTH-1:0] b13_data_in;
wire [`DWIDTH-1:0] b14_data_in;
wire [`DWIDTH-1:0] b15_data_in;
assign b0_data_in = b_data_in[1*`DWIDTH-1:0*`DWIDTH];
assign b1_data_in = b_data_in[2*`DWIDTH-1:1*`DWIDTH];
assign b2_data_in = b_data_in[3*`DWIDTH-1:2*`DWIDTH];
assign b3_data_in = b_data_in[4*`DWIDTH-1:3*`DWIDTH];
assign b4_data_in = b_data_in[5*`DWIDTH-1:4*`DWIDTH];
assign b5_data_in = b_data_in[6*`DWIDTH-1:5*`DWIDTH];
assign b6_data_in = b_data_in[7*`DWIDTH-1:6*`DWIDTH];
assign b7_data_in = b_data_in[8*`DWIDTH-1:7*`DWIDTH];
assign b8_data_in = b_data_in[9*`DWIDTH-1:8*`DWIDTH];
assign b9_data_in = b_data_in[10*`DWIDTH-1:9*`DWIDTH];
assign b10_data_in = b_data_in[11*`DWIDTH-1:10*`DWIDTH];
assign b11_data_in = b_data_in[12*`DWIDTH-1:11*`DWIDTH];
assign b12_data_in = b_data_in[13*`DWIDTH-1:12*`DWIDTH];
assign b13_data_in = b_data_in[14*`DWIDTH-1:13*`DWIDTH];
assign b14_data_in = b_data_in[15*`DWIDTH-1:14*`DWIDTH];
assign b15_data_in = b_data_in[16*`DWIDTH-1:15*`DWIDTH];
assign a0 = (b_loc==0) ? a0_data : a0_data_in;
assign a1 = (b_loc==0) ? a1_data_delayed_1 : a1_data_in;
assign a2 = (b_loc==0) ? a2_data_delayed_2 : a2_data_in;
assign a3 = (b_loc==0) ? a3_data_delayed_3 : a3_data_in;
assign a4 = (b_loc==0) ? a4_data_delayed_4 : a4_data_in;
assign a5 = (b_loc==0) ? a5_data_delayed_5 : a5_data_in;
assign a6 = (b_loc==0) ? a6_data_delayed_6 : a6_data_in;
assign a7 = (b_loc==0) ? a7_data_delayed_7 : a7_data_in;
assign a8 = (b_loc==0) ? a8_data_delayed_8 : a8_data_in;
assign a9 = (b_loc==0) ? a9_data_delayed_9 : a9_data_in;
assign a10 = (b_loc==0) ? a10_data_delayed_10 : a10_data_in;
assign a11 = (b_loc==0) ? a11_data_delayed_11 : a11_data_in;
assign a12 = (b_loc==0) ? a12_data_delayed_12 : a12_data_in;
assign a13 = (b_loc==0) ? a13_data_delayed_13 : a13_data_in;
assign a14 = (b_loc==0) ? a14_data_delayed_14 : a14_data_in;
assign a15 = (b_loc==0) ? a15_data_delayed_15 : a15_data_in;
assign b0 = (a_loc==0) ? b0_data : b0_data_in;
assign b1 = (a_loc==0) ? b1_data_delayed_1 : b1_data_in;
assign b2 = (a_loc==0) ? b2_data_delayed_2 : b2_data_in;
assign b3 = (a_loc==0) ? b3_data_delayed_3 : b3_data_in;
assign b4 = (a_loc==0) ? b4_data_delayed_4 : b4_data_in;
assign b5 = (a_loc==0) ? b5_data_delayed_5 : b5_data_in;
assign b6 = (a_loc==0) ? b6_data_delayed_6 : b6_data_in;
assign b7 = (a_loc==0) ? b7_data_delayed_7 : b7_data_in;
assign b8 = (a_loc==0) ? b8_data_delayed_8 : b8_data_in;
assign b9 = (a_loc==0) ? b9_data_delayed_9 : b9_data_in;
assign b10 = (a_loc==0) ? b10_data_delayed_10 : b10_data_in;
assign b11 = (a_loc==0) ? b11_data_delayed_11 : b11_data_in;
assign b12 = (a_loc==0) ? b12_data_delayed_12 : b12_data_in;
assign b13 = (a_loc==0) ? b13_data_delayed_13 : b13_data_in;
assign b14 = (a_loc==0) ? b14_data_delayed_14 : b14_data_in;
assign b15 = (a_loc==0) ? b15_data_delayed_15 : b15_data_in;
wire [`DWIDTH-1:0] matrixC0_0;
wire [`DWIDTH-1:0] matrixC0_1;
wire [`DWIDTH-1:0] matrixC0_2;
wire [`DWIDTH-1:0] matrixC0_3;
wire [`DWIDTH-1:0] matrixC0_4;
wire [`DWIDTH-1:0] matrixC0_5;
wire [`DWIDTH-1:0] matrixC0_6;
wire [`DWIDTH-1:0] matrixC0_7;
wire [`DWIDTH-1:0] matrixC0_8;
wire [`DWIDTH-1:0] matrixC0_9;
wire [`DWIDTH-1:0] matrixC0_10;
wire [`DWIDTH-1:0] matrixC0_11;
wire [`DWIDTH-1:0] matrixC0_12;
wire [`DWIDTH-1:0] matrixC0_13;
wire [`DWIDTH-1:0] matrixC0_14;
wire [`DWIDTH-1:0] matrixC0_15;
wire [`DWIDTH-1:0] matrixC1_0;
wire [`DWIDTH-1:0] matrixC1_1;
wire [`DWIDTH-1:0] matrixC1_2;
wire [`DWIDTH-1:0] matrixC1_3;
wire [`DWIDTH-1:0] matrixC1_4;
wire [`DWIDTH-1:0] matrixC1_5;
wire [`DWIDTH-1:0] matrixC1_6;
wire [`DWIDTH-1:0] matrixC1_7;
wire [`DWIDTH-1:0] matrixC1_8;
wire [`DWIDTH-1:0] matrixC1_9;
wire [`DWIDTH-1:0] matrixC1_10;
wire [`DWIDTH-1:0] matrixC1_11;
wire [`DWIDTH-1:0] matrixC1_12;
wire [`DWIDTH-1:0] matrixC1_13;
wire [`DWIDTH-1:0] matrixC1_14;
wire [`DWIDTH-1:0] matrixC1_15;
wire [`DWIDTH-1:0] matrixC2_0;
wire [`DWIDTH-1:0] matrixC2_1;
wire [`DWIDTH-1:0] matrixC2_2;
wire [`DWIDTH-1:0] matrixC2_3;
wire [`DWIDTH-1:0] matrixC2_4;
wire [`DWIDTH-1:0] matrixC2_5;
wire [`DWIDTH-1:0] matrixC2_6;
wire [`DWIDTH-1:0] matrixC2_7;
wire [`DWIDTH-1:0] matrixC2_8;
wire [`DWIDTH-1:0] matrixC2_9;
wire [`DWIDTH-1:0] matrixC2_10;
wire [`DWIDTH-1:0] matrixC2_11;
wire [`DWIDTH-1:0] matrixC2_12;
wire [`DWIDTH-1:0] matrixC2_13;
wire [`DWIDTH-1:0] matrixC2_14;
wire [`DWIDTH-1:0] matrixC2_15;
wire [`DWIDTH-1:0] matrixC3_0;
wire [`DWIDTH-1:0] matrixC3_1;
wire [`DWIDTH-1:0] matrixC3_2;
wire [`DWIDTH-1:0] matrixC3_3;
wire [`DWIDTH-1:0] matrixC3_4;
wire [`DWIDTH-1:0] matrixC3_5;
wire [`DWIDTH-1:0] matrixC3_6;
wire [`DWIDTH-1:0] matrixC3_7;
wire [`DWIDTH-1:0] matrixC3_8;
wire [`DWIDTH-1:0] matrixC3_9;
wire [`DWIDTH-1:0] matrixC3_10;
wire [`DWIDTH-1:0] matrixC3_11;
wire [`DWIDTH-1:0] matrixC3_12;
wire [`DWIDTH-1:0] matrixC3_13;
wire [`DWIDTH-1:0] matrixC3_14;
wire [`DWIDTH-1:0] matrixC3_15;
wire [`DWIDTH-1:0] matrixC4_0;
wire [`DWIDTH-1:0] matrixC4_1;
wire [`DWIDTH-1:0] matrixC4_2;
wire [`DWIDTH-1:0] matrixC4_3;
wire [`DWIDTH-1:0] matrixC4_4;
wire [`DWIDTH-1:0] matrixC4_5;
wire [`DWIDTH-1:0] matrixC4_6;
wire [`DWIDTH-1:0] matrixC4_7;
wire [`DWIDTH-1:0] matrixC4_8;
wire [`DWIDTH-1:0] matrixC4_9;
wire [`DWIDTH-1:0] matrixC4_10;
wire [`DWIDTH-1:0] matrixC4_11;
wire [`DWIDTH-1:0] matrixC4_12;
wire [`DWIDTH-1:0] matrixC4_13;
wire [`DWIDTH-1:0] matrixC4_14;
wire [`DWIDTH-1:0] matrixC4_15;
wire [`DWIDTH-1:0] matrixC5_0;
wire [`DWIDTH-1:0] matrixC5_1;
wire [`DWIDTH-1:0] matrixC5_2;
wire [`DWIDTH-1:0] matrixC5_3;
wire [`DWIDTH-1:0] matrixC5_4;
wire [`DWIDTH-1:0] matrixC5_5;
wire [`DWIDTH-1:0] matrixC5_6;
wire [`DWIDTH-1:0] matrixC5_7;
wire [`DWIDTH-1:0] matrixC5_8;
wire [`DWIDTH-1:0] matrixC5_9;
wire [`DWIDTH-1:0] matrixC5_10;
wire [`DWIDTH-1:0] matrixC5_11;
wire [`DWIDTH-1:0] matrixC5_12;
wire [`DWIDTH-1:0] matrixC5_13;
wire [`DWIDTH-1:0] matrixC5_14;
wire [`DWIDTH-1:0] matrixC5_15;
wire [`DWIDTH-1:0] matrixC6_0;
wire [`DWIDTH-1:0] matrixC6_1;
wire [`DWIDTH-1:0] matrixC6_2;
wire [`DWIDTH-1:0] matrixC6_3;
wire [`DWIDTH-1:0] matrixC6_4;
wire [`DWIDTH-1:0] matrixC6_5;
wire [`DWIDTH-1:0] matrixC6_6;
wire [`DWIDTH-1:0] matrixC6_7;
wire [`DWIDTH-1:0] matrixC6_8;
wire [`DWIDTH-1:0] matrixC6_9;
wire [`DWIDTH-1:0] matrixC6_10;
wire [`DWIDTH-1:0] matrixC6_11;
wire [`DWIDTH-1:0] matrixC6_12;
wire [`DWIDTH-1:0] matrixC6_13;
wire [`DWIDTH-1:0] matrixC6_14;
wire [`DWIDTH-1:0] matrixC6_15;
wire [`DWIDTH-1:0] matrixC7_0;
wire [`DWIDTH-1:0] matrixC7_1;
wire [`DWIDTH-1:0] matrixC7_2;
wire [`DWIDTH-1:0] matrixC7_3;
wire [`DWIDTH-1:0] matrixC7_4;
wire [`DWIDTH-1:0] matrixC7_5;
wire [`DWIDTH-1:0] matrixC7_6;
wire [`DWIDTH-1:0] matrixC7_7;
wire [`DWIDTH-1:0] matrixC7_8;
wire [`DWIDTH-1:0] matrixC7_9;
wire [`DWIDTH-1:0] matrixC7_10;
wire [`DWIDTH-1:0] matrixC7_11;
wire [`DWIDTH-1:0] matrixC7_12;
wire [`DWIDTH-1:0] matrixC7_13;
wire [`DWIDTH-1:0] matrixC7_14;
wire [`DWIDTH-1:0] matrixC7_15;
wire [`DWIDTH-1:0] matrixC8_0;
wire [`DWIDTH-1:0] matrixC8_1;
wire [`DWIDTH-1:0] matrixC8_2;
wire [`DWIDTH-1:0] matrixC8_3;
wire [`DWIDTH-1:0] matrixC8_4;
wire [`DWIDTH-1:0] matrixC8_5;
wire [`DWIDTH-1:0] matrixC8_6;
wire [`DWIDTH-1:0] matrixC8_7;
wire [`DWIDTH-1:0] matrixC8_8;
wire [`DWIDTH-1:0] matrixC8_9;
wire [`DWIDTH-1:0] matrixC8_10;
wire [`DWIDTH-1:0] matrixC8_11;
wire [`DWIDTH-1:0] matrixC8_12;
wire [`DWIDTH-1:0] matrixC8_13;
wire [`DWIDTH-1:0] matrixC8_14;
wire [`DWIDTH-1:0] matrixC8_15;
wire [`DWIDTH-1:0] matrixC9_0;
wire [`DWIDTH-1:0] matrixC9_1;
wire [`DWIDTH-1:0] matrixC9_2;
wire [`DWIDTH-1:0] matrixC9_3;
wire [`DWIDTH-1:0] matrixC9_4;
wire [`DWIDTH-1:0] matrixC9_5;
wire [`DWIDTH-1:0] matrixC9_6;
wire [`DWIDTH-1:0] matrixC9_7;
wire [`DWIDTH-1:0] matrixC9_8;
wire [`DWIDTH-1:0] matrixC9_9;
wire [`DWIDTH-1:0] matrixC9_10;
wire [`DWIDTH-1:0] matrixC9_11;
wire [`DWIDTH-1:0] matrixC9_12;
wire [`DWIDTH-1:0] matrixC9_13;
wire [`DWIDTH-1:0] matrixC9_14;
wire [`DWIDTH-1:0] matrixC9_15;
wire [`DWIDTH-1:0] matrixC10_0;
wire [`DWIDTH-1:0] matrixC10_1;
wire [`DWIDTH-1:0] matrixC10_2;
wire [`DWIDTH-1:0] matrixC10_3;
wire [`DWIDTH-1:0] matrixC10_4;
wire [`DWIDTH-1:0] matrixC10_5;
wire [`DWIDTH-1:0] matrixC10_6;
wire [`DWIDTH-1:0] matrixC10_7;
wire [`DWIDTH-1:0] matrixC10_8;
wire [`DWIDTH-1:0] matrixC10_9;
wire [`DWIDTH-1:0] matrixC10_10;
wire [`DWIDTH-1:0] matrixC10_11;
wire [`DWIDTH-1:0] matrixC10_12;
wire [`DWIDTH-1:0] matrixC10_13;
wire [`DWIDTH-1:0] matrixC10_14;
wire [`DWIDTH-1:0] matrixC10_15;
wire [`DWIDTH-1:0] matrixC11_0;
wire [`DWIDTH-1:0] matrixC11_1;
wire [`DWIDTH-1:0] matrixC11_2;
wire [`DWIDTH-1:0] matrixC11_3;
wire [`DWIDTH-1:0] matrixC11_4;
wire [`DWIDTH-1:0] matrixC11_5;
wire [`DWIDTH-1:0] matrixC11_6;
wire [`DWIDTH-1:0] matrixC11_7;
wire [`DWIDTH-1:0] matrixC11_8;
wire [`DWIDTH-1:0] matrixC11_9;
wire [`DWIDTH-1:0] matrixC11_10;
wire [`DWIDTH-1:0] matrixC11_11;
wire [`DWIDTH-1:0] matrixC11_12;
wire [`DWIDTH-1:0] matrixC11_13;
wire [`DWIDTH-1:0] matrixC11_14;
wire [`DWIDTH-1:0] matrixC11_15;
wire [`DWIDTH-1:0] matrixC12_0;
wire [`DWIDTH-1:0] matrixC12_1;
wire [`DWIDTH-1:0] matrixC12_2;
wire [`DWIDTH-1:0] matrixC12_3;
wire [`DWIDTH-1:0] matrixC12_4;
wire [`DWIDTH-1:0] matrixC12_5;
wire [`DWIDTH-1:0] matrixC12_6;
wire [`DWIDTH-1:0] matrixC12_7;
wire [`DWIDTH-1:0] matrixC12_8;
wire [`DWIDTH-1:0] matrixC12_9;
wire [`DWIDTH-1:0] matrixC12_10;
wire [`DWIDTH-1:0] matrixC12_11;
wire [`DWIDTH-1:0] matrixC12_12;
wire [`DWIDTH-1:0] matrixC12_13;
wire [`DWIDTH-1:0] matrixC12_14;
wire [`DWIDTH-1:0] matrixC12_15;
wire [`DWIDTH-1:0] matrixC13_0;
wire [`DWIDTH-1:0] matrixC13_1;
wire [`DWIDTH-1:0] matrixC13_2;
wire [`DWIDTH-1:0] matrixC13_3;
wire [`DWIDTH-1:0] matrixC13_4;
wire [`DWIDTH-1:0] matrixC13_5;
wire [`DWIDTH-1:0] matrixC13_6;
wire [`DWIDTH-1:0] matrixC13_7;
wire [`DWIDTH-1:0] matrixC13_8;
wire [`DWIDTH-1:0] matrixC13_9;
wire [`DWIDTH-1:0] matrixC13_10;
wire [`DWIDTH-1:0] matrixC13_11;
wire [`DWIDTH-1:0] matrixC13_12;
wire [`DWIDTH-1:0] matrixC13_13;
wire [`DWIDTH-1:0] matrixC13_14;
wire [`DWIDTH-1:0] matrixC13_15;
wire [`DWIDTH-1:0] matrixC14_0;
wire [`DWIDTH-1:0] matrixC14_1;
wire [`DWIDTH-1:0] matrixC14_2;
wire [`DWIDTH-1:0] matrixC14_3;
wire [`DWIDTH-1:0] matrixC14_4;
wire [`DWIDTH-1:0] matrixC14_5;
wire [`DWIDTH-1:0] matrixC14_6;
wire [`DWIDTH-1:0] matrixC14_7;
wire [`DWIDTH-1:0] matrixC14_8;
wire [`DWIDTH-1:0] matrixC14_9;
wire [`DWIDTH-1:0] matrixC14_10;
wire [`DWIDTH-1:0] matrixC14_11;
wire [`DWIDTH-1:0] matrixC14_12;
wire [`DWIDTH-1:0] matrixC14_13;
wire [`DWIDTH-1:0] matrixC14_14;
wire [`DWIDTH-1:0] matrixC14_15;
wire [`DWIDTH-1:0] matrixC15_0;
wire [`DWIDTH-1:0] matrixC15_1;
wire [`DWIDTH-1:0] matrixC15_2;
wire [`DWIDTH-1:0] matrixC15_3;
wire [`DWIDTH-1:0] matrixC15_4;
wire [`DWIDTH-1:0] matrixC15_5;
wire [`DWIDTH-1:0] matrixC15_6;
wire [`DWIDTH-1:0] matrixC15_7;
wire [`DWIDTH-1:0] matrixC15_8;
wire [`DWIDTH-1:0] matrixC15_9;
wire [`DWIDTH-1:0] matrixC15_10;
wire [`DWIDTH-1:0] matrixC15_11;
wire [`DWIDTH-1:0] matrixC15_12;
wire [`DWIDTH-1:0] matrixC15_13;
wire [`DWIDTH-1:0] matrixC15_14;
wire [`DWIDTH-1:0] matrixC15_15;
wire row_latch_en;
//////////////////////////////////////////////////////////////////////////
// Instantiation of the output logic
//////////////////////////////////////////////////////////////////////////
output_logic u_output_logic(
.start_mat_mul(start_mat_mul),
.done_mat_mul(done_mat_mul),
.address_mat_c(address_mat_c),
.address_stride_c(address_stride_c),
.c_data_out(c_data_out),
.c_data_in(c_data_in),
.c_addr(c_addr),
.c_data_available(c_data_available),
.clk_cnt(clk_cnt),
.row_latch_en(row_latch_en),
.final_mat_mul_size(final_mat_mul_size),
.matrixC0_0(matrixC0_0),
.matrixC0_1(matrixC0_1),
.matrixC0_2(matrixC0_2),
.matrixC0_3(matrixC0_3),
.matrixC0_4(matrixC0_4),
.matrixC0_5(matrixC0_5),
.matrixC0_6(matrixC0_6),
.matrixC0_7(matrixC0_7),
.matrixC0_8(matrixC0_8),
.matrixC0_9(matrixC0_9),
.matrixC0_10(matrixC0_10),
.matrixC0_11(matrixC0_11),
.matrixC0_12(matrixC0_12),
.matrixC0_13(matrixC0_13),
.matrixC0_14(matrixC0_14),
.matrixC0_15(matrixC0_15),
.matrixC1_0(matrixC1_0),
.matrixC1_1(matrixC1_1),
.matrixC1_2(matrixC1_2),
.matrixC1_3(matrixC1_3),
.matrixC1_4(matrixC1_4),
.matrixC1_5(matrixC1_5),
.matrixC1_6(matrixC1_6),
.matrixC1_7(matrixC1_7),
.matrixC1_8(matrixC1_8),
.matrixC1_9(matrixC1_9),
.matrixC1_10(matrixC1_10),
.matrixC1_11(matrixC1_11),
.matrixC1_12(matrixC1_12),
.matrixC1_13(matrixC1_13),
.matrixC1_14(matrixC1_14),
.matrixC1_15(matrixC1_15),
.matrixC2_0(matrixC2_0),
.matrixC2_1(matrixC2_1),
.matrixC2_2(matrixC2_2),
.matrixC2_3(matrixC2_3),
.matrixC2_4(matrixC2_4),
.matrixC2_5(matrixC2_5),
.matrixC2_6(matrixC2_6),
.matrixC2_7(matrixC2_7),
.matrixC2_8(matrixC2_8),
.matrixC2_9(matrixC2_9),
.matrixC2_10(matrixC2_10),
.matrixC2_11(matrixC2_11),
.matrixC2_12(matrixC2_12),
.matrixC2_13(matrixC2_13),
.matrixC2_14(matrixC2_14),
.matrixC2_15(matrixC2_15),
.matrixC3_0(matrixC3_0),
.matrixC3_1(matrixC3_1),
.matrixC3_2(matrixC3_2),
.matrixC3_3(matrixC3_3),
.matrixC3_4(matrixC3_4),
.matrixC3_5(matrixC3_5),
.matrixC3_6(matrixC3_6),
.matrixC3_7(matrixC3_7),
.matrixC3_8(matrixC3_8),
.matrixC3_9(matrixC3_9),
.matrixC3_10(matrixC3_10),
.matrixC3_11(matrixC3_11),
.matrixC3_12(matrixC3_12),
.matrixC3_13(matrixC3_13),
.matrixC3_14(matrixC3_14),
.matrixC3_15(matrixC3_15),
.matrixC4_0(matrixC4_0),
.matrixC4_1(matrixC4_1),
.matrixC4_2(matrixC4_2),
.matrixC4_3(matrixC4_3),
.matrixC4_4(matrixC4_4),
.matrixC4_5(matrixC4_5),
.matrixC4_6(matrixC4_6),
.matrixC4_7(matrixC4_7),
.matrixC4_8(matrixC4_8),
.matrixC4_9(matrixC4_9),
.matrixC4_10(matrixC4_10),
.matrixC4_11(matrixC4_11),
.matrixC4_12(matrixC4_12),
.matrixC4_13(matrixC4_13),
.matrixC4_14(matrixC4_14),
.matrixC4_15(matrixC4_15),
.matrixC5_0(matrixC5_0),
.matrixC5_1(matrixC5_1),
.matrixC5_2(matrixC5_2),
.matrixC5_3(matrixC5_3),
.matrixC5_4(matrixC5_4),
.matrixC5_5(matrixC5_5),
.matrixC5_6(matrixC5_6),
.matrixC5_7(matrixC5_7),
.matrixC5_8(matrixC5_8),
.matrixC5_9(matrixC5_9),
.matrixC5_10(matrixC5_10),
.matrixC5_11(matrixC5_11),
.matrixC5_12(matrixC5_12),
.matrixC5_13(matrixC5_13),
.matrixC5_14(matrixC5_14),
.matrixC5_15(matrixC5_15),
.matrixC6_0(matrixC6_0),
.matrixC6_1(matrixC6_1),
.matrixC6_2(matrixC6_2),
.matrixC6_3(matrixC6_3),
.matrixC6_4(matrixC6_4),
.matrixC6_5(matrixC6_5),
.matrixC6_6(matrixC6_6),
.matrixC6_7(matrixC6_7),
.matrixC6_8(matrixC6_8),
.matrixC6_9(matrixC6_9),
.matrixC6_10(matrixC6_10),
.matrixC6_11(matrixC6_11),
.matrixC6_12(matrixC6_12),
.matrixC6_13(matrixC6_13),
.matrixC6_14(matrixC6_14),
.matrixC6_15(matrixC6_15),
.matrixC7_0(matrixC7_0),
.matrixC7_1(matrixC7_1),
.matrixC7_2(matrixC7_2),
.matrixC7_3(matrixC7_3),
.matrixC7_4(matrixC7_4),
.matrixC7_5(matrixC7_5),
.matrixC7_6(matrixC7_6),
.matrixC7_7(matrixC7_7),
.matrixC7_8(matrixC7_8),
.matrixC7_9(matrixC7_9),
.matrixC7_10(matrixC7_10),
.matrixC7_11(matrixC7_11),
.matrixC7_12(matrixC7_12),
.matrixC7_13(matrixC7_13),
.matrixC7_14(matrixC7_14),
.matrixC7_15(matrixC7_15),
.matrixC8_0(matrixC8_0),
.matrixC8_1(matrixC8_1),
.matrixC8_2(matrixC8_2),
.matrixC8_3(matrixC8_3),
.matrixC8_4(matrixC8_4),
.matrixC8_5(matrixC8_5),
.matrixC8_6(matrixC8_6),
.matrixC8_7(matrixC8_7),
.matrixC8_8(matrixC8_8),
.matrixC8_9(matrixC8_9),
.matrixC8_10(matrixC8_10),
.matrixC8_11(matrixC8_11),
.matrixC8_12(matrixC8_12),
.matrixC8_13(matrixC8_13),
.matrixC8_14(matrixC8_14),
.matrixC8_15(matrixC8_15),
.matrixC9_0(matrixC9_0),
.matrixC9_1(matrixC9_1),
.matrixC9_2(matrixC9_2),
.matrixC9_3(matrixC9_3),
.matrixC9_4(matrixC9_4),
.matrixC9_5(matrixC9_5),
.matrixC9_6(matrixC9_6),
.matrixC9_7(matrixC9_7),
.matrixC9_8(matrixC9_8),
.matrixC9_9(matrixC9_9),
.matrixC9_10(matrixC9_10),
.matrixC9_11(matrixC9_11),
.matrixC9_12(matrixC9_12),
.matrixC9_13(matrixC9_13),
.matrixC9_14(matrixC9_14),
.matrixC9_15(matrixC9_15),
.matrixC10_0(matrixC10_0),
.matrixC10_1(matrixC10_1),
.matrixC10_2(matrixC10_2),
.matrixC10_3(matrixC10_3),
.matrixC10_4(matrixC10_4),
.matrixC10_5(matrixC10_5),
.matrixC10_6(matrixC10_6),
.matrixC10_7(matrixC10_7),
.matrixC10_8(matrixC10_8),
.matrixC10_9(matrixC10_9),
.matrixC10_10(matrixC10_10),
.matrixC10_11(matrixC10_11),
.matrixC10_12(matrixC10_12),
.matrixC10_13(matrixC10_13),
.matrixC10_14(matrixC10_14),
.matrixC10_15(matrixC10_15),
.matrixC11_0(matrixC11_0),
.matrixC11_1(matrixC11_1),
.matrixC11_2(matrixC11_2),
.matrixC11_3(matrixC11_3),
.matrixC11_4(matrixC11_4),
.matrixC11_5(matrixC11_5),
.matrixC11_6(matrixC11_6),
.matrixC11_7(matrixC11_7),
.matrixC11_8(matrixC11_8),
.matrixC11_9(matrixC11_9),
.matrixC11_10(matrixC11_10),
.matrixC11_11(matrixC11_11),
.matrixC11_12(matrixC11_12),
.matrixC11_13(matrixC11_13),
.matrixC11_14(matrixC11_14),
.matrixC11_15(matrixC11_15),
.matrixC12_0(matrixC12_0),
.matrixC12_1(matrixC12_1),
.matrixC12_2(matrixC12_2),
.matrixC12_3(matrixC12_3),
.matrixC12_4(matrixC12_4),
.matrixC12_5(matrixC12_5),
.matrixC12_6(matrixC12_6),
.matrixC12_7(matrixC12_7),
.matrixC12_8(matrixC12_8),
.matrixC12_9(matrixC12_9),
.matrixC12_10(matrixC12_10),
.matrixC12_11(matrixC12_11),
.matrixC12_12(matrixC12_12),
.matrixC12_13(matrixC12_13),
.matrixC12_14(matrixC12_14),
.matrixC12_15(matrixC12_15),
.matrixC13_0(matrixC13_0),
.matrixC13_1(matrixC13_1),
.matrixC13_2(matrixC13_2),
.matrixC13_3(matrixC13_3),
.matrixC13_4(matrixC13_4),
.matrixC13_5(matrixC13_5),
.matrixC13_6(matrixC13_6),
.matrixC13_7(matrixC13_7),
.matrixC13_8(matrixC13_8),
.matrixC13_9(matrixC13_9),
.matrixC13_10(matrixC13_10),
.matrixC13_11(matrixC13_11),
.matrixC13_12(matrixC13_12),
.matrixC13_13(matrixC13_13),
.matrixC13_14(matrixC13_14),
.matrixC13_15(matrixC13_15),
.matrixC14_0(matrixC14_0),
.matrixC14_1(matrixC14_1),
.matrixC14_2(matrixC14_2),
.matrixC14_3(matrixC14_3),
.matrixC14_4(matrixC14_4),
.matrixC14_5(matrixC14_5),
.matrixC14_6(matrixC14_6),
.matrixC14_7(matrixC14_7),
.matrixC14_8(matrixC14_8),
.matrixC14_9(matrixC14_9),
.matrixC14_10(matrixC14_10),
.matrixC14_11(matrixC14_11),
.matrixC14_12(matrixC14_12),
.matrixC14_13(matrixC14_13),
.matrixC14_14(matrixC14_14),
.matrixC14_15(matrixC14_15),
.matrixC15_0(matrixC15_0),
.matrixC15_1(matrixC15_1),
.matrixC15_2(matrixC15_2),
.matrixC15_3(matrixC15_3),
.matrixC15_4(matrixC15_4),
.matrixC15_5(matrixC15_5),
.matrixC15_6(matrixC15_6),
.matrixC15_7(matrixC15_7),
.matrixC15_8(matrixC15_8),
.matrixC15_9(matrixC15_9),
.matrixC15_10(matrixC15_10),
.matrixC15_11(matrixC15_11),
.matrixC15_12(matrixC15_12),
.matrixC15_13(matrixC15_13),
.matrixC15_14(matrixC15_14),
.matrixC15_15(matrixC15_15),
.clk(clk),
.reset(reset)
);
//////////////////////////////////////////////////////////////////////////
// Instantiations of the actual PEs
//////////////////////////////////////////////////////////////////////////
systolic_pe_matrix u_systolic_pe_matrix(
.clk(clk),
.reset(reset),
.pe_reset(pe_reset),
.a0(a0),
.a1(a1),
.a2(a2),
.a3(a3),
.a4(a4),
.a5(a5),
.a6(a6),
.a7(a7),
.a8(a8),
.a9(a9),
.a10(a10),
.a11(a11),
.a12(a12),
.a13(a13),
.a14(a14),
.a15(a15),
.b0(b0),
.b1(b1),
.b2(b2),
.b3(b3),
.b4(b4),
.b5(b5),
.b6(b6),
.b7(b7),
.b8(b8),
.b9(b9),
.b10(b10),
.b11(b11),
.b12(b12),
.b13(b13),
.b14(b14),
.b15(b15),
.matrixC0_0(matrixC0_0),
.matrixC0_1(matrixC0_1),
.matrixC0_2(matrixC0_2),
.matrixC0_3(matrixC0_3),
.matrixC0_4(matrixC0_4),
.matrixC0_5(matrixC0_5),
.matrixC0_6(matrixC0_6),
.matrixC0_7(matrixC0_7),
.matrixC0_8(matrixC0_8),
.matrixC0_9(matrixC0_9),
.matrixC0_10(matrixC0_10),
.matrixC0_11(matrixC0_11),
.matrixC0_12(matrixC0_12),
.matrixC0_13(matrixC0_13),
.matrixC0_14(matrixC0_14),
.matrixC0_15(matrixC0_15),
.matrixC1_0(matrixC1_0),
.matrixC1_1(matrixC1_1),
.matrixC1_2(matrixC1_2),
.matrixC1_3(matrixC1_3),
.matrixC1_4(matrixC1_4),
.matrixC1_5(matrixC1_5),
.matrixC1_6(matrixC1_6),
.matrixC1_7(matrixC1_7),
.matrixC1_8(matrixC1_8),
.matrixC1_9(matrixC1_9),
.matrixC1_10(matrixC1_10),
.matrixC1_11(matrixC1_11),
.matrixC1_12(matrixC1_12),
.matrixC1_13(matrixC1_13),
.matrixC1_14(matrixC1_14),
.matrixC1_15(matrixC1_15),
.matrixC2_0(matrixC2_0),
.matrixC2_1(matrixC2_1),
.matrixC2_2(matrixC2_2),
.matrixC2_3(matrixC2_3),
.matrixC2_4(matrixC2_4),
.matrixC2_5(matrixC2_5),
.matrixC2_6(matrixC2_6),
.matrixC2_7(matrixC2_7),
.matrixC2_8(matrixC2_8),
.matrixC2_9(matrixC2_9),
.matrixC2_10(matrixC2_10),
.matrixC2_11(matrixC2_11),
.matrixC2_12(matrixC2_12),
.matrixC2_13(matrixC2_13),
.matrixC2_14(matrixC2_14),
.matrixC2_15(matrixC2_15),
.matrixC3_0(matrixC3_0),
.matrixC3_1(matrixC3_1),
.matrixC3_2(matrixC3_2),
.matrixC3_3(matrixC3_3),
.matrixC3_4(matrixC3_4),
.matrixC3_5(matrixC3_5),
.matrixC3_6(matrixC3_6),
.matrixC3_7(matrixC3_7),
.matrixC3_8(matrixC3_8),
.matrixC3_9(matrixC3_9),
.matrixC3_10(matrixC3_10),
.matrixC3_11(matrixC3_11),
.matrixC3_12(matrixC3_12),
.matrixC3_13(matrixC3_13),
.matrixC3_14(matrixC3_14),
.matrixC3_15(matrixC3_15),
.matrixC4_0(matrixC4_0),
.matrixC4_1(matrixC4_1),
.matrixC4_2(matrixC4_2),
.matrixC4_3(matrixC4_3),
.matrixC4_4(matrixC4_4),
.matrixC4_5(matrixC4_5),
.matrixC4_6(matrixC4_6),
.matrixC4_7(matrixC4_7),
.matrixC4_8(matrixC4_8),
.matrixC4_9(matrixC4_9),
.matrixC4_10(matrixC4_10),
.matrixC4_11(matrixC4_11),
.matrixC4_12(matrixC4_12),
.matrixC4_13(matrixC4_13),
.matrixC4_14(matrixC4_14),
.matrixC4_15(matrixC4_15),
.matrixC5_0(matrixC5_0),
.matrixC5_1(matrixC5_1),
.matrixC5_2(matrixC5_2),
.matrixC5_3(matrixC5_3),
.matrixC5_4(matrixC5_4),
.matrixC5_5(matrixC5_5),
.matrixC5_6(matrixC5_6),
.matrixC5_7(matrixC5_7),
.matrixC5_8(matrixC5_8),
.matrixC5_9(matrixC5_9),
.matrixC5_10(matrixC5_10),
.matrixC5_11(matrixC5_11),
.matrixC5_12(matrixC5_12),
.matrixC5_13(matrixC5_13),
.matrixC5_14(matrixC5_14),
.matrixC5_15(matrixC5_15),
.matrixC6_0(matrixC6_0),
.matrixC6_1(matrixC6_1),
.matrixC6_2(matrixC6_2),
.matrixC6_3(matrixC6_3),
.matrixC6_4(matrixC6_4),
.matrixC6_5(matrixC6_5),
.matrixC6_6(matrixC6_6),
.matrixC6_7(matrixC6_7),
.matrixC6_8(matrixC6_8),
.matrixC6_9(matrixC6_9),
.matrixC6_10(matrixC6_10),
.matrixC6_11(matrixC6_11),
.matrixC6_12(matrixC6_12),
.matrixC6_13(matrixC6_13),
.matrixC6_14(matrixC6_14),
.matrixC6_15(matrixC6_15),
.matrixC7_0(matrixC7_0),
.matrixC7_1(matrixC7_1),
.matrixC7_2(matrixC7_2),
.matrixC7_3(matrixC7_3),
.matrixC7_4(matrixC7_4),
.matrixC7_5(matrixC7_5),
.matrixC7_6(matrixC7_6),
.matrixC7_7(matrixC7_7),
.matrixC7_8(matrixC7_8),
.matrixC7_9(matrixC7_9),
.matrixC7_10(matrixC7_10),
.matrixC7_11(matrixC7_11),
.matrixC7_12(matrixC7_12),
.matrixC7_13(matrixC7_13),
.matrixC7_14(matrixC7_14),
.matrixC7_15(matrixC7_15),
.matrixC8_0(matrixC8_0),
.matrixC8_1(matrixC8_1),
.matrixC8_2(matrixC8_2),
.matrixC8_3(matrixC8_3),
.matrixC8_4(matrixC8_4),
.matrixC8_5(matrixC8_5),
.matrixC8_6(matrixC8_6),
.matrixC8_7(matrixC8_7),
.matrixC8_8(matrixC8_8),
.matrixC8_9(matrixC8_9),
.matrixC8_10(matrixC8_10),
.matrixC8_11(matrixC8_11),
.matrixC8_12(matrixC8_12),
.matrixC8_13(matrixC8_13),
.matrixC8_14(matrixC8_14),
.matrixC8_15(matrixC8_15),
.matrixC9_0(matrixC9_0),
.matrixC9_1(matrixC9_1),
.matrixC9_2(matrixC9_2),
.matrixC9_3(matrixC9_3),
.matrixC9_4(matrixC9_4),
.matrixC9_5(matrixC9_5),
.matrixC9_6(matrixC9_6),
.matrixC9_7(matrixC9_7),
.matrixC9_8(matrixC9_8),
.matrixC9_9(matrixC9_9),
.matrixC9_10(matrixC9_10),
.matrixC9_11(matrixC9_11),
.matrixC9_12(matrixC9_12),
.matrixC9_13(matrixC9_13),
.matrixC9_14(matrixC9_14),
.matrixC9_15(matrixC9_15),
.matrixC10_0(matrixC10_0),
.matrixC10_1(matrixC10_1),
.matrixC10_2(matrixC10_2),
.matrixC10_3(matrixC10_3),
.matrixC10_4(matrixC10_4),
.matrixC10_5(matrixC10_5),
.matrixC10_6(matrixC10_6),
.matrixC10_7(matrixC10_7),
.matrixC10_8(matrixC10_8),
.matrixC10_9(matrixC10_9),
.matrixC10_10(matrixC10_10),
.matrixC10_11(matrixC10_11),
.matrixC10_12(matrixC10_12),
.matrixC10_13(matrixC10_13),
.matrixC10_14(matrixC10_14),
.matrixC10_15(matrixC10_15),
.matrixC11_0(matrixC11_0),
.matrixC11_1(matrixC11_1),
.matrixC11_2(matrixC11_2),
.matrixC11_3(matrixC11_3),
.matrixC11_4(matrixC11_4),
.matrixC11_5(matrixC11_5),
.matrixC11_6(matrixC11_6),
.matrixC11_7(matrixC11_7),
.matrixC11_8(matrixC11_8),
.matrixC11_9(matrixC11_9),
.matrixC11_10(matrixC11_10),
.matrixC11_11(matrixC11_11),
.matrixC11_12(matrixC11_12),
.matrixC11_13(matrixC11_13),
.matrixC11_14(matrixC11_14),
.matrixC11_15(matrixC11_15),
.matrixC12_0(matrixC12_0),
.matrixC12_1(matrixC12_1),
.matrixC12_2(matrixC12_2),
.matrixC12_3(matrixC12_3),
.matrixC12_4(matrixC12_4),
.matrixC12_5(matrixC12_5),
.matrixC12_6(matrixC12_6),
.matrixC12_7(matrixC12_7),
.matrixC12_8(matrixC12_8),
.matrixC12_9(matrixC12_9),
.matrixC12_10(matrixC12_10),
.matrixC12_11(matrixC12_11),
.matrixC12_12(matrixC12_12),
.matrixC12_13(matrixC12_13),
.matrixC12_14(matrixC12_14),
.matrixC12_15(matrixC12_15),
.matrixC13_0(matrixC13_0),
.matrixC13_1(matrixC13_1),
.matrixC13_2(matrixC13_2),
.matrixC13_3(matrixC13_3),
.matrixC13_4(matrixC13_4),
.matrixC13_5(matrixC13_5),
.matrixC13_6(matrixC13_6),
.matrixC13_7(matrixC13_7),
.matrixC13_8(matrixC13_8),
.matrixC13_9(matrixC13_9),
.matrixC13_10(matrixC13_10),
.matrixC13_11(matrixC13_11),
.matrixC13_12(matrixC13_12),
.matrixC13_13(matrixC13_13),
.matrixC13_14(matrixC13_14),
.matrixC13_15(matrixC13_15),
.matrixC14_0(matrixC14_0),
.matrixC14_1(matrixC14_1),
.matrixC14_2(matrixC14_2),
.matrixC14_3(matrixC14_3),
.matrixC14_4(matrixC14_4),
.matrixC14_5(matrixC14_5),
.matrixC14_6(matrixC14_6),
.matrixC14_7(matrixC14_7),
.matrixC14_8(matrixC14_8),
.matrixC14_9(matrixC14_9),
.matrixC14_10(matrixC14_10),
.matrixC14_11(matrixC14_11),
.matrixC14_12(matrixC14_12),
.matrixC14_13(matrixC14_13),
.matrixC14_14(matrixC14_14),
.matrixC14_15(matrixC14_15),
.matrixC15_0(matrixC15_0),
.matrixC15_1(matrixC15_1),
.matrixC15_2(matrixC15_2),
.matrixC15_3(matrixC15_3),
.matrixC15_4(matrixC15_4),
.matrixC15_5(matrixC15_5),
.matrixC15_6(matrixC15_6),
.matrixC15_7(matrixC15_7),
.matrixC15_8(matrixC15_8),
.matrixC15_9(matrixC15_9),
.matrixC15_10(matrixC15_10),
.matrixC15_11(matrixC15_11),
.matrixC15_12(matrixC15_12),
.matrixC15_13(matrixC15_13),
.matrixC15_14(matrixC15_14),
.matrixC15_15(matrixC15_15),
.a_data_out(a_data_out),
.b_data_out(b_data_out)
);
endmodule
//////////////////////////////////////////////////////////////////////////
// Output logic
//////////////////////////////////////////////////////////////////////////
module output_logic(
start_mat_mul,
done_mat_mul,
address_mat_c,
address_stride_c,
c_data_in,
c_data_out, //Data values going out to next matmul - systolic shifting
c_addr,
c_data_available,
clk_cnt,
row_latch_en,
final_mat_mul_size,
matrixC0_0,
matrixC0_1,
matrixC0_2,
matrixC0_3,
matrixC0_4,
matrixC0_5,
matrixC0_6,
matrixC0_7,
matrixC0_8,
matrixC0_9,
matrixC0_10,
matrixC0_11,
matrixC0_12,
matrixC0_13,
matrixC0_14,
matrixC0_15,
matrixC1_0,
matrixC1_1,
matrixC1_2,
matrixC1_3,
matrixC1_4,
matrixC1_5,
matrixC1_6,
matrixC1_7,
matrixC1_8,
matrixC1_9,
matrixC1_10,
matrixC1_11,
matrixC1_12,
matrixC1_13,
matrixC1_14,
matrixC1_15,
matrixC2_0,
matrixC2_1,
matrixC2_2,
matrixC2_3,
matrixC2_4,
matrixC2_5,
matrixC2_6,
matrixC2_7,
matrixC2_8,
matrixC2_9,
matrixC2_10,
matrixC2_11,
matrixC2_12,
matrixC2_13,
matrixC2_14,
matrixC2_15,
matrixC3_0,
matrixC3_1,
matrixC3_2,
matrixC3_3,
matrixC3_4,
matrixC3_5,
matrixC3_6,
matrixC3_7,
matrixC3_8,
matrixC3_9,
matrixC3_10,
matrixC3_11,
matrixC3_12,
matrixC3_13,
matrixC3_14,
matrixC3_15,
matrixC4_0,
matrixC4_1,
matrixC4_2,
matrixC4_3,
matrixC4_4,
matrixC4_5,
matrixC4_6,
matrixC4_7,
matrixC4_8,
matrixC4_9,
matrixC4_10,
matrixC4_11,
matrixC4_12,
matrixC4_13,
matrixC4_14,
matrixC4_15,
matrixC5_0,
matrixC5_1,
matrixC5_2,
matrixC5_3,
matrixC5_4,
matrixC5_5,
matrixC5_6,
matrixC5_7,
matrixC5_8,
matrixC5_9,
matrixC5_10,
matrixC5_11,
matrixC5_12,
matrixC5_13,
matrixC5_14,
matrixC5_15,
matrixC6_0,
matrixC6_1,
matrixC6_2,
matrixC6_3,
matrixC6_4,
matrixC6_5,
matrixC6_6,
matrixC6_7,
matrixC6_8,
matrixC6_9,
matrixC6_10,
matrixC6_11,
matrixC6_12,
matrixC6_13,
matrixC6_14,
matrixC6_15,
matrixC7_0,
matrixC7_1,
matrixC7_2,
matrixC7_3,
matrixC7_4,
matrixC7_5,
matrixC7_6,
matrixC7_7,
matrixC7_8,
matrixC7_9,
matrixC7_10,
matrixC7_11,
matrixC7_12,
matrixC7_13,
matrixC7_14,
matrixC7_15,
matrixC8_0,
matrixC8_1,
matrixC8_2,
matrixC8_3,
matrixC8_4,
matrixC8_5,
matrixC8_6,
matrixC8_7,
matrixC8_8,
matrixC8_9,
matrixC8_10,
matrixC8_11,
matrixC8_12,
matrixC8_13,
matrixC8_14,
matrixC8_15,
matrixC9_0,
matrixC9_1,
matrixC9_2,
matrixC9_3,
matrixC9_4,
matrixC9_5,
matrixC9_6,
matrixC9_7,
matrixC9_8,
matrixC9_9,
matrixC9_10,
matrixC9_11,
matrixC9_12,
matrixC9_13,
matrixC9_14,
matrixC9_15,
matrixC10_0,
matrixC10_1,
matrixC10_2,
matrixC10_3,
matrixC10_4,
matrixC10_5,
matrixC10_6,
matrixC10_7,
matrixC10_8,
matrixC10_9,
matrixC10_10,
matrixC10_11,
matrixC10_12,
matrixC10_13,
matrixC10_14,
matrixC10_15,
matrixC11_0,
matrixC11_1,
matrixC11_2,
matrixC11_3,
matrixC11_4,
matrixC11_5,
matrixC11_6,
matrixC11_7,
matrixC11_8,
matrixC11_9,
matrixC11_10,
matrixC11_11,
matrixC11_12,
matrixC11_13,
matrixC11_14,
matrixC11_15,
matrixC12_0,
matrixC12_1,
matrixC12_2,
matrixC12_3,
matrixC12_4,
matrixC12_5,
matrixC12_6,
matrixC12_7,
matrixC12_8,
matrixC12_9,
matrixC12_10,
matrixC12_11,
matrixC12_12,
matrixC12_13,
matrixC12_14,
matrixC12_15,
matrixC13_0,
matrixC13_1,
matrixC13_2,
matrixC13_3,
matrixC13_4,
matrixC13_5,
matrixC13_6,
matrixC13_7,
matrixC13_8,
matrixC13_9,
matrixC13_10,
matrixC13_11,
matrixC13_12,
matrixC13_13,
matrixC13_14,
matrixC13_15,
matrixC14_0,
matrixC14_1,
matrixC14_2,
matrixC14_3,
matrixC14_4,
matrixC14_5,
matrixC14_6,
matrixC14_7,
matrixC14_8,
matrixC14_9,
matrixC14_10,
matrixC14_11,
matrixC14_12,
matrixC14_13,
matrixC14_14,
matrixC14_15,
matrixC15_0,
matrixC15_1,
matrixC15_2,
matrixC15_3,
matrixC15_4,
matrixC15_5,
matrixC15_6,
matrixC15_7,
matrixC15_8,
matrixC15_9,
matrixC15_10,
matrixC15_11,
matrixC15_12,
matrixC15_13,
matrixC15_14,
matrixC15_15,
clk,
reset
);
input clk;
input reset;
input start_mat_mul;
input done_mat_mul;
input [`AWIDTH-1:0] address_mat_c;
input [`ADDR_STRIDE_WIDTH-1:0] address_stride_c;
input [`MAT_MUL_SIZE*`DWIDTH-1:0] c_data_in;
output [`MAT_MUL_SIZE*`DWIDTH-1:0] c_data_out;
output [`AWIDTH-1:0] c_addr;
output c_data_available;
input [7:0] clk_cnt;
output row_latch_en;
input [7:0] final_mat_mul_size;
input [`DWIDTH-1:0] matrixC0_0;
input [`DWIDTH-1:0] matrixC0_1;
input [`DWIDTH-1:0] matrixC0_2;
input [`DWIDTH-1:0] matrixC0_3;
input [`DWIDTH-1:0] matrixC0_4;
input [`DWIDTH-1:0] matrixC0_5;
input [`DWIDTH-1:0] matrixC0_6;
input [`DWIDTH-1:0] matrixC0_7;
input [`DWIDTH-1:0] matrixC0_8;
input [`DWIDTH-1:0] matrixC0_9;
input [`DWIDTH-1:0] matrixC0_10;
input [`DWIDTH-1:0] matrixC0_11;
input [`DWIDTH-1:0] matrixC0_12;
input [`DWIDTH-1:0] matrixC0_13;
input [`DWIDTH-1:0] matrixC0_14;
input [`DWIDTH-1:0] matrixC0_15;
input [`DWIDTH-1:0] matrixC1_0;
input [`DWIDTH-1:0] matrixC1_1;
input [`DWIDTH-1:0] matrixC1_2;
input [`DWIDTH-1:0] matrixC1_3;
input [`DWIDTH-1:0] matrixC1_4;
input [`DWIDTH-1:0] matrixC1_5;
input [`DWIDTH-1:0] matrixC1_6;
input [`DWIDTH-1:0] matrixC1_7;
input [`DWIDTH-1:0] matrixC1_8;
input [`DWIDTH-1:0] matrixC1_9;
input [`DWIDTH-1:0] matrixC1_10;
input [`DWIDTH-1:0] matrixC1_11;
input [`DWIDTH-1:0] matrixC1_12;
input [`DWIDTH-1:0] matrixC1_13;
input [`DWIDTH-1:0] matrixC1_14;
input [`DWIDTH-1:0] matrixC1_15;
input [`DWIDTH-1:0] matrixC2_0;
input [`DWIDTH-1:0] matrixC2_1;
input [`DWIDTH-1:0] matrixC2_2;
input [`DWIDTH-1:0] matrixC2_3;
input [`DWIDTH-1:0] matrixC2_4;
input [`DWIDTH-1:0] matrixC2_5;
input [`DWIDTH-1:0] matrixC2_6;
input [`DWIDTH-1:0] matrixC2_7;
input [`DWIDTH-1:0] matrixC2_8;
input [`DWIDTH-1:0] matrixC2_9;
input [`DWIDTH-1:0] matrixC2_10;
input [`DWIDTH-1:0] matrixC2_11;
input [`DWIDTH-1:0] matrixC2_12;
input [`DWIDTH-1:0] matrixC2_13;
input [`DWIDTH-1:0] matrixC2_14;
input [`DWIDTH-1:0] matrixC2_15;
input [`DWIDTH-1:0] matrixC3_0;
input [`DWIDTH-1:0] matrixC3_1;
input [`DWIDTH-1:0] matrixC3_2;
input [`DWIDTH-1:0] matrixC3_3;
input [`DWIDTH-1:0] matrixC3_4;
input [`DWIDTH-1:0] matrixC3_5;
input [`DWIDTH-1:0] matrixC3_6;
input [`DWIDTH-1:0] matrixC3_7;
input [`DWIDTH-1:0] matrixC3_8;
input [`DWIDTH-1:0] matrixC3_9;
input [`DWIDTH-1:0] matrixC3_10;
input [`DWIDTH-1:0] matrixC3_11;
input [`DWIDTH-1:0] matrixC3_12;
input [`DWIDTH-1:0] matrixC3_13;
input [`DWIDTH-1:0] matrixC3_14;
input [`DWIDTH-1:0] matrixC3_15;
input [`DWIDTH-1:0] matrixC4_0;
input [`DWIDTH-1:0] matrixC4_1;
input [`DWIDTH-1:0] matrixC4_2;
input [`DWIDTH-1:0] matrixC4_3;
input [`DWIDTH-1:0] matrixC4_4;
input [`DWIDTH-1:0] matrixC4_5;
input [`DWIDTH-1:0] matrixC4_6;
input [`DWIDTH-1:0] matrixC4_7;
input [`DWIDTH-1:0] matrixC4_8;
input [`DWIDTH-1:0] matrixC4_9;
input [`DWIDTH-1:0] matrixC4_10;
input [`DWIDTH-1:0] matrixC4_11;
input [`DWIDTH-1:0] matrixC4_12;
input [`DWIDTH-1:0] matrixC4_13;
input [`DWIDTH-1:0] matrixC4_14;
input [`DWIDTH-1:0] matrixC4_15;
input [`DWIDTH-1:0] matrixC5_0;
input [`DWIDTH-1:0] matrixC5_1;
input [`DWIDTH-1:0] matrixC5_2;
input [`DWIDTH-1:0] matrixC5_3;
input [`DWIDTH-1:0] matrixC5_4;
input [`DWIDTH-1:0] matrixC5_5;
input [`DWIDTH-1:0] matrixC5_6;
input [`DWIDTH-1:0] matrixC5_7;
input [`DWIDTH-1:0] matrixC5_8;
input [`DWIDTH-1:0] matrixC5_9;
input [`DWIDTH-1:0] matrixC5_10;
input [`DWIDTH-1:0] matrixC5_11;
input [`DWIDTH-1:0] matrixC5_12;
input [`DWIDTH-1:0] matrixC5_13;
input [`DWIDTH-1:0] matrixC5_14;
input [`DWIDTH-1:0] matrixC5_15;
input [`DWIDTH-1:0] matrixC6_0;
input [`DWIDTH-1:0] matrixC6_1;
input [`DWIDTH-1:0] matrixC6_2;
input [`DWIDTH-1:0] matrixC6_3;
input [`DWIDTH-1:0] matrixC6_4;
input [`DWIDTH-1:0] matrixC6_5;
input [`DWIDTH-1:0] matrixC6_6;
input [`DWIDTH-1:0] matrixC6_7;
input [`DWIDTH-1:0] matrixC6_8;
input [`DWIDTH-1:0] matrixC6_9;
input [`DWIDTH-1:0] matrixC6_10;
input [`DWIDTH-1:0] matrixC6_11;
input [`DWIDTH-1:0] matrixC6_12;
input [`DWIDTH-1:0] matrixC6_13;
input [`DWIDTH-1:0] matrixC6_14;
input [`DWIDTH-1:0] matrixC6_15;
input [`DWIDTH-1:0] matrixC7_0;
input [`DWIDTH-1:0] matrixC7_1;
input [`DWIDTH-1:0] matrixC7_2;
input [`DWIDTH-1:0] matrixC7_3;
input [`DWIDTH-1:0] matrixC7_4;
input [`DWIDTH-1:0] matrixC7_5;
input [`DWIDTH-1:0] matrixC7_6;
input [`DWIDTH-1:0] matrixC7_7;
input [`DWIDTH-1:0] matrixC7_8;
input [`DWIDTH-1:0] matrixC7_9;
input [`DWIDTH-1:0] matrixC7_10;
input [`DWIDTH-1:0] matrixC7_11;
input [`DWIDTH-1:0] matrixC7_12;
input [`DWIDTH-1:0] matrixC7_13;
input [`DWIDTH-1:0] matrixC7_14;
input [`DWIDTH-1:0] matrixC7_15;
input [`DWIDTH-1:0] matrixC8_0;
input [`DWIDTH-1:0] matrixC8_1;
input [`DWIDTH-1:0] matrixC8_2;
input [`DWIDTH-1:0] matrixC8_3;
input [`DWIDTH-1:0] matrixC8_4;
input [`DWIDTH-1:0] matrixC8_5;
input [`DWIDTH-1:0] matrixC8_6;
input [`DWIDTH-1:0] matrixC8_7;
input [`DWIDTH-1:0] matrixC8_8;
input [`DWIDTH-1:0] matrixC8_9;
input [`DWIDTH-1:0] matrixC8_10;
input [`DWIDTH-1:0] matrixC8_11;
input [`DWIDTH-1:0] matrixC8_12;
input [`DWIDTH-1:0] matrixC8_13;
input [`DWIDTH-1:0] matrixC8_14;
input [`DWIDTH-1:0] matrixC8_15;
input [`DWIDTH-1:0] matrixC9_0;
input [`DWIDTH-1:0] matrixC9_1;
input [`DWIDTH-1:0] matrixC9_2;
input [`DWIDTH-1:0] matrixC9_3;
input [`DWIDTH-1:0] matrixC9_4;
input [`DWIDTH-1:0] matrixC9_5;
input [`DWIDTH-1:0] matrixC9_6;
input [`DWIDTH-1:0] matrixC9_7;
input [`DWIDTH-1:0] matrixC9_8;
input [`DWIDTH-1:0] matrixC9_9;
input [`DWIDTH-1:0] matrixC9_10;
input [`DWIDTH-1:0] matrixC9_11;
input [`DWIDTH-1:0] matrixC9_12;
input [`DWIDTH-1:0] matrixC9_13;
input [`DWIDTH-1:0] matrixC9_14;
input [`DWIDTH-1:0] matrixC9_15;
input [`DWIDTH-1:0] matrixC10_0;
input [`DWIDTH-1:0] matrixC10_1;
input [`DWIDTH-1:0] matrixC10_2;
input [`DWIDTH-1:0] matrixC10_3;
input [`DWIDTH-1:0] matrixC10_4;
input [`DWIDTH-1:0] matrixC10_5;
input [`DWIDTH-1:0] matrixC10_6;
input [`DWIDTH-1:0] matrixC10_7;
input [`DWIDTH-1:0] matrixC10_8;
input [`DWIDTH-1:0] matrixC10_9;
input [`DWIDTH-1:0] matrixC10_10;
input [`DWIDTH-1:0] matrixC10_11;
input [`DWIDTH-1:0] matrixC10_12;
input [`DWIDTH-1:0] matrixC10_13;
input [`DWIDTH-1:0] matrixC10_14;
input [`DWIDTH-1:0] matrixC10_15;
input [`DWIDTH-1:0] matrixC11_0;
input [`DWIDTH-1:0] matrixC11_1;
input [`DWIDTH-1:0] matrixC11_2;
input [`DWIDTH-1:0] matrixC11_3;
input [`DWIDTH-1:0] matrixC11_4;
input [`DWIDTH-1:0] matrixC11_5;
input [`DWIDTH-1:0] matrixC11_6;
input [`DWIDTH-1:0] matrixC11_7;
input [`DWIDTH-1:0] matrixC11_8;
input [`DWIDTH-1:0] matrixC11_9;
input [`DWIDTH-1:0] matrixC11_10;
input [`DWIDTH-1:0] matrixC11_11;
input [`DWIDTH-1:0] matrixC11_12;
input [`DWIDTH-1:0] matrixC11_13;
input [`DWIDTH-1:0] matrixC11_14;
input [`DWIDTH-1:0] matrixC11_15;
input [`DWIDTH-1:0] matrixC12_0;
input [`DWIDTH-1:0] matrixC12_1;
input [`DWIDTH-1:0] matrixC12_2;
input [`DWIDTH-1:0] matrixC12_3;
input [`DWIDTH-1:0] matrixC12_4;
input [`DWIDTH-1:0] matrixC12_5;
input [`DWIDTH-1:0] matrixC12_6;
input [`DWIDTH-1:0] matrixC12_7;
input [`DWIDTH-1:0] matrixC12_8;
input [`DWIDTH-1:0] matrixC12_9;
input [`DWIDTH-1:0] matrixC12_10;
input [`DWIDTH-1:0] matrixC12_11;
input [`DWIDTH-1:0] matrixC12_12;
input [`DWIDTH-1:0] matrixC12_13;
input [`DWIDTH-1:0] matrixC12_14;
input [`DWIDTH-1:0] matrixC12_15;
input [`DWIDTH-1:0] matrixC13_0;
input [`DWIDTH-1:0] matrixC13_1;
input [`DWIDTH-1:0] matrixC13_2;
input [`DWIDTH-1:0] matrixC13_3;
input [`DWIDTH-1:0] matrixC13_4;
input [`DWIDTH-1:0] matrixC13_5;
input [`DWIDTH-1:0] matrixC13_6;
input [`DWIDTH-1:0] matrixC13_7;
input [`DWIDTH-1:0] matrixC13_8;
input [`DWIDTH-1:0] matrixC13_9;
input [`DWIDTH-1:0] matrixC13_10;
input [`DWIDTH-1:0] matrixC13_11;
input [`DWIDTH-1:0] matrixC13_12;
input [`DWIDTH-1:0] matrixC13_13;
input [`DWIDTH-1:0] matrixC13_14;
input [`DWIDTH-1:0] matrixC13_15;
input [`DWIDTH-1:0] matrixC14_0;
input [`DWIDTH-1:0] matrixC14_1;
input [`DWIDTH-1:0] matrixC14_2;
input [`DWIDTH-1:0] matrixC14_3;
input [`DWIDTH-1:0] matrixC14_4;
input [`DWIDTH-1:0] matrixC14_5;
input [`DWIDTH-1:0] matrixC14_6;
input [`DWIDTH-1:0] matrixC14_7;
input [`DWIDTH-1:0] matrixC14_8;
input [`DWIDTH-1:0] matrixC14_9;
input [`DWIDTH-1:0] matrixC14_10;
input [`DWIDTH-1:0] matrixC14_11;
input [`DWIDTH-1:0] matrixC14_12;
input [`DWIDTH-1:0] matrixC14_13;
input [`DWIDTH-1:0] matrixC14_14;
input [`DWIDTH-1:0] matrixC14_15;
input [`DWIDTH-1:0] matrixC15_0;
input [`DWIDTH-1:0] matrixC15_1;
input [`DWIDTH-1:0] matrixC15_2;
input [`DWIDTH-1:0] matrixC15_3;
input [`DWIDTH-1:0] matrixC15_4;
input [`DWIDTH-1:0] matrixC15_5;
input [`DWIDTH-1:0] matrixC15_6;
input [`DWIDTH-1:0] matrixC15_7;
input [`DWIDTH-1:0] matrixC15_8;
input [`DWIDTH-1:0] matrixC15_9;
input [`DWIDTH-1:0] matrixC15_10;
input [`DWIDTH-1:0] matrixC15_11;
input [`DWIDTH-1:0] matrixC15_12;
input [`DWIDTH-1:0] matrixC15_13;
input [`DWIDTH-1:0] matrixC15_14;
input [`DWIDTH-1:0] matrixC15_15;
wire row_latch_en;
//////////////////////////////////////////////////////////////////////////
// Logic to capture matrix C data from the PEs and shift it out
//////////////////////////////////////////////////////////////////////////
//assign row_latch_en = (clk_cnt==(`MAT_MUL_SIZE + (a_loc+b_loc) * `BB_MAT_MUL_SIZE + 10 + `NUM_CYCLES_IN_MAC - 1));
//Writing the line above to avoid multiplication:
//assign row_latch_en = (clk_cnt==(`MAT_MUL_SIZE + ((a_loc+b_loc) << `LOG2_MAT_MUL_SIZE) + 10 + `NUM_CYCLES_IN_MAC - 1));
assign row_latch_en =
((clk_cnt == ((final_mat_mul_size<<2) - final_mat_mul_size - 1 +`NUM_CYCLES_IN_MAC)));
reg c_data_available;
reg [`AWIDTH-1:0] c_addr;
reg start_capturing_c_data;
integer counter;
reg [16*`DWIDTH-1:0] c_data_out;
reg [16*`DWIDTH-1:0] c_data_out_1;
reg [16*`DWIDTH-1:0] c_data_out_2;
reg [16*`DWIDTH-1:0] c_data_out_3;
reg [16*`DWIDTH-1:0] c_data_out_4;
reg [16*`DWIDTH-1:0] c_data_out_5;
reg [16*`DWIDTH-1:0] c_data_out_6;
reg [16*`DWIDTH-1:0] c_data_out_7;
reg [16*`DWIDTH-1:0] c_data_out_8;
reg [16*`DWIDTH-1:0] c_data_out_9;
reg [16*`DWIDTH-1:0] c_data_out_10;
reg [16*`DWIDTH-1:0] c_data_out_11;
reg [16*`DWIDTH-1:0] c_data_out_12;
reg [16*`DWIDTH-1:0] c_data_out_13;
reg [16*`DWIDTH-1:0] c_data_out_14;
reg [16*`DWIDTH-1:0] c_data_out_15;
wire condition_to_start_shifting_output;
assign condition_to_start_shifting_output =
row_latch_en ;
//For larger matmuls, this logic will have more entries in the case statement
always @(posedge clk) begin
if (reset | ~start_mat_mul) begin
start_capturing_c_data <= 1'b0;
c_data_available <= 1'b0;
c_addr <= address_mat_c + address_stride_c;
c_data_out <= 0;
counter <= 0;
c_data_out_1 <= 0;
c_data_out_2 <= 0;
c_data_out_3 <= 0;
c_data_out_4 <= 0;
c_data_out_5 <= 0;
c_data_out_6 <= 0;
c_data_out_7 <= 0;
c_data_out_8 <= 0;
c_data_out_9 <= 0;
c_data_out_10 <= 0;
c_data_out_11 <= 0;
c_data_out_12 <= 0;
c_data_out_13 <= 0;
c_data_out_14 <= 0;
c_data_out_15 <= 0;
end else if (condition_to_start_shifting_output) begin
start_capturing_c_data <= 1'b1;
c_data_available <= 1'b1;
c_addr <= c_addr - address_stride_c;
c_data_out <= {matrixC15_15, matrixC14_15, matrixC13_15, matrixC12_15, matrixC11_15, matrixC10_15, matrixC9_15, matrixC8_15, matrixC7_15, matrixC6_15, matrixC5_15, matrixC4_15, matrixC3_15, matrixC2_15, matrixC1_15, matrixC0_15};
c_data_out_1 <= {matrixC15_14, matrixC14_14, matrixC13_14, matrixC12_14, matrixC11_14, matrixC10_14, matrixC9_14, matrixC8_14, matrixC7_14, matrixC6_14, matrixC5_14, matrixC4_14, matrixC3_14, matrixC2_14, matrixC1_14, matrixC0_14};
c_data_out_2 <= {matrixC15_13, matrixC14_13, matrixC13_13, matrixC12_13, matrixC11_13, matrixC10_13, matrixC9_13, matrixC8_13, matrixC7_13, matrixC6_13, matrixC5_13, matrixC4_13, matrixC3_13, matrixC2_13, matrixC1_13, matrixC0_13};
c_data_out_3 <= {matrixC15_12, matrixC14_12, matrixC13_12, matrixC12_12, matrixC11_12, matrixC10_12, matrixC9_12, matrixC8_12, matrixC7_12, matrixC6_12, matrixC5_12, matrixC4_12, matrixC3_12, matrixC2_12, matrixC1_12, matrixC0_12};
c_data_out_4 <= {matrixC15_11, matrixC14_11, matrixC13_11, matrixC12_11, matrixC11_11, matrixC10_11, matrixC9_11, matrixC8_11, matrixC7_11, matrixC6_11, matrixC5_11, matrixC4_11, matrixC3_11, matrixC2_11, matrixC1_11, matrixC0_11};
c_data_out_5 <= {matrixC15_10, matrixC14_10, matrixC13_10, matrixC12_10, matrixC11_10, matrixC10_10, matrixC9_10, matrixC8_10, matrixC7_10, matrixC6_10, matrixC5_10, matrixC4_10, matrixC3_10, matrixC2_10, matrixC1_10, matrixC0_10};
c_data_out_6 <= {matrixC15_9, matrixC14_9, matrixC13_9, matrixC12_9, matrixC11_9, matrixC10_9, matrixC9_9, matrixC8_9, matrixC7_9, matrixC6_9, matrixC5_9, matrixC4_9, matrixC3_9, matrixC2_9, matrixC1_9, matrixC0_9};
c_data_out_7 <= {matrixC15_8, matrixC14_8, matrixC13_8, matrixC12_8, matrixC11_8, matrixC10_8, matrixC9_8, matrixC8_8, matrixC7_8, matrixC6_8, matrixC5_8, matrixC4_8, matrixC3_8, matrixC2_8, matrixC1_8, matrixC0_8};
c_data_out_8 <= {matrixC15_7, matrixC14_7, matrixC13_7, matrixC12_7, matrixC11_7, matrixC10_7, matrixC9_7, matrixC8_7, matrixC7_7, matrixC6_7, matrixC5_7, matrixC4_7, matrixC3_7, matrixC2_7, matrixC1_7, matrixC0_7};
c_data_out_9 <= {matrixC15_6, matrixC14_6, matrixC13_6, matrixC12_6, matrixC11_6, matrixC10_6, matrixC9_6, matrixC8_6, matrixC7_6, matrixC6_6, matrixC5_6, matrixC4_6, matrixC3_6, matrixC2_6, matrixC1_6, matrixC0_6};
c_data_out_10 <= {matrixC15_5, matrixC14_5, matrixC13_5, matrixC12_5, matrixC11_5, matrixC10_5, matrixC9_5, matrixC8_5, matrixC7_5, matrixC6_5, matrixC5_5, matrixC4_5, matrixC3_5, matrixC2_5, matrixC1_5, matrixC0_5};
c_data_out_11 <= {matrixC15_4, matrixC14_4, matrixC13_4, matrixC12_4, matrixC11_4, matrixC10_4, matrixC9_4, matrixC8_4, matrixC7_4, matrixC6_4, matrixC5_4, matrixC4_4, matrixC3_4, matrixC2_4, matrixC1_4, matrixC0_4};
c_data_out_12 <= {matrixC15_3, matrixC14_3, matrixC13_3, matrixC12_3, matrixC11_3, matrixC10_3, matrixC9_3, matrixC8_3, matrixC7_3, matrixC6_3, matrixC5_3, matrixC4_3, matrixC3_3, matrixC2_3, matrixC1_3, matrixC0_3};
c_data_out_13 <= {matrixC15_2, matrixC14_2, matrixC13_2, matrixC12_2, matrixC11_2, matrixC10_2, matrixC9_2, matrixC8_2, matrixC7_2, matrixC6_2, matrixC5_2, matrixC4_2, matrixC3_2, matrixC2_2, matrixC1_2, matrixC0_2};
c_data_out_14 <= {matrixC15_1, matrixC14_1, matrixC13_1, matrixC12_1, matrixC11_1, matrixC10_1, matrixC9_1, matrixC8_1, matrixC7_1, matrixC6_1, matrixC5_1, matrixC4_1, matrixC3_1, matrixC2_1, matrixC1_1, matrixC0_1};
c_data_out_15 <= {matrixC15_0, matrixC14_0, matrixC13_0, matrixC12_0, matrixC11_0, matrixC10_0, matrixC9_0, matrixC8_0, matrixC7_0, matrixC6_0, matrixC5_0, matrixC4_0, matrixC3_0, matrixC2_0, matrixC1_0, matrixC0_0};
counter <= counter + 1;
end else if (done_mat_mul) begin
start_capturing_c_data <= 1'b0;
c_data_available <= 1'b0;
c_addr <= address_mat_c + address_stride_c;
c_data_out <= 0;
c_data_out_1 <= 0;
c_data_out_2 <= 0;
c_data_out_3 <= 0;
c_data_out_4 <= 0;
c_data_out_5 <= 0;
c_data_out_6 <= 0;
c_data_out_7 <= 0;
c_data_out_8 <= 0;
c_data_out_9 <= 0;
c_data_out_10 <= 0;
c_data_out_11 <= 0;
c_data_out_12 <= 0;
c_data_out_13 <= 0;
c_data_out_14 <= 0;
c_data_out_15 <= 0;
end
else if (counter >= `MAT_MUL_SIZE) begin
c_data_out <= c_data_out_1;
c_addr <= c_addr - address_stride_c;
c_data_out_1 <= c_data_out_2;
c_data_out_2 <= c_data_out_3;
c_data_out_3 <= c_data_out_4;
c_data_out_4 <= c_data_out_5;
c_data_out_5 <= c_data_out_6;
c_data_out_6 <= c_data_out_7;
c_data_out_7 <= c_data_out_8;
c_data_out_8 <= c_data_out_9;
c_data_out_9 <= c_data_out_10;
c_data_out_10 <= c_data_out_11;
c_data_out_11 <= c_data_out_12;
c_data_out_12 <= c_data_out_13;
c_data_out_13 <= c_data_out_14;
c_data_out_14 <= c_data_out_15;
c_data_out_15 <= c_data_in;
end
else if (start_capturing_c_data) begin
c_data_available <= 1'b1;
c_addr <= c_addr - address_stride_c;
counter <= counter + 1;
c_data_out <= c_data_out_1;
c_data_out_1 <= c_data_out_2;
c_data_out_2 <= c_data_out_3;
c_data_out_3 <= c_data_out_4;
c_data_out_4 <= c_data_out_5;
c_data_out_5 <= c_data_out_6;
c_data_out_6 <= c_data_out_7;
c_data_out_7 <= c_data_out_8;
c_data_out_8 <= c_data_out_9;
c_data_out_9 <= c_data_out_10;
c_data_out_10 <= c_data_out_11;
c_data_out_11 <= c_data_out_12;
c_data_out_12 <= c_data_out_13;
c_data_out_13 <= c_data_out_14;
c_data_out_14 <= c_data_out_15;
c_data_out_15 <= c_data_in;
end
end
endmodule
//////////////////////////////////////////////////////////////////////////
// Systolic data setup
//////////////////////////////////////////////////////////////////////////
module systolic_data_setup(
clk,
reset,
start_mat_mul,
a_addr,
b_addr,
address_mat_a,
address_mat_b,
address_stride_a,
address_stride_b,
a_data,
b_data,
clk_cnt,
a0_data,
b0_data,
a1_data_delayed_1,
b1_data_delayed_1,
a2_data_delayed_2,
b2_data_delayed_2,
a3_data_delayed_3,
b3_data_delayed_3,
a4_data_delayed_4,
b4_data_delayed_4,
a5_data_delayed_5,
b5_data_delayed_5,
a6_data_delayed_6,
b6_data_delayed_6,
a7_data_delayed_7,
b7_data_delayed_7,
a8_data_delayed_8,
b8_data_delayed_8,
a9_data_delayed_9,
b9_data_delayed_9,
a10_data_delayed_10,
b10_data_delayed_10,
a11_data_delayed_11,
b11_data_delayed_11,
a12_data_delayed_12,
b12_data_delayed_12,
a13_data_delayed_13,
b13_data_delayed_13,
a14_data_delayed_14,
b14_data_delayed_14,
a15_data_delayed_15,
b15_data_delayed_15,
validity_mask_a_rows,
validity_mask_a_cols,
validity_mask_b_rows,
validity_mask_b_cols,
final_mat_mul_size,
a_loc,
b_loc
);
input clk;
input reset;
input start_mat_mul;
output [`AWIDTH-1:0] a_addr;
output [`AWIDTH-1:0] b_addr;
input [`AWIDTH-1:0] address_mat_a;
input [`AWIDTH-1:0] address_mat_b;
input [`ADDR_STRIDE_WIDTH-1:0] address_stride_a;
input [`ADDR_STRIDE_WIDTH-1:0] address_stride_b;
input [`MAT_MUL_SIZE*`DWIDTH-1:0] a_data;
input [`MAT_MUL_SIZE*`DWIDTH-1:0] b_data;
input [7:0] clk_cnt;
output [`DWIDTH-1:0] a0_data;
output [`DWIDTH-1:0] b0_data;
output [`DWIDTH-1:0] a1_data_delayed_1;
output [`DWIDTH-1:0] b1_data_delayed_1;
output [`DWIDTH-1:0] a2_data_delayed_2;
output [`DWIDTH-1:0] b2_data_delayed_2;
output [`DWIDTH-1:0] a3_data_delayed_3;
output [`DWIDTH-1:0] b3_data_delayed_3;
output [`DWIDTH-1:0] a4_data_delayed_4;
output [`DWIDTH-1:0] b4_data_delayed_4;
output [`DWIDTH-1:0] a5_data_delayed_5;
output [`DWIDTH-1:0] b5_data_delayed_5;
output [`DWIDTH-1:0] a6_data_delayed_6;
output [`DWIDTH-1:0] b6_data_delayed_6;
output [`DWIDTH-1:0] a7_data_delayed_7;
output [`DWIDTH-1:0] b7_data_delayed_7;
output [`DWIDTH-1:0] a8_data_delayed_8;
output [`DWIDTH-1:0] b8_data_delayed_8;
output [`DWIDTH-1:0] a9_data_delayed_9;
output [`DWIDTH-1:0] b9_data_delayed_9;
output [`DWIDTH-1:0] a10_data_delayed_10;
output [`DWIDTH-1:0] b10_data_delayed_10;
output [`DWIDTH-1:0] a11_data_delayed_11;
output [`DWIDTH-1:0] b11_data_delayed_11;
output [`DWIDTH-1:0] a12_data_delayed_12;
output [`DWIDTH-1:0] b12_data_delayed_12;
output [`DWIDTH-1:0] a13_data_delayed_13;
output [`DWIDTH-1:0] b13_data_delayed_13;
output [`DWIDTH-1:0] a14_data_delayed_14;
output [`DWIDTH-1:0] b14_data_delayed_14;
output [`DWIDTH-1:0] a15_data_delayed_15;
output [`DWIDTH-1:0] b15_data_delayed_15;
input [`MASK_WIDTH-1:0] validity_mask_a_rows;
input [`MASK_WIDTH-1:0] validity_mask_a_cols;
input [`MASK_WIDTH-1:0] validity_mask_b_rows;
input [`MASK_WIDTH-1:0] validity_mask_b_cols;
input [7:0] final_mat_mul_size;
input [7:0] a_loc;
input [7:0] b_loc;
wire [`DWIDTH-1:0] a0_data;
wire [`DWIDTH-1:0] a1_data;
wire [`DWIDTH-1:0] a2_data;
wire [`DWIDTH-1:0] a3_data;
wire [`DWIDTH-1:0] a4_data;
wire [`DWIDTH-1:0] a5_data;
wire [`DWIDTH-1:0] a6_data;
wire [`DWIDTH-1:0] a7_data;
wire [`DWIDTH-1:0] a8_data;
wire [`DWIDTH-1:0] a9_data;
wire [`DWIDTH-1:0] a10_data;
wire [`DWIDTH-1:0] a11_data;
wire [`DWIDTH-1:0] a12_data;
wire [`DWIDTH-1:0] a13_data;
wire [`DWIDTH-1:0] a14_data;
wire [`DWIDTH-1:0] a15_data;
wire [`DWIDTH-1:0] b0_data;
wire [`DWIDTH-1:0] b1_data;
wire [`DWIDTH-1:0] b2_data;
wire [`DWIDTH-1:0] b3_data;
wire [`DWIDTH-1:0] b4_data;
wire [`DWIDTH-1:0] b5_data;
wire [`DWIDTH-1:0] b6_data;
wire [`DWIDTH-1:0] b7_data;
wire [`DWIDTH-1:0] b8_data;
wire [`DWIDTH-1:0] b9_data;
wire [`DWIDTH-1:0] b10_data;
wire [`DWIDTH-1:0] b11_data;
wire [`DWIDTH-1:0] b12_data;
wire [`DWIDTH-1:0] b13_data;
wire [`DWIDTH-1:0] b14_data;
wire [`DWIDTH-1:0] b15_data;
//////////////////////////////////////////////////////////////////////////
// Logic to generate addresses to BRAM A
//////////////////////////////////////////////////////////////////////////
reg [`AWIDTH-1:0] a_addr;
reg a_mem_access; //flag that tells whether the matmul is trying to access memory or not
always @(posedge clk) begin
//(clk_cnt >= a_loc*`MAT_MUL_SIZE+final_mat_mul_size) begin
//Writing the line above to avoid multiplication:
if (reset || ~start_mat_mul || (clk_cnt >= (a_loc<<`LOG2_MAT_MUL_SIZE)+final_mat_mul_size)) begin
a_addr <= address_mat_a-address_stride_a;
a_mem_access <= 0;
end
//else if ((clk_cnt >= a_loc*`MAT_MUL_SIZE) && (clk_cnt < a_loc*`MAT_MUL_SIZE+final_mat_mul_size)) begin
//Writing the line above to avoid multiplication:
else if ((clk_cnt >= (a_loc<<`LOG2_MAT_MUL_SIZE)) && (clk_cnt < (a_loc<<`LOG2_MAT_MUL_SIZE)+final_mat_mul_size)) begin
a_addr <= a_addr + address_stride_a;
a_mem_access <= 1;
end
end
//////////////////////////////////////////////////////////////////////////
// Logic to generate valid signals for data coming from BRAM A
//////////////////////////////////////////////////////////////////////////
reg [7:0] a_mem_access_counter;
always @(posedge clk) begin
if (reset || ~start_mat_mul) begin
a_mem_access_counter <= 0;
end
else if (a_mem_access == 1) begin
a_mem_access_counter <= a_mem_access_counter + 1;
end
else begin
a_mem_access_counter <= 0;
end
end
wire a_data_valid; //flag that tells whether the data from memory is valid
assign a_data_valid =
((validity_mask_a_cols[0]==1'b0 && a_mem_access_counter==1) ||
(validity_mask_a_cols[1]==1'b0 && a_mem_access_counter==2) ||
(validity_mask_a_cols[2]==1'b0 && a_mem_access_counter==3) ||
(validity_mask_a_cols[3]==1'b0 && a_mem_access_counter==4) ||
(validity_mask_a_cols[4]==1'b0 && a_mem_access_counter==5) ||
(validity_mask_a_cols[5]==1'b0 && a_mem_access_counter==6) ||
(validity_mask_a_cols[6]==1'b0 && a_mem_access_counter==7) ||
(validity_mask_a_cols[7]==1'b0 && a_mem_access_counter==8) ||
(validity_mask_a_cols[8]==1'b0 && a_mem_access_counter==9) ||
(validity_mask_a_cols[9]==1'b0 && a_mem_access_counter==10) ||
(validity_mask_a_cols[10]==1'b0 && a_mem_access_counter==11) ||
(validity_mask_a_cols[11]==1'b0 && a_mem_access_counter==12) ||
(validity_mask_a_cols[12]==1'b0 && a_mem_access_counter==13) ||
(validity_mask_a_cols[13]==1'b0 && a_mem_access_counter==14) ||
(validity_mask_a_cols[14]==1'b0 && a_mem_access_counter==15) ||
(validity_mask_a_cols[15]==1'b0 && a_mem_access_counter==16)) ?
1'b0 : (a_mem_access_counter >= `MEM_ACCESS_LATENCY);
//////////////////////////////////////////////////////////////////////////
// Logic to delay certain parts of the data received from BRAM A (systolic data setup)
//////////////////////////////////////////////////////////////////////////
assign a0_data = a_data[1*`DWIDTH-1:0*`DWIDTH] & {`DWIDTH{a_data_valid}} & {`DWIDTH{validity_mask_a_rows[0]}};
assign a1_data = a_data[2*`DWIDTH-1:1*`DWIDTH] & {`DWIDTH{a_data_valid}} & {`DWIDTH{validity_mask_a_rows[1]}};
assign a2_data = a_data[3*`DWIDTH-1:2*`DWIDTH] & {`DWIDTH{a_data_valid}} & {`DWIDTH{validity_mask_a_rows[2]}};
assign a3_data = a_data[4*`DWIDTH-1:3*`DWIDTH] & {`DWIDTH{a_data_valid}} & {`DWIDTH{validity_mask_a_rows[3]}};
assign a4_data = a_data[5*`DWIDTH-1:4*`DWIDTH] & {`DWIDTH{a_data_valid}} & {`DWIDTH{validity_mask_a_rows[4]}};
assign a5_data = a_data[6*`DWIDTH-1:5*`DWIDTH] & {`DWIDTH{a_data_valid}} & {`DWIDTH{validity_mask_a_rows[5]}};
assign a6_data = a_data[7*`DWIDTH-1:6*`DWIDTH] & {`DWIDTH{a_data_valid}} & {`DWIDTH{validity_mask_a_rows[6]}};
assign a7_data = a_data[8*`DWIDTH-1:7*`DWIDTH] & {`DWIDTH{a_data_valid}} & {`DWIDTH{validity_mask_a_rows[7]}};
assign a8_data = a_data[9*`DWIDTH-1:8*`DWIDTH] & {`DWIDTH{a_data_valid}} & {`DWIDTH{validity_mask_a_rows[8]}};
assign a9_data = a_data[10*`DWIDTH-1:9*`DWIDTH] & {`DWIDTH{a_data_valid}} & {`DWIDTH{validity_mask_a_rows[9]}};
assign a10_data = a_data[11*`DWIDTH-1:10*`DWIDTH] & {`DWIDTH{a_data_valid}} & {`DWIDTH{validity_mask_a_rows[10]}};
assign a11_data = a_data[12*`DWIDTH-1:11*`DWIDTH] & {`DWIDTH{a_data_valid}} & {`DWIDTH{validity_mask_a_rows[11]}};
assign a12_data = a_data[13*`DWIDTH-1:12*`DWIDTH] & {`DWIDTH{a_data_valid}} & {`DWIDTH{validity_mask_a_rows[12]}};
assign a13_data = a_data[14*`DWIDTH-1:13*`DWIDTH] & {`DWIDTH{a_data_valid}} & {`DWIDTH{validity_mask_a_rows[13]}};
assign a14_data = a_data[15*`DWIDTH-1:14*`DWIDTH] & {`DWIDTH{a_data_valid}} & {`DWIDTH{validity_mask_a_rows[14]}};
assign a15_data = a_data[16*`DWIDTH-1:15*`DWIDTH] & {`DWIDTH{a_data_valid}} & {`DWIDTH{validity_mask_a_rows[15]}};
reg [`DWIDTH-1:0] a1_data_delayed_1;
reg [`DWIDTH-1:0] a2_data_delayed_1;
reg [`DWIDTH-1:0] a2_data_delayed_2;
reg [`DWIDTH-1:0] a3_data_delayed_1;
reg [`DWIDTH-1:0] a3_data_delayed_2;
reg [`DWIDTH-1:0] a3_data_delayed_3;
reg [`DWIDTH-1:0] a4_data_delayed_1;
reg [`DWIDTH-1:0] a4_data_delayed_2;
reg [`DWIDTH-1:0] a4_data_delayed_3;
reg [`DWIDTH-1:0] a4_data_delayed_4;
reg [`DWIDTH-1:0] a5_data_delayed_1;
reg [`DWIDTH-1:0] a5_data_delayed_2;
reg [`DWIDTH-1:0] a5_data_delayed_3;
reg [`DWIDTH-1:0] a5_data_delayed_4;
reg [`DWIDTH-1:0] a5_data_delayed_5;
reg [`DWIDTH-1:0] a6_data_delayed_1;
reg [`DWIDTH-1:0] a6_data_delayed_2;
reg [`DWIDTH-1:0] a6_data_delayed_3;
reg [`DWIDTH-1:0] a6_data_delayed_4;
reg [`DWIDTH-1:0] a6_data_delayed_5;
reg [`DWIDTH-1:0] a6_data_delayed_6;
reg [`DWIDTH-1:0] a7_data_delayed_1;
reg [`DWIDTH-1:0] a7_data_delayed_2;
reg [`DWIDTH-1:0] a7_data_delayed_3;
reg [`DWIDTH-1:0] a7_data_delayed_4;
reg [`DWIDTH-1:0] a7_data_delayed_5;
reg [`DWIDTH-1:0] a7_data_delayed_6;
reg [`DWIDTH-1:0] a7_data_delayed_7;
reg [`DWIDTH-1:0] a8_data_delayed_1;
reg [`DWIDTH-1:0] a8_data_delayed_2;
reg [`DWIDTH-1:0] a8_data_delayed_3;
reg [`DWIDTH-1:0] a8_data_delayed_4;
reg [`DWIDTH-1:0] a8_data_delayed_5;
reg [`DWIDTH-1:0] a8_data_delayed_6;
reg [`DWIDTH-1:0] a8_data_delayed_7;
reg [`DWIDTH-1:0] a8_data_delayed_8;
reg [`DWIDTH-1:0] a9_data_delayed_1;
reg [`DWIDTH-1:0] a9_data_delayed_2;
reg [`DWIDTH-1:0] a9_data_delayed_3;
reg [`DWIDTH-1:0] a9_data_delayed_4;
reg [`DWIDTH-1:0] a9_data_delayed_5;
reg [`DWIDTH-1:0] a9_data_delayed_6;
reg [`DWIDTH-1:0] a9_data_delayed_7;
reg [`DWIDTH-1:0] a9_data_delayed_8;
reg [`DWIDTH-1:0] a9_data_delayed_9;
reg [`DWIDTH-1:0] a10_data_delayed_1;
reg [`DWIDTH-1:0] a10_data_delayed_2;
reg [`DWIDTH-1:0] a10_data_delayed_3;
reg [`DWIDTH-1:0] a10_data_delayed_4;
reg [`DWIDTH-1:0] a10_data_delayed_5;
reg [`DWIDTH-1:0] a10_data_delayed_6;
reg [`DWIDTH-1:0] a10_data_delayed_7;
reg [`DWIDTH-1:0] a10_data_delayed_8;
reg [`DWIDTH-1:0] a10_data_delayed_9;
reg [`DWIDTH-1:0] a10_data_delayed_10;
reg [`DWIDTH-1:0] a11_data_delayed_1;
reg [`DWIDTH-1:0] a11_data_delayed_2;
reg [`DWIDTH-1:0] a11_data_delayed_3;
reg [`DWIDTH-1:0] a11_data_delayed_4;
reg [`DWIDTH-1:0] a11_data_delayed_5;
reg [`DWIDTH-1:0] a11_data_delayed_6;
reg [`DWIDTH-1:0] a11_data_delayed_7;
reg [`DWIDTH-1:0] a11_data_delayed_8;
reg [`DWIDTH-1:0] a11_data_delayed_9;
reg [`DWIDTH-1:0] a11_data_delayed_10;
reg [`DWIDTH-1:0] a11_data_delayed_11;
reg [`DWIDTH-1:0] a12_data_delayed_1;
reg [`DWIDTH-1:0] a12_data_delayed_2;
reg [`DWIDTH-1:0] a12_data_delayed_3;
reg [`DWIDTH-1:0] a12_data_delayed_4;
reg [`DWIDTH-1:0] a12_data_delayed_5;
reg [`DWIDTH-1:0] a12_data_delayed_6;
reg [`DWIDTH-1:0] a12_data_delayed_7;
reg [`DWIDTH-1:0] a12_data_delayed_8;
reg [`DWIDTH-1:0] a12_data_delayed_9;
reg [`DWIDTH-1:0] a12_data_delayed_10;
reg [`DWIDTH-1:0] a12_data_delayed_11;
reg [`DWIDTH-1:0] a12_data_delayed_12;
reg [`DWIDTH-1:0] a13_data_delayed_1;
reg [`DWIDTH-1:0] a13_data_delayed_2;
reg [`DWIDTH-1:0] a13_data_delayed_3;
reg [`DWIDTH-1:0] a13_data_delayed_4;
reg [`DWIDTH-1:0] a13_data_delayed_5;
reg [`DWIDTH-1:0] a13_data_delayed_6;
reg [`DWIDTH-1:0] a13_data_delayed_7;
reg [`DWIDTH-1:0] a13_data_delayed_8;
reg [`DWIDTH-1:0] a13_data_delayed_9;
reg [`DWIDTH-1:0] a13_data_delayed_10;
reg [`DWIDTH-1:0] a13_data_delayed_11;
reg [`DWIDTH-1:0] a13_data_delayed_12;
reg [`DWIDTH-1:0] a13_data_delayed_13;
reg [`DWIDTH-1:0] a14_data_delayed_1;
reg [`DWIDTH-1:0] a14_data_delayed_2;
reg [`DWIDTH-1:0] a14_data_delayed_3;
reg [`DWIDTH-1:0] a14_data_delayed_4;
reg [`DWIDTH-1:0] a14_data_delayed_5;
reg [`DWIDTH-1:0] a14_data_delayed_6;
reg [`DWIDTH-1:0] a14_data_delayed_7;
reg [`DWIDTH-1:0] a14_data_delayed_8;
reg [`DWIDTH-1:0] a14_data_delayed_9;
reg [`DWIDTH-1:0] a14_data_delayed_10;
reg [`DWIDTH-1:0] a14_data_delayed_11;
reg [`DWIDTH-1:0] a14_data_delayed_12;
reg [`DWIDTH-1:0] a14_data_delayed_13;
reg [`DWIDTH-1:0] a14_data_delayed_14;
reg [`DWIDTH-1:0] a15_data_delayed_1;
reg [`DWIDTH-1:0] a15_data_delayed_2;
reg [`DWIDTH-1:0] a15_data_delayed_3;
reg [`DWIDTH-1:0] a15_data_delayed_4;
reg [`DWIDTH-1:0] a15_data_delayed_5;
reg [`DWIDTH-1:0] a15_data_delayed_6;
reg [`DWIDTH-1:0] a15_data_delayed_7;
reg [`DWIDTH-1:0] a15_data_delayed_8;
reg [`DWIDTH-1:0] a15_data_delayed_9;
reg [`DWIDTH-1:0] a15_data_delayed_10;
reg [`DWIDTH-1:0] a15_data_delayed_11;
reg [`DWIDTH-1:0] a15_data_delayed_12;
reg [`DWIDTH-1:0] a15_data_delayed_13;
reg [`DWIDTH-1:0] a15_data_delayed_14;
reg [`DWIDTH-1:0] a15_data_delayed_15;
always @(posedge clk) begin
if (reset || ~start_mat_mul || clk_cnt==0) begin
a1_data_delayed_1 <= 0;
a2_data_delayed_1 <= 0;
a2_data_delayed_2 <= 0;
a3_data_delayed_1 <= 0;
a3_data_delayed_2 <= 0;
a3_data_delayed_3 <= 0;
a4_data_delayed_1 <= 0;
a4_data_delayed_2 <= 0;
a4_data_delayed_3 <= 0;
a4_data_delayed_4 <= 0;
a5_data_delayed_1 <= 0;
a5_data_delayed_2 <= 0;
a5_data_delayed_3 <= 0;
a5_data_delayed_4 <= 0;
a5_data_delayed_5 <= 0;
a6_data_delayed_1 <= 0;
a6_data_delayed_2 <= 0;
a6_data_delayed_3 <= 0;
a6_data_delayed_4 <= 0;
a6_data_delayed_5 <= 0;
a6_data_delayed_6 <= 0;
a7_data_delayed_1 <= 0;
a7_data_delayed_2 <= 0;
a7_data_delayed_3 <= 0;
a7_data_delayed_4 <= 0;
a7_data_delayed_5 <= 0;
a7_data_delayed_6 <= 0;
a7_data_delayed_7 <= 0;
a8_data_delayed_1 <= 0;
a8_data_delayed_2 <= 0;
a8_data_delayed_3 <= 0;
a8_data_delayed_4 <= 0;
a8_data_delayed_5 <= 0;
a8_data_delayed_6 <= 0;
a8_data_delayed_7 <= 0;
a8_data_delayed_8 <= 0;
a9_data_delayed_1 <= 0;
a9_data_delayed_2 <= 0;
a9_data_delayed_3 <= 0;
a9_data_delayed_4 <= 0;
a9_data_delayed_5 <= 0;
a9_data_delayed_6 <= 0;
a9_data_delayed_7 <= 0;
a9_data_delayed_8 <= 0;
a9_data_delayed_9 <= 0;
a10_data_delayed_1 <= 0;
a10_data_delayed_2 <= 0;
a10_data_delayed_3 <= 0;
a10_data_delayed_4 <= 0;
a10_data_delayed_5 <= 0;
a10_data_delayed_6 <= 0;
a10_data_delayed_7 <= 0;
a10_data_delayed_8 <= 0;
a10_data_delayed_9 <= 0;
a10_data_delayed_10 <= 0;
a11_data_delayed_1 <= 0;
a11_data_delayed_2 <= 0;
a11_data_delayed_3 <= 0;
a11_data_delayed_4 <= 0;
a11_data_delayed_5 <= 0;
a11_data_delayed_6 <= 0;
a11_data_delayed_7 <= 0;
a11_data_delayed_8 <= 0;
a11_data_delayed_9 <= 0;
a11_data_delayed_10 <= 0;
a11_data_delayed_11 <= 0;
a12_data_delayed_1 <= 0;
a12_data_delayed_2 <= 0;
a12_data_delayed_3 <= 0;
a12_data_delayed_4 <= 0;
a12_data_delayed_5 <= 0;
a12_data_delayed_6 <= 0;
a12_data_delayed_7 <= 0;
a12_data_delayed_8 <= 0;
a12_data_delayed_9 <= 0;
a12_data_delayed_10 <= 0;
a12_data_delayed_11 <= 0;
a12_data_delayed_12 <= 0;
a13_data_delayed_1 <= 0;
a13_data_delayed_2 <= 0;
a13_data_delayed_3 <= 0;
a13_data_delayed_4 <= 0;
a13_data_delayed_5 <= 0;
a13_data_delayed_6 <= 0;
a13_data_delayed_7 <= 0;
a13_data_delayed_8 <= 0;
a13_data_delayed_9 <= 0;
a13_data_delayed_10 <= 0;
a13_data_delayed_11 <= 0;
a13_data_delayed_12 <= 0;
a13_data_delayed_13 <= 0;
a14_data_delayed_1 <= 0;
a14_data_delayed_2 <= 0;
a14_data_delayed_3 <= 0;
a14_data_delayed_4 <= 0;
a14_data_delayed_5 <= 0;
a14_data_delayed_6 <= 0;
a14_data_delayed_7 <= 0;
a14_data_delayed_8 <= 0;
a14_data_delayed_9 <= 0;
a14_data_delayed_10 <= 0;
a14_data_delayed_11 <= 0;
a14_data_delayed_12 <= 0;
a14_data_delayed_13 <= 0;
a14_data_delayed_14 <= 0;
a15_data_delayed_1 <= 0;
a15_data_delayed_2 <= 0;
a15_data_delayed_3 <= 0;
a15_data_delayed_4 <= 0;
a15_data_delayed_5 <= 0;
a15_data_delayed_6 <= 0;
a15_data_delayed_7 <= 0;
a15_data_delayed_8 <= 0;
a15_data_delayed_9 <= 0;
a15_data_delayed_10 <= 0;
a15_data_delayed_11 <= 0;
a15_data_delayed_12 <= 0;
a15_data_delayed_13 <= 0;
a15_data_delayed_14 <= 0;
a15_data_delayed_15 <= 0;
end
else begin
a1_data_delayed_1 <= a1_data;
a2_data_delayed_1 <= a2_data;
a3_data_delayed_1 <= a3_data;
a4_data_delayed_1 <= a4_data;
a5_data_delayed_1 <= a5_data;
a6_data_delayed_1 <= a6_data;
a7_data_delayed_1 <= a7_data;
a8_data_delayed_1 <= a8_data;
a9_data_delayed_1 <= a9_data;
a10_data_delayed_1 <= a10_data;
a11_data_delayed_1 <= a11_data;
a12_data_delayed_1 <= a12_data;
a13_data_delayed_1 <= a13_data;
a14_data_delayed_1 <= a14_data;
a15_data_delayed_1 <= a15_data;
a2_data_delayed_2 <= a2_data_delayed_1;
a3_data_delayed_2 <= a3_data_delayed_1;
a3_data_delayed_3 <= a3_data_delayed_2;
a4_data_delayed_2 <= a4_data_delayed_1;
a4_data_delayed_3 <= a4_data_delayed_2;
a4_data_delayed_4 <= a4_data_delayed_3;
a5_data_delayed_2 <= a5_data_delayed_1;
a5_data_delayed_3 <= a5_data_delayed_2;
a5_data_delayed_4 <= a5_data_delayed_3;
a5_data_delayed_5 <= a5_data_delayed_4;
a6_data_delayed_2 <= a6_data_delayed_1;
a6_data_delayed_3 <= a6_data_delayed_2;
a6_data_delayed_4 <= a6_data_delayed_3;
a6_data_delayed_5 <= a6_data_delayed_4;
a6_data_delayed_6 <= a6_data_delayed_5;
a7_data_delayed_2 <= a7_data_delayed_1;
a7_data_delayed_3 <= a7_data_delayed_2;
a7_data_delayed_4 <= a7_data_delayed_3;
a7_data_delayed_5 <= a7_data_delayed_4;
a7_data_delayed_6 <= a7_data_delayed_5;
a7_data_delayed_7 <= a7_data_delayed_6;
a8_data_delayed_2 <= a8_data_delayed_1;
a8_data_delayed_3 <= a8_data_delayed_2;
a8_data_delayed_4 <= a8_data_delayed_3;
a8_data_delayed_5 <= a8_data_delayed_4;
a8_data_delayed_6 <= a8_data_delayed_5;
a8_data_delayed_7 <= a8_data_delayed_6;
a8_data_delayed_8 <= a8_data_delayed_7;
a9_data_delayed_2 <= a9_data_delayed_1;
a9_data_delayed_3 <= a9_data_delayed_2;
a9_data_delayed_4 <= a9_data_delayed_3;
a9_data_delayed_5 <= a9_data_delayed_4;
a9_data_delayed_6 <= a9_data_delayed_5;
a9_data_delayed_7 <= a9_data_delayed_6;
a9_data_delayed_8 <= a9_data_delayed_7;
a9_data_delayed_9 <= a9_data_delayed_8;
a10_data_delayed_2 <= a10_data_delayed_1;
a10_data_delayed_3 <= a10_data_delayed_2;
a10_data_delayed_4 <= a10_data_delayed_3;
a10_data_delayed_5 <= a10_data_delayed_4;
a10_data_delayed_6 <= a10_data_delayed_5;
a10_data_delayed_7 <= a10_data_delayed_6;
a10_data_delayed_8 <= a10_data_delayed_7;
a10_data_delayed_9 <= a10_data_delayed_8;
a10_data_delayed_10 <= a10_data_delayed_9;
a11_data_delayed_2 <= a11_data_delayed_1;
a11_data_delayed_3 <= a11_data_delayed_2;
a11_data_delayed_4 <= a11_data_delayed_3;
a11_data_delayed_5 <= a11_data_delayed_4;
a11_data_delayed_6 <= a11_data_delayed_5;
a11_data_delayed_7 <= a11_data_delayed_6;
a11_data_delayed_8 <= a11_data_delayed_7;
a11_data_delayed_9 <= a11_data_delayed_8;
a11_data_delayed_10 <= a11_data_delayed_9;
a11_data_delayed_11 <= a11_data_delayed_10;
a12_data_delayed_2 <= a12_data_delayed_1;
a12_data_delayed_3 <= a12_data_delayed_2;
a12_data_delayed_4 <= a12_data_delayed_3;
a12_data_delayed_5 <= a12_data_delayed_4;
a12_data_delayed_6 <= a12_data_delayed_5;
a12_data_delayed_7 <= a12_data_delayed_6;
a12_data_delayed_8 <= a12_data_delayed_7;
a12_data_delayed_9 <= a12_data_delayed_8;
a12_data_delayed_10 <= a12_data_delayed_9;
a12_data_delayed_11 <= a12_data_delayed_10;
a12_data_delayed_12 <= a12_data_delayed_11;
a13_data_delayed_2 <= a13_data_delayed_1;
a13_data_delayed_3 <= a13_data_delayed_2;
a13_data_delayed_4 <= a13_data_delayed_3;
a13_data_delayed_5 <= a13_data_delayed_4;
a13_data_delayed_6 <= a13_data_delayed_5;
a13_data_delayed_7 <= a13_data_delayed_6;
a13_data_delayed_8 <= a13_data_delayed_7;
a13_data_delayed_9 <= a13_data_delayed_8;
a13_data_delayed_10 <= a13_data_delayed_9;
a13_data_delayed_11 <= a13_data_delayed_10;
a13_data_delayed_12 <= a13_data_delayed_11;
a13_data_delayed_13 <= a13_data_delayed_12;
a14_data_delayed_2 <= a14_data_delayed_1;
a14_data_delayed_3 <= a14_data_delayed_2;
a14_data_delayed_4 <= a14_data_delayed_3;
a14_data_delayed_5 <= a14_data_delayed_4;
a14_data_delayed_6 <= a14_data_delayed_5;
a14_data_delayed_7 <= a14_data_delayed_6;
a14_data_delayed_8 <= a14_data_delayed_7;
a14_data_delayed_9 <= a14_data_delayed_8;
a14_data_delayed_10 <= a14_data_delayed_9;
a14_data_delayed_11 <= a14_data_delayed_10;
a14_data_delayed_12 <= a14_data_delayed_11;
a14_data_delayed_13 <= a14_data_delayed_12;
a14_data_delayed_14 <= a14_data_delayed_13;
a15_data_delayed_2 <= a15_data_delayed_1;
a15_data_delayed_3 <= a15_data_delayed_2;
a15_data_delayed_4 <= a15_data_delayed_3;
a15_data_delayed_5 <= a15_data_delayed_4;
a15_data_delayed_6 <= a15_data_delayed_5;
a15_data_delayed_7 <= a15_data_delayed_6;
a15_data_delayed_8 <= a15_data_delayed_7;
a15_data_delayed_9 <= a15_data_delayed_8;
a15_data_delayed_10 <= a15_data_delayed_9;
a15_data_delayed_11 <= a15_data_delayed_10;
a15_data_delayed_12 <= a15_data_delayed_11;
a15_data_delayed_13 <= a15_data_delayed_12;
a15_data_delayed_14 <= a15_data_delayed_13;
a15_data_delayed_15 <= a15_data_delayed_14;
end
end
//////////////////////////////////////////////////////////////////////////
// Logic to generate addresses to BRAM B
//////////////////////////////////////////////////////////////////////////
reg [`AWIDTH-1:0] b_addr;
reg b_mem_access; //flag that tells whether the matmul is trying to access memory or not
always @(posedge clk) begin
//else if (clk_cnt >= b_loc*`MAT_MUL_SIZE+final_mat_mul_size) begin
//Writing the line above to avoid multiplication:
if ((reset || ~start_mat_mul) || (clk_cnt >= (b_loc<<`LOG2_MAT_MUL_SIZE)+final_mat_mul_size)) begin
b_addr <= address_mat_b - address_stride_b;
b_mem_access <= 0;
end
//else if ((clk_cnt >= b_loc*`MAT_MUL_SIZE) && (clk_cnt < b_loc*`MAT_MUL_SIZE+final_mat_mul_size)) begin
//Writing the line above to avoid multiplication:
else if ((clk_cnt >= (b_loc<<`LOG2_MAT_MUL_SIZE)) && (clk_cnt < (b_loc<<`LOG2_MAT_MUL_SIZE)+final_mat_mul_size)) begin
b_addr <= b_addr + address_stride_b;
b_mem_access <= 1;
end
end
//////////////////////////////////////////////////////////////////////////
// Logic to generate valid signals for data coming from BRAM B
//////////////////////////////////////////////////////////////////////////
reg [7:0] b_mem_access_counter;
always @(posedge clk) begin
if (reset || ~start_mat_mul) begin
b_mem_access_counter <= 0;
end
else if (b_mem_access == 1) begin
b_mem_access_counter <= b_mem_access_counter + 1;
end
else begin
b_mem_access_counter <= 0;
end
end
wire b_data_valid; //flag that tells whether the data from memory is valid
assign b_data_valid =
((validity_mask_b_rows[0]==1'b0 && b_mem_access_counter==1) ||
(validity_mask_b_rows[1]==1'b0 && b_mem_access_counter==2) ||
(validity_mask_b_rows[2]==1'b0 && b_mem_access_counter==3) ||
(validity_mask_b_rows[3]==1'b0 && b_mem_access_counter==4) ||
(validity_mask_b_rows[4]==1'b0 && b_mem_access_counter==5) ||
(validity_mask_b_rows[5]==1'b0 && b_mem_access_counter==6) ||
(validity_mask_b_rows[6]==1'b0 && b_mem_access_counter==7) ||
(validity_mask_b_rows[7]==1'b0 && b_mem_access_counter==8) ||
(validity_mask_b_rows[8]==1'b0 && b_mem_access_counter==9) ||
(validity_mask_b_rows[9]==1'b0 && b_mem_access_counter==10) ||
(validity_mask_b_rows[10]==1'b0 && b_mem_access_counter==11) ||
(validity_mask_b_rows[11]==1'b0 && b_mem_access_counter==12) ||
(validity_mask_b_rows[12]==1'b0 && b_mem_access_counter==13) ||
(validity_mask_b_rows[13]==1'b0 && b_mem_access_counter==14) ||
(validity_mask_b_rows[14]==1'b0 && b_mem_access_counter==15) ||
(validity_mask_b_rows[15]==1'b0 && b_mem_access_counter==16)) ?
1'b0 : (b_mem_access_counter >= `MEM_ACCESS_LATENCY);
//////////////////////////////////////////////////////////////////////////
// Logic to delay certain parts of the data received from BRAM B (systolic data setup)
//////////////////////////////////////////////////////////////////////////
assign b0_data = b_data[1*`DWIDTH-1:0*`DWIDTH] & {`DWIDTH{b_data_valid}} & {`DWIDTH{validity_mask_b_cols[0]}};
assign b1_data = b_data[2*`DWIDTH-1:1*`DWIDTH] & {`DWIDTH{b_data_valid}} & {`DWIDTH{validity_mask_b_cols[1]}};
assign b2_data = b_data[3*`DWIDTH-1:2*`DWIDTH] & {`DWIDTH{b_data_valid}} & {`DWIDTH{validity_mask_b_cols[2]}};
assign b3_data = b_data[4*`DWIDTH-1:3*`DWIDTH] & {`DWIDTH{b_data_valid}} & {`DWIDTH{validity_mask_b_cols[3]}};
assign b4_data = b_data[5*`DWIDTH-1:4*`DWIDTH] & {`DWIDTH{b_data_valid}} & {`DWIDTH{validity_mask_b_cols[4]}};
assign b5_data = b_data[6*`DWIDTH-1:5*`DWIDTH] & {`DWIDTH{b_data_valid}} & {`DWIDTH{validity_mask_b_cols[5]}};
assign b6_data = b_data[7*`DWIDTH-1:6*`DWIDTH] & {`DWIDTH{b_data_valid}} & {`DWIDTH{validity_mask_b_cols[6]}};
assign b7_data = b_data[8*`DWIDTH-1:7*`DWIDTH] & {`DWIDTH{b_data_valid}} & {`DWIDTH{validity_mask_b_cols[7]}};
assign b8_data = b_data[9*`DWIDTH-1:8*`DWIDTH] & {`DWIDTH{b_data_valid}} & {`DWIDTH{validity_mask_b_cols[8]}};
assign b9_data = b_data[10*`DWIDTH-1:9*`DWIDTH] & {`DWIDTH{b_data_valid}} & {`DWIDTH{validity_mask_b_cols[9]}};
assign b10_data = b_data[11*`DWIDTH-1:10*`DWIDTH] & {`DWIDTH{b_data_valid}} & {`DWIDTH{validity_mask_b_cols[10]}};
assign b11_data = b_data[12*`DWIDTH-1:11*`DWIDTH] & {`DWIDTH{b_data_valid}} & {`DWIDTH{validity_mask_b_cols[11]}};
assign b12_data = b_data[13*`DWIDTH-1:12*`DWIDTH] & {`DWIDTH{b_data_valid}} & {`DWIDTH{validity_mask_b_cols[12]}};
assign b13_data = b_data[14*`DWIDTH-1:13*`DWIDTH] & {`DWIDTH{b_data_valid}} & {`DWIDTH{validity_mask_b_cols[13]}};
assign b14_data = b_data[15*`DWIDTH-1:14*`DWIDTH] & {`DWIDTH{b_data_valid}} & {`DWIDTH{validity_mask_b_cols[14]}};
assign b15_data = b_data[16*`DWIDTH-1:15*`DWIDTH] & {`DWIDTH{b_data_valid}} & {`DWIDTH{validity_mask_b_cols[15]}};
reg [`DWIDTH-1:0] b1_data_delayed_1;
reg [`DWIDTH-1:0] b2_data_delayed_1;
reg [`DWIDTH-1:0] b2_data_delayed_2;
reg [`DWIDTH-1:0] b3_data_delayed_1;
reg [`DWIDTH-1:0] b3_data_delayed_2;
reg [`DWIDTH-1:0] b3_data_delayed_3;
reg [`DWIDTH-1:0] b4_data_delayed_1;
reg [`DWIDTH-1:0] b4_data_delayed_2;
reg [`DWIDTH-1:0] b4_data_delayed_3;
reg [`DWIDTH-1:0] b4_data_delayed_4;
reg [`DWIDTH-1:0] b5_data_delayed_1;
reg [`DWIDTH-1:0] b5_data_delayed_2;
reg [`DWIDTH-1:0] b5_data_delayed_3;
reg [`DWIDTH-1:0] b5_data_delayed_4;
reg [`DWIDTH-1:0] b5_data_delayed_5;
reg [`DWIDTH-1:0] b6_data_delayed_1;
reg [`DWIDTH-1:0] b6_data_delayed_2;
reg [`DWIDTH-1:0] b6_data_delayed_3;
reg [`DWIDTH-1:0] b6_data_delayed_4;
reg [`DWIDTH-1:0] b6_data_delayed_5;
reg [`DWIDTH-1:0] b6_data_delayed_6;
reg [`DWIDTH-1:0] b7_data_delayed_1;
reg [`DWIDTH-1:0] b7_data_delayed_2;
reg [`DWIDTH-1:0] b7_data_delayed_3;
reg [`DWIDTH-1:0] b7_data_delayed_4;
reg [`DWIDTH-1:0] b7_data_delayed_5;
reg [`DWIDTH-1:0] b7_data_delayed_6;
reg [`DWIDTH-1:0] b7_data_delayed_7;
reg [`DWIDTH-1:0] b8_data_delayed_1;
reg [`DWIDTH-1:0] b8_data_delayed_2;
reg [`DWIDTH-1:0] b8_data_delayed_3;
reg [`DWIDTH-1:0] b8_data_delayed_4;
reg [`DWIDTH-1:0] b8_data_delayed_5;
reg [`DWIDTH-1:0] b8_data_delayed_6;
reg [`DWIDTH-1:0] b8_data_delayed_7;
reg [`DWIDTH-1:0] b8_data_delayed_8;
reg [`DWIDTH-1:0] b9_data_delayed_1;
reg [`DWIDTH-1:0] b9_data_delayed_2;
reg [`DWIDTH-1:0] b9_data_delayed_3;
reg [`DWIDTH-1:0] b9_data_delayed_4;
reg [`DWIDTH-1:0] b9_data_delayed_5;
reg [`DWIDTH-1:0] b9_data_delayed_6;
reg [`DWIDTH-1:0] b9_data_delayed_7;
reg [`DWIDTH-1:0] b9_data_delayed_8;
reg [`DWIDTH-1:0] b9_data_delayed_9;
reg [`DWIDTH-1:0] b10_data_delayed_1;
reg [`DWIDTH-1:0] b10_data_delayed_2;
reg [`DWIDTH-1:0] b10_data_delayed_3;
reg [`DWIDTH-1:0] b10_data_delayed_4;
reg [`DWIDTH-1:0] b10_data_delayed_5;
reg [`DWIDTH-1:0] b10_data_delayed_6;
reg [`DWIDTH-1:0] b10_data_delayed_7;
reg [`DWIDTH-1:0] b10_data_delayed_8;
reg [`DWIDTH-1:0] b10_data_delayed_9;
reg [`DWIDTH-1:0] b10_data_delayed_10;
reg [`DWIDTH-1:0] b11_data_delayed_1;
reg [`DWIDTH-1:0] b11_data_delayed_2;
reg [`DWIDTH-1:0] b11_data_delayed_3;
reg [`DWIDTH-1:0] b11_data_delayed_4;
reg [`DWIDTH-1:0] b11_data_delayed_5;
reg [`DWIDTH-1:0] b11_data_delayed_6;
reg [`DWIDTH-1:0] b11_data_delayed_7;
reg [`DWIDTH-1:0] b11_data_delayed_8;
reg [`DWIDTH-1:0] b11_data_delayed_9;
reg [`DWIDTH-1:0] b11_data_delayed_10;
reg [`DWIDTH-1:0] b11_data_delayed_11;
reg [`DWIDTH-1:0] b12_data_delayed_1;
reg [`DWIDTH-1:0] b12_data_delayed_2;
reg [`DWIDTH-1:0] b12_data_delayed_3;
reg [`DWIDTH-1:0] b12_data_delayed_4;
reg [`DWIDTH-1:0] b12_data_delayed_5;
reg [`DWIDTH-1:0] b12_data_delayed_6;
reg [`DWIDTH-1:0] b12_data_delayed_7;
reg [`DWIDTH-1:0] b12_data_delayed_8;
reg [`DWIDTH-1:0] b12_data_delayed_9;
reg [`DWIDTH-1:0] b12_data_delayed_10;
reg [`DWIDTH-1:0] b12_data_delayed_11;
reg [`DWIDTH-1:0] b12_data_delayed_12;
reg [`DWIDTH-1:0] b13_data_delayed_1;
reg [`DWIDTH-1:0] b13_data_delayed_2;
reg [`DWIDTH-1:0] b13_data_delayed_3;
reg [`DWIDTH-1:0] b13_data_delayed_4;
reg [`DWIDTH-1:0] b13_data_delayed_5;
reg [`DWIDTH-1:0] b13_data_delayed_6;
reg [`DWIDTH-1:0] b13_data_delayed_7;
reg [`DWIDTH-1:0] b13_data_delayed_8;
reg [`DWIDTH-1:0] b13_data_delayed_9;
reg [`DWIDTH-1:0] b13_data_delayed_10;
reg [`DWIDTH-1:0] b13_data_delayed_11;
reg [`DWIDTH-1:0] b13_data_delayed_12;
reg [`DWIDTH-1:0] b13_data_delayed_13;
reg [`DWIDTH-1:0] b14_data_delayed_1;
reg [`DWIDTH-1:0] b14_data_delayed_2;
reg [`DWIDTH-1:0] b14_data_delayed_3;
reg [`DWIDTH-1:0] b14_data_delayed_4;
reg [`DWIDTH-1:0] b14_data_delayed_5;
reg [`DWIDTH-1:0] b14_data_delayed_6;
reg [`DWIDTH-1:0] b14_data_delayed_7;
reg [`DWIDTH-1:0] b14_data_delayed_8;
reg [`DWIDTH-1:0] b14_data_delayed_9;
reg [`DWIDTH-1:0] b14_data_delayed_10;
reg [`DWIDTH-1:0] b14_data_delayed_11;
reg [`DWIDTH-1:0] b14_data_delayed_12;
reg [`DWIDTH-1:0] b14_data_delayed_13;
reg [`DWIDTH-1:0] b14_data_delayed_14;
reg [`DWIDTH-1:0] b15_data_delayed_1;
reg [`DWIDTH-1:0] b15_data_delayed_2;
reg [`DWIDTH-1:0] b15_data_delayed_3;
reg [`DWIDTH-1:0] b15_data_delayed_4;
reg [`DWIDTH-1:0] b15_data_delayed_5;
reg [`DWIDTH-1:0] b15_data_delayed_6;
reg [`DWIDTH-1:0] b15_data_delayed_7;
reg [`DWIDTH-1:0] b15_data_delayed_8;
reg [`DWIDTH-1:0] b15_data_delayed_9;
reg [`DWIDTH-1:0] b15_data_delayed_10;
reg [`DWIDTH-1:0] b15_data_delayed_11;
reg [`DWIDTH-1:0] b15_data_delayed_12;
reg [`DWIDTH-1:0] b15_data_delayed_13;
reg [`DWIDTH-1:0] b15_data_delayed_14;
reg [`DWIDTH-1:0] b15_data_delayed_15;
always @(posedge clk) begin
if (reset || ~start_mat_mul || clk_cnt==0) begin
b1_data_delayed_1 <= 0;
b2_data_delayed_1 <= 0;
b2_data_delayed_2 <= 0;
b3_data_delayed_1 <= 0;
b3_data_delayed_2 <= 0;
b3_data_delayed_3 <= 0;
b4_data_delayed_1 <= 0;
b4_data_delayed_2 <= 0;
b4_data_delayed_3 <= 0;
b4_data_delayed_4 <= 0;
b5_data_delayed_1 <= 0;
b5_data_delayed_2 <= 0;
b5_data_delayed_3 <= 0;
b5_data_delayed_4 <= 0;
b5_data_delayed_5 <= 0;
b6_data_delayed_1 <= 0;
b6_data_delayed_2 <= 0;
b6_data_delayed_3 <= 0;
b6_data_delayed_4 <= 0;
b6_data_delayed_5 <= 0;
b6_data_delayed_6 <= 0;
b7_data_delayed_1 <= 0;
b7_data_delayed_2 <= 0;
b7_data_delayed_3 <= 0;
b7_data_delayed_4 <= 0;
b7_data_delayed_5 <= 0;
b7_data_delayed_6 <= 0;
b7_data_delayed_7 <= 0;
b8_data_delayed_1 <= 0;
b8_data_delayed_2 <= 0;
b8_data_delayed_3 <= 0;
b8_data_delayed_4 <= 0;
b8_data_delayed_5 <= 0;
b8_data_delayed_6 <= 0;
b8_data_delayed_7 <= 0;
b8_data_delayed_8 <= 0;
b9_data_delayed_1 <= 0;
b9_data_delayed_2 <= 0;
b9_data_delayed_3 <= 0;
b9_data_delayed_4 <= 0;
b9_data_delayed_5 <= 0;
b9_data_delayed_6 <= 0;
b9_data_delayed_7 <= 0;
b9_data_delayed_8 <= 0;
b9_data_delayed_9 <= 0;
b10_data_delayed_1 <= 0;
b10_data_delayed_2 <= 0;
b10_data_delayed_3 <= 0;
b10_data_delayed_4 <= 0;
b10_data_delayed_5 <= 0;
b10_data_delayed_6 <= 0;
b10_data_delayed_7 <= 0;
b10_data_delayed_8 <= 0;
b10_data_delayed_9 <= 0;
b10_data_delayed_10 <= 0;
b11_data_delayed_1 <= 0;
b11_data_delayed_2 <= 0;
b11_data_delayed_3 <= 0;
b11_data_delayed_4 <= 0;
b11_data_delayed_5 <= 0;
b11_data_delayed_6 <= 0;
b11_data_delayed_7 <= 0;
b11_data_delayed_8 <= 0;
b11_data_delayed_9 <= 0;
b11_data_delayed_10 <= 0;
b11_data_delayed_11 <= 0;
b12_data_delayed_1 <= 0;
b12_data_delayed_2 <= 0;
b12_data_delayed_3 <= 0;
b12_data_delayed_4 <= 0;
b12_data_delayed_5 <= 0;
b12_data_delayed_6 <= 0;
b12_data_delayed_7 <= 0;
b12_data_delayed_8 <= 0;
b12_data_delayed_9 <= 0;
b12_data_delayed_10 <= 0;
b12_data_delayed_11 <= 0;
b12_data_delayed_12 <= 0;
b13_data_delayed_1 <= 0;
b13_data_delayed_2 <= 0;
b13_data_delayed_3 <= 0;
b13_data_delayed_4 <= 0;
b13_data_delayed_5 <= 0;
b13_data_delayed_6 <= 0;
b13_data_delayed_7 <= 0;
b13_data_delayed_8 <= 0;
b13_data_delayed_9 <= 0;
b13_data_delayed_10 <= 0;
b13_data_delayed_11 <= 0;
b13_data_delayed_12 <= 0;
b13_data_delayed_13 <= 0;
b14_data_delayed_1 <= 0;
b14_data_delayed_2 <= 0;
b14_data_delayed_3 <= 0;
b14_data_delayed_4 <= 0;
b14_data_delayed_5 <= 0;
b14_data_delayed_6 <= 0;
b14_data_delayed_7 <= 0;
b14_data_delayed_8 <= 0;
b14_data_delayed_9 <= 0;
b14_data_delayed_10 <= 0;
b14_data_delayed_11 <= 0;
b14_data_delayed_12 <= 0;
b14_data_delayed_13 <= 0;
b14_data_delayed_14 <= 0;
b15_data_delayed_1 <= 0;
b15_data_delayed_2 <= 0;
b15_data_delayed_3 <= 0;
b15_data_delayed_4 <= 0;
b15_data_delayed_5 <= 0;
b15_data_delayed_6 <= 0;
b15_data_delayed_7 <= 0;
b15_data_delayed_8 <= 0;
b15_data_delayed_9 <= 0;
b15_data_delayed_10 <= 0;
b15_data_delayed_11 <= 0;
b15_data_delayed_12 <= 0;
b15_data_delayed_13 <= 0;
b15_data_delayed_14 <= 0;
b15_data_delayed_15 <= 0;
end
else begin
b1_data_delayed_1 <= b1_data;
b2_data_delayed_1 <= b2_data;
b3_data_delayed_1 <= b3_data;
b4_data_delayed_1 <= b4_data;
b5_data_delayed_1 <= b5_data;
b6_data_delayed_1 <= b6_data;
b7_data_delayed_1 <= b7_data;
b8_data_delayed_1 <= b8_data;
b9_data_delayed_1 <= b9_data;
b10_data_delayed_1 <= b10_data;
b11_data_delayed_1 <= b11_data;
b12_data_delayed_1 <= b12_data;
b13_data_delayed_1 <= b13_data;
b14_data_delayed_1 <= b14_data;
b15_data_delayed_1 <= b15_data;
b2_data_delayed_2 <= b2_data_delayed_1;
b3_data_delayed_2 <= b3_data_delayed_1;
b3_data_delayed_3 <= b3_data_delayed_2;
b4_data_delayed_2 <= b4_data_delayed_1;
b4_data_delayed_3 <= b4_data_delayed_2;
b4_data_delayed_4 <= b4_data_delayed_3;
b5_data_delayed_2 <= b5_data_delayed_1;
b5_data_delayed_3 <= b5_data_delayed_2;
b5_data_delayed_4 <= b5_data_delayed_3;
b5_data_delayed_5 <= b5_data_delayed_4;
b6_data_delayed_2 <= b6_data_delayed_1;
b6_data_delayed_3 <= b6_data_delayed_2;
b6_data_delayed_4 <= b6_data_delayed_3;
b6_data_delayed_5 <= b6_data_delayed_4;
b6_data_delayed_6 <= b6_data_delayed_5;
b7_data_delayed_2 <= b7_data_delayed_1;
b7_data_delayed_3 <= b7_data_delayed_2;
b7_data_delayed_4 <= b7_data_delayed_3;
b7_data_delayed_5 <= b7_data_delayed_4;
b7_data_delayed_6 <= b7_data_delayed_5;
b7_data_delayed_7 <= b7_data_delayed_6;
b8_data_delayed_2 <= b8_data_delayed_1;
b8_data_delayed_3 <= b8_data_delayed_2;
b8_data_delayed_4 <= b8_data_delayed_3;
b8_data_delayed_5 <= b8_data_delayed_4;
b8_data_delayed_6 <= b8_data_delayed_5;
b8_data_delayed_7 <= b8_data_delayed_6;
b8_data_delayed_8 <= b8_data_delayed_7;
b9_data_delayed_2 <= b9_data_delayed_1;
b9_data_delayed_3 <= b9_data_delayed_2;
b9_data_delayed_4 <= b9_data_delayed_3;
b9_data_delayed_5 <= b9_data_delayed_4;
b9_data_delayed_6 <= b9_data_delayed_5;
b9_data_delayed_7 <= b9_data_delayed_6;
b9_data_delayed_8 <= b9_data_delayed_7;
b9_data_delayed_9 <= b9_data_delayed_8;
b10_data_delayed_2 <= b10_data_delayed_1;
b10_data_delayed_3 <= b10_data_delayed_2;
b10_data_delayed_4 <= b10_data_delayed_3;
b10_data_delayed_5 <= b10_data_delayed_4;
b10_data_delayed_6 <= b10_data_delayed_5;
b10_data_delayed_7 <= b10_data_delayed_6;
b10_data_delayed_8 <= b10_data_delayed_7;
b10_data_delayed_9 <= b10_data_delayed_8;
b10_data_delayed_10 <= b10_data_delayed_9;
b11_data_delayed_2 <= b11_data_delayed_1;
b11_data_delayed_3 <= b11_data_delayed_2;
b11_data_delayed_4 <= b11_data_delayed_3;
b11_data_delayed_5 <= b11_data_delayed_4;
b11_data_delayed_6 <= b11_data_delayed_5;
b11_data_delayed_7 <= b11_data_delayed_6;
b11_data_delayed_8 <= b11_data_delayed_7;
b11_data_delayed_9 <= b11_data_delayed_8;
b11_data_delayed_10 <= b11_data_delayed_9;
b11_data_delayed_11 <= b11_data_delayed_10;
b12_data_delayed_2 <= b12_data_delayed_1;
b12_data_delayed_3 <= b12_data_delayed_2;
b12_data_delayed_4 <= b12_data_delayed_3;
b12_data_delayed_5 <= b12_data_delayed_4;
b12_data_delayed_6 <= b12_data_delayed_5;
b12_data_delayed_7 <= b12_data_delayed_6;
b12_data_delayed_8 <= b12_data_delayed_7;
b12_data_delayed_9 <= b12_data_delayed_8;
b12_data_delayed_10 <= b12_data_delayed_9;
b12_data_delayed_11 <= b12_data_delayed_10;
b12_data_delayed_12 <= b12_data_delayed_11;
b13_data_delayed_2 <= b13_data_delayed_1;
b13_data_delayed_3 <= b13_data_delayed_2;
b13_data_delayed_4 <= b13_data_delayed_3;
b13_data_delayed_5 <= b13_data_delayed_4;
b13_data_delayed_6 <= b13_data_delayed_5;
b13_data_delayed_7 <= b13_data_delayed_6;
b13_data_delayed_8 <= b13_data_delayed_7;
b13_data_delayed_9 <= b13_data_delayed_8;
b13_data_delayed_10 <= b13_data_delayed_9;
b13_data_delayed_11 <= b13_data_delayed_10;
b13_data_delayed_12 <= b13_data_delayed_11;
b13_data_delayed_13 <= b13_data_delayed_12;
b14_data_delayed_2 <= b14_data_delayed_1;
b14_data_delayed_3 <= b14_data_delayed_2;
b14_data_delayed_4 <= b14_data_delayed_3;
b14_data_delayed_5 <= b14_data_delayed_4;
b14_data_delayed_6 <= b14_data_delayed_5;
b14_data_delayed_7 <= b14_data_delayed_6;
b14_data_delayed_8 <= b14_data_delayed_7;
b14_data_delayed_9 <= b14_data_delayed_8;
b14_data_delayed_10 <= b14_data_delayed_9;
b14_data_delayed_11 <= b14_data_delayed_10;
b14_data_delayed_12 <= b14_data_delayed_11;
b14_data_delayed_13 <= b14_data_delayed_12;
b14_data_delayed_14 <= b14_data_delayed_13;
b15_data_delayed_2 <= b15_data_delayed_1;
b15_data_delayed_3 <= b15_data_delayed_2;
b15_data_delayed_4 <= b15_data_delayed_3;
b15_data_delayed_5 <= b15_data_delayed_4;
b15_data_delayed_6 <= b15_data_delayed_5;
b15_data_delayed_7 <= b15_data_delayed_6;
b15_data_delayed_8 <= b15_data_delayed_7;
b15_data_delayed_9 <= b15_data_delayed_8;
b15_data_delayed_10 <= b15_data_delayed_9;
b15_data_delayed_11 <= b15_data_delayed_10;
b15_data_delayed_12 <= b15_data_delayed_11;
b15_data_delayed_13 <= b15_data_delayed_12;
b15_data_delayed_14 <= b15_data_delayed_13;
b15_data_delayed_15 <= b15_data_delayed_14;
end
end
endmodule
//////////////////////////////////////////////////////////////////////////
// Systolically connected PEs
//////////////////////////////////////////////////////////////////////////
module systolic_pe_matrix(
clk,
reset,
pe_reset,
a0,
a1,
a2,
a3,
a4,
a5,
a6,
a7,
a8,
a9,
a10,
a11,
a12,
a13,
a14,
a15,
b0,
b1,
b2,
b3,
b4,
b5,
b6,
b7,
b8,
b9,
b10,
b11,
b12,
b13,
b14,
b15,
matrixC0_0,
matrixC0_1,
matrixC0_2,
matrixC0_3,
matrixC0_4,
matrixC0_5,
matrixC0_6,
matrixC0_7,
matrixC0_8,
matrixC0_9,
matrixC0_10,
matrixC0_11,
matrixC0_12,
matrixC0_13,
matrixC0_14,
matrixC0_15,
matrixC1_0,
matrixC1_1,
matrixC1_2,
matrixC1_3,
matrixC1_4,
matrixC1_5,
matrixC1_6,
matrixC1_7,
matrixC1_8,
matrixC1_9,
matrixC1_10,
matrixC1_11,
matrixC1_12,
matrixC1_13,
matrixC1_14,
matrixC1_15,
matrixC2_0,
matrixC2_1,
matrixC2_2,
matrixC2_3,
matrixC2_4,
matrixC2_5,
matrixC2_6,
matrixC2_7,
matrixC2_8,
matrixC2_9,
matrixC2_10,
matrixC2_11,
matrixC2_12,
matrixC2_13,
matrixC2_14,
matrixC2_15,
matrixC3_0,
matrixC3_1,
matrixC3_2,
matrixC3_3,
matrixC3_4,
matrixC3_5,
matrixC3_6,
matrixC3_7,
matrixC3_8,
matrixC3_9,
matrixC3_10,
matrixC3_11,
matrixC3_12,
matrixC3_13,
matrixC3_14,
matrixC3_15,
matrixC4_0,
matrixC4_1,
matrixC4_2,
matrixC4_3,
matrixC4_4,
matrixC4_5,
matrixC4_6,
matrixC4_7,
matrixC4_8,
matrixC4_9,
matrixC4_10,
matrixC4_11,
matrixC4_12,
matrixC4_13,
matrixC4_14,
matrixC4_15,
matrixC5_0,
matrixC5_1,
matrixC5_2,
matrixC5_3,
matrixC5_4,
matrixC5_5,
matrixC5_6,
matrixC5_7,
matrixC5_8,
matrixC5_9,
matrixC5_10,
matrixC5_11,
matrixC5_12,
matrixC5_13,
matrixC5_14,
matrixC5_15,
matrixC6_0,
matrixC6_1,
matrixC6_2,
matrixC6_3,
matrixC6_4,
matrixC6_5,
matrixC6_6,
matrixC6_7,
matrixC6_8,
matrixC6_9,
matrixC6_10,
matrixC6_11,
matrixC6_12,
matrixC6_13,
matrixC6_14,
matrixC6_15,
matrixC7_0,
matrixC7_1,
matrixC7_2,
matrixC7_3,
matrixC7_4,
matrixC7_5,
matrixC7_6,
matrixC7_7,
matrixC7_8,
matrixC7_9,
matrixC7_10,
matrixC7_11,
matrixC7_12,
matrixC7_13,
matrixC7_14,
matrixC7_15,
matrixC8_0,
matrixC8_1,
matrixC8_2,
matrixC8_3,
matrixC8_4,
matrixC8_5,
matrixC8_6,
matrixC8_7,
matrixC8_8,
matrixC8_9,
matrixC8_10,
matrixC8_11,
matrixC8_12,
matrixC8_13,
matrixC8_14,
matrixC8_15,
matrixC9_0,
matrixC9_1,
matrixC9_2,
matrixC9_3,
matrixC9_4,
matrixC9_5,
matrixC9_6,
matrixC9_7,
matrixC9_8,
matrixC9_9,
matrixC9_10,
matrixC9_11,
matrixC9_12,
matrixC9_13,
matrixC9_14,
matrixC9_15,
matrixC10_0,
matrixC10_1,
matrixC10_2,
matrixC10_3,
matrixC10_4,
matrixC10_5,
matrixC10_6,
matrixC10_7,
matrixC10_8,
matrixC10_9,
matrixC10_10,
matrixC10_11,
matrixC10_12,
matrixC10_13,
matrixC10_14,
matrixC10_15,
matrixC11_0,
matrixC11_1,
matrixC11_2,
matrixC11_3,
matrixC11_4,
matrixC11_5,
matrixC11_6,
matrixC11_7,
matrixC11_8,
matrixC11_9,
matrixC11_10,
matrixC11_11,
matrixC11_12,
matrixC11_13,
matrixC11_14,
matrixC11_15,
matrixC12_0,
matrixC12_1,
matrixC12_2,
matrixC12_3,
matrixC12_4,
matrixC12_5,
matrixC12_6,
matrixC12_7,
matrixC12_8,
matrixC12_9,
matrixC12_10,
matrixC12_11,
matrixC12_12,
matrixC12_13,
matrixC12_14,
matrixC12_15,
matrixC13_0,
matrixC13_1,
matrixC13_2,
matrixC13_3,
matrixC13_4,
matrixC13_5,
matrixC13_6,
matrixC13_7,
matrixC13_8,
matrixC13_9,
matrixC13_10,
matrixC13_11,
matrixC13_12,
matrixC13_13,
matrixC13_14,
matrixC13_15,
matrixC14_0,
matrixC14_1,
matrixC14_2,
matrixC14_3,
matrixC14_4,
matrixC14_5,
matrixC14_6,
matrixC14_7,
matrixC14_8,
matrixC14_9,
matrixC14_10,
matrixC14_11,
matrixC14_12,
matrixC14_13,
matrixC14_14,
matrixC14_15,
matrixC15_0,
matrixC15_1,
matrixC15_2,
matrixC15_3,
matrixC15_4,
matrixC15_5,
matrixC15_6,
matrixC15_7,
matrixC15_8,
matrixC15_9,
matrixC15_10,
matrixC15_11,
matrixC15_12,
matrixC15_13,
matrixC15_14,
matrixC15_15,
a_data_out,
b_data_out
);
input clk;
input reset;
input pe_reset;
input [`DWIDTH-1:0] a0;
input [`DWIDTH-1:0] a1;
input [`DWIDTH-1:0] a2;
input [`DWIDTH-1:0] a3;
input [`DWIDTH-1:0] a4;
input [`DWIDTH-1:0] a5;
input [`DWIDTH-1:0] a6;
input [`DWIDTH-1:0] a7;
input [`DWIDTH-1:0] a8;
input [`DWIDTH-1:0] a9;
input [`DWIDTH-1:0] a10;
input [`DWIDTH-1:0] a11;
input [`DWIDTH-1:0] a12;
input [`DWIDTH-1:0] a13;
input [`DWIDTH-1:0] a14;
input [`DWIDTH-1:0] a15;
input [`DWIDTH-1:0] b0;
input [`DWIDTH-1:0] b1;
input [`DWIDTH-1:0] b2;
input [`DWIDTH-1:0] b3;
input [`DWIDTH-1:0] b4;
input [`DWIDTH-1:0] b5;
input [`DWIDTH-1:0] b6;
input [`DWIDTH-1:0] b7;
input [`DWIDTH-1:0] b8;
input [`DWIDTH-1:0] b9;
input [`DWIDTH-1:0] b10;
input [`DWIDTH-1:0] b11;
input [`DWIDTH-1:0] b12;
input [`DWIDTH-1:0] b13;
input [`DWIDTH-1:0] b14;
input [`DWIDTH-1:0] b15;
output [`DWIDTH-1:0] matrixC0_0;
output [`DWIDTH-1:0] matrixC0_1;
output [`DWIDTH-1:0] matrixC0_2;
output [`DWIDTH-1:0] matrixC0_3;
output [`DWIDTH-1:0] matrixC0_4;
output [`DWIDTH-1:0] matrixC0_5;
output [`DWIDTH-1:0] matrixC0_6;
output [`DWIDTH-1:0] matrixC0_7;
output [`DWIDTH-1:0] matrixC0_8;
output [`DWIDTH-1:0] matrixC0_9;
output [`DWIDTH-1:0] matrixC0_10;
output [`DWIDTH-1:0] matrixC0_11;
output [`DWIDTH-1:0] matrixC0_12;
output [`DWIDTH-1:0] matrixC0_13;
output [`DWIDTH-1:0] matrixC0_14;
output [`DWIDTH-1:0] matrixC0_15;
output [`DWIDTH-1:0] matrixC1_0;
output [`DWIDTH-1:0] matrixC1_1;
output [`DWIDTH-1:0] matrixC1_2;
output [`DWIDTH-1:0] matrixC1_3;
output [`DWIDTH-1:0] matrixC1_4;
output [`DWIDTH-1:0] matrixC1_5;
output [`DWIDTH-1:0] matrixC1_6;
output [`DWIDTH-1:0] matrixC1_7;
output [`DWIDTH-1:0] matrixC1_8;
output [`DWIDTH-1:0] matrixC1_9;
output [`DWIDTH-1:0] matrixC1_10;
output [`DWIDTH-1:0] matrixC1_11;
output [`DWIDTH-1:0] matrixC1_12;
output [`DWIDTH-1:0] matrixC1_13;
output [`DWIDTH-1:0] matrixC1_14;
output [`DWIDTH-1:0] matrixC1_15;
output [`DWIDTH-1:0] matrixC2_0;
output [`DWIDTH-1:0] matrixC2_1;
output [`DWIDTH-1:0] matrixC2_2;
output [`DWIDTH-1:0] matrixC2_3;
output [`DWIDTH-1:0] matrixC2_4;
output [`DWIDTH-1:0] matrixC2_5;
output [`DWIDTH-1:0] matrixC2_6;
output [`DWIDTH-1:0] matrixC2_7;
output [`DWIDTH-1:0] matrixC2_8;
output [`DWIDTH-1:0] matrixC2_9;
output [`DWIDTH-1:0] matrixC2_10;
output [`DWIDTH-1:0] matrixC2_11;
output [`DWIDTH-1:0] matrixC2_12;
output [`DWIDTH-1:0] matrixC2_13;
output [`DWIDTH-1:0] matrixC2_14;
output [`DWIDTH-1:0] matrixC2_15;
output [`DWIDTH-1:0] matrixC3_0;
output [`DWIDTH-1:0] matrixC3_1;
output [`DWIDTH-1:0] matrixC3_2;
output [`DWIDTH-1:0] matrixC3_3;
output [`DWIDTH-1:0] matrixC3_4;
output [`DWIDTH-1:0] matrixC3_5;
output [`DWIDTH-1:0] matrixC3_6;
output [`DWIDTH-1:0] matrixC3_7;
output [`DWIDTH-1:0] matrixC3_8;
output [`DWIDTH-1:0] matrixC3_9;
output [`DWIDTH-1:0] matrixC3_10;
output [`DWIDTH-1:0] matrixC3_11;
output [`DWIDTH-1:0] matrixC3_12;
output [`DWIDTH-1:0] matrixC3_13;
output [`DWIDTH-1:0] matrixC3_14;
output [`DWIDTH-1:0] matrixC3_15;
output [`DWIDTH-1:0] matrixC4_0;
output [`DWIDTH-1:0] matrixC4_1;
output [`DWIDTH-1:0] matrixC4_2;
output [`DWIDTH-1:0] matrixC4_3;
output [`DWIDTH-1:0] matrixC4_4;
output [`DWIDTH-1:0] matrixC4_5;
output [`DWIDTH-1:0] matrixC4_6;
output [`DWIDTH-1:0] matrixC4_7;
output [`DWIDTH-1:0] matrixC4_8;
output [`DWIDTH-1:0] matrixC4_9;
output [`DWIDTH-1:0] matrixC4_10;
output [`DWIDTH-1:0] matrixC4_11;
output [`DWIDTH-1:0] matrixC4_12;
output [`DWIDTH-1:0] matrixC4_13;
output [`DWIDTH-1:0] matrixC4_14;
output [`DWIDTH-1:0] matrixC4_15;
output [`DWIDTH-1:0] matrixC5_0;
output [`DWIDTH-1:0] matrixC5_1;
output [`DWIDTH-1:0] matrixC5_2;
output [`DWIDTH-1:0] matrixC5_3;
output [`DWIDTH-1:0] matrixC5_4;
output [`DWIDTH-1:0] matrixC5_5;
output [`DWIDTH-1:0] matrixC5_6;
output [`DWIDTH-1:0] matrixC5_7;
output [`DWIDTH-1:0] matrixC5_8;
output [`DWIDTH-1:0] matrixC5_9;
output [`DWIDTH-1:0] matrixC5_10;
output [`DWIDTH-1:0] matrixC5_11;
output [`DWIDTH-1:0] matrixC5_12;
output [`DWIDTH-1:0] matrixC5_13;
output [`DWIDTH-1:0] matrixC5_14;
output [`DWIDTH-1:0] matrixC5_15;
output [`DWIDTH-1:0] matrixC6_0;
output [`DWIDTH-1:0] matrixC6_1;
output [`DWIDTH-1:0] matrixC6_2;
output [`DWIDTH-1:0] matrixC6_3;
output [`DWIDTH-1:0] matrixC6_4;
output [`DWIDTH-1:0] matrixC6_5;
output [`DWIDTH-1:0] matrixC6_6;
output [`DWIDTH-1:0] matrixC6_7;
output [`DWIDTH-1:0] matrixC6_8;
output [`DWIDTH-1:0] matrixC6_9;
output [`DWIDTH-1:0] matrixC6_10;
output [`DWIDTH-1:0] matrixC6_11;
output [`DWIDTH-1:0] matrixC6_12;
output [`DWIDTH-1:0] matrixC6_13;
output [`DWIDTH-1:0] matrixC6_14;
output [`DWIDTH-1:0] matrixC6_15;
output [`DWIDTH-1:0] matrixC7_0;
output [`DWIDTH-1:0] matrixC7_1;
output [`DWIDTH-1:0] matrixC7_2;
output [`DWIDTH-1:0] matrixC7_3;
output [`DWIDTH-1:0] matrixC7_4;
output [`DWIDTH-1:0] matrixC7_5;
output [`DWIDTH-1:0] matrixC7_6;
output [`DWIDTH-1:0] matrixC7_7;
output [`DWIDTH-1:0] matrixC7_8;
output [`DWIDTH-1:0] matrixC7_9;
output [`DWIDTH-1:0] matrixC7_10;
output [`DWIDTH-1:0] matrixC7_11;
output [`DWIDTH-1:0] matrixC7_12;
output [`DWIDTH-1:0] matrixC7_13;
output [`DWIDTH-1:0] matrixC7_14;
output [`DWIDTH-1:0] matrixC7_15;
output [`DWIDTH-1:0] matrixC8_0;
output [`DWIDTH-1:0] matrixC8_1;
output [`DWIDTH-1:0] matrixC8_2;
output [`DWIDTH-1:0] matrixC8_3;
output [`DWIDTH-1:0] matrixC8_4;
output [`DWIDTH-1:0] matrixC8_5;
output [`DWIDTH-1:0] matrixC8_6;
output [`DWIDTH-1:0] matrixC8_7;
output [`DWIDTH-1:0] matrixC8_8;
output [`DWIDTH-1:0] matrixC8_9;
output [`DWIDTH-1:0] matrixC8_10;
output [`DWIDTH-1:0] matrixC8_11;
output [`DWIDTH-1:0] matrixC8_12;
output [`DWIDTH-1:0] matrixC8_13;
output [`DWIDTH-1:0] matrixC8_14;
output [`DWIDTH-1:0] matrixC8_15;
output [`DWIDTH-1:0] matrixC9_0;
output [`DWIDTH-1:0] matrixC9_1;
output [`DWIDTH-1:0] matrixC9_2;
output [`DWIDTH-1:0] matrixC9_3;
output [`DWIDTH-1:0] matrixC9_4;
output [`DWIDTH-1:0] matrixC9_5;
output [`DWIDTH-1:0] matrixC9_6;
output [`DWIDTH-1:0] matrixC9_7;
output [`DWIDTH-1:0] matrixC9_8;
output [`DWIDTH-1:0] matrixC9_9;
output [`DWIDTH-1:0] matrixC9_10;
output [`DWIDTH-1:0] matrixC9_11;
output [`DWIDTH-1:0] matrixC9_12;
output [`DWIDTH-1:0] matrixC9_13;
output [`DWIDTH-1:0] matrixC9_14;
output [`DWIDTH-1:0] matrixC9_15;
output [`DWIDTH-1:0] matrixC10_0;
output [`DWIDTH-1:0] matrixC10_1;
output [`DWIDTH-1:0] matrixC10_2;
output [`DWIDTH-1:0] matrixC10_3;
output [`DWIDTH-1:0] matrixC10_4;
output [`DWIDTH-1:0] matrixC10_5;
output [`DWIDTH-1:0] matrixC10_6;
output [`DWIDTH-1:0] matrixC10_7;
output [`DWIDTH-1:0] matrixC10_8;
output [`DWIDTH-1:0] matrixC10_9;
output [`DWIDTH-1:0] matrixC10_10;
output [`DWIDTH-1:0] matrixC10_11;
output [`DWIDTH-1:0] matrixC10_12;
output [`DWIDTH-1:0] matrixC10_13;
output [`DWIDTH-1:0] matrixC10_14;
output [`DWIDTH-1:0] matrixC10_15;
output [`DWIDTH-1:0] matrixC11_0;
output [`DWIDTH-1:0] matrixC11_1;
output [`DWIDTH-1:0] matrixC11_2;
output [`DWIDTH-1:0] matrixC11_3;
output [`DWIDTH-1:0] matrixC11_4;
output [`DWIDTH-1:0] matrixC11_5;
output [`DWIDTH-1:0] matrixC11_6;
output [`DWIDTH-1:0] matrixC11_7;
output [`DWIDTH-1:0] matrixC11_8;
output [`DWIDTH-1:0] matrixC11_9;
output [`DWIDTH-1:0] matrixC11_10;
output [`DWIDTH-1:0] matrixC11_11;
output [`DWIDTH-1:0] matrixC11_12;
output [`DWIDTH-1:0] matrixC11_13;
output [`DWIDTH-1:0] matrixC11_14;
output [`DWIDTH-1:0] matrixC11_15;
output [`DWIDTH-1:0] matrixC12_0;
output [`DWIDTH-1:0] matrixC12_1;
output [`DWIDTH-1:0] matrixC12_2;
output [`DWIDTH-1:0] matrixC12_3;
output [`DWIDTH-1:0] matrixC12_4;
output [`DWIDTH-1:0] matrixC12_5;
output [`DWIDTH-1:0] matrixC12_6;
output [`DWIDTH-1:0] matrixC12_7;
output [`DWIDTH-1:0] matrixC12_8;
output [`DWIDTH-1:0] matrixC12_9;
output [`DWIDTH-1:0] matrixC12_10;
output [`DWIDTH-1:0] matrixC12_11;
output [`DWIDTH-1:0] matrixC12_12;
output [`DWIDTH-1:0] matrixC12_13;
output [`DWIDTH-1:0] matrixC12_14;
output [`DWIDTH-1:0] matrixC12_15;
output [`DWIDTH-1:0] matrixC13_0;
output [`DWIDTH-1:0] matrixC13_1;
output [`DWIDTH-1:0] matrixC13_2;
output [`DWIDTH-1:0] matrixC13_3;
output [`DWIDTH-1:0] matrixC13_4;
output [`DWIDTH-1:0] matrixC13_5;
output [`DWIDTH-1:0] matrixC13_6;
output [`DWIDTH-1:0] matrixC13_7;
output [`DWIDTH-1:0] matrixC13_8;
output [`DWIDTH-1:0] matrixC13_9;
output [`DWIDTH-1:0] matrixC13_10;
output [`DWIDTH-1:0] matrixC13_11;
output [`DWIDTH-1:0] matrixC13_12;
output [`DWIDTH-1:0] matrixC13_13;
output [`DWIDTH-1:0] matrixC13_14;
output [`DWIDTH-1:0] matrixC13_15;
output [`DWIDTH-1:0] matrixC14_0;
output [`DWIDTH-1:0] matrixC14_1;
output [`DWIDTH-1:0] matrixC14_2;
output [`DWIDTH-1:0] matrixC14_3;
output [`DWIDTH-1:0] matrixC14_4;
output [`DWIDTH-1:0] matrixC14_5;
output [`DWIDTH-1:0] matrixC14_6;
output [`DWIDTH-1:0] matrixC14_7;
output [`DWIDTH-1:0] matrixC14_8;
output [`DWIDTH-1:0] matrixC14_9;
output [`DWIDTH-1:0] matrixC14_10;
output [`DWIDTH-1:0] matrixC14_11;
output [`DWIDTH-1:0] matrixC14_12;
output [`DWIDTH-1:0] matrixC14_13;
output [`DWIDTH-1:0] matrixC14_14;
output [`DWIDTH-1:0] matrixC14_15;
output [`DWIDTH-1:0] matrixC15_0;
output [`DWIDTH-1:0] matrixC15_1;
output [`DWIDTH-1:0] matrixC15_2;
output [`DWIDTH-1:0] matrixC15_3;
output [`DWIDTH-1:0] matrixC15_4;
output [`DWIDTH-1:0] matrixC15_5;
output [`DWIDTH-1:0] matrixC15_6;
output [`DWIDTH-1:0] matrixC15_7;
output [`DWIDTH-1:0] matrixC15_8;
output [`DWIDTH-1:0] matrixC15_9;
output [`DWIDTH-1:0] matrixC15_10;
output [`DWIDTH-1:0] matrixC15_11;
output [`DWIDTH-1:0] matrixC15_12;
output [`DWIDTH-1:0] matrixC15_13;
output [`DWIDTH-1:0] matrixC15_14;
output [`DWIDTH-1:0] matrixC15_15;
output [`MAT_MUL_SIZE*`DWIDTH-1:0] a_data_out;
output [`MAT_MUL_SIZE*`DWIDTH-1:0] b_data_out;
wire [`DWIDTH-1:0] a0_0to0_1, a0_1to0_2, a0_2to0_3, a0_3to0_4, a0_4to0_5, a0_5to0_6, a0_6to0_7, a0_7to0_8, a0_8to0_9, a0_9to0_10, a0_10to0_11, a0_11to0_12, a0_12to0_13, a0_13to0_14, a0_14to0_15, a0_15to0_16;
wire [`DWIDTH-1:0] a1_0to1_1, a1_1to1_2, a1_2to1_3, a1_3to1_4, a1_4to1_5, a1_5to1_6, a1_6to1_7, a1_7to1_8, a1_8to1_9, a1_9to1_10, a1_10to1_11, a1_11to1_12, a1_12to1_13, a1_13to1_14, a1_14to1_15, a1_15to1_16;
wire [`DWIDTH-1:0] a2_0to2_1, a2_1to2_2, a2_2to2_3, a2_3to2_4, a2_4to2_5, a2_5to2_6, a2_6to2_7, a2_7to2_8, a2_8to2_9, a2_9to2_10, a2_10to2_11, a2_11to2_12, a2_12to2_13, a2_13to2_14, a2_14to2_15, a2_15to2_16;
wire [`DWIDTH-1:0] a3_0to3_1, a3_1to3_2, a3_2to3_3, a3_3to3_4, a3_4to3_5, a3_5to3_6, a3_6to3_7, a3_7to3_8, a3_8to3_9, a3_9to3_10, a3_10to3_11, a3_11to3_12, a3_12to3_13, a3_13to3_14, a3_14to3_15, a3_15to3_16;
wire [`DWIDTH-1:0] a4_0to4_1, a4_1to4_2, a4_2to4_3, a4_3to4_4, a4_4to4_5, a4_5to4_6, a4_6to4_7, a4_7to4_8, a4_8to4_9, a4_9to4_10, a4_10to4_11, a4_11to4_12, a4_12to4_13, a4_13to4_14, a4_14to4_15, a4_15to4_16;
wire [`DWIDTH-1:0] a5_0to5_1, a5_1to5_2, a5_2to5_3, a5_3to5_4, a5_4to5_5, a5_5to5_6, a5_6to5_7, a5_7to5_8, a5_8to5_9, a5_9to5_10, a5_10to5_11, a5_11to5_12, a5_12to5_13, a5_13to5_14, a5_14to5_15, a5_15to5_16;
wire [`DWIDTH-1:0] a6_0to6_1, a6_1to6_2, a6_2to6_3, a6_3to6_4, a6_4to6_5, a6_5to6_6, a6_6to6_7, a6_7to6_8, a6_8to6_9, a6_9to6_10, a6_10to6_11, a6_11to6_12, a6_12to6_13, a6_13to6_14, a6_14to6_15, a6_15to6_16;
wire [`DWIDTH-1:0] a7_0to7_1, a7_1to7_2, a7_2to7_3, a7_3to7_4, a7_4to7_5, a7_5to7_6, a7_6to7_7, a7_7to7_8, a7_8to7_9, a7_9to7_10, a7_10to7_11, a7_11to7_12, a7_12to7_13, a7_13to7_14, a7_14to7_15, a7_15to7_16;
wire [`DWIDTH-1:0] a8_0to8_1, a8_1to8_2, a8_2to8_3, a8_3to8_4, a8_4to8_5, a8_5to8_6, a8_6to8_7, a8_7to8_8, a8_8to8_9, a8_9to8_10, a8_10to8_11, a8_11to8_12, a8_12to8_13, a8_13to8_14, a8_14to8_15, a8_15to8_16;
wire [`DWIDTH-1:0] a9_0to9_1, a9_1to9_2, a9_2to9_3, a9_3to9_4, a9_4to9_5, a9_5to9_6, a9_6to9_7, a9_7to9_8, a9_8to9_9, a9_9to9_10, a9_10to9_11, a9_11to9_12, a9_12to9_13, a9_13to9_14, a9_14to9_15, a9_15to9_16;
wire [`DWIDTH-1:0] a10_0to10_1, a10_1to10_2, a10_2to10_3, a10_3to10_4, a10_4to10_5, a10_5to10_6, a10_6to10_7, a10_7to10_8, a10_8to10_9, a10_9to10_10, a10_10to10_11, a10_11to10_12, a10_12to10_13, a10_13to10_14, a10_14to10_15, a10_15to10_16;
wire [`DWIDTH-1:0] a11_0to11_1, a11_1to11_2, a11_2to11_3, a11_3to11_4, a11_4to11_5, a11_5to11_6, a11_6to11_7, a11_7to11_8, a11_8to11_9, a11_9to11_10, a11_10to11_11, a11_11to11_12, a11_12to11_13, a11_13to11_14, a11_14to11_15, a11_15to11_16;
wire [`DWIDTH-1:0] a12_0to12_1, a12_1to12_2, a12_2to12_3, a12_3to12_4, a12_4to12_5, a12_5to12_6, a12_6to12_7, a12_7to12_8, a12_8to12_9, a12_9to12_10, a12_10to12_11, a12_11to12_12, a12_12to12_13, a12_13to12_14, a12_14to12_15, a12_15to12_16;
wire [`DWIDTH-1:0] a13_0to13_1, a13_1to13_2, a13_2to13_3, a13_3to13_4, a13_4to13_5, a13_5to13_6, a13_6to13_7, a13_7to13_8, a13_8to13_9, a13_9to13_10, a13_10to13_11, a13_11to13_12, a13_12to13_13, a13_13to13_14, a13_14to13_15, a13_15to13_16;
wire [`DWIDTH-1:0] a14_0to14_1, a14_1to14_2, a14_2to14_3, a14_3to14_4, a14_4to14_5, a14_5to14_6, a14_6to14_7, a14_7to14_8, a14_8to14_9, a14_9to14_10, a14_10to14_11, a14_11to14_12, a14_12to14_13, a14_13to14_14, a14_14to14_15, a14_15to14_16;
wire [`DWIDTH-1:0] a15_0to15_1, a15_1to15_2, a15_2to15_3, a15_3to15_4, a15_4to15_5, a15_5to15_6, a15_6to15_7, a15_7to15_8, a15_8to15_9, a15_9to15_10, a15_10to15_11, a15_11to15_12, a15_12to15_13, a15_13to15_14, a15_14to15_15, a15_15to15_16;
wire [`DWIDTH-1:0] b0_0to1_0, b1_0to2_0, b2_0to3_0, b3_0to4_0, b4_0to5_0, b5_0to6_0, b6_0to7_0, b7_0to8_0, b8_0to9_0, b9_0to10_0, b10_0to11_0, b11_0to12_0, b12_0to13_0, b13_0to14_0, b14_0to15_0, b15_0to16_0;
wire [`DWIDTH-1:0] b0_1to1_1, b1_1to2_1, b2_1to3_1, b3_1to4_1, b4_1to5_1, b5_1to6_1, b6_1to7_1, b7_1to8_1, b8_1to9_1, b9_1to10_1, b10_1to11_1, b11_1to12_1, b12_1to13_1, b13_1to14_1, b14_1to15_1, b15_1to16_1;
wire [`DWIDTH-1:0] b0_2to1_2, b1_2to2_2, b2_2to3_2, b3_2to4_2, b4_2to5_2, b5_2to6_2, b6_2to7_2, b7_2to8_2, b8_2to9_2, b9_2to10_2, b10_2to11_2, b11_2to12_2, b12_2to13_2, b13_2to14_2, b14_2to15_2, b15_2to16_2;
wire [`DWIDTH-1:0] b0_3to1_3, b1_3to2_3, b2_3to3_3, b3_3to4_3, b4_3to5_3, b5_3to6_3, b6_3to7_3, b7_3to8_3, b8_3to9_3, b9_3to10_3, b10_3to11_3, b11_3to12_3, b12_3to13_3, b13_3to14_3, b14_3to15_3, b15_3to16_3;
wire [`DWIDTH-1:0] b0_4to1_4, b1_4to2_4, b2_4to3_4, b3_4to4_4, b4_4to5_4, b5_4to6_4, b6_4to7_4, b7_4to8_4, b8_4to9_4, b9_4to10_4, b10_4to11_4, b11_4to12_4, b12_4to13_4, b13_4to14_4, b14_4to15_4, b15_4to16_4;
wire [`DWIDTH-1:0] b0_5to1_5, b1_5to2_5, b2_5to3_5, b3_5to4_5, b4_5to5_5, b5_5to6_5, b6_5to7_5, b7_5to8_5, b8_5to9_5, b9_5to10_5, b10_5to11_5, b11_5to12_5, b12_5to13_5, b13_5to14_5, b14_5to15_5, b15_5to16_5;
wire [`DWIDTH-1:0] b0_6to1_6, b1_6to2_6, b2_6to3_6, b3_6to4_6, b4_6to5_6, b5_6to6_6, b6_6to7_6, b7_6to8_6, b8_6to9_6, b9_6to10_6, b10_6to11_6, b11_6to12_6, b12_6to13_6, b13_6to14_6, b14_6to15_6, b15_6to16_6;
wire [`DWIDTH-1:0] b0_7to1_7, b1_7to2_7, b2_7to3_7, b3_7to4_7, b4_7to5_7, b5_7to6_7, b6_7to7_7, b7_7to8_7, b8_7to9_7, b9_7to10_7, b10_7to11_7, b11_7to12_7, b12_7to13_7, b13_7to14_7, b14_7to15_7, b15_7to16_7;
wire [`DWIDTH-1:0] b0_8to1_8, b1_8to2_8, b2_8to3_8, b3_8to4_8, b4_8to5_8, b5_8to6_8, b6_8to7_8, b7_8to8_8, b8_8to9_8, b9_8to10_8, b10_8to11_8, b11_8to12_8, b12_8to13_8, b13_8to14_8, b14_8to15_8, b15_8to16_8;
wire [`DWIDTH-1:0] b0_9to1_9, b1_9to2_9, b2_9to3_9, b3_9to4_9, b4_9to5_9, b5_9to6_9, b6_9to7_9, b7_9to8_9, b8_9to9_9, b9_9to10_9, b10_9to11_9, b11_9to12_9, b12_9to13_9, b13_9to14_9, b14_9to15_9, b15_9to16_9;
wire [`DWIDTH-1:0] b0_10to1_10, b1_10to2_10, b2_10to3_10, b3_10to4_10, b4_10to5_10, b5_10to6_10, b6_10to7_10, b7_10to8_10, b8_10to9_10, b9_10to10_10, b10_10to11_10, b11_10to12_10, b12_10to13_10, b13_10to14_10, b14_10to15_10, b15_10to16_10;
wire [`DWIDTH-1:0] b0_11to1_11, b1_11to2_11, b2_11to3_11, b3_11to4_11, b4_11to5_11, b5_11to6_11, b6_11to7_11, b7_11to8_11, b8_11to9_11, b9_11to10_11, b10_11to11_11, b11_11to12_11, b12_11to13_11, b13_11to14_11, b14_11to15_11, b15_11to16_11;
wire [`DWIDTH-1:0] b0_12to1_12, b1_12to2_12, b2_12to3_12, b3_12to4_12, b4_12to5_12, b5_12to6_12, b6_12to7_12, b7_12to8_12, b8_12to9_12, b9_12to10_12, b10_12to11_12, b11_12to12_12, b12_12to13_12, b13_12to14_12, b14_12to15_12, b15_12to16_12;
wire [`DWIDTH-1:0] b0_13to1_13, b1_13to2_13, b2_13to3_13, b3_13to4_13, b4_13to5_13, b5_13to6_13, b6_13to7_13, b7_13to8_13, b8_13to9_13, b9_13to10_13, b10_13to11_13, b11_13to12_13, b12_13to13_13, b13_13to14_13, b14_13to15_13, b15_13to16_13;
wire [`DWIDTH-1:0] b0_14to1_14, b1_14to2_14, b2_14to3_14, b3_14to4_14, b4_14to5_14, b5_14to6_14, b6_14to7_14, b7_14to8_14, b8_14to9_14, b9_14to10_14, b10_14to11_14, b11_14to12_14, b12_14to13_14, b13_14to14_14, b14_14to15_14, b15_14to16_14;
wire [`DWIDTH-1:0] b0_15to1_15, b1_15to2_15, b2_15to3_15, b3_15to4_15, b4_15to5_15, b5_15to6_15, b6_15to7_15, b7_15to8_15, b8_15to9_15, b9_15to10_15, b10_15to11_15, b11_15to12_15, b12_15to13_15, b13_15to14_15, b14_15to15_15, b15_15to16_15;
//////////////////////////////////////////////////////////////////////////
// Instantiations of the actual PEs
//////////////////////////////////////////////////////////////////////////
//For larger matmul, more PEs will be needed
wire effective_rst;
assign effective_rst = reset | pe_reset;
processing_element pe0_0(.reset(effective_rst), .clk(clk), .in_a(a0), .in_b(b0), .out_a(a0_0to0_1), .out_b(b0_0to1_0), .out_c(matrixC0_0));
processing_element pe0_1(.reset(effective_rst), .clk(clk), .in_a(a0_0to0_1), .in_b(b1), .out_a(a0_1to0_2), .out_b(b0_1to1_1), .out_c(matrixC0_1));
processing_element pe0_2(.reset(effective_rst), .clk(clk), .in_a(a0_1to0_2), .in_b(b2), .out_a(a0_2to0_3), .out_b(b0_2to1_2), .out_c(matrixC0_2));
processing_element pe0_3(.reset(effective_rst), .clk(clk), .in_a(a0_2to0_3), .in_b(b3), .out_a(a0_3to0_4), .out_b(b0_3to1_3), .out_c(matrixC0_3));
processing_element pe0_4(.reset(effective_rst), .clk(clk), .in_a(a0_3to0_4), .in_b(b4), .out_a(a0_4to0_5), .out_b(b0_4to1_4), .out_c(matrixC0_4));
processing_element pe0_5(.reset(effective_rst), .clk(clk), .in_a(a0_4to0_5), .in_b(b5), .out_a(a0_5to0_6), .out_b(b0_5to1_5), .out_c(matrixC0_5));
processing_element pe0_6(.reset(effective_rst), .clk(clk), .in_a(a0_5to0_6), .in_b(b6), .out_a(a0_6to0_7), .out_b(b0_6to1_6), .out_c(matrixC0_6));
processing_element pe0_7(.reset(effective_rst), .clk(clk), .in_a(a0_6to0_7), .in_b(b7), .out_a(a0_7to0_8), .out_b(b0_7to1_7), .out_c(matrixC0_7));
processing_element pe0_8(.reset(effective_rst), .clk(clk), .in_a(a0_7to0_8), .in_b(b8), .out_a(a0_8to0_9), .out_b(b0_8to1_8), .out_c(matrixC0_8));
processing_element pe0_9(.reset(effective_rst), .clk(clk), .in_a(a0_8to0_9), .in_b(b9), .out_a(a0_9to0_10), .out_b(b0_9to1_9), .out_c(matrixC0_9));
processing_element pe0_10(.reset(effective_rst), .clk(clk), .in_a(a0_9to0_10), .in_b(b10), .out_a(a0_10to0_11), .out_b(b0_10to1_10), .out_c(matrixC0_10));
processing_element pe0_11(.reset(effective_rst), .clk(clk), .in_a(a0_10to0_11), .in_b(b11), .out_a(a0_11to0_12), .out_b(b0_11to1_11), .out_c(matrixC0_11));
processing_element pe0_12(.reset(effective_rst), .clk(clk), .in_a(a0_11to0_12), .in_b(b12), .out_a(a0_12to0_13), .out_b(b0_12to1_12), .out_c(matrixC0_12));
processing_element pe0_13(.reset(effective_rst), .clk(clk), .in_a(a0_12to0_13), .in_b(b13), .out_a(a0_13to0_14), .out_b(b0_13to1_13), .out_c(matrixC0_13));
processing_element pe0_14(.reset(effective_rst), .clk(clk), .in_a(a0_13to0_14), .in_b(b14), .out_a(a0_14to0_15), .out_b(b0_14to1_14), .out_c(matrixC0_14));
processing_element pe0_15(.reset(effective_rst), .clk(clk), .in_a(a0_14to0_15), .in_b(b15), .out_a(a0_15to0_16), .out_b(b0_15to1_15), .out_c(matrixC0_15));
processing_element pe1_0(.reset(effective_rst), .clk(clk), .in_a(a1), .in_b(b0_0to1_0), .out_a(a1_0to1_1), .out_b(b1_0to2_0), .out_c(matrixC1_0));
processing_element pe2_0(.reset(effective_rst), .clk(clk), .in_a(a2), .in_b(b1_0to2_0), .out_a(a2_0to2_1), .out_b(b2_0to3_0), .out_c(matrixC2_0));
processing_element pe3_0(.reset(effective_rst), .clk(clk), .in_a(a3), .in_b(b2_0to3_0), .out_a(a3_0to3_1), .out_b(b3_0to4_0), .out_c(matrixC3_0));
processing_element pe4_0(.reset(effective_rst), .clk(clk), .in_a(a4), .in_b(b3_0to4_0), .out_a(a4_0to4_1), .out_b(b4_0to5_0), .out_c(matrixC4_0));
processing_element pe5_0(.reset(effective_rst), .clk(clk), .in_a(a5), .in_b(b4_0to5_0), .out_a(a5_0to5_1), .out_b(b5_0to6_0), .out_c(matrixC5_0));
processing_element pe6_0(.reset(effective_rst), .clk(clk), .in_a(a6), .in_b(b5_0to6_0), .out_a(a6_0to6_1), .out_b(b6_0to7_0), .out_c(matrixC6_0));
processing_element pe7_0(.reset(effective_rst), .clk(clk), .in_a(a7), .in_b(b6_0to7_0), .out_a(a7_0to7_1), .out_b(b7_0to8_0), .out_c(matrixC7_0));
processing_element pe8_0(.reset(effective_rst), .clk(clk), .in_a(a8), .in_b(b7_0to8_0), .out_a(a8_0to8_1), .out_b(b8_0to9_0), .out_c(matrixC8_0));
processing_element pe9_0(.reset(effective_rst), .clk(clk), .in_a(a9), .in_b(b8_0to9_0), .out_a(a9_0to9_1), .out_b(b9_0to10_0), .out_c(matrixC9_0));
processing_element pe10_0(.reset(effective_rst), .clk(clk), .in_a(a10), .in_b(b9_0to10_0), .out_a(a10_0to10_1), .out_b(b10_0to11_0), .out_c(matrixC10_0));
processing_element pe11_0(.reset(effective_rst), .clk(clk), .in_a(a11), .in_b(b10_0to11_0), .out_a(a11_0to11_1), .out_b(b11_0to12_0), .out_c(matrixC11_0));
processing_element pe12_0(.reset(effective_rst), .clk(clk), .in_a(a12), .in_b(b11_0to12_0), .out_a(a12_0to12_1), .out_b(b12_0to13_0), .out_c(matrixC12_0));
processing_element pe13_0(.reset(effective_rst), .clk(clk), .in_a(a13), .in_b(b12_0to13_0), .out_a(a13_0to13_1), .out_b(b13_0to14_0), .out_c(matrixC13_0));
processing_element pe14_0(.reset(effective_rst), .clk(clk), .in_a(a14), .in_b(b13_0to14_0), .out_a(a14_0to14_1), .out_b(b14_0to15_0), .out_c(matrixC14_0));
processing_element pe15_0(.reset(effective_rst), .clk(clk), .in_a(a15), .in_b(b14_0to15_0), .out_a(a15_0to15_1), .out_b(b15_0to16_0), .out_c(matrixC15_0));
processing_element pe1_1(.reset(effective_rst), .clk(clk), .in_a(a1_0to1_1), .in_b(b0_1to1_1), .out_a(a1_1to1_2), .out_b(b1_1to2_1), .out_c(matrixC1_1));
processing_element pe1_2(.reset(effective_rst), .clk(clk), .in_a(a1_1to1_2), .in_b(b0_2to1_2), .out_a(a1_2to1_3), .out_b(b1_2to2_2), .out_c(matrixC1_2));
processing_element pe1_3(.reset(effective_rst), .clk(clk), .in_a(a1_2to1_3), .in_b(b0_3to1_3), .out_a(a1_3to1_4), .out_b(b1_3to2_3), .out_c(matrixC1_3));
processing_element pe1_4(.reset(effective_rst), .clk(clk), .in_a(a1_3to1_4), .in_b(b0_4to1_4), .out_a(a1_4to1_5), .out_b(b1_4to2_4), .out_c(matrixC1_4));
processing_element pe1_5(.reset(effective_rst), .clk(clk), .in_a(a1_4to1_5), .in_b(b0_5to1_5), .out_a(a1_5to1_6), .out_b(b1_5to2_5), .out_c(matrixC1_5));
processing_element pe1_6(.reset(effective_rst), .clk(clk), .in_a(a1_5to1_6), .in_b(b0_6to1_6), .out_a(a1_6to1_7), .out_b(b1_6to2_6), .out_c(matrixC1_6));
processing_element pe1_7(.reset(effective_rst), .clk(clk), .in_a(a1_6to1_7), .in_b(b0_7to1_7), .out_a(a1_7to1_8), .out_b(b1_7to2_7), .out_c(matrixC1_7));
processing_element pe1_8(.reset(effective_rst), .clk(clk), .in_a(a1_7to1_8), .in_b(b0_8to1_8), .out_a(a1_8to1_9), .out_b(b1_8to2_8), .out_c(matrixC1_8));
processing_element pe1_9(.reset(effective_rst), .clk(clk), .in_a(a1_8to1_9), .in_b(b0_9to1_9), .out_a(a1_9to1_10), .out_b(b1_9to2_9), .out_c(matrixC1_9));
processing_element pe1_10(.reset(effective_rst), .clk(clk), .in_a(a1_9to1_10), .in_b(b0_10to1_10), .out_a(a1_10to1_11), .out_b(b1_10to2_10), .out_c(matrixC1_10));
processing_element pe1_11(.reset(effective_rst), .clk(clk), .in_a(a1_10to1_11), .in_b(b0_11to1_11), .out_a(a1_11to1_12), .out_b(b1_11to2_11), .out_c(matrixC1_11));
processing_element pe1_12(.reset(effective_rst), .clk(clk), .in_a(a1_11to1_12), .in_b(b0_12to1_12), .out_a(a1_12to1_13), .out_b(b1_12to2_12), .out_c(matrixC1_12));
processing_element pe1_13(.reset(effective_rst), .clk(clk), .in_a(a1_12to1_13), .in_b(b0_13to1_13), .out_a(a1_13to1_14), .out_b(b1_13to2_13), .out_c(matrixC1_13));
processing_element pe1_14(.reset(effective_rst), .clk(clk), .in_a(a1_13to1_14), .in_b(b0_14to1_14), .out_a(a1_14to1_15), .out_b(b1_14to2_14), .out_c(matrixC1_14));
processing_element pe1_15(.reset(effective_rst), .clk(clk), .in_a(a1_14to1_15), .in_b(b0_15to1_15), .out_a(a1_15to1_16), .out_b(b1_15to2_15), .out_c(matrixC1_15));
processing_element pe2_1(.reset(effective_rst), .clk(clk), .in_a(a2_0to2_1), .in_b(b1_1to2_1), .out_a(a2_1to2_2), .out_b(b2_1to3_1), .out_c(matrixC2_1));
processing_element pe2_2(.reset(effective_rst), .clk(clk), .in_a(a2_1to2_2), .in_b(b1_2to2_2), .out_a(a2_2to2_3), .out_b(b2_2to3_2), .out_c(matrixC2_2));
processing_element pe2_3(.reset(effective_rst), .clk(clk), .in_a(a2_2to2_3), .in_b(b1_3to2_3), .out_a(a2_3to2_4), .out_b(b2_3to3_3), .out_c(matrixC2_3));
processing_element pe2_4(.reset(effective_rst), .clk(clk), .in_a(a2_3to2_4), .in_b(b1_4to2_4), .out_a(a2_4to2_5), .out_b(b2_4to3_4), .out_c(matrixC2_4));
processing_element pe2_5(.reset(effective_rst), .clk(clk), .in_a(a2_4to2_5), .in_b(b1_5to2_5), .out_a(a2_5to2_6), .out_b(b2_5to3_5), .out_c(matrixC2_5));
processing_element pe2_6(.reset(effective_rst), .clk(clk), .in_a(a2_5to2_6), .in_b(b1_6to2_6), .out_a(a2_6to2_7), .out_b(b2_6to3_6), .out_c(matrixC2_6));
processing_element pe2_7(.reset(effective_rst), .clk(clk), .in_a(a2_6to2_7), .in_b(b1_7to2_7), .out_a(a2_7to2_8), .out_b(b2_7to3_7), .out_c(matrixC2_7));
processing_element pe2_8(.reset(effective_rst), .clk(clk), .in_a(a2_7to2_8), .in_b(b1_8to2_8), .out_a(a2_8to2_9), .out_b(b2_8to3_8), .out_c(matrixC2_8));
processing_element pe2_9(.reset(effective_rst), .clk(clk), .in_a(a2_8to2_9), .in_b(b1_9to2_9), .out_a(a2_9to2_10), .out_b(b2_9to3_9), .out_c(matrixC2_9));
processing_element pe2_10(.reset(effective_rst), .clk(clk), .in_a(a2_9to2_10), .in_b(b1_10to2_10), .out_a(a2_10to2_11), .out_b(b2_10to3_10), .out_c(matrixC2_10));
processing_element pe2_11(.reset(effective_rst), .clk(clk), .in_a(a2_10to2_11), .in_b(b1_11to2_11), .out_a(a2_11to2_12), .out_b(b2_11to3_11), .out_c(matrixC2_11));
processing_element pe2_12(.reset(effective_rst), .clk(clk), .in_a(a2_11to2_12), .in_b(b1_12to2_12), .out_a(a2_12to2_13), .out_b(b2_12to3_12), .out_c(matrixC2_12));
processing_element pe2_13(.reset(effective_rst), .clk(clk), .in_a(a2_12to2_13), .in_b(b1_13to2_13), .out_a(a2_13to2_14), .out_b(b2_13to3_13), .out_c(matrixC2_13));
processing_element pe2_14(.reset(effective_rst), .clk(clk), .in_a(a2_13to2_14), .in_b(b1_14to2_14), .out_a(a2_14to2_15), .out_b(b2_14to3_14), .out_c(matrixC2_14));
processing_element pe2_15(.reset(effective_rst), .clk(clk), .in_a(a2_14to2_15), .in_b(b1_15to2_15), .out_a(a2_15to2_16), .out_b(b2_15to3_15), .out_c(matrixC2_15));
processing_element pe3_1(.reset(effective_rst), .clk(clk), .in_a(a3_0to3_1), .in_b(b2_1to3_1), .out_a(a3_1to3_2), .out_b(b3_1to4_1), .out_c(matrixC3_1));
processing_element pe3_2(.reset(effective_rst), .clk(clk), .in_a(a3_1to3_2), .in_b(b2_2to3_2), .out_a(a3_2to3_3), .out_b(b3_2to4_2), .out_c(matrixC3_2));
processing_element pe3_3(.reset(effective_rst), .clk(clk), .in_a(a3_2to3_3), .in_b(b2_3to3_3), .out_a(a3_3to3_4), .out_b(b3_3to4_3), .out_c(matrixC3_3));
processing_element pe3_4(.reset(effective_rst), .clk(clk), .in_a(a3_3to3_4), .in_b(b2_4to3_4), .out_a(a3_4to3_5), .out_b(b3_4to4_4), .out_c(matrixC3_4));
processing_element pe3_5(.reset(effective_rst), .clk(clk), .in_a(a3_4to3_5), .in_b(b2_5to3_5), .out_a(a3_5to3_6), .out_b(b3_5to4_5), .out_c(matrixC3_5));
processing_element pe3_6(.reset(effective_rst), .clk(clk), .in_a(a3_5to3_6), .in_b(b2_6to3_6), .out_a(a3_6to3_7), .out_b(b3_6to4_6), .out_c(matrixC3_6));
processing_element pe3_7(.reset(effective_rst), .clk(clk), .in_a(a3_6to3_7), .in_b(b2_7to3_7), .out_a(a3_7to3_8), .out_b(b3_7to4_7), .out_c(matrixC3_7));
processing_element pe3_8(.reset(effective_rst), .clk(clk), .in_a(a3_7to3_8), .in_b(b2_8to3_8), .out_a(a3_8to3_9), .out_b(b3_8to4_8), .out_c(matrixC3_8));
processing_element pe3_9(.reset(effective_rst), .clk(clk), .in_a(a3_8to3_9), .in_b(b2_9to3_9), .out_a(a3_9to3_10), .out_b(b3_9to4_9), .out_c(matrixC3_9));
processing_element pe3_10(.reset(effective_rst), .clk(clk), .in_a(a3_9to3_10), .in_b(b2_10to3_10), .out_a(a3_10to3_11), .out_b(b3_10to4_10), .out_c(matrixC3_10));
processing_element pe3_11(.reset(effective_rst), .clk(clk), .in_a(a3_10to3_11), .in_b(b2_11to3_11), .out_a(a3_11to3_12), .out_b(b3_11to4_11), .out_c(matrixC3_11));
processing_element pe3_12(.reset(effective_rst), .clk(clk), .in_a(a3_11to3_12), .in_b(b2_12to3_12), .out_a(a3_12to3_13), .out_b(b3_12to4_12), .out_c(matrixC3_12));
processing_element pe3_13(.reset(effective_rst), .clk(clk), .in_a(a3_12to3_13), .in_b(b2_13to3_13), .out_a(a3_13to3_14), .out_b(b3_13to4_13), .out_c(matrixC3_13));
processing_element pe3_14(.reset(effective_rst), .clk(clk), .in_a(a3_13to3_14), .in_b(b2_14to3_14), .out_a(a3_14to3_15), .out_b(b3_14to4_14), .out_c(matrixC3_14));
processing_element pe3_15(.reset(effective_rst), .clk(clk), .in_a(a3_14to3_15), .in_b(b2_15to3_15), .out_a(a3_15to3_16), .out_b(b3_15to4_15), .out_c(matrixC3_15));
processing_element pe4_1(.reset(effective_rst), .clk(clk), .in_a(a4_0to4_1), .in_b(b3_1to4_1), .out_a(a4_1to4_2), .out_b(b4_1to5_1), .out_c(matrixC4_1));
processing_element pe4_2(.reset(effective_rst), .clk(clk), .in_a(a4_1to4_2), .in_b(b3_2to4_2), .out_a(a4_2to4_3), .out_b(b4_2to5_2), .out_c(matrixC4_2));
processing_element pe4_3(.reset(effective_rst), .clk(clk), .in_a(a4_2to4_3), .in_b(b3_3to4_3), .out_a(a4_3to4_4), .out_b(b4_3to5_3), .out_c(matrixC4_3));
processing_element pe4_4(.reset(effective_rst), .clk(clk), .in_a(a4_3to4_4), .in_b(b3_4to4_4), .out_a(a4_4to4_5), .out_b(b4_4to5_4), .out_c(matrixC4_4));
processing_element pe4_5(.reset(effective_rst), .clk(clk), .in_a(a4_4to4_5), .in_b(b3_5to4_5), .out_a(a4_5to4_6), .out_b(b4_5to5_5), .out_c(matrixC4_5));
processing_element pe4_6(.reset(effective_rst), .clk(clk), .in_a(a4_5to4_6), .in_b(b3_6to4_6), .out_a(a4_6to4_7), .out_b(b4_6to5_6), .out_c(matrixC4_6));
processing_element pe4_7(.reset(effective_rst), .clk(clk), .in_a(a4_6to4_7), .in_b(b3_7to4_7), .out_a(a4_7to4_8), .out_b(b4_7to5_7), .out_c(matrixC4_7));
processing_element pe4_8(.reset(effective_rst), .clk(clk), .in_a(a4_7to4_8), .in_b(b3_8to4_8), .out_a(a4_8to4_9), .out_b(b4_8to5_8), .out_c(matrixC4_8));
processing_element pe4_9(.reset(effective_rst), .clk(clk), .in_a(a4_8to4_9), .in_b(b3_9to4_9), .out_a(a4_9to4_10), .out_b(b4_9to5_9), .out_c(matrixC4_9));
processing_element pe4_10(.reset(effective_rst), .clk(clk), .in_a(a4_9to4_10), .in_b(b3_10to4_10), .out_a(a4_10to4_11), .out_b(b4_10to5_10), .out_c(matrixC4_10));
processing_element pe4_11(.reset(effective_rst), .clk(clk), .in_a(a4_10to4_11), .in_b(b3_11to4_11), .out_a(a4_11to4_12), .out_b(b4_11to5_11), .out_c(matrixC4_11));
processing_element pe4_12(.reset(effective_rst), .clk(clk), .in_a(a4_11to4_12), .in_b(b3_12to4_12), .out_a(a4_12to4_13), .out_b(b4_12to5_12), .out_c(matrixC4_12));
processing_element pe4_13(.reset(effective_rst), .clk(clk), .in_a(a4_12to4_13), .in_b(b3_13to4_13), .out_a(a4_13to4_14), .out_b(b4_13to5_13), .out_c(matrixC4_13));
processing_element pe4_14(.reset(effective_rst), .clk(clk), .in_a(a4_13to4_14), .in_b(b3_14to4_14), .out_a(a4_14to4_15), .out_b(b4_14to5_14), .out_c(matrixC4_14));
processing_element pe4_15(.reset(effective_rst), .clk(clk), .in_a(a4_14to4_15), .in_b(b3_15to4_15), .out_a(a4_15to4_16), .out_b(b4_15to5_15), .out_c(matrixC4_15));
processing_element pe5_1(.reset(effective_rst), .clk(clk), .in_a(a5_0to5_1), .in_b(b4_1to5_1), .out_a(a5_1to5_2), .out_b(b5_1to6_1), .out_c(matrixC5_1));
processing_element pe5_2(.reset(effective_rst), .clk(clk), .in_a(a5_1to5_2), .in_b(b4_2to5_2), .out_a(a5_2to5_3), .out_b(b5_2to6_2), .out_c(matrixC5_2));
processing_element pe5_3(.reset(effective_rst), .clk(clk), .in_a(a5_2to5_3), .in_b(b4_3to5_3), .out_a(a5_3to5_4), .out_b(b5_3to6_3), .out_c(matrixC5_3));
processing_element pe5_4(.reset(effective_rst), .clk(clk), .in_a(a5_3to5_4), .in_b(b4_4to5_4), .out_a(a5_4to5_5), .out_b(b5_4to6_4), .out_c(matrixC5_4));
processing_element pe5_5(.reset(effective_rst), .clk(clk), .in_a(a5_4to5_5), .in_b(b4_5to5_5), .out_a(a5_5to5_6), .out_b(b5_5to6_5), .out_c(matrixC5_5));
processing_element pe5_6(.reset(effective_rst), .clk(clk), .in_a(a5_5to5_6), .in_b(b4_6to5_6), .out_a(a5_6to5_7), .out_b(b5_6to6_6), .out_c(matrixC5_6));
processing_element pe5_7(.reset(effective_rst), .clk(clk), .in_a(a5_6to5_7), .in_b(b4_7to5_7), .out_a(a5_7to5_8), .out_b(b5_7to6_7), .out_c(matrixC5_7));
processing_element pe5_8(.reset(effective_rst), .clk(clk), .in_a(a5_7to5_8), .in_b(b4_8to5_8), .out_a(a5_8to5_9), .out_b(b5_8to6_8), .out_c(matrixC5_8));
processing_element pe5_9(.reset(effective_rst), .clk(clk), .in_a(a5_8to5_9), .in_b(b4_9to5_9), .out_a(a5_9to5_10), .out_b(b5_9to6_9), .out_c(matrixC5_9));
processing_element pe5_10(.reset(effective_rst), .clk(clk), .in_a(a5_9to5_10), .in_b(b4_10to5_10), .out_a(a5_10to5_11), .out_b(b5_10to6_10), .out_c(matrixC5_10));
processing_element pe5_11(.reset(effective_rst), .clk(clk), .in_a(a5_10to5_11), .in_b(b4_11to5_11), .out_a(a5_11to5_12), .out_b(b5_11to6_11), .out_c(matrixC5_11));
processing_element pe5_12(.reset(effective_rst), .clk(clk), .in_a(a5_11to5_12), .in_b(b4_12to5_12), .out_a(a5_12to5_13), .out_b(b5_12to6_12), .out_c(matrixC5_12));
processing_element pe5_13(.reset(effective_rst), .clk(clk), .in_a(a5_12to5_13), .in_b(b4_13to5_13), .out_a(a5_13to5_14), .out_b(b5_13to6_13), .out_c(matrixC5_13));
processing_element pe5_14(.reset(effective_rst), .clk(clk), .in_a(a5_13to5_14), .in_b(b4_14to5_14), .out_a(a5_14to5_15), .out_b(b5_14to6_14), .out_c(matrixC5_14));
processing_element pe5_15(.reset(effective_rst), .clk(clk), .in_a(a5_14to5_15), .in_b(b4_15to5_15), .out_a(a5_15to5_16), .out_b(b5_15to6_15), .out_c(matrixC5_15));
processing_element pe6_1(.reset(effective_rst), .clk(clk), .in_a(a6_0to6_1), .in_b(b5_1to6_1), .out_a(a6_1to6_2), .out_b(b6_1to7_1), .out_c(matrixC6_1));
processing_element pe6_2(.reset(effective_rst), .clk(clk), .in_a(a6_1to6_2), .in_b(b5_2to6_2), .out_a(a6_2to6_3), .out_b(b6_2to7_2), .out_c(matrixC6_2));
processing_element pe6_3(.reset(effective_rst), .clk(clk), .in_a(a6_2to6_3), .in_b(b5_3to6_3), .out_a(a6_3to6_4), .out_b(b6_3to7_3), .out_c(matrixC6_3));
processing_element pe6_4(.reset(effective_rst), .clk(clk), .in_a(a6_3to6_4), .in_b(b5_4to6_4), .out_a(a6_4to6_5), .out_b(b6_4to7_4), .out_c(matrixC6_4));
processing_element pe6_5(.reset(effective_rst), .clk(clk), .in_a(a6_4to6_5), .in_b(b5_5to6_5), .out_a(a6_5to6_6), .out_b(b6_5to7_5), .out_c(matrixC6_5));
processing_element pe6_6(.reset(effective_rst), .clk(clk), .in_a(a6_5to6_6), .in_b(b5_6to6_6), .out_a(a6_6to6_7), .out_b(b6_6to7_6), .out_c(matrixC6_6));
processing_element pe6_7(.reset(effective_rst), .clk(clk), .in_a(a6_6to6_7), .in_b(b5_7to6_7), .out_a(a6_7to6_8), .out_b(b6_7to7_7), .out_c(matrixC6_7));
processing_element pe6_8(.reset(effective_rst), .clk(clk), .in_a(a6_7to6_8), .in_b(b5_8to6_8), .out_a(a6_8to6_9), .out_b(b6_8to7_8), .out_c(matrixC6_8));
processing_element pe6_9(.reset(effective_rst), .clk(clk), .in_a(a6_8to6_9), .in_b(b5_9to6_9), .out_a(a6_9to6_10), .out_b(b6_9to7_9), .out_c(matrixC6_9));
processing_element pe6_10(.reset(effective_rst), .clk(clk), .in_a(a6_9to6_10), .in_b(b5_10to6_10), .out_a(a6_10to6_11), .out_b(b6_10to7_10), .out_c(matrixC6_10));
processing_element pe6_11(.reset(effective_rst), .clk(clk), .in_a(a6_10to6_11), .in_b(b5_11to6_11), .out_a(a6_11to6_12), .out_b(b6_11to7_11), .out_c(matrixC6_11));
processing_element pe6_12(.reset(effective_rst), .clk(clk), .in_a(a6_11to6_12), .in_b(b5_12to6_12), .out_a(a6_12to6_13), .out_b(b6_12to7_12), .out_c(matrixC6_12));
processing_element pe6_13(.reset(effective_rst), .clk(clk), .in_a(a6_12to6_13), .in_b(b5_13to6_13), .out_a(a6_13to6_14), .out_b(b6_13to7_13), .out_c(matrixC6_13));
processing_element pe6_14(.reset(effective_rst), .clk(clk), .in_a(a6_13to6_14), .in_b(b5_14to6_14), .out_a(a6_14to6_15), .out_b(b6_14to7_14), .out_c(matrixC6_14));
processing_element pe6_15(.reset(effective_rst), .clk(clk), .in_a(a6_14to6_15), .in_b(b5_15to6_15), .out_a(a6_15to6_16), .out_b(b6_15to7_15), .out_c(matrixC6_15));
processing_element pe7_1(.reset(effective_rst), .clk(clk), .in_a(a7_0to7_1), .in_b(b6_1to7_1), .out_a(a7_1to7_2), .out_b(b7_1to8_1), .out_c(matrixC7_1));
processing_element pe7_2(.reset(effective_rst), .clk(clk), .in_a(a7_1to7_2), .in_b(b6_2to7_2), .out_a(a7_2to7_3), .out_b(b7_2to8_2), .out_c(matrixC7_2));
processing_element pe7_3(.reset(effective_rst), .clk(clk), .in_a(a7_2to7_3), .in_b(b6_3to7_3), .out_a(a7_3to7_4), .out_b(b7_3to8_3), .out_c(matrixC7_3));
processing_element pe7_4(.reset(effective_rst), .clk(clk), .in_a(a7_3to7_4), .in_b(b6_4to7_4), .out_a(a7_4to7_5), .out_b(b7_4to8_4), .out_c(matrixC7_4));
processing_element pe7_5(.reset(effective_rst), .clk(clk), .in_a(a7_4to7_5), .in_b(b6_5to7_5), .out_a(a7_5to7_6), .out_b(b7_5to8_5), .out_c(matrixC7_5));
processing_element pe7_6(.reset(effective_rst), .clk(clk), .in_a(a7_5to7_6), .in_b(b6_6to7_6), .out_a(a7_6to7_7), .out_b(b7_6to8_6), .out_c(matrixC7_6));
processing_element pe7_7(.reset(effective_rst), .clk(clk), .in_a(a7_6to7_7), .in_b(b6_7to7_7), .out_a(a7_7to7_8), .out_b(b7_7to8_7), .out_c(matrixC7_7));
processing_element pe7_8(.reset(effective_rst), .clk(clk), .in_a(a7_7to7_8), .in_b(b6_8to7_8), .out_a(a7_8to7_9), .out_b(b7_8to8_8), .out_c(matrixC7_8));
processing_element pe7_9(.reset(effective_rst), .clk(clk), .in_a(a7_8to7_9), .in_b(b6_9to7_9), .out_a(a7_9to7_10), .out_b(b7_9to8_9), .out_c(matrixC7_9));
processing_element pe7_10(.reset(effective_rst), .clk(clk), .in_a(a7_9to7_10), .in_b(b6_10to7_10), .out_a(a7_10to7_11), .out_b(b7_10to8_10), .out_c(matrixC7_10));
processing_element pe7_11(.reset(effective_rst), .clk(clk), .in_a(a7_10to7_11), .in_b(b6_11to7_11), .out_a(a7_11to7_12), .out_b(b7_11to8_11), .out_c(matrixC7_11));
processing_element pe7_12(.reset(effective_rst), .clk(clk), .in_a(a7_11to7_12), .in_b(b6_12to7_12), .out_a(a7_12to7_13), .out_b(b7_12to8_12), .out_c(matrixC7_12));
processing_element pe7_13(.reset(effective_rst), .clk(clk), .in_a(a7_12to7_13), .in_b(b6_13to7_13), .out_a(a7_13to7_14), .out_b(b7_13to8_13), .out_c(matrixC7_13));
processing_element pe7_14(.reset(effective_rst), .clk(clk), .in_a(a7_13to7_14), .in_b(b6_14to7_14), .out_a(a7_14to7_15), .out_b(b7_14to8_14), .out_c(matrixC7_14));
processing_element pe7_15(.reset(effective_rst), .clk(clk), .in_a(a7_14to7_15), .in_b(b6_15to7_15), .out_a(a7_15to7_16), .out_b(b7_15to8_15), .out_c(matrixC7_15));
processing_element pe8_1(.reset(effective_rst), .clk(clk), .in_a(a8_0to8_1), .in_b(b7_1to8_1), .out_a(a8_1to8_2), .out_b(b8_1to9_1), .out_c(matrixC8_1));
processing_element pe8_2(.reset(effective_rst), .clk(clk), .in_a(a8_1to8_2), .in_b(b7_2to8_2), .out_a(a8_2to8_3), .out_b(b8_2to9_2), .out_c(matrixC8_2));
processing_element pe8_3(.reset(effective_rst), .clk(clk), .in_a(a8_2to8_3), .in_b(b7_3to8_3), .out_a(a8_3to8_4), .out_b(b8_3to9_3), .out_c(matrixC8_3));
processing_element pe8_4(.reset(effective_rst), .clk(clk), .in_a(a8_3to8_4), .in_b(b7_4to8_4), .out_a(a8_4to8_5), .out_b(b8_4to9_4), .out_c(matrixC8_4));
processing_element pe8_5(.reset(effective_rst), .clk(clk), .in_a(a8_4to8_5), .in_b(b7_5to8_5), .out_a(a8_5to8_6), .out_b(b8_5to9_5), .out_c(matrixC8_5));
processing_element pe8_6(.reset(effective_rst), .clk(clk), .in_a(a8_5to8_6), .in_b(b7_6to8_6), .out_a(a8_6to8_7), .out_b(b8_6to9_6), .out_c(matrixC8_6));
processing_element pe8_7(.reset(effective_rst), .clk(clk), .in_a(a8_6to8_7), .in_b(b7_7to8_7), .out_a(a8_7to8_8), .out_b(b8_7to9_7), .out_c(matrixC8_7));
processing_element pe8_8(.reset(effective_rst), .clk(clk), .in_a(a8_7to8_8), .in_b(b7_8to8_8), .out_a(a8_8to8_9), .out_b(b8_8to9_8), .out_c(matrixC8_8));
processing_element pe8_9(.reset(effective_rst), .clk(clk), .in_a(a8_8to8_9), .in_b(b7_9to8_9), .out_a(a8_9to8_10), .out_b(b8_9to9_9), .out_c(matrixC8_9));
processing_element pe8_10(.reset(effective_rst), .clk(clk), .in_a(a8_9to8_10), .in_b(b7_10to8_10), .out_a(a8_10to8_11), .out_b(b8_10to9_10), .out_c(matrixC8_10));
processing_element pe8_11(.reset(effective_rst), .clk(clk), .in_a(a8_10to8_11), .in_b(b7_11to8_11), .out_a(a8_11to8_12), .out_b(b8_11to9_11), .out_c(matrixC8_11));
processing_element pe8_12(.reset(effective_rst), .clk(clk), .in_a(a8_11to8_12), .in_b(b7_12to8_12), .out_a(a8_12to8_13), .out_b(b8_12to9_12), .out_c(matrixC8_12));
processing_element pe8_13(.reset(effective_rst), .clk(clk), .in_a(a8_12to8_13), .in_b(b7_13to8_13), .out_a(a8_13to8_14), .out_b(b8_13to9_13), .out_c(matrixC8_13));
processing_element pe8_14(.reset(effective_rst), .clk(clk), .in_a(a8_13to8_14), .in_b(b7_14to8_14), .out_a(a8_14to8_15), .out_b(b8_14to9_14), .out_c(matrixC8_14));
processing_element pe8_15(.reset(effective_rst), .clk(clk), .in_a(a8_14to8_15), .in_b(b7_15to8_15), .out_a(a8_15to8_16), .out_b(b8_15to9_15), .out_c(matrixC8_15));
processing_element pe9_1(.reset(effective_rst), .clk(clk), .in_a(a9_0to9_1), .in_b(b8_1to9_1), .out_a(a9_1to9_2), .out_b(b9_1to10_1), .out_c(matrixC9_1));
processing_element pe9_2(.reset(effective_rst), .clk(clk), .in_a(a9_1to9_2), .in_b(b8_2to9_2), .out_a(a9_2to9_3), .out_b(b9_2to10_2), .out_c(matrixC9_2));
processing_element pe9_3(.reset(effective_rst), .clk(clk), .in_a(a9_2to9_3), .in_b(b8_3to9_3), .out_a(a9_3to9_4), .out_b(b9_3to10_3), .out_c(matrixC9_3));
processing_element pe9_4(.reset(effective_rst), .clk(clk), .in_a(a9_3to9_4), .in_b(b8_4to9_4), .out_a(a9_4to9_5), .out_b(b9_4to10_4), .out_c(matrixC9_4));
processing_element pe9_5(.reset(effective_rst), .clk(clk), .in_a(a9_4to9_5), .in_b(b8_5to9_5), .out_a(a9_5to9_6), .out_b(b9_5to10_5), .out_c(matrixC9_5));
processing_element pe9_6(.reset(effective_rst), .clk(clk), .in_a(a9_5to9_6), .in_b(b8_6to9_6), .out_a(a9_6to9_7), .out_b(b9_6to10_6), .out_c(matrixC9_6));
processing_element pe9_7(.reset(effective_rst), .clk(clk), .in_a(a9_6to9_7), .in_b(b8_7to9_7), .out_a(a9_7to9_8), .out_b(b9_7to10_7), .out_c(matrixC9_7));
processing_element pe9_8(.reset(effective_rst), .clk(clk), .in_a(a9_7to9_8), .in_b(b8_8to9_8), .out_a(a9_8to9_9), .out_b(b9_8to10_8), .out_c(matrixC9_8));
processing_element pe9_9(.reset(effective_rst), .clk(clk), .in_a(a9_8to9_9), .in_b(b8_9to9_9), .out_a(a9_9to9_10), .out_b(b9_9to10_9), .out_c(matrixC9_9));
processing_element pe9_10(.reset(effective_rst), .clk(clk), .in_a(a9_9to9_10), .in_b(b8_10to9_10), .out_a(a9_10to9_11), .out_b(b9_10to10_10), .out_c(matrixC9_10));
processing_element pe9_11(.reset(effective_rst), .clk(clk), .in_a(a9_10to9_11), .in_b(b8_11to9_11), .out_a(a9_11to9_12), .out_b(b9_11to10_11), .out_c(matrixC9_11));
processing_element pe9_12(.reset(effective_rst), .clk(clk), .in_a(a9_11to9_12), .in_b(b8_12to9_12), .out_a(a9_12to9_13), .out_b(b9_12to10_12), .out_c(matrixC9_12));
processing_element pe9_13(.reset(effective_rst), .clk(clk), .in_a(a9_12to9_13), .in_b(b8_13to9_13), .out_a(a9_13to9_14), .out_b(b9_13to10_13), .out_c(matrixC9_13));
processing_element pe9_14(.reset(effective_rst), .clk(clk), .in_a(a9_13to9_14), .in_b(b8_14to9_14), .out_a(a9_14to9_15), .out_b(b9_14to10_14), .out_c(matrixC9_14));
processing_element pe9_15(.reset(effective_rst), .clk(clk), .in_a(a9_14to9_15), .in_b(b8_15to9_15), .out_a(a9_15to9_16), .out_b(b9_15to10_15), .out_c(matrixC9_15));
processing_element pe10_1(.reset(effective_rst), .clk(clk), .in_a(a10_0to10_1), .in_b(b9_1to10_1), .out_a(a10_1to10_2), .out_b(b10_1to11_1), .out_c(matrixC10_1));
processing_element pe10_2(.reset(effective_rst), .clk(clk), .in_a(a10_1to10_2), .in_b(b9_2to10_2), .out_a(a10_2to10_3), .out_b(b10_2to11_2), .out_c(matrixC10_2));
processing_element pe10_3(.reset(effective_rst), .clk(clk), .in_a(a10_2to10_3), .in_b(b9_3to10_3), .out_a(a10_3to10_4), .out_b(b10_3to11_3), .out_c(matrixC10_3));
processing_element pe10_4(.reset(effective_rst), .clk(clk), .in_a(a10_3to10_4), .in_b(b9_4to10_4), .out_a(a10_4to10_5), .out_b(b10_4to11_4), .out_c(matrixC10_4));
processing_element pe10_5(.reset(effective_rst), .clk(clk), .in_a(a10_4to10_5), .in_b(b9_5to10_5), .out_a(a10_5to10_6), .out_b(b10_5to11_5), .out_c(matrixC10_5));
processing_element pe10_6(.reset(effective_rst), .clk(clk), .in_a(a10_5to10_6), .in_b(b9_6to10_6), .out_a(a10_6to10_7), .out_b(b10_6to11_6), .out_c(matrixC10_6));
processing_element pe10_7(.reset(effective_rst), .clk(clk), .in_a(a10_6to10_7), .in_b(b9_7to10_7), .out_a(a10_7to10_8), .out_b(b10_7to11_7), .out_c(matrixC10_7));
processing_element pe10_8(.reset(effective_rst), .clk(clk), .in_a(a10_7to10_8), .in_b(b9_8to10_8), .out_a(a10_8to10_9), .out_b(b10_8to11_8), .out_c(matrixC10_8));
processing_element pe10_9(.reset(effective_rst), .clk(clk), .in_a(a10_8to10_9), .in_b(b9_9to10_9), .out_a(a10_9to10_10), .out_b(b10_9to11_9), .out_c(matrixC10_9));
processing_element pe10_10(.reset(effective_rst), .clk(clk), .in_a(a10_9to10_10), .in_b(b9_10to10_10), .out_a(a10_10to10_11), .out_b(b10_10to11_10), .out_c(matrixC10_10));
processing_element pe10_11(.reset(effective_rst), .clk(clk), .in_a(a10_10to10_11), .in_b(b9_11to10_11), .out_a(a10_11to10_12), .out_b(b10_11to11_11), .out_c(matrixC10_11));
processing_element pe10_12(.reset(effective_rst), .clk(clk), .in_a(a10_11to10_12), .in_b(b9_12to10_12), .out_a(a10_12to10_13), .out_b(b10_12to11_12), .out_c(matrixC10_12));
processing_element pe10_13(.reset(effective_rst), .clk(clk), .in_a(a10_12to10_13), .in_b(b9_13to10_13), .out_a(a10_13to10_14), .out_b(b10_13to11_13), .out_c(matrixC10_13));
processing_element pe10_14(.reset(effective_rst), .clk(clk), .in_a(a10_13to10_14), .in_b(b9_14to10_14), .out_a(a10_14to10_15), .out_b(b10_14to11_14), .out_c(matrixC10_14));
processing_element pe10_15(.reset(effective_rst), .clk(clk), .in_a(a10_14to10_15), .in_b(b9_15to10_15), .out_a(a10_15to10_16), .out_b(b10_15to11_15), .out_c(matrixC10_15));
processing_element pe11_1(.reset(effective_rst), .clk(clk), .in_a(a11_0to11_1), .in_b(b10_1to11_1), .out_a(a11_1to11_2), .out_b(b11_1to12_1), .out_c(matrixC11_1));
processing_element pe11_2(.reset(effective_rst), .clk(clk), .in_a(a11_1to11_2), .in_b(b10_2to11_2), .out_a(a11_2to11_3), .out_b(b11_2to12_2), .out_c(matrixC11_2));
processing_element pe11_3(.reset(effective_rst), .clk(clk), .in_a(a11_2to11_3), .in_b(b10_3to11_3), .out_a(a11_3to11_4), .out_b(b11_3to12_3), .out_c(matrixC11_3));
processing_element pe11_4(.reset(effective_rst), .clk(clk), .in_a(a11_3to11_4), .in_b(b10_4to11_4), .out_a(a11_4to11_5), .out_b(b11_4to12_4), .out_c(matrixC11_4));
processing_element pe11_5(.reset(effective_rst), .clk(clk), .in_a(a11_4to11_5), .in_b(b10_5to11_5), .out_a(a11_5to11_6), .out_b(b11_5to12_5), .out_c(matrixC11_5));
processing_element pe11_6(.reset(effective_rst), .clk(clk), .in_a(a11_5to11_6), .in_b(b10_6to11_6), .out_a(a11_6to11_7), .out_b(b11_6to12_6), .out_c(matrixC11_6));
processing_element pe11_7(.reset(effective_rst), .clk(clk), .in_a(a11_6to11_7), .in_b(b10_7to11_7), .out_a(a11_7to11_8), .out_b(b11_7to12_7), .out_c(matrixC11_7));
processing_element pe11_8(.reset(effective_rst), .clk(clk), .in_a(a11_7to11_8), .in_b(b10_8to11_8), .out_a(a11_8to11_9), .out_b(b11_8to12_8), .out_c(matrixC11_8));
processing_element pe11_9(.reset(effective_rst), .clk(clk), .in_a(a11_8to11_9), .in_b(b10_9to11_9), .out_a(a11_9to11_10), .out_b(b11_9to12_9), .out_c(matrixC11_9));
processing_element pe11_10(.reset(effective_rst), .clk(clk), .in_a(a11_9to11_10), .in_b(b10_10to11_10), .out_a(a11_10to11_11), .out_b(b11_10to12_10), .out_c(matrixC11_10));
processing_element pe11_11(.reset(effective_rst), .clk(clk), .in_a(a11_10to11_11), .in_b(b10_11to11_11), .out_a(a11_11to11_12), .out_b(b11_11to12_11), .out_c(matrixC11_11));
processing_element pe11_12(.reset(effective_rst), .clk(clk), .in_a(a11_11to11_12), .in_b(b10_12to11_12), .out_a(a11_12to11_13), .out_b(b11_12to12_12), .out_c(matrixC11_12));
processing_element pe11_13(.reset(effective_rst), .clk(clk), .in_a(a11_12to11_13), .in_b(b10_13to11_13), .out_a(a11_13to11_14), .out_b(b11_13to12_13), .out_c(matrixC11_13));
processing_element pe11_14(.reset(effective_rst), .clk(clk), .in_a(a11_13to11_14), .in_b(b10_14to11_14), .out_a(a11_14to11_15), .out_b(b11_14to12_14), .out_c(matrixC11_14));
processing_element pe11_15(.reset(effective_rst), .clk(clk), .in_a(a11_14to11_15), .in_b(b10_15to11_15), .out_a(a11_15to11_16), .out_b(b11_15to12_15), .out_c(matrixC11_15));
processing_element pe12_1(.reset(effective_rst), .clk(clk), .in_a(a12_0to12_1), .in_b(b11_1to12_1), .out_a(a12_1to12_2), .out_b(b12_1to13_1), .out_c(matrixC12_1));
processing_element pe12_2(.reset(effective_rst), .clk(clk), .in_a(a12_1to12_2), .in_b(b11_2to12_2), .out_a(a12_2to12_3), .out_b(b12_2to13_2), .out_c(matrixC12_2));
processing_element pe12_3(.reset(effective_rst), .clk(clk), .in_a(a12_2to12_3), .in_b(b11_3to12_3), .out_a(a12_3to12_4), .out_b(b12_3to13_3), .out_c(matrixC12_3));
processing_element pe12_4(.reset(effective_rst), .clk(clk), .in_a(a12_3to12_4), .in_b(b11_4to12_4), .out_a(a12_4to12_5), .out_b(b12_4to13_4), .out_c(matrixC12_4));
processing_element pe12_5(.reset(effective_rst), .clk(clk), .in_a(a12_4to12_5), .in_b(b11_5to12_5), .out_a(a12_5to12_6), .out_b(b12_5to13_5), .out_c(matrixC12_5));
processing_element pe12_6(.reset(effective_rst), .clk(clk), .in_a(a12_5to12_6), .in_b(b11_6to12_6), .out_a(a12_6to12_7), .out_b(b12_6to13_6), .out_c(matrixC12_6));
processing_element pe12_7(.reset(effective_rst), .clk(clk), .in_a(a12_6to12_7), .in_b(b11_7to12_7), .out_a(a12_7to12_8), .out_b(b12_7to13_7), .out_c(matrixC12_7));
processing_element pe12_8(.reset(effective_rst), .clk(clk), .in_a(a12_7to12_8), .in_b(b11_8to12_8), .out_a(a12_8to12_9), .out_b(b12_8to13_8), .out_c(matrixC12_8));
processing_element pe12_9(.reset(effective_rst), .clk(clk), .in_a(a12_8to12_9), .in_b(b11_9to12_9), .out_a(a12_9to12_10), .out_b(b12_9to13_9), .out_c(matrixC12_9));
processing_element pe12_10(.reset(effective_rst), .clk(clk), .in_a(a12_9to12_10), .in_b(b11_10to12_10), .out_a(a12_10to12_11), .out_b(b12_10to13_10), .out_c(matrixC12_10));
processing_element pe12_11(.reset(effective_rst), .clk(clk), .in_a(a12_10to12_11), .in_b(b11_11to12_11), .out_a(a12_11to12_12), .out_b(b12_11to13_11), .out_c(matrixC12_11));
processing_element pe12_12(.reset(effective_rst), .clk(clk), .in_a(a12_11to12_12), .in_b(b11_12to12_12), .out_a(a12_12to12_13), .out_b(b12_12to13_12), .out_c(matrixC12_12));
processing_element pe12_13(.reset(effective_rst), .clk(clk), .in_a(a12_12to12_13), .in_b(b11_13to12_13), .out_a(a12_13to12_14), .out_b(b12_13to13_13), .out_c(matrixC12_13));
processing_element pe12_14(.reset(effective_rst), .clk(clk), .in_a(a12_13to12_14), .in_b(b11_14to12_14), .out_a(a12_14to12_15), .out_b(b12_14to13_14), .out_c(matrixC12_14));
processing_element pe12_15(.reset(effective_rst), .clk(clk), .in_a(a12_14to12_15), .in_b(b11_15to12_15), .out_a(a12_15to12_16), .out_b(b12_15to13_15), .out_c(matrixC12_15));
processing_element pe13_1(.reset(effective_rst), .clk(clk), .in_a(a13_0to13_1), .in_b(b12_1to13_1), .out_a(a13_1to13_2), .out_b(b13_1to14_1), .out_c(matrixC13_1));
processing_element pe13_2(.reset(effective_rst), .clk(clk), .in_a(a13_1to13_2), .in_b(b12_2to13_2), .out_a(a13_2to13_3), .out_b(b13_2to14_2), .out_c(matrixC13_2));
processing_element pe13_3(.reset(effective_rst), .clk(clk), .in_a(a13_2to13_3), .in_b(b12_3to13_3), .out_a(a13_3to13_4), .out_b(b13_3to14_3), .out_c(matrixC13_3));
processing_element pe13_4(.reset(effective_rst), .clk(clk), .in_a(a13_3to13_4), .in_b(b12_4to13_4), .out_a(a13_4to13_5), .out_b(b13_4to14_4), .out_c(matrixC13_4));
processing_element pe13_5(.reset(effective_rst), .clk(clk), .in_a(a13_4to13_5), .in_b(b12_5to13_5), .out_a(a13_5to13_6), .out_b(b13_5to14_5), .out_c(matrixC13_5));
processing_element pe13_6(.reset(effective_rst), .clk(clk), .in_a(a13_5to13_6), .in_b(b12_6to13_6), .out_a(a13_6to13_7), .out_b(b13_6to14_6), .out_c(matrixC13_6));
processing_element pe13_7(.reset(effective_rst), .clk(clk), .in_a(a13_6to13_7), .in_b(b12_7to13_7), .out_a(a13_7to13_8), .out_b(b13_7to14_7), .out_c(matrixC13_7));
processing_element pe13_8(.reset(effective_rst), .clk(clk), .in_a(a13_7to13_8), .in_b(b12_8to13_8), .out_a(a13_8to13_9), .out_b(b13_8to14_8), .out_c(matrixC13_8));
processing_element pe13_9(.reset(effective_rst), .clk(clk), .in_a(a13_8to13_9), .in_b(b12_9to13_9), .out_a(a13_9to13_10), .out_b(b13_9to14_9), .out_c(matrixC13_9));
processing_element pe13_10(.reset(effective_rst), .clk(clk), .in_a(a13_9to13_10), .in_b(b12_10to13_10), .out_a(a13_10to13_11), .out_b(b13_10to14_10), .out_c(matrixC13_10));
processing_element pe13_11(.reset(effective_rst), .clk(clk), .in_a(a13_10to13_11), .in_b(b12_11to13_11), .out_a(a13_11to13_12), .out_b(b13_11to14_11), .out_c(matrixC13_11));
processing_element pe13_12(.reset(effective_rst), .clk(clk), .in_a(a13_11to13_12), .in_b(b12_12to13_12), .out_a(a13_12to13_13), .out_b(b13_12to14_12), .out_c(matrixC13_12));
processing_element pe13_13(.reset(effective_rst), .clk(clk), .in_a(a13_12to13_13), .in_b(b12_13to13_13), .out_a(a13_13to13_14), .out_b(b13_13to14_13), .out_c(matrixC13_13));
processing_element pe13_14(.reset(effective_rst), .clk(clk), .in_a(a13_13to13_14), .in_b(b12_14to13_14), .out_a(a13_14to13_15), .out_b(b13_14to14_14), .out_c(matrixC13_14));
processing_element pe13_15(.reset(effective_rst), .clk(clk), .in_a(a13_14to13_15), .in_b(b12_15to13_15), .out_a(a13_15to13_16), .out_b(b13_15to14_15), .out_c(matrixC13_15));
processing_element pe14_1(.reset(effective_rst), .clk(clk), .in_a(a14_0to14_1), .in_b(b13_1to14_1), .out_a(a14_1to14_2), .out_b(b14_1to15_1), .out_c(matrixC14_1));
processing_element pe14_2(.reset(effective_rst), .clk(clk), .in_a(a14_1to14_2), .in_b(b13_2to14_2), .out_a(a14_2to14_3), .out_b(b14_2to15_2), .out_c(matrixC14_2));
processing_element pe14_3(.reset(effective_rst), .clk(clk), .in_a(a14_2to14_3), .in_b(b13_3to14_3), .out_a(a14_3to14_4), .out_b(b14_3to15_3), .out_c(matrixC14_3));
processing_element pe14_4(.reset(effective_rst), .clk(clk), .in_a(a14_3to14_4), .in_b(b13_4to14_4), .out_a(a14_4to14_5), .out_b(b14_4to15_4), .out_c(matrixC14_4));
processing_element pe14_5(.reset(effective_rst), .clk(clk), .in_a(a14_4to14_5), .in_b(b13_5to14_5), .out_a(a14_5to14_6), .out_b(b14_5to15_5), .out_c(matrixC14_5));
processing_element pe14_6(.reset(effective_rst), .clk(clk), .in_a(a14_5to14_6), .in_b(b13_6to14_6), .out_a(a14_6to14_7), .out_b(b14_6to15_6), .out_c(matrixC14_6));
processing_element pe14_7(.reset(effective_rst), .clk(clk), .in_a(a14_6to14_7), .in_b(b13_7to14_7), .out_a(a14_7to14_8), .out_b(b14_7to15_7), .out_c(matrixC14_7));
processing_element pe14_8(.reset(effective_rst), .clk(clk), .in_a(a14_7to14_8), .in_b(b13_8to14_8), .out_a(a14_8to14_9), .out_b(b14_8to15_8), .out_c(matrixC14_8));
processing_element pe14_9(.reset(effective_rst), .clk(clk), .in_a(a14_8to14_9), .in_b(b13_9to14_9), .out_a(a14_9to14_10), .out_b(b14_9to15_9), .out_c(matrixC14_9));
processing_element pe14_10(.reset(effective_rst), .clk(clk), .in_a(a14_9to14_10), .in_b(b13_10to14_10), .out_a(a14_10to14_11), .out_b(b14_10to15_10), .out_c(matrixC14_10));
processing_element pe14_11(.reset(effective_rst), .clk(clk), .in_a(a14_10to14_11), .in_b(b13_11to14_11), .out_a(a14_11to14_12), .out_b(b14_11to15_11), .out_c(matrixC14_11));
processing_element pe14_12(.reset(effective_rst), .clk(clk), .in_a(a14_11to14_12), .in_b(b13_12to14_12), .out_a(a14_12to14_13), .out_b(b14_12to15_12), .out_c(matrixC14_12));
processing_element pe14_13(.reset(effective_rst), .clk(clk), .in_a(a14_12to14_13), .in_b(b13_13to14_13), .out_a(a14_13to14_14), .out_b(b14_13to15_13), .out_c(matrixC14_13));
processing_element pe14_14(.reset(effective_rst), .clk(clk), .in_a(a14_13to14_14), .in_b(b13_14to14_14), .out_a(a14_14to14_15), .out_b(b14_14to15_14), .out_c(matrixC14_14));
processing_element pe14_15(.reset(effective_rst), .clk(clk), .in_a(a14_14to14_15), .in_b(b13_15to14_15), .out_a(a14_15to14_16), .out_b(b14_15to15_15), .out_c(matrixC14_15));
processing_element pe15_1(.reset(effective_rst), .clk(clk), .in_a(a15_0to15_1), .in_b(b14_1to15_1), .out_a(a15_1to15_2), .out_b(b15_1to16_1), .out_c(matrixC15_1));
processing_element pe15_2(.reset(effective_rst), .clk(clk), .in_a(a15_1to15_2), .in_b(b14_2to15_2), .out_a(a15_2to15_3), .out_b(b15_2to16_2), .out_c(matrixC15_2));
processing_element pe15_3(.reset(effective_rst), .clk(clk), .in_a(a15_2to15_3), .in_b(b14_3to15_3), .out_a(a15_3to15_4), .out_b(b15_3to16_3), .out_c(matrixC15_3));
processing_element pe15_4(.reset(effective_rst), .clk(clk), .in_a(a15_3to15_4), .in_b(b14_4to15_4), .out_a(a15_4to15_5), .out_b(b15_4to16_4), .out_c(matrixC15_4));
processing_element pe15_5(.reset(effective_rst), .clk(clk), .in_a(a15_4to15_5), .in_b(b14_5to15_5), .out_a(a15_5to15_6), .out_b(b15_5to16_5), .out_c(matrixC15_5));
processing_element pe15_6(.reset(effective_rst), .clk(clk), .in_a(a15_5to15_6), .in_b(b14_6to15_6), .out_a(a15_6to15_7), .out_b(b15_6to16_6), .out_c(matrixC15_6));
processing_element pe15_7(.reset(effective_rst), .clk(clk), .in_a(a15_6to15_7), .in_b(b14_7to15_7), .out_a(a15_7to15_8), .out_b(b15_7to16_7), .out_c(matrixC15_7));
processing_element pe15_8(.reset(effective_rst), .clk(clk), .in_a(a15_7to15_8), .in_b(b14_8to15_8), .out_a(a15_8to15_9), .out_b(b15_8to16_8), .out_c(matrixC15_8));
processing_element pe15_9(.reset(effective_rst), .clk(clk), .in_a(a15_8to15_9), .in_b(b14_9to15_9), .out_a(a15_9to15_10), .out_b(b15_9to16_9), .out_c(matrixC15_9));
processing_element pe15_10(.reset(effective_rst), .clk(clk), .in_a(a15_9to15_10), .in_b(b14_10to15_10), .out_a(a15_10to15_11), .out_b(b15_10to16_10), .out_c(matrixC15_10));
processing_element pe15_11(.reset(effective_rst), .clk(clk), .in_a(a15_10to15_11), .in_b(b14_11to15_11), .out_a(a15_11to15_12), .out_b(b15_11to16_11), .out_c(matrixC15_11));
processing_element pe15_12(.reset(effective_rst), .clk(clk), .in_a(a15_11to15_12), .in_b(b14_12to15_12), .out_a(a15_12to15_13), .out_b(b15_12to16_12), .out_c(matrixC15_12));
processing_element pe15_13(.reset(effective_rst), .clk(clk), .in_a(a15_12to15_13), .in_b(b14_13to15_13), .out_a(a15_13to15_14), .out_b(b15_13to16_13), .out_c(matrixC15_13));
processing_element pe15_14(.reset(effective_rst), .clk(clk), .in_a(a15_13to15_14), .in_b(b14_14to15_14), .out_a(a15_14to15_15), .out_b(b15_14to16_14), .out_c(matrixC15_14));
processing_element pe15_15(.reset(effective_rst), .clk(clk), .in_a(a15_14to15_15), .in_b(b14_15to15_15), .out_a(a15_15to15_16), .out_b(b15_15to16_15), .out_c(matrixC15_15));
assign a_data_out = {a15_15to15_16,a14_15to14_16,a13_15to13_16,a12_15to12_16,a11_15to11_16,a10_15to10_16,a9_15to9_16,a8_15to8_16,a7_15to7_16,a6_15to6_16,a5_15to5_16,a4_15to4_16,a3_15to3_16,a2_15to2_16,a1_15to1_16,a0_15to0_16};
assign b_data_out = {b15_15to16_15,b15_14to16_14,b15_13to16_13,b15_12to16_12,b15_11to16_11,b15_10to16_10,b15_9to16_9,b15_8to16_8,b15_7to16_7,b15_6to16_6,b15_5to16_5,b15_4to16_4,b15_3to16_3,b15_2to16_2,b15_1to16_1,b15_0to16_0};
endmodule
module processing_element(
reset,
clk,
in_a,
in_b,
out_a,
out_b,
out_c
);
input reset;
input clk;
input [`DWIDTH-1:0] in_a;
input [`DWIDTH-1:0] in_b;
output [`DWIDTH-1:0] out_a;
output [`DWIDTH-1:0] out_b;
output [`DWIDTH-1:0] out_c; //reduced precision
reg [`DWIDTH-1:0] out_a;
reg [`DWIDTH-1:0] out_b;
wire [`DWIDTH-1:0] out_c;
wire [`DWIDTH-1:0] out_mac;
assign out_c = out_mac;
seq_mac u_mac(.a(in_a), .b(in_b), .out(out_mac), .reset(reset), .clk(clk));
always @(posedge clk)begin
if(reset) begin
out_a<=0;
out_b<=0;
end
else begin
out_a<=in_a;
out_b<=in_b;
end
end
endmodule
module seq_mac(a, b, out, reset, clk);
input [`DWIDTH-1:0] a;
input [`DWIDTH-1:0] b;
input reset;
input clk;
output [`DWIDTH-1:0] out;
reg [2*`DWIDTH-1:0] out_temp;
wire [`DWIDTH-1:0] mul_out;
wire [2*`DWIDTH-1:0] add_out;
reg [`DWIDTH-1:0] a_flopped;
reg [`DWIDTH-1:0] b_flopped;
wire [2*`DWIDTH-1:0] mul_out_temp;
reg [2*`DWIDTH-1:0] mul_out_temp_reg;
always @(posedge clk) begin
if (reset) begin
a_flopped <= 0;
b_flopped <= 0;
end else begin
a_flopped <= a;
b_flopped <= b;
end
end
//assign mul_out = a * b;
qmult mult_u1(.i_multiplicand(a_flopped), .i_multiplier(b_flopped), .o_result(mul_out_temp));
always @(posedge clk) begin
if (reset) begin
mul_out_temp_reg <= 0;
end else begin
mul_out_temp_reg <= mul_out_temp;
end
end
//we just truncate the higher bits of the product
//assign add_out = mul_out + out;
qadd add_u1(.a(out_temp), .b(mul_out_temp_reg), .c(add_out));
always @(posedge clk) begin
if (reset) begin
out_temp <= 0;
end else begin
out_temp <= add_out;
end
end
//down cast the result
assign out =
(out_temp[2*`DWIDTH-1] == 0) ? //positive number
(
(|(out_temp[2*`DWIDTH-2 : `DWIDTH-1])) ? //is any bit from 14:7 is 1, that means overlfow
{out_temp[2*`DWIDTH-1] , {(`DWIDTH-1){1'b1}}} : //sign bit and then all 1s
{out_temp[2*`DWIDTH-1] , out_temp[`DWIDTH-2:0]}
)
: //negative number
(
(|(out_temp[2*`DWIDTH-2 : `DWIDTH-1])) ? //is any bit from 14:7 is 0, that means overlfow
{out_temp[2*`DWIDTH-1] , out_temp[`DWIDTH-2:0]} :
{out_temp[2*`DWIDTH-1] , {(`DWIDTH-1){1'b0}}} //sign bit and then all 0s
);
endmodule
module qmult(i_multiplicand,i_multiplier,o_result);
input [`DWIDTH-1:0] i_multiplicand;
input [`DWIDTH-1:0] i_multiplier;
output [2*`DWIDTH-1:0] o_result;
assign o_result = i_multiplicand * i_multiplier;
//DW02_mult #(`DWIDTH,`DWIDTH) u_mult(.A(i_multiplicand), .B(i_multiplier), .TC(1'b1), .PRODUCT(o_result));
endmodule
module qadd(a,b,c);
input [2*`DWIDTH-1:0] a;
input [2*`DWIDTH-1:0] b;
output [2*`DWIDTH-1:0] c;
assign c = a + b;
//DW01_add #(`DWIDTH) u_add(.A(a), .B(b), .CI(1'b0), .SUM(c), .CO());
endmodule
//////////////////////////////////////////////
// Configuration block
//////////////////////////////////////////////
module cfg(
input PCLK,
input PRESETn,
input [`REG_ADDRWIDTH-1:0] PADDR,
input PWRITE,
input PSEL,
input PENABLE,
input [`REG_DATAWIDTH-1:0] PWDATA,
output reg [`REG_DATAWIDTH-1:0] PRDATA,
output reg PREADY,
output reg start_tpu,
output reg enable_matmul,
output reg enable_norm,
output reg enable_pool,
output reg enable_activation,
output reg enable_conv_mode,
output reg [`DWIDTH-1:0] mean,
output reg [`DWIDTH-1:0] inv_var,
output reg [`MAX_BITS_POOL-1:0] pool_window_size,
output reg [`AWIDTH-1:0] address_mat_a,
output reg [`AWIDTH-1:0] address_mat_b,
output reg [`AWIDTH-1:0] address_mat_c,
output reg [`MASK_WIDTH-1:0] validity_mask_a_rows,
output reg [`MASK_WIDTH-1:0] validity_mask_a_cols,
output reg [`MASK_WIDTH-1:0] validity_mask_b_rows,
output reg [`MASK_WIDTH-1:0] validity_mask_b_cols,
output reg save_output_to_accum,
output reg add_accum_to_output,
output reg [`ADDR_STRIDE_WIDTH-1:0] address_stride_a,
output reg [`ADDR_STRIDE_WIDTH-1:0] address_stride_b,
output reg [`ADDR_STRIDE_WIDTH-1:0] address_stride_c,
output reg activation_type,
output reg [3:0] conv_filter_height,
output reg [3:0] conv_filter_width,
output reg [3:0] conv_stride_horiz,
output reg [3:0] conv_stride_verti,
output reg [3:0] conv_padding_left,
output reg [3:0] conv_padding_right,
output reg [3:0] conv_padding_top,
output reg [3:0] conv_padding_bottom,
output reg [15:0] num_channels_inp,
output reg [15:0] num_channels_out,
output reg [15:0] inp_img_height,
output reg [15:0] inp_img_width,
output reg [15:0] out_img_height,
output reg [15:0] out_img_width,
output reg [31:0] batch_size,
output reg pe_reset,
input done_tpu
);
//Dummy register to sync all other invalid/unimplemented addresses
reg [`REG_DATAWIDTH-1:0] reg_dummy;
//////////////////////////////////////////////////////
//Using a simple APB interface. Taken from:
// https://github.com/maomran/APB-Slave
// https://research.ijcaonline.org/volume95/number21/pxc3897047.pdf
reg [1:0] State;
`define IDLE 2'b00
`define W_ENABLE 2'b01
`define R_ENABLE 2'b10
always @(posedge PCLK) begin
if (PRESETn == 0) begin
State <= `IDLE;
PRDATA <= 0;
PREADY <= 0;
start_tpu <= 0;
enable_matmul <= 0;
enable_norm <= 0;
enable_pool <= 0;
enable_activation <= 0;
mean <= 0;
inv_var <= 0;
pool_window_size <= 1;
reg_dummy <= 0;
address_mat_a <= 0;
address_mat_b <= 0;
address_mat_c <= 0;
validity_mask_a_rows <= {`MASK_WIDTH{1'b1}};
validity_mask_a_cols <= {`MASK_WIDTH{1'b1}};
validity_mask_b_rows <= {`MASK_WIDTH{1'b1}};
validity_mask_b_cols <= {`MASK_WIDTH{1'b1}};
save_output_to_accum <= 0;
add_accum_to_output <= 0;
address_stride_a <= `DESIGN_SIZE;
address_stride_b <= `DESIGN_SIZE;
address_stride_c <= `DESIGN_SIZE;
activation_type <= 1;
conv_filter_height <= 2;
conv_filter_width <= 2;
conv_stride_horiz <= 1;
conv_stride_verti <= 1;
conv_padding_left <= 0;
conv_padding_right <= 0;
conv_padding_top <= 0;
conv_padding_bottom<= 0;
num_channels_inp <= 4;
num_channels_out <= 4;
inp_img_height <= 8;
inp_img_width <= 8;
out_img_height <= 7;
out_img_width <= 7;
batch_size <= 2;
enable_conv_mode <= 0;
pe_reset <= 0;
end
else begin
case (State)
`IDLE : begin
PRDATA <= 0;
if (PSEL) begin
if (PWRITE) begin
State <= `W_ENABLE;
end
else begin
State <= `R_ENABLE;
end
end
PREADY <= 0;
pe_reset <= 0; //this register bit auto resets itself
end
`W_ENABLE : begin
if (PSEL && PWRITE && PENABLE) begin
case (PADDR)
`REG_ENABLES_ADDR : begin
enable_conv_mode <= PWDATA[31];
enable_activation <= PWDATA[3];
enable_pool <= PWDATA[2];
enable_norm <= PWDATA[1];
enable_matmul <= PWDATA[0];
end
`REG_STDN_TPU_ADDR : begin
start_tpu <= PWDATA[0];
pe_reset <= PWDATA[15];
end
`REG_MEAN_ADDR : mean <= PWDATA[`DWIDTH-1:0];
`REG_INV_VAR_ADDR : inv_var <= PWDATA[`DWIDTH-1:0];
`REG_MATRIX_A_ADDR : address_mat_a <= PWDATA[`AWIDTH-1:0];
`REG_MATRIX_B_ADDR : address_mat_b <= PWDATA[`AWIDTH-1:0];
`REG_MATRIX_C_ADDR : address_mat_c <= PWDATA[`AWIDTH-1:0];
`REG_VALID_MASK_A_ROWS_ADDR: begin
validity_mask_a_rows <= PWDATA[`MASK_WIDTH-1:0];
end
`REG_VALID_MASK_A_COLS_ADDR: begin
validity_mask_a_cols <= PWDATA[`MASK_WIDTH-1:0];
end
`REG_VALID_MASK_B_ROWS_ADDR: begin
validity_mask_b_rows <= PWDATA[`MASK_WIDTH-1:0];
end
`REG_VALID_MASK_B_COLS_ADDR: begin
validity_mask_b_cols <= PWDATA[`MASK_WIDTH-1:0];
end
`REG_POOL_WINDOW_ADDR: pool_window_size <= PWDATA[`MAX_BITS_POOL-1:0];
`REG_ACCUM_ACTIONS_ADDR: begin
add_accum_to_output <= PWDATA[1];
save_output_to_accum <= PWDATA[0];
end
`REG_MATRIX_A_STRIDE_ADDR : address_stride_a <= PWDATA[`ADDR_STRIDE_WIDTH-1:0];
`REG_MATRIX_B_STRIDE_ADDR : address_stride_b <= PWDATA[`ADDR_STRIDE_WIDTH-1:0];
`REG_MATRIX_C_STRIDE_ADDR : address_stride_c <= PWDATA[`ADDR_STRIDE_WIDTH-1:0];
`REG_ACTIVATION_CSR_ADDR : activation_type <= PWDATA[0];
`REG_CONV_PARAMS_1_ADDR : begin
conv_filter_height <= PWDATA[3:0];
conv_filter_width <= PWDATA[7:4];
conv_stride_horiz <= PWDATA[11:8];
conv_stride_verti <= PWDATA[15:12];
conv_padding_left <= PWDATA[19:16];
conv_padding_right <= PWDATA[23:20];
conv_padding_top <= PWDATA[27:24];
conv_padding_bottom<= PWDATA[31:28];
end
`REG_CONV_PARAMS_2_ADDR : begin
num_channels_inp <= PWDATA[15:0];
num_channels_out <= PWDATA[31:16];
end
`REG_CONV_PARAMS_3_ADDR : begin
inp_img_height <= PWDATA[15:0];
inp_img_width <= PWDATA[31:16];
end
`REG_CONV_PARAMS_4_ADDR : begin
out_img_height <= PWDATA[15:0];
out_img_width <= PWDATA[31:16];
end
`REG_BATCH_SIZE_ADDR : batch_size <= PWDATA[31:0];
default: reg_dummy <= PWDATA; //sink writes to a dummy register
endcase
PREADY <=1;
end
State <= `IDLE;
end
`R_ENABLE : begin
if (PSEL && !PWRITE && PENABLE) begin
PREADY <= 1;
case (PADDR)
`REG_ENABLES_ADDR : PRDATA <= {28'b0, enable_activation, enable_pool, enable_norm, enable_matmul};
`REG_STDN_TPU_ADDR : PRDATA <= {done_tpu, 30'b0, start_tpu};
`REG_MEAN_ADDR : PRDATA <= mean;
`REG_INV_VAR_ADDR : PRDATA <= inv_var;
`REG_MATRIX_A_ADDR : PRDATA <= address_mat_a;
`REG_MATRIX_B_ADDR : PRDATA <= address_mat_b;
`REG_MATRIX_C_ADDR : PRDATA <= address_mat_c;
`REG_VALID_MASK_A_ROWS_ADDR: PRDATA <= validity_mask_a_rows;
`REG_VALID_MASK_A_COLS_ADDR: PRDATA <= validity_mask_a_cols;
`REG_VALID_MASK_B_ROWS_ADDR: PRDATA <= validity_mask_b_rows;
`REG_VALID_MASK_B_COLS_ADDR: PRDATA <= validity_mask_b_cols;
`REG_POOL_WINDOW_ADDR : PRDATA <= pool_window_size;
`REG_ACCUM_ACTIONS_ADDR: PRDATA <= {30'b0, add_accum_to_output, save_output_to_accum};
`REG_MATRIX_A_STRIDE_ADDR : PRDATA <= address_stride_a;
`REG_MATRIX_B_STRIDE_ADDR : PRDATA <= address_stride_b;
`REG_MATRIX_C_STRIDE_ADDR : PRDATA <= address_stride_c;
`REG_ACTIVATION_CSR_ADDR : PRDATA <= {31'b0, activation_type};
`REG_CONV_PARAMS_1_ADDR : PRDATA <= {
conv_filter_height,
conv_filter_width,
conv_stride_horiz,
conv_stride_verti,
conv_padding_left,
conv_padding_right,
conv_padding_top,
conv_padding_bottom
};
`REG_CONV_PARAMS_2_ADDR : PRDATA <= {
num_channels_inp,
num_channels_out
};
`REG_CONV_PARAMS_3_ADDR : PRDATA <= {
inp_img_height,
inp_img_width
};
`REG_CONV_PARAMS_4_ADDR : PRDATA <= {
out_img_height,
out_img_width
};
`REG_BATCH_SIZE_ADDR : PRDATA <= batch_size;
default : PRDATA <= reg_dummy; //read the dummy register for undefined addresses
endcase
end
State <= `IDLE;
end
default: begin
State <= `IDLE;
end
endcase
end
end
endmodule
////////////////////////////////////////////////
// Normalization block
////////////////////////////////////////////////
module norm(
input enable_norm,
input [`DWIDTH-1:0] mean,
input [`DWIDTH-1:0] inv_var,
input in_data_available,
input [`DESIGN_SIZE*`DWIDTH-1:0] inp_data,
output [`DESIGN_SIZE*`DWIDTH-1:0] out_data,
output out_data_available,
input [`MASK_WIDTH-1:0] validity_mask,
output done_norm,
input clk,
input reset
);
reg out_data_available_internal;
wire [`DESIGN_SIZE*`DWIDTH-1:0] out_data_internal;
reg [`DESIGN_SIZE*`DWIDTH-1:0] mean_applied_data;
reg [`DESIGN_SIZE*`DWIDTH-1:0] variance_applied_data;
reg done_norm_internal;
reg norm_in_progress;
reg in_data_available_flopped;
reg [`DESIGN_SIZE*`DWIDTH-1:0] inp_data_flopped;
//Muxing logic to handle the case when this block is disabled
assign out_data_available = (enable_norm) ? out_data_available_internal : in_data_available_flopped;
assign out_data = (enable_norm) ? out_data_internal : inp_data_flopped;
assign done_norm = (enable_norm) ? done_norm_internal : 1'b1;
//inp_data will have multiple elements in it. the number of elements is the same as size of the matmul.
//on each clock edge, if in_data_available is 1, then we will normalize the inputs.
//the code uses the funky part-select syntax. example:
//wire [7:0] byteN = word[byte_num*8 +: 8];
//byte_num*8 is the starting point. 8 is the width is the part-select (has to be constant).in_data_available
//+: indicates the part-select increases from the starting point
//-: indicates the part-select decreases from the starting point
//another example:
//loc = 3;
//PA[loc -:4] = PA[loc+1 +:4]; // equivalent to PA[3:0] = PA[7:4];
reg [31:0] cycle_count;
reg [31:0] i;
always @(posedge clk) begin
if ((reset || ~enable_norm)) begin
mean_applied_data <= 0;
variance_applied_data <= 0;
out_data_available_internal <= 0;
cycle_count <= 0;
done_norm_internal <= 0;
norm_in_progress <= 0;
in_data_available_flopped <= in_data_available;
inp_data_flopped <= inp_data;
end else if (in_data_available || norm_in_progress) begin
cycle_count = cycle_count + 1;
//Let's apply mean and variance as the input data comes in.
//We have a pipeline here. First stage does the add (to apply the mean)
//and second stage does the multiplication (to apply the variance).
//Note: the following loop is not a loop across multiple columns of data.
//This loop will run in 2 cycle on the same column of data that comes into
//this module in 1 clock.
for (i = 0; i < `DESIGN_SIZE; i=i+1) begin
if (validity_mask[i] == 1'b1) begin
mean_applied_data[i*`DWIDTH +: `DWIDTH] <= (inp_data[i*`DWIDTH +: `DWIDTH] - mean);
variance_applied_data[i*`DWIDTH +: `DWIDTH] <= (mean_applied_data[i*`DWIDTH +: `DWIDTH] * inv_var);
end
else begin
mean_applied_data[i*`DWIDTH +: `DWIDTH] <= (inp_data[i*`DWIDTH +: `DWIDTH]);
variance_applied_data[i*`DWIDTH +: `DWIDTH] <= (mean_applied_data[i*`DWIDTH +: `DWIDTH]);
end
end
//Out data is available starting with the second clock cycle because
//in the first cycle, we only apply the mean.
if(cycle_count==2) begin
out_data_available_internal <= 1;
end
//When we've normalized values N times, where N is the matmul
//size, that means we're done. But there is one additional cycle
//that is taken in the beginning (when we are applying the mean to the first
//column of data). We can call this the Initiation Interval of the pipeline.
//So, for a 4x4 matmul, this block takes 5 cycles.
if(cycle_count==(`DESIGN_SIZE+1)) begin
done_norm_internal <= 1'b1;
norm_in_progress <= 0;
end
else begin
norm_in_progress <= 1;
end
end
else begin
mean_applied_data <= 0;
variance_applied_data <= 0;
out_data_available_internal <= 0;
cycle_count <= 0;
done_norm_internal <= 0;
norm_in_progress <= 0;
end
end
assign out_data_internal = variance_applied_data;
endmodule
//////////////////////////////////
// Dual port RAM
//////////////////////////////////
module ram (
addr0,
d0,
we0,
q0,
addr1,
d1,
we1,
q1,
clk);
input [`AWIDTH-1:0] addr0;
input [`AWIDTH-1:0] addr1;
input [`DESIGN_SIZE*`DWIDTH-1:0] d0;
input [`DESIGN_SIZE*`DWIDTH-1:0] d1;
input [`DESIGN_SIZE-1:0] we0;
input [`DESIGN_SIZE-1:0] we1;
output reg [`DESIGN_SIZE*`DWIDTH-1:0] q0;
output reg [`DESIGN_SIZE*`DWIDTH-1:0] q1;
input clk;
`ifdef SIMULATION
reg [7:0] ram[((1<<`AWIDTH)-1):0];
reg [31:0] i;
always @(posedge clk)
begin
for (i = 0; i < `DESIGN_SIZE; i=i+1) begin
if (we0[i]) ram[addr0+i] <= d0[i*`DWIDTH +: `DWIDTH];
end
for (i = 0; i < `DESIGN_SIZE; i=i+1) begin
q0[i*`DWIDTH +: `DWIDTH] <= ram[addr0+i];
end
end
always @(posedge clk)
begin
for (i = 0; i < `DESIGN_SIZE; i=i+1) begin
if (we1[i]) ram[addr0+i] <= d1[i*`DWIDTH +: `DWIDTH];
end
for (i = 0; i < `DESIGN_SIZE; i=i+1) begin
q1[i*`DWIDTH +: `DWIDTH] <= ram[addr1+i];
end
end
`else
//BRAMs available in VTR FPGA architectures have one bit write-enables.
//So let's combine multiple bits into 1. We don't have a usecase of
//writing/not-writing only parts of the word anyway.
wire we0_coalesced;
assign we0_coalesced = |we0;
wire we1_coalesced;
assign we1_coalesced = |we1;
dual_port_ram u_dual_port_ram(
.addr1(addr0),
.we1(we0_coalesced),
.data1(d0),
.out1(q0),
.addr2(addr1),
.we2(we1_coalesced),
.data2(d1),
.out2(q1),
.clk(clk)
);
`endif
endmodule
////////////////////////////////////////////////
// Control unit
////////////////////////////////////////////////
module control(
input clk,
input reset,
input start_tpu,
input enable_matmul,
input enable_norm,
input enable_activation,
input enable_pool,
output reg start_mat_mul,
input done_mat_mul,
input done_norm,
input done_pool,
input done_activation,
input save_output_to_accum,
output reg done_tpu
);
reg [3:0] state;
`define STATE_INIT 4'b0000
`define STATE_MATMUL 4'b0001
`define STATE_NORM 4'b0010
`define STATE_POOL 4'b0011
`define STATE_ACTIVATION 4'b0100
`define STATE_DONE 4'b0101
//////////////////////////////////////////////////////
// Assumption: We will always run matmul first. That is, matmul is not optional.
// The other blocks - norm, act, pool - are optional.
// Assumption: Order is fixed: Matmul -> Norm -> Pool -> Activation
//////////////////////////////////////////////////////
always @( posedge clk) begin
if (reset) begin
state <= `STATE_INIT;
start_mat_mul <= 1'b0;
done_tpu <= 1'b0;
end else begin
case (state)
`STATE_INIT: begin
if ((start_tpu == 1'b1) && (done_tpu == 1'b0)) begin
if (enable_matmul == 1'b1) begin
start_mat_mul <= 1'b1;
state <= `STATE_MATMUL;
end
end
end
//start_mat_mul is kinda used as a reset in some logic
//inside the matmul unit. So, we can't make it 0 right away after
//asserting it.
`STATE_MATMUL: begin
if (done_mat_mul == 1'b1) begin
start_mat_mul <= 1'b0;
if(save_output_to_accum) begin
state <= `STATE_DONE;
end
else if (enable_norm) begin
state <= `STATE_NORM;
end
else if (enable_pool) begin
state <= `STATE_POOL;
end
else if (enable_activation) begin
state <= `STATE_ACTIVATION;
end
else begin
state <= `STATE_DONE;
end
end
else begin
start_mat_mul <= 1'b1;
end
end
`STATE_NORM: begin
if (done_norm == 1'b1) begin
if (enable_pool) begin
state <= `STATE_POOL;
end
else if (enable_activation) begin
state <= `STATE_ACTIVATION;
end
else begin
state <= `STATE_DONE;
end
end
end
`STATE_POOL: begin
if (done_pool == 1'b1) begin
if (enable_activation) begin
state <= `STATE_ACTIVATION;
end
else begin
state <= `STATE_DONE;
end
end
end
`STATE_ACTIVATION: begin
if (done_activation == 1'b1) begin
state <= `STATE_DONE;
end
end
`STATE_DONE: begin
//We need to write start_tpu to 0 in the CFG block to get out of this state
if (start_tpu == 1'b0) begin
state <= `STATE_INIT;
done_tpu <= 0;
end
else begin
done_tpu <= 1;
end
end
endcase
end
end
endmodule
////////////////////////////////////////////////
// Pooling block
////////////////////////////////////////////////
module pool(
input enable_pool,
input in_data_available,
input [`MAX_BITS_POOL-1:0] pool_window_size,
input [`DESIGN_SIZE*`DWIDTH-1:0] inp_data,
output [`DESIGN_SIZE*`DWIDTH-1:0] out_data,
output out_data_available,
input [`MASK_WIDTH-1:0] validity_mask,
output done_pool,
input clk,
input reset
);
reg in_data_available_flopped;
reg [`DESIGN_SIZE*`DWIDTH-1:0] inp_data_flopped;
reg [`DESIGN_SIZE*`DWIDTH-1:0] out_data_temp;
reg done_pool_temp;
reg out_data_available_temp;
reg [31:0] i,j;
reg [31:0] cycle_count;
always @(posedge clk) begin
if (reset || ~enable_pool || ~in_data_available) begin
out_data_temp <= 0;
done_pool_temp <= 0;
out_data_available_temp <= 0;
cycle_count <= 0;
in_data_available_flopped <= in_data_available;
inp_data_flopped <= inp_data;
end
else if (in_data_available) begin
cycle_count = cycle_count + 1;
out_data_available_temp <= 1;
case (pool_window_size)
1: begin
out_data_temp <= inp_data;
end
2: begin
for (i = 0; i < `DESIGN_SIZE/2; i = i + 8) begin
out_data_temp[ i +: 8] <= (inp_data[i*2 +: 8] + inp_data[i*2 + 8 +: 8]) >> 1;
end
end
4: begin
for (i = 0; i < `DESIGN_SIZE/4; i = i + 8) begin
//TODO: If 3 adders are the critical path, break into 2 cycles
out_data_temp[ i +: 8] <= (inp_data[i*4 +: 8] + inp_data[i*4 + 8 +: 8] + inp_data[i*4 + 16 +: 8] + inp_data[i*4 + 24 +: 8]) >> 2;
end
end
endcase
if(cycle_count==`DESIGN_SIZE) begin
done_pool_temp <= 1'b1;
end
end
end
assign out_data = enable_pool ? out_data_temp : inp_data_flopped;
assign out_data_available = enable_pool ? out_data_available_temp : in_data_available_flopped;
assign done_pool = enable_pool ? done_pool_temp : 1'b1;
//Adding a dummy signal to use validity_mask input, to make ODIN happy
wire [`MASK_WIDTH-1:0] dummy;
assign dummy = validity_mask;
endmodule
////////////////////////////////////////////////
// Activation block
////////////////////////////////////////////////
module activation(
input activation_type,
input enable_activation,
input in_data_available,
input [`DESIGN_SIZE*`DWIDTH-1:0] inp_data,
output [`DESIGN_SIZE*`DWIDTH-1:0] out_data,
output out_data_available,
input [`MASK_WIDTH-1:0] validity_mask,
output done_activation,
input clk,
input reset
);
reg done_activation_internal;
reg out_data_available_internal;
wire [`DESIGN_SIZE*`DWIDTH-1:0] out_data_internal;
reg [`DESIGN_SIZE*`DWIDTH-1:0] slope_applied_data_internal;
reg [`DESIGN_SIZE*`DWIDTH-1:0] intercept_applied_data_internal;
reg [`DESIGN_SIZE*`DWIDTH-1:0] relu_applied_data_internal;
reg [31:0] i;
reg [31:0] cycle_count;
reg activation_in_progress;
reg [(`DESIGN_SIZE*4)-1:0] address;
reg [(`DESIGN_SIZE*8)-1:0] data_slope;
reg [(`DESIGN_SIZE*8)-1:0] data_slope_flopped;
reg [(`DESIGN_SIZE*8)-1:0] data_intercept;
reg [(`DESIGN_SIZE*8)-1:0] data_intercept_delayed;
reg [(`DESIGN_SIZE*8)-1:0] data_intercept_flopped;
reg in_data_available_flopped;
reg [`DESIGN_SIZE*`DWIDTH-1:0] inp_data_flopped;
always @(posedge clk) begin
if (reset) begin
inp_data_flopped <= 0;
data_slope_flopped <= 0;
end else begin
inp_data_flopped <= inp_data;
data_slope_flopped <= data_slope;
end
end
// If the activation block is not enabled, just forward the input data
assign out_data = enable_activation ? out_data_internal : inp_data_flopped;
assign done_activation = enable_activation ? done_activation_internal : 1'b1;
assign out_data_available = enable_activation ? out_data_available_internal : in_data_available_flopped;
always @(posedge clk) begin
if (reset || ~enable_activation) begin
slope_applied_data_internal <= 0;
intercept_applied_data_internal <= 0;
relu_applied_data_internal <= 0;
data_intercept_delayed <= 0;
data_intercept_flopped <= 0;
done_activation_internal <= 0;
out_data_available_internal <= 0;
cycle_count <= 0;
activation_in_progress <= 0;
in_data_available_flopped <= in_data_available;
end else if(in_data_available || activation_in_progress) begin
cycle_count = cycle_count + 1;
for (i = 0; i < `DESIGN_SIZE; i=i+1) begin
if(activation_type==1'b1) begin // tanH
slope_applied_data_internal[i*`DWIDTH +:`DWIDTH] <= data_slope_flopped[i*8 +: 8] * inp_data_flopped[i*`DWIDTH +:`DWIDTH];
data_intercept_flopped[i*8 +: 8] <= data_intercept[i*8 +: 8];
data_intercept_delayed[i*8 +: 8] <= data_intercept_flopped[i*8 +: 8];
intercept_applied_data_internal[i*`DWIDTH +:`DWIDTH] <= slope_applied_data_internal[i*`DWIDTH +:`DWIDTH] + data_intercept_delayed[i*8 +: 8];
end else begin // ReLU
relu_applied_data_internal[i*`DWIDTH +:`DWIDTH] <= inp_data[i*`DWIDTH] ? {`DWIDTH{1'b0}} : inp_data[i*`DWIDTH +:`DWIDTH];
end
end
//TANH needs 1 extra cycle
if (activation_type==1'b1) begin
if (cycle_count==3) begin
out_data_available_internal <= 1;
end
end else begin
if (cycle_count==2) begin
out_data_available_internal <= 1;
end
end
//TANH needs 1 extra cycle
if (activation_type==1'b1) begin
if(cycle_count==(`DESIGN_SIZE+2)) begin
done_activation_internal <= 1'b1;
activation_in_progress <= 0;
end
else begin
activation_in_progress <= 1;
end
end else begin
if(cycle_count==(`DESIGN_SIZE+1)) begin
done_activation_internal <= 1'b1;
activation_in_progress <= 0;
end
else begin
activation_in_progress <= 1;
end
end
end
else begin
slope_applied_data_internal <= 0;
intercept_applied_data_internal <= 0;
relu_applied_data_internal <= 0;
data_intercept_delayed <= 0;
data_intercept_flopped <= 0;
done_activation_internal <= 0;
out_data_available_internal <= 0;
cycle_count <= 0;
activation_in_progress <= 0;
end
end
assign out_data_internal = (activation_type) ? intercept_applied_data_internal : relu_applied_data_internal;
//Our equation of tanh is Y=AX+B
//A is the slope and B is the intercept.
//We store A in one LUT and B in another.
//LUT for the slope
always @(address) begin
for (i = 0; i < `DESIGN_SIZE; i=i+1) begin
case (address[i*4+:4])
4'b0000: data_slope[i*8+:8] = 8'd0;
4'b0001: data_slope[i*8+:8] = 8'd0;
4'b0010: data_slope[i*8+:8] = 8'd2;
4'b0011: data_slope[i*8+:8] = 8'd3;
4'b0100: data_slope[i*8+:8] = 8'd4;
4'b0101: data_slope[i*8+:8] = 8'd0;
4'b0110: data_slope[i*8+:8] = 8'd4;
4'b0111: data_slope[i*8+:8] = 8'd3;
4'b1000: data_slope[i*8+:8] = 8'd2;
4'b1001: data_slope[i*8+:8] = 8'd0;
4'b1010: data_slope[i*8+:8] = 8'd0;
default: data_slope[i*8+:8] = 8'd0;
endcase
end
end
//LUT for the intercept
always @(address) begin
for (i = 0; i < `DESIGN_SIZE; i=i+1) begin
case (address[i*4+:4])
4'b0000: data_intercept[i*8+:8] = 8'd127;
4'b0001: data_intercept[i*8+:8] = 8'd99;
4'b0010: data_intercept[i*8+:8] = 8'd46;
4'b0011: data_intercept[i*8+:8] = 8'd18;
4'b0100: data_intercept[i*8+:8] = 8'd0;
4'b0101: data_intercept[i*8+:8] = 8'd0;
4'b0110: data_intercept[i*8+:8] = 8'd0;
4'b0111: data_intercept[i*8+:8] = -8'd18;
4'b1000: data_intercept[i*8+:8] = -8'd46;
4'b1001: data_intercept[i*8+:8] = -8'd99;
4'b1010: data_intercept[i*8+:8] = -8'd127;
default: data_intercept[i*8+:8] = 8'd0;
endcase
end
end
//Logic to find address
always @(inp_data) begin
for (i = 0; i < `DESIGN_SIZE; i=i+1) begin
if((inp_data[i*`DWIDTH +:`DWIDTH])>=90) begin
address[i*4+:4] = 4'b0000;
end
else if ((inp_data[i*`DWIDTH +:`DWIDTH])>=39 && (inp_data[i*`DWIDTH +:`DWIDTH])<90) begin
address[i*4+:4] = 4'b0001;
end
else if ((inp_data[i*`DWIDTH +:`DWIDTH])>=28 && (inp_data[i*`DWIDTH +:`DWIDTH])<39) begin
address[i*4+:4] = 4'b0010;
end
else if ((inp_data[i*`DWIDTH +:`DWIDTH])>=16 && (inp_data[i*`DWIDTH +:`DWIDTH])<28) begin
address[i*4+:4] = 4'b0011;
end
else if ((inp_data[i*`DWIDTH +:`DWIDTH])>=1 && (inp_data[i*`DWIDTH +:`DWIDTH])<16) begin
address[i*4+:4] = 4'b0100;
end
else if ((inp_data[i*`DWIDTH +:`DWIDTH])==0) begin
address[i*4+:4] = 4'b0101;
end
else if ((inp_data[i*`DWIDTH +:`DWIDTH])>-16 && (inp_data[i*`DWIDTH +:`DWIDTH])<=-1) begin
address[i*4+:4] = 4'b0110;
end
else if ((inp_data[i*`DWIDTH +:`DWIDTH])>-28 && (inp_data[i*`DWIDTH +:`DWIDTH])<=-16) begin
address[i*4+:4] = 4'b0111;
end
else if ((inp_data[i*`DWIDTH +:`DWIDTH])>-39 && (inp_data[i*`DWIDTH +:`DWIDTH])<=-28) begin
address[i*4+:4] = 4'b1000;
end
else if ((inp_data[i*`DWIDTH +:`DWIDTH])>-90 && (inp_data[i*`DWIDTH +:`DWIDTH])<=-39) begin
address[i*4+:4] = 4'b1001;
end
else if ((inp_data[i*`DWIDTH +:`DWIDTH])<=-90) begin
address[i*4+:4] = 4'b1010;
end
else begin
address[i*4+:4] = 4'b0101;
end
end
end
//Adding a dummy signal to use validity_mask input, to make ODIN happy
//TODO: Need to correctly use validity_mask
wire [`MASK_WIDTH-1:0] dummy;
assign dummy = validity_mask;
endmodule
//////////////////////////////////////////////////////
// Top module
//////////////////////////////////////////////////////
module top(
input clk,
input clk_mem,
input reset,
input resetn,
input [`REG_ADDRWIDTH-1:0] PADDR,
input PWRITE,
input PSEL,
input PENABLE,
input [`REG_DATAWIDTH-1:0] PWDATA,
output [`REG_DATAWIDTH-1:0] PRDATA,
output PREADY,
input [`AWIDTH-1:0] bram_addr_a_ext,
output [`DESIGN_SIZE*`DWIDTH-1:0] bram_rdata_a_ext,
input [`DESIGN_SIZE*`DWIDTH-1:0] bram_wdata_a_ext,
input [`DESIGN_SIZE-1:0] bram_we_a_ext,
input [`AWIDTH-1:0] bram_addr_b_ext,
output [`DESIGN_SIZE*`DWIDTH-1:0] bram_rdata_b_ext,
input [`DESIGN_SIZE*`DWIDTH-1:0] bram_wdata_b_ext,
input [`DESIGN_SIZE-1:0] bram_we_b_ext
);
wire [`AWIDTH-1:0] bram_addr_a;
wire [`AWIDTH-1:0] bram_addr_a_for_reading;
reg [`AWIDTH-1:0] bram_addr_a_for_writing;
wire [`DESIGN_SIZE*`DWIDTH-1:0] bram_rdata_a;
reg [`DESIGN_SIZE*`DWIDTH-1:0] bram_wdata_a;
wire [`DESIGN_SIZE-1:0] bram_we_a;
wire bram_en_a;
wire [`AWIDTH-1:0] bram_addr_b;
wire [`DESIGN_SIZE*`DWIDTH-1:0] bram_rdata_b;
wire [`DESIGN_SIZE*`DWIDTH-1:0] bram_wdata_b;
wire [`DESIGN_SIZE-1:0] bram_we_b;
wire bram_en_b;
reg bram_a_wdata_available;
wire [`AWIDTH-1:0] bram_addr_c_NC;
wire start_tpu;
wire done_tpu;
wire start_mat_mul;
wire done_mat_mul;
wire norm_out_data_available;
wire done_norm;
wire pool_out_data_available;
wire done_pool;
wire activation_out_data_available;
wire done_activation;
wire enable_matmul;
wire enable_norm;
wire enable_activation;
wire enable_pool;
wire [`DESIGN_SIZE*`DWIDTH-1:0] matmul_c_data_out;
wire [`DESIGN_SIZE*`DWIDTH-1:0] norm_data_out;
wire [`DESIGN_SIZE*`DWIDTH-1:0] pool_data_out;
wire [`DESIGN_SIZE*`DWIDTH-1:0] activation_data_out;
wire matmul_c_data_available;
wire [`DESIGN_SIZE*`DWIDTH-1:0] a_data_out_NC;
wire [`DESIGN_SIZE*`DWIDTH-1:0] b_data_out_NC;
wire [`DESIGN_SIZE*`DWIDTH-1:0] a_data_in_NC;
wire [`DESIGN_SIZE*`DWIDTH-1:0] b_data_in_NC;
wire [`DWIDTH-1:0] mean;
wire [`DWIDTH-1:0] inv_var;
wire [`AWIDTH-1:0] address_mat_a;
wire [`AWIDTH-1:0] address_mat_b;
wire [`AWIDTH-1:0] address_mat_c;
wire [`MASK_WIDTH-1:0] validity_mask_a_rows;
wire [`MASK_WIDTH-1:0] validity_mask_a_cols;
wire [`MASK_WIDTH-1:0] validity_mask_b_rows;
wire [`MASK_WIDTH-1:0] validity_mask_b_cols;
wire save_output_to_accum;
wire add_accum_to_output;
wire [`ADDR_STRIDE_WIDTH-1:0] address_stride_a;
wire [`ADDR_STRIDE_WIDTH-1:0] address_stride_b;
wire [`ADDR_STRIDE_WIDTH-1:0] address_stride_c;
wire [`MAX_BITS_POOL-1:0] pool_window_size;
wire activation_type;
wire [3:0] conv_filter_height;
wire [3:0] conv_filter_width;
wire [3:0] conv_stride_horiz;
wire [3:0] conv_stride_verti;
wire [3:0] conv_padding_left;
wire [3:0] conv_padding_right;
wire [3:0] conv_padding_top;
wire [3:0] conv_padding_bottom;
wire [15:0] num_channels_inp;
wire [15:0] num_channels_out;
wire [15:0] inp_img_height;
wire [15:0] inp_img_width;
wire [15:0] out_img_height;
wire [15:0] out_img_width;
wire [31:0] batch_size;
wire enable_conv_mode;
wire pe_reset;
//Connections for bram a (activation/input matrix)
//bram_addr_a -> connected to u_matmul_4x4
//bram_rdata_a -> connected to u_matmul_4x4
//bram_wdata_a -> will come from the last block that is enabled
//bram_we_a -> will be 1 when the last block's data is available
//bram_en_a -> hardcoded to 1
assign bram_addr_a = (bram_a_wdata_available) ? bram_addr_a_for_writing : bram_addr_a_for_reading;
assign bram_en_a = 1'b1;
assign bram_we_a = (bram_a_wdata_available) ? {`DESIGN_SIZE{1'b1}} : {`DESIGN_SIZE{1'b0}};
//Connections for bram b (weights matrix)
//bram_addr_b -> connected to u_matmul_4x4
//bram_rdata_b -> connected to u_matmul_4x4
//bram_wdata_b -> hardcoded to 0 (this block only reads from bram b)
//bram_we_b -> hardcoded to 0 (this block only reads from bram b)
//bram_en_b -> hardcoded to 1
assign bram_wdata_b = {`DESIGN_SIZE*`DWIDTH{1'b0}};
assign bram_en_b = 1'b1;
assign bram_we_b = {`DESIGN_SIZE{1'b0}};
////////////////////////////////////////////////////////////////
// BRAM matrix A (inputs/activations)
////////////////////////////////////////////////////////////////
ram matrix_A (
.addr0(bram_addr_a),
.d0(bram_wdata_a),
.we0(bram_we_a),
.q0(bram_rdata_a),
.addr1(bram_addr_a_ext),
.d1(bram_wdata_a_ext),
.we1(bram_we_a_ext),
.q1(bram_rdata_a_ext),
.clk(clk_mem));
////////////////////////////////////////////////////////////////
// BRAM matrix B (weights)
////////////////////////////////////////////////////////////////
ram matrix_B (
.addr0(bram_addr_b),
.d0(bram_wdata_b),
.we0(bram_we_b),
.q0(bram_rdata_b),
.addr1(bram_addr_b_ext),
.d1(bram_wdata_b_ext),
.we1(bram_we_b_ext),
.q1(bram_rdata_b_ext),
.clk(clk_mem));
////////////////////////////////////////////////////////////////
// Control logic that directs all the operation
////////////////////////////////////////////////////////////////
control u_control(
.clk(clk),
.reset(reset),
.start_tpu(start_tpu),
.enable_matmul(enable_matmul),
.enable_norm(enable_norm),
.enable_activation(enable_activation),
.enable_pool(enable_pool),
.start_mat_mul(start_mat_mul),
.done_mat_mul(done_mat_mul),
.done_norm(done_norm),
.done_pool(done_pool),
.done_activation(done_activation),
.save_output_to_accum(save_output_to_accum),
.done_tpu(done_tpu)
);
////////////////////////////////////////////////////////////////
// Configuration (register) block
////////////////////////////////////////////////////////////////
cfg u_cfg(
.PCLK(clk),
.PRESETn(resetn),
.PADDR(PADDR),
.PWRITE(PWRITE),
.PSEL(PSEL),
.PENABLE(PENABLE),
.PWDATA(PWDATA),
.PRDATA(PRDATA),
.PREADY(PREADY),
.start_tpu(start_tpu),
.enable_matmul(enable_matmul),
.enable_norm(enable_norm),
.enable_pool(enable_pool),
.enable_activation(enable_activation),
.enable_conv_mode(enable_conv_mode),
.mean(mean),
.inv_var(inv_var),
.pool_window_size(pool_window_size),
.address_mat_a(address_mat_a),
.address_mat_b(address_mat_b),
.address_mat_c(address_mat_c),
.validity_mask_a_rows(validity_mask_a_rows),
.validity_mask_a_cols(validity_mask_a_cols),
.validity_mask_b_rows(validity_mask_b_rows),
.validity_mask_b_cols(validity_mask_b_cols),
.save_output_to_accum(save_output_to_accum),
.add_accum_to_output(add_accum_to_output),
.address_stride_a(address_stride_a),
.address_stride_b(address_stride_b),
.address_stride_c(address_stride_c),
.activation_type(activation_type),
.conv_filter_height(conv_filter_height),
.conv_filter_width(conv_filter_width),
.conv_stride_horiz(conv_stride_horiz),
.conv_stride_verti(conv_stride_verti),
.conv_padding_left(conv_padding_left),
.conv_padding_right(conv_padding_right),
.conv_padding_top(conv_padding_top),
.conv_padding_bottom(conv_padding_bottom),
.num_channels_inp(num_channels_inp),
.num_channels_out(num_channels_out),
.inp_img_height(inp_img_height),
.inp_img_width(inp_img_width),
.out_img_height(out_img_height),
.out_img_width(out_img_width),
.batch_size(batch_size),
.pe_reset(pe_reset),
.done_tpu(done_tpu)
);
//TODO: We want to move the data setup part
//and the interface to BRAM_A and BRAM_B outside
//into its own modules. For now, it is all inside
//the matmul block
////////////////////////////////////////////////////////////////
//Matrix multiplier
//Note: the ports on this module to write data to bram c
//are not used in this top module.
////////////////////////////////////////////////////////////////
matmul_16x16_systolic u_matmul(
.clk(clk),
.reset(reset),
.pe_reset(pe_reset),
.start_mat_mul(start_mat_mul),
.done_mat_mul(done_mat_mul),
.address_mat_a(address_mat_a),
.address_mat_b(address_mat_b),
.address_mat_c(address_mat_c),
.address_stride_a(address_stride_a),
.address_stride_b(address_stride_b),
.address_stride_c(address_stride_c),
.a_data(bram_rdata_a),
.b_data(bram_rdata_b),
.a_data_in(a_data_in_NC),
.b_data_in(b_data_in_NC),
.c_data_in({`DESIGN_SIZE*`DWIDTH{1'b0}}),
.c_data_out(matmul_c_data_out),
.a_data_out(a_data_out_NC),
.b_data_out(b_data_out_NC),
.a_addr(bram_addr_a_for_reading),
.b_addr(bram_addr_b),
.c_addr(bram_addr_c_NC),
.c_data_available(matmul_c_data_available),
.validity_mask_a_rows(validity_mask_a_rows),
.validity_mask_a_cols(validity_mask_a_cols),
.validity_mask_b_rows(validity_mask_b_rows),
.validity_mask_b_cols(validity_mask_b_cols),
.final_mat_mul_size(8'd16),
.a_loc(8'd0),
.b_loc(8'd0)
);
////////////////////////////////////////////////////////////////
// Normalization module
////////////////////////////////////////////////////////////////
norm u_norm(
.enable_norm(enable_norm),
.mean(mean),
.inv_var(inv_var),
.in_data_available(matmul_c_data_available),
.inp_data(matmul_c_data_out),
.out_data(norm_data_out),
.out_data_available(norm_out_data_available),
.validity_mask(validity_mask_a_rows),
.done_norm(done_norm),
.clk(clk),
.reset(reset)
);
////////////////////////////////////////////////////////////////
// Pooling module
////////////////////////////////////////////////////////////////
pool u_pool(
.enable_pool(enable_pool),
.in_data_available(norm_out_data_available),
.pool_window_size(pool_window_size),
.inp_data(norm_data_out),
.out_data(pool_data_out),
.out_data_available(pool_out_data_available),
.validity_mask(validity_mask_a_rows),
.done_pool(done_pool),
.clk(clk),
.reset(reset)
);
////////////////////////////////////////////////////////////////
// Activation module
////////////////////////////////////////////////////////////////
activation u_activation(
.activation_type(activation_type),
.enable_activation(enable_activation),
.in_data_available(pool_out_data_available),
.inp_data(pool_data_out),
.out_data(activation_data_out),
.out_data_available(activation_out_data_available),
.validity_mask(validity_mask_a_rows),
.done_activation(done_activation),
.clk(clk),
.reset(reset)
);
//Interface to BRAM to write the output.
//Ideally, we could remove this flop stage. But then we'd
//have to generate the address for the output BRAM in each
//block that could potentially write the output.
always @(posedge clk) begin
if (reset) begin
bram_wdata_a <= 0;
bram_addr_a_for_writing <= address_mat_c + address_stride_c;
bram_a_wdata_available <= 0;
end
else if (activation_out_data_available) begin
bram_wdata_a <= activation_data_out;
bram_addr_a_for_writing <= bram_addr_a_for_writing - address_stride_c;
bram_a_wdata_available <= activation_out_data_available;
end
else begin
bram_wdata_a <= 0;
bram_addr_a_for_writing <= address_mat_c + address_stride_c;
bram_a_wdata_available <= 0;
end
end
endmodule