//auto-generated top.v
//top level module of LU factorization
//by Wei Zhang

`define NWIDTH 6'b010100
`define BLOCKWIDTH 4'b0111
`define DDRWIDTH 7'b0100000
`define DDRNUMDQS 4'b0100
`define DDRSIZEWIDTH 6'b011000
`define BURSTLEN 3'b010
`define MEMCONWIDTH 8'b01000000
`define MEMCONNUMBYTES 5'b01000
`define RAMWIDTH 12'b010000000000
`define RAMNUMBYTES 9'b010000000
`define RAMSIZEWIDTH 4'b0111
`define TOPWIDTH 7'b0100000
`define rFIFOINPUTWIDTH 8'b01000000
`define wFIFOINPUTWIDTH 12'b010000000000
`define mFIFOWIDTH 6'b011100
`define aFIFOWIDTH 4'b0111

module LU32PEEng (clk, //ref_clk, global_reset_n,
 start, N, offset, done,
		//mem_addr, mem_ba, mem_cas_n, mem_cke, mem_clk, mem_clk_n, mem_cs_n,
burst_begin,
mem_local_be,
mem_local_read_req,
mem_local_size,
mem_local_wdata,
mem_local_write_req,
mem_local_rdata,
mem_local_rdata_valid,
mem_local_ready,
mem_local_wdata_req,
reset_n,
mem_local_addr
//Cong: dummy output
//a_junk,
//w_junk,
//m_junk,
//r_junk,
//Cong:dummy output
//junk_r,
//junk_r1,
//junk_r2,
//junk_r3,
//junk_top
	);

input start;
input[`NWIDTH-1:0] N;
input[`DDRSIZEWIDTH-1:0] offset;
output done;
input clk;

output burst_begin;
output [`MEMCONNUMBYTES-1:0] mem_local_be;
output mem_local_read_req;
output [`BURSTLEN-1:0] mem_local_size;
output [`MEMCONWIDTH-1:0] mem_local_wdata;
output mem_local_write_req;
output [`DDRSIZEWIDTH-1:0] mem_local_addr;
input [`MEMCONWIDTH-1:0] mem_local_rdata;
input mem_local_rdata_valid;
input mem_local_ready;
input reset_n;
input mem_local_wdata_req;
wire[`BLOCKWIDTH-1:0] m, n, loop;
wire[1:0] mode;
wire comp_start, comp_done;
wire dtu_write_req, dtu_read_req, dtu_ack, dtu_done;
wire [`DDRSIZEWIDTH-1:0] dtu_mem_addr;
wire [`RAMSIZEWIDTH-1:0] dtu_ram_addr;
wire [`BLOCKWIDTH-1:0] dtu_size;
wire left_sel;

wire[`RAMWIDTH-1:0] curWriteDataMem, curReadDataMem;
wire[`RAMSIZEWIDTH-1:0] curWriteAddrMem, curReadAddrMem;
wire[`RAMNUMBYTES-1:0] curWriteByteEnMem;
wire curWriteEnMem;
wire[`RAMWIDTH-1:0] leftWriteDataMem;
wire[`RAMSIZEWIDTH-1:0] leftWriteAddrMem;
wire[`RAMNUMBYTES-1:0] leftWriteByteEnMem;
wire leftWriteEnMem;
wire curMemSel, leftMemSel;

wire burst_begin;
wire [`MEMCONNUMBYTES-1:0] mem_local_be;
wire mem_local_read_req;
wire [`BURSTLEN-1:0] mem_local_size;
wire [`MEMCONWIDTH-1:0] mem_local_wdata;
wire mem_local_write_req;
wire [`MEMCONWIDTH-1:0] mem_local_rdata;
wire mem_local_rdata_valid;
wire mem_local_ready;
wire mem_local_wdata_req;
wire reset_n;
wire [`DDRSIZEWIDTH-1:0] mem_local_addr;

wire[`RAMWIDTH-1:0] ram_write_data, ram_read_data;
wire[`RAMSIZEWIDTH-1:0] ram_write_addr, ram_read_addr;
wire[`RAMNUMBYTES-1:0] ram_write_byte_en;
wire ram_write_en;

MarshallerController MC (clk, start, done, N, offset,
	comp_start, m, n, loop, mode, comp_done, curMemSel, leftMemSel,
	dtu_write_req, dtu_read_req, dtu_mem_addr, dtu_ram_addr, dtu_size, dtu_ack, dtu_done, left_sel);

// block that computes the LU factorization, with answer stored back into ram block
LU compBlock (clk, comp_start, m, n, loop, mode, comp_done,
			curReadAddrMem, curReadDataMem, curWriteByteEnMem, curWriteDataMem, curWriteAddrMem, curWriteEnMem, curMemSel,
			leftWriteByteEnMem, leftWriteDataMem, leftWriteAddrMem, leftWriteEnMem, leftMemSel);

DataTransferUnit DTU (.clk(clk), .dtu_write_req(dtu_write_req), .dtu_read_req(dtu_read_req), .dtu_mem_addr(dtu_mem_addr), .dtu_ram_addr(dtu_ram_addr), .dtu_size(dtu_size), .dtu_ack(dtu_ack), .dtu_done(dtu_done),
		.ram_read_addr(ram_read_addr), .ram_read_data(ram_read_data), .ram_write_byte_en(ram_write_byte_en), .ram_write_data(ram_write_data), .ram_write_addr(ram_write_addr), .ram_write_en(ram_write_en),
		.mem_rdata(mem_local_rdata), .mem_rdata_valid(mem_local_rdata_valid), .mem_ready(mem_local_ready), .mem_wdata_req(mem_local_wdata_req), .reset_n(reset_n),
		.burst_begin(burst_begin), .mem_local_addr(mem_local_addr), .mem_be(mem_local_be), .mem_read_req(mem_local_read_req), .mem_size(mem_local_size),
		.mem_wdata(mem_local_wdata), .mem_write_req(mem_local_write_req)
		//Cong: dummy output
		);

assign curReadAddrMem = ram_read_addr;
assign curWriteByteEnMem = ram_write_byte_en;
assign curWriteDataMem = ram_write_data;
assign curWriteAddrMem = ram_write_addr;
assign curWriteEnMem = ram_write_en && (left_sel == 0);
assign leftWriteByteEnMem = ram_write_byte_en;
assign leftWriteDataMem = ram_write_data;
assign leftWriteAddrMem = ram_write_addr;
assign leftWriteEnMem = ram_write_en && (left_sel == 1);
assign ram_read_data = curReadDataMem;
endmodule
`define BLOCKM 8'b01000000
`define BLOCKN 8'b01000000
`define BLOCKMDIVK 3'b010
`define MEMBLOCKM 7'b0100000
`define MEMBLOCKN 7'b0100000
`define NWIDTH 6'b010100
`define BLOCKWIDTH 4'b0111
`define DDRSIZEWIDTH 6'b011000
`define RAMSIZEWIDTH 4'b0111
`define START 1'b0 //0
`define SETUP 2'b01 //1
`define FIRST 3'b010 //2
`define MODE0_SETUP 3'b011 //3
`define MODE0_WAIT 4'b0100 //4
`define MODE0 4'b0101 //5
`define MODE1_SETUP 4'b0110 //6
`define MODE1_WAIT 4'b0111 //7
`define MODE1 5'b01000 //8
`define MODE2_SETUP 5'b01001 //9
`define MODE2_WAIT 5'b01010 //10
`define MODE2 5'b01011 //11
`define MODE3_SETUP 5'b01100 //12
`define MODE3_WAIT 5'b01101 //13
`define MODE3 5'b01110 //14
`define STALL 5'b01111 //15
`define STALL_WAIT 6'b010000 //16
`define WAIT 6'b010001 //17
`define FINAL_WRITE 6'b010010 //18
`define FINAL_WAIT 6'b010011 //19
`define IDLE 6'b010100 //20
`define LAST_SETUP 6'b010101 //21
`define LAST_SETUP_WAIT 6'b010110 //22
`define LAST 6'b010111 //23
`define LAST_WAIT 6'b011000 //24
`define MEM_IDLE 1'b0 //0
`define MEM_WRITE 2'b01 //1
`define MEM_WRITE_WAIT 3'b010 //2
`define MEM_CHECK_DONE 3'b011 //3
`define MEM_READ 4'b0100 //4
`define MEM_READ_WAIT 4'b0101 //5
`define MEM_DONE 4'b0110 //6
`define MEM_WAIT_DONE 4'b0111 //7

module MarshallerController (clk, start, done, input_N, offset,
	comp_start, block_m, block_n, loop, mode, comp_done, cur_mem_sel, left_mem_sel,
	dtu_write_req, dtu_read_req, dtu_mem_addr, dtu_ram_addr, dtu_size, dtu_ack, dtu_done, left_sel);


input clk;
input start;
output done;
input [`NWIDTH-1:0] input_N;
input [`DDRSIZEWIDTH-1:0] offset;

// for computation section
output comp_start;
output [`BLOCKWIDTH-1:0] block_m, block_n, loop;
output [1:0] mode;
input comp_done;
output cur_mem_sel, left_mem_sel;

// for data marshaller section
output dtu_write_req,	dtu_read_req;
output [`DDRSIZEWIDTH-1:0] dtu_mem_addr;
output [`RAMSIZEWIDTH-1:0] dtu_ram_addr;
output [`BLOCKWIDTH-1:0] dtu_size;
input dtu_ack, dtu_done;
output left_sel;

reg [4:0] cur_state, next_state;
reg [`NWIDTH-1:0] comp_N, N, mcount, ncount, Ndivk, mem_N;
reg [1:0] mode;
reg [`BLOCKWIDTH-1:0] block_m, block_n, loop, read_n;
reg [`BLOCKWIDTH-1:0] write_n, write_n_buf;
reg left_mem_sel, cur_mem_sel, no_left_switch;

reg [3:0] cur_mem_state, next_mem_state;
reg [`RAMSIZEWIDTH-1:0] ram_addr;
reg [`DDRSIZEWIDTH-1:0] mem_addr;
reg [`DDRSIZEWIDTH-1:0] mem_base, mem_top, mem_write, mem_left, mem_cur;
reg [`DDRSIZEWIDTH-1:0] mem_write_buf;
reg [`BLOCKWIDTH-1:0] mem_count;
reg [1:0] mem_read;
reg [`BLOCKWIDTH-1:0] mem_write_size, mem_write_size_buf, mem_read_size;
wire mem_done;

assign done = (cur_state == `IDLE);
assign dtu_ram_addr = ram_addr;
assign dtu_mem_addr = mem_addr;
assign dtu_size = (cur_mem_state == `MEM_WRITE) ? mem_write_size : mem_read_size;
assign comp_start = (cur_state == `MODE0)||(cur_state == `MODE1)||(cur_state == `MODE2)||(cur_state == `MODE3)||(cur_state == `FIRST)||(cur_state == `LAST);
assign dtu_write_req = (cur_mem_state == `MEM_WRITE);
assign dtu_read_req = (cur_mem_state == `MEM_READ);
assign mem_done = (cur_mem_state == `MEM_DONE)&&(dtu_done == 1'b1);
assign left_sel = mem_read == 2'b01 && (cur_mem_state == `MEM_READ || cur_mem_state == `MEM_READ_WAIT || cur_mem_state == `MEM_WAIT_DONE);

// FSM to produce memory instructions to DTU
always @ (posedge clk)
begin
	case (cur_mem_state)
	`MEM_IDLE:
	begin
		if (cur_state == `START)
			next_mem_state <= `MEM_CHECK_DONE;
		else
			next_mem_state <= `MEM_IDLE;
	end
	`MEM_DONE:
	begin
		if (cur_state == `MODE0 || cur_state == `MODE1 || cur_state == `MODE2 || 
			cur_state == `MODE3 || cur_state == `FINAL_WRITE || cur_state == `LAST_SETUP)
			next_mem_state <= `MEM_WRITE;
		else if (cur_state == `FIRST)
			next_mem_state <= `MEM_CHECK_DONE;
		else
			next_mem_state <= `MEM_DONE;
	end
	`MEM_WRITE:
	begin
		next_mem_state <= `MEM_WRITE_WAIT;
	end
	`MEM_WRITE_WAIT:
	begin
		if (dtu_ack == 1'b1)
		begin
			if (mem_count == write_n)
				next_mem_state <= `MEM_WAIT_DONE;
			else 
				next_mem_state <= `MEM_WRITE;
		end
		else
			next_mem_state <= `MEM_WRITE_WAIT;
	end
	`MEM_WAIT_DONE:
	begin
		if (dtu_done == 1'b1)
			next_mem_state <= `MEM_CHECK_DONE;
		else
			next_mem_state <= `MEM_WAIT_DONE;
	end
	`MEM_CHECK_DONE:
	begin
		if (mem_read == 2'b10)
			next_mem_state <= `MEM_DONE;
		else
			next_mem_state <= `MEM_READ;
	end
	`MEM_READ:
	begin
		next_mem_state <= `MEM_READ_WAIT;
	end
	`MEM_READ_WAIT:
	begin
		if (dtu_ack == 1'b1)
		begin
			if (mem_count == read_n)
				next_mem_state <= `MEM_WAIT_DONE;
			else
				next_mem_state <= `MEM_READ;
		end
		else
			next_mem_state <= `MEM_READ_WAIT;
	end
	default:
		next_mem_state <= `MEM_IDLE;
	endcase
end

always @ (posedge clk)
begin
	if (cur_mem_state == `MEM_DONE || cur_mem_state == `MEM_IDLE)
	begin
		ram_addr <= 7'b0;
		mem_addr <= mem_write;
		if (next_state == `LAST_WAIT || next_state == `FINAL_WAIT || next_state == `STALL)
			mem_read <= 2'b00;
		else if (next_state == `MODE0_SETUP || next_state == `SETUP || cur_state == `MODE0 || next_state == `LAST_SETUP_WAIT)
			mem_read <= 2'b01;
		else
			mem_read <= 2'b10;
		mem_count <= 7'b0;
	end
	else if (cur_mem_state == `MEM_CHECK_DONE)
	begin
		if (mem_read == 2'b10)
		begin
			mem_addr <= mem_left;
			read_n <= loop;
		end
		else
		begin
			mem_addr <= mem_cur;
			read_n <= block_n;
		end
		mem_read <= mem_read - 2'b01;
		mem_count <= 7'b0;
		ram_addr <= 7'b0;
	end
	else if (cur_mem_state == `MEM_WRITE || cur_mem_state == `MEM_READ)
	begin
		ram_addr <= ram_addr + `BLOCKMDIVK;
		mem_addr <= mem_addr + Ndivk;
		mem_count <= mem_count + 2'b01;
	end
	
end

// FSM to determine the block LU factorization algorithm
always @ (posedge clk)
begin
	case (cur_state)
	`START:
	begin
		next_state <= `SETUP;
	end
	`SETUP:
	begin
		next_state <= `WAIT;
	end
	`WAIT:
	begin
		if (mem_done == 1'b1)
			next_state <= `FIRST;
		else
			next_state <= `WAIT;

	end
	`FIRST:
	begin
		if (mcount < comp_N)
			next_state <= `MODE1_SETUP;
		else if (ncount < comp_N)
			next_state <= `MODE2_SETUP;
		else
			next_state <= `LAST_WAIT;
	end
	`MODE0_SETUP:
	begin
		next_state <= `MODE0_WAIT;
	end
	`MODE0_WAIT:
	begin
		if (mem_done == 1'b1 && comp_done == 1'b1)
			next_state <= `MODE0;
		else
			next_state <= `MODE0_WAIT;

	end
	`MODE0:
	begin
		if (mcount < comp_N)
			next_state <= `MODE1_SETUP;
		else if (ncount < comp_N)
			next_state <= `MODE2_SETUP;
		else
		begin
			next_state <= `LAST_WAIT;
		end
	end
	`MODE1_SETUP:
	begin
		next_state <= `MODE1_WAIT;
	end
	`MODE1_WAIT:
	begin
		if (mem_done == 1'b1 && comp_done == 1'b1)
			next_state <= `MODE1;
		else
			next_state <= `MODE1_WAIT;

	end
	`MODE1:
	begin
		if (mcount < comp_N)
			next_state <= `MODE1_SETUP;
		else if (ncount < comp_N)
			next_state <= `MODE2_SETUP;
		else if (comp_N <= `BLOCKN + `BLOCKN)
			next_state <= `STALL;
		else
			next_state <= `MODE0_SETUP;
	end
	`MODE2_SETUP:
	begin
		next_state <= `MODE2_WAIT;
	end
	`MODE2_WAIT:
	begin
		if (mem_done == 1'b1 && comp_done == 1'b1)
			next_state <= `MODE2;
		else
			next_state <= `MODE2_WAIT;
	end
	`MODE2:
	begin
		if (mcount < comp_N)
			next_state <= `MODE3_SETUP;
		else if (ncount < comp_N)
			next_state <= `MODE2_SETUP;
		else if (comp_N <= `BLOCKN + `BLOCKN)
			next_state <= `STALL;
		else
			next_state <= `MODE0_SETUP;
	end
	`MODE3_SETUP:
	begin
		next_state <= `MODE3_WAIT;
	end
	`MODE3_WAIT:
	begin
		if (mem_done == 1'b1 && comp_done == 1'b1)
			next_state <= `MODE3;
		else
			next_state <= `MODE3_WAIT;
	end
	`MODE3:
	begin
		if (mcount < comp_N)
			next_state <= `MODE3_SETUP;
		else if (ncount < comp_N)
			next_state <= `MODE2_SETUP;
		else if (comp_N <= `BLOCKN + `BLOCKN)
			next_state <= `STALL;
		else
			next_state <= `MODE0_SETUP;
	end
	`STALL:
		next_state <= `STALL_WAIT;
	`STALL_WAIT:
		if (mem_done == 1'b1 && comp_done == 1'b1)
			next_state <= `LAST_SETUP;
		else
			next_state <= `STALL_WAIT;
	`LAST_SETUP:
		next_state <= `LAST_SETUP_WAIT;
	`LAST_SETUP_WAIT:
		if (mem_done == 1'b1 && comp_done == 1'b1)
			next_state <= `LAST;
		else
			next_state <= `LAST_SETUP_WAIT;
	`LAST:
		next_state <= `LAST_WAIT;
	`LAST_WAIT:
		if (mem_done == 1'b1 && comp_done == 1'b1)
			next_state <= `FINAL_WRITE;
		else
			next_state <= `LAST_WAIT;
	`FINAL_WRITE:
		next_state <= `FINAL_WAIT;
	`FINAL_WAIT:
		if (mem_done == 1'b1)
			next_state <= `IDLE;
		else
			next_state <= `FINAL_WAIT;
	`IDLE:
		if (start)
			next_state <= `SETUP;
		else
			next_state <= `IDLE;
	default:
		next_state <= `START;
	endcase
end

always @ (posedge clk)
begin
	if (start)
	begin
		cur_state <= `START;
		cur_mem_state <= `MEM_IDLE;
	end
	else
	begin
		cur_state <= next_state;
		cur_mem_state <= next_mem_state;
	end
end

always @ (cur_state)
begin
	case (cur_state)
	`MODE1:
		mode = 2'b01;
	`MODE2:
		mode = 2'b10;
	`MODE3:
		mode = 2'b11;
	default:
		mode = 2'b00;
	endcase
end

always @ (posedge clk)
begin
	if (start)
	begin
		comp_N <= input_N;
		N <= input_N;
	end
	else if (next_state == `MODE0)
	begin
		comp_N <= comp_N - `BLOCKN;
	end

	Ndivk <= ((N+`BLOCKM-1)>>6)<<5;
	mem_N <= Ndivk<<6;

	if (start)
	begin
		mem_base <= offset;
		mem_top <= offset;
		mem_left <= offset;
		mem_cur <= offset;
	end
	else if (cur_state == `MODE0_SETUP)
	begin
		mem_base <= mem_base + mem_N+`MEMBLOCKN;
		mem_top <= mem_base + mem_N+`MEMBLOCKN;
		mem_cur <= mem_base + mem_N+`MEMBLOCKN;
		mem_left <= mem_base + mem_N+`MEMBLOCKN;
	end
	else if (cur_state == `MODE1_SETUP)
	begin
		mem_cur <= mem_cur + `MEMBLOCKM;
	end	
	else if (cur_state == `MODE3_SETUP)
	begin
		mem_cur <= mem_cur + `MEMBLOCKM;
		mem_left <= mem_left + `MEMBLOCKM;
	end
	else if (cur_state == `MODE2_SETUP)
	begin
		mem_cur <= mem_top + mem_N;
		mem_top <= mem_top + mem_N;
		mem_left <= mem_base;
	end

	if (cur_state == `SETUP)
	begin
		mem_write <= 24'b0;
		mem_write_buf <= 24'b0;
		mem_write_size <= `BLOCKMDIVK;
		mem_write_size_buf <= `BLOCKMDIVK;
		write_n <= block_n;
		write_n_buf <= block_n;
	end
	else if (cur_mem_state == `MEM_CHECK_DONE && mem_read == 0)
	begin
		mem_write <= mem_write_buf;
		mem_write_buf <= mem_cur;
		mem_write_size <= mem_write_size_buf;
		mem_write_size_buf <= mem_read_size;
		write_n <= write_n_buf;
		write_n_buf <= block_n;
	end

	mem_read_size <= `BLOCKMDIVK;

	if (start) begin
		loop <= `BLOCKN;
	end else if (next_state == `LAST) begin
		loop <= comp_N[8:0] - `BLOCKN;
	end

	if (cur_state == `MODE0_SETUP || cur_state == `MODE2_SETUP || start) begin
		mcount <= `BLOCKM;
	end else if (cur_state == `MODE1_SETUP || cur_state == `MODE3_SETUP) begin
		mcount <= mcount+`BLOCKM;
	end

	if (cur_state == `MODE0_SETUP || start) begin
		ncount <= `BLOCKN;
	end else if (cur_state == `MODE2_SETUP) begin
		ncount <= ncount+`BLOCKN;
	end

	if (mcount < comp_N) begin
		block_m <= `BLOCKM;
	end else begin
		block_m <= comp_N - mcount + `BLOCKM;
	end 

	if (ncount < comp_N) begin
		block_n <= `BLOCKN;
	end else begin
		block_n <= comp_N - ncount + `BLOCKN;
	end

	if (start) begin
		cur_mem_sel <= 1'b0;
	end else if ((cur_state == `MODE0)||(cur_state == `MODE1)||(cur_state == `MODE2)||(cur_state == `MODE3)||
		 (cur_state == `FIRST)||(cur_state == `FINAL_WRITE)||(cur_state == `LAST_SETUP)||(cur_state == `LAST)) begin
		cur_mem_sel <= !cur_mem_sel;
	end 

	if (start) begin
		no_left_switch <= 1'b0;
	end else if ((cur_state == `MODE0)||(cur_state == `FIRST)) begin
		no_left_switch <= 1'b1;
	end else if ((cur_state == `MODE1)||(cur_state == `MODE2)||(cur_state == `MODE3)||
		 (cur_state == `FINAL_WRITE)||(cur_state == `LAST_SETUP)) begin
		no_left_switch <= 1'b0;
	end

	if (start) begin
		left_mem_sel <= 1'b0;
	end else if (((cur_state == `MODE0)||(cur_state ==`MODE1)||(cur_state == `MODE2)||(cur_state == `MODE3)||
		 (cur_state == `FIRST)||(cur_state == `FINAL_WRITE)||(cur_state == `LAST_SETUP))&&(no_left_switch == 1'b0)) begin
		left_mem_sel <= !left_mem_sel;
	end 
end

endmodule


//topoutputdelay = 1
//auto-generated LU.v
//datapath for computating LU factorization
//by Wei Zhang

`define rRAMSIZEWIDTH 7
`define cSETUP 4'b0000
`define cSTART 4'b0001
`define cFETCH_COL 4'b0010
`define cWAIT_COL 4'b0011
`define cFIND_REC 4'b0100
`define cMULT_COL 4'b0101
`define cUPDATE_J 4'b0110
`define cSTORE_MO 4'b0111
`define cMULT_SUB 4'b1000
`define cINCRE_I 4'b1001
`define cWAIT 4'b1010
`define cDONE 4'b1011
`define cSTORE_DIAG 4'b1100
`define cSTORE_DIAG2 4'b1101
`define cSTART_FETCH_ROW 4'b1110
`define cROW_WAIT 2'b00
`define cFETCH_ROW 2'b01
`define cDONE_FETCH_ROW 2'b10
`define cLOAD_ROW_INC_J 2'b11

`define PRECISION 7'b0100000
`define NUMPE 7'b0100000
`define PEWIDTH 4'b0101
`define BLOCKWIDTH 4'b0111
`define RAMWIDTH 12'b010000000000
`define RAMNUMBYTES 9'b010000000
`define RAMSIZEWIDTH 4'b0111
`define TOPSIZEWIDTH 5'b01100
`define TOPINPUTDELAY 3'b011
`define TOPOUTPUTDELAY 2'b01
`define MEMINPUTDELAY 3'b010
`define MEMOUTPUTDELAY 2'b01
`define TOPWIDTH 7'b0100000
 
module LU (clk, start, m, n, loop, mode, done, 
			curReadAddrMem, curReadDataMem, curWriteByteEnMem, curWriteDataMem, curWriteAddrMem, curWriteEnMem, curMemSel,
			leftWriteByteEnMem, leftWriteDataMem, leftWriteAddrMem, leftWriteEnMem, leftMemSel
);


input clk, start;
input[`BLOCKWIDTH-1:0] m, n, loop;
input[1:0] mode;
output done;
wire[`RAMWIDTH-1:0] curWriteData0, curWriteData1;
wire[`RAMSIZEWIDTH-1:0] curWriteAddr0, curReadAddr0, curWriteAddr1, curReadAddr1;
wire[`RAMWIDTH-1:0] curReadData0, curReadData1;
wire[`RAMNUMBYTES-1:0] curWriteByteEn0, curWriteByteEn1;
wire curWriteEn0, curWriteEn1;

input[`RAMWIDTH-1:0] curWriteDataMem;
output[`RAMWIDTH-1:0] curReadDataMem;
input[`RAMSIZEWIDTH-1:0] curWriteAddrMem, curReadAddrMem;
input[`RAMNUMBYTES-1:0] curWriteByteEnMem;
input curWriteEnMem;
input[`RAMWIDTH-1:0] leftWriteDataMem;
input[`RAMSIZEWIDTH-1:0] leftWriteAddrMem;
input[`RAMNUMBYTES-1:0] leftWriteByteEnMem;
input leftWriteEnMem;
input leftMemSel, curMemSel;

wire[`RAMWIDTH-1:0] curReadDataLU, curReadDataMem;
wire[`RAMWIDTH-1:0] curWriteDataLU, curWriteDataMem;
wire[`RAMSIZEWIDTH-1:0] curWriteAddrLU, curWriteAddrMem, curReadAddrLU, curReadAddrMem;
wire[`RAMNUMBYTES-1:0] curWriteByteEnLU, curWriteByteEnMem;
wire curWriteEnLU, curWriteEnMem;

reg[`RAMWIDTH-1:0] curReadData0Reg0;
reg[`RAMWIDTH-1:0] curReadData1Reg0;
reg[`RAMWIDTH-1:0] leftReadData0Reg0;
reg[`RAMWIDTH-1:0] leftReadData1Reg0;
reg[`RAMWIDTH-1:0] curWriteData0Reg0;
reg[`RAMWIDTH-1:0] curWriteData0Reg1;
reg[`RAMWIDTH-1:0] curWriteData1Reg0;
reg[`RAMWIDTH-1:0] curWriteData1Reg1;
reg[`RAMSIZEWIDTH-1:0] curWriteAddr0Reg0;
reg[`RAMSIZEWIDTH-1:0] curWriteAddr0Reg1;
reg[`RAMSIZEWIDTH-1:0] curReadAddr0Reg0;
reg[`RAMSIZEWIDTH-1:0] curReadAddr0Reg1;
reg[`RAMSIZEWIDTH-1:0] curWriteAddr1Reg0;
reg[`RAMSIZEWIDTH-1:0] curWriteAddr1Reg1;
reg[`RAMSIZEWIDTH-1:0] curReadAddr1Reg0;
reg[`RAMSIZEWIDTH-1:0] curReadAddr1Reg1;
reg[`RAMNUMBYTES-1:0] curWriteByteEn0Reg0;
reg[`RAMNUMBYTES-1:0] curWriteByteEn0Reg1;
reg[`RAMNUMBYTES-1:0] curWriteByteEn1Reg0;
reg[`RAMNUMBYTES-1:0] curWriteByteEn1Reg1;
reg curWriteEn0Reg0;
reg curWriteEn0Reg1;
reg curWriteEn1Reg0;
reg curWriteEn1Reg1;
reg[`RAMWIDTH-1:0] leftWriteData0Reg0;
reg[`RAMWIDTH-1:0] leftWriteData0Reg1;
reg[`RAMWIDTH-1:0] leftWriteData1Reg0;
reg[`RAMWIDTH-1:0] leftWriteData1Reg1;
reg[`RAMSIZEWIDTH-1:0] leftWriteAddr0Reg0;
reg[`RAMSIZEWIDTH-1:0] leftWriteAddr0Reg1;
reg[`RAMSIZEWIDTH-1:0] leftReadAddr0Reg0;
reg[`RAMSIZEWIDTH-1:0] leftReadAddr0Reg1;
reg[`RAMSIZEWIDTH-1:0] leftWriteAddr1Reg0;
reg[`RAMSIZEWIDTH-1:0] leftWriteAddr1Reg1;
reg[`RAMSIZEWIDTH-1:0] leftReadAddr1Reg0;
reg[`RAMSIZEWIDTH-1:0] leftReadAddr1Reg1;
reg[`RAMNUMBYTES-1:0] leftWriteByteEn0Reg0;
reg[`RAMNUMBYTES-1:0] leftWriteByteEn0Reg1;
reg[`RAMNUMBYTES-1:0] leftWriteByteEn1Reg0;
reg[`RAMNUMBYTES-1:0] leftWriteByteEn1Reg1;
reg leftWriteEn0Reg0;
reg leftWriteEn0Reg1;
reg leftWriteEn1Reg0;
reg leftWriteEn1Reg1;

reg[`PRECISION-1:0] multOperand;
reg[`PRECISION-1:0] diag;
wire[`PRECISION-1:0] recResult;
wire[`PRECISION-1:0] multA0;
wire[`PRECISION-1:0] multA1;
wire[`PRECISION-1:0] multA2;
wire[`PRECISION-1:0] multA3;
wire[`PRECISION-1:0] multA4;
wire[`PRECISION-1:0] multA5;
wire[`PRECISION-1:0] multA6;
wire[`PRECISION-1:0] multA7;
wire[`PRECISION-1:0] multA8;
wire[`PRECISION-1:0] multA9;
wire[`PRECISION-1:0] multA10;
wire[`PRECISION-1:0] multA11;
wire[`PRECISION-1:0] multA12;
wire[`PRECISION-1:0] multA13;
wire[`PRECISION-1:0] multA14;
wire[`PRECISION-1:0] multA15;
wire[`PRECISION-1:0] multA16;
wire[`PRECISION-1:0] multA17;
wire[`PRECISION-1:0] multA18;
wire[`PRECISION-1:0] multA19;
wire[`PRECISION-1:0] multA20;
wire[`PRECISION-1:0] multA21;
wire[`PRECISION-1:0] multA22;
wire[`PRECISION-1:0] multA23;
wire[`PRECISION-1:0] multA24;
wire[`PRECISION-1:0] multA25;
wire[`PRECISION-1:0] multA26;
wire[`PRECISION-1:0] multA27;
wire[`PRECISION-1:0] multA28;
wire[`PRECISION-1:0] multA29;
wire[`PRECISION-1:0] multA30;
wire[`PRECISION-1:0] multA31;
wire[`PRECISION-1:0] multResult0;
wire[`PRECISION-1:0] multResult1;
wire[`PRECISION-1:0] multResult2;
wire[`PRECISION-1:0] multResult3;
wire[`PRECISION-1:0] multResult4;
wire[`PRECISION-1:0] multResult5;
wire[`PRECISION-1:0] multResult6;
wire[`PRECISION-1:0] multResult7;
wire[`PRECISION-1:0] multResult8;
wire[`PRECISION-1:0] multResult9;
wire[`PRECISION-1:0] multResult10;
wire[`PRECISION-1:0] multResult11;
wire[`PRECISION-1:0] multResult12;
wire[`PRECISION-1:0] multResult13;
wire[`PRECISION-1:0] multResult14;
wire[`PRECISION-1:0] multResult15;
wire[`PRECISION-1:0] multResult16;
wire[`PRECISION-1:0] multResult17;
wire[`PRECISION-1:0] multResult18;
wire[`PRECISION-1:0] multResult19;
wire[`PRECISION-1:0] multResult20;
wire[`PRECISION-1:0] multResult21;
wire[`PRECISION-1:0] multResult22;
wire[`PRECISION-1:0] multResult23;
wire[`PRECISION-1:0] multResult24;
wire[`PRECISION-1:0] multResult25;
wire[`PRECISION-1:0] multResult26;
wire[`PRECISION-1:0] multResult27;
wire[`PRECISION-1:0] multResult28;
wire[`PRECISION-1:0] multResult29;
wire[`PRECISION-1:0] multResult30;
wire[`PRECISION-1:0] multResult31;
wire[`PRECISION-1:0] addA0;
wire[`PRECISION-1:0] addA1;
wire[`PRECISION-1:0] addA2;
wire[`PRECISION-1:0] addA3;
wire[`PRECISION-1:0] addA4;
wire[`PRECISION-1:0] addA5;
wire[`PRECISION-1:0] addA6;
wire[`PRECISION-1:0] addA7;
wire[`PRECISION-1:0] addA8;
wire[`PRECISION-1:0] addA9;
wire[`PRECISION-1:0] addA10;
wire[`PRECISION-1:0] addA11;
wire[`PRECISION-1:0] addA12;
wire[`PRECISION-1:0] addA13;
wire[`PRECISION-1:0] addA14;
wire[`PRECISION-1:0] addA15;
wire[`PRECISION-1:0] addA16;
wire[`PRECISION-1:0] addA17;
wire[`PRECISION-1:0] addA18;
wire[`PRECISION-1:0] addA19;
wire[`PRECISION-1:0] addA20;
wire[`PRECISION-1:0] addA21;
wire[`PRECISION-1:0] addA22;
wire[`PRECISION-1:0] addA23;
wire[`PRECISION-1:0] addA24;
wire[`PRECISION-1:0] addA25;
wire[`PRECISION-1:0] addA26;
wire[`PRECISION-1:0] addA27;
wire[`PRECISION-1:0] addA28;
wire[`PRECISION-1:0] addA29;
wire[`PRECISION-1:0] addA30;
wire[`PRECISION-1:0] addA31;
wire[`PRECISION-1:0] addResult0;
wire[`PRECISION-1:0] addResult1;
wire[`PRECISION-1:0] addResult2;
wire[`PRECISION-1:0] addResult3;
wire[`PRECISION-1:0] addResult4;
wire[`PRECISION-1:0] addResult5;
wire[`PRECISION-1:0] addResult6;
wire[`PRECISION-1:0] addResult7;
wire[`PRECISION-1:0] addResult8;
wire[`PRECISION-1:0] addResult9;
wire[`PRECISION-1:0] addResult10;
wire[`PRECISION-1:0] addResult11;
wire[`PRECISION-1:0] addResult12;
wire[`PRECISION-1:0] addResult13;
wire[`PRECISION-1:0] addResult14;
wire[`PRECISION-1:0] addResult15;
wire[`PRECISION-1:0] addResult16;
wire[`PRECISION-1:0] addResult17;
wire[`PRECISION-1:0] addResult18;
wire[`PRECISION-1:0] addResult19;
wire[`PRECISION-1:0] addResult20;
wire[`PRECISION-1:0] addResult21;
wire[`PRECISION-1:0] addResult22;
wire[`PRECISION-1:0] addResult23;
wire[`PRECISION-1:0] addResult24;
wire[`PRECISION-1:0] addResult25;
wire[`PRECISION-1:0] addResult26;
wire[`PRECISION-1:0] addResult27;
wire[`PRECISION-1:0] addResult28;
wire[`PRECISION-1:0] addResult29;
wire[`PRECISION-1:0] addResult30;
wire[`PRECISION-1:0] addResult31;
wire[`RAMWIDTH-1:0] leftReadData0, leftReadData1, leftWriteData0, leftWriteData1;
wire[`RAMSIZEWIDTH-1:0] leftWriteAddr0, leftWriteAddr1, leftReadAddr0, leftReadAddr1;
wire[`RAMNUMBYTES-1:0] leftWriteByteEn0, leftWriteByteEn1;
wire leftWriteEn0, leftWriteEn1;
wire[`RAMWIDTH-1:0] leftReadDataLU, leftWriteDataLU, leftWriteDataMem;
wire[`RAMSIZEWIDTH-1:0] leftWriteAddrLU, leftWriteAddrMem, leftReadAddrLU;
wire[`RAMNUMBYTES-1:0] leftWriteByteEnLU, leftWriteByteEnMem;
wire leftWriteEnLU, leftWriteEnMem;

wire[`PRECISION-1:0] topWriteData;
reg[`PRECISION-1:0] topWriteDataLU;
wire[`PRECISION-1:0] topReadData, topReadDataLU;
wire[`TOPSIZEWIDTH-1:0] topWriteAddr, topWriteAddrLU, topReadAddr, topReadAddrLU;
wire topWriteEn, topWriteEnLU;

reg[`PRECISION-1:0] topReadDataReg0;
reg[`PRECISION-1:0] topWriteDataReg0;
reg[`PRECISION-1:0] topWriteDataReg1;
reg[`PRECISION-1:0] topWriteDataReg2;
reg[`TOPSIZEWIDTH-1:0] topWriteAddrReg0;
reg[`TOPSIZEWIDTH-1:0] topWriteAddrReg1;
reg[`TOPSIZEWIDTH-1:0] topWriteAddrReg2;
reg[`TOPSIZEWIDTH-1:0] topReadAddrReg0;
reg[`TOPSIZEWIDTH-1:0] topReadAddrReg1;
reg[`TOPSIZEWIDTH-1:0] topReadAddrReg2;
reg topWriteEnReg0;
reg topWriteEnReg1;
reg topWriteEnReg2;
wire[`RAMWIDTH-1:0] rcWriteData;
wire leftWriteSel, curWriteSel, topSourceSel;
wire diagEn;
wire[`PEWIDTH-1:0] topWriteSel;

wire MOSel;
wire MOEn;

// control block
LUControl conBlock (clk, start, m, n, loop, mode, done, 
                    curReadAddrLU, curWriteAddrLU, curWriteByteEnLU, curWriteEnLU, curWriteSel,
                    leftReadAddrLU, leftWriteAddrLU, leftWriteByteEnLU, leftWriteEnLU,  leftWriteSel,
                    topReadAddrLU, topWriteAddrLU, topWriteEnLU, topWriteSel, topSourceSel, diagEn, MOSel, MOEn);

// fp_div unit
//floating point divider here
fpu_div rec(.clock(clk), .n(32'h3F800000), .d(diag), .div(recResult));
// on-chip memory blocks that store the matrix to be LU factorized
// store current blocks data
ram currentBlock0 (curWriteByteEn0, clk, curWriteData0, curReadAddr0, curWriteAddr0, curWriteEn0, curReadData0 );
ram1 currentBlock1 (curWriteByteEn1, clk, curWriteData1, curReadAddr1, curWriteAddr1, curWriteEn1, curReadData1 );
// store left blocks data
ram2 leftBlock0(leftWriteByteEn0, clk, leftWriteData0, leftReadAddr0, leftWriteAddr0, leftWriteEn0, leftReadData0 );

ram3 leftBlock1(leftWriteByteEn1, clk, leftWriteData1, leftReadAddr1, leftWriteAddr1, leftWriteEn1, leftReadData1 );

// store top block data
top_ram topBlock(clk, topWriteData, topReadAddr, topWriteAddr, topWriteEn, topReadDataLU );

// processing elements that does the main computation of LU factorization
mult_add PE0 (clk, multA0, multOperand, addA0, multResult0, addResult0);
mult_add PE1 (clk, multA1, multOperand, addA1, multResult1, addResult1);
mult_add PE2 (clk, multA2, multOperand, addA2, multResult2, addResult2);
mult_add PE3 (clk, multA3, multOperand, addA3, multResult3, addResult3);
mult_add PE4 (clk, multA4, multOperand, addA4, multResult4, addResult4);
mult_add PE5 (clk, multA5, multOperand, addA5, multResult5, addResult5);
mult_add PE6 (clk, multA6, multOperand, addA6, multResult6, addResult6);
mult_add PE7 (clk, multA7, multOperand, addA7, multResult7, addResult7);
mult_add PE8 (clk, multA8, multOperand, addA8, multResult8, addResult8);
mult_add PE9 (clk, multA9, multOperand, addA9, multResult9, addResult9);
mult_add PE10 (clk, multA10, multOperand, addA10, multResult10, addResult10);
mult_add PE11 (clk, multA11, multOperand, addA11, multResult11, addResult11);
mult_add PE12 (clk, multA12, multOperand, addA12, multResult12, addResult12);
mult_add PE13 (clk, multA13, multOperand, addA13, multResult13, addResult13);
mult_add PE14 (clk, multA14, multOperand, addA14, multResult14, addResult14);
mult_add PE15 (clk, multA15, multOperand, addA15, multResult15, addResult15);
mult_add PE16 (clk, multA16, multOperand, addA16, multResult16, addResult16);
mult_add PE17 (clk, multA17, multOperand, addA17, multResult17, addResult17);
mult_add PE18 (clk, multA18, multOperand, addA18, multResult18, addResult18);
mult_add PE19 (clk, multA19, multOperand, addA19, multResult19, addResult19);
mult_add PE20 (clk, multA20, multOperand, addA20, multResult20, addResult20);
mult_add PE21 (clk, multA21, multOperand, addA21, multResult21, addResult21);
mult_add PE22 (clk, multA22, multOperand, addA22, multResult22, addResult22);
mult_add PE23 (clk, multA23, multOperand, addA23, multResult23, addResult23);
mult_add PE24 (clk, multA24, multOperand, addA24, multResult24, addResult24);
mult_add PE25 (clk, multA25, multOperand, addA25, multResult25, addResult25);
mult_add PE26 (clk, multA26, multOperand, addA26, multResult26, addResult26);
mult_add PE27 (clk, multA27, multOperand, addA27, multResult27, addResult27);
mult_add PE28 (clk, multA28, multOperand, addA28, multResult28, addResult28);
mult_add PE29 (clk, multA29, multOperand, addA29, multResult29, addResult29);
mult_add PE30 (clk, multA30, multOperand, addA30, multResult30, addResult30);
mult_add PE31 (clk, multA31, multOperand, addA31, multResult31, addResult31);

// connect to ports of the left blocks
assign leftWriteDataLU = (leftWriteSel == 1'b0) ? curReadDataLU : rcWriteData;
always @ (posedge clk)
begin
	if(leftMemSel == 1'b0)
	begin
		leftWriteData0Reg0 <= leftWriteDataMem;
		leftWriteAddr0Reg0 <= leftWriteAddrMem;
		leftWriteByteEn0Reg0 <= leftWriteByteEnMem;
		leftWriteEn0Reg0 <= leftWriteEnMem;
		leftWriteData1Reg0 <= leftWriteDataLU;
		leftWriteAddr1Reg0 <= leftWriteAddrLU;
		leftWriteByteEn1Reg0 <= leftWriteByteEnLU;
		leftWriteEn1Reg0 <= leftWriteEnLU;
	end
	else
	begin
		leftWriteData0Reg0 <= leftWriteDataLU;
		leftWriteAddr0Reg0 <= leftWriteAddrLU;
		leftWriteByteEn0Reg0 <= leftWriteByteEnLU;
		leftWriteEn0Reg0 <= leftWriteEnLU;
		leftWriteData1Reg0 <= leftWriteDataMem;
		leftWriteAddr1Reg0 <= leftWriteAddrMem;
		leftWriteByteEn1Reg0 <= leftWriteByteEnMem;
		leftWriteEn1Reg0 <= leftWriteEnMem;
	end
	leftReadAddr0Reg0 <= leftReadAddrLU;
	leftReadAddr1Reg0 <= leftReadAddrLU;
	leftWriteData0Reg1 <= leftWriteData0Reg0;
	leftWriteAddr0Reg1 <= leftWriteAddr0Reg0;
	leftReadAddr0Reg1 <= leftReadAddr0Reg0;
	leftWriteByteEn0Reg1 <= leftWriteByteEn0Reg0;
	leftWriteEn0Reg1 <= leftWriteEn0Reg0;
	leftWriteData1Reg1 <= leftWriteData1Reg0;
	leftWriteAddr1Reg1 <= leftWriteAddr1Reg0;
	leftReadAddr1Reg1 <= leftReadAddr1Reg0;
	leftWriteByteEn1Reg1 <= leftWriteByteEn1Reg0;
	leftWriteEn1Reg1 <= leftWriteEn1Reg0;
end
assign leftWriteData0 = leftWriteData0Reg1;
assign leftWriteAddr0 = leftWriteAddr0Reg1;
assign leftReadAddr0 = leftReadAddr0Reg1;
assign leftWriteByteEn0 = leftWriteByteEn0Reg1;
assign leftWriteEn0 = leftWriteEn0Reg1;
assign leftWriteData1 = leftWriteData1Reg1;
assign leftWriteAddr1 = leftWriteAddr1Reg1;
assign leftReadAddr1 = leftReadAddr1Reg1;
assign leftWriteByteEn1 = leftWriteByteEn1Reg1;
assign leftWriteEn1 = leftWriteEn1Reg1;

always @ (posedge clk)
begin
		leftReadData0Reg0 <= leftReadData0;
		leftReadData1Reg0 <= leftReadData1;
end
assign leftReadDataLU = (leftMemSel == 1'b0) ? leftReadData1Reg0 : leftReadData0Reg0;
// data feed to fp div unit
always @ (posedge clk)
begin
	if (diagEn == 1'b1)
	begin
			diag <= topReadData;
	end
end
// one of the inputs to the PE
always @ (posedge clk)
begin
	if (start == 1'b1)
		multOperand <= 0;
	else if (MOEn == 1'b1)
	begin
		if (MOSel == 1'b0)
			multOperand <= recResult;
		else
			multOperand <= topReadData;
	end
end

// connections to top block memory ports
always @ (topSourceSel or topWriteSel or curReadDataLU or addResult31 or addResult30 or addResult29 or addResult28 or addResult27 or addResult26 or addResult25 or addResult24 or addResult23 or addResult22 or addResult21 or addResult20 or addResult19 or addResult18 or addResult17 or addResult16 or addResult15 or addResult14 or addResult13 or addResult12 or addResult11 or addResult10 or addResult9 or addResult8 or addResult7 or addResult6 or addResult5 or addResult4 or addResult3 or addResult2 or addResult1 or addResult0)
begin
	if (topSourceSel == 1'b0)
		case (topWriteSel)
		0:
			topWriteDataLU = curReadDataLU[1023:992];
		1:
			topWriteDataLU = curReadDataLU[991:960];
		2:
			topWriteDataLU = curReadDataLU[959:928];
		3:
			topWriteDataLU = curReadDataLU[927:896];
		4:
			topWriteDataLU = curReadDataLU[895:864];
		5:
			topWriteDataLU = curReadDataLU[863:832];
		6:
			topWriteDataLU = curReadDataLU[831:800];
		7:
			topWriteDataLU = curReadDataLU[799:768];
		8:
			topWriteDataLU = curReadDataLU[767:736];
		9:
			topWriteDataLU = curReadDataLU[735:704];
		10:
			topWriteDataLU = curReadDataLU[703:672];
		11:
			topWriteDataLU = curReadDataLU[671:640];
		12:
			topWriteDataLU = curReadDataLU[639:608];
		13:
			topWriteDataLU = curReadDataLU[607:576];
		14:
			topWriteDataLU = curReadDataLU[575:544];
		15:
			topWriteDataLU = curReadDataLU[543:512];
		16:
			topWriteDataLU = curReadDataLU[511:480];
		17:
			topWriteDataLU = curReadDataLU[479:448];
		18:
			topWriteDataLU = curReadDataLU[447:416];
		19:
			topWriteDataLU = curReadDataLU[415:384];
		20:
			topWriteDataLU = curReadDataLU[383:352];
		21:
			topWriteDataLU = curReadDataLU[351:320];
		22:
			topWriteDataLU = curReadDataLU[319:288];
		23:
			topWriteDataLU = curReadDataLU[287:256];
		24:
			topWriteDataLU = curReadDataLU[255:224];
		25:
			topWriteDataLU = curReadDataLU[223:192];
		26:
			topWriteDataLU = curReadDataLU[191:160];
		27:
			topWriteDataLU = curReadDataLU[159:128];
		28:
			topWriteDataLU = curReadDataLU[127:96];
		29:
			topWriteDataLU = curReadDataLU[95:64];
		30:
			topWriteDataLU = curReadDataLU[63:32];
		31:
			topWriteDataLU = curReadDataLU[31:0];
		default:
			topWriteDataLU = curReadDataLU[`PRECISION-1:0];
		endcase
	else
		case (topWriteSel)
		0:
			topWriteDataLU = addResult31;
		1:
			topWriteDataLU = addResult30;
		2:
			topWriteDataLU = addResult29;
		3:
			topWriteDataLU = addResult28;
		4:
			topWriteDataLU = addResult27;
		5:
			topWriteDataLU = addResult26;
		6:
			topWriteDataLU = addResult25;
		7:
			topWriteDataLU = addResult24;
		8:
			topWriteDataLU = addResult23;
		9:
			topWriteDataLU = addResult22;
		10:
			topWriteDataLU = addResult21;
		11:
			topWriteDataLU = addResult20;
		12:
			topWriteDataLU = addResult19;
		13:
			topWriteDataLU = addResult18;
		14:
			topWriteDataLU = addResult17;
		15:
			topWriteDataLU = addResult16;
		16:
			topWriteDataLU = addResult15;
		17:
			topWriteDataLU = addResult14;
		18:
			topWriteDataLU = addResult13;
		19:
			topWriteDataLU = addResult12;
		20:
			topWriteDataLU = addResult11;
		21:
			topWriteDataLU = addResult10;
		22:
			topWriteDataLU = addResult9;
		23:
			topWriteDataLU = addResult8;
		24:
			topWriteDataLU = addResult7;
		25:
			topWriteDataLU = addResult6;
		26:
			topWriteDataLU = addResult5;
		27:
			topWriteDataLU = addResult4;
		28:
			topWriteDataLU = addResult3;
		29:
			topWriteDataLU = addResult2;
		30:
			topWriteDataLU = addResult1;
		31:
			topWriteDataLU = addResult0;
		default:
			topWriteDataLU = addResult0;
		endcase
end

always @ (posedge clk)
begin
	topWriteDataReg0 <= topWriteDataLU;
	topReadAddrReg0 <= topReadAddrLU;
	topWriteAddrReg0 <= topWriteAddrLU;
	topWriteEnReg0 <= topWriteEnLU;
	topWriteDataReg1 <= topWriteDataReg0;
	topReadAddrReg1 <= topReadAddrReg0;
	topWriteAddrReg1 <= topWriteAddrReg0;
	topWriteEnReg1 <= topWriteEnReg0;
	topWriteDataReg2 <= topWriteDataReg1;
	topReadAddrReg2 <= topReadAddrReg1;
	topWriteAddrReg2 <= topWriteAddrReg1;
	topWriteEnReg2 <= topWriteEnReg1;
end
assign topWriteData = topWriteDataReg2;
assign topReadAddr = topReadAddrReg2;
assign topWriteAddr = topWriteAddrReg2;
assign topWriteEn = topWriteEnReg2;
always @ (posedge clk)
begin
	topReadDataReg0 <= topReadDataLU;
end
assign topReadData = topReadDataReg0;

// connections to processing element
assign multA0 = leftReadDataLU[31:0];
assign multA1 = leftReadDataLU[63:32];
assign multA2 = leftReadDataLU[95:64];
assign multA3 = leftReadDataLU[127:96];
assign multA4 = leftReadDataLU[159:128];
assign multA5 = leftReadDataLU[191:160];
assign multA6 = leftReadDataLU[223:192];
assign multA7 = leftReadDataLU[255:224];
assign multA8 = leftReadDataLU[287:256];
assign multA9 = leftReadDataLU[319:288];
assign multA10 = leftReadDataLU[351:320];
assign multA11 = leftReadDataLU[383:352];
assign multA12 = leftReadDataLU[415:384];
assign multA13 = leftReadDataLU[447:416];
assign multA14 = leftReadDataLU[479:448];
assign multA15 = leftReadDataLU[511:480];
assign multA16 = leftReadDataLU[543:512];
assign multA17 = leftReadDataLU[575:544];
assign multA18 = leftReadDataLU[607:576];
assign multA19 = leftReadDataLU[639:608];
assign multA20 = leftReadDataLU[671:640];
assign multA21 = leftReadDataLU[703:672];
assign multA22 = leftReadDataLU[735:704];
assign multA23 = leftReadDataLU[767:736];
assign multA24 = leftReadDataLU[799:768];
assign multA25 = leftReadDataLU[831:800];
assign multA26 = leftReadDataLU[863:832];
assign multA27 = leftReadDataLU[895:864];
assign multA28 = leftReadDataLU[927:896];
assign multA29 = leftReadDataLU[959:928];
assign multA30 = leftReadDataLU[991:960];
assign multA31 = leftReadDataLU[1023:992];

assign addA0 = curReadDataLU[31:0];
assign addA1 = curReadDataLU[63:32];
assign addA2 = curReadDataLU[95:64];
assign addA3 = curReadDataLU[127:96];
assign addA4 = curReadDataLU[159:128];
assign addA5 = curReadDataLU[191:160];
assign addA6 = curReadDataLU[223:192];
assign addA7 = curReadDataLU[255:224];
assign addA8 = curReadDataLU[287:256];
assign addA9 = curReadDataLU[319:288];
assign addA10 = curReadDataLU[351:320];
assign addA11 = curReadDataLU[383:352];
assign addA12 = curReadDataLU[415:384];
assign addA13 = curReadDataLU[447:416];
assign addA14 = curReadDataLU[479:448];
assign addA15 = curReadDataLU[511:480];
assign addA16 = curReadDataLU[543:512];
assign addA17 = curReadDataLU[575:544];
assign addA18 = curReadDataLU[607:576];
assign addA19 = curReadDataLU[639:608];
assign addA20 = curReadDataLU[671:640];
assign addA21 = curReadDataLU[703:672];
assign addA22 = curReadDataLU[735:704];
assign addA23 = curReadDataLU[767:736];
assign addA24 = curReadDataLU[799:768];
assign addA25 = curReadDataLU[831:800];
assign addA26 = curReadDataLU[863:832];
assign addA27 = curReadDataLU[895:864];
assign addA28 = curReadDataLU[927:896];
assign addA29 = curReadDataLU[959:928];
assign addA30 = curReadDataLU[991:960];
assign addA31 = curReadDataLU[1023:992];

// connections to ports of the current blocks
assign rcWriteData[31:0] = (curWriteSel == 0) ? multResult0 : addResult0;
assign rcWriteData[63:32] = (curWriteSel == 0) ? multResult1 : addResult1;
assign rcWriteData[95:64] = (curWriteSel == 0) ? multResult2 : addResult2;
assign rcWriteData[127:96] = (curWriteSel == 0) ? multResult3 : addResult3;
assign rcWriteData[159:128] = (curWriteSel == 0) ? multResult4 : addResult4;
assign rcWriteData[191:160] = (curWriteSel == 0) ? multResult5 : addResult5;
assign rcWriteData[223:192] = (curWriteSel == 0) ? multResult6 : addResult6;
assign rcWriteData[255:224] = (curWriteSel == 0) ? multResult7 : addResult7;
assign rcWriteData[287:256] = (curWriteSel == 0) ? multResult8 : addResult8;
assign rcWriteData[319:288] = (curWriteSel == 0) ? multResult9 : addResult9;
assign rcWriteData[351:320] = (curWriteSel == 0) ? multResult10 : addResult10;
assign rcWriteData[383:352] = (curWriteSel == 0) ? multResult11 : addResult11;
assign rcWriteData[415:384] = (curWriteSel == 0) ? multResult12 : addResult12;
assign rcWriteData[447:416] = (curWriteSel == 0) ? multResult13 : addResult13;
assign rcWriteData[479:448] = (curWriteSel == 0) ? multResult14 : addResult14;
assign rcWriteData[511:480] = (curWriteSel == 0) ? multResult15 : addResult15;
assign rcWriteData[543:512] = (curWriteSel == 0) ? multResult16 : addResult16;
assign rcWriteData[575:544] = (curWriteSel == 0) ? multResult17 : addResult17;
assign rcWriteData[607:576] = (curWriteSel == 0) ? multResult18 : addResult18;
assign rcWriteData[639:608] = (curWriteSel == 0) ? multResult19 : addResult19;
assign rcWriteData[671:640] = (curWriteSel == 0) ? multResult20 : addResult20;
assign rcWriteData[703:672] = (curWriteSel == 0) ? multResult21 : addResult21;
assign rcWriteData[735:704] = (curWriteSel == 0) ? multResult22 : addResult22;
assign rcWriteData[767:736] = (curWriteSel == 0) ? multResult23 : addResult23;
assign rcWriteData[799:768] = (curWriteSel == 0) ? multResult24 : addResult24;
assign rcWriteData[831:800] = (curWriteSel == 0) ? multResult25 : addResult25;
assign rcWriteData[863:832] = (curWriteSel == 0) ? multResult26 : addResult26;
assign rcWriteData[895:864] = (curWriteSel == 0) ? multResult27 : addResult27;
assign rcWriteData[927:896] = (curWriteSel == 0) ? multResult28 : addResult28;
assign rcWriteData[959:928] = (curWriteSel == 0) ? multResult29 : addResult29;
assign rcWriteData[991:960] = (curWriteSel == 0) ? multResult30 : addResult30;
assign rcWriteData[1023:992] = (curWriteSel == 0) ? multResult31 : addResult31;
assign curWriteDataLU = rcWriteData;

always @ (posedge clk)
begin
	if(curMemSel == 1'b0)
	begin
		curWriteData0Reg0 <= curWriteDataMem;
		curWriteAddr0Reg0 <= curWriteAddrMem;
		curReadAddr0Reg0 <= curReadAddrMem;
		curWriteByteEn0Reg0 <= curWriteByteEnMem;
		curWriteEn0Reg0 <= curWriteEnMem;
		curWriteData1Reg0 <= curWriteDataLU;
		curWriteAddr1Reg0 <= curWriteAddrLU;
		curReadAddr1Reg0 <= curReadAddrLU;
		curWriteByteEn1Reg0 <= curWriteByteEnLU;
		curWriteEn1Reg0 <= curWriteEnLU;
	end
	else
	begin
		curWriteData0Reg0 <= curWriteDataLU;
		curWriteAddr0Reg0 <= curWriteAddrLU;
		curReadAddr0Reg0 <= curReadAddrLU;
		curWriteByteEn0Reg0 <= curWriteByteEnLU;
		curWriteEn0Reg0 <= curWriteEnLU;
		curWriteData1Reg0 <= curWriteDataMem;
		curWriteAddr1Reg0 <= curWriteAddrMem;
		curReadAddr1Reg0 <= curReadAddrMem;
		curWriteByteEn1Reg0 <= curWriteByteEnMem;
		curWriteEn1Reg0 <= curWriteEnMem;
	end
	curWriteData0Reg1 <= curWriteData0Reg0;
	curWriteAddr0Reg1 <= curWriteAddr0Reg0;
	curReadAddr0Reg1 <= curReadAddr0Reg0;
	curWriteByteEn0Reg1 <= curWriteByteEn0Reg0;
	curWriteEn0Reg1 <= curWriteEn0Reg0;
	curWriteData1Reg1 <= curWriteData1Reg0;
	curWriteAddr1Reg1 <= curWriteAddr1Reg0;
	curReadAddr1Reg1 <= curReadAddr1Reg0;
	curWriteByteEn1Reg1 <= curWriteByteEn1Reg0;
	curWriteEn1Reg1 <= curWriteEn1Reg0;
end
assign curWriteData0 = curWriteData0Reg1;
assign curWriteAddr0 = curWriteAddr0Reg1;
assign curReadAddr0 = curReadAddr0Reg1;
assign curWriteByteEn0 = curWriteByteEn0Reg1;
assign curWriteEn0 = curWriteEn0Reg1;
assign curWriteData1 = curWriteData1Reg1;
assign curWriteAddr1 = curWriteAddr1Reg1;
assign curReadAddr1 = curReadAddr1Reg1;
assign curWriteByteEn1 = curWriteByteEn1Reg1;
assign curWriteEn1 = curWriteEn1Reg1;

always @ (posedge clk)
begin
		curReadData0Reg0 <= curReadData0;
		curReadData1Reg0 <= curReadData1;
end
assign curReadDataMem = (curMemSel == 0) ? curReadData0Reg0 : curReadData1Reg0;
assign curReadDataLU = (curMemSel == 0) ? curReadData1Reg0 : curReadData0Reg0;
endmodule

module LUControl (clk, start_in, m_in, n_in, loop_in, mode_in, done,
					curReadAddr, curWriteAddr, curWriteByteEn, curWriteEn, curWriteSel, 
					leftReadAddr, leftWriteAddr, leftWriteByteEn, leftWriteEn,  leftWriteSel,
					topReadAddr, topWriteAddr, topWriteEn, topWriteSel, topSourceSel, diagEn, MOSel, MOEn);

input clk, start_in;
input[7-1:0] m_in, n_in, loop_in;
input[1:0] mode_in;
output done;

output[128-1:0] curWriteByteEn;
output[7-1:0] curWriteAddr, curReadAddr;
output curWriteEn;

output[128-1:0] leftWriteByteEn;
output[7-1:0] leftWriteAddr, leftReadAddr;
output leftWriteEn;

output[12-1:0] topWriteAddr, topReadAddr;
output topWriteEn;

output leftWriteSel, curWriteSel, topSourceSel, diagEn;
output[5-1:0] topWriteSel;

output MOSel;
output MOEn;

reg start;
reg[15:0]startDelay;
reg[7-1:0] m, n, stop, stop2, loop;
reg[1:0] mode;
reg[3:0] nextState, currentState;
reg[1:0] nextRowState, currentRowState;
reg startFetchRow, doneFetchRow, loadRow, writeRow;
reg updateCounter;

reg[7-1:0] i1, j;
reg[12-1:0] nextTopIdx, nextTopIdx2, curTopIdx, nextTopIdxCounter;
reg[2-1:0] topIdx, topIdxCounter, mdivk;
reg[7-1:0] diagIdx, leftIdx, msIdx;
reg[5-1:0] imodk, i1modk;
reg[7-1:0] diagIdxCounter, leftIdxCounter, msIdxCounter, readRowCounter, topWriteCounter;
reg[128-1:0] byteEn, i1modkByteEn;

reg done;

reg[128-1:0] curWriteByteEn;
reg[7-1:0] curWriteAddr, curReadAddr;
reg curWriteEn;

reg[128-1:0] leftWriteByteEn;
reg[7-1:0] leftWriteAddr, leftReadAddr;
reg leftWriteEn;

reg[12-1:0] topWriteAddr, topReadAddr;
reg topWriteEn;

reg leftWriteSel, curWriteSel, topSourceSel, diagEn;
reg[5-1:0] topWriteSel;

reg MOSel;
reg MOEn;

reg[7-1:0] counter;
reg[6-1:0] divCounter;

reg[128-1:0]writeByteEnDelay0; 
reg[128-1:0]writeByteEnDelay1; 
reg[128-1:0]writeByteEnDelay2; 
reg[128-1:0]writeByteEnDelay3; 
reg[128-1:0]writeByteEnDelay4; 
reg[128-1:0]writeByteEnDelay5; 
reg[128-1:0]writeByteEnDelay6; 
reg[128-1:0]writeByteEnDelay7; 
reg[128-1:0]writeByteEnDelay8; 
reg[128-1:0]writeByteEnDelay9; 
reg[128-1:0]writeByteEnDelay10; 
reg[128-1:0]writeByteEnDelay11; 
reg[128-1:0]writeByteEnDelay12; 
reg[128-1:0]writeByteEnDelay13; 
reg[128-1:0]writeByteEnDelay14; 
reg[128-1:0]writeByteEnDelay15; 
reg[128-1:0]writeByteEnDelay16; 
reg[128-1:0]writeByteEnDelay17; 
reg[128-1:0]writeByteEnDelay18; 
reg[128-1:0]writeByteEnDelay19; 
reg[128-1:0]writeByteEnDelay20; 
reg[128-1:0]writeByteEnDelay21; 
reg[128-1:0]writeByteEnDelay22; 
reg[128-1:0]writeByteEnDelay23; 
reg[128-1:0]writeByteEnDelay24; 
reg[128-1:0]writeByteEnDelay25; 
reg[128-1:0]writeByteEnDelay26; 
reg[128-1:0]writeByteEnDelay27; 
reg[128-1:0]writeByteEnDelay28; 
reg[128-1:0]writeByteEnDelay29; 
reg[128-1:0]writeByteEnDelay30; 
reg[128-1:0]writeByteEnDelay31; 

reg[7-1:0]curWriteAddrDelay0; 
reg[7-1:0]curWriteAddrDelay1; 
reg[7-1:0]curWriteAddrDelay2; 
reg[7-1:0]curWriteAddrDelay3; 
reg[7-1:0]curWriteAddrDelay4; 
reg[7-1:0]curWriteAddrDelay5; 
reg[7-1:0]curWriteAddrDelay6; 
reg[7-1:0]curWriteAddrDelay7; 
reg[7-1:0]curWriteAddrDelay8; 
reg[7-1:0]curWriteAddrDelay9; 
reg[7-1:0]curWriteAddrDelay10; 
reg[7-1:0]curWriteAddrDelay11; 
reg[7-1:0]curWriteAddrDelay12; 
reg[7-1:0]curWriteAddrDelay13; 
reg[7-1:0]curWriteAddrDelay14; 
reg[7-1:0]curWriteAddrDelay15; 
reg[7-1:0]curWriteAddrDelay16; 
reg[7-1:0]curWriteAddrDelay17; 
reg[7-1:0]curWriteAddrDelay18; 
reg[7-1:0]curWriteAddrDelay19; 
reg[7-1:0]curWriteAddrDelay20; 
reg[7-1:0]curWriteAddrDelay21; 
reg[7-1:0]curWriteAddrDelay22; 
reg[7-1:0]curWriteAddrDelay23; 
reg[7-1:0]curWriteAddrDelay24; 
reg[7-1:0]curWriteAddrDelay25; 
reg[7-1:0]curWriteAddrDelay26; 
reg[7-1:0]curWriteAddrDelay27; 
reg[7-1:0]curWriteAddrDelay28; 
reg[7-1:0]curWriteAddrDelay29; 
reg[7-1:0]curWriteAddrDelay30; 
reg[7-1:0]curWriteAddrDelay31; 

reg[7-1:0]curReadAddrDelay0; 
reg[7-1:0]curReadAddrDelay1; 
reg[7-1:0]curReadAddrDelay2; 
reg[7-1:0]curReadAddrDelay3; 
reg[7-1:0]curReadAddrDelay4; 
reg[7-1:0]curReadAddrDelay5; 
reg[7-1:0]curReadAddrDelay6; 
reg[7-1:0]curReadAddrDelay7; 
reg[7-1:0]curReadAddrDelay8; 
reg[7-1:0]curReadAddrDelay9; 
reg[7-1:0]curReadAddrDelay10; 
reg[7-1:0]curReadAddrDelay11; 

reg[32-1:0]leftWriteEnDelay; 
reg[32-1:0]curWriteEnDelay; 
reg[5-1:0]leftWriteSelDelay; 
reg[16-1:0]curWriteSelDelay; 
reg[7-1:0]leftReadAddrDelay0; 
reg[12-1:0]topWriteAddrDelay0; 
reg[12-1:0]topWriteAddrDelay1; 
reg[12-1:0]topWriteAddrDelay2; 
reg[12-1:0]topWriteAddrDelay3; 
reg[12-1:0]topWriteAddrDelay4; 
reg[12-1:0]topWriteAddrDelay5; 
reg[12-1:0]topWriteAddrDelay6; 
reg[12-1:0]topWriteAddrDelay7; 
reg[12-1:0]topWriteAddrDelay8; 
reg[12-1:0]topWriteAddrDelay9; 
reg[12-1:0]topWriteAddrDelay10; 
reg[12-1:0]topWriteAddrDelay11; 
reg[12-1:0]topWriteAddrDelay12; 
reg[12-1:0]topWriteAddrDelay13; 
reg[12-1:0]topWriteAddrDelay14; 
reg[12-1:0]topWriteAddrDelay15; 
reg[12-1:0]topWriteAddrDelay16; 
reg[12-1:0]topWriteAddrDelay17; 
reg[12-1:0]topWriteAddrDelay18; 
reg[12-1:0]topWriteAddrDelay19; 
reg[12-1:0]topWriteAddrDelay20; 
reg[12-1:0]topWriteAddrDelay21; 
reg[12-1:0]topWriteAddrDelay22; 
reg[12-1:0]topWriteAddrDelay23; 
reg[12-1:0]topWriteAddrDelay24; 
reg[12-1:0]topWriteAddrDelay25; 
reg[12-1:0]topWriteAddrDelay26; 
reg[12-1:0]topWriteAddrDelay27; 
reg[12-1:0]topWriteAddrDelay28; 
reg[12-1:0]topWriteAddrDelay29; 
reg[12-1:0]topWriteAddrDelay30; 
reg[12-1:0]topWriteAddrDelay31; 

reg [32-1:0]topWriteEnDelay;
reg [5-1:0]topSourceSelDelay;
reg[5-1:0]topWriteSelDelay0; 
reg[5-1:0]topWriteSelDelay1; 
reg[5-1:0]topWriteSelDelay2; 
reg[5-1:0]topWriteSelDelay3; 
reg[5-1:0]topWriteSelDelay4; 
reg[5-1:0]topWriteSelDelay5; 
reg[5-1:0]topWriteSelDelay6; 
reg[5-1:0]topWriteSelDelay7; 
reg[5-1:0]topWriteSelDelay8; 
reg[5-1:0]topWriteSelDelay9; 
reg[5-1:0]topWriteSelDelay10; 
reg[5-1:0]topWriteSelDelay11; 
reg[5-1:0]topWriteSelDelay12; 
reg[5-1:0]topWriteSelDelay13; 
reg[5-1:0]topWriteSelDelay14; 
reg[5-1:0]topWriteSelDelay15; 
reg[5-1:0]topWriteSelDelay16; 
reg[5-1:0]topWriteSelDelay17; 
reg[5-1:0]topWriteSelDelay18; 
reg[5-1:0]topWriteSelDelay19; 
reg[5-1:0]topWriteSelDelay20; 
reg[5-1:0]topWriteSelDelay21; 
reg[5-1:0]topWriteSelDelay22; 
reg[5-1:0]topWriteSelDelay23; 
reg[5-1:0]topWriteSelDelay24; 
reg[5-1:0]topWriteSelDelay25; 
reg[5-1:0]topWriteSelDelay26; 
reg[5-1:0]topWriteSelDelay27; 
reg[5-1:0]topWriteSelDelay28; 
reg[5-1:0]topWriteSelDelay29; 
reg[5-1:0]topWriteSelDelay30; 
reg[5-1:0]topWriteSelDelay31; 

reg [6-1:0]diagEnDelay;
reg[6-1:0]MOEnDelay;
reg [7-1:0]waitCycles;

// register store m, n and mdivk value
always @ (posedge clk)
begin
	if (start_in == 1'b1)
	begin
		n <= n_in;
		m <= m_in;
		loop <= loop_in;
		mode <= mode_in;
	end
	if (mode[0] == 1'b0 && m == loop)
		stop <= loop;
	else
		stop <= loop+1'b1;
	stop2 <= loop;
	startDelay[0] <= start_in;
	startDelay[1] <= startDelay[0];
	startDelay[2] <= startDelay[1];
	startDelay[3] <= startDelay[2];
	startDelay[4] <= startDelay[3];
	startDelay[5] <= startDelay[4];
	startDelay[6] <= startDelay[5];
	startDelay[7] <= startDelay[6];
	startDelay[8] <= startDelay[7];
	startDelay[9] <= startDelay[8];
	startDelay[10] <= startDelay[9];
	startDelay[11] <= startDelay[10];
	startDelay[12] <= startDelay[11];
	startDelay[13] <= startDelay[12];
	startDelay[14] <= startDelay[13];
	startDelay[15] <= startDelay[14];
	start <= startDelay[15];
	mdivk <= (m+32-1)>>5;
end

// registers that store values that are used in FSM, dependent on i and/or j
always @ (posedge clk)
begin
	if (start == 1'b1)
		topIdx <= 2'b00; //offset1divk;
	else if (currentState == `cINCRE_I && i1modk == 32-1 && mode[0] == 1'b0)
		topIdx <= topIdx + 1'b1;
		
	if (start == 1'b1)
		diagIdx <= 7'b0000000;
	else if (currentState == `cSTORE_DIAG && mode == 2'b01)
		diagIdx <= 2;	else if (currentState == `cINCRE_I)
	begin
		if ((imodk == 32-1 && mode == 2'b00) || (i1modk == 32-1 && mode == 2'b01))
			diagIdx <= diagIdx + 2 + 1;
		else
			diagIdx <= diagIdx + 2;
	end
	
	if (start == 1'b1)
		leftIdx <= 7'b0000000;
	else if (currentState == `cINCRE_I)
	begin
		if (i1modk == 32-1 && mode[0] == 1'b0)
			leftIdx <= leftIdx + 2 + 1;
		else
			leftIdx <= leftIdx + 2;
	end

	if (start == 1'b1)
		msIdx <= 7'b0000000;
	else if (currentState == `cUPDATE_J)
		if (mode[1] == 1'b0)
			msIdx <= leftIdx + 2;
		else
			msIdx <= topIdx;
	else if (nextRowState == `cLOAD_ROW_INC_J)
		msIdx <= msIdx + 2;

	if (start == 1'b1)
		imodk <= 5'b00000;
	else if (currentState == `cINCRE_I)
	begin
		if (imodk == 32-1)
		imodk <= 5'b00000;
		else
			imodk <= imodk + 1'b1;
	end
	
	if (start == 1'b1)
		i1modk <= 5'b00001;
	else if (currentState == `cINCRE_I)
	begin
		if (i1modk == 32-1)
		i1modk <= 5'b00000;
		else
			i1modk <= i1modk + 1'b1;
	end
	
	if (start == 1'b1)
		nextTopIdx <= 12'b000000000000;
	else if (currentState == `cINCRE_I)
		if (mode[1] == 0)
			nextTopIdx <= nextTopIdx + n + 1;
		else
			nextTopIdx <= nextTopIdx + n;
 nextTopIdx2 <= nextTopIdx + n + 1;

	if (start == 1'b1)
		curTopIdx <= 12'b000000000001;
	else if (currentState == `cUPDATE_J)
   if (mode[1] == 1'b0)
		  curTopIdx <= nextTopIdx+1;
   else
		  curTopIdx <= nextTopIdx;
	else if (nextRowState == `cLOAD_ROW_INC_J)
		curTopIdx <= curTopIdx + 1;
	
	if (start == 1'b1)
		i1 <= 7'b0000001;
	else if (currentState == `cINCRE_I)
	   i1 <= i1 + 1;

	if (start == 1'b1)
		j <= 7'b0000000;
	else if (currentState == `cUPDATE_J)
		if (mode[1] == 1'b0)
			j <= i1;
		else
		j <= 7'b0000000;
	else if (currentRowState == `cLOAD_ROW_INC_J)
		j <= j + 1;

// compute cycles of delay in FSM
	if (currentState == `cSTORE_MO)
		waitCycles <= 32-1;
	else if (currentState == `cINCRE_I)
	begin
		if (i1 == stop-1)
			if (mode[1] == 1'b1)
				waitCycles <= 32-1 + 6 - 3;
			else
				waitCycles <= waitCycles + 5 - 2;
		else if (mode == 2'b01 && waitCycles < 32-1 - (16-1) - 4)
			waitCycles <= 32-1 - (16-1) - 4;
		else if (mode == 2'b10 && i1modk == 32-1)
			waitCycles <= 32-1 + 6 - 3;
		else if (mode == 2'b00)
			waitCycles <= waitCycles + 6 ;
	end
else if (waitCycles >7'b0000000)
		waitCycles <= waitCycles - 1;

end

// determining next state of main FSM
always @ (currentState or start or mode or m or n or counter or mdivk or topIdxCounter or doneFetchRow or divCounter or j or stop2 or waitCycles or stop or i1)
begin
	case (currentState)
	`cSETUP:
	begin
		if (start == 1'b1)
			nextState = `cSTART;
		else
			nextState = `cSETUP;
		updateCounter = 1'b1;
	end
	`cSTART:
	begin
		if (mode == 2'b00)
		begin
			if (m == 1 && n == 1)
				nextState = `cDONE;
			else
				nextState = `cFETCH_COL;
		end
		else if (mode == 2'b01)
			nextState = `cSTORE_DIAG;
		else if (mode == 2'b10)
			nextState = `cSTART_FETCH_ROW;
		else
			nextState = `cUPDATE_J;
		updateCounter = 1'b1;
	end
	`cSTART_FETCH_ROW:
	begin
		if (counter == 5+6-1)
   begin
		  if (mode == 2'b00)
			  nextState = `cSTORE_DIAG;
		  else
			  nextState = `cUPDATE_J;
	  end
		else
			nextState = `cSTART_FETCH_ROW;
		updateCounter = 1'b0;
	end
	`cFETCH_COL:
		if (counter >= mdivk-1)
		begin
			if (mode == 2'b00 && counter < 5)
			begin
				nextState = `cWAIT_COL;
				updateCounter = 1'b0;
			end
			else
			begin
				if (mode == 2'b00)
					nextState = `cSTART_FETCH_ROW;
				else
					nextState = `cFIND_REC;
				updateCounter = 1'b1;
			end
		end
		else
		begin
			nextState = `cFETCH_COL;
			updateCounter = 1'b0;
		end
	`cWAIT_COL:
		if (counter >= 5)
		begin
			if (mode == 0)
				nextState = `cSTART_FETCH_ROW;
			else
				nextState = `cFIND_REC;
			updateCounter = 1;
		end
		else
		begin
			nextState = `cWAIT_COL;
			updateCounter = 0;
		end
	`cSTORE_DIAG:
	begin
		if (mode == 0)
			nextState =  `cFIND_REC;
		else
			nextState =  `cFETCH_COL;
		updateCounter = 1;
	end
	`cFIND_REC:
		if (divCounter == 56)
		begin
			if (mode == 0)
				nextState = `cMULT_COL;
			else
				nextState = `cSTORE_DIAG2;
			updateCounter = 1;
		end
		else
		begin
			nextState = `cFIND_REC;
			updateCounter = 0;
		end
	`cSTORE_DIAG2:
	begin
		nextState =  `cMULT_COL;
		updateCounter = 1;
	end
	`cMULT_COL:
		if (topIdxCounter == mdivk-1)
		begin
			nextState = `cUPDATE_J;
			updateCounter = 0;
		end
		else
		begin
			nextState = `cMULT_COL;
			updateCounter = 0;
		end
	`cUPDATE_J:
		if ((mode[1] == 1 || counter >= 16-1) && doneFetchRow == 1)
		begin
			nextState = `cSTORE_MO;
			updateCounter = 1;
		end
		else
		begin
			nextState = `cUPDATE_J;
			updateCounter = 0;
		end
	`cSTORE_MO:
	begin
		if (j == stop2)
		begin
			if (counter == mdivk-1+5-2)
				nextState = `cDONE;
			else
				nextState = `cSTORE_MO;
			updateCounter = 0;
		end
		else
		begin
			nextState =  `cMULT_SUB;
			updateCounter = 1;
		end
	end
	`cMULT_SUB:
		if (topIdxCounter == mdivk-1)
		begin
			if (j == n-1)
				nextState = `cINCRE_I;
			else
				nextState = `cMULT_SUB;
			updateCounter = 1;
		end
		else
		begin
			nextState = `cMULT_SUB;
			updateCounter = 0;
		end
	`cINCRE_I:
	begin
		nextState = `cWAIT;
		updateCounter = 1;
	end
	`cWAIT:
		if (waitCycles == 0)
		begin
			if (i1 == stop)
				nextState = `cDONE;
			else if (mode == 0)
				nextState = `cSTORE_DIAG;
			else if (mode == 1)
				nextState = `cFIND_REC;
			else
				nextState = `cUPDATE_J;
			updateCounter = 1;
		end
		else
		begin
			nextState = `cWAIT;
			updateCounter = 0;
		end
	`cDONE:
	begin
		nextState = `cDONE;
		updateCounter = 0;
	end
	default:
	begin
		nextState = `cSETUP;
		updateCounter = 1;
	end
	endcase
end

always @ (currentRowState or currentState or nextState or i1 or topIdxCounter or mdivk or msIdxCounter or readRowCounter or j or n or mode)
begin
	if (currentRowState == `cDONE_FETCH_ROW)
		doneFetchRow = 1;
	else
		doneFetchRow = 0;
		if((nextState == `cSTART_FETCH_ROW && currentState != `cSTART_FETCH_ROW && i1 == 1))
		startFetchRow = 1;
	else
		startFetchRow = 0;
	if (currentState == `cMULT_SUB && topIdxCounter+2 == mdivk)
		loadRow = 1;
	else
		loadRow = 0;
	writeRow = (msIdxCounter == readRowCounter)&&(currentState==`cMULT_SUB)&&(j!=n)&&(mode[0] == 0);
end

// second FSM that controls the control signals to temp_top block
always @ (currentRowState or nextTopIdxCounter or n or startFetchRow or loadRow or topIdx or mdivk or nextState)
begin
	case (currentRowState)
	`cFETCH_ROW:
		if (nextTopIdxCounter == n-1)
			nextRowState = `cDONE_FETCH_ROW;
		else
			nextRowState = `cFETCH_ROW;
	`cDONE_FETCH_ROW:
		if (startFetchRow == 1)
			nextRowState = `cFETCH_ROW;
		else if (loadRow == 1 || (topIdx+1 == mdivk && nextState == `cMULT_SUB))
			nextRowState = `cLOAD_ROW_INC_J;
		else
			nextRowState = `cDONE_FETCH_ROW;
	`cLOAD_ROW_INC_J:
		if (topIdx+1 == mdivk && nextState == `cMULT_SUB)
			nextRowState = `cLOAD_ROW_INC_J;
		else
			nextRowState = `cDONE_FETCH_ROW;
	default:
		nextRowState = `cDONE_FETCH_ROW;
	endcase
end

// address counters
always @ (posedge clk)
begin
	if (updateCounter == 1 || currentRowState == `cLOAD_ROW_INC_J)
		topIdxCounter <= topIdx;
	else
		topIdxCounter <= topIdxCounter + 1;

	if (updateCounter == 1)
		diagIdxCounter <= diagIdx;
	else
		diagIdxCounter <= diagIdxCounter + 1;

	if (updateCounter == 1 || currentRowState == `cLOAD_ROW_INC_J)
		msIdxCounter <= msIdx;
	else
		msIdxCounter <= msIdxCounter + 1;

	if (updateCounter == 1 || currentRowState == `cLOAD_ROW_INC_J)
		leftIdxCounter <= leftIdx;
	else
		leftIdxCounter <= leftIdxCounter + 1;
	
	if (currentState == `cFETCH_COL || currentState == `cSTORE_MO)
		topWriteCounter <= i1;
	else if (writeRow == 1 || currentRowState == `cFETCH_ROW)
		topWriteCounter <= topWriteCounter + 1;

	if (currentState == `cSTART)
		nextTopIdxCounter <= nextTopIdx;
 else if (currentState == `cSTORE_MO)
		if (mode[1] == 0)
			nextTopIdxCounter <= nextTopIdx + n + 1;
		else
			nextTopIdxCounter <= nextTopIdx + n;
	else if (writeRow == 1 || currentRowState == `cFETCH_ROW)
		nextTopIdxCounter <= nextTopIdxCounter + 1;

	if (currentState == `cSTART)
			readRowCounter <= 0; //offsetdivk;
	else if (currentState == `cSTORE_MO)
		if (mode[1] == 0)
			readRowCounter <= leftIdx + 2;
		else
			readRowCounter <= topIdx;
	else if (writeRow == 1 || currentRowState == `cFETCH_ROW)
	   readRowCounter <= readRowCounter + 2;

	if (updateCounter == 1)
		counter <= 0;
	else
		counter <= counter + 1;

	if (currentState == `cSTORE_DIAG || currentState == `cSTORE_DIAG2)
		divCounter <= 0;
	else if (divCounter < 56)
		divCounter <= divCounter + 1;

	case (i1modk) 
		5'b00000: begin
			i1modkByteEn <= ~(128'b0) >> (5'b00000<<2'b10);
		end
		5'b00001: begin
			i1modkByteEn <= ~(128'b0) >> (5'b00001<<2'b10);
		end
		5'b00010: begin
			i1modkByteEn <= ~(128'b0) >> (5'b00010<<2'b10);
		end
		5'b00011: begin
			i1modkByteEn <= ~(128'b0) >> (5'b00011<<2'b10);
		end
		5'b00100: begin
			i1modkByteEn <= ~(128'b0) >> (5'b00100<<2'b10);
		end
		5'b00101: begin
			i1modkByteEn <= ~(128'b0) >> (5'b00101<<2'b10);
		end
		5'b00110: begin
			i1modkByteEn <= ~(128'b0) >> (5'b00110<<2'b10);
		end
		5'b00111: begin
			i1modkByteEn <= ~(128'b0) >> (5'b00111<<2'b10);
		end
		5'b01000: begin
			i1modkByteEn <= ~(128'b0) >> (5'b01000<<2'b10);
		end
		5'b01001: begin
			i1modkByteEn <= ~(128'b0) >> (5'b01001<<2'b10);
		end
		5'b01010: begin
			i1modkByteEn <= ~(128'b0) >> (5'b01010<<2'b10);
		end
		5'b01011: begin
			i1modkByteEn <= ~(128'b0) >> (5'b01011<<2'b10);
		end
		5'b01100: begin
			i1modkByteEn <= ~(128'b0) >> (5'b01100<<2'b10);
		end
		5'b01101: begin
			i1modkByteEn <= ~(128'b0) >> (5'b01101<<2'b10);
		end
		5'b01110: begin
			i1modkByteEn <= ~(128'b0) >> (5'b01110<<2'b10);
		end
		5'b01111: begin
			i1modkByteEn <= ~(128'b0) >> (5'b01111<<2'b10);
		end
		5'b10000: begin
			i1modkByteEn <= ~(128'b0) >> (5'b10000<<2'b10);
		end
		5'b10001: begin
			i1modkByteEn <= ~(128'b0) >> (5'b10001<<2'b10);
		end
		5'b10010: begin
			i1modkByteEn <= ~(128'b0) >> (5'b10010<<2'b10);
		end
		5'b10011: begin
			i1modkByteEn <= ~(128'b0) >> (5'b10011<<2'b10);
		end
		5'b10100: begin
			i1modkByteEn <= ~(128'b0) >> (5'b10100<<2'b10);
		end
		5'b10101: begin
			i1modkByteEn <= ~(128'b0) >> (5'b10101<<2'b10);
		end
		5'b10110: begin
			i1modkByteEn <= ~(128'b0) >> (5'b10110<<2'b10);
		end
		5'b10111: begin
			i1modkByteEn <= ~(128'b0) >> (5'b10111<<2'b10);
		end
		5'b11000: begin
			i1modkByteEn <= ~(128'b0) >> (5'b11000<<2'b10);
		end
		5'b11001: begin
			i1modkByteEn <= ~(128'b0) >> (5'b11001<<2'b10);
		end
		5'b11010: begin
			i1modkByteEn <= ~(128'b0) >> (5'b11010<<2'b10);
		end
		5'b11011: begin
			i1modkByteEn <= ~(128'b0) >> (5'b11011<<2'b10);
		end
		5'b11100: begin
			i1modkByteEn <= ~(128'b0) >> (5'b11100<<2'b10);
		end
		5'b11101: begin
			i1modkByteEn <= ~(128'b0) >> (5'b11101<<2'b10);
		end
		5'b11110: begin
			i1modkByteEn <= ~(128'b0) >> (5'b11110<<2'b10);
		end
		5'b11111: begin
			i1modkByteEn <= ~(128'b0) >> (5'b11111<<2'b10);
		end
		default: begin
			i1modkByteEn <= ~(128'b0);
		end
	endcase
end

// compute Byte Enable
always @ (posedge clk)
begin
	if ((nextState == `cMULT_COL && currentState != `cMULT_COL) || (currentState == `cSTORE_MO) || currentRowState == `cLOAD_ROW_INC_J)
		byteEn <= i1modkByteEn;
	else
		byteEn <= 128'b11111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111;
end

// update FSM state register
always @ (posedge clk)
begin
	if (start_in == 1'b1)
		currentState <= `cSETUP;
	else
		currentState <= nextState;
	if (start == 1'b1)
		currentRowState <= `cDONE_FETCH_ROW;
	else
		currentRowState <= nextRowState;
end

// delay register for control signals
// control signals are delayed to match latency of operations and/or memory access
always @ (posedge clk)
begin
	curReadAddrDelay0 <= curReadAddrDelay1;
	curReadAddrDelay1 <= curReadAddrDelay2;
	curReadAddrDelay2 <= curReadAddrDelay3;
	curReadAddrDelay3 <= curReadAddrDelay4;
	curReadAddrDelay4 <= curReadAddrDelay5;
	curReadAddrDelay5 <= curReadAddrDelay6;
	curReadAddrDelay6 <= curReadAddrDelay7;
	curReadAddrDelay7 <= curReadAddrDelay8;
	curReadAddrDelay8 <= curReadAddrDelay9;
	curReadAddrDelay9 <= curReadAddrDelay10;
	curReadAddrDelay10 <= curReadAddrDelay11;
	curReadAddrDelay11 <= msIdxCounter;
	
	curWriteAddrDelay0 <= curWriteAddrDelay1;
	curWriteAddrDelay1 <= curWriteAddrDelay2;
	curWriteAddrDelay2 <= curWriteAddrDelay3;
	curWriteAddrDelay3 <= curWriteAddrDelay4;
	if (currentState == `cFETCH_COL)
		curWriteAddrDelay4 <= diagIdxCounter;
	else
		curWriteAddrDelay4 <= curWriteAddrDelay5;
	curWriteAddrDelay5 <= curWriteAddrDelay6;
	curWriteAddrDelay6 <= curWriteAddrDelay7;
	curWriteAddrDelay7 <= curWriteAddrDelay8;
	curWriteAddrDelay8 <= curWriteAddrDelay9;
	curWriteAddrDelay9 <= curWriteAddrDelay10;
	curWriteAddrDelay10 <= curWriteAddrDelay11;
	curWriteAddrDelay11 <= curWriteAddrDelay12;
	curWriteAddrDelay12 <= curWriteAddrDelay13;
	curWriteAddrDelay13 <= curWriteAddrDelay14;
	curWriteAddrDelay14 <= curWriteAddrDelay15;
	if (currentState == `cMULT_COL)
		curWriteAddrDelay15 <= leftIdxCounter;
	else
		curWriteAddrDelay15 <= curWriteAddrDelay16;
	curWriteAddrDelay16 <= curWriteAddrDelay17;
	curWriteAddrDelay17 <= curWriteAddrDelay18;
	curWriteAddrDelay18 <= curWriteAddrDelay19;
	curWriteAddrDelay19 <= curWriteAddrDelay20;
	curWriteAddrDelay20 <= curWriteAddrDelay21;
	curWriteAddrDelay21 <= curWriteAddrDelay22;
	curWriteAddrDelay22 <= curWriteAddrDelay23;
	curWriteAddrDelay23 <= curWriteAddrDelay24;
	curWriteAddrDelay24 <= curWriteAddrDelay25;
	curWriteAddrDelay25 <= curWriteAddrDelay26;
	curWriteAddrDelay26 <= curWriteAddrDelay27;
	curWriteAddrDelay27 <= curWriteAddrDelay28;
	curWriteAddrDelay28 <= curWriteAddrDelay29;
	curWriteAddrDelay29 <= curWriteAddrDelay30;
	curWriteAddrDelay30 <= curWriteAddrDelay31;
	curWriteAddrDelay31 <= msIdxCounter;
	
	writeByteEnDelay0 <= writeByteEnDelay1;
	writeByteEnDelay1 <= writeByteEnDelay2;
	writeByteEnDelay2 <= writeByteEnDelay3;
	writeByteEnDelay3 <= writeByteEnDelay4;
	if (mode[0] == 1'b1)
		writeByteEnDelay4 <= ~0;
	else if (currentState == `cFETCH_COL)
		writeByteEnDelay4 <= byteEn;
	else
		writeByteEnDelay4 <= writeByteEnDelay5;
	writeByteEnDelay5 <= writeByteEnDelay6;
	writeByteEnDelay6 <= writeByteEnDelay7;
	writeByteEnDelay7 <= writeByteEnDelay8;
	writeByteEnDelay8 <= writeByteEnDelay9;
	writeByteEnDelay9 <= writeByteEnDelay10;
	writeByteEnDelay10 <= writeByteEnDelay11;
	writeByteEnDelay11 <= writeByteEnDelay12;
	writeByteEnDelay12 <= writeByteEnDelay13;
	writeByteEnDelay13 <= writeByteEnDelay14;
	writeByteEnDelay14 <= writeByteEnDelay15;
	if (currentState == `cMULT_COL)
		writeByteEnDelay15 <= byteEn;
	else
		writeByteEnDelay15 <= writeByteEnDelay16;
	writeByteEnDelay16 <= writeByteEnDelay17;
	writeByteEnDelay17 <= writeByteEnDelay18;
	writeByteEnDelay18 <= writeByteEnDelay19;
	writeByteEnDelay19 <= writeByteEnDelay20;
	writeByteEnDelay20 <= writeByteEnDelay21;
	writeByteEnDelay21 <= writeByteEnDelay22;
	writeByteEnDelay22 <= writeByteEnDelay23;
	writeByteEnDelay23 <= writeByteEnDelay24;
	writeByteEnDelay24 <= writeByteEnDelay25;
	writeByteEnDelay25 <= writeByteEnDelay26;
	writeByteEnDelay26 <= writeByteEnDelay27;
	writeByteEnDelay27 <= writeByteEnDelay28;
	writeByteEnDelay28 <= writeByteEnDelay29;
	writeByteEnDelay29 <= writeByteEnDelay30;
	writeByteEnDelay30 <= writeByteEnDelay31;
	writeByteEnDelay31 <= byteEn;
	
	curWriteSelDelay[0] <= curWriteSelDelay[1];
	curWriteSelDelay[1] <= curWriteSelDelay[2];
	curWriteSelDelay[2] <= curWriteSelDelay[3];
	curWriteSelDelay[3] <= curWriteSelDelay[4];
	curWriteSelDelay[4] <= curWriteSelDelay[5];
	curWriteSelDelay[5] <= curWriteSelDelay[6];
	curWriteSelDelay[6] <= curWriteSelDelay[7];
	curWriteSelDelay[7] <= curWriteSelDelay[8];
	curWriteSelDelay[8] <= curWriteSelDelay[9];
	curWriteSelDelay[9] <= curWriteSelDelay[10];
	curWriteSelDelay[10] <= curWriteSelDelay[11];
	curWriteSelDelay[11] <= curWriteSelDelay[12];
	curWriteSelDelay[12] <= curWriteSelDelay[13];
	curWriteSelDelay[13] <= curWriteSelDelay[14];
	curWriteSelDelay[14] <= curWriteSelDelay[15];
	if (currentState == `cMULT_COL)
		curWriteSelDelay[15] <= 1'b0;
	else
		curWriteSelDelay[15] <= 1'b1;

	curWriteEnDelay[0] <= curWriteEnDelay[1];
	curWriteEnDelay[1] <= curWriteEnDelay[2];
	curWriteEnDelay[2] <= curWriteEnDelay[3];
	curWriteEnDelay[3] <= curWriteEnDelay[4];
	curWriteEnDelay[4] <= curWriteEnDelay[5];
	curWriteEnDelay[5] <= curWriteEnDelay[6];
	curWriteEnDelay[6] <= curWriteEnDelay[7];
	curWriteEnDelay[7] <= curWriteEnDelay[8];
	curWriteEnDelay[8] <= curWriteEnDelay[9];
	curWriteEnDelay[9] <= curWriteEnDelay[10];
	curWriteEnDelay[10] <= curWriteEnDelay[11];
	curWriteEnDelay[11] <= curWriteEnDelay[12];
	curWriteEnDelay[12] <= curWriteEnDelay[13];
	curWriteEnDelay[13] <= curWriteEnDelay[14];
	curWriteEnDelay[14] <= curWriteEnDelay[15];
	if (currentState == `cMULT_COL)
		curWriteEnDelay[15] <= 1'b1;
	else
		curWriteEnDelay[15] <= curWriteEnDelay[16];
	curWriteEnDelay[16] <= curWriteEnDelay[17];
	curWriteEnDelay[17] <= curWriteEnDelay[18];
	curWriteEnDelay[18] <= curWriteEnDelay[19];
	curWriteEnDelay[19] <= curWriteEnDelay[20];
	curWriteEnDelay[20] <= curWriteEnDelay[21];
	curWriteEnDelay[21] <= curWriteEnDelay[22];
	curWriteEnDelay[22] <= curWriteEnDelay[23];
	curWriteEnDelay[23] <= curWriteEnDelay[24];
	curWriteEnDelay[24] <= curWriteEnDelay[25];
	curWriteEnDelay[25] <= curWriteEnDelay[26];
	curWriteEnDelay[26] <= curWriteEnDelay[27];
	curWriteEnDelay[27] <= curWriteEnDelay[28];
	curWriteEnDelay[28] <= curWriteEnDelay[29];
	curWriteEnDelay[29] <= curWriteEnDelay[30];
	curWriteEnDelay[30] <= curWriteEnDelay[31];
	if (currentState == `cMULT_SUB)
		curWriteEnDelay[31] <= 1'b1;
	else
		curWriteEnDelay[31] <= 1'b0;

	leftWriteSelDelay[0] <= leftWriteSelDelay[1];
	leftWriteSelDelay[1] <= leftWriteSelDelay[2];
	leftWriteSelDelay[2] <= leftWriteSelDelay[3];
	leftWriteSelDelay[3] <= leftWriteSelDelay[4];
	if (currentState == `cFETCH_COL)
		leftWriteSelDelay[4] <= 1'b0;
	else
		leftWriteSelDelay[4] <= 1'b1;

	leftWriteEnDelay[0] <= leftWriteEnDelay[1];
	leftWriteEnDelay[1] <= leftWriteEnDelay[2];
	leftWriteEnDelay[2] <= leftWriteEnDelay[3];
	leftWriteEnDelay[3] <= leftWriteEnDelay[4];
	if (currentState == `cFETCH_COL)
		leftWriteEnDelay[4] <= 1'b1;
	else
		leftWriteEnDelay[4] <= leftWriteEnDelay[5];
	leftWriteEnDelay[5] <= leftWriteEnDelay[6];
	leftWriteEnDelay[6] <= leftWriteEnDelay[7];
	leftWriteEnDelay[7] <= leftWriteEnDelay[8];
	leftWriteEnDelay[8] <= leftWriteEnDelay[9];
	leftWriteEnDelay[9] <= leftWriteEnDelay[10];
	leftWriteEnDelay[10] <= leftWriteEnDelay[11];
	leftWriteEnDelay[11] <= leftWriteEnDelay[12];
	leftWriteEnDelay[12] <= leftWriteEnDelay[13];
	leftWriteEnDelay[13] <= leftWriteEnDelay[14];
	leftWriteEnDelay[14] <= leftWriteEnDelay[15];
	if (currentState == `cMULT_COL)
		leftWriteEnDelay[15] <= 1'b1;
	else
		leftWriteEnDelay[15] <= leftWriteEnDelay[16];
	leftWriteEnDelay[16] <= leftWriteEnDelay[17];
	leftWriteEnDelay[17] <= leftWriteEnDelay[18];
	leftWriteEnDelay[18] <= leftWriteEnDelay[19];
	leftWriteEnDelay[19] <= leftWriteEnDelay[20];
	leftWriteEnDelay[20] <= leftWriteEnDelay[21];
	leftWriteEnDelay[21] <= leftWriteEnDelay[22];
	leftWriteEnDelay[22] <= leftWriteEnDelay[23];
	leftWriteEnDelay[23] <= leftWriteEnDelay[24];
	leftWriteEnDelay[24] <= leftWriteEnDelay[25];
	leftWriteEnDelay[25] <= leftWriteEnDelay[26];
	leftWriteEnDelay[26] <= leftWriteEnDelay[27];
	leftWriteEnDelay[27] <= leftWriteEnDelay[28];
	leftWriteEnDelay[28] <= leftWriteEnDelay[29];
	leftWriteEnDelay[29] <= leftWriteEnDelay[30];
	leftWriteEnDelay[30] <= leftWriteEnDelay[31];
	if (currentState == `cMULT_SUB && (mode == 0 || (mode == 1 && j == i1)))
		leftWriteEnDelay[31] <= 1'b1;
	else
		leftWriteEnDelay[31] <= 1'b0;

	topWriteAddrDelay0 <= topWriteAddrDelay1;
	topWriteAddrDelay1 <= topWriteAddrDelay2;
	topWriteAddrDelay2 <= topWriteAddrDelay3;
	topWriteAddrDelay3 <= topWriteAddrDelay4;
	if (currentRowState == `cFETCH_ROW)
		topWriteAddrDelay4 <= nextTopIdxCounter;
	else
		topWriteAddrDelay4 <=  topWriteAddrDelay5;
	topWriteAddrDelay5 <= topWriteAddrDelay6;
	topWriteAddrDelay6 <= topWriteAddrDelay7;
	topWriteAddrDelay7 <= topWriteAddrDelay8;
	topWriteAddrDelay8 <= topWriteAddrDelay9;
	topWriteAddrDelay9 <= topWriteAddrDelay10;
	topWriteAddrDelay10 <= topWriteAddrDelay11;
	topWriteAddrDelay11 <= topWriteAddrDelay12;
	topWriteAddrDelay12 <= topWriteAddrDelay13;
	topWriteAddrDelay13 <= topWriteAddrDelay14;
	topWriteAddrDelay14 <= topWriteAddrDelay15;
	topWriteAddrDelay15 <= topWriteAddrDelay16;
	topWriteAddrDelay16 <= topWriteAddrDelay17;
	topWriteAddrDelay17 <= topWriteAddrDelay18;
	topWriteAddrDelay18 <= topWriteAddrDelay19;
	topWriteAddrDelay19 <= topWriteAddrDelay20;
	topWriteAddrDelay20 <= topWriteAddrDelay21;
	topWriteAddrDelay21 <= topWriteAddrDelay22;
	topWriteAddrDelay22 <= topWriteAddrDelay23;
	topWriteAddrDelay23 <= topWriteAddrDelay24;
	topWriteAddrDelay24 <= topWriteAddrDelay25;
	topWriteAddrDelay25 <= topWriteAddrDelay26;
	topWriteAddrDelay26 <= topWriteAddrDelay27;
	topWriteAddrDelay27 <= topWriteAddrDelay28;
	topWriteAddrDelay28 <= topWriteAddrDelay29;
	topWriteAddrDelay29 <= topWriteAddrDelay30;
	topWriteAddrDelay30 <= topWriteAddrDelay31;
		topWriteAddrDelay31 <= nextTopIdxCounter;

	topWriteEnDelay[0] <= topWriteEnDelay[1];
	topWriteEnDelay[1] <= topWriteEnDelay[2];
	topWriteEnDelay[2] <= topWriteEnDelay[3];
	topWriteEnDelay[3] <= topWriteEnDelay[4];
	if (currentRowState == `cFETCH_ROW)
		topWriteEnDelay[4] <= 1'b1;
	else
		topWriteEnDelay[4] <=  topWriteEnDelay[5];
	topWriteEnDelay[5] <= topWriteEnDelay[6];
	topWriteEnDelay[6] <= topWriteEnDelay[7];
	topWriteEnDelay[7] <= topWriteEnDelay[8];
	topWriteEnDelay[8] <= topWriteEnDelay[9];
	topWriteEnDelay[9] <= topWriteEnDelay[10];
	topWriteEnDelay[10] <= topWriteEnDelay[11];
	topWriteEnDelay[11] <= topWriteEnDelay[12];
	topWriteEnDelay[12] <= topWriteEnDelay[13];
	topWriteEnDelay[13] <= topWriteEnDelay[14];
	topWriteEnDelay[14] <= topWriteEnDelay[15];
	topWriteEnDelay[15] <= topWriteEnDelay[16];
	topWriteEnDelay[16] <= topWriteEnDelay[17];
	topWriteEnDelay[17] <= topWriteEnDelay[18];
	topWriteEnDelay[18] <= topWriteEnDelay[19];
	topWriteEnDelay[19] <= topWriteEnDelay[20];
	topWriteEnDelay[20] <= topWriteEnDelay[21];
	topWriteEnDelay[21] <= topWriteEnDelay[22];
	topWriteEnDelay[22] <= topWriteEnDelay[23];
	topWriteEnDelay[23] <= topWriteEnDelay[24];
	topWriteEnDelay[24] <= topWriteEnDelay[25];
	topWriteEnDelay[25] <= topWriteEnDelay[26];
	topWriteEnDelay[26] <= topWriteEnDelay[27];
	topWriteEnDelay[27] <= topWriteEnDelay[28];
	topWriteEnDelay[28] <= topWriteEnDelay[29];
	topWriteEnDelay[29] <= topWriteEnDelay[30];
	topWriteEnDelay[30] <= topWriteEnDelay[31];
	topWriteEnDelay[31] <= writeRow;

	topWriteSelDelay0 <= topWriteSelDelay1;
	topWriteSelDelay1 <= topWriteSelDelay2;
	topWriteSelDelay2 <= topWriteSelDelay3;
	topWriteSelDelay3 <= topWriteSelDelay4;
	if (currentRowState == `cFETCH_ROW || currentState == `cUPDATE_J && i1 == 1)
		topWriteSelDelay4 <= imodk;
	else
		topWriteSelDelay4 <=  topWriteSelDelay5;
	topWriteSelDelay5 <= topWriteSelDelay6;
	topWriteSelDelay6 <= topWriteSelDelay7;
	topWriteSelDelay7 <= topWriteSelDelay8;
	topWriteSelDelay8 <= topWriteSelDelay9;
	topWriteSelDelay9 <= topWriteSelDelay10;
	topWriteSelDelay10 <= topWriteSelDelay11;
	topWriteSelDelay11 <= topWriteSelDelay12;
	topWriteSelDelay12 <= topWriteSelDelay13;
	topWriteSelDelay13 <= topWriteSelDelay14;
	topWriteSelDelay14 <= topWriteSelDelay15;
	topWriteSelDelay15 <= topWriteSelDelay16;
	topWriteSelDelay16 <= topWriteSelDelay17;
	topWriteSelDelay17 <= topWriteSelDelay18;
	topWriteSelDelay18 <= topWriteSelDelay19;
	topWriteSelDelay19 <= topWriteSelDelay20;
	topWriteSelDelay20 <= topWriteSelDelay21;
	topWriteSelDelay21 <= topWriteSelDelay22;
	topWriteSelDelay22 <= topWriteSelDelay23;
	topWriteSelDelay23 <= topWriteSelDelay24;
	topWriteSelDelay24 <= topWriteSelDelay25;
	topWriteSelDelay25 <= topWriteSelDelay26;
	topWriteSelDelay26 <= topWriteSelDelay27;
	topWriteSelDelay27 <= topWriteSelDelay28;
	topWriteSelDelay28 <= topWriteSelDelay29;
	topWriteSelDelay29 <= topWriteSelDelay30;
	topWriteSelDelay30 <= topWriteSelDelay31;
	topWriteSelDelay31 <= i1modk;

	topSourceSelDelay[0] <= topSourceSelDelay[1];
	topSourceSelDelay[1] <= topSourceSelDelay[2];
	topSourceSelDelay[2] <= topSourceSelDelay[3];
	topSourceSelDelay[3] <= topSourceSelDelay[4];
	if (start == 1'b1)
		topSourceSelDelay[4] <= 1'b0;
	else if (currentState == `cSTORE_MO)
		topSourceSelDelay[4] <= 1'b1;

	leftReadAddrDelay0 <= leftIdxCounter;


	diagEnDelay[0] <= diagEnDelay[1];
	diagEnDelay[1] <= diagEnDelay[2];
	diagEnDelay[2] <= diagEnDelay[3];
	diagEnDelay[3] <= diagEnDelay[4];
	diagEnDelay[4] <= diagEnDelay[5];
	diagEnDelay[5] <= (currentState == `cSTORE_DIAG || currentState == `cSTORE_DIAG2);

	MOEnDelay[0] <= MOEnDelay[1];
	MOEnDelay[1] <= MOEnDelay[2];
	MOEnDelay[2] <= MOEnDelay[3];
	MOEnDelay[3] <= MOEnDelay[4];
	MOEnDelay[4] <= MOEnDelay[5];
	if (currentState == `cSTORE_MO || currentRowState == `cLOAD_ROW_INC_J)
		MOEnDelay[5] <= 1'b1;
	else
		MOEnDelay[5] <= 1'b0;
end

// output contorl signals
always @ (posedge clk)
begin
	if (currentState == `cFETCH_COL)
		curReadAddr <= diagIdxCounter;
	else if (currentRowState == `cFETCH_ROW)
	   curReadAddr <= readRowCounter;
	else
		curReadAddr <= curReadAddrDelay0;
	curWriteAddr <= curWriteAddrDelay0;
	curWriteByteEn <= writeByteEnDelay0;
	curWriteSel <= curWriteSelDelay;
	curWriteEn <= curWriteEnDelay;

	if (currentState == `cMULT_COL)
		leftReadAddr <= leftIdxCounter;
	else
		leftReadAddr <= leftReadAddrDelay0;
	leftWriteAddr <= curWriteAddrDelay0;
	leftWriteByteEn <= writeByteEnDelay0;
	leftWriteSel <= leftWriteSelDelay;
	leftWriteEn <= leftWriteEnDelay;

	if (currentState == `cSTORE_DIAG)
		topReadAddr <= nextTopIdx;
else if (currentState == `cSTORE_DIAG2)
   topReadAddr <= nextTopIdx2;
	else
	topReadAddr <= curTopIdx;
	topWriteAddr <= topWriteAddrDelay0;
	topWriteEn <= topWriteEnDelay;
	topWriteSel <= topWriteSelDelay0;
	topSourceSel <= topSourceSelDelay;

	MOSel <= ~(currentState == `cFIND_REC);
if (currentState == `cFIND_REC)
		MOEn <= 1'b1;
	else
		MOEn <= MOEnDelay;

	diagEn <= diagEnDelay;

	if (currentState == `cDONE)
		done <= 1'b1;
	else
		done <= 1'b0;
end

endmodule

module ram (
	byteena_a,
	clk,
	data,
	rdaddress,
	wraddress,
	wren,
	q
	);

	input	[`RAMNUMBYTES-1:0]  byteena_a;
	input	  clk;
	input	[`RAMWIDTH-1:0]  data;
	input	[`rRAMSIZEWIDTH-1:0]  rdaddress;
	input	[`rRAMSIZEWIDTH-1:0]  wraddress;
	input	  wren;
	output	[`RAMWIDTH-1:0]  q;
	wire	[`RAMWIDTH-1:0]  value_out;
	wire [`RAMWIDTH-1:0] subwire;
	assign q = subwire | dummy;
	wire [`RAMWIDTH-1:0] uselessdata;
 assign uselessdata = 1024'b0;
wire j;
assign j = |byteena_a;
 wire [`RAMWIDTH-1:0]dummy;
 assign dummy = value_out & 1024'b0;
dual_port_ram inst1( 
.clk (clk),
.we1(wren),
.we2(1'b0),
.data1(data),
.data2(uselessdata),
.out1(value_out),
.out2(subwire),
.addr1(wraddress),
.addr2(rdaddress));


endmodule

module ram1 (
	byteena_a,
	clk,
	data,
	rdaddress,
	wraddress,
	wren,
	q
	);

	input	[`RAMNUMBYTES-1:0]  byteena_a;
	input	  clk;
	input	[`RAMWIDTH-1:0]  data;
	input	[`rRAMSIZEWIDTH-1:0]  rdaddress;
	input	[`rRAMSIZEWIDTH-1:0]  wraddress;
	input	  wren;
	output	[`RAMWIDTH-1:0]  q;
	wire	[`RAMWIDTH-1:0]  value_out;
	wire [`RAMWIDTH-1:0] subwire;
	assign q = subwire | dummy;
	wire [`RAMWIDTH-1:0] uselessdata;
 assign uselessdata = 1024'b0;
wire j;
assign j = |byteena_a;
 wire [`RAMWIDTH-1:0]dummy;
 assign dummy = value_out & 1024'b0;
dual_port_ram inst1( 
.clk (clk),
.we1(wren),
.we2(1'b0),
.data1(data),
.data2(uselessdata),
.out1(value_out),
.out2(subwire),
.addr1(wraddress),
.addr2(rdaddress));


endmodule

module ram2 (
	byteena_a,
	clk,
	data,
	rdaddress,
	wraddress,
	wren,
	q
	);

	input	[`RAMNUMBYTES-1:0]  byteena_a;
	input	  clk;
	input	[`RAMWIDTH-1:0]  data;
	input	[`rRAMSIZEWIDTH-1:0]  rdaddress;
	input	[`rRAMSIZEWIDTH-1:0]  wraddress;
	input	  wren;
	output	[`RAMWIDTH-1:0]  q;
	wire	[`RAMWIDTH-1:0]  value_out;
	wire [`RAMWIDTH-1:0] subwire;
	assign q = subwire | dummy;
	wire [`RAMWIDTH-1:0] uselessdata;
 assign uselessdata = 1024'b0;
wire j;
assign j = |byteena_a;
 wire [`RAMWIDTH-1:0]dummy;
 assign dummy = value_out & 1024'b0;
dual_port_ram inst1( 
.clk (clk),
.we1(wren),
.we2(1'b0),
.data1(data),
.data2(uselessdata),
.out1(value_out),
.out2(subwire),
.addr1(wraddress),
.addr2(rdaddress));


endmodule

module ram3 (
	byteena_a,
	clk,
	data,
	rdaddress,
	wraddress,
	wren,
	q
	);

	input	[`RAMNUMBYTES-1:0]  byteena_a;
	input	  clk;
	input	[`RAMWIDTH-1:0]  data;
	input	[`rRAMSIZEWIDTH-1:0]  rdaddress;
	input	[`rRAMSIZEWIDTH-1:0]  wraddress;
	input	  wren;
	output	[`RAMWIDTH-1:0]  q;
	wire	[`RAMWIDTH-1:0]  value_out;
	wire [`RAMWIDTH-1:0] subwire;
	assign q = subwire | dummy;
	wire [`RAMWIDTH-1:0] uselessdata;
 assign uselessdata = 1024'b0;
wire j;
assign j = |byteena_a;
 wire [`RAMWIDTH-1:0]dummy;
 assign dummy = value_out & 1024'b0;
dual_port_ram inst1( 
.clk (clk),
.we1(wren),
.we2(1'b0),
.data1(data),
.data2(uselessdata),
.out1(value_out),
.out2(subwire),
.addr1(wraddress),
.addr2(rdaddress));


endmodule


module top_ram (
	clk,
	data,
	rdaddress,
	wraddress,
	wren,
	q
	);

	//parameter TOPSIZE = 4096, TOPSIZEWIDTH = 12, TOPWIDTH = 32;
	
	input	  clk;
	input	[32-1:0]  data;
	input	[12-1:0]  rdaddress;
	input	[12-1:0]  wraddress;
	input	  wren;
	output	[32-1:0]  q;

	wire [32-1:0] sub_wire0;
	wire [32-1:0] q;
	wire [32-1:0] junk_output;
	assign q = sub_wire0 | dummy;
	wire[32-1:0] dummy;
	assign dummy = junk_output & 32'b0;
 dual_port_ram_4096x32 inst2(
 .clk (clk),
 .we1(wren),
 .we2(1'b0),
 .data1(data),
 .data2(data),
 .out1(junk_output),
 .out2(sub_wire0),
 .addr1(wraddress),
 .addr2(rdaddress));

endmodule

module mult_add (clk, A, B, C, mult_result, add_result);
//parameter PRECISION = 32;
input clk;
input [32-1:0] A, B, C;
output [32-1:0] mult_result, add_result;
reg [32-1:0] mult_result;
reg [32-1:0] add_result;
wire [32-1:0] mult_comp_result;
reg [32-1:0] add_a, add_b;
wire [32-1:0] addition_result;
wire [31:0] dummy_wire;
assign dummy_wire = mult_comp_result>>2'b10;
//divsp MUL(.clk(clk), .rmode(2'b00), .fpu_op(3'b010), .opa(A), .opb(B), .ans(mult_comp_result) );
wire [4:0]dummy_wire_2;
fpmul MUL(.clk(clk), .a(A), .b(B), .y_out(mult_comp_result), .control(2'b00), .flags(dummy_wire_2));
fpu_add ADD(.clock(clk), .a1(C), .b1(dummy_wire), .sum(addition_result));
always @ (posedge clk)
begin
	add_result  <= addition_result;
	mult_result <= mult_comp_result[31:0];
end
endmodule


//`define rFIFOINPUTWIDTH 64
`define rFIFOSIZE 256
`define rFIFOSIZEWIDTH 8
`define rFIFOOUTPUTWIDTH 1024
`define rFIFORSIZEWIDTH 4
	`define wFIFOINPUTWIDTH 12'b010000000000
	`define wFIFOSIZE 6'b010000
	`define wFIFOSIZEWIDTH 4'b0100
	`define wFIFOOUTPUTWIDTH 8'b01000000
	`define wFIFORSIZEWIDTH 5'b01000
 //for addr_fifo
`define aFIFOSIZE 6'b010000
`define aFIFOSIZEWIDTH 4'b0100
`define aFIFOWIDTH 4'b0111
//for memfifo
`define mFIFOSIZE 16
`define mFIFOSIZEWIDTH 4
//`define mFIFOWIDTH 28

`define BURSTLEN 3'b010
`define BURSTWIDTH 3'b010
`define DATAWIDTH 12'b010000000000
`define DATANUMBYTES 9'b010000000
`define MEMCONWIDTH 8'b01000000
`define MEMCONNUMBYTES 5'b01000
`define DDRSIZEWIDTH 6'b011000
`define FIFOSIZE 6'b010000
`define FIFOSIZEWIDTH 4'b0100
`define RAMWIDTH 12'b010000000000
`define RAMNUMBYTES 9'b010000000
`define RAMSIZEWIDTH 4'b0111
`define RATIO 6'b010000
`define RAMLAT 4'b0101
 
`define dIDLE 0
`define dWRITE 1
`define dREAD 2

module DataTransferUnit (clk, dtu_write_req, dtu_read_req, dtu_mem_addr, dtu_ram_addr, dtu_size, dtu_ack, dtu_done,
		ram_read_addr, ram_read_data, ram_write_byte_en, ram_write_data, ram_write_addr, ram_write_en,
		mem_rdata, mem_rdata_valid, mem_ready, mem_wdata_req, reset_n,
		burst_begin, mem_local_addr, mem_be, mem_read_req, mem_size, mem_wdata, mem_write_req
		);

output burst_begin;
output [`DDRSIZEWIDTH-1:0] mem_local_addr;
output [`MEMCONNUMBYTES-1: 0] mem_be;
output mem_read_req;
output [`BURSTWIDTH-1:0] mem_size;
output [`MEMCONWIDTH-1:0] mem_wdata;
output mem_write_req;
input clk;
input [`MEMCONWIDTH-1:0] mem_rdata;
input mem_rdata_valid;
input mem_ready;
input mem_wdata_req;
input reset_n;

input dtu_write_req;
input dtu_read_req;
input [`DDRSIZEWIDTH-1:0] dtu_mem_addr;
input [`RAMSIZEWIDTH-1:0] dtu_ram_addr;
input [6:0] dtu_size;
output dtu_ack;
output dtu_done;

output[`RAMWIDTH-1:0] ram_write_data;
input[`RAMWIDTH-1:0] ram_read_data;
output[`RAMSIZEWIDTH-1:0] ram_write_addr, ram_read_addr;
output[`RAMNUMBYTES-1:0] ram_write_byte_en;
output ram_write_en;

reg[`DDRSIZEWIDTH-1:0] mem_addr0;
reg[`DDRSIZEWIDTH-1:0] mem_addr1;
reg[`DDRSIZEWIDTH-1:0] mem_addr2;
reg[`DDRSIZEWIDTH-1:0] mem_addr3;
reg[`DDRSIZEWIDTH-1:0] mem_addr4;
reg[`DDRSIZEWIDTH-1:0] mem_addr5;

reg [1:0] state;
wire [`DATAWIDTH-1:0] rdata, ram_write_dataw, ram_read_dataw;

wire [`RAMSIZEWIDTH-1:0] rfifo_addr;
reg [`RAMLAT-1:0]fifo_write_reg;
reg [`RAMLAT-1:0]write_req_reg;
reg [`RAMLAT-1:0]read_req_reg;
reg [0:0]fifo_read_reg;
reg rdata_valid;
reg [1:0]test_complete_reg;
reg [`BURSTWIDTH-1:0] size_count0;
reg [`BURSTWIDTH-1:0] size_count1;
reg [`BURSTWIDTH-1:0] size_count2;
reg [`BURSTWIDTH-1:0] size_count3;
reg [`BURSTWIDTH-1:0] size_count4;

reg [`RAMSIZEWIDTH-1:0] size;
reg [`RAMSIZEWIDTH-1:0]ram_addr0;
reg [`RAMSIZEWIDTH-1:0]ram_addr1;
reg [`RAMSIZEWIDTH-1:0]ram_addr2;
reg [`RAMSIZEWIDTH-1:0]ram_addr3;
reg [`RAMSIZEWIDTH-1:0]ram_addr4;

reg [4:0] data_count;
reg ram_write_en_reg;

wire read_req;
wire write_req;
wire [`FIFOSIZEWIDTH-1:0] wfifo_count;
wire rfull, wempty, rempty, rdcmd_empty, wrcmd_full, wrcmd_empty, rdata_empty;
wire [`DATAWIDTH-1:0] mem_data;
wire not_stall;
wire fifo_write, fifo_read;
wire rdata_req;
wire [`BURSTWIDTH+`DDRSIZEWIDTH+1:0] wrmem_cmd, rdmem_cmd;
wire mem_cmd_ready, mem_cmd_issue;

// FIFOs to interact with off-chip memory
memcmd_fifo cmd_store(
	//.aclr(~reset_n),
	//.rdclk(phy_clk),
	.clk(clk),
	.data(wrmem_cmd),
	.rdreq(mem_cmd_ready),
	//.rdempty(rdcmd_empty),
	.wrreq(mem_cmd_issue),
	.full(wrcmd_full),
	.empty(wrcmd_empty),
	.q(rdmem_cmd)
	);

wfifo wdata_store(
	//.rdclk(phy_clk),
	.clk(clk),
	.data(mem_data),
	.rdreq(mem_wdata_req),
	.wrreq(fifo_write),
	.empty(wempty),
	.q(mem_wdata),
	.usedw(wfifo_count)
	);

addr_fifo raddress_store (
	.clk(clk),
	.data(ram_addr3),
	.wrreq(fifo_read),
	.rdreq(rdata_req),
	.empty(rempty),
	.full(rfull),
	.q(rfifo_addr)
	);

rfifo rdata_store(
	.clk(clk),
	.data(mem_rdata),
	.rdreq(rdata_req),
	//.wrclk(phy_clk),
	.wrreq(mem_rdata_valid),
	.empty(rdata_empty),
	.q(rdata)
	);

assign mem_cmd_ready = (mem_ready == 1'b1);// && (rdcmd_empty == 0);
assign mem_cmd_issue = (wrcmd_full == 1'b0) && (write_req == 1 || read_req == 1'b1 || wrcmd_empty == 1'b1);
assign wrmem_cmd[27:26] = size_count0;
assign wrmem_cmd[`DDRSIZEWIDTH+1:2] = mem_addr0;
assign wrmem_cmd[1] = read_req;
assign wrmem_cmd[0] = write_req;
assign mem_write_req = rdmem_cmd[0];// && rdcmd_empty == 0;
assign mem_read_req = rdmem_cmd[1];// && rdcmd_empty == 0;
assign mem_local_addr = rdmem_cmd[`DDRSIZEWIDTH+1:2];
assign burst_begin = 0;
assign mem_size = rdmem_cmd[`BURSTWIDTH+`DDRSIZEWIDTH+1:`DDRSIZEWIDTH+2];
assign mem_be = ~0;
assign fifo_write = fifo_write_reg[0];
assign write_req = (not_stall) ? write_req_reg[0] : 0;
assign read_req = (not_stall) ? read_req_reg[0] : 0;
assign fifo_read = (not_stall) ? fifo_read_reg[0] : 0;
assign not_stall = (wfifo_count < `FIFOSIZE-5) && (rfull == 0) && (wrcmd_full == 0);
assign dtu_ack = (state == `dIDLE);
assign dtu_done = (state == `dIDLE) && wempty && rempty;

assign ram_write_dataw[63:0] = rdata[1023:960];
assign mem_data[63:0] = ram_read_dataw[1023:960];
assign ram_write_dataw[127:64] = rdata[959:896];
assign mem_data[127:64] = ram_read_dataw[959:896];
assign ram_write_dataw[191:128] = rdata[895:832];
assign mem_data[191:128] = ram_read_dataw[895:832];
assign ram_write_dataw[255:192] = rdata[831:768];
assign mem_data[255:192] = ram_read_dataw[831:768];
assign ram_write_dataw[319:256] = rdata[767:704];
assign mem_data[319:256] = ram_read_dataw[767:704];
assign ram_write_dataw[383:320] = rdata[703:640];
assign mem_data[383:320] = ram_read_dataw[703:640];
assign ram_write_dataw[447:384] = rdata[639:576];
assign mem_data[447:384] = ram_read_dataw[639:576];
assign ram_write_dataw[511:448] = rdata[575:512];
assign mem_data[511:448] = ram_read_dataw[575:512];
assign ram_write_dataw[575:512] = rdata[511:448];
assign mem_data[575:512] = ram_read_dataw[511:448];
assign ram_write_dataw[639:576] = rdata[447:384];
assign mem_data[639:576] = ram_read_dataw[447:384];
assign ram_write_dataw[703:640] = rdata[383:320];
assign mem_data[703:640] = ram_read_dataw[383:320];
assign ram_write_dataw[767:704] = rdata[319:256];
assign mem_data[767:704] = ram_read_dataw[319:256];
assign ram_write_dataw[831:768] = rdata[255:192];
assign mem_data[831:768] = ram_read_dataw[255:192];
assign ram_write_dataw[895:832] = rdata[191:128];
assign mem_data[895:832] = ram_read_dataw[191:128];
assign ram_write_dataw[959:896] = rdata[127:64];
assign mem_data[959:896] = ram_read_dataw[127:64];
assign ram_write_dataw[1023:960] = rdata[63:0];
assign mem_data[1023:960] = ram_read_dataw[63:0];
assign ram_write_data = ram_write_dataw[1023:0];
assign ram_read_dataw[1023:0] = ram_read_data;
assign ram_write_addr = rfifo_addr;
assign ram_read_addr = ram_addr4;
assign ram_write_byte_en = ~0;
assign ram_write_en = ram_write_en_reg;
assign rdata_req = !rdata_empty;

// FSM to produce off-chip memory commands
always @ (posedge clk)
begin
	if (reset_n == 1'b0)
	begin
		state <= `dIDLE;
	end
	else
	begin
		case (state)
		`dIDLE:
		begin
			if (dtu_write_req)
				state <= `dWRITE;
			else if (dtu_read_req)
				state <= `dREAD;
			else
				state <= `dIDLE;
		end
		`dWRITE:
		begin
			if (not_stall && size == 0 && data_count < `BURSTLEN)
				state <= `dIDLE;
			else
				state <= `dWRITE;
		end
		`dREAD:
		begin
			if (not_stall && size == 0 && data_count < `BURSTLEN)
				state <= `dIDLE;
			else
				state <= `dREAD;
		end
		default:
		begin
			state <= `dIDLE;
		end
		endcase
	end
end

always @ (posedge clk)
begin

	if (reset_n == 0)
	begin
		size <= 0;
		data_count <= 0;
		size_count4 <= 1;
		mem_addr5 <= 0;
		ram_addr4 <= 0;
		fifo_write_reg[`RAMLAT-1] <= 0;
		write_req_reg[`RAMLAT-1] <= 0;
		fifo_read_reg[0] <= 0;
		read_req_reg[`RAMLAT-1] <= 0;
	end
	else if (state == `dIDLE)
	begin
		size <= dtu_size;
		size_count4 <= `BURSTLEN;
		mem_addr5 <= dtu_mem_addr;
		ram_addr4 <= dtu_ram_addr;
		fifo_write_reg[`RAMLAT-1] <= 1'b0;
		write_req_reg[`RAMLAT-1] <= 1'b0;
		fifo_read_reg[0] <= 1'b0;
		read_req_reg[`RAMLAT-1] <= 1'b0;
		data_count <= 0;
	end
	else if (data_count >= `BURSTLEN && not_stall)
	begin
		data_count <= data_count - `BURSTLEN;
		mem_addr5 <= mem_addr5 + `BURSTLEN;
		fifo_write_reg[`RAMLAT-1] <= 1'b0;
		write_req_reg[`RAMLAT-1] <= state == `dWRITE;
		fifo_read_reg[0] <= 0;
		read_req_reg[`RAMLAT-1] <= state == `dREAD;
	end
	else if (size == 0 && data_count == 0 && not_stall==1'b1)
	begin
		fifo_write_reg[`RAMLAT-1] <= 0;
		write_req_reg[`RAMLAT-1] <= 0;
		fifo_read_reg[0] <= 0;
		read_req_reg[`RAMLAT-1] <= 0;
	end
	else if (size == 0 && not_stall==1'b1)
	begin
		size_count4 <= data_count[`BURSTWIDTH-1:0];
		fifo_write_reg[`RAMLAT-1] <= 0;
		write_req_reg[`RAMLAT-1] <= state == `dWRITE;
		fifo_read_reg[0] <= 0;
		read_req_reg[`RAMLAT-1] <= state == `dREAD;
	end
	else if (not_stall==1'b1)
	begin
		size <= size - 1;
		data_count <= data_count + `RATIO - `BURSTLEN;
		mem_addr5 <= mem_addr5 + `BURSTLEN;
		ram_addr4 <= ram_addr4+1;
		fifo_write_reg[`RAMLAT-1] <= state == `dWRITE;
		write_req_reg[`RAMLAT-1] <= state == `dWRITE;
		fifo_read_reg[0] <= state == `dREAD;
		read_req_reg[`RAMLAT-1] <= state == `dREAD;
	end
	else
	begin
		fifo_write_reg[`RAMLAT-1] <= 0;
	end
end


always @ (posedge clk)
begin
	if (reset_n == 0)
	begin
		fifo_write_reg[0] <= 1'b0;
		fifo_write_reg[1] <= 1'b0;
		fifo_write_reg[2] <= 1'b0;
		fifo_write_reg[3] <= 1'b0;
	end
	else
	begin
		fifo_write_reg[0] <= fifo_write_reg[1];
		fifo_write_reg[1] <= fifo_write_reg[2];
		fifo_write_reg[2] <= fifo_write_reg[3];
		fifo_write_reg[3] <= fifo_write_reg[4];
	end

	if (reset_n == 1'b0)
	begin
		mem_addr0 <= 0;
		ram_addr0 <= 0;
		size_count0 <= 1;
		write_req_reg[0] <= 0;
		read_req_reg[0] <= 0;
		mem_addr1 <= 0;
		ram_addr1 <= 0;
		size_count1 <= 1;
		write_req_reg[1] <= 0;
		read_req_reg[1] <= 0;
		mem_addr2 <= 0;
		ram_addr2 <= 0;
		size_count2 <= 1;
		write_req_reg[2] <= 0;
		read_req_reg[2] <= 0;
		mem_addr3 <= 0;
		ram_addr3 <= 0;
		size_count3 <= 1;
		write_req_reg[3] <= 0;
		read_req_reg[3] <= 0;
		mem_addr4 <= 0;
	end
	else if (not_stall)
	begin
		size_count0 <= size_count1;
		mem_addr0 <= mem_addr1;
		ram_addr0 <= ram_addr1;
		write_req_reg[0] <= write_req_reg[1];
		read_req_reg[0] <= read_req_reg[1];
		size_count1 <= size_count2;
		mem_addr1 <= mem_addr2;
		ram_addr1 <= ram_addr2;
		write_req_reg[1] <= write_req_reg[2];
		read_req_reg[1] <= read_req_reg[2];
		size_count2 <= size_count3;
		mem_addr2 <= mem_addr3;
		ram_addr2 <= ram_addr3;
		write_req_reg[2] <= write_req_reg[3];
		read_req_reg[2] <= read_req_reg[3];
		size_count3 <= size_count4;
		mem_addr3 <= mem_addr4;
		ram_addr3 <= ram_addr4;
		write_req_reg[3] <= write_req_reg[4];
		read_req_reg[3] <= read_req_reg[4];
		mem_addr4 <= mem_addr5;
	end
	
	ram_write_en_reg  <= rdata_req;
end

endmodule

module rfifo (
	clk,
	data,
	rdreq,
	wrreq,
	empty,
	q
	);


	input	  clk;
	input	  wrreq;
	input	  rdreq;
	input	[`rFIFOINPUTWIDTH-1:0]  data;
	output	  empty;
	output	[`rFIFOOUTPUTWIDTH-1:0]  q;

	reg [`rFIFORSIZEWIDTH-1:0] wr_pointer;
	reg [`rFIFORSIZEWIDTH-1:0] rd_pointer;
	reg [`rFIFORSIZEWIDTH:0] status_cnt;
	reg [`rFIFOOUTPUTWIDTH-1:0] q ;
	reg[3:0] counter;
	wire [`rFIFOINPUTWIDTH-1:0] data_ram;
assign empty = (status_cnt == 9'b000000000);
wire [`rFIFOINPUTWIDTH-1:0]junk_input;
wire [`rFIFOINPUTWIDTH-1:0]junk_output;
assign junk_input = 64'b0000000000000000000000000000000000000000000000000000000000000000;
 always @ (posedge clk)
 begin  //WRITE_POINTER
	if (wrreq) 
	begin
 		wr_pointer <= wr_pointer + 1'b1;
	end
end
always @ (posedge clk)
begin  //READ_POINTER
	if (rdreq) 
	begin
	rd_pointer <= rd_pointer + 2'b01;
	end
end
always  @ (posedge clk )
begin  //READ_DATA
if (rdreq) 
	counter <= 0;
else 
	counter <= counter + 2'b01;
if(counter == 0)
	q[`rFIFOINPUTWIDTH-1:0] <= data_ram;
else if (counter == 1)
	q[127:64] <= data_ram;
else if (counter == 2)
	q[191:128] <= data_ram;
else if (counter == 3)
	q[255:192] <= data_ram;
else if (counter == 4)
	q[319:256] <= data_ram;
else if (counter == 5)
	q[383:320] <= data_ram;
else if (counter == 6)
	q[447:384] <= data_ram;
else if (counter == 7)
	q[511:448] <= data_ram;
else if (counter == 8)
	q[575:512] <= data_ram;
else if (counter == 9)
	q[639:576] <= data_ram;
else if (counter == 10)
	q[703:640] <= data_ram;
else if (counter == 11)
	q[767:704] <= data_ram;
else if (counter == 12)
	q[831:768] <= data_ram;
else if (counter == 13)
	q[895:832] <= data_ram;
else if (counter == 14)
	q[959:896] <= data_ram;
else if (counter == 15)
	q[1023:960] <= data_ram;
end
always @ (posedge clk )
begin // : STATUS_COUNTER
	if ((rdreq) && (!wrreq) && (status_cnt != 0))
		status_cnt <= status_cnt - 1'b1;
// Write but no read.
	else if ((wrreq) && (!rdreq) && (status_cnt != 64 ))
		status_cnt <= status_cnt + 1'b1;
end 
  dual_port_ram_rfifo ram_addr(
.we1      (wrreq)      , // write enable
 .we2      (rdreq)       , // Read enable
.addr1 (wr_pointer) , // address_0 input 
.addr2 (rd_pointer) , // address_q input  
.data1    (data)    , // data_0 bi-directional
.data2    (junk_input),   // data_1 bi-directional
.clk(clk),
.out1	(data_ram),
.out2	(junk_output)
 ); 


endmodule


// synopsys translate_off
//`timescale 1 ps / 1 ps
// synopsys translate_on
module wfifo (
	clk,
	data,
	rdreq,
	wrreq,
	empty,
	q,
	usedw
	);

	input	clk;
	input	  wrreq;
	input	  rdreq;
	input	[`wFIFOINPUTWIDTH-1:0]  data;
	output	  empty;
	output	[`wFIFOOUTPUTWIDTH-1:0]  q;
	output	[`wFIFOSIZEWIDTH-1:0]  usedw;
//-----------Internal variables-------------------
reg [`wFIFOSIZEWIDTH-1:0] wr_pointer;
reg [`wFIFOSIZEWIDTH-1:0] rd_pointer;
reg [`wFIFOSIZEWIDTH:0] status_cnt;
reg [`wFIFOOUTPUTWIDTH-1:0] q ;
reg[3:0] counter;
wire [`wFIFOINPUTWIDTH-1:0] data_ram ;
assign empty = (status_cnt == 5'b00000);
wire [`wFIFOINPUTWIDTH-1:0]junk_input;
wire [`wFIFOINPUTWIDTH-1:0]junk_output;
assign junk_input = 1024'b0;
 always @ (posedge clk)
 begin  //WRITE_POINTER
	if (wrreq) 
	begin
 		wr_pointer <= wr_pointer + 1'b1;
	end
end
always @ (posedge clk)
begin  //READ_POINTER
	if (rdreq) 
	begin
	rd_pointer <= rd_pointer + 2'b01;
	end
end
always  @ (posedge clk )
begin  //READ_DATA
if (rdreq) 
	counter <= 0;
else 
	counter <= counter + 2'b01;
if(counter == 0)
	q <= data_ram[63:0];
else if(counter == 1)
	q <= data_ram[127:64];
else if(counter == 2)
	q <= data_ram[191:128];
else if(counter == 3)
	q <= data_ram[255:192];
else if(counter == 4)
	q <= data_ram[319:256];
else if(counter == 5)
	q <= data_ram[383:320];
else if(counter == 6)
	q <= data_ram[447:384];
else if(counter == 7)
	q <= data_ram[511:448];
else if(counter == 8)
	q <= data_ram[575:512];
else if(counter == 9)
	q <= data_ram[639:576];
else if(counter == 10)
	q <= data_ram[703:640];
else if(counter == 11)
	q <= data_ram[767:704];
else if(counter == 12)
	q <= data_ram[831:768];
else if(counter == 13)
	q <= data_ram[895:832];
else if(counter == 14)
	q <= data_ram[959:896];
else if(counter == 15)
	q <= data_ram[1023:960];
end
always @ (posedge clk )
begin // : STATUS_COUNTER
	if ((rdreq) && (!wrreq) && (status_cnt != 5'b00000))
		status_cnt <= status_cnt - 1'b1;
	// Write but no read.
	else if ((wrreq) && (!rdreq) && (status_cnt != 5'b10000 )) 
		status_cnt <= status_cnt + 1'b1;
end 
assign usedw = status_cnt[`wFIFOSIZEWIDTH-1:0];
  dual_port_ram_wfifo ram_addr(
.we1      (wrreq)      , // write enable
 .we2      (rdreq)       , // Read enable
.addr1 (wr_pointer) , // address_0 input 
.addr2 (rd_pointer) , // address_q input  
.data1    (data)    , // data_0 bi-directional
.data2    (junk_input),   // data_1 bi-directional
.clk(clk),
.out1	(data_ram),
.out2	(junk_output)
 ); 


endmodule

// synopsys translate_off
//`timescale 1 ps / 1 ps
// synopsys translate_on
module addr_fifo (
	clk,
	data,
	wrreq,
	rdreq,
	empty,
	full,
	q
	);

	input	  clk;
	input	[`aFIFOWIDTH-1:0]  data;
	input	  rdreq;
	input	  wrreq;
	output	  empty;
	output	  full;
	output	[`aFIFOWIDTH-1:0]  q;

reg [`aFIFOSIZEWIDTH-1:0] wr_pointer;
reg [`aFIFOSIZEWIDTH-1:0] rd_pointer;
reg [`aFIFOSIZEWIDTH:0] status_cnt;
reg [`aFIFOWIDTH-1:0] q ;
wire [`aFIFOWIDTH-1:0] data_ram ;
assign full = (status_cnt == 5'b01111);
assign empty = (status_cnt == 5'b00000);
wire [`aFIFOWIDTH-1:0]junk_input;
wire [`aFIFOWIDTH-1:0]junk_output;
assign junk_input = 7'b0000000;
always @ (posedge clk)
begin  //WRITE_POINTER
if (wrreq) 
begin
wr_pointer <= wr_pointer + 1'b1;
end
end
always @ (posedge clk)
begin  //READ_POINTER
if (rdreq) 
begin
rd_pointer <= rd_pointer + 1'b1;
end
end
always  @ (posedge clk )
begin  //READ_DATA
if (rdreq) begin
q <= data_ram; 
end
end
always @ (posedge clk )
begin // : STATUS_COUNTER
	if ((rdreq) && (!wrreq) && (status_cnt != 5'b00000))
		status_cnt <= status_cnt - 1'b1;
	// Write but no read.
	else if ((wrreq) && (!rdreq) && (status_cnt != 5'b10000))
		status_cnt <= status_cnt + 1;
end
  dual_port_ram_afifo ram_addr(
.we1      (wrreq)      , // write enable
 .we2      (rdreq)       , // Read enable
.addr1 (wr_pointer) , // address_0 input 
.addr2 (rd_pointer) , // address_q input  
.data1    (data)    , // data_0 bi-directional
.data2    (junk_input),   // data_1 bi-directional
.clk(clk),
.out1	(data_ram),
.out2	(junk_output)
 ); 


endmodule

module memcmd_fifo (
	clk,
	data,
	rdreq,
	wrreq,
	full,
	empty,
	q
	);
	
	input	  clk;
	input	[`mFIFOWIDTH-1:0]  data;
	input	  wrreq;
	input	  rdreq;
	output	  full;
	output	  empty;
	output	[`mFIFOWIDTH-1:0]  q;

	reg [`mFIFOSIZEWIDTH-1:0] wr_pointer;
	reg [`mFIFOSIZEWIDTH-1:0] rd_pointer;
	reg [`mFIFOSIZEWIDTH:0] status_cnt;
	reg [`mFIFOWIDTH-1:0] q ;
	wire [`mFIFOWIDTH-1:0] data_ram;
	assign full = (status_cnt ==5'b01111);
	assign empty = (status_cnt == 5'b00000);
	wire [`mFIFOWIDTH-1:0]junk_input;
	wire [`mFIFOWIDTH-1:0]junk_output;
	assign junk_input = 28'b0000000000000000000000000000;
	always @ (posedge clk)
	begin  //WRITE_POINTER
		if (wrreq)
			begin
				wr_pointer <= wr_pointer + 1'b1;
			end
	end
	always @ (posedge clk)
	begin  //READ_POINTER
		if (rdreq)
		begin
			rd_pointer <= rd_pointer + 1'b1;
		end
	end
	always  @ (posedge clk )
	begin  //READ_DATA
		if (rdreq) begin
			q <= data_ram;
		end
	end
always @ (posedge clk )
begin // : STATUS_COUNTER
	if ((rdreq) && (!wrreq) && (status_cnt != 0))
		status_cnt <= status_cnt - 1'b1;
	else if ((wrreq) && (!rdreq) && (status_cnt != 16 ))
		status_cnt <= status_cnt + 1'b1;
end
	dual_port_ram_mfifo ram_addr(
	.we1      (wrreq)      , // write enable
	.we2      (rdreq)       , // Read enable
	.addr1 (wr_pointer) , // address_0 input
	.addr2 (rd_pointer) , // address_q input
	.data1    (data)    , // data_0 bi-directional
	.data2    (junk_input),   // data_1 bi-directional
	.clk(clk),
	.out1	(data_ram),
	.out2	(junk_output));


endmodule


`define ZERO        8'b00000000  
`define ONE         8'b00000001  
`define TWO         8'b00000010  
`define THREE 		  8'b00000011  
`define FOUR		  8'b00000100  
`define FIVE		  8'b00000101  
`define SIX         8'b00000110  
`define SEVEN       8'b00000111  
`define EIGHT       8'b00001000  
`define NINE        8'b00001001  
`define TEN         8'b00001010  
`define ELEVEN      8'b00001011  
`define TWELVE      8'b00001100  
`define THIRTEEN    8'b00001101  
`define FOURTEEN    8'b00001110  
`define FIFTEEN     8'b00001111  
`define SIXTEEN     8'b00010000  
`define SEVENTEEN   8'b00010001  
`define EIGHTEEN	  8'b00010010  
`define NINETEEN    8'b00010011  
`define TWENTY		  8'b00010100  
`define TWENTYONE   8'b00010101  
`define TWENTYTWO   8'b00010110  
`define TWENTYTHREE 8'b00010111  
`define TWENTYFOUR  8'b00011000  
  
module fpu_add (clock, a1, b1, sum);  
	input clock;  
	input [31:0]a1;  
	input [31:0]b1;  
	output [31:0]sum;  
	reg [31:0]sum;  
	  
	//Split up the numbers into exponents and mantissa.  
	reg [7:0]a_exp; 
	//reg [7:0]b_exp;  
	reg [23:0]a_man; 
	reg [23:0]b_man; 
  
	reg [7:0]temp;  
	  
	reg [24:0]sum_man;  
	//reg [7:0]sum_exp;  
	  
	//introduce latency on inputs  
	reg [31:0]a;  
	reg [31:0]b;  
	  
	always @ (posedge clock) begin  
		a <= a1;  
		b <= b1;  
	end  
	  
	reg smaller; //smaller is 1 if a < b, 0 otherwise  
	  
	//Shift mantissa's to have the same exponent  
	always @ (a or b) begin  
		//a_exp = a[30:23];  
		//b_exp = b[30:23];  
		//a_man = {1'b1, a[22:0]};  
	   //b_man = {1'b1, b[22:0]};  
		  
		if (a[30:23] < b[30:23]) begin  
			temp = b[30:23] - a[30:23];  
			//a_man = {1'b1, a[22:0]} >> temp; //Expand into case statement, as below.  
			case (temp)   
				`ONE: begin  
					a_man = {1'b1, a[22:0]} >> `ONE;  
				end  
				`TWO: begin  
					a_man = {1'b1, a[22:0]} >> `TWO;  
				end  
				`THREE: begin  
					a_man = {1'b1, a[22:0]} >> `THREE;  
				end  
				`FOUR: begin  
					a_man = {1'b1, a[22:0]} >> `FOUR;  
				end  
				`FIVE: begin  
					a_man = {1'b1, a[22:0]} >> `FIVE;  
				end  
				`SIX: begin  
					a_man = {1'b1, a[22:0]} >> `SIX;  
				end  
				`SEVEN: begin  
					a_man = {1'b1, a[22:0]} >> `SEVEN;  
				end  
				`EIGHT: begin  
					a_man = {1'b1, a[22:0]} >> `EIGHT;  
				end  
				`NINE: begin  
					a_man = {1'b1, a[22:0]} >> `NINE;  
				end  
				`TEN: begin  
					a_man = {1'b1, a[22:0]} >> `TEN;  
				end  
				`ELEVEN: begin  
					a_man = {1'b1, a[22:0]} >> `ELEVEN;  
				end  
				`TWELVE: begin  
					a_man = {1'b1, a[22:0]} >> `TWELVE;  
				end  
				`THIRTEEN: begin  
					a_man = {1'b1, a[22:0]} >> `THIRTEEN;  
				end  
				`FOURTEEN: begin  
					a_man = {1'b1, a[22:0]} >> `FOURTEEN;  
				end  
				`FIFTEEN: begin  
					a_man = {1'b1, a[22:0]} >> `FIFTEEN;  
				end  
				`SIXTEEN: begin  
					a_man = {1'b1, a[22:0]} >> `SIXTEEN;  
				end  
				`SEVENTEEN: begin  
					a_man = {1'b1, a[22:0]} >> `SEVENTEEN;  
				end  
				`EIGHTEEN: begin  
					a_man = {1'b1, a[22:0]} >> `EIGHTEEN;  
				end  
				`NINETEEN: begin  
					a_man = {1'b1, a[22:0]} >> `NINETEEN;  
				end  
				`TWENTY: begin  
					a_man = {1'b1, a[22:0]} >> `TWENTY;  
				end  
				`TWENTYONE: begin  
					a_man = {1'b1, a[22:0]} >> `TWENTYONE;  
				end  
				`TWENTYTWO: begin  
					a_man = {1'b1, a[22:0]} >> `TWENTYTWO;  
				end  
				`TWENTYTHREE: begin  
					a_man = {1'b1, a[22:0]} >> `TWENTYTHREE;  
				end  
				`TWENTYFOUR: begin  
					a_man = {1'b1, a[22:0]} >> `TWENTYFOUR;  
				end  
				default: begin //More than twenty-four, shift by twenty-four. It is a boundary case.  
					a_man = {1'b1, a[22:0]} >> `TWENTYFOUR;  
				end  
			endcase   
				  
			b_man = {1'b1, b[22:0]};  
			a_exp = b[30:23];  
			//b_exp = b[30:23];  
			  
		end else if (a[30:23] > b[30:23]) begin  
			temp = a[30:23] - b[30:23];  
			a_man = {1'b1, a[22:0]};  
			//b_man = {1'b1, b[22:0]} >> temp; //Expand into case statement, as below.  
			case (temp)   
				`ONE: begin  
					b_man = {1'b1, b[22:0]} >> `ONE;  
				end  
				`TWO: begin  
					b_man = {1'b1, b[22:0]} >> `TWO;  
				end  
				`THREE: begin  
					b_man = {1'b1, b[22:0]} >> `THREE;  
				end  
				`FOUR: begin  
					b_man = {1'b1, b[22:0]} >> `FOUR;  
				end  
				`FIVE: begin  
					b_man = {1'b1, b[22:0]} >> `FIVE;  
				end  
				`SIX: begin  
					b_man = {1'b1, b[22:0]} >> `SIX;  
				end  
				`SEVEN: begin  
					b_man = {1'b1, b[22:0]} >> `SEVEN;  
				end  
				`EIGHT: begin  
					b_man = {1'b1, b[22:0]} >> `EIGHT;  
				end  
				`NINE: begin  
					b_man = {1'b1, b[22:0]} >> `NINE;  
				end  
				`TEN: begin  
					b_man = {1'b1, b[22:0]} >> `TEN;  
				end  
				`ELEVEN: begin  
					b_man = {1'b1, b[22:0]} >> `ELEVEN;  
				end  
				`TWELVE: begin  
					b_man = {1'b1, b[22:0]} >> `TWELVE;  
				end  
				`THIRTEEN: begin  
					b_man = {1'b1, b[22:0]} >> `THIRTEEN;  
				end  
				`FOURTEEN: begin  
					b_man = {1'b1, b[22:0]} >> `FOURTEEN;  
				end  
				`FIFTEEN: begin  
					b_man = {1'b1, b[22:0]} >> `FIFTEEN;  
				end  
				`SIXTEEN: begin  
					b_man = {1'b1, b[22:0]} >> `SIXTEEN;  
				end  
				`SEVENTEEN: begin  
					b_man = {1'b1, b[22:0]} >> `SEVENTEEN;  
				end  
				`EIGHTEEN: begin  
					b_man = {1'b1, b[22:0]} >> `EIGHTEEN;  
				end  
				`NINETEEN: begin  
					b_man = {1'b1, b[22:0]} >> `NINETEEN;  
				end  
				`TWENTY: begin  
					b_man = {1'b1, b[22:0]} >> `TWENTY;  
				end  
				`TWENTYONE: begin  
					b_man = {1'b1, b[22:0]} >> `TWENTYONE;  
				end  
				`TWENTYTWO: begin  
					b_man = {1'b1, b[22:0]} >> `TWENTYTWO;  
				end  
				`TWENTYTHREE: begin  
					b_man = {1'b1, b[22:0]} >> `TWENTYTHREE;  
				end  
				`TWENTYFOUR: begin  
					b_man = {1'b1, b[22:0]} >> `TWENTYFOUR;  
				end  
				default: begin //More than twenty-four, shift by twenty-four. It is a boundary case.  
					b_man = {1'b1, b[22:0]} >> `TWENTYFOUR;  
				end  
			endcase   
			  
			a_exp = a[30:23];  
			//b_exp = a[30:23];  
		end else begin  
			temp = 8'b0;  
			a_man = {1'b1, a[22:0]};  
			b_man = {1'b1, b[22:0]};  
			a_exp = a[30:23];  
		end  
		  
	end  
  
	//Perform the addition operation  
	always @ (a_man or b_man or a or b) begin  
		if (a_man < b_man) begin  
			smaller = 1'b1;  
		end else begin  
			smaller = 1'b0;  
		end  
	  
		//both positive  
		if (~a[31] && ~b[31]) begin  
			sum_man = a_man + b_man;  
			sum[31] = 1'b0;  
		end   
		  
		//both negative  
		else if (a[31] && b[31]) begin  
			sum_man = a_man + b_man;  
			sum[31] = 1'b1;  
		end  
		  
		//a pos, b neg  
		else if (~a[31] && b[31]) begin  
			if (smaller) begin //a < b  
				sum_man = b_man - a_man;  
				sum[31] = 1'b1;  
			end else begin  
				sum_man = a_man - b_man;  
				sum[31] = 1'b0;  
			end  
		end  
		  
		//a neg, b pos  
		else /*if (a[31] && ~b[31])*/ begin  
			if (smaller) begin //a < b  
				sum_man = b_man - a_man;  
				sum[31] = 1'b0;  
			end else begin  
				sum_man = a_man - b_man;  
				sum[31] = 1'b1;  
			end  
		end  
	end  
	  
	//Store the number  
	// we already have the sign.  
	  
	always @ (sum_man or a_exp) begin  
		if (sum_man[24])begin //shif sum >> by 1, add 1 to the exponent.  
			sum[22:0] = sum_man[23:1];  
			sum[30:23] = a_exp + 8'b00000001;  
			  
		end else if (sum_man[23]) begin //do nothing  
			sum[22:0] = sum_man[22:0];  
			sum[30:23] = a_exp;  
			  
		end else if (sum_man[22]) begin //shift << by 1, subtract 1 from exponent. 
			sum[22:0] = {sum_man[21:0], 1'b0}; 
			sum[30:23] = a_exp - 8'b00000001; 
 
		end else if (sum_man[21]) begin //shift << by 2, subtract 2 from exponent. 
			sum[22:0] = {sum_man[20:0], 2'b0}; 
			sum[30:23] = a_exp - 8'b00000010; 
 
		end else if (sum_man[20]) begin //shift << by 3, subtract 3 from exponent. 
			sum[22:0] = {sum_man[19:0], 3'b0}; 
			sum[30:23] = a_exp - 8'b00000011; 
 
		end else if (sum_man[19]) begin //shift << by 4, subtract 4 from exponent. 
			sum[22:0] = {sum_man[18:0], 4'b0}; 
			sum[30:23] = a_exp - 8'b00000100; 
 
		end else if (sum_man[18]) begin //shift << by 5, subtract 5 from exponent. 
			sum[22:0] = {sum_man[17:0], 5'b0}; 
			sum[30:23] = a_exp - 8'b00000101; 
 
		end else if (sum_man[17]) begin //shift << by 6, subtract 6 from exponent. 
			sum[22:0] = {sum_man[16:0], 6'b0}; 
			sum[30:23] = a_exp - 8'b00000110; 
 
		end else if (sum_man[16]) begin //shift << by 7, subtract 7 from exponent. 
			sum[22:0] = {sum_man[15:0], 7'b0}; 
			sum[30:23] = a_exp - 8'b00000111; 
 
		end else if (sum_man[15]) begin //shift << by 8, subtract 8 from exponent. 
			sum[22:0] = {sum_man[14:0], 8'b0}; 
			sum[30:23] = a_exp - 8'b00001000; 
 
		end else if (sum_man[14]) begin //shift << by 9, subtract 9 from exponent. 
			sum[22:0] = {sum_man[13:0], 9'b0}; 
			sum[30:23] = a_exp - 8'b00001001; 
 
		end else if (sum_man[13]) begin //shift << by 10, subtract 10 from exponent. 
			sum[22:0] = {sum_man[12:0], 10'b0}; 
			sum[30:23] = a_exp - 8'b00001010; 
 
		end else if (sum_man[12]) begin //shift << by 11, subtract 11 from exponent. 
			sum[22:0] = {sum_man[11:0], 11'b0}; 
			sum[30:23] = a_exp - 8'b00001011; 
 
		end else if (sum_man[11]) begin //shift << by 12, subtract 12 from exponent. 
			sum[22:0] = {sum_man[10:0], 12'b0}; 
			sum[30:23] = a_exp - 8'b00001100; 
 
		end else if (sum_man[10]) begin //shift << by 13, subtract 13 from exponent. 
			sum[22:0] = {sum_man[9:0], 13'b0}; 
			sum[30:23] = a_exp - 8'b00001101; 
 
		end else if (sum_man[9]) begin //shift << by 14, subtract 14 from exponent. 
			sum[22:0] = {sum_man[8:0], 14'b0}; 
			sum[30:23] = a_exp - 8'b00001110; 
 
		end else if (sum_man[8]) begin //shift << by 15, subtract 15 from exponent. 
			sum[22:0] = {sum_man[7:0], 15'b0}; 
			sum[30:23] = a_exp - 8'b00001111; 
 
		end else if (sum_man[7]) begin //shift << by 16, subtract 16 from exponent. 
			sum[22:0] = {sum_man[6:0], 16'b0}; 
			sum[30:23] = a_exp - 8'b00010000; 
 
		end else if (sum_man[6]) begin //shift << by 17, subtract 17 from exponent. 
			sum[22:0] = {sum_man[5:0], 17'b0}; 
			sum[30:23] = a_exp - 8'b00010001; 
 
		end else if (sum_man[5]) begin //shift << by 18, subtract 18 from exponent. 
			sum[22:0] = {sum_man[4:0], 18'b0}; 
			sum[30:23] = a_exp - 8'b00010010; 
 
		end else if (sum_man[4]) begin //shift << by 19, subtract 19 from exponent. 
			sum[22:0] = {sum_man[3:0], 19'b0}; 
			sum[30:23] = a_exp - 8'b00010011; 
 
		end else if (sum_man[3]) begin //shift << by 20, subtract 20 from exponent. 
			sum[22:0] = {sum_man[2:0], 20'b0}; 
			sum[30:23] = a_exp - 8'b00010100; 
 
		end else if (sum_man[2]) begin //shift << by 21, subtract 21 from exponent. 
			sum[22:0] = {sum_man[1:0], 21'b0}; 
			sum[30:23] = a_exp - 8'b00010101; 
 
		end else if (sum_man[1]) begin //shift << by 22, subtract 22 from exponent. 
			sum[22:0] = {sum_man[0:0], 22'b0}; 
			sum[30:23] = a_exp - 8'b00010110; 
 
		end else /*if (sum_man[0])*/ begin //shift << by 23, subtract 23 from exponent. 
			sum[22:0] = 23'b0; 
			sum[30:23] = a_exp - 8'b00010111;  
		end 
		  
	end  
  
endmodule   
	
module fpu_div(clock, n, d, div);  
//n = numerator  
//d = denomenator  
//div = result  
	input clock;  
  
	input [31:0]n;  
	input [31:0]d;  
	output [31:0]div;  
	reg [31:0]div;  
	  
	//Store the mantissa and exponents separately. Introduce the latency of 1.  
	reg [7:0]n_exp;  
	reg [7:0]d_exp;  
	reg [23:0]n_man;  
	reg [23:0]d_man;  
	reg n_sign;  
	reg d_sign;  
	  
	wire [23:0]div_man;  
	reg [7:0]div_exp;  
	  
	always @ (posedge clock) begin  
		n_exp <= n[30:23];  
		d_exp <= d[30:23];  
		n_man <= {1'b1, n[22:0]};  
		d_man <= {1'b1, d[22:0]};  
		n_sign <= n[31];  
		d_sign <= d[31];  
	end  
	  
	//Find the exponent, store in div_exp.  
	always @ (n_exp or d_exp) begin  
		if (n_exp >= d_exp) begin  
			div_exp = 8'b01111111 + (n_exp - d_exp);  
		end else begin  
			div_exp = 8'b01111111 - (d_exp - n_exp);  
		end  
	end  
	  
	//Divide the mantissas, store in div_man.  
	div_24b divide(.numer(n_man), .denom(d_man), .res(div_man));  
	  
	//Store the result. Shift exponents appropriately. Store sign.  
	//Sign  
	always @ (n_sign or d_sign) begin  
		div[31] = n_sign ^ d_sign;  
	end  
	  
	//Mantissa and Exponent  
	always @ (div_man or div_exp) begin  
		if (div_man[23]) begin //do nothing  
			div[22:0] = div_man[22:0];  
			div[30:23] = div_exp;  
		  
		end else if (div_man[22]) begin //shift << by 1, subtract 1 from exponent. 
			div[22:0] = {div_man[21:0], 1'b0}; 
			div[30:23] = div_exp - 8'b00000001; 
 
		end else if (div_man[21]) begin //shift << by 2, subtract 2 from exponent. 
			div[22:0] = {div_man[20:0], 2'b0}; 
			div[30:23] = div_exp - 8'b00000010; 
 
		end else if (div_man[20]) begin //shift << by 3, subtract 3 from exponent. 
			div[22:0] = {div_man[19:0], 3'b0}; 
			div[30:23] = div_exp - 8'b00000011; 
 
		end else if (div_man[19]) begin //shift << by 4, subtract 4 from exponent. 
			div[22:0] = {div_man[18:0], 4'b0}; 
			div[30:23] = div_exp - 8'b00000100; 
 
		end else if (div_man[18]) begin //shift << by 5, subtract 5 from exponent. 
			div[22:0] = {div_man[17:0], 5'b0}; 
			div[30:23] = div_exp - 8'b00000101; 
 
		end else if (div_man[17]) begin //shift << by 6, subtract 6 from exponent. 
			div[22:0] = {div_man[16:0], 6'b0}; 
			div[30:23] = div_exp - 8'b00000110; 
 
		end else if (div_man[16]) begin //shift << by 7, subtract 7 from exponent. 
			div[22:0] = {div_man[15:0], 7'b0}; 
			div[30:23] = div_exp - 8'b00000111; 
 
		end else if (div_man[15]) begin //shift << by 8, subtract 8 from exponent. 
			div[22:0] = {div_man[14:0], 8'b0}; 
			div[30:23] = div_exp - 8'b00001000; 
 
		end else if (div_man[14]) begin //shift << by 9, subtract 9 from exponent. 
			div[22:0] = {div_man[13:0], 9'b0}; 
			div[30:23] = div_exp - 8'b00001001; 
 
		end else if (div_man[13]) begin //shift << by 10, subtract 10 from exponent. 
			div[22:0] = {div_man[12:0], 10'b0}; 
			div[30:23] = div_exp - 8'b00001010; 
 
		end else if (div_man[12]) begin //shift << by 11, subtract 11 from exponent. 
			div[22:0] = {div_man[11:0], 11'b0}; 
			div[30:23] = div_exp - 8'b00001011; 
 
		end else if (div_man[11]) begin //shift << by 12, subtract 12 from exponent. 
			div[22:0] = {div_man[10:0], 12'b0}; 
			div[30:23] = div_exp - 8'b00001100; 
 
		end else if (div_man[10]) begin //shift << by 13, subtract 13 from exponent. 
			div[22:0] = {div_man[9:0], 13'b0}; 
			div[30:23] = div_exp - 8'b00001101; 
 
		end else if (div_man[9]) begin //shift << by 14, subtract 14 from exponent. 
			div[22:0] = {div_man[8:0], 14'b0}; 
			div[30:23] = div_exp - 8'b00001110; 
 
		end else if (div_man[8]) begin //shift << by 15, subtract 15 from exponent. 
			div[22:0] = {div_man[7:0], 15'b0}; 
			div[30:23] = div_exp - 8'b00001111; 
 
		end else if (div_man[7]) begin //shift << by 16, subtract 16 from exponent. 
			div[22:0] = {div_man[6:0], 16'b0}; 
			div[30:23] = div_exp - 8'b00010000; 
 
		end else if (div_man[6]) begin //shift << by 17, subtract 17 from exponent. 
			div[22:0] = {div_man[5:0], 17'b0}; 
			div[30:23] = div_exp - 8'b00010001; 
 
		end else if (div_man[5]) begin //shift << by 18, subtract 18 from exponent. 
			div[22:0] = {div_man[4:0], 18'b0}; 
			div[30:23] = div_exp - 8'b00010010; 
 
		end else if (div_man[4]) begin //shift << by 19, subtract 19 from exponent. 
			div[22:0] = {div_man[3:0], 19'b0}; 
			div[30:23] = div_exp - 8'b00010011; 
 
		end else if (div_man[3]) begin //shift << by 20, subtract 20 from exponent. 
			div[22:0] = {div_man[2:0], 20'b0}; 
			div[30:23] = div_exp - 8'b00010100; 
 
		end else if (div_man[2]) begin //shift << by 21, subtract 21 from exponent. 
			div[22:0] = {div_man[1:0], 21'b0}; 
			div[30:23] = div_exp - 8'b00010101; 
 
		end else if (div_man[1]) begin //shift << by 22, subtract 22 from exponent. 
			div[22:0] = {div_man[0:0], 22'b0}; 
			div[30:23] = div_exp - 8'b00010110; 
  
		end else /*if (div_man[0])*/ begin //shift << by 23, subtract 23 from exponent. 
			div[22:0] = 23'b0; 
			div[30:23] = div_exp - 8'b00010111; 
		end 
  
	end  
	  
endmodule   
  
  
  
  
  
module div_24b(numer, denom, res);  
	//input clock;  
  
	input [23:0]numer;  
	input [23:0]denom;  
	output [23:0]res;  
	reg [23:0]res;  
	  
	//Pad with 23 zeros.  
	wire [46:0]denom_pad;  
	wire [46:0]numer23; 
	reg [46:0]numer22; 
	reg [46:0]numer21; 
	reg [46:0]numer20; 
	reg [46:0]numer19; 
	reg [46:0]numer18; 
	reg [46:0]numer17; 
	reg [46:0]numer16; 
	reg [46:0]numer15; 
	reg [46:0]numer14; 
	reg [46:0]numer13; 
	reg [46:0]numer12; 
	reg [46:0]numer11; 
	reg [46:0]numer10; 
	reg [46:0]numer9; 
	reg [46:0]numer8; 
	reg [46:0]numer7; 
	reg [46:0]numer6; 
	reg [46:0]numer5; 
	reg [46:0]numer4; 
	reg [46:0]numer3; 
	reg [46:0]numer2; 
	reg [46:0]numer1;  
	reg [46:0]numer0;  
	  
	//always @ (posedge clock) begin  
	assign denom_pad = {23'b0, denom};  
	assign numer23 = {numer, 23'b0};  
	// end  
	  
	//res[23]  
	always @ (denom_pad or numer23) begin  
	  
		if (denom_pad[23:0] <= numer23[46:23]) begin 
			res[23] = 1'b1; 
			numer22 = {numer23[46:23] - denom_pad[23:0], 23'b0}; 
		end else begin 
			res[23] = 1'b0; 
			numer22 = numer23; 
		end 
 
		if (denom_pad[24:0] <= numer22[46:22]) begin 
			res[22] = 1'b1; 
			numer21 = {numer22[46:22] - denom_pad[24:0], 22'b0}; 
		end else begin 
			res[22] = 1'b0; 
			numer21 = numer22; 
		end 
 
		if (denom_pad[25:0] <= numer21[46:21]) begin 
			res[21] = 1'b1; 
			numer20 = {numer21[46:21] - denom_pad[25:0], 21'b0}; 
		end else begin 
			res[21] = 1'b0; 
			numer20 = numer21; 
		end 
 
		if (denom_pad[26:0] <= numer20[46:20]) begin 
			res[20] = 1'b1; 
			numer19 = {numer20[46:20] - denom_pad[26:0], 20'b0}; 
		end else begin 
			res[20] = 1'b0; 
			numer19 = numer20; 
		end 
 
		if (denom_pad[27:0] <= numer19[46:19]) begin 
			res[19] = 1'b1; 
			numer18 = {numer19[46:19] - denom_pad[27:0], 19'b0}; 
		end else begin 
			res[19] = 1'b0; 
			numer18 = numer19; 
		end 
 
		if (denom_pad[28:0] <= numer18[46:18]) begin 
			res[18] = 1'b1; 
			numer17 = {numer18[46:18] - denom_pad[28:0], 18'b0}; 
		end else begin 
			res[18] = 1'b0; 
			numer17 = numer18; 
		end 
 
		if (denom_pad[29:0] <= numer17[46:17]) begin 
			res[17] = 1'b1; 
			numer16 = {numer17[46:17] - denom_pad[29:0], 17'b0}; 
		end else begin 
			res[17] = 1'b0; 
			numer16 = numer17; 
		end 
 
		if (denom_pad[30:0] <= numer16[46:16]) begin 
			res[16] = 1'b1; 
			numer15 = {numer16[46:16] - denom_pad[30:0], 16'b0}; 
		end else begin 
			res[16] = 1'b0; 
			numer15 = numer16; 
		end 
 
		if (denom_pad[31:0] <= numer15[46:15]) begin 
			res[15] = 1'b1; 
			numer14 = {numer15[46:15] - denom_pad[31:0], 15'b0}; 
		end else begin 
			res[15] = 1'b0; 
			numer14 = numer15; 
		end 
 
		if (denom_pad[32:0] <= numer14[46:14]) begin 
			res[14] = 1'b1; 
			numer13 = {numer14[46:14] - denom_pad[32:0], 14'b0}; 
		end else begin 
			res[14] = 1'b0; 
			numer13 = numer14; 
		end 
 
		if (denom_pad[33:0] <= numer13[46:13]) begin 
			res[13] = 1'b1; 
			numer12 = {numer13[46:13] - denom_pad[33:0], 13'b0}; 
		end else begin 
			res[13] = 1'b0; 
			numer12 = numer13; 
		end 
 
		if (denom_pad[34:0] <= numer12[46:12]) begin 
			res[12] = 1'b1; 
			numer11 = {numer12[46:12] - denom_pad[34:0], 12'b0}; 
		end else begin 
			res[12] = 1'b0; 
			numer11 = numer12; 
		end 
 
		if (denom_pad[35:0] <= numer11[46:11]) begin 
			res[11] = 1'b1; 
			numer10 = {numer11[46:11] - denom_pad[35:0], 11'b0}; 
		end else begin 
			res[11] = 1'b0; 
			numer10 = numer11; 
		end 
 
		if (denom_pad[36:0] <= numer10[46:10]) begin 
			res[10] = 1'b1; 
			numer9 = {numer10[46:10] - denom_pad[36:0], 10'b0}; 
		end else begin 
			res[10] = 1'b0; 
			numer9 = numer10; 
		end 
 
		if (denom_pad[37:0] <= numer9[46:9]) begin 
			res[9] = 1'b1; 
			numer8 = {numer9[46:9] - denom_pad[37:0], 9'b0}; 
		end else begin 
			res[9] = 1'b0; 
			numer8 = numer9; 
		end 
 
		if (denom_pad[38:0] <= numer8[46:8]) begin 
			res[8] = 1'b1; 
			numer7 = {numer8[46:8] - denom_pad[38:0], 8'b0}; 
		end else begin 
			res[8] = 1'b0; 
			numer7 = numer8; 
		end 
 
		if (denom_pad[39:0] <= numer7[46:7]) begin 
			res[7] = 1'b1; 
			numer6 = {numer7[46:7] - denom_pad[39:0], 7'b0}; 
		end else begin 
			res[7] = 1'b0; 
			numer6 = numer7; 
		end 
 
		if (denom_pad[40:0] <= numer6[46:6]) begin 
			res[6] = 1'b1; 
			numer5 = {numer6[46:6] - denom_pad[40:0], 6'b0}; 
		end else begin 
			res[6] = 1'b0; 
			numer5 = numer6; 
		end 
 
		if (denom_pad[41:0] <= numer5[46:5]) begin 
			res[5] = 1'b1; 
			numer4 = {numer5[46:5] - denom_pad[41:0], 5'b0}; 
		end else begin 
			res[5] = 1'b0; 
			numer4 = numer5; 
		end 
 
		if (denom_pad[42:0] <= numer4[46:4]) begin 
			res[4] = 1'b1; 
			numer3 = {numer4[46:4] - denom_pad[42:0], 4'b0}; 
		end else begin 
			res[4] = 1'b0; 
			numer3 = numer4; 
		end 
 
		if (denom_pad[43:0] <= numer3[46:3]) begin 
			res[3] = 1'b1; 
			numer2 = {numer3[46:3] - denom_pad[43:0], 3'b0}; 
		end else begin 
			res[3] = 1'b0; 
			numer2 = numer3; 
		end 
 
		if (denom_pad[44:0] <= numer2[46:2]) begin 
			res[2] = 1'b1; 
			numer1 = {numer2[46:2] - denom_pad[44:0], 2'b0}; 
		end else begin 
			res[2] = 1'b0; 
			numer1 = numer2; 
		end 
 
		if (denom_pad[45:0] <= numer1[46:1]) begin 
			res[1] = 1'b1; 
			numer0 = {numer1[46:1] - denom_pad[45:0], 1'b0}; 
		end else begin 
			res[1] = 1'b0; 
			numer0 = numer1; 
		end 
  
		if (denom_pad <= numer0) begin  
			res[0] = 1'b1;  
		end else begin  
			res[0] = 1'b0;  
		end 
  
	end  
	  
endmodule   


//////////////////////////////////////////////  
//   
// constants.v  
//  
// Version 1.3  
// Written 7/11/01 David_Harris@hmc.edu & Mark_Phair@hmc.edu  
// Modifed 8/20/01 Mark_Phair@hmc.edu and Justin_Schauer@hmc.edu  
//  
// A set of constants for a parameterized floating point multiplier and adder.  
//  
//////////////////////////////////////////////  
  
//////////////////////////////////////////////  
// FREE VARIABLES  
//////////////////////////////////////////////  
  
// Widths of Fields  
`define WEXP	8  
`define WSIG	23  
`define WFLAG	5  
`define WCONTROL 5  
  
// output flag select (flags[x])  
`define DIVZERO 	0  
`define INVALID 	1  
`define INEXACT 	2  
`define OVERFLOW 	3  
`define UNDERFLOW	4  
  
//////////////////////////////////////////////  
// DEPENDENT VARIABLES  
//////////////////////////////////////////////  
  
`define WIDTH 		32 	//(`WEXP + `WSIG + 1)  
`define PRODWIDTH	48 	//(2 * (`WSIG + 1))  
`define SHIFTWIDTH	96 	//(2 * `PRODWIDTH))  
`define WPRENORM	24	// `WSIG + 1  
`define WEXPSUM		10	// `WEXP + 2  
`define BIAS		127	// (2^(`WEXP)) - 1  
`define WSIGMINUS1	22	// `WSIG - 1, used for rounding  
`define WSHIFTAMT	5	// log2(`WSIG + 1) rounded up  
  
// for trapped over/underflow  
`define UNDERBIAS	192	// 3 * 2 ^ (`WEXP -2)  
`define OVERBIAS	-192	// -`UNDERBIAS  
  
// specialized constants for fpadd  
`define	EXTRASIG	25		// `WSIG+2 this is the amount of precision needed so no  
					// subtraction errors occur  
`define	SHIFT		5		// # bits the max alignment shift will fit in (log2(`WSIG+2)  
					// rounded up to nearest int)  
`define	MAX_EXP		8'b11111110	// the maximum non-infinite exponent,  
					// `WEXP bits, the most significant  
					// `WEXP-1 bits are 1, the LSB is 0  
`define	INF_EXP		8'b11111111	// Infinity exponent, `WEXP bits, all 1  
// Max significand, `WSIG bits, all 1  
`define	MAX_SIG		23'b11111111111111111111111  
`define	WEXP_0		8'b0		// Exponent equals `WEXP'b0  
`define	WEXP_1		8'b1		// Exponent equals one `WEXP'b1  
`define	WSIG_0		23'b0		// Significand equals zero `WSIG'b0  
`define	WSIG_1		23'b1		// Significand equals one `WSIG'b1  
`define	EXTRASIG_0	25'b0		// All result bits for adder zero `EXTRASIG'b0  
  
// specialized constants for fpmul  
`define	MAXSHIFT	24		// `WSIG + 1  
  
// GENERAL SPECIAL NUMBERS - Exp + Significand of special numbers  
// plain NaN `WIDTH-1, all 1  
`define CONSTNAN	{9'b111111111,22'b0}  
// zero `WIDTH-1, all 0  
`define CONSTZERO	31'b0  
// infinity `WEXP all 1, `WSIG all 0  
`define CONSTINFINITY	{8'b11111111, 23'b0}  
// largest number maximum exponent(all 1's - 1) and maximum significand (all 1's)  
`define CONSTLARGEST	{`MAX_EXP, `MAX_SIG}  
`define PRESHIFTZEROS  48'b0 // `PRODWIDTH'b0  
  
//////////////////////////////////////////////  
//   
// fpmul.v  
//  
// Version 1.6  
// Written 07/11/01 David_Harris@hmc.edu & Mark_Phair@hmc.edu  
// Modifed 08/20/01 Mark_Phair@hmc.edu  
//  
// A parameterized floating point multiplier.  
//  
// BLOCK DESCRIPTIONS  
//  
// preprocess 	- general processing, such as zero detection, computing sign, NaN  
//  
// prenorm	- normalize denorms  
//  
// exponent	- sum the exponents, check for tininess before rounding  
//  
// multiply	- multiply the mantissae  
//  
// special	- calculate special cases, such as NaN and infinities  
//  
// shift	- shift the sig and exp if nesc.  
//  
// round	- round product  
//  
// normalize	- normalizes the result if appropriate (i.e. not a denormalized #)  
//  
// flag 	- general flag processing  
//  
// assemble	- assemble results  
//  
//////////////////////////////////////////////  
  
//////////////////////////////////////////////  
// Includes  
//////////////////////////////////////////////  
  
  
  
//////////////////////////////////////////////  
// fpmul module  
//////////////////////////////////////////////  
  
module fpmul(clk, a, b, y_out, control, flags) ; 
	  
	input clk;  
	 
  // external signals 
  input	 [`WIDTH-1:0] 	a, b;		// floating-point inputs 
  output [`WIDTH-1:0] 	y_out;		// floating-point product 
  reg [`WIDTH-1:0] y_out; 
  input  [1:0] control;	// control including rounding mode 
  output [`WFLAG-1:0]	flags;		// DIVZERO, INVALID, INEXACT,  
					// OVERFLOW, UNDERFLOW (defined in constant.v) 
 
	//intermediate y_out 
	wire [`WIDTH-1:0]y; 
					 
  // internal signals 
  wire			multsign;	// sign of product 
  wire			specialsign;	// sign of special 
 
  wire  [`WSIG:0] 	norma;		// normal-form mantissa a, 1 bit larger to hold leading 1 
  wire  [`WSIG:0] 	normb;		// normal-form mantissa b, 1 bit larger to hold leading 1 
 
  wire	[`WEXPSUM-1:0]	expa, expb;	// the two exponents, after prenormalization 
  wire	[`WEXPSUM-1:0] 	expsum;		// sum of exponents (two's complement) 
  wire	[`WEXPSUM-1:0] 	shiftexp;	// shifted exponent 
  wire	[`WEXP-1:0] 	roundexp;	// rounded, correct exponent 
 
  wire	[`PRODWIDTH-1:0] prod;		// product of mantissae 
  wire	[`PRODWIDTH-1:0] normalized;	// Normalized product 
  wire	[`SHIFTWIDTH-1:0] shiftprod;	// shifted product 
  wire	[`WSIG-1:0]	roundprod;	// rounded product 
  wire	[`WIDTH-2:0]	special;	// special case exponent and product 
 
  wire			twoormore;	// product is outside range [1,2) 
  wire			zero;		// zero detected 
  wire			infinity;	// infinity detected 
  wire			aisnan;		// NaN detected in A 
  wire			bisnan;		// NaN detected in B 
  wire			aisdenorm;	// Denormalized number detected in A 
  wire			bisdenorm;	// Denormalized number detected in B 
  wire			specialcase;	// This is a special case 
  wire			specialsigncase; // Use the special case sign 
  wire			roundoverflow;	// overflow in rounding, need to add 1 to exponent 
  wire			invalid;	// invalid operation 
  wire			overflow;	// exponent result too high, standard overflow 
  wire			inexact;	// inexact flag 
  wire			shiftloss;	// lost digits due to a shift, result inaccurate 
  wire	[1:0]		roundmode;	// rounding mode information extracted from control field   
  wire			tiny;		// Result is tiny (denormalized #) after multiplication 
  wire			stilltiny;	// Result is tiny (denormalized #) after rounding 
  wire			denormround;	// rounding occured only because the initial result was 
					//	a denormalized number. This is used to determine 
					//	underflow in cases of denormalized numbers rounding 
					//	up to normalized numbers 
 
  preprocess	preprocesser(a, b, zero, aisnan, bisnan,  
				aisdenorm, bisdenorm, infinity,  
				control, roundmode, sign);   
 
  special	specialer(a, b, special, specialsign, zero,  
				aisnan, bisnan,  
				infinity, invalid,  
				specialcase, specialsigncase); 
   
  prenorm	prenormer(a[`WIDTH-2:0], b[`WIDTH-2:0], norma, normb, expa, expb, aisdenorm, bisdenorm); 
 
  multiply_a	multiplier(norma, normb, prod, twoormore); 
 
  exponent	exponenter(expa, expb, expsum, twoormore, tiny); 
 
  normalize	normalizer(prod, normalized, tiny, twoormore);   
 
  shift		shifter(normalized, expsum, shiftprod,  
			shiftexp, shiftloss); 
 
  round		rounder(shiftprod, shiftexp, shiftloss, 
			roundprod, roundexp,  
			roundmode, sign, tiny, inexact,  
			overflow, stilltiny, denormround); 
 
		// *** To check for tininess before rounding, use tiny 
		//	To check after rounding, use stilltiny 
		// *** for underflow detect: 
		//	To check for inexact result use (inexact | (shiftloss & stilltiny)),  
		//	To check for denormilization loss use (shiftloss & stilltiny) 
//  flag		flager(invalid, overflow, inexact | shiftloss,  
//			shiftloss | inexact, 
//			/* tiny */ (stilltiny | (tiny & denormround)),  
//			specialcase, flags);  
			  
  //ODIN cannot have operations in module instantiations.  
  wire inexact_or_shiftloss;  
  assign inexact_or_shiftloss = inexact | shiftloss;  
  wire shiftloss_or_inexact;  
  assign shiftloss_or_inexact = shiftloss | inexact;  
  wire still_tiny_or_tiny_and_denormround;  
  assign still_tiny_or_tiny_and_denormround = stilltiny | (tiny & denormround);  
  
    flag		flager(invalid, overflow, inexact_or_shiftloss,  
			shiftloss_or_inexact, 
			/* tiny */ stilltiny_or_tiny_and_denormround,  
			specialcase, flags);  
	  
 
  assemble	assembler(roundprod, special, y,  
			sign, specialsign, roundexp,  
			specialcase, specialsigncase, 
			roundmode, flags[`OVERFLOW]); 
			 
	always @ (posedge clk) begin 
		y_out <= y; 
	end 
 
endmodule  
 
  
 
 
module preprocess(a, b, zero, aisnan, bisnan, aisdenorm, bisdenorm, infinity, control, roundmode, sign); 
 
  // external signals 
  input	[`WIDTH-1:0] 	a, b;		// floating-point inputs 
  output 		zero;		// is there a zero? 
  //input	[`WCONTROL-1:0]	control;	// control field  
  input [1:0] control; //the rest is unused, not necessary for ODIN. 
  output [1:0]		roundmode;	// 00 = RN; 01 = RZ; 10 = RP; 11 = RM  
  output		aisnan;		// NaN detected in A 
  output		bisnan;		// NaN detected in B 
  output		aisdenorm;	// denormalized number detected in A 
  output		bisdenorm;	// denormalized number detected in B 
  output		infinity;	// infinity detected in A 
  output		sign;		// sign of product 
 
  // internal signals 
  wire			signa, signb;	// sign of a and b 
  wire [`WEXP-1:0]	expa, expb;	// the exponents of a and b 
  wire [`WSIG-1:0]	siga, sigb;	// the significands of a and b 
  wire			aexpfull;	// the exponent of a is all 1's 
  wire			bexpfull;	// the exponent of b is all 1's 
  wire			aexpzero;	// the exponent of a is all 0's 
  wire			bexpzero;	// the exponent of b is all 0's 
  wire			asigzero;	// the significand of a is all 0's 
  wire			bsigzero;	// the significand of b is all 0's 
 
  // Sign calculation 
  assign signa 		= a[`WIDTH-1]; 
  assign signb 		= b[`WIDTH-1]; 
  assign sign = signa ^ signb; 
 
  // Significand calcuations 
 
  assign siga		= a[`WSIG-1:0]; 
  assign sigb		= b[`WSIG-1:0]; 
  // Are the significands all 0's? 
  assign asigzero	= ~|siga; 
  assign bsigzero	= ~|sigb; 
 
  // Exponent calculations 
 
  assign expa		= a[`WIDTH-2:`WIDTH-`WEXP-1]; 
  assign expb		= b[`WIDTH-2:`WIDTH-`WEXP-1]; 
  // Are the exponents all 0's? 
  assign aexpzero	= ~|expa; 
  assign bexpzero	= ~|expb; 
  // Are the exponents all 1's? 
  assign aexpfull	= &expa; 
  assign bexpfull	= &expb; 
 
  // General calculations 
 
  // Zero Detect 
  assign zero 		= (aexpzero & asigzero) | (bexpzero & bsigzero); 
 
  // NaN detect 
  assign aisnan		= aexpfull & ~asigzero; 
  assign bisnan		= bexpfull & ~bsigzero; 
 
  // Infinity detect 
  assign infinity	= (aexpfull & asigzero) | (bexpfull & bsigzero); 
 
  // Denorm detect 
  assign aisdenorm	= aexpzero & ~asigzero; 
  assign bisdenorm	= bexpzero & ~bsigzero; 
 
  // Round mode extraction 
  assign roundmode	= control[1:0]; 
 
endmodule  
 
module special (a, b, special, specialsign,  
		zero, aisnan, bisnan, infinity,  
		invalid, specialcase, specialsigncase); 
 
  // external signals 
  input	[`WIDTH-1:0] 	a, b;		// floating-point inputs 
  output [`WIDTH-2:0]	special;	// special case output, exp + sig 
  output		specialsign;	// the special-case sign 
  input			zero;		// is there a zero? 
  input			aisnan;		// NaN detected in A 
  input			bisnan;		// NaN detected in B 
  input			infinity;	// infinity detected 
  output		invalid;	// invalid operation 
  output		specialcase;	// this is a special case 
  output		specialsigncase; // Use the special sign 
 
  // internal signals 
  wire			infandzero;	// infinity and zero detected 
  wire	[`WIDTH-2:0]	highernan;	// holds inputed NaN, the higher if two are input, 
					// and dont care if neither a nor b are NaNs 
  wire			aishighernan;	// a is the higher NaN 
 
  assign infandzero	= (infinity & zero); 
 
  //#######SPECIAL ASSIGNMENT###### 
  // #######return higher NaN########## 
  // Use this block if you want to return the higher of two NaNs 
 
  assign aishighernan = (aisnan & ((a[`WSIG-1:0] >= b[`WSIG-1:0]) | ~bisnan)); 
 
  assign highernan[`WIDTH-2:0] = aishighernan ? a[`WIDTH-2:0] : b[`WIDTH-2:0]; 
 
  assign special[`WIDTH-2:0] = (aisnan | bisnan) ? (highernan[`WIDTH-2:0]) :  
			(zero ?  
			(infinity ? (`CONSTNAN) : (`CONSTZERO)) : (`CONSTINFINITY)); 
  // #######return first NaN########## 
  // Use this block to return the first NaN encountered 
//  assign special	= aisnan ? (a[`WIDTH-2:0]) :  
//			(bisnan ? (b[`WIDTH-2:0]) :  
//			(zero ?  
//			(infinity ? (`CONSTNAN) : (`CONSTZERO)) : (`CONSTINFINITY))); 
  //######END SPECIAL ASSIGNMENT####### 
 
  assign specialcase	= zero | aisnan | bisnan | infinity; 
 
  assign invalid	= infandzero; //*** need to include something about signaling NaNs here 
 
  // dont need to check if b is NaN, if it defaults to that point, and b isnt NAN 
  // then it wont be used anyway 
  assign specialsign	= infandzero ? (1'b1) : (aishighernan ? a[`WIDTH-1] : b[`WIDTH-1]); 
 
  assign specialsigncase = infandzero | aisnan | bisnan; 
 
endmodule   
 
module prenorm(a, b, norma, normb, modexpa, modexpb, aisdenorm, bisdenorm); 
 
  //input	[`WIDTH-1:0]	a, b;			// the input floating point numbers  
  input [`WIDTH-2:0] a, b;  //We don't need bit 31 here, unused in ODIN. 
  output [`WSIG:0]	norma, normb;		// the mantissae in normal form 
  output [`WEXPSUM-1:0]	modexpa, modexpb;	// the output exponents, larger to accomodate 
						//	two's complement form 
  input			aisdenorm;		// a is a denormalized number 
  input			bisdenorm;		// b is a denormalized nubmer 
 
  // internal signals 
  wire	[`WEXPSUM-1:0]	expa, expb;		// exponents in two's complement form 
						//	are negative if shifted for a 
						// 	denormalized number 
  wire	[`SHIFT-1:0]	shifta, shiftb; 	// the shift amounts 
  reg [`WSIG:0]	shifteda, shiftedb;	// the shifted significands, used to be wire, changed for ODIN. 
 
  // pull out the exponents 
  assign expa 	= a[`WIDTH-2:`WIDTH-1-`WEXP]; 
  assign expb 	= b[`WIDTH-2:`WIDTH-1-`WEXP]; 
 
  // when breaking appart for paramaterizing: 
  // ### RUN ./prenormshift.pl wsig_in ### 
assign shifta = a[23 - 1] ? 1 : 
                 a[23 - 2] ? 2 : 
                 a[23 - 3] ? 3 : 
                 a[23 - 4] ? 4 : 
                 a[23 - 5] ? 5 : 
                 a[23 - 6] ? 6 : 
                 a[23 - 7] ? 7 : 
                 a[23 - 8] ? 8 : 
                 a[23 - 9] ? 9 : 
                 a[23 - 10] ? 10 : 
                 a[23 - 11] ? 11 : 
                 a[23 - 12] ? 12 : 
                 a[23 - 13] ? 13 : 
                 a[23 - 14] ? 14 : 
                 a[23 - 15] ? 15 : 
                 a[23 - 16] ? 16 : 
                 a[23 - 17] ? 17 : 
                 a[23 - 18] ? 18 : 
                 a[23 - 19] ? 19 : 
                 a[23 - 20] ? 20 : 
                 a[23 - 21] ? 21 : 
                 a[23 - 22] ? 22 : 
                 23; // dont need to check last bit 
// if the second to last isn't 1, then the last one must be 
 
assign shiftb = b[23 - 1] ? 1 : 
                 b[23 - 2] ? 2 : 
                 b[23 - 3] ? 3 : 
                 b[23 - 4] ? 4 : 
                 b[23 - 5] ? 5 : 
                 b[23 - 6] ? 6 : 
                 b[23 - 7] ? 7 : 
                 b[23 - 8] ? 8 : 
                 b[23 - 9] ? 9 : 
                 b[23 - 10] ? 10 : 
                 b[23 - 11] ? 11 : 
                 b[23 - 12] ? 12 : 
                 b[23 - 13] ? 13 : 
                 b[23 - 14] ? 14 : 
                 b[23 - 15] ? 15 : 
                 b[23 - 16] ? 16 : 
                 b[23 - 17] ? 17 : 
                 b[23 - 18] ? 18 : 
                 b[23 - 19] ? 19 : 
                 b[23 - 20] ? 20 : 
                 b[23 - 21] ? 21 : 
                 b[23 - 22] ? 22 : 
                 23; // dont need to check last bit 
// if the second to last isn't 1, then the last one must be 
 
 
 
  // If number is a denorm, the exponent must be  
  //	decremented by the shift amount 
  assign modexpa = aisdenorm ? 1 - shifta : expa;  
  assign modexpb = bisdenorm ? 1 - shiftb : expb;  
 
  // If number is denorm, shift the significand the appropriate amount 
//  assign shifteda = a[`WSIG-1:0] << shifta;  
	//Must have constant shifts for ODIN  
	always @ (shifta or a) begin  
		case (shifta)   
			5'b00001: begin  
				shifteda = a[`WSIG-1:0] << 5'b00001; 
			end 
 
			5'b00010: begin  
				shifteda = a[`WSIG-1:0] << 5'b00010; 
			end 
 
			5'b00011: begin  
				shifteda = a[`WSIG-1:0] << 5'b00011; 
			end 
 
			5'b00100: begin  
				shifteda = a[`WSIG-1:0] << 5'b00100; 
			end 
 
			5'b00101: begin  
				shifteda = a[`WSIG-1:0] << 5'b00101; 
			end 
 
			5'b00110: begin  
				shifteda = a[`WSIG-1:0] << 5'b00110; 
			end 
 
			5'b00111: begin  
				shifteda = a[`WSIG-1:0] << 5'b00111; 
			end 
 
			5'b01000: begin  
				shifteda = a[`WSIG-1:0] << 5'b01000; 
			end 
 
			5'b01001: begin  
				shifteda = a[`WSIG-1:0] << 5'b01001; 
			end 
 
			5'b01010: begin  
				shifteda = a[`WSIG-1:0] << 5'b01010; 
			end 
 
			5'b01011: begin  
				shifteda = a[`WSIG-1:0] << 5'b01011; 
			end 
 
			5'b01100: begin  
				shifteda = a[`WSIG-1:0] << 5'b01100; 
			end 
 
			5'b01101: begin  
				shifteda = a[`WSIG-1:0] << 5'b01101; 
			end 
 
			5'b01110: begin  
				shifteda = a[`WSIG-1:0] << 5'b01110; 
			end 
 
			5'b01111: begin  
				shifteda = a[`WSIG-1:0] << 5'b01111; 
			end 
 
			5'b10000: begin  
				shifteda = a[`WSIG-1:0] << 5'b10000; 
			end 
 
			5'b10001: begin  
				shifteda = a[`WSIG-1:0] << 5'b10001; 
			end 
 
			5'b10010: begin  
				shifteda = a[`WSIG-1:0] << 5'b10010; 
			end 
 
			5'b10011: begin  
				shifteda = a[`WSIG-1:0] << 5'b10011; 
			end 
 
			5'b10100: begin  
				shifteda = a[`WSIG-1:0] << 5'b10100; 
			end 
 
			5'b10101: begin  
				shifteda = a[`WSIG-1:0] << 5'b10101; 
			end 
 
			5'b10110: begin  
				shifteda = a[`WSIG-1:0] << 5'b10110; 
			end 
 
			5'b10111: begin  
				shifteda = a[`WSIG-1:0] << 5'b10111; 
			end 
 
			default: begin //Won't be higher than 23.  
				shifteda = a[`WSIG-1:0];  
			end  
		endcase  
	end  
 
  assign norma 	= aisdenorm ? shifteda : {1'b1, a[`WSIG-1:0]}; 
 
 // assign shiftedb = b[`WSIG-1:0] << shiftb;  
	always @ (shiftb or b) begin  
		case (shiftb)   
			5'b00001: begin  
				shiftedb = b[`WSIG-1:0] << 5'b00001; 
			end 
 
			5'b00010: begin  
				shiftedb = b[`WSIG-1:0] << 5'b00010; 
			end 
 
			5'b00011: begin  
				shiftedb = b[`WSIG-1:0] << 5'b00011; 
			end 
 
			5'b00100: begin  
				shiftedb = b[`WSIG-1:0] << 5'b00100; 
			end 
 
			5'b00101: begin  
				shiftedb = b[`WSIG-1:0] << 5'b00101; 
			end 
 
			5'b00110: begin  
				shiftedb = b[`WSIG-1:0] << 5'b00110; 
			end 
 
			5'b00111: begin  
				shiftedb = b[`WSIG-1:0] << 5'b00111; 
			end 
 
			5'b01000: begin  
				shiftedb = b[`WSIG-1:0] << 5'b01000; 
			end 
 
			5'b01001: begin  
				shiftedb = b[`WSIG-1:0] << 5'b01001; 
			end 
 
			5'b01010: begin  
				shiftedb = b[`WSIG-1:0] << 5'b01010; 
			end 
 
			5'b01011: begin  
				shiftedb = b[`WSIG-1:0] << 5'b01011; 
			end 
 
			5'b01100: begin  
				shiftedb = b[`WSIG-1:0] << 5'b01100; 
			end 
 
			5'b01101: begin  
				shiftedb = b[`WSIG-1:0] << 5'b01101; 
			end 
 
			5'b01110: begin  
				shiftedb = b[`WSIG-1:0] << 5'b01110; 
			end 
 
			5'b01111: begin  
				shiftedb = b[`WSIG-1:0] << 5'b01111; 
			end 
 
			5'b10000: begin  
				shiftedb = b[`WSIG-1:0] << 5'b10000; 
			end 
 
			5'b10001: begin  
				shiftedb = b[`WSIG-1:0] << 5'b10001; 
			end 
 
			5'b10010: begin  
				shiftedb = b[`WSIG-1:0] << 5'b10010; 
			end 
 
			5'b10011: begin  
				shiftedb = b[`WSIG-1:0] << 5'b10011; 
			end 
 
			5'b10100: begin  
				shiftedb = b[`WSIG-1:0] << 5'b10100; 
			end 
 
			5'b10101: begin  
				shiftedb = b[`WSIG-1:0] << 5'b10101; 
			end 
 
			5'b10110: begin  
				shiftedb = b[`WSIG-1:0] << 5'b10110; 
			end 
 
			5'b10111: begin  
				shiftedb = b[`WSIG-1:0] << 5'b10111; 
			end 
  
			default: begin // Won't be higher than 23.  
				shiftedb = b[`WSIG-1:0];  
			end  
		endcase  
	end 
		  
  
  assign normb 	= bisdenorm ? shiftedb : {1'b1, b[`WSIG-1:0]}; 
 
endmodule  
 
module multiply_a (norma, normb, prod, twoormore); 
 
  input	 [`WSIG:0]		norma, normb;	// normalized mantissae 
 
  output [`PRODWIDTH-1:0] 	prod;		// product of mantissae 
  output			twoormore;	// Product overflowed range [1,2) 
 
  // multiplier array  
  //	(*** need a more effecient multiplier,  
  //	designware might work, though) 
  assign prod		= norma * normb; 
 
  // did the multiply overflow the range [1,2)? 
  assign twoormore	= prod[`PRODWIDTH-1]; 
 
endmodule   
 
 
  
module exponent(expa, expb, expsum, twoormore, tiny); 
 
  input	[`WEXPSUM-1:0]	expa, expb;	// the input exponents in 2's complement form 
					//	to accomodate denorms that have been 
					//	prenormalized 
  input			twoormore;	// product is outside range [1,2) 
 
  output [`WEXPSUM-1:0]	expsum;		// the sum of the exponents 
  output		tiny;		// Result is tiny (denormalized #) 
 
  // Sum the exponents, subtract the bias 
  // 	and add 1 (twoormore) if multiply went out of [1,2) range 
  assign expsum = expa + expb - `BIAS + twoormore; 
 
  // The result is tiny if the exponent is less than 1. 
  //	Because the exponent sum is in 2's-complement form, 
  //	it is negative if the first bit is 1, and zero if 
  //    all the bits are zero 
  assign tiny	= ~|expsum[`WEXPSUM-2:0] | expsum[`WEXPSUM-1]; 
 
 
endmodule  
  
 
 
  
module normalize(prod, normalized, tiny, twoormore); 
 
  // external signals 
  input  [`PRODWIDTH-1:0]	prod;		// Product of multiplication 
  output [`PRODWIDTH-1:0]	normalized;	// Normalized product 
  input				tiny;		// Result is tiny (denormalized #) 
  input				twoormore;	// Product overflowed range [1,2) 
 
  // normalize product if appropriate 
  //	There are three possible cases here: 
  //	1) tiny and prod overfl. [1,2)	-> take the whole prod, including the leading 1 
  //	2) tiny or prod overfl. [1,2)	-> dont take the first bit. its zero if its tiny, 
  //				 		and it's the implied 1 if its not 
  //	3) neither tiny nor prod overfl.-> dont take the first 2 bits, the 2nd one is the 
  //						implied 1 
  assign normalized = (tiny & twoormore) ? prod[`PRODWIDTH-1:0] : 
			((tiny ^ twoormore) ? {prod[`PRODWIDTH-2:0],1'b0} : 
			{prod[`PRODWIDTH-3:0],2'b0}); 
 
endmodule   
 
module shift(normalized, selectedexp, shiftprod, shiftexp, shiftloss); 
 
  // external signals 
  input	[`PRODWIDTH-1:0] normalized;	// normalized product of mantissae 
  input	[`WEXPSUM-1:0] 	selectedexp;	// sum of exponents 
  output [`SHIFTWIDTH-1:0] shiftprod;	// shifted and normalized product 
  output [`WEXPSUM-1:0]	shiftexp;	// shifted exponent 
  output		shiftloss;	// loss of accuaracy due to shifting 
 
  // internal signals 
  wire	[`WEXPSUM-1:0]	roundedexp;		// selected exponent + 1 if rounding caused overflow 
//  wire			negexp;		// exponent is negative 
  wire	[`WEXPSUM-1:0]	shiftamt;		// theoretical amount to shift product by 
  wire	[`WSHIFTAMT-1:0] actualshiftamt;	// actual amount to shift product by 
  wire			tozero;		// need more shifts than possible with width of significand 
  wire			doshift;	// only shift if value is nonnegative 
  wire	[`SHIFTWIDTH-1:0] preshift; 	// value before shifting, with more room to ensure lossless shifting 
  reg	[`SHIFTWIDTH-1:0] postshift;	// value after shifting, with more room to ensure lossless shifting, used to be wire, changed for ODIN. 
 
  // set up value for shifting 
  assign preshift	= {normalized, `PRESHIFTZEROS}; 
 
  // determine shift amount 
  assign shiftamt	=  -selectedexp; 
 
  // make sure shift amount is nonnegative 
  //	If the exponent is negative, the shift amount should 
  //	come out positive, otherwise there shouldn't be any 
  //	shifting to be done 
  assign doshift	= ~shiftamt[`WEXPSUM-1]; 
   
  // Determine if the result must be shifted more than 
  //	will show up in the significand, even if it rounds up 
  assign tozero		= doshift & (shiftamt > `MAXSHIFT); 
 
  // If the shift is big enough to shift all the bits out of the final significand, 
  //	then it stops being relevent how much it has been shifted. 
  assign actualshiftamt	= tozero ? `MAXSHIFT : shiftamt[`WSHIFTAMT-1:0]; 
 
  // shift significand 
  //assign postshift	= preshift >> actualshiftamt;  
  //We can only have constant shifts for ODIN:  
  always @ (actualshiftamt or preshift) begin  
		case (actualshiftamt)   
			5'b00001: begin  
				postshift = preshift >> 5'b00001; 
			end 
 
			5'b00010: begin  
				postshift = preshift >> 5'b00010; 
			end 
 
			5'b00011: begin  
				postshift = preshift >> 5'b00011; 
			end 
 
			5'b00100: begin  
				postshift = preshift >> 5'b00100; 
			end 
 
			5'b00101: begin  
				postshift = preshift >> 5'b00101; 
			end 
 
			5'b00110: begin  
				postshift = preshift >> 5'b00110; 
			end 
 
			5'b00111: begin  
				postshift = preshift >> 5'b00111; 
			end 
 
			5'b01000: begin  
				postshift = preshift >> 5'b01000; 
			end 
 
			5'b01001: begin  
				postshift = preshift >> 5'b01001; 
			end 
 
			5'b01010: begin  
				postshift = preshift >> 5'b01010; 
			end 
 
			5'b01011: begin  
				postshift = preshift >> 5'b01011; 
			end 
 
			5'b01100: begin  
				postshift = preshift >> 5'b01100; 
			end 
 
			5'b01101: begin  
				postshift = preshift >> 5'b01101; 
			end 
 
			5'b01110: begin  
				postshift = preshift >> 5'b01110; 
			end 
 
			5'b01111: begin  
				postshift = preshift >> 5'b01111; 
			end 
 
			5'b10000: begin  
				postshift = preshift >> 5'b10000; 
			end 
 
			5'b10001: begin  
				postshift = preshift >> 5'b10001; 
			end 
 
			5'b10010: begin  
				postshift = preshift >> 5'b10010; 
			end 
 
			5'b10011: begin  
				postshift = preshift >> 5'b10011; 
			end 
 
			5'b10100: begin  
				postshift = preshift >> 5'b10100; 
			end 
 
			5'b10101: begin  
				postshift = preshift >> 5'b10101; 
			end 
 
			5'b10110: begin  
				postshift = preshift >> 5'b10110; 
			end 
 
			5'b10111: begin  
				postshift = preshift >> 5'b10111; 
			end 
 
			5'b11000: begin  
				postshift = preshift >> 5'b11000; 
			end 
 
			5'b11001: begin  
				postshift = preshift >> 5'b11001; 
			end 
 
			5'b11010: begin  
				postshift = preshift >> 5'b11010; 
			end 
 
			5'b11011: begin  
				postshift = preshift >> 5'b11011; 
			end 
 
			5'b11100: begin  
				postshift = preshift >> 5'b11100; 
			end 
 
			5'b11101: begin  
				postshift = preshift >> 5'b11101; 
			end 
 
			5'b11110: begin  
				postshift = preshift >> 5'b11110; 
			end 
 
			5'b11111: begin  
				postshift = preshift >> 5'b11111; 
			end  
		  
			default: begin  
				postshift = preshift;  
			end  
		endcase  
	end 
 
 
  // assign appropriate significand 
  assign shiftprod	= doshift ? postshift :	preshift; 
 
  // determine if any bits were lost from the shift 
  //assign shiftloss	= tozero | (negexp & |postshift[`WSIG-1:0]);  
  assign shiftloss	= tozero | (doshift & |postshift[`SHIFTWIDTH-`PRODWIDTH-1:0]);  
 
  // assign appropriate exponent 
  assign shiftexp	= doshift ? 0 : selectedexp;   
 
endmodule   
  
 
 
module round(shiftprod, shiftexp, shiftloss, roundprod, roundexp, roundmode,  
		sign, tiny, inexact, overflow, stilltiny, denormround); 
 
  // external signals 
  input	[`SHIFTWIDTH-1:0] shiftprod;	// normalized and shifted product of mantissae 
  input [`WEXPSUM-1:0]	shiftexp;	// shifted exponent 
  input			shiftloss;	// bits were lost in the shifting process 
  output [`WSIG-1:0] 	roundprod;	// rounded floating-point product 
  output [`WEXP-1:0] 	roundexp;	// rounded exponent 
  input  [1:0] 		roundmode;	// 00 = RN; 01 = RZ; 10 = RP; 11 = RM 
  input			sign;		// sign bit for rounding mode direction 
  input			tiny;		// denormalized number after rounding 
  output		inexact;	// rounding occured 
  output		overflow;	// overflow occured 
  output		stilltiny;	// Result is tiny (denormalized #) after rounding 
  output		denormround;	// result was rounded only because it was a denormalized number 
 
  // internal signals 
  wire			roundzero;	// rounding towards zero 
  wire			roundinf;	// rounding towards infinity 
  wire 			stickybit;	// there one or more 1 bits in the LS bits 
  wire			denormsticky;	// sticky bit if this weren't a denorm 
  wire [`WSIG-1:0] 	MSBits;		// most significant bits 
  wire [`WSIG:0] 	MSBitsplus1; 	// most significant bits plus 1 
					//	for rounding purposes. needs to be one 
					//	bit bigger for overflow 
  wire [1:0]		roundbits;	// bits used to compute rounding decision 
  wire 			rounddecision;	// round up 
  wire			roundoverflow;	// rounding overflow occured 
  wire [`WEXPSUM-1:0]	tempexp;	// exponent after rounding 
 
  //reduce round mode to three modes 
  //	dont need round nearest, it is implied 
  //	by roundzero and roundinf being false 
  //assign roundnearest 	= ~&roundmode; 
//  assign roundzero	= &roundmode || (^roundmode && (roundmode[0] || sign)); 
  assign roundzero	= (~roundmode[1] & roundmode[0]) | (roundmode[1] & (roundmode[0] ^ sign)); 
  assign roundinf	= roundmode[1] & ~(sign ^ roundmode[0]); 
 
  // pull out the most significant bits for the product 
  assign MSBits = shiftprod[`SHIFTWIDTH-1:`SHIFTWIDTH-`WSIG]; 
 
  // add a 1 to the end of MSBits for round up 
  assign MSBitsplus1 = MSBits + 1; 
 
  // pull out the last of the most significant bits  
  //	and the first of the least significant bits 
  //	to use for calculating the rounding decision 
  assign roundbits[1:0]	= shiftprod[`SHIFTWIDTH-`WSIG:`SHIFTWIDTH-`WSIG-1]; 
 
  // calculate the sticky bit. Are any of the least significant bits 1? 
  //	also: was anything lost while shifting? 
  // *** Optimization: some of these bits are already checked from the shiftloss *** 
  // *** Optimization: stickybit can be calculated from denormsticky  
  //			with only 1 more gate, instead of duplication of effort *** 
  assign stickybit 	= |shiftprod[`SHIFTWIDTH-`WSIG-2:0] | shiftloss; 
  assign denormsticky 	= |shiftprod[`SHIFTWIDTH-`WSIG-3:0] | shiftloss; 
 
  // Compute rounding decision 
  assign rounddecision	= ~roundzero & 	( (roundbits[0]	& (roundinf | roundbits[1])) 
					| (stickybit	& (roundinf | roundbits[0])) 
					); 
 
  // Was this only rounded because it is a denorm? 
  assign denormround	= tiny & rounddecision & ~denormsticky & roundbits[0]; 
 
  // detect rounding overflow. it only overflows if: 
  // 1) the top bit of MSBitsplus1 is 1 
  // 2) it decides to round up 
  assign roundoverflow	= MSBitsplus1[`WSIG] & rounddecision; 
 
  // assign significand (and postnormalize) 
  //  rounddecision decides whether to use msbits+1 or msbits. 
  //  if using msbits+1 and there is an rounding overflow (i.e. result=2), 
  //  then should return 1 instead 
  assign roundprod = rounddecision ?  
 			(roundoverflow ? 0 :  
			MSBitsplus1[`WSIG-1:0]) : 
			MSBits; 
 
  // detect inexact 
  assign inexact	= rounddecision | stickybit | roundbits[0]; 
 
  // compensate for a rounding overflow 
  assign tempexp 	= roundoverflow + shiftexp; 
 
  // check for overflow in exponent 
  //	overflow occured if the number 
  //	is too large to be represented, 
  //	i.e. can't fit in `WEXP bits, or 
  //	all `WEXP bits are 1's 
  assign overflow	= &tempexp[`WEXP-1:0] | |tempexp[`WEXPSUM-1:`WEXP]; 
 
  // two possible cases: 
  //	1) Overflow: then exponent doesnt matter, 
  //	it will be changed to infinity anyway 
  //	2) not overflow: the leading bits will be 0 
  assign roundexp	= tempexp[`WEXP-1:0]; 
 
  // The result is tiny if the exponent is less than 1. 
  //	Because the exponent sum is NOT in 2's-complement form, 
  //	it is only less than one if its is zero, i.e. 
  //    all the bits are 0 
  assign stilltiny	= ~|roundexp; 
 
endmodule  
 
 
module flag (invalid, overflow, inexact, underflow, tiny, specialcase, flags); 
 
  input			invalid;	// invalid operation 
  input			overflow;	// the result was too large 
  input			inexact;	// The result was rounded 
  input			specialcase;	// Using special result, shouldn't throw flags 
  input			underflow;	// Underflow detected 
  input			tiny;		// The result is tiny 
 
  output [`WFLAG-1:0]	flags;		// DIVZERO, INVALID, INEXACT,  
					// OVERFLOW, UNDERFLOW (defined in constant.v) 
 
  // flags 
  assign flags[`DIVZERO]	= 1'b0; 
  assign flags[`INVALID]	= invalid; 
  assign flags[`INEXACT]	= ~specialcase & (inexact | underflow | overflow); 
  assign flags[`OVERFLOW]	= ~specialcase & overflow; 
  assign flags[`UNDERFLOW]	= tiny; //~specialcase & tiny & underflow & ~overflow; 
 
endmodule  
 
module assemble(roundprod, special, y, sign, specialsign,  
		shiftexp, specialcase, specialsigncase, 
		roundmode, overflow); 
 
  // external signals 
  input	[`WSIG-1:0] 	roundprod;	// shifted, rounded and normalized  
					// 	product of mantissae 
  input	[`WIDTH-2:0]	special;	// special case product + exponent 
  output [`WIDTH-1:0] 	y;		// floating-point product 
  input			sign;		// sign of product (+ = 0, - = 1) 
  input			specialsign;	// special case sign 
  input	[`WEXP-1:0] 	shiftexp;	// shifted exponent 
  input			specialcase;	// this is a special case 
  input			specialsigncase; // use the special case sign 
  input	[1:0]		roundmode;	// rounding mode information extracted from control field   
  input			overflow;	// overflow detected 
   
  // internal signals 
  wire	[`WIDTH-2:0]	rounded;	// final product + exponent 
  wire	[`WIDTH-2:0]	overflowvalue;	// product + exponent for overflow condition 
  wire			undenormed;	// the result was denormalized before rounding, but rounding  
					//	caused it to become a small normalized number. 
 
  // SET UP ROUNDED PRODUCT + EXPONENT 
   
  // assign significand 
  assign rounded[`WSIG-1:0]	= roundprod; 
 
  // assign exponent 
  assign rounded[`WIDTH-2:`WIDTH-`WEXP-1] = shiftexp; 
 
  // SET UP OVERFLOW CONDITION 
  assign overflowvalue[`WIDTH-2:0] = roundmode[1] ?  
				(sign ^ roundmode[0] ? `CONSTLARGEST : `CONSTINFINITY) : 
				(roundmode[0] ? `CONSTLARGEST: `CONSTINFINITY); 
 
  // FINAL PRODUCT ASSIGN  
 
  // assign sign 
  assign y[`WIDTH-1]	= specialsigncase ? specialsign : sign; 
 
  // assign product vs special vs overflowed 
  assign y[`WIDTH-2:0]	= specialcase ? special[`WIDTH-2:0] : 
				(overflow ? overflowvalue[`WIDTH-2:0] : 
				rounded[`WIDTH-2:0]); 
 
endmodule 

//---------------------------------------
// A dual-port RAM
// This module is tuned for VTR's benchmarks
//---------------------------------------
module dual_port_ram (
	input clk,
	input we1,
	input we2,
	input [`rRAMSIZEWIDTH - 1 : 0] addr1,
	input [`RAMWIDTH - 1 : 0] data1,
	output [`RAMWIDTH - 1 : 0] out1,
	input [`rRAMSIZEWIDTH - 1 : 0] addr2,
	input [`RAMWIDTH - 1 : 0] data2,
	output [`RAMWIDTH - 1 : 0] out2
);
	reg [`RAMWIDTH - 1 : 0] ram[2**`rRAMSIZEWIDTH - 1 : 0];
	reg [`RAMWIDTH - 1 : 0] data_out1;
	reg [`RAMWIDTH - 1 : 0] data_out2;

	assign out1 = data_out1;
	assign out2 = data_out2;

    // If writen enable 1 is activated,
    // data1 will be loaded through addr1
    // Otherwise, data will be read out through addr1
	always @(posedge clk) begin
		if (we1) begin
			ram[addr1] <= data1;
        end else begin
			data_out1 <= ram[addr1];
		end
	end

    // If writen enable 2 is activated,
    // data1 will be loaded through addr2
    // Otherwise, data will be read out through addr2
	always @(posedge clk) begin
		if (we2) begin
			ram[addr2] <= data2;
		end else begin
			data_out2 <= ram[addr2];
		end
	end

endmodule

//---------------------------------------
// A dual-port RAM 4096x32
// This module is tuned for VTR's benchmarks
//---------------------------------------
module dual_port_ram_4096x32 (
	input clk,
	input we1,
	input we2,
	input [12 - 1 : 0] addr1,
	input [32 - 1 : 0] data1,
	output [32 - 1 : 0] out1,
	input [12 - 1 : 0] addr2,
	input [32 - 1 : 0] data2,
	output [32 - 1 : 0] out2
);
	reg [32 - 1 : 0] ram[2**12 - 1 : 0];
	reg [32 - 1 : 0] data_out1;
	reg [32 - 1 : 0] data_out2;

	assign out1 = data_out1;
	assign out2 = data_out2;

    // If writen enable 1 is activated,
    // data1 will be loaded through addr1
    // Otherwise, data will be read out through addr1
	always @(posedge clk) begin
		if (we1) begin
			ram[addr1] <= data1;
        end else begin
			data_out1 <= ram[addr1];
		end
	end

    // If writen enable 2 is activated,
    // data1 will be loaded through addr2
    // Otherwise, data will be read out through addr2
	always @(posedge clk) begin
		if (we2) begin
			ram[addr2] <= data2;
		end else begin
			data_out2 <= ram[addr2];
		end
	end

endmodule

//---------------------------------------
// A dual-port RAM rFIFO
// This module is tuned for VTR's benchmarks
//---------------------------------------
module dual_port_ram_rfifo (
	input clk,
	input we1,
	input we2,
	input [`rFIFOSIZEWIDTH - 1 : 0] addr1,
	input [`rFIFOINPUTWIDTH - 1 : 0] data1,
	output [`rFIFOINPUTWIDTH - 1 : 0] out1,
	input [`rFIFOSIZEWIDTH - 1 : 0] addr2,
	input [`rFIFOINPUTWIDTH - 1 : 0] data2,
	output [`rFIFOINPUTWIDTH - 1 : 0] out2
);
	reg [`rFIFOINPUTWIDTH - 1 : 0] ram[2**`rFIFOSIZEWIDTH - 1 : 0];
	reg [`rFIFOINPUTWIDTH - 1 : 0] data_out1;
	reg [`rFIFOINPUTWIDTH - 1 : 0] data_out2;

	assign out1 = data_out1;
	assign out2 = data_out2;

    // If writen enable 1 is activated,
    // data1 will be loaded through addr1
    // Otherwise, data will be read out through addr1
	always @(posedge clk) begin
		if (we1) begin
			ram[addr1] <= data1;
        end else begin
			data_out1 <= ram[addr1];
		end
	end

    // If writen enable 2 is activated,
    // data1 will be loaded through addr2
    // Otherwise, data will be read out through addr2
	always @(posedge clk) begin
		if (we2) begin
			ram[addr2] <= data2;
		end else begin
			data_out2 <= ram[addr2];
		end
	end

endmodule

//---------------------------------------
// A dual-port RAM wFIFO
// This module is tuned for VTR's benchmarks
//---------------------------------------
module dual_port_ram_wfifo (
	input clk,
	input we1,
	input we2,
	input [`wFIFOSIZEWIDTH - 1 : 0] addr1,
	input [`wFIFOINPUTWIDTH - 1 : 0] data1,
	output [`wFIFOINPUTWIDTH - 1 : 0] out1,
	input [`wFIFOSIZEWIDTH - 1 : 0] addr2,
	input [`wFIFOINPUTWIDTH - 1 : 0] data2,
	output [`wFIFOINPUTWIDTH - 1 : 0] out2
);
	reg [`wFIFOINPUTWIDTH - 1 : 0] ram[2**`wFIFOSIZEWIDTH - 1 : 0];
	reg [`wFIFOINPUTWIDTH - 1 : 0] data_out1;
	reg [`wFIFOINPUTWIDTH - 1 : 0] data_out2;

	assign out1 = data_out1;
	assign out2 = data_out2;

    // If writen enable 1 is activated,
    // data1 will be loaded through addr1
    // Otherwise, data will be read out through addr1
	always @(posedge clk) begin
		if (we1) begin
			ram[addr1] <= data1;
        end else begin
			data_out1 <= ram[addr1];
		end
	end

    // If writen enable 2 is activated,
    // data1 will be loaded through addr2
    // Otherwise, data will be read out through addr2
	always @(posedge clk) begin
		if (we2) begin
			ram[addr2] <= data2;
		end else begin
			data_out2 <= ram[addr2];
		end
	end

endmodule

//---------------------------------------
// A dual-port RAM wFIFO
// This module is tuned for VTR's benchmarks
//---------------------------------------
module dual_port_ram_afifo (
	input clk,
	input we1,
	input we2,
	input [`aFIFOSIZEWIDTH - 1 : 0] addr1,
	input [`aFIFOWIDTH - 1 : 0] data1,
	output [`aFIFOWIDTH - 1 : 0] out1,
	input [`aFIFOSIZEWIDTH - 1 : 0] addr2,
	input [`aFIFOWIDTH - 1 : 0] data2,
	output [`aFIFOWIDTH - 1 : 0] out2
);
	reg [`aFIFOWIDTH - 1 : 0] ram[2**`aFIFOSIZEWIDTH - 1 : 0];
	reg [`aFIFOWIDTH - 1 : 0] data_out1;
	reg [`aFIFOWIDTH - 1 : 0] data_out2;

	assign out1 = data_out1;
	assign out2 = data_out2;

    // If writen enable 1 is activated,
    // data1 will be loaded through addr1
    // Otherwise, data will be read out through addr1
	always @(posedge clk) begin
		if (we1) begin
			ram[addr1] <= data1;
        end else begin
			data_out1 <= ram[addr1];
		end
	end

    // If writen enable 2 is activated,
    // data1 will be loaded through addr2
    // Otherwise, data will be read out through addr2
	always @(posedge clk) begin
		if (we2) begin
			ram[addr2] <= data2;
		end else begin
			data_out2 <= ram[addr2];
		end
	end

endmodule

//---------------------------------------
// A dual-port RAM mFIFO
// This module is tuned for VTR's benchmarks
//---------------------------------------
module dual_port_ram_mfifo (
	input clk,
	input we1,
	input we2,
	input [`mFIFOSIZEWIDTH - 1 : 0] addr1,
	input [`mFIFOWIDTH - 1 : 0] data1,
	output [`mFIFOWIDTH - 1 : 0] out1,
	input [`mFIFOSIZEWIDTH - 1 : 0] addr2,
	input [`mFIFOWIDTH - 1 : 0] data2,
	output [`mFIFOWIDTH - 1 : 0] out2
);
	reg [`mFIFOWIDTH - 1 : 0] ram[2**`mFIFOSIZEWIDTH - 1 : 0];
	reg [`mFIFOWIDTH - 1 : 0] data_out1;
	reg [`mFIFOWIDTH - 1 : 0] data_out2;

	assign out1 = data_out1;
	assign out2 = data_out2;

    // If writen enable 1 is activated,
    // data1 will be loaded through addr1
    // Otherwise, data will be read out through addr1
	always @(posedge clk) begin
		if (we1) begin
			ram[addr1] <= data1;
        end else begin
			data_out1 <= ram[addr1];
		end
	end

    // If writen enable 2 is activated,
    // data1 will be loaded through addr2
    // Otherwise, data will be read out through addr2
	always @(posedge clk) begin
		if (we2) begin
			ram[addr2] <= data2;
		end else begin
			data_out2 <= ram[addr2];
		end
	end

endmodule