OpenFPGA/openfpga_flow/benchmarks/vtr_benchmark/LU8PEEng.v

5278 lines
144 KiB
Verilog
Executable File

//auto-generated top.v
//top level module of LU factorization
//by Wei Zhang
`define NWIDTH 6'b010100
`define BLOCKWIDTH 4'b0101
`define DDRWIDTH 7'b0100000
`define DDRNUMDQS 4'b0100
`define DDRSIZEWIDTH 6'b011000
`define BURSTLEN 3'b010
`define MEMCONWIDTH 8'b01000000
`define MEMCONNUMBYTES 5'b01000
`define RAMWIDTH 10'b0100000000
`define RAMNUMBYTES 7'b0100000
`define RAMSIZEWIDTH 4'b0101
`define TOPWIDTH 7'b0100000
`define rFIFOINPUTWIDTH 8'b01000000
`define wFIFOINPUTWIDTH 10'b0100000000
`define mFIFOWIDTH 6'b011100
`define aFIFOWIDTH 4'b0101
module LU8PEEng (clk, //ref_clk, global_reset_n,
start, N, offset, done,
//mem_addr, mem_ba, mem_cas_n, mem_cke, mem_clk, mem_clk_n, mem_cs_n,
burst_begin,
mem_local_be,
mem_local_read_req,
mem_local_size,
mem_local_wdata,
mem_local_write_req,
mem_local_rdata,
mem_local_rdata_valid,
mem_local_ready,
mem_local_wdata_req,
reset_n,
mem_local_addr
//Cong: dummy output
//a_junk,
//w_junk,
//m_junk,
//r_junk,
//Cong:dummy output
//junk_r,
//junk_r1,
//junk_r2,
//junk_r3,
//junk_top
);
input start;
input[`NWIDTH-1:0] N;
input[`DDRSIZEWIDTH-1:0] offset;
output done;
input clk;
output burst_begin;
output [`MEMCONNUMBYTES-1:0] mem_local_be;
output mem_local_read_req;
output [`BURSTLEN-1:0] mem_local_size;
output [`MEMCONWIDTH-1:0] mem_local_wdata;
output mem_local_write_req;
output [`DDRSIZEWIDTH-1:0] mem_local_addr;
input [`MEMCONWIDTH-1:0] mem_local_rdata;
input mem_local_rdata_valid;
input mem_local_ready;
input reset_n;
input mem_local_wdata_req;
wire[`BLOCKWIDTH-1:0] m, n, loop;
wire[1:0] mode;
wire comp_start, comp_done;
wire dtu_write_req, dtu_read_req, dtu_ack, dtu_done;
wire [`DDRSIZEWIDTH-1:0] dtu_mem_addr;
wire [`RAMSIZEWIDTH-1:0] dtu_ram_addr;
wire [`BLOCKWIDTH-1:0] dtu_size;
wire left_sel;
wire[`RAMWIDTH-1:0] curWriteDataMem, curReadDataMem;
wire[`RAMSIZEWIDTH-1:0] curWriteAddrMem, curReadAddrMem;
wire[`RAMNUMBYTES-1:0] curWriteByteEnMem;
wire curWriteEnMem;
wire[`RAMWIDTH-1:0] leftWriteDataMem;
wire[`RAMSIZEWIDTH-1:0] leftWriteAddrMem;
wire[`RAMNUMBYTES-1:0] leftWriteByteEnMem;
wire leftWriteEnMem;
wire curMemSel, leftMemSel;
wire burst_begin;
wire [`MEMCONNUMBYTES-1:0] mem_local_be;
wire mem_local_read_req;
wire [`BURSTLEN-1:0] mem_local_size;
wire [`MEMCONWIDTH-1:0] mem_local_wdata;
wire mem_local_write_req;
wire [`MEMCONWIDTH-1:0] mem_local_rdata;
wire mem_local_rdata_valid;
wire mem_local_ready;
wire mem_local_wdata_req;
wire reset_n;
wire [`DDRSIZEWIDTH-1:0] mem_local_addr;
wire[`RAMWIDTH-1:0] ram_write_data, ram_read_data;
wire[`RAMSIZEWIDTH-1:0] ram_write_addr, ram_read_addr;
wire[`RAMNUMBYTES-1:0] ram_write_byte_en;
wire ram_write_en;
MarshallerController MC (clk, start, done, N, offset,
comp_start, m, n, loop, mode, comp_done, curMemSel, leftMemSel,
dtu_write_req, dtu_read_req, dtu_mem_addr, dtu_ram_addr, dtu_size, dtu_ack, dtu_done, left_sel);
// block that computes the LU factorization, with answer stored back into ram block
LU compBlock (clk, comp_start, m, n, loop, mode, comp_done,
curReadAddrMem, curReadDataMem, curWriteByteEnMem, curWriteDataMem, curWriteAddrMem, curWriteEnMem, curMemSel,
leftWriteByteEnMem, leftWriteDataMem, leftWriteAddrMem, leftWriteEnMem, leftMemSel);
DataTransferUnit DTU (.clk(clk), .dtu_write_req(dtu_write_req), .dtu_read_req(dtu_read_req), .dtu_mem_addr(dtu_mem_addr), .dtu_ram_addr(dtu_ram_addr), .dtu_size(dtu_size), .dtu_ack(dtu_ack), .dtu_done(dtu_done),
.ram_read_addr(ram_read_addr), .ram_read_data(ram_read_data), .ram_write_byte_en(ram_write_byte_en), .ram_write_data(ram_write_data), .ram_write_addr(ram_write_addr), .ram_write_en(ram_write_en),
.mem_rdata(mem_local_rdata), .mem_rdata_valid(mem_local_rdata_valid), .mem_ready(mem_local_ready), .mem_wdata_req(mem_local_wdata_req), .reset_n(reset_n),
.burst_begin(burst_begin), .mem_local_addr(mem_local_addr), .mem_be(mem_local_be), .mem_read_req(mem_local_read_req), .mem_size(mem_local_size),
.mem_wdata(mem_local_wdata), .mem_write_req(mem_local_write_req)
//Cong: dummy output
);
assign curReadAddrMem = ram_read_addr;
assign curWriteByteEnMem = ram_write_byte_en;
assign curWriteDataMem = ram_write_data;
assign curWriteAddrMem = ram_write_addr;
assign curWriteEnMem = ram_write_en && (left_sel == 0);
assign leftWriteByteEnMem = ram_write_byte_en;
assign leftWriteDataMem = ram_write_data;
assign leftWriteAddrMem = ram_write_addr;
assign leftWriteEnMem = ram_write_en && (left_sel == 1);
assign ram_read_data = curReadDataMem;
endmodule
`define BLOCKM 6'b010000
`define BLOCKN 6'b010000
`define BLOCKMDIVK 3'b010
`define MEMBLOCKM 5'b01000
`define MEMBLOCKN 5'b01000
`define NWIDTH 6'b010100
`define BLOCKWIDTH 4'b0101
`define DDRSIZEWIDTH 6'b011000
`define RAMSIZEWIDTH 4'b0101
`define START 1'b0 //0
`define SETUP 2'b01 //1
`define FIRST 3'b010 //2
`define MODE0_SETUP 3'b011 //3
`define MODE0_WAIT 4'b0100 //4
`define MODE0 4'b0101 //5
`define MODE1_SETUP 4'b0110 //6
`define MODE1_WAIT 4'b0111 //7
`define MODE1 5'b01000 //8
`define MODE2_SETUP 5'b01001 //9
`define MODE2_WAIT 5'b01010 //10
`define MODE2 5'b01011 //11
`define MODE3_SETUP 5'b01100 //12
`define MODE3_WAIT 5'b01101 //13
`define MODE3 5'b01110 //14
`define STALL 5'b01111 //15
`define STALL_WAIT 6'b010000 //16
`define WAIT 6'b010001 //17
`define FINAL_WRITE 6'b010010 //18
`define FINAL_WAIT 6'b010011 //19
`define IDLE 6'b010100 //20
`define LAST_SETUP 6'b010101 //21
`define LAST_SETUP_WAIT 6'b010110 //22
`define LAST 6'b010111 //23
`define LAST_WAIT 6'b011000 //24
`define MEM_IDLE 1'b0 //0
`define MEM_WRITE 2'b01 //1
`define MEM_WRITE_WAIT 3'b010 //2
`define MEM_CHECK_DONE 3'b011 //3
`define MEM_READ 4'b0100 //4
`define MEM_READ_WAIT 4'b0101 //5
`define MEM_DONE 4'b0110 //6
`define MEM_WAIT_DONE 4'b0111 //7
module MarshallerController (clk, start, done, input_N, offset,
comp_start, block_m, block_n, loop, mode, comp_done, cur_mem_sel, left_mem_sel,
dtu_write_req, dtu_read_req, dtu_mem_addr, dtu_ram_addr, dtu_size, dtu_ack, dtu_done, left_sel);
input clk;
input start;
output done;
input [`NWIDTH-1:0] input_N;
input [`DDRSIZEWIDTH-1:0] offset;
// for computation section
output comp_start;
output [`BLOCKWIDTH-1:0] block_m, block_n, loop;
output [1:0] mode;
input comp_done;
output cur_mem_sel, left_mem_sel;
// for data marshaller section
output dtu_write_req, dtu_read_req;
output [`DDRSIZEWIDTH-1:0] dtu_mem_addr;
output [`RAMSIZEWIDTH-1:0] dtu_ram_addr;
output [`BLOCKWIDTH-1:0] dtu_size;
input dtu_ack, dtu_done;
output left_sel;
reg [4:0] cur_state, next_state;
reg [`NWIDTH-1:0] comp_N, N, mcount, ncount, Ndivk, mem_N;
reg [1:0] mode;
reg [`BLOCKWIDTH-1:0] block_m, block_n, loop, read_n;
reg [`BLOCKWIDTH-1:0] write_n, write_n_buf;
reg left_mem_sel, cur_mem_sel, no_left_switch;
reg [3:0] cur_mem_state, next_mem_state;
reg [`RAMSIZEWIDTH-1:0] ram_addr;
reg [`DDRSIZEWIDTH-1:0] mem_addr;
reg [`DDRSIZEWIDTH-1:0] mem_base, mem_top, mem_write, mem_left, mem_cur;
reg [`DDRSIZEWIDTH-1:0] mem_write_buf;
reg [`BLOCKWIDTH-1:0] mem_count;
reg [1:0] mem_read;
reg [`BLOCKWIDTH-1:0] mem_write_size, mem_write_size_buf, mem_read_size;
wire mem_done;
assign done = (cur_state == `IDLE);
assign dtu_ram_addr = ram_addr;
assign dtu_mem_addr = mem_addr;
assign dtu_size = (cur_mem_state == `MEM_WRITE) ? mem_write_size : mem_read_size;
assign comp_start = (cur_state == `MODE0)||(cur_state == `MODE1)||(cur_state == `MODE2)||(cur_state == `MODE3)||(cur_state == `FIRST)||(cur_state == `LAST);
assign dtu_write_req = (cur_mem_state == `MEM_WRITE);
assign dtu_read_req = (cur_mem_state == `MEM_READ);
assign mem_done = (cur_mem_state == `MEM_DONE)&&(dtu_done == 1'b1);
assign left_sel = mem_read == 2'b01 && (cur_mem_state == `MEM_READ || cur_mem_state == `MEM_READ_WAIT || cur_mem_state == `MEM_WAIT_DONE);
// FSM to produce memory instructions to DTU
always @ (posedge clk)
begin
case (cur_mem_state)
`MEM_IDLE:
begin
if (cur_state == `START)
next_mem_state <= `MEM_CHECK_DONE;
else
next_mem_state <= `MEM_IDLE;
end
`MEM_DONE:
begin
if (cur_state == `MODE0 || cur_state == `MODE1 || cur_state == `MODE2 ||
cur_state == `MODE3 || cur_state == `FINAL_WRITE || cur_state == `LAST_SETUP)
next_mem_state <= `MEM_WRITE;
else if (cur_state == `FIRST)
next_mem_state <= `MEM_CHECK_DONE;
else
next_mem_state <= `MEM_DONE;
end
`MEM_WRITE:
begin
next_mem_state <= `MEM_WRITE_WAIT;
end
`MEM_WRITE_WAIT:
begin
if (dtu_ack == 1'b1)
begin
if (mem_count == write_n)
next_mem_state <= `MEM_WAIT_DONE;
else
next_mem_state <= `MEM_WRITE;
end
else
next_mem_state <= `MEM_WRITE_WAIT;
end
`MEM_WAIT_DONE:
begin
if (dtu_done == 1'b1)
next_mem_state <= `MEM_CHECK_DONE;
else
next_mem_state <= `MEM_WAIT_DONE;
end
`MEM_CHECK_DONE:
begin
if (mem_read == 2'b10)
next_mem_state <= `MEM_DONE;
else
next_mem_state <= `MEM_READ;
end
`MEM_READ:
begin
next_mem_state <= `MEM_READ_WAIT;
end
`MEM_READ_WAIT:
begin
if (dtu_ack == 1'b1)
begin
if (mem_count == read_n)
next_mem_state <= `MEM_WAIT_DONE;
else
next_mem_state <= `MEM_READ;
end
else
next_mem_state <= `MEM_READ_WAIT;
end
default:
next_mem_state <= `MEM_IDLE;
endcase
end
always @ (posedge clk)
begin
if (cur_mem_state == `MEM_DONE || cur_mem_state == `MEM_IDLE)
begin
ram_addr <= 5'b0;
mem_addr <= mem_write;
if (next_state == `LAST_WAIT || next_state == `FINAL_WAIT || next_state == `STALL)
mem_read <= 2'b00;
else if (next_state == `MODE0_SETUP || next_state == `SETUP || cur_state == `MODE0 || next_state == `LAST_SETUP_WAIT)
mem_read <= 2'b01;
else
mem_read <= 2'b10;
mem_count <= 5'b0;
end
else if (cur_mem_state == `MEM_CHECK_DONE)
begin
if (mem_read == 2'b10)
begin
mem_addr <= mem_left;
read_n <= loop;
end
else
begin
mem_addr <= mem_cur;
read_n <= block_n;
end
mem_read <= mem_read - 2'b01;
mem_count <= 5'b0;
ram_addr <= 5'b0;
end
else if (cur_mem_state == `MEM_WRITE || cur_mem_state == `MEM_READ)
begin
ram_addr <= ram_addr + `BLOCKMDIVK;
mem_addr <= mem_addr + Ndivk;
mem_count <= mem_count + 2'b01;
end
end
// FSM to determine the block LU factorization algorithm
always @ (posedge clk)
begin
case (cur_state)
`START:
begin
next_state <= `SETUP;
end
`SETUP:
begin
next_state <= `WAIT;
end
`WAIT:
begin
if (mem_done == 1'b1)
next_state <= `FIRST;
else
next_state <= `WAIT;
end
`FIRST:
begin
if (mcount < comp_N)
next_state <= `MODE1_SETUP;
else if (ncount < comp_N)
next_state <= `MODE2_SETUP;
else
next_state <= `LAST_WAIT;
end
`MODE0_SETUP:
begin
next_state <= `MODE0_WAIT;
end
`MODE0_WAIT:
begin
if (mem_done == 1'b1 && comp_done == 1'b1)
next_state <= `MODE0;
else
next_state <= `MODE0_WAIT;
end
`MODE0:
begin
if (mcount < comp_N)
next_state <= `MODE1_SETUP;
else if (ncount < comp_N)
next_state <= `MODE2_SETUP;
else
begin
next_state <= `LAST_WAIT;
end
end
`MODE1_SETUP:
begin
next_state <= `MODE1_WAIT;
end
`MODE1_WAIT:
begin
if (mem_done == 1'b1 && comp_done == 1'b1)
next_state <= `MODE1;
else
next_state <= `MODE1_WAIT;
end
`MODE1:
begin
if (mcount < comp_N)
next_state <= `MODE1_SETUP;
else if (ncount < comp_N)
next_state <= `MODE2_SETUP;
else if (comp_N <= `BLOCKN + `BLOCKN)
next_state <= `STALL;
else
next_state <= `MODE0_SETUP;
end
`MODE2_SETUP:
begin
next_state <= `MODE2_WAIT;
end
`MODE2_WAIT:
begin
if (mem_done == 1'b1 && comp_done == 1'b1)
next_state <= `MODE2;
else
next_state <= `MODE2_WAIT;
end
`MODE2:
begin
if (mcount < comp_N)
next_state <= `MODE3_SETUP;
else if (ncount < comp_N)
next_state <= `MODE2_SETUP;
else if (comp_N <= `BLOCKN + `BLOCKN)
next_state <= `STALL;
else
next_state <= `MODE0_SETUP;
end
`MODE3_SETUP:
begin
next_state <= `MODE3_WAIT;
end
`MODE3_WAIT:
begin
if (mem_done == 1'b1 && comp_done == 1'b1)
next_state <= `MODE3;
else
next_state <= `MODE3_WAIT;
end
`MODE3:
begin
if (mcount < comp_N)
next_state <= `MODE3_SETUP;
else if (ncount < comp_N)
next_state <= `MODE2_SETUP;
else if (comp_N <= `BLOCKN + `BLOCKN)
next_state <= `STALL;
else
next_state <= `MODE0_SETUP;
end
`STALL:
next_state <= `STALL_WAIT;
`STALL_WAIT:
if (mem_done == 1'b1 && comp_done == 1'b1)
next_state <= `LAST_SETUP;
else
next_state <= `STALL_WAIT;
`LAST_SETUP:
next_state <= `LAST_SETUP_WAIT;
`LAST_SETUP_WAIT:
if (mem_done == 1'b1 && comp_done == 1'b1)
next_state <= `LAST;
else
next_state <= `LAST_SETUP_WAIT;
`LAST:
next_state <= `LAST_WAIT;
`LAST_WAIT:
if (mem_done == 1'b1 && comp_done == 1'b1)
next_state <= `FINAL_WRITE;
else
next_state <= `LAST_WAIT;
`FINAL_WRITE:
next_state <= `FINAL_WAIT;
`FINAL_WAIT:
if (mem_done == 1'b1)
next_state <= `IDLE;
else
next_state <= `FINAL_WAIT;
`IDLE:
if (start)
next_state <= `SETUP;
else
next_state <= `IDLE;
default:
next_state <= `START;
endcase
end
always @ (posedge clk)
begin
if (start)
begin
cur_state <= `START;
cur_mem_state <= `MEM_IDLE;
end
else
begin
cur_state <= next_state;
cur_mem_state <= next_mem_state;
end
end
always @ (cur_state)
begin
case (cur_state)
`MODE1:
mode = 2'b01;
`MODE2:
mode = 2'b10;
`MODE3:
mode = 2'b11;
default:
mode = 2'b00;
endcase
end
always @ (posedge clk)
begin
if (start)
begin
comp_N <= input_N;
N <= input_N;
end
else if (next_state == `MODE0)
begin
comp_N <= comp_N - `BLOCKN;
end
Ndivk <= ((N+`BLOCKM-1)>>4)<<3;
mem_N <= Ndivk<<4;
if (start)
begin
mem_base <= offset;
mem_top <= offset;
mem_left <= offset;
mem_cur <= offset;
end
else if (cur_state == `MODE0_SETUP)
begin
mem_base <= mem_base + mem_N+`MEMBLOCKN;
mem_top <= mem_base + mem_N+`MEMBLOCKN;
mem_cur <= mem_base + mem_N+`MEMBLOCKN;
mem_left <= mem_base + mem_N+`MEMBLOCKN;
end
else if (cur_state == `MODE1_SETUP)
begin
mem_cur <= mem_cur + `MEMBLOCKM;
end
else if (cur_state == `MODE3_SETUP)
begin
mem_cur <= mem_cur + `MEMBLOCKM;
mem_left <= mem_left + `MEMBLOCKM;
end
else if (cur_state == `MODE2_SETUP)
begin
mem_cur <= mem_top + mem_N;
mem_top <= mem_top + mem_N;
mem_left <= mem_base;
end
if (cur_state == `SETUP)
begin
mem_write <= 24'b0;
mem_write_buf <= 24'b0;
mem_write_size <= `BLOCKMDIVK;
mem_write_size_buf <= `BLOCKMDIVK;
write_n <= block_n;
write_n_buf <= block_n;
end
else if (cur_mem_state == `MEM_CHECK_DONE && mem_read == 0)
begin
mem_write <= mem_write_buf;
mem_write_buf <= mem_cur;
mem_write_size <= mem_write_size_buf;
mem_write_size_buf <= mem_read_size;
write_n <= write_n_buf;
write_n_buf <= block_n;
end
mem_read_size <= `BLOCKMDIVK;
if (start) begin
loop <= `BLOCKN;
end else if (next_state == `LAST) begin
loop <= comp_N[8:0] - `BLOCKN;
end
if (cur_state == `MODE0_SETUP || cur_state == `MODE2_SETUP || start) begin
mcount <= `BLOCKM;
end else if (cur_state == `MODE1_SETUP || cur_state == `MODE3_SETUP) begin
mcount <= mcount+`BLOCKM;
end
if (cur_state == `MODE0_SETUP || start) begin
ncount <= `BLOCKN;
end else if (cur_state == `MODE2_SETUP) begin
ncount <= ncount+`BLOCKN;
end
if (mcount < comp_N) begin
block_m <= `BLOCKM;
end else begin
block_m <= comp_N - mcount + `BLOCKM;
end
if (ncount < comp_N) begin
block_n <= `BLOCKN;
end else begin
block_n <= comp_N - ncount + `BLOCKN;
end
if (start) begin
cur_mem_sel <= 1'b0;
end else if ((cur_state == `MODE0)||(cur_state == `MODE1)||(cur_state == `MODE2)||(cur_state == `MODE3)||
(cur_state == `FIRST)||(cur_state == `FINAL_WRITE)||(cur_state == `LAST_SETUP)||(cur_state == `LAST)) begin
cur_mem_sel <= !cur_mem_sel;
end
if (start) begin
no_left_switch <= 1'b0;
end else if ((cur_state == `MODE0)||(cur_state == `FIRST)) begin
no_left_switch <= 1'b1;
end else if ((cur_state == `MODE1)||(cur_state == `MODE2)||(cur_state == `MODE3)||
(cur_state == `FINAL_WRITE)||(cur_state == `LAST_SETUP)) begin
no_left_switch <= 1'b0;
end
if (start) begin
left_mem_sel <= 1'b0;
end else if (((cur_state == `MODE0)||(cur_state ==`MODE1)||(cur_state == `MODE2)||(cur_state == `MODE3)||
(cur_state == `FIRST)||(cur_state == `FINAL_WRITE)||(cur_state == `LAST_SETUP))&&(no_left_switch == 1'b0)) begin
left_mem_sel <= !left_mem_sel;
end
end
endmodule
//topoutputdelay = 1
//auto-generated LU.v
//datapath for computating LU factorization
//by Wei Zhang
`define rRAMSIZEWIDTH 5
`define cSETUP 4'b0000
`define cSTART 4'b0001
`define cFETCH_COL 4'b0010
`define cWAIT_COL 4'b0011
`define cFIND_REC 4'b0100
`define cMULT_COL 4'b0101
`define cUPDATE_J 4'b0110
`define cSTORE_MO 4'b0111
`define cMULT_SUB 4'b1000
`define cINCRE_I 4'b1001
`define cWAIT 4'b1010
`define cDONE 4'b1011
`define cSTORE_DIAG 4'b1100
`define cSTORE_DIAG2 4'b1101
`define cSTART_FETCH_ROW 4'b1110
`define cROW_WAIT 2'b00
`define cFETCH_ROW 2'b01
`define cDONE_FETCH_ROW 2'b10
`define cLOAD_ROW_INC_J 2'b11
`define PRECISION 7'b0100000
`define NUMPE 5'b01000
`define PEWIDTH 3'b011
`define BLOCKWIDTH 4'b0101
`define RAMWIDTH 10'b0100000000
`define RAMNUMBYTES 7'b0100000
`define RAMSIZEWIDTH 4'b0101
`define TOPSIZEWIDTH 5'b01000
`define TOPINPUTDELAY 3'b011
`define TOPOUTPUTDELAY 2'b01
`define MEMINPUTDELAY 3'b010
`define MEMOUTPUTDELAY 2'b01
`define TOPWIDTH 7'b0100000
module LU (clk, start, m, n, loop, mode, done,
curReadAddrMem, curReadDataMem, curWriteByteEnMem, curWriteDataMem, curWriteAddrMem, curWriteEnMem, curMemSel,
leftWriteByteEnMem, leftWriteDataMem, leftWriteAddrMem, leftWriteEnMem, leftMemSel
);
input clk, start;
input[`BLOCKWIDTH-1:0] m, n, loop;
input[1:0] mode;
output done;
wire[`RAMWIDTH-1:0] curWriteData0, curWriteData1;
wire[`RAMSIZEWIDTH-1:0] curWriteAddr0, curReadAddr0, curWriteAddr1, curReadAddr1;
wire[`RAMWIDTH-1:0] curReadData0, curReadData1;
wire[`RAMNUMBYTES-1:0] curWriteByteEn0, curWriteByteEn1;
wire curWriteEn0, curWriteEn1;
input[`RAMWIDTH-1:0] curWriteDataMem;
output[`RAMWIDTH-1:0] curReadDataMem;
input[`RAMSIZEWIDTH-1:0] curWriteAddrMem, curReadAddrMem;
input[`RAMNUMBYTES-1:0] curWriteByteEnMem;
input curWriteEnMem;
input[`RAMWIDTH-1:0] leftWriteDataMem;
input[`RAMSIZEWIDTH-1:0] leftWriteAddrMem;
input[`RAMNUMBYTES-1:0] leftWriteByteEnMem;
input leftWriteEnMem;
input leftMemSel, curMemSel;
wire[`RAMWIDTH-1:0] curReadDataLU, curReadDataMem;
wire[`RAMWIDTH-1:0] curWriteDataLU, curWriteDataMem;
wire[`RAMSIZEWIDTH-1:0] curWriteAddrLU, curWriteAddrMem, curReadAddrLU, curReadAddrMem;
wire[`RAMNUMBYTES-1:0] curWriteByteEnLU, curWriteByteEnMem;
wire curWriteEnLU, curWriteEnMem;
reg[`RAMWIDTH-1:0] curReadData0Reg0;
reg[`RAMWIDTH-1:0] curReadData1Reg0;
reg[`RAMWIDTH-1:0] leftReadData0Reg0;
reg[`RAMWIDTH-1:0] leftReadData1Reg0;
reg[`RAMWIDTH-1:0] curWriteData0Reg0;
reg[`RAMWIDTH-1:0] curWriteData0Reg1;
reg[`RAMWIDTH-1:0] curWriteData1Reg0;
reg[`RAMWIDTH-1:0] curWriteData1Reg1;
reg[`RAMSIZEWIDTH-1:0] curWriteAddr0Reg0;
reg[`RAMSIZEWIDTH-1:0] curWriteAddr0Reg1;
reg[`RAMSIZEWIDTH-1:0] curReadAddr0Reg0;
reg[`RAMSIZEWIDTH-1:0] curReadAddr0Reg1;
reg[`RAMSIZEWIDTH-1:0] curWriteAddr1Reg0;
reg[`RAMSIZEWIDTH-1:0] curWriteAddr1Reg1;
reg[`RAMSIZEWIDTH-1:0] curReadAddr1Reg0;
reg[`RAMSIZEWIDTH-1:0] curReadAddr1Reg1;
reg[`RAMNUMBYTES-1:0] curWriteByteEn0Reg0;
reg[`RAMNUMBYTES-1:0] curWriteByteEn0Reg1;
reg[`RAMNUMBYTES-1:0] curWriteByteEn1Reg0;
reg[`RAMNUMBYTES-1:0] curWriteByteEn1Reg1;
reg curWriteEn0Reg0;
reg curWriteEn0Reg1;
reg curWriteEn1Reg0;
reg curWriteEn1Reg1;
reg[`RAMWIDTH-1:0] leftWriteData0Reg0;
reg[`RAMWIDTH-1:0] leftWriteData0Reg1;
reg[`RAMWIDTH-1:0] leftWriteData1Reg0;
reg[`RAMWIDTH-1:0] leftWriteData1Reg1;
reg[`RAMSIZEWIDTH-1:0] leftWriteAddr0Reg0;
reg[`RAMSIZEWIDTH-1:0] leftWriteAddr0Reg1;
reg[`RAMSIZEWIDTH-1:0] leftReadAddr0Reg0;
reg[`RAMSIZEWIDTH-1:0] leftReadAddr0Reg1;
reg[`RAMSIZEWIDTH-1:0] leftWriteAddr1Reg0;
reg[`RAMSIZEWIDTH-1:0] leftWriteAddr1Reg1;
reg[`RAMSIZEWIDTH-1:0] leftReadAddr1Reg0;
reg[`RAMSIZEWIDTH-1:0] leftReadAddr1Reg1;
reg[`RAMNUMBYTES-1:0] leftWriteByteEn0Reg0;
reg[`RAMNUMBYTES-1:0] leftWriteByteEn0Reg1;
reg[`RAMNUMBYTES-1:0] leftWriteByteEn1Reg0;
reg[`RAMNUMBYTES-1:0] leftWriteByteEn1Reg1;
reg leftWriteEn0Reg0;
reg leftWriteEn0Reg1;
reg leftWriteEn1Reg0;
reg leftWriteEn1Reg1;
reg[`PRECISION-1:0] multOperand;
reg[`PRECISION-1:0] diag;
wire[`PRECISION-1:0] recResult;
wire[`PRECISION-1:0] multA0;
wire[`PRECISION-1:0] multA1;
wire[`PRECISION-1:0] multA2;
wire[`PRECISION-1:0] multA3;
wire[`PRECISION-1:0] multA4;
wire[`PRECISION-1:0] multA5;
wire[`PRECISION-1:0] multA6;
wire[`PRECISION-1:0] multA7;
wire[`PRECISION-1:0] multResult0;
wire[`PRECISION-1:0] multResult1;
wire[`PRECISION-1:0] multResult2;
wire[`PRECISION-1:0] multResult3;
wire[`PRECISION-1:0] multResult4;
wire[`PRECISION-1:0] multResult5;
wire[`PRECISION-1:0] multResult6;
wire[`PRECISION-1:0] multResult7;
wire[`PRECISION-1:0] addA0;
wire[`PRECISION-1:0] addA1;
wire[`PRECISION-1:0] addA2;
wire[`PRECISION-1:0] addA3;
wire[`PRECISION-1:0] addA4;
wire[`PRECISION-1:0] addA5;
wire[`PRECISION-1:0] addA6;
wire[`PRECISION-1:0] addA7;
wire[`PRECISION-1:0] addResult0;
wire[`PRECISION-1:0] addResult1;
wire[`PRECISION-1:0] addResult2;
wire[`PRECISION-1:0] addResult3;
wire[`PRECISION-1:0] addResult4;
wire[`PRECISION-1:0] addResult5;
wire[`PRECISION-1:0] addResult6;
wire[`PRECISION-1:0] addResult7;
wire[`RAMWIDTH-1:0] leftReadData0, leftReadData1, leftWriteData0, leftWriteData1;
wire[`RAMSIZEWIDTH-1:0] leftWriteAddr0, leftWriteAddr1, leftReadAddr0, leftReadAddr1;
wire[`RAMNUMBYTES-1:0] leftWriteByteEn0, leftWriteByteEn1;
wire leftWriteEn0, leftWriteEn1;
wire[`RAMWIDTH-1:0] leftReadDataLU, leftWriteDataLU, leftWriteDataMem;
wire[`RAMSIZEWIDTH-1:0] leftWriteAddrLU, leftWriteAddrMem, leftReadAddrLU;
wire[`RAMNUMBYTES-1:0] leftWriteByteEnLU, leftWriteByteEnMem;
wire leftWriteEnLU, leftWriteEnMem;
wire[`PRECISION-1:0] topWriteData;
reg[`PRECISION-1:0] topWriteDataLU;
wire[`PRECISION-1:0] topReadData, topReadDataLU;
wire[`TOPSIZEWIDTH-1:0] topWriteAddr, topWriteAddrLU, topReadAddr, topReadAddrLU;
wire topWriteEn, topWriteEnLU;
reg[`PRECISION-1:0] topReadDataReg0;
reg[`PRECISION-1:0] topWriteDataReg0;
reg[`PRECISION-1:0] topWriteDataReg1;
reg[`PRECISION-1:0] topWriteDataReg2;
reg[`TOPSIZEWIDTH-1:0] topWriteAddrReg0;
reg[`TOPSIZEWIDTH-1:0] topWriteAddrReg1;
reg[`TOPSIZEWIDTH-1:0] topWriteAddrReg2;
reg[`TOPSIZEWIDTH-1:0] topReadAddrReg0;
reg[`TOPSIZEWIDTH-1:0] topReadAddrReg1;
reg[`TOPSIZEWIDTH-1:0] topReadAddrReg2;
reg topWriteEnReg0;
reg topWriteEnReg1;
reg topWriteEnReg2;
wire[`RAMWIDTH-1:0] rcWriteData;
wire leftWriteSel, curWriteSel, topSourceSel;
wire diagEn;
wire[`PEWIDTH-1:0] topWriteSel;
wire MOSel;
wire MOEn;
// control block
LUControl conBlock (clk, start, m, n, loop, mode, done,
curReadAddrLU, curWriteAddrLU, curWriteByteEnLU, curWriteEnLU, curWriteSel,
leftReadAddrLU, leftWriteAddrLU, leftWriteByteEnLU, leftWriteEnLU, leftWriteSel,
topReadAddrLU, topWriteAddrLU, topWriteEnLU, topWriteSel, topSourceSel, diagEn, MOSel, MOEn);
// fp_div unit
//floating point divider here
fpu_div rec(.clock(clk), .n(32'h3F800000), .d(diag), .div(recResult));
// on-chip memory blocks that store the matrix to be LU factorized
// store current blocks data
ram currentBlock0 (curWriteByteEn0, clk, curWriteData0, curReadAddr0, curWriteAddr0, curWriteEn0, curReadData0 );
ram1 currentBlock1 (curWriteByteEn1, clk, curWriteData1, curReadAddr1, curWriteAddr1, curWriteEn1, curReadData1 );
// store left blocks data
ram2 leftBlock0(leftWriteByteEn0, clk, leftWriteData0, leftReadAddr0, leftWriteAddr0, leftWriteEn0, leftReadData0 );
ram3 leftBlock1(leftWriteByteEn1, clk, leftWriteData1, leftReadAddr1, leftWriteAddr1, leftWriteEn1, leftReadData1 );
// store top block data
top_ram topBlock(clk, topWriteData, topReadAddr, topWriteAddr, topWriteEn, topReadDataLU );
// processing elements that does the main computation of LU factorization
mult_add PE0 (clk, multA0, multOperand, addA0, multResult0, addResult0);
mult_add PE1 (clk, multA1, multOperand, addA1, multResult1, addResult1);
mult_add PE2 (clk, multA2, multOperand, addA2, multResult2, addResult2);
mult_add PE3 (clk, multA3, multOperand, addA3, multResult3, addResult3);
mult_add PE4 (clk, multA4, multOperand, addA4, multResult4, addResult4);
mult_add PE5 (clk, multA5, multOperand, addA5, multResult5, addResult5);
mult_add PE6 (clk, multA6, multOperand, addA6, multResult6, addResult6);
mult_add PE7 (clk, multA7, multOperand, addA7, multResult7, addResult7);
// connect to ports of the left blocks
assign leftWriteDataLU = (leftWriteSel == 1'b0) ? curReadDataLU : rcWriteData;
always @ (posedge clk)
begin
if(leftMemSel == 1'b0)
begin
leftWriteData0Reg0 <= leftWriteDataMem;
leftWriteAddr0Reg0 <= leftWriteAddrMem;
leftWriteByteEn0Reg0 <= leftWriteByteEnMem;
leftWriteEn0Reg0 <= leftWriteEnMem;
leftWriteData1Reg0 <= leftWriteDataLU;
leftWriteAddr1Reg0 <= leftWriteAddrLU;
leftWriteByteEn1Reg0 <= leftWriteByteEnLU;
leftWriteEn1Reg0 <= leftWriteEnLU;
end
else
begin
leftWriteData0Reg0 <= leftWriteDataLU;
leftWriteAddr0Reg0 <= leftWriteAddrLU;
leftWriteByteEn0Reg0 <= leftWriteByteEnLU;
leftWriteEn0Reg0 <= leftWriteEnLU;
leftWriteData1Reg0 <= leftWriteDataMem;
leftWriteAddr1Reg0 <= leftWriteAddrMem;
leftWriteByteEn1Reg0 <= leftWriteByteEnMem;
leftWriteEn1Reg0 <= leftWriteEnMem;
end
leftReadAddr0Reg0 <= leftReadAddrLU;
leftReadAddr1Reg0 <= leftReadAddrLU;
leftWriteData0Reg1 <= leftWriteData0Reg0;
leftWriteAddr0Reg1 <= leftWriteAddr0Reg0;
leftReadAddr0Reg1 <= leftReadAddr0Reg0;
leftWriteByteEn0Reg1 <= leftWriteByteEn0Reg0;
leftWriteEn0Reg1 <= leftWriteEn0Reg0;
leftWriteData1Reg1 <= leftWriteData1Reg0;
leftWriteAddr1Reg1 <= leftWriteAddr1Reg0;
leftReadAddr1Reg1 <= leftReadAddr1Reg0;
leftWriteByteEn1Reg1 <= leftWriteByteEn1Reg0;
leftWriteEn1Reg1 <= leftWriteEn1Reg0;
end
assign leftWriteData0 = leftWriteData0Reg1;
assign leftWriteAddr0 = leftWriteAddr0Reg1;
assign leftReadAddr0 = leftReadAddr0Reg1;
assign leftWriteByteEn0 = leftWriteByteEn0Reg1;
assign leftWriteEn0 = leftWriteEn0Reg1;
assign leftWriteData1 = leftWriteData1Reg1;
assign leftWriteAddr1 = leftWriteAddr1Reg1;
assign leftReadAddr1 = leftReadAddr1Reg1;
assign leftWriteByteEn1 = leftWriteByteEn1Reg1;
assign leftWriteEn1 = leftWriteEn1Reg1;
always @ (posedge clk)
begin
leftReadData0Reg0 <= leftReadData0;
leftReadData1Reg0 <= leftReadData1;
end
assign leftReadDataLU = (leftMemSel == 1'b0) ? leftReadData1Reg0 : leftReadData0Reg0;
// data feed to fp div unit
always @ (posedge clk)
begin
if (diagEn == 1'b1)
begin
diag <= topReadData;
end
end
// one of the inputs to the PE
always @ (posedge clk)
begin
if (start == 1'b1)
multOperand <= 0;
else if (MOEn == 1'b1)
begin
if (MOSel == 1'b0)
multOperand <= recResult;
else
multOperand <= topReadData;
end
end
// connections to top block memory ports
always @ (topSourceSel or topWriteSel or curReadDataLU or addResult7 or addResult6 or addResult5 or addResult4 or addResult3 or addResult2 or addResult1 or addResult0)
begin
if (topSourceSel == 1'b0)
case (topWriteSel)
0:
topWriteDataLU = curReadDataLU[255:224];
1:
topWriteDataLU = curReadDataLU[223:192];
2:
topWriteDataLU = curReadDataLU[191:160];
3:
topWriteDataLU = curReadDataLU[159:128];
4:
topWriteDataLU = curReadDataLU[127:96];
5:
topWriteDataLU = curReadDataLU[95:64];
6:
topWriteDataLU = curReadDataLU[63:32];
7:
topWriteDataLU = curReadDataLU[31:0];
default:
topWriteDataLU = curReadDataLU[`PRECISION-1:0];
endcase
else
case (topWriteSel)
0:
topWriteDataLU = addResult7;
1:
topWriteDataLU = addResult6;
2:
topWriteDataLU = addResult5;
3:
topWriteDataLU = addResult4;
4:
topWriteDataLU = addResult3;
5:
topWriteDataLU = addResult2;
6:
topWriteDataLU = addResult1;
7:
topWriteDataLU = addResult0;
default:
topWriteDataLU = addResult0;
endcase
end
always @ (posedge clk)
begin
topWriteDataReg0 <= topWriteDataLU;
topReadAddrReg0 <= topReadAddrLU;
topWriteAddrReg0 <= topWriteAddrLU;
topWriteEnReg0 <= topWriteEnLU;
topWriteDataReg1 <= topWriteDataReg0;
topReadAddrReg1 <= topReadAddrReg0;
topWriteAddrReg1 <= topWriteAddrReg0;
topWriteEnReg1 <= topWriteEnReg0;
topWriteDataReg2 <= topWriteDataReg1;
topReadAddrReg2 <= topReadAddrReg1;
topWriteAddrReg2 <= topWriteAddrReg1;
topWriteEnReg2 <= topWriteEnReg1;
end
assign topWriteData = topWriteDataReg2;
assign topReadAddr = topReadAddrReg2;
assign topWriteAddr = topWriteAddrReg2;
assign topWriteEn = topWriteEnReg2;
always @ (posedge clk)
begin
topReadDataReg0 <= topReadDataLU;
end
assign topReadData = topReadDataReg0;
// connections to processing element
assign multA0 = leftReadDataLU[31:0];
assign multA1 = leftReadDataLU[63:32];
assign multA2 = leftReadDataLU[95:64];
assign multA3 = leftReadDataLU[127:96];
assign multA4 = leftReadDataLU[159:128];
assign multA5 = leftReadDataLU[191:160];
assign multA6 = leftReadDataLU[223:192];
assign multA7 = leftReadDataLU[255:224];
assign addA0 = curReadDataLU[31:0];
assign addA1 = curReadDataLU[63:32];
assign addA2 = curReadDataLU[95:64];
assign addA3 = curReadDataLU[127:96];
assign addA4 = curReadDataLU[159:128];
assign addA5 = curReadDataLU[191:160];
assign addA6 = curReadDataLU[223:192];
assign addA7 = curReadDataLU[255:224];
// connections to ports of the current blocks
assign rcWriteData[31:0] = (curWriteSel == 0) ? multResult0 : addResult0;
assign rcWriteData[63:32] = (curWriteSel == 0) ? multResult1 : addResult1;
assign rcWriteData[95:64] = (curWriteSel == 0) ? multResult2 : addResult2;
assign rcWriteData[127:96] = (curWriteSel == 0) ? multResult3 : addResult3;
assign rcWriteData[159:128] = (curWriteSel == 0) ? multResult4 : addResult4;
assign rcWriteData[191:160] = (curWriteSel == 0) ? multResult5 : addResult5;
assign rcWriteData[223:192] = (curWriteSel == 0) ? multResult6 : addResult6;
assign rcWriteData[255:224] = (curWriteSel == 0) ? multResult7 : addResult7;
assign curWriteDataLU = rcWriteData;
always @ (posedge clk)
begin
if(curMemSel == 1'b0)
begin
curWriteData0Reg0 <= curWriteDataMem;
curWriteAddr0Reg0 <= curWriteAddrMem;
curReadAddr0Reg0 <= curReadAddrMem;
curWriteByteEn0Reg0 <= curWriteByteEnMem;
curWriteEn0Reg0 <= curWriteEnMem;
curWriteData1Reg0 <= curWriteDataLU;
curWriteAddr1Reg0 <= curWriteAddrLU;
curReadAddr1Reg0 <= curReadAddrLU;
curWriteByteEn1Reg0 <= curWriteByteEnLU;
curWriteEn1Reg0 <= curWriteEnLU;
end
else
begin
curWriteData0Reg0 <= curWriteDataLU;
curWriteAddr0Reg0 <= curWriteAddrLU;
curReadAddr0Reg0 <= curReadAddrLU;
curWriteByteEn0Reg0 <= curWriteByteEnLU;
curWriteEn0Reg0 <= curWriteEnLU;
curWriteData1Reg0 <= curWriteDataMem;
curWriteAddr1Reg0 <= curWriteAddrMem;
curReadAddr1Reg0 <= curReadAddrMem;
curWriteByteEn1Reg0 <= curWriteByteEnMem;
curWriteEn1Reg0 <= curWriteEnMem;
end
curWriteData0Reg1 <= curWriteData0Reg0;
curWriteAddr0Reg1 <= curWriteAddr0Reg0;
curReadAddr0Reg1 <= curReadAddr0Reg0;
curWriteByteEn0Reg1 <= curWriteByteEn0Reg0;
curWriteEn0Reg1 <= curWriteEn0Reg0;
curWriteData1Reg1 <= curWriteData1Reg0;
curWriteAddr1Reg1 <= curWriteAddr1Reg0;
curReadAddr1Reg1 <= curReadAddr1Reg0;
curWriteByteEn1Reg1 <= curWriteByteEn1Reg0;
curWriteEn1Reg1 <= curWriteEn1Reg0;
end
assign curWriteData0 = curWriteData0Reg1;
assign curWriteAddr0 = curWriteAddr0Reg1;
assign curReadAddr0 = curReadAddr0Reg1;
assign curWriteByteEn0 = curWriteByteEn0Reg1;
assign curWriteEn0 = curWriteEn0Reg1;
assign curWriteData1 = curWriteData1Reg1;
assign curWriteAddr1 = curWriteAddr1Reg1;
assign curReadAddr1 = curReadAddr1Reg1;
assign curWriteByteEn1 = curWriteByteEn1Reg1;
assign curWriteEn1 = curWriteEn1Reg1;
always @ (posedge clk)
begin
curReadData0Reg0 <= curReadData0;
curReadData1Reg0 <= curReadData1;
end
assign curReadDataMem = (curMemSel == 0) ? curReadData0Reg0 : curReadData1Reg0;
assign curReadDataLU = (curMemSel == 0) ? curReadData1Reg0 : curReadData0Reg0;
endmodule
module LUControl (clk, start_in, m_in, n_in, loop_in, mode_in, done,
curReadAddr, curWriteAddr, curWriteByteEn, curWriteEn, curWriteSel,
leftReadAddr, leftWriteAddr, leftWriteByteEn, leftWriteEn, leftWriteSel,
topReadAddr, topWriteAddr, topWriteEn, topWriteSel, topSourceSel, diagEn, MOSel, MOEn);
input clk, start_in;
input[5-1:0] m_in, n_in, loop_in;
input[1:0] mode_in;
output done;
output[32-1:0] curWriteByteEn;
output[5-1:0] curWriteAddr, curReadAddr;
output curWriteEn;
output[32-1:0] leftWriteByteEn;
output[5-1:0] leftWriteAddr, leftReadAddr;
output leftWriteEn;
output[8-1:0] topWriteAddr, topReadAddr;
output topWriteEn;
output leftWriteSel, curWriteSel, topSourceSel, diagEn;
output[3-1:0] topWriteSel;
output MOSel;
output MOEn;
reg start;
reg[15:0]startDelay;
reg[5-1:0] m, n, stop, stop2, loop;
reg[1:0] mode;
reg[3:0] nextState, currentState;
reg[1:0] nextRowState, currentRowState;
reg startFetchRow, doneFetchRow, loadRow, writeRow;
reg updateCounter;
reg[5-1:0] i1, j;
reg[8-1:0] nextTopIdx, nextTopIdx2, curTopIdx, nextTopIdxCounter;
reg[2-1:0] topIdx, topIdxCounter, mdivk;
reg[5-1:0] diagIdx, leftIdx, msIdx;
reg[3-1:0] imodk, i1modk;
reg[5-1:0] diagIdxCounter, leftIdxCounter, msIdxCounter, readRowCounter, topWriteCounter;
reg[32-1:0] byteEn, i1modkByteEn;
reg done;
reg[32-1:0] curWriteByteEn;
reg[5-1:0] curWriteAddr, curReadAddr;
reg curWriteEn;
reg[32-1:0] leftWriteByteEn;
reg[5-1:0] leftWriteAddr, leftReadAddr;
reg leftWriteEn;
reg[8-1:0] topWriteAddr, topReadAddr;
reg topWriteEn;
reg leftWriteSel, curWriteSel, topSourceSel, diagEn;
reg[3-1:0] topWriteSel;
reg MOSel;
reg MOEn;
reg[5-1:0] counter;
reg[6-1:0] divCounter;
reg[32-1:0]writeByteEnDelay0;
reg[32-1:0]writeByteEnDelay1;
reg[32-1:0]writeByteEnDelay2;
reg[32-1:0]writeByteEnDelay3;
reg[32-1:0]writeByteEnDelay4;
reg[32-1:0]writeByteEnDelay5;
reg[32-1:0]writeByteEnDelay6;
reg[32-1:0]writeByteEnDelay7;
reg[32-1:0]writeByteEnDelay8;
reg[32-1:0]writeByteEnDelay9;
reg[32-1:0]writeByteEnDelay10;
reg[32-1:0]writeByteEnDelay11;
reg[32-1:0]writeByteEnDelay12;
reg[32-1:0]writeByteEnDelay13;
reg[32-1:0]writeByteEnDelay14;
reg[32-1:0]writeByteEnDelay15;
reg[32-1:0]writeByteEnDelay16;
reg[32-1:0]writeByteEnDelay17;
reg[32-1:0]writeByteEnDelay18;
reg[32-1:0]writeByteEnDelay19;
reg[32-1:0]writeByteEnDelay20;
reg[32-1:0]writeByteEnDelay21;
reg[32-1:0]writeByteEnDelay22;
reg[32-1:0]writeByteEnDelay23;
reg[32-1:0]writeByteEnDelay24;
reg[32-1:0]writeByteEnDelay25;
reg[32-1:0]writeByteEnDelay26;
reg[32-1:0]writeByteEnDelay27;
reg[32-1:0]writeByteEnDelay28;
reg[32-1:0]writeByteEnDelay29;
reg[32-1:0]writeByteEnDelay30;
reg[32-1:0]writeByteEnDelay31;
reg[5-1:0]curWriteAddrDelay0;
reg[5-1:0]curWriteAddrDelay1;
reg[5-1:0]curWriteAddrDelay2;
reg[5-1:0]curWriteAddrDelay3;
reg[5-1:0]curWriteAddrDelay4;
reg[5-1:0]curWriteAddrDelay5;
reg[5-1:0]curWriteAddrDelay6;
reg[5-1:0]curWriteAddrDelay7;
reg[5-1:0]curWriteAddrDelay8;
reg[5-1:0]curWriteAddrDelay9;
reg[5-1:0]curWriteAddrDelay10;
reg[5-1:0]curWriteAddrDelay11;
reg[5-1:0]curWriteAddrDelay12;
reg[5-1:0]curWriteAddrDelay13;
reg[5-1:0]curWriteAddrDelay14;
reg[5-1:0]curWriteAddrDelay15;
reg[5-1:0]curWriteAddrDelay16;
reg[5-1:0]curWriteAddrDelay17;
reg[5-1:0]curWriteAddrDelay18;
reg[5-1:0]curWriteAddrDelay19;
reg[5-1:0]curWriteAddrDelay20;
reg[5-1:0]curWriteAddrDelay21;
reg[5-1:0]curWriteAddrDelay22;
reg[5-1:0]curWriteAddrDelay23;
reg[5-1:0]curWriteAddrDelay24;
reg[5-1:0]curWriteAddrDelay25;
reg[5-1:0]curWriteAddrDelay26;
reg[5-1:0]curWriteAddrDelay27;
reg[5-1:0]curWriteAddrDelay28;
reg[5-1:0]curWriteAddrDelay29;
reg[5-1:0]curWriteAddrDelay30;
reg[5-1:0]curWriteAddrDelay31;
reg[5-1:0]curReadAddrDelay0;
reg[5-1:0]curReadAddrDelay1;
reg[5-1:0]curReadAddrDelay2;
reg[5-1:0]curReadAddrDelay3;
reg[5-1:0]curReadAddrDelay4;
reg[5-1:0]curReadAddrDelay5;
reg[5-1:0]curReadAddrDelay6;
reg[5-1:0]curReadAddrDelay7;
reg[5-1:0]curReadAddrDelay8;
reg[5-1:0]curReadAddrDelay9;
reg[5-1:0]curReadAddrDelay10;
reg[5-1:0]curReadAddrDelay11;
reg[32-1:0]leftWriteEnDelay;
reg[32-1:0]curWriteEnDelay;
reg[5-1:0]leftWriteSelDelay;
reg[16-1:0]curWriteSelDelay;
reg[5-1:0]leftReadAddrDelay0;
reg[8-1:0]topWriteAddrDelay0;
reg[8-1:0]topWriteAddrDelay1;
reg[8-1:0]topWriteAddrDelay2;
reg[8-1:0]topWriteAddrDelay3;
reg[8-1:0]topWriteAddrDelay4;
reg[8-1:0]topWriteAddrDelay5;
reg[8-1:0]topWriteAddrDelay6;
reg[8-1:0]topWriteAddrDelay7;
reg[8-1:0]topWriteAddrDelay8;
reg[8-1:0]topWriteAddrDelay9;
reg[8-1:0]topWriteAddrDelay10;
reg[8-1:0]topWriteAddrDelay11;
reg[8-1:0]topWriteAddrDelay12;
reg[8-1:0]topWriteAddrDelay13;
reg[8-1:0]topWriteAddrDelay14;
reg[8-1:0]topWriteAddrDelay15;
reg[8-1:0]topWriteAddrDelay16;
reg[8-1:0]topWriteAddrDelay17;
reg[8-1:0]topWriteAddrDelay18;
reg[8-1:0]topWriteAddrDelay19;
reg[8-1:0]topWriteAddrDelay20;
reg[8-1:0]topWriteAddrDelay21;
reg[8-1:0]topWriteAddrDelay22;
reg[8-1:0]topWriteAddrDelay23;
reg[8-1:0]topWriteAddrDelay24;
reg[8-1:0]topWriteAddrDelay25;
reg[8-1:0]topWriteAddrDelay26;
reg[8-1:0]topWriteAddrDelay27;
reg[8-1:0]topWriteAddrDelay28;
reg[8-1:0]topWriteAddrDelay29;
reg[8-1:0]topWriteAddrDelay30;
reg[8-1:0]topWriteAddrDelay31;
reg [32-1:0]topWriteEnDelay;
reg [5-1:0]topSourceSelDelay;
reg[3-1:0]topWriteSelDelay0;
reg[3-1:0]topWriteSelDelay1;
reg[3-1:0]topWriteSelDelay2;
reg[3-1:0]topWriteSelDelay3;
reg[3-1:0]topWriteSelDelay4;
reg[3-1:0]topWriteSelDelay5;
reg[3-1:0]topWriteSelDelay6;
reg[3-1:0]topWriteSelDelay7;
reg[3-1:0]topWriteSelDelay8;
reg[3-1:0]topWriteSelDelay9;
reg[3-1:0]topWriteSelDelay10;
reg[3-1:0]topWriteSelDelay11;
reg[3-1:0]topWriteSelDelay12;
reg[3-1:0]topWriteSelDelay13;
reg[3-1:0]topWriteSelDelay14;
reg[3-1:0]topWriteSelDelay15;
reg[3-1:0]topWriteSelDelay16;
reg[3-1:0]topWriteSelDelay17;
reg[3-1:0]topWriteSelDelay18;
reg[3-1:0]topWriteSelDelay19;
reg[3-1:0]topWriteSelDelay20;
reg[3-1:0]topWriteSelDelay21;
reg[3-1:0]topWriteSelDelay22;
reg[3-1:0]topWriteSelDelay23;
reg[3-1:0]topWriteSelDelay24;
reg[3-1:0]topWriteSelDelay25;
reg[3-1:0]topWriteSelDelay26;
reg[3-1:0]topWriteSelDelay27;
reg[3-1:0]topWriteSelDelay28;
reg[3-1:0]topWriteSelDelay29;
reg[3-1:0]topWriteSelDelay30;
reg[3-1:0]topWriteSelDelay31;
reg [6-1:0]diagEnDelay;
reg[6-1:0]MOEnDelay;
reg [5-1:0]waitCycles;
// register store m, n and mdivk value
always @ (posedge clk)
begin
if (start_in == 1'b1)
begin
n <= n_in;
m <= m_in;
loop <= loop_in;
mode <= mode_in;
end
if (mode[0] == 1'b0 && m == loop)
stop <= loop;
else
stop <= loop+1'b1;
stop2 <= loop;
startDelay[0] <= start_in;
startDelay[1] <= startDelay[0];
startDelay[2] <= startDelay[1];
startDelay[3] <= startDelay[2];
startDelay[4] <= startDelay[3];
startDelay[5] <= startDelay[4];
startDelay[6] <= startDelay[5];
startDelay[7] <= startDelay[6];
startDelay[8] <= startDelay[7];
startDelay[9] <= startDelay[8];
startDelay[10] <= startDelay[9];
startDelay[11] <= startDelay[10];
startDelay[12] <= startDelay[11];
startDelay[13] <= startDelay[12];
startDelay[14] <= startDelay[13];
startDelay[15] <= startDelay[14];
start <= startDelay[15];
mdivk <= (m+8-1)>>3;
end
// registers that store values that are used in FSM, dependent on i and/or j
always @ (posedge clk)
begin
if (start == 1'b1)
topIdx <= 2'b00; //offset1divk;
else if (currentState == `cINCRE_I && i1modk == 8-1 && mode[0] == 1'b0)
topIdx <= topIdx + 1'b1;
if (start == 1'b1)
diagIdx <= 5'b00000;
else if (currentState == `cSTORE_DIAG && mode == 2'b01)
diagIdx <= 2; else if (currentState == `cINCRE_I)
begin
if ((imodk == 8-1 && mode == 2'b00) || (i1modk == 8-1 && mode == 2'b01))
diagIdx <= diagIdx + 2 + 1;
else
diagIdx <= diagIdx + 2;
end
if (start == 1'b1)
leftIdx <= 5'b00000;
else if (currentState == `cINCRE_I)
begin
if (i1modk == 8-1 && mode[0] == 1'b0)
leftIdx <= leftIdx + 2 + 1;
else
leftIdx <= leftIdx + 2;
end
if (start == 1'b1)
msIdx <= 5'b00000;
else if (currentState == `cUPDATE_J)
if (mode[1] == 1'b0)
msIdx <= leftIdx + 2;
else
msIdx <= topIdx;
else if (nextRowState == `cLOAD_ROW_INC_J)
msIdx <= msIdx + 2;
if (start == 1'b1)
imodk <= 3'b000;
else if (currentState == `cINCRE_I)
begin
if (imodk == 8-1)
imodk <= 3'b000;
else
imodk <= imodk + 1'b1;
end
if (start == 1'b1)
i1modk <= 3'b001;
else if (currentState == `cINCRE_I)
begin
if (i1modk == 8-1)
i1modk <= 3'b000;
else
i1modk <= i1modk + 1'b1;
end
if (start == 1'b1)
nextTopIdx <= 8'b00000000;
else if (currentState == `cINCRE_I)
if (mode[1] == 0)
nextTopIdx <= nextTopIdx + n + 1;
else
nextTopIdx <= nextTopIdx + n;
nextTopIdx2 <= nextTopIdx + n + 1;
if (start == 1'b1)
curTopIdx <= 8'b00000001;
else if (currentState == `cUPDATE_J)
if (mode[1] == 1'b0)
curTopIdx <= nextTopIdx+1;
else
curTopIdx <= nextTopIdx;
else if (nextRowState == `cLOAD_ROW_INC_J)
curTopIdx <= curTopIdx + 1;
if (start == 1'b1)
i1 <= 5'b00001;
else if (currentState == `cINCRE_I)
i1 <= i1 + 1;
if (start == 1'b1)
j <= 5'b00000;
else if (currentState == `cUPDATE_J)
if (mode[1] == 1'b0)
j <= i1;
else
j <= 5'b00000;
else if (currentRowState == `cLOAD_ROW_INC_J)
j <= j + 1;
// compute cycles of delay in FSM
if (currentState == `cSTORE_MO)
waitCycles <= 32-1;
else if (currentState == `cINCRE_I)
begin
if (i1 == stop-1)
if (mode[1] == 1'b1)
waitCycles <= 32-1 + 6 - 3;
else
waitCycles <= waitCycles + 5 - 2;
else if (mode == 2'b01 && waitCycles < 32-1 - (16-1) - 4)
waitCycles <= 32-1 - (16-1) - 4;
else if (mode == 2'b10 && i1modk == 8-1)
waitCycles <= 32-1 + 6 - 3;
else if (mode == 2'b00)
waitCycles <= waitCycles + 6 ;
end
else if (waitCycles >5'b00000)
waitCycles <= waitCycles - 1;
end
// determining next state of main FSM
always @ (currentState or start or mode or m or n or counter or mdivk or topIdxCounter or doneFetchRow or divCounter or j or stop2 or waitCycles or stop or i1)
begin
case (currentState)
`cSETUP:
begin
if (start == 1'b1)
nextState = `cSTART;
else
nextState = `cSETUP;
updateCounter = 1'b1;
end
`cSTART:
begin
if (mode == 2'b00)
begin
if (m == 1 && n == 1)
nextState = `cDONE;
else
nextState = `cFETCH_COL;
end
else if (mode == 2'b01)
nextState = `cSTORE_DIAG;
else if (mode == 2'b10)
nextState = `cSTART_FETCH_ROW;
else
nextState = `cUPDATE_J;
updateCounter = 1'b1;
end
`cSTART_FETCH_ROW:
begin
if (counter == 5+6-1)
begin
if (mode == 2'b00)
nextState = `cSTORE_DIAG;
else
nextState = `cUPDATE_J;
end
else
nextState = `cSTART_FETCH_ROW;
updateCounter = 1'b0;
end
`cFETCH_COL:
if (counter >= mdivk-1)
begin
if (mode == 2'b00 && counter < 5)
begin
nextState = `cWAIT_COL;
updateCounter = 1'b0;
end
else
begin
if (mode == 2'b00)
nextState = `cSTART_FETCH_ROW;
else
nextState = `cFIND_REC;
updateCounter = 1'b1;
end
end
else
begin
nextState = `cFETCH_COL;
updateCounter = 1'b0;
end
`cWAIT_COL:
if (counter >= 5)
begin
if (mode == 0)
nextState = `cSTART_FETCH_ROW;
else
nextState = `cFIND_REC;
updateCounter = 1;
end
else
begin
nextState = `cWAIT_COL;
updateCounter = 0;
end
`cSTORE_DIAG:
begin
if (mode == 0)
nextState = `cFIND_REC;
else
nextState = `cFETCH_COL;
updateCounter = 1;
end
`cFIND_REC:
if (divCounter == 56)
begin
if (mode == 0)
nextState = `cMULT_COL;
else
nextState = `cSTORE_DIAG2;
updateCounter = 1;
end
else
begin
nextState = `cFIND_REC;
updateCounter = 0;
end
`cSTORE_DIAG2:
begin
nextState = `cMULT_COL;
updateCounter = 1;
end
`cMULT_COL:
if (topIdxCounter == mdivk-1)
begin
nextState = `cUPDATE_J;
updateCounter = 0;
end
else
begin
nextState = `cMULT_COL;
updateCounter = 0;
end
`cUPDATE_J:
if ((mode[1] == 1 || counter >= 16-1) && doneFetchRow == 1)
begin
nextState = `cSTORE_MO;
updateCounter = 1;
end
else
begin
nextState = `cUPDATE_J;
updateCounter = 0;
end
`cSTORE_MO:
begin
if (j == stop2)
begin
if (counter == mdivk-1+5-2)
nextState = `cDONE;
else
nextState = `cSTORE_MO;
updateCounter = 0;
end
else
begin
nextState = `cMULT_SUB;
updateCounter = 1;
end
end
`cMULT_SUB:
if (topIdxCounter == mdivk-1)
begin
if (j == n-1)
nextState = `cINCRE_I;
else
nextState = `cMULT_SUB;
updateCounter = 1;
end
else
begin
nextState = `cMULT_SUB;
updateCounter = 0;
end
`cINCRE_I:
begin
nextState = `cWAIT;
updateCounter = 1;
end
`cWAIT:
if (waitCycles == 0)
begin
if (i1 == stop)
nextState = `cDONE;
else if (mode == 0)
nextState = `cSTORE_DIAG;
else if (mode == 1)
nextState = `cFIND_REC;
else
nextState = `cUPDATE_J;
updateCounter = 1;
end
else
begin
nextState = `cWAIT;
updateCounter = 0;
end
`cDONE:
begin
nextState = `cDONE;
updateCounter = 0;
end
default:
begin
nextState = `cSETUP;
updateCounter = 1;
end
endcase
end
always @ (currentRowState or currentState or nextState or i1 or topIdxCounter or mdivk or msIdxCounter or readRowCounter or j or n or mode)
begin
if (currentRowState == `cDONE_FETCH_ROW)
doneFetchRow = 1;
else
doneFetchRow = 0;
if((nextState == `cSTART_FETCH_ROW && currentState != `cSTART_FETCH_ROW && i1 == 1))
startFetchRow = 1;
else
startFetchRow = 0;
if (currentState == `cMULT_SUB && topIdxCounter+2 == mdivk)
loadRow = 1;
else
loadRow = 0;
writeRow = (msIdxCounter == readRowCounter)&&(currentState==`cMULT_SUB)&&(j!=n)&&(mode[0] == 0);
end
// second FSM that controls the control signals to temp_top block
always @ (currentRowState or nextTopIdxCounter or n or startFetchRow or loadRow or topIdx or mdivk or nextState)
begin
case (currentRowState)
`cFETCH_ROW:
if (nextTopIdxCounter == n-1)
nextRowState = `cDONE_FETCH_ROW;
else
nextRowState = `cFETCH_ROW;
`cDONE_FETCH_ROW:
if (startFetchRow == 1)
nextRowState = `cFETCH_ROW;
else if (loadRow == 1 || (topIdx+1 == mdivk && nextState == `cMULT_SUB))
nextRowState = `cLOAD_ROW_INC_J;
else
nextRowState = `cDONE_FETCH_ROW;
`cLOAD_ROW_INC_J:
if (topIdx+1 == mdivk && nextState == `cMULT_SUB)
nextRowState = `cLOAD_ROW_INC_J;
else
nextRowState = `cDONE_FETCH_ROW;
default:
nextRowState = `cDONE_FETCH_ROW;
endcase
end
// address counters
always @ (posedge clk)
begin
if (updateCounter == 1 || currentRowState == `cLOAD_ROW_INC_J)
topIdxCounter <= topIdx;
else
topIdxCounter <= topIdxCounter + 1;
if (updateCounter == 1)
diagIdxCounter <= diagIdx;
else
diagIdxCounter <= diagIdxCounter + 1;
if (updateCounter == 1 || currentRowState == `cLOAD_ROW_INC_J)
msIdxCounter <= msIdx;
else
msIdxCounter <= msIdxCounter + 1;
if (updateCounter == 1 || currentRowState == `cLOAD_ROW_INC_J)
leftIdxCounter <= leftIdx;
else
leftIdxCounter <= leftIdxCounter + 1;
if (currentState == `cFETCH_COL || currentState == `cSTORE_MO)
topWriteCounter <= i1;
else if (writeRow == 1 || currentRowState == `cFETCH_ROW)
topWriteCounter <= topWriteCounter + 1;
if (currentState == `cSTART)
nextTopIdxCounter <= nextTopIdx;
else if (currentState == `cSTORE_MO)
if (mode[1] == 0)
nextTopIdxCounter <= nextTopIdx + n + 1;
else
nextTopIdxCounter <= nextTopIdx + n;
else if (writeRow == 1 || currentRowState == `cFETCH_ROW)
nextTopIdxCounter <= nextTopIdxCounter + 1;
if (currentState == `cSTART)
readRowCounter <= 0; //offsetdivk;
else if (currentState == `cSTORE_MO)
if (mode[1] == 0)
readRowCounter <= leftIdx + 2;
else
readRowCounter <= topIdx;
else if (writeRow == 1 || currentRowState == `cFETCH_ROW)
readRowCounter <= readRowCounter + 2;
if (updateCounter == 1)
counter <= 0;
else
counter <= counter + 1;
if (currentState == `cSTORE_DIAG || currentState == `cSTORE_DIAG2)
divCounter <= 0;
else if (divCounter < 56)
divCounter <= divCounter + 1;
case (i1modk)
3'b000: begin
i1modkByteEn <= ~(32'b0) >> (3'b000<<2'b10);
end
3'b001: begin
i1modkByteEn <= ~(32'b0) >> (3'b001<<2'b10);
end
3'b010: begin
i1modkByteEn <= ~(32'b0) >> (3'b010<<2'b10);
end
3'b011: begin
i1modkByteEn <= ~(32'b0) >> (3'b011<<2'b10);
end
3'b100: begin
i1modkByteEn <= ~(32'b0) >> (3'b100<<2'b10);
end
3'b101: begin
i1modkByteEn <= ~(32'b0) >> (3'b101<<2'b10);
end
3'b110: begin
i1modkByteEn <= ~(32'b0) >> (3'b110<<2'b10);
end
3'b111: begin
i1modkByteEn <= ~(32'b0) >> (3'b111<<2'b10);
end
default: begin
i1modkByteEn <= ~(32'b0);
end
endcase
end
// compute Byte Enable
always @ (posedge clk)
begin
if ((nextState == `cMULT_COL && currentState != `cMULT_COL) || (currentState == `cSTORE_MO) || currentRowState == `cLOAD_ROW_INC_J)
byteEn <= i1modkByteEn;
else
byteEn <= 32'b11111111111111111111111111111111;
end
// update FSM state register
always @ (posedge clk)
begin
if (start_in == 1'b1)
currentState <= `cSETUP;
else
currentState <= nextState;
if (start == 1'b1)
currentRowState <= `cDONE_FETCH_ROW;
else
currentRowState <= nextRowState;
end
// delay register for control signals
// control signals are delayed to match latency of operations and/or memory access
always @ (posedge clk)
begin
curReadAddrDelay0 <= curReadAddrDelay1;
curReadAddrDelay1 <= curReadAddrDelay2;
curReadAddrDelay2 <= curReadAddrDelay3;
curReadAddrDelay3 <= curReadAddrDelay4;
curReadAddrDelay4 <= curReadAddrDelay5;
curReadAddrDelay5 <= curReadAddrDelay6;
curReadAddrDelay6 <= curReadAddrDelay7;
curReadAddrDelay7 <= curReadAddrDelay8;
curReadAddrDelay8 <= curReadAddrDelay9;
curReadAddrDelay9 <= curReadAddrDelay10;
curReadAddrDelay10 <= curReadAddrDelay11;
curReadAddrDelay11 <= msIdxCounter;
curWriteAddrDelay0 <= curWriteAddrDelay1;
curWriteAddrDelay1 <= curWriteAddrDelay2;
curWriteAddrDelay2 <= curWriteAddrDelay3;
curWriteAddrDelay3 <= curWriteAddrDelay4;
if (currentState == `cFETCH_COL)
curWriteAddrDelay4 <= diagIdxCounter;
else
curWriteAddrDelay4 <= curWriteAddrDelay5;
curWriteAddrDelay5 <= curWriteAddrDelay6;
curWriteAddrDelay6 <= curWriteAddrDelay7;
curWriteAddrDelay7 <= curWriteAddrDelay8;
curWriteAddrDelay8 <= curWriteAddrDelay9;
curWriteAddrDelay9 <= curWriteAddrDelay10;
curWriteAddrDelay10 <= curWriteAddrDelay11;
curWriteAddrDelay11 <= curWriteAddrDelay12;
curWriteAddrDelay12 <= curWriteAddrDelay13;
curWriteAddrDelay13 <= curWriteAddrDelay14;
curWriteAddrDelay14 <= curWriteAddrDelay15;
if (currentState == `cMULT_COL)
curWriteAddrDelay15 <= leftIdxCounter;
else
curWriteAddrDelay15 <= curWriteAddrDelay16;
curWriteAddrDelay16 <= curWriteAddrDelay17;
curWriteAddrDelay17 <= curWriteAddrDelay18;
curWriteAddrDelay18 <= curWriteAddrDelay19;
curWriteAddrDelay19 <= curWriteAddrDelay20;
curWriteAddrDelay20 <= curWriteAddrDelay21;
curWriteAddrDelay21 <= curWriteAddrDelay22;
curWriteAddrDelay22 <= curWriteAddrDelay23;
curWriteAddrDelay23 <= curWriteAddrDelay24;
curWriteAddrDelay24 <= curWriteAddrDelay25;
curWriteAddrDelay25 <= curWriteAddrDelay26;
curWriteAddrDelay26 <= curWriteAddrDelay27;
curWriteAddrDelay27 <= curWriteAddrDelay28;
curWriteAddrDelay28 <= curWriteAddrDelay29;
curWriteAddrDelay29 <= curWriteAddrDelay30;
curWriteAddrDelay30 <= curWriteAddrDelay31;
curWriteAddrDelay31 <= msIdxCounter;
writeByteEnDelay0 <= writeByteEnDelay1;
writeByteEnDelay1 <= writeByteEnDelay2;
writeByteEnDelay2 <= writeByteEnDelay3;
writeByteEnDelay3 <= writeByteEnDelay4;
if (mode[0] == 1'b1)
writeByteEnDelay4 <= ~0;
else if (currentState == `cFETCH_COL)
writeByteEnDelay4 <= byteEn;
else
writeByteEnDelay4 <= writeByteEnDelay5;
writeByteEnDelay5 <= writeByteEnDelay6;
writeByteEnDelay6 <= writeByteEnDelay7;
writeByteEnDelay7 <= writeByteEnDelay8;
writeByteEnDelay8 <= writeByteEnDelay9;
writeByteEnDelay9 <= writeByteEnDelay10;
writeByteEnDelay10 <= writeByteEnDelay11;
writeByteEnDelay11 <= writeByteEnDelay12;
writeByteEnDelay12 <= writeByteEnDelay13;
writeByteEnDelay13 <= writeByteEnDelay14;
writeByteEnDelay14 <= writeByteEnDelay15;
if (currentState == `cMULT_COL)
writeByteEnDelay15 <= byteEn;
else
writeByteEnDelay15 <= writeByteEnDelay16;
writeByteEnDelay16 <= writeByteEnDelay17;
writeByteEnDelay17 <= writeByteEnDelay18;
writeByteEnDelay18 <= writeByteEnDelay19;
writeByteEnDelay19 <= writeByteEnDelay20;
writeByteEnDelay20 <= writeByteEnDelay21;
writeByteEnDelay21 <= writeByteEnDelay22;
writeByteEnDelay22 <= writeByteEnDelay23;
writeByteEnDelay23 <= writeByteEnDelay24;
writeByteEnDelay24 <= writeByteEnDelay25;
writeByteEnDelay25 <= writeByteEnDelay26;
writeByteEnDelay26 <= writeByteEnDelay27;
writeByteEnDelay27 <= writeByteEnDelay28;
writeByteEnDelay28 <= writeByteEnDelay29;
writeByteEnDelay29 <= writeByteEnDelay30;
writeByteEnDelay30 <= writeByteEnDelay31;
writeByteEnDelay31 <= byteEn;
curWriteSelDelay[0] <= curWriteSelDelay[1];
curWriteSelDelay[1] <= curWriteSelDelay[2];
curWriteSelDelay[2] <= curWriteSelDelay[3];
curWriteSelDelay[3] <= curWriteSelDelay[4];
curWriteSelDelay[4] <= curWriteSelDelay[5];
curWriteSelDelay[5] <= curWriteSelDelay[6];
curWriteSelDelay[6] <= curWriteSelDelay[7];
curWriteSelDelay[7] <= curWriteSelDelay[8];
curWriteSelDelay[8] <= curWriteSelDelay[9];
curWriteSelDelay[9] <= curWriteSelDelay[10];
curWriteSelDelay[10] <= curWriteSelDelay[11];
curWriteSelDelay[11] <= curWriteSelDelay[12];
curWriteSelDelay[12] <= curWriteSelDelay[13];
curWriteSelDelay[13] <= curWriteSelDelay[14];
curWriteSelDelay[14] <= curWriteSelDelay[15];
if (currentState == `cMULT_COL)
curWriteSelDelay[15] <= 1'b0;
else
curWriteSelDelay[15] <= 1'b1;
curWriteEnDelay[0] <= curWriteEnDelay[1];
curWriteEnDelay[1] <= curWriteEnDelay[2];
curWriteEnDelay[2] <= curWriteEnDelay[3];
curWriteEnDelay[3] <= curWriteEnDelay[4];
curWriteEnDelay[4] <= curWriteEnDelay[5];
curWriteEnDelay[5] <= curWriteEnDelay[6];
curWriteEnDelay[6] <= curWriteEnDelay[7];
curWriteEnDelay[7] <= curWriteEnDelay[8];
curWriteEnDelay[8] <= curWriteEnDelay[9];
curWriteEnDelay[9] <= curWriteEnDelay[10];
curWriteEnDelay[10] <= curWriteEnDelay[11];
curWriteEnDelay[11] <= curWriteEnDelay[12];
curWriteEnDelay[12] <= curWriteEnDelay[13];
curWriteEnDelay[13] <= curWriteEnDelay[14];
curWriteEnDelay[14] <= curWriteEnDelay[15];
if (currentState == `cMULT_COL)
curWriteEnDelay[15] <= 1'b1;
else
curWriteEnDelay[15] <= curWriteEnDelay[16];
curWriteEnDelay[16] <= curWriteEnDelay[17];
curWriteEnDelay[17] <= curWriteEnDelay[18];
curWriteEnDelay[18] <= curWriteEnDelay[19];
curWriteEnDelay[19] <= curWriteEnDelay[20];
curWriteEnDelay[20] <= curWriteEnDelay[21];
curWriteEnDelay[21] <= curWriteEnDelay[22];
curWriteEnDelay[22] <= curWriteEnDelay[23];
curWriteEnDelay[23] <= curWriteEnDelay[24];
curWriteEnDelay[24] <= curWriteEnDelay[25];
curWriteEnDelay[25] <= curWriteEnDelay[26];
curWriteEnDelay[26] <= curWriteEnDelay[27];
curWriteEnDelay[27] <= curWriteEnDelay[28];
curWriteEnDelay[28] <= curWriteEnDelay[29];
curWriteEnDelay[29] <= curWriteEnDelay[30];
curWriteEnDelay[30] <= curWriteEnDelay[31];
if (currentState == `cMULT_SUB)
curWriteEnDelay[31] <= 1'b1;
else
curWriteEnDelay[31] <= 1'b0;
leftWriteSelDelay[0] <= leftWriteSelDelay[1];
leftWriteSelDelay[1] <= leftWriteSelDelay[2];
leftWriteSelDelay[2] <= leftWriteSelDelay[3];
leftWriteSelDelay[3] <= leftWriteSelDelay[4];
if (currentState == `cFETCH_COL)
leftWriteSelDelay[4] <= 1'b0;
else
leftWriteSelDelay[4] <= 1'b1;
leftWriteEnDelay[0] <= leftWriteEnDelay[1];
leftWriteEnDelay[1] <= leftWriteEnDelay[2];
leftWriteEnDelay[2] <= leftWriteEnDelay[3];
leftWriteEnDelay[3] <= leftWriteEnDelay[4];
if (currentState == `cFETCH_COL)
leftWriteEnDelay[4] <= 1'b1;
else
leftWriteEnDelay[4] <= leftWriteEnDelay[5];
leftWriteEnDelay[5] <= leftWriteEnDelay[6];
leftWriteEnDelay[6] <= leftWriteEnDelay[7];
leftWriteEnDelay[7] <= leftWriteEnDelay[8];
leftWriteEnDelay[8] <= leftWriteEnDelay[9];
leftWriteEnDelay[9] <= leftWriteEnDelay[10];
leftWriteEnDelay[10] <= leftWriteEnDelay[11];
leftWriteEnDelay[11] <= leftWriteEnDelay[12];
leftWriteEnDelay[12] <= leftWriteEnDelay[13];
leftWriteEnDelay[13] <= leftWriteEnDelay[14];
leftWriteEnDelay[14] <= leftWriteEnDelay[15];
if (currentState == `cMULT_COL)
leftWriteEnDelay[15] <= 1'b1;
else
leftWriteEnDelay[15] <= leftWriteEnDelay[16];
leftWriteEnDelay[16] <= leftWriteEnDelay[17];
leftWriteEnDelay[17] <= leftWriteEnDelay[18];
leftWriteEnDelay[18] <= leftWriteEnDelay[19];
leftWriteEnDelay[19] <= leftWriteEnDelay[20];
leftWriteEnDelay[20] <= leftWriteEnDelay[21];
leftWriteEnDelay[21] <= leftWriteEnDelay[22];
leftWriteEnDelay[22] <= leftWriteEnDelay[23];
leftWriteEnDelay[23] <= leftWriteEnDelay[24];
leftWriteEnDelay[24] <= leftWriteEnDelay[25];
leftWriteEnDelay[25] <= leftWriteEnDelay[26];
leftWriteEnDelay[26] <= leftWriteEnDelay[27];
leftWriteEnDelay[27] <= leftWriteEnDelay[28];
leftWriteEnDelay[28] <= leftWriteEnDelay[29];
leftWriteEnDelay[29] <= leftWriteEnDelay[30];
leftWriteEnDelay[30] <= leftWriteEnDelay[31];
if (currentState == `cMULT_SUB && (mode == 0 || (mode == 1 && j == i1)))
leftWriteEnDelay[31] <= 1'b1;
else
leftWriteEnDelay[31] <= 1'b0;
topWriteAddrDelay0 <= topWriteAddrDelay1;
topWriteAddrDelay1 <= topWriteAddrDelay2;
topWriteAddrDelay2 <= topWriteAddrDelay3;
topWriteAddrDelay3 <= topWriteAddrDelay4;
if (currentRowState == `cFETCH_ROW)
topWriteAddrDelay4 <= nextTopIdxCounter;
else
topWriteAddrDelay4 <= topWriteAddrDelay5;
topWriteAddrDelay5 <= topWriteAddrDelay6;
topWriteAddrDelay6 <= topWriteAddrDelay7;
topWriteAddrDelay7 <= topWriteAddrDelay8;
topWriteAddrDelay8 <= topWriteAddrDelay9;
topWriteAddrDelay9 <= topWriteAddrDelay10;
topWriteAddrDelay10 <= topWriteAddrDelay11;
topWriteAddrDelay11 <= topWriteAddrDelay12;
topWriteAddrDelay12 <= topWriteAddrDelay13;
topWriteAddrDelay13 <= topWriteAddrDelay14;
topWriteAddrDelay14 <= topWriteAddrDelay15;
topWriteAddrDelay15 <= topWriteAddrDelay16;
topWriteAddrDelay16 <= topWriteAddrDelay17;
topWriteAddrDelay17 <= topWriteAddrDelay18;
topWriteAddrDelay18 <= topWriteAddrDelay19;
topWriteAddrDelay19 <= topWriteAddrDelay20;
topWriteAddrDelay20 <= topWriteAddrDelay21;
topWriteAddrDelay21 <= topWriteAddrDelay22;
topWriteAddrDelay22 <= topWriteAddrDelay23;
topWriteAddrDelay23 <= topWriteAddrDelay24;
topWriteAddrDelay24 <= topWriteAddrDelay25;
topWriteAddrDelay25 <= topWriteAddrDelay26;
topWriteAddrDelay26 <= topWriteAddrDelay27;
topWriteAddrDelay27 <= topWriteAddrDelay28;
topWriteAddrDelay28 <= topWriteAddrDelay29;
topWriteAddrDelay29 <= topWriteAddrDelay30;
topWriteAddrDelay30 <= topWriteAddrDelay31;
topWriteAddrDelay31 <= nextTopIdxCounter;
topWriteEnDelay[0] <= topWriteEnDelay[1];
topWriteEnDelay[1] <= topWriteEnDelay[2];
topWriteEnDelay[2] <= topWriteEnDelay[3];
topWriteEnDelay[3] <= topWriteEnDelay[4];
if (currentRowState == `cFETCH_ROW)
topWriteEnDelay[4] <= 1'b1;
else
topWriteEnDelay[4] <= topWriteEnDelay[5];
topWriteEnDelay[5] <= topWriteEnDelay[6];
topWriteEnDelay[6] <= topWriteEnDelay[7];
topWriteEnDelay[7] <= topWriteEnDelay[8];
topWriteEnDelay[8] <= topWriteEnDelay[9];
topWriteEnDelay[9] <= topWriteEnDelay[10];
topWriteEnDelay[10] <= topWriteEnDelay[11];
topWriteEnDelay[11] <= topWriteEnDelay[12];
topWriteEnDelay[12] <= topWriteEnDelay[13];
topWriteEnDelay[13] <= topWriteEnDelay[14];
topWriteEnDelay[14] <= topWriteEnDelay[15];
topWriteEnDelay[15] <= topWriteEnDelay[16];
topWriteEnDelay[16] <= topWriteEnDelay[17];
topWriteEnDelay[17] <= topWriteEnDelay[18];
topWriteEnDelay[18] <= topWriteEnDelay[19];
topWriteEnDelay[19] <= topWriteEnDelay[20];
topWriteEnDelay[20] <= topWriteEnDelay[21];
topWriteEnDelay[21] <= topWriteEnDelay[22];
topWriteEnDelay[22] <= topWriteEnDelay[23];
topWriteEnDelay[23] <= topWriteEnDelay[24];
topWriteEnDelay[24] <= topWriteEnDelay[25];
topWriteEnDelay[25] <= topWriteEnDelay[26];
topWriteEnDelay[26] <= topWriteEnDelay[27];
topWriteEnDelay[27] <= topWriteEnDelay[28];
topWriteEnDelay[28] <= topWriteEnDelay[29];
topWriteEnDelay[29] <= topWriteEnDelay[30];
topWriteEnDelay[30] <= topWriteEnDelay[31];
topWriteEnDelay[31] <= writeRow;
topWriteSelDelay0 <= topWriteSelDelay1;
topWriteSelDelay1 <= topWriteSelDelay2;
topWriteSelDelay2 <= topWriteSelDelay3;
topWriteSelDelay3 <= topWriteSelDelay4;
if (currentRowState == `cFETCH_ROW || currentState == `cUPDATE_J && i1 == 1)
topWriteSelDelay4 <= imodk;
else
topWriteSelDelay4 <= topWriteSelDelay5;
topWriteSelDelay5 <= topWriteSelDelay6;
topWriteSelDelay6 <= topWriteSelDelay7;
topWriteSelDelay7 <= topWriteSelDelay8;
topWriteSelDelay8 <= topWriteSelDelay9;
topWriteSelDelay9 <= topWriteSelDelay10;
topWriteSelDelay10 <= topWriteSelDelay11;
topWriteSelDelay11 <= topWriteSelDelay12;
topWriteSelDelay12 <= topWriteSelDelay13;
topWriteSelDelay13 <= topWriteSelDelay14;
topWriteSelDelay14 <= topWriteSelDelay15;
topWriteSelDelay15 <= topWriteSelDelay16;
topWriteSelDelay16 <= topWriteSelDelay17;
topWriteSelDelay17 <= topWriteSelDelay18;
topWriteSelDelay18 <= topWriteSelDelay19;
topWriteSelDelay19 <= topWriteSelDelay20;
topWriteSelDelay20 <= topWriteSelDelay21;
topWriteSelDelay21 <= topWriteSelDelay22;
topWriteSelDelay22 <= topWriteSelDelay23;
topWriteSelDelay23 <= topWriteSelDelay24;
topWriteSelDelay24 <= topWriteSelDelay25;
topWriteSelDelay25 <= topWriteSelDelay26;
topWriteSelDelay26 <= topWriteSelDelay27;
topWriteSelDelay27 <= topWriteSelDelay28;
topWriteSelDelay28 <= topWriteSelDelay29;
topWriteSelDelay29 <= topWriteSelDelay30;
topWriteSelDelay30 <= topWriteSelDelay31;
topWriteSelDelay31 <= i1modk;
topSourceSelDelay[0] <= topSourceSelDelay[1];
topSourceSelDelay[1] <= topSourceSelDelay[2];
topSourceSelDelay[2] <= topSourceSelDelay[3];
topSourceSelDelay[3] <= topSourceSelDelay[4];
if (start == 1'b1)
topSourceSelDelay[4] <= 1'b0;
else if (currentState == `cSTORE_MO)
topSourceSelDelay[4] <= 1'b1;
leftReadAddrDelay0 <= leftIdxCounter;
diagEnDelay[0] <= diagEnDelay[1];
diagEnDelay[1] <= diagEnDelay[2];
diagEnDelay[2] <= diagEnDelay[3];
diagEnDelay[3] <= diagEnDelay[4];
diagEnDelay[4] <= diagEnDelay[5];
diagEnDelay[5] <= (currentState == `cSTORE_DIAG || currentState == `cSTORE_DIAG2);
MOEnDelay[0] <= MOEnDelay[1];
MOEnDelay[1] <= MOEnDelay[2];
MOEnDelay[2] <= MOEnDelay[3];
MOEnDelay[3] <= MOEnDelay[4];
MOEnDelay[4] <= MOEnDelay[5];
if (currentState == `cSTORE_MO || currentRowState == `cLOAD_ROW_INC_J)
MOEnDelay[5] <= 1'b1;
else
MOEnDelay[5] <= 1'b0;
end
// output contorl signals
always @ (posedge clk)
begin
if (currentState == `cFETCH_COL)
curReadAddr <= diagIdxCounter;
else if (currentRowState == `cFETCH_ROW)
curReadAddr <= readRowCounter;
else
curReadAddr <= curReadAddrDelay0;
curWriteAddr <= curWriteAddrDelay0;
curWriteByteEn <= writeByteEnDelay0;
curWriteSel <= curWriteSelDelay;
curWriteEn <= curWriteEnDelay;
if (currentState == `cMULT_COL)
leftReadAddr <= leftIdxCounter;
else
leftReadAddr <= leftReadAddrDelay0;
leftWriteAddr <= curWriteAddrDelay0;
leftWriteByteEn <= writeByteEnDelay0;
leftWriteSel <= leftWriteSelDelay;
leftWriteEn <= leftWriteEnDelay;
if (currentState == `cSTORE_DIAG)
topReadAddr <= nextTopIdx;
else if (currentState == `cSTORE_DIAG2)
topReadAddr <= nextTopIdx2;
else
topReadAddr <= curTopIdx;
topWriteAddr <= topWriteAddrDelay0;
topWriteEn <= topWriteEnDelay;
topWriteSel <= topWriteSelDelay0;
topSourceSel <= topSourceSelDelay;
MOSel <= ~(currentState == `cFIND_REC);
if (currentState == `cFIND_REC)
MOEn <= 1'b1;
else
MOEn <= MOEnDelay;
diagEn <= diagEnDelay;
if (currentState == `cDONE)
done <= 1'b1;
else
done <= 1'b0;
end
endmodule
module ram (
byteena_a,
clk,
data,
rdaddress,
wraddress,
wren,
q
);
input [`RAMNUMBYTES-1:0] byteena_a;
input clk;
input [`RAMWIDTH-1:0] data;
input [`rRAMSIZEWIDTH-1:0] rdaddress;
input [`rRAMSIZEWIDTH-1:0] wraddress;
input wren;
output [`RAMWIDTH-1:0] q;
wire [`RAMWIDTH-1:0] value_out;
wire [`RAMWIDTH-1:0] subwire;
assign q = subwire | dummy;
wire [`RAMWIDTH-1:0] uselessdata;
assign uselessdata = 256'b0;
wire j;
assign j = |byteena_a;
wire [`RAMWIDTH-1:0]dummy;
assign dummy = value_out & 256'b0;
dual_port_ram inst1(
.clk (clk),
.we1(wren),
.we2(1'b0),
.data1(data),
.data2(uselessdata),
.out1(value_out),
.out2(subwire),
.addr1(wraddress),
.addr2(rdaddress));
endmodule
module ram1 (
byteena_a,
clk,
data,
rdaddress,
wraddress,
wren,
q
);
input [`RAMNUMBYTES-1:0] byteena_a;
input clk;
input [`RAMWIDTH-1:0] data;
input [`rRAMSIZEWIDTH-1:0] rdaddress;
input [`rRAMSIZEWIDTH-1:0] wraddress;
input wren;
output [`RAMWIDTH-1:0] q;
wire [`RAMWIDTH-1:0] value_out;
wire [`RAMWIDTH-1:0] subwire;
assign q = subwire | dummy;
wire [`RAMWIDTH-1:0] uselessdata;
assign uselessdata = 256'b0;
wire j;
assign j = |byteena_a;
wire [`RAMWIDTH-1:0]dummy;
assign dummy = value_out & 256'b0;
dual_port_ram inst1(
.clk (clk),
.we1(wren),
.we2(1'b0),
.data1(data),
.data2(uselessdata),
.out1(value_out),
.out2(subwire),
.addr1(wraddress),
.addr2(rdaddress));
endmodule
module ram2 (
byteena_a,
clk,
data,
rdaddress,
wraddress,
wren,
q
);
input [`RAMNUMBYTES-1:0] byteena_a;
input clk;
input [`RAMWIDTH-1:0] data;
input [`rRAMSIZEWIDTH-1:0] rdaddress;
input [`rRAMSIZEWIDTH-1:0] wraddress;
input wren;
output [`RAMWIDTH-1:0] q;
wire [`RAMWIDTH-1:0] value_out;
wire [`RAMWIDTH-1:0] subwire;
assign q = subwire | dummy;
wire [`RAMWIDTH-1:0] uselessdata;
assign uselessdata = 256'b0;
wire j;
assign j = |byteena_a;
wire [`RAMWIDTH-1:0]dummy;
assign dummy = value_out & 256'b0;
dual_port_ram inst1(
.clk (clk),
.we1(wren),
.we2(1'b0),
.data1(data),
.data2(uselessdata),
.out1(value_out),
.out2(subwire),
.addr1(wraddress),
.addr2(rdaddress));
endmodule
module ram3 (
byteena_a,
clk,
data,
rdaddress,
wraddress,
wren,
q
);
input [`RAMNUMBYTES-1:0] byteena_a;
input clk;
input [`RAMWIDTH-1:0] data;
input [`rRAMSIZEWIDTH-1:0] rdaddress;
input [`rRAMSIZEWIDTH-1:0] wraddress;
input wren;
output [`RAMWIDTH-1:0] q;
wire [`RAMWIDTH-1:0] value_out;
wire [`RAMWIDTH-1:0] subwire;
assign q = subwire | dummy;
wire [`RAMWIDTH-1:0] uselessdata;
assign uselessdata = 256'b0;
wire j;
assign j = |byteena_a;
wire [`RAMWIDTH-1:0]dummy;
assign dummy = value_out & 256'b0;
dual_port_ram inst1(
.clk (clk),
.we1(wren),
.we2(1'b0),
.data1(data),
.data2(uselessdata),
.out1(value_out),
.out2(subwire),
.addr1(wraddress),
.addr2(rdaddress));
endmodule
module top_ram (
clk,
data,
rdaddress,
wraddress,
wren,
q
);
//parameter TOPSIZE = 256, TOPSIZEWIDTH = 8, TOPWIDTH = 32;
input clk;
input [32-1:0] data;
input [8-1:0] rdaddress;
input [8-1:0] wraddress;
input wren;
output [32-1:0] q;
wire [32-1:0] sub_wire0;
wire [32-1:0] q;
wire [32-1:0] junk_output;
assign q = sub_wire0 | dummy;
wire[32-1:0] dummy;
assign dummy = junk_output & 32'b0;
dual_port_ram_256x32 inst2(
.clk (clk),
.we1(wren),
.we2(1'b0),
.data1(data),
.data2(data),
.out1(junk_output),
.out2(sub_wire0),
.addr1(wraddress),
.addr2(rdaddress));
endmodule
module mult_add (clk, A, B, C, mult_result, add_result);
//parameter PRECISION = 32;
input clk;
input [32-1:0] A, B, C;
output [32-1:0] mult_result, add_result;
reg [32-1:0] mult_result;
reg [32-1:0] add_result;
wire [32-1:0] mult_comp_result;
reg [32-1:0] add_a, add_b;
wire [32-1:0] addition_result;
wire [31:0] dummy_wire;
assign dummy_wire = mult_comp_result>>2'b10;
//divsp MUL(.clk(clk), .rmode(2'b00), .fpu_op(3'b010), .opa(A), .opb(B), .ans(mult_comp_result) );
wire [4:0]dummy_wire_2;
fpmul MUL(.clk(clk), .a(A), .b(B), .y_out(mult_comp_result), .control(2'b00), .flags(dummy_wire_2));
fpu_add ADD(.clock(clk), .a1(C), .b1(dummy_wire), .sum(addition_result));
always @ (posedge clk)
begin
add_result <= addition_result;
mult_result <= mult_comp_result[31:0];
end
endmodule
//`define rFIFOINPUTWIDTH 64
`define rFIFOSIZE 64
`define rFIFOSIZEWIDTH 6
`define rFIFOOUTPUTWIDTH 256
`define rFIFORSIZEWIDTH 4
`define wFIFOINPUTWIDTH 10'b0100000000
`define wFIFOSIZE 6'b010000
`define wFIFOSIZEWIDTH 4'b0100
`define wFIFOOUTPUTWIDTH 8'b01000000
`define wFIFORSIZEWIDTH 4'b0110
//for addr_fifo
`define aFIFOSIZE 6'b010000
`define aFIFOSIZEWIDTH 4'b0100
`define aFIFOWIDTH 4'b0101
//for memfifo
`define mFIFOSIZE 16
`define mFIFOSIZEWIDTH 4
//`define mFIFOWIDTH 28
`define BURSTLEN 3'b010
`define BURSTWIDTH 3'b010
`define DATAWIDTH 10'b0100000000
`define DATANUMBYTES 7'b0100000
`define MEMCONWIDTH 8'b01000000
`define MEMCONNUMBYTES 5'b01000
`define DDRSIZEWIDTH 6'b011000
`define FIFOSIZE 6'b010000
`define FIFOSIZEWIDTH 4'b0100
`define RAMWIDTH 10'b0100000000
`define RAMNUMBYTES 7'b0100000
`define RAMSIZEWIDTH 4'b0101
`define RATIO 4'b0100
`define RAMLAT 4'b0101
`define dIDLE 0
`define dWRITE 1
`define dREAD 2
module DataTransferUnit (clk, dtu_write_req, dtu_read_req, dtu_mem_addr, dtu_ram_addr, dtu_size, dtu_ack, dtu_done,
ram_read_addr, ram_read_data, ram_write_byte_en, ram_write_data, ram_write_addr, ram_write_en,
mem_rdata, mem_rdata_valid, mem_ready, mem_wdata_req, reset_n,
burst_begin, mem_local_addr, mem_be, mem_read_req, mem_size, mem_wdata, mem_write_req
);
output burst_begin;
output [`DDRSIZEWIDTH-1:0] mem_local_addr;
output [`MEMCONNUMBYTES-1: 0] mem_be;
output mem_read_req;
output [`BURSTWIDTH-1:0] mem_size;
output [`MEMCONWIDTH-1:0] mem_wdata;
output mem_write_req;
input clk;
input [`MEMCONWIDTH-1:0] mem_rdata;
input mem_rdata_valid;
input mem_ready;
input mem_wdata_req;
input reset_n;
input dtu_write_req;
input dtu_read_req;
input [`DDRSIZEWIDTH-1:0] dtu_mem_addr;
input [`RAMSIZEWIDTH-1:0] dtu_ram_addr;
input [4:0] dtu_size;
output dtu_ack;
output dtu_done;
output[`RAMWIDTH-1:0] ram_write_data;
input[`RAMWIDTH-1:0] ram_read_data;
output[`RAMSIZEWIDTH-1:0] ram_write_addr, ram_read_addr;
output[`RAMNUMBYTES-1:0] ram_write_byte_en;
output ram_write_en;
reg[`DDRSIZEWIDTH-1:0] mem_addr0;
reg[`DDRSIZEWIDTH-1:0] mem_addr1;
reg[`DDRSIZEWIDTH-1:0] mem_addr2;
reg[`DDRSIZEWIDTH-1:0] mem_addr3;
reg[`DDRSIZEWIDTH-1:0] mem_addr4;
reg[`DDRSIZEWIDTH-1:0] mem_addr5;
reg [1:0] state;
wire [`DATAWIDTH-1:0] rdata, ram_write_dataw, ram_read_dataw;
wire [`RAMSIZEWIDTH-1:0] rfifo_addr;
reg [`RAMLAT-1:0]fifo_write_reg;
reg [`RAMLAT-1:0]write_req_reg;
reg [`RAMLAT-1:0]read_req_reg;
reg [0:0]fifo_read_reg;
reg rdata_valid;
reg [1:0]test_complete_reg;
reg [`BURSTWIDTH-1:0] size_count0;
reg [`BURSTWIDTH-1:0] size_count1;
reg [`BURSTWIDTH-1:0] size_count2;
reg [`BURSTWIDTH-1:0] size_count3;
reg [`BURSTWIDTH-1:0] size_count4;
reg [`RAMSIZEWIDTH-1:0] size;
reg [`RAMSIZEWIDTH-1:0]ram_addr0;
reg [`RAMSIZEWIDTH-1:0]ram_addr1;
reg [`RAMSIZEWIDTH-1:0]ram_addr2;
reg [`RAMSIZEWIDTH-1:0]ram_addr3;
reg [`RAMSIZEWIDTH-1:0]ram_addr4;
reg [2:0] data_count;
reg ram_write_en_reg;
wire read_req;
wire write_req;
wire [`FIFOSIZEWIDTH-1:0] wfifo_count;
wire rfull, wempty, rempty, rdcmd_empty, wrcmd_full, wrcmd_empty, rdata_empty;
wire [`DATAWIDTH-1:0] mem_data;
wire not_stall;
wire fifo_write, fifo_read;
wire rdata_req;
wire [`BURSTWIDTH+`DDRSIZEWIDTH+1:0] wrmem_cmd, rdmem_cmd;
wire mem_cmd_ready, mem_cmd_issue;
// FIFOs to interact with off-chip memory
memcmd_fifo cmd_store(
//.aclr(~reset_n),
//.rdclk(phy_clk),
.clk(clk),
.data(wrmem_cmd),
.rdreq(mem_cmd_ready),
//.rdempty(rdcmd_empty),
.wrreq(mem_cmd_issue),
.full(wrcmd_full),
.empty(wrcmd_empty),
.q(rdmem_cmd)
);
wfifo wdata_store(
//.rdclk(phy_clk),
.clk(clk),
.data(mem_data),
.rdreq(mem_wdata_req),
.wrreq(fifo_write),
.empty(wempty),
.q(mem_wdata),
.usedw(wfifo_count)
);
addr_fifo raddress_store (
.clk(clk),
.data(ram_addr3),
.wrreq(fifo_read),
.rdreq(rdata_req),
.empty(rempty),
.full(rfull),
.q(rfifo_addr)
);
rfifo rdata_store(
.clk(clk),
.data(mem_rdata),
.rdreq(rdata_req),
//.wrclk(phy_clk),
.wrreq(mem_rdata_valid),
.empty(rdata_empty),
.q(rdata)
);
assign mem_cmd_ready = (mem_ready == 1'b1);// && (rdcmd_empty == 0);
assign mem_cmd_issue = (wrcmd_full == 1'b0) && (write_req == 1 || read_req == 1'b1 || wrcmd_empty == 1'b1);
assign wrmem_cmd[27:26] = size_count0;
assign wrmem_cmd[`DDRSIZEWIDTH+1:2] = mem_addr0;
assign wrmem_cmd[1] = read_req;
assign wrmem_cmd[0] = write_req;
assign mem_write_req = rdmem_cmd[0];// && rdcmd_empty == 0;
assign mem_read_req = rdmem_cmd[1];// && rdcmd_empty == 0;
assign mem_local_addr = rdmem_cmd[`DDRSIZEWIDTH+1:2];
assign burst_begin = 0;
assign mem_size = rdmem_cmd[`BURSTWIDTH+`DDRSIZEWIDTH+1:`DDRSIZEWIDTH+2];
assign mem_be = ~0;
assign fifo_write = fifo_write_reg[0];
assign write_req = (not_stall) ? write_req_reg[0] : 0;
assign read_req = (not_stall) ? read_req_reg[0] : 0;
assign fifo_read = (not_stall) ? fifo_read_reg[0] : 0;
assign not_stall = (wfifo_count < `FIFOSIZE-5) && (rfull == 0) && (wrcmd_full == 0);
assign dtu_ack = (state == `dIDLE);
assign dtu_done = (state == `dIDLE) && wempty && rempty;
assign ram_write_dataw[63:0] = rdata[255:192];
assign mem_data[63:0] = ram_read_dataw[255:192];
assign ram_write_dataw[127:64] = rdata[191:128];
assign mem_data[127:64] = ram_read_dataw[191:128];
assign ram_write_dataw[191:128] = rdata[127:64];
assign mem_data[191:128] = ram_read_dataw[127:64];
assign ram_write_dataw[255:192] = rdata[63:0];
assign mem_data[255:192] = ram_read_dataw[63:0];
assign ram_write_data = ram_write_dataw[255:0];
assign ram_read_dataw[255:0] = ram_read_data;
assign ram_write_addr = rfifo_addr;
assign ram_read_addr = ram_addr4;
assign ram_write_byte_en = ~0;
assign ram_write_en = ram_write_en_reg;
assign rdata_req = !rdata_empty;
// FSM to produce off-chip memory commands
always @ (posedge clk)
begin
if (reset_n == 1'b0)
begin
state <= `dIDLE;
end
else
begin
case (state)
`dIDLE:
begin
if (dtu_write_req)
state <= `dWRITE;
else if (dtu_read_req)
state <= `dREAD;
else
state <= `dIDLE;
end
`dWRITE:
begin
if (not_stall && size == 0 && data_count < `BURSTLEN)
state <= `dIDLE;
else
state <= `dWRITE;
end
`dREAD:
begin
if (not_stall && size == 0 && data_count < `BURSTLEN)
state <= `dIDLE;
else
state <= `dREAD;
end
default:
begin
state <= `dIDLE;
end
endcase
end
end
always @ (posedge clk)
begin
if (reset_n == 0)
begin
size <= 0;
data_count <= 0;
size_count4 <= 1;
mem_addr5 <= 0;
ram_addr4 <= 0;
fifo_write_reg[`RAMLAT-1] <= 0;
write_req_reg[`RAMLAT-1] <= 0;
fifo_read_reg[0] <= 0;
read_req_reg[`RAMLAT-1] <= 0;
end
else if (state == `dIDLE)
begin
size <= dtu_size;
size_count4 <= `BURSTLEN;
mem_addr5 <= dtu_mem_addr;
ram_addr4 <= dtu_ram_addr;
fifo_write_reg[`RAMLAT-1] <= 1'b0;
write_req_reg[`RAMLAT-1] <= 1'b0;
fifo_read_reg[0] <= 1'b0;
read_req_reg[`RAMLAT-1] <= 1'b0;
data_count <= 0;
end
else if (data_count >= `BURSTLEN && not_stall)
begin
data_count <= data_count - `BURSTLEN;
mem_addr5 <= mem_addr5 + `BURSTLEN;
fifo_write_reg[`RAMLAT-1] <= 1'b0;
write_req_reg[`RAMLAT-1] <= state == `dWRITE;
fifo_read_reg[0] <= 0;
read_req_reg[`RAMLAT-1] <= state == `dREAD;
end
else if (size == 0 && data_count == 0 && not_stall==1'b1)
begin
fifo_write_reg[`RAMLAT-1] <= 0;
write_req_reg[`RAMLAT-1] <= 0;
fifo_read_reg[0] <= 0;
read_req_reg[`RAMLAT-1] <= 0;
end
else if (size == 0 && not_stall==1'b1)
begin
size_count4 <= data_count[`BURSTWIDTH-1:0];
fifo_write_reg[`RAMLAT-1] <= 0;
write_req_reg[`RAMLAT-1] <= state == `dWRITE;
fifo_read_reg[0] <= 0;
read_req_reg[`RAMLAT-1] <= state == `dREAD;
end
else if (not_stall==1'b1)
begin
size <= size - 1;
data_count <= data_count + `RATIO - `BURSTLEN;
mem_addr5 <= mem_addr5 + `BURSTLEN;
ram_addr4 <= ram_addr4+1;
fifo_write_reg[`RAMLAT-1] <= state == `dWRITE;
write_req_reg[`RAMLAT-1] <= state == `dWRITE;
fifo_read_reg[0] <= state == `dREAD;
read_req_reg[`RAMLAT-1] <= state == `dREAD;
end
else
begin
fifo_write_reg[`RAMLAT-1] <= 0;
end
end
always @ (posedge clk)
begin
if (reset_n == 0)
begin
fifo_write_reg[0] <= 1'b0;
fifo_write_reg[1] <= 1'b0;
fifo_write_reg[2] <= 1'b0;
fifo_write_reg[3] <= 1'b0;
end
else
begin
fifo_write_reg[0] <= fifo_write_reg[1];
fifo_write_reg[1] <= fifo_write_reg[2];
fifo_write_reg[2] <= fifo_write_reg[3];
fifo_write_reg[3] <= fifo_write_reg[4];
end
if (reset_n == 1'b0)
begin
mem_addr0 <= 0;
ram_addr0 <= 0;
size_count0 <= 1;
write_req_reg[0] <= 0;
read_req_reg[0] <= 0;
mem_addr1 <= 0;
ram_addr1 <= 0;
size_count1 <= 1;
write_req_reg[1] <= 0;
read_req_reg[1] <= 0;
mem_addr2 <= 0;
ram_addr2 <= 0;
size_count2 <= 1;
write_req_reg[2] <= 0;
read_req_reg[2] <= 0;
mem_addr3 <= 0;
ram_addr3 <= 0;
size_count3 <= 1;
write_req_reg[3] <= 0;
read_req_reg[3] <= 0;
mem_addr4 <= 0;
end
else if (not_stall)
begin
size_count0 <= size_count1;
mem_addr0 <= mem_addr1;
ram_addr0 <= ram_addr1;
write_req_reg[0] <= write_req_reg[1];
read_req_reg[0] <= read_req_reg[1];
size_count1 <= size_count2;
mem_addr1 <= mem_addr2;
ram_addr1 <= ram_addr2;
write_req_reg[1] <= write_req_reg[2];
read_req_reg[1] <= read_req_reg[2];
size_count2 <= size_count3;
mem_addr2 <= mem_addr3;
ram_addr2 <= ram_addr3;
write_req_reg[2] <= write_req_reg[3];
read_req_reg[2] <= read_req_reg[3];
size_count3 <= size_count4;
mem_addr3 <= mem_addr4;
ram_addr3 <= ram_addr4;
write_req_reg[3] <= write_req_reg[4];
read_req_reg[3] <= read_req_reg[4];
mem_addr4 <= mem_addr5;
end
ram_write_en_reg <= rdata_req;
end
endmodule
module rfifo (
clk,
data,
rdreq,
wrreq,
empty,
q
);
input clk;
input wrreq;
input rdreq;
input [`rFIFOINPUTWIDTH-1:0] data;
output empty;
output [`rFIFOOUTPUTWIDTH-1:0] q;
reg [`rFIFORSIZEWIDTH-1:0] wr_pointer;
reg [`rFIFORSIZEWIDTH-1:0] rd_pointer;
reg [`rFIFORSIZEWIDTH:0] status_cnt;
reg [`rFIFOOUTPUTWIDTH-1:0] q ;
reg[1:0] counter;
wire [`rFIFOINPUTWIDTH-1:0] data_ram;
assign empty = (status_cnt == 7'b0000000);
wire [`rFIFOINPUTWIDTH-1:0]junk_input;
wire [`rFIFOINPUTWIDTH-1:0]junk_output;
assign junk_input = 64'b0000000000000000000000000000000000000000000000000000000000000000;
always @ (posedge clk)
begin //WRITE_POINTER
if (wrreq)
begin
wr_pointer <= wr_pointer + 1'b1;
end
end
always @ (posedge clk)
begin //READ_POINTER
if (rdreq)
begin
rd_pointer <= rd_pointer + 2'b01;
end
end
always @ (posedge clk )
begin //READ_DATA
if (rdreq)
counter <= 0;
else
counter <= counter + 2'b01;
if(counter == 0)
q[`rFIFOINPUTWIDTH-1:0] <= data_ram;
else if (counter == 1)
q[127:64] <= data_ram;
else if (counter == 2)
q[191:128] <= data_ram;
else if (counter == 3)
q[255:192] <= data_ram;
end
always @ (posedge clk )
begin // : STATUS_COUNTER
if ((rdreq) && (!wrreq) && (status_cnt != 0))
status_cnt <= status_cnt - 1'b1;
// Write but no read.
else if ((wrreq) && (!rdreq) && (status_cnt != 64 ))
status_cnt <= status_cnt + 1'b1;
end
dual_port_ram_rfifo ram_addr(
.we1 (wrreq) , // write enable
.we2 (rdreq) , // Read enable
.addr1 (wr_pointer) , // address_0 input
.addr2 (rd_pointer) , // address_q input
.data1 (data) , // data_0 bi-directional
.data2 (junk_input), // data_1 bi-directional
.clk(clk),
.out1 (data_ram),
.out2 (junk_output)
);
endmodule
// synopsys translate_off
//`timescale 1 ps / 1 ps
// synopsys translate_on
module wfifo (
clk,
data,
rdreq,
wrreq,
empty,
q,
usedw
);
input clk;
input wrreq;
input rdreq;
input [`wFIFOINPUTWIDTH-1:0] data;
output empty;
output [`wFIFOOUTPUTWIDTH-1:0] q;
output [`wFIFOSIZEWIDTH-1:0] usedw;
//-----------Internal variables-------------------
reg [`wFIFOSIZEWIDTH-1:0] wr_pointer;
reg [`wFIFOSIZEWIDTH-1:0] rd_pointer;
reg [`wFIFOSIZEWIDTH:0] status_cnt;
reg [`wFIFOOUTPUTWIDTH-1:0] q ;
reg[1:0] counter;
wire [`wFIFOINPUTWIDTH-1:0] data_ram ;
assign empty = (status_cnt == 5'b00000);
wire [`wFIFOINPUTWIDTH-1:0]junk_input;
wire [`wFIFOINPUTWIDTH-1:0]junk_output;
assign junk_input = 256'b0;
always @ (posedge clk)
begin //WRITE_POINTER
if (wrreq)
begin
wr_pointer <= wr_pointer + 1'b1;
end
end
always @ (posedge clk)
begin //READ_POINTER
if (rdreq)
begin
rd_pointer <= rd_pointer + 2'b01;
end
end
always @ (posedge clk )
begin //READ_DATA
if (rdreq)
counter <= 0;
else
counter <= counter + 2'b01;
if(counter == 0)
q <= data_ram[63:0];
else if(counter == 1)
q <= data_ram[127:64];
else if(counter == 2)
q <= data_ram[191:128];
else if(counter == 3)
q <= data_ram[255:192];
end
always @ (posedge clk )
begin // : STATUS_COUNTER
if ((rdreq) && (!wrreq) && (status_cnt != 5'b00000))
status_cnt <= status_cnt - 1'b1;
// Write but no read.
else if ((wrreq) && (!rdreq) && (status_cnt != 5'b10000 ))
status_cnt <= status_cnt + 1'b1;
end
assign usedw = status_cnt[`wFIFOSIZEWIDTH-1:0];
dual_port_ram_wfifo ram_addr(
.we1 (wrreq) , // write enable
.we2 (rdreq) , // Read enable
.addr1 (wr_pointer) , // address_0 input
.addr2 (rd_pointer) , // address_q input
.data1 (data) , // data_0 bi-directional
.data2 (junk_input), // data_1 bi-directional
.clk(clk),
.out1 (data_ram),
.out2 (junk_output)
);
endmodule
// synopsys translate_off
//`timescale 1 ps / 1 ps
// synopsys translate_on
module addr_fifo (
clk,
data,
wrreq,
rdreq,
empty,
full,
q
);
input clk;
input [`aFIFOWIDTH-1:0] data;
input rdreq;
input wrreq;
output empty;
output full;
output [`aFIFOWIDTH-1:0] q;
reg [`aFIFOSIZEWIDTH-1:0] wr_pointer;
reg [`aFIFOSIZEWIDTH-1:0] rd_pointer;
reg [`aFIFOSIZEWIDTH:0] status_cnt;
reg [`aFIFOWIDTH-1:0] q ;
wire [`aFIFOWIDTH-1:0] data_ram ;
assign full = (status_cnt == 5'b01111);
assign empty = (status_cnt == 5'b00000);
wire [`aFIFOWIDTH-1:0]junk_input;
wire [`aFIFOWIDTH-1:0]junk_output;
assign junk_input = 5'b00000;
always @ (posedge clk)
begin //WRITE_POINTER
if (wrreq)
begin
wr_pointer <= wr_pointer + 1'b1;
end
end
always @ (posedge clk)
begin //READ_POINTER
if (rdreq)
begin
rd_pointer <= rd_pointer + 1'b1;
end
end
always @ (posedge clk )
begin //READ_DATA
if (rdreq) begin
q <= data_ram;
end
end
always @ (posedge clk )
begin // : STATUS_COUNTER
if ((rdreq) && (!wrreq) && (status_cnt != 5'b00000))
status_cnt <= status_cnt - 1'b1;
// Write but no read.
else if ((wrreq) && (!rdreq) && (status_cnt != 5'b10000))
status_cnt <= status_cnt + 1;
end
dual_port_ram_afifo ram_addr(
.we1 (wrreq) , // write enable
.we2 (rdreq) , // Read enable
.addr1 (wr_pointer) , // address_0 input
.addr2 (rd_pointer) , // address_q input
.data1 (data) , // data_0 bi-directional
.data2 (junk_input), // data_1 bi-directional
.clk(clk),
.out1 (data_ram),
.out2 (junk_output)
);
endmodule
module memcmd_fifo (
clk,
data,
rdreq,
wrreq,
full,
empty,
q
);
input clk;
input [`mFIFOWIDTH-1:0] data;
input wrreq;
input rdreq;
output full;
output empty;
output [`mFIFOWIDTH-1:0] q;
reg [`mFIFOSIZEWIDTH-1:0] wr_pointer;
reg [`mFIFOSIZEWIDTH-1:0] rd_pointer;
reg [`mFIFOSIZEWIDTH:0] status_cnt;
reg [`mFIFOWIDTH-1:0] q ;
wire [`mFIFOWIDTH-1:0] data_ram;
assign full = (status_cnt ==5'b01111);
assign empty = (status_cnt == 5'b00000);
wire [`mFIFOWIDTH-1:0]junk_input;
wire [`mFIFOWIDTH-1:0]junk_output;
assign junk_input = 28'b0000000000000000000000000000;
always @ (posedge clk)
begin //WRITE_POINTER
if (wrreq)
begin
wr_pointer <= wr_pointer + 1'b1;
end
end
always @ (posedge clk)
begin //READ_POINTER
if (rdreq)
begin
rd_pointer <= rd_pointer + 1'b1;
end
end
always @ (posedge clk )
begin //READ_DATA
if (rdreq) begin
q <= data_ram;
end
end
always @ (posedge clk )
begin // : STATUS_COUNTER
if ((rdreq) && (!wrreq) && (status_cnt != 0))
status_cnt <= status_cnt - 1'b1;
else if ((wrreq) && (!rdreq) && (status_cnt != 16 ))
status_cnt <= status_cnt + 1'b1;
end
dual_port_ram_mfifo ram_addr(
.we1 (wrreq) , // write enable
.we2 (rdreq) , // Read enable
.addr1 (wr_pointer) , // address_0 input
.addr2 (rd_pointer) , // address_q input
.data1 (data) , // data_0 bi-directional
.data2 (junk_input), // data_1 bi-directional
.clk(clk),
.out1 (data_ram),
.out2 (junk_output));
endmodule
`define ZERO 8'b00000000
`define ONE 8'b00000001
`define TWO 8'b00000010
`define THREE 8'b00000011
`define FOUR 8'b00000100
`define FIVE 8'b00000101
`define SIX 8'b00000110
`define SEVEN 8'b00000111
`define EIGHT 8'b00001000
`define NINE 8'b00001001
`define TEN 8'b00001010
`define ELEVEN 8'b00001011
`define TWELVE 8'b00001100
`define THIRTEEN 8'b00001101
`define FOURTEEN 8'b00001110
`define FIFTEEN 8'b00001111
`define SIXTEEN 8'b00010000
`define SEVENTEEN 8'b00010001
`define EIGHTEEN 8'b00010010
`define NINETEEN 8'b00010011
`define TWENTY 8'b00010100
`define TWENTYONE 8'b00010101
`define TWENTYTWO 8'b00010110
`define TWENTYTHREE 8'b00010111
`define TWENTYFOUR 8'b00011000
module fpu_add (clock, a1, b1, sum);
input clock;
input [31:0]a1;
input [31:0]b1;
output [31:0]sum;
reg [31:0]sum;
//Split up the numbers into exponents and mantissa.
reg [7:0]a_exp;
//reg [7:0]b_exp;
reg [23:0]a_man;
reg [23:0]b_man;
reg [7:0]temp;
reg [24:0]sum_man;
//reg [7:0]sum_exp;
//introduce latency on inputs
reg [31:0]a;
reg [31:0]b;
always @ (posedge clock) begin
a <= a1;
b <= b1;
end
reg smaller; //smaller is 1 if a < b, 0 otherwise
//Shift mantissa's to have the same exponent
always @ (a or b) begin
//a_exp = a[30:23];
//b_exp = b[30:23];
//a_man = {1'b1, a[22:0]};
//b_man = {1'b1, b[22:0]};
if (a[30:23] < b[30:23]) begin
temp = b[30:23] - a[30:23];
//a_man = {1'b1, a[22:0]} >> temp; //Expand into case statement, as below.
case (temp)
`ONE: begin
a_man = {1'b1, a[22:0]} >> `ONE;
end
`TWO: begin
a_man = {1'b1, a[22:0]} >> `TWO;
end
`THREE: begin
a_man = {1'b1, a[22:0]} >> `THREE;
end
`FOUR: begin
a_man = {1'b1, a[22:0]} >> `FOUR;
end
`FIVE: begin
a_man = {1'b1, a[22:0]} >> `FIVE;
end
`SIX: begin
a_man = {1'b1, a[22:0]} >> `SIX;
end
`SEVEN: begin
a_man = {1'b1, a[22:0]} >> `SEVEN;
end
`EIGHT: begin
a_man = {1'b1, a[22:0]} >> `EIGHT;
end
`NINE: begin
a_man = {1'b1, a[22:0]} >> `NINE;
end
`TEN: begin
a_man = {1'b1, a[22:0]} >> `TEN;
end
`ELEVEN: begin
a_man = {1'b1, a[22:0]} >> `ELEVEN;
end
`TWELVE: begin
a_man = {1'b1, a[22:0]} >> `TWELVE;
end
`THIRTEEN: begin
a_man = {1'b1, a[22:0]} >> `THIRTEEN;
end
`FOURTEEN: begin
a_man = {1'b1, a[22:0]} >> `FOURTEEN;
end
`FIFTEEN: begin
a_man = {1'b1, a[22:0]} >> `FIFTEEN;
end
`SIXTEEN: begin
a_man = {1'b1, a[22:0]} >> `SIXTEEN;
end
`SEVENTEEN: begin
a_man = {1'b1, a[22:0]} >> `SEVENTEEN;
end
`EIGHTEEN: begin
a_man = {1'b1, a[22:0]} >> `EIGHTEEN;
end
`NINETEEN: begin
a_man = {1'b1, a[22:0]} >> `NINETEEN;
end
`TWENTY: begin
a_man = {1'b1, a[22:0]} >> `TWENTY;
end
`TWENTYONE: begin
a_man = {1'b1, a[22:0]} >> `TWENTYONE;
end
`TWENTYTWO: begin
a_man = {1'b1, a[22:0]} >> `TWENTYTWO;
end
`TWENTYTHREE: begin
a_man = {1'b1, a[22:0]} >> `TWENTYTHREE;
end
`TWENTYFOUR: begin
a_man = {1'b1, a[22:0]} >> `TWENTYFOUR;
end
default: begin //More than twenty-four, shift by twenty-four. It is a boundary case.
a_man = {1'b1, a[22:0]} >> `TWENTYFOUR;
end
endcase
b_man = {1'b1, b[22:0]};
a_exp = b[30:23];
//b_exp = b[30:23];
end else if (a[30:23] > b[30:23]) begin
temp = a[30:23] - b[30:23];
a_man = {1'b1, a[22:0]};
//b_man = {1'b1, b[22:0]} >> temp; //Expand into case statement, as below.
case (temp)
`ONE: begin
b_man = {1'b1, b[22:0]} >> `ONE;
end
`TWO: begin
b_man = {1'b1, b[22:0]} >> `TWO;
end
`THREE: begin
b_man = {1'b1, b[22:0]} >> `THREE;
end
`FOUR: begin
b_man = {1'b1, b[22:0]} >> `FOUR;
end
`FIVE: begin
b_man = {1'b1, b[22:0]} >> `FIVE;
end
`SIX: begin
b_man = {1'b1, b[22:0]} >> `SIX;
end
`SEVEN: begin
b_man = {1'b1, b[22:0]} >> `SEVEN;
end
`EIGHT: begin
b_man = {1'b1, b[22:0]} >> `EIGHT;
end
`NINE: begin
b_man = {1'b1, b[22:0]} >> `NINE;
end
`TEN: begin
b_man = {1'b1, b[22:0]} >> `TEN;
end
`ELEVEN: begin
b_man = {1'b1, b[22:0]} >> `ELEVEN;
end
`TWELVE: begin
b_man = {1'b1, b[22:0]} >> `TWELVE;
end
`THIRTEEN: begin
b_man = {1'b1, b[22:0]} >> `THIRTEEN;
end
`FOURTEEN: begin
b_man = {1'b1, b[22:0]} >> `FOURTEEN;
end
`FIFTEEN: begin
b_man = {1'b1, b[22:0]} >> `FIFTEEN;
end
`SIXTEEN: begin
b_man = {1'b1, b[22:0]} >> `SIXTEEN;
end
`SEVENTEEN: begin
b_man = {1'b1, b[22:0]} >> `SEVENTEEN;
end
`EIGHTEEN: begin
b_man = {1'b1, b[22:0]} >> `EIGHTEEN;
end
`NINETEEN: begin
b_man = {1'b1, b[22:0]} >> `NINETEEN;
end
`TWENTY: begin
b_man = {1'b1, b[22:0]} >> `TWENTY;
end
`TWENTYONE: begin
b_man = {1'b1, b[22:0]} >> `TWENTYONE;
end
`TWENTYTWO: begin
b_man = {1'b1, b[22:0]} >> `TWENTYTWO;
end
`TWENTYTHREE: begin
b_man = {1'b1, b[22:0]} >> `TWENTYTHREE;
end
`TWENTYFOUR: begin
b_man = {1'b1, b[22:0]} >> `TWENTYFOUR;
end
default: begin //More than twenty-four, shift by twenty-four. It is a boundary case.
b_man = {1'b1, b[22:0]} >> `TWENTYFOUR;
end
endcase
a_exp = a[30:23];
//b_exp = a[30:23];
end else begin
temp = 8'b0;
a_man = {1'b1, a[22:0]};
b_man = {1'b1, b[22:0]};
a_exp = a[30:23];
end
end
//Perform the addition operation
always @ (a_man or b_man or a or b) begin
if (a_man < b_man) begin
smaller = 1'b1;
end else begin
smaller = 1'b0;
end
//both positive
if (~a[31] && ~b[31]) begin
sum_man = a_man + b_man;
sum[31] = 1'b0;
end
//both negative
else if (a[31] && b[31]) begin
sum_man = a_man + b_man;
sum[31] = 1'b1;
end
//a pos, b neg
else if (~a[31] && b[31]) begin
if (smaller) begin //a < b
sum_man = b_man - a_man;
sum[31] = 1'b1;
end else begin
sum_man = a_man - b_man;
sum[31] = 1'b0;
end
end
//a neg, b pos
else /*if (a[31] && ~b[31])*/ begin
if (smaller) begin //a < b
sum_man = b_man - a_man;
sum[31] = 1'b0;
end else begin
sum_man = a_man - b_man;
sum[31] = 1'b1;
end
end
end
//Store the number
// we already have the sign.
always @ (sum_man or a_exp) begin
if (sum_man[24])begin //shif sum >> by 1, add 1 to the exponent.
sum[22:0] = sum_man[23:1];
sum[30:23] = a_exp + 8'b00000001;
end else if (sum_man[23]) begin //do nothing
sum[22:0] = sum_man[22:0];
sum[30:23] = a_exp;
end else if (sum_man[22]) begin //shift << by 1, subtract 1 from exponent.
sum[22:0] = {sum_man[21:0], 1'b0};
sum[30:23] = a_exp - 8'b00000001;
end else if (sum_man[21]) begin //shift << by 2, subtract 2 from exponent.
sum[22:0] = {sum_man[20:0], 2'b0};
sum[30:23] = a_exp - 8'b00000010;
end else if (sum_man[20]) begin //shift << by 3, subtract 3 from exponent.
sum[22:0] = {sum_man[19:0], 3'b0};
sum[30:23] = a_exp - 8'b00000011;
end else if (sum_man[19]) begin //shift << by 4, subtract 4 from exponent.
sum[22:0] = {sum_man[18:0], 4'b0};
sum[30:23] = a_exp - 8'b00000100;
end else if (sum_man[18]) begin //shift << by 5, subtract 5 from exponent.
sum[22:0] = {sum_man[17:0], 5'b0};
sum[30:23] = a_exp - 8'b00000101;
end else if (sum_man[17]) begin //shift << by 6, subtract 6 from exponent.
sum[22:0] = {sum_man[16:0], 6'b0};
sum[30:23] = a_exp - 8'b00000110;
end else if (sum_man[16]) begin //shift << by 7, subtract 7 from exponent.
sum[22:0] = {sum_man[15:0], 7'b0};
sum[30:23] = a_exp - 8'b00000111;
end else if (sum_man[15]) begin //shift << by 8, subtract 8 from exponent.
sum[22:0] = {sum_man[14:0], 8'b0};
sum[30:23] = a_exp - 8'b00001000;
end else if (sum_man[14]) begin //shift << by 9, subtract 9 from exponent.
sum[22:0] = {sum_man[13:0], 9'b0};
sum[30:23] = a_exp - 8'b00001001;
end else if (sum_man[13]) begin //shift << by 10, subtract 10 from exponent.
sum[22:0] = {sum_man[12:0], 10'b0};
sum[30:23] = a_exp - 8'b00001010;
end else if (sum_man[12]) begin //shift << by 11, subtract 11 from exponent.
sum[22:0] = {sum_man[11:0], 11'b0};
sum[30:23] = a_exp - 8'b00001011;
end else if (sum_man[11]) begin //shift << by 12, subtract 12 from exponent.
sum[22:0] = {sum_man[10:0], 12'b0};
sum[30:23] = a_exp - 8'b00001100;
end else if (sum_man[10]) begin //shift << by 13, subtract 13 from exponent.
sum[22:0] = {sum_man[9:0], 13'b0};
sum[30:23] = a_exp - 8'b00001101;
end else if (sum_man[9]) begin //shift << by 14, subtract 14 from exponent.
sum[22:0] = {sum_man[8:0], 14'b0};
sum[30:23] = a_exp - 8'b00001110;
end else if (sum_man[8]) begin //shift << by 15, subtract 15 from exponent.
sum[22:0] = {sum_man[7:0], 15'b0};
sum[30:23] = a_exp - 8'b00001111;
end else if (sum_man[7]) begin //shift << by 16, subtract 16 from exponent.
sum[22:0] = {sum_man[6:0], 16'b0};
sum[30:23] = a_exp - 8'b00010000;
end else if (sum_man[6]) begin //shift << by 17, subtract 17 from exponent.
sum[22:0] = {sum_man[5:0], 17'b0};
sum[30:23] = a_exp - 8'b00010001;
end else if (sum_man[5]) begin //shift << by 18, subtract 18 from exponent.
sum[22:0] = {sum_man[4:0], 18'b0};
sum[30:23] = a_exp - 8'b00010010;
end else if (sum_man[4]) begin //shift << by 19, subtract 19 from exponent.
sum[22:0] = {sum_man[3:0], 19'b0};
sum[30:23] = a_exp - 8'b00010011;
end else if (sum_man[3]) begin //shift << by 20, subtract 20 from exponent.
sum[22:0] = {sum_man[2:0], 20'b0};
sum[30:23] = a_exp - 8'b00010100;
end else if (sum_man[2]) begin //shift << by 21, subtract 21 from exponent.
sum[22:0] = {sum_man[1:0], 21'b0};
sum[30:23] = a_exp - 8'b00010101;
end else if (sum_man[1]) begin //shift << by 22, subtract 22 from exponent.
sum[22:0] = {sum_man[0:0], 22'b0};
sum[30:23] = a_exp - 8'b00010110;
end else /*if (sum_man[0])*/ begin //shift << by 23, subtract 23 from exponent.
sum[22:0] = 23'b0;
sum[30:23] = a_exp - 8'b00010111;
end
end
endmodule
module fpu_div(clock, n, d, div);
//n = numerator
//d = denomenator
//div = result
input clock;
input [31:0]n;
input [31:0]d;
output [31:0]div;
reg [31:0]div;
//Store the mantissa and exponents separately. Introduce the latency of 1.
reg [7:0]n_exp;
reg [7:0]d_exp;
reg [23:0]n_man;
reg [23:0]d_man;
reg n_sign;
reg d_sign;
wire [23:0]div_man;
reg [7:0]div_exp;
always @ (posedge clock) begin
n_exp <= n[30:23];
d_exp <= d[30:23];
n_man <= {1'b1, n[22:0]};
d_man <= {1'b1, d[22:0]};
n_sign <= n[31];
d_sign <= d[31];
end
//Find the exponent, store in div_exp.
always @ (n_exp or d_exp) begin
if (n_exp >= d_exp) begin
div_exp = 8'b01111111 + (n_exp - d_exp);
end else begin
div_exp = 8'b01111111 - (d_exp - n_exp);
end
end
//Divide the mantissas, store in div_man.
div_24b divide(.numer(n_man), .denom(d_man), .res(div_man));
//Store the result. Shift exponents appropriately. Store sign.
//Sign
always @ (n_sign or d_sign) begin
div[31] = n_sign ^ d_sign;
end
//Mantissa and Exponent
always @ (div_man or div_exp) begin
if (div_man[23]) begin //do nothing
div[22:0] = div_man[22:0];
div[30:23] = div_exp;
end else if (div_man[22]) begin //shift << by 1, subtract 1 from exponent.
div[22:0] = {div_man[21:0], 1'b0};
div[30:23] = div_exp - 8'b00000001;
end else if (div_man[21]) begin //shift << by 2, subtract 2 from exponent.
div[22:0] = {div_man[20:0], 2'b0};
div[30:23] = div_exp - 8'b00000010;
end else if (div_man[20]) begin //shift << by 3, subtract 3 from exponent.
div[22:0] = {div_man[19:0], 3'b0};
div[30:23] = div_exp - 8'b00000011;
end else if (div_man[19]) begin //shift << by 4, subtract 4 from exponent.
div[22:0] = {div_man[18:0], 4'b0};
div[30:23] = div_exp - 8'b00000100;
end else if (div_man[18]) begin //shift << by 5, subtract 5 from exponent.
div[22:0] = {div_man[17:0], 5'b0};
div[30:23] = div_exp - 8'b00000101;
end else if (div_man[17]) begin //shift << by 6, subtract 6 from exponent.
div[22:0] = {div_man[16:0], 6'b0};
div[30:23] = div_exp - 8'b00000110;
end else if (div_man[16]) begin //shift << by 7, subtract 7 from exponent.
div[22:0] = {div_man[15:0], 7'b0};
div[30:23] = div_exp - 8'b00000111;
end else if (div_man[15]) begin //shift << by 8, subtract 8 from exponent.
div[22:0] = {div_man[14:0], 8'b0};
div[30:23] = div_exp - 8'b00001000;
end else if (div_man[14]) begin //shift << by 9, subtract 9 from exponent.
div[22:0] = {div_man[13:0], 9'b0};
div[30:23] = div_exp - 8'b00001001;
end else if (div_man[13]) begin //shift << by 10, subtract 10 from exponent.
div[22:0] = {div_man[12:0], 10'b0};
div[30:23] = div_exp - 8'b00001010;
end else if (div_man[12]) begin //shift << by 11, subtract 11 from exponent.
div[22:0] = {div_man[11:0], 11'b0};
div[30:23] = div_exp - 8'b00001011;
end else if (div_man[11]) begin //shift << by 12, subtract 12 from exponent.
div[22:0] = {div_man[10:0], 12'b0};
div[30:23] = div_exp - 8'b00001100;
end else if (div_man[10]) begin //shift << by 13, subtract 13 from exponent.
div[22:0] = {div_man[9:0], 13'b0};
div[30:23] = div_exp - 8'b00001101;
end else if (div_man[9]) begin //shift << by 14, subtract 14 from exponent.
div[22:0] = {div_man[8:0], 14'b0};
div[30:23] = div_exp - 8'b00001110;
end else if (div_man[8]) begin //shift << by 15, subtract 15 from exponent.
div[22:0] = {div_man[7:0], 15'b0};
div[30:23] = div_exp - 8'b00001111;
end else if (div_man[7]) begin //shift << by 16, subtract 16 from exponent.
div[22:0] = {div_man[6:0], 16'b0};
div[30:23] = div_exp - 8'b00010000;
end else if (div_man[6]) begin //shift << by 17, subtract 17 from exponent.
div[22:0] = {div_man[5:0], 17'b0};
div[30:23] = div_exp - 8'b00010001;
end else if (div_man[5]) begin //shift << by 18, subtract 18 from exponent.
div[22:0] = {div_man[4:0], 18'b0};
div[30:23] = div_exp - 8'b00010010;
end else if (div_man[4]) begin //shift << by 19, subtract 19 from exponent.
div[22:0] = {div_man[3:0], 19'b0};
div[30:23] = div_exp - 8'b00010011;
end else if (div_man[3]) begin //shift << by 20, subtract 20 from exponent.
div[22:0] = {div_man[2:0], 20'b0};
div[30:23] = div_exp - 8'b00010100;
end else if (div_man[2]) begin //shift << by 21, subtract 21 from exponent.
div[22:0] = {div_man[1:0], 21'b0};
div[30:23] = div_exp - 8'b00010101;
end else if (div_man[1]) begin //shift << by 22, subtract 22 from exponent.
div[22:0] = {div_man[0:0], 22'b0};
div[30:23] = div_exp - 8'b00010110;
end else /*if (div_man[0])*/ begin //shift << by 23, subtract 23 from exponent.
div[22:0] = 23'b0;
div[30:23] = div_exp - 8'b00010111;
end
end
endmodule
module div_24b(numer, denom, res);
//input clock;
input [23:0]numer;
input [23:0]denom;
output [23:0]res;
reg [23:0]res;
//Pad with 23 zeros.
wire [46:0]denom_pad;
wire [46:0]numer23;
reg [46:0]numer22;
reg [46:0]numer21;
reg [46:0]numer20;
reg [46:0]numer19;
reg [46:0]numer18;
reg [46:0]numer17;
reg [46:0]numer16;
reg [46:0]numer15;
reg [46:0]numer14;
reg [46:0]numer13;
reg [46:0]numer12;
reg [46:0]numer11;
reg [46:0]numer10;
reg [46:0]numer9;
reg [46:0]numer8;
reg [46:0]numer7;
reg [46:0]numer6;
reg [46:0]numer5;
reg [46:0]numer4;
reg [46:0]numer3;
reg [46:0]numer2;
reg [46:0]numer1;
reg [46:0]numer0;
//always @ (posedge clock) begin
assign denom_pad = {23'b0, denom};
assign numer23 = {numer, 23'b0};
// end
//res[23]
always @ (denom_pad or numer23) begin
if (denom_pad[23:0] <= numer23[46:23]) begin
res[23] = 1'b1;
numer22 = {numer23[46:23] - denom_pad[23:0], 23'b0};
end else begin
res[23] = 1'b0;
numer22 = numer23;
end
if (denom_pad[24:0] <= numer22[46:22]) begin
res[22] = 1'b1;
numer21 = {numer22[46:22] - denom_pad[24:0], 22'b0};
end else begin
res[22] = 1'b0;
numer21 = numer22;
end
if (denom_pad[25:0] <= numer21[46:21]) begin
res[21] = 1'b1;
numer20 = {numer21[46:21] - denom_pad[25:0], 21'b0};
end else begin
res[21] = 1'b0;
numer20 = numer21;
end
if (denom_pad[26:0] <= numer20[46:20]) begin
res[20] = 1'b1;
numer19 = {numer20[46:20] - denom_pad[26:0], 20'b0};
end else begin
res[20] = 1'b0;
numer19 = numer20;
end
if (denom_pad[27:0] <= numer19[46:19]) begin
res[19] = 1'b1;
numer18 = {numer19[46:19] - denom_pad[27:0], 19'b0};
end else begin
res[19] = 1'b0;
numer18 = numer19;
end
if (denom_pad[28:0] <= numer18[46:18]) begin
res[18] = 1'b1;
numer17 = {numer18[46:18] - denom_pad[28:0], 18'b0};
end else begin
res[18] = 1'b0;
numer17 = numer18;
end
if (denom_pad[29:0] <= numer17[46:17]) begin
res[17] = 1'b1;
numer16 = {numer17[46:17] - denom_pad[29:0], 17'b0};
end else begin
res[17] = 1'b0;
numer16 = numer17;
end
if (denom_pad[30:0] <= numer16[46:16]) begin
res[16] = 1'b1;
numer15 = {numer16[46:16] - denom_pad[30:0], 16'b0};
end else begin
res[16] = 1'b0;
numer15 = numer16;
end
if (denom_pad[31:0] <= numer15[46:15]) begin
res[15] = 1'b1;
numer14 = {numer15[46:15] - denom_pad[31:0], 15'b0};
end else begin
res[15] = 1'b0;
numer14 = numer15;
end
if (denom_pad[32:0] <= numer14[46:14]) begin
res[14] = 1'b1;
numer13 = {numer14[46:14] - denom_pad[32:0], 14'b0};
end else begin
res[14] = 1'b0;
numer13 = numer14;
end
if (denom_pad[33:0] <= numer13[46:13]) begin
res[13] = 1'b1;
numer12 = {numer13[46:13] - denom_pad[33:0], 13'b0};
end else begin
res[13] = 1'b0;
numer12 = numer13;
end
if (denom_pad[34:0] <= numer12[46:12]) begin
res[12] = 1'b1;
numer11 = {numer12[46:12] - denom_pad[34:0], 12'b0};
end else begin
res[12] = 1'b0;
numer11 = numer12;
end
if (denom_pad[35:0] <= numer11[46:11]) begin
res[11] = 1'b1;
numer10 = {numer11[46:11] - denom_pad[35:0], 11'b0};
end else begin
res[11] = 1'b0;
numer10 = numer11;
end
if (denom_pad[36:0] <= numer10[46:10]) begin
res[10] = 1'b1;
numer9 = {numer10[46:10] - denom_pad[36:0], 10'b0};
end else begin
res[10] = 1'b0;
numer9 = numer10;
end
if (denom_pad[37:0] <= numer9[46:9]) begin
res[9] = 1'b1;
numer8 = {numer9[46:9] - denom_pad[37:0], 9'b0};
end else begin
res[9] = 1'b0;
numer8 = numer9;
end
if (denom_pad[38:0] <= numer8[46:8]) begin
res[8] = 1'b1;
numer7 = {numer8[46:8] - denom_pad[38:0], 8'b0};
end else begin
res[8] = 1'b0;
numer7 = numer8;
end
if (denom_pad[39:0] <= numer7[46:7]) begin
res[7] = 1'b1;
numer6 = {numer7[46:7] - denom_pad[39:0], 7'b0};
end else begin
res[7] = 1'b0;
numer6 = numer7;
end
if (denom_pad[40:0] <= numer6[46:6]) begin
res[6] = 1'b1;
numer5 = {numer6[46:6] - denom_pad[40:0], 6'b0};
end else begin
res[6] = 1'b0;
numer5 = numer6;
end
if (denom_pad[41:0] <= numer5[46:5]) begin
res[5] = 1'b1;
numer4 = {numer5[46:5] - denom_pad[41:0], 5'b0};
end else begin
res[5] = 1'b0;
numer4 = numer5;
end
if (denom_pad[42:0] <= numer4[46:4]) begin
res[4] = 1'b1;
numer3 = {numer4[46:4] - denom_pad[42:0], 4'b0};
end else begin
res[4] = 1'b0;
numer3 = numer4;
end
if (denom_pad[43:0] <= numer3[46:3]) begin
res[3] = 1'b1;
numer2 = {numer3[46:3] - denom_pad[43:0], 3'b0};
end else begin
res[3] = 1'b0;
numer2 = numer3;
end
if (denom_pad[44:0] <= numer2[46:2]) begin
res[2] = 1'b1;
numer1 = {numer2[46:2] - denom_pad[44:0], 2'b0};
end else begin
res[2] = 1'b0;
numer1 = numer2;
end
if (denom_pad[45:0] <= numer1[46:1]) begin
res[1] = 1'b1;
numer0 = {numer1[46:1] - denom_pad[45:0], 1'b0};
end else begin
res[1] = 1'b0;
numer0 = numer1;
end
if (denom_pad <= numer0) begin
res[0] = 1'b1;
end else begin
res[0] = 1'b0;
end
end
endmodule
//////////////////////////////////////////////
//
// constants.v
//
// Version 1.3
// Written 7/11/01 David_Harris@hmc.edu & Mark_Phair@hmc.edu
// Modifed 8/20/01 Mark_Phair@hmc.edu and Justin_Schauer@hmc.edu
//
// A set of constants for a parameterized floating point multiplier and adder.
//
//////////////////////////////////////////////
//////////////////////////////////////////////
// FREE VARIABLES
//////////////////////////////////////////////
// Widths of Fields
`define WEXP 8
`define WSIG 23
`define WFLAG 5
`define WCONTROL 5
// output flag select (flags[x])
`define DIVZERO 0
`define INVALID 1
`define INEXACT 2
`define OVERFLOW 3
`define UNDERFLOW 4
//////////////////////////////////////////////
// DEPENDENT VARIABLES
//////////////////////////////////////////////
`define WIDTH 32 //(`WEXP + `WSIG + 1)
`define PRODWIDTH 48 //(2 * (`WSIG + 1))
`define SHIFTWIDTH 96 //(2 * `PRODWIDTH))
`define WPRENORM 24 // `WSIG + 1
`define WEXPSUM 10 // `WEXP + 2
`define BIAS 127 // (2^(`WEXP)) - 1
`define WSIGMINUS1 22 // `WSIG - 1, used for rounding
`define WSHIFTAMT 5 // log2(`WSIG + 1) rounded up
// for trapped over/underflow
`define UNDERBIAS 192 // 3 * 2 ^ (`WEXP -2)
`define OVERBIAS -192 // -`UNDERBIAS
// specialized constants for fpadd
`define EXTRASIG 25 // `WSIG+2 this is the amount of precision needed so no
// subtraction errors occur
`define SHIFT 5 // # bits the max alignment shift will fit in (log2(`WSIG+2)
// rounded up to nearest int)
`define MAX_EXP 8'b11111110 // the maximum non-infinite exponent,
// `WEXP bits, the most significant
// `WEXP-1 bits are 1, the LSB is 0
`define INF_EXP 8'b11111111 // Infinity exponent, `WEXP bits, all 1
// Max significand, `WSIG bits, all 1
`define MAX_SIG 23'b11111111111111111111111
`define WEXP_0 8'b0 // Exponent equals `WEXP'b0
`define WEXP_1 8'b1 // Exponent equals one `WEXP'b1
`define WSIG_0 23'b0 // Significand equals zero `WSIG'b0
`define WSIG_1 23'b1 // Significand equals one `WSIG'b1
`define EXTRASIG_0 25'b0 // All result bits for adder zero `EXTRASIG'b0
// specialized constants for fpmul
`define MAXSHIFT 24 // `WSIG + 1
// GENERAL SPECIAL NUMBERS - Exp + Significand of special numbers
// plain NaN `WIDTH-1, all 1
`define CONSTNAN {9'b111111111,22'b0}
// zero `WIDTH-1, all 0
`define CONSTZERO 31'b0
// infinity `WEXP all 1, `WSIG all 0
`define CONSTINFINITY {8'b11111111, 23'b0}
// largest number maximum exponent(all 1's - 1) and maximum significand (all 1's)
`define CONSTLARGEST {`MAX_EXP, `MAX_SIG}
`define PRESHIFTZEROS 48'b0 // `PRODWIDTH'b0
//////////////////////////////////////////////
//
// fpmul.v
//
// Version 1.6
// Written 07/11/01 David_Harris@hmc.edu & Mark_Phair@hmc.edu
// Modifed 08/20/01 Mark_Phair@hmc.edu
//
// A parameterized floating point multiplier.
//
// BLOCK DESCRIPTIONS
//
// preprocess - general processing, such as zero detection, computing sign, NaN
//
// prenorm - normalize denorms
//
// exponent - sum the exponents, check for tininess before rounding
//
// multiply - multiply the mantissae
//
// special - calculate special cases, such as NaN and infinities
//
// shift - shift the sig and exp if nesc.
//
// round - round product
//
// normalize - normalizes the result if appropriate (i.e. not a denormalized #)
//
// flag - general flag processing
//
// assemble - assemble results
//
//////////////////////////////////////////////
//////////////////////////////////////////////
// Includes
//////////////////////////////////////////////
//////////////////////////////////////////////
// fpmul module
//////////////////////////////////////////////
module fpmul(clk, a, b, y_out, control, flags) ;
input clk;
// external signals
input [`WIDTH-1:0] a, b; // floating-point inputs
output [`WIDTH-1:0] y_out; // floating-point product
reg [`WIDTH-1:0] y_out;
input [1:0] control; // control including rounding mode
output [`WFLAG-1:0] flags; // DIVZERO, INVALID, INEXACT,
// OVERFLOW, UNDERFLOW (defined in constant.v)
//intermediate y_out
wire [`WIDTH-1:0]y;
// internal signals
wire multsign; // sign of product
wire specialsign; // sign of special
wire [`WSIG:0] norma; // normal-form mantissa a, 1 bit larger to hold leading 1
wire [`WSIG:0] normb; // normal-form mantissa b, 1 bit larger to hold leading 1
wire [`WEXPSUM-1:0] expa, expb; // the two exponents, after prenormalization
wire [`WEXPSUM-1:0] expsum; // sum of exponents (two's complement)
wire [`WEXPSUM-1:0] shiftexp; // shifted exponent
wire [`WEXP-1:0] roundexp; // rounded, correct exponent
wire [`PRODWIDTH-1:0] prod; // product of mantissae
wire [`PRODWIDTH-1:0] normalized; // Normalized product
wire [`SHIFTWIDTH-1:0] shiftprod; // shifted product
wire [`WSIG-1:0] roundprod; // rounded product
wire [`WIDTH-2:0] special; // special case exponent and product
wire twoormore; // product is outside range [1,2)
wire zero; // zero detected
wire infinity; // infinity detected
wire aisnan; // NaN detected in A
wire bisnan; // NaN detected in B
wire aisdenorm; // Denormalized number detected in A
wire bisdenorm; // Denormalized number detected in B
wire specialcase; // This is a special case
wire specialsigncase; // Use the special case sign
wire roundoverflow; // overflow in rounding, need to add 1 to exponent
wire invalid; // invalid operation
wire overflow; // exponent result too high, standard overflow
wire inexact; // inexact flag
wire shiftloss; // lost digits due to a shift, result inaccurate
wire [1:0] roundmode; // rounding mode information extracted from control field
wire tiny; // Result is tiny (denormalized #) after multiplication
wire stilltiny; // Result is tiny (denormalized #) after rounding
wire denormround; // rounding occured only because the initial result was
// a denormalized number. This is used to determine
// underflow in cases of denormalized numbers rounding
// up to normalized numbers
preprocess preprocesser(a, b, zero, aisnan, bisnan,
aisdenorm, bisdenorm, infinity,
control, roundmode, sign);
special specialer(a, b, special, specialsign, zero,
aisnan, bisnan,
infinity, invalid,
specialcase, specialsigncase);
prenorm prenormer(a[`WIDTH-2:0], b[`WIDTH-2:0], norma, normb, expa, expb, aisdenorm, bisdenorm);
multiply_a multiplier(norma, normb, prod, twoormore);
exponent exponenter(expa, expb, expsum, twoormore, tiny);
normalize normalizer(prod, normalized, tiny, twoormore);
shift shifter(normalized, expsum, shiftprod,
shiftexp, shiftloss);
round rounder(shiftprod, shiftexp, shiftloss,
roundprod, roundexp,
roundmode, sign, tiny, inexact,
overflow, stilltiny, denormround);
// *** To check for tininess before rounding, use tiny
// To check after rounding, use stilltiny
// *** for underflow detect:
// To check for inexact result use (inexact | (shiftloss & stilltiny)),
// To check for denormilization loss use (shiftloss & stilltiny)
// flag flager(invalid, overflow, inexact | shiftloss,
// shiftloss | inexact,
// /* tiny */ (stilltiny | (tiny & denormround)),
// specialcase, flags);
//ODIN cannot have operations in module instantiations.
wire inexact_or_shiftloss;
assign inexact_or_shiftloss = inexact | shiftloss;
wire shiftloss_or_inexact;
assign shiftloss_or_inexact = shiftloss | inexact;
wire still_tiny_or_tiny_and_denormround;
assign still_tiny_or_tiny_and_denormround = stilltiny | (tiny & denormround);
flag flager(invalid, overflow, inexact_or_shiftloss,
shiftloss_or_inexact,
/* tiny */ stilltiny_or_tiny_and_denormround,
specialcase, flags);
assemble assembler(roundprod, special, y,
sign, specialsign, roundexp,
specialcase, specialsigncase,
roundmode, flags[`OVERFLOW]);
always @ (posedge clk) begin
y_out <= y;
end
endmodule
module preprocess(a, b, zero, aisnan, bisnan, aisdenorm, bisdenorm, infinity, control, roundmode, sign);
// external signals
input [`WIDTH-1:0] a, b; // floating-point inputs
output zero; // is there a zero?
//input [`WCONTROL-1:0] control; // control field
input [1:0] control; //the rest is unused, not necessary for ODIN.
output [1:0] roundmode; // 00 = RN; 01 = RZ; 10 = RP; 11 = RM
output aisnan; // NaN detected in A
output bisnan; // NaN detected in B
output aisdenorm; // denormalized number detected in A
output bisdenorm; // denormalized number detected in B
output infinity; // infinity detected in A
output sign; // sign of product
// internal signals
wire signa, signb; // sign of a and b
wire [`WEXP-1:0] expa, expb; // the exponents of a and b
wire [`WSIG-1:0] siga, sigb; // the significands of a and b
wire aexpfull; // the exponent of a is all 1's
wire bexpfull; // the exponent of b is all 1's
wire aexpzero; // the exponent of a is all 0's
wire bexpzero; // the exponent of b is all 0's
wire asigzero; // the significand of a is all 0's
wire bsigzero; // the significand of b is all 0's
// Sign calculation
assign signa = a[`WIDTH-1];
assign signb = b[`WIDTH-1];
assign sign = signa ^ signb;
// Significand calcuations
assign siga = a[`WSIG-1:0];
assign sigb = b[`WSIG-1:0];
// Are the significands all 0's?
assign asigzero = ~|siga;
assign bsigzero = ~|sigb;
// Exponent calculations
assign expa = a[`WIDTH-2:`WIDTH-`WEXP-1];
assign expb = b[`WIDTH-2:`WIDTH-`WEXP-1];
// Are the exponents all 0's?
assign aexpzero = ~|expa;
assign bexpzero = ~|expb;
// Are the exponents all 1's?
assign aexpfull = &expa;
assign bexpfull = &expb;
// General calculations
// Zero Detect
assign zero = (aexpzero & asigzero) | (bexpzero & bsigzero);
// NaN detect
assign aisnan = aexpfull & ~asigzero;
assign bisnan = bexpfull & ~bsigzero;
// Infinity detect
assign infinity = (aexpfull & asigzero) | (bexpfull & bsigzero);
// Denorm detect
assign aisdenorm = aexpzero & ~asigzero;
assign bisdenorm = bexpzero & ~bsigzero;
// Round mode extraction
assign roundmode = control[1:0];
endmodule
module special (a, b, special, specialsign,
zero, aisnan, bisnan, infinity,
invalid, specialcase, specialsigncase);
// external signals
input [`WIDTH-1:0] a, b; // floating-point inputs
output [`WIDTH-2:0] special; // special case output, exp + sig
output specialsign; // the special-case sign
input zero; // is there a zero?
input aisnan; // NaN detected in A
input bisnan; // NaN detected in B
input infinity; // infinity detected
output invalid; // invalid operation
output specialcase; // this is a special case
output specialsigncase; // Use the special sign
// internal signals
wire infandzero; // infinity and zero detected
wire [`WIDTH-2:0] highernan; // holds inputed NaN, the higher if two are input,
// and dont care if neither a nor b are NaNs
wire aishighernan; // a is the higher NaN
assign infandzero = (infinity & zero);
//#######SPECIAL ASSIGNMENT######
// #######return higher NaN##########
// Use this block if you want to return the higher of two NaNs
assign aishighernan = (aisnan & ((a[`WSIG-1:0] >= b[`WSIG-1:0]) | ~bisnan));
assign highernan[`WIDTH-2:0] = aishighernan ? a[`WIDTH-2:0] : b[`WIDTH-2:0];
assign special[`WIDTH-2:0] = (aisnan | bisnan) ? (highernan[`WIDTH-2:0]) :
(zero ?
(infinity ? (`CONSTNAN) : (`CONSTZERO)) : (`CONSTINFINITY));
// #######return first NaN##########
// Use this block to return the first NaN encountered
// assign special = aisnan ? (a[`WIDTH-2:0]) :
// (bisnan ? (b[`WIDTH-2:0]) :
// (zero ?
// (infinity ? (`CONSTNAN) : (`CONSTZERO)) : (`CONSTINFINITY)));
//######END SPECIAL ASSIGNMENT#######
assign specialcase = zero | aisnan | bisnan | infinity;
assign invalid = infandzero; //*** need to include something about signaling NaNs here
// dont need to check if b is NaN, if it defaults to that point, and b isnt NAN
// then it wont be used anyway
assign specialsign = infandzero ? (1'b1) : (aishighernan ? a[`WIDTH-1] : b[`WIDTH-1]);
assign specialsigncase = infandzero | aisnan | bisnan;
endmodule
module prenorm(a, b, norma, normb, modexpa, modexpb, aisdenorm, bisdenorm);
//input [`WIDTH-1:0] a, b; // the input floating point numbers
input [`WIDTH-2:0] a, b; //We don't need bit 31 here, unused in ODIN.
output [`WSIG:0] norma, normb; // the mantissae in normal form
output [`WEXPSUM-1:0] modexpa, modexpb; // the output exponents, larger to accomodate
// two's complement form
input aisdenorm; // a is a denormalized number
input bisdenorm; // b is a denormalized nubmer
// internal signals
wire [`WEXPSUM-1:0] expa, expb; // exponents in two's complement form
// are negative if shifted for a
// denormalized number
wire [`SHIFT-1:0] shifta, shiftb; // the shift amounts
reg [`WSIG:0] shifteda, shiftedb; // the shifted significands, used to be wire, changed for ODIN.
// pull out the exponents
assign expa = a[`WIDTH-2:`WIDTH-1-`WEXP];
assign expb = b[`WIDTH-2:`WIDTH-1-`WEXP];
// when breaking appart for paramaterizing:
// ### RUN ./prenormshift.pl wsig_in ###
assign shifta = a[23 - 1] ? 1 :
a[23 - 2] ? 2 :
a[23 - 3] ? 3 :
a[23 - 4] ? 4 :
a[23 - 5] ? 5 :
a[23 - 6] ? 6 :
a[23 - 7] ? 7 :
a[23 - 8] ? 8 :
a[23 - 9] ? 9 :
a[23 - 10] ? 10 :
a[23 - 11] ? 11 :
a[23 - 12] ? 12 :
a[23 - 13] ? 13 :
a[23 - 14] ? 14 :
a[23 - 15] ? 15 :
a[23 - 16] ? 16 :
a[23 - 17] ? 17 :
a[23 - 18] ? 18 :
a[23 - 19] ? 19 :
a[23 - 20] ? 20 :
a[23 - 21] ? 21 :
a[23 - 22] ? 22 :
23; // dont need to check last bit
// if the second to last isn't 1, then the last one must be
assign shiftb = b[23 - 1] ? 1 :
b[23 - 2] ? 2 :
b[23 - 3] ? 3 :
b[23 - 4] ? 4 :
b[23 - 5] ? 5 :
b[23 - 6] ? 6 :
b[23 - 7] ? 7 :
b[23 - 8] ? 8 :
b[23 - 9] ? 9 :
b[23 - 10] ? 10 :
b[23 - 11] ? 11 :
b[23 - 12] ? 12 :
b[23 - 13] ? 13 :
b[23 - 14] ? 14 :
b[23 - 15] ? 15 :
b[23 - 16] ? 16 :
b[23 - 17] ? 17 :
b[23 - 18] ? 18 :
b[23 - 19] ? 19 :
b[23 - 20] ? 20 :
b[23 - 21] ? 21 :
b[23 - 22] ? 22 :
23; // dont need to check last bit
// if the second to last isn't 1, then the last one must be
// If number is a denorm, the exponent must be
// decremented by the shift amount
assign modexpa = aisdenorm ? 1 - shifta : expa;
assign modexpb = bisdenorm ? 1 - shiftb : expb;
// If number is denorm, shift the significand the appropriate amount
// assign shifteda = a[`WSIG-1:0] << shifta;
//Must have constant shifts for ODIN
always @ (shifta or a) begin
case (shifta)
5'b00001: begin
shifteda = a[`WSIG-1:0] << 5'b00001;
end
5'b00010: begin
shifteda = a[`WSIG-1:0] << 5'b00010;
end
5'b00011: begin
shifteda = a[`WSIG-1:0] << 5'b00011;
end
5'b00100: begin
shifteda = a[`WSIG-1:0] << 5'b00100;
end
5'b00101: begin
shifteda = a[`WSIG-1:0] << 5'b00101;
end
5'b00110: begin
shifteda = a[`WSIG-1:0] << 5'b00110;
end
5'b00111: begin
shifteda = a[`WSIG-1:0] << 5'b00111;
end
5'b01000: begin
shifteda = a[`WSIG-1:0] << 5'b01000;
end
5'b01001: begin
shifteda = a[`WSIG-1:0] << 5'b01001;
end
5'b01010: begin
shifteda = a[`WSIG-1:0] << 5'b01010;
end
5'b01011: begin
shifteda = a[`WSIG-1:0] << 5'b01011;
end
5'b01100: begin
shifteda = a[`WSIG-1:0] << 5'b01100;
end
5'b01101: begin
shifteda = a[`WSIG-1:0] << 5'b01101;
end
5'b01110: begin
shifteda = a[`WSIG-1:0] << 5'b01110;
end
5'b01111: begin
shifteda = a[`WSIG-1:0] << 5'b01111;
end
5'b10000: begin
shifteda = a[`WSIG-1:0] << 5'b10000;
end
5'b10001: begin
shifteda = a[`WSIG-1:0] << 5'b10001;
end
5'b10010: begin
shifteda = a[`WSIG-1:0] << 5'b10010;
end
5'b10011: begin
shifteda = a[`WSIG-1:0] << 5'b10011;
end
5'b10100: begin
shifteda = a[`WSIG-1:0] << 5'b10100;
end
5'b10101: begin
shifteda = a[`WSIG-1:0] << 5'b10101;
end
5'b10110: begin
shifteda = a[`WSIG-1:0] << 5'b10110;
end
5'b10111: begin
shifteda = a[`WSIG-1:0] << 5'b10111;
end
default: begin //Won't be higher than 23.
shifteda = a[`WSIG-1:0];
end
endcase
end
assign norma = aisdenorm ? shifteda : {1'b1, a[`WSIG-1:0]};
// assign shiftedb = b[`WSIG-1:0] << shiftb;
always @ (shiftb or b) begin
case (shiftb)
5'b00001: begin
shiftedb = b[`WSIG-1:0] << 5'b00001;
end
5'b00010: begin
shiftedb = b[`WSIG-1:0] << 5'b00010;
end
5'b00011: begin
shiftedb = b[`WSIG-1:0] << 5'b00011;
end
5'b00100: begin
shiftedb = b[`WSIG-1:0] << 5'b00100;
end
5'b00101: begin
shiftedb = b[`WSIG-1:0] << 5'b00101;
end
5'b00110: begin
shiftedb = b[`WSIG-1:0] << 5'b00110;
end
5'b00111: begin
shiftedb = b[`WSIG-1:0] << 5'b00111;
end
5'b01000: begin
shiftedb = b[`WSIG-1:0] << 5'b01000;
end
5'b01001: begin
shiftedb = b[`WSIG-1:0] << 5'b01001;
end
5'b01010: begin
shiftedb = b[`WSIG-1:0] << 5'b01010;
end
5'b01011: begin
shiftedb = b[`WSIG-1:0] << 5'b01011;
end
5'b01100: begin
shiftedb = b[`WSIG-1:0] << 5'b01100;
end
5'b01101: begin
shiftedb = b[`WSIG-1:0] << 5'b01101;
end
5'b01110: begin
shiftedb = b[`WSIG-1:0] << 5'b01110;
end
5'b01111: begin
shiftedb = b[`WSIG-1:0] << 5'b01111;
end
5'b10000: begin
shiftedb = b[`WSIG-1:0] << 5'b10000;
end
5'b10001: begin
shiftedb = b[`WSIG-1:0] << 5'b10001;
end
5'b10010: begin
shiftedb = b[`WSIG-1:0] << 5'b10010;
end
5'b10011: begin
shiftedb = b[`WSIG-1:0] << 5'b10011;
end
5'b10100: begin
shiftedb = b[`WSIG-1:0] << 5'b10100;
end
5'b10101: begin
shiftedb = b[`WSIG-1:0] << 5'b10101;
end
5'b10110: begin
shiftedb = b[`WSIG-1:0] << 5'b10110;
end
5'b10111: begin
shiftedb = b[`WSIG-1:0] << 5'b10111;
end
default: begin // Won't be higher than 23.
shiftedb = b[`WSIG-1:0];
end
endcase
end
assign normb = bisdenorm ? shiftedb : {1'b1, b[`WSIG-1:0]};
endmodule
module multiply_a (norma, normb, prod, twoormore);
input [`WSIG:0] norma, normb; // normalized mantissae
output [`PRODWIDTH-1:0] prod; // product of mantissae
output twoormore; // Product overflowed range [1,2)
// multiplier array
// (*** need a more effecient multiplier,
// designware might work, though)
assign prod = norma * normb;
// did the multiply overflow the range [1,2)?
assign twoormore = prod[`PRODWIDTH-1];
endmodule
module exponent(expa, expb, expsum, twoormore, tiny);
input [`WEXPSUM-1:0] expa, expb; // the input exponents in 2's complement form
// to accomodate denorms that have been
// prenormalized
input twoormore; // product is outside range [1,2)
output [`WEXPSUM-1:0] expsum; // the sum of the exponents
output tiny; // Result is tiny (denormalized #)
// Sum the exponents, subtract the bias
// and add 1 (twoormore) if multiply went out of [1,2) range
assign expsum = expa + expb - `BIAS + twoormore;
// The result is tiny if the exponent is less than 1.
// Because the exponent sum is in 2's-complement form,
// it is negative if the first bit is 1, and zero if
// all the bits are zero
assign tiny = ~|expsum[`WEXPSUM-2:0] | expsum[`WEXPSUM-1];
endmodule
module normalize(prod, normalized, tiny, twoormore);
// external signals
input [`PRODWIDTH-1:0] prod; // Product of multiplication
output [`PRODWIDTH-1:0] normalized; // Normalized product
input tiny; // Result is tiny (denormalized #)
input twoormore; // Product overflowed range [1,2)
// normalize product if appropriate
// There are three possible cases here:
// 1) tiny and prod overfl. [1,2) -> take the whole prod, including the leading 1
// 2) tiny or prod overfl. [1,2) -> dont take the first bit. its zero if its tiny,
// and it's the implied 1 if its not
// 3) neither tiny nor prod overfl.-> dont take the first 2 bits, the 2nd one is the
// implied 1
assign normalized = (tiny & twoormore) ? prod[`PRODWIDTH-1:0] :
((tiny ^ twoormore) ? {prod[`PRODWIDTH-2:0],1'b0} :
{prod[`PRODWIDTH-3:0],2'b0});
endmodule
module shift(normalized, selectedexp, shiftprod, shiftexp, shiftloss);
// external signals
input [`PRODWIDTH-1:0] normalized; // normalized product of mantissae
input [`WEXPSUM-1:0] selectedexp; // sum of exponents
output [`SHIFTWIDTH-1:0] shiftprod; // shifted and normalized product
output [`WEXPSUM-1:0] shiftexp; // shifted exponent
output shiftloss; // loss of accuaracy due to shifting
// internal signals
wire [`WEXPSUM-1:0] roundedexp; // selected exponent + 1 if rounding caused overflow
// wire negexp; // exponent is negative
wire [`WEXPSUM-1:0] shiftamt; // theoretical amount to shift product by
wire [`WSHIFTAMT-1:0] actualshiftamt; // actual amount to shift product by
wire tozero; // need more shifts than possible with width of significand
wire doshift; // only shift if value is nonnegative
wire [`SHIFTWIDTH-1:0] preshift; // value before shifting, with more room to ensure lossless shifting
reg [`SHIFTWIDTH-1:0] postshift; // value after shifting, with more room to ensure lossless shifting, used to be wire, changed for ODIN.
// set up value for shifting
assign preshift = {normalized, `PRESHIFTZEROS};
// determine shift amount
assign shiftamt = -selectedexp;
// make sure shift amount is nonnegative
// If the exponent is negative, the shift amount should
// come out positive, otherwise there shouldn't be any
// shifting to be done
assign doshift = ~shiftamt[`WEXPSUM-1];
// Determine if the result must be shifted more than
// will show up in the significand, even if it rounds up
assign tozero = doshift & (shiftamt > `MAXSHIFT);
// If the shift is big enough to shift all the bits out of the final significand,
// then it stops being relevent how much it has been shifted.
assign actualshiftamt = tozero ? `MAXSHIFT : shiftamt[`WSHIFTAMT-1:0];
// shift significand
//assign postshift = preshift >> actualshiftamt;
//We can only have constant shifts for ODIN:
always @ (actualshiftamt or preshift) begin
case (actualshiftamt)
5'b00001: begin
postshift = preshift >> 5'b00001;
end
5'b00010: begin
postshift = preshift >> 5'b00010;
end
5'b00011: begin
postshift = preshift >> 5'b00011;
end
5'b00100: begin
postshift = preshift >> 5'b00100;
end
5'b00101: begin
postshift = preshift >> 5'b00101;
end
5'b00110: begin
postshift = preshift >> 5'b00110;
end
5'b00111: begin
postshift = preshift >> 5'b00111;
end
5'b01000: begin
postshift = preshift >> 5'b01000;
end
5'b01001: begin
postshift = preshift >> 5'b01001;
end
5'b01010: begin
postshift = preshift >> 5'b01010;
end
5'b01011: begin
postshift = preshift >> 5'b01011;
end
5'b01100: begin
postshift = preshift >> 5'b01100;
end
5'b01101: begin
postshift = preshift >> 5'b01101;
end
5'b01110: begin
postshift = preshift >> 5'b01110;
end
5'b01111: begin
postshift = preshift >> 5'b01111;
end
5'b10000: begin
postshift = preshift >> 5'b10000;
end
5'b10001: begin
postshift = preshift >> 5'b10001;
end
5'b10010: begin
postshift = preshift >> 5'b10010;
end
5'b10011: begin
postshift = preshift >> 5'b10011;
end
5'b10100: begin
postshift = preshift >> 5'b10100;
end
5'b10101: begin
postshift = preshift >> 5'b10101;
end
5'b10110: begin
postshift = preshift >> 5'b10110;
end
5'b10111: begin
postshift = preshift >> 5'b10111;
end
5'b11000: begin
postshift = preshift >> 5'b11000;
end
5'b11001: begin
postshift = preshift >> 5'b11001;
end
5'b11010: begin
postshift = preshift >> 5'b11010;
end
5'b11011: begin
postshift = preshift >> 5'b11011;
end
5'b11100: begin
postshift = preshift >> 5'b11100;
end
5'b11101: begin
postshift = preshift >> 5'b11101;
end
5'b11110: begin
postshift = preshift >> 5'b11110;
end
5'b11111: begin
postshift = preshift >> 5'b11111;
end
default: begin
postshift = preshift;
end
endcase
end
// assign appropriate significand
assign shiftprod = doshift ? postshift : preshift;
// determine if any bits were lost from the shift
//assign shiftloss = tozero | (negexp & |postshift[`WSIG-1:0]);
assign shiftloss = tozero | (doshift & |postshift[`SHIFTWIDTH-`PRODWIDTH-1:0]);
// assign appropriate exponent
assign shiftexp = doshift ? 0 : selectedexp;
endmodule
module round(shiftprod, shiftexp, shiftloss, roundprod, roundexp, roundmode,
sign, tiny, inexact, overflow, stilltiny, denormround);
// external signals
input [`SHIFTWIDTH-1:0] shiftprod; // normalized and shifted product of mantissae
input [`WEXPSUM-1:0] shiftexp; // shifted exponent
input shiftloss; // bits were lost in the shifting process
output [`WSIG-1:0] roundprod; // rounded floating-point product
output [`WEXP-1:0] roundexp; // rounded exponent
input [1:0] roundmode; // 00 = RN; 01 = RZ; 10 = RP; 11 = RM
input sign; // sign bit for rounding mode direction
input tiny; // denormalized number after rounding
output inexact; // rounding occured
output overflow; // overflow occured
output stilltiny; // Result is tiny (denormalized #) after rounding
output denormround; // result was rounded only because it was a denormalized number
// internal signals
wire roundzero; // rounding towards zero
wire roundinf; // rounding towards infinity
wire stickybit; // there one or more 1 bits in the LS bits
wire denormsticky; // sticky bit if this weren't a denorm
wire [`WSIG-1:0] MSBits; // most significant bits
wire [`WSIG:0] MSBitsplus1; // most significant bits plus 1
// for rounding purposes. needs to be one
// bit bigger for overflow
wire [1:0] roundbits; // bits used to compute rounding decision
wire rounddecision; // round up
wire roundoverflow; // rounding overflow occured
wire [`WEXPSUM-1:0] tempexp; // exponent after rounding
//reduce round mode to three modes
// dont need round nearest, it is implied
// by roundzero and roundinf being false
//assign roundnearest = ~&roundmode;
// assign roundzero = &roundmode || (^roundmode && (roundmode[0] || sign));
assign roundzero = (~roundmode[1] & roundmode[0]) | (roundmode[1] & (roundmode[0] ^ sign));
assign roundinf = roundmode[1] & ~(sign ^ roundmode[0]);
// pull out the most significant bits for the product
assign MSBits = shiftprod[`SHIFTWIDTH-1:`SHIFTWIDTH-`WSIG];
// add a 1 to the end of MSBits for round up
assign MSBitsplus1 = MSBits + 1;
// pull out the last of the most significant bits
// and the first of the least significant bits
// to use for calculating the rounding decision
assign roundbits[1:0] = shiftprod[`SHIFTWIDTH-`WSIG:`SHIFTWIDTH-`WSIG-1];
// calculate the sticky bit. Are any of the least significant bits 1?
// also: was anything lost while shifting?
// *** Optimization: some of these bits are already checked from the shiftloss ***
// *** Optimization: stickybit can be calculated from denormsticky
// with only 1 more gate, instead of duplication of effort ***
assign stickybit = |shiftprod[`SHIFTWIDTH-`WSIG-2:0] | shiftloss;
assign denormsticky = |shiftprod[`SHIFTWIDTH-`WSIG-3:0] | shiftloss;
// Compute rounding decision
assign rounddecision = ~roundzero & ( (roundbits[0] & (roundinf | roundbits[1]))
| (stickybit & (roundinf | roundbits[0]))
);
// Was this only rounded because it is a denorm?
assign denormround = tiny & rounddecision & ~denormsticky & roundbits[0];
// detect rounding overflow. it only overflows if:
// 1) the top bit of MSBitsplus1 is 1
// 2) it decides to round up
assign roundoverflow = MSBitsplus1[`WSIG] & rounddecision;
// assign significand (and postnormalize)
// rounddecision decides whether to use msbits+1 or msbits.
// if using msbits+1 and there is an rounding overflow (i.e. result=2),
// then should return 1 instead
assign roundprod = rounddecision ?
(roundoverflow ? 0 :
MSBitsplus1[`WSIG-1:0]) :
MSBits;
// detect inexact
assign inexact = rounddecision | stickybit | roundbits[0];
// compensate for a rounding overflow
assign tempexp = roundoverflow + shiftexp;
// check for overflow in exponent
// overflow occured if the number
// is too large to be represented,
// i.e. can't fit in `WEXP bits, or
// all `WEXP bits are 1's
assign overflow = &tempexp[`WEXP-1:0] | |tempexp[`WEXPSUM-1:`WEXP];
// two possible cases:
// 1) Overflow: then exponent doesnt matter,
// it will be changed to infinity anyway
// 2) not overflow: the leading bits will be 0
assign roundexp = tempexp[`WEXP-1:0];
// The result is tiny if the exponent is less than 1.
// Because the exponent sum is NOT in 2's-complement form,
// it is only less than one if its is zero, i.e.
// all the bits are 0
assign stilltiny = ~|roundexp;
endmodule
module flag (invalid, overflow, inexact, underflow, tiny, specialcase, flags);
input invalid; // invalid operation
input overflow; // the result was too large
input inexact; // The result was rounded
input specialcase; // Using special result, shouldn't throw flags
input underflow; // Underflow detected
input tiny; // The result is tiny
output [`WFLAG-1:0] flags; // DIVZERO, INVALID, INEXACT,
// OVERFLOW, UNDERFLOW (defined in constant.v)
// flags
assign flags[`DIVZERO] = 1'b0;
assign flags[`INVALID] = invalid;
assign flags[`INEXACT] = ~specialcase & (inexact | underflow | overflow);
assign flags[`OVERFLOW] = ~specialcase & overflow;
assign flags[`UNDERFLOW] = tiny; //~specialcase & tiny & underflow & ~overflow;
endmodule
module assemble(roundprod, special, y, sign, specialsign,
shiftexp, specialcase, specialsigncase,
roundmode, overflow);
// external signals
input [`WSIG-1:0] roundprod; // shifted, rounded and normalized
// product of mantissae
input [`WIDTH-2:0] special; // special case product + exponent
output [`WIDTH-1:0] y; // floating-point product
input sign; // sign of product (+ = 0, - = 1)
input specialsign; // special case sign
input [`WEXP-1:0] shiftexp; // shifted exponent
input specialcase; // this is a special case
input specialsigncase; // use the special case sign
input [1:0] roundmode; // rounding mode information extracted from control field
input overflow; // overflow detected
// internal signals
wire [`WIDTH-2:0] rounded; // final product + exponent
wire [`WIDTH-2:0] overflowvalue; // product + exponent for overflow condition
wire undenormed; // the result was denormalized before rounding, but rounding
// caused it to become a small normalized number.
// SET UP ROUNDED PRODUCT + EXPONENT
// assign significand
assign rounded[`WSIG-1:0] = roundprod;
// assign exponent
assign rounded[`WIDTH-2:`WIDTH-`WEXP-1] = shiftexp;
// SET UP OVERFLOW CONDITION
assign overflowvalue[`WIDTH-2:0] = roundmode[1] ?
(sign ^ roundmode[0] ? `CONSTLARGEST : `CONSTINFINITY) :
(roundmode[0] ? `CONSTLARGEST: `CONSTINFINITY);
// FINAL PRODUCT ASSIGN
// assign sign
assign y[`WIDTH-1] = specialsigncase ? specialsign : sign;
// assign product vs special vs overflowed
assign y[`WIDTH-2:0] = specialcase ? special[`WIDTH-2:0] :
(overflow ? overflowvalue[`WIDTH-2:0] :
rounded[`WIDTH-2:0]);
endmodule
//---------------------------------------
// A dual-port RAM
// This module is tuned for VTR's benchmarks
//---------------------------------------
module dual_port_ram (
input clk,
input we1,
input we2,
input [`rRAMSIZEWIDTH - 1 : 0] addr1,
input [`RAMWIDTH - 1 : 0] data1,
output [`RAMWIDTH - 1 : 0] out1,
input [`rRAMSIZEWIDTH - 1 : 0] addr2,
input [`RAMWIDTH - 1 : 0] data2,
output [`RAMWIDTH - 1 : 0] out2
);
reg [`RAMWIDTH - 1 : 0] ram[2**`rRAMSIZEWIDTH - 1 : 0];
reg [`RAMWIDTH - 1 : 0] data_out1;
reg [`RAMWIDTH - 1 : 0] data_out2;
assign out1 = data_out1;
assign out2 = data_out2;
// If writen enable 1 is activated,
// data1 will be loaded through addr1
// Otherwise, data will be read out through addr1
always @(posedge clk) begin
if (we1) begin
ram[addr1] <= data1;
end else begin
data_out1 <= ram[addr1];
end
end
// If writen enable 2 is activated,
// data1 will be loaded through addr2
// Otherwise, data will be read out through addr2
always @(posedge clk) begin
if (we2) begin
ram[addr2] <= data2;
end else begin
data_out2 <= ram[addr2];
end
end
endmodule
//---------------------------------------
// A dual-port RAM 256x32
// This module is tuned for VTR's benchmarks
//---------------------------------------
module dual_port_ram_256x32 (
input clk,
input we1,
input we2,
input [8 - 1 : 0] addr1,
input [32 - 1 : 0] data1,
output [32 - 1 : 0] out1,
input [8- 1 : 0] addr2,
input [32 - 1 : 0] data2,
output [32 - 1 : 0] out2
);
reg [32 - 1 : 0] ram[2**8 - 1 : 0];
reg [32 - 1 : 0] data_out1;
reg [32 - 1 : 0] data_out2;
assign out1 = data_out1;
assign out2 = data_out2;
// If writen enable 1 is activated,
// data1 will be loaded through addr1
// Otherwise, data will be read out through addr1
always @(posedge clk) begin
if (we1) begin
ram[addr1] <= data1;
end else begin
data_out1 <= ram[addr1];
end
end
// If writen enable 2 is activated,
// data1 will be loaded through addr2
// Otherwise, data will be read out through addr2
always @(posedge clk) begin
if (we2) begin
ram[addr2] <= data2;
end else begin
data_out2 <= ram[addr2];
end
end
endmodule
//---------------------------------------
// A dual-port RAM rFIFO
// This module is tuned for VTR's benchmarks
//---------------------------------------
module dual_port_ram_rfifo (
input clk,
input we1,
input we2,
input [`rFIFOSIZEWIDTH - 1 : 0] addr1,
input [`rFIFOINPUTWIDTH - 1 : 0] data1,
output [`rFIFOINPUTWIDTH - 1 : 0] out1,
input [`rFIFOSIZEWIDTH - 1 : 0] addr2,
input [`rFIFOINPUTWIDTH - 1 : 0] data2,
output [`rFIFOINPUTWIDTH - 1 : 0] out2
);
reg [`rFIFOINPUTWIDTH - 1 : 0] ram[2**`rFIFOSIZEWIDTH - 1 : 0];
reg [`rFIFOINPUTWIDTH - 1 : 0] data_out1;
reg [`rFIFOINPUTWIDTH - 1 : 0] data_out2;
assign out1 = data_out1;
assign out2 = data_out2;
// If writen enable 1 is activated,
// data1 will be loaded through addr1
// Otherwise, data will be read out through addr1
always @(posedge clk) begin
if (we1) begin
ram[addr1] <= data1;
end else begin
data_out1 <= ram[addr1];
end
end
// If writen enable 2 is activated,
// data1 will be loaded through addr2
// Otherwise, data will be read out through addr2
always @(posedge clk) begin
if (we2) begin
ram[addr2] <= data2;
end else begin
data_out2 <= ram[addr2];
end
end
endmodule
//---------------------------------------
// A dual-port RAM wFIFO
// This module is tuned for VTR's benchmarks
//---------------------------------------
module dual_port_ram_wfifo (
input clk,
input we1,
input we2,
input [`wFIFOSIZEWIDTH - 1 : 0] addr1,
input [`wFIFOINPUTWIDTH - 1 : 0] data1,
output [`wFIFOINPUTWIDTH - 1 : 0] out1,
input [`wFIFOSIZEWIDTH - 1 : 0] addr2,
input [`wFIFOINPUTWIDTH - 1 : 0] data2,
output [`wFIFOINPUTWIDTH - 1 : 0] out2
);
reg [`wFIFOINPUTWIDTH - 1 : 0] ram[2**`wFIFOSIZEWIDTH - 1 : 0];
reg [`wFIFOINPUTWIDTH - 1 : 0] data_out1;
reg [`wFIFOINPUTWIDTH - 1 : 0] data_out2;
assign out1 = data_out1;
assign out2 = data_out2;
// If writen enable 1 is activated,
// data1 will be loaded through addr1
// Otherwise, data will be read out through addr1
always @(posedge clk) begin
if (we1) begin
ram[addr1] <= data1;
end else begin
data_out1 <= ram[addr1];
end
end
// If writen enable 2 is activated,
// data1 will be loaded through addr2
// Otherwise, data will be read out through addr2
always @(posedge clk) begin
if (we2) begin
ram[addr2] <= data2;
end else begin
data_out2 <= ram[addr2];
end
end
endmodule
//---------------------------------------
// A dual-port RAM wFIFO
// This module is tuned for VTR's benchmarks
//---------------------------------------
module dual_port_ram_afifo (
input clk,
input we1,
input we2,
input [`aFIFOSIZEWIDTH - 1 : 0] addr1,
input [`aFIFOWIDTH - 1 : 0] data1,
output [`aFIFOWIDTH - 1 : 0] out1,
input [`aFIFOSIZEWIDTH - 1 : 0] addr2,
input [`aFIFOWIDTH - 1 : 0] data2,
output [`aFIFOWIDTH - 1 : 0] out2
);
reg [`aFIFOWIDTH - 1 : 0] ram[2**`aFIFOSIZEWIDTH - 1 : 0];
reg [`aFIFOWIDTH - 1 : 0] data_out1;
reg [`aFIFOWIDTH - 1 : 0] data_out2;
assign out1 = data_out1;
assign out2 = data_out2;
// If writen enable 1 is activated,
// data1 will be loaded through addr1
// Otherwise, data will be read out through addr1
always @(posedge clk) begin
if (we1) begin
ram[addr1] <= data1;
end else begin
data_out1 <= ram[addr1];
end
end
// If writen enable 2 is activated,
// data1 will be loaded through addr2
// Otherwise, data will be read out through addr2
always @(posedge clk) begin
if (we2) begin
ram[addr2] <= data2;
end else begin
data_out2 <= ram[addr2];
end
end
endmodule
//---------------------------------------
// A dual-port RAM mFIFO
// This module is tuned for VTR's benchmarks
//---------------------------------------
module dual_port_ram_mfifo (
input clk,
input we1,
input we2,
input [`mFIFOSIZEWIDTH - 1 : 0] addr1,
input [`mFIFOWIDTH - 1 : 0] data1,
output [`mFIFOWIDTH - 1 : 0] out1,
input [`mFIFOSIZEWIDTH - 1 : 0] addr2,
input [`mFIFOWIDTH - 1 : 0] data2,
output [`mFIFOWIDTH - 1 : 0] out2
);
reg [`mFIFOWIDTH - 1 : 0] ram[2**`mFIFOSIZEWIDTH - 1 : 0];
reg [`mFIFOWIDTH - 1 : 0] data_out1;
reg [`mFIFOWIDTH - 1 : 0] data_out2;
assign out1 = data_out1;
assign out2 = data_out2;
// If writen enable 1 is activated,
// data1 will be loaded through addr1
// Otherwise, data will be read out through addr1
always @(posedge clk) begin
if (we1) begin
ram[addr1] <= data1;
end else begin
data_out1 <= ram[addr1];
end
end
// If writen enable 2 is activated,
// data1 will be loaded through addr2
// Otherwise, data will be read out through addr2
always @(posedge clk) begin
if (we2) begin
ram[addr2] <= data2;
end else begin
data_out2 <= ram[addr2];
end
end
endmodule