[Arch] Update heterogenous architecture for vtr benchmark by adding mult36

This commit is contained in:
tangxifan 2021-03-20 18:04:59 -06:00
parent 1185f7b8bf
commit 911979a731
3 changed files with 104 additions and 1 deletions

View File

@ -206,6 +206,16 @@
<port type="output" prefix="data_out" size="8"/>
<port type="clock" prefix="clk" size="1" is_global="true" default_val="0"/>
</circuit_model>
<circuit_model type="hard_logic" name="mult_36x36" prefix="mult_36x36" spice_netlist="${OPENFPGA_PATH}/openfpga_flow/openfpga_cell_library/spice/mult_36x36.sp" verilog_netlist="${OPENFPGA_PATH}/openfpga_flow/openfpga_cell_library/verilog/mult_36x36.v">
<design_technology type="cmos"/>
<input_buffer exist="true" circuit_model_name="INVTX1"/>
<output_buffer exist="true" circuit_model_name="INVTX1"/>
<port type="input" prefix="A" lib_name="A" size="36"/>
<port type="input" prefix="B" lib_name="B" size="36"/>
<port type="output" prefix="Y" lib_name="out" size="72"/>
<!-- As a fracturable multiplier, it requires 2 configuration bits to operate in 4 different modes -->
<port type="sram" prefix="mode" size="2" mode_select="true" circuit_model_name="DFFR" default_val="1"/>
</circuit_model>
</circuit_library>
<configuration_protocol>
<organization type="scan_chain" circuit_model_name="DFFR"/>
@ -265,6 +275,10 @@
<pb_type name="clb.fle[n1_lut6].ble6.ff" physical_pb_type_name="clb.fle[physical].fabric.ff" physical_pb_type_index_factor="2" physical_pb_type_index_offset="0"/>
<!-- End physical pb_type binding in complex block clb -->
<!-- physical pb_type binding in complex block dsp -->
<pb_type name="mult_36" physical_mode_name="mult_36x36" idle_mode_name="mult_36x36"/>
<!-- Bind the primitive pb_type in the physical mode to a circuit model -->
<pb_type name="mult_36[mult_36x36].mult_36x36_slice.mult_36x36" circuit_model_name="mult_36x36" mode_bits="00"/>
<!-- physical pb_type binding in complex block memory -->
<pb_type name="memory[mem_1024x8_dp].mem_1024x8_dp" circuit_model_name="dpram_1024x8"/>

View File

@ -138,6 +138,15 @@
<port name="data_out" clock="clk"/>
</output_ports>
</model>
<model name="mult_36">
<input_ports>
<port name="A" combinational_sink_ports="Y"/>
<port name="B" combinational_sink_ports="Y"/>
</input_ports>
<output_ports>
<port name="Y"/>
</output_ports>
</model>
</models>
<tiles>
<tile name="io" capacity="8" area="0">
@ -196,6 +205,23 @@
<loc side="bottom">memory.waddr[9:5] memory.raddr[9:5] memory.data_in[7:4] memory.ren memory.data_out[7:4]</loc>
</pinlocations>
</tile>
<tile name="mult_36" height="6" area="396000">
<equivalent_sites>
<site pb_type="mult_36" pin_mapping="direct"/>
</equivalent_sites>
<input name="a" num_pins="36"/>
<input name="b" num_pins="36"/>
<output name="out" num_pins="72"/>
<fc in_type="frac" in_val="0.15" out_type="frac" out_val="0.10"/>
<!-- Highly recommand to customize pin location when direct connection is used!!! -->
<!-- pinlocations are designed to spread pin on 4 sides evenly -->
<pinlocations pattern="custom">
<loc side="left">mult_36.b[0:9] mult_36.b[10:35] mult_36.out[36:71]</loc>
<loc side="top"></loc>
<loc side="right">mult_36.a[0:9] mult_36.a[10:35] mult_36.out[0:35]</loc>
<loc side="bottom"></loc>
</pinlocations>
</tile>
</tiles>
<!-- ODIN II specific config ends -->
<!-- Physical descriptions begin -->
@ -208,6 +234,8 @@
<fill type="clb" priority="10"/>
<!--Column of 'memory' with 'EMPTY' blocks wherever a 'memory' does not fit. Vertical offset by 1 for perimeter.-->
<col type="memory" startx="2" starty="1" repeatx="8" priority="20"/>
<!--Column of 'mult_36' with 'EMPTY' blocks wherever a 'mult_36' does not fit. Vertical offset by 1 for perimeter.-->
<col type="mult_36" startx="6" starty="1" repeatx="8" priority="20"/>
<col type="EMPTY" startx="2" repeatx="8" starty="1" priority="19"/>
</auto_layout>
<fixed_layout name="3x2" width="5" height="4">
@ -686,6 +714,58 @@
</interconnect>
</pb_type>
<!-- Define general purpose logic block (CLB) ends -->
<!-- Define 36-bit multiplier begin -->
<pb_type name="mult_36">
<input name="a" num_pins="36"/>
<input name="b" num_pins="36"/>
<output name="out" num_pins="72"/>
<mode name="mult_36x36">
<pb_type name="mult_36x36_slice" num_pb="1">
<input name="A_cfg" num_pins="36"/>
<input name="B_cfg" num_pins="36"/>
<output name="OUT_cfg" num_pins="72"/>
<pb_type name="mult_36x36" blif_model=".subckt mult_36" num_pb="1">
<input name="A" num_pins="36"/>
<input name="B" num_pins="36"/>
<output name="Y" num_pins="72"/>
<delay_constant max="1.523e-9" min="0.776e-9" in_port="mult_36x36.A" out_port="mult_36x36.Y"/>
<delay_constant max="1.523e-9" min="0.776e-9" in_port="mult_36x36.B" out_port="mult_36x36.Y"/>
</pb_type>
<interconnect>
<direct name="a2a" input="mult_36x36_slice.A_cfg" output="mult_36x36.A">
</direct>
<direct name="b2b" input="mult_36x36_slice.B_cfg" output="mult_36x36.B">
</direct>
<direct name="out2out" input="mult_36x36.Y" output="mult_36x36_slice.OUT_cfg">
</direct>
</interconnect>
<power method="pin-toggle">
<port name="A_cfg" energy_per_toggle="2.13e-12"/>
<port name="B_cfg" energy_per_toggle="2.13e-12"/>
<static_power power_per_instance="0.0"/>
</power>
</pb_type>
<interconnect>
<!-- Stratix IV input delay of 207ps is conservative for this architecture because this architecture does not have an input crossbar in the multiplier.
Subtract 72.5 ps delay, which is already in the connection block input mux, leading
to a 134 ps delay.
The interconnect difference for DSP blocks is 0.5523, which leads to a minimum delay of 74 ps
-->
<direct name="a2a" input="mult_36.a" output="mult_36x36_slice.A_cfg">
<delay_constant max="134e-12" min="74e-12" in_port="mult_36.a" out_port="mult_36x36_slice.A_cfg"/>
</direct>
<direct name="b2b" input="mult_36.b" output="mult_36x36_slice.B_cfg">
<delay_constant max="134e-12" min="74e-12" in_port="mult_36.b" out_port="mult_36x36_slice.B_cfg"/>
</direct>
<direct name="out2out" input="mult_36x36_slice.OUT_cfg" output="mult_36.out">
<delay_constant max="1.93e-9" min="74e-12" in_port="mult_36x36_slice.OUT_cfg" out_port="mult_36.out"/>
</direct>
</interconnect>
</mode>
<!-- Place this multiplier block every 8 columns from (and including) the sixth column -->
<power method="sum-of-children"/>
</pb_type>
<!-- Define fracturable multiplier end -->
<!-- Define single-mode dual-port memory begin -->
<pb_type name="memory">
<input name="waddr" num_pins="10"/>

View File

@ -193,7 +193,16 @@
<!--Column of 'memory' with 'EMPTY' blocks wherever a 'memory' does not fit. Vertical offset by 1 for perimeter.-->
<col type="memory" startx="16" starty="1" repeatx="16" priority="20"/>
<col type="EMPTY" startx="16" repeatx="16" starty="1" priority="19"/>
</auto_layout-->
</auto_layout>
<fixed_layout name="6x6" width="8" height="8">
<!--Perimeter of 'io' blocks with 'EMPTY' blocks at corners-->
<perimeter type="io" priority="100"/>
<corners type="EMPTY" priority="101"/>
<!--Fill with 'clb'-->
<fill type="clb" priority="10"/>
<!--Column of 'memory' with 'EMPTY' blocks wherever a 'memory' does not fit. Vertical offset by 1 for perimeter.-->
<col type="EMPTY" startx="16" repeatx="16" starty="1" priority="19"/>
</fixed_layout>
<!-- Apply a fixed layout of 2x2 core array.
VPR8 considers the I/O ring in the array size
Therefore the height and width are both 4