Merge pull request #669 from lnis-uofu/runtime_membank
Reduce Peak Memory Usage of Fabric Bitstream Database
This commit is contained in:
commit
b824f94751
|
@ -71,7 +71,8 @@ std::vector<char> FabricBitstream::bit_address(const FabricBitId& bit_id) const
|
|||
VTR_ASSERT(true == valid_bit_id(bit_id));
|
||||
VTR_ASSERT(true == use_address_);
|
||||
|
||||
return bit_addresses_[bit_id];
|
||||
/* Decode address bits */
|
||||
return decode_address_bits(bit_address_1bits_[bit_id], bit_address_xbits_[bit_id]);
|
||||
}
|
||||
|
||||
std::vector<char> FabricBitstream::bit_bl_address(const FabricBitId& bit_id) const {
|
||||
|
@ -84,7 +85,7 @@ std::vector<char> FabricBitstream::bit_wl_address(const FabricBitId& bit_id) con
|
|||
VTR_ASSERT(true == use_address_);
|
||||
VTR_ASSERT(true == use_wl_address_);
|
||||
|
||||
return bit_wl_addresses_[bit_id];
|
||||
return decode_wl_address_bits(bit_wl_address_1bits_[bit_id], bit_wl_address_xbits_[bit_id]);
|
||||
}
|
||||
|
||||
char FabricBitstream::bit_din(const FabricBitId& bit_id) const {
|
||||
|
@ -110,11 +111,13 @@ void FabricBitstream::reserve_bits(const size_t& num_bits) {
|
|||
config_bit_ids_.reserve(num_bits);
|
||||
|
||||
if (true == use_address_) {
|
||||
bit_addresses_.reserve(num_bits);
|
||||
bit_address_1bits_.reserve(num_bits);
|
||||
bit_address_xbits_.reserve(num_bits);
|
||||
bit_dins_.reserve(num_bits);
|
||||
|
||||
if (true == use_wl_address_) {
|
||||
bit_wl_addresses_.reserve(num_bits);
|
||||
bit_wl_address_1bits_.reserve(num_bits);
|
||||
bit_wl_address_xbits_.reserve(num_bits);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -126,15 +129,16 @@ FabricBitId FabricBitstream::add_bit(const ConfigBitId& config_bit_id) {
|
|||
config_bit_ids_.push_back(config_bit_id);
|
||||
|
||||
if (true == use_address_) {
|
||||
bit_addresses_.emplace_back();
|
||||
bit_address_1bits_.emplace_back();
|
||||
bit_address_xbits_.emplace_back();
|
||||
bit_dins_.emplace_back();
|
||||
|
||||
if (true == use_wl_address_) {
|
||||
bit_wl_addresses_.emplace_back();
|
||||
bit_wl_address_1bits_.emplace_back();
|
||||
bit_wl_address_xbits_.emplace_back();
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
return bit;
|
||||
}
|
||||
|
||||
|
@ -148,7 +152,9 @@ void FabricBitstream::set_bit_address(const FabricBitId& bit_id,
|
|||
} else {
|
||||
VTR_ASSERT(address_length_ == address.size());
|
||||
}
|
||||
bit_addresses_[bit_id] = address;
|
||||
/* Encode bit '1' and bit 'x' into two numbers */
|
||||
bit_address_1bits_[bit_id] = encode_address_1bits(address);
|
||||
bit_address_xbits_[bit_id] = encode_address_xbits(address);
|
||||
}
|
||||
|
||||
void FabricBitstream::set_bit_bl_address(const FabricBitId& bit_id,
|
||||
|
@ -168,7 +174,9 @@ void FabricBitstream::set_bit_wl_address(const FabricBitId& bit_id,
|
|||
} else {
|
||||
VTR_ASSERT(wl_address_length_ == address.size());
|
||||
}
|
||||
bit_wl_addresses_[bit_id] = address;
|
||||
/* Encode bit '1' and bit 'x' into two numbers */
|
||||
bit_wl_address_1bits_[bit_id] = encode_address_1bits(address);
|
||||
bit_wl_address_xbits_[bit_id] = encode_address_xbits(address);
|
||||
}
|
||||
|
||||
void FabricBitstream::set_bit_din(const FabricBitId& bit_id,
|
||||
|
@ -233,11 +241,13 @@ void FabricBitstream::reverse() {
|
|||
std::reverse(config_bit_ids_.begin(), config_bit_ids_.end());
|
||||
|
||||
if (true == use_address_) {
|
||||
std::reverse(bit_addresses_.begin(), bit_addresses_.end());
|
||||
std::reverse(bit_address_1bits_.begin(), bit_address_1bits_.end());
|
||||
std::reverse(bit_address_xbits_.begin(), bit_address_xbits_.end());
|
||||
std::reverse(bit_dins_.begin(), bit_dins_.end());
|
||||
|
||||
if (true == use_wl_address_) {
|
||||
std::reverse(bit_wl_addresses_.begin(), bit_wl_addresses_.end());
|
||||
std::reverse(bit_wl_address_1bits_.begin(), bit_wl_address_1bits_.end());
|
||||
std::reverse(bit_wl_address_xbits_.begin(), bit_wl_address_xbits_.end());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -259,4 +269,60 @@ bool FabricBitstream::valid_region_id(const FabricBitRegionId& region_id) const
|
|||
return (size_t(region_id) < num_regions_);
|
||||
}
|
||||
|
||||
size_t FabricBitstream::encode_address_1bits(const std::vector<char>& address) const {
|
||||
/* Convert all the 'x' bit into 0 */
|
||||
std::vector<char> binary_address = address;
|
||||
for (char& bit : binary_address) {
|
||||
if (bit == 'x') {
|
||||
bit = '0';
|
||||
}
|
||||
}
|
||||
/* Convert the binary address to a number */
|
||||
return bintoi_charvec(binary_address);
|
||||
}
|
||||
|
||||
size_t FabricBitstream::encode_address_xbits(const std::vector<char>& address) const {
|
||||
/* Convert all the '1' bit into 0 and Convert all the 'x' bit into 1 */
|
||||
std::vector<char> binary_address = address;
|
||||
for (char& bit : binary_address) {
|
||||
if (bit == '1') {
|
||||
bit = '0';
|
||||
}
|
||||
if (bit == 'x') {
|
||||
bit = '1';
|
||||
}
|
||||
}
|
||||
/* Convert the binary address to a number */
|
||||
return bintoi_charvec(binary_address);
|
||||
}
|
||||
|
||||
std::vector<char> FabricBitstream::decode_address_bits(const size_t& bit1, const size_t& bitx) const {
|
||||
/* Decode the bit1 number to a binary vector */
|
||||
std::vector<char> ret_vec = itobin_charvec(bit1, address_length_);
|
||||
/* Decode the bitx number to a binary vector */
|
||||
std::vector<char> bitx_vec = itobin_charvec(bitx, address_length_);
|
||||
/* Combine the two vectors: 'x' overwrite any bit '0' and '1' */
|
||||
for (size_t ibit = 0; ibit < ret_vec.size(); ++ibit) {
|
||||
if (bitx_vec[ibit] == '1') {
|
||||
ret_vec[ibit] = 'x';
|
||||
}
|
||||
}
|
||||
return ret_vec;
|
||||
}
|
||||
|
||||
std::vector<char> FabricBitstream::decode_wl_address_bits(const size_t& bit1, const size_t& bitx) const {
|
||||
/* Decode the bit1 number to a binary vector */
|
||||
std::vector<char> ret_vec = itobin_charvec(bit1, wl_address_length_);
|
||||
/* Decode the bitx number to a binary vector */
|
||||
std::vector<char> bitx_vec = itobin_charvec(bitx, wl_address_length_);
|
||||
/* Combine the two vectors: 'x' overwrite any bit '0' and '1' */
|
||||
for (size_t ibit = 0; ibit < ret_vec.size(); ++ibit) {
|
||||
if (bitx_vec[ibit] == '1') {
|
||||
ret_vec[ibit] = 'x';
|
||||
}
|
||||
}
|
||||
return ret_vec;
|
||||
}
|
||||
|
||||
|
||||
} /* end namespace openfpga */
|
||||
|
|
|
@ -187,6 +187,12 @@ class FabricBitstream {
|
|||
bool valid_bit_id(const FabricBitId& bit_id) const;
|
||||
bool valid_region_id(const FabricBitRegionId& bit_id) const;
|
||||
|
||||
private: /* Private APIs */
|
||||
size_t encode_address_1bits(const std::vector<char>& address) const;
|
||||
size_t encode_address_xbits(const std::vector<char>& address) const;
|
||||
std::vector<char> decode_address_bits(const size_t& bit1, const size_t& bitx) const;
|
||||
std::vector<char> decode_wl_address_bits(const size_t& bit1, const size_t& bitx) const;
|
||||
|
||||
private: /* Internal data */
|
||||
/* Unique id of a region in the Bitstream */
|
||||
size_t num_regions_;
|
||||
|
@ -206,17 +212,27 @@ class FabricBitstream {
|
|||
size_t wl_address_length_;
|
||||
|
||||
/* Address bits: this is designed for memory decoders
|
||||
* Here we store the binary format of the address, which can be loaded
|
||||
* Here we store the encoded format of the address, and decoded to binary format which can be loaded
|
||||
* to the configuration protocol directly
|
||||
*
|
||||
* We use a 2-element array, as we may have a BL address and a WL address
|
||||
* Encoding strategy is as follows:
|
||||
* - An address bit which may contain '0', '1', 'x'. For example
|
||||
* 101x1
|
||||
* - The string can be encoded into two integer numbers:
|
||||
* - bit-one number: which encodes the '0' and '1' bits into a number. For example,
|
||||
* 101x1 -> 10101 -> 21
|
||||
* - bit-x number: which encodes the 'x' bits into a number. For example,
|
||||
* 101x1 -> 00010 -> 2
|
||||
*
|
||||
* TODO: use nested vector may cause large memory footprint
|
||||
* when bitstream size increases
|
||||
* NEED TO THINK ABOUT A COMPACT MODELING
|
||||
* TODO: There is a limitation here, when the length of address vector is more than 64,
|
||||
* A size_t number overflows (cannot represent any binary number > 64 bit).
|
||||
* Such thing can entirely happen even in a medium sized FPGA.
|
||||
* A solution can be use multiple size_t to fit. But clearly, we should not use vector in vector, which causes large memory overhead!
|
||||
*/
|
||||
vtr::vector<FabricBitId, std::vector<char>> bit_addresses_;
|
||||
vtr::vector<FabricBitId, std::vector<char>> bit_wl_addresses_;
|
||||
vtr::vector<FabricBitId, size_t> bit_address_1bits_;
|
||||
vtr::vector<FabricBitId, size_t> bit_address_xbits_;
|
||||
vtr::vector<FabricBitId, size_t> bit_wl_address_1bits_;
|
||||
vtr::vector<FabricBitId, size_t> bit_wl_address_xbits_;
|
||||
|
||||
/* Data input (Din) bits: this is designed for memory decoders */
|
||||
vtr::vector<FabricBitId, char> bit_dins_;
|
||||
|
|
|
@ -445,6 +445,7 @@ int write_fabric_bitstream_to_text_file(const BitstreamManager& bitstream_manage
|
|||
status = write_memory_bank_shift_register_fabric_bitstream_to_text_file(fp,
|
||||
apply_fast_configuration,
|
||||
bit_value_to_skip,
|
||||
|
||||
fabric_bitstream,
|
||||
blwl_sr_banks,
|
||||
keep_dont_care_bits);
|
||||
|
|
|
@ -10,6 +10,7 @@
|
|||
/* Headers from vtrutil library */
|
||||
#include "vtr_assert.h"
|
||||
#include "vtr_log.h"
|
||||
#include "vtr_time.h"
|
||||
|
||||
/* Headers from openfpgautil library */
|
||||
#include "openfpga_reserved_words.h"
|
||||
|
@ -506,6 +507,7 @@ MemoryBankShiftRegisterFabricBitstream build_memory_bank_shift_register_fabric_b
|
|||
const bool& fast_configuration,
|
||||
const bool& bit_value_to_skip,
|
||||
const char& dont_care_bit) {
|
||||
vtr::ScopedStartFinishTimer timer("Reshape fabric bitstream for memory bank using shift registers");
|
||||
MemoryBankFlattenFabricBitstream raw_fabric_bits = build_memory_bank_flatten_fabric_bitstream(fabric_bitstream, fast_configuration, bit_value_to_skip, dont_care_bit);
|
||||
MemoryBankShiftRegisterFabricBitstream fabric_bits;
|
||||
|
||||
|
|
|
@ -42,7 +42,8 @@ build_fabric_bitstream --verbose
|
|||
|
||||
# Write fabric-dependent bitstream
|
||||
write_fabric_bitstream --file fabric_bitstream.txt --format plain_text
|
||||
write_fabric_bitstream --file fabric_bitstream.xml --format xml
|
||||
# Skip this because the XML file is too large to fit Github runners
|
||||
#write_fabric_bitstream --file fabric_bitstream.xml --format xml
|
||||
|
||||
# Finish and exit OpenFPGA
|
||||
exit
|
||||
|
|
|
@ -9,13 +9,16 @@ PYTHON_EXEC=python3.8
|
|||
echo -e "FPGA-Bitstream regression tests";
|
||||
|
||||
echo -e "Testing bitstream generation for an auto-sized device";
|
||||
run-task fpga_bitstream/generate_bitstream/device_auto $@
|
||||
run-task fpga_bitstream/generate_bitstream/configuration_chain/device_auto $@
|
||||
run-task fpga_bitstream/generate_bitstream/ql_memory_bank_shift_register/device_auto $@
|
||||
|
||||
echo -e "Testing bitstream generation for an 48x48 FPGA device";
|
||||
run-task fpga_bitstream/generate_bitstream/device_48x48 $@
|
||||
run-task fpga_bitstream/generate_bitstream/configuration_chain/device_48x48 $@
|
||||
run-task fpga_bitstream/generate_bitstream/ql_memory_bank_shift_register/device_48x48 $@
|
||||
|
||||
echo -e "Testing bitstream generation for an 96x96 FPGA device";
|
||||
run-task fpga_bitstream/generate_bitstream/device_96x96 $@
|
||||
run-task fpga_bitstream/generate_bitstream/configuration_chain/device_96x96 $@
|
||||
run-task fpga_bitstream/generate_bitstream/ql_memory_bank_shift_register/device_96x96 $@
|
||||
|
||||
echo -e "Testing loading architecture bitstream from an external file";
|
||||
run-task fpga_bitstream/load_external_architecture_bitstream $@
|
||||
|
|
|
@ -0,0 +1,38 @@
|
|||
# = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
|
||||
# Configuration file for running experiments
|
||||
# = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
|
||||
# timeout_each_job : FPGA Task script splits fpga flow into multiple jobs
|
||||
# Each job execute fpga_flow script on combination of architecture & benchmark
|
||||
# timeout_each_job is timeout for each job
|
||||
# = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
|
||||
|
||||
[GENERAL]
|
||||
run_engine=openfpga_shell
|
||||
power_tech_file = ${PATH:OPENFPGA_PATH}/openfpga_flow/tech/PTM_45nm/45nm.xml
|
||||
power_analysis = false
|
||||
spice_output=false
|
||||
verilog_output=true
|
||||
# Runtime of this bitstream generation should not exceed 3 minutes as a QoR requirement
|
||||
timeout_each_job = 3*60
|
||||
fpga_flow=yosys_vpr
|
||||
|
||||
[OpenFPGA_SHELL]
|
||||
openfpga_shell_template=${PATH:OPENFPGA_PATH}/openfpga_flow/openfpga_shell_scripts/generate_bitstream_fix_device_example_script.openfpga
|
||||
openfpga_arch_file=${PATH:OPENFPGA_PATH}/openfpga_flow/openfpga_arch/k4_N4_40nm_qlbanksr_openfpga.xml
|
||||
openfpga_sim_setting_file=${PATH:OPENFPGA_PATH}/openfpga_flow/openfpga_simulation_settings/fixed_sim_openfpga.xml
|
||||
# VPR parameters
|
||||
openfpga_vpr_route_chan_width=50
|
||||
openfpga_vpr_device_layout=48x48
|
||||
|
||||
[ARCHITECTURES]
|
||||
arch0=${PATH:OPENFPGA_PATH}/openfpga_flow/vpr_arch/k4_N4_tileable_40nm.xml
|
||||
|
||||
[BENCHMARKS]
|
||||
bench0=${PATH:OPENFPGA_PATH}/openfpga_flow/benchmarks/micro_benchmark/SAPone/rtl/*
|
||||
|
||||
[SYNTHESIS_PARAM]
|
||||
# Yosys script parameters
|
||||
bench_read_verilog_options_common = -nolatches
|
||||
bench0_top = SAPone
|
||||
|
||||
[SCRIPT_PARAM_MIN_ROUTE_CHAN_WIDTH]
|
|
@ -0,0 +1,36 @@
|
|||
# = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
|
||||
# Configuration file for running experiments
|
||||
# = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
|
||||
# timeout_each_job : FPGA Task script splits fpga flow into multiple jobs
|
||||
# Each job execute fpga_flow script on combination of architecture & benchmark
|
||||
# timeout_each_job is timeout for each job
|
||||
# = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
|
||||
|
||||
[GENERAL]
|
||||
run_engine=openfpga_shell
|
||||
power_tech_file = ${PATH:OPENFPGA_PATH}/openfpga_flow/tech/PTM_45nm/45nm.xml
|
||||
power_analysis = false
|
||||
spice_output=false
|
||||
verilog_output=true
|
||||
# Runtime of this bitstream generation should not exceed 6 minutes as a QoR requirement
|
||||
timeout_each_job = 6*60
|
||||
fpga_flow=yosys_vpr
|
||||
|
||||
[OpenFPGA_SHELL]
|
||||
openfpga_shell_template=${PATH:OPENFPGA_PATH}/openfpga_flow/openfpga_shell_scripts/generate_bitstream_fix_device_example_script.openfpga
|
||||
openfpga_arch_file=${PATH:OPENFPGA_PATH}/openfpga_flow/openfpga_arch/k4_N4_40nm_qlbanksr_openfpga.xml
|
||||
openfpga_sim_setting_file=${PATH:OPENFPGA_PATH}/openfpga_flow/openfpga_simulation_settings/fixed_sim_openfpga.xml
|
||||
openfpga_vpr_route_chan_width=100
|
||||
openfpga_vpr_device_layout=96x96
|
||||
|
||||
[ARCHITECTURES]
|
||||
arch0=${PATH:OPENFPGA_PATH}/openfpga_flow/vpr_arch/k4_N4_tileable_40nm.xml
|
||||
|
||||
[BENCHMARKS]
|
||||
bench0=${PATH:OPENFPGA_PATH}/openfpga_flow/benchmarks/micro_benchmark/RISC_posedge_clk/rtl/*.v
|
||||
|
||||
[SYNTHESIS_PARAM]
|
||||
bench_read_verilog_options_common = -nolatches
|
||||
bench0_top = RISC_core_top
|
||||
|
||||
[SCRIPT_PARAM_MIN_ROUTE_CHAN_WIDTH]
|
|
@ -0,0 +1,33 @@
|
|||
# = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
|
||||
# Configuration file for running experiments
|
||||
# = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
|
||||
# timeout_each_job : FPGA Task script splits fpga flow into multiple jobs
|
||||
# Each job execute fpga_flow script on combination of architecture & benchmark
|
||||
# timeout_each_job is timeout for each job
|
||||
# = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
|
||||
|
||||
[GENERAL]
|
||||
run_engine=openfpga_shell
|
||||
power_tech_file = ${PATH:OPENFPGA_PATH}/openfpga_flow/tech/PTM_45nm/45nm.xml
|
||||
power_analysis = true
|
||||
spice_output=false
|
||||
verilog_output=true
|
||||
timeout_each_job = 20*60
|
||||
fpga_flow=yosys_vpr
|
||||
|
||||
[OpenFPGA_SHELL]
|
||||
openfpga_shell_template=${PATH:OPENFPGA_PATH}/openfpga_flow/openfpga_shell_scripts/generate_bitstream_example_script.openfpga
|
||||
openfpga_arch_file=${PATH:OPENFPGA_PATH}/openfpga_flow/openfpga_arch/k4_N4_40nm_qlbanksr_openfpga.xml
|
||||
openfpga_sim_setting_file=${PATH:OPENFPGA_PATH}/openfpga_flow/openfpga_simulation_settings/auto_sim_openfpga.xml
|
||||
|
||||
[ARCHITECTURES]
|
||||
arch0=${PATH:OPENFPGA_PATH}/openfpga_flow/vpr_arch/k4_N4_tileable_40nm.xml
|
||||
|
||||
[BENCHMARKS]
|
||||
bench0=${PATH:OPENFPGA_PATH}/openfpga_flow/benchmarks/micro_benchmark/and2/and2.v
|
||||
|
||||
[SYNTHESIS_PARAM]
|
||||
bench_read_verilog_options_common = -nolatches
|
||||
bench0_top = and2
|
||||
|
||||
[SCRIPT_PARAM_MIN_ROUTE_CHAN_WIDTH]
|
|
@ -84,6 +84,20 @@
|
|||
<!--Fill with 'clb'-->
|
||||
<fill type="clb" priority="10"/>
|
||||
</fixed_layout>
|
||||
<fixed_layout name="48x48" width="50" height="50">
|
||||
<!--Perimeter of 'io' blocks with 'EMPTY' blocks at corners-->
|
||||
<perimeter type="io" priority="100"/>
|
||||
<corners type="EMPTY" priority="101"/>
|
||||
<!--Fill with 'clb'-->
|
||||
<fill type="clb" priority="10"/>
|
||||
</fixed_layout>
|
||||
<fixed_layout name="96x96" width="98" height="98">
|
||||
<!--Perimeter of 'io' blocks with 'EMPTY' blocks at corners-->
|
||||
<perimeter type="io" priority="100"/>
|
||||
<corners type="EMPTY" priority="101"/>
|
||||
<!--Fill with 'clb'-->
|
||||
<fill type="clb" priority="10"/>
|
||||
</fixed_layout>
|
||||
</layout>
|
||||
<device>
|
||||
<!-- VB & JL: Using Ian Kuon's transistor sizing and drive strength data for routing, at 40 nm. Ian used BPTM
|
||||
|
|
Loading…
Reference in New Issue