Merge pull request #669 from lnis-uofu/runtime_membank

Reduce Peak Memory Usage of Fabric Bitstream Database
This commit is contained in:
tangxifan 2022-05-25 21:17:58 +08:00 committed by GitHub
commit b824f94751
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
13 changed files with 232 additions and 22 deletions

View File

@ -71,7 +71,8 @@ std::vector<char> FabricBitstream::bit_address(const FabricBitId& bit_id) const
VTR_ASSERT(true == valid_bit_id(bit_id));
VTR_ASSERT(true == use_address_);
return bit_addresses_[bit_id];
/* Decode address bits */
return decode_address_bits(bit_address_1bits_[bit_id], bit_address_xbits_[bit_id]);
}
std::vector<char> FabricBitstream::bit_bl_address(const FabricBitId& bit_id) const {
@ -84,7 +85,7 @@ std::vector<char> FabricBitstream::bit_wl_address(const FabricBitId& bit_id) con
VTR_ASSERT(true == use_address_);
VTR_ASSERT(true == use_wl_address_);
return bit_wl_addresses_[bit_id];
return decode_wl_address_bits(bit_wl_address_1bits_[bit_id], bit_wl_address_xbits_[bit_id]);
}
char FabricBitstream::bit_din(const FabricBitId& bit_id) const {
@ -110,11 +111,13 @@ void FabricBitstream::reserve_bits(const size_t& num_bits) {
config_bit_ids_.reserve(num_bits);
if (true == use_address_) {
bit_addresses_.reserve(num_bits);
bit_address_1bits_.reserve(num_bits);
bit_address_xbits_.reserve(num_bits);
bit_dins_.reserve(num_bits);
if (true == use_wl_address_) {
bit_wl_addresses_.reserve(num_bits);
bit_wl_address_1bits_.reserve(num_bits);
bit_wl_address_xbits_.reserve(num_bits);
}
}
}
@ -126,15 +129,16 @@ FabricBitId FabricBitstream::add_bit(const ConfigBitId& config_bit_id) {
config_bit_ids_.push_back(config_bit_id);
if (true == use_address_) {
bit_addresses_.emplace_back();
bit_address_1bits_.emplace_back();
bit_address_xbits_.emplace_back();
bit_dins_.emplace_back();
if (true == use_wl_address_) {
bit_wl_addresses_.emplace_back();
bit_wl_address_1bits_.emplace_back();
bit_wl_address_xbits_.emplace_back();
}
}
return bit;
}
@ -148,7 +152,9 @@ void FabricBitstream::set_bit_address(const FabricBitId& bit_id,
} else {
VTR_ASSERT(address_length_ == address.size());
}
bit_addresses_[bit_id] = address;
/* Encode bit '1' and bit 'x' into two numbers */
bit_address_1bits_[bit_id] = encode_address_1bits(address);
bit_address_xbits_[bit_id] = encode_address_xbits(address);
}
void FabricBitstream::set_bit_bl_address(const FabricBitId& bit_id,
@ -168,7 +174,9 @@ void FabricBitstream::set_bit_wl_address(const FabricBitId& bit_id,
} else {
VTR_ASSERT(wl_address_length_ == address.size());
}
bit_wl_addresses_[bit_id] = address;
/* Encode bit '1' and bit 'x' into two numbers */
bit_wl_address_1bits_[bit_id] = encode_address_1bits(address);
bit_wl_address_xbits_[bit_id] = encode_address_xbits(address);
}
void FabricBitstream::set_bit_din(const FabricBitId& bit_id,
@ -233,11 +241,13 @@ void FabricBitstream::reverse() {
std::reverse(config_bit_ids_.begin(), config_bit_ids_.end());
if (true == use_address_) {
std::reverse(bit_addresses_.begin(), bit_addresses_.end());
std::reverse(bit_address_1bits_.begin(), bit_address_1bits_.end());
std::reverse(bit_address_xbits_.begin(), bit_address_xbits_.end());
std::reverse(bit_dins_.begin(), bit_dins_.end());
if (true == use_wl_address_) {
std::reverse(bit_wl_addresses_.begin(), bit_wl_addresses_.end());
std::reverse(bit_wl_address_1bits_.begin(), bit_wl_address_1bits_.end());
std::reverse(bit_wl_address_xbits_.begin(), bit_wl_address_xbits_.end());
}
}
}
@ -259,4 +269,60 @@ bool FabricBitstream::valid_region_id(const FabricBitRegionId& region_id) const
return (size_t(region_id) < num_regions_);
}
size_t FabricBitstream::encode_address_1bits(const std::vector<char>& address) const {
/* Convert all the 'x' bit into 0 */
std::vector<char> binary_address = address;
for (char& bit : binary_address) {
if (bit == 'x') {
bit = '0';
}
}
/* Convert the binary address to a number */
return bintoi_charvec(binary_address);
}
size_t FabricBitstream::encode_address_xbits(const std::vector<char>& address) const {
/* Convert all the '1' bit into 0 and Convert all the 'x' bit into 1 */
std::vector<char> binary_address = address;
for (char& bit : binary_address) {
if (bit == '1') {
bit = '0';
}
if (bit == 'x') {
bit = '1';
}
}
/* Convert the binary address to a number */
return bintoi_charvec(binary_address);
}
std::vector<char> FabricBitstream::decode_address_bits(const size_t& bit1, const size_t& bitx) const {
/* Decode the bit1 number to a binary vector */
std::vector<char> ret_vec = itobin_charvec(bit1, address_length_);
/* Decode the bitx number to a binary vector */
std::vector<char> bitx_vec = itobin_charvec(bitx, address_length_);
/* Combine the two vectors: 'x' overwrite any bit '0' and '1' */
for (size_t ibit = 0; ibit < ret_vec.size(); ++ibit) {
if (bitx_vec[ibit] == '1') {
ret_vec[ibit] = 'x';
}
}
return ret_vec;
}
std::vector<char> FabricBitstream::decode_wl_address_bits(const size_t& bit1, const size_t& bitx) const {
/* Decode the bit1 number to a binary vector */
std::vector<char> ret_vec = itobin_charvec(bit1, wl_address_length_);
/* Decode the bitx number to a binary vector */
std::vector<char> bitx_vec = itobin_charvec(bitx, wl_address_length_);
/* Combine the two vectors: 'x' overwrite any bit '0' and '1' */
for (size_t ibit = 0; ibit < ret_vec.size(); ++ibit) {
if (bitx_vec[ibit] == '1') {
ret_vec[ibit] = 'x';
}
}
return ret_vec;
}
} /* end namespace openfpga */

View File

@ -187,6 +187,12 @@ class FabricBitstream {
bool valid_bit_id(const FabricBitId& bit_id) const;
bool valid_region_id(const FabricBitRegionId& bit_id) const;
private: /* Private APIs */
size_t encode_address_1bits(const std::vector<char>& address) const;
size_t encode_address_xbits(const std::vector<char>& address) const;
std::vector<char> decode_address_bits(const size_t& bit1, const size_t& bitx) const;
std::vector<char> decode_wl_address_bits(const size_t& bit1, const size_t& bitx) const;
private: /* Internal data */
/* Unique id of a region in the Bitstream */
size_t num_regions_;
@ -206,17 +212,27 @@ class FabricBitstream {
size_t wl_address_length_;
/* Address bits: this is designed for memory decoders
* Here we store the binary format of the address, which can be loaded
* Here we store the encoded format of the address, and decoded to binary format which can be loaded
* to the configuration protocol directly
*
* We use a 2-element array, as we may have a BL address and a WL address
* Encoding strategy is as follows:
* - An address bit which may contain '0', '1', 'x'. For example
* 101x1
* - The string can be encoded into two integer numbers:
* - bit-one number: which encodes the '0' and '1' bits into a number. For example,
* 101x1 -> 10101 -> 21
* - bit-x number: which encodes the 'x' bits into a number. For example,
* 101x1 -> 00010 -> 2
*
* TODO: use nested vector may cause large memory footprint
* when bitstream size increases
* NEED TO THINK ABOUT A COMPACT MODELING
* TODO: There is a limitation here, when the length of address vector is more than 64,
* A size_t number overflows (cannot represent any binary number > 64 bit).
* Such thing can entirely happen even in a medium sized FPGA.
* A solution can be use multiple size_t to fit. But clearly, we should not use vector in vector, which causes large memory overhead!
*/
vtr::vector<FabricBitId, std::vector<char>> bit_addresses_;
vtr::vector<FabricBitId, std::vector<char>> bit_wl_addresses_;
vtr::vector<FabricBitId, size_t> bit_address_1bits_;
vtr::vector<FabricBitId, size_t> bit_address_xbits_;
vtr::vector<FabricBitId, size_t> bit_wl_address_1bits_;
vtr::vector<FabricBitId, size_t> bit_wl_address_xbits_;
/* Data input (Din) bits: this is designed for memory decoders */
vtr::vector<FabricBitId, char> bit_dins_;

View File

@ -445,6 +445,7 @@ int write_fabric_bitstream_to_text_file(const BitstreamManager& bitstream_manage
status = write_memory_bank_shift_register_fabric_bitstream_to_text_file(fp,
apply_fast_configuration,
bit_value_to_skip,
fabric_bitstream,
blwl_sr_banks,
keep_dont_care_bits);

View File

@ -10,6 +10,7 @@
/* Headers from vtrutil library */
#include "vtr_assert.h"
#include "vtr_log.h"
#include "vtr_time.h"
/* Headers from openfpgautil library */
#include "openfpga_reserved_words.h"
@ -506,6 +507,7 @@ MemoryBankShiftRegisterFabricBitstream build_memory_bank_shift_register_fabric_b
const bool& fast_configuration,
const bool& bit_value_to_skip,
const char& dont_care_bit) {
vtr::ScopedStartFinishTimer timer("Reshape fabric bitstream for memory bank using shift registers");
MemoryBankFlattenFabricBitstream raw_fabric_bits = build_memory_bank_flatten_fabric_bitstream(fabric_bitstream, fast_configuration, bit_value_to_skip, dont_care_bit);
MemoryBankShiftRegisterFabricBitstream fabric_bits;

View File

@ -42,7 +42,8 @@ build_fabric_bitstream --verbose
# Write fabric-dependent bitstream
write_fabric_bitstream --file fabric_bitstream.txt --format plain_text
write_fabric_bitstream --file fabric_bitstream.xml --format xml
# Skip this because the XML file is too large to fit Github runners
#write_fabric_bitstream --file fabric_bitstream.xml --format xml
# Finish and exit OpenFPGA
exit

View File

@ -9,13 +9,16 @@ PYTHON_EXEC=python3.8
echo -e "FPGA-Bitstream regression tests";
echo -e "Testing bitstream generation for an auto-sized device";
run-task fpga_bitstream/generate_bitstream/device_auto $@
run-task fpga_bitstream/generate_bitstream/configuration_chain/device_auto $@
run-task fpga_bitstream/generate_bitstream/ql_memory_bank_shift_register/device_auto $@
echo -e "Testing bitstream generation for an 48x48 FPGA device";
run-task fpga_bitstream/generate_bitstream/device_48x48 $@
run-task fpga_bitstream/generate_bitstream/configuration_chain/device_48x48 $@
run-task fpga_bitstream/generate_bitstream/ql_memory_bank_shift_register/device_48x48 $@
echo -e "Testing bitstream generation for an 96x96 FPGA device";
run-task fpga_bitstream/generate_bitstream/device_96x96 $@
run-task fpga_bitstream/generate_bitstream/configuration_chain/device_96x96 $@
run-task fpga_bitstream/generate_bitstream/ql_memory_bank_shift_register/device_96x96 $@
echo -e "Testing loading architecture bitstream from an external file";
run-task fpga_bitstream/load_external_architecture_bitstream $@

View File

@ -0,0 +1,38 @@
# = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
# Configuration file for running experiments
# = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
# timeout_each_job : FPGA Task script splits fpga flow into multiple jobs
# Each job execute fpga_flow script on combination of architecture & benchmark
# timeout_each_job is timeout for each job
# = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
[GENERAL]
run_engine=openfpga_shell
power_tech_file = ${PATH:OPENFPGA_PATH}/openfpga_flow/tech/PTM_45nm/45nm.xml
power_analysis = false
spice_output=false
verilog_output=true
# Runtime of this bitstream generation should not exceed 3 minutes as a QoR requirement
timeout_each_job = 3*60
fpga_flow=yosys_vpr
[OpenFPGA_SHELL]
openfpga_shell_template=${PATH:OPENFPGA_PATH}/openfpga_flow/openfpga_shell_scripts/generate_bitstream_fix_device_example_script.openfpga
openfpga_arch_file=${PATH:OPENFPGA_PATH}/openfpga_flow/openfpga_arch/k4_N4_40nm_qlbanksr_openfpga.xml
openfpga_sim_setting_file=${PATH:OPENFPGA_PATH}/openfpga_flow/openfpga_simulation_settings/fixed_sim_openfpga.xml
# VPR parameters
openfpga_vpr_route_chan_width=50
openfpga_vpr_device_layout=48x48
[ARCHITECTURES]
arch0=${PATH:OPENFPGA_PATH}/openfpga_flow/vpr_arch/k4_N4_tileable_40nm.xml
[BENCHMARKS]
bench0=${PATH:OPENFPGA_PATH}/openfpga_flow/benchmarks/micro_benchmark/SAPone/rtl/*
[SYNTHESIS_PARAM]
# Yosys script parameters
bench_read_verilog_options_common = -nolatches
bench0_top = SAPone
[SCRIPT_PARAM_MIN_ROUTE_CHAN_WIDTH]

View File

@ -0,0 +1,36 @@
# = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
# Configuration file for running experiments
# = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
# timeout_each_job : FPGA Task script splits fpga flow into multiple jobs
# Each job execute fpga_flow script on combination of architecture & benchmark
# timeout_each_job is timeout for each job
# = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
[GENERAL]
run_engine=openfpga_shell
power_tech_file = ${PATH:OPENFPGA_PATH}/openfpga_flow/tech/PTM_45nm/45nm.xml
power_analysis = false
spice_output=false
verilog_output=true
# Runtime of this bitstream generation should not exceed 6 minutes as a QoR requirement
timeout_each_job = 6*60
fpga_flow=yosys_vpr
[OpenFPGA_SHELL]
openfpga_shell_template=${PATH:OPENFPGA_PATH}/openfpga_flow/openfpga_shell_scripts/generate_bitstream_fix_device_example_script.openfpga
openfpga_arch_file=${PATH:OPENFPGA_PATH}/openfpga_flow/openfpga_arch/k4_N4_40nm_qlbanksr_openfpga.xml
openfpga_sim_setting_file=${PATH:OPENFPGA_PATH}/openfpga_flow/openfpga_simulation_settings/fixed_sim_openfpga.xml
openfpga_vpr_route_chan_width=100
openfpga_vpr_device_layout=96x96
[ARCHITECTURES]
arch0=${PATH:OPENFPGA_PATH}/openfpga_flow/vpr_arch/k4_N4_tileable_40nm.xml
[BENCHMARKS]
bench0=${PATH:OPENFPGA_PATH}/openfpga_flow/benchmarks/micro_benchmark/RISC_posedge_clk/rtl/*.v
[SYNTHESIS_PARAM]
bench_read_verilog_options_common = -nolatches
bench0_top = RISC_core_top
[SCRIPT_PARAM_MIN_ROUTE_CHAN_WIDTH]

View File

@ -0,0 +1,33 @@
# = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
# Configuration file for running experiments
# = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
# timeout_each_job : FPGA Task script splits fpga flow into multiple jobs
# Each job execute fpga_flow script on combination of architecture & benchmark
# timeout_each_job is timeout for each job
# = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
[GENERAL]
run_engine=openfpga_shell
power_tech_file = ${PATH:OPENFPGA_PATH}/openfpga_flow/tech/PTM_45nm/45nm.xml
power_analysis = true
spice_output=false
verilog_output=true
timeout_each_job = 20*60
fpga_flow=yosys_vpr
[OpenFPGA_SHELL]
openfpga_shell_template=${PATH:OPENFPGA_PATH}/openfpga_flow/openfpga_shell_scripts/generate_bitstream_example_script.openfpga
openfpga_arch_file=${PATH:OPENFPGA_PATH}/openfpga_flow/openfpga_arch/k4_N4_40nm_qlbanksr_openfpga.xml
openfpga_sim_setting_file=${PATH:OPENFPGA_PATH}/openfpga_flow/openfpga_simulation_settings/auto_sim_openfpga.xml
[ARCHITECTURES]
arch0=${PATH:OPENFPGA_PATH}/openfpga_flow/vpr_arch/k4_N4_tileable_40nm.xml
[BENCHMARKS]
bench0=${PATH:OPENFPGA_PATH}/openfpga_flow/benchmarks/micro_benchmark/and2/and2.v
[SYNTHESIS_PARAM]
bench_read_verilog_options_common = -nolatches
bench0_top = and2
[SCRIPT_PARAM_MIN_ROUTE_CHAN_WIDTH]

View File

@ -84,6 +84,20 @@
<!--Fill with 'clb'-->
<fill type="clb" priority="10"/>
</fixed_layout>
<fixed_layout name="48x48" width="50" height="50">
<!--Perimeter of 'io' blocks with 'EMPTY' blocks at corners-->
<perimeter type="io" priority="100"/>
<corners type="EMPTY" priority="101"/>
<!--Fill with 'clb'-->
<fill type="clb" priority="10"/>
</fixed_layout>
<fixed_layout name="96x96" width="98" height="98">
<!--Perimeter of 'io' blocks with 'EMPTY' blocks at corners-->
<perimeter type="io" priority="100"/>
<corners type="EMPTY" priority="101"/>
<!--Fill with 'clb'-->
<fill type="clb" priority="10"/>
</fixed_layout>
</layout>
<device>
<!-- VB & JL: Using Ian Kuon's transistor sizing and drive strength data for routing, at 40 nm. Ian used BPTM