[Engine] Fixed a critical bug on WL arrangement; Previously we always consider squart of a local tile. Now we apply global optimization where the number of WLs are determined by the max. number of BLs per column

This commit is contained in:
tangxifan 2021-09-10 17:03:44 -07:00
parent 73d21c9730
commit 4af6413c97
7 changed files with 31 additions and 21 deletions

View File

@ -20,6 +20,7 @@
#include "openfpga_naming.h"
#include "memory_utils.h"
#include "memory_bank_utils.h"
#include "decoder_library_utils.h"
#include "module_manager_utils.h"
#include "build_decoder_modules.h"
@ -691,18 +692,17 @@ TopModuleNumConfigBits find_top_module_regional_num_config_bit(const ModuleManag
* - each column has independent BLs
*/
for (const ConfigRegionId& config_region : module_manager.regions(top_module)) {
std::map<int, size_t> num_bls;
std::map<int, size_t> num_wls;
for (size_t child_id = 0; child_id < module_manager.region_configurable_children(top_module, config_region).size(); ++child_id) {
ModuleId child_module = module_manager.region_configurable_children(top_module, config_region)[child_id];
vtr::Point<int> coord = module_manager.region_configurable_child_coordinates(top_module, config_region)[child_id];
num_bls[coord.x()] = std::max(num_bls[coord.x()], find_memory_decoder_data_size(find_module_num_config_bits(module_manager, child_module, circuit_lib, sram_model, config_protocol_type)));
num_wls[coord.y()] = std::max(num_wls[coord.y()], find_memory_wl_decoder_data_size(find_module_num_config_bits(module_manager, child_module, circuit_lib, sram_model, config_protocol_type)));
}
for (const auto& kv : num_bls) {
std::map<int, size_t> num_bls_per_tile = compute_memory_bank_regional_bitline_numbers_per_tile(module_manager, top_module,
config_region,
circuit_lib, sram_model);
std::map<int, size_t> num_wls_per_tile = compute_memory_bank_regional_wordline_numbers_per_tile(module_manager, top_module,
config_region,
circuit_lib, sram_model,
num_bls_per_tile);
for (const auto& kv : num_bls_per_tile) {
num_config_bits[config_region].first += kv.second;
}
for (const auto& kv : num_wls) {
for (const auto& kv : num_wls_per_tile) {
num_config_bits[config_region].second += kv.second;
}
}

View File

@ -290,7 +290,8 @@ void add_top_module_nets_cmos_ql_memory_bank_config_bus(ModuleManager& module_ma
circuit_lib, sram_model);
std::map<int, size_t> num_wls_per_tile = compute_memory_bank_regional_wordline_numbers_per_tile(module_manager, top_module,
config_region,
circuit_lib, sram_model);
circuit_lib, sram_model,
num_bls_per_tile);
std::map<int, size_t> bl_start_index_per_tile = compute_memory_bank_regional_blwl_start_index_per_tile(child_x_range, num_bls_per_tile);
std::map<int, size_t> wl_start_index_per_tile = compute_memory_bank_regional_blwl_start_index_per_tile(child_y_range, num_wls_per_tile);

View File

@ -243,7 +243,8 @@ void build_module_fabric_dependent_bitstream_ql_memory_bank(const ConfigProtocol
circuit_lib, config_protocol.memory_model());
std::map<int, size_t> num_wls_per_tile = compute_memory_bank_regional_wordline_numbers_per_tile(module_manager, top_module,
config_region,
circuit_lib, config_protocol.memory_model());
circuit_lib, config_protocol.memory_model(),
num_bls_per_tile);
std::map<int, size_t> bl_start_index_per_tile = compute_memory_bank_regional_blwl_start_index_per_tile(child_x_range, num_bls_per_tile);
std::map<int, size_t> wl_start_index_per_tile = compute_memory_bank_regional_blwl_start_index_per_tile(child_y_range, num_wls_per_tile);

View File

@ -96,12 +96,12 @@ size_t find_memory_decoder_data_size(const size_t& num_mems) {
* Considering the example of 203 memory cells again, when 15 BLs are used, we just need
* 203 / 15 = 13.5555 -> 14 WLs
***************************************************************************************/
size_t find_memory_wl_decoder_data_size(const size_t& num_mems) {
/* Handle exception: zero memory should have zero WLs */
if (0 == num_mems) {
size_t find_memory_wl_decoder_data_size(const size_t& num_mems, const size_t& num_bls) {
/* Handle exception: zero BLs should have zero WLs */
if (0 == num_bls) {
return 0;
}
return std::ceil(num_mems / (size_t)std::ceil(std::sqrt((float)num_mems)));
return std::ceil((float)num_mems / (float)num_bls);
}
/***************************************************************************************

View File

@ -17,7 +17,7 @@ size_t find_memory_decoder_addr_size(const size_t& num_mems);
size_t find_memory_decoder_data_size(const size_t& num_mems);
size_t find_memory_wl_decoder_data_size(const size_t& num_mems);
size_t find_memory_wl_decoder_data_size(const size_t& num_mems, const size_t& num_bls);
DecoderId add_mux_local_decoder_to_library(DecoderLibrary& decoder_lib,
const size_t data_size);

View File

@ -76,13 +76,14 @@ std::map<int, size_t> compute_memory_bank_regional_wordline_numbers_per_tile(con
const ModuleId& top_module,
const ConfigRegionId& config_region,
const CircuitLibrary& circuit_lib,
const CircuitModelId& sram_model) {
const CircuitModelId& sram_model,
const std::map<int, size_t>& num_bls_per_tile) {
std::map<int, size_t> num_wls_per_tile;
for (size_t child_id = 0; child_id < module_manager.region_configurable_children(top_module, config_region).size(); ++child_id) {
ModuleId child_module = module_manager.region_configurable_children(top_module, config_region)[child_id];
vtr::Point<int> coord = module_manager.region_configurable_child_coordinates(top_module, config_region)[child_id];
num_wls_per_tile[coord.y()] = std::max(num_wls_per_tile[coord.y()], find_memory_wl_decoder_data_size(find_module_num_config_bits(module_manager, child_module, circuit_lib, sram_model, CONFIG_MEM_QL_MEMORY_BANK)));
num_wls_per_tile[coord.y()] = std::max(num_wls_per_tile[coord.y()], find_memory_wl_decoder_data_size(find_module_num_config_bits(module_manager, child_module, circuit_lib, sram_model, CONFIG_MEM_QL_MEMORY_BANK), num_bls_per_tile.at(coord.x())));
}
return num_wls_per_tile;

View File

@ -53,14 +53,21 @@ std::map<int, size_t> compute_memory_bank_regional_bitline_numbers_per_tile(cons
/**
* @brief Precompute the number of word lines required by each tile under a specific configuration region
* @note
* Not every index in the range computed by the compute_memory_bank_regional_configurable_child_x_range() function has a postive number of word lines
* Not every index in the range computed by the compute_memory_bank_regional_configurable_child_y_range() function has a postive number of word lines
* If an empty entry is found (e.g., std::map::find(y) is empty), it means there are not word lines required in that tile
* @note
* This function requires an input argument which describes number of bitlines per tile. Base on the information, the number of word lines are inferred
* by total number of memores / number of bit lines at a given tile location
* This strategy is chosen because in each column, the number of bit lines are bounded by the tile which consumes most configuation bits. It may reduces
* the use of word lines. For example, a tile[0][0] has only 8 bits, from which we may infer 3 BLs and 3 WLs. However, when tile[0][1] contains 100 bits,
* which will force the number of BLs to be 10. In such case, tile[0][0] only requires 1 WL
*/
std::map<int, size_t> compute_memory_bank_regional_wordline_numbers_per_tile(const ModuleManager& module_manager,
const ModuleId& top_module,
const ConfigRegionId& config_region,
const CircuitLibrary& circuit_lib,
const CircuitModelId& sram_model);
const CircuitModelId& sram_model,
const std::map<int, size_t>& num_bls_per_tile);
/**
* @brief Precompute the BLs and WLs distribution across the FPGA fabric