From 0d779160413f88bba4f3fd50e061fc80cc6ab6fa Mon Sep 17 00:00:00 2001 From: tangxifan Date: Fri, 30 Oct 2020 10:43:11 -0600 Subject: [PATCH 01/11] [Tool] Support multi-region frame-based configuration protocol --- .../src/fabric/build_top_module_memory.cpp | 346 +++++++++++++++++- .../fpga_bitstream/build_device_bitstream.cpp | 5 +- .../fpga_bitstream/build_fabric_bitstream.cpp | 43 ++- .../fpga_verilog/verilog_top_testbench.cpp | 77 +++- 4 files changed, 440 insertions(+), 31 deletions(-) diff --git a/openfpga/src/fabric/build_top_module_memory.cpp b/openfpga/src/fabric/build_top_module_memory.cpp index 50a1a7f15..8a86d4d69 100644 --- a/openfpga/src/fabric/build_top_module_memory.cpp +++ b/openfpga/src/fabric/build_top_module_memory.cpp @@ -781,7 +781,12 @@ void add_top_module_sram_ports(ModuleManager& module_manager, BasicPort en_port(std::string(DECODER_ENABLE_PORT_NAME), 1); module_manager.add_port(module_id, en_port, ModuleManager::MODULE_INPUT_PORT); - BasicPort addr_port(std::string(DECODER_ADDRESS_PORT_NAME), total_num_config_bits); + size_t max_num_config_bits = 0; + for (const size_t& curr_num_config_bits : num_config_bits) { + max_num_config_bits = std::max(max_num_config_bits, curr_num_config_bits); + } + + BasicPort addr_port(std::string(DECODER_ADDRESS_PORT_NAME), max_num_config_bits); module_manager.add_port(module_id, addr_port, ModuleManager::MODULE_INPUT_PORT); BasicPort din_port(std::string(DECODER_DATA_IN_PORT_NAME), sram_port_size); @@ -1258,6 +1263,343 @@ void add_top_module_nets_cmos_memory_chain_config_bus(ModuleManager& module_mana } } +/******************************************************************** + * This function will create nets for the following types of connections: + * - Connect the enable signal to the EN of memory module + * - Connect the address port to the address port of memory module + * - Connect the data_in (Din) to the data_in of the memory module + * Note that the top-level module may have multiple regions and + * therefore the Din port have multiple pins. The Din of local decoder + * should be connected the Din pin indexed by current configuration region + * + * EN ADDR[X-1:0] DATA_IN[N-1:0] + * | | | + * | | | Top module + * +----+-----+------------+------------------ + * | | | | + * | v v v + * | +-------------------------------+ + * | | EN ADDR[X-1:0] DATA_IN[N-1:0] | + * | | | + * | | Configurable Child | + * | | | + * | +-------------------------------+ + * + * Note: + * - This function is ONLY applicable to single configurable child case!!! + * - This function is applicable to the configurable child in a specific region!!! + *********************************************************************/ +static +void add_top_module_nets_cmos_memory_frame_short_config_bus(ModuleManager& module_manager, + const ModuleId& top_module, + const ConfigRegionId& config_region) { + std::vector configurable_children = module_manager.region_configurable_children(top_module, config_region); + + VTR_ASSERT(1 == configurable_children.size()); + ModuleId child_module = configurable_children[0]; + size_t child_instance = module_manager.region_configurable_child_instances(top_module, config_region)[0]; + + /* Connect the enable (EN) port of the parent module + * to the EN port of memory module + */ + ModulePortId parent_en_port = module_manager.find_module_port(top_module, std::string(DECODER_ENABLE_PORT_NAME)); + ModulePortId child_en_port = module_manager.find_module_port(child_module, std::string(DECODER_ENABLE_PORT_NAME)); + add_module_bus_nets(module_manager, top_module, + top_module, 0, parent_en_port, + child_module, child_instance, child_en_port); + + /* Connect the address port of the parent module to the child module address port */ + ModulePortId parent_addr_port = module_manager.find_module_port(top_module, std::string(DECODER_ADDRESS_PORT_NAME)); + ModulePortId child_addr_port = module_manager.find_module_port(child_module, std::string(DECODER_ADDRESS_PORT_NAME)); + add_module_bus_nets(module_manager, top_module, + top_module, 0, parent_addr_port, + child_module, child_instance, child_addr_port); + + /* Connect the data_in (Din) of parent module to the data_in of the memory module + */ + ModulePortId parent_din_port = module_manager.find_module_port(top_module, std::string(DECODER_DATA_IN_PORT_NAME)); + BasicPort parent_din_port_info = module_manager.module_port(top_module, parent_din_port); + ModulePortId child_din_port = module_manager.find_module_port(child_module, std::string(DECODER_DATA_IN_PORT_NAME)); + BasicPort child_din_port_info = module_manager.module_port(child_module, child_din_port); + + /* Ensure pin indices are in range! */ + VTR_ASSERT(size_t(config_region) < parent_din_port_info.get_width()); + VTR_ASSERT(1 == child_din_port_info.get_width()); + + /* Create a net for the Din[config_region] pin */ + ModuleNetId din_net = create_module_source_pin_net(module_manager, top_module, + top_module, 0, + parent_din_port, + parent_din_port_info.pins()[size_t(config_region)]); + VTR_ASSERT(ModuleNetId::INVALID() != din_net); + + /* Configure the net sink */ + module_manager.add_module_net_sink(top_module, din_net, child_module, child_instance, child_din_port, child_din_port_info.pins()[0]); +} + +/******************************************************************** + * This function will + * - Add a frame decoder to the parent module + * - If the decoder exists in the library, we use the module + * - If the decoder does not exist, we create a new module and use it + * - Create nets for the following types of connections: + * - Connect the EN signal, first few bits of address of parent module + * to the frame decoder inputs + * Note that the top-level module may have more address bits than + * what is required for this configuration region. + * A decoder will be created anyway to avoid address collision + * to other configuration regions + * - Connect the enable (EN) port of memory modules under the parent module + * to the frame decoder outputs + * - Connect the data_in (Din) of parent module to the data_in of the all + * the memory modules + * Note that the top-level module may have multiple regions and + * therefore the Din port have multiple pins. The Din of local decoder + * should be connected the Din pin indexed by current configuration region + * + * EN ADDR[X-1:0] DATA_IN[Y-1:0] + * | | | + * | | | Top module + * +--------+-------+------------+------------------ + * | | | + * | v v + * | EN ADDR[X - 1: X - log(N)/log2] + * | | | + * | v v + * | +--------------------------------------------+ + * | | Frame-based decoder | + * | | | + * | | Data out | + * | +--------------------------------------------+ + * | | + * | +-------------+--------------------+ + * | | | | + * | Din | Din | Din | + * | [Y] | [Y] | [Y] | + * | | | | | | | + * | v v v v v v + * | +--------+ +--------+ +--------+ + * | | Memory | | Memory | ... | Memory | + * | | Module | | Module | | Module | + * | | [0] | | [1] | | [N-1] | + * | +--------+ +--------+ +--------+ + * | ^ ^ ^ + * | | | | + * | +-------------+--------------------+ + * | | + * | ADDR[X - log(N)/log2 - 1: 0] + * + * Note: + * - X is the port size of address port of the parent module + * - the address port of child memory modules may be smaller than + * X - log(N)/log2. In such case, we will drop the MSBs until it fit + * - This function is only applicable to 2+ configurable children!!! + * + *********************************************************************/ +static +void add_top_module_nets_cmos_memory_frame_decoder_config_bus(ModuleManager& module_manager, + DecoderLibrary& decoder_lib, + const ModuleId& parent_module, + const ConfigRegionId& config_region) { + std::vector configurable_children = module_manager.region_configurable_children(parent_module, config_region); + std::vector configurable_child_instances = module_manager.region_configurable_child_instances(parent_module, config_region); + + /* Find the decoder specification */ + size_t addr_size = find_mux_local_decoder_addr_size(configurable_children.size()); + /* Data input should match the WL (data_in) of a SRAM */ + size_t data_size = configurable_children.size(); + + /* Find the number of address bits that are wired directly to configurable children */ + size_t max_child_addr_size = 0; + for (size_t mem_index = 0; mem_index < configurable_children.size(); ++mem_index) { + ModuleId child_module = configurable_children[mem_index]; + ModulePortId child_addr_port = module_manager.find_module_port(child_module, std::string(DECODER_ADDRESS_PORT_NAME)); + BasicPort child_addr_port_info = module_manager.module_port(child_module, child_addr_port); + max_child_addr_size = std::max(max_child_addr_size, child_addr_port_info.get_width()); + } + + /* Search the decoder library and try to find one + * If not found, create a new module and add it to the module manager + */ + DecoderId decoder_id = decoder_lib.find_decoder(addr_size, data_size, true, false, false); + if (DecoderId::INVALID() == decoder_id) { + decoder_id = decoder_lib.add_decoder(addr_size, data_size, true, false, false); + } + VTR_ASSERT(DecoderId::INVALID() != decoder_id); + + /* Create a module if not existed yet */ + std::string decoder_module_name = generate_memory_decoder_subckt_name(addr_size, data_size); + ModuleId decoder_module = module_manager.find_module(decoder_module_name); + if (ModuleId::INVALID() == decoder_module) { + decoder_module = build_frame_memory_decoder_module(module_manager, + decoder_lib, + decoder_id); + } + VTR_ASSERT(ModuleId::INVALID() != decoder_module); + + /* Instanciate the decoder module here */ + size_t decoder_instance = module_manager.num_instance(parent_module, decoder_module); + module_manager.add_child_module(parent_module, decoder_module); + + /* Connect the enable (EN) port of memory modules under the parent module + * to the frame decoder inputs + */ + ModulePortId parent_en_port = module_manager.find_module_port(parent_module, std::string(DECODER_ENABLE_PORT_NAME)); + ModulePortId decoder_en_port = module_manager.find_module_port(decoder_module, std::string(DECODER_ENABLE_PORT_NAME)); + add_module_bus_nets(module_manager, parent_module, + parent_module, 0, parent_en_port, + decoder_module, decoder_instance, decoder_en_port); + + /* Connect the address port of the parent module to the frame decoder address port + * Note that we only connect to the first few bits of address port + */ + ModulePortId parent_addr_port = module_manager.find_module_port(parent_module, std::string(DECODER_ADDRESS_PORT_NAME)); + ModulePortId decoder_addr_port = module_manager.find_module_port(decoder_module, std::string(DECODER_ADDRESS_PORT_NAME)); + BasicPort parent_addr_port_info = module_manager.module_port(parent_module, parent_addr_port); + BasicPort decoder_addr_port_info = module_manager.module_port(decoder_module, decoder_addr_port); + for (size_t ipin = 0; ipin < decoder_addr_port_info.get_width(); ++ipin) { + /* Create a net for the addr pin */ + ModuleNetId addr_net = create_module_source_pin_net(module_manager, parent_module, + parent_module, 0, + parent_addr_port, + parent_addr_port_info.pins()[ipin + max_child_addr_size]); + VTR_ASSERT(ModuleNetId::INVALID() != addr_net); + + /* Configure the net sink */ + module_manager.add_module_net_sink(parent_module, addr_net, + decoder_module, decoder_instance, + decoder_addr_port, + decoder_addr_port_info.pins()[ipin]); + } + + /* Connect the address port of the parent module to the address port of configurable children + * Note that we only connect to the last few bits of address port + */ + for (size_t mem_index = 0; mem_index < configurable_children.size(); ++mem_index) { + ModuleId child_module = configurable_children[mem_index]; + size_t child_instance = configurable_child_instances[mem_index]; + ModulePortId child_addr_port = module_manager.find_module_port(child_module, std::string(DECODER_ADDRESS_PORT_NAME)); + BasicPort child_addr_port_info = module_manager.module_port(child_module, child_addr_port); + for (size_t ipin = 0; ipin < child_addr_port_info.get_width(); ++ipin) { + ModuleNetId addr_net = create_module_source_pin_net(module_manager, parent_module, + parent_module, 0, + parent_addr_port, + parent_addr_port_info.pins()[ipin]); + VTR_ASSERT(ModuleNetId::INVALID() != addr_net); + + /* Configure the net sink */ + module_manager.add_module_net_sink(parent_module, addr_net, + child_module, child_instance, + child_addr_port, + child_addr_port_info.pins()[ipin]); + } + } + + /* Connect the data_in (Din) of parent module to the data_in of the all + * the memory modules + */ + ModulePortId parent_din_port = module_manager.find_module_port(parent_module, std::string(DECODER_DATA_IN_PORT_NAME)); + BasicPort parent_din_port_info = module_manager.module_port(parent_module, parent_din_port); + for (size_t mem_index = 0; mem_index < configurable_children.size(); ++mem_index) { + ModuleId child_module = configurable_children[mem_index]; + size_t child_instance = module_manager.configurable_child_instances(parent_module)[mem_index]; + ModulePortId child_din_port = module_manager.find_module_port(child_module, std::string(DECODER_DATA_IN_PORT_NAME)); + BasicPort child_din_port_info = module_manager.module_port(child_module, child_din_port); + + /* Ensure pin indices are in range! */ + VTR_ASSERT(size_t(config_region) < parent_din_port_info.get_width()); + VTR_ASSERT(1 == child_din_port_info.get_width()); + + /* Create a net for the Din[config_region] pin */ + ModuleNetId din_net = create_module_source_pin_net(module_manager, parent_module, + parent_module, 0, + parent_din_port, + parent_din_port_info.pins()[size_t(config_region)]); + VTR_ASSERT(ModuleNetId::INVALID() != din_net); + + /* Configure the net sink */ + module_manager.add_module_net_sink(parent_module, din_net, child_module, child_instance, child_din_port, child_din_port_info.pins()[0]); + } + + /* Connect the data_out port of the decoder module + * to the enable port of configurable children + */ + ModulePortId decoder_dout_port = module_manager.find_module_port(decoder_module, std::string(DECODER_DATA_OUT_PORT_NAME)); + BasicPort decoder_dout_port_info = module_manager.module_port(decoder_module, decoder_dout_port); + VTR_ASSERT(decoder_dout_port_info.get_width() == configurable_children.size()); + for (size_t mem_index = 0; mem_index < configurable_children.size(); ++mem_index) { + ModuleId child_module = configurable_children[mem_index]; + size_t child_instance = module_manager.configurable_child_instances(parent_module)[mem_index]; + ModulePortId child_en_port = module_manager.find_module_port(child_module, std::string(DECODER_ENABLE_PORT_NAME)); + BasicPort child_en_port_info = module_manager.module_port(child_module, child_en_port); + for (size_t ipin = 0; ipin < child_en_port_info.get_width(); ++ipin) { + ModuleNetId en_net = create_module_source_pin_net(module_manager, parent_module, + decoder_module, decoder_instance, + decoder_dout_port, + decoder_dout_port_info.pins()[mem_index]); + VTR_ASSERT(ModuleNetId::INVALID() != en_net); + + /* Configure the net sink */ + module_manager.add_module_net_sink(parent_module, en_net, + child_module, child_instance, + child_en_port, + child_en_port_info.pins()[ipin]); + } + } + + /* Add the decoder as the last configurable children */ + module_manager.add_configurable_child(parent_module, decoder_module, decoder_instance); + /* Register the configurable child to configuration region */ + module_manager.add_configurable_child_to_region(parent_module, + config_region, + decoder_module, + decoder_instance, + module_manager.configurable_children(parent_module).size() - 1); +} + +/********************************************************************* + * Add framed-based decoders to the top-level module + * and build net connections between decoders and subblocks + * + * For each configuration region, we create an independent decoder + * Note that to avoid parasitic programming, all the decoders will + * be in the same size, sharing the same principle as memory banks + * + * For each region, decoder and net addition will depend on the following cases: + * - If there is no configurable child, nothing to do. + * - If there is only one configurable child, short wire the EN, ADDR and DATA_IN to it + * - If there are more than two configurable childern, add a decoder and build interconnection + * between it and the children + **********************************************************************/ +static +void add_top_module_nets_cmos_memory_frame_config_bus(ModuleManager& module_manager, + DecoderLibrary& decoder_lib, + const ModuleId& top_module, + const vtr::vector& num_config_bits) { + /* Find the number of address bits for the top-level module */ + size_t top_addr_size = 0; + for (const ConfigRegionId& config_region : module_manager.regions(top_module)) { + top_addr_size = std::max(top_addr_size, num_config_bits[config_region]); + } + + for (const ConfigRegionId& config_region : module_manager.regions(top_module)) { + if (0 == module_manager.region_configurable_children(top_module, config_region).size()) { + continue; + } + + /* Short-wiring is applicable only when all the following situations are met: + * - There is only 1 configurable child in the region + * - The number of address bits of the configurable child is the same as top-level + */ + if ( (1 == module_manager.region_configurable_children(top_module, config_region).size()) + && (num_config_bits[config_region] == top_addr_size)) { + add_top_module_nets_cmos_memory_frame_short_config_bus(module_manager, top_module, config_region); + } else { + add_top_module_nets_cmos_memory_frame_decoder_config_bus(module_manager, decoder_lib, top_module, config_region); + } + } +} + /********************************************************************* * Add the port-to-port connection between all the memory modules * and their parent module @@ -1323,7 +1665,7 @@ void add_top_module_nets_cmos_memory_config_bus(ModuleManager& module_manager, add_top_module_nets_cmos_memory_bank_config_bus(module_manager, decoder_lib, parent_module, num_config_bits); break; case CONFIG_MEM_FRAME_BASED: - add_module_nets_cmos_memory_frame_config_bus(module_manager, decoder_lib, parent_module); + add_top_module_nets_cmos_memory_frame_config_bus(module_manager, decoder_lib, parent_module, num_config_bits); break; default: VTR_LOGF_ERROR(__FILE__, __LINE__, diff --git a/openfpga/src/fpga_bitstream/build_device_bitstream.cpp b/openfpga/src/fpga_bitstream/build_device_bitstream.cpp index e4ded1821..5172bbef9 100644 --- a/openfpga/src/fpga_bitstream/build_device_bitstream.cpp +++ b/openfpga/src/fpga_bitstream/build_device_bitstream.cpp @@ -85,14 +85,13 @@ size_t rec_estimate_device_bitstream_num_bits(const ModuleManager& module_manage for (const ConfigRegionId& config_region : module_manager.regions(parent_module)) { size_t curr_region_num_config_child = module_manager.region_configurable_children(parent_module, config_region).size(); - /* FIXME: This will be uncommented when multi-region support is extended for frame-based - * Frame-based configuration protocol will have 1 decoder + /* Frame-based configuration protocol will have 1 decoder * if there are more than 1 configurable children + */ if ( (CONFIG_MEM_FRAME_BASED == config_protocol_type) && (2 <= curr_region_num_config_child)) { curr_region_num_config_child--; } - */ /* Memory configuration protocol will have 2 decoders * at the top-level diff --git a/openfpga/src/fpga_bitstream/build_fabric_bitstream.cpp b/openfpga/src/fpga_bitstream/build_fabric_bitstream.cpp index 04d7fe714..311995a72 100644 --- a/openfpga/src/fpga_bitstream/build_fabric_bitstream.cpp +++ b/openfpga/src/fpga_bitstream/build_fabric_bitstream.cpp @@ -281,6 +281,8 @@ static void rec_build_module_fabric_dependent_frame_bitstream(const BitstreamManager& bitstream_manager, const std::vector& parent_blocks, const ModuleManager& module_manager, + const ModuleId& top_module, + const ConfigRegionId& config_region, const std::vector& parent_modules, const std::vector& addr_code, FabricBitstream& fabric_bitstream, @@ -293,7 +295,18 @@ void rec_build_module_fabric_dependent_frame_bitstream(const BitstreamManager& b const ConfigBlockId& parent_block = parent_blocks.back(); const ModuleId& parent_module = parent_modules.back(); - size_t num_configurable_children = module_manager.configurable_children(parent_modules.back()).size(); + std::vector configurable_children; + std::vector configurable_child_instances; + if (top_module == parent_module) { + configurable_children = module_manager.region_configurable_children(parent_module, config_region); + configurable_child_instances = module_manager.region_configurable_child_instances(parent_module, config_region); + } else { + VTR_ASSERT(top_module != parent_module); + configurable_children = module_manager.configurable_children(parent_module); + configurable_child_instances = module_manager.configurable_child_instances(parent_module); + } + + size_t num_configurable_children = configurable_children.size(); size_t max_child_addr_code_size = 0; bool add_addr_code = true; @@ -318,11 +331,11 @@ void rec_build_module_fabric_dependent_frame_bitstream(const BitstreamManager& b */ VTR_ASSERT(2 < num_configurable_children); num_configurable_children--; - decoder_module = module_manager.configurable_children(parent_module).back(); + decoder_module = configurable_children.back(); /* The address code size is the max. of address port of all the configurable children */ for (size_t child_id = 0; child_id < num_configurable_children; ++child_id) { - ModuleId child_module = module_manager.configurable_children(parent_module)[child_id]; + ModuleId child_module = configurable_children[child_id]; const ModulePortId& child_addr_port_id = module_manager.find_module_port(child_module, std::string(DECODER_ADDRESS_PORT_NAME)); const BasicPort& child_addr_port = module_manager.module_port(child_module, child_addr_port_id); max_child_addr_code_size = std::max((int)child_addr_port.get_width(), (int)max_child_addr_code_size); @@ -330,15 +343,14 @@ void rec_build_module_fabric_dependent_frame_bitstream(const BitstreamManager& b } for (size_t child_id = 0; child_id < num_configurable_children; ++child_id) { - ModuleId child_module = module_manager.configurable_children(parent_module)[child_id]; - size_t child_instance = module_manager.configurable_child_instances(parent_module)[child_id]; + ModuleId child_module = configurable_children[child_id]; + size_t child_instance = configurable_child_instances[child_id]; /* Get the instance name and ensure it is not empty */ std::string instance_name = module_manager.instance_name(parent_module, child_module, child_instance); /* Find the child block that matches the instance name! */ ConfigBlockId child_block = bitstream_manager.find_child_block(parent_block, instance_name); /* We must have one valid block id! */ - if (true != bitstream_manager.valid_block_id(child_block)) VTR_ASSERT(true == bitstream_manager.valid_block_id(child_block)); /* Pass on the list of blocks, modules and address lists */ @@ -400,7 +412,10 @@ void rec_build_module_fabric_dependent_frame_bitstream(const BitstreamManager& b /* Go recursively */ rec_build_module_fabric_dependent_frame_bitstream(bitstream_manager, child_blocks, - module_manager, child_modules, + module_manager, + top_module, + config_region, + child_modules, child_addr_code, fabric_bitstream, fabric_bitstream_region); @@ -417,9 +432,15 @@ void rec_build_module_fabric_dependent_frame_bitstream(const BitstreamManager& b * We will find the address bit and add it to addr_code * Then we can add the configuration bits to the fabric_bitstream. */ - if (!(1 < module_manager.configurable_children(parent_modules.back()).size())) - VTR_ASSERT(1 < module_manager.configurable_children(parent_modules.back()).size()); - ModuleId decoder_module = module_manager.configurable_children(parent_modules.back()).back(); + std::vector configurable_children; + if (top_module == parent_modules.back()) { + configurable_children = module_manager.region_configurable_children(parent_modules.back(), config_region); + } else { + VTR_ASSERT(top_module != parent_modules.back()); + configurable_children = module_manager.configurable_children(parent_modules.back()); + } + + ModuleId decoder_module = configurable_children.back(); /* Find the address port from the decoder module */ const ModulePortId& decoder_addr_port_id = module_manager.find_module_port(decoder_module, std::string(DECODER_ADDRESS_PORT_NAME)); const BasicPort& decoder_addr_port = module_manager.module_port(decoder_module, decoder_addr_port_id); @@ -557,6 +578,8 @@ void build_module_fabric_dependent_bitstream(const ConfigProtocol& config_protoc rec_build_module_fabric_dependent_frame_bitstream(bitstream_manager, std::vector(1, top_block), module_manager, + top_module, + config_region, std::vector(1, top_module), std::vector(), fabric_bitstream, diff --git a/openfpga/src/fpga_verilog/verilog_top_testbench.cpp b/openfpga/src/fpga_verilog/verilog_top_testbench.cpp index b6da8f813..1d2d9c83c 100644 --- a/openfpga/src/fpga_verilog/verilog_top_testbench.cpp +++ b/openfpga/src/fpga_verilog/verilog_top_testbench.cpp @@ -1676,29 +1676,74 @@ void print_verilog_top_testbench_frame_decoder_bitstream(std::fstream& fp, fp << std::endl; - /* Attention: the configuration chain protcol requires the last configuration bit is fed first - * We will visit the fabric bitstream in a reverse way + /* Reorganize the fabric bitstream by the same address across regions: + * This is due to that the length of fabric bitstream could be different in each region. + * Template: + *
+ * An example: + * 000000 1011 + * + * Note: the std::map may cause large memory footprint for large bitstream databases! */ - for (const FabricBitId& bit_id : fabric_bitstream.bits()) { - /* When fast configuration is enabled, we skip zero data_in values */ - if ((true == fast_configuration) - && (bit_value_to_skip == fabric_bitstream.bit_din(bit_id))) { - continue; + std::map> fabric_bits_by_addr; + for (const FabricBitRegionId& region : fabric_bitstream.regions()) { + for (const FabricBitId& bit_id : fabric_bitstream.region_bits(region)) { + /* Create string for address */ + VTR_ASSERT(addr_port.get_width() == fabric_bitstream.bit_address(bit_id).size()); + std::string addr_str; + for (const char& addr_bit : fabric_bitstream.bit_address(bit_id)) { + addr_str.push_back(addr_bit); + } + + /* Place the config bit */ + auto result = fabric_bits_by_addr.find(addr_str); + if (result == fabric_bits_by_addr.end()) { + /* This is a new bit, resize the vector to the number of regions + * and deposit '0' to all the bits + */ + fabric_bits_by_addr[addr_str] = std::vector(fabric_bitstream.regions().size(), false); + fabric_bits_by_addr[addr_str][size_t(region)] = fabric_bitstream.bit_din(bit_id); + } else { + VTR_ASSERT_SAFE(result != fabric_bits_by_addr.end()); + result->second[size_t(region)] = fabric_bitstream.bit_din(bit_id); + } + } + } + + for (const auto& addr_din_pair : fabric_bits_by_addr) { + /* When fast configuration is enabled, + * the rule to skip any configuration bit should consider the whole data input values. + * Only all the bits in the din port match the value to be skipped, + * the programming cycle can be skipped! + */ + if (true == fast_configuration) { + bool skip_curr_bits = true; + for (const bool& bit : addr_din_pair.second) { + if (bit_value_to_skip != bit) { + skip_curr_bits = false; + break; + } + } + + if (true == skip_curr_bits) { + continue; + } } fp << "\t\t" << std::string(TOP_TESTBENCH_PROG_TASK_NAME); fp << "(" << addr_port.get_width() << "'b"; - VTR_ASSERT(addr_port.get_width() == fabric_bitstream.bit_address(bit_id).size()); - for (const char& addr_bit : fabric_bitstream.bit_address(bit_id)) { - fp << addr_bit; - } + VTR_ASSERT(addr_port.get_width() == addr_din_pair.first.size()); + fp << addr_din_pair.first; fp << ", "; fp <<"1'b"; - if (true == fabric_bitstream.bit_din(bit_id)) { - fp << "1"; - } else { - VTR_ASSERT(false == fabric_bitstream.bit_din(bit_id)); - fp << "0"; + VTR_ASSERT(din_port.get_width() == addr_din_pair.second.size()); + for (const bool& din_value : addr_din_pair.second) { + if (true == din_value) { + fp << "1"; + } else { + VTR_ASSERT(false == din_value); + fp << "0"; + } } fp << ");" << std::endl; } From b701bd26407eefba2fd3d84072555d750e5feb58 Mon Sep 17 00:00:00 2001 From: tangxifan Date: Fri, 30 Oct 2020 10:45:14 -0600 Subject: [PATCH 02/11] [Arch] Add multi-region architecture example for frame-based protocol --- ...k4_N4_40nm_multi_region_frame_openfpga.xml | 198 ++++++++++++++++++ 1 file changed, 198 insertions(+) create mode 100644 openfpga_flow/openfpga_arch/k4_N4_40nm_multi_region_frame_openfpga.xml diff --git a/openfpga_flow/openfpga_arch/k4_N4_40nm_multi_region_frame_openfpga.xml b/openfpga_flow/openfpga_arch/k4_N4_40nm_multi_region_frame_openfpga.xml new file mode 100644 index 000000000..116f26f14 --- /dev/null +++ b/openfpga_flow/openfpga_arch/k4_N4_40nm_multi_region_frame_openfpga.xml @@ -0,0 +1,198 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + 10e-12 + + + 10e-12 + + + + + + + + + 10e-12 + + + 10e-12 + + + + + + + + + 10e-12 + + + 10e-12 + + + + + + + + + + + + + 10e-12 5e-12 5e-12 + + + 10e-12 5e-12 5e-12 + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + From 29da368742b6fe51f4a54701f72b92b8375351dc Mon Sep 17 00:00:00 2001 From: tangxifan Date: Fri, 30 Oct 2020 10:46:47 -0600 Subject: [PATCH 03/11] [Arch] Add architecture example for multi-region frame-based architecture using both set/reset for configurable memories --- ...gion_frame_use_both_set_reset_openfpga.xml | 200 ++++++++++++++++++ 1 file changed, 200 insertions(+) create mode 100644 openfpga_flow/openfpga_arch/k4_N4_40nm_multi_region_frame_use_both_set_reset_openfpga.xml diff --git a/openfpga_flow/openfpga_arch/k4_N4_40nm_multi_region_frame_use_both_set_reset_openfpga.xml b/openfpga_flow/openfpga_arch/k4_N4_40nm_multi_region_frame_use_both_set_reset_openfpga.xml new file mode 100644 index 000000000..b8f617dc5 --- /dev/null +++ b/openfpga_flow/openfpga_arch/k4_N4_40nm_multi_region_frame_use_both_set_reset_openfpga.xml @@ -0,0 +1,200 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + 10e-12 + + + 10e-12 + + + + + + + + + 10e-12 + + + 10e-12 + + + + + + + + + 10e-12 + + + 10e-12 + + + + + + + + + + + + + 10e-12 5e-12 5e-12 + + + 10e-12 5e-12 5e-12 + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + From ca7d43275d8355eb0e1b8aa46fe454b8c2201dcb Mon Sep 17 00:00:00 2001 From: tangxifan Date: Fri, 30 Oct 2020 10:48:29 -0600 Subject: [PATCH 04/11] [Test] Add test case for multi_region configuration frame --- .../config/task.conf | 34 +++++++++++++++++++ 1 file changed, 34 insertions(+) create mode 100644 openfpga_flow/tasks/basic_tests/full_testbench/multi_region_configuration_frame/config/task.conf diff --git a/openfpga_flow/tasks/basic_tests/full_testbench/multi_region_configuration_frame/config/task.conf b/openfpga_flow/tasks/basic_tests/full_testbench/multi_region_configuration_frame/config/task.conf new file mode 100644 index 000000000..d1e73545c --- /dev/null +++ b/openfpga_flow/tasks/basic_tests/full_testbench/multi_region_configuration_frame/config/task.conf @@ -0,0 +1,34 @@ +# = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = +# Configuration file for running experiments +# = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = +# timeout_each_job : FPGA Task script splits fpga flow into multiple jobs +# Each job execute fpga_flow script on combination of architecture & benchmark +# timeout_each_job is timeout for each job +# = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = + +[GENERAL] +run_engine=openfpga_shell +power_tech_file = ${PATH:OPENFPGA_PATH}/openfpga_flow/tech/PTM_45nm/45nm.xml +power_analysis = true +spice_output=false +verilog_output=true +timeout_each_job = 20*60 +fpga_flow=yosys_vpr + +[OpenFPGA_SHELL] +openfpga_shell_template=${PATH:OPENFPGA_PATH}/openfpga_flow/OpenFPGAShellScripts/full_testbench_example_script.openfpga +openfpga_arch_file=${PATH:OPENFPGA_PATH}/openfpga_flow/openfpga_arch/k4_N4_40nm_multi_region_frame_openfpga.xml +openfpga_sim_setting_file=${PATH:OPENFPGA_PATH}/openfpga_flow/openfpga_simulation_settings/auto_sim_openfpga.xml + +[ARCHITECTURES] +arch0=${PATH:OPENFPGA_PATH}/openfpga_flow/vpr_arch/k4_N4_tileable_40nm.xml + +[BENCHMARKS] +bench0=${PATH:OPENFPGA_PATH}/openfpga_flow/benchmarks/micro_benchmark/and2/and2.v + +[SYNTHESIS_PARAM] +bench0_top = and2 +bench0_chan_width = 300 + +[SCRIPT_PARAM_MIN_ROUTE_CHAN_WIDTH] +end_flow_with_test= From 4c14428400884fe68e2fa90978e1d22be6438d60 Mon Sep 17 00:00:00 2001 From: tangxifan Date: Fri, 30 Oct 2020 10:50:00 -0600 Subject: [PATCH 05/11] [Test] Add test case for fast configuration support on multi-region frame-based configuration protocol --- .../config/task.conf | 34 +++++++++++++++++++ 1 file changed, 34 insertions(+) create mode 100644 openfpga_flow/tasks/basic_tests/full_testbench/smart_fast_multi_region_configuration_frame/config/task.conf diff --git a/openfpga_flow/tasks/basic_tests/full_testbench/smart_fast_multi_region_configuration_frame/config/task.conf b/openfpga_flow/tasks/basic_tests/full_testbench/smart_fast_multi_region_configuration_frame/config/task.conf new file mode 100644 index 000000000..b4b6a7759 --- /dev/null +++ b/openfpga_flow/tasks/basic_tests/full_testbench/smart_fast_multi_region_configuration_frame/config/task.conf @@ -0,0 +1,34 @@ +# = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = +# Configuration file for running experiments +# = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = +# timeout_each_job : FPGA Task script splits fpga flow into multiple jobs +# Each job execute fpga_flow script on combination of architecture & benchmark +# timeout_each_job is timeout for each job +# = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = + +[GENERAL] +run_engine=openfpga_shell +power_tech_file = ${PATH:OPENFPGA_PATH}/openfpga_flow/tech/PTM_45nm/45nm.xml +power_analysis = true +spice_output=false +verilog_output=true +timeout_each_job = 20*60 +fpga_flow=yosys_vpr + +[OpenFPGA_SHELL] +openfpga_shell_template=${PATH:OPENFPGA_PATH}/openfpga_flow/OpenFPGAShellScripts/fast_configuration_example_script.openfpga +openfpga_arch_file=${PATH:OPENFPGA_PATH}/openfpga_flow/openfpga_arch/k4_N4_40nm_multi_region_frame_use_both_set_reset_openfpga.xml +openfpga_sim_setting_file=${PATH:OPENFPGA_PATH}/openfpga_flow/openfpga_simulation_settings/auto_sim_openfpga.xml + +[ARCHITECTURES] +arch0=${PATH:OPENFPGA_PATH}/openfpga_flow/vpr_arch/k4_N4_tileable_40nm.xml + +[BENCHMARKS] +bench0=${PATH:OPENFPGA_PATH}/openfpga_flow/benchmarks/micro_benchmark/and2/and2.v + +[SYNTHESIS_PARAM] +bench0_top = and2 +bench0_chan_width = 300 + +[SCRIPT_PARAM_MIN_ROUTE_CHAN_WIDTH] +end_flow_with_test= From 5bcd559851ef87249b487f75d285bfe916391298 Mon Sep 17 00:00:00 2001 From: tangxifan Date: Fri, 30 Oct 2020 17:29:04 -0600 Subject: [PATCH 06/11] [Tool] Many bug fix in the multi-region support for both memory banks and framed-based. Still have problems in multi-region framed-based verification --- .../src/fabric/build_top_module_memory.cpp | 30 ++-- .../fpga_bitstream/build_fabric_bitstream.cpp | 84 +++++++-- .../fpga_verilog/verilog_top_testbench.cpp | 107 +++-------- openfpga/src/utils/fabric_bitstream_utils.cpp | 168 ++++++++++++++++++ openfpga/src/utils/fabric_bitstream_utils.h | 11 ++ 5 files changed, 280 insertions(+), 120 deletions(-) diff --git a/openfpga/src/fabric/build_top_module_memory.cpp b/openfpga/src/fabric/build_top_module_memory.cpp index 8a86d4d69..91275b3bd 100644 --- a/openfpga/src/fabric/build_top_module_memory.cpp +++ b/openfpga/src/fabric/build_top_module_memory.cpp @@ -1349,6 +1349,7 @@ void add_top_module_nets_cmos_memory_frame_short_config_bus(ModuleManager& modul * what is required for this configuration region. * A decoder will be created anyway to avoid address collision * to other configuration regions + * The address lines will be aligned from the MSB of top-level address lines!!! * - Connect the enable (EN) port of memory modules under the parent module * to the frame decoder outputs * - Connect the data_in (Din) of parent module to the data_in of the all @@ -1387,14 +1388,12 @@ void add_top_module_nets_cmos_memory_frame_short_config_bus(ModuleManager& modul * | | | | * | +-------------+--------------------+ * | | - * | ADDR[X - log(N)/log2 - 1: 0] + * | ADDR[log(N)/log2 - 1: 0] * * Note: * - X is the port size of address port of the parent module * - the address port of child memory modules may be smaller than * X - log(N)/log2. In such case, we will drop the MSBs until it fit - * - This function is only applicable to 2+ configurable children!!! - * *********************************************************************/ static void add_top_module_nets_cmos_memory_frame_decoder_config_bus(ModuleManager& module_manager, @@ -1404,20 +1403,14 @@ void add_top_module_nets_cmos_memory_frame_decoder_config_bus(ModuleManager& mod std::vector configurable_children = module_manager.region_configurable_children(parent_module, config_region); std::vector configurable_child_instances = module_manager.region_configurable_child_instances(parent_module, config_region); + ModulePortId parent_addr_port = module_manager.find_module_port(parent_module, std::string(DECODER_ADDRESS_PORT_NAME)); + BasicPort parent_addr_port_info = module_manager.module_port(parent_module, parent_addr_port); + /* Find the decoder specification */ size_t addr_size = find_mux_local_decoder_addr_size(configurable_children.size()); /* Data input should match the WL (data_in) of a SRAM */ size_t data_size = configurable_children.size(); - /* Find the number of address bits that are wired directly to configurable children */ - size_t max_child_addr_size = 0; - for (size_t mem_index = 0; mem_index < configurable_children.size(); ++mem_index) { - ModuleId child_module = configurable_children[mem_index]; - ModulePortId child_addr_port = module_manager.find_module_port(child_module, std::string(DECODER_ADDRESS_PORT_NAME)); - BasicPort child_addr_port_info = module_manager.module_port(child_module, child_addr_port); - max_child_addr_size = std::max(max_child_addr_size, child_addr_port_info.get_width()); - } - /* Search the decoder library and try to find one * If not found, create a new module and add it to the module manager */ @@ -1453,23 +1446,21 @@ void add_top_module_nets_cmos_memory_frame_decoder_config_bus(ModuleManager& mod /* Connect the address port of the parent module to the frame decoder address port * Note that we only connect to the first few bits of address port */ - ModulePortId parent_addr_port = module_manager.find_module_port(parent_module, std::string(DECODER_ADDRESS_PORT_NAME)); ModulePortId decoder_addr_port = module_manager.find_module_port(decoder_module, std::string(DECODER_ADDRESS_PORT_NAME)); - BasicPort parent_addr_port_info = module_manager.module_port(parent_module, parent_addr_port); BasicPort decoder_addr_port_info = module_manager.module_port(decoder_module, decoder_addr_port); for (size_t ipin = 0; ipin < decoder_addr_port_info.get_width(); ++ipin) { /* Create a net for the addr pin */ ModuleNetId addr_net = create_module_source_pin_net(module_manager, parent_module, parent_module, 0, parent_addr_port, - parent_addr_port_info.pins()[ipin + max_child_addr_size]); + parent_addr_port_info.pins()[parent_addr_port_info.get_width() - 1 - ipin]); VTR_ASSERT(ModuleNetId::INVALID() != addr_net); /* Configure the net sink */ module_manager.add_module_net_sink(parent_module, addr_net, decoder_module, decoder_instance, decoder_addr_port, - decoder_addr_port_info.pins()[ipin]); + decoder_addr_port_info.pins()[decoder_addr_port_info.get_width() - 1 - ipin]); } /* Connect the address port of the parent module to the address port of configurable children @@ -1577,10 +1568,9 @@ void add_top_module_nets_cmos_memory_frame_config_bus(ModuleManager& module_mana const ModuleId& top_module, const vtr::vector& num_config_bits) { /* Find the number of address bits for the top-level module */ - size_t top_addr_size = 0; - for (const ConfigRegionId& config_region : module_manager.regions(top_module)) { - top_addr_size = std::max(top_addr_size, num_config_bits[config_region]); - } + ModulePortId top_addr_port = module_manager.find_module_port(top_module, std::string(DECODER_ADDRESS_PORT_NAME)); + BasicPort top_addr_port_info = module_manager.module_port(top_module, top_addr_port); + size_t top_addr_size = top_addr_port_info.get_width(); for (const ConfigRegionId& config_region : module_manager.regions(top_module)) { if (0 == module_manager.region_configurable_children(top_module, config_region).size()) { diff --git a/openfpga/src/fpga_bitstream/build_fabric_bitstream.cpp b/openfpga/src/fpga_bitstream/build_fabric_bitstream.cpp index 311995a72..a327e9c4e 100644 --- a/openfpga/src/fpga_bitstream/build_fabric_bitstream.cpp +++ b/openfpga/src/fpga_bitstream/build_fabric_bitstream.cpp @@ -325,21 +325,26 @@ void rec_build_module_fabric_dependent_frame_bitstream(const BitstreamManager& b if (1 == num_configurable_children) { add_addr_code = false; } else { - /* For more than 2 children, there is a decoder in the tail of the list - * We will not decode that, but will access the address size from that module - * So, we reduce the number of children by 1 - */ + /* For more than 2 children, there is a decoder in the tail of the list + * We will not decode that, but will access the address size from that module + * So, we reduce the number of children by 1 + */ VTR_ASSERT(2 < num_configurable_children); num_configurable_children--; decoder_module = configurable_children.back(); - /* The address code size is the max. of address port of all the configurable children */ - for (size_t child_id = 0; child_id < num_configurable_children; ++child_id) { - ModuleId child_module = configurable_children[child_id]; + /* The max address code size is the max address code size of all the + * configurable children in all the regions + */ + for (const ModuleId& child_module : module_manager.configurable_children(parent_module)) { + /* Bypass any decoder module (which no configurable children */ + if (module_manager.configurable_children(child_module).empty()) { + continue; + } const ModulePortId& child_addr_port_id = module_manager.find_module_port(child_module, std::string(DECODER_ADDRESS_PORT_NAME)); const BasicPort& child_addr_port = module_manager.module_port(child_module, child_addr_port_id); - max_child_addr_code_size = std::max((int)child_addr_port.get_width(), (int)max_child_addr_code_size); - } + max_child_addr_code_size = std::max(child_addr_port.get_width(), max_child_addr_code_size); + } } for (size_t child_id = 0; child_id < num_configurable_children; ++child_id) { @@ -369,7 +374,15 @@ void rec_build_module_fabric_dependent_frame_bitstream(const BitstreamManager& b const BasicPort& decoder_addr_port = module_manager.module_port(decoder_module, decoder_addr_port_id); std::vector addr_bits_vec = itobin_charvec(child_id, decoder_addr_port.get_width()); - child_addr_code.insert(child_addr_code.begin(), addr_bits_vec.begin(), addr_bits_vec.end()); + /* For top-level module, the child address should be added to the tail + * For other modules, the child address should be added to the head + */ + if (top_module == parent_module) { + child_addr_code.insert(child_addr_code.end(), addr_bits_vec.begin(), addr_bits_vec.end()); + } else { + VTR_ASSERT_SAFE(top_module != parent_module); + child_addr_code.insert(child_addr_code.begin(), addr_bits_vec.begin(), addr_bits_vec.end()); + } /* Note that the address port size of the child module may be smaller than the maximum * of other child modules at this level. @@ -382,8 +395,8 @@ void rec_build_module_fabric_dependent_frame_bitstream(const BitstreamManager& b * we should add dummy '0' to fill the gap * * Addr_code for child[0]: '000' + addr_bits_vec - * Addr_code for child[1]: '0' + addr_bits_vec - * Addr_code for child[2]: addr_bits_vec + * Addr_code for child[1]: '00' + addr_bits_vec + * Addr_code for child[2]: '0' + addr_bits_vec * * Addr[6:8] * | @@ -392,7 +405,7 @@ void rec_build_module_fabric_dependent_frame_bitstream(const BitstreamManager& b * | Decoder Module | * +-------------------------------------------+ * - * Addr[0:2] Addr[0:4] Addr[0:5] + * Addr[0:2] Addr[0:3] Addr[0:4] * | | | * v v v * +-----------+ +-------------+ +------------+ @@ -570,10 +583,49 @@ void build_module_fabric_dependent_bitstream(const ConfigProtocol& config_protoc fabric_bitstream.reserve_bits(bitstream_manager.num_bits()); fabric_bitstream.set_address_length(addr_port_info.get_width()); - /* TODO: Currently only support 1 region. Will expand later! */ - VTR_ASSERT(1 == module_manager.regions(top_module).size()); + /* Find the maximum decoder address among all the configurable regions */ + size_t max_decoder_addr_size = 0; + for (const ConfigRegionId& config_region : module_manager.regions(top_module)) { + std::vector configurable_children = module_manager.region_configurable_children(top_module, config_region); + /* Bypass the regions that have no decoders */ + if ( (0 == configurable_children.size()) + || (1 == configurable_children.size())) { + continue; + } + ModuleId decoder_module = configurable_children.back(); + ModulePortId decoder_addr_port_id = module_manager.find_module_port(decoder_module, DECODER_ADDRESS_PORT_NAME); + BasicPort decoder_addr_port = module_manager.module_port(decoder_module, decoder_addr_port_id); + max_decoder_addr_size = std::max(max_decoder_addr_size, decoder_addr_port.get_width()); + } for (const ConfigRegionId& config_region : module_manager.regions(top_module)) { + std::vector configurable_children = module_manager.region_configurable_children(top_module, config_region); + + /* Bypass non-configurable regions */ + if (0 == configurable_children.size()) { + continue; + } + + /* Find the idle address bit which should be added to the head of the address bit + * This depends on the number of address bits required by this region + * For example: + * Top-level address is addr[0:4] + * There are 4 decoders in the top-level module, whose address sizes are + * decoder A: addr[0:4] + * decoder B: addr[0:3] + * decoder C: addr[0:2] + * decoder D: addr[0:3] + * For decoder A, the address fit well + * For decoder B, an idle bit should be added '0' + addr[0:3] + * For decoder C, two idle bits should be added '00' + addr[0:2] + * For decoder D, an idle bit should be added '0' + addr[0:3] + */ + ModuleId decoder_module = configurable_children.back(); + ModulePortId decoder_addr_port_id = module_manager.find_module_port(decoder_module, DECODER_ADDRESS_PORT_NAME); + BasicPort decoder_addr_port = module_manager.module_port(decoder_module, decoder_addr_port_id); + VTR_ASSERT(max_decoder_addr_size >= decoder_addr_port.get_width()); + std::vector idle_addr_bits(max_decoder_addr_size - decoder_addr_port.get_width(), '0'); + FabricBitRegionId fabric_bitstream_region = fabric_bitstream.add_region(); rec_build_module_fabric_dependent_frame_bitstream(bitstream_manager, std::vector(1, top_block), @@ -581,7 +633,7 @@ void build_module_fabric_dependent_bitstream(const ConfigProtocol& config_protoc top_module, config_region, std::vector(1, top_module), - std::vector(), + idle_addr_bits, fabric_bitstream, fabric_bitstream_region); } diff --git a/openfpga/src/fpga_verilog/verilog_top_testbench.cpp b/openfpga/src/fpga_verilog/verilog_top_testbench.cpp index 1d2d9c83c..6753f6626 100644 --- a/openfpga/src/fpga_verilog/verilog_top_testbench.cpp +++ b/openfpga/src/fpga_verilog/verilog_top_testbench.cpp @@ -699,17 +699,24 @@ size_t calculate_num_config_clock_cycles(const e_config_protocol_type& sram_orgz 100. * ((float)num_config_clock_cycles / (float)(1 + regional_bitstream_max_size) - 1.)); } break; - case CONFIG_MEM_MEMORY_BANK: - case CONFIG_MEM_FRAME_BASED: { + case CONFIG_MEM_MEMORY_BANK: { /* For fast configuration, we will skip all the zero data points */ + num_config_clock_cycles = 1 + build_memory_bank_fabric_bitstream_by_address(fabric_bitstream).size(); if (true == fast_configuration) { size_t full_num_config_clock_cycles = num_config_clock_cycles; - num_config_clock_cycles = 1; - for (const FabricBitId& bit_id : fabric_bitstream.bits()) { - if (bit_value_to_skip != fabric_bitstream.bit_din(bit_id)) { - num_config_clock_cycles++; - } - } + num_config_clock_cycles = 1 + find_memory_bank_fast_configuration_fabric_bitstream_size(fabric_bitstream, bit_value_to_skip); + VTR_LOG("Fast configuration reduces number of configuration clock cycles from %lu to %lu (compression_rate = %f%)\n", + full_num_config_clock_cycles, + num_config_clock_cycles, + 100. * ((float)num_config_clock_cycles / (float)full_num_config_clock_cycles - 1.)); + } + break; + } + case CONFIG_MEM_FRAME_BASED: { + num_config_clock_cycles = 1 + build_frame_based_fabric_bitstream_by_address(fabric_bitstream).size(); + if (true == fast_configuration) { + size_t full_num_config_clock_cycles = num_config_clock_cycles; + num_config_clock_cycles = 1 + find_frame_based_fast_configuration_fabric_bitstream_size(fabric_bitstream, bit_value_to_skip); VTR_LOG("Fast configuration reduces number of configuration clock cycles from %lu to %lu (compression_rate = %f%)\n", full_num_config_clock_cycles, num_config_clock_cycles, @@ -1529,46 +1536,8 @@ void print_verilog_top_testbench_memory_bank_bitstream(std::fstream& fp, fp << std::endl; - /* Reorganize the fabric bitstream by the same address across regions: - * This is due to that the length of fabric bitstream could be different in each region. - * Template: - * - * An example: - * 000000 00000 1011 - * - * Note: the std::map may cause large memory footprint for large bitstream databases! - */ - std::map, std::vector> fabric_bits_by_addr; - for (const FabricBitRegionId& region : fabric_bitstream.regions()) { - for (const FabricBitId& bit_id : fabric_bitstream.region_bits(region)) { - /* Create string for BL address */ - VTR_ASSERT(bl_addr_port.get_width() == fabric_bitstream.bit_bl_address(bit_id).size()); - std::string bl_addr_str; - for (const char& addr_bit : fabric_bitstream.bit_bl_address(bit_id)) { - bl_addr_str.push_back(addr_bit); - } - - /* Create string for WL address */ - VTR_ASSERT(wl_addr_port.get_width() == fabric_bitstream.bit_wl_address(bit_id).size()); - std::string wl_addr_str; - for (const char& addr_bit : fabric_bitstream.bit_wl_address(bit_id)) { - wl_addr_str.push_back(addr_bit); - } - - /* Place the config bit */ - auto result = fabric_bits_by_addr.find(std::make_pair(bl_addr_str, wl_addr_str)); - if (result == fabric_bits_by_addr.end()) { - /* This is a new bit, resize the vector to the number of regions - * and deposit '0' to all the bits - */ - fabric_bits_by_addr[std::make_pair(bl_addr_str, wl_addr_str)] = std::vector(fabric_bitstream.regions().size(), false); - fabric_bits_by_addr[std::make_pair(bl_addr_str, wl_addr_str)][size_t(region)] = fabric_bitstream.bit_din(bit_id); - } else { - VTR_ASSERT_SAFE(result != fabric_bits_by_addr.end()); - result->second[size_t(region)] = fabric_bitstream.bit_din(bit_id); - } - } - } + /* Reorganize the fabric bitstream by the same address across regions */ + std::map, std::vector> fabric_bits_by_addr = build_memory_bank_fabric_bitstream_by_address(fabric_bitstream); for (const auto& addr_din_pair : fabric_bits_by_addr) { /* When fast configuration is enabled, @@ -1676,39 +1645,8 @@ void print_verilog_top_testbench_frame_decoder_bitstream(std::fstream& fp, fp << std::endl; - /* Reorganize the fabric bitstream by the same address across regions: - * This is due to that the length of fabric bitstream could be different in each region. - * Template: - *
- * An example: - * 000000 1011 - * - * Note: the std::map may cause large memory footprint for large bitstream databases! - */ - std::map> fabric_bits_by_addr; - for (const FabricBitRegionId& region : fabric_bitstream.regions()) { - for (const FabricBitId& bit_id : fabric_bitstream.region_bits(region)) { - /* Create string for address */ - VTR_ASSERT(addr_port.get_width() == fabric_bitstream.bit_address(bit_id).size()); - std::string addr_str; - for (const char& addr_bit : fabric_bitstream.bit_address(bit_id)) { - addr_str.push_back(addr_bit); - } - - /* Place the config bit */ - auto result = fabric_bits_by_addr.find(addr_str); - if (result == fabric_bits_by_addr.end()) { - /* This is a new bit, resize the vector to the number of regions - * and deposit '0' to all the bits - */ - fabric_bits_by_addr[addr_str] = std::vector(fabric_bitstream.regions().size(), false); - fabric_bits_by_addr[addr_str][size_t(region)] = fabric_bitstream.bit_din(bit_id); - } else { - VTR_ASSERT_SAFE(result != fabric_bits_by_addr.end()); - result->second[size_t(region)] = fabric_bitstream.bit_din(bit_id); - } - } - } + /* Reorganize the fabric bitstream by the same address across regions */ + std::map> fabric_bits_by_addr = build_frame_based_fabric_bitstream_by_address(fabric_bitstream); for (const auto& addr_din_pair : fabric_bits_by_addr) { /* When fast configuration is enabled, @@ -1735,7 +1673,7 @@ void print_verilog_top_testbench_frame_decoder_bitstream(std::fstream& fp, VTR_ASSERT(addr_port.get_width() == addr_din_pair.first.size()); fp << addr_din_pair.first; fp << ", "; - fp <<"1'b"; + fp << din_port.get_width() << "'b"; VTR_ASSERT(din_port.get_width() == addr_din_pair.second.size()); for (const bool& din_value : addr_din_pair.second) { if (true == din_value) { @@ -1748,7 +1686,7 @@ void print_verilog_top_testbench_frame_decoder_bitstream(std::fstream& fp, fp << ");" << std::endl; } - /* Disable the address and din */ + /* Disable the address and din fp << "\t\t" << std::string(TOP_TESTBENCH_PROG_TASK_NAME); fp << "(" << addr_port.get_width() << "'b"; std::vector all_zero_addr(addr_port.get_width(), 0); @@ -1756,8 +1694,9 @@ void print_verilog_top_testbench_frame_decoder_bitstream(std::fstream& fp, fp << addr_bit; } fp << ", "; - fp <<"1'b0"; + fp << generate_verilog_constant_values(initial_din_values); fp << ");" << std::endl; + */ /* Raise the flag of configuration done when bitstream loading is complete */ BasicPort prog_clock_port(std::string(TOP_TB_PROG_CLOCK_PORT_NAME), 1); diff --git a/openfpga/src/utils/fabric_bitstream_utils.cpp b/openfpga/src/utils/fabric_bitstream_utils.cpp index 53d99d5a1..fa0fd9e84 100644 --- a/openfpga/src/utils/fabric_bitstream_utils.cpp +++ b/openfpga/src/utils/fabric_bitstream_utils.cpp @@ -64,4 +64,172 @@ size_t find_configuration_chain_fabric_bitstream_size_to_be_skipped(const Fabric return num_bits_to_skip; } +/******************************************************************** + * Reorganize the fabric bitstream for frame-based protocol + * by the same address across regions: + * This is due to that the length of fabric bitstream could be different in each region. + * Template: + *
+ * An example: + * 000000 1011 + * + * Note: the std::map may cause large memory footprint for large bitstream databases! + *******************************************************************/ +std::map> build_frame_based_fabric_bitstream_by_address(const FabricBitstream& fabric_bitstream) { + std::map> fabric_bits_by_addr; + for (const FabricBitRegionId& region : fabric_bitstream.regions()) { + for (const FabricBitId& bit_id : fabric_bitstream.region_bits(region)) { + /* Create string for address */ + std::string addr_str; + for (const char& addr_bit : fabric_bitstream.bit_address(bit_id)) { + addr_str.push_back(addr_bit); + } + + /* Place the config bit */ + auto result = fabric_bits_by_addr.find(addr_str); + if (result == fabric_bits_by_addr.end()) { + /* This is a new bit, resize the vector to the number of regions + * and deposit '0' to all the bits + */ + fabric_bits_by_addr[addr_str] = std::vector(fabric_bitstream.regions().size(), false); + fabric_bits_by_addr[addr_str][size_t(region)] = fabric_bitstream.bit_din(bit_id); + } else { + VTR_ASSERT_SAFE(result != fabric_bits_by_addr.end()); + result->second[size_t(region)] = fabric_bitstream.bit_din(bit_id); + } + } + } + + return fabric_bits_by_addr; +} + +/******************************************************************** + * For fast configuration, the number of bits to be skipped + * the rule to skip any configuration bit should consider the whole data input values. + * Only all the bits in the din port match the value to be skipped, + * the programming cycle can be skipped! + * For example: + * Address: 010101 + * Region 0: 0 + * Region 1: 1 + * Region 2: 0 + * This bit cannot be skipped if the bit_value_to_skip is 0 + * + * Address: 010101 + * Region 0: 0 + * Region 1: 0 + * Region 2: 0 + * This bit can be skipped if the bit_value_to_skip is 0 + *******************************************************************/ +size_t find_frame_based_fast_configuration_fabric_bitstream_size(const FabricBitstream& fabric_bitstream, + const bool& bit_value_to_skip) { + std::map> fabric_bits_by_addr = build_frame_based_fabric_bitstream_by_address(fabric_bitstream); + + size_t num_bits = 0; + + for (const auto& addr_din_pair : fabric_bits_by_addr) { + bool skip_curr_bits = true; + for (const bool& bit : addr_din_pair.second) { + if (bit_value_to_skip != bit) { + skip_curr_bits = false; + break; + } + } + + if (false == skip_curr_bits) { + num_bits++; + } + } + + return num_bits; +} + +/******************************************************************** + * Reorganize the fabric bitstream for memory banks + * by the same address across regions: + * This is due to that the length of fabric bitstream could be different in each region. + * Template: + * + * An example: + * 000000 00000 1011 + * + * Note: the std::map may cause large memory footprint for large bitstream databases! + *******************************************************************/ +std::map, std::vector> build_memory_bank_fabric_bitstream_by_address(const FabricBitstream& fabric_bitstream) { + std::map, std::vector> fabric_bits_by_addr; + for (const FabricBitRegionId& region : fabric_bitstream.regions()) { + for (const FabricBitId& bit_id : fabric_bitstream.region_bits(region)) { + /* Create string for BL address */ + std::string bl_addr_str; + for (const char& addr_bit : fabric_bitstream.bit_bl_address(bit_id)) { + bl_addr_str.push_back(addr_bit); + } + + /* Create string for WL address */ + std::string wl_addr_str; + for (const char& addr_bit : fabric_bitstream.bit_wl_address(bit_id)) { + wl_addr_str.push_back(addr_bit); + } + + /* Place the config bit */ + auto result = fabric_bits_by_addr.find(std::make_pair(bl_addr_str, wl_addr_str)); + if (result == fabric_bits_by_addr.end()) { + /* This is a new bit, resize the vector to the number of regions + * and deposit '0' to all the bits + */ + fabric_bits_by_addr[std::make_pair(bl_addr_str, wl_addr_str)] = std::vector(fabric_bitstream.regions().size(), false); + fabric_bits_by_addr[std::make_pair(bl_addr_str, wl_addr_str)][size_t(region)] = fabric_bitstream.bit_din(bit_id); + } else { + VTR_ASSERT_SAFE(result != fabric_bits_by_addr.end()); + result->second[size_t(region)] = fabric_bitstream.bit_din(bit_id); + } + } + } + + return fabric_bits_by_addr; +} + +/******************************************************************** + * For fast configuration, the number of bits to be skipped + * the rule to skip any configuration bit should consider the whole data input values. + * Only all the bits in the din port match the value to be skipped, + * the programming cycle can be skipped! + * For example: + * BL Address: 010101 + * WL Address: 010101 + * Region 0: 0 + * Region 1: 1 + * Region 2: 0 + * This bit cannot be skipped if the bit_value_to_skip is 0 + * + * BL Address: 010101 + * WL Address: 010101 + * Region 0: 0 + * Region 1: 0 + * Region 2: 0 + * This bit can be skipped if the bit_value_to_skip is 0 + *******************************************************************/ +size_t find_memory_bank_fast_configuration_fabric_bitstream_size(const FabricBitstream& fabric_bitstream, + const bool& bit_value_to_skip) { + std::map, std::vector> fabric_bits_by_addr = build_memory_bank_fabric_bitstream_by_address(fabric_bitstream); + + size_t num_bits = 0; + + for (const auto& addr_din_pair : fabric_bits_by_addr) { + bool skip_curr_bits = true; + for (const bool& bit : addr_din_pair.second) { + if (bit_value_to_skip != bit) { + skip_curr_bits = false; + break; + } + } + + if (false == skip_curr_bits) { + num_bits++; + } + } + + return num_bits; +} + } /* end namespace openfpga */ diff --git a/openfpga/src/utils/fabric_bitstream_utils.h b/openfpga/src/utils/fabric_bitstream_utils.h index 998332ccf..e34464a89 100644 --- a/openfpga/src/utils/fabric_bitstream_utils.h +++ b/openfpga/src/utils/fabric_bitstream_utils.h @@ -8,6 +8,7 @@ * Include header files that are required by function declaration *******************************************************************/ #include +#include #include "bitstream_manager.h" #include "fabric_bitstream.h" @@ -24,6 +25,16 @@ size_t find_configuration_chain_fabric_bitstream_size_to_be_skipped(const Fabric const BitstreamManager& bitstream_manager, const bool& bit_value_to_skip); +std::map> build_frame_based_fabric_bitstream_by_address(const FabricBitstream& fabric_bitstream); + +size_t find_frame_based_fast_configuration_fabric_bitstream_size(const FabricBitstream& fabric_bitstream, + const bool& bit_value_to_skip); + +std::map, std::vector> build_memory_bank_fabric_bitstream_by_address(const FabricBitstream& fabric_bitstream); + +size_t find_memory_bank_fast_configuration_fabric_bitstream_size(const FabricBitstream& fabric_bitstream, + const bool& bit_value_to_skip); + } /* end namespace openfpga */ #endif From b78f8bec16af36b3f7b6072df2ed760d73e861a9 Mon Sep 17 00:00:00 2001 From: tangxifan Date: Fri, 30 Oct 2020 21:19:20 -0600 Subject: [PATCH 07/11] [Tool] Bug fixed for multi-region configuration frame --- .../libopenfpgautil/src/openfpga_decode.cpp | 50 +++++++++++++++++++ .../libopenfpgautil/src/openfpga_decode.h | 8 +++ .../fpga_bitstream/build_fabric_bitstream.cpp | 14 +++++- .../src/fpga_bitstream/fabric_bitstream.cpp | 18 +++++-- .../src/fpga_bitstream/fabric_bitstream.h | 4 +- .../src/fpga_verilog/verilog_decoders.cpp | 9 ++-- openfpga/src/utils/fabric_bitstream_utils.cpp | 28 +++++++---- 7 files changed, 109 insertions(+), 22 deletions(-) diff --git a/libopenfpga/libopenfpgautil/src/openfpga_decode.cpp b/libopenfpga/libopenfpgautil/src/openfpga_decode.cpp index 168f98e42..b4dae065f 100644 --- a/libopenfpga/libopenfpgautil/src/openfpga_decode.cpp +++ b/libopenfpga/libopenfpgautil/src/openfpga_decode.cpp @@ -128,4 +128,54 @@ size_t bintoi_charvec(const std::vector& bin) { return ret; } +/******************************************************************** + * Expand all the don't care bits in a string + * A don't care 'x' can be decoded to either '0' or '1' + * For example: + * input: 0x1x + * output: 0010 + * 0100 + * 0101 + * 0011 + * + * Return all the strings + ********************************************************************/ +std::vector expand_dont_care_bin_str(const std::string& input_str) { + std::vector ret; + + /* If the input is don't care free, we can retrun */ + bool has_dont_care = false; + for (const char& bit : input_str) { + if (DONT_CARE_CHAR == bit) { + has_dont_care = true; + break; + } + } + + if (false == has_dont_care) { + ret.push_back(input_str); + return ret; + } + + /* Recusively expand all the don't bits */ + for (size_t i = 0; i < input_str.length(); ++i) { + if (DONT_CARE_CHAR == input_str[i]) { + std::string temp_input_str = input_str; + /* Flip to '0' and go recursively */ + temp_input_str[i] = '0'; + for (const std::string& expanded_str : expand_dont_care_bin_str(temp_input_str)) { + ret.push_back(expanded_str); + } + /* Flip to '1' and go recursively */ + temp_input_str[i] = '1'; + for (const std::string& expanded_str : expand_dont_care_bin_str(temp_input_str)) { + ret.push_back(expanded_str); + } + break; + } + } + + return ret; +} + } /* end namespace openfpga */ diff --git a/libopenfpga/libopenfpgautil/src/openfpga_decode.h b/libopenfpga/libopenfpgautil/src/openfpga_decode.h index 85809fee2..2e92bb9f4 100644 --- a/libopenfpga/libopenfpgautil/src/openfpga_decode.h +++ b/libopenfpga/libopenfpgautil/src/openfpga_decode.h @@ -6,6 +6,7 @@ *******************************************************************/ #include #include +#include /******************************************************************** * Function declaration @@ -13,6 +14,11 @@ /* namespace openfpga begins */ namespace openfpga { +/**************************************** + * Constants + */ +constexpr char DONT_CARE_CHAR = 'x'; + std::vector ito1hot_vec(const size_t& in_int, const size_t& bin_len); @@ -24,6 +30,8 @@ std::vector itobin_charvec(const size_t& in_int, size_t bintoi_charvec(const std::vector& bin); +std::vector expand_dont_care_bin_str(const std::string& input_str); + } /* namespace openfpga ends */ #endif diff --git a/openfpga/src/fpga_bitstream/build_fabric_bitstream.cpp b/openfpga/src/fpga_bitstream/build_fabric_bitstream.cpp index a327e9c4e..cf028b084 100644 --- a/openfpga/src/fpga_bitstream/build_fabric_bitstream.cpp +++ b/openfpga/src/fpga_bitstream/build_fabric_bitstream.cpp @@ -285,6 +285,7 @@ void rec_build_module_fabric_dependent_frame_bitstream(const BitstreamManager& b const ConfigRegionId& config_region, const std::vector& parent_modules, const std::vector& addr_code, + const char& bitstream_dont_care_char, FabricBitstream& fabric_bitstream, FabricBitRegionId& fabric_bitstream_region) { @@ -418,7 +419,8 @@ void rec_build_module_fabric_dependent_frame_bitstream(const BitstreamManager& b const ModulePortId& child_addr_port_id = module_manager.find_module_port(child_module, std::string(DECODER_ADDRESS_PORT_NAME)); const BasicPort& child_addr_port = module_manager.module_port(child_module, child_addr_port_id); if (0 < max_child_addr_code_size - child_addr_port.get_width()) { - std::vector dummy_codes(max_child_addr_code_size - child_addr_port.get_width(), '0'); + /* Deposit don't care state for the dummy bits */ + std::vector dummy_codes(max_child_addr_code_size - child_addr_port.get_width(), bitstream_dont_care_char); child_addr_code.insert(child_addr_code.begin(), dummy_codes.begin(), dummy_codes.end()); } } @@ -430,6 +432,7 @@ void rec_build_module_fabric_dependent_frame_bitstream(const BitstreamManager& b config_region, child_modules, child_addr_code, + bitstream_dont_care_char, fabric_bitstream, fabric_bitstream_region); } @@ -583,6 +586,12 @@ void build_module_fabric_dependent_bitstream(const ConfigProtocol& config_protoc fabric_bitstream.reserve_bits(bitstream_manager.num_bits()); fabric_bitstream.set_address_length(addr_port_info.get_width()); + /* Avoid use don't care if there is only a region */ + char bitstream_dont_care_char = DONT_CARE_CHAR; + if (1 == module_manager.regions(top_module).size()) { + bitstream_dont_care_char = '0'; + } + /* Find the maximum decoder address among all the configurable regions */ size_t max_decoder_addr_size = 0; for (const ConfigRegionId& config_region : module_manager.regions(top_module)) { @@ -624,7 +633,7 @@ void build_module_fabric_dependent_bitstream(const ConfigProtocol& config_protoc ModulePortId decoder_addr_port_id = module_manager.find_module_port(decoder_module, DECODER_ADDRESS_PORT_NAME); BasicPort decoder_addr_port = module_manager.module_port(decoder_module, decoder_addr_port_id); VTR_ASSERT(max_decoder_addr_size >= decoder_addr_port.get_width()); - std::vector idle_addr_bits(max_decoder_addr_size - decoder_addr_port.get_width(), '0'); + std::vector idle_addr_bits(max_decoder_addr_size - decoder_addr_port.get_width(), bitstream_dont_care_char); FabricBitRegionId fabric_bitstream_region = fabric_bitstream.add_region(); rec_build_module_fabric_dependent_frame_bitstream(bitstream_manager, @@ -634,6 +643,7 @@ void build_module_fabric_dependent_bitstream(const ConfigProtocol& config_protoc config_region, std::vector(1, top_module), idle_addr_bits, + bitstream_dont_care_char, fabric_bitstream, fabric_bitstream_region); } diff --git a/openfpga/src/fpga_bitstream/fabric_bitstream.cpp b/openfpga/src/fpga_bitstream/fabric_bitstream.cpp index 5949a0bfb..6ea5c8168 100644 --- a/openfpga/src/fpga_bitstream/fabric_bitstream.cpp +++ b/openfpga/src/fpga_bitstream/fabric_bitstream.cpp @@ -68,7 +68,7 @@ std::vector FabricBitstream::bit_address(const FabricBitId& bit_id) const VTR_ASSERT(true == valid_bit_id(bit_id)); VTR_ASSERT(true == use_address_); - return itobin_charvec(bit_addresses_[bit_id], address_length_); + return bit_addresses_[bit_id]; } std::vector FabricBitstream::bit_bl_address(const FabricBitId& bit_id) const { @@ -81,7 +81,7 @@ std::vector FabricBitstream::bit_wl_address(const FabricBitId& bit_id) con VTR_ASSERT(true == use_address_); VTR_ASSERT(true == use_wl_address_); - return itobin_charvec(bit_wl_addresses_[bit_id], wl_address_length_); + return bit_wl_addresses_[bit_id]; } char FabricBitstream::bit_din(const FabricBitId& bit_id) const { @@ -122,6 +122,16 @@ FabricBitId FabricBitstream::add_bit(const ConfigBitId& config_bit_id) { num_bits_++; config_bit_ids_.push_back(config_bit_id); + if (true == use_address_) { + bit_addresses_.emplace_back(); + bit_dins_.emplace_back(); + + if (true == use_wl_address_) { + bit_wl_addresses_.emplace_back(); + } + } + + return bit; } @@ -130,7 +140,7 @@ void FabricBitstream::set_bit_address(const FabricBitId& bit_id, VTR_ASSERT(true == valid_bit_id(bit_id)); VTR_ASSERT(true == use_address_); VTR_ASSERT(address_length_ == address.size()); - bit_addresses_[bit_id] = bintoi_charvec(address); + bit_addresses_[bit_id] = address; } void FabricBitstream::set_bit_bl_address(const FabricBitId& bit_id, @@ -144,7 +154,7 @@ void FabricBitstream::set_bit_wl_address(const FabricBitId& bit_id, VTR_ASSERT(true == use_address_); VTR_ASSERT(true == use_wl_address_); VTR_ASSERT(wl_address_length_ == address.size()); - bit_wl_addresses_[bit_id] = bintoi_charvec(address); + bit_wl_addresses_[bit_id] = address; } void FabricBitstream::set_bit_din(const FabricBitId& bit_id, diff --git a/openfpga/src/fpga_bitstream/fabric_bitstream.h b/openfpga/src/fpga_bitstream/fabric_bitstream.h index 0761ff7c2..1c41250d3 100644 --- a/openfpga/src/fpga_bitstream/fabric_bitstream.h +++ b/openfpga/src/fpga_bitstream/fabric_bitstream.h @@ -208,8 +208,8 @@ class FabricBitstream { * * We use a 2-element array, as we may have a BL address and a WL address */ - vtr::vector bit_addresses_; - vtr::vector bit_wl_addresses_; + vtr::vector> bit_addresses_; + vtr::vector> bit_wl_addresses_; /* Data input (Din) bits: this is designed for memory decoders */ vtr::vector bit_dins_; diff --git a/openfpga/src/fpga_verilog/verilog_decoders.cpp b/openfpga/src/fpga_verilog/verilog_decoders.cpp index b0259f247..5804a918e 100644 --- a/openfpga/src/fpga_verilog/verilog_decoders.cpp +++ b/openfpga/src/fpga_verilog/verilog_decoders.cpp @@ -295,14 +295,17 @@ void print_verilog_arch_decoder_module(std::fstream& fp, /* Print the truth table of this decoder */ /* Internal logics */ /* Early exit: Corner case for data size = 1 the logic is very simple: - * data = addr; - * data_inv = ~data_inv + * when enable is '1' and and address is '0' + * data_out is driven by '1' + * else data_out is driven by '0' */ if (1 == data_size) { fp << "always@(" << generate_verilog_port(VERILOG_PORT_CONKT, addr_port); fp << " or " << generate_verilog_port(VERILOG_PORT_CONKT, enable_port); fp << ") begin" << std::endl; - fp << "\tif (" << generate_verilog_port(VERILOG_PORT_CONKT, enable_port) << " == 1'b1) begin" << std::endl; + fp << "\tif ((" << generate_verilog_port(VERILOG_PORT_CONKT, enable_port) << " == 1'b1) && ("; + fp << generate_verilog_port(VERILOG_PORT_CONKT, addr_port) << " == 1'b0))"; + fp << " begin" << std::endl; fp << "\t\t" << generate_verilog_port_constant_values(data_port, std::vector(1, 1)) << ";" << std::endl; fp << "\t" << "end else begin" << std::endl; fp << "\t\t" << generate_verilog_port_constant_values(data_port, std::vector(1, 0)) << ";" << std::endl; diff --git a/openfpga/src/utils/fabric_bitstream_utils.cpp b/openfpga/src/utils/fabric_bitstream_utils.cpp index fa0fd9e84..b1beeb00d 100644 --- a/openfpga/src/utils/fabric_bitstream_utils.cpp +++ b/openfpga/src/utils/fabric_bitstream_utils.cpp @@ -11,6 +11,9 @@ #include "vtr_assert.h" #include "vtr_log.h" +/* Headers from openfpgautil library */ +#include "openfpga_decode.h" + #include "fabric_bitstream_utils.h" /* begin namespace openfpga */ @@ -85,17 +88,20 @@ std::map> build_frame_based_fabric_bitstream_by_a addr_str.push_back(addr_bit); } - /* Place the config bit */ - auto result = fabric_bits_by_addr.find(addr_str); - if (result == fabric_bits_by_addr.end()) { - /* This is a new bit, resize the vector to the number of regions - * and deposit '0' to all the bits - */ - fabric_bits_by_addr[addr_str] = std::vector(fabric_bitstream.regions().size(), false); - fabric_bits_by_addr[addr_str][size_t(region)] = fabric_bitstream.bit_din(bit_id); - } else { - VTR_ASSERT_SAFE(result != fabric_bits_by_addr.end()); - result->second[size_t(region)] = fabric_bitstream.bit_din(bit_id); + /* Expand all the don't care bits */ + for (const std::string& curr_addr_str : expand_dont_care_bin_str(addr_str)) { + /* Place the config bit */ + auto result = fabric_bits_by_addr.find(curr_addr_str); + if (result == fabric_bits_by_addr.end()) { + /* This is a new bit, resize the vector to the number of regions + * and deposit '0' to all the bits + */ + fabric_bits_by_addr[curr_addr_str] = std::vector(fabric_bitstream.regions().size(), false); + fabric_bits_by_addr[curr_addr_str][size_t(region)] = fabric_bitstream.bit_din(bit_id); + } else { + VTR_ASSERT_SAFE(result != fabric_bits_by_addr.end()); + result->second[size_t(region)] = fabric_bitstream.bit_din(bit_id); + } } } } From 940eb937f2b6796c76ee9a1c448d2c41cfd4ade9 Mon Sep 17 00:00:00 2001 From: tangxifan Date: Fri, 30 Oct 2020 21:21:11 -0600 Subject: [PATCH 08/11] [Test] add multi-region configuration frame test cases to CI --- .travis/basic_reg_test.sh | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.travis/basic_reg_test.sh b/.travis/basic_reg_test.sh index e6e04412e..eac3b3e48 100755 --- a/.travis/basic_reg_test.sh +++ b/.travis/basic_reg_test.sh @@ -36,6 +36,8 @@ python3 openfpga_flow/scripts/run_fpga_task.py basic_tests/full_testbench/config python3 openfpga_flow/scripts/run_fpga_task.py basic_tests/full_testbench/configuration_frame_use_set --debug --show_thread_logs python3 openfpga_flow/scripts/run_fpga_task.py basic_tests/full_testbench/configuration_frame_use_setb --debug --show_thread_logs python3 openfpga_flow/scripts/run_fpga_task.py basic_tests/full_testbench/configuration_frame_use_set_reset --debug --show_thread_logs +python3 openfpga_flow/scripts/run_fpga_task.py basic_tests/full_testbench/multi_region_configuration_frame --debug --show_thread_logs +python3 openfpga_flow/scripts/run_fpga_task.py basic_tests/full_testbench/smart_fast_multi_region_configuration_frame --debug --show_thread_logs python3 openfpga_flow/scripts/run_fpga_task.py basic_tests/preconfig_testbench/configuration_frame --debug --show_thread_logs echo -e "Testing memory bank configuration protocol of a K4N4 FPGA"; @@ -97,4 +99,3 @@ echo -e "Testing K4N5 with pattern based local routing"; python3 openfpga_flow/scripts/run_fpga_task.py basic_tests/k4_series/k4n5_pattern_local_routing --debug --show_thread_logs end_section "OpenFPGA.TaskTun" -python3 openfpga_flow/scripts/run_fpga_task.py basic_tests/full_testbench/multi_region_memory_bank --debug --show_thread_logs From 7e940980e16f289f609c61da893da76f0b0c9a04 Mon Sep 17 00:00:00 2001 From: tangxifan Date: Fri, 30 Oct 2020 21:52:01 -0600 Subject: [PATCH 09/11] [Doc] Update documentation about configuration regions for frame-based protocol --- docs/source/manual/arch_lang/config_protocol.rst | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/docs/source/manual/arch_lang/config_protocol.rst b/docs/source/manual/arch_lang/config_protocol.rst index d52399cdf..44549d7c0 100644 --- a/docs/source/manual/arch_lang/config_protocol.rst +++ b/docs/source/manual/arch_lang/config_protocol.rst @@ -41,6 +41,13 @@ Template - ``memory_bank`` requires a circuit model type of ``sram`` - ``standalone`` requires a circuit model type of ``sram`` +.. option:: num_regions="" + + Specify the number of configuration regions to be used across the fabrics. By default, it will be only 1 configuration region. Each configuration region contains independent configuration protocols, but the whole fabric should employ the same type of configuration protocols. For example, an FPGA fabric consists of 4 configuration regions, each of which includes a configuration chain. The more configuration chain to be used, the fast configuration runtime will be, but at the cost of more I/Os in the FPGA fabrics. The organization of each configurable region can be customized through the fabric key (see details in :ref:`fabric_key`). + + .. warning:: Currently, multiple configuration regions is not applicable to ``standalone`` configuration protocol. + + Configuration Chain Example ~~~~~~~~~~~~~~~~~~~~~~~~~~~ The following XML code describes a scan-chain circuitry to configure the core logic of FPGA, as illustrated in :numref:`fig_ccff_fpga`. @@ -60,9 +67,6 @@ It will use the circuit model defined in :numref:`fig_ccff`. Example of a configuration chain to program core logic of a FPGA -.. option:: num_regions="" - - Specify the number of configuration chains to be used across the fabrics. By default, it will be only 1 configuration chain. The more configuration chain to be used, the fast configuration runtime will be, but at the cost of more I/Os in the FPGA fabrics. The organization of each configurable region can be customized through the fabric key (see details in :ref:`fabric_key`). .. figure:: figures/multi_region_config_chains.png :scale: 100% @@ -113,6 +117,8 @@ When the decoder of sub block, e.g., the LUT, is enabled, each memory cells can .. warning:: Please do NOT add inverted Bit-Line and Word-Line inputs. It is not supported yet! +When multiple configuration region is applied, the configuration frames will be grouped into different configuration regions. Each region has a separated data input bus and dedicated address decoders. As such, the configuration frame groups can be programmed in parallel. + Memory bank Example ~~~~~~~~~~~~~~~~~~~ The following XML code describes a memory-bank circuitry to configure the core logic of FPGA, as illustrated in :numref:`fig_memory_bank`. From 6b25cf720d39a90c86d56e19333574a37fe8d316 Mon Sep 17 00:00:00 2001 From: tangxifan Date: Fri, 30 Oct 2020 22:09:48 -0600 Subject: [PATCH 10/11] [Tool] Comment on the memory efficiency on fabric bitstream address storage --- openfpga/src/fpga_bitstream/fabric_bitstream.h | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/openfpga/src/fpga_bitstream/fabric_bitstream.h b/openfpga/src/fpga_bitstream/fabric_bitstream.h index 1c41250d3..028452a58 100644 --- a/openfpga/src/fpga_bitstream/fabric_bitstream.h +++ b/openfpga/src/fpga_bitstream/fabric_bitstream.h @@ -207,6 +207,10 @@ class FabricBitstream { * to the configuration protocol directly * * We use a 2-element array, as we may have a BL address and a WL address + * + * TODO: use nested vector may cause large memory footprint + * when bitstream size increases + * NEED TO THINK ABOUT A COMPACT MODELING */ vtr::vector> bit_addresses_; vtr::vector> bit_wl_addresses_; From be7f7592ae6d0beb9936e9da56d74671dbbe58e8 Mon Sep 17 00:00:00 2001 From: tangxifan Date: Fri, 30 Oct 2020 22:13:28 -0600 Subject: [PATCH 11/11] [Doc] Update documentation about don't care bit in frame address --- .../manual/fpga_bitstream/fabric_dependent_bitstream.rst | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/docs/source/manual/fpga_bitstream/fabric_dependent_bitstream.rst b/docs/source/manual/fpga_bitstream/fabric_dependent_bitstream.rst index 825760a5e..5bdb91145 100644 --- a/docs/source/manual/fpga_bitstream/fabric_dependent_bitstream.rst +++ b/docs/source/manual/fpga_bitstream/fabric_dependent_bitstream.rst @@ -47,6 +47,8 @@ The information depends on the type of configuration procotol. .. option:: frame_based Multiple lines will be included, each of which is organized as
. + Note that the address may include don't care bit which is denoted as ``x``. + OpenFPGA automatically convert don't care bit to logic ``0`` when generating testbenches. For example .. code-block:: xml @@ -97,10 +99,12 @@ Other information may depend on the type of configuration procotol. - ``frame``: frame address information + .. note:: Frame address may include don't care bit which is denoted as ``x``. + A quick example: .. code-block:: xml - +