Merge pull request #1259 from chungshien/openfpga-issue-1256

Address issue 1256
2023-08-07 18:18:14 -07:00 · 2023-08-07 18:18:14 -07:00 · 1064520103
parent ba52e57f65 83371cb98b
commit 1064520103
10 changed files with 587 additions and 49 deletions
--- a/openfpga/src/base/openfpga_bitstream_command_template.h
+++ b/openfpga/src/base/openfpga_bitstream_command_template.h
@ -200,6 +200,11 @@ ShellCommandId add_write_fabric_bitstream_command_template(
    "Keep don't care bits in bitstream file; If not enabled, don't care bits "
    "are converted to logic '0' or '1'");

+  /* Add an option '--wl_incremental_order' */
+  shell_cmd.add_option(
+    "wl_decremental_order", false,
+    "Generate bitstream in WL decremental addressing order if supported");
+
  /* Add an option '--no_time_stamp' */
  shell_cmd.add_option("no_time_stamp", false,
                       "Do not print time stamp in output files");
--- a/openfpga/src/base/openfpga_bitstream_template.h
+++ b/openfpga/src/base/openfpga_bitstream_template.h
@ -91,6 +91,7 @@ int write_fabric_bitstream_template(const T& openfpga_ctx, const Command& cmd,
  CommandOptionId opt_file_format = cmd.option("format");
  CommandOptionId opt_fast_config = cmd.option("fast_configuration");
  CommandOptionId opt_keep_dont_care_bits = cmd.option("keep_dont_care_bits");
+  CommandOptionId opt_wl_decremental_order = cmd.option("wl_decremental_order");
  CommandOptionId opt_no_time_stamp = cmd.option("no_time_stamp");

  /* Write fabric bitstream if required */
@ -127,6 +128,7 @@ int write_fabric_bitstream_template(const T& openfpga_ctx, const Command& cmd,
      cmd_context.option_value(cmd, opt_file),
      cmd_context.option_enable(cmd, opt_fast_config),
      cmd_context.option_enable(cmd, opt_keep_dont_care_bits),
+      !cmd_context.option_enable(cmd, opt_wl_decremental_order),
      !cmd_context.option_enable(cmd, opt_no_time_stamp),
      cmd_context.option_enable(cmd, opt_verbose));
  }
--- a/openfpga/src/fpga_bitstream/build_fabric_bitstream_memory_bank.cpp
+++ b/openfpga/src/fpga_bitstream/build_fabric_bitstream_memory_bank.cpp
@ -176,6 +176,11 @@ static void rec_build_module_fabric_dependent_ql_memory_bank_regional_bitstream(
       bitstream_manager.block_bits(parent_block)) {
    FabricBitId fabric_bit = fabric_bitstream.add_bit(config_bit);

+    /*
+      If both BL and WL protocols are Flatten, we will have new way of
+      storing information in fabric_bitstream. This will save high
+      memory usage, as well as fast processing
+    */
    /* The BL address to be decoded depends on the protocol
     * - flatten BLs: use 1-hot decoding
     * - BL decoders: fully encoded
@ -183,38 +188,57 @@ static void rec_build_module_fabric_dependent_ql_memory_bank_regional_bitstream(
     */
    size_t cur_bl_index = bl_start_index_per_tile.at(tile_coord.x()) +
                          cur_mem_index[tile_coord] % num_bls_cur_tile;
-    std::vector<char> bl_addr_bits_vec;
-    if (BLWL_PROTOCOL_DECODER == config_protocol.bl_protocol_type()) {
-      bl_addr_bits_vec = itobin_charvec(cur_bl_index, bl_addr_size);
-    } else if (BLWL_PROTOCOL_FLATTEN == config_protocol.bl_protocol_type() ||
-               BLWL_PROTOCOL_SHIFT_REGISTER ==
-                 config_protocol.bl_protocol_type()) {
-      bl_addr_bits_vec =
-        ito1hot_charvec(cur_bl_index, bl_addr_size, DONT_CARE_CHAR);
+    if (BLWL_PROTOCOL_FLATTEN != config_protocol.bl_protocol_type() ||
+        BLWL_PROTOCOL_FLATTEN != config_protocol.wl_protocol_type()) {
+      // This is using old way
+      // We only do this kind of resource wasting storing if
+      // either protocol is not flatten
+      std::vector<char> bl_addr_bits_vec;
+      if (BLWL_PROTOCOL_DECODER == config_protocol.bl_protocol_type()) {
+        bl_addr_bits_vec = itobin_charvec(cur_bl_index, bl_addr_size);
+      } else if (BLWL_PROTOCOL_FLATTEN == config_protocol.bl_protocol_type() ||
+                 BLWL_PROTOCOL_SHIFT_REGISTER ==
+                   config_protocol.bl_protocol_type()) {
+        bl_addr_bits_vec =
+          ito1hot_charvec(cur_bl_index, bl_addr_size, DONT_CARE_CHAR);
+      }
+      /* Set BL address */
+      fabric_bitstream.set_bit_bl_address(
+        fabric_bit, bl_addr_bits_vec,
+        BLWL_PROTOCOL_DECODER != config_protocol.bl_protocol_type());
    }

    /* Find WL address */
    size_t cur_wl_index =
      wl_start_index_per_tile.at(tile_coord.y()) +
      std::floor(cur_mem_index[tile_coord] / num_bls_cur_tile);
-    std::vector<char> wl_addr_bits_vec;
-    if (BLWL_PROTOCOL_DECODER == config_protocol.wl_protocol_type()) {
-      wl_addr_bits_vec = itobin_charvec(cur_wl_index, wl_addr_size);
-    } else if (BLWL_PROTOCOL_FLATTEN == config_protocol.wl_protocol_type() ||
-               BLWL_PROTOCOL_SHIFT_REGISTER ==
-                 config_protocol.wl_protocol_type()) {
-      wl_addr_bits_vec = ito1hot_charvec(cur_wl_index, wl_addr_size);
+    if (BLWL_PROTOCOL_FLATTEN != config_protocol.bl_protocol_type() ||
+        BLWL_PROTOCOL_FLATTEN != config_protocol.wl_protocol_type()) {
+      // This is using old way
+      // We only do this kind of resource wasting storing if
+      // either protocol is not flatten
+      std::vector<char> wl_addr_bits_vec;
+      if (BLWL_PROTOCOL_DECODER == config_protocol.wl_protocol_type()) {
+        wl_addr_bits_vec = itobin_charvec(cur_wl_index, wl_addr_size);
+      } else if (BLWL_PROTOCOL_FLATTEN == config_protocol.wl_protocol_type() ||
+                 BLWL_PROTOCOL_SHIFT_REGISTER ==
+                   config_protocol.wl_protocol_type()) {
+        wl_addr_bits_vec = ito1hot_charvec(cur_wl_index, wl_addr_size);
+      }
+      /* Set WL address */
+      fabric_bitstream.set_bit_wl_address(
+        fabric_bit, wl_addr_bits_vec,
+        BLWL_PROTOCOL_DECODER != config_protocol.wl_protocol_type());
+    }
+    if (BLWL_PROTOCOL_FLATTEN == config_protocol.bl_protocol_type() &&
+        BLWL_PROTOCOL_FLATTEN == config_protocol.wl_protocol_type()) {
+      // New way of storing information in compact way
+      // Only for Flatten protocol (can easily support shift register as well)
+      // Need to understand decoder to better assessment
+      fabric_bitstream.set_memory_bank_info(
+        fabric_bit, fabric_bitstream_region, cur_bl_index, cur_wl_index,
+        bl_addr_size, wl_addr_size, bitstream_manager.bit_value(config_bit));
    }
-
-    /* Set BL address */
-    fabric_bitstream.set_bit_bl_address(
-      fabric_bit, bl_addr_bits_vec,
-      BLWL_PROTOCOL_DECODER != config_protocol.bl_protocol_type());
-
-    /* Set WL address */
-    fabric_bitstream.set_bit_wl_address(
-      fabric_bit, wl_addr_bits_vec,
-      BLWL_PROTOCOL_DECODER != config_protocol.wl_protocol_type());

    /* Set data input */
    fabric_bitstream.set_bit_din(fabric_bit,
--- a/openfpga/src/fpga_bitstream/fabric_bitstream.cpp
+++ b/openfpga/src/fpga_bitstream/fabric_bitstream.cpp
@ -11,6 +11,128 @@
 /* begin namespace openfpga */
 namespace openfpga {

+/**************************************************
+ * FabricBitstreamMemoryBank
+ *************************************************/
+void FabricBitstreamMemoryBank::add_bit(const fabric_size_t& bit_id,
+                                        const fabric_size_t& region_id,
+                                        const fabric_size_t& bl,
+                                        const fabric_size_t& wl,
+                                        const fabric_size_t& bl_addr_size,
+                                        const fabric_size_t& wl_addr_size,
+                                        bool bit) {
+  // Fabric Bit is added in sequential manner and each bit is unique
+  VTR_ASSERT((size_t)(bit_id) == fabric_bit_datas.size());
+  // Region is added in sequntial manner but it is not unique from fabric bit
+  // perspective
+  VTR_ASSERT((size_t)(region_id) <= blwl_lengths.size());
+  if ((size_t)(region_id) == blwl_lengths.size()) {
+    // Add if this is first time
+    blwl_lengths.push_back(fabric_blwl_length(bl_addr_size, wl_addr_size));
+  } else {
+    // Otherwise if the region had been added, it must always be consistent
+    VTR_ASSERT(blwl_lengths[region_id].bl == bl_addr_size);
+    VTR_ASSERT(blwl_lengths[region_id].wl == wl_addr_size);
+  }
+  // The BL/WL index must be within respective length
+  VTR_ASSERT(bl < blwl_lengths[region_id].bl);
+  VTR_ASSERT(wl < blwl_lengths[region_id].wl);
+  // We might not need this at all to track the raw data
+  // But since it does not use a lot of memory, tracking for good
+  fabric_bit_datas.push_back(fabric_bit_data((fabric_size_t)(size_t)(region_id),
+                                             (fabric_size_t)(bl),
+                                             (fabric_size_t)(wl), bit));
+  // This is real compact data
+  VTR_ASSERT(datas.size() == masks.size());
+  while ((size_t)(region_id) >= datas.size()) {
+    datas.emplace_back();
+    masks.emplace_back();
+  }
+  VTR_ASSERT(datas[region_id].size() == masks[region_id].size());
+  while ((size_t)(wl) >= datas[region_id].size()) {
+    datas[region_id].push_back(std::vector<uint8_t>((bl_addr_size + 7) / 8, 0));
+    masks[region_id].push_back(std::vector<uint8_t>((bl_addr_size + 7) / 8, 0));
+  }
+  // Same uniqie config bit cannot be set twice
+  VTR_ASSERT((masks[region_id][wl][bl >> 3] & (1 << (bl & 7))) == 0);
+  if (bit) {
+    // Mark the data value if bit (or din) is true
+    datas[region_id][wl][bl >> 3] |= (1 << (bl & 7));
+  }
+  // Mark the mask to indicate we had used this bit
+  masks[region_id][wl][bl >> 3] |= (1 << (bl & 7));
+}
+
+void FabricBitstreamMemoryBank::fast_configuration(
+  const bool& fast, const bool& bit_value_to_skip) {
+  for (auto& wls : wls_to_skip) {
+    wls.clear();
+  }
+  wls_to_skip.clear();
+  for (size_t region = 0; region < datas.size(); region++) {
+    wls_to_skip.emplace_back();
+    if (fast) {
+      for (fabric_size_t wl = 0; wl < blwl_lengths[region].wl; wl++) {
+        VTR_ASSERT((size_t)(wl) < datas[region].size());
+        bool skip_wl = true;
+        for (fabric_size_t bl = 0; bl < blwl_lengths[region].bl && skip_wl;
+             bl++) {
+          // Only check the bit that being used (marked in the mask),
+          // otherwise it is just a don't care, we can skip
+          if (masks[region][wl][bl >> 3] & (1 << (bl & 7))) {
+            if (datas[region][wl][bl >> 3] & (1 << (bl & 7))) {
+              // If bit_value_to_skip=true, and yet the din (recorded in
+              // datas) also 1, then we can skip
+              skip_wl = bit_value_to_skip;
+            } else {
+              skip_wl = !bit_value_to_skip;
+            }
+          }
+        }
+        if (skip_wl) {
+          // Record down that for this region, we will skip this WL
+          wls_to_skip[region].push_back(wl);
+        }
+      }
+    }
+  }
+}
+
+fabric_size_t FabricBitstreamMemoryBank::get_longest_effective_wl_count()
+  const {
+  // This function check effective WL count
+  // Where effective WL is the WL that we want to program after considering
+  // fast configuration from all the region, it return the longest
+  fabric_size_t longest_wl = 0;
+  for (size_t region = 0; region < datas.size(); region++) {
+    VTR_ASSERT((size_t)(region) < wls_to_skip.size());
+    fabric_size_t current_wl =
+      (fabric_size_t)(datas[region].size() - wls_to_skip[region].size());
+    if (current_wl > longest_wl) {
+      longest_wl = current_wl;
+    }
+  }
+  return longest_wl;
+}
+
+fabric_size_t FabricBitstreamMemoryBank::get_total_bl_addr_size() const {
+  // Simply total up all the BL addr size
+  fabric_size_t bl = 0;
+  for (size_t region = 0; region < datas.size(); region++) {
+    bl += blwl_lengths[region].bl;
+  }
+  return bl;
+}
+
+fabric_size_t FabricBitstreamMemoryBank::get_total_wl_addr_size() const {
+  // Simply total up all the WL addr size
+  fabric_size_t wl = 0;
+  for (size_t region = 0; region < datas.size(); region++) {
+    wl += blwl_lengths[region].wl;
+  }
+  return wl;
+}
+
 /**************************************************
 * Public Constructor
 *************************************************/
@ -129,6 +251,15 @@ bool FabricBitstream::use_address() const { return use_address_; }

 bool FabricBitstream::use_wl_address() const { return use_wl_address_; }

+const FabricBitstreamMemoryBank& FabricBitstream::memory_bank_info(
+  const bool& fast, const bool& bit_value_to_skip) const {
+  VTR_ASSERT(true == use_address_);
+  VTR_ASSERT(true == use_wl_address_);
+  (const_cast<FabricBitstreamMemoryBank*>(&memory_bank_data_))
+    ->fast_configuration(fast, bit_value_to_skip);
+  return memory_bank_data_;
+}
+
 /******************************************************************************
 * Public Mutators
 ******************************************************************************/
@ -243,6 +374,27 @@ void FabricBitstream::set_bl_address_length(const size_t& length) {
  set_address_length(length);
 }

+void FabricBitstream::set_memory_bank_info(const FabricBitId& bit_id,
+                                           const FabricBitRegionId& region_id,
+                                           const size_t& bl, const size_t& wl,
+                                           const size_t& bl_addr_size,
+                                           const size_t& wl_addr_size,
+                                           bool bit) {
+  // Bit must be valid one
+  // We only support this in protocol that use BL and WL address
+  VTR_ASSERT(true == valid_bit_id(bit_id));
+  VTR_ASSERT(true == use_address_);
+  VTR_ASSERT(true == use_wl_address_);
+  VTR_ASSERT(bl_addr_size);
+  VTR_ASSERT(wl_addr_size);
+  // All the basic checking had passed, we can add the data into
+  // memory_bank_data_
+  memory_bank_data_.add_bit(
+    (fabric_size_t)(size_t)(bit_id), (fabric_size_t)(size_t)(region_id),
+    (fabric_size_t)(bl), (fabric_size_t)(wl), (fabric_size_t)(bl_addr_size),
+    (fabric_size_t)(wl_addr_size), bit);
+}
+
 void FabricBitstream::set_use_wl_address(const bool& enable) {
  /* Add a lock, only can be modified when num bits are zero*/
  if (0 == num_bits_) {
--- a/openfpga/src/fpga_bitstream/fabric_bitstream.h
+++ b/openfpga/src/fpga_bitstream/fabric_bitstream.h
@ -41,6 +41,85 @@
 /* begin namespace openfpga */
 namespace openfpga {

+// Use uint32_t (maximum of 4Gigs) is good enough, we have BL and WL,
+// combination of both hold up to 18 quintillion of configuration bits (+ dont
+// care)
+typedef uint32_t fabric_size_t;
+struct fabric_bit_data {
+  fabric_bit_data(fabric_size_t r, fabric_size_t b, fabric_size_t w, bool bi)
+    : region(r), bl(b), wl(w), bit(bi) {}
+  const fabric_size_t region = 0;
+  const fabric_size_t bl = 0;
+  const fabric_size_t wl = 0;
+  const bool bit = false;
+};
+struct fabric_blwl_length {
+  fabric_blwl_length(fabric_size_t b, fabric_size_t w) : bl(b), wl(w) {}
+  const fabric_size_t bl = 0;
+  const fabric_size_t wl = 0;
+};
+
+/*
+  This class arrange Memory Bank databae in a compact way
+*/
+struct FabricBitstreamMemoryBank {
+  void add_bit(const fabric_size_t& bit_id, const fabric_size_t& region_id,
+               const fabric_size_t& bl, const fabric_size_t& wl,
+               const fabric_size_t& bl_addr_size,
+               const fabric_size_t& wl_addr_size, bool bit);
+  void fast_configuration(const bool& fast, const bool& bit_value_to_skip);
+  fabric_size_t get_longest_effective_wl_count() const;
+  fabric_size_t get_total_bl_addr_size() const;
+  fabric_size_t get_total_wl_addr_size() const;
+
+  /*************************
+   * All the database (except fabric_bit_datas) is sorted by region
+   *  1. The very first layer of vector is region
+   * For the datas and masks
+   *  1. They are sorted by WL, hence second layer is WL
+   *  2. Last layer is BL data stored in vector of uint8_t
+   *  3. Each uint8_t will store up-to 8 configuration bit info
+   **************************/
+  // Store the BL WL of each region
+  std::vector<fabric_blwl_length> blwl_lengths;
+  // Store config ID raw data. Not used by bitstream generation
+  // Used by XML generation
+  /*
+      fabric_bit_datas[Bit #0] = (region, bl, wl)
+      fabric_bit_datas[Bit #1] = (region, bl, wl)
+      fabric_bit_datas[Bit #2] = (region, bl, wl)
+    */
+  std::vector<fabric_bit_data> fabric_bit_datas;
+  // 100K LE FPGA only need few mega bytes
+  /*
+    datas represent the Din value of a given WL and BL (1bit)
+      datas[region #0][wl #0] = std::vector<uint8_t> to represent BLs
+        where uint8_t #0 = MSB{ BL#7, BL#6, .... BL #1, BL #0 } LSB
+        where uint8_t #1 = MSB{ BL#15, BL#14, .... BL #9, BL #8 } LSB
+      datas[region #0][wl #1] = std::vector<uint8_t> to represent BLs
+      datas[region #0][wl #2] = std::vector<uint8_t> to represent BLs
+      ......
+      datas[region #0][wl #n-1] = std::vector<uint8_t> to represent BLs
+      ......
+      datas[region #1][wl #0] = std::vector<uint8_t> to represent BLs
+      datas[region #1][wl #1] = std::vector<uint8_t> to represent BLs
+      ......
+  */
+  std::vector<std::vector<std::vector<uint8_t>>> datas;
+  /*
+    masks has same structure as datas
+    but masks presents data that being used
+    for exampe:
+      if mask's uint8_t #0 value = 0x41 it means for this WL
+        a. BL #0 is being used, and its Din is recoreded in datas
+        b. BL #6 is being used, and its Din is recoreded in datas
+        c. Other BLs #1, 2, 3, 4, 5, 7 are don't care bit (not being used)
+  */
+  std::vector<std::vector<std::vector<uint8_t>>> masks;
+  // This track which WL to skip because of fast configuration
+  std::vector<std::vector<fabric_size_t>> wls_to_skip;
+};
+
 class FabricBitstream {
 public: /* Type implementations */
  /*
@ -144,6 +223,9 @@ class FabricBitstream {
  bool use_address() const;
  bool use_wl_address() const;

+  const FabricBitstreamMemoryBank& memory_bank_info(
+    const bool& fast = false, const bool& bit_value_to_skip = false) const;
+
 public: /* Public Mutators */
  /* Reserve config bits */
  void reserve_bits(const size_t& num_bits);
@ -193,6 +275,18 @@ class FabricBitstream {
  void set_address_length(const size_t& length);
  void set_bl_address_length(const size_t& length);

+  /*
+    This is setting memory bank protocol in a more efficient way
+    Instead of building lengthy BL/WL bits of database (BL or Wl could be in
+    thousand bits of size), a small device like 100K LE (compared to other
+    vendors offer) might end up using tens of gig bytes.
+  */
+  void set_memory_bank_info(const FabricBitId& bit_id,
+                            const FabricBitRegionId& region_id,
+                            const size_t& bl, const size_t& wl,
+                            const size_t& bl_addr_size,
+                            const size_t& wl_addr_size, bool bit);
+
  /* Enable the use of WL-address related data
   * Same priniciple as the set_use_address()
   */
@ -250,6 +344,9 @@ class FabricBitstream {

  /* Data input (Din) bits: this is designed for memory decoders */
  vtr::vector<FabricBitId, char> bit_dins_;
+
+  /* New way of dealing with memory bank protocol - fast and compact */
+  FabricBitstreamMemoryBank memory_bank_data_;
 };

 } /* end namespace openfpga */
--- a/openfpga/src/fpga_bitstream/write_text_fabric_bitstream.cpp
+++ b/openfpga/src/fpga_bitstream/write_text_fabric_bitstream.cpp
@ -245,6 +245,177 @@ static int write_memory_bank_flatten_fabric_bitstream_to_text_file(
  return status;
 }

+/********************************************************************
+ * Write the fabric bitstream fitting a memory bank protocol
+ * to a plain text file in efficient method
+ *
+ * Old function is write_memory_bank_flatten_fabric_bitstream_to_text_file()
+ *
+ * As compared to original function, based on 100K LE FPGA:
+ *  1. Original function used 600 seconds and needs high memory usage
+ *  2. This new function only needs 1 second and 4M Bytes
+ *
+ * Old function only print WL in decremental order. It is not by intentional
+ * It is because of the map-key ordering
+ * In QL Memory Bank with Flatten BL/WL, data is stored by WL address,
+ *   where we use WL string as map key
+ *     WL #0 --- "1000000000000 .... 0000"
+ *     WL #1 --- "0100000000000 .... 0000"
+ *     WL #n-1 --- "0000000000000 .... 0001
+ * From string comparison wise, WL #n-1 will be first, and WL #0 will be last
+ * The sequence of WL does not really matter,  but preferrable in some ordering
+ *   manner. Using map key as ordering cannot guarantee the determinstic
+ *
+ * This new way of writting fabric guarantee the WL order in 100% deterministc
+ *   way: either incremental (default) or decremental
+ *
+ * Return:
+ *  - 0 if succeed
+ *  - 1 if critical errors occured
+ *******************************************************************/
+static int fast_write_memory_bank_flatten_fabric_bitstream_to_text_file(
+  std::fstream& fp, const bool& fast_configuration,
+  const bool& bit_value_to_skip, const FabricBitstream& fabric_bitstream,
+  const bool& keep_dont_care_bits, const bool& wl_incremental_order) {
+  int status = 0;
+
+  std::string dont_care_bit = "0";
+  if (keep_dont_care_bits) {
+    dont_care_bit = "x";
+  }
+  const FabricBitstreamMemoryBank& memory_bank =
+    fabric_bitstream.memory_bank_info(fast_configuration, bit_value_to_skip);
+
+  fabric_size_t longest_effective_wl_count =
+    memory_bank.get_longest_effective_wl_count();
+  /* Output information about how to intepret the bitstream */
+  fp << "// Bitstream length: " << longest_effective_wl_count << std::endl;
+  fp << "// Bitstream width (LSB -> MSB): ";
+  fp << "<bl_address " << memory_bank.get_total_bl_addr_size() << " bits>";
+  fp << "<wl_address " << memory_bank.get_total_wl_addr_size() << " bits>";
+  fp << std::endl;
+
+  // Step 1
+  // Initialize wl_indexes for every region
+  // The intialization depends the ordering of WL
+  // It could either be 0 (if wl_incremental_order=true) or
+  // last WL index (if wl_incremental_order=false)
+  std::vector<fabric_size_t> wl_indexes;
+  for (size_t region = 0; region < memory_bank.datas.size(); region++) {
+    if (wl_incremental_order) {
+      wl_indexes.push_back(0);
+    } else {
+      wl_indexes.push_back(
+        (fabric_size_t)(memory_bank.datas[region].size() - 1));
+    }
+  }
+  // Step 2
+  // Loop through total WL count that we would like to configure
+  for (size_t wl_index = 0; wl_index < longest_effective_wl_count; wl_index++) {
+    // Step 3
+    // Write BL address
+    // We cascade all regions: 0, 1, 2 ...
+    for (size_t region = 0; region < memory_bank.datas.size(); region++) {
+      // Step 3a
+      // The sequence of configuration of each region WL is not the same
+      //   since WL to skip for each region is not the same
+      // If it happen that current WL that we are going to program is
+      //   one of the WLs (stored in wls_to_skip) that we had determined
+      //   to skip, the we will increment or decrement to next
+      //   depending on wl_incremental_order
+      const fabric_blwl_length& lengths = memory_bank.blwl_lengths[region];
+      fabric_size_t current_wl = wl_indexes[region];
+      while (std::find(memory_bank.wls_to_skip[region].begin(),
+                       memory_bank.wls_to_skip[region].end(),
+                       current_wl) != memory_bank.wls_to_skip[region].end()) {
+        // We would like to skip this
+        if (wl_incremental_order) {
+          wl_indexes[region]++;
+        } else {
+          wl_indexes[region]--;
+        }
+        current_wl = wl_indexes[region];
+      }
+      // Step 3b
+      // If current WL still within the valid range, we will print BL
+      // Otherwise it is either
+      //   overflow (wl_incremental_order=true) or
+      //   underflow (to max fabric_blwl_length when wl_incremental_order=false)
+      // Since fabric_blwl_length is unsigned, hence underflow of -1 will be
+      //   considered as overflow too
+      // If it is overflow/underflow, then we just print don't care
+      if (current_wl < memory_bank.datas[region].size()) {
+        const std::vector<uint8_t>& data =
+          memory_bank.datas[region][current_wl];
+        const std::vector<uint8_t>& mask =
+          memory_bank.masks[region][current_wl];
+        // Step 3c
+        // Real code to print BL data that we had stored
+        // mask tell you each BL is valid
+        //   for invalid BL, we will print don't care
+        // data tell you the real din value
+        // (bl >> 3) - This is to find Byte index of the BL
+        // (1 << (bl & 7)) - This is to find Bit index of the BL
+        //                   within that Byte index
+        // When we '&' both, we can know if that BL is set or unset
+        for (size_t bl = 0; bl < lengths.bl; bl++) {
+          if (mask[bl >> 3] & (1 << (bl & 7))) {
+            if (data[bl >> 3] & (1 << (bl & 7))) {
+              fp << "1";
+            } else {
+              fp << "0";
+            }
+          } else {
+            fp << dont_care_bit.c_str();
+          }
+        }
+      } else {
+        /* However not all region has equal WL, for those that is shorter,
+         * print 'x' for all BL*/
+        for (size_t bl = 0; bl < lengths.bl; bl++) {
+          fp << dont_care_bit.c_str();
+        }
+      }
+    }
+    // Step 4
+    // Write WL address
+    // We cascade all regions: 0, 1, 2 ...
+    for (size_t region = 0; region < memory_bank.datas.size(); region++) {
+      const fabric_blwl_length& lengths = memory_bank.blwl_lengths[region];
+      fabric_size_t current_wl = wl_indexes[region];
+      // Step 4a
+      // If current WL still within the valid range, we will print WL
+      // Otherwise it is overflow/underflow then we will print don't care
+      if (current_wl < memory_bank.datas[region].size()) {
+        // Step 4b
+        // One hot printing
+        for (size_t wl_temp = 0; wl_temp < lengths.wl; wl_temp++) {
+          if (wl_temp == current_wl) {
+            fp << "1";
+          } else {
+            fp << "0";
+          }
+        }
+        // Step 4b
+        // Increment or decrement to next depending on wl_incremental_order
+        if (wl_incremental_order) {
+          wl_indexes[region]++;
+        } else {
+          wl_indexes[region]--;
+        }
+      } else {
+        /* However not all region has equal WL, for those that is shorter,
+         * print 'x' for all WL */
+        for (size_t wl_temp = 0; wl_temp < lengths.wl; wl_temp++) {
+          fp << dont_care_bit.c_str();
+        }
+      }
+    }
+    fp << std::endl;
+  }
+  return status;
+}
+
 /********************************************************************
 * Write the fabric bitstream fitting a memory bank protocol
 * to a plain text file
@ -393,7 +564,8 @@ int write_fabric_bitstream_to_text_file(
  const ConfigProtocol& config_protocol,
  const FabricGlobalPortInfo& global_ports, const std::string& fname,
  const bool& fast_configuration, const bool& keep_dont_care_bits,
-  const bool& include_time_stamp, const bool& verbose) {
+  const bool& wl_incremental_order, const bool& include_time_stamp,
+  const bool& verbose) {
  /* Ensure that we have a valid file name */
  if (true == fname.empty()) {
    VTR_LOG_ERROR(
@ -454,6 +626,14 @@ int write_fabric_bitstream_to_text_file(
      if (BLWL_PROTOCOL_DECODER == config_protocol.bl_protocol_type()) {
        status = write_memory_bank_fabric_bitstream_to_text_file(
          fp, apply_fast_configuration, bit_value_to_skip, fabric_bitstream);
+      } else if (BLWL_PROTOCOL_FLATTEN == config_protocol.bl_protocol_type() &&
+                 BLWL_PROTOCOL_FLATTEN == config_protocol.wl_protocol_type()) {
+        // If both BL and WL protocols are flatten, use new way to write the
+        // bitstream
+        status = fast_write_memory_bank_flatten_fabric_bitstream_to_text_file(
+          fp, apply_fast_configuration, bit_value_to_skip, fabric_bitstream,
+          keep_dont_care_bits, wl_incremental_order);
+
      } else if (BLWL_PROTOCOL_FLATTEN == config_protocol.bl_protocol_type()) {
        status = write_memory_bank_flatten_fabric_bitstream_to_text_file(
          fp, apply_fast_configuration, bit_value_to_skip, fabric_bitstream,
--- a/openfpga/src/fpga_bitstream/write_text_fabric_bitstream.h
+++ b/openfpga/src/fpga_bitstream/write_text_fabric_bitstream.h
@ -27,7 +27,8 @@ int write_fabric_bitstream_to_text_file(
  const ConfigProtocol& config_protocol,
  const FabricGlobalPortInfo& global_ports, const std::string& fname,
  const bool& fast_configuration, const bool& keep_dont_care_bits,
-  const bool& include_time_stamp, const bool& verbose);
+  const bool& wl_incremental_order, const bool& include_time_stamp,
+  const bool& verbose);

 } /* end namespace openfpga */

--- a/openfpga/src/fpga_bitstream/write_xml_fabric_bitstream.cpp
+++ b/openfpga/src/fpga_bitstream/write_xml_fabric_bitstream.cpp
@ -71,7 +71,8 @@ static void write_fabric_bitstream_xml_file_head(
 static int write_fabric_config_bit_to_xml_file(
  std::fstream& fp, const BitstreamManager& bitstream_manager,
  const FabricBitstream& fabric_bitstream, const FabricBitId& fabric_bit,
-  const e_config_protocol_type& config_type, const int& xml_hierarchy_depth) {
+  const e_config_protocol_type& config_type, bool fast_xml,
+  const int& xml_hierarchy_depth, std::string& bl_addr, std::string& wl_addr) {
  if (false == valid_file_stream(fp)) {
    return 1;
  }
@ -106,22 +107,60 @@ static int write_fabric_config_bit_to_xml_file(
    case CONFIG_MEM_STANDALONE:
    case CONFIG_MEM_SCAN_CHAIN:
      break;
-    case CONFIG_MEM_QL_MEMORY_BANK:
-    case CONFIG_MEM_MEMORY_BANK: {
-      /* Bit line address */
-      write_tab_to_file(fp, xml_hierarchy_depth + 1);
-      fp << "<bl address=\"";
-      for (const char& addr_bit : fabric_bitstream.bit_bl_address(fabric_bit)) {
-        fp << addr_bit;
-      }
-      fp << "\"/>\n";
+    case CONFIG_MEM_MEMORY_BANK:
+    case CONFIG_MEM_QL_MEMORY_BANK: {
+      if (fast_xml) {
+        // New way of printing XML
+        // This is fast (less than 100s) as compared to original 1300s seen in
+        // 100K LE FPFA
+        const FabricBitstreamMemoryBank& memory_bank =
+          fabric_bitstream.memory_bank_info();
+        /* Bit line address */
+        write_tab_to_file(fp, xml_hierarchy_depth + 1);
+        const fabric_bit_data& bit =
+          memory_bank.fabric_bit_datas[(size_t)(fabric_bit)];
+        const fabric_blwl_length& lengths =
+          memory_bank.blwl_lengths[bit.region];
+        if (bl_addr.size() == 0) {
+          VTR_ASSERT(wl_addr.size() == 0);
+          bl_addr.resize(lengths.bl);
+          wl_addr.resize(lengths.wl);
+          bl_addr.assign(lengths.bl, 'x');
+          wl_addr.assign(lengths.wl, '0');
+        } else {
+          VTR_ASSERT((fabric_size_t)(bl_addr.size()) == lengths.bl);
+          VTR_ASSERT((fabric_size_t)(wl_addr.size()) == lengths.wl);
+        }
+        fp << "<bl address=\"";
+        memset(&bl_addr[bit.bl], '1', 1);
+        fp << bl_addr.c_str();
+        memset(&bl_addr[bit.bl], 'x', 1);
+        fp << "\"/>\n";
+        /* Word line address */
+        write_tab_to_file(fp, xml_hierarchy_depth + 1);
+        fp << "<wl address=\"";
+        memset(&wl_addr[bit.wl], '1', 1);
+        fp << wl_addr.c_str();
+        memset(&wl_addr[bit.wl], '0', 1);
+        fp << "\"/>\n";
+      } else {
+        /* Bit line address */
+        write_tab_to_file(fp, xml_hierarchy_depth + 1);
+        fp << "<bl address=\"";
+        for (const char& addr_bit :
+             fabric_bitstream.bit_bl_address(fabric_bit)) {
+          fp << addr_bit;
+        }
+        fp << "\"/>\n";

-      write_tab_to_file(fp, xml_hierarchy_depth + 1);
-      fp << "<wl address=\"";
-      for (const char& addr_bit : fabric_bitstream.bit_wl_address(fabric_bit)) {
-        fp << addr_bit;
+        write_tab_to_file(fp, xml_hierarchy_depth + 1);
+        fp << "<wl address=\"";
+        for (const char& addr_bit :
+             fabric_bitstream.bit_wl_address(fabric_bit)) {
+          fp << addr_bit;
+        }
+        fp << "\"/>\n";
      }
-      fp << "\"/>\n";
      break;
    }
    case CONFIG_MEM_FRAME_BASED: {
@ -156,13 +195,25 @@ static int write_fabric_regional_config_bit_to_xml_file(
  std::fstream& fp, const BitstreamManager& bitstream_manager,
  const FabricBitstream& fabric_bitstream,
  const FabricBitRegionId& fabric_region,
-  const e_config_protocol_type& config_type, const int& xml_hierarchy_depth) {
+  const e_config_protocol_type& config_type, bool fast_xml,
+  const int& xml_hierarchy_depth) {
  if (false == valid_file_stream(fp)) {
    return 1;
  }

  int status = 0;
-
+  // Use string to print, instead of char by char
+  // This is for Flatten BL/WL protocol
+  // You will find this much more faster than char by char
+  // We do not need to build the string for every BL/WL
+  // It is one-hot and sequal addr
+  // We start with all '0' (WL) or 'x' (BL)
+  // By setting "1' and resettting ('0' or 'x') at approriate bit position
+  // We could create one-hot string much faster
+  // Use FPGA 100K as example: old way needs 1300seconds to write 85Gig XML
+  // New way only needs 80seconds to write identical XML
+  std::string bl_addr = "";
+  std::string wl_addr = "";
  write_tab_to_file(fp, xml_hierarchy_depth);
  fp << "<region ";
  fp << "id=\"";
@ -170,14 +221,24 @@ static int write_fabric_regional_config_bit_to_xml_file(
  fp << "\"";
  fp << ">\n";

+  size_t bit_index = 0;
+  size_t total_bits = fabric_bitstream.region_bits(fabric_region).size();
+  size_t percentage = 0;
  for (const FabricBitId& fabric_bit :
       fabric_bitstream.region_bits(fabric_region)) {
    status = write_fabric_config_bit_to_xml_file(
      fp, bitstream_manager, fabric_bitstream, fabric_bit, config_type,
-      xml_hierarchy_depth + 1);
+      fast_xml, xml_hierarchy_depth + 1, bl_addr, wl_addr);
    if (1 == status) {
      return status;
    }
+    // Misc to print percentage of the process
+    bit_index++;
+    size_t temp = (bit_index * 100) / total_bits;
+    if (temp != percentage) {
+      VTR_LOG("  Progress: %lu%\r", percentage);
+      percentage = temp;
+    }
  }

  write_tab_to_file(fp, xml_hierarchy_depth);
@ -231,6 +292,8 @@ int write_fabric_bitstream_to_xml_file(
  for (const FabricBitRegionId& region : fabric_bitstream.regions()) {
    status = write_fabric_regional_config_bit_to_xml_file(
      fp, bitstream_manager, fabric_bitstream, region, config_protocol.type(),
+      BLWL_PROTOCOL_FLATTEN == config_protocol.bl_protocol_type() &&
+        BLWL_PROTOCOL_FLATTEN == config_protocol.wl_protocol_type(),
      xml_hierarchy_depth + 1);
    if (1 == status) {
      break;
--- a/openfpga/src/fpga_verilog/verilog_top_testbench.cpp
+++ b/openfpga/src/fpga_verilog/verilog_top_testbench.cpp
@ -1061,6 +1061,17 @@ static size_t calculate_num_config_clock_cycles(
                      (float)full_num_config_clock_cycles -
                    1.));
        }
+      } else if (BLWL_PROTOCOL_FLATTEN == config_protocol.bl_protocol_type() &&
+                 BLWL_PROTOCOL_FLATTEN == config_protocol.wl_protocol_type()) {
+        // Only support new fast way if both BL/WL protocols are flatten
+        // Based on 100K LE FPGA, we are wasting a lot of time to build
+        // MemoryBankFlattenFabricBitstream
+        // just to get the effective WL addr size. So wasteful of the resource
+        const FabricBitstreamMemoryBank& memory_bank =
+          fabric_bitstream.memory_bank_info(fast_configuration,
+                                            bit_value_to_skip);
+        num_config_clock_cycles =
+          1 + memory_bank.get_longest_effective_wl_count();
      } else if (BLWL_PROTOCOL_FLATTEN == config_protocol.bl_protocol_type()) {
        num_config_clock_cycles =
          1 + build_memory_bank_flatten_fabric_bitstream(
--- a/openfpga/src/fpga_verilog/verilog_top_testbench_memory_bank.cpp
+++ b/openfpga/src/fpga_verilog/verilog_top_testbench_memory_bank.cpp
@ -565,9 +565,12 @@ static void print_verilog_full_testbench_ql_memory_bank_flatten_bitstream(
  valid_file_stream(fp);

  /* Reorganize the fabric bitstream by the same address across regions */
-  MemoryBankFlattenFabricBitstream fabric_bits_by_addr =
-    build_memory_bank_flatten_fabric_bitstream(
-      fabric_bitstream, fast_configuration, bit_value_to_skip);
+  // New way to get the effective WL addr size.
+  // Based on 100K LE FPGA, we are wasting a lot of time to build
+  // MemoryBankFlattenFabricBitstream just to get size(). So wasteful of the
+  // resource
+  const FabricBitstreamMemoryBank& memory_bank =
+    fabric_bitstream.memory_bank_info(fast_configuration, bit_value_to_skip);

  /* Feed address and data input pair one by one
   * Note: the first cycle is reserved for programming reset
@ -604,7 +607,7 @@ static void print_verilog_full_testbench_ql_memory_bank_flatten_bitstream(

  /* Define a constant for the bitstream length */
  print_verilog_define_flag(fp, std::string(TOP_TB_BITSTREAM_LENGTH_VARIABLE),
-                            fabric_bits_by_addr.size());
+                            memory_bank.get_longest_effective_wl_count());
  print_verilog_define_flag(fp, std::string(TOP_TB_BITSTREAM_WIDTH_VARIABLE),
                            bl_port_width + wl_port_width);