From 39934f9d16742a101216467c2d01e5b46d09f50f Mon Sep 17 00:00:00 2001
From: Chung Shien Chai <chungshien.chai@rapidsilicon.com>
Date: Thu, 20 Jul 2023 22:34:18 -0700
Subject: [PATCH] Address issue 1256

---
 .../openfpga_bitstream_command_template.h     |   5 +
 .../src/base/openfpga_bitstream_template.h    |   2 +
 .../build_fabric_bitstream_memory_bank.cpp    |  68 +++++---
 .../src/fpga_bitstream/fabric_bitstream.cpp   | 154 ++++++++++++++++++
 .../src/fpga_bitstream/fabric_bitstream.h     |  96 +++++++++++
 .../write_text_fabric_bitstream.cpp           | 147 ++++++++++++++++-
 .../write_text_fabric_bitstream.h             |   3 +-
 .../write_xml_fabric_bitstream.cpp            |  63 ++++++-
 .../fpga_verilog/verilog_top_testbench.cpp    |  13 ++
 .../verilog_top_testbench_memory_bank.cpp     |  14 +-
 10 files changed, 531 insertions(+), 34 deletions(-)
diff --git a/openfpga/src/base/openfpga_bitstream_command_template.h b/openfpga/src/base/openfpga_bitstream_command_template.h
index b11c058cd..bc781dcdc 100644
--- a/openfpga/src/base/openfpga_bitstream_command_template.h
+++ b/openfpga/src/base/openfpga_bitstream_command_template.h
@@ -200,6 +200,11 @@ ShellCommandId add_write_fabric_bitstream_command_template(
     "Keep don't care bits in bitstream file; If not enabled, don't care bits "
     "are converted to logic '0' or '1'");
 
+  /* Add an option '--wl_incremental_order' */
+  shell_cmd.add_option(
+    "wl_decremental_order", false,
+    "Generate bitstream in WL decremental addressing order if supported");
+
   /* Add an option '--no_time_stamp' */
   shell_cmd.add_option("no_time_stamp", false,
                        "Do not print time stamp in output files");
diff --git a/openfpga/src/base/openfpga_bitstream_template.h b/openfpga/src/base/openfpga_bitstream_template.h
index de01d1df9..196d9ef30 100644
--- a/openfpga/src/base/openfpga_bitstream_template.h
+++ b/openfpga/src/base/openfpga_bitstream_template.h
@@ -91,6 +91,7 @@ int write_fabric_bitstream_template(const T& openfpga_ctx, const Command& cmd,
   CommandOptionId opt_file_format = cmd.option("format");
   CommandOptionId opt_fast_config = cmd.option("fast_configuration");
   CommandOptionId opt_keep_dont_care_bits = cmd.option("keep_dont_care_bits");
+  CommandOptionId opt_wl_decremental_order = cmd.option("wl_decremental_order");
   CommandOptionId opt_no_time_stamp = cmd.option("no_time_stamp");
 
   /* Write fabric bitstream if required */
@@ -127,6 +128,7 @@ int write_fabric_bitstream_template(const T& openfpga_ctx, const Command& cmd,
       cmd_context.option_value(cmd, opt_file),
       cmd_context.option_enable(cmd, opt_fast_config),
       cmd_context.option_enable(cmd, opt_keep_dont_care_bits),
+      !cmd_context.option_enable(cmd, opt_wl_decremental_order),
       !cmd_context.option_enable(cmd, opt_no_time_stamp),
       cmd_context.option_enable(cmd, opt_verbose));
   }
diff --git a/openfpga/src/fpga_bitstream/build_fabric_bitstream_memory_bank.cpp b/openfpga/src/fpga_bitstream/build_fabric_bitstream_memory_bank.cpp
index 97fe97d90..1659e3b65 100644
--- a/openfpga/src/fpga_bitstream/build_fabric_bitstream_memory_bank.cpp
+++ b/openfpga/src/fpga_bitstream/build_fabric_bitstream_memory_bank.cpp
@@ -174,6 +174,11 @@ static void rec_build_module_fabric_dependent_ql_memory_bank_regional_bitstream(
        bitstream_manager.block_bits(parent_block)) {
     FabricBitId fabric_bit = fabric_bitstream.add_bit(config_bit);
 
+    /*
+      If both BL and WL protocols are Flatten, we will have new way of
+      storing information in fabric_bitstream. This will save high
+      memory usage, as well as fast processing
+    */
     /* The BL address to be decoded depends on the protocol
      * - flatten BLs: use 1-hot decoding
      * - BL decoders: fully encoded
@@ -181,38 +186,53 @@ static void rec_build_module_fabric_dependent_ql_memory_bank_regional_bitstream(
      */
     size_t cur_bl_index = bl_start_index_per_tile.at(tile_coord.x()) +
                           cur_mem_index[tile_coord] % num_bls_cur_tile;
-    std::vector<char> bl_addr_bits_vec;
-    if (BLWL_PROTOCOL_DECODER == config_protocol.bl_protocol_type()) {
-      bl_addr_bits_vec = itobin_charvec(cur_bl_index, bl_addr_size);
-    } else if (BLWL_PROTOCOL_FLATTEN == config_protocol.bl_protocol_type() ||
-               BLWL_PROTOCOL_SHIFT_REGISTER ==
-                 config_protocol.bl_protocol_type()) {
-      bl_addr_bits_vec =
-        ito1hot_charvec(cur_bl_index, bl_addr_size, DONT_CARE_CHAR);
+    if (BLWL_PROTOCOL_FLATTEN != config_protocol.bl_protocol_type() ||
+        BLWL_PROTOCOL_FLATTEN != config_protocol.wl_protocol_type()) {
+      // This is using old way
+      // We only do this kind of resource wasting storing if
+      // either protocol is not flatten
+      std::vector<char> bl_addr_bits_vec;
+      if (BLWL_PROTOCOL_DECODER == config_protocol.bl_protocol_type()) {
+        bl_addr_bits_vec = itobin_charvec(cur_bl_index, bl_addr_size);
+      } else if (BLWL_PROTOCOL_FLATTEN == config_protocol.bl_protocol_type() ||
+                 BLWL_PROTOCOL_SHIFT_REGISTER ==
+                   config_protocol.bl_protocol_type()) {
+        bl_addr_bits_vec =
+          ito1hot_charvec(cur_bl_index, bl_addr_size, DONT_CARE_CHAR);
+      }
+      /* Set BL address */
+      fabric_bitstream.set_bit_bl_address(
+        fabric_bit, bl_addr_bits_vec,
+        BLWL_PROTOCOL_DECODER != config_protocol.bl_protocol_type());
     }
 
     /* Find WL address */
     size_t cur_wl_index =
       wl_start_index_per_tile.at(tile_coord.y()) +
       std::floor(cur_mem_index[tile_coord] / num_bls_cur_tile);
-    std::vector<char> wl_addr_bits_vec;
-    if (BLWL_PROTOCOL_DECODER == config_protocol.wl_protocol_type()) {
-      wl_addr_bits_vec = itobin_charvec(cur_wl_index, wl_addr_size);
-    } else if (BLWL_PROTOCOL_FLATTEN == config_protocol.wl_protocol_type() ||
-               BLWL_PROTOCOL_SHIFT_REGISTER ==
-                 config_protocol.wl_protocol_type()) {
-      wl_addr_bits_vec = ito1hot_charvec(cur_wl_index, wl_addr_size);
+    if (BLWL_PROTOCOL_FLATTEN != config_protocol.bl_protocol_type() ||
+        BLWL_PROTOCOL_FLATTEN != config_protocol.wl_protocol_type()) {
+      // This is using old way
+      // We only do this kind of resource wasting storing if
+      // either protocol is not flatten
+      std::vector<char> wl_addr_bits_vec;
+      if (BLWL_PROTOCOL_DECODER == config_protocol.wl_protocol_type()) {
+        wl_addr_bits_vec = itobin_charvec(cur_wl_index, wl_addr_size);
+      } else if (BLWL_PROTOCOL_FLATTEN == config_protocol.wl_protocol_type() ||
+                 BLWL_PROTOCOL_SHIFT_REGISTER ==
+                   config_protocol.wl_protocol_type()) {
+        wl_addr_bits_vec = ito1hot_charvec(cur_wl_index, wl_addr_size);
+      }
+      /* Set WL address */
+      fabric_bitstream.set_bit_wl_address(
+        fabric_bit, wl_addr_bits_vec,
+        BLWL_PROTOCOL_DECODER != config_protocol.wl_protocol_type());
     }
 
-    /* Set BL address */
-    fabric_bitstream.set_bit_bl_address(
-      fabric_bit, bl_addr_bits_vec,
-      BLWL_PROTOCOL_DECODER != config_protocol.bl_protocol_type());
-
-    /* Set WL address */
-    fabric_bitstream.set_bit_wl_address(
-      fabric_bit, wl_addr_bits_vec,
-      BLWL_PROTOCOL_DECODER != config_protocol.wl_protocol_type());
+    /* New way of storing information in compact way*/
+    fabric_bitstream.set_memory_bank_info(
+      fabric_bit, fabric_bitstream_region, cur_bl_index, cur_wl_index,
+      bl_addr_size, wl_addr_size, bitstream_manager.bit_value(config_bit));
 
     /* Set data input */
     fabric_bitstream.set_bit_din(fabric_bit,
diff --git a/openfpga/src/fpga_bitstream/fabric_bitstream.cpp b/openfpga/src/fpga_bitstream/fabric_bitstream.cpp
index 5649388b8..9778b76e6 100644
--- a/openfpga/src/fpga_bitstream/fabric_bitstream.cpp
+++ b/openfpga/src/fpga_bitstream/fabric_bitstream.cpp
@@ -11,6 +11,133 @@
 /* begin namespace openfpga */
 namespace openfpga {
 
+/**************************************************
+ * FabricBitstreamMemoryBank
+ *************************************************/
+void FabricBitstreamMemoryBank::add_bit(const fabric_size_t& bit_id,
+                                        const fabric_size_t& region_id,
+                                        const fabric_size_t& bl,
+                                        const fabric_size_t& wl,
+                                        const fabric_size_t& bl_addr_size,
+                                        const fabric_size_t& wl_addr_size,
+                                        bool bit) {
+  // Fabric Bit is added in sequential manner and each bit is unique
+  VTR_ASSERT((size_t)(bit_id) == fabric_bit_datas.size());
+  // Region is added in sequntial manner but it is not unique from fabric bit
+  // perspective
+  VTR_ASSERT((size_t)(region_id) <= blwl_lengths.size());
+  if ((size_t)(region_id) == blwl_lengths.size()) {
+    // Add if this is first time
+    blwl_lengths.push_back(fabric_blwl_length(bl_addr_size, wl_addr_size));
+  } else {
+    // Otherwise if the region had been added, it must always be consistent
+    VTR_ASSERT(blwl_lengths[region_id].bl == bl_addr_size);
+    VTR_ASSERT(blwl_lengths[region_id].wl == wl_addr_size);
+  }
+  // The BL/WL index must be within respective length
+  VTR_ASSERT(bl < blwl_lengths[region_id].bl);
+  VTR_ASSERT(wl < blwl_lengths[region_id].wl);
+  // We might not need this at all to track the raw data
+  // But since it does not use a lot of memory, tracking for good
+  fabric_bit_datas.push_back(fabric_bit_data((fabric_size_t)(size_t)(region_id),
+                                             (fabric_size_t)(bl),
+                                             (fabric_size_t)(wl), bit));
+  // This is real compact data
+  VTR_ASSERT(datas.size() == masks.size());
+  while ((size_t)(region_id) >= datas.size()) {
+    datas.emplace_back();
+    masks.emplace_back();
+  }
+  VTR_ASSERT(datas[region_id].size() == masks[region_id].size());
+  while ((size_t)(wl) >= datas[region_id].size()) {
+    datas[region_id].push_back(std::vector<uint8_t>((bl_addr_size + 7) / 8, 0));
+    masks[region_id].push_back(std::vector<uint8_t>((bl_addr_size + 7) / 8, 0));
+  }
+  // Same uniqie config bit cannot be set twice
+  VTR_ASSERT((masks[region_id][wl][bl >> 3] & (1 << (bl & 7))) == 0);
+  if (bit) {
+    // Mark the data value if bit (or din) is true
+    datas[region_id][wl][bl >> 3] |= (1 << (bl & 7));
+  }
+  // Mark the mask to indicate we had used this bit
+  masks[region_id][wl][bl >> 3] |= (1 << (bl & 7));
+}
+
+void FabricBitstreamMemoryBank::fast_configuration(
+  const bool& fast, const bool& bit_value_to_skip) {
+  for (auto& wls : wls_to_skip) {
+    wls.clear();
+  }
+  wls_to_skip.clear();
+  // If we had processed it before, we do not need to process again
+  if (wls_to_skip.size() == 0) {
+    for (size_t region = 0; region < datas.size(); region++) {
+      wls_to_skip.emplace_back();
+      if (fast) {
+        for (fabric_size_t wl = 0; wl < blwl_lengths[region].wl; wl++) {
+          VTR_ASSERT((size_t)(wl) < datas[region].size());
+          bool skip_wl = true;
+          for (fabric_size_t bl = 0; bl < blwl_lengths[region].bl && skip_wl;
+               bl++) {
+            // Only check the bit that being used (marked in the mask),
+            // otherwise it is just a don't care, we can skip
+            if (masks[region][wl][bl >> 3] & (1 << (bl & 7))) {
+              if (datas[region][wl][bl >> 3] & (1 << (bl & 7))) {
+                // If bit_value_to_skip=true, and yet the din (recorded in
+                // datas) also 1, then we can skip
+                skip_wl = bit_value_to_skip;
+              } else {
+                skip_wl = !bit_value_to_skip;
+              }
+            }
+          }
+          if (skip_wl) {
+            // Record down that for this region, we will skip this WL
+            wls_to_skip[region].push_back(wl);
+          }
+        }
+      }
+    }
+  } else {
+    VTR_ASSERT(wls_to_skip.size() == datas.size());
+  }
+}
+
+fabric_size_t FabricBitstreamMemoryBank::get_lontest_effective_wl_addr_size()
+  const {
+  // This function check effective WL addr size
+  // Where effective WL is the WL that we wantt to program after considering
+  // fast configuration From all the region, it return the longest
+  fabric_size_t longest_wl = 0;
+  for (size_t region = 0; region < datas.size(); region++) {
+    VTR_ASSERT((size_t)(region) < wls_to_skip.size());
+    fabric_size_t current_wl =
+      (fabric_size_t)(datas[region].size() - wls_to_skip[region].size());
+    if (current_wl > longest_wl) {
+      longest_wl = current_wl;
+    }
+  }
+  return longest_wl;
+}
+
+fabric_size_t FabricBitstreamMemoryBank::get_total_bl_addr_size() const {
+  // Simply total up all the BL addr size
+  fabric_size_t bl = 0;
+  for (size_t region = 0; region < datas.size(); region++) {
+    bl += blwl_lengths[region].bl;
+  }
+  return bl;
+}
+
+fabric_size_t FabricBitstreamMemoryBank::get_total_wl_addr_size() const {
+  // Simply total up all the WL addr size
+  fabric_size_t wl = 0;
+  for (size_t region = 0; region < datas.size(); region++) {
+    wl += blwl_lengths[region].wl;
+  }
+  return wl;
+}
+
 /**************************************************
  * Public Constructor
  *************************************************/
@@ -129,6 +256,12 @@ bool FabricBitstream::use_address() const { return use_address_; }
 
 bool FabricBitstream::use_wl_address() const { return use_wl_address_; }
 
+const FabricBitstreamMemoryBank* FabricBitstream::memory_bank_info() const {
+  VTR_ASSERT(true == use_address_);
+  VTR_ASSERT(true == use_wl_address_);
+  return &memory_bank_data_;
+}
+
 /******************************************************************************
  * Public Mutators
  ******************************************************************************/
@@ -243,6 +376,27 @@ void FabricBitstream::set_bl_address_length(const size_t& length) {
   set_address_length(length);
 }
 
+void FabricBitstream::set_memory_bank_info(const FabricBitId& bit_id,
+                                           const FabricBitRegionId& region_id,
+                                           const size_t& bl, const size_t& wl,
+                                           const size_t& bl_addr_size,
+                                           const size_t& wl_addr_size,
+                                           bool bit) {
+  // Bit must be valid one
+  // We only support this in protocol that use BL and WL address
+  VTR_ASSERT(true == valid_bit_id(bit_id));
+  VTR_ASSERT(true == use_address_);
+  VTR_ASSERT(true == use_wl_address_);
+  VTR_ASSERT(bl_addr_size);
+  VTR_ASSERT(wl_addr_size);
+  // All the basic checking had passed, we can add the data into
+  // memory_bank_data_
+  memory_bank_data_.add_bit(
+    (fabric_size_t)(size_t)(bit_id), (fabric_size_t)(size_t)(region_id),
+    (fabric_size_t)(bl), (fabric_size_t)(wl), (fabric_size_t)(bl_addr_size),
+    (fabric_size_t)(wl_addr_size), bit);
+}
+
 void FabricBitstream::set_use_wl_address(const bool& enable) {
   /* Add a lock, only can be modified when num bits are zero*/
   if (0 == num_bits_) {
diff --git a/openfpga/src/fpga_bitstream/fabric_bitstream.h b/openfpga/src/fpga_bitstream/fabric_bitstream.h
index 68a972636..dfe8840a2 100644
--- a/openfpga/src/fpga_bitstream/fabric_bitstream.h
+++ b/openfpga/src/fpga_bitstream/fabric_bitstream.h
@@ -41,6 +41,85 @@
 /* begin namespace openfpga */
 namespace openfpga {
 
+// Use uint32_t (maximum of 4Gigs) is good enough, we have BL and WL,
+// combination of both hold up to 18 quintillion of configuration bits (+ dont
+// care)
+typedef uint32_t fabric_size_t;
+struct fabric_bit_data {
+  fabric_bit_data(fabric_size_t r, fabric_size_t b, fabric_size_t w, bool bi)
+    : region(r), bl(b), wl(w), bit(bi) {}
+  const fabric_size_t region = 0;
+  const fabric_size_t bl = 0;
+  const fabric_size_t wl = 0;
+  const bool bit = false;
+};
+struct fabric_blwl_length {
+  fabric_blwl_length(fabric_size_t b, fabric_size_t w) : bl(b), wl(w) {}
+  const fabric_size_t bl = 0;
+  const fabric_size_t wl = 0;
+};
+
+/*
+  This class arrange Memory Bank databae in a compact way
+*/
+struct FabricBitstreamMemoryBank {
+  void add_bit(const fabric_size_t& bit_id, const fabric_size_t& region_id,
+               const fabric_size_t& bl, const fabric_size_t& wl,
+               const fabric_size_t& bl_addr_size,
+               const fabric_size_t& wl_addr_size, bool bit);
+  void fast_configuration(const bool& fast, const bool& bit_value_to_skip);
+  fabric_size_t get_lontest_effective_wl_addr_size() const;
+  fabric_size_t get_total_bl_addr_size() const;
+  fabric_size_t get_total_wl_addr_size() const;
+
+  /*************************
+   * All the database (except fabric_bit_datas) is sorted by region
+   *  1. The very first layer of vector is region
+   * For the datas and masks
+   *  1. They are sorted by WL, hence second layer is WL
+   *  2. Layer is BL data stored in vector of uint8_t
+   *  3. Each uint8_t will store up-to 8 configuration bit info
+   **************************/
+  // Store the BL WL of each region
+  std::vector<fabric_blwl_length> blwl_lengths;
+  // Store config ID raw data. Not used by bitstream generation
+  // Used by XML generation
+  /*
+      fabric_bit_datas[Bit #0] = (region, bl, wl)
+      fabric_bit_datas[Bit #1] = (region, bl, wl)
+      fabric_bit_datas[Bit #2] = (region, bl, wl)
+    */
+  std::vector<fabric_bit_data> fabric_bit_datas;
+  // 100K LE FPGA only need few mega bytes
+  /*
+    datas represent the Din value of a given WL and BL (1bit)
+      datas[region #0][wl #0] = std::vector<uint8_t> to represent BLs
+        where uint8_t #0 = MSB{ BL#7, BL#6, .... BL #1, BL #0 } LSB
+        where uint8_t #1 = MSB{ BL#15, BL#14, .... BL #9, BL #8 } LSB
+      datas[region #0][wl #1] = std::vector<uint8_t> to represent BLs
+      datas[region #0][wl #2] = std::vector<uint8_t> to represent BLs
+      ......
+      datas[region #0][wl #n-1] = std::vector<uint8_t> to represent BLs
+      ......
+      datas[region #1][wl #0] = std::vector<uint8_t> to represent BLs
+      datas[region #1][wl #1] = std::vector<uint8_t> to represent BLs
+      ......
+  */
+  std::vector<std::vector<std::vector<uint8_t>>> datas;
+  /*
+    masks has same structure as datas
+    but masks presents data that being used
+    for exampe:
+      if mask's uint8_t #0 value = 0x41 it means for this WL
+        a. BL #0 is being used, and its Din is recoreded in datas
+        b. BL #6 is being used, and its Din is recoreded in datas
+        c. Other BLs #1, 2, 3, 4, 5, 7 are don't care bit (not being used)
+  */
+  std::vector<std::vector<std::vector<uint8_t>>> masks;
+  // This track which WL to skip because of fast configuration
+  std::vector<std::vector<fabric_size_t>> wls_to_skip;
+};
+
 class FabricBitstream {
  public: /* Type implementations */
   /*
@@ -144,6 +223,8 @@ class FabricBitstream {
   bool use_address() const;
   bool use_wl_address() const;
 
+  const FabricBitstreamMemoryBank* memory_bank_info() const;
+
  public: /* Public Mutators */
   /* Reserve config bits */
   void reserve_bits(const size_t& num_bits);
@@ -193,6 +274,18 @@ class FabricBitstream {
   void set_address_length(const size_t& length);
   void set_bl_address_length(const size_t& length);
 
+  /*
+    This is setting memory bank protocol in a more efficient way
+    Instead of building lengthy BL/WL bits of database (BL or Wl could be in
+    thousand bits of size), a small device like 100K LE (compared to other
+    vendors offer) might end up using tens of gig bytes.
+  */
+  void set_memory_bank_info(const FabricBitId& bit_id,
+                            const FabricBitRegionId& region_id,
+                            const size_t& bl, const size_t& wl,
+                            const size_t& bl_addr_size,
+                            const size_t& wl_addr_size, bool bit);
+
   /* Enable the use of WL-address related data
    * Same priniciple as the set_use_address()
    */
@@ -250,6 +343,9 @@ class FabricBitstream {
 
   /* Data input (Din) bits: this is designed for memory decoders */
   vtr::vector<FabricBitId, char> bit_dins_;
+
+  /* New way of dealing with memory bank protocol - fast and compact */
+  FabricBitstreamMemoryBank memory_bank_data_;
 };
 
 } /* end namespace openfpga */
diff --git a/openfpga/src/fpga_bitstream/write_text_fabric_bitstream.cpp b/openfpga/src/fpga_bitstream/write_text_fabric_bitstream.cpp
index 9cb2a3f9c..2ade6174b 100644
--- a/openfpga/src/fpga_bitstream/write_text_fabric_bitstream.cpp
+++ b/openfpga/src/fpga_bitstream/write_text_fabric_bitstream.cpp
@@ -245,6 +245,142 @@ static int write_memory_bank_flatten_fabric_bitstream_to_text_file(
   return status;
 }
 
+/********************************************************************
+ * Write the fabric bitstream fitting a memory bank protocol
+ * to a plain text file in efficient method
+ *
+ * Old function is write_memory_bank_flatten_fabric_bitstream_to_text_file()
+ *
+ * As compared to original function, based on 100K LE FPGA:
+ *  1. Original function used 600 seconds and need 80G Bytes of memory
+ *  2. This new function only needs 1 second and 4M Bytes
+ *
+ * Old function only print WL in decremental order. It is not by intentional
+ * It is because of the map-key ordering
+ * In QL Memory Bank with Flatten BL/WL, data is stored by WL address,
+ *   where we use WL string as map key
+ *     WL #0 --- "1000000000000 .... 0000"
+ *     WL #1 --- "0100000000000 .... 0000"
+ *     WL #n-1 --- "0000000000000 .... 0001
+ * From string comparison wise, WL #n-1 will be first, and WL #0 will be last
+ * The sequence of WL does not really matter,  but preferrable in some ordering
+ *   manner. Using map key as ordering cannot guarantee the determinstic
+ *
+ * This new way of writting fabric guarantee the WL order in 100% deterministc
+ *   way: either incremental (default) or decremental
+ *
+ * Return:
+ *  - 0 if succeed
+ *  - 1 if critical errors occured
+ *******************************************************************/
+static int fast_write_memory_bank_flatten_fabric_bitstream_to_text_file(
+  std::fstream& fp, const bool& fast_configuration,
+  const bool& bit_value_to_skip, const FabricBitstream& fabric_bitstream,
+  const bool& keep_dont_care_bits, const bool& wl_incremental_order) {
+  int status = 0;
+
+  std::string dont_care_bit = "0";
+  if (keep_dont_care_bits) {
+    dont_care_bit = "x";
+  }
+  const FabricBitstreamMemoryBank* memory_bank =
+    fabric_bitstream.memory_bank_info();
+
+  // Must call this to prepare wls_to_skip
+  (const_cast<FabricBitstreamMemoryBank*>(memory_bank))
+    ->fast_configuration(fast_configuration, bit_value_to_skip);
+
+  fabric_size_t lontest_effective_wl_addr_size =
+    memory_bank->get_lontest_effective_wl_addr_size();
+  /* Output information about how to intepret the bitstream */
+  fp << "// Bitstream length: " << lontest_effective_wl_addr_size << std::endl;
+  fp << "// Bitstream width (LSB -> MSB): ";
+  fp << "<bl_address " << memory_bank->get_total_bl_addr_size() << " bits>";
+  fp << "<wl_address " << memory_bank->get_total_wl_addr_size() << " bits>";
+  fp << std::endl;
+
+  std::vector<fabric_size_t> wl_indexes;
+  for (size_t region = 0; region < memory_bank->datas.size(); region++) {
+    if (wl_incremental_order) {
+      wl_indexes.push_back(0);
+    } else {
+      wl_indexes.push_back(
+        (fabric_size_t)(memory_bank->datas[region].size() - 1));
+    }
+  }
+  for (size_t wl_index = 0; wl_index < lontest_effective_wl_addr_size;
+       wl_index++) {
+    /* Write BL address code */
+    /* cascade region 0, 1, 2, 3 ... */
+    for (size_t region = 0; region < memory_bank->datas.size(); region++) {
+      const fabric_blwl_length& lengths = memory_bank->blwl_lengths[region];
+      fabric_size_t current_wl = wl_indexes[region];
+      while (std::find(memory_bank->wls_to_skip[region].begin(),
+                       memory_bank->wls_to_skip[region].end(),
+                       current_wl) != memory_bank->wls_to_skip[region].end()) {
+        // We would like to skip this
+        if (wl_incremental_order) {
+          wl_indexes[region]++;
+        } else {
+          wl_indexes[region]--;
+        }
+        current_wl = wl_indexes[region];
+      }
+      if (current_wl < memory_bank->datas[region].size()) {
+        const std::vector<uint8_t>& data =
+          memory_bank->datas[region][current_wl];
+        const std::vector<uint8_t>& mask =
+          memory_bank->masks[region][current_wl];
+        for (size_t bl = 0; bl < lengths.bl; bl++) {
+          if (mask[bl >> 3] & (1 << (bl & 7))) {
+            if (data[bl >> 3] & (1 << (bl & 7))) {
+              fp << "1";
+            } else {
+              fp << "0";
+            }
+          } else {
+            fp << dont_care_bit.c_str();
+          }
+        }
+      } else {
+        /* However not all region has equal WL, for those that is shortest,
+         * print 'x' for all BL*/
+        for (size_t bl = 0; bl < lengths.bl; bl++) {
+          fp << dont_care_bit.c_str();
+        }
+      }
+    }
+    /* Write WL address code */
+    /* cascade region 0, 1, 2, 3 ... */
+    for (size_t region = 0; region < memory_bank->datas.size(); region++) {
+      const fabric_blwl_length& lengths = memory_bank->blwl_lengths[region];
+      fabric_size_t current_wl = wl_indexes[region];
+      if (current_wl < memory_bank->datas[region].size()) {
+        for (size_t wl_temp = 0; wl_temp < lengths.wl; wl_temp++) {
+          if (wl_temp == current_wl) {
+            fp << "1";
+          } else {
+            fp << "0";
+          }
+        }
+        if (wl_incremental_order) {
+          wl_indexes[region]++;
+        } else {
+          wl_indexes[region]--;
+        }
+      } else {
+        /* However not all region has equal WL, for those that is shortest,
+         * print 'x' for all WL */
+        for (size_t wl_temp = 0; wl_temp < lengths.wl; wl_temp++) {
+          fp << dont_care_bit.c_str();
+        }
+      }
+    }
+    fp << std::endl;
+  }
+  return status;
+}
+
 /********************************************************************
  * Write the fabric bitstream fitting a memory bank protocol
  * to a plain text file
@@ -393,7 +529,8 @@ int write_fabric_bitstream_to_text_file(
   const ConfigProtocol& config_protocol,
   const FabricGlobalPortInfo& global_ports, const std::string& fname,
   const bool& fast_configuration, const bool& keep_dont_care_bits,
-  const bool& include_time_stamp, const bool& verbose) {
+  const bool& wl_incremental_order, const bool& include_time_stamp,
+  const bool& verbose) {
   /* Ensure that we have a valid file name */
   if (true == fname.empty()) {
     VTR_LOG_ERROR(
@@ -454,6 +591,14 @@ int write_fabric_bitstream_to_text_file(
       if (BLWL_PROTOCOL_DECODER == config_protocol.bl_protocol_type()) {
         status = write_memory_bank_fabric_bitstream_to_text_file(
           fp, apply_fast_configuration, bit_value_to_skip, fabric_bitstream);
+      } else if (BLWL_PROTOCOL_FLATTEN == config_protocol.bl_protocol_type() &&
+                 BLWL_PROTOCOL_FLATTEN == config_protocol.wl_protocol_type()) {
+        // If both BL and WL protocols are flatten, use new way to write the
+        // bitstream
+        status = fast_write_memory_bank_flatten_fabric_bitstream_to_text_file(
+          fp, apply_fast_configuration, bit_value_to_skip, fabric_bitstream,
+          keep_dont_care_bits, wl_incremental_order);
+
       } else if (BLWL_PROTOCOL_FLATTEN == config_protocol.bl_protocol_type()) {
         status = write_memory_bank_flatten_fabric_bitstream_to_text_file(
           fp, apply_fast_configuration, bit_value_to_skip, fabric_bitstream,
diff --git a/openfpga/src/fpga_bitstream/write_text_fabric_bitstream.h b/openfpga/src/fpga_bitstream/write_text_fabric_bitstream.h
index 0d8682739..59f4774de 100644
--- a/openfpga/src/fpga_bitstream/write_text_fabric_bitstream.h
+++ b/openfpga/src/fpga_bitstream/write_text_fabric_bitstream.h
@@ -27,7 +27,8 @@ int write_fabric_bitstream_to_text_file(
   const ConfigProtocol& config_protocol,
   const FabricGlobalPortInfo& global_ports, const std::string& fname,
   const bool& fast_configuration, const bool& keep_dont_care_bits,
-  const bool& include_time_stamp, const bool& verbose);
+  const bool& wl_incremental_order, const bool& include_time_stamp,
+  const bool& verbose);
 
 } /* end namespace openfpga */
 
diff --git a/openfpga/src/fpga_bitstream/write_xml_fabric_bitstream.cpp b/openfpga/src/fpga_bitstream/write_xml_fabric_bitstream.cpp
index ec8a036fd..786a3768e 100644
--- a/openfpga/src/fpga_bitstream/write_xml_fabric_bitstream.cpp
+++ b/openfpga/src/fpga_bitstream/write_xml_fabric_bitstream.cpp
@@ -71,7 +71,8 @@ static void write_fabric_bitstream_xml_file_head(
 static int write_fabric_config_bit_to_xml_file(
   std::fstream& fp, const BitstreamManager& bitstream_manager,
   const FabricBitstream& fabric_bitstream, const FabricBitId& fabric_bit,
-  const e_config_protocol_type& config_type, const int& xml_hierarchy_depth) {
+  const e_config_protocol_type& config_type, const int& xml_hierarchy_depth,
+  std::string& bl_addr, std::string& wl_addr) {
   if (false == valid_file_stream(fp)) {
     return 1;
   }
@@ -106,7 +107,6 @@ static int write_fabric_config_bit_to_xml_file(
     case CONFIG_MEM_STANDALONE:
     case CONFIG_MEM_SCAN_CHAIN:
       break;
-    case CONFIG_MEM_QL_MEMORY_BANK:
     case CONFIG_MEM_MEMORY_BANK: {
       /* Bit line address */
       write_tab_to_file(fp, xml_hierarchy_depth + 1);
@@ -124,6 +124,41 @@ static int write_fabric_config_bit_to_xml_file(
       fp << "\"/>\n";
       break;
     }
+    case CONFIG_MEM_QL_MEMORY_BANK: {
+      // New way of printing XML
+      // This is fast (less than 100s) as compared to original 1300s seen in
+      // 100K LE FPFA
+      const FabricBitstreamMemoryBank* memory_bank =
+        fabric_bitstream.memory_bank_info();
+      /* Bit line address */
+      write_tab_to_file(fp, xml_hierarchy_depth + 1);
+      const fabric_bit_data& bit =
+        memory_bank->fabric_bit_datas[(size_t)(fabric_bit)];
+      const fabric_blwl_length& lengths = memory_bank->blwl_lengths[bit.region];
+      if (bl_addr.size() == 0) {
+        VTR_ASSERT(wl_addr.size() == 0);
+        bl_addr.resize(lengths.bl);
+        wl_addr.resize(lengths.wl);
+        memset(&bl_addr[0], 'x', lengths.bl);
+        memset(&wl_addr[0], '0', lengths.wl);
+      } else {
+        VTR_ASSERT((fabric_size_t)(bl_addr.size()) == lengths.bl);
+        VTR_ASSERT((fabric_size_t)(wl_addr.size()) == lengths.wl);
+      }
+      fp << "<bl address=\"";
+      memset(&bl_addr[bit.bl], '1', 1);
+      fp << bl_addr.c_str();
+      memset(&bl_addr[bit.bl], 'x', 1);
+      fp << "\"/>\n";
+      /* Word line address */
+      write_tab_to_file(fp, xml_hierarchy_depth + 1);
+      fp << "<wl address=\"";
+      memset(&wl_addr[bit.wl], '1', 1);
+      fp << wl_addr.c_str();
+      memset(&wl_addr[bit.wl], '0', 1);
+      fp << "\"/>\n";
+      break;
+    }
     case CONFIG_MEM_FRAME_BASED: {
       write_tab_to_file(fp, xml_hierarchy_depth + 1);
       fp << "<frame address=\"";
@@ -162,7 +197,17 @@ static int write_fabric_regional_config_bit_to_xml_file(
   }
 
   int status = 0;
-
+  // Use string to print, instead of char by char
+  // You will find this much more faster than char by char
+  // We do not need to build the string for every BL/WL
+  // It is one-hot and sequal addr
+  // We start with all '0' (WL) or 'x' (BL)
+  // By setting "1' and resettting ('0' or 'x') at approriate bit position
+  // We could create one-hot string much faster
+  // Use FPGA 100K as example: old way needs 1300seconds to write 85Gig XML
+  /// New way only needs 80seconds to write identical XML
+  std::string bl_addr = "";
+  std::string wl_addr = "";
   write_tab_to_file(fp, xml_hierarchy_depth);
   fp << "<region ";
   fp << "id=\"";
@@ -170,14 +215,24 @@ static int write_fabric_regional_config_bit_to_xml_file(
   fp << "\"";
   fp << ">\n";
 
+  size_t bit_index = 0;
+  size_t total_bits = fabric_bitstream.region_bits(fabric_region).size();
+  size_t percentage = 0;
   for (const FabricBitId& fabric_bit :
        fabric_bitstream.region_bits(fabric_region)) {
     status = write_fabric_config_bit_to_xml_file(
       fp, bitstream_manager, fabric_bitstream, fabric_bit, config_type,
-      xml_hierarchy_depth + 1);
+      xml_hierarchy_depth + 1, bl_addr, wl_addr);
     if (1 == status) {
       return status;
     }
+    // Misc to print percentage of the process
+    bit_index++;
+    size_t temp = (bit_index * 100) / total_bits;
+    if (temp != percentage) {
+      VTR_LOG("  Progress: %lu%\r", percentage);
+      percentage = temp;
+    }
   }
 
   write_tab_to_file(fp, xml_hierarchy_depth);
diff --git a/openfpga/src/fpga_verilog/verilog_top_testbench.cpp b/openfpga/src/fpga_verilog/verilog_top_testbench.cpp
index 4146fa870..1ac006cca 100644
--- a/openfpga/src/fpga_verilog/verilog_top_testbench.cpp
+++ b/openfpga/src/fpga_verilog/verilog_top_testbench.cpp
@@ -1061,6 +1061,19 @@ static size_t calculate_num_config_clock_cycles(
                       (float)full_num_config_clock_cycles -
                     1.));
         }
+      } else if (BLWL_PROTOCOL_FLATTEN == config_protocol.bl_protocol_type() &&
+                 BLWL_PROTOCOL_FLATTEN == config_protocol.wl_protocol_type()) {
+        // Only support new fast way if both BL/WL protocols are flatten
+        // Based on 100K LE FPGA, we are wasting a lot of time to build
+        // MemoryBankFlattenFabricBitstream
+        // just to get the effective WL addr size. So wasteful of the resource
+        const FabricBitstreamMemoryBank* memory_bank =
+          fabric_bitstream.memory_bank_info();
+        // Must call this to prepare wls_to_skip
+        (const_cast<FabricBitstreamMemoryBank*>(memory_bank))
+          ->fast_configuration(fast_configuration, bit_value_to_skip);
+        num_config_clock_cycles =
+          1 + memory_bank->get_lontest_effective_wl_addr_size();
       } else if (BLWL_PROTOCOL_FLATTEN == config_protocol.bl_protocol_type()) {
         num_config_clock_cycles =
           1 + build_memory_bank_flatten_fabric_bitstream(
diff --git a/openfpga/src/fpga_verilog/verilog_top_testbench_memory_bank.cpp b/openfpga/src/fpga_verilog/verilog_top_testbench_memory_bank.cpp
index 8b70ef3fb..7b964be62 100644
--- a/openfpga/src/fpga_verilog/verilog_top_testbench_memory_bank.cpp
+++ b/openfpga/src/fpga_verilog/verilog_top_testbench_memory_bank.cpp
@@ -565,9 +565,15 @@ static void print_verilog_full_testbench_ql_memory_bank_flatten_bitstream(
   valid_file_stream(fp);
 
   /* Reorganize the fabric bitstream by the same address across regions */
-  MemoryBankFlattenFabricBitstream fabric_bits_by_addr =
-    build_memory_bank_flatten_fabric_bitstream(
-      fabric_bitstream, fast_configuration, bit_value_to_skip);
+  // New way to get the effective WL addr size.
+  // Based on 100K LE FPGA, we are wasting a lot of time to build
+  // MemoryBankFlattenFabricBitstream just to get size(). So wasteful of the
+  // resource
+  const FabricBitstreamMemoryBank* memory_bank =
+    fabric_bitstream.memory_bank_info();
+  // Must call this to prepare wls_to_skip
+  (const_cast<FabricBitstreamMemoryBank*>(memory_bank))
+    ->fast_configuration(fast_configuration, bit_value_to_skip);
 
   /* Feed address and data input pair one by one
    * Note: the first cycle is reserved for programming reset
@@ -604,7 +610,7 @@ static void print_verilog_full_testbench_ql_memory_bank_flatten_bitstream(
 
   /* Define a constant for the bitstream length */
   print_verilog_define_flag(fp, std::string(TOP_TB_BITSTREAM_LENGTH_VARIABLE),
-                            fabric_bits_by_addr.size());
+                            memory_bank->get_lontest_effective_wl_addr_size());
   print_verilog_define_flag(fp, std::string(TOP_TB_BITSTREAM_WIDTH_VARIABLE),
                             bl_port_width + wl_port_width);