diff --git a/docs/source/manual/arch_lang/config_protocol.rst b/docs/source/manual/arch_lang/config_protocol.rst
index 04b47d638..9b5cfe3ef 100644
--- a/docs/source/manual/arch_lang/config_protocol.rst
+++ b/docs/source/manual/arch_lang/config_protocol.rst
@@ -167,10 +167,39 @@ The BL and WL protocols can be customized through the XML syntax ``bl`` and ``wl
-.. option:: protocol="decoder|flatten"
+.. option:: protocol="decoder|flatten|shift_register"
- - ``decoder``: BLs or WLs are controlled by decoders with address lines. For BLs, the decoder includes an enable signal as well as a data input signal. This is the default option if not specified.
- - ``flatten``: BLs or WLs are directly available at the FPGA fabric. In this way, all the configurable memorys on the same WL can be written through the BL signals in one clock cycle
+ - ``decoder``: BLs or WLs are controlled by decoders with address lines. For BLs, the decoder includes an enable signal as well as a data input signal. This is the default option if not specified. See an illustrative example in :numref:`fig_memory_bank_decoder_based`.
+ - ``flatten``: BLs or WLs are directly available at the FPGA fabric. In this way, all the configurable memorys on the same WL can be written through the BL signals in one clock cycle. See an illustrative example in :numref:`fig_memory_bank_flatten`.
+ - ``shift_register``: BLs or WLs are controlled by shift register chains. The BL/WLs are programming each time the shift register chains are fully loaded. See an illustrative example in :numref:`fig_memory_bank_shift_register`.
+
+.. _fig_memory_bank_decoder_based:
+
+.. figure:: figures/memory_bank_decoder.svg
+ :scale: 30%
+ :alt: map to buried treasure
+
+ Example of (a) a memory organization using address decoders; (b) single memory bank across the fabric; and (c) multiple memory banks across the fabric.
+
+
+.. _fig_memory_bank_flatten:
+
+.. figure:: figures/memory_bank_flatten.svg
+ :scale: 30%
+ :alt: map to buried treasure
+
+ Example of (a) a memory organization with direct access to BL/WL signals; (b) single memory bank across the fabric; and (c) multiple memory banks across the fabric.
+
+.. _fig_memory_bank_shift_register:
+
+.. figure:: figures/memory_bank_shift_register.svg
+ :scale: 30%
+ :alt: map to buried treasure
+
+ Example of (a) a memory organization using shift register chains to control BL/WLs; (b) single memory bank across the fabric; and (c) multiple memory banks across the fabric.
+
+
+.. note:: The flip-flop for WL shift register requires an enable signal to gate WL signals when loading WL shift registers
.. note:: Memory-bank decoders does require a memory cell to have
diff --git a/docs/source/manual/arch_lang/figures/memory_bank_decoder.svg b/docs/source/manual/arch_lang/figures/memory_bank_decoder.svg
new file mode 100644
index 000000000..d392fe785
--- /dev/null
+++ b/docs/source/manual/arch_lang/figures/memory_bank_decoder.svg
@@ -0,0 +1,2449 @@
+
+
+
diff --git a/docs/source/manual/arch_lang/figures/memory_bank_flatten.svg b/docs/source/manual/arch_lang/figures/memory_bank_flatten.svg
new file mode 100644
index 000000000..c7c87c656
--- /dev/null
+++ b/docs/source/manual/arch_lang/figures/memory_bank_flatten.svg
@@ -0,0 +1,2440 @@
+
+
+
diff --git a/docs/source/manual/arch_lang/figures/memory_bank_shift_register.svg b/docs/source/manual/arch_lang/figures/memory_bank_shift_register.svg
new file mode 100644
index 000000000..04ab9fe34
--- /dev/null
+++ b/docs/source/manual/arch_lang/figures/memory_bank_shift_register.svg
@@ -0,0 +1,2770 @@
+
+
+
diff --git a/docs/source/manual/file_formats/fabric_bitstream.rst b/docs/source/manual/file_formats/fabric_bitstream.rst
index 2def5d269..6cd8085de 100644
--- a/docs/source/manual/file_formats/fabric_bitstream.rst
+++ b/docs/source/manual/file_formats/fabric_bitstream.rst
@@ -65,6 +65,112 @@ The information depends on the type of configuration procotol.
.. note:: When there are multiple configuration regions, each ```` may consist of multiple bits. For example, ``0110`` represents the bits for 4 configuration regions, where the 4 digits correspond to the bits from region ``0, 1, 2, 3`` respectively.
+.. option:: ql_memory_bank using decoders
+
+ Multiple lines will be included, each of which is organized as .
+ The size of address line and data input bits are shown as a comment in the bitstream file, which eases the development of bitstream downloader.
+ For example
+
+ .. code-block:: verilog
+
+ // Bitstream width (LSB -> MSB):
+
+ The first part represents the Bit-Line address.
+ The second part represents the Word-Line address.
+ The third part represents the configuration bit.
+ For example
+
+ .. code-block:: xml
+
+
+
+ ...
+
+
+ .. note:: When there are multiple configuration regions, each ```` may consist of multiple bits. For example, ``0110`` represents the bits for 4 configuration regions, where the 4 digits correspond to the bits from region ``0, 1, 2, 3`` respectively.
+
+.. option:: ql_memory_bank using flatten BL and WLs
+
+ Multiple lines will be included, each of which is organized as .
+ The size of data are shown as a comment in the bitstream file, which eases the development of bitstream downloader.
+ For example
+
+ .. code-block:: verilog
+
+ // Bitstream width (LSB -> MSB):
+
+ The first part represents the Bit-Line data from multiple configuration regions.
+ The second part represents the Word-Line data from multiple configuration regions.
+ For example
+
+ .. code-block:: xml
+
+
+
+ ...
+
+
+ .. note:: The WL data of region is one-hot.
+
+.. option:: ql_memory_bank using shift registers
+
+ Multiple lines will be included, each of which is organized as or .
+ The size of data are shown as a comment in the bitstream file, which eases the development of bitstream downloader.
+ For example
+
+ .. code-block:: verilog
+
+ // Bitstream word count: 36
+ // Bitstream bl word size: 39
+ // Bitstream wl word size: 37
+ // Bitstream width (LSB -> MSB):
+
+ The bitstream data are organized by words. Each word consists of two parts, BL data to be loaded to BL shift register chains and WL data to be loaded to WL shift register chains
+ For example
+
+ .. code-block:: xml
+
+ // Word 0
+ // BL Part
+ ----
+ ^
+ |
+ ... BL word size
+ |
+ v
+ ----
+ // Word 0
+ // WL Part
+ ----
+ ^
+ |
+ ... WL word size
+ |
+ v
+ ----
+ // Word 1
+ // BL Part
+ ----
+ ^
+ |
+ ... BL word size
+ |
+ v
+ ----
+ // Word 1
+ // WL Part
+ ----
+ ^
+ |
+ ... WL word size
+ |
+ v
+ ----
+ ... // More words
+
+ .. note:: The BL/WL data may be multi-bit, while each bit corresponds to a configuration region
+ .. note:: The WL data of region is one-hot.
+
.. option:: frame_based
Multiple lines will be included, each of which is organized as ````.
diff --git a/openfpga/src/fabric/build_top_module_memory_bank.cpp b/openfpga/src/fabric/build_top_module_memory_bank.cpp
index dd3dceafb..18accc9bb 100644
--- a/openfpga/src/fabric/build_top_module_memory_bank.cpp
+++ b/openfpga/src/fabric/build_top_module_memory_bank.cpp
@@ -301,11 +301,18 @@ ModuleId build_wl_shift_register_chain_module(ModuleManager& module_manager,
circuit_lib.port_size(sram_output_ports[0]));
module_manager.add_port(mem_module, chain_tail_port, ModuleManager::MODULE_OUTPUT_PORT);
- /* Add the output ports to output BL signals */
+ /* Add the output ports to output BL/WL signals */
BasicPort chain_wl_port(WL_SHIFT_REGISTER_CHAIN_WL_OUT_NAME,
num_mems);
module_manager.add_port(mem_module, chain_wl_port, ModuleManager::MODULE_OUTPUT_PORT);
+ /* Add the output ports to output WLR signals */
+ if (!sram_wlr_ports.empty()) {
+ BasicPort chain_wlr_port(WL_SHIFT_REGISTER_CHAIN_WLR_OUT_NAME,
+ num_mems);
+ module_manager.add_port(mem_module, chain_wlr_port, ModuleManager::MODULE_OUTPUT_PORT);
+ }
+
/* Find the sram module in the module manager */
ModuleId sram_mem_module = module_manager.find_module(circuit_lib.model_name(sram_model));
diff --git a/openfpga_flow/openfpga_arch/k4_N4_40nm_qlbanksr_wlr_openfpga.xml b/openfpga_flow/openfpga_arch/k4_N4_40nm_qlbanksr_wlr_openfpga.xml
index da0e46ded..d78c980e5 100644
--- a/openfpga_flow/openfpga_arch/k4_N4_40nm_qlbanksr_wlr_openfpga.xml
+++ b/openfpga_flow/openfpga_arch/k4_N4_40nm_qlbanksr_wlr_openfpga.xml
@@ -166,11 +166,11 @@
-
+
-
+
@@ -181,7 +181,7 @@
-
+
diff --git a/openfpga_flow/regression_test_scripts/basic_reg_test.sh b/openfpga_flow/regression_test_scripts/basic_reg_test.sh
index 09c4d7f10..0b7a3dc95 100755
--- a/openfpga_flow/regression_test_scripts/basic_reg_test.sh
+++ b/openfpga_flow/regression_test_scripts/basic_reg_test.sh
@@ -60,6 +60,7 @@ run-task basic_tests/full_testbench/multi_region_ql_memory_bank --debug --show_t
run-task basic_tests/full_testbench/ql_memory_bank_flatten --debug --show_thread_logs
run-task basic_tests/full_testbench/ql_memory_bank_flatten_use_wlr --debug --show_thread_logs
run-task basic_tests/full_testbench/ql_memory_bank_shift_register --debug --show_thread_logs
+run-task basic_tests/full_testbench/ql_memory_bank_shift_register_use_wlr --debug --show_thread_logs
echo -e "Testing testbenches without self checking features";
run-task basic_tests/full_testbench/full_testbench_without_self_checking --debug --show_thread_logs
diff --git a/openfpga_flow/tasks/basic_tests/full_testbench/ql_memory_bank_shift_register_use_wlr/config/task.conf b/openfpga_flow/tasks/basic_tests/full_testbench/ql_memory_bank_shift_register_use_wlr/config/task.conf
new file mode 100644
index 000000000..56dee6c68
--- /dev/null
+++ b/openfpga_flow/tasks/basic_tests/full_testbench/ql_memory_bank_shift_register_use_wlr/config/task.conf
@@ -0,0 +1,45 @@
+# = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
+# Configuration file for running experiments
+# = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
+# timeout_each_job : FPGA Task script splits fpga flow into multiple jobs
+# Each job execute fpga_flow script on combination of architecture & benchmark
+# timeout_each_job is timeout for each job
+# = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
+
+[GENERAL]
+run_engine=openfpga_shell
+power_tech_file = ${PATH:OPENFPGA_PATH}/openfpga_flow/tech/PTM_45nm/45nm.xml
+power_analysis = true
+spice_output=false
+verilog_output=true
+timeout_each_job = 20*60
+fpga_flow=yosys_vpr
+
+[OpenFPGA_SHELL]
+openfpga_shell_template=${PATH:OPENFPGA_PATH}/openfpga_flow/openfpga_shell_scripts/write_full_testbench_example_script.openfpga
+openfpga_arch_file=${PATH:OPENFPGA_PATH}/openfpga_flow/openfpga_arch/k4_N4_40nm_qlbanksr_wlr_openfpga.xml
+openfpga_sim_setting_file=${PATH:OPENFPGA_PATH}/openfpga_flow/openfpga_simulation_settings/auto_shift_register_sim_openfpga.xml
+
+openfpga_vpr_device_layout=
+openfpga_fast_configuration=
+
+[ARCHITECTURES]
+arch0=${PATH:OPENFPGA_PATH}/openfpga_flow/vpr_arch/k4_N4_tileable_40nm.xml
+
+[BENCHMARKS]
+bench0=${PATH:OPENFPGA_PATH}/openfpga_flow/benchmarks/micro_benchmark/and2/and2.v
+bench1=${PATH:OPENFPGA_PATH}/openfpga_flow/benchmarks/micro_benchmark/or2/or2.v
+bench2=${PATH:OPENFPGA_PATH}/openfpga_flow/benchmarks/micro_benchmark/and2_latch/and2_latch.v
+
+[SYNTHESIS_PARAM]
+bench0_top = and2
+bench0_chan_width = 300
+
+bench1_top = or2
+bench1_chan_width = 300
+
+bench2_top = and2_latch
+bench2_chan_width = 300
+
+[SCRIPT_PARAM_MIN_ROUTE_CHAN_WIDTH]
+end_flow_with_test=