Merge pull request #86 from lnis-uofu/k4_N8_interface

Merging registered/non-registered related IO definition in k4_N8 device
2021-01-25 10:38:13 -08:00 · 2021-01-25 10:38:13 -08:00 · d2240d8539
parent 658edb47f7 f1eb4c4f88
commit d2240d8539
4 changed files with 574 additions and 190 deletions
--- a/ARCH/openfpga_arch_template/k4_N8_reset_softadder_register_scan_chain_caravel_io_skywater130nm_fdhd_cc_openfpga.xml
+++ b/ARCH/openfpga_arch_template/k4_N8_reset_softadder_register_scan_chain_caravel_io_skywater130nm_fdhd_cc_openfpga.xml
@ -1,14 +1,14 @@
 <!-- Architecture annotation for OpenFPGA framework
-     This annotation supports the k4_frac_cc_sky130nm.xml
-     - General purpose logic block
-       - K = 6, N = 10, I = 40
-       - Single mode
-     - Routing architecture
-       - L = 4, fc_in = 0.15, fc_out = 0.1
-     - Skywater 130nm PDK
-       - circuit models are binded to the opensource skywater
-         foundry middle-speed (ms) standard cell library
-  -->
+This annotation supports the k4_frac_cc_sky130nm.xml
+- General purpose logic block
+- K = 6, N = 10, I = 40
+- Single mode
+- Routing architecture
+- L = 4, fc_in = 0.15, fc_out = 0.1
+- Skywater 130nm PDK
+- circuit models are binded to the opensource skywater
+foundry middle-speed (ms) standard cell library
+-->
 <openfpga_architecture>
  <technology_library>
    <device_library>
@ -43,6 +43,18 @@
        10e-12
      </delay_matrix>
    </circuit_model>
+    <circuit_model type="inv_buf" name="sky130_fd_sc_hd__buf_1" prefix="sky130_fd_sc_hd__buf_1" is_default="false" verilog_netlist="${SKYWATER_OPENFPGA_HOME}/PDK/skywater-pdk/libraries/sky130_fd_sc_hd/latest/cells/buf/sky130_fd_sc_hd__buf_1.v">
+      <design_technology type="cmos" topology="buffer" size="1"/>
+      <device_technology device_model_name="logic"/>
+      <port type="input" prefix="in" lib_name="A" size="1"/>
+      <port type="output" prefix="out" lib_name="X" size="1"/>
+      <delay_matrix type="rise" in_port="in" out_port="out">
+        10e-12
+      </delay_matrix>
+      <delay_matrix type="fall" in_port="in" out_port="out">
+        10e-12
+      </delay_matrix>
+    </circuit_model>
    <circuit_model type="inv_buf" name="sky130_fd_sc_hd__buf_2" prefix="sky130_fd_sc_hd__buf_2" is_default="false" verilog_netlist="${SKYWATER_OPENFPGA_HOME}/PDK/skywater-pdk/libraries/sky130_fd_sc_hd/latest/cells/buf/sky130_fd_sc_hd__buf_2.v">
      <design_technology type="cmos" topology="buffer" size="1" num_level="2" f_per_stage="2"/>
      <device_technology device_model_name="logic"/>
@ -67,6 +79,30 @@
        10e-12
      </delay_matrix>
    </circuit_model>
+    <circuit_model type="inv_buf" name="sky130_fd_sc_hd__buf_8" prefix="sky130_fd_sc_hd__buf_8" is_default="false" verilog_netlist="${SKYWATER_OPENFPGA_HOME}/PDK/skywater-pdk/libraries/sky130_fd_sc_hd/latest/cells/buf/sky130_fd_sc_hd__buf_8.v">
+      <design_technology type="cmos" topology="buffer" size="1" num_level="2" f_per_stage="4"/>
+      <device_technology device_model_name="logic"/>
+      <port type="input" prefix="in" lib_name="A" size="1"/>
+      <port type="output" prefix="out" lib_name="X" size="1"/>
+      <delay_matrix type="rise" in_port="in" out_port="out">
+        10e-12
+      </delay_matrix>
+      <delay_matrix type="fall" in_port="in" out_port="out">
+        10e-12
+      </delay_matrix>
+    </circuit_model>
+    <circuit_model type="inv_buf" name="sky130_fd_sc_hd__buf_16" prefix="sky130_fd_sc_hd__buf_16" is_default="false" verilog_netlist="${SKYWATER_OPENFPGA_HOME}/PDK/skywater-pdk/libraries/sky130_fd_sc_hd/latest/cells/buf/sky130_fd_sc_hd__buf_16.v">
+      <design_technology type="cmos" topology="buffer" size="1" num_level="2" f_per_stage="4"/>
+      <device_technology device_model_name="logic"/>
+      <port type="input" prefix="in" lib_name="A" size="1"/>
+      <port type="output" prefix="out" lib_name="X" size="1"/>
+      <delay_matrix type="rise" in_port="in" out_port="out">
+        10e-12
+      </delay_matrix>
+      <delay_matrix type="fall" in_port="in" out_port="out">
+        10e-12
+      </delay_matrix>
+    </circuit_model>
    <circuit_model type="inv_buf" name="sky130_fd_sc_hd__inv_2" prefix="sky130_fd_sc_hd__inv_2" is_default="false" verilog_netlist="${SKYWATER_OPENFPGA_HOME}/PDK/skywater-pdk/libraries/sky130_fd_sc_hd/latest/cells/inv/sky130_fd_sc_hd__inv_2.v">
      <design_technology type="cmos" topology="buffer" size="1"/>
      <device_technology device_model_name="logic"/>
@ -79,6 +115,35 @@
        10e-12
      </delay_matrix>
    </circuit_model>
+    <circuit_model type="inv_buf" name="sky130_fd_sc_hd__inv_4" prefix="sky130_fd_sc_hd__inv_4" is_default="false" verilog_netlist="${SKYWATER_OPENFPGA_HOME}/PDK/skywater-pdk/libraries/sky130_fd_sc_hd/latest/cells/inv/sky130_fd_sc_hd__inv_4.v">
+      <design_technology type="cmos" topology="buffer" size="1"/>
+      <device_technology device_model_name="logic"/>
+      <port type="input" prefix="in" lib_name="A" size="1"/>
+      <port type="output" prefix="out" lib_name="Y" size="1"/>
+      <delay_matrix type="rise" in_port="in" out_port="out">
+        10e-12
+      </delay_matrix>
+      <delay_matrix type="fall" in_port="in" out_port="out">
+        10e-12
+      </delay_matrix>
+    </circuit_model>
+    <!--  Trick OpenFPGA to avoid auto-generating TGATE modules, which are not used in PnR -->
+    <circuit_model type="pass_gate" name="TGATE" prefix="TGATE" is_default="true" verilog_netlist="${SKYWATER_OPENFPGA_HOME}/HDL/common/fd_hd_mux_custom_cells_tt.v">
+      <design_technology type="cmos" topology="transmission_gate" nmos_size="1" pmos_size="2"/>
+      <device_technology device_model_name="logic"/>
+      <input_buffer exist="false"/>
+      <output_buffer exist="false"/>
+      <port type="input" prefix="in" size="1"/>
+      <port type="input" prefix="sel" size="1"/>
+      <port type="input" prefix="selb" size="1"/>
+      <port type="output" prefix="out" size="1"/>
+      <delay_matrix type="rise" in_port="in sel selb" out_port="out">
+        10e-12 5e-12 5e-12
+      </delay_matrix>
+      <delay_matrix type="fall" in_port="in sel selb" out_port="out">
+        10e-12 5e-12 5e-12
+      </delay_matrix>
+    </circuit_model>
    <circuit_model type="gate" name="sky130_fd_sc_hd__or2_1" prefix="sky130_fd_sc_hd__or2_1" is_default="true" verilog_netlist="${SKYWATER_OPENFPGA_HOME}/PDK/skywater-pdk/libraries/sky130_fd_sc_hd/latest/cells/or2/sky130_fd_sc_hd__or2_1.v">
      <design_technology type="cmos" topology="OR"/>
      <device_technology device_model_name="logic"/>
@ -95,12 +160,12 @@
      </delay_matrix>
    </circuit_model>
    <!-- Define a circuit model for the standard cell MUX2
-         OpenFPGA requires the following truth table for the MUX2
-         When the select signal sel is enabled, the first input, i.e., in0
-         will be propagated to the output, i.e., out
-         If your standard cell provider does not offer the exact truth table,
-         you can simply swap the inputs as shown in the example below
-      -->
+      OpenFPGA requires the following truth table for the MUX2
+      When the select signal sel is enabled, the first input, i.e., in0
+      will be propagated to the output, i.e., out
+      If your standard cell provider does not offer the exact truth table,
+      you can simply swap the inputs as shown in the example below
+    -->
    <circuit_model type="gate" name="sky130_fd_sc_hd__mux2_1" prefix="sky130_fd_sc_hd__mux2_1" verilog_netlist="${SKYWATER_OPENFPGA_HOME}/PDK/skywater-pdk/libraries/sky130_fd_sc_hd/latest/cells/mux2/sky130_fd_sc_hd__mux2_1.v">
      <design_technology type="cmos" topology="MUX2"/>
      <device_technology device_model_name="logic"/>
@ -118,7 +183,7 @@
      <port type="input" prefix="in" size="1"/>
      <port type="output" prefix="out" size="1"/>
      <wire_param model_type="pi" R="101" C="22.5e-15" num_level="1"/>
-      <!-- model_type could be T, res_val and cap_val DON'T CARE -->
+        <!-- model_type could be T, res_val and cap_val DON'T CARE -->
    </circuit_model>
    <circuit_model type="wire" name="direct_interc" prefix="direct_interc" is_default="true">
      <design_technology type="cmos"/>
@ -127,7 +192,79 @@
      <port type="input" prefix="in" size="1"/>
      <port type="output" prefix="out" size="1"/>
      <wire_param model_type="pi" R="0" C="0" num_level="1"/>
-      <!-- model_type could be T, res_val cap_val should be defined -->
+        <!-- model_type could be T, res_val cap_val should be defined -->
+    </circuit_model>
+    <circuit_model type="mux" name="mux_2level" prefix="mux_2level" dump_structural_verilog="true">
+      <design_technology type="cmos" structure="multi_level" num_level="2" add_const_input="true" const_input_val="1" local_encoder="true"/>
+      <input_buffer exist="false"/>
+      <output_buffer exist="true" circuit_model_name="sky130_fd_sc_hd__buf_1"/>
+      <pass_gate_logic circuit_model_name="sky130_fd_sc_hd__mux2_1"/>
+      <port type="input" prefix="in" size="1"/>
+      <port type="output" prefix="out" size="1"/>
+      <port type="sram" prefix="sram" size="1"/>
+    </circuit_model>
+    <circuit_model type="mux" name="mux_2level_tapbuf4" prefix="mux_2level_tapbuf4" dump_structural_verilog="true">
+      <design_technology type="cmos" structure="multi_level" num_level="2" add_const_input="true" const_input_val="1" local_encoder="true"/>
+      <input_buffer exist="false"/>
+      <output_buffer exist="true" circuit_model_name="sky130_fd_sc_hd__buf_4"/>
+      <pass_gate_logic circuit_model_name="sky130_fd_sc_hd__mux2_1"/>
+      <port type="input" prefix="in" size="1"/>
+      <port type="output" prefix="out" size="1"/>
+      <port type="sram" prefix="sram" size="1"/>
+    </circuit_model>
+    <circuit_model type="mux" name="mux_2level_tapbuf8" prefix="mux_2level_tapbuf8" dump_structural_verilog="true">
+      <design_technology type="cmos" structure="multi_level" num_level="2" add_const_input="true" const_input_val="1" local_encoder="true"/>
+      <input_buffer exist="false"/>
+      <output_buffer exist="true" circuit_model_name="sky130_fd_sc_hd__buf_8"/>
+      <pass_gate_logic circuit_model_name="sky130_fd_sc_hd__mux2_1"/>
+      <port type="input" prefix="in" size="1"/>
+      <port type="output" prefix="out" size="1"/>
+      <port type="sram" prefix="sram" size="1"/>
+    </circuit_model>
+    <circuit_model type="mux" name="mux_2level_tapbuf16" prefix="mux_2level_tapbuf16" dump_structural_verilog="true">
+      <design_technology type="cmos" structure="multi_level" num_level="2" add_const_input="true" const_input_val="1" local_encoder="true"/>
+      <input_buffer exist="false"/>
+      <output_buffer exist="true" circuit_model_name="sky130_fd_sc_hd__buf_16"/>
+      <pass_gate_logic circuit_model_name="sky130_fd_sc_hd__mux2_1"/>
+      <port type="input" prefix="in" size="1"/>
+      <port type="output" prefix="out" size="1"/>
+      <port type="sram" prefix="sram" size="1"/>
+    </circuit_model>
+    <circuit_model type="mux" name="mux_1level" prefix="mux_1level" dump_structural_verilog="true">
+      <design_technology type="cmos" structure="multi_level" num_level="1" add_const_input="true" const_input_val="1" local_encoder="true"/>
+      <input_buffer exist="false"/>
+      <output_buffer exist="true" circuit_model_name="sky130_fd_sc_hd__inv_1"/>
+      <pass_gate_logic circuit_model_name="sky130_fd_sc_hd__mux2_1"/>
+      <port type="input" prefix="in" size="1"/>
+      <port type="output" prefix="out" size="1"/>
+      <port type="sram" prefix="sram" size="1"/>
+    </circuit_model>
+    <circuit_model type="mux" name="mux_1level_io" prefix="mux_1level_io" dump_structural_verilog="true">
+      <design_technology type="cmos" structure="multi_level" num_level="1" local_encoder="false"/>
+      <input_buffer exist="false"/>
+      <output_buffer exist="true" circuit_model_name="sky130_fd_sc_hd__buf_1"/>
+      <pass_gate_logic circuit_model_name="sky130_fd_sc_hd__mux2_1"/>
+      <port type="input" prefix="in" size="1"/>
+      <port type="output" prefix="out" size="1"/>
+      <port type="sram" prefix="sram" size="1"/>
+    </circuit_model>
+    <circuit_model type="mux" name="mux_1level_fabric" prefix="mux_1level_fabric" dump_structural_verilog="true">
+      <design_technology type="cmos" structure="multi_level" num_level="1" local_encoder="false"/>
+      <input_buffer exist="false"/>
+      <output_buffer exist="true" circuit_model_name="sky130_fd_sc_hd__buf_1"/>
+      <pass_gate_logic circuit_model_name="sky130_fd_sc_hd__mux2_1"/>
+      <port type="input" prefix="in" size="1"/>
+      <port type="output" prefix="out" size="1"/>
+      <port type="sram" prefix="sram" size="1"/>
+    </circuit_model>
+    <circuit_model type="mux" name="mux_1level_tapbuf" prefix="mux_1level_tapbuf" dump_structural_verilog="true">
+      <design_technology type="cmos" structure="multi_level" num_level="1" add_const_input="true" const_input_val="1" local_encoder="true"/>
+      <input_buffer exist="false"/>
+      <output_buffer exist="true" circuit_model_name="sky130_fd_sc_hd__inv_4"/>
+      <pass_gate_logic circuit_model_name="sky130_fd_sc_hd__mux2_1"/>
+      <port type="input" prefix="in" size="1"/>
+      <port type="output" prefix="out" size="1"/>
+      <port type="sram" prefix="sram" size="1"/>
    </circuit_model>
    <circuit_model type="mux" name="mux_tree" prefix="mux_tree" is_default="true" dump_structural_verilog="true">
      <design_technology type="cmos" structure="tree" add_const_input="true" const_input_val="1"/>
@ -147,7 +284,6 @@
      <port type="output" prefix="out" size="1"/>
      <port type="sram" prefix="sram" size="1"/>
    </circuit_model>
-    <!--DFF subckt ports should be defined as <D> <Q> <CLK> <RESET> <SET>  -->
    <circuit_model type="ff" name="sky130_fd_sc_hd__sdfrtp_1" prefix="sky130_fd_sc_hd__sdfrtp_1" verilog_netlist="${SKYWATER_OPENFPGA_HOME}/PDK/skywater-pdk/libraries/sky130_fd_sc_hd/latest/cells/sdfrtp/sky130_fd_sc_hd__sdfrtp_1.v">
      <design_technology type="cmos"/>
      <input_buffer exist="true" circuit_model_name="sky130_fd_sc_hd__inv_1"/>
@ -167,33 +303,37 @@
      <lut_input_buffer exist="true" circuit_model_name="sky130_fd_sc_hd__buf_2"/>
      <lut_intermediate_buffer exist="true" circuit_model_name="sky130_fd_sc_hd__buf_2" location_map="-1-"/>
      <pass_gate_logic circuit_model_name="sky130_fd_sc_hd__mux2_1"/>
-      <port type="input" prefix="in" size="4" tri_state_map="---1" circuit_model_name="sky130_fd_sc_hd__or2_1"/>
+      <port type="input" prefix="in" size="4"/>
      <port type="output" prefix="lut2_out" size="2" lut_frac_level="2" lut_output_mask="2,3"/>
      <port type="output" prefix="lut4_out" size="1" lut_output_mask="0"/>
      <port type="sram" prefix="sram" size="16"/>
-      <port type="sram" prefix="mode" size="1" mode_select="true" circuit_model_name="sky130_fd_sc_hd__dfrtp_1" default_val="1"/>
    </circuit_model>
-    <!--Scan-chain DFF subckt ports should be defined as <D> <Q> <Qb> <CLK> <RESET> <SET>  -->
-    <circuit_model type="ccff" name="sky130_fd_sc_hd__dfrtp_1" prefix="sky130_fd_sc_hd__dfrtp_1" verilog_netlist="${SKYWATER_OPENFPGA_HOME}/PDK/skywater-pdk/libraries/sky130_fd_sc_hd/latest/cells/dfrtp/sky130_fd_sc_hd__dfrtp_1.v">
+    <!-- new ccFF  -->
+    <circuit_model type="ccff" name="CFGSDFFR" prefix="CFGSDFFR" spice_netlist="${OPENFPGA_PATH}/openfpga_flow/openfpga_cell_library/spice/dff.sp" verilog_netlist="${OPENFPGA_PATH}/openfpga_flow/openfpga_cell_library/verilog/dff.v">
      <design_technology type="cmos"/>
      <input_buffer exist="true" circuit_model_name="sky130_fd_sc_hd__inv_1"/>
      <output_buffer exist="true" circuit_model_name="sky130_fd_sc_hd__inv_1"/>
+      <port type="input" prefix="pReset" lib_name="RST" size="1" is_global="true" default_val="0" is_reset="true" is_prog="true"/>
+      <port type="input" prefix="Test_en" lib_name="SE" size="1" is_global="true" default_val="0"/>
+      <port type="input" prefix="CFG_DONE" lib_name="CFGE" size="1" is_global="true" default_val="0" is_config_enable="true"/>
      <port type="input" prefix="D" size="1"/>
+      <port type="input" prefix="SI" size="1"/>
      <port type="output" prefix="Q" size="1"/>
-      <port type="clock" prefix="prog_clk" lib_name="CLK" size="1" is_global="true" default_val="0" is_prog="true"/>
-      <port type="input" prefix="pReset" lib_name="RESET_B" size="1" is_global="true" default_val="1" is_prog="true" is_reset="true"/> 
+      <port type="output" prefix="CFGQN" size="1"/>
+      <port type="output" prefix="CFGQ" size="1"/>
+      <port type="clock" prefix="prog_clk" lib_name="CK" size="1" is_global="true" default_val="0" is_prog="true"/>
    </circuit_model>
-    <circuit_model type="iopad" name="EMBEDDED_IO_HD" prefix="EMBEDDED_IO_HD" is_default="true" verilog_netlist="${SKYWATER_OPENFPGA_HOME}/HDL/common/digital_io_hd.v">
+    <circuit_model type="iopad" name="IO" prefix="IO" is_default="true" verilog_netlist="${SKYWATER_OPENFPGA_HOME}/HDL/common/ql_iso_io_logic.v">
      <design_technology type="cmos"/>
      <input_buffer exist="true" circuit_model_name="sky130_fd_sc_hd__inv_1"/>
      <output_buffer exist="true" circuit_model_name="sky130_fd_sc_hd__inv_1"/>
-      <port type="input" prefix="SOC_IN" lib_name="SOC_IN" size="1" is_global="true" is_io="true" is_data_io="true"/>
-      <port type="output" prefix="SOC_OUT" lib_name="SOC_OUT" size="1" is_global="true" is_io="true" is_data_io="true"/>
-      <port type="output" prefix="SOC_DIR" lib_name="SOC_DIR" size="1" is_global="true" is_io="true"/>
-      <port type="input" prefix="IO_ISOL_N" lib_name="IO_ISOL_N" size="1" is_global="true" default_val="1"/>
+      <port type="input" prefix="A2F" lib_name="SOC_IN" size="1" is_global="true" is_io="true" is_data_io="true"/>
+      <port type="output" prefix="F2A" lib_name="SOC_OUT" size="1" is_global="true" is_io="true" is_data_io="true"/>
      <port type="output" prefix="inpad" lib_name="FPGA_IN" size="1"/>
      <port type="input" prefix="outpad" lib_name="FPGA_OUT" size="1"/>
-      <port type="sram" prefix="en" lib_name="FPGA_DIR" size="1" mode_select="true" circuit_model_name="sky130_fd_sc_hd__dfrtp_1" default_val="1"/>
+      <port type="input" prefix="IO_ISOL_N" lib_name="IO_ISOL_N" size="1" is_global="true" default_val="1" is_config_enable="true"/>
+      <port type="input" prefix="CFG_DONE" lib_name="CFG_DONE" size="1" is_global="true" default_val="0" is_config_enable="true"/>
+      <port type="sram" prefix="io_dir" lib_name="FPGA_IO_DIR" size="1" mode_select="true" circuit_model_name="CFGSDFFR" default_val="1"/>
    </circuit_model>
    <circuit_model type="hard_logic" name="sky130_fd_sc_hd__mux2_1_wrapper" prefix="sky130_fd_sc_hd__mux2_1_wrapper" verilog_netlist="${SKYWATER_OPENFPGA_HOME}/HDL/common/sky130_fd_sc_hd_wrapper.v">
      <design_technology type="cmos"/>
@ -207,7 +347,7 @@
    </circuit_model>
  </circuit_library>
  <configuration_protocol>
-    <organization type="scan_chain" circuit_model_name="sky130_fd_sc_hd__dfrtp_1" num_regions="1"/>
+    <organization type="scan_chain" circuit_model_name="CFGSDFFR" num_regions="1"/>
  </configuration_protocol>
  <connection_block>
    <switch name="ipin_cblock" circuit_model_name="mux_tree_tapbuf"/>
@ -228,37 +368,60 @@
    <direct name="scan_chain" circuit_model_name="direct_interc" type="column" x_dir="positive" y_dir="positive"/>
  </direct_connection>
  <tile_annotations>
-      <global_port name="clk" is_clock="true" default_val="0">
-          <tile name="clb" port="clk[0:3]" x="-1" y="-1"/>
-      </global_port>
-      <global_port name="Reset" is_reset="true" default_val="1">
-          <tile name="clb" port="reset" x="-1" y="-1"/>
-      </global_port>
+    <global_port name="clk" is_clock="true" default_val="0">
+      <tile name="clb" port="clk[0:3]" x="-1" y="-1"/>
+      <tile name="io_top" port="clk[0:3]" x="-1" y="-1"/>
+      <tile name="io_right" port="clk[0:3]" x="-1" y="-1"/>
+      <tile name="io_bottom" port="clk[0:3]" x="-1" y="-1"/>
+      <tile name="io_left" port="clk[0:3]" x="-1" y="-1"/>
+    </global_port>
+    <global_port name="reset" is_reset="true" default_val="0">
+      <tile name="clb" port="reset" x="-1" y="-1"/>
+      <tile name="io_top" port="reset" x="-1" y="-1"/>
+      <tile name="io_right" port="reset" x="-1" y="-1"/>
+      <tile name="io_bottom" port="reset" x="-1" y="-1"/>
+      <tile name="io_left" port="reset" x="-1" y="-1"/>
+    </global_port>
  </tile_annotations>
  <pb_type_annotations>
    <!-- physical pb_type binding in complex block IO -->
    <pb_type name="io" physical_mode_name="physical" idle_mode_name="inpad"/>
-    <!-- IMPORTANT: must set unused I/Os to operating in INPUT mode !!! -->
-    <pb_type name="io[physical].iopad" circuit_model_name="EMBEDDED_IO_HD" mode_bits="1"/> 
-    <pb_type name="io[inpad].inpad" physical_pb_type_name="io[physical].iopad" mode_bits="1"/> 
-    <pb_type name="io[outpad].outpad" physical_pb_type_name="io[physical].iopad" mode_bits="0"/> 
-    <!-- End physical pb_type binding in complex block IO -->
+      <!-- IMPORTANT: must set unused I/Os to operating in INPUT mode !!! -->
+    <pb_type name="io[physical].iopad">
+      <interconnect name="mux1" circuit_model_name="mux_1level_io"/>
+      <interconnect name="mux2" circuit_model_name="mux_1level_io"/>
+    </pb_type>
+    <pb_type name="io[physical].iopad.pad" circuit_model_name="IO" mode_bits="1"/>
+    <pb_type name="io[io_input].io_input.inpad" physical_pb_type_name="io[physical].iopad.pad" mode_bits="1"/>
+    <pb_type name="io[io_output].io_output.outpad" physical_pb_type_name="io[physical].iopad.pad" mode_bits="0"/>

-    <!-- physical pb_type binding in complex block CLB -->
+    <pb_type name="io[physical].iopad.ff" circuit_model_name="sky130_fd_sc_hd__sdfrtp_1"/>
+    <pb_type name="io[io_input].io_input.ff" physical_pb_type_name="io[physical].iopad.ff"/>
+    <pb_type name="io[io_output].io_output.ff" physical_pb_type_name="io[physical].iopad.ff"/>
+      <!-- End physical pb_type binding in complex block IO -->
+
+      <!-- physical pb_type binding in complex block CLB -->
+    <pb_type name="clb.fle[physical].fabric">
+      <!-- Binding interconnect to circuit models as their physical implementation, if not defined, we use the default model -->
+      <interconnect name="mux1" circuit_model_name="mux_1level_fabric"/>
+      <interconnect name="mux2" circuit_model_name="mux_1level_fabric"/>
+    </pb_type>  
+    <pb_type name="clb.fle[physical].fabric.frac_logic">
+      <interconnect name="mux2" circuit_model_name="mux_1level_fabric"/>
+    </pb_type>	
    <!-- physical mode will be the default mode if not specified -->
    <pb_type name="clb.fle" physical_mode_name="physical"/>
-    <pb_type name="clb.fle[physical].fabric.frac_logic.frac_lut4" circuit_model_name="frac_lut4" mode_bits="0"/>
+    <pb_type name="clb.fle[physical].fabric.frac_logic.frac_lut4" circuit_model_name="frac_lut4"/>
    <pb_type name="clb.fle[physical].fabric.frac_logic.carry_follower" circuit_model_name="sky130_fd_sc_hd__mux2_1_wrapper"/>
    <pb_type name="clb.fle[physical].fabric.ff" circuit_model_name="sky130_fd_sc_hd__sdfrtp_1"/>
    <!-- Binding operating pb_type to physical pb_type -->
-    <!-- Binding operating pb_types in mode 'ble4' -->
-    <pb_type name="clb.fle[n1_lut4].ble4.lut4" physical_pb_type_name="clb.fle[physical].fabric.frac_logic.frac_lut4" mode_bits="0">
+    <pb_type name="clb.fle[n1_lut4].ble4.lut4" physical_pb_type_name="clb.fle[physical].fabric.frac_logic.frac_lut4">
      <!-- Binding the lut4 to the first 4 inputs of fracturable lut4 -->
      <port name="in" physical_mode_port="in[0:3]"/>
      <port name="out" physical_mode_port="lut4_out"/>
    </pb_type>
    <pb_type name="clb.fle[n1_lut4].ble4.ff" physical_pb_type_name="clb.fle[physical].fabric.ff"/>
-    <!-- Binding operating pb_types in mode 'shift_register' -->
+      <!-- Binding operating pb_types in mode 'shift_register' -->
    <pb_type name="clb.fle[shift_register].shift_reg.ff" physical_pb_type_name="clb.fle[physical].fabric.ff"/>
    <!-- End physical pb_type binding in complex block IO -->
  </pb_type_annotations>
--- a/ARCH/vpr_arch/k4_N8_tileable_reset_softadder_register_scan_chain_nonLR_caravel_io_skywater130nm.xml
+++ b/ARCH/vpr_arch/k4_N8_tileable_reset_softadder_register_scan_chain_nonLR_caravel_io_skywater130nm.xml
@ -1,27 +1,27 @@
 <!-- 
-  Low-cost homogeneous FPGA Architecture.
+Low-cost homogeneous FPGA Architecture.

-  - Skywater 130 nm technology
-  - General purpose logic block: 
-    K = 4, N = 8, fracturable 4 LUTs (can operate as one 4-LUT or two 3-LUTs with all 3 inputs shared) 
-    with optionally registered outputs
-  - Routing architecture:
-      - 10% L = 1, fc_in = 0.15, Fc_out = 0.10
-      - 10% L = 2, fc_in = 0.15, Fc_out = 0.10
-      - 80% L = 4, fc_in = 0.15, Fc_out = 0.10
-      - 100 routing tracks per channel
+- Skywater 130 nm technology
+- General purpose logic block: 
+K = 4, N = 8, fracturable 4 LUTs (can operate as one 4-LUT or two 3-LUTs with all 3 inputs shared) 
+with optionally registered outputs
+- Routing architecture:
+- 10% L = 1, fc_in = 0.15, Fc_out = 0.10
+- 10% L = 2, fc_in = 0.15, Fc_out = 0.10
+- 80% L = 4, fc_in = 0.15, Fc_out = 0.10
+- 100 routing tracks per channel

-  Authors: Xifan Tang
+Authors: Xifan Tang
 -->
 <architecture>
  <!-- 
-       ODIN II specific config begins 
-       Describes the types of user-specified netlist blocks (in blif, this corresponds to 
-       ".model [type_of_block]") that this architecture supports.
+    ODIN II specific config begins 
+    Describes the types of user-specified netlist blocks (in blif, this corresponds to 
+    ".model [type_of_block]") that this architecture supports.

-       Note: Basic LUTs, I/Os, and flip-flops are not included here as there are 
-       already special structures in blif (.names, .input, .output, and .latch) 
-       that describe them.
+    Note: Basic LUTs, I/Os, and flip-flops are not included here as there are 
+    already special structures in blif (.names, .input, .output, and .latch) 
+    that describe them.
  -->
  <models>
    <!-- A virtual model for I/O to be used in the physical mode of io block -->
@ -68,19 +68,28 @@
  </models>
  <tiles>
    <!-- Do NOT add clock pins to I/O here!!! VPR does not build clock network in the way that OpenFPGA can support
-         If you need to register the I/O, define clocks in the circuit models
-         These clocks can be handled in back-end
-     -->
+      If you need to register the I/O, define clocks in the circuit models
+      These clocks can be handled in back-end
+    -->
    <!-- Top-side has 1 I/O per tile -->
    <tile name="io_top" capacity="16" area="0">
      <equivalent_sites>
        <site pb_type="io"/>
      </equivalent_sites>
-      <input name="outpad" num_pins="1"/>
-      <output name="inpad" num_pins="1"/>
-      <fc in_type="frac" in_val="0.15" out_type="frac" out_val="0.10"/>
+      <clock name="clk" num_pins="4"/>
+      <input name="f2a_i" num_pins="1"/>
+      <output name="a2f_o" num_pins="1"/>
+      <input name="sc_in" num_pins="1"/>
+      <output name="sc_out" num_pins="1"/>
+      <input name="reset" num_pins="1" is_non_clock_global="true"/>
+      <fc in_type="frac" in_val="0.15" out_type="frac" out_val="0.10">
+        <fc_override port_name="clk" fc_type="frac" fc_val="0"/>
+        <fc_override port_name="sc_in" fc_type="frac" fc_val="0"/>
+        <fc_override port_name="sc_out" fc_type="frac" fc_val="0"/>
+        <fc_override port_name="reset" fc_type="frac" fc_val="0"/>
+      </fc>
      <pinlocations pattern="custom">
-        <loc side="bottom">io_top.outpad io_top.inpad</loc>
+        <loc side="bottom">io_top.a2f_o io_top.f2a_i io_top.clk io_top.sc_in io_top.sc_out io_top.reset</loc>
      </pinlocations>
    </tile>
    <!-- Right-side has 1 I/O per tile -->
@ -88,11 +97,20 @@
      <equivalent_sites>
        <site pb_type="io"/>
      </equivalent_sites>
-      <input name="outpad" num_pins="1"/>
-      <output name="inpad" num_pins="1"/>
-      <fc in_type="frac" in_val="0.15" out_type="frac" out_val="0.10"/>
+      <clock name="clk" num_pins="4"/>
+      <input name="f2a_i" num_pins="1"/>
+      <output name="a2f_o" num_pins="1"/>
+      <input name="sc_in" num_pins="1"/>
+      <output name="sc_out" num_pins="1"/>
+      <input name="reset" num_pins="1" is_non_clock_global="true"/>
+      <fc in_type="frac" in_val="0.15" out_type="frac" out_val="0.10">
+        <fc_override port_name="clk" fc_type="frac" fc_val="0"/>
+        <fc_override port_name="sc_in" fc_type="frac" fc_val="0"/>
+        <fc_override port_name="sc_out" fc_type="frac" fc_val="0"/>
+        <fc_override port_name="reset" fc_type="frac" fc_val="0"/>
+      </fc>
      <pinlocations pattern="custom">
-        <loc side="left">io_right.outpad io_right.inpad</loc>
+        <loc side="left">io_right.a2f_o io_right.f2a_i io_right.clk io_right.sc_in io_right.sc_out io_right.reset</loc>
      </pinlocations>
    </tile>
    <!-- Bottom-side has 9 I/O per tile -->
@ -100,11 +118,20 @@
      <equivalent_sites>
        <site pb_type="io"/>
      </equivalent_sites>
-      <input name="outpad" num_pins="1"/>
-      <output name="inpad" num_pins="1"/>
-      <fc in_type="frac" in_val="0.15" out_type="frac" out_val="0.10"/>
+      <clock name="clk" num_pins="4"/>
+      <input name="f2a_i" num_pins="1"/>
+      <output name="a2f_o" num_pins="1"/>
+      <input name="sc_in" num_pins="1"/>
+      <output name="sc_out" num_pins="1"/>
+      <input name="reset" num_pins="1" is_non_clock_global="true"/>
+      <fc in_type="frac" in_val="0.15" out_type="frac" out_val="0.10">
+        <fc_override port_name="clk" fc_type="frac" fc_val="0"/>
+        <fc_override port_name="sc_in" fc_type="frac" fc_val="0"/>
+        <fc_override port_name="sc_out" fc_type="frac" fc_val="0"/>
+        <fc_override port_name="reset" fc_type="frac" fc_val="0"/>
+      </fc>
      <pinlocations pattern="custom">
-        <loc side="top">io_bottom.outpad io_bottom.inpad</loc>
+        <loc side="top">io_bottom.a2f_o io_bottom.f2a_i io_bottom.clk io_bottom.sc_in io_bottom.sc_out io_bottom.reset</loc>
      </pinlocations>
    </tile>
    <!-- Left-side has 1 I/O per tile -->
@ -112,11 +139,20 @@
      <equivalent_sites>
        <site pb_type="io"/>
      </equivalent_sites>
-      <input name="outpad" num_pins="1"/>
-      <output name="inpad" num_pins="1"/>
-      <fc in_type="frac" in_val="0.15" out_type="frac" out_val="0.10"/>
+      <clock name="clk" num_pins="4"/>
+      <input name="f2a_i" num_pins="1"/>
+      <output name="a2f_o" num_pins="1"/>
+      <input name="sc_in" num_pins="1"/>
+      <output name="sc_out" num_pins="1"/>
+      <input name="reset" num_pins="1" is_non_clock_global="true"/>
+      <fc in_type="frac" in_val="0.15" out_type="frac" out_val="0.10">
+        <fc_override port_name="clk" fc_type="frac" fc_val="0"/>
+        <fc_override port_name="sc_in" fc_type="frac" fc_val="0"/>
+        <fc_override port_name="sc_out" fc_type="frac" fc_val="0"/>
+        <fc_override port_name="reset" fc_type="frac" fc_val="0"/>
+      </fc>
      <pinlocations pattern="custom">
-        <loc side="right">io_left.outpad io_left.inpad</loc>
+        <loc side="right">io_left.a2f_o io_left.f2a_i io_left.clk io_left.sc_in io_left.sc_out io_left.reset</loc>
      </pinlocations>
    </tile>
    <!-- CLB has most pins on the top and right sides -->
@ -164,7 +200,7 @@
      <col type="io_left" startx="0" priority="100"/>
      <col type="io_right" startx="W-1" priority="100"/>
      <corners type="EMPTY" priority="101"/>
-      <!--Fill with 'clb'-->
+        <!--Fill with 'clb'-->
      <fill type="clb" priority="10"/>
    </auto_layout>
    <fixed_layout name="2x2" width="4" height="4">
@ -174,7 +210,7 @@
      <col type="io_left" startx="0" priority="100"/>
      <col type="io_right" startx="W-1" priority="100"/>
      <corners type="EMPTY" priority="101"/>
-      <!--Fill with 'clb'-->
+        <!--Fill with 'clb'-->
      <fill type="clb" priority="10"/>
    </fixed_layout>
    <fixed_layout name="12x12" width="14" height="14">
@ -184,7 +220,7 @@
      <col type="io_left" startx="0" priority="100"/>
      <col type="io_right" startx="W-1" priority="100"/>
      <corners type="EMPTY" priority="101"/>
-      <!--Fill with 'clb'-->
+        <!--Fill with 'clb'-->
      <fill type="clb" priority="10"/>
    </fixed_layout>
    <fixed_layout name="32x32" width="34" height="34">
@ -194,30 +230,30 @@
      <col type="io_left" startx="0" priority="100"/>
      <col type="io_right" startx="W-1" priority="100"/>
      <corners type="EMPTY" priority="101"/>
-      <!--Fill with 'clb'-->
+        <!--Fill with 'clb'-->
      <fill type="clb" priority="10"/>
    </fixed_layout>
  </layout>
  <device>
    <!-- VB & JL: Using Ian Kuon's transistor sizing and drive strength data for routing, at 40 nm. Ian used BPTM 
-			     models. We are modifying the delay values however, to include metal C and R, which allows more architecture
-			     experimentation. We are also modifying the relative resistance of PMOS to be 1.8x that of NMOS
-			     (vs. Ian's 3x) as 1.8x lines up with Jeff G's data from a 45 nm process (and is more typical of 
-			     45 nm in general). I'm upping the Rmin_nmos from Ian's just over 6k to nearly 9k, and dropping 
-			     RminW_pmos from 18k to 16k to hit this 1.8x ratio, while keeping the delays of buffers approximately
-			     lined up with Stratix IV. 
-			     We are using Jeff G.'s capacitance data for 45 nm (in tech/ptm_45nm).
-			     Jeff's tables list C in for transistors with widths in multiples of the minimum feature size (45 nm).
-			     The minimum contactable transistor is 2.5 * 45 nm, so I need to multiply drive strength sizes in this file
-	                     by 2.5x when looking up in Jeff's tables.
-			     The delay values are lined up with Stratix IV, which has an architecture similar to this
-			     proposed FPGA, and which is also 40 nm 
-			     C_ipin_cblock: input capacitance of a track buffer, which VPR assumes is a single-stage
-			     4x minimum drive strength buffer. -->
+      models. We are modifying the delay values however, to include metal C and R, which allows more architecture
+      experimentation. We are also modifying the relative resistance of PMOS to be 1.8x that of NMOS
+      (vs. Ian's 3x) as 1.8x lines up with Jeff G's data from a 45 nm process (and is more typical of 
+      45 nm in general). I'm upping the Rmin_nmos from Ian's just over 6k to nearly 9k, and dropping 
+      RminW_pmos from 18k to 16k to hit this 1.8x ratio, while keeping the delays of buffers approximately
+      lined up with Stratix IV. 
+      We are using Jeff G.'s capacitance data for 45 nm (in tech/ptm_45nm).
+      Jeff's tables list C in for transistors with widths in multiples of the minimum feature size (45 nm).
+      The minimum contactable transistor is 2.5 * 45 nm, so I need to multiply drive strength sizes in this file
+      by 2.5x when looking up in Jeff's tables.
+      The delay values are lined up with Stratix IV, which has an architecture similar to this
+      proposed FPGA, and which is also 40 nm 
+      C_ipin_cblock: input capacitance of a track buffer, which VPR assumes is a single-stage
+    4x minimum drive strength buffer. -->
    <sizing R_minW_nmos="8926" R_minW_pmos="16067"/>
-    <!-- The grid_logic_tile_area below will be used for all blocks that do not explicitly set their own (non-routing)
-     	  area; set to 0 since we explicitly set the area of all blocks currently in this architecture file.
-	  -->
+      <!-- The grid_logic_tile_area below will be used for all blocks that do not explicitly set their own (non-routing)
+        area; set to 0 since we explicitly set the area of all blocks currently in this architecture file.
+      -->
    <area grid_logic_tile_area="0"/>
    <chan_width_distr>
      <x distr="uniform" peak="1.000000"/>
@ -228,28 +264,28 @@
  </device>
  <switchlist>
    <!-- VB: the mux_trans_size and buf_size data below is in minimum width transistor *areas*, assuming the purple
-	       book area formula. This means the mux transistors are about 5x minimum drive strength.
-	       We assume the first stage of the buffer is 3x min drive strength to be reasonable given the large 
-	       mux transistors, and this gives a reasonable stage ratio of a bit over 5x to the second stage. We assume
-	       the n and p transistors in the first stage are equal-sized to lower the buffer trip point, since it's fed
-	       by a pass transistor mux. We can then reverse engineer the buffer second stage to hit the specified 
-	       buf_size (really buffer area) - 16.2x minimum drive nmos and 1.8*16.2 = 29.2x minimum drive.
-	       I then took the data from Jeff G.'s PTM modeling of 45 nm to get the Cin (gate of first stage) and Cout 
-	       (diff of second stage) listed below.  Jeff's models are in tech/ptm_45nm, and are in min feature multiples.
-	       The minimum contactable transistor is 2.5 * 45 nm, so I need to multiply the drive strength sizes above by 
-	       2.5x when looking up in Jeff's tables.
-	       Finally, we choose a switch delay (58 ps) that leads to length 4 wires having a delay equal to that of SIV of 126 ps.
-	       This also leads to the switch being 46% of the total wire delay, which is reasonable. -->
+      book area formula. This means the mux transistors are about 5x minimum drive strength.
+      We assume the first stage of the buffer is 3x min drive strength to be reasonable given the large 
+      mux transistors, and this gives a reasonable stage ratio of a bit over 5x to the second stage. We assume
+      the n and p transistors in the first stage are equal-sized to lower the buffer trip point, since it's fed
+      by a pass transistor mux. We can then reverse engineer the buffer second stage to hit the specified 
+      buf_size (really buffer area) - 16.2x minimum drive nmos and 1.8*16.2 = 29.2x minimum drive.
+      I then took the data from Jeff G.'s PTM modeling of 45 nm to get the Cin (gate of first stage) and Cout 
+      (diff of second stage) listed below.  Jeff's models are in tech/ptm_45nm, and are in min feature multiples.
+      The minimum contactable transistor is 2.5 * 45 nm, so I need to multiply the drive strength sizes above by 
+      2.5x when looking up in Jeff's tables.
+      Finally, we choose a switch delay (58 ps) that leads to length 4 wires having a delay equal to that of SIV of 126 ps.
+    This also leads to the switch being 46% of the total wire delay, which is reasonable. -->
    <switch type="mux" name="L1_mux" R="551" Cin=".77e-15" Cout="4e-15" Tdel="58e-12" mux_trans_size="2.630740" buf_size="27.645901"/>
    <switch type="mux" name="L2_mux" R="551" Cin=".77e-15" Cout="4e-15" Tdel="58e-12" mux_trans_size="2.630740" buf_size="27.645901"/>
    <switch type="mux" name="L4_mux" R="551" Cin=".77e-15" Cout="4e-15" Tdel="58e-12" mux_trans_size="2.630740" buf_size="27.645901"/>
-    <!--switch ipin_cblock resistance set to yeild for 4x minimum drive strength buffer-->
+      <!--switch ipin_cblock resistance set to yeild for 4x minimum drive strength buffer-->
    <switch type="mux" name="ipin_cblock" R="2231.5" Cout="0." Cin="1.47e-15" Tdel="7.247000e-11" mux_trans_size="1.222260" buf_size="auto"/>
  </switchlist>
  <segmentlist>
    <!--- VB & JL: using ITRS metal stack data, 96 nm half pitch wires, which are intermediate metal width/space.  
-			     With the 96 nm half pitch, such wires would take 60 um of height, vs. a 90 nm high (approximated as square) Stratix IV tile so this seems
-			     reasonable. Using a tile length of 90 nm, corresponding to the length of a Stratix IV tile if it were square. -->
+      With the 96 nm half pitch, such wires would take 60 um of height, vs. a 90 nm high (approximated as square) Stratix IV tile so this seems
+    reasonable. Using a tile length of 90 nm, corresponding to the length of a Stratix IV tile if it were square. -->
    <!-- GIVE a specific name for the segment! OpenFPGA appreciate that! -->
    <segment name="L1" freq="0.20" length="1" type="unidir" Rmetal="101" Cmetal="22.5e-15">
      <mux name="L1_mux"/>
@ -275,64 +311,131 @@
  <complexblocklist>
    <!-- Define input pads begin -->
    <pb_type name="io">
-      <input name="outpad" num_pins="1"/>
-      <output name="inpad" num_pins="1"/>
-      <!-- Do NOT add clock pins to I/O here!!! VPR does not build clock network in the way that OpenFPGA can support
-           If you need to register the I/O, define clocks in the circuit models
-           These clocks can be handled in back-end
-       -->
-      <!-- A mode denotes the physical implementation of an I/O 
-           This mode will be not packable but is mainly used for fabric verilog generation   
-        -->
+      <clock name="clk" num_pins="4"/>
+      <input name="f2a_i" num_pins="1"/>
+      <output name="a2f_o" num_pins="1"/>
+      <input name="sc_in" num_pins="1"/>
+      <output name="sc_out" num_pins="1"/>
+      <input name="reset" num_pins="1" is_non_clock_global="true"/>
+        <!-- Physical mode definition begin (physical implementation of the io) -->
      <mode name="physical" disabled_in_pack="true">
-        <pb_type name="iopad" blif_model=".subckt io" num_pb="1">
-          <input name="outpad" num_pins="1"/>
-          <output name="inpad" num_pins="1"/>
+        <pb_type name="iopad" num_pb="1">
+          <clock name="clk" num_pins="1"/>
+          <input name="f2a_i" num_pins="1"/>
+          <output name="a2f_o" num_pins="1"/>
+          <input name="sc_in" num_pins="1"/>
+          <input name="reset" num_pins="1"/>
+          <output name="sc_out" num_pins="1"/>
+          <pb_type name="ff" blif_model=".subckt scff" num_pb="2">
+            <input name="D" num_pins="1" port_class="D"/>
+            <input name="DI" num_pins="1"/>
+            <input name="reset" num_pins="1"/>
+            <output name="Q" num_pins="1" port_class="Q"/>
+            <clock name="clk" num_pins="1" port_class="clock"/>
+            <T_setup value="66e-12" port="ff.D" clock="clk"/>
+            <T_setup value="66e-12" port="ff.DI" clock="clk"/>
+            <T_setup value="66e-12" port="ff.reset" clock="clk"/>
+            <T_clock_to_Q max="124e-12" port="ff.Q" clock="clk"/>
+          </pb_type>
+          <pb_type name="pad" blif_model=".subckt io" num_pb="1">
+            <input name="outpad" num_pins="1"/>
+            <output name="inpad" num_pins="1"/>
+          </pb_type>
+          <interconnect>
+            <direct name="ff[0:0]-clk" input="iopad.clk" output="ff[0:0].clk"/>
+            <direct name="ff[1:1]-clk" input="iopad.clk" output="ff[1:1].clk"/>
+            <direct name="ff[0:0]-D" input="iopad.f2a_i" output="ff[0:0].D" />
+            <direct name="ff[1:1]-D" input="pad.inpad" output="ff[1:1].D"/>
+            <direct name="ff[0:0]-DI" input="iopad.sc_in" output="ff[0:0].DI"/>
+            <direct name="ff[1:1]-DI" input="ff[0:0].Q" output="ff[1:1].DI"/>
+            <direct name="iopad-sc_out" input="ff[1:1].Q" output="iopad.sc_out"/>
+            <complete name="complete1" input="iopad.reset" output="ff[1:0].reset"/>
+            <mux name="mux1" input="iopad.f2a_i ff[0:0].Q" output="pad.outpad">
+              <delay_constant max="25e-12" in_port="iopad.f2a_i" out_port="pad.outpad"/>
+              <delay_constant max="45e-12" in_port="ff[0:0].Q" out_port="pad.outpad"/>
+            </mux>
+            <mux name="mux2" input="pad.inpad ff[1:1].Q" output="iopad.a2f_o">
+              <delay_constant max="25e-12" in_port="pad.inpad" out_port="iopad.a2f_o"/>
+              <delay_constant max="45e-12" in_port="ff[1:1].Q" out_port="iopad.a2f_o"/>
+            </mux>
+          </interconnect>
        </pb_type>
        <interconnect>
-          <direct name="outpad" input="io.outpad" output="iopad.outpad">
-            <delay_constant max="1.394e-11" in_port="io.outpad" out_port="iopad.outpad"/>
-          </direct>
-          <direct name="inpad" input="iopad.inpad" output="io.inpad">
-            <delay_constant max="4.243e-11" in_port="iopad.inpad" out_port="io.inpad"/>
-          </direct>
+          <complete name="clks" input="io.clk" output="iopad.clk"/>
+          <direct name="direct3" input="io.f2a_i" output="iopad.f2a_i"/>
+          <direct name="direct4" input="iopad.a2f_o" output="io.a2f_o"/>
+          <direct name="direct6" input="io.sc_in" output="iopad.sc_in"/>
+          <direct name="direct7" input="iopad.sc_out" output="io.sc_out"/>
+          <direct name="direct8" input="io.reset" output="iopad.reset"/>
        </interconnect>
      </mode>
-
-      <!-- IOs can operate as either inputs or outputs.
-	     Delays below come from Ian Kuon. They are small, so they should be interpreted as
-	     the delays to and from registers in the I/O (and generally I/Os are registered 
-	     today and that is when you timing analyze them.
-	     -->
-      <mode name="inpad">
-        <pb_type name="inpad" blif_model=".input" num_pb="1">
-          <output name="inpad" num_pins="1"/>
+      <!-- Physical mode definition end (physical implementation of the io) -->
+      <mode name="io_output">
+        <pb_type name="io_output" num_pb="1">
+          <clock name="clk" num_pins="1"/>
+          <input name="f2a_i" num_pins="1"/>
+          <pb_type name="ff" blif_model=".latch" num_pb="1" class="flipflop">
+            <input name="D" num_pins="1" port_class="D"/>
+            <output name="Q" num_pins="1" port_class="Q"/>
+            <clock name="clk" num_pins="1" port_class="clock"/>
+            <T_setup value="66e-12" port="ff.D" clock="clk"/>
+            <T_clock_to_Q max="124e-12" port="ff.Q" clock="clk"/>
+          </pb_type>
+          <pb_type name="outpad" blif_model=".output" num_pb="1">
+            <input name="outpad" num_pins="1"/>
+          </pb_type>
+          <interconnect>
+            <direct name="ff-clk" input="io_output.clk" output="ff.clk"/>
+            <direct name="ff-D" input="io_output.f2a_i" output="ff.D"/>
+            <mux name="mux1" input="ff.Q io_output.f2a_i" output="outpad.outpad">
+              <pack_pattern name="pack-OREG" in_port="ff.Q" out_port="outpad.outpad"/>
+              <delay_constant max="25e-12" in_port="io_output.f2a_i" out_port="outpad.outpad"/>
+              <delay_constant max="45e-12" in_port="ff.Q" out_port="outpad.outpad"/>
+            </mux>
+          </interconnect>
        </pb_type>
        <interconnect>
-          <direct name="inpad" input="inpad.inpad" output="io.inpad">
-            <delay_constant max="4.243e-11" in_port="inpad.inpad" out_port="io.inpad"/>
-          </direct>
+          <complete name="io_output-clk" input="io.clk" output="io_output.clk"/>
+          <direct name="io_output-f2a_i" input="io.f2a_i" output="io_output.f2a_i"/>
        </interconnect>
      </mode>
-      <mode name="outpad">
-        <pb_type name="outpad" blif_model=".output" num_pb="1">
-          <input name="outpad" num_pins="1"/>
+      <mode name="io_input">
+        <pb_type name="io_input" num_pb="1">
+          <clock name="clk" num_pins="1"/>
+          <output name="a2f_o" num_pins="1"/>
+          <pb_type name="inpad" blif_model=".input" num_pb="1">
+            <output name="inpad" num_pins="1"/>
+          </pb_type>
+          <pb_type name="ff" blif_model=".latch" num_pb="1" class="flipflop">
+            <input name="D" num_pins="1" port_class="D"/>
+            <output name="Q" num_pins="1" port_class="Q"/>
+            <clock name="clk" num_pins="1" port_class="clock"/>
+            <T_setup value="66e-12" port="ff.D" clock="clk"/>
+            <T_clock_to_Q max="124e-12" port="ff.Q" clock="clk"/>
+          </pb_type>
+          <interconnect>
+            <direct name="ff-clk" input="io_input.clk" output="ff.clk"/>
+            <direct name="ff-D" input="inpad.inpad" output="ff.D"/>
+            <mux name="mux2" input="inpad.inpad ff.Q" output="io_input.a2f_o">
+              <pack_pattern name="pack-IREG" in_port="ff.Q" out_port="io_input.a2f_o"/>
+              <delay_constant max="25e-12" in_port="inpad.inpad" out_port="io_input.a2f_o"/>
+              <delay_constant max="45e-12" in_port="ff.Q" out_port="io_input.a2f_o"/>
+            </mux>
+          </interconnect>
        </pb_type>
        <interconnect>
-          <direct name="outpad" input="io.outpad" output="outpad.outpad">
-            <delay_constant max="1.394e-11" in_port="io.outpad" out_port="outpad.outpad"/>
-          </direct>
+          <direct name="io-a2f_o" input="io_input.a2f_o" output="io.a2f_o"/>
+          <complete name="io_input-clk" input="io.clk" output="io_input.clk"/>
        </interconnect>
      </mode>
-      <power method="ignore"/>
    </pb_type>
    <!-- Define I/O pads ends -->
    <!-- Define general purpose logic block (CLB) begin -->
    <!-- -Due to the absence of local routing, 
-         the 4 inputs of fracturable LUT4 are no longer equivalent, 
-         because the 4th input can not be switched when the dual-LUT3 modes are used.
-         So pin equivalence should be applied to the first 3 inputs only
-	  -->
+      the 4 inputs of fracturable LUT4 are no longer equivalent, 
+      because the 4th input can not be switched when the dual-LUT3 modes are used.
+      So pin equivalence should be applied to the first 3 inputs only
+    -->
    <pb_type name="clb">
      <input name="I" num_pins="24" equivalent="full"/>
      <input name="reg_in" num_pins="1"/>
@ -345,9 +448,9 @@
      <output name="cout" num_pins="1"/>
      <output name="cout_copy" num_pins="1"/>
      <clock name="clk" num_pins="4"/>
-      <!-- Describe fracturable logic element.  
-             Each fracturable logic element has a 6-LUT that can alternatively operate as two 5-LUTs with shared inputs. 
-             The outputs of the fracturable logic element can be optionally registered
+        <!-- Describe fracturable logic element.  
+          Each fracturable logic element has a 6-LUT that can alternatively operate as two 5-LUTs with shared inputs. 
+          The outputs of the fracturable logic element can be optionally registered
        -->
      <pb_type name="fle" num_pb="8">
        <input name="in" num_pins="4"/>
@ -360,7 +463,7 @@
        <output name="sc_out" num_pins="1"/>
        <output name="cout" num_pins="1"/>
        <clock name="clk" num_pins="1"/>
-        <!-- Physical mode definition begin (physical implementation of the fle) -->
+          <!-- Physical mode definition begin (physical implementation of the fle) -->
        <mode name="physical" disabled_in_pack="true">
          <pb_type name="fabric" num_pb="1">
            <input name="in" num_pins="4"/>
@ -378,7 +481,7 @@
              <input name="cin" num_pins="1"/>
              <output name="out" num_pins="1"/>
              <output name="cout" num_pins="1"/>
-              <!-- Define LUT -->
+                <!-- Define LUT -->
              <pb_type name="frac_lut4" blif_model=".subckt frac_lut4" num_pb="1">
                <input name="in" num_pins="4"/>
                <output name="lut2_out" num_pins="2"/>
@ -397,7 +500,6 @@
                <direct name="direct4" input="frac_lut4.lut2_out[1:1]" output="carry_follower.a"/>
                <direct name="direct5" input="frac_lut4.lut2_out[0:0]" output="carry_follower.cin"/>
                <direct name="direct6" input="carry_follower.cout" output="frac_logic.cout"/>
-                <!-- Xifan Tang: I use out[0] because the output of lut6 in lut6 mode is wired to the out[0] -->
                <direct name="direct7" input="frac_lut4.lut4_out" output="frac_logic.out"/>
                <mux name="mux2" input="frac_logic.cin frac_logic.in[2:2]" output="frac_lut4.in[2:2]"/>
              </interconnect>
@ -416,8 +518,8 @@
            </pb_type>         
            <interconnect>
              <direct name="direct1" input="fabric.in" output="frac_logic.in"/>
-	      <direct name="direct2" input="fabric.sc_in" output="ff.DI"/>
-	      <direct name="direct3" input="fabric.cin" output="frac_logic.cin"/>
+              <direct name="direct2" input="fabric.sc_in" output="ff.DI"/>
+              <direct name="direct3" input="fabric.cin" output="frac_logic.cin"/>
              <direct name="direct4" input="ff.Q" output="fabric.sc_out"/>
              <direct name="direct5" input="ff.Q" output="fabric.reg_out"/>
              <direct name="direct6" input="frac_logic.cout" output="fabric.cout"/>
@ -454,20 +556,20 @@
            <input name="in" num_pins="4"/>
            <output name="out" num_pins="1"/>
            <clock name="clk" num_pins="1"/>
-            <!-- Define LUT -->
+              <!-- Define LUT -->
            <pb_type name="lut4" blif_model=".names" num_pb="1" class="lut">
              <input name="in" num_pins="4" port_class="lut_in"/>
              <output name="out" num_pins="1" port_class="lut_out"/>
-              <!-- LUT timing using delay matrix -->
-              <!-- These are the physical delay inputs on a Stratix IV LUT but because VPR cannot do LUT rebalancing,
-                       we instead take the average of these numbers to get more stable results
+                <!-- LUT timing using delay matrix -->
+                <!-- These are the physical delay inputs on a Stratix IV LUT but because VPR cannot do LUT rebalancing,
+                  we instead take the average of these numbers to get more stable results
                  82e-12
                  173e-12
                  261e-12
                  263e-12
                  398e-12
                  397e-12
-                  -->
+                -->
              <delay_matrix type="max" in_port="lut4.in" out_port="lut4.out">
                261e-12
                261e-12
@ -536,13 +638,13 @@
      </pb_type>
      <interconnect>
        <!-- We use direct connections to reduce the area to the most
-             The global local routing is going to compensate the loss in routability
-          -->
+          The global local routing is going to compensate the loss in routability
+        -->
        <!-- FIXME: The implicit port definition results in I0[0] connected to
-                    in[2]. Such twisted connection is not expected.
-                    I[0] should be connected to in[0]
-	  -->
-	<complete name="crossbar" input="clb.I fle[7:0].out" output="fle[7:0].in">
+          in[2]. Such twisted connection is not expected.
+          I[0] should be connected to in[0]
+        -->
+        <complete name="crossbar" input="clb.I fle[7:0].out" output="fle[7:0].in">
          <!-- TODO: Timing should be backannotated from post-PnR results -->
        </complete>
        <complete name="clks" input="clb.clk" output="fle[7:0].clk">
@ -550,24 +652,21 @@
        <complete name="resets" input="clb.reset" output="fle[7:0].reset">
        </complete>
        <!-- This way of specifying direct connection to clb outputs is important because this architecture uses automatic spreading of opins.  
-               By grouping to output pins in this fashion, if a logic block is completely filled by 6-LUTs, 
-               then the outputs those 6-LUTs take get evenly distributed across all four sides of the CLB instead of clumped on two sides (which is what happens with a more
-               naive specification).
-          -->
+          By grouping to output pins in this fashion, if a logic block is completely filled by 6-LUTs, 
+          then the outputs those 6-LUTs take get evenly distributed across all four sides of the CLB instead of clumped on two sides (which is what happens with a more
+          naive specification).
+        -->
        <direct name="clbouts1" input="fle[3:0].out" output="clb.O[3:0]"/>
-	<direct name="clbouts2" input="fle[7:4].out" output="clb.O[7:4]"/>
-	<direct name="cout_copy" input="fle[7:7].cout" output="clb.cout_copy"/>
-        <!-- Shift register chain links -->
+        <direct name="clbouts2" input="fle[7:4].out" output="clb.O[7:4]"/>
+        <direct name="cout_copy" input="fle[7:7].cout" output="clb.cout_copy"/>
+          <!-- Shift register chain links -->
        <direct name="shift_register_in" input="clb.reg_in" output="fle[0:0].reg_in">
          <!-- Put all inter-block carry chain delay on this one edge -->
          <delay_constant max="0.16e-9" in_port="clb.reg_in" out_port="fle[0:0].reg_in"/>
-          <!--pack_pattern name="chain" in_port="clb.reg_in" out_port="fle[0:0].reg_in"/-->
        </direct>
        <direct name="shift_register_out" input="fle[7:7].reg_out" output="clb.reg_out">
-          <!--pack_pattern name="chain" in_port="fle[7:7].reg_out" out_port="clb.reg_out"/-->
        </direct>
        <direct name="shift_register_link" input="fle[6:0].reg_out" output="fle[7:1].reg_in">
-          <!--pack_pattern name="chain" in_port="fle[6:0].reg_out" out_port="fle[7:1].reg_in"/-->
        </direct>
        <!-- Scan chain links -->
        <direct name="scan_chain_in" input="clb.sc_in" output="fle[0:0].sc_in">
--- a/HDL/common/ql_io_logic.v
+++ b/HDL/common/ql_io_logic.v
@ -0,0 +1,52 @@
+`timescale 1ns/1ps
+
+//-----------------------------------------------------
+// Function    : An embedded I/O with
+//               - An I/O isolation signal to set 
+//                 the I/O in input mode. This is to avoid
+//                 any unexpected output signals to damage
+//                 circuits outside the FPGA due to configurable
+//                 memories are not properly initialized
+//                 This feature may not be needed if the configurable
+//                 memory cell has a built-in set/reset functionality
+//               - Internal protection circuitry to ensure
+//                 clean signals at all the SOC I/O ports
+//                 This is to avoid 
+//                   - output any random signal
+//                     when the I/O is in input mode, also avoid
+//                   - driven by any random signal
+//                     when the I/O is output mode
+//
+// Note: This cell is built with Standard Cells from HD library
+//       It is already technology mapped and can be directly used
+//       for physical design
+//-----------------------------------------------------
+module EMBEDDED_IO_HD (
+  input SOC_IN,   // Input to drive the inpad signal
+  output SOC_OUT, // Output the outpad signal
+  output FPGA_IN, // Input data to FPGA
+  input FPGA_OUT, // Output data from FPGA
+  input FPGA_IO_DIR, 
+  input CFG_DONE 
+);
+
+  wire cfg_done_b;
+  sky130_fd_sc_hd__inv_1 INV		(
+	  				.A(CFG_DONE),
+					.Y(cfg_done_b)
+					);
+  sky130_fd_sc_hd__or3_1 OR3 		(
+	  				.A(FPGA_IO_DIR),
+                                        .B(FPGA_OUT),
+                                        .C(cfg_done_b),
+                                        .X(SOC_OUT)
+                                        );
+  sky130_fd_sc_hd__and2_1 AND2 		(
+	  				.A(FPGA_IO_DIR),
+                                        .B(SOC_IN),
+                                        .X(FPGA_IN)
+                                        );
+
+
+endmodule
+
--- a/HDL/common/ql_iso_io_logic.v
+++ b/HDL/common/ql_iso_io_logic.v
@ -0,0 +1,70 @@
+`timescale 1ns/1ps
+
+//-----------------------------------------------------
+// Function    : An embedded I/O with
+//               - An I/O isolation signal to set 
+//                 the I/O in input mode. This is to avoid
+//                 any unexpected output signals to damage
+//                 circuits outside the FPGA due to configurable
+//                 memories are not properly initialized
+//                 This feature may not be needed if the configurable
+//                 memory cell has a built-in set/reset functionality
+//               - Internal protection circuitry to ensure
+//                 clean signals at all the SOC I/O ports
+//                 This is to avoid 
+//                   - output any random signal
+//                     when the I/O is in input mode, also avoid
+//                   - driven by any random signal
+//                     when the I/O is output mode
+//
+// Note: This cell is built with Standard Cells from HD library
+//       It is already technology mapped and can be directly used
+//       for physical design
+//-----------------------------------------------------
+module IO (
+  input SOC_IN,   // Input to drive the inpad signal
+  output SOC_OUT, // Output the outpad signal
+  output FPGA_IN, // Input data to FPGA
+  input FPGA_OUT, // Output data from FPGA
+  input FPGA_IO_DIR, 
+  input CFG_DONE, 
+  input IO_ISOL_N 
+);
+
+  wire cfg_done_b;
+  wire io_isol;
+  wire f2a_o_gate;
+  wire f2a_o_int;
+  sky130_fd_sc_hd__inv_1 INV_CFG_DONE	(
+	  				.A(CFG_DONE),
+					.Y(cfg_done_b)
+					);
+  sky130_fd_sc_hd__inv_1 INV_ISOL_N     (
+                                        .A(IO_ISOL_N),
+                                        .Y(io_isol)
+                                        );
+  // output path
+  sky130_fd_sc_hd__nor2_1 NOR2 		(
+	  				.A(FPGA_IO_DIR),
+                                        .B(cfg_done_b),
+                                        .Y(f2a_o_gate)
+                                        );
+  sky130_fd_sc_hd__nand2_1 NAND2 	(
+	  				.A(FPGA_OUT),
+                                        .B(f2a_o_gate),
+                                        .Y(f2a_o_int)
+                                        );
+  sky130_fd_sc_hd__einvn_4 EINVN_OUT	(
+	  				.A(f2a_o_int),
+                                        .TE_B(io_isol),
+                                        .Z(SOC_OUT)
+                                        );
+  // input path 
+  sky130_fd_sc_hd__and3_1 AND3 		(
+                                        .A(SOC_IN),
+	  				.B(FPGA_IO_DIR),
+                                        .C(IO_ISOL_N),
+                                        .X(FPGA_IN)
+                                        );
+endmodule
+