Merge branch 'master' of https://github.com/lnis-uofu/OpenFPGA into vtr_upgrade

2022-09-16 16:47:21 -07:00 · 2022-09-16 16:47:21 -07:00 · 373566416c
parent e98d022d3a 30988d7072
commit 373566416c
30 changed files with 1780 additions and 305 deletions
--- a/VERSION.md
+++ b/VERSION.md
@ -1 +1 @@
-1.1.489
+1.1.525
--- a/docs/source/manual/openfpga_shell/openfpga_commands/fpga_bitstream_commands.rst
+++ b/docs/source/manual/openfpga_shell/openfpga_commands/fpga_bitstream_commands.rst
@ -25,6 +25,14 @@ Repack's functionality are in the following aspects:
  
  .. warning:: Design constraints are designed to help repacker to identify which clock net to be mapped to which pin, so that multi-clock benchmarks can be correctly implemented, in the case that VPR may not have sufficient vision on clock net mapping. **Try not to use design constraints to remap any other types of nets!!!**

+  .. option:: --ignore_global_nets_on_pins
+
+    Specify the mapping results of global nets should be ignored on which pins of a ``pb_type``. For example, ``--ignore_global_nets_on_pins clb.I[0:11]``. Once specified, the mapping results on the pins for all the global nets, such as clock, reset *etc.*, are ignored. Routing traces will be appeneded to other pins where the same global nets are mapped to. 
+  
+  .. note:: This option is designed for global nets which are applied to both data path and global networks. For example, a reset signal is mapped to both a LUT input and the reset pin of a FF. Suggest not to use the option in other purposes!
+
+  .. warning:: Users must specify the size/width of the pin. Currently, OpenFPGA cannot infer the pin size from the architecture!!!
+     
  .. option:: --verbose 
  
    Show verbose log
--- a/openfpga/src/base/openfpga_bitstream_command.cpp
+++ b/openfpga/src/base/openfpga_bitstream_command.cpp
@ -21,9 +21,15 @@ ShellCommandId add_openfpga_repack_command(openfpga::Shell<OpenfpgaContext>& she
                                           const ShellCommandClassId& cmd_class_id,
                                           const std::vector<ShellCommandId>& dependent_cmds) {
  Command shell_cmd("repack");
+
  /* Add an option '--design_constraints' */
  CommandOptionId opt_design_constraints = shell_cmd.add_option("design_constraints", false, "file path to the design constraints");
  shell_cmd.set_option_require_value(opt_design_constraints, openfpga::OPT_STRING);
+
+  /* Add an option '--ignore_global_nets_on_pins' */
+  CommandOptionId opt_ignore_global_nets = shell_cmd.add_option("ignore_global_nets_on_pins", false, "Specify the pins where global nets will be ignored. Routing traces are merged to other pins");
+  shell_cmd.set_option_require_value(opt_ignore_global_nets, openfpga::OPT_STRING);
+
  /* Add an option '--verbose' */
  shell_cmd.add_option("verbose", false, "Enable verbose output");
  
--- a/openfpga/src/base/openfpga_repack.cpp
+++ b/openfpga/src/base/openfpga_repack.cpp
@ -30,6 +30,7 @@ int repack(OpenfpgaContext& openfpga_ctx,
           const Command& cmd, const CommandContext& cmd_context) {

  CommandOptionId opt_design_constraints = cmd.option("design_constraints");
+  CommandOptionId opt_ignore_global_nets = cmd.option("ignore_global_nets_on_pins");
  CommandOptionId opt_verbose = cmd.option("verbose");

  /* Load design constraints from file */
@ -40,22 +41,32 @@ int repack(OpenfpgaContext& openfpga_ctx,
    repack_design_constraints = read_xml_repack_design_constraints(dc_fname.c_str());
  }

+  /* Setup repacker options */
+  RepackOption options;
+  options.set_design_constraints(repack_design_constraints);
+  options.set_ignore_global_nets_on_pins(cmd_context.option_value(cmd, opt_ignore_global_nets));
+  options.set_verbose_output(cmd_context.option_enable(cmd, opt_verbose));
+
+  if (!options.valid()) {
+    VTR_LOG("Detected errors when parsing options!\n");
+    return CMD_EXEC_FATAL_ERROR;
+  }
+
  pack_physical_pbs(g_vpr_ctx.device(),
                    g_vpr_ctx.atom(),
                    g_vpr_ctx.clustering(),
                    openfpga_ctx.mutable_vpr_device_annotation(),
                    openfpga_ctx.mutable_vpr_clustering_annotation(),
                    openfpga_ctx.vpr_bitstream_annotation(),
-                    repack_design_constraints,
                    openfpga_ctx.arch().circuit_lib,
-                    cmd_context.option_enable(cmd, opt_verbose));
+                    options);

  build_physical_lut_truth_tables(openfpga_ctx.mutable_vpr_clustering_annotation(),
                                  g_vpr_ctx.atom(),
                                  g_vpr_ctx.clustering(),
                                  openfpga_ctx.vpr_device_annotation(),
                                  openfpga_ctx.arch().circuit_lib,
-                                  cmd_context.option_enable(cmd, opt_verbose));
+                                  options.verbose_output());

  /* TODO: should identify the error code from internal function execution */
  return CMD_EXEC_SUCCESS;
--- a/openfpga/src/fabric/build_fabric_io_location_map.cpp
+++ b/openfpga/src/fabric/build_fabric_io_location_map.cpp
@ -28,6 +28,8 @@ namespace openfpga {
 /********************************************************************
 * Find all the GPIO ports in the grid module
 * and cache their port/pin index in the top-level module
+ *
+ * .. note:: The I/O sequence(indexing) is already determined in the io_children() list of top-level module. Here we just build a fast lookup from (x, y, z) coordinate to the actual indices
 *******************************************************************/
 IoLocationMap build_fabric_io_location_map(const ModuleManager& module_manager,
                                           const DeviceGrid& grids) {
@ -37,59 +39,54 @@ IoLocationMap build_fabric_io_location_map(const ModuleManager& module_manager,

  std::map<std::string, size_t> io_counter;

-  /* Create the coordinate range for each side of FPGA fabric */
-  std::map<e_side, std::vector<vtr::Point<size_t>>> io_coordinates = generate_perimeter_grid_coordinates( grids);
+  std::string top_module_name = generate_fpga_top_module_name();
+  ModuleId top_module = module_manager.find_module(top_module_name);
+  VTR_ASSERT(true == module_manager.valid_module_id(top_module));
+
+  /* Walk through the I/O child list */
+  for (size_t ichild = 0; ichild < module_manager.io_children(top_module).size(); ++ichild) {
+    ModuleId child = module_manager.io_children(top_module)[ichild];
+    vtr::Point<int> coord = module_manager.io_child_coordinates(top_module)[ichild];

-  /* Walk through all the grids on the perimeter, which are I/O grids */
-  for (const e_side& io_side : FPGA_SIDES_CLOCKWISE) {
-    for (const vtr::Point<size_t>& io_coordinate : io_coordinates[io_side]) {
    /* Bypass EMPTY grid */
-      if (true == is_empty_type(grids[io_coordinate.x()][io_coordinate.y()].type)) {
+    if (true == is_empty_type(grids[coord.x()][coord.y()].type)) {
      continue;
    } 

    /* Skip width or height > 1 tiles (mostly heterogeneous blocks) */
-      if ( (0 < grids[io_coordinate.x()][io_coordinate.y()].width_offset)
-        || (0 < grids[io_coordinate.x()][io_coordinate.y()].height_offset)) {
+    if ( (0 < grids[coord.x()][coord.y()].width_offset)
+      || (0 < grids[coord.x()][coord.y()].height_offset)) {
      continue;
    }

-      t_physical_tile_type_ptr grid_type = grids[io_coordinate.x()][io_coordinate.y()].type;
-
-      /* Find the module name for this type of grid */
-      std::string grid_module_name_prefix(GRID_MODULE_NAME_PREFIX);
-      std::string grid_module_name = generate_grid_block_module_name(grid_module_name_prefix, std::string(grid_type->name), is_io_type(grid_type), io_side);
-      ModuleId grid_module = module_manager.find_module(grid_module_name);
-      VTR_ASSERT(true == module_manager.valid_module_id(grid_module));
+    VTR_ASSERT_SAFE(true == module_manager.valid_module_id(child));

    /* Find all the GPIO ports in the grid module */

    /* MUST DO: register in io location mapping!
     * I/O location mapping is a critical look-up for testbench generators
-       * As we add the I/O grid instances to top module by following order:
-       * TOP -> RIGHT -> BOTTOM -> LEFT
-       * The I/O index will increase in this way as well.
-       * This organization I/O indices is also consistent to the way 
-       * that GPIOs are wired in function connect_gpio_module()
-       *
-       * Note: if you change the GPIO function, you should update here as well!
     */
-      for (int z = 0; z < grids[io_coordinate.x()][io_coordinate.y()].type->capacity; ++z) {
+    if (size_t(grids[coord.x()][coord.y()].type->capacity) != module_manager.io_children(child).size()) {
+      VTR_LOG("%s[%ld][%ld] capacity: %d while io_child number is %d", grids[coord.x()][coord.y()].type->name, coord.x(), coord.y(), grids[coord.x()][coord.y()].type->capacity, module_manager.io_children(child).size());
+    }
+    VTR_ASSERT(size_t(grids[coord.x()][coord.y()].type->capacity) == module_manager.io_children(child).size());
+    for (size_t isubchild = 0; isubchild < module_manager.io_children(child).size(); ++isubchild) {
+      vtr::Point<int> subchild_coord = module_manager.io_child_coordinates(child)[isubchild];
+
      for (const ModuleManager::e_module_port_type& module_io_port_type : MODULE_IO_PORT_TYPES) {
-          for (const ModulePortId& gpio_port_id : module_manager.module_port_ids_by_type(grid_module, module_io_port_type)) {
+        for (const ModulePortId& gpio_port_id : module_manager.module_port_ids_by_type(child, module_io_port_type)) {
          /* Only care mappable I/O */
-            if (false == module_manager.port_is_mappable_io(grid_module, gpio_port_id)) {
+          if (false == module_manager.port_is_mappable_io(child, gpio_port_id)) {
            continue;
          }
-
-            const BasicPort& gpio_port = module_manager.module_port(grid_module, gpio_port_id);
+          const BasicPort& gpio_port = module_manager.module_port(child, gpio_port_id);

          auto curr_io_index = io_counter.find(gpio_port.get_name());
          /* Index always start from zero */
          if (curr_io_index == io_counter.end()) {
            io_counter[gpio_port.get_name()] = 0;
          }
-            io_location_map.set_io_index(io_coordinate.x(), io_coordinate.y(), z,
+          io_location_map.set_io_index(coord.x(), coord.y(), subchild_coord.x(),
                                       gpio_port.get_name(),
                                       io_counter[gpio_port.get_name()]);
          io_counter[gpio_port.get_name()]++;
@ -97,73 +94,8 @@ IoLocationMap build_fabric_io_location_map(const ModuleManager& module_manager,
      }
    }
  }
-  }
-
-  /* Walk through all the center grids, which may include I/O grids */
-  for (size_t ix = 1; ix < grids.width() - 1; ++ix) {
-    for (size_t iy = 1; iy < grids.height() - 1; ++iy) {
-      /* Bypass EMPTY grid */
-      if (true == is_empty_type(grids[ix][iy].type)) {
-        continue;
-      } 
-
-      /* Skip width or height > 1 tiles (mostly heterogeneous blocks) */
-      if ( (0 < grids[ix][iy].width_offset)
-        || (0 < grids[ix][iy].height_offset)) {
-        continue;
-      }
-
-      t_physical_tile_type_ptr grid_type = grids[ix][iy].type;
-
-      /* Find the module name for this type of grid */
-      std::string grid_module_name_prefix(GRID_MODULE_NAME_PREFIX);
-      std::string grid_module_name = generate_grid_block_module_name(grid_module_name_prefix, std::string(grid_type->name), is_io_type(grid_type), NUM_SIDES);
-      ModuleId grid_module = module_manager.find_module(grid_module_name);
-      VTR_ASSERT(true == module_manager.valid_module_id(grid_module));
-
-      /* Find all the GPIO ports in the grid module */
-
-      /* MUST DO: register in io location mapping!
-       * I/O location mapping is a critical look-up for testbench generators
-       * As we add the I/O grid instances to top module by following order:
-       * TOP -> RIGHT -> BOTTOM -> LEFT
-       * The I/O index will increase in this way as well.
-       * This organization I/O indices is also consistent to the way 
-       * that GPIOs are wired in function connect_gpio_module()
-       *
-       * Note: if you change the GPIO function, you should update here as well!
-       * FIXME: The codes should be adapt to support sub tiles!!!
-       */
-      for (int z = 0; z < grids[ix][iy].type->capacity; ++z) {
-        for (const ModuleManager::e_module_port_type& module_io_port_type : MODULE_IO_PORT_TYPES) {
-          for (const ModulePortId& gpio_port_id : module_manager.module_port_ids_by_type(grid_module, module_io_port_type)) {
-            /* Only care mappable I/O */
-            if (false == module_manager.port_is_mappable_io(grid_module, gpio_port_id)) {
-              continue;
-            }
-
-            const BasicPort& gpio_port = module_manager.module_port(grid_module, gpio_port_id);
-
-            auto curr_io_index = io_counter.find(gpio_port.get_name());
-            /* Index always start from zero */
-            if (curr_io_index == io_counter.end()) {
-              io_counter[gpio_port.get_name()] = 0;
-            }
-            io_location_map.set_io_index(ix, iy, z,
-                                         gpio_port.get_name(),
-                                         io_counter[gpio_port.get_name()]);
-            io_counter[gpio_port.get_name()]++;
-          }
-        }
-      }
-    }
-  }

  /* Check all the GPIO ports in the top-level module has been mapped */
-  std::string top_module_name = generate_fpga_top_module_name();
-  ModuleId top_module = module_manager.find_module(top_module_name);
-  VTR_ASSERT(true == module_manager.valid_module_id(top_module));
-
  for (const ModuleManager::e_module_port_type& module_io_port_type : MODULE_IO_PORT_TYPES) {
    for (const ModulePortId& gpio_port_id : module_manager.module_port_ids_by_type(top_module, module_io_port_type)) {
      /* Only care mappable I/O */
--- a/openfpga/src/fabric/build_grid_modules.cpp
+++ b/openfpga/src/fabric/build_grid_modules.cpp
@ -1026,7 +1026,9 @@ void build_physical_tile_module(ModuleManager& module_manager,

      /* Add all the sub modules */
      size_t pb_instance_id = module_manager.num_instance(grid_module, pb_module);
-      module_manager.add_child_module(grid_module, pb_module);
+      module_manager.add_child_module(grid_module, pb_module, false);
+      /* Add a custom I/O child with coordinate 'z' */
+      module_manager.add_io_child(grid_module, pb_module, pb_instance_id, vtr::Point<int>(iz, 0));

      /* Give the child module with a unique instance name */
      std::string instance_name = generate_physical_block_instance_name(lb_type->pb_graph_head->pb_type, iz);
--- a/openfpga/src/fabric/build_top_module.cpp
+++ b/openfpga/src/fabric/build_top_module.cpp
@ -50,7 +50,7 @@ size_t add_top_module_grid_instance(ModuleManager& module_manager,
  /* Record the instance id */
  size_t grid_instance = module_manager.num_instance(top_module, grid_module);
  /* Add the module to top_module */ 
-  module_manager.add_child_module(top_module, grid_module);
+  module_manager.add_child_module(top_module, grid_module, false);
  /* Set an unique name to the instance
   * Note: it is your risk to gurantee the name is unique!
   */
@ -102,33 +102,6 @@ vtr::Matrix<size_t> add_top_module_grid_instances(ModuleManager& module_manager,
  vtr::Matrix<size_t> grid_instance_ids({grids.width(), grids.height()}); 
  grid_instance_ids.fill(size_t(-1));

-  /* Instanciate core grids */
-  for (size_t ix = 1; ix < grids.width() - 1; ++ix) {
-    for (size_t iy = 1; iy < grids.height() - 1; ++iy) {
-      /* Bypass EMPTY grid */
-      if (true == is_empty_type(grids[ix][iy].type)) {
-        continue;
-      } 
-      /* Skip width or height > 1 tiles (mostly heterogeneous blocks) */
-      if ( (0 < grids[ix][iy].width_offset)
-        || (0 < grids[ix][iy].height_offset)) {
-        /* Find the root of this grid, the instance id should be valid. 
-         * We just copy it here
-         */
-        vtr::Point<size_t> root_grid_coord(ix - grids[ix][iy].width_offset,
-                                           iy - grids[ix][iy].height_offset);
-        VTR_ASSERT(size_t(-1) != grid_instance_ids[root_grid_coord.x()][root_grid_coord.y()]);
-        grid_instance_ids[ix][iy] = grid_instance_ids[root_grid_coord.x()][root_grid_coord.y()];
-        continue;
-      }
-      /* Add a grid module to top_module*/
-      vtr::Point<size_t> grid_coord(ix, iy);
-      grid_instance_ids[ix][iy] = add_top_module_grid_instance(module_manager, top_module,
-                                                               grids[ix][iy].type,
-                                                               NUM_SIDES, grid_coord);
-    }
-  }
-
  /* Instanciate I/O grids */
  /* Create the coordinate range for each side of FPGA fabric */
  std::map<e_side, std::vector<vtr::Point<size_t>>> io_coordinates = generate_perimeter_grid_coordinates( grids);
@ -157,6 +130,37 @@ vtr::Matrix<size_t> add_top_module_grid_instances(ModuleManager& module_manager,
    }
  }

+  /* Instanciate core grids
+   * IMPORTANT: sequence matters here, it impacts the I/O indexing.
+   * We should follow the same sequence as the build_io_location_map()! 
+   * If you change the sequence of walking through grids here, you should change it in the build_io_location map()!
+   */
+  for (size_t ix = 1; ix < grids.width() - 1; ++ix) {
+    for (size_t iy = 1; iy < grids.height() - 1; ++iy) {
+      /* Bypass EMPTY grid */
+      if (true == is_empty_type(grids[ix][iy].type)) {
+        continue;
+      } 
+      /* Skip width or height > 1 tiles (mostly heterogeneous blocks) */
+      if ( (0 < grids[ix][iy].width_offset)
+        || (0 < grids[ix][iy].height_offset)) {
+        /* Find the root of this grid, the instance id should be valid. 
+         * We just copy it here
+         */
+        vtr::Point<size_t> root_grid_coord(ix - grids[ix][iy].width_offset,
+                                           iy - grids[ix][iy].height_offset);
+        VTR_ASSERT(size_t(-1) != grid_instance_ids[root_grid_coord.x()][root_grid_coord.y()]);
+        grid_instance_ids[ix][iy] = grid_instance_ids[root_grid_coord.x()][root_grid_coord.y()];
+        continue;
+      }
+      /* Add a grid module to top_module*/
+      vtr::Point<size_t> grid_coord(ix, iy);
+      grid_instance_ids[ix][iy] = add_top_module_grid_instance(module_manager, top_module,
+                                                               grids[ix][iy].type,
+                                                               NUM_SIDES, grid_coord);
+    }
+  }
+
  return grid_instance_ids;
 }

@ -201,7 +205,7 @@ vtr::Matrix<size_t> add_top_module_switch_block_instances(ModuleManager& module_
      /* Record the instance id */
      sb_instance_ids[rr_gsb.get_sb_x()][rr_gsb.get_sb_y()] = module_manager.num_instance(top_module, sb_module);
      /* Add the module to top_module */ 
-      module_manager.add_child_module(top_module, sb_module);
+      module_manager.add_child_module(top_module, sb_module, false);
      /* Set an unique name to the instance
       * Note: it is your risk to gurantee the name is unique!
       */
@ -257,7 +261,7 @@ vtr::Matrix<size_t> add_top_module_connection_block_instances(ModuleManager& mod
      /* Record the instance id */
      cb_instance_ids[rr_gsb.get_cb_x(cb_type)][rr_gsb.get_cb_y(cb_type)] = module_manager.num_instance(top_module, cb_module);
      /* Add the module to top_module */ 
-      module_manager.add_child_module(top_module, cb_module);
+      module_manager.add_child_module(top_module, cb_module, false);
      /* Set an unique name to the instance
       * Note: it is your risk to gurantee the name is unique!
       */
@ -271,6 +275,118 @@ vtr::Matrix<size_t> add_top_module_connection_block_instances(ModuleManager& mod
  return cb_instance_ids;
 }

+/********************************************************************
+ * Add the I/O children to the top-level module, which impacts the I/O indexing
+ * This is the default function to build the I/O sequence/indexing
+ * The I/O children is added in a maze shape
+ * The function supports I/Os in the center of grids, starting from the bottom-left corner and ending at the center
+ * 
+ *    +----------------------+
+ *    |+--------------------+|
+ *    ||+------------------+||
+ *    |||+----------------+|||
+ *    ||||+-------------->||||
+ *    ||||+---------------+|||
+ *    |||+-----------------+||
+ *    ||+-------------------+|
+ *    |+---------------------+
+ *    ^
+ *  io[0]                   
+ *******************************************************************/
+static 
+void add_top_module_io_children(ModuleManager& module_manager,
+                                const ModuleId& top_module,
+                                const DeviceGrid& grids,
+                                const vtr::Matrix<size_t>& grid_instance_ids) {
+  /* Create the coordinate range for the perimeter I/Os of FPGA fabric */
+  std::map<e_side, std::vector<vtr::Point<size_t>>> io_coordinates = generate_perimeter_grid_coordinates( grids);
+
+  for (const e_side& io_side : FPGA_SIDES_CLOCKWISE) {
+    for (const vtr::Point<size_t>& io_coord : io_coordinates[io_side]) {
+      /* Bypass EMPTY grid */
+      if (true == is_empty_type(grids[io_coord.x()][io_coord.y()].type)) {
+        continue;
+      } 
+      /* Skip width, height > 1 tiles (mostly heterogeneous blocks) */
+      if ( (0 < grids[io_coord.x()][io_coord.y()].width_offset)
+        || (0 < grids[io_coord.x()][io_coord.y()].height_offset)) {
+        continue;
+      }
+      /* Find the module name for this type of grid */
+      t_physical_tile_type_ptr grid_type = grids[io_coord.x()][io_coord.y()].type;
+      std::string grid_module_name_prefix(GRID_MODULE_NAME_PREFIX);
+      std::string grid_module_name = generate_grid_block_module_name(grid_module_name_prefix, std::string(grid_type->name), is_io_type(grid_type), io_side);
+      ModuleId grid_module = module_manager.find_module(grid_module_name);
+      VTR_ASSERT(true == module_manager.valid_module_id(grid_module));
+      /* Add a I/O children to top_module*/
+      module_manager.add_io_child(top_module, grid_module, grid_instance_ids[io_coord.x()][io_coord.y()], vtr::Point<int>(io_coord.x(), io_coord.y()));
+    }
+  }
+
+  /* Walk through the center grids */
+  size_t xmin = 1;
+  size_t xmax = grids.width() - 2;
+  size_t ymin = 1;
+  size_t ymax = grids.height() - 2;
+  std::vector<vtr::Point<size_t>> coords;
+  while (xmin < xmax && ymin < ymax) { 
+    for (size_t iy = ymin; iy < ymax + 1; iy++) {
+      coords.push_back(vtr::Point<size_t>(xmin, iy));
+    }
+    for (size_t ix = xmin + 1; ix < xmax + 1; ix++) {
+      coords.push_back(vtr::Point<size_t>(ix, ymax));
+    }
+    for (size_t iy = ymax - 1; iy > ymin; iy--) {
+      coords.push_back(vtr::Point<size_t>(xmax, iy));
+    }
+    for (size_t ix = xmax; ix > xmin; ix--) {
+      coords.push_back(vtr::Point<size_t>(ix, ymin));
+    }
+    xmin++;
+    ymin++;
+    xmax--;
+    ymax--;
+  }
+
+  /* If height is odd, add the missing horizental line */
+  if ((grids.height() - 2) % 2 == 1) {
+    if (ymin == ymax) {
+      for (size_t ix = xmin; ix < xmax + 1; ix++) {
+        coords.push_back(vtr::Point<size_t>(ix, ymin));
+      }
+    }
+  }
+  /* If width is odd, add the missing vertical line */
+  if ((grids.width() - 2) % 2 == 1) {
+    if (xmin == xmax) {
+      for (size_t iy = ymin; iy < ymax + 1; iy++) {
+        coords.push_back(vtr::Point<size_t>(xmin, iy));
+      }
+    }
+  }
+
+  /* Now walk through the coordinates */
+  for (vtr::Point<size_t> coord : coords) {
+    /* Bypass EMPTY grid */
+    if (true == is_empty_type(grids[coord.x()][coord.y()].type)) {
+      continue;
+    } 
+    /* Skip width or height > 1 tiles (mostly heterogeneous blocks) */
+    if ( (0 < grids[coord.x()][coord.y()].width_offset)
+      || (0 < grids[coord.x()][coord.y()].height_offset)) {
+      continue;
+    }
+    /* Find the module name for this type of grid */
+    t_physical_tile_type_ptr grid_type = grids[coord.x()][coord.y()].type;
+    std::string grid_module_name_prefix(GRID_MODULE_NAME_PREFIX);
+    std::string grid_module_name = generate_grid_block_module_name(grid_module_name_prefix, std::string(grid_type->name), is_io_type(grid_type), NUM_SIDES);
+    ModuleId grid_module = module_manager.find_module(grid_module_name);
+    VTR_ASSERT(true == module_manager.valid_module_id(grid_module));
+    /* Add a I/O children to top_module*/
+    module_manager.add_io_child(top_module, grid_module, grid_instance_ids[coord.x()][coord.y()], vtr::Point<int>(coord.x(), coord.y()));
+  }
+}
+
 /********************************************************************
 * Print the top-level module for the FPGA fabric in Verilog format
 * This function will 
@ -323,6 +439,9 @@ int build_top_module(ModuleManager& module_manager,
  cb_instance_ids[CHANX] = add_top_module_connection_block_instances(module_manager, top_module, device_rr_gsb, CHANX, compact_routing_hierarchy);
  cb_instance_ids[CHANY] = add_top_module_connection_block_instances(module_manager, top_module, device_rr_gsb, CHANY, compact_routing_hierarchy);

+  /* Update I/O children list */
+  add_top_module_io_children(module_manager, top_module, grids, grid_instance_ids);
+
  /* Add nets when we need a complete fabric modeling,
   * which is required by downstream functions
   */
@ -350,8 +469,7 @@ int build_top_module(ModuleManager& module_manager,
  }

  /* Add GPIO ports from the sub-modules under this Verilog module 
-   * This is a much easier job after adding sub modules (instances), 
-   * we just need to find all the I/O ports from the child modules and build a list of it
+   * For top-level module, we follow a special sequencing for I/O modules. So we rebuild the I/O children list here
   */
  add_module_gpio_ports_from_child_modules(module_manager, top_module);

--- a/openfpga/src/fabric/build_top_module_directs.cpp
+++ b/openfpga/src/fabric/build_top_module_directs.cpp
@ -131,7 +131,7 @@ void add_module_nets_tile_direct_connection(ModuleManager& module_manager,

  /* Add a submodule of direct connection module to the top-level module */
  size_t direct_instance_id = module_manager.num_instance(top_module, direct_module);
-  module_manager.add_child_module(top_module, direct_module);
+  module_manager.add_child_module(top_module, direct_module, false);

  /* Create the 1st module net */
  ModuleNetId net_direct_src = module_manager.create_module_net(top_module); 
--- a/openfpga/src/fabric/build_top_module_memory.cpp
+++ b/openfpga/src/fabric/build_top_module_memory.cpp
@ -1055,7 +1055,7 @@ void add_top_module_nets_cmos_memory_bank_config_bus(ModuleManager& module_manag
    }
    VTR_ASSERT(ModuleId::INVALID() != bl_decoder_module);
    size_t curr_bl_decoder_instance_id = module_manager.num_instance(top_module, bl_decoder_module);
-    module_manager.add_child_module(top_module, bl_decoder_module);
+    module_manager.add_child_module(top_module, bl_decoder_module, false);

    /************************************************************** 
     * Add the WL decoder module 
@ -1083,7 +1083,7 @@ void add_top_module_nets_cmos_memory_bank_config_bus(ModuleManager& module_manag
    }
    VTR_ASSERT(ModuleId::INVALID() != wl_decoder_module);
    size_t curr_wl_decoder_instance_id = module_manager.num_instance(top_module, wl_decoder_module);
-    module_manager.add_child_module(top_module, wl_decoder_module);
+    module_manager.add_child_module(top_module, wl_decoder_module, false);

    /************************************************************** 
     * Add module nets from the top module to BL decoder's inputs
@ -1531,7 +1531,7 @@ void add_top_module_nets_cmos_memory_frame_decoder_config_bus(ModuleManager& mod

  /* Instanciate the decoder module here */
  size_t decoder_instance = module_manager.num_instance(parent_module, decoder_module);
-  module_manager.add_child_module(parent_module, decoder_module);
+  module_manager.add_child_module(parent_module, decoder_module, false);

  /* Connect the enable (EN) port of memory modules under the parent module
   * to the frame decoder inputs
--- a/openfpga/src/fabric/build_top_module_memory_bank.cpp
+++ b/openfpga/src/fabric/build_top_module_memory_bank.cpp
@ -507,7 +507,7 @@ void add_top_module_nets_cmos_ql_memory_bank_bl_decoder_config_bus(ModuleManager
    }
    VTR_ASSERT(ModuleId::INVALID() != bl_decoder_module);
    size_t curr_bl_decoder_instance_id = module_manager.num_instance(top_module, bl_decoder_module);
-    module_manager.add_child_module(top_module, bl_decoder_module);
+    module_manager.add_child_module(top_module, bl_decoder_module, false);

    /************************************************************** 
     * Add module nets from the top module to BL decoder's inputs
@ -705,7 +705,7 @@ void add_top_module_nets_cmos_ql_memory_bank_wl_decoder_config_bus(ModuleManager
    }
    VTR_ASSERT(ModuleId::INVALID() != wl_decoder_module);
    size_t curr_wl_decoder_instance_id = module_manager.num_instance(top_module, wl_decoder_module);
-    module_manager.add_child_module(top_module, wl_decoder_module);
+    module_manager.add_child_module(top_module, wl_decoder_module, false);

    /************************************************************** 
     * Add module nets from the top module to WL decoder's inputs 
@ -1471,7 +1471,7 @@ void add_top_module_nets_cmos_ql_memory_bank_bl_shift_register_config_bus(Module
      VTR_ASSERT(sr_bank_module);

      size_t cur_inst = module_manager.num_instance(top_module, sr_bank_module);
-      module_manager.add_child_module(top_module, sr_bank_module);
+      module_manager.add_child_module(top_module, sr_bank_module, false);

      sr_banks.link_bl_shift_register_bank_to_module(config_region, sr_bank, sr_bank_module);
      sr_banks.link_bl_shift_register_bank_to_instance(config_region, sr_bank, cur_inst);
@ -1565,7 +1565,7 @@ void add_top_module_nets_cmos_ql_memory_bank_wl_shift_register_config_bus(Module
      VTR_ASSERT(sr_bank_module);

      size_t cur_inst = module_manager.num_instance(top_module, sr_bank_module);
-      module_manager.add_child_module(top_module, sr_bank_module);
+      module_manager.add_child_module(top_module, sr_bank_module, false);

      sr_banks.link_wl_shift_register_bank_to_module(config_region, sr_bank, sr_bank_module);
      sr_banks.link_wl_shift_register_bank_to_instance(config_region, sr_bank, cur_inst);
--- a/openfpga/src/fabric/module_manager.cpp
+++ b/openfpga/src/fabric/module_manager.cpp
@ -91,6 +91,29 @@ std::vector<vtr::Point<int>> ModuleManager::configurable_child_coordinates(const
  return configurable_child_coordinates_[parent_module];
 }

+/* Find all the configurable child modules under a parent module */
+std::vector<ModuleId> ModuleManager::io_children(const ModuleId& parent_module) const {
+  /* Validate the module_id */
+  VTR_ASSERT(valid_module_id(parent_module));
+
+  return io_children_[parent_module];
+}
+
+/* Find all the instances of configurable child modules under a parent module */
+std::vector<size_t> ModuleManager::io_child_instances(const ModuleId& parent_module) const {
+  /* Validate the module_id */
+  VTR_ASSERT(valid_module_id(parent_module));
+
+  return io_child_instances_[parent_module];
+}
+
+std::vector<vtr::Point<int>> ModuleManager::io_child_coordinates(const ModuleId& parent_module) const {
+  /* Validate the module_id */
+  VTR_ASSERT(valid_module_id(parent_module));
+
+  return io_child_coordinates_[parent_module];
+}
+
 /* Find the source ids of modules */
 ModuleManager::module_net_src_range ModuleManager::module_net_sources(const ModuleId& module, const ModuleNetId& net) const {
  /* Validate the module_id */
@ -562,6 +585,10 @@ ModuleId ModuleManager::add_module(const std::string& name) {
  config_region_ids_.emplace_back(); 
  config_region_children_.emplace_back(); 

+  io_children_.emplace_back();
+  io_child_instances_.emplace_back();
+  io_child_coordinates_.emplace_back();
+
  port_ids_.emplace_back();
  ports_.emplace_back();
  port_types_.emplace_back();
@ -680,7 +707,7 @@ void ModuleManager::set_port_preproc_flag(const ModuleId& module, const ModulePo
 }

 /* Add a child module to a parent module */
-void ModuleManager::add_child_module(const ModuleId& parent_module, const ModuleId& child_module) {
+void ModuleManager::add_child_module(const ModuleId& parent_module, const ModuleId& child_module, const bool& is_io_child) {
  /* Validate the id of both parent and child modules */
  VTR_ASSERT ( valid_module_id(parent_module) );
  VTR_ASSERT ( valid_module_id(child_module) );
@ -693,19 +720,27 @@ void ModuleManager::add_child_module(const ModuleId& parent_module, const Module
  }

  std::vector<ModuleId>::iterator child_it = std::find(children_[parent_module].begin(), children_[parent_module].end(), child_module);
+  int child_instance_id = -1;
  if (child_it == children_[parent_module].end()) {
    /* Update the child module of parent module */
    children_[parent_module].push_back(child_module);
    num_child_instances_[parent_module].push_back(1); /* By default give one */
+    child_instance_id = 0;
    /* Update the instance name list */
    child_instance_names_[parent_module].emplace_back();
    child_instance_names_[parent_module].back().emplace_back();
  } else {
    /* Increase the counter of instances */
+    child_instance_id = num_child_instances_[parent_module][child_it - children_[parent_module].begin()];
    num_child_instances_[parent_module][child_it - children_[parent_module].begin()]++;
    child_instance_names_[parent_module][child_it - children_[parent_module].begin()].emplace_back();
  }

+  /* Add to I/O child if needed */
+  if (is_io_child) {
+    add_io_child(parent_module, child_module, child_instance_id);
+  }
+
  /* Update fast look-up for nets */
  size_t instance_id = net_lookup_[parent_module][child_module].size();
  net_lookup_[parent_module][child_module].emplace_back();
@ -815,6 +850,36 @@ void ModuleManager::add_configurable_child_to_region(const ModuleId& parent_modu
  config_region_children_[parent_module][config_region].push_back(config_child_id);
 }

+void ModuleManager::add_io_child(const ModuleId& parent_module, 
+                                 const ModuleId& child_module, 
+                                 const size_t& child_instance,
+                                 const vtr::Point<int> coord) {
+  /* Validate the id of both parent and child modules */
+  VTR_ASSERT ( valid_module_id(parent_module) );
+  VTR_ASSERT ( valid_module_id(child_module) );
+  /* Ensure that the instance id is in range */
+  VTR_ASSERT ( child_instance < num_instance(parent_module, child_module));
+
+  io_children_[parent_module].push_back(child_module);
+  io_child_instances_[parent_module].push_back(child_instance);
+  io_child_coordinates_[parent_module].push_back(coord);
+}
+
+void ModuleManager::reserve_io_child(const ModuleId& parent_module,
+                                     const size_t& num_children) {
+  VTR_ASSERT ( valid_module_id(parent_module) );
+  /* Do reserve when the number of children is larger than current size of lists */
+  if (num_children > io_children_[parent_module].size()) {
+    io_children_[parent_module].reserve(num_children);
+  }
+  if (num_children > io_child_instances_[parent_module].size()) {
+    io_child_instances_[parent_module].reserve(num_children);
+  }
+  if (num_children > io_child_coordinates_[parent_module].size()) {
+    io_child_coordinates_[parent_module].reserve(num_children);
+  }
+}
+
 void ModuleManager::reserve_module_nets(const ModuleId& module,
                                        const size_t& num_nets) {
  /* Validate the module id */
@ -1020,6 +1085,14 @@ void ModuleManager::clear_config_region(const ModuleId& parent_module) {
  config_region_children_[parent_module].clear();
 }

+void ModuleManager::clear_io_children(const ModuleId& parent_module) {
+  VTR_ASSERT(valid_module_id(parent_module));
+
+  io_children_[parent_module].clear();
+  io_child_instances_[parent_module].clear();
+  io_child_coordinates_[parent_module].clear();
+}
+
 /******************************************************************************
 * Private validators/invalidators
 ******************************************************************************/
--- a/openfpga/src/fabric/module_manager.h
+++ b/openfpga/src/fabric/module_manager.h
@ -151,6 +151,14 @@ class ModuleManager {
    std::vector<size_t> configurable_child_instances(const ModuleId& parent_module) const;
    /* Find the coordindate of a configurable child module under a parent module */
    std::vector<vtr::Point<int>> configurable_child_coordinates(const ModuleId& parent_module) const;
+
+    /* Find all the I/O child modules under a parent module */
+    std::vector<ModuleId> io_children(const ModuleId& parent_module) const;
+    /* Find all the instances of I/O child modules under a parent module */
+    std::vector<size_t> io_child_instances(const ModuleId& parent_module) const;
+    /* Find the coordindate of an I/O child module under a parent module */
+    std::vector<vtr::Point<int>> io_child_coordinates(const ModuleId& parent_module) const;
+
    /* Find the source ids of modules */
    module_net_src_range module_net_sources(const ModuleId& module, const ModuleNetId& net) const;
    /* Find the sink ids of modules */
@ -255,8 +263,13 @@ class ModuleManager {
    void set_port_is_register(const ModuleId& module, const std::string& port_name, const bool& is_register);
    /* Set the preprocessing flag for a port */
    void set_port_preproc_flag(const ModuleId& module, const ModulePortId& port, const std::string& preproc_flag);
-    /* Add a child module to a parent module */
-    void add_child_module(const ModuleId& parent_module, const ModuleId& child_module);
+    /** @brief Add a child module to a parent module.
+     *  By default, it considers the child module as an I/O child, and update the children list of I/O modules inside
+     *  It not needed, just turn it off. Then you need to call add_io_child() API to update child list
+     * 
+     *  .. note:: By default, we assume the I/O indexing to the same as sequence when adding child modules to a parent. However, it may not be true all the time, especially for the top-level module, where customization is needed.  
+     */
+    void add_child_module(const ModuleId& parent_module, const ModuleId& child_module, const bool& is_io_child = true);
    /* Set the instance name of a child module */
    void set_child_instance_name(const ModuleId& parent_module, const ModuleId& child_module, const size_t& instance_id, const std::string& instance_name);
    /* Add a configurable child module to module
@ -266,9 +279,7 @@ class ModuleManager {
     * By default, it is an invalid coordinate 
     */
    void add_configurable_child(const ModuleId& module, const ModuleId& child_module, const size_t& child_instance, const vtr::Point<int> coord = vtr::Point<int>(-1, -1));
-    /* Reserved a number of configurable children
-     * for memory efficiency
-     */
+    /* Reserved a number of configurable children for memory efficiency */
    void reserve_configurable_child(const ModuleId& module, const size_t& num_children);

    /* Create a new configurable region under a module */
@ -283,10 +294,18 @@ class ModuleManager {
                                          const ModuleId& child_module,
                                          const size_t& child_instance,
                                          const size_t& config_child_id);
-
-    /* Reserved a number of module nets for a given module
-     * for memory efficiency
+    /** @brief Add a I/O child to module
+     * This function also set the coordinate of a configurable child
+     * The coordinate is used for build I/O location map. So it is consistent with the VPR coordinate system
+     * By default, it is an invalid coordinate 
+     *
+     * .. note:: I/O child does not necessary have to be a I/O block. It just provide a sequence for other functions, e.g., connect_gpio_module() to index the I/Os from each child module/instance.
     */
+    void add_io_child(const ModuleId& module, const ModuleId& child_module, const size_t& child_instance, const vtr::Point<int> coord = vtr::Point<int>(-1, -1));
+    /** @brief Reserved a number of I/O children for memory efficiency */
+    void reserve_io_child(const ModuleId& module, const size_t& num_children);
+
+    /* Reserved a number of module nets for a given module for memory efficiency */
    void reserve_module_nets(const ModuleId& module, const size_t& num_nets);

    /* Add a net to the connection graph of the module */ 
@ -295,9 +314,7 @@ class ModuleManager {
    void set_net_name(const ModuleId& module, const ModuleNetId& net,
                      const std::string& name);

-    /* Reserved a number of sources for a module net for a given module
-     * for memory efficiency
-     */
+    /* Reserved a number of sources for a module net for a given module for memory efficiency */
    void reserve_module_net_sources(const ModuleId& module, const ModuleNetId& net,
                                    const size_t& num_sources);

@ -306,9 +323,7 @@ class ModuleManager {
                                         const ModuleId& src_module, const size_t& instance_id,
                                         const ModulePortId& src_port, const size_t& src_pin);

-    /* Reserved a number of sinks for a module net for a given module
-     * for memory efficiency
-     */
+    /* Reserved a number of sinks for a module net for a given module for memory efficiency */
    void reserve_module_net_sinks(const ModuleId& module, const ModuleNetId& net,
                                  const size_t& num_sinks);

@ -330,6 +345,14 @@ class ModuleManager {
     * Do NOT use unless you know what you are doing!!!
     */
    void clear_config_region(const ModuleId& parent_module);
+
+    /* This is a strong function which will remove all the io children 
+     * under a given parent module
+     * It is mainly used by other functions which want to force an I/O sequence
+     * Do NOT use unless you know what you are doing!!!
+     */
+    void clear_io_children(const ModuleId& parent_module);
+
  public: /* Public validators/invalidators */
    bool valid_module_id(const ModuleId& module) const;
    bool valid_module_port_id(const ModuleId& module, const ModulePortId& port) const;
@ -371,6 +394,15 @@ class ModuleManager {
    vtr::vector<ModuleId, vtr::vector<ConfigRegionId, ConfigRegionId>> config_region_ids_; 
    vtr::vector<ModuleId, vtr::vector<ConfigRegionId, std::vector<size_t>>> config_region_children_;

+    /* I/O child modules are used to record the position of I/O modules in GPIO indexing
+     * The sequence of children in the list denotes which one is indexed in the GPIO first, etc. 
+     * Note that the sequence can be totally different from the children_ list
+     * This is really dependent how the I/O indexing is organized which should be made by users/designers 
+     */
+    vtr::vector<ModuleId, std::vector<ModuleId>> io_children_;
+    vtr::vector<ModuleId, std::vector<size_t>> io_child_instances_;
+    vtr::vector<ModuleId, std::vector<vtr::Point<int>>> io_child_coordinates_;
+
    /* Port-level data */
    vtr::vector<ModuleId, vtr::vector<ModulePortId, ModulePortId>> port_ids_;    /* List of ports for each Module */ 
    vtr::vector<ModuleId, vtr::vector<ModulePortId, BasicPort>> ports_;    /* List of ports for each Module */ 
--- a/openfpga/src/repack/repack.cpp
+++ b/openfpga/src/repack/repack.cpp
@ -388,10 +388,11 @@ void add_lb_router_nets(LbRouter& lb_router,
                        const VprDeviceAnnotation& device_annotation,
                        const ClusteringContext& clustering_ctx,
                        const VprClusteringAnnotation& clustering_annotation,
-                        const RepackDesignConstraints& design_constraints,
                        const ClusterBlockId& block_id,
-                        const bool& verbose) {
+                        const RepackOption& options) {
  size_t net_counter = 0;
+  bool verbose = options.verbose_output();
+  RepackDesignConstraints design_constraints = options.design_constraints();

  /* Two spots to find source nodes for each nets
   *  - nets that appear in the inputs of a clustered block
@ -437,6 +438,54 @@ void add_lb_router_nets(LbRouter& lb_router,
    pb_pin_mapped_nets[pb_pin] = atom_net_id;
  }

+  /* Cache the sink nodes/routing traces for the global nets which is specifed to be ignored on given pins */
+  std::map<AtomNetId, std::vector<LbRRNodeId>> ignored_global_net_sinks;
+  std::map<AtomNetId, bool> ignored_atom_nets;
+  for (int j = 0; j < lb_type->pb_type->num_pins; j++) {
+    /* Get the source pb_graph pin and find the rr_node in logical block routing resource graph */
+    const t_pb_graph_pin* source_pb_pin = get_pb_graph_node_pin_from_block_pin(block_id, j);
+    VTR_ASSERT(source_pb_pin->parent_node == pb->pb_graph_node);
+
+    /* Bypass output pins */
+    if (OUT_PORT == source_pb_pin->port->type) {
+      continue;
+    }
+
+    /* Find the net mapped to this pin in clustering results*/
+    ClusterNetId cluster_net_id = clustering_ctx.clb_nlist.block_net(block_id, j);
+    /* Get the actual net id because it may be renamed during routing */
+    if (true == clustering_annotation.is_net_renamed(block_id, j)) {
+      cluster_net_id = clustering_annotation.net(block_id, j);
+    }
+
+    /* Bypass unmapped pins */
+    if (ClusterNetId::INVALID() == cluster_net_id) {
+      continue;
+    }
+
+    /* Only for global net which should be ignored, cache the sink nodes */
+    BasicPort curr_pin(std::string(source_pb_pin->port->name), source_pb_pin->pin_number, source_pb_pin->pin_number);
+    if ( (clustering_ctx.clb_nlist.net_is_ignored(cluster_net_id)) 
+      && (clustering_ctx.clb_nlist.net_is_global(cluster_net_id)) 
+      && (options.is_pin_ignore_global_nets(std::string(lb_type->pb_type->name), curr_pin))) {
+      /* Find the net mapped to this pin in clustering results*/
+      AtomNetId atom_net_id = pb_pin_mapped_nets[source_pb_pin];
+
+      std::vector<int> pb_route_indices = find_pb_route_by_atom_net(pb, source_pb_pin, atom_net_id);
+      VTR_ASSERT(1 == pb_route_indices.size());
+      int pb_route_index = pb_route_indices[0];
+      t_pb_graph_pin* packing_source_pb_pin = get_pb_graph_node_pin_from_block_pin(block_id, pb_route_index);
+      VTR_ASSERT(nullptr != packing_source_pb_pin);
+
+      /* Find all the sink pins in the pb_route, we walk through the input pins and find the pin  */
+      std::vector<t_pb_graph_pin*> sink_pb_graph_pins = find_routed_pb_graph_pins_atom_net(pb, source_pb_pin, packing_source_pb_pin, atom_net_id, device_annotation, pb_pin_mapped_nets, pb_graph_pin_lookup_from_index);
+      std::vector<LbRRNodeId> sink_lb_rr_nodes = find_lb_net_physical_sink_lb_rr_nodes(lb_rr_graph, sink_pb_graph_pins, device_annotation);
+      VTR_ASSERT(sink_lb_rr_nodes.size() == sink_pb_graph_pins.size());
+      ignored_global_net_sinks[atom_net_id].insert(ignored_global_net_sinks[atom_net_id].end(), sink_lb_rr_nodes.begin(), sink_lb_rr_nodes.end());
+      ignored_atom_nets[atom_net_id] = true;
+    }
+  }
+
  /* Cache all the source nodes and sinks node for each net
   * net_terminal[net][0] is the list of source nodes 
   * net_terminal[net][1] is the list of sink nodes 
@ -460,6 +509,12 @@ void add_lb_router_nets(LbRouter& lb_router,
    /* Find the net mapped to this pin in clustering results*/
    AtomNetId atom_net_id = pb_pin_mapped_nets[source_pb_pin];

+    BasicPort curr_pin(std::string(source_pb_pin->port->name), source_pb_pin->pin_number, source_pb_pin->pin_number);
+    if ( (ignored_atom_nets[atom_net_id]) 
+      && (options.is_pin_ignore_global_nets(std::string(lb_type->pb_type->name), curr_pin))) {
+      continue;
+    }
+
    /* Check if the net information is constrained or not */
    std::string constrained_net_name = design_constraints.find_constrained_pin_net(std::string(lb_type->pb_type->name), BasicPort(std::string(source_pb_pin->port->name), source_pb_pin->pin_number, source_pb_pin->pin_number));

@ -573,6 +628,10 @@ void add_lb_router_nets(LbRouter& lb_router,
               sink_pb_pin->to_string().c_str());
    }

+    /* Append sink nodes from ignored global net cache */
+    sink_lb_rr_nodes.insert(sink_lb_rr_nodes.end(), ignored_global_net_sinks[atom_net_id_to_route].begin(), ignored_global_net_sinks[atom_net_id_to_route].end());
+    VTR_LOGV(verbose, "Append %ld sinks from the routing traces of ignored global nets\n", ignored_global_net_sinks[atom_net_id_to_route].size());
+
    /* Add the net */
    add_lb_router_net_to_route(lb_router, lb_rr_graph,
                               std::vector<LbRRNodeId>(1, source_lb_rr_node),
@ -671,13 +730,13 @@ void repack_cluster(const AtomContext& atom_ctx,
                    const VprDeviceAnnotation& device_annotation,
                    VprClusteringAnnotation& clustering_annotation,
                    const VprBitstreamAnnotation& bitstream_annotation,
-                    const RepackDesignConstraints& design_constraints,
                    const ClusterBlockId& block_id,
-                    const bool& verbose) {
+                    const RepackOption& options) {
  /* Get the pb graph that current clustered block is mapped to */
  t_logical_block_type_ptr lb_type = clustering_ctx.clb_nlist.block_type(block_id);
  t_pb_graph_node* pb_graph_head = lb_type->pb_graph_head;
  VTR_ASSERT(nullptr != pb_graph_head);
+  bool verbose = options.verbose_output();

  /* We should get a non-empty graph */
  const LbRRGraph& lb_rr_graph = device_annotation.physical_lb_rr_graph(pb_graph_head);
@ -693,8 +752,7 @@ void repack_cluster(const AtomContext& atom_ctx,
  /* Add nets to be routed with source and terminals */
  add_lb_router_nets(lb_router, lb_type, lb_rr_graph, atom_ctx, device_annotation,
                     clustering_ctx, const_cast<const VprClusteringAnnotation&>(clustering_annotation),
-                     design_constraints,
-                     block_id, verbose);
+                     block_id, options);

  /* Initialize the modes to expand routing trees with the physical modes in device annotation
   * This is a must-do before running the routeri in the purpose of repacking!!!
@ -740,8 +798,7 @@ void repack_clusters(const AtomContext& atom_ctx,
                     const VprDeviceAnnotation& device_annotation,
                     VprClusteringAnnotation& clustering_annotation,
                     const VprBitstreamAnnotation& bitstream_annotation,
-                     const RepackDesignConstraints& design_constraints,
-                     const bool& verbose) {
+                     const RepackOption& options) {
  vtr::ScopedStartFinishTimer timer("Repack clustered blocks to physical implementation of logical tile");

  for (auto blk_id : clustering_ctx.clb_nlist.blocks()) {
@ -749,8 +806,8 @@ void repack_clusters(const AtomContext& atom_ctx,
                   device_annotation,
                   clustering_annotation, 
                   bitstream_annotation,
-                   design_constraints,
-                   blk_id, verbose);
+                   blk_id,
+                   options);
  }
 }

@ -808,22 +865,20 @@ void pack_physical_pbs(const DeviceContext& device_ctx,
                       VprDeviceAnnotation& device_annotation,
                       VprClusteringAnnotation& clustering_annotation,
                       const VprBitstreamAnnotation& bitstream_annotation,
-                       const RepackDesignConstraints& design_constraints,
                       const CircuitLibrary& circuit_lib,
-                       const bool& verbose) {
+                       const RepackOption& options) {

  /* build the routing resource graph for each logical tile */
  build_physical_lb_rr_graphs(device_ctx,
                              device_annotation,
-                              verbose);
+                              options.verbose_output());

  /* Call the LbRouter to re-pack each clustered block to physical implementation */ 
  repack_clusters(atom_ctx, clustering_ctx, 
                  const_cast<const VprDeviceAnnotation&>(device_annotation),
                  clustering_annotation, 
                  bitstream_annotation,
-                  design_constraints,
-                  verbose);
+                  options);

  /* Annnotate wire LUTs that are ONLY created by repacker!!!
   * This is a MUST RUN!
@ -833,7 +888,7 @@ void pack_physical_pbs(const DeviceContext& device_ctx,
                                                  clustering_ctx,
                                                  device_annotation,
                                                  circuit_lib,
-                                                  verbose);
+                                                  options.verbose_output());
 }

 } /* end namespace openfpga */
--- a/openfpga/src/repack/repack.h
+++ b/openfpga/src/repack/repack.h
@ -9,8 +9,8 @@
 #include "vpr_clustering_annotation.h"
 #include "vpr_routing_annotation.h"
 #include "vpr_bitstream_annotation.h"
-#include "repack_design_constraints.h"
 #include "circuit_library.h"
+#include "repack_option.h"

 /********************************************************************
 * Function declaration
@ -25,9 +25,8 @@ void pack_physical_pbs(const DeviceContext& device_ctx,
                       VprDeviceAnnotation& device_annotation,
                       VprClusteringAnnotation& clustering_annotation,
                       const VprBitstreamAnnotation& bitstream_annotation,
-                       const RepackDesignConstraints& design_constraints,
                       const CircuitLibrary& circuit_lib,
-                       const bool& verbose);
+                       const RepackOption& options);

 } /* end namespace openfpga */

--- a/openfpga/src/repack/repack_option.cpp
+++ b/openfpga/src/repack/repack_option.cpp
@ -0,0 +1,127 @@
+/******************************************************************************
+ * Memember functions for data structure RepackOption
+ ******************************************************************************/
+#include <map>
+#include <array>
+#include "vtr_assert.h"
+#include "vtr_log.h"
+
+#include "repack_option.h"
+#include "openfpga_tokenizer.h"
+#include "openfpga_port_parser.h"
+
+/* begin namespace openfpga */
+namespace openfpga {
+
+/**************************************************
+ * Public Constructors
+ *************************************************/
+RepackOption::RepackOption() {
+  verbose_output_ = false;
+  num_parse_errors_ = 0;
+}
+
+/**************************************************
+ * Public Accessors 
+ *************************************************/
+RepackDesignConstraints RepackOption::design_constraints() const {
+  return design_constraints_;
+}
+
+bool RepackOption::is_pin_ignore_global_nets(const std::string& pb_type_name, const BasicPort& pin) const {
+  auto result = ignore_global_nets_on_pins_.find(pb_type_name);
+  if (result == ignore_global_nets_on_pins_.end()) {
+    /* Not found, return false */
+    return false;
+  } else {
+    /* If the pin is contained by the ignore list, return true */
+    for (BasicPort existing_port : result->second) {
+      if (existing_port.mergeable(pin) && existing_port.contained(pin)) {
+        return true;
+      }
+    }
+  }
+  return false;
+}
+
+bool RepackOption::verbose_output() const {
+  return verbose_output_;
+}
+
+/******************************************************************************
+ * Private Mutators
+ ******************************************************************************/
+void RepackOption::set_design_constraints(const RepackDesignConstraints& design_constraints) {
+  design_constraints_ = design_constraints;
+}
+
+void RepackOption::set_ignore_global_nets_on_pins(const std::string& content) {
+  num_parse_errors_ = 0;
+  /* Split the content using a tokenizer */
+  StringToken tokenizer(content);
+  std::vector<std::string> tokens = tokenizer.split(',');
+
+  /* Parse each token */
+  for (std::string token : tokens) {
+    /* Extract the pb_type name and port name */
+    StringToken pin_tokenizer(token);
+    std::vector<std::string> pin_info = pin_tokenizer.split('.');
+    /* Expect two contents, otherwise error out */
+    if (pin_info.size() != 2) {
+      std::string err_msg = std::string("Invalid content '") + token + std::string("' to skip, expect <pb_type_name>.<pin>\n");
+      VTR_LOG_ERROR(err_msg.c_str());
+      num_parse_errors_++;
+      continue;
+    }
+    std::string pb_type_name = pin_info[0]; 
+    PortParser port_parser(pin_info[1]);
+    BasicPort curr_port = port_parser.port();
+    if (!curr_port.is_valid()) {
+      std::string err_msg = std::string("Invalid pin definition '") + token + std::string("', expect <pb_type_name>.<pin_name>[int:int]\n");
+      VTR_LOG_ERROR(err_msg.c_str());
+      num_parse_errors_++;
+      continue;
+    }
+   
+    /* Check if the existing port already in the ignore list or not */
+    auto result = ignore_global_nets_on_pins_.find(pb_type_name);
+    if (result == ignore_global_nets_on_pins_.end()) {
+      /* Not found, push the port */
+      ignore_global_nets_on_pins_[pb_type_name].push_back(curr_port);
+    } else {
+      /* Already a list of ports. Check one by one. 
+       * - It already contained, do nothing but throw a warning. 
+       * - If we can merge, merge it.
+       * - Otherwise, create it */
+      bool included_by_existing_port = false;
+      for (BasicPort existing_port : result->second) {
+        if (existing_port.mergeable(curr_port)) {
+          if (!existing_port.contained(curr_port)) {
+            result->second.push_back(curr_port);
+            included_by_existing_port = true;
+            break;
+          } else {
+            std::string warn_msg = std::string("Pin definition '") + token + std::string("' is already included by other pin\n");
+            VTR_LOG_WARN(warn_msg.c_str());
+          }
+        }
+      }
+      if (!included_by_existing_port) {
+        result->second.push_back(curr_port);
+      }
+    }
+  }
+}
+
+void RepackOption::set_verbose_output(const bool& enabled) {
+  verbose_output_ = enabled;
+}
+
+bool RepackOption::valid() const {
+  if (num_parse_errors_) {
+    return false;
+  }
+  return true;
+}
+
+} /* end namespace openfpga */
--- a/openfpga/src/repack/repack_option.h
+++ b/openfpga/src/repack/repack_option.h
@ -0,0 +1,52 @@
+#ifndef REPACK_OPTION_H
+#define REPACK_OPTION_H
+
+/********************************************************************
+ * Include header files required by the data structure definition
+ *******************************************************************/
+#include <string>
+#include <vector>
+#include "repack_design_constraints.h"
+
+/* Begin namespace openfpga */
+namespace openfpga {
+
+/********************************************************************
+ * Options for RRGSB writer
+ *******************************************************************/
+class RepackOption {
+  public: /* Public constructor */
+    /* Set default options */
+    RepackOption();
+  public: /* Public accessors */
+    RepackDesignConstraints design_constraints() const;
+    /* Identify if a pin should ignore all the global nets */
+    bool is_pin_ignore_global_nets(const std::string& pb_type_name, const BasicPort& pin) const;
+    bool verbose_output() const;
+  public: /* Public mutators */
+    void set_design_constraints(const RepackDesignConstraints& design_constraints);
+    void set_ignore_global_nets_on_pins(const std::string& content); 
+    void set_verbose_output(const bool& enabled);
+  public: /* Public validators */
+    /* Check if the following internal data is valid or not:
+     * - no parsing errors
+     */
+    bool valid() const;
+  private: /* Internal Data */
+    RepackDesignConstraints design_constraints_;
+    /* The pin information on which global nets should be mapped to: [pb_type_name][0..num_ports]
+     * For example: 
+     * - clb.I[0:1], clb.I[5:6] -> ["clb"][BasicPort(I, 0, 1), BasicPort(I, 5, 6)]
+     * - clb.I[0:1], clb.I[2:6] -> ["clb"][BasicPort(I, 0, 6)]
+     */
+    std::map<std::string, std::vector<BasicPort>> ignore_global_nets_on_pins_;
+
+    bool verbose_output_;
+
+    /* A flag to indicate if the data parse is invalid or not */
+    int num_parse_errors_;
+};
+
+} /* End namespace openfpga*/
+
+#endif
--- a/openfpga/src/utils/module_manager_utils.cpp
+++ b/openfpga/src/utils/module_manager_utils.cpp
@ -1326,7 +1326,7 @@ void add_module_nets_cmos_memory_frame_decoder_config_bus(ModuleManager& module_

  /* Instanciate the decoder module here */
  VTR_ASSERT(0 == module_manager.num_instance(parent_module, decoder_module));
-  module_manager.add_child_module(parent_module, decoder_module);
+  module_manager.add_child_module(parent_module, decoder_module, false);

  /* Connect the enable (EN) port of memory modules under the parent module
   * to the frame decoder inputs
@ -1802,10 +1802,9 @@ void add_module_io_ports_from_child_modules(ModuleManager& module_manager,
  std::vector<BasicPort> gpio_ports_to_add;
  std::vector<bool> mappable_gpio_ports;

-  /* Iterate over the child modules */
-  for (const ModuleId& child : module_manager.child_modules(module_id)) {
-    /* Iterate over the child instances */
-    for (size_t i = 0; i < module_manager.num_instance(module_id, child); ++i) {
+  /* Iterate over the child modules and instances */
+  for (size_t i = 0; i < module_manager.io_children(module_id).size(); ++i) {
+    ModuleId child = module_manager.io_children(module_id)[i];
    /* Find all the global ports, whose port type is special */
    for (const ModulePortId& gpio_port_id : module_manager.module_port_ids_by_type(child, module_port_type)) {
      const BasicPort& gpio_port = module_manager.module_port(child, gpio_port_id);
@ -1837,7 +1836,6 @@ void add_module_io_ports_from_child_modules(ModuleManager& module_manager,
      }
    }
  } 
-  } 

  /* Record the port id for each type of GPIO port */
  std::vector<ModulePortId> gpio_port_ids;
@ -1854,9 +1852,9 @@ void add_module_io_ports_from_child_modules(ModuleManager& module_manager,
  /* Set up a counter for each type of GPIO port */
  std::vector<size_t> gpio_port_lsb(gpio_ports_to_add.size(), 0);
  /* Add module nets to connect the GPIOs of the module to the GPIOs of the sub module */
-  for (const ModuleId& child : module_manager.child_modules(module_id)) {
-    /* Iterate over the child instances */
-    for (const size_t& child_instance : module_manager.child_module_instances(module_id, child)) {
+  for (size_t i = 0; i < module_manager.io_children(module_id).size(); ++i) {
+    ModuleId child = module_manager.io_children(module_id)[i];
+    size_t child_instance = module_manager.io_child_instances(module_id)[i];
    /* Find all the global ports, whose port type is special */
    for (ModulePortId child_gpio_port_id : module_manager.module_port_ids_by_type(child, module_port_type)) {
      BasicPort child_gpio_port = module_manager.module_port(child, child_gpio_port_id);
@ -1892,11 +1890,9 @@ void add_module_io_ports_from_child_modules(ModuleManager& module_manager,
      }
    }
  }
-  }

  /* Check: all the lsb should now match the size of each GPIO port */
  for (size_t iport = 0; iport < gpio_ports_to_add.size(); ++iport) {
-    if (gpio_ports_to_add[iport].get_width() != gpio_port_lsb[iport]) 
    VTR_ASSERT(gpio_ports_to_add[iport].get_width() == gpio_port_lsb[iport]);
  }
 }
--- a/openfpga_flow/benchmarks/micro_benchmark/rst_on_lut/rst_on_lut.v
+++ b/openfpga_flow/benchmarks/micro_benchmark/rst_on_lut/rst_on_lut.v
@ -0,0 +1,26 @@
+/////////////////////////////////////////
+//  Functionality: A register driven by a combinational logic with reset signal
+//  Author:        Xifan Tang
+////////////////////////////////////////
+`timescale 1ns / 1ps
+
+module rst_on_lut(a, b, q, out, clk, rst);
+
+input wire rst;
+input wire clk;
+input wire a;
+input wire b;
+output reg q;
+output wire out;
+
+always @(posedge rst or posedge clk) begin
+  if (rst) begin
+    q <= 0;
+  end else begin
+    q <= a;
+  end
+end
+
+assign out = b & ~rst;
+
+endmodule
--- a/openfpga_flow/openfpga_shell_scripts/ignore_global_nets_on_pins_example_script.openfpga
+++ b/openfpga_flow/openfpga_shell_scripts/ignore_global_nets_on_pins_example_script.openfpga
@ -0,0 +1,76 @@
+# Run VPR for the 'and' design
+#--write_rr_graph example_rr_graph.xml
+vpr ${VPR_ARCH_FILE} ${VPR_TESTBENCH_BLIF} --clock_modeling ideal
+
+# Read OpenFPGA architecture definition
+read_openfpga_arch -f ${OPENFPGA_ARCH_FILE}
+
+# Read OpenFPGA simulation settings
+read_openfpga_simulation_setting -f ${OPENFPGA_SIM_SETTING_FILE}
+
+# Annotate the OpenFPGA architecture to VPR data base
+# to debug use --verbose options
+link_openfpga_arch --sort_gsb_chan_node_in_edges
+
+# Check and correct any naming conflicts in the BLIF netlist
+check_netlist_naming_conflict --fix --report ./netlist_renaming.xml
+
+# Apply fix-up to clustering nets based on routing results
+pb_pin_fixup --verbose
+
+# Apply fix-up to Look-Up Table truth tables based on packing results
+lut_truth_table_fixup
+
+# Build the module graph
+#  - Enabled compression on routing architecture modules
+#  - Enable pin duplication on grid modules
+build_fabric --compress_routing #--verbose
+
+# Write the fabric hierarchy of module graph to a file
+# This is used by hierarchical PnR flows
+write_fabric_hierarchy --file ./fabric_hierarchy.txt
+
+# Repack the netlist to physical pbs
+# This must be done before bitstream generator and testbench generation
+# Strongly recommend it is done after all the fix-up have been applied
+repack --ignore_global_nets_on_pins clb.I[0:11] #--verbose
+
+# Build the bitstream
+#  - Output the fabric-independent bitstream to a file
+build_architecture_bitstream --verbose --write_file fabric_independent_bitstream.xml
+
+# Build fabric-dependent bitstream
+build_fabric_bitstream --verbose
+
+# Write fabric-dependent bitstream
+write_fabric_bitstream --file fabric_bitstream.bit --format plain_text
+
+# Write the Verilog netlist for FPGA fabric
+#  - Enable the use of explicit port mapping in Verilog netlist
+write_fabric_verilog --file ./SRC --explicit_port_mapping --include_timing --print_user_defined_template --verbose
+
+# Write the Verilog testbench for FPGA fabric
+#  - We suggest the use of same output directory as fabric Verilog netlists
+#  - Must specify the reference benchmark file if you want to output any testbenches
+#  - Enable top-level testbench which is a full verification including programming circuit and core logic of FPGA
+#  - Enable pre-configured top-level testbench which is a fast verification skipping programming phase
+#  - Simulation ini file is optional and is needed only when you need to interface different HDL simulators using openfpga flow-run scripts
+write_full_testbench --file ./SRC --reference_benchmark_file_path ${REFERENCE_VERILOG_TESTBENCH} --include_signal_init --pin_constraints_file ${OPENFPGA_PIN_CONSTRAINTS_FILE} --bitstream fabric_bitstream.bit
+write_preconfigured_fabric_wrapper --embed_bitstream iverilog --file ./SRC  --pin_constraints_file ${OPENFPGA_PIN_CONSTRAINTS_FILE}
+write_preconfigured_testbench --file ./SRC --reference_benchmark_file_path ${REFERENCE_VERILOG_TESTBENCH} --pin_constraints_file ${OPENFPGA_PIN_CONSTRAINTS_FILE} 
+
+# Write the SDC files for PnR backend
+#  - Turn on every options here
+write_pnr_sdc --file ./SDC
+
+# Write SDC to disable timing for configure ports
+write_sdc_disable_timing_configure_ports --file ./SDC/disable_configure_ports.sdc
+
+# Write the SDC to run timing analysis for a mapped FPGA fabric
+write_analysis_sdc --file ./SDC_analysis
+
+# Finish and exit OpenFPGA
+exit
+
+# Note :
+# To run verification at the end of the flow maintain source in ./SRC directory
--- a/openfpga_flow/regression_test_scripts/basic_reg_test.sh
+++ b/openfpga_flow/regression_test_scripts/basic_reg_test.sh
@ -129,6 +129,11 @@ echo -e "Testing K4N5 with pattern based local routing";
 run-task basic_tests/k4_series/k4n5_pattern_local_routing $@
 echo -e "Testing K4N4 with custom I/O location syntax";
 run-task basic_tests/k4_series/k4n4_custom_io_loc $@
+run-task basic_tests/k4_series/k4n4_custom_io_loc_center $@
+run-task basic_tests/k4_series/k4n4_custom_io_loc_center_height_odd $@
+run-task basic_tests/k4_series/k4n4_custom_io_loc_center_width_odd $@
+echo -e "Testing K4N4 with a local routing where reset can driven LUT inputs";
+run-task basic_tests/k4_series/k4n4_rstOnLut $@

 echo -e "Testing different tile organizations";
 echo -e "Testing tiles with pins only on top and left sides";
--- a/openfpga_flow/scripts/io_sequence_visualizer.py
+++ b/openfpga_flow/scripts/io_sequence_visualizer.py
@ -0,0 +1,109 @@
+"""
+=========================================
+Represetes IO Sequence in OpenFPGA Engine
+=========================================
+
+This example demonstrates the ``OpenFPGA_Arch`` class which parses the
+`VPR` and `OpenFPGA` Architecture file and provides logical information.
+
+.. image:: ../../../examples/OpenFPGA_basic/_sample_io_sequence.svg
+   :width: 60%
+   :align: center
+
+Author: Ganesh Gore
+
+"""
+import math
+import svgwrite
+from svgwrite.container import Group
+
+
+def draw_connections(width, height, connections):
+    """
+    Draw connection sequence
+    """
+    dwg = svgwrite.Drawing()
+
+    DRAW_WIDTH = (width + 2) * SCALE
+    DRAW_HEIGHT = (height + 2) * SCALE
+    # set user coordinate space
+    dwg.viewbox(width=DRAW_WIDTH, height=DRAW_HEIGHT, miny=-1 * DRAW_HEIGHT)
+
+    dwg_main = Group(id="Main", transform="scale(1,-1)")
+    dwg.add(dwg_main)
+
+    for w in range(1, width + 2):
+        dwg_main.add(
+            dwg.line(
+                (w * SCALE, SCALE), (w * SCALE, (height + 1) * SCALE), stroke="red"
+            )
+        )
+
+    for h in range(1, height + 2):
+        dwg_main.add(
+            dwg.line((SCALE, h * SCALE), ((width + 1) * SCALE, h * SCALE), stroke="red")
+        )
+
+    path = "M "
+    for point in connections:
+        path += " %d %d " % ((point[0] + 0.5) * SCALE, (point[1] + 0.5) * SCALE)
+
+    dwg_main.add(dwg.path(path, stroke="blue", fill="none", stroke_width="2px"))
+    dwg.saveas("_sample_io_sequence.svg", pretty=True)
+
+
+SCALE = 20
+FPGA_WIDTH = 40
+FPGA_HEIGHT = 15
+
+W = max(FPGA_WIDTH, FPGA_HEIGHT)
+W2 = math.floor(W / 2) + 1
+
+connections = []
+xmin, xmax = 1, FPGA_WIDTH
+ymin, ymax = 1, FPGA_HEIGHT
+
+while (xmin < xmax) and (ymin < ymax):
+    print(xmin, ymin, end=" -> ")
+    print(xmax, ymax)
+
+    x = xmin
+    for y in range(ymin, ymax + 1):
+        connections.append((x, y))
+    y = ymax
+    for x in range(xmin, xmax + 1):
+        connections.append((x, y))
+
+    x = xmax
+    for y in range(ymin, ymax + 1)[::-1]:
+        connections.append((x, y))
+
+    y = ymin
+    for x in range(xmin, xmax + 1)[::-1][:-1]:
+        connections.append((x, y))
+
+    xmin += 1
+    ymin += 1
+    xmax -= 1
+    ymax -= 1
+
+
+if FPGA_HEIGHT % 2 == 1:  # if height is odd
+    if ymin == ymax:  # if touching vertically
+        y = ymin
+        for x in range(xmin, xmax + 1):
+            connections.append((x, y))
+
+
+if FPGA_WIDTH % 2 == 1:  # if width is odd
+    if xmin == xmax:  # if touching horizontally
+        x = xmin
+        for y in range(ymin, ymax + 1):
+            connections.append((x, y))
+
+# print(connections)
+if connections:
+    draw_connections(FPGA_WIDTH, FPGA_HEIGHT, connections)
+else:
+    # Dummy draw
+    draw_connections(FPGA_WIDTH, FPGA_HEIGHT, [(1, 1)])
--- a/openfpga_flow/tasks/basic_tests/k4_series/k4n4_custom_io_loc/config/task.conf
+++ b/openfpga_flow/tasks/basic_tests/k4_series/k4n4_custom_io_loc/config/task.conf
@ -19,7 +19,7 @@ fpga_flow=vpr_blif
 openfpga_shell_template=${PATH:OPENFPGA_PATH}/openfpga_flow/openfpga_shell_scripts/write_full_testbench_example_script.openfpga
 openfpga_arch_file=${PATH:OPENFPGA_PATH}/openfpga_flow/openfpga_arch/k4_N4_40nm_cc_openfpga.xml
 openfpga_sim_setting_file=${PATH:OPENFPGA_PATH}/openfpga_flow/openfpga_simulation_settings/auto_sim_openfpga.xml
-openfpga_vpr_device_layout=--device 2x2
+openfpga_vpr_device_layout=--device 4x4
 openfpga_fast_configuration=

 [ARCHITECTURES]
--- a/openfpga_flow/tasks/basic_tests/k4_series/k4n4_custom_io_loc_center/config/task.conf
+++ b/openfpga_flow/tasks/basic_tests/k4_series/k4n4_custom_io_loc_center/config/task.conf
@ -0,0 +1,37 @@
+# = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
+# Configuration file for running experiments
+# = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
+# timeout_each_job : FPGA Task script splits fpga flow into multiple jobs
+# Each job execute fpga_flow script on combination of architecture & benchmark
+# timeout_each_job is timeout for each job
+# = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
+
+[GENERAL]
+run_engine=openfpga_shell
+power_tech_file = ${PATH:OPENFPGA_PATH}/openfpga_flow/tech/PTM_45nm/45nm.xml
+power_analysis = true
+spice_output=false
+verilog_output=true
+timeout_each_job = 20*60
+fpga_flow=vpr_blif
+
+[OpenFPGA_SHELL]
+openfpga_shell_template=${PATH:OPENFPGA_PATH}/openfpga_flow/openfpga_shell_scripts/write_full_testbench_example_script.openfpga
+openfpga_arch_file=${PATH:OPENFPGA_PATH}/openfpga_flow/openfpga_arch/k4_N4_40nm_cc_openfpga.xml
+openfpga_sim_setting_file=${PATH:OPENFPGA_PATH}/openfpga_flow/openfpga_simulation_settings/auto_sim_openfpga.xml
+openfpga_vpr_device_layout=--device 4x4_io_center
+openfpga_fast_configuration=
+
+[ARCHITECTURES]
+arch0=${PATH:OPENFPGA_PATH}/openfpga_flow/vpr_arch/k4_N4_tileable_customIoLoc_40nm.xml
+
+[BENCHMARKS]
+bench0=${PATH:OPENFPGA_PATH}/openfpga_flow/benchmarks/micro_benchmark/and2/and2.blif
+
+[SYNTHESIS_PARAM]
+bench0_top = and2
+bench0_act = ${PATH:OPENFPGA_PATH}/openfpga_flow/benchmarks/micro_benchmark/and2/and2.act
+bench0_verilog = ${PATH:OPENFPGA_PATH}/openfpga_flow/benchmarks/micro_benchmark/and2/and2.v
+
+[SCRIPT_PARAM_MIN_ROUTE_CHAN_WIDTH]
+end_flow_with_test=
--- a/openfpga_flow/tasks/basic_tests/k4_series/k4n4_custom_io_loc_center_height_odd/config/task.conf
+++ b/openfpga_flow/tasks/basic_tests/k4_series/k4n4_custom_io_loc_center_height_odd/config/task.conf
@ -0,0 +1,37 @@
+# = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
+# Configuration file for running experiments
+# = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
+# timeout_each_job : FPGA Task script splits fpga flow into multiple jobs
+# Each job execute fpga_flow script on combination of architecture & benchmark
+# timeout_each_job is timeout for each job
+# = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
+
+[GENERAL]
+run_engine=openfpga_shell
+power_tech_file = ${PATH:OPENFPGA_PATH}/openfpga_flow/tech/PTM_45nm/45nm.xml
+power_analysis = true
+spice_output=false
+verilog_output=true
+timeout_each_job = 20*60
+fpga_flow=vpr_blif
+
+[OpenFPGA_SHELL]
+openfpga_shell_template=${PATH:OPENFPGA_PATH}/openfpga_flow/openfpga_shell_scripts/write_full_testbench_example_script.openfpga
+openfpga_arch_file=${PATH:OPENFPGA_PATH}/openfpga_flow/openfpga_arch/k4_N4_40nm_cc_openfpga.xml
+openfpga_sim_setting_file=${PATH:OPENFPGA_PATH}/openfpga_flow/openfpga_simulation_settings/auto_sim_openfpga.xml
+openfpga_vpr_device_layout=--device 4x3_io_center
+openfpga_fast_configuration=
+
+[ARCHITECTURES]
+arch0=${PATH:OPENFPGA_PATH}/openfpga_flow/vpr_arch/k4_N4_tileable_customIoLoc_40nm.xml
+
+[BENCHMARKS]
+bench0=${PATH:OPENFPGA_PATH}/openfpga_flow/benchmarks/micro_benchmark/and2/and2.blif
+
+[SYNTHESIS_PARAM]
+bench0_top = and2
+bench0_act = ${PATH:OPENFPGA_PATH}/openfpga_flow/benchmarks/micro_benchmark/and2/and2.act
+bench0_verilog = ${PATH:OPENFPGA_PATH}/openfpga_flow/benchmarks/micro_benchmark/and2/and2.v
+
+[SCRIPT_PARAM_MIN_ROUTE_CHAN_WIDTH]
+end_flow_with_test=
--- a/openfpga_flow/tasks/basic_tests/k4_series/k4n4_custom_io_loc_center_width_odd/config/task.conf
+++ b/openfpga_flow/tasks/basic_tests/k4_series/k4n4_custom_io_loc_center_width_odd/config/task.conf
@ -0,0 +1,37 @@
+# = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
+# Configuration file for running experiments
+# = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
+# timeout_each_job : FPGA Task script splits fpga flow into multiple jobs
+# Each job execute fpga_flow script on combination of architecture & benchmark
+# timeout_each_job is timeout for each job
+# = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
+
+[GENERAL]
+run_engine=openfpga_shell
+power_tech_file = ${PATH:OPENFPGA_PATH}/openfpga_flow/tech/PTM_45nm/45nm.xml
+power_analysis = true
+spice_output=false
+verilog_output=true
+timeout_each_job = 20*60
+fpga_flow=vpr_blif
+
+[OpenFPGA_SHELL]
+openfpga_shell_template=${PATH:OPENFPGA_PATH}/openfpga_flow/openfpga_shell_scripts/write_full_testbench_example_script.openfpga
+openfpga_arch_file=${PATH:OPENFPGA_PATH}/openfpga_flow/openfpga_arch/k4_N4_40nm_cc_openfpga.xml
+openfpga_sim_setting_file=${PATH:OPENFPGA_PATH}/openfpga_flow/openfpga_simulation_settings/auto_sim_openfpga.xml
+openfpga_vpr_device_layout=--device 3x4_io_center
+openfpga_fast_configuration=
+
+[ARCHITECTURES]
+arch0=${PATH:OPENFPGA_PATH}/openfpga_flow/vpr_arch/k4_N4_tileable_customIoLoc_40nm.xml
+
+[BENCHMARKS]
+bench0=${PATH:OPENFPGA_PATH}/openfpga_flow/benchmarks/micro_benchmark/and2/and2.blif
+
+[SYNTHESIS_PARAM]
+bench0_top = and2
+bench0_act = ${PATH:OPENFPGA_PATH}/openfpga_flow/benchmarks/micro_benchmark/and2/and2.act
+bench0_verilog = ${PATH:OPENFPGA_PATH}/openfpga_flow/benchmarks/micro_benchmark/and2/and2.v
+
+[SCRIPT_PARAM_MIN_ROUTE_CHAN_WIDTH]
+end_flow_with_test=
--- a/openfpga_flow/tasks/basic_tests/k4_series/k4n4_rstOnLut/config/pin_constraints_reset.xml
+++ b/openfpga_flow/tasks/basic_tests/k4_series/k4n4_rstOnLut/config/pin_constraints_reset.xml
@ -0,0 +1,7 @@
+<pin_constraints>
+  <!-- For a given .blif file, we want to assign 
+       - the reset signal to the op_reset[0] port of the FPGA fabric
+    -->
+  <set_io pin="op_reset[0]" net="rst"/>
+</pin_constraints>
+
--- a/openfpga_flow/tasks/basic_tests/k4_series/k4n4_rstOnLut/config/task.conf
+++ b/openfpga_flow/tasks/basic_tests/k4_series/k4n4_rstOnLut/config/task.conf
@ -0,0 +1,42 @@
+# = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
+# Configuration file for running experiments
+# = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
+# timeout_each_job : FPGA Task script splits fpga flow into multiple jobs
+# Each job execute fpga_flow script on combination of architecture & benchmark
+# timeout_each_job is timeout for each job
+# = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
+
+[GENERAL]
+run_engine=openfpga_shell
+power_tech_file = ${PATH:OPENFPGA_PATH}/openfpga_flow/tech/PTM_45nm/45nm.xml
+power_analysis = false
+spice_output=false
+verilog_output=true
+timeout_each_job = 3*60
+fpga_flow=yosys_vpr
+
+[OpenFPGA_SHELL]
+openfpga_shell_template=${PATH:OPENFPGA_PATH}/openfpga_flow/openfpga_shell_scripts/ignore_global_nets_on_pins_example_script.openfpga
+openfpga_arch_file=${PATH:OPENFPGA_PATH}/openfpga_flow/openfpga_arch/k4_frac_N4_fracff_40nm_cc_openfpga.xml
+openfpga_sim_setting_file=${PATH:OPENFPGA_PATH}/openfpga_flow/openfpga_simulation_settings/fixed_sim_openfpga.xml
+
+[ARCHITECTURES]
+arch0=${PATH:OPENFPGA_PATH}/openfpga_flow/vpr_arch/k4_frac_N4_tileable_fracff_rstOnLut_40nm.xml
+
+[BENCHMARKS]
+bench0=${PATH:OPENFPGA_PATH}/openfpga_flow/benchmarks/micro_benchmark/rst_on_lut/rst_on_lut.v
+
+[SYNTHESIS_PARAM]
+# Yosys script parameters
+bench_yosys_cell_sim_verilog_common=${PATH:OPENFPGA_PATH}/openfpga_flow/openfpga_yosys_techlib/openfpga_dff_sim.v
+bench_yosys_dff_map_verilog_common=${PATH:OPENFPGA_PATH}/openfpga_flow/openfpga_yosys_techlib/openfpga_dff_map.v
+bench_read_verilog_options_common = -nolatches
+bench_yosys_common=${PATH:OPENFPGA_PATH}/openfpga_flow/misc/ys_tmpl_yosys_vpr_dff_flow.ys
+bench_yosys_rewrite_common=${PATH:OPENFPGA_PATH}/openfpga_flow/misc/ys_tmpl_yosys_vpr_flow_with_rewrite.ys;${PATH:OPENFPGA_PATH}/openfpga_flow/misc/ys_tmpl_rewrite_flow.ys
+
+bench0_top = rst_on_lut
+bench0_openfpga_pin_constraints_file = ${PATH:TASK_DIR}/config/pin_constraints_reset.xml
+
+[SCRIPT_PARAM_MIN_ROUTE_CHAN_WIDTH]
+end_flow_with_test=
+vpr_fpga_verilog_formal_verification_top_netlist=
--- a/openfpga_flow/vpr_arch/README.md
+++ b/openfpga_flow/vpr_arch/README.md
@ -22,6 +22,7 @@ Please reveal the following architecture features in the names to help quickly s
 - reduced\_io: If I/Os only appear a certain or multiple sides of FPGAs 
 - registerable\_io: If I/Os are registerable (can be either combinational or sequential)
 - CustomIoLoc: Use OpenFPGA's extended custom I/O location syntax
+- rstOnLut: The reset signal of CLB can feed LUT inputs through a local routing architecture
 - <feature\_size>: The technology node which the delay numbers are extracted from.
 - TileOrgz<Type>: How tile is organized. 
  * Top-left (Tl): the pins of a tile are placed on the top side and left side only
--- a/openfpga_flow/vpr_arch/k4_N4_tileable_customIoLoc_40nm.xml
+++ b/openfpga_flow/vpr_arch/k4_N4_tileable_customIoLoc_40nm.xml
@ -43,7 +43,10 @@
      <output name="inpad" num_pins="1"/>
      <fc in_type="frac" in_val="0.15" out_type="frac" out_val="0.10"/>
      <pinlocations pattern="custom">
-        <loc side="bottom">io_top[0:3].inpad io_top[0:7].outpad io_top[4:7].inpad</loc>
+        <loc side="top">io_top[0:1].inpad io_top[0:1].outpad</loc>
+        <loc side="right">io_top[3:2].inpad</loc>
+        <loc side="bottom">io_top[4:5].inpad io_top[2:4].outpad</loc>
+        <loc side="left">io_top[6:7].inpad io_top[5:7].outpad</loc>
      </pinlocations>
    </tile>
    <tile name="io_bottom" capacity="6" area="0">
@ -54,18 +57,10 @@
      <output name="inpad" num_pins="1"/>
      <fc in_type="frac" in_val="0.15" out_type="frac" out_val="0.10"/>
      <pinlocations pattern="custom">
-        <loc side="top">io_bottom[0:1].outpad io_bottom[0:3].inpad io_bottom[2:5].outpad io_bottom[4:5].inpad</loc>
-      </pinlocations>
-    </tile>
-    <tile name="io_left" capacity="4" area="0">
-      <equivalent_sites>
-        <site pb_type="io"/>
-      </equivalent_sites>
-      <input name="outpad" num_pins="1"/>
-      <output name="inpad" num_pins="1"/>
-      <fc in_type="frac" in_val="0.15" out_type="frac" out_val="0.10"/>
-      <pinlocations pattern="custom">
-        <loc side="right">io_left.inpad io_left.outpad</loc>
+        <loc side="top">io_bottom[0:1].outpad</loc>
+        <loc side="right">io_bottom[2:2].outpad io_bottom[0:3].inpad</loc>
+        <loc side="bottom">io_bottom[3:3].outpad</loc>
+        <loc side="left">io_bottom[4:5].outpad io_bottom[4:5].inpad</loc>
      </pinlocations>
    </tile>
    <tile name="io_right" capacity="2" area="0">
@ -76,7 +71,10 @@
      <output name="inpad" num_pins="1"/>
      <fc in_type="frac" in_val="0.15" out_type="frac" out_val="0.10"/>
      <pinlocations pattern="custom">
-        <loc side="left">io_right[1:1].inpad io_right[1:0].outpad io_right[0:0].inpad</loc>
+        <loc side="top">io_right[0:0].inpad</loc>
+        <loc side="right">io_right[1:1].outpad</loc>
+        <loc side="bottom">io_right[0:0].outpad</loc>
+        <loc side="left">io_right[1:1].inpad</loc>
      </pinlocations>
    </tile>
    <tile name="clb" area="53894">
@ -94,61 +92,123 @@
  <!-- Physical descriptions begin -->
  <layout tileable="true">
    <auto_layout aspect_ratio="1.0">
-      <!--Perimeter of 'io' blocks with 'EMPTY' blocks at corners-->
-      <row type="io_top" starty="H-1" priority="100"/>
-      <row type="io_bottom" starty="0" priority="100"/>
-      <col type="io_left" startx="0" priority="100"/>
-      <col type="io_right" startx="W-1" priority="100"/>
+      <!--Perimeter of 'EMPTY' blocks, I/Os are placed on the inner ring
+        Intend to have no I/Os on the left side, it is to check the correctness of I/O indexing in OpenFPGA
+        -->
+      <row type="io_top" starty="H-2" priority="90"/>
+      <row type="io_bottom" starty="1" priority="91"/>
+      <col type="io_right" startx="W-2" priority="93"/>
+      <row type="EMPTY" starty="H-1" priority="101"/>
+      <row type="EMPTY" starty="0" priority="102"/>
+      <col type="EMPTY" startx="0" priority="103"/>
+      <col type="EMPTY" startx="W-1" priority="104"/>
      <corners type="EMPTY" priority="101"/>
      <!--Fill with 'clb'-->
      <fill type="clb" priority="10"/>
    </auto_layout>
    <fixed_layout name="2x2" width="4" height="4">
-      <!--Perimeter of 'io' blocks with 'EMPTY' blocks at corners-->
-      <row type="io_top" starty="H-1" priority="100"/>
-      <row type="io_bottom" starty="0" priority="100"/>
-      <col type="io_left" startx="0" priority="100"/>
-      <col type="io_right" startx="W-1" priority="100"/>
+      <!--Perimeter of 'EMPTY' blocks, I/Os are placed on the inner ring -->
+      <row type="io_top" starty="H-2" priority="90"/>
+      <row type="io_bottom" starty="1" priority="91"/>
+      <col type="io_right" startx="W-2" priority="93"/>
+      <row type="EMPTY" starty="H-1" priority="101"/>
+      <row type="EMPTY" starty="0" priority="102"/>
+      <col type="EMPTY" startx="0" priority="103"/>
+      <col type="EMPTY" startx="W-1" priority="104"/>
      <corners type="EMPTY" priority="101"/>
      <!--Fill with 'clb'-->
      <fill type="clb" priority="10"/>
    </fixed_layout>
    <fixed_layout name="4x4" width="6" height="6">
-      <!--Perimeter of 'io' blocks with 'EMPTY' blocks at corners-->
-      <row type="io_top" starty="H-1" priority="100"/>
-      <row type="io_bottom" starty="0" priority="100"/>
-      <col type="io_left" startx="0" priority="100"/>
-      <col type="io_right" startx="W-1" priority="100"/>
+      <!--Perimeter of 'EMPTY' blocks, I/Os are placed on the inner ring -->
+      <row type="io_top" starty="H-2" priority="90"/>
+      <row type="io_bottom" starty="1" priority="91"/>
+      <col type="io_right" startx="W-2" priority="93"/>
+      <row type="EMPTY" starty="H-1" priority="101"/>
+      <row type="EMPTY" starty="0" priority="102"/>
+      <col type="EMPTY" startx="0" priority="103"/>
+      <col type="EMPTY" startx="W-1" priority="104"/>
      <corners type="EMPTY" priority="101"/>
      <!--Fill with 'clb'-->
      <fill type="clb" priority="10"/>
    </fixed_layout>
+    <fixed_layout name="4x4_io_center" width="6" height="6">
+      <!--Perimeter of 'clb' blocks, I/Os are placed in the center-->
+      <row type="clb" starty="H-2" priority="90"/>
+      <row type="clb" starty="1" priority="91"/>
+      <col type="clb" startx="W-2" priority="93"/>
+      <col type="clb" startx="1" priority="93"/>
+      <row type="EMPTY" starty="H-1" priority="101"/>
+      <row type="EMPTY" starty="0" priority="102"/>
+      <col type="EMPTY" startx="0" priority="103"/>
+      <col type="EMPTY" startx="W-1" priority="104"/>
+      <corners type="EMPTY" priority="101"/>
+      <!--Fill with 'clb'-->
+      <fill type="io_top" priority="10"/>
+    </fixed_layout>
+    <fixed_layout name="4x3_io_center" width="6" height="5">
+      <!--Perimeter of 'clb' blocks, I/Os are placed in the center-->
+      <row type="clb" starty="H-2" priority="90"/>
+      <row type="clb" starty="1" priority="91"/>
+      <col type="clb" startx="W-2" priority="93"/>
+      <col type="clb" startx="1" priority="93"/>
+      <row type="EMPTY" starty="H-1" priority="101"/>
+      <row type="EMPTY" starty="0" priority="102"/>
+      <col type="EMPTY" startx="0" priority="103"/>
+      <col type="EMPTY" startx="W-1" priority="104"/>
+      <corners type="EMPTY" priority="101"/>
+      <!--Fill with 'clb'-->
+      <fill type="io_top" priority="10"/>
+    </fixed_layout>
+    <fixed_layout name="3x4_io_center" width="5" height="6">
+      <!--Perimeter of 'clb' blocks, I/Os are placed in the center-->
+      <row type="clb" starty="H-2" priority="90"/>
+      <row type="clb" starty="1" priority="91"/>
+      <col type="clb" startx="W-2" priority="93"/>
+      <col type="clb" startx="1" priority="93"/>
+      <row type="EMPTY" starty="H-1" priority="101"/>
+      <row type="EMPTY" starty="0" priority="102"/>
+      <col type="EMPTY" startx="0" priority="103"/>
+      <col type="EMPTY" startx="W-1" priority="104"/>
+      <corners type="EMPTY" priority="101"/>
+      <!--Fill with 'clb'-->
+      <fill type="io_top" priority="10"/>
+    </fixed_layout>
    <fixed_layout name="48x48" width="50" height="50">
-      <!--Perimeter of 'io' blocks with 'EMPTY' blocks at corners-->
-      <row type="io_top" starty="H-1" priority="100"/>
-      <row type="io_bottom" starty="0" priority="100"/>
-      <col type="io_left" startx="0" priority="100"/>
-      <col type="io_right" startx="W-1" priority="100"/>
+      <!--Perimeter of 'EMPTY' blocks, I/Os are placed on the inner ring -->
+      <row type="io_top" starty="H-2" priority="90"/>
+      <row type="io_bottom" starty="1" priority="91"/>
+      <col type="io_right" startx="W-2" priority="93"/>
+      <row type="EMPTY" starty="H-1" priority="101"/>
+      <row type="EMPTY" starty="0" priority="102"/>
+      <col type="EMPTY" startx="0" priority="103"/>
+      <col type="EMPTY" startx="W-1" priority="104"/>
      <corners type="EMPTY" priority="101"/>
      <!--Fill with 'clb'-->
      <fill type="clb" priority="10"/>
    </fixed_layout>
    <fixed_layout name="72x72" width="74" height="74">
-      <!--Perimeter of 'io' blocks with 'EMPTY' blocks at corners-->
-      <row type="io_top" starty="H-1" priority="100"/>
-      <row type="io_bottom" starty="0" priority="100"/>
-      <col type="io_left" startx="0" priority="100"/>
-      <col type="io_right" startx="W-1" priority="100"/>
+      <!--Perimeter of 'EMPTY' blocks, I/Os are placed on the inner ring -->
+      <row type="io_top" starty="H-2" priority="90"/>
+      <row type="io_bottom" starty="1" priority="91"/>
+      <col type="io_right" startx="W-2" priority="93"/>
+      <row type="EMPTY" starty="H-1" priority="101"/>
+      <row type="EMPTY" starty="0" priority="102"/>
+      <col type="EMPTY" startx="0" priority="103"/>
+      <col type="EMPTY" startx="W-1" priority="104"/>
      <corners type="EMPTY" priority="101"/>
      <!--Fill with 'clb'-->
      <fill type="clb" priority="10"/>
    </fixed_layout>
    <fixed_layout name="96x96" width="98" height="98">
-      <!--Perimeter of 'io' blocks with 'EMPTY' blocks at corners-->
-      <row type="io_top" starty="H-1" priority="100"/>
-      <row type="io_bottom" starty="0" priority="100"/>
-      <col type="io_left" startx="0" priority="100"/>
-      <col type="io_right" startx="W-1" priority="100"/>
+      <!--Perimeter of 'EMPTY' blocks, I/Os are placed on the inner ring -->
+      <row type="io_top" starty="H-2" priority="90"/>
+      <row type="io_bottom" starty="1" priority="91"/>
+      <col type="io_right" startx="W-2" priority="93"/>
+      <row type="EMPTY" starty="H-1" priority="101"/>
+      <row type="EMPTY" starty="0" priority="102"/>
+      <col type="EMPTY" startx="0" priority="103"/>
+      <col type="EMPTY" startx="W-1" priority="104"/>
      <corners type="EMPTY" priority="101"/>
      <!--Fill with 'clb'-->
      <fill type="clb" priority="10"/>
--- a/openfpga_flow/vpr_arch/k4_frac_N4_tileable_fracff_rstOnLut_40nm.xml
+++ b/openfpga_flow/vpr_arch/k4_frac_N4_tileable_fracff_rstOnLut_40nm.xml
@ -0,0 +1,627 @@
+<!-- 
+  Flagship Heterogeneous Architecture (No Carry Chains) for VTR 7.0.
+
+  - 40 nm technology
+  - General purpose logic block: 
+    K = 4, N = 4, fracturable 4 LUTs (can operate as one 4-LUT or two 3-LUTs with all 3 inputs shared) 
+    with optionally registered outputs
+  - Routing architecture: L = 4, fc_in = 0.15, Fc_out = 0.1
+
+  Authors: Xifan Tang
+-->
+<architecture>
+  <!-- 
+       ODIN II specific config begins 
+       Describes the types of user-specified netlist blocks (in blif, this corresponds to 
+       ".model [type_of_block]") that this architecture supports.
+
+       Note: Basic LUTs, I/Os, and flip-flops are not included here as there are 
+       already special structures in blif (.names, .input, .output, and .latch) 
+       that describe them.
+  -->
+  <models>
+    <!-- A virtual model for I/O to be used in the physical mode of io block -->
+    <model name="io">
+      <input_ports>
+        <port name="outpad"/>
+      </input_ports>
+      <output_ports>
+        <port name="inpad"/>
+      </output_ports>
+    </model>
+    <!-- A virtual model for I/O to be used in the physical mode of io block -->
+    <model name="frac_lut4">
+      <input_ports>
+        <port name="in"/>
+      </input_ports>
+      <output_ports>
+        <port name="lut3_out"/>
+        <port name="lut4_out"/>
+      </output_ports>
+    </model>
+    <!-- A virtual model for scan-chain flip-flop to be used in the physical mode of FF -->
+    <model name="dff">
+      <input_ports>
+        <port name="D" clock="C"/>
+        <port name="C" is_clock="1"/>
+      </input_ports>
+      <output_ports>
+        <port name="Q" clock="C"/>
+      </output_ports>
+    </model>
+    <!-- A virtual model for scan-chain flip-flop to be used in the physical mode of FF -->
+    <model name="dffr">
+      <input_ports>
+        <port name="D" clock="C"/>
+        <port name="R" clock="C"/>
+        <port name="C" is_clock="1"/>
+      </input_ports>
+      <output_ports>
+        <port name="Q" clock="C"/>
+      </output_ports>
+    </model>
+   <!-- A virtual model for scan-chain flip-flop to be used in the physical mode of FF -->
+    <model name="dffrn">
+      <input_ports>
+        <port name="D" clock="C"/>
+        <port name="RN" clock="C"/>
+        <port name="C" is_clock="1"/>
+      </input_ports>
+      <output_ports>
+        <port name="Q" clock="C"/>
+      </output_ports>
+    </model>
+  </models>
+  <tiles>
+    <!-- Do NOT add clock pins to I/O here!!! VPR does not build clock network in the way that OpenFPGA can support
+         If you need to register the I/O, define clocks in the circuit models
+         These clocks can be handled in back-end
+     -->
+    <tile name="io" capacity="8" area="0">
+      <equivalent_sites>
+        <site pb_type="io"/>
+      </equivalent_sites>
+      <input name="outpad" num_pins="1"/>
+      <output name="inpad" num_pins="1"/>
+      <fc in_type="frac" in_val="0.15" out_type="frac" out_val="0.10"/>
+      <pinlocations pattern="custom">
+        <loc side="left">io.outpad io.inpad</loc>
+        <loc side="top">io.outpad io.inpad</loc>
+        <loc side="right">io.outpad io.inpad</loc>
+        <loc side="bottom">io.outpad io.inpad</loc>
+      </pinlocations>
+    </tile>
+    <tile name="clb" area="53894">
+      <equivalent_sites>
+        <site pb_type="clb"/>
+      </equivalent_sites>
+      <input name="I" num_pins="12" equivalent="full"/>
+      <input name="reset" num_pins="1" is_non_clock_global="true"/>
+      <output name="O" num_pins="8" equivalent="none"/>
+      <clock name="clk" num_pins="1"/>
+      <fc in_type="frac" in_val="0.15" out_type="frac" out_val="0.10">
+        <fc_override port_name="clk" fc_type="frac" fc_val="0"/>
+        <fc_override port_name="reset" fc_type="frac" fc_val="0"/>
+      </fc>
+      <pinlocations pattern="spread"/>
+    </tile>
+  </tiles>
+  <!-- ODIN II specific config ends -->
+  <!-- Physical descriptions begin -->
+  <layout tileable="true">
+    <auto_layout aspect_ratio="1.0">
+      <!--Perimeter of 'io' blocks with 'EMPTY' blocks at corners-->
+      <perimeter type="io" priority="100"/>
+      <corners type="EMPTY" priority="101"/>
+      <!--Fill with 'clb'-->
+      <fill type="clb" priority="10"/>
+    </auto_layout>
+    <fixed_layout name="2x2" width="4" height="4">
+      <!--Perimeter of 'io' blocks with 'EMPTY' blocks at corners-->
+      <perimeter type="io" priority="100"/>
+      <corners type="EMPTY" priority="101"/>
+      <!--Fill with 'clb'-->
+      <fill type="clb" priority="10"/>
+    </fixed_layout>
+    <fixed_layout name="4x4" width="6" height="6">
+      <!--Perimeter of 'io' blocks with 'EMPTY' blocks at corners-->
+      <perimeter type="io" priority="100"/>
+      <corners type="EMPTY" priority="101"/>
+      <!--Fill with 'clb'-->
+      <fill type="clb" priority="10"/>
+    </fixed_layout>
+    <fixed_layout name="48x48" width="50" height="50">
+      <!--Perimeter of 'io' blocks with 'EMPTY' blocks at corners-->
+      <perimeter type="io" priority="100"/>
+      <corners type="EMPTY" priority="101"/>
+      <!--Fill with 'clb'-->
+      <fill type="clb" priority="10"/>
+    </fixed_layout>
+  </layout>
+  <device>
+    <!-- VB & JL: Using Ian Kuon's transistor sizing and drive strength data for routing, at 40 nm. Ian used BPTM 
+			     models. We are modifying the delay values however, to include metal C and R, which allows more architecture
+			     experimentation. We are also modifying the relative resistance of PMOS to be 1.8x that of NMOS
+			     (vs. Ian's 3x) as 1.8x lines up with Jeff G's data from a 45 nm process (and is more typical of 
+			     45 nm in general). I'm upping the Rmin_nmos from Ian's just over 6k to nearly 9k, and dropping 
+			     RminW_pmos from 18k to 16k to hit this 1.8x ratio, while keeping the delays of buffers approximately
+			     lined up with Stratix IV. 
+			     We are using Jeff G.'s capacitance data for 45 nm (in tech/ptm_45nm).
+			     Jeff's tables list C in for transistors with widths in multiples of the minimum feature size (45 nm).
+			     The minimum contactable transistor is 2.5 * 45 nm, so I need to multiply drive strength sizes in this file
+	                     by 2.5x when looking up in Jeff's tables.
+			     The delay values are lined up with Stratix IV, which has an architecture similar to this
+			     proposed FPGA, and which is also 40 nm 
+			     C_ipin_cblock: input capacitance of a track buffer, which VPR assumes is a single-stage
+			     4x minimum drive strength buffer. -->
+    <sizing R_minW_nmos="8926" R_minW_pmos="16067"/>
+    <!-- The grid_logic_tile_area below will be used for all blocks that do not explicitly set their own (non-routing)
+     	  area; set to 0 since we explicitly set the area of all blocks currently in this architecture file.
+	  -->
+    <area grid_logic_tile_area="0"/>
+    <chan_width_distr>
+      <x distr="uniform" peak="1.000000"/>
+      <y distr="uniform" peak="1.000000"/>
+    </chan_width_distr>
+    <switch_block type="wilton" fs="3" sub_type="subset" sub_fs="3"/>
+    <connection_block input_switch_name="ipin_cblock"/>
+  </device>
+  <switchlist>
+    <!-- VB: the mux_trans_size and buf_size data below is in minimum width transistor *areas*, assuming the purple
+	       book area formula. This means the mux transistors are about 5x minimum drive strength.
+	       We assume the first stage of the buffer is 3x min drive strength to be reasonable given the large 
+	       mux transistors, and this gives a reasonable stage ratio of a bit over 5x to the second stage. We assume
+	       the n and p transistors in the first stage are equal-sized to lower the buffer trip point, since it's fed
+	       by a pass transistor mux. We can then reverse engineer the buffer second stage to hit the specified 
+	       buf_size (really buffer area) - 16.2x minimum drive nmos and 1.8*16.2 = 29.2x minimum drive.
+	       I then took the data from Jeff G.'s PTM modeling of 45 nm to get the Cin (gate of first stage) and Cout 
+	       (diff of second stage) listed below.  Jeff's models are in tech/ptm_45nm, and are in min feature multiples.
+	       The minimum contactable transistor is 2.5 * 45 nm, so I need to multiply the drive strength sizes above by 
+	       2.5x when looking up in Jeff's tables.
+	       Finally, we choose a switch delay (58 ps) that leads to length 4 wires having a delay equal to that of SIV of 126 ps.
+	       This also leads to the switch being 46% of the total wire delay, which is reasonable. -->
+    <switch type="mux" name="0" R="551" Cin=".77e-15" Cout="4e-15" Tdel="58e-12" mux_trans_size="2.630740" buf_size="27.645901"/>
+    <!--switch ipin_cblock resistance set to yeild for 4x minimum drive strength buffer-->
+    <switch type="mux" name="ipin_cblock" R="2231.5" Cout="0." Cin="1.47e-15" Tdel="7.247000e-11" mux_trans_size="1.222260" buf_size="auto"/>
+  </switchlist>
+  <segmentlist>
+    <!--- VB & JL: using ITRS metal stack data, 96 nm half pitch wires, which are intermediate metal width/space.  
+			     With the 96 nm half pitch, such wires would take 60 um of height, vs. a 90 nm high (approximated as square) Stratix IV tile so this seems
+			     reasonable. Using a tile length of 90 nm, corresponding to the length of a Stratix IV tile if it were square. -->
+    <!-- GIVE a specific name for the segment! OpenFPGA appreciate that! -->
+    <segment name="L4" freq="1.000000" length="4" type="unidir" Rmetal="101" Cmetal="22.5e-15">
+      <mux name="0"/>
+      <sb type="pattern">1 1 1 1 1</sb>
+      <cb type="pattern">1 1 1 1</cb>
+    </segment>
+  </segmentlist>
+  <complexblocklist>
+    <!-- Define I/O pads begin -->
+    <!-- Capacity is a unique property of I/Os, it is the maximum number of I/Os that can be placed at the same (X,Y) location on the FPGA -->
+    <!-- Not sure of the area of an I/O (varies widely), and it's not relevant to the design of the FPGA core, so we're setting it to 0. -->
+    <pb_type name="io">
+      <input name="outpad" num_pins="1"/>
+      <output name="inpad" num_pins="1"/>
+      <!-- Do NOT add clock pins to I/O here!!! VPR does not build clock network in the way that OpenFPGA can support
+           If you need to register the I/O, define clocks in the circuit models
+           These clocks can be handled in back-end
+       -->
+      <!-- A mode denotes the physical implementation of an I/O 
+           This mode will be not packable but is mainly used for fabric verilog generation   
+        -->
+      <mode name="physical" disable_packing="true">
+        <pb_type name="iopad" blif_model=".subckt io" num_pb="1">
+          <input name="outpad" num_pins="1"/>
+          <output name="inpad" num_pins="1"/>
+        </pb_type>
+        <interconnect>
+          <direct name="outpad" input="io.outpad" output="iopad.outpad">
+            <delay_constant max="1.394e-11" in_port="io.outpad" out_port="iopad.outpad"/>
+          </direct>
+          <direct name="inpad" input="iopad.inpad" output="io.inpad">
+            <delay_constant max="4.243e-11" in_port="iopad.inpad" out_port="io.inpad"/>
+          </direct>
+        </interconnect>
+      </mode>
+
+      <!-- IOs can operate as either inputs or outputs.
+	     Delays below come from Ian Kuon. They are small, so they should be interpreted as
+	     the delays to and from registers in the I/O (and generally I/Os are registered 
+	     today and that is when you timing analyze them.
+	     -->
+      <mode name="inpad">
+        <pb_type name="inpad" blif_model=".input" num_pb="1">
+          <output name="inpad" num_pins="1"/>
+        </pb_type>
+        <interconnect>
+          <direct name="inpad" input="inpad.inpad" output="io.inpad">
+            <delay_constant max="4.243e-11" in_port="inpad.inpad" out_port="io.inpad"/>
+          </direct>
+        </interconnect>
+      </mode>
+      <mode name="outpad">
+        <pb_type name="outpad" blif_model=".output" num_pb="1">
+          <input name="outpad" num_pins="1"/>
+        </pb_type>
+        <interconnect>
+          <direct name="outpad" input="io.outpad" output="outpad.outpad">
+            <delay_constant max="1.394e-11" in_port="io.outpad" out_port="outpad.outpad"/>
+          </direct>
+        </interconnect>
+      </mode>
+      <!-- Every input pin is driven by 15% of the tracks in a channel, every output pin is driven by 10% of the tracks in a channel -->
+      <!-- IOs go on the periphery of the FPGA, for consistency, 
+          make it physically equivalent on all sides so that only one definition of I/Os is needed.
+          If I do not make a physically equivalent definition, then I need to define 4 different I/Os, one for each side of the FPGA
+        -->
+      <!-- Place I/Os on the sides of the FPGA -->
+      <power method="ignore"/>
+    </pb_type>
+    <!-- Define I/O pads ends -->
+    <!-- Define general purpose logic block (CLB) begin -->
+    <!--- Area calculation: Total Stratix IV tile area is about 8100 um^2, and a minimum width transistor 
+	   area is 60 L^2 yields a tile area of 84375 MWTAs.
+	   Routing at W=300 is 30481 MWTAs, leaving us with a total of 53000 MWTAs for logic block area 
+	   This means that only 37% of our area is in the general routing, and 63% is inside the logic
+	   block. Note that the crossbar / local interconnect is considered part of the logic block
+	   area in this analysis. That is a lower proportion of of routing area than most academics
+	   assume, but note that the total routing area really includes the crossbar, which would push
+	   routing area up significantly, we estimate into the ~70% range. 
+	   -->
+    <pb_type name="clb">
+      <input name="I" num_pins="12" equivalent="full"/>
+      <input name="reset" num_pins="1"/>
+      <output name="O" num_pins="8" equivalent="none"/>
+      <clock name="clk" num_pins="1"/>
+      <!-- Describe fracturable logic element.  
+             Each fracturable logic element has a 6-LUT that can alternatively operate as two 5-LUTs with shared inputs. 
+             The outputs of the fracturable logic element can be optionally registered
+        -->
+      <pb_type name="fle" num_pb="4">
+        <input name="in" num_pins="4"/>
+        <input name="reset" num_pins="1"/>
+        <output name="out" num_pins="2"/>
+        <clock name="clk" num_pins="1"/>
+        <!-- Physical mode definition begin (physical implementation of the fle) -->
+        <mode name="physical" disable_packing="true">
+          <pb_type name="fabric" num_pb="1">
+            <input name="in" num_pins="4"/>
+            <input name="reset" num_pins="1"/>
+            <output name="out" num_pins="2"/>
+            <clock name="clk" num_pins="1"/>
+            <pb_type name="frac_logic" num_pb="1">
+              <input name="in" num_pins="4"/>
+              <output name="out" num_pins="2"/>
+              <!-- Define LUT -->
+              <pb_type name="frac_lut4" blif_model=".subckt frac_lut4" num_pb="1">
+                <input name="in" num_pins="4"/>
+                <output name="lut3_out" num_pins="2"/>
+                <output name="lut4_out" num_pins="1"/>
+              </pb_type>
+              <interconnect>
+                <direct name="direct1" input="frac_logic.in" output="frac_lut4.in"/>
+                <direct name="direct2" input="frac_lut4.lut3_out[1]" output="frac_logic.out[1]"/>
+                <!-- Xifan Tang: I use out[0] because the output of lut6 in lut6 mode is wired to the out[0] -->
+                <mux name="mux1" input="frac_lut4.lut4_out frac_lut4.lut3_out[0]" output="frac_logic.out[0]"/>
+              </interconnect>
+            </pb_type>
+            <!-- Define flip-flop -->
+            <pb_type name="ff" blif_model=".subckt dffr" num_pb="2">
+              <input name="D" num_pins="1" port_class="D"/>
+              <input name="R" num_pins="1"/>
+              <output name="Q" num_pins="1" port_class="Q"/>
+              <clock name="C" num_pins="1" port_class="clock"/>
+              <T_setup value="66e-12" port="ff.D" clock="C"/>
+              <T_setup value="66e-12" port="ff.R" clock="C"/>
+              <T_clock_to_Q max="124e-12" port="ff.Q" clock="C"/>
+            </pb_type>
+            <interconnect>
+              <direct name="direct1" input="fabric.in" output="frac_logic.in"/>
+              <direct name="direct2" input="frac_logic.out[1:0]" output="ff[1:0].D"/>
+              <complete name="direct3" input="fabric.clk" output="ff[1:0].C"/>
+              <complete name="direct4" input="fabric.reset" output="ff[1:0].R"/>
+              <mux name="mux1" input="ff[0].Q frac_logic.out[0]" output="fabric.out[0]">
+                <!-- LUT to output is faster than FF to output on a Stratix IV -->
+                <delay_constant max="25e-12" in_port="frac_logic.out[0]" out_port="fabric.out[0]"/>
+                <delay_constant max="45e-12" in_port="ff[0].Q" out_port="fabric.out[0]"/>
+              </mux>
+              <mux name="mux2" input="ff[1].Q frac_logic.out[1]" output="fabric.out[1]">
+                <!-- LUT to output is faster than FF to output on a Stratix IV -->
+                <delay_constant max="25e-12" in_port="frac_logic.out[1]" out_port="fabric.out[1]"/>
+                <delay_constant max="45e-12" in_port="ff[1].Q" out_port="fabric.out[1]"/>
+              </mux>
+            </interconnect>
+          </pb_type>
+          <interconnect>
+            <direct name="direct1" input="fle.in" output="fabric.in"/>
+            <direct name="direct2" input="fabric.out" output="fle.out"/>
+            <direct name="direct3" input="fle.clk" output="fabric.clk"/>
+            <direct name="direct4" input="fle.reset" output="fabric.reset"/>
+          </interconnect>
+        </mode>
+        <!-- Physical mode definition end (physical implementation of the fle) -->
+        <!-- Dual 3-LUT mode definition begin -->
+        <mode name="n2_lut3">
+          <pb_type name="lut3inter" num_pb="1">
+            <input name="in" num_pins="3"/>
+            <input name="reset" num_pins="1"/>
+            <output name="out" num_pins="2"/>
+            <clock name="clk" num_pins="1"/>
+            <pb_type name="ble3" num_pb="2">
+              <input name="in" num_pins="3"/>
+              <input name="reset" num_pins="1"/>
+              <output name="out" num_pins="1"/>
+              <clock name="clk" num_pins="1"/>
+              <!-- Define the LUT -->
+              <pb_type name="lut3" blif_model=".names" num_pb="1" class="lut">
+                <input name="in" num_pins="3" port_class="lut_in"/>
+                <output name="out" num_pins="1" port_class="lut_out"/>
+                <!-- LUT timing using delay matrix -->
+                <!-- These are the physical delay inputs on a Stratix IV LUT but because VPR cannot do LUT rebalancing,
+                           we instead take the average of these numbers to get more stable results
+                      82e-12
+                      173e-12
+                      261e-12
+                      263e-12
+                      398e-12
+                      -->
+                <delay_matrix type="max" in_port="lut3.in" out_port="lut3.out">
+                  235e-12
+                  235e-12
+                  235e-12
+                </delay_matrix>
+              </pb_type>
+              <!-- Define the flip-flop -->
+              <pb_type name="ff" num_pb="1">
+                <input name="D" num_pins="1"/>
+                <input name="R" num_pins="1"/>
+                <output name="Q" num_pins="1"/>
+                <clock name="C" num_pins="1"/>
+                <mode name="latch">
+                  <pb_type name="latch" blif_model=".latch" num_pb="1">
+                    <input name="D" num_pins="1" port_class="D"/>
+                    <output name="Q" num_pins="1" port_class="Q"/>
+                    <clock name="clk" num_pins="1" port_class="clock"/>
+                    <T_setup value="66e-12" port="latch.D" clock="clk"/>
+                    <T_clock_to_Q max="124e-12" port="latch.Q" clock="clk"/>
+                  </pb_type> 
+                  <interconnect>
+                    <direct name="direct1" input="ff.D" output="latch.D"/>
+                    <direct name="direct2" input="ff.C" output="latch.clk"/>
+                    <direct name="direct3" input="latch.Q" output="ff.Q"/>
+                  </interconnect>
+                </mode>
+                <mode name="dff">
+                  <pb_type name="dff" blif_model=".subckt dff" num_pb="1">
+                    <input name="D" num_pins="1" port_class="D"/>
+                    <output name="Q" num_pins="1" port_class="Q"/>
+                    <clock name="C" num_pins="1" port_class="clock"/>
+                    <T_setup value="66e-12" port="dff.D" clock="C"/>
+                    <T_clock_to_Q max="124e-12" port="dff.Q" clock="C"/>
+                  </pb_type> 
+                  <interconnect>
+                    <direct name="direct1" input="ff.D" output="dff.D"/>
+                    <direct name="direct2" input="ff.C" output="dff.C"/>
+                    <direct name="direct3" input="dff.Q" output="ff.Q"/>
+                  </interconnect>
+                </mode>
+                <mode name="dffr">
+                  <pb_type name="dffr" blif_model=".subckt dffr" num_pb="1">
+                    <input name="D" num_pins="1" port_class="D"/>
+                    <input name="R" num_pins="1"/>
+                    <output name="Q" num_pins="1" port_class="Q"/>
+                    <clock name="C" num_pins="1" port_class="clock"/>
+                    <T_setup value="66e-12" port="dffr.D" clock="C"/>
+                    <T_setup value="66e-12" port="dffr.R" clock="C"/>
+                    <T_clock_to_Q max="124e-12" port="dffr.Q" clock="C"/>
+                  </pb_type> 
+                  <interconnect>
+                    <direct name="direct1" input="ff.D" output="dffr.D"/>
+                    <direct name="direct2" input="ff.C" output="dffr.C"/>
+                    <direct name="direct3" input="ff.R" output="dffr.R"/>
+                    <direct name="direct4" input="dffr.Q" output="ff.Q"/>
+                  </interconnect>
+                </mode>
+                <mode name="dffrn">
+                  <pb_type name="dffrn" blif_model=".subckt dffrn" num_pb="1">
+                    <input name="D" num_pins="1" port_class="D"/>
+                    <input name="RN" num_pins="1"/>
+                    <output name="Q" num_pins="1" port_class="Q"/>
+                    <clock name="C" num_pins="1" port_class="clock"/>
+                    <T_setup value="66e-12" port="dffrn.D" clock="C"/>
+                    <T_setup value="66e-12" port="dffrn.RN" clock="C"/>
+                    <T_clock_to_Q max="124e-12" port="dffrn.Q" clock="C"/>
+                  </pb_type> 
+                  <interconnect>
+                    <direct name="direct1" input="ff.D" output="dffrn.D"/>
+                    <direct name="direct2" input="ff.C" output="dffrn.C"/>
+                    <direct name="direct3" input="ff.R" output="dffrn.RN"/>
+                    <direct name="direct4" input="dffrn.Q" output="ff.Q"/>
+                  </interconnect>
+                </mode>
+              </pb_type> 
+              <interconnect>
+                <direct name="direct1" input="ble3.in[2:0]" output="lut3[0:0].in[2:0]"/>
+                <direct name="direct2" input="lut3[0:0].out" output="ff[0:0].D">
+                  <!-- Advanced user option that tells CAD tool to find LUT+FF pairs in netlist -->
+                  <pack_pattern name="ble3" in_port="lut3[0:0].out" out_port="ff[0:0].D"/>
+                </direct>
+                <direct name="direct3" input="ble3.clk" output="ff[0:0].C"/>
+                <direct name="direct4" input="ble3.reset" output="ff[0:0].R"/>
+                <mux name="mux1" input="ff[0:0].Q lut3.out[0:0]" output="ble3.out[0:0]">
+                  <!-- LUT to output is faster than FF to output on a Stratix IV -->
+                  <delay_constant max="25e-12" in_port="lut3.out[0:0]" out_port="ble3.out[0:0]"/>
+                  <delay_constant max="45e-12" in_port="ff[0:0].Q" out_port="ble3.out[0:0]"/>
+                </mux>
+              </interconnect>
+            </pb_type>
+            <interconnect>
+              <direct name="direct1" input="lut3inter.in" output="ble3[0:0].in"/>
+              <direct name="direct2" input="lut3inter.in" output="ble3[1:1].in"/>
+              <direct name="direct3" input="ble3[1:0].out" output="lut3inter.out"/>
+              <complete name="complete1" input="lut3inter.clk" output="ble3[1:0].clk"/>
+              <complete name="complete2" input="lut3inter.reset" output="ble3[1:0].reset"/>
+            </interconnect>
+          </pb_type>
+          <interconnect>
+            <direct name="direct1" input="fle.in[2:0]" output="lut3inter.in"/>
+            <direct name="direct2" input="lut3inter.out" output="fle.out"/>
+            <direct name="direct3" input="fle.clk" output="lut3inter.clk"/>
+            <direct name="direct4" input="fle.reset" output="lut3inter.reset"/>
+          </interconnect>
+        </mode>
+        <!-- Dual 3-LUT mode definition end -->
+        <!-- 4-LUT mode definition begin -->
+        <mode name="n1_lut4">
+          <!-- Define 4-LUT mode -->
+          <pb_type name="ble4" num_pb="1">
+            <input name="in" num_pins="4"/>
+            <input name="reset" num_pins="1"/>
+            <output name="out" num_pins="1"/>
+            <clock name="clk" num_pins="1"/>
+            <!-- Define LUT -->
+            <pb_type name="lut4" blif_model=".names" num_pb="1" class="lut">
+              <input name="in" num_pins="4" port_class="lut_in"/>
+              <output name="out" num_pins="1" port_class="lut_out"/>
+              <!-- LUT timing using delay matrix -->
+              <!-- These are the physical delay inputs on a Stratix IV LUT but because VPR cannot do LUT rebalancing,
+                       we instead take the average of these numbers to get more stable results
+                  82e-12
+                  173e-12
+                  261e-12
+                  263e-12
+                  398e-12
+                  397e-12
+                  -->
+              <delay_matrix type="max" in_port="lut4.in" out_port="lut4.out">
+                261e-12
+                261e-12
+                261e-12
+                261e-12
+              </delay_matrix>
+            </pb_type>
+            <!-- Define the flip-flop -->
+            <pb_type name="ff" num_pb="1">
+              <input name="D" num_pins="1"/>
+              <input name="R" num_pins="1"/>
+              <output name="Q" num_pins="1"/>
+              <clock name="C" num_pins="1"/>
+              <mode name="latch">
+                <pb_type name="latch" blif_model=".latch" num_pb="1">
+                  <input name="D" num_pins="1" port_class="D"/>
+                  <output name="Q" num_pins="1" port_class="Q"/>
+                  <clock name="clk" num_pins="1" port_class="clock"/>
+                  <T_setup value="66e-12" port="latch.D" clock="clk"/>
+                  <T_clock_to_Q max="124e-12" port="latch.Q" clock="clk"/>
+                </pb_type> 
+                <interconnect>
+                  <direct name="direct1" input="ff.D" output="latch.D"/>
+                  <direct name="direct2" input="ff.C" output="latch.clk"/>
+                  <direct name="direct3" input="latch.Q" output="ff.Q"/>
+                </interconnect>
+              </mode>
+              <mode name="dff">
+                <pb_type name="dff" blif_model=".subckt dff" num_pb="1">
+                  <input name="D" num_pins="1" port_class="D"/>
+                  <output name="Q" num_pins="1" port_class="Q"/>
+                  <clock name="C" num_pins="1" port_class="clock"/>
+                  <T_setup value="66e-12" port="dff.D" clock="C"/>
+                  <T_clock_to_Q max="124e-12" port="dff.Q" clock="C"/>
+                </pb_type> 
+                <interconnect>
+                  <direct name="direct1" input="ff.D" output="dff.D"/>
+                  <direct name="direct2" input="ff.C" output="dff.C"/>
+                  <direct name="direct3" input="dff.Q" output="ff.Q"/>
+                </interconnect>
+              </mode>
+              <mode name="dffr">
+                <pb_type name="dffr" blif_model=".subckt dffr" num_pb="1">
+                  <input name="D" num_pins="1" port_class="D"/>
+                  <input name="R" num_pins="1"/>
+                  <output name="Q" num_pins="1" port_class="Q"/>
+                  <clock name="C" num_pins="1" port_class="clock"/>
+                  <T_setup value="66e-12" port="dffr.D" clock="C"/>
+                  <T_setup value="66e-12" port="dffr.R" clock="C"/>
+                  <T_clock_to_Q max="124e-12" port="dffr.Q" clock="C"/>
+                </pb_type> 
+                <interconnect>
+                  <direct name="direct1" input="ff.D" output="dffr.D"/>
+                  <direct name="direct2" input="ff.C" output="dffr.C"/>
+                  <direct name="direct3" input="ff.R" output="dffr.R"/>
+                  <direct name="direct4" input="dffr.Q" output="ff.Q"/>
+                </interconnect>
+              </mode>
+              <mode name="dffrn">
+                <pb_type name="dffrn" blif_model=".subckt dffrn" num_pb="1">
+                  <input name="D" num_pins="1" port_class="D"/>
+                  <input name="RN" num_pins="1"/>
+                  <output name="Q" num_pins="1" port_class="Q"/>
+                  <clock name="C" num_pins="1" port_class="clock"/>
+                  <T_setup value="66e-12" port="dffrn.D" clock="C"/>
+                  <T_setup value="66e-12" port="dffrn.RN" clock="C"/>
+                  <T_clock_to_Q max="124e-12" port="dffrn.Q" clock="C"/>
+                </pb_type> 
+                <interconnect>
+                  <direct name="direct1" input="ff.D" output="dffrn.D"/>
+                  <direct name="direct2" input="ff.C" output="dffrn.C"/>
+                  <direct name="direct3" input="ff.R" output="dffrn.RN"/>
+                  <direct name="direct4" input="dffrn.Q" output="ff.Q"/>
+                </interconnect>
+              </mode>
+            </pb_type> 
+            <interconnect>
+              <direct name="direct1" input="ble4.in" output="lut4[0:0].in"/>
+              <direct name="direct2" input="lut4.out" output="ff.D">
+                <!-- Advanced user option that tells CAD tool to find LUT+FF pairs in netlist -->
+                <pack_pattern name="ble4" in_port="lut4.out" out_port="ff.D"/>
+              </direct>
+              <direct name="direct3" input="ble4.clk" output="ff.C"/>
+              <direct name="direct4" input="ble4.reset" output="ff.R"/>
+              <mux name="mux1" input="ff.Q lut4.out" output="ble4.out">
+                <!-- LUT to output is faster than FF to output on a Stratix IV -->
+                <delay_constant max="25e-12" in_port="lut4.out" out_port="ble4.out"/>
+                <delay_constant max="45e-12" in_port="ff.Q" out_port="ble4.out"/>
+              </mux>
+            </interconnect>
+          </pb_type>
+          <interconnect>
+            <direct name="direct1" input="fle.in" output="ble4.in"/>
+            <direct name="direct2" input="ble4.out" output="fle.out[0:0]"/>
+            <direct name="direct3" input="fle.clk" output="ble4.clk"/>
+            <direct name="direct4" input="fle.reset" output="ble4.reset"/>
+          </interconnect>
+        </mode>
+        <!-- 6-LUT mode definition end -->
+      </pb_type>
+      <interconnect>
+        <!-- We use a full crossbar to get logical equivalence at inputs of CLB 
+		     The delays below come from Stratix IV. the delay through a connection block
+		     input mux + the crossbar in Stratix IV is 167 ps. We already have a 72 ps 
+		     delay on the connection block input mux (modeled by Ian Kuon), so the remaining
+		     delay within the crossbar is 95 ps. 
+		     The delays of cluster feedbacks in Stratix IV is 100 ps, when driven by a LUT.
+		     Since all our outputs LUT outputs go to a BLE output, and have a delay of 
+		     25 ps to do so, we subtract 25 ps from the 100 ps delay of a feedback
+		     to get the part that should be marked on the crossbar.	 -->
+        <complete name="crossbar" input="clb.I fle[3:0].out clb.reset" output="fle[3:0].in">
+          <delay_constant max="95e-12" in_port="clb.I clb.reset" out_port="fle[3:0].in"/>
+          <delay_constant max="75e-12" in_port="fle[3:0].out" out_port="fle[3:0].in"/>
+        </complete>
+        <complete name="clks" input="clb.clk" output="fle[3:0].clk">
+        </complete>
+        <complete name="resets" input="clb.reset" output="fle[3:0].reset">
+        </complete>
+        <!-- This way of specifying direct connection to clb outputs is important because this architecture uses automatic spreading of opins.  
+               By grouping to output pins in this fashion, if a logic block is completely filled by 6-LUTs, 
+               then the outputs those 6-LUTs take get evenly distributed across all four sides of the CLB instead of clumped on two sides (which is what happens with a more
+               naive specification).
+          -->
+        <direct name="clbouts1" input="fle[3:0].out[0:0]" output="clb.O[3:0]"/>
+        <direct name="clbouts2" input="fle[3:0].out[1:1]" output="clb.O[7:4]"/>
+      </interconnect>
+      <!-- Every input pin is driven by 15% of the tracks in a channel, every output pin is driven by 10% of the tracks in a channel -->
+      <!-- Place this general purpose logic block in any unspecified column -->
+    </pb_type>
+    <!-- Define general purpose logic block (CLB) ends -->
+  </complexblocklist>
+</architecture>
 @ -1 +1 @@
 .1.489
 .1.525