#include #include #include #include #include "util.h" #include "physical_types.h" #include "vpr_types.h" #include "globals.h" #include "route_export.h" #include "route_common.h" #include "cluster_legality.h" #include "cluster_placement.h" #include "rr_graph.h" static t_chunk rr_mem_ch = {NULL, 0, NULL}; /*static struct s_linked_vptr *rr_mem_chunk_list_head = NULL; static int chunk_bytes_avail = 0; static char *chunk_next_avail_mem = NULL;*/ static struct s_trace **best_routing; /* nets_in_cluster: array of all nets contained in the cluster */ static int *nets_in_cluster; /* [0..num_nets_in_cluster-1] */ static int num_nets_in_cluster; static int saved_num_nets_in_cluster; static int curr_cluster_index; static int ext_input_rr_node_index, ext_output_rr_node_index, ext_clock_rr_node_index, max_ext_index; static int **saved_net_rr_terminals; static float pres_fac; /********************* Subroutines local to this module *********************/ static boolean is_net_in_cluster(INP int inet); static void add_net_rr_terminal_cluster(int iblk_net, t_pb_graph_node * primitive, int ilogical_block, t_model_ports * model_port, int ipin); static boolean breadth_first_route_net_cluster(int inet); static void breadth_first_expand_trace_segment_cluster( struct s_trace *start_ptr, int remaining_connections_to_sink); static void breadth_first_expand_neighbours_cluster(int inode, float pcost, int inet, boolean first_time); static void breadth_first_add_source_to_heap_cluster(int inet); static void alloc_net_rr_terminals_cluster(void); static void mark_ends_cluster(int inet); static float rr_node_intrinsic_cost(int inode); /************************ Subroutine definitions ****************************/ static boolean is_net_in_cluster(INP int inet) { int i; for (i = 0; i < num_nets_in_cluster; i++) { if (nets_in_cluster[i] == inet) { return TRUE; } } return FALSE; } /* load rr_node for source and sinks of net if exists, return FALSE otherwise */ /* Todo: Note this is an inefficient way to determine port, better to use a lookup, worry about this if runtime becomes an issue */ static void add_net_rr_terminal_cluster(int iblk_net, t_pb_graph_node * primitive, int ilogical_block, t_model_ports * model_port, int ipin) { /* Ensure at most one external input/clock source and one external output sink for net */ int i, net_pin; t_port *prim_port; const t_pb_type *pb_type; boolean found; int input_port; int output_port; int clock_port; input_port = output_port = clock_port = 0; pb_type = primitive->pb_type; prim_port = NULL; assert(pb_type->num_modes == 0); found = FALSE; /* TODO: This is inelegant design, I should change the primitive ports in pb_type to be input, output, or clock instead of this lookup */ for (i = 0; i < pb_type->num_ports && !found; i++) { prim_port = &pb_type->ports[i]; if (pb_type->ports[i].model_port == model_port) { found = TRUE; } else { if (prim_port->is_clock) { clock_port++; assert(prim_port->type == IN_PORT); } else if (prim_port->type == IN_PORT) { input_port++; } else if (prim_port->type == OUT_PORT) { output_port++; } else { assert(0); } } } assert(found); assert(ipin < prim_port->num_pins); net_pin = OPEN; if (prim_port->is_clock) { for (i = 1; i <= vpack_net[iblk_net].num_sinks; i++) { if (vpack_net[iblk_net].node_block[i] == ilogical_block && vpack_net[iblk_net].node_block_port[i] == model_port->index && vpack_net[iblk_net].node_block_pin[i] == ipin) { net_pin = i; break; } } assert(net_pin != OPEN); assert(rr_node[primitive->clock_pins[clock_port][ipin].pin_count_in_cluster].num_edges == 1); net_rr_terminals[iblk_net][net_pin] = rr_node[primitive->clock_pins[clock_port][ipin].pin_count_in_cluster].edges[0]; } else if (prim_port->type == IN_PORT) { for (i = 1; i <= vpack_net[iblk_net].num_sinks; i++) { if (vpack_net[iblk_net].node_block[i] == ilogical_block && vpack_net[iblk_net].node_block_port[i] == model_port->index && vpack_net[iblk_net].node_block_pin[i] == ipin) { net_pin = i; break; } } assert(net_pin != OPEN); assert(rr_node[primitive->input_pins[input_port][ipin].pin_count_in_cluster].num_edges == 1); net_rr_terminals[iblk_net][net_pin] = rr_node[primitive->input_pins[input_port][ipin].pin_count_in_cluster].edges[0]; } else if (prim_port->type == OUT_PORT) { i = 0; if (vpack_net[iblk_net].node_block[i] == ilogical_block && vpack_net[iblk_net].node_block_port[i] == model_port->index && vpack_net[iblk_net].node_block_pin[i] == ipin) { net_pin = i; } assert(net_pin != OPEN); net_rr_terminals[iblk_net][net_pin] = primitive->output_pins[output_port][ipin].pin_count_in_cluster; } else { assert(0); } } void reload_ext_net_rr_terminal_cluster(void) { int i, j, net_index; boolean has_ext_sink, has_ext_source; int curr_ext_output, curr_ext_input, curr_ext_clock; curr_ext_input = ext_input_rr_node_index; curr_ext_output = ext_output_rr_node_index; curr_ext_clock = ext_clock_rr_node_index; for (i = 0; i < num_nets_in_cluster; i++) { net_index = nets_in_cluster[i]; has_ext_sink = FALSE; has_ext_source = (boolean) (logical_block[vpack_net[net_index].node_block[0]].clb_index != curr_cluster_index); if (has_ext_source) { /* Instantiate a source of this net */ if (vpack_net[net_index].is_global) { net_rr_terminals[net_index][0] = curr_ext_clock; curr_ext_clock++; } else { net_rr_terminals[net_index][0] = curr_ext_input; curr_ext_input++; } } for (j = 1; j <= vpack_net[net_index].num_sinks; j++) { if (logical_block[vpack_net[net_index].node_block[j]].clb_index != curr_cluster_index) { if (has_ext_sink || has_ext_source) { /* Only need one node driving external routing, either this cluster drives external routing or another cluster does it */ net_rr_terminals[net_index][j] = OPEN; } else { /* External sink, only need to route once, externally routing will take care of the rest */ net_rr_terminals[net_index][j] = curr_ext_output; curr_ext_output++; has_ext_sink = TRUE; } } } if (curr_ext_input > ext_output_rr_node_index || curr_ext_output > ext_clock_rr_node_index || curr_ext_clock > max_ext_index) { /* failed, not enough pins of proper type, overran index */ assert(0); } } } void alloc_and_load_cluster_legality_checker(void) { best_routing = (struct s_trace **) my_calloc(num_logical_nets, sizeof(struct s_trace *)); nets_in_cluster = (int *) my_malloc(num_logical_nets * sizeof(int)); num_nets_in_cluster = 0; num_nets = num_logical_nets; /* inside a cluster, I do not consider rr_indexed_data cost, set to 1 since other costs are multiplied by it */ num_rr_indexed_data = 1; rr_indexed_data = (t_rr_indexed_data *) my_calloc(1, sizeof(t_rr_indexed_data)); rr_indexed_data[0].base_cost = 1; /* alloc routing structures */ alloc_route_static_structs(); alloc_net_rr_terminals_cluster(); } void free_cluster_legality_checker(void) { int inet; free(best_routing); free(rr_indexed_data); free_rr_node_route_structs(); free_route_structs(); free_trace_structs(); free_chunk_memory(&rr_mem_ch); for (inet = 0; inet < num_logical_nets; inet++) { free(saved_net_rr_terminals[inet]); } free(net_rr_terminals); free(nets_in_cluster); free(saved_net_rr_terminals); } void alloc_and_load_rr_graph_for_pb_graph_node( INP t_pb_graph_node *pb_graph_node, INP const t_arch* arch, int mode) { int i, j, k, index; boolean is_primitive; is_primitive = (boolean) (pb_graph_node->pb_type->num_modes == 0); for (i = 0; i < pb_graph_node->num_input_ports; i++) { for (j = 0; j < pb_graph_node->num_input_pins[i]; j++) { index = pb_graph_node->input_pins[i][j].pin_count_in_cluster; rr_node[index].pb_graph_pin = &pb_graph_node->input_pins[i][j]; rr_node[index].fan_in = pb_graph_node->input_pins[i][j].num_input_edges; rr_node[index].num_edges = pb_graph_node->input_pins[i][j].num_output_edges; rr_node[index].pack_intrinsic_cost = 1 + (float) rr_node[index].num_edges / 5 + ((float)j/(float)pb_graph_node->num_input_pins[i])/(float)10; /* need to normalize better than 5 and 10, bias router to use earlier inputs pins */ rr_node[index].edges = (int *) my_malloc( rr_node[index].num_edges * sizeof(int)); rr_node[index].switches = (short *) my_calloc(rr_node[index].num_edges, sizeof(short)); rr_node[index].net_num = OPEN; rr_node[index].vpack_net_num = OPEN; /* Xifan TANG: ensure a clear initialization */ rr_node[index].prev_node = OPEN; rr_node[index].prev_edge = OPEN; if (mode == 0) { /* default mode is the first mode */ rr_node[index].capacity = 1; } else { rr_node[index].capacity = 0; } for (k = 0; k < pb_graph_node->input_pins[i][j].num_output_edges; k++) { /* TODO: Intention was to do bus-based implementation here */ rr_node[index].edges[k] = pb_graph_node->input_pins[i][j].output_edges[k]->output_pins[0]->pin_count_in_cluster; rr_node[index].switches[k] = arch->num_switches - 1; /* last switch in arch switch properties is a delayless switch */ assert( pb_graph_node->input_pins[i][j].output_edges[k]->num_output_pins == 1); } rr_node[index].type = INTRA_CLUSTER_EDGE; if (is_primitive) { /* This is a terminating pin, add SINK node */ assert(rr_node[index].num_edges == 0); rr_node[index].num_edges = 1; rr_node[index].edges = (int *) my_calloc(rr_node[index].num_edges, sizeof(int)); rr_node[index].switches = (short *) my_calloc(rr_node[index].num_edges, sizeof(short)); rr_node[index].edges[0] = num_rr_nodes; /* Create SINK node */ rr_node[num_rr_nodes].pb_graph_pin = NULL; rr_node[num_rr_nodes].fan_in = 1; rr_node[num_rr_nodes].num_edges = 0; rr_node[num_rr_nodes].pack_intrinsic_cost = 1; rr_node[num_rr_nodes].edges = NULL; rr_node[num_rr_nodes].switches = NULL; rr_node[num_rr_nodes].net_num = OPEN; rr_node[num_rr_nodes].vpack_net_num = OPEN; /* Xifan TANG: ensure a clear initialization */ rr_node[num_rr_nodes].prev_node = OPEN; rr_node[num_rr_nodes].prev_edge = OPEN; rr_node[num_rr_nodes].capacity = 1; rr_node[num_rr_nodes].type = SINK; num_rr_nodes++; if(pb_graph_node->pb_type->class_type == LUT_CLASS) { /* LUTs are special, they have logical equivalence at inputs, logical equivalence is represented by a single high capacity sink instead of multiple single capacity sinks */ rr_node[num_rr_nodes - 1].capacity = pb_graph_node->num_input_pins[i]; if(j != 0) { num_rr_nodes--; rr_node[index].edges[0] = num_rr_nodes - 1; } } } } } for (i = 0; i < pb_graph_node->num_output_ports; i++) { for (j = 0; j < pb_graph_node->num_output_pins[i]; j++) { index = pb_graph_node->output_pins[i][j].pin_count_in_cluster; rr_node[index].pb_graph_pin = &pb_graph_node->output_pins[i][j]; rr_node[index].fan_in = pb_graph_node->output_pins[i][j].num_input_edges; rr_node[index].num_edges = pb_graph_node->output_pins[i][j].num_output_edges; rr_node[index].pack_intrinsic_cost = 1 + (float) rr_node[index].num_edges / 5; /* need to normalize better than 5 */ rr_node[index].edges = (int *) my_malloc( rr_node[index].num_edges * sizeof(int)); rr_node[index].switches = (short *) my_calloc(rr_node[index].num_edges, sizeof(short)); rr_node[index].net_num = OPEN; rr_node[index].vpack_net_num = OPEN; /* Xifan TANG: ensure a clear initialization */ rr_node[index].prev_node = OPEN; rr_node[index].prev_edge = OPEN; if (mode == 0) { /* Default mode is the first mode */ rr_node[index].capacity = 1; } else { rr_node[index].capacity = 0; } for (k = 0; k < pb_graph_node->output_pins[i][j].num_output_edges; k++) { /* TODO: Intention was to do bus-based implementation here */ rr_node[index].edges[k] = pb_graph_node->output_pins[i][j].output_edges[k]->output_pins[0]->pin_count_in_cluster; rr_node[index].switches[k] = arch->num_switches - 1; assert( pb_graph_node->output_pins[i][j].output_edges[k]->num_output_pins == 1); } rr_node[index].type = INTRA_CLUSTER_EDGE; if (is_primitive) { rr_node[index].type = SOURCE; } } } for (i = 0; i < pb_graph_node->num_clock_ports; i++) { for (j = 0; j < pb_graph_node->num_clock_pins[i]; j++) { index = pb_graph_node->clock_pins[i][j].pin_count_in_cluster; rr_node[index].pb_graph_pin = &pb_graph_node->clock_pins[i][j]; rr_node[index].fan_in = pb_graph_node->clock_pins[i][j].num_input_edges; rr_node[index].num_edges = pb_graph_node->clock_pins[i][j].num_output_edges; rr_node[index].pack_intrinsic_cost = 1 + (float) rr_node[index].num_edges / 5; /* need to normalize better than 5 */ rr_node[index].edges = (int *) my_malloc( rr_node[index].num_edges * sizeof(int)); rr_node[index].switches = (short *) my_calloc(rr_node[index].num_edges, sizeof(short)); rr_node[index].net_num = OPEN; rr_node[index].vpack_net_num = OPEN; /* Xifan TANG: ensure a clear initialization */ rr_node[index].prev_node = OPEN; rr_node[index].prev_edge = OPEN; if (mode == 0) { /* default mode is the first mode (useful for routing */ rr_node[index].capacity = 1; } else { rr_node[index].capacity = 0; } for (k = 0; k < pb_graph_node->clock_pins[i][j].num_output_edges; k++) { /* TODO: Intention was to do bus-based implementation here */ rr_node[index].edges[k] = pb_graph_node->clock_pins[i][j].output_edges[k]->output_pins[0]->pin_count_in_cluster; rr_node[index].switches[k] = arch->num_switches - 1; assert( pb_graph_node->clock_pins[i][j].output_edges[k]->num_output_pins == 1); } rr_node[index].type = INTRA_CLUSTER_EDGE; if (is_primitive) { /* This is a terminating pin, add SINK node */ assert(rr_node[index].num_edges == 0); rr_node[index].num_edges = 1; rr_node[index].edges = (int *) my_calloc(rr_node[index].num_edges, sizeof(int)); rr_node[index].switches = (short *) my_calloc(rr_node[index].num_edges, sizeof(short)); rr_node[index].edges[0] = num_rr_nodes; /* Create SINK node */ rr_node[num_rr_nodes].pb_graph_pin = NULL; rr_node[num_rr_nodes].fan_in = 1; rr_node[num_rr_nodes].num_edges = 0; rr_node[num_rr_nodes].pack_intrinsic_cost = 1; rr_node[num_rr_nodes].edges = NULL; rr_node[num_rr_nodes].switches = NULL; rr_node[num_rr_nodes].net_num = OPEN; rr_node[num_rr_nodes].vpack_net_num = OPEN; /* Xifan TANG: ensure a clear initialization */ rr_node[num_rr_nodes].prev_node = OPEN; rr_node[num_rr_nodes].prev_edge = OPEN; rr_node[num_rr_nodes].capacity = 1; rr_node[num_rr_nodes].type = SINK; num_rr_nodes++; } } } for (i = 0; i < pb_graph_node->pb_type->num_modes; i++) { /* Xifan Tang: we DO NOT build the rr_graph for those modes are disabled in packing */ /* if (TRUE == pb_graph_node->pb_type->modes[i].disabled_in_packing) { continue; } */ for (j = 0; j < pb_graph_node->pb_type->modes[i].num_pb_type_children; j++) { for (k = 0; k < pb_graph_node->pb_type->modes[i].pb_type_children[j].num_pb; k++) { alloc_and_load_rr_graph_for_pb_graph_node( &pb_graph_node->child_pb_graph_nodes[i][j][k], arch, i); } } } } void alloc_and_load_legalizer_for_cluster(INP t_block* clb, INP int clb_index, INP const t_arch *arch) { /** * Structure: Model external routing and internal routing * * 1. Model external routing * num input pins == num external sources for input pins, fully connect them to input pins (simulates external routing) * num output pins == num external sinks for output pins, fully connect them to output pins (simulates external routing) * num clock pins == num external sources for clock pins, fully connect them to clock pins (simulates external routing) * 2. Model internal routing * */ /* make each rr_node one correspond with pin and correspond with pin's index pin_count_in_cluster */ int i, j, k, m, index, pb_graph_rr_index; int count_pins; t_pb_type * pb_type; t_pb_graph_node *pb_graph_node; int ipin; /* Create rr_graph */ pb_type = clb->type->pb_type; pb_graph_node = clb->type->pb_graph_head; num_rr_nodes = pb_graph_node->total_pb_pins + pb_type->num_input_pins + pb_type->num_output_pins + pb_type->num_clock_pins; /* allocate memory for rr_node resources + additional memory for any additional sources/sinks, 2x is an overallocation but guarantees that there will be enough sources/sinks available */ rr_node = (t_rr_node *) my_calloc(num_rr_nodes * 2, sizeof(t_rr_node)); clb->pb->rr_graph = rr_node; alloc_and_load_rr_graph_for_pb_graph_node(pb_graph_node, arch, 0); curr_cluster_index = clb_index; /* Alloc and load rr_graph external sources and sinks */ ext_input_rr_node_index = pb_graph_node->total_pb_pins; ext_output_rr_node_index = pb_type->num_input_pins + pb_graph_node->total_pb_pins; ext_clock_rr_node_index = pb_type->num_input_pins + pb_type->num_output_pins + pb_graph_node->total_pb_pins; max_ext_index = pb_type->num_input_pins + pb_type->num_output_pins + pb_type->num_clock_pins + pb_graph_node->total_pb_pins; for (i = 0; i < pb_type->num_input_pins; i++) { index = i + pb_graph_node->total_pb_pins; rr_node[index].type = SOURCE; rr_node[index].fan_in = 0; rr_node[index].num_edges = pb_type->num_input_pins; rr_node[index].pack_intrinsic_cost = 1 + (float) rr_node[index].num_edges / 5; /* need to normalize better than 5 */ rr_node[index].edges = (int *) my_malloc( rr_node[index].num_edges * sizeof(int)); rr_node[index].switches = (short *) my_calloc(rr_node[index].num_edges, sizeof(int)); rr_node[index].capacity = 1; } for (i = 0; i < pb_type->num_output_pins; i++) { index = i + pb_type->num_input_pins + pb_graph_node->total_pb_pins; rr_node[index].type = SINK; rr_node[index].fan_in = pb_type->num_output_pins; rr_node[index].num_edges = 0; rr_node[index].pack_intrinsic_cost = 1 + (float) rr_node[index].num_edges / 5; /* need to normalize better than 5 */ rr_node[index].capacity = 1; } for (i = 0; i < pb_type->num_clock_pins; i++) { index = i + pb_type->num_input_pins + pb_type->num_output_pins + pb_graph_node->total_pb_pins; rr_node[index].type = SOURCE; rr_node[index].fan_in = 0; rr_node[index].num_edges = pb_type->num_clock_pins; rr_node[index].pack_intrinsic_cost = 1 + (float) rr_node[index].num_edges / 5; /* need to normalize better than 5 */ rr_node[index].edges = (int *) my_malloc( rr_node[index].num_edges * sizeof(int)); rr_node[index].switches = (short *) my_calloc(rr_node[index].num_edges, sizeof(int)); rr_node[index].capacity = 1; } ipin = 0; for (i = 0; i < pb_graph_node->num_input_ports; i++) { for (j = 0; j < pb_graph_node->num_input_pins[i]; j++) { pb_graph_rr_index = pb_graph_node->input_pins[i][j].pin_count_in_cluster; for (k = 0; k < pb_type->num_input_pins; k++) { index = k + pb_graph_node->total_pb_pins; rr_node[index].edges[ipin] = pb_graph_rr_index; rr_node[index].switches[ipin] = arch->num_switches - 1; } rr_node[pb_graph_rr_index].pack_intrinsic_cost = MAX_SHORT; /* using an input pin should be made costly */ ipin++; } } /* Must attach output pins to input pins because if a connection cannot fit using intra-cluster routing, it can also use external routing */ for (i = 0; i < pb_graph_node->num_output_ports; i++) { for (j = 0; j < pb_graph_node->num_output_pins[i]; j++) { count_pins = pb_graph_node->output_pins[i][j].num_output_edges + pb_type->num_output_pins + pb_type->num_input_pins; pb_graph_rr_index = pb_graph_node->output_pins[i][j].pin_count_in_cluster; rr_node[pb_graph_rr_index].edges = (int *) my_realloc( rr_node[pb_graph_rr_index].edges, (count_pins) * sizeof(int)); rr_node[pb_graph_rr_index].switches = (short *) my_realloc( rr_node[pb_graph_rr_index].switches, (count_pins) * sizeof(int)); ipin = 0; for (k = 0; k < pb_graph_node->num_input_ports; k++) { for (m = 0; m < pb_graph_node->num_input_pins[k]; m++) { index = pb_graph_node->input_pins[k][m].pin_count_in_cluster; rr_node[pb_graph_rr_index].edges[ipin + pb_graph_node->output_pins[i][j].num_output_edges] = index; rr_node[pb_graph_rr_index].switches[ipin + pb_graph_node->output_pins[i][j].num_output_edges] = arch->num_switches - 1; ipin++; } } for (k = 0; k < pb_type->num_output_pins; k++) { index = k + pb_type->num_input_pins + pb_graph_node->total_pb_pins; rr_node[pb_graph_rr_index].edges[k + pb_type->num_input_pins + pb_graph_node->output_pins[i][j].num_output_edges] = index; rr_node[pb_graph_rr_index].switches[k + pb_type->num_input_pins + pb_graph_node->output_pins[i][j].num_output_edges] = arch->num_switches - 1; } rr_node[pb_graph_rr_index].num_edges += pb_type->num_output_pins + pb_type->num_input_pins; rr_node[pb_graph_rr_index].pack_intrinsic_cost = 1 + (float) rr_node[pb_graph_rr_index].num_edges / 5; /* need to normalize better than 5 */ } } ipin = 0; for (i = 0; i < pb_graph_node->num_clock_ports; i++) { for (j = 0; j < pb_graph_node->num_clock_pins[i]; j++) { for (k = 0; k < pb_type->num_clock_pins; k++) { index = k + pb_type->num_input_pins + pb_type->num_output_pins + pb_graph_node->total_pb_pins; pb_graph_rr_index = pb_graph_node->clock_pins[i][j].pin_count_in_cluster; rr_node[index].edges[ipin] = pb_graph_rr_index; rr_node[index].switches[ipin] = arch->num_switches - 1; } ipin++; } } alloc_and_load_rr_node_route_structs(); num_nets_in_cluster = 0; } void free_legalizer_for_cluster(INP t_block* clb, boolean free_local_rr_graph) { int i; free_rr_node_route_structs(); if(free_local_rr_graph == TRUE) { for (i = 0; i < num_rr_nodes; i++) { if (clb->pb->rr_graph[i].edges != NULL) { free(clb->pb->rr_graph[i].edges); } if (clb->pb->rr_graph[i].switches != NULL) { free(clb->pb->rr_graph[i].switches); } } free(clb->pb->rr_graph); } } void reset_legalizer_for_cluster(t_block *clb) { int i; for (i = 0; i < num_nets_in_cluster; i++) { free_traceback(nets_in_cluster[i]); trace_head[nets_in_cluster[i]] = best_routing[nets_in_cluster[i]]; free_traceback(nets_in_cluster[i]); best_routing[nets_in_cluster[i]] = NULL; } free_rr_node_route_structs(); num_nets_in_cluster = 0; saved_num_nets_in_cluster = 0; } /** * * internal_nets: index of nets to route [0..num_internal_nets - 1] */ boolean try_breadth_first_route_cluster(void) { /* Iterated maze router ala Pathfinder Negotiated Congestion algorithm, * * (FPGA 95 p. 111). Returns TRUE if it can route this FPGA, FALSE if * * it can't. */ /* For different modes, when a mode is turned on, I set the max occupancy of all rr_nodes in the mode to 1 and all others to 0 */ /* TODO: There is a bug for route-throughs where edges in route-throughs do not get turned off because the rr_edge is in a particular mode but the two rr_nodes are outside */ boolean success, is_routable; int itry, inet, net_index; struct s_router_opts router_opts; /* Xifan TANG: Count runtime for routing in packing stage */ clock_t begin, end; begin = clock(); /* Usually the first iteration uses a very small (or 0) pres_fac to find * * the shortest path and get a congestion map. For fast compiles, I set * * pres_fac high even for the first iteration. */ /* sets up a fast breadth-first router */ router_opts.first_iter_pres_fac = 10; router_opts.max_router_iterations = 20; router_opts.initial_pres_fac = 10; router_opts.pres_fac_mult = 2; router_opts.acc_fac = 1; reset_rr_node_route_structs(); /* Clear all prior rr_graph history */ pres_fac = router_opts.first_iter_pres_fac; for (itry = 1; itry <= router_opts.max_router_iterations; itry++) { for (inet = 0; inet < num_nets_in_cluster; inet++) { net_index = nets_in_cluster[inet]; pathfinder_update_one_cost(trace_head[net_index], -1, pres_fac); is_routable = breadth_first_route_net_cluster(net_index); /* Impossible to route? (disconnected rr_graph) */ if (!is_routable) { /* TODO: Inelegant, can be more intelligent */ vpr_printf(TIO_MESSAGE_INFO, "Failed routing net %s\n", vpack_net[net_index].name); vpr_printf(TIO_MESSAGE_INFO, "Routing failed. Disconnected rr_graph.\n"); return FALSE; } pathfinder_update_one_cost(trace_head[net_index], 1, pres_fac); } success = feasible_routing(); if (success) { /* End of packing routing */ end = clock(); /* accumulate the runtime for pack routing */ #ifdef CLOCKS_PER_SEC pack_route_time += (float)(end - begin)/ CLOCKS_PER_SEC; #else pack_route_time += (float)(end - begin)/ CLK_PER_SEC; #endif /* vpr_printf(TIO_MESSAGE_INFO, "Updated: Packing routing took %g seconds\n", pack_route_time); */ return (TRUE); } if (itry == 1) pres_fac = router_opts.initial_pres_fac; else pres_fac *= router_opts.pres_fac_mult; pres_fac = std::min(pres_fac, static_cast(HUGE_POSITIVE_FLOAT / 1e5)); pathfinder_update_cost(pres_fac, router_opts.acc_fac); } /* End of packing routing */ end = clock(); /* accumulate the runtime for pack routing */ #ifdef CLOCKS_PER_SEC pack_route_time += (float)(end - begin)/ CLOCKS_PER_SEC; #else pack_route_time += (float)(end - begin)/ CLK_PER_SEC; #endif /* vpr_printf(TIO_MESSAGE_INFO, "Updated: Packing routing took %g seconds\n", pack_route_time); */ return (FALSE); } static boolean breadth_first_route_net_cluster(int inet) { /* Uses a maze routing (Dijkstra's) algorithm to route a net. The net * * begins at the net output, and expands outward until it hits a target * * pin. The algorithm is then restarted with the entire first wire segment * * included as part of the source this time. For an n-pin net, the maze * * router is invoked n-1 times to complete all the connections. Inet is * * the index of the net to be routed. * * If this routine finds that a net *cannot* be connected (due to a complete * * lack of potential paths, rather than congestion), it returns FALSE, as * * routing is impossible on this architecture. Otherwise it returns TRUE. */ int i, inode, prev_node, remaining_connections_to_sink; float pcost, new_pcost; struct s_heap *current; struct s_trace *tptr; boolean first_time; free_traceback(inet); breadth_first_add_source_to_heap_cluster(inet); mark_ends_cluster(inet); tptr = NULL; remaining_connections_to_sink = 0; for (i = 1; i <= vpack_net[inet].num_sinks; i++) { /* Need n-1 wires to connect n pins */ /* Do not connect open terminals */ if (net_rr_terminals[inet][i] == OPEN) continue; /* Expand and begin routing */ breadth_first_expand_trace_segment_cluster(tptr, remaining_connections_to_sink); current = get_heap_head(); if (current == NULL) { /* Infeasible routing. No possible path for net. */ reset_path_costs(); /* Clean up before leaving. */ return (FALSE); } inode = current->index; while (rr_node_route_inf[inode].target_flag == 0) { pcost = rr_node_route_inf[inode].path_cost; new_pcost = current->cost; if (pcost > new_pcost) { /* New path is lowest cost. */ rr_node_route_inf[inode].path_cost = new_pcost; prev_node = current->u.prev_node; rr_node_route_inf[inode].prev_node = prev_node; rr_node_route_inf[inode].prev_edge = current->prev_edge; first_time = FALSE; if (pcost > 0.99 * HUGE_POSITIVE_FLOAT) /* First time touched. */{ add_to_mod_list(&rr_node_route_inf[inode].path_cost); first_time = TRUE; } breadth_first_expand_neighbours_cluster(inode, new_pcost, inet, first_time); } free_heap_data(current); current = get_heap_head(); if (current == NULL) { /* Impossible routing. No path for net. */ reset_path_costs(); return (FALSE); } inode = current->index; } rr_node_route_inf[inode].target_flag--; /* Connected to this SINK. */ remaining_connections_to_sink = rr_node_route_inf[inode].target_flag; tptr = update_traceback(current, inet); free_heap_data(current); } empty_heap(); reset_path_costs(); return (TRUE); } static void breadth_first_expand_trace_segment_cluster( struct s_trace *start_ptr, int remaining_connections_to_sink) { /* Adds all the rr_nodes in the traceback segment starting at tptr (and * * continuing to the end of the traceback) to the heap with a cost of zero. * * This allows expansion to begin from the existing wiring. The * * remaining_connections_to_sink value is 0 if the route segment ending * * at this location is the last one to connect to the SINK ending the route * * segment. This is the usual case. If it is not the last connection this * * net must make to this SINK, I have a hack to ensure the next connection * * to this SINK goes through a different IPIN. Without this hack, the * * router would always put all the connections from this net to this SINK * * through the same IPIN. With LUTs or cluster-based logic blocks, you * * should never have a net connecting to two logically-equivalent pins on * * the same logic block, so the hack will never execute. If your logic * * block is an and-gate, however, nets might connect to two and-inputs on * * the same logic block, and since the and-inputs are logically-equivalent, * * this means two connections to the same SINK. */ struct s_trace *tptr, *next_ptr; int inode, sink_node, last_ipin_node; tptr = start_ptr; if (remaining_connections_to_sink == 0) { /* Usual case. */ while (tptr != NULL) { node_to_heap(tptr->index, 0., NO_PREVIOUS, NO_PREVIOUS, OPEN, OPEN); tptr = tptr->next; } } else { /* This case never executes for most logic blocks. */ /* Weird case. Lots of hacks. The cleanest way to do this would be to empty * * the heap, update the congestion due to the partially-completed route, put * * the whole route so far (excluding IPINs and SINKs) on the heap with cost * * 0., and expand till you hit the next SINK. That would be slow, so I * * do some hacks to enable incremental wavefront expansion instead. */ if (tptr == NULL) return; /* No route yet */ next_ptr = tptr->next; last_ipin_node = OPEN; /* Stops compiler from complaining. */ /* Can't put last SINK on heap with NO_PREVIOUS, etc, since that won't let * * us reach it again. Instead, leave the last traceback element (SINK) off * * the heap. */ while (next_ptr != NULL) { inode = tptr->index; node_to_heap(inode, 0., NO_PREVIOUS, NO_PREVIOUS, OPEN, OPEN); if (rr_node[inode].type == INTRA_CLUSTER_EDGE) { if(rr_node[inode].pb_graph_pin != NULL && rr_node[inode].pb_graph_pin->num_output_edges == 0) { last_ipin_node = inode; } } tptr = next_ptr; next_ptr = tptr->next; } /* This will stop the IPIN node used to get to this SINK from being * * reexpanded for the remainder of this net's routing. This will make us * * hook up more IPINs to this SINK (which is what we want). If IPIN * * doglegs are allowed in the graph, we won't be able to use this IPIN to * * do a dogleg, since it won't be re-expanded. Shouldn't be a big problem. */ assert(last_ipin_node != OPEN); rr_node_route_inf[last_ipin_node].path_cost = -HUGE_POSITIVE_FLOAT; /* Also need to mark the SINK as having high cost, so another connection can * * be made to it. */ sink_node = tptr->index; rr_node_route_inf[sink_node].path_cost = HUGE_POSITIVE_FLOAT; /* Finally, I need to remove any pending connections to this SINK via the * * IPIN I just used (since they would result in congestion). Scan through * * the heap to do this. */ invalidate_heap_entries(sink_node, last_ipin_node); } } static void breadth_first_expand_neighbours_cluster(int inode, float pcost, int inet, boolean first_time) { /* Puts all the rr_nodes adjacent to inode on the heap. rr_nodes outside * * the expanded bounding box specified in route_bb are not added to the * * heap. pcost is the path_cost to get to inode. */ int iconn, to_node, num_edges; float tot_cost; num_edges = rr_node[inode].num_edges; for (iconn = 0; iconn < num_edges; iconn++) { to_node = rr_node[inode].edges[iconn]; /* Xifan Tang: SHOULD BE FIXED THOROUGHLY!!! * Here, I just bypass all the edges that belongs a mode that is disabled in packing */ if ( (NULL != rr_node[to_node].pb_graph_pin) && (NULL != rr_node[to_node].pb_graph_pin->parent_node->pb_type->parent_mode) && (TRUE == rr_node[to_node].pb_graph_pin->parent_node->pb_type->parent_mode->disabled_in_packing)) { continue; } /*if (first_time) { */ tot_cost = pcost + get_rr_cong_cost(to_node) * rr_node_intrinsic_cost(to_node); /* } else { tot_cost = pcost + get_rr_cong_cost(to_node); }*/ node_to_heap(to_node, tot_cost, inode, iconn, OPEN, OPEN); } } static void breadth_first_add_source_to_heap_cluster(int inet) { /* Adds the SOURCE of this net to the heap. Used to start a net's routing. */ int inode; float cost; inode = net_rr_terminals[inet][0]; /* SOURCE */ cost = get_rr_cong_cost(inode); node_to_heap(inode, cost, NO_PREVIOUS, NO_PREVIOUS, OPEN, OPEN); } static void mark_ends_cluster(int inet) { /* Mark all the SINKs of this net as targets by setting their target flags * * to the number of times the net must connect to each SINK. Note that * * this number can occassionally be greater than 1 -- think of connecting * * the same net to two inputs of an and-gate (and-gate inputs are logically * * equivalent, so both will connect to the same SINK). */ int ipin, inode; for (ipin = 1; ipin <= vpack_net[inet].num_sinks; ipin++) { inode = net_rr_terminals[inet][ipin]; if (inode == OPEN) continue; rr_node_route_inf[inode].target_flag++; assert(rr_node_route_inf[inode].target_flag > 0 && rr_node_route_inf[inode].target_flag <= rr_node[inode].capacity); } } static void alloc_net_rr_terminals_cluster(void) { int inet; net_rr_terminals = (int **) my_malloc(num_logical_nets * sizeof(int *)); saved_net_rr_terminals = (int **) my_malloc( num_logical_nets * sizeof(int *)); saved_num_nets_in_cluster = 0; for (inet = 0; inet < num_logical_nets; inet++) { net_rr_terminals[inet] = (int *) my_chunk_malloc( (vpack_net[inet].num_sinks + 1) * sizeof(int), &rr_mem_ch); saved_net_rr_terminals[inet] = (int *) my_malloc( (vpack_net[inet].num_sinks + 1) * sizeof(int)); } } void setup_intracluster_routing_for_molecule(INP t_pack_molecule *molecule, INP t_pb_graph_node **primitive_list) { /* Allocates and loads the net_rr_terminals data structure. For each net * * it stores the rr_node index of the SOURCE of the net and all the SINKs * * of the net. [0..num_logical_nets-1][0..num_pins-1]. */ int i; for (i = 0; i < get_array_size_of_molecule(molecule); i++) { if (molecule->logical_block_ptrs[i] != NULL) { setup_intracluster_routing_for_logical_block( molecule->logical_block_ptrs[i]->index, primitive_list[i]); } } reload_ext_net_rr_terminal_cluster(); } void setup_intracluster_routing_for_logical_block(INP int iblock, INP t_pb_graph_node *primitive) { /* Allocates and loads the net_rr_terminals data structure. For each net * * it stores the rr_node index of the SOURCE of the net and all the SINKs * * of the net. [0..num_logical_nets-1][0..num_pins-1]. */ int ipin, iblk_net; t_model_ports *port; assert(primitive->pb_type->num_modes == 0); /* check if primitive */ assert(logical_block[iblock].clb_index != NO_CLUSTER); /* check if primitive and block is open */ /* check if block type matches primitive type */ if (logical_block[iblock].model != primitive->pb_type->model) { /* End early, model is incompatible */ assert(0); } /* for each net of logical block, check if it is in cluster, if not add it */ /* also check if pins on primitive can fit logical block */ port = logical_block[iblock].model->inputs; while (port) { for (ipin = 0; ipin < port->size; ipin++) { if (port->is_clock) { assert(port->size == 1); iblk_net = logical_block[iblock].clock_net; } else { iblk_net = logical_block[iblock].input_nets[port->index][ipin]; } if (iblk_net == OPEN) { continue; } if (!is_net_in_cluster(iblk_net)) { nets_in_cluster[num_nets_in_cluster] = iblk_net; num_nets_in_cluster++; } add_net_rr_terminal_cluster(iblk_net, primitive, iblock, port, ipin); } port = port->next; } port = logical_block[iblock].model->outputs; while (port) { for (ipin = 0; ipin < port->size; ipin++) { iblk_net = logical_block[iblock].output_nets[port->index][ipin]; if (iblk_net == OPEN) { continue; } if (!is_net_in_cluster(iblk_net)) { nets_in_cluster[num_nets_in_cluster] = iblk_net; num_nets_in_cluster++; } add_net_rr_terminal_cluster(iblk_net, primitive, iblock, port, ipin); } port = port->next; } } void save_and_reset_routing_cluster(void) { /* This routing frees any routing currently held in best routing, * * then copies over the current routing (held in trace_head), and * * finally sets trace_head and trace_tail to all NULLs so that the * * connection to the saved routing is broken. This is necessary so * * that the next iteration of the router does not free the saved * * routing elements. Also, the routing path costs and net_rr_terminals is stripped from the * existing rr_graph so that the saved routing does not affect the graph */ int inet, i, j; struct s_trace *tempptr; saved_num_nets_in_cluster = num_nets_in_cluster; for (i = 0; i < num_nets_in_cluster; i++) { inet = nets_in_cluster[i]; for (j = 0; j <= vpack_net[inet].num_sinks; j++) { saved_net_rr_terminals[inet][j] = net_rr_terminals[inet][j]; } /* Free any previously saved routing. It is no longer best. */ /* Also Save a pointer to the current routing in best_routing. */ pathfinder_update_one_cost(trace_head[inet], -1, pres_fac); tempptr = trace_head[inet]; trace_head[inet] = best_routing[inet]; free_traceback(inet); best_routing[inet] = tempptr; /* Set the current (working) routing to NULL so the current trace * * elements won't be reused by the memory allocator. */ trace_head[inet] = NULL; trace_tail[inet] = NULL; } } void restore_routing_cluster(void) { /* Deallocates any current routing in trace_head, and replaces it with * * the routing in best_routing. Best_routing is set to NULL to show that * * it no longer points to a valid routing. NOTE: trace_tail is not * * restored -- it is set to all NULLs since it is only used in * * update_traceback. If you need trace_tail restored, modify this * * routine. Also restores the locally used opin data. */ int inet, i, j; for (i = 0; i < num_nets_in_cluster; i++) { inet = nets_in_cluster[i]; pathfinder_update_one_cost(trace_head[inet], -1, pres_fac); /* Free any current routing. */ free_traceback(inet); /* Set the current routing to the saved one. */ trace_head[inet] = best_routing[inet]; best_routing[inet] = NULL; /* No stored routing. */ /* restore net terminals */ for (j = 0; j <= vpack_net[inet].num_sinks; j++) { net_rr_terminals[inet][j] = saved_net_rr_terminals[inet][j]; } /* restore old routing */ pathfinder_update_one_cost(trace_head[inet], 1, pres_fac); } num_nets_in_cluster = saved_num_nets_in_cluster; } void save_cluster_solution(void) { /* This routine updates the occupancy and pres_cost of the rr_nodes that are * * affected by the portion of the routing of one net that starts at * * route_segment_start. If route_segment_start is trace_head[inet], the * * cost of all the nodes in the routing of net inet are updated. If * * add_or_sub is -1 the net (or net portion) is ripped up, if it is 1 the * * net is added to the routing. The size of pres_fac determines how severly * * oversubscribed rr_nodes are penalized. */ int i, j, net_index; struct s_trace *tptr, *prev; int inode; for (i = 0; i < max_ext_index; i++) { rr_node[i].net_num = OPEN; rr_node[i].vpack_net_num = OPEN; /* Xifan TANG: ensure a clear initialization */ rr_node[i].prev_edge = OPEN; rr_node[i].prev_node = OPEN; } for (i = 0; i < num_nets_in_cluster; i++) { prev = NULL; net_index = nets_in_cluster[i]; tptr = trace_head[net_index]; if (tptr == NULL) /* No routing yet. */ return; for (;;) { inode = tptr->index; rr_node[inode].net_num = net_index; if (prev != NULL) { rr_node[inode].prev_node = prev->index; for (j = 0; j < rr_node[prev->index].num_edges; j++) { if (rr_node[prev->index].edges[j] == inode) { rr_node[inode].prev_edge = j; break; } } assert(j != rr_node[prev->index].num_edges); } else { rr_node[inode].prev_node = OPEN; rr_node[inode].prev_edge = OPEN; } if (rr_node[inode].type == SINK) { tptr = tptr->next; /* Skip next segment. */ if (tptr == NULL) break; } prev = tptr; tptr = tptr->next; } /* End while loop -- did an entire traceback. */ } } boolean is_pin_open(int i) { return (boolean) (rr_node[i].occ == 0); } static float rr_node_intrinsic_cost(int inode) { /* This is a tie breaker to avoid using nodes with more edges whenever possible */ float value; value = rr_node[inode].pack_intrinsic_cost; return value; } /* turns on mode for a pb by setting capacity of its rr_nodes to 1 */ void set_pb_graph_mode(t_pb_graph_node *pb_graph_node, int mode, int isOn) { int i, j, index; int i_pb_type, i_pb_inst; const t_pb_type *pb_type; pb_type = pb_graph_node->pb_type; for (i_pb_type = 0; i_pb_type < pb_type->modes[mode].num_pb_type_children; i_pb_type++) { for (i_pb_inst = 0; i_pb_inst < pb_type->modes[mode].pb_type_children[i_pb_type].num_pb; i_pb_inst++) { for (i = 0; i < pb_graph_node->child_pb_graph_nodes[mode][i_pb_type][i_pb_inst].num_input_ports; i++) { for (j = 0; j < pb_graph_node->child_pb_graph_nodes[mode][i_pb_type][i_pb_inst].num_input_pins[i]; j++) { index = pb_graph_node->child_pb_graph_nodes[mode][i_pb_type][i_pb_inst].input_pins[i][j].pin_count_in_cluster; rr_node[index].capacity = isOn; } } for (i = 0; i < pb_graph_node->child_pb_graph_nodes[mode][i_pb_type][i_pb_inst].num_output_ports; i++) { for (j = 0; j < pb_graph_node->child_pb_graph_nodes[mode][i_pb_type][i_pb_inst].num_output_pins[i]; j++) { index = pb_graph_node->child_pb_graph_nodes[mode][i_pb_type][i_pb_inst].output_pins[i][j].pin_count_in_cluster; rr_node[index].capacity = isOn; } } for (i = 0; i < pb_graph_node->child_pb_graph_nodes[mode][i_pb_type][i_pb_inst].num_clock_ports; i++) { for (j = 0; j < pb_graph_node->child_pb_graph_nodes[mode][i_pb_type][i_pb_inst].num_clock_pins[i]; j++) { index = pb_graph_node->child_pb_graph_nodes[mode][i_pb_type][i_pb_inst].clock_pins[i][j].pin_count_in_cluster; rr_node[index].capacity = isOn; } } } } } /* If this is done post place and route, use the cb pins determined by place-and-route rather than letting the legalizer freely determine */ void force_post_place_route_cb_input_pins(int iblock) { int i, j, k, ipin, net_index, ext_net; int pin_offset; boolean has_ext_source, success; int curr_ext_input, curr_ext_clock; t_pb_graph_node *pb_graph_node; pb_graph_node = block[iblock].pb->pb_graph_node; pin_offset = block[iblock].z * (pb_graph_node->pb_type->num_input_pins + pb_graph_node->pb_type->num_output_pins + pb_graph_node->pb_type->num_clock_pins); curr_ext_input = ext_input_rr_node_index; curr_ext_clock = ext_clock_rr_node_index; for (i = 0; i < num_nets_in_cluster; i++) { net_index = nets_in_cluster[i]; has_ext_source = (boolean) (logical_block[vpack_net[net_index].node_block[0]].clb_index != curr_cluster_index); if(has_ext_source) { ext_net = vpack_to_clb_net_mapping[net_index]; assert(ext_net != OPEN); if (vpack_net[net_index].is_global) { free(rr_node[curr_ext_clock].edges); rr_node[curr_ext_clock].edges = NULL; rr_node[curr_ext_clock].num_edges = 0; success = FALSE; ipin = 0; /* force intra-cluster net to use pins from ext route */ for(j = 0; j < pb_graph_node->num_clock_ports; j++) { for(k = 0; k < pb_graph_node->num_clock_pins[j]; k++) { if(ext_net == block[iblock].nets[ipin + pb_graph_node->pb_type->num_input_pins + pb_graph_node->pb_type->num_output_pins + pin_offset]) { success = TRUE; rr_node[curr_ext_clock].num_edges++; rr_node[curr_ext_clock].edges = (int*)my_realloc(rr_node[curr_ext_clock].edges, rr_node[curr_ext_clock].num_edges * sizeof(int)); rr_node[curr_ext_clock].edges[rr_node[curr_ext_clock].num_edges - 1] = pb_graph_node->clock_pins[j][k].pin_count_in_cluster; } ipin++; } } assert(success); curr_ext_clock++; } else { free(rr_node[curr_ext_input].edges); rr_node[curr_ext_input].edges = NULL; rr_node[curr_ext_input].num_edges = 0; success = FALSE; ipin = 0; /* force intra-cluster net to use pins from ext route */ for(j = 0; j < pb_graph_node->num_input_ports; j++) { for(k = 0; k < pb_graph_node->num_input_pins[j]; k++) { if(ext_net == block[iblock].nets[ipin + pin_offset]) { success = TRUE; rr_node[curr_ext_input].num_edges++; rr_node[curr_ext_input].edges = (int*)my_realloc(rr_node[curr_ext_input].edges, rr_node[curr_ext_input].num_edges * sizeof(int)); rr_node[curr_ext_input].edges[rr_node[curr_ext_input].num_edges - 1] = pb_graph_node->input_pins[j][k].pin_count_in_cluster; } ipin++; } } curr_ext_input++; assert(success); } } } }