2604 lines
112 KiB
C++
2604 lines
112 KiB
C++
#include <cstdio>
|
|
#include <cmath>
|
|
#include <memory>
|
|
#include <fstream>
|
|
|
|
#include "vtr_assert.h"
|
|
#include "vtr_log.h"
|
|
#include "vtr_util.h"
|
|
#include "vtr_random.h"
|
|
#include "vtr_geometry.h"
|
|
|
|
#include "vpr_types.h"
|
|
#include "vpr_error.h"
|
|
#include "vpr_utils.h"
|
|
|
|
#include "globals.h"
|
|
#include "place.h"
|
|
#include "read_place.h"
|
|
#include "draw.h"
|
|
#include "place_and_route.h"
|
|
#include "net_delay.h"
|
|
#include "timing_place_lookup.h"
|
|
#include "timing_place.h"
|
|
#include "read_xml_arch_file.h"
|
|
#include "echo_files.h"
|
|
#include "vpr_utils.h"
|
|
#include "place_macro.h"
|
|
#include "histogram.h"
|
|
#include "place_util.h"
|
|
#include "initial_placement.h"
|
|
#include "place_delay_model.h"
|
|
#include "move_transactions.h"
|
|
#include "move_utils.h"
|
|
|
|
#include "uniform_move_generator.h"
|
|
|
|
#include "PlacementDelayCalculator.h"
|
|
#include "VprTimingGraphResolver.h"
|
|
#include "timing_util.h"
|
|
#include "timing_info.h"
|
|
#include "tatum/echo_writer.hpp"
|
|
#include "tatum/TimingReporter.hpp"
|
|
|
|
using std::max;
|
|
using std::min;
|
|
|
|
/************** Types and defines local to place.c ***************************/
|
|
|
|
/* Cut off for incremental bounding box updates. *
|
|
* 4 is fastest -- I checked. */
|
|
/* To turn off incremental bounding box updates, set this to a huge value */
|
|
#define SMALL_NET 4
|
|
|
|
/* This defines the error tolerance for floating points variables used in *
|
|
* cost computation. 0.01 means that there is a 1% error tolerance. */
|
|
#define ERROR_TOL .01
|
|
|
|
/* This defines the maximum number of swap attempts before invoking the *
|
|
* once-in-a-while placement legality check as well as floating point *
|
|
* variables round-offs check. */
|
|
#define MAX_MOVES_BEFORE_RECOMPUTE 500000
|
|
|
|
/* Flags for the states of the bounding box. *
|
|
* Stored as char for memory efficiency. */
|
|
#define NOT_UPDATED_YET 'N'
|
|
#define UPDATED_ONCE 'U'
|
|
#define GOT_FROM_SCRATCH 'S'
|
|
|
|
/* For comp_cost. NORMAL means use the method that generates updateable *
|
|
* bounding boxes for speed. CHECK means compute all bounding boxes from *
|
|
* scratch using a very simple routine to allow checks of the other *
|
|
* costs. */
|
|
enum e_cost_methods {
|
|
NORMAL,
|
|
CHECK
|
|
};
|
|
|
|
struct t_placer_statistics {
|
|
double av_cost, av_bb_cost, av_timing_cost,
|
|
sum_of_squares;
|
|
int success_sum;
|
|
};
|
|
|
|
struct t_placer_costs {
|
|
//Although we do nost cost calculations with float's we
|
|
//use doubles for the accumulated costs to avoid round-off,
|
|
//particularly on large designs where the magnitude of a single
|
|
//move's delta cost is small compared to the overall cost.
|
|
double cost;
|
|
double bb_cost;
|
|
double timing_cost;
|
|
};
|
|
|
|
struct t_placer_prev_inverse_costs {
|
|
double bb_cost;
|
|
double timing_cost;
|
|
};
|
|
|
|
constexpr float INVALID_DELAY = std::numeric_limits<float>::quiet_NaN();
|
|
|
|
constexpr double MAX_INV_TIMING_COST = 1.e9;
|
|
/* Stops inverse timing cost from going to infinity with very lax timing constraints,
|
|
* which avoids multiplying by a gigantic prev_inverse.timing_cost when auto-normalizing.
|
|
* The exact value of this cost has relatively little impact, but should not be
|
|
* large enough to be on the order of timing costs for normal constraints. */
|
|
|
|
/********************** Variables local to place.c ***************************/
|
|
|
|
/* Cost of a net, and a temporary cost of a net used during move assessment. */
|
|
static vtr::vector<ClusterNetId, double> net_cost, temp_net_cost;
|
|
|
|
/* [0...cluster_ctx.clb_nlist.nets().size()-1] *
|
|
* A flag array to indicate whether the specific bounding box has been updated *
|
|
* in this particular swap or not. If it has been updated before, the code *
|
|
* must use the updated data, instead of the out-of-date data passed into the *
|
|
* subroutine, particularly used in try_swap(). The value NOT_UPDATED_YET *
|
|
* indicates that the net has not been updated before, UPDATED_ONCE indicated *
|
|
* that the net has been updated once, if it is going to be updated again, the *
|
|
* values from the previous update must be used. GOT_FROM_SCRATCH is only *
|
|
* applicable for nets larger than SMALL_NETS and it indicates that the *
|
|
* particular bounding box cannot be updated incrementally before, hence the *
|
|
* bounding box is got from scratch, so the bounding box would definitely be *
|
|
* right, DO NOT update again. */
|
|
static vtr::vector<ClusterNetId, char> bb_updated_before;
|
|
|
|
/* [0..cluster_ctx.clb_nlist.nets().size()-1][1..num_pins-1]. What is the value of the timing */
|
|
/* driven portion of the cost function. These arrays will be set to */
|
|
/* (criticality * delay) for each point to point connection. */
|
|
|
|
static vtr::vector<ClusterNetId, double*> point_to_point_timing_cost;
|
|
static vtr::vector<ClusterNetId, double*> temp_point_to_point_timing_cost;
|
|
|
|
/* [0..cluster_ctx.clb_nlist.nets().size()-1][1..num_pins-1]. What is the value of the delay */
|
|
/* for each connection in the circuit */
|
|
static vtr::vector<ClusterNetId, float*> point_to_point_delay;
|
|
static vtr::vector<ClusterNetId, float*> temp_point_to_point_delay;
|
|
|
|
/* [0..cluster_ctx.clb_nlist.blocks().size()-1][0..pins_per_clb-1]. Indicates which pin on the net */
|
|
/* this block corresponds to, this is only required during timing-driven */
|
|
/* placement. It is used to allow us to update individual connections on */
|
|
/* each net */
|
|
static vtr::vector<ClusterBlockId, std::vector<int>> net_pin_indices;
|
|
|
|
/* [0..cluster_ctx.clb_nlist.nets().size()-1]. Store the bounding box coordinates and the number of *
|
|
* blocks on each of a net's bounding box (to allow efficient updates), *
|
|
* respectively. */
|
|
|
|
static vtr::vector<ClusterNetId, t_bb> bb_coords, bb_num_on_edges;
|
|
|
|
/* The arrays below are used to precompute the inverse of the average *
|
|
* number of tracks per channel between [subhigh] and [sublow]. Access *
|
|
* them as chan?_place_cost_fac[subhigh][sublow]. They are used to *
|
|
* speed up the computation of the cost function that takes the length *
|
|
* of the net bounding box in each dimension, divided by the average *
|
|
* number of tracks in that direction; for other cost functions they *
|
|
* will never be used. *
|
|
*/
|
|
static float** chanx_place_cost_fac; //[0...device_ctx.grid.width()-2]
|
|
static float** chany_place_cost_fac; //[0...device_ctx.grid.height()-2]
|
|
|
|
/* The following arrays are used by the try_swap function for speed. */
|
|
/* [0...cluster_ctx.clb_nlist.nets().size()-1] */
|
|
static vtr::vector<ClusterNetId, t_bb> ts_bb_coord_new, ts_bb_edge_new;
|
|
static std::vector<ClusterNetId> ts_nets_to_update;
|
|
|
|
/* These file-scoped variables keep track of the number of swaps *
|
|
* rejected, accepted or aborted. The total number of swap attempts *
|
|
* is the sum of the three number. */
|
|
static int num_swap_rejected = 0;
|
|
static int num_swap_accepted = 0;
|
|
static int num_swap_aborted = 0;
|
|
static int num_ts_called = 0;
|
|
|
|
/* Expected crossing counts for nets with different #'s of pins. From *
|
|
* ICCAD 94 pp. 690 - 695 (with linear interpolation applied by me). *
|
|
* Multiplied to bounding box of a net to better estimate wire length *
|
|
* for higher fanout nets. Each entry is the correction factor for the *
|
|
* fanout index-1 */
|
|
static const float cross_count[50] = {/* [0..49] */ 1.0, 1.0, 1.0, 1.0828, 1.1536, 1.2206, 1.2823, 1.3385, 1.3991, 1.4493, 1.4974,
|
|
1.5455, 1.5937, 1.6418, 1.6899, 1.7304, 1.7709, 1.8114, 1.8519, 1.8924,
|
|
1.9288, 1.9652, 2.0015, 2.0379, 2.0743, 2.1061, 2.1379, 2.1698, 2.2016,
|
|
2.2334, 2.2646, 2.2958, 2.3271, 2.3583, 2.3895, 2.4187, 2.4479, 2.4772,
|
|
2.5064, 2.5356, 2.5610, 2.5864, 2.6117, 2.6371, 2.6625, 2.6887, 2.7148,
|
|
2.7410, 2.7671, 2.7933};
|
|
|
|
std::unique_ptr<FILE, decltype(&vtr::fclose)> f_move_stats_file(nullptr, vtr::fclose);
|
|
|
|
#ifdef VTR_ENABLE_DEBUG_LOGGING
|
|
|
|
# define LOG_MOVE_STATS_HEADER() \
|
|
do { \
|
|
if (f_move_stats_file) { \
|
|
fprintf(f_move_stats_file.get(), \
|
|
"temp,from_blk,to_blk,from_type,to_type," \
|
|
"blk_count," \
|
|
"delta_cost,delta_bb_cost,delta_td_cost," \
|
|
"outcome,reason\n"); \
|
|
} \
|
|
} while (false)
|
|
|
|
# define LOG_MOVE_STATS_PROPOSED(t, affected_blocks) \
|
|
do { \
|
|
if (f_move_stats_file) { \
|
|
auto& place_ctx = g_vpr_ctx.placement(); \
|
|
auto& cluster_ctx = g_vpr_ctx.clustering(); \
|
|
ClusterBlockId b_from = affected_blocks.moved_blocks[0].block_num; \
|
|
\
|
|
t_pl_loc to = affected_blocks.moved_blocks[0].new_loc; \
|
|
ClusterBlockId b_to = place_ctx.grid_blocks[to.x][to.y].blocks[to.z]; \
|
|
\
|
|
t_logical_block_type_ptr from_type = cluster_ctx.clb_nlist.block_type(b_from); \
|
|
t_logical_block_type_ptr to_type = nullptr; \
|
|
if (b_to) { \
|
|
to_type = cluster_ctx.clb_nlist.block_type(b_to); \
|
|
} \
|
|
\
|
|
fprintf(f_move_stats_file.get(), \
|
|
"%g," \
|
|
"%d,%d," \
|
|
"%s,%s," \
|
|
"%d,", \
|
|
t, \
|
|
int(size_t(b_from)), int(size_t(b_to)), \
|
|
from_type->name, (to_type ? to_type->name : "EMPTY"), \
|
|
affected_blocks.num_moved_blocks); \
|
|
} \
|
|
} while (false)
|
|
|
|
# define LOG_MOVE_STATS_OUTCOME(delta_cost, delta_bb_cost, delta_td_cost, \
|
|
outcome, reason) \
|
|
do { \
|
|
if (f_move_stats_file) { \
|
|
fprintf(f_move_stats_file.get(), \
|
|
"%g,%g,%g," \
|
|
"%s,%s\n", \
|
|
delta_cost, delta_bb_cost, delta_td_cost, \
|
|
outcome, reason); \
|
|
} \
|
|
} while (false)
|
|
|
|
#else
|
|
|
|
# define LOG_MOVE_STATS_HEADER() \
|
|
do { \
|
|
fprintf(f_move_stats_file.get(), \
|
|
"VTR_ENABLE_DEBUG_LOGGING disabled " \
|
|
"-- No move stats recorded\n"); \
|
|
} while (false)
|
|
|
|
# define LOG_MOVE_STATS_PROPOSED(t, blocks_affected) \
|
|
do { \
|
|
} while (false)
|
|
|
|
# define LOG_MOVE_STATS_OUTCOME(delta_cost, delta_bb_cost, delta_td_cost, \
|
|
outcome, reason) \
|
|
do { \
|
|
} while (false)
|
|
|
|
#endif
|
|
|
|
/********************* Static subroutines local to place.c *******************/
|
|
#ifdef VERBOSE
|
|
static void print_clb_placement(const char* fname);
|
|
#endif
|
|
|
|
static void alloc_and_load_placement_structs(float place_cost_exp,
|
|
const t_placer_opts& placer_opts,
|
|
t_direct_inf* directs,
|
|
int num_directs);
|
|
|
|
static void alloc_and_load_net_pin_indices();
|
|
|
|
static void alloc_and_load_try_swap_structs();
|
|
|
|
static void free_placement_structs(const t_placer_opts& placer_opts);
|
|
|
|
static void alloc_and_load_for_fast_cost_update(float place_cost_exp);
|
|
|
|
static void free_fast_cost_update();
|
|
|
|
static double comp_bb_cost(e_cost_methods method);
|
|
|
|
static void update_move_nets(int num_nets_affected);
|
|
static void reset_move_nets(int num_nets_affected);
|
|
|
|
static e_move_result try_swap(float t,
|
|
t_placer_costs* costs,
|
|
t_placer_prev_inverse_costs* prev_inverse_costs,
|
|
float rlim,
|
|
MoveGenerator& move_generator,
|
|
t_pl_blocks_to_be_moved& blocks_affected,
|
|
const PlaceDelayModel* delay_model,
|
|
float rlim_escape_fraction,
|
|
enum e_place_algorithm place_algorithm,
|
|
float timing_tradeoff);
|
|
|
|
static void check_place(const t_placer_costs& costs,
|
|
const PlaceDelayModel* delay_model,
|
|
enum e_place_algorithm place_algorithm);
|
|
|
|
static int check_placement_costs(const t_placer_costs& costs,
|
|
const PlaceDelayModel* delay_model,
|
|
enum e_place_algorithm place_algorithm);
|
|
static int check_placement_consistency();
|
|
static int check_block_placement_consistency();
|
|
static int check_macro_placement_consistency();
|
|
|
|
static float starting_t(t_placer_costs* costs,
|
|
t_placer_prev_inverse_costs* prev_inverse_costs,
|
|
t_annealing_sched annealing_sched,
|
|
int max_moves,
|
|
float rlim,
|
|
const PlaceDelayModel* delay_model,
|
|
MoveGenerator& move_generator,
|
|
t_pl_blocks_to_be_moved& blocks_affected,
|
|
const t_placer_opts& placer_opts);
|
|
|
|
static void update_t(float* t, float rlim, float success_rat, t_annealing_sched annealing_sched);
|
|
|
|
static void update_rlim(float* rlim, float success_rat, const DeviceGrid& grid);
|
|
|
|
static int exit_crit(float t, float cost, t_annealing_sched annealing_sched);
|
|
|
|
static int count_connections();
|
|
|
|
static double get_std_dev(int n, double sum_x_squared, double av_x);
|
|
|
|
static double recompute_bb_cost();
|
|
|
|
static float comp_td_point_to_point_delay(const PlaceDelayModel* delay_model, ClusterNetId net_id, int ipin);
|
|
|
|
static void comp_td_point_to_point_delays(const PlaceDelayModel* delay_model);
|
|
|
|
static void update_td_cost(const t_pl_blocks_to_be_moved& blocks_affected);
|
|
|
|
static bool driven_by_moved_block(const ClusterNetId net, const t_pl_blocks_to_be_moved& blocks_affected);
|
|
|
|
static void comp_td_costs(const PlaceDelayModel* delay_model, double* timing_cost);
|
|
|
|
static e_move_result assess_swap(double delta_c, double t);
|
|
|
|
static void get_non_updateable_bb(ClusterNetId net_id, t_bb* bb_coord_new);
|
|
|
|
static void update_bb(ClusterNetId net_id, t_bb* bb_coord_new, t_bb* bb_edge_new, int xold, int yold, int xnew, int ynew);
|
|
|
|
static int find_affected_nets_and_update_costs(e_place_algorithm place_algorithm,
|
|
const t_pl_blocks_to_be_moved& blocks_affected,
|
|
const PlaceDelayModel* delay_model,
|
|
double& bb_delta_c,
|
|
double& timing_delta_c);
|
|
|
|
static void record_affected_net(const ClusterNetId net, int& num_affected_nets);
|
|
|
|
static void update_net_bb(const ClusterNetId net,
|
|
const t_pl_blocks_to_be_moved& blocks_affected,
|
|
int iblk,
|
|
const ClusterBlockId blk,
|
|
const ClusterPinId blk_pin);
|
|
static void update_td_delta_costs(const PlaceDelayModel* delay_model, const t_pl_blocks_to_be_moved& blocks_affected, const ClusterNetId net, const ClusterPinId pin, double& delta_timing_cost);
|
|
|
|
static double get_net_cost(ClusterNetId net_id, t_bb* bb_ptr);
|
|
|
|
static void get_bb_from_scratch(ClusterNetId net_id, t_bb* coords, t_bb* num_on_edges);
|
|
|
|
static double get_net_wirelength_estimate(ClusterNetId net_id, t_bb* bbptr);
|
|
|
|
static void free_try_swap_arrays();
|
|
|
|
static void outer_loop_recompute_criticalities(const t_placer_opts& placer_opts,
|
|
t_placer_costs* costs,
|
|
t_placer_prev_inverse_costs* prev_inverse_costs,
|
|
int num_connections,
|
|
float crit_exponent,
|
|
int* outer_crit_iter_count,
|
|
const ClusteredPinAtomPinsLookup& netlist_pin_lookup,
|
|
const PlaceDelayModel* delay_model,
|
|
SetupTimingInfo& timing_info);
|
|
|
|
static void placement_inner_loop(float t,
|
|
float rlim,
|
|
const t_placer_opts& placer_opts,
|
|
int move_lim,
|
|
float crit_exponent,
|
|
int inner_recompute_limit,
|
|
t_placer_statistics* stats,
|
|
t_placer_costs* costs,
|
|
t_placer_prev_inverse_costs* prev_inverse_costs,
|
|
int* moves_since_cost_recompute,
|
|
const ClusteredPinAtomPinsLookup& netlist_pin_lookup,
|
|
const PlaceDelayModel* delay_model,
|
|
MoveGenerator& move_generator,
|
|
t_pl_blocks_to_be_moved& blocks_affected,
|
|
SetupTimingInfo& timing_info);
|
|
|
|
static void recompute_costs_from_scratch(const t_placer_opts& placer_opts, const PlaceDelayModel* delay_model, t_placer_costs* costs);
|
|
|
|
static void calc_placer_stats(t_placer_statistics& stats, float& success_rat, double& std_dev, const t_placer_costs& costs, const int move_lim);
|
|
|
|
static void generate_post_place_timing_reports(const t_placer_opts& placer_opts,
|
|
const t_analysis_opts& analysis_opts,
|
|
const SetupTimingInfo& timing_info,
|
|
const PlacementDelayCalculator& delay_calc);
|
|
|
|
static void print_place_status_header();
|
|
static void print_place_status(const float t,
|
|
const float oldt,
|
|
const t_placer_statistics& stats,
|
|
const float cpd,
|
|
const float sTNS,
|
|
const float sWNS,
|
|
const float acc_rate,
|
|
const float std_dev,
|
|
const float rlim,
|
|
const float crit_exponent,
|
|
size_t tot_moves);
|
|
static void print_resources_utilization();
|
|
|
|
/*****************************************************************************/
|
|
void try_place(const t_placer_opts& placer_opts,
|
|
t_annealing_sched annealing_sched,
|
|
const t_router_opts& router_opts,
|
|
const t_analysis_opts& analysis_opts,
|
|
t_chan_width_dist chan_width_dist,
|
|
t_det_routing_arch* det_routing_arch,
|
|
std::vector<t_segment_inf>& segment_inf,
|
|
t_direct_inf* directs,
|
|
int num_directs) {
|
|
/* Does almost all the work of placing a circuit. Width_fac gives the *
|
|
* width of the widest channel. Place_cost_exp says what exponent the *
|
|
* width should be taken to when calculating costs. This allows a *
|
|
* greater bias for anisotropic architectures. */
|
|
|
|
int tot_iter, move_lim, moves_since_cost_recompute, width_fac, num_connections,
|
|
outer_crit_iter_count, inner_recompute_limit;
|
|
float t, success_rat, rlim,
|
|
oldt = 0, crit_exponent,
|
|
first_rlim, final_rlim, inverse_delta_rlim;
|
|
|
|
t_placer_costs costs;
|
|
t_placer_prev_inverse_costs prev_inverse_costs;
|
|
|
|
tatum::TimingPathInfo critical_path;
|
|
float sTNS = NAN;
|
|
float sWNS = NAN;
|
|
|
|
double std_dev;
|
|
char msg[vtr::bufsize];
|
|
t_placer_statistics stats;
|
|
|
|
auto& device_ctx = g_vpr_ctx.device();
|
|
auto& cluster_ctx = g_vpr_ctx.clustering();
|
|
|
|
std::shared_ptr<SetupTimingInfo> timing_info;
|
|
std::shared_ptr<PlacementDelayCalculator> placement_delay_calc;
|
|
std::unique_ptr<PlaceDelayModel> place_delay_model;
|
|
std::unique_ptr<MoveGenerator> move_generator;
|
|
|
|
t_pl_blocks_to_be_moved blocks_affected(cluster_ctx.clb_nlist.blocks().size());
|
|
|
|
/* Allocated here because it goes into timing critical code where each memory allocation is expensive */
|
|
IntraLbPbPinLookup pb_gpin_lookup(device_ctx.logical_block_types);
|
|
|
|
/* init file scope variables */
|
|
num_swap_rejected = 0;
|
|
num_swap_accepted = 0;
|
|
num_swap_aborted = 0;
|
|
num_ts_called = 0;
|
|
|
|
if (placer_opts.place_algorithm == PATH_TIMING_DRIVEN_PLACE
|
|
|| placer_opts.enable_timing_computations) {
|
|
/*do this before the initial placement to avoid messing up the initial placement */
|
|
place_delay_model = alloc_lookups_and_criticalities(chan_width_dist, placer_opts, router_opts, det_routing_arch, segment_inf, directs, num_directs);
|
|
|
|
if (isEchoFileEnabled(E_ECHO_PLACEMENT_DELTA_DELAY_MODEL)) {
|
|
place_delay_model->dump_echo(getEchoFileName(E_ECHO_PLACEMENT_DELTA_DELAY_MODEL));
|
|
}
|
|
}
|
|
|
|
move_generator = std::make_unique<UniformMoveGenerator>();
|
|
|
|
width_fac = placer_opts.place_chan_width;
|
|
|
|
init_chan(width_fac, chan_width_dist);
|
|
|
|
alloc_and_load_placement_structs(placer_opts.place_cost_exp, placer_opts,
|
|
directs, num_directs);
|
|
|
|
initial_placement(placer_opts.pad_loc_type, placer_opts.pad_loc_file.c_str());
|
|
|
|
// Update physical pin values
|
|
for (auto block_id : cluster_ctx.clb_nlist.blocks()) {
|
|
place_sync_external_block_connections(block_id);
|
|
}
|
|
|
|
init_draw_coords((float)width_fac);
|
|
//Enables fast look-up of atom pins connect to CLB pins
|
|
ClusteredPinAtomPinsLookup netlist_pin_lookup(cluster_ctx.clb_nlist, pb_gpin_lookup);
|
|
|
|
/* Gets initial cost and loads bounding boxes. */
|
|
|
|
if (placer_opts.place_algorithm == PATH_TIMING_DRIVEN_PLACE || placer_opts.enable_timing_computations) {
|
|
costs.bb_cost = comp_bb_cost(NORMAL);
|
|
|
|
crit_exponent = placer_opts.td_place_exp_first; /*this will be modified when rlim starts to change */
|
|
|
|
num_connections = count_connections();
|
|
VTR_LOG("\n");
|
|
VTR_LOG("There are %d point to point connections in this circuit.\n", num_connections);
|
|
VTR_LOG("\n");
|
|
|
|
//Update the point-to-point delays from the initial placement
|
|
comp_td_point_to_point_delays(place_delay_model.get());
|
|
|
|
/*
|
|
* Initialize timing analysis
|
|
*/
|
|
auto& atom_ctx = g_vpr_ctx.atom();
|
|
placement_delay_calc = std::make_shared<PlacementDelayCalculator>(atom_ctx.nlist, atom_ctx.lookup, point_to_point_delay);
|
|
placement_delay_calc->set_tsu_margin_relative(placer_opts.tsu_rel_margin);
|
|
placement_delay_calc->set_tsu_margin_absolute(placer_opts.tsu_abs_margin);
|
|
timing_info = make_setup_timing_info(placement_delay_calc);
|
|
|
|
timing_info->update();
|
|
timing_info->set_warn_unconstrained(false); //Don't warn again about unconstrained nodes again during placement
|
|
|
|
//Initial slack estimates
|
|
load_criticalities(*timing_info, crit_exponent, netlist_pin_lookup);
|
|
|
|
critical_path = timing_info->least_slack_critical_path();
|
|
|
|
//Write out the initial timing echo file
|
|
if (isEchoFileEnabled(E_ECHO_INITIAL_PLACEMENT_TIMING_GRAPH)) {
|
|
auto& timing_ctx = g_vpr_ctx.timing();
|
|
|
|
tatum::write_echo(getEchoFileName(E_ECHO_INITIAL_PLACEMENT_TIMING_GRAPH),
|
|
*timing_ctx.graph, *timing_ctx.constraints, *placement_delay_calc, timing_info->analyzer());
|
|
}
|
|
|
|
/*now we can properly compute costs */
|
|
comp_td_costs(place_delay_model.get(), &costs.timing_cost); /*also updates values in point_to_point_delay */
|
|
|
|
outer_crit_iter_count = 1;
|
|
|
|
prev_inverse_costs.timing_cost = 1 / costs.timing_cost;
|
|
prev_inverse_costs.bb_cost = 1 / costs.bb_cost;
|
|
costs.cost = 1; /*our new cost function uses normalized values of */
|
|
/*bb_cost and timing_cost, the value of cost will be reset */
|
|
/*to 1 at each temperature when *_TIMING_DRIVEN_PLACE is true */
|
|
} else { /*BOUNDING_BOX_PLACE */
|
|
costs.cost = costs.bb_cost = comp_bb_cost(NORMAL);
|
|
costs.timing_cost = 0;
|
|
outer_crit_iter_count = 0;
|
|
num_connections = 0;
|
|
crit_exponent = 0;
|
|
|
|
prev_inverse_costs.timing_cost = 0; /*inverses not used */
|
|
prev_inverse_costs.bb_cost = 0;
|
|
}
|
|
|
|
//Sanity check that initial placement is legal
|
|
check_place(costs, place_delay_model.get(), placer_opts.place_algorithm);
|
|
|
|
//Initial pacement statistics
|
|
VTR_LOG("Initial placement cost: %g bb_cost: %g td_cost: %g\n",
|
|
costs.cost, costs.bb_cost, costs.timing_cost);
|
|
if (placer_opts.place_algorithm == PATH_TIMING_DRIVEN_PLACE) {
|
|
VTR_LOG("Initial placement estimated Critical Path Delay (CPD): %g ns\n",
|
|
1e9 * critical_path.delay());
|
|
VTR_LOG("Initial placement estimated setup Total Negative Slack (sTNS): %g ns\n",
|
|
1e9 * timing_info->setup_total_negative_slack());
|
|
VTR_LOG("Initial placement estimated setup Worst Negative Slack (sWNS): %g ns\n",
|
|
1e9 * timing_info->setup_worst_negative_slack());
|
|
VTR_LOG("\n");
|
|
|
|
VTR_LOG("Initial placement estimated setup slack histogram:\n");
|
|
print_histogram(create_setup_slack_histogram(*timing_info->setup_analyzer()));
|
|
}
|
|
size_t num_macro_members = 0;
|
|
for (auto& macro : g_vpr_ctx.placement().pl_macros) {
|
|
num_macro_members += macro.members.size();
|
|
}
|
|
VTR_LOG("Placement contains %zu placement macros involving %zu blocks (average macro size %f)\n", g_vpr_ctx.placement().pl_macros.size(), num_macro_members, float(num_macro_members) / g_vpr_ctx.placement().pl_macros.size());
|
|
VTR_LOG("\n");
|
|
|
|
//Table header
|
|
print_place_status_header();
|
|
|
|
sprintf(msg, "Initial Placement. Cost: %g BB Cost: %g TD Cost %g \t Channel Factor: %d",
|
|
costs.cost, costs.bb_cost, costs.timing_cost, width_fac);
|
|
//Draw the initial placement
|
|
update_screen(ScreenUpdatePriority::MAJOR, msg, PLACEMENT, timing_info);
|
|
move_lim = (int)(annealing_sched.inner_num * pow(cluster_ctx.clb_nlist.blocks().size(), 1.3333));
|
|
|
|
/* Sometimes I want to run the router with a random placement. Avoid *
|
|
* using 0 moves to stop division by 0 and 0 length vector problems, *
|
|
* by setting move_lim to 1 (which is still too small to do any *
|
|
* significant optimization). */
|
|
if (move_lim <= 0)
|
|
move_lim = 1;
|
|
|
|
if (placer_opts.inner_loop_recompute_divider != 0) {
|
|
inner_recompute_limit = (int)(0.5 + (float)move_lim / (float)placer_opts.inner_loop_recompute_divider);
|
|
} else {
|
|
/*don't do an inner recompute */
|
|
inner_recompute_limit = move_lim + 1;
|
|
}
|
|
|
|
rlim = (float)max(device_ctx.grid.width() - 1, device_ctx.grid.height() - 1);
|
|
|
|
first_rlim = rlim; /*used in timing-driven placement for exponent computation */
|
|
final_rlim = 1;
|
|
inverse_delta_rlim = 1 / (first_rlim - final_rlim);
|
|
|
|
t = starting_t(&costs, &prev_inverse_costs,
|
|
annealing_sched, move_lim, rlim,
|
|
place_delay_model.get(),
|
|
*move_generator,
|
|
blocks_affected,
|
|
placer_opts);
|
|
|
|
if (!placer_opts.move_stats_file.empty()) {
|
|
f_move_stats_file = std::unique_ptr<FILE, decltype(&vtr::fclose)>(vtr::fopen(placer_opts.move_stats_file.c_str(), "w"), vtr::fclose);
|
|
LOG_MOVE_STATS_HEADER();
|
|
}
|
|
|
|
tot_iter = 0;
|
|
moves_since_cost_recompute = 0;
|
|
int num_temps = 0;
|
|
|
|
/* Outer loop of the simmulated annealing begins */
|
|
while (exit_crit(t, costs.cost, annealing_sched) == 0) {
|
|
if (placer_opts.place_algorithm == PATH_TIMING_DRIVEN_PLACE) {
|
|
costs.cost = 1;
|
|
}
|
|
|
|
outer_loop_recompute_criticalities(placer_opts, &costs, &prev_inverse_costs,
|
|
num_connections,
|
|
crit_exponent,
|
|
&outer_crit_iter_count,
|
|
netlist_pin_lookup,
|
|
place_delay_model.get(),
|
|
*timing_info);
|
|
|
|
placement_inner_loop(t, rlim, placer_opts,
|
|
move_lim, crit_exponent, inner_recompute_limit, &stats,
|
|
&costs,
|
|
&prev_inverse_costs,
|
|
&moves_since_cost_recompute,
|
|
netlist_pin_lookup,
|
|
place_delay_model.get(),
|
|
*move_generator,
|
|
blocks_affected,
|
|
*timing_info);
|
|
|
|
tot_iter += move_lim;
|
|
|
|
calc_placer_stats(stats, success_rat, std_dev, costs, move_lim);
|
|
|
|
oldt = t; /* for finding and printing alpha. */
|
|
update_t(&t, rlim, success_rat, annealing_sched);
|
|
++num_temps;
|
|
|
|
if (placer_opts.place_algorithm == PATH_TIMING_DRIVEN_PLACE) {
|
|
critical_path = timing_info->least_slack_critical_path();
|
|
sTNS = timing_info->setup_total_negative_slack();
|
|
sWNS = timing_info->setup_worst_negative_slack();
|
|
}
|
|
|
|
print_place_status(t, oldt,
|
|
stats,
|
|
critical_path.delay(), sTNS, sWNS,
|
|
success_rat, std_dev, rlim, crit_exponent, tot_iter);
|
|
|
|
sprintf(msg, "Cost: %g BB Cost %g TD Cost %g Temperature: %g",
|
|
costs.cost, costs.bb_cost, costs.timing_cost, t);
|
|
update_screen(ScreenUpdatePriority::MINOR, msg, PLACEMENT, timing_info);
|
|
update_rlim(&rlim, success_rat, device_ctx.grid);
|
|
|
|
if (placer_opts.place_algorithm == PATH_TIMING_DRIVEN_PLACE) {
|
|
crit_exponent = (1 - (rlim - final_rlim) * inverse_delta_rlim)
|
|
* (placer_opts.td_place_exp_last - placer_opts.td_place_exp_first)
|
|
+ placer_opts.td_place_exp_first;
|
|
}
|
|
|
|
#ifdef VERBOSE
|
|
if (getEchoEnabled()) {
|
|
print_clb_placement("first_iteration_clb_placement.echo");
|
|
}
|
|
#endif
|
|
}
|
|
/* Outer loop of the simmulated annealing ends */
|
|
|
|
outer_loop_recompute_criticalities(placer_opts, &costs,
|
|
&prev_inverse_costs,
|
|
num_connections,
|
|
crit_exponent,
|
|
&outer_crit_iter_count,
|
|
netlist_pin_lookup,
|
|
place_delay_model.get(),
|
|
*timing_info);
|
|
|
|
t = 0; /* freeze out */
|
|
|
|
/* Run inner loop again with temperature = 0 so as to accept only swaps
|
|
* which reduce the cost of the placement */
|
|
placement_inner_loop(t, rlim, placer_opts,
|
|
move_lim, crit_exponent, inner_recompute_limit, &stats,
|
|
&costs,
|
|
&prev_inverse_costs,
|
|
&moves_since_cost_recompute,
|
|
netlist_pin_lookup,
|
|
place_delay_model.get(),
|
|
*move_generator,
|
|
blocks_affected,
|
|
*timing_info);
|
|
|
|
tot_iter += move_lim;
|
|
++num_temps;
|
|
|
|
calc_placer_stats(stats, success_rat, std_dev, costs, move_lim);
|
|
|
|
if (placer_opts.place_algorithm == PATH_TIMING_DRIVEN_PLACE) {
|
|
critical_path = timing_info->least_slack_critical_path();
|
|
sTNS = timing_info->setup_total_negative_slack();
|
|
sWNS = timing_info->setup_worst_negative_slack();
|
|
}
|
|
|
|
print_place_status(t, oldt, stats,
|
|
critical_path.delay(), sTNS, sWNS,
|
|
success_rat, std_dev, rlim, crit_exponent, tot_iter);
|
|
|
|
// TODO:
|
|
// 1. add some subroutine hierarchy! Too big!
|
|
|
|
#ifdef VERBOSE
|
|
if (getEchoEnabled() && isEchoFileEnabled(E_ECHO_END_CLB_PLACEMENT)) {
|
|
print_clb_placement(getEchoFileName(E_ECHO_END_CLB_PLACEMENT));
|
|
}
|
|
#endif
|
|
|
|
check_place(costs, place_delay_model.get(), placer_opts.place_algorithm);
|
|
|
|
//Some stats
|
|
VTR_LOG("\n");
|
|
VTR_LOG("Swaps called: %d\n", num_ts_called);
|
|
|
|
if (placer_opts.enable_timing_computations
|
|
&& placer_opts.place_algorithm == BOUNDING_BOX_PLACE) {
|
|
/*need this done since the timing data has not been kept up to date*
|
|
*in bounding_box mode */
|
|
for (auto net_id : cluster_ctx.clb_nlist.nets()) {
|
|
for (size_t ipin = 1; ipin < cluster_ctx.clb_nlist.net_pins(net_id).size(); ipin++)
|
|
set_timing_place_crit(net_id, ipin, 0); /*dummy crit values */
|
|
}
|
|
comp_td_costs(place_delay_model.get(), &costs.timing_cost); /*computes point_to_point_delay */
|
|
}
|
|
|
|
if (placer_opts.place_algorithm == PATH_TIMING_DRIVEN_PLACE
|
|
|| placer_opts.enable_timing_computations) {
|
|
//Final timing estimate
|
|
VTR_ASSERT(timing_info);
|
|
|
|
timing_info->update(); //Tatum
|
|
critical_path = timing_info->least_slack_critical_path();
|
|
|
|
if (isEchoFileEnabled(E_ECHO_FINAL_PLACEMENT_TIMING_GRAPH)) {
|
|
auto& timing_ctx = g_vpr_ctx.timing();
|
|
|
|
tatum::write_echo(getEchoFileName(E_ECHO_FINAL_PLACEMENT_TIMING_GRAPH),
|
|
*timing_ctx.graph, *timing_ctx.constraints, *placement_delay_calc, timing_info->analyzer());
|
|
}
|
|
|
|
generate_post_place_timing_reports(placer_opts,
|
|
analysis_opts,
|
|
*timing_info,
|
|
*placement_delay_calc);
|
|
|
|
/* Print critical path delay. */
|
|
VTR_LOG("\n");
|
|
VTR_LOG("Placement estimated critical path delay: %g ns",
|
|
1e9 * critical_path.delay());
|
|
VTR_LOG("\n");
|
|
VTR_LOG("Placement estimated setup Total Negative Slack (sTNS): %g ns\n",
|
|
1e9 * timing_info->setup_total_negative_slack());
|
|
VTR_LOG("Placement estimated setup Worst Negative Slack (sWNS): %g ns\n",
|
|
1e9 * timing_info->setup_worst_negative_slack());
|
|
VTR_LOG("\n");
|
|
|
|
VTR_LOG("Placement estimated setup slack histogram:\n");
|
|
print_histogram(create_setup_slack_histogram(*timing_info->setup_analyzer()));
|
|
VTR_LOG("\n");
|
|
}
|
|
|
|
sprintf(msg, "Placement. Cost: %g bb_cost: %g td_cost: %g Channel Factor: %d",
|
|
costs.cost, costs.bb_cost, costs.timing_cost, width_fac);
|
|
VTR_LOG("Placement cost: %g, bb_cost: %g, td_cost: %g, \n",
|
|
costs.cost, costs.bb_cost, costs.timing_cost);
|
|
update_screen(ScreenUpdatePriority::MAJOR, msg, PLACEMENT, timing_info);
|
|
// Print out swap statistics
|
|
size_t total_swap_attempts = num_swap_rejected + num_swap_accepted + num_swap_aborted;
|
|
VTR_ASSERT(total_swap_attempts > 0);
|
|
|
|
print_resources_utilization();
|
|
|
|
size_t num_swap_print_digits = ceil(log10(total_swap_attempts));
|
|
float reject_rate = (float)num_swap_rejected / total_swap_attempts;
|
|
float accept_rate = (float)num_swap_accepted / total_swap_attempts;
|
|
float abort_rate = (float)num_swap_aborted / total_swap_attempts;
|
|
VTR_LOG("Placement number of temperatures: %d\n", num_temps);
|
|
VTR_LOG("Placement total # of swap attempts: %*d\n", num_swap_print_digits, total_swap_attempts);
|
|
VTR_LOG("\tSwaps accepted: %*d (%4.1f %%)\n", num_swap_print_digits, num_swap_accepted, 100 * accept_rate);
|
|
VTR_LOG("\tSwaps rejected: %*d (%4.1f %%)\n", num_swap_print_digits, num_swap_rejected, 100 * reject_rate);
|
|
VTR_LOG("\tSwaps aborted : %*d (%4.1f %%)\n", num_swap_print_digits, num_swap_aborted, 100 * abort_rate);
|
|
|
|
report_aborted_moves();
|
|
|
|
free_placement_structs(placer_opts);
|
|
if (placer_opts.place_algorithm == PATH_TIMING_DRIVEN_PLACE
|
|
|| placer_opts.enable_timing_computations) {
|
|
free_lookups_and_criticalities();
|
|
}
|
|
|
|
free_try_swap_arrays();
|
|
}
|
|
|
|
/* Function to recompute the criticalities before the inner loop of the annealing */
|
|
static void outer_loop_recompute_criticalities(const t_placer_opts& placer_opts,
|
|
t_placer_costs* costs,
|
|
t_placer_prev_inverse_costs* prev_inverse_costs,
|
|
int num_connections,
|
|
float crit_exponent,
|
|
int* outer_crit_iter_count,
|
|
const ClusteredPinAtomPinsLookup& netlist_pin_lookup,
|
|
const PlaceDelayModel* delay_model,
|
|
SetupTimingInfo& timing_info) {
|
|
if (placer_opts.place_algorithm != PATH_TIMING_DRIVEN_PLACE)
|
|
return;
|
|
|
|
/*at each temperature change we update these values to be used */
|
|
/*for normalizing the tradeoff between timing and wirelength (bb) */
|
|
if (*outer_crit_iter_count >= placer_opts.recompute_crit_iter
|
|
|| placer_opts.inner_loop_recompute_divider != 0) {
|
|
#ifdef VERBOSE
|
|
VTR_LOG("Outer loop recompute criticalities\n");
|
|
#endif
|
|
num_connections = std::max(num_connections, 1); //Avoid division by zero
|
|
VTR_ASSERT(num_connections > 0);
|
|
|
|
//Per-temperature timing update
|
|
timing_info.update();
|
|
load_criticalities(timing_info, crit_exponent, netlist_pin_lookup);
|
|
|
|
/*recompute costs from scratch, based on new criticalities */
|
|
comp_td_costs(delay_model, &costs->timing_cost);
|
|
*outer_crit_iter_count = 0;
|
|
}
|
|
(*outer_crit_iter_count)++;
|
|
|
|
/*at each temperature change we update these values to be used */
|
|
/*for normalizing the tradeoff between timing and wirelength (bb) */
|
|
prev_inverse_costs->bb_cost = 1 / costs->bb_cost;
|
|
/*Prevent inverse timing cost from going to infinity */
|
|
prev_inverse_costs->timing_cost = min(1 / costs->timing_cost, MAX_INV_TIMING_COST);
|
|
}
|
|
|
|
/* Function which contains the inner loop of the simulated annealing */
|
|
static void placement_inner_loop(float t,
|
|
float rlim,
|
|
const t_placer_opts& placer_opts,
|
|
int move_lim,
|
|
float crit_exponent,
|
|
int inner_recompute_limit,
|
|
t_placer_statistics* stats,
|
|
t_placer_costs* costs,
|
|
t_placer_prev_inverse_costs* prev_inverse_costs,
|
|
int* moves_since_cost_recompute,
|
|
const ClusteredPinAtomPinsLookup& netlist_pin_lookup,
|
|
const PlaceDelayModel* delay_model,
|
|
MoveGenerator& move_generator,
|
|
t_pl_blocks_to_be_moved& blocks_affected,
|
|
SetupTimingInfo& timing_info) {
|
|
int inner_crit_iter_count, inner_iter;
|
|
|
|
stats->av_cost = 0.;
|
|
stats->av_bb_cost = 0.;
|
|
stats->av_timing_cost = 0.;
|
|
stats->sum_of_squares = 0.;
|
|
stats->success_sum = 0;
|
|
|
|
inner_crit_iter_count = 1;
|
|
|
|
/* Inner loop begins */
|
|
for (inner_iter = 0; inner_iter < move_lim; inner_iter++) {
|
|
e_move_result swap_result = try_swap(t, costs, prev_inverse_costs, rlim,
|
|
move_generator,
|
|
blocks_affected,
|
|
delay_model,
|
|
placer_opts.rlim_escape_fraction,
|
|
placer_opts.place_algorithm,
|
|
placer_opts.timing_tradeoff);
|
|
|
|
if (swap_result == ACCEPTED) {
|
|
/* Move was accepted. Update statistics that are useful for the annealing schedule. */
|
|
stats->success_sum++;
|
|
stats->av_cost += costs->cost;
|
|
stats->av_bb_cost += costs->bb_cost;
|
|
stats->av_timing_cost += costs->timing_cost;
|
|
stats->sum_of_squares += (costs->cost) * (costs->cost);
|
|
num_swap_accepted++;
|
|
} else if (swap_result == ABORTED) {
|
|
num_swap_aborted++;
|
|
} else { // swap_result == REJECTED
|
|
num_swap_rejected++;
|
|
}
|
|
|
|
if (placer_opts.place_algorithm == PATH_TIMING_DRIVEN_PLACE) {
|
|
/* Do we want to re-timing analyze the circuit to get updated slack and criticality values?
|
|
* We do this only once in a while, since it is expensive.
|
|
*/
|
|
if (inner_crit_iter_count >= inner_recompute_limit
|
|
&& inner_iter != move_lim - 1) { /*on last iteration don't recompute */
|
|
|
|
inner_crit_iter_count = 0;
|
|
#ifdef VERBOSE
|
|
VTR_LOG("Inner loop recompute criticalities\n");
|
|
#endif
|
|
/* Using the delays in net_delay, do a timing analysis to update slacks and
|
|
* criticalities; then update the timing cost since it will change.
|
|
*/
|
|
//Inner loop timing update
|
|
timing_info.update();
|
|
load_criticalities(timing_info, crit_exponent, netlist_pin_lookup);
|
|
|
|
comp_td_costs(delay_model, &costs->timing_cost);
|
|
}
|
|
inner_crit_iter_count++;
|
|
}
|
|
#ifdef VERBOSE
|
|
VTR_LOG("t = %g cost = %g bb_cost = %g timing_cost = %g move = %d\n",
|
|
t, costs->cost, costs->bb_cost, costs->timing_cost, inner_iter);
|
|
if (fabs((costs->bb_cost) - comp_bb_cost(CHECK)) > (costs->bb_cost) * ERROR_TOL)
|
|
VPR_ERROR(VPR_ERROR_PLACE,
|
|
"fabs((*bb_cost) - comp_bb_cost(CHECK)) > (*bb_cost) * ERROR_TOL");
|
|
#endif
|
|
|
|
/* Lines below prevent too much round-off error from accumulating
|
|
* in the cost over many iterations (due to incremental updates).
|
|
* This round-off can lead to error checks failing because the cost
|
|
* is different from what you get when you recompute from scratch.
|
|
*/
|
|
++(*moves_since_cost_recompute);
|
|
if (*moves_since_cost_recompute > MAX_MOVES_BEFORE_RECOMPUTE) {
|
|
recompute_costs_from_scratch(placer_opts, delay_model, costs);
|
|
*moves_since_cost_recompute = 0;
|
|
}
|
|
}
|
|
/* Inner loop ends */
|
|
}
|
|
|
|
static void recompute_costs_from_scratch(const t_placer_opts& placer_opts, const PlaceDelayModel* delay_model, t_placer_costs* costs) {
|
|
double new_bb_cost = recompute_bb_cost();
|
|
if (fabs(new_bb_cost - costs->bb_cost) > costs->bb_cost * ERROR_TOL) {
|
|
std::string msg = vtr::string_fmt("in recompute_costs_from_scratch: new_bb_cost = %g, old bb_cost = %g\n",
|
|
new_bb_cost, costs->bb_cost);
|
|
VPR_ERROR(VPR_ERROR_PLACE, msg.c_str());
|
|
}
|
|
costs->bb_cost = new_bb_cost;
|
|
|
|
if (placer_opts.place_algorithm == PATH_TIMING_DRIVEN_PLACE) {
|
|
double new_timing_cost = 0.;
|
|
comp_td_costs(delay_model, &new_timing_cost);
|
|
if (fabs(new_timing_cost - costs->timing_cost) > costs->timing_cost * ERROR_TOL) {
|
|
std::string msg = vtr::string_fmt("in recompute_costs_from_scratch: new_timing_cost = %g, old timing_cost = %g, ERROR_TOL = %g\n",
|
|
new_timing_cost, costs->timing_cost, ERROR_TOL);
|
|
VPR_ERROR(VPR_ERROR_PLACE, msg.c_str());
|
|
}
|
|
costs->timing_cost = new_timing_cost;
|
|
} else {
|
|
VTR_ASSERT(placer_opts.place_algorithm == BOUNDING_BOX_PLACE);
|
|
|
|
costs->cost = new_bb_cost;
|
|
}
|
|
}
|
|
|
|
/*only count non-global connections */
|
|
static int count_connections() {
|
|
int count = 0;
|
|
|
|
auto& cluster_ctx = g_vpr_ctx.clustering();
|
|
for (auto net_id : cluster_ctx.clb_nlist.nets()) {
|
|
if (cluster_ctx.clb_nlist.net_is_ignored(net_id))
|
|
continue;
|
|
|
|
count += cluster_ctx.clb_nlist.net_sinks(net_id).size();
|
|
}
|
|
|
|
return (count);
|
|
}
|
|
|
|
static double get_std_dev(int n, double sum_x_squared, double av_x) {
|
|
/* Returns the standard deviation of data set x. There are n sample points, *
|
|
* sum_x_squared is the summation over n of x^2 and av_x is the average x. *
|
|
* All operations are done in double precision, since round off error can be *
|
|
* a problem in the initial temp. std_dev calculation for big circuits. */
|
|
|
|
double std_dev;
|
|
|
|
if (n <= 1)
|
|
std_dev = 0.;
|
|
else
|
|
std_dev = (sum_x_squared - n * av_x * av_x) / (double)(n - 1);
|
|
|
|
if (std_dev > 0.) /* Very small variances sometimes round negative */
|
|
std_dev = sqrt(std_dev);
|
|
else
|
|
std_dev = 0.;
|
|
|
|
return (std_dev);
|
|
}
|
|
|
|
static void update_rlim(float* rlim, float success_rat, const DeviceGrid& grid) {
|
|
/* Update the range limited to keep acceptance prob. near 0.44. Use *
|
|
* a floating point rlim to allow gradual transitions at low temps. */
|
|
|
|
float upper_lim;
|
|
|
|
*rlim = (*rlim) * (1. - 0.44 + success_rat);
|
|
upper_lim = max(grid.width() - 1, grid.height() - 1);
|
|
*rlim = min(*rlim, upper_lim);
|
|
*rlim = max(*rlim, (float)1.);
|
|
}
|
|
|
|
/* Update the temperature according to the annealing schedule selected. */
|
|
static void update_t(float* t, float rlim, float success_rat, t_annealing_sched annealing_sched) {
|
|
/* float fac; */
|
|
|
|
if (annealing_sched.type == USER_SCHED) {
|
|
*t = annealing_sched.alpha_t * (*t);
|
|
} else { /* AUTO_SCHED */
|
|
if (success_rat > 0.96) {
|
|
*t = (*t) * 0.5;
|
|
} else if (success_rat > 0.8) {
|
|
*t = (*t) * 0.9;
|
|
} else if (success_rat > 0.15 || rlim > 1.) {
|
|
*t = (*t) * 0.95;
|
|
} else {
|
|
*t = (*t) * 0.8;
|
|
}
|
|
}
|
|
}
|
|
|
|
static int exit_crit(float t, float cost, t_annealing_sched annealing_sched) {
|
|
/* Return 1 when the exit criterion is met. */
|
|
|
|
if (annealing_sched.type == USER_SCHED) {
|
|
if (t < annealing_sched.exit_t) {
|
|
return (1);
|
|
} else {
|
|
return (0);
|
|
}
|
|
}
|
|
|
|
auto& cluster_ctx = g_vpr_ctx.clustering();
|
|
|
|
/* Automatic annealing schedule */
|
|
float t_exit = 0.005 * cost / cluster_ctx.clb_nlist.nets().size();
|
|
|
|
if (t < t_exit) {
|
|
return (1);
|
|
} else if (std::isnan(t_exit)) {
|
|
//May get nan if there are no nets
|
|
return (1);
|
|
} else {
|
|
return (0);
|
|
}
|
|
}
|
|
|
|
static float starting_t(t_placer_costs* costs,
|
|
t_placer_prev_inverse_costs* prev_inverse_costs,
|
|
t_annealing_sched annealing_sched,
|
|
int max_moves,
|
|
float rlim,
|
|
const PlaceDelayModel* delay_model,
|
|
MoveGenerator& move_generator,
|
|
t_pl_blocks_to_be_moved& blocks_affected,
|
|
const t_placer_opts& placer_opts) {
|
|
/* Finds the starting temperature (hot condition). */
|
|
|
|
int i, num_accepted, move_lim;
|
|
double std_dev, av, sum_of_squares; /* Double important to avoid round off */
|
|
|
|
if (annealing_sched.type == USER_SCHED)
|
|
return (annealing_sched.init_t);
|
|
|
|
auto& cluster_ctx = g_vpr_ctx.clustering();
|
|
|
|
move_lim = min(max_moves, (int)cluster_ctx.clb_nlist.blocks().size());
|
|
|
|
num_accepted = 0;
|
|
av = 0.;
|
|
sum_of_squares = 0.;
|
|
|
|
/* Try one move per block. Set t high so essentially all accepted. */
|
|
|
|
for (i = 0; i < move_lim; i++) {
|
|
e_move_result swap_result = try_swap(HUGE_POSITIVE_FLOAT, costs, prev_inverse_costs, rlim,
|
|
move_generator,
|
|
blocks_affected,
|
|
delay_model,
|
|
placer_opts.rlim_escape_fraction,
|
|
placer_opts.place_algorithm,
|
|
placer_opts.timing_tradeoff);
|
|
|
|
if (swap_result == ACCEPTED) {
|
|
num_accepted++;
|
|
av += costs->cost;
|
|
sum_of_squares += costs->cost * costs->cost;
|
|
num_swap_accepted++;
|
|
} else if (swap_result == ABORTED) {
|
|
num_swap_aborted++;
|
|
} else {
|
|
num_swap_rejected++;
|
|
}
|
|
}
|
|
|
|
if (num_accepted != 0)
|
|
av /= num_accepted;
|
|
else
|
|
av = 0.;
|
|
|
|
std_dev = get_std_dev(num_accepted, sum_of_squares, av);
|
|
|
|
if (num_accepted != move_lim) {
|
|
VTR_LOG_WARN("Starting t: %d of %d configurations accepted.\n", num_accepted, move_lim);
|
|
}
|
|
|
|
#ifdef VERBOSE
|
|
VTR_LOG("std_dev: %g, average cost: %g, starting temp: %g\n", std_dev, av, 20. * std_dev);
|
|
#endif
|
|
|
|
/* Set the initial temperature to 20 times the standard of deviation */
|
|
/* so that the initial temperature adjusts according to the circuit */
|
|
return (20. * std_dev);
|
|
}
|
|
|
|
static void update_move_nets(int num_nets_affected) {
|
|
/* update net cost functions and reset flags. */
|
|
auto& cluster_ctx = g_vpr_ctx.clustering();
|
|
for (int inet_affected = 0; inet_affected < num_nets_affected; inet_affected++) {
|
|
ClusterNetId net_id = ts_nets_to_update[inet_affected];
|
|
|
|
bb_coords[net_id] = ts_bb_coord_new[net_id];
|
|
if (cluster_ctx.clb_nlist.net_sinks(net_id).size() >= SMALL_NET)
|
|
bb_num_on_edges[net_id] = ts_bb_edge_new[net_id];
|
|
|
|
net_cost[net_id] = temp_net_cost[net_id];
|
|
|
|
/* negative temp_net_cost value is acting as a flag. */
|
|
temp_net_cost[net_id] = -1;
|
|
bb_updated_before[net_id] = NOT_UPDATED_YET;
|
|
}
|
|
}
|
|
|
|
static void reset_move_nets(int num_nets_affected) {
|
|
/* Reset the net cost function flags first. */
|
|
for (int inet_affected = 0; inet_affected < num_nets_affected; inet_affected++) {
|
|
ClusterNetId net_id = ts_nets_to_update[inet_affected];
|
|
temp_net_cost[net_id] = -1;
|
|
bb_updated_before[net_id] = NOT_UPDATED_YET;
|
|
}
|
|
}
|
|
|
|
static e_move_result try_swap(float t,
|
|
t_placer_costs* costs,
|
|
t_placer_prev_inverse_costs* prev_inverse_costs,
|
|
float rlim,
|
|
MoveGenerator& move_generator,
|
|
t_pl_blocks_to_be_moved& blocks_affected,
|
|
const PlaceDelayModel* delay_model,
|
|
float rlim_escape_fraction,
|
|
enum e_place_algorithm place_algorithm,
|
|
float timing_tradeoff) {
|
|
/* Picks some block and moves it to another spot. If this spot is *
|
|
* occupied, switch the blocks. Assess the change in cost function. *
|
|
* rlim is the range limiter. *
|
|
* Returns whether the swap is accepted, rejected or aborted. *
|
|
* Passes back the new value of the cost functions. */
|
|
|
|
num_ts_called++;
|
|
|
|
MoveOutcomeStats move_outcome_stats;
|
|
|
|
/* I'm using negative values of temp_net_cost as a flag, so DO NOT *
|
|
* use cost functions that can go negative. */
|
|
|
|
double delta_c = 0; /* Change in cost due to this swap. */
|
|
double bb_delta_c = 0;
|
|
double timing_delta_c = 0;
|
|
|
|
//Allow some fraction of moves to not be restricted by rlim,
|
|
//in the hopes of better escaping local minima
|
|
if (rlim_escape_fraction > 0. && vtr::frand() < rlim_escape_fraction) {
|
|
rlim = std::numeric_limits<float>::infinity();
|
|
}
|
|
|
|
//Generate a new move (perturbation) used to explore the space of possible placements
|
|
e_create_move create_move_outcome = move_generator.propose_move(blocks_affected, rlim);
|
|
|
|
LOG_MOVE_STATS_PROPOSED(t, blocks_affected);
|
|
|
|
e_move_result move_outcome = ABORTED;
|
|
|
|
if (create_move_outcome == e_create_move::ABORT) {
|
|
//Proposed move is not legal -- give up on this move
|
|
clear_move_blocks(blocks_affected);
|
|
|
|
LOG_MOVE_STATS_OUTCOME(std::numeric_limits<float>::quiet_NaN(),
|
|
std::numeric_limits<float>::quiet_NaN(),
|
|
std::numeric_limits<float>::quiet_NaN(),
|
|
"ABORTED", "illegal move");
|
|
|
|
move_outcome = ABORTED;
|
|
} else {
|
|
VTR_ASSERT(create_move_outcome == e_create_move::VALID);
|
|
|
|
/*
|
|
* To make evaluating the move simpler (e.g. calculating changed bounding box),
|
|
* we first move the blocks to thier new locations (apply the move to
|
|
* place_ctx.block_locs) and then computed the change in cost. If the move is
|
|
* accepted, the inverse look-up in place_ctx.grid_blocks is updated (committing
|
|
* the move). If the move is rejected the blocks are returned to their original
|
|
* positions (reverting place_ctx.block_locs to its original state).
|
|
*
|
|
* Note that the inverse look-up place_ctx.grid_blocks is only updated
|
|
* after move acceptance is determined, and so should not be used when
|
|
* evaluating a move.
|
|
*/
|
|
|
|
//Update the block positions
|
|
apply_move_blocks(blocks_affected);
|
|
|
|
// Find all the nets affected by this swap and update their costs
|
|
int num_nets_affected = find_affected_nets_and_update_costs(place_algorithm, blocks_affected, delay_model, bb_delta_c, timing_delta_c);
|
|
if (place_algorithm == PATH_TIMING_DRIVEN_PLACE) {
|
|
/*in this case we redefine delta_c as a combination of timing and bb. *
|
|
*additionally, we normalize all values, therefore delta_c is in *
|
|
*relation to 1*/
|
|
|
|
delta_c = (1 - timing_tradeoff) * bb_delta_c * prev_inverse_costs->bb_cost
|
|
+ timing_tradeoff * timing_delta_c * prev_inverse_costs->timing_cost;
|
|
} else {
|
|
delta_c = bb_delta_c;
|
|
}
|
|
|
|
/* 1 -> move accepted, 0 -> rejected. */
|
|
move_outcome = assess_swap(delta_c, t);
|
|
|
|
if (move_outcome == ACCEPTED) {
|
|
costs->cost += delta_c;
|
|
costs->bb_cost += bb_delta_c;
|
|
|
|
if (place_algorithm == PATH_TIMING_DRIVEN_PLACE) {
|
|
/*update the point_to_point_timing_cost and point_to_point_delay
|
|
* values from the temporary values */
|
|
costs->timing_cost += timing_delta_c;
|
|
|
|
update_td_cost(blocks_affected);
|
|
}
|
|
|
|
/* update net cost functions and reset flags. */
|
|
update_move_nets(num_nets_affected);
|
|
|
|
/* Update clb data structures since we kept the move. */
|
|
commit_move_blocks(blocks_affected);
|
|
|
|
} else { /* Move was rejected. */
|
|
/* Reset the net cost function flags first. */
|
|
reset_move_nets(num_nets_affected);
|
|
|
|
/* Restore the place_ctx.block_locs data structures to their state before the move. */
|
|
revert_move_blocks(blocks_affected);
|
|
}
|
|
|
|
move_outcome_stats.delta_cost_norm = delta_c;
|
|
move_outcome_stats.delta_bb_cost_norm = bb_delta_c * prev_inverse_costs->bb_cost;
|
|
move_outcome_stats.delta_timing_cost_norm = timing_delta_c * prev_inverse_costs->timing_cost;
|
|
|
|
move_outcome_stats.delta_bb_cost_abs = bb_delta_c;
|
|
move_outcome_stats.delta_timing_cost_abs = timing_delta_c;
|
|
|
|
LOG_MOVE_STATS_OUTCOME(delta_c, bb_delta_c, timing_delta_c,
|
|
(move_outcome ? "ACCEPTED" : "REJECTED"), "");
|
|
}
|
|
|
|
move_outcome_stats.outcome = move_outcome;
|
|
|
|
move_generator.process_outcome(move_outcome_stats);
|
|
|
|
clear_move_blocks(blocks_affected);
|
|
|
|
//VTR_ASSERT(check_macro_placement_consistency() == 0);
|
|
#if 0
|
|
//Check that each accepted swap yields a valid placement
|
|
check_place(*costs, delay_model, place_algorithm);
|
|
#endif
|
|
|
|
return (move_outcome);
|
|
}
|
|
|
|
//Puts all the nets changed by the current swap into nets_to_update,
|
|
//and updates their bounding box.
|
|
//
|
|
//Returns the number of affected nets.
|
|
static int find_affected_nets_and_update_costs(e_place_algorithm place_algorithm,
|
|
const t_pl_blocks_to_be_moved& blocks_affected,
|
|
const PlaceDelayModel* delay_model,
|
|
double& bb_delta_c,
|
|
double& timing_delta_c) {
|
|
VTR_ASSERT_SAFE(bb_delta_c == 0.);
|
|
VTR_ASSERT_SAFE(timing_delta_c == 0.);
|
|
auto& cluster_ctx = g_vpr_ctx.clustering();
|
|
|
|
int num_affected_nets = 0;
|
|
|
|
//Go through all the blocks moved
|
|
for (int iblk = 0; iblk < blocks_affected.num_moved_blocks; iblk++) {
|
|
ClusterBlockId blk = blocks_affected.moved_blocks[iblk].block_num;
|
|
|
|
//Go through all the pins in the moved block
|
|
for (ClusterPinId blk_pin : cluster_ctx.clb_nlist.block_pins(blk)) {
|
|
ClusterNetId net_id = cluster_ctx.clb_nlist.pin_net(blk_pin);
|
|
VTR_ASSERT_SAFE_MSG(net_id, "Only valid nets should be found in compressed netlist block pins");
|
|
|
|
if (cluster_ctx.clb_nlist.net_is_ignored(net_id))
|
|
continue; //TODO: do we require anyting special here for global nets. "Global nets are assumed to span the whole chip, and do not effect costs"
|
|
|
|
//Record effected nets
|
|
record_affected_net(net_id, num_affected_nets);
|
|
|
|
//Update the net bounding boxes
|
|
//
|
|
//Do not update the net cost here since it should only be updated
|
|
//once per net, not once per pin.
|
|
update_net_bb(net_id, blocks_affected, iblk, blk, blk_pin);
|
|
|
|
if (place_algorithm == PATH_TIMING_DRIVEN_PLACE) {
|
|
//Determine the change in timing costs if required
|
|
update_td_delta_costs(delay_model, blocks_affected, net_id, blk_pin, timing_delta_c);
|
|
}
|
|
}
|
|
}
|
|
|
|
/* Now update the bounding box costs (since the net bounding boxes are up-to-date).
|
|
* The cost is only updated once per net.
|
|
*/
|
|
for (int inet_affected = 0; inet_affected < num_affected_nets; inet_affected++) {
|
|
ClusterNetId net_id = ts_nets_to_update[inet_affected];
|
|
|
|
temp_net_cost[net_id] = get_net_cost(net_id, &ts_bb_coord_new[net_id]);
|
|
bb_delta_c += temp_net_cost[net_id] - net_cost[net_id];
|
|
}
|
|
|
|
return num_affected_nets;
|
|
}
|
|
|
|
static void record_affected_net(const ClusterNetId net, int& num_affected_nets) {
|
|
//Record effected nets
|
|
if (temp_net_cost[net] < 0.) {
|
|
//Net not marked yet.
|
|
ts_nets_to_update[num_affected_nets] = net;
|
|
num_affected_nets++;
|
|
|
|
//Flag to say we've marked this net.
|
|
temp_net_cost[net] = 1.;
|
|
}
|
|
}
|
|
|
|
static void update_net_bb(const ClusterNetId net,
|
|
const t_pl_blocks_to_be_moved& blocks_affected,
|
|
int iblk,
|
|
const ClusterBlockId blk,
|
|
const ClusterPinId blk_pin) {
|
|
auto& cluster_ctx = g_vpr_ctx.clustering();
|
|
|
|
if (cluster_ctx.clb_nlist.net_sinks(net).size() < SMALL_NET) {
|
|
//For small nets brute-force bounding box update is faster
|
|
|
|
if (bb_updated_before[net] == NOT_UPDATED_YET) { //Only once per-net
|
|
get_non_updateable_bb(net, &ts_bb_coord_new[net]);
|
|
}
|
|
} else {
|
|
//For large nets, update bounding box incrementally
|
|
int iblk_pin = tile_pin_index(blk_pin);
|
|
|
|
t_physical_tile_type_ptr blk_type = physical_tile_type(blk);
|
|
int pin_width_offset = blk_type->pin_width_offset[iblk_pin];
|
|
int pin_height_offset = blk_type->pin_height_offset[iblk_pin];
|
|
|
|
//Incremental bounding box update
|
|
update_bb(net, &ts_bb_coord_new[net],
|
|
&ts_bb_edge_new[net],
|
|
blocks_affected.moved_blocks[iblk].old_loc.x + pin_width_offset,
|
|
blocks_affected.moved_blocks[iblk].old_loc.y + pin_height_offset,
|
|
blocks_affected.moved_blocks[iblk].new_loc.x + pin_width_offset,
|
|
blocks_affected.moved_blocks[iblk].new_loc.y + pin_height_offset);
|
|
}
|
|
}
|
|
|
|
static void update_td_delta_costs(const PlaceDelayModel* delay_model, const t_pl_blocks_to_be_moved& blocks_affected, const ClusterNetId net, const ClusterPinId pin, double& delta_timing_cost) {
|
|
auto& cluster_ctx = g_vpr_ctx.clustering();
|
|
|
|
if (cluster_ctx.clb_nlist.pin_type(pin) == PinType::DRIVER) {
|
|
//This pin is a net driver on a moved block.
|
|
//Re-compute all point to point connections for this net.
|
|
for (size_t ipin = 1; ipin < cluster_ctx.clb_nlist.net_pins(net).size(); ipin++) {
|
|
float temp_delay = comp_td_point_to_point_delay(delay_model, net, ipin);
|
|
temp_point_to_point_delay[net][ipin] = temp_delay;
|
|
|
|
temp_point_to_point_timing_cost[net][ipin] = get_timing_place_crit(net, ipin) * temp_delay;
|
|
delta_timing_cost += temp_point_to_point_timing_cost[net][ipin] - point_to_point_timing_cost[net][ipin];
|
|
}
|
|
} else {
|
|
//This pin is a net sink on a moved block
|
|
VTR_ASSERT_SAFE(cluster_ctx.clb_nlist.pin_type(pin) == PinType::SINK);
|
|
|
|
//If this net is being driven by a moved block, we do not
|
|
//need to compute the change in the timing cost (here) since it will
|
|
//be computed by the net's driver pin (since the driver block moved).
|
|
//
|
|
//Computing it here would double count the change, and mess up the
|
|
//delta_timing_cost value.
|
|
if (!driven_by_moved_block(net, blocks_affected)) {
|
|
int net_pin = cluster_ctx.clb_nlist.pin_net_index(pin);
|
|
|
|
float temp_delay = comp_td_point_to_point_delay(delay_model, net, net_pin);
|
|
temp_point_to_point_delay[net][net_pin] = temp_delay;
|
|
|
|
temp_point_to_point_timing_cost[net][net_pin] = get_timing_place_crit(net, net_pin) * temp_delay;
|
|
delta_timing_cost += temp_point_to_point_timing_cost[net][net_pin] - point_to_point_timing_cost[net][net_pin];
|
|
}
|
|
}
|
|
}
|
|
|
|
static e_move_result assess_swap(double delta_c, double t) {
|
|
/* Returns: 1 -> move accepted, 0 -> rejected. */
|
|
if (delta_c <= 0) {
|
|
return ACCEPTED;
|
|
}
|
|
|
|
if (t == 0.) {
|
|
return REJECTED;
|
|
}
|
|
|
|
float fnum = vtr::frand();
|
|
float prob_fac = std::exp(-delta_c / t);
|
|
if (prob_fac > fnum) {
|
|
return ACCEPTED;
|
|
}
|
|
|
|
return REJECTED;
|
|
}
|
|
|
|
static double recompute_bb_cost() {
|
|
/* Recomputes the cost to eliminate roundoff that may have accrued. *
|
|
* This routine does as little work as possible to compute this new *
|
|
* cost. */
|
|
|
|
double cost = 0;
|
|
|
|
auto& cluster_ctx = g_vpr_ctx.clustering();
|
|
|
|
for (auto net_id : cluster_ctx.clb_nlist.nets()) { /* for each net ... */
|
|
if (!cluster_ctx.clb_nlist.net_is_ignored(net_id)) { /* Do only if not ignored. */
|
|
/* Bounding boxes don't have to be recomputed; they're correct. */
|
|
cost += net_cost[net_id];
|
|
}
|
|
}
|
|
|
|
return (cost);
|
|
}
|
|
|
|
/*returns the delay of one point to point connection */
|
|
static float comp_td_point_to_point_delay(const PlaceDelayModel* delay_model, ClusterNetId net_id, int ipin) {
|
|
auto& cluster_ctx = g_vpr_ctx.clustering();
|
|
auto& place_ctx = g_vpr_ctx.placement();
|
|
|
|
float delay_source_to_sink = 0.;
|
|
|
|
if (!cluster_ctx.clb_nlist.net_is_ignored(net_id)) {
|
|
//Only estimate delay for signals routed through the inter-block
|
|
//routing network. TODO: Do how should we compute the delay for globals. "Global signals are assumed to have zero delay."
|
|
|
|
ClusterPinId source_pin = cluster_ctx.clb_nlist.net_driver(net_id);
|
|
ClusterPinId sink_pin = cluster_ctx.clb_nlist.net_pin(net_id, ipin);
|
|
|
|
ClusterBlockId source_block = cluster_ctx.clb_nlist.pin_block(source_pin);
|
|
ClusterBlockId sink_block = cluster_ctx.clb_nlist.pin_block(sink_pin);
|
|
|
|
int source_block_ipin = cluster_ctx.clb_nlist.pin_logical_index(source_pin);
|
|
int sink_block_ipin = cluster_ctx.clb_nlist.pin_logical_index(sink_pin);
|
|
|
|
int source_x = place_ctx.block_locs[source_block].loc.x;
|
|
int source_y = place_ctx.block_locs[source_block].loc.y;
|
|
int sink_x = place_ctx.block_locs[sink_block].loc.x;
|
|
int sink_y = place_ctx.block_locs[sink_block].loc.y;
|
|
|
|
/* Note: This heuristic only considers delta_x and delta_y, a much better heuristic
|
|
* would be to to create a more comprehensive lookup table.
|
|
*
|
|
* In particular this aproach does not accurately capture the effect of fast
|
|
* carry-chain connections.
|
|
*/
|
|
delay_source_to_sink = delay_model->delay(source_x,
|
|
source_y,
|
|
source_block_ipin,
|
|
sink_x,
|
|
sink_y,
|
|
sink_block_ipin);
|
|
if (delay_source_to_sink < 0) {
|
|
VPR_ERROR(VPR_ERROR_PLACE,
|
|
"in comp_td_point_to_point_delay: Bad delay_source_to_sink value %g from %s (at %d,%d) to %s (at %d,%d)\n"
|
|
"in comp_td_point_to_point_delay: Delay is less than 0\n",
|
|
block_type_pin_index_to_name(physical_tile_type(source_block), source_block_ipin).c_str(),
|
|
source_x, source_y,
|
|
block_type_pin_index_to_name(physical_tile_type(sink_block), sink_block_ipin).c_str(),
|
|
sink_x, sink_y,
|
|
delay_source_to_sink);
|
|
}
|
|
}
|
|
|
|
return (delay_source_to_sink);
|
|
}
|
|
|
|
//Recompute all point to point delays, updating point_to_point_delay
|
|
static void comp_td_point_to_point_delays(const PlaceDelayModel* delay_model) {
|
|
auto& cluster_ctx = g_vpr_ctx.clustering();
|
|
|
|
for (auto net_id : cluster_ctx.clb_nlist.nets()) {
|
|
for (size_t ipin = 1; ipin < cluster_ctx.clb_nlist.net_pins(net_id).size(); ++ipin) {
|
|
point_to_point_delay[net_id][ipin] = comp_td_point_to_point_delay(delay_model, net_id, ipin);
|
|
}
|
|
}
|
|
}
|
|
|
|
/* Update the point_to_point_timing_cost values from the temporary *
|
|
* values for all connections that have changed. */
|
|
static void update_td_cost(const t_pl_blocks_to_be_moved& blocks_affected) {
|
|
auto& cluster_ctx = g_vpr_ctx.clustering();
|
|
|
|
/* Go through all the blocks moved. */
|
|
for (int iblk = 0; iblk < blocks_affected.num_moved_blocks; iblk++) {
|
|
ClusterBlockId bnum = blocks_affected.moved_blocks[iblk].block_num;
|
|
for (ClusterPinId pin_id : cluster_ctx.clb_nlist.block_pins(bnum)) {
|
|
ClusterNetId net_id = cluster_ctx.clb_nlist.pin_net(pin_id);
|
|
|
|
if (cluster_ctx.clb_nlist.net_is_ignored(net_id))
|
|
continue;
|
|
|
|
if (cluster_ctx.clb_nlist.pin_type(pin_id) == PinType::DRIVER) {
|
|
//This net is being driven by a moved block, recompute
|
|
//all point to point connections on this net.
|
|
for (size_t ipin = 1; ipin < cluster_ctx.clb_nlist.net_pins(net_id).size(); ipin++) {
|
|
point_to_point_delay[net_id][ipin] = temp_point_to_point_delay[net_id][ipin];
|
|
temp_point_to_point_delay[net_id][ipin] = INVALID_DELAY;
|
|
point_to_point_timing_cost[net_id][ipin] = temp_point_to_point_timing_cost[net_id][ipin];
|
|
temp_point_to_point_timing_cost[net_id][ipin] = INVALID_DELAY;
|
|
}
|
|
} else {
|
|
//This pin is a net sink on a moved block
|
|
VTR_ASSERT_SAFE(cluster_ctx.clb_nlist.pin_type(pin_id) == PinType::SINK);
|
|
|
|
/* The following "if" prevents the value from being updated twice. */
|
|
if (!driven_by_moved_block(net_id, blocks_affected)) {
|
|
int net_pin = cluster_ctx.clb_nlist.pin_net_index(pin_id);
|
|
|
|
point_to_point_delay[net_id][net_pin] = temp_point_to_point_delay[net_id][net_pin];
|
|
temp_point_to_point_delay[net_id][net_pin] = INVALID_DELAY;
|
|
point_to_point_timing_cost[net_id][net_pin] = temp_point_to_point_timing_cost[net_id][net_pin];
|
|
temp_point_to_point_timing_cost[net_id][net_pin] = INVALID_DELAY;
|
|
}
|
|
}
|
|
} /* Finished going through all the pins in the moved block */
|
|
} /* Finished going through all the blocks moved */
|
|
}
|
|
|
|
static bool driven_by_moved_block(const ClusterNetId net, const t_pl_blocks_to_be_moved& blocks_affected) {
|
|
auto& cluster_ctx = g_vpr_ctx.clustering();
|
|
|
|
ClusterBlockId net_driver_block = cluster_ctx.clb_nlist.net_driver_block(net);
|
|
for (int iblk = 0; iblk < blocks_affected.num_moved_blocks; iblk++) {
|
|
if (net_driver_block == blocks_affected.moved_blocks[iblk].block_num) {
|
|
return true;
|
|
}
|
|
}
|
|
return false;
|
|
}
|
|
|
|
static void comp_td_costs(const PlaceDelayModel* delay_model, double* timing_cost) {
|
|
/* Computes the cost (from scratch) from the delays and criticalities *
|
|
* of all point to point connections, we define the timing cost of *
|
|
* each connection as criticality*delay. */
|
|
|
|
auto& cluster_ctx = g_vpr_ctx.clustering();
|
|
|
|
double new_timing_cost = 0.;
|
|
|
|
for (auto net_id : cluster_ctx.clb_nlist.nets()) { /* For each net ... */
|
|
|
|
if (cluster_ctx.clb_nlist.net_is_ignored(net_id)) { /* Do only if not ignored. */
|
|
continue;
|
|
}
|
|
|
|
for (unsigned ipin = 1; ipin < cluster_ctx.clb_nlist.net_pins(net_id).size(); ipin++) {
|
|
float conn_delay = comp_td_point_to_point_delay(delay_model, net_id, ipin);
|
|
float conn_timing_cost = conn_delay * get_timing_place_crit(net_id, ipin);
|
|
|
|
point_to_point_delay[net_id][ipin] = conn_delay;
|
|
temp_point_to_point_delay[net_id][ipin] = INVALID_DELAY;
|
|
|
|
point_to_point_timing_cost[net_id][ipin] = conn_timing_cost;
|
|
temp_point_to_point_timing_cost[net_id][ipin] = INVALID_DELAY;
|
|
new_timing_cost += conn_timing_cost;
|
|
}
|
|
}
|
|
|
|
/* Make sure timing cost does not go above MIN_TIMING_COST. */
|
|
*timing_cost = new_timing_cost;
|
|
}
|
|
|
|
/* Finds the cost from scratch. Done only when the placement *
|
|
* has been radically changed (i.e. after initial placement). *
|
|
* Otherwise find the cost change incrementally. If method *
|
|
* check is NORMAL, we find bounding boxes that are updateable *
|
|
* for the larger nets. If method is CHECK, all bounding boxes *
|
|
* are found via the non_updateable_bb routine, to provide a *
|
|
* cost which can be used to check the correctness of the *
|
|
* other routine. */
|
|
static double comp_bb_cost(e_cost_methods method) {
|
|
double cost = 0;
|
|
double expected_wirelength = 0.0;
|
|
auto& cluster_ctx = g_vpr_ctx.clustering();
|
|
|
|
for (auto net_id : cluster_ctx.clb_nlist.nets()) { /* for each net ... */
|
|
if (!cluster_ctx.clb_nlist.net_is_ignored(net_id)) { /* Do only if not ignored. */
|
|
/* Small nets don't use incremental updating on their bounding boxes, *
|
|
* so they can use a fast bounding box calculator. */
|
|
if (cluster_ctx.clb_nlist.net_sinks(net_id).size() >= SMALL_NET && method == NORMAL) {
|
|
get_bb_from_scratch(net_id, &bb_coords[net_id],
|
|
&bb_num_on_edges[net_id]);
|
|
} else {
|
|
get_non_updateable_bb(net_id, &bb_coords[net_id]);
|
|
}
|
|
|
|
net_cost[net_id] = get_net_cost(net_id, &bb_coords[net_id]);
|
|
cost += net_cost[net_id];
|
|
if (method == CHECK)
|
|
expected_wirelength += get_net_wirelength_estimate(net_id, &bb_coords[net_id]);
|
|
}
|
|
}
|
|
|
|
if (method == CHECK) {
|
|
VTR_LOG("\n");
|
|
VTR_LOG("BB estimate of min-dist (placement) wire length: %.0f\n", expected_wirelength);
|
|
}
|
|
return cost;
|
|
}
|
|
|
|
/* Frees the major structures needed by the placer (and not needed *
|
|
* elsewhere). */
|
|
static void free_placement_structs(const t_placer_opts& placer_opts) {
|
|
auto& cluster_ctx = g_vpr_ctx.clustering();
|
|
|
|
free_fast_cost_update();
|
|
|
|
if (placer_opts.place_algorithm == PATH_TIMING_DRIVEN_PLACE
|
|
|| placer_opts.enable_timing_computations) {
|
|
for (auto net_id : cluster_ctx.clb_nlist.nets()) {
|
|
/*add one to the address since it is indexed from 1 not 0 */
|
|
point_to_point_timing_cost[net_id]++;
|
|
free(point_to_point_timing_cost[net_id]);
|
|
|
|
temp_point_to_point_timing_cost[net_id]++;
|
|
free(temp_point_to_point_timing_cost[net_id]);
|
|
|
|
point_to_point_delay[net_id]++;
|
|
free(point_to_point_delay[net_id]);
|
|
|
|
temp_point_to_point_delay[net_id]++;
|
|
free(temp_point_to_point_delay[net_id]);
|
|
}
|
|
|
|
point_to_point_timing_cost.clear();
|
|
point_to_point_delay.clear();
|
|
temp_point_to_point_timing_cost.clear();
|
|
temp_point_to_point_delay.clear();
|
|
|
|
net_pin_indices.clear();
|
|
}
|
|
|
|
free_placement_macros_structs();
|
|
|
|
/* Frees up all the data structure used in vpr_utils. */
|
|
free_blk_pin_from_port_pin();
|
|
}
|
|
|
|
/* Allocates the major structures needed only by the placer, primarily for *
|
|
* computing costs quickly and such. */
|
|
static void alloc_and_load_placement_structs(float place_cost_exp,
|
|
const t_placer_opts& placer_opts,
|
|
t_direct_inf* directs,
|
|
int num_directs) {
|
|
int max_pins_per_clb;
|
|
unsigned int ipin;
|
|
|
|
auto& device_ctx = g_vpr_ctx.device();
|
|
auto& cluster_ctx = g_vpr_ctx.clustering();
|
|
auto& place_ctx = g_vpr_ctx.mutable_placement();
|
|
|
|
size_t num_nets = cluster_ctx.clb_nlist.nets().size();
|
|
|
|
init_placement_context();
|
|
|
|
max_pins_per_clb = 0;
|
|
for (const auto& type : device_ctx.physical_tile_types) {
|
|
max_pins_per_clb = max(max_pins_per_clb, type.num_pins);
|
|
}
|
|
|
|
if (placer_opts.place_algorithm == PATH_TIMING_DRIVEN_PLACE
|
|
|| placer_opts.enable_timing_computations) {
|
|
/* Allocate structures associated with timing driven placement */
|
|
/* [0..cluster_ctx.clb_nlist.nets().size()-1][1..num_pins-1] */
|
|
point_to_point_delay.resize(num_nets);
|
|
temp_point_to_point_delay.resize(num_nets);
|
|
|
|
point_to_point_timing_cost.resize(num_nets);
|
|
temp_point_to_point_timing_cost.resize(num_nets);
|
|
|
|
for (auto net_id : cluster_ctx.clb_nlist.nets()) {
|
|
size_t num_sinks = cluster_ctx.clb_nlist.net_sinks(net_id).size();
|
|
/* In the following, subract one so index starts at *
|
|
* 1 instead of 0 */
|
|
point_to_point_delay[net_id] = (float*)vtr::malloc(num_sinks * sizeof(float));
|
|
point_to_point_delay[net_id]--;
|
|
|
|
temp_point_to_point_delay[net_id] = (float*)vtr::malloc(num_sinks * sizeof(float));
|
|
temp_point_to_point_delay[net_id]--;
|
|
|
|
point_to_point_timing_cost[net_id] = (double*)vtr::malloc(num_sinks * sizeof(double));
|
|
point_to_point_timing_cost[net_id]--;
|
|
|
|
temp_point_to_point_timing_cost[net_id] = (double*)vtr::malloc(num_sinks * sizeof(double));
|
|
temp_point_to_point_timing_cost[net_id]--;
|
|
}
|
|
for (auto net_id : cluster_ctx.clb_nlist.nets()) {
|
|
for (ipin = 1; ipin < cluster_ctx.clb_nlist.net_pins(net_id).size(); ipin++) {
|
|
point_to_point_delay[net_id][ipin] = 0;
|
|
temp_point_to_point_delay[net_id][ipin] = 0;
|
|
}
|
|
}
|
|
}
|
|
|
|
net_cost.resize(num_nets, -1.);
|
|
temp_net_cost.resize(num_nets, -1.);
|
|
bb_coords.resize(num_nets, t_bb());
|
|
bb_num_on_edges.resize(num_nets, t_bb());
|
|
|
|
/* Used to store costs for moves not yet made and to indicate when a net's *
|
|
* cost has been recomputed. temp_net_cost[inet] < 0 means net's cost hasn't *
|
|
* been recomputed. */
|
|
bb_updated_before.resize(num_nets, NOT_UPDATED_YET);
|
|
|
|
alloc_and_load_for_fast_cost_update(place_cost_exp);
|
|
|
|
alloc_and_load_net_pin_indices();
|
|
|
|
alloc_and_load_try_swap_structs();
|
|
|
|
place_ctx.pl_macros = alloc_and_load_placement_macros(directs, num_directs);
|
|
}
|
|
|
|
/* Allocates and loads net_pin_indices array, this array allows us to quickly *
|
|
* find what pin on the net a block pin corresponds to. Returns the pointer *
|
|
* to the 2D net_pin_indices array. */
|
|
static void alloc_and_load_net_pin_indices() {
|
|
unsigned int netpin;
|
|
int max_pins_per_clb = 0;
|
|
|
|
auto& device_ctx = g_vpr_ctx.device();
|
|
auto& cluster_ctx = g_vpr_ctx.clustering();
|
|
|
|
/* Compute required size. */
|
|
for (const auto& type : device_ctx.physical_tile_types)
|
|
max_pins_per_clb = max(max_pins_per_clb, type.num_pins);
|
|
|
|
/* Allocate for maximum size. */
|
|
net_pin_indices.resize(cluster_ctx.clb_nlist.blocks().size());
|
|
|
|
for (auto blk_id : cluster_ctx.clb_nlist.blocks())
|
|
net_pin_indices[blk_id].resize(max_pins_per_clb);
|
|
|
|
/* Load the values */
|
|
for (auto net_id : cluster_ctx.clb_nlist.nets()) {
|
|
if (cluster_ctx.clb_nlist.net_is_ignored(net_id))
|
|
continue;
|
|
netpin = 0;
|
|
for (auto pin_id : cluster_ctx.clb_nlist.net_pins(net_id)) {
|
|
int pin_index = cluster_ctx.clb_nlist.pin_logical_index(pin_id);
|
|
ClusterBlockId block_id = cluster_ctx.clb_nlist.pin_block(pin_id);
|
|
net_pin_indices[block_id][pin_index] = netpin;
|
|
netpin++;
|
|
}
|
|
}
|
|
}
|
|
|
|
static void alloc_and_load_try_swap_structs() {
|
|
/* Allocate the local bb_coordinate storage, etc. only once. */
|
|
/* Allocate with size cluster_ctx.clb_nlist.nets().size() for any number of nets affected. */
|
|
auto& cluster_ctx = g_vpr_ctx.clustering();
|
|
|
|
size_t num_nets = cluster_ctx.clb_nlist.nets().size();
|
|
|
|
ts_bb_coord_new.resize(num_nets, t_bb());
|
|
ts_bb_edge_new.resize(num_nets, t_bb());
|
|
ts_nets_to_update.resize(num_nets, ClusterNetId::INVALID());
|
|
|
|
auto& place_ctx = g_vpr_ctx.mutable_placement();
|
|
place_ctx.compressed_block_grids = create_compressed_block_grids();
|
|
}
|
|
|
|
/* This routine finds the bounding box of each net from scratch (i.e. *
|
|
* from only the block location information). It updates both the *
|
|
* coordinate and number of pins on each edge information. It *
|
|
* should only be called when the bounding box information is not valid. */
|
|
static void get_bb_from_scratch(ClusterNetId net_id, t_bb* coords, t_bb* num_on_edges) {
|
|
int pnum, x, y, xmin, xmax, ymin, ymax;
|
|
int xmin_edge, xmax_edge, ymin_edge, ymax_edge;
|
|
|
|
auto& cluster_ctx = g_vpr_ctx.clustering();
|
|
auto& place_ctx = g_vpr_ctx.placement();
|
|
auto& device_ctx = g_vpr_ctx.device();
|
|
auto& grid = device_ctx.grid;
|
|
|
|
ClusterBlockId bnum = cluster_ctx.clb_nlist.net_driver_block(net_id);
|
|
pnum = net_pin_to_tile_pin_index(net_id, 0);
|
|
VTR_ASSERT(pnum >= 0);
|
|
x = place_ctx.block_locs[bnum].loc.x + physical_tile_type(bnum)->pin_width_offset[pnum];
|
|
y = place_ctx.block_locs[bnum].loc.y + physical_tile_type(bnum)->pin_height_offset[pnum];
|
|
|
|
x = max(min<int>(x, grid.width() - 2), 1);
|
|
y = max(min<int>(y, grid.height() - 2), 1);
|
|
|
|
xmin = x;
|
|
ymin = y;
|
|
xmax = x;
|
|
ymax = y;
|
|
xmin_edge = 1;
|
|
ymin_edge = 1;
|
|
xmax_edge = 1;
|
|
ymax_edge = 1;
|
|
|
|
for (auto pin_id : cluster_ctx.clb_nlist.net_sinks(net_id)) {
|
|
bnum = cluster_ctx.clb_nlist.pin_block(pin_id);
|
|
pnum = tile_pin_index(pin_id);
|
|
x = place_ctx.block_locs[bnum].loc.x + physical_tile_type(bnum)->pin_width_offset[pnum];
|
|
y = place_ctx.block_locs[bnum].loc.y + physical_tile_type(bnum)->pin_height_offset[pnum];
|
|
|
|
/* Code below counts IO blocks as being within the 1..grid.width()-2, 1..grid.height()-2 clb array. *
|
|
* This is because channels do not go out of the 0..grid.width()-2, 0..grid.height()-2 range, and *
|
|
* I always take all channels impinging on the bounding box to be within *
|
|
* that bounding box. Hence, this "movement" of IO blocks does not affect *
|
|
* the which channels are included within the bounding box, and it *
|
|
* simplifies the code a lot. */
|
|
|
|
x = max(min<int>(x, grid.width() - 2), 1); //-2 for no perim channels
|
|
y = max(min<int>(y, grid.height() - 2), 1); //-2 for no perim channels
|
|
|
|
if (x == xmin) {
|
|
xmin_edge++;
|
|
}
|
|
if (x == xmax) { /* Recall that xmin could equal xmax -- don't use else */
|
|
xmax_edge++;
|
|
} else if (x < xmin) {
|
|
xmin = x;
|
|
xmin_edge = 1;
|
|
} else if (x > xmax) {
|
|
xmax = x;
|
|
xmax_edge = 1;
|
|
}
|
|
|
|
if (y == ymin) {
|
|
ymin_edge++;
|
|
}
|
|
if (y == ymax) {
|
|
ymax_edge++;
|
|
} else if (y < ymin) {
|
|
ymin = y;
|
|
ymin_edge = 1;
|
|
} else if (y > ymax) {
|
|
ymax = y;
|
|
ymax_edge = 1;
|
|
}
|
|
}
|
|
|
|
/* Copy the coordinates and number on edges information into the proper *
|
|
* structures. */
|
|
coords->xmin = xmin;
|
|
coords->xmax = xmax;
|
|
coords->ymin = ymin;
|
|
coords->ymax = ymax;
|
|
|
|
num_on_edges->xmin = xmin_edge;
|
|
num_on_edges->xmax = xmax_edge;
|
|
num_on_edges->ymin = ymin_edge;
|
|
num_on_edges->ymax = ymax_edge;
|
|
}
|
|
|
|
static double get_net_wirelength_estimate(ClusterNetId net_id, t_bb* bbptr) {
|
|
/* WMF: Finds the estimate of wirelength due to one net by looking at *
|
|
* its coordinate bounding box. */
|
|
|
|
double ncost, crossing;
|
|
auto& cluster_ctx = g_vpr_ctx.clustering();
|
|
|
|
/* Get the expected "crossing count" of a net, based on its number *
|
|
* of pins. Extrapolate for very large nets. */
|
|
|
|
if (((cluster_ctx.clb_nlist.net_pins(net_id).size()) > 50)
|
|
&& ((cluster_ctx.clb_nlist.net_pins(net_id).size()) < 85)) {
|
|
crossing = 2.7933 + 0.02616 * ((cluster_ctx.clb_nlist.net_pins(net_id).size()) - 50);
|
|
} else if ((cluster_ctx.clb_nlist.net_pins(net_id).size()) >= 85) {
|
|
crossing = 2.7933 + 0.011 * (cluster_ctx.clb_nlist.net_pins(net_id).size())
|
|
- 0.0000018 * (cluster_ctx.clb_nlist.net_pins(net_id).size())
|
|
* (cluster_ctx.clb_nlist.net_pins(net_id).size());
|
|
} else {
|
|
crossing = cross_count[cluster_ctx.clb_nlist.net_pins(net_id).size() - 1];
|
|
}
|
|
|
|
/* Could insert a check for xmin == xmax. In that case, assume *
|
|
* connection will be made with no bends and hence no x-cost. *
|
|
* Same thing for y-cost. */
|
|
|
|
/* Cost = wire length along channel * cross_count / average *
|
|
* channel capacity. Do this for x, then y direction and add. */
|
|
|
|
ncost = (bbptr->xmax - bbptr->xmin + 1) * crossing;
|
|
|
|
ncost += (bbptr->ymax - bbptr->ymin + 1) * crossing;
|
|
|
|
return (ncost);
|
|
}
|
|
|
|
static double get_net_cost(ClusterNetId net_id, t_bb* bbptr) {
|
|
/* Finds the cost due to one net by looking at its coordinate bounding *
|
|
* box. */
|
|
|
|
double ncost, crossing;
|
|
auto& cluster_ctx = g_vpr_ctx.clustering();
|
|
|
|
/* Get the expected "crossing count" of a net, based on its number *
|
|
* of pins. Extrapolate for very large nets. */
|
|
|
|
if ((cluster_ctx.clb_nlist.net_pins(net_id).size()) > 50) {
|
|
crossing = 2.7933 + 0.02616 * ((cluster_ctx.clb_nlist.net_pins(net_id).size()) - 50);
|
|
/* crossing = 3.0; Old value */
|
|
} else {
|
|
crossing = cross_count[(cluster_ctx.clb_nlist.net_pins(net_id).size()) - 1];
|
|
}
|
|
|
|
/* Could insert a check for xmin == xmax. In that case, assume *
|
|
* connection will be made with no bends and hence no x-cost. *
|
|
* Same thing for y-cost. */
|
|
|
|
/* Cost = wire length along channel * cross_count / average *
|
|
* channel capacity. Do this for x, then y direction and add. */
|
|
|
|
ncost = (bbptr->xmax - bbptr->xmin + 1) * crossing
|
|
* chanx_place_cost_fac[bbptr->ymax][bbptr->ymin - 1];
|
|
|
|
ncost += (bbptr->ymax - bbptr->ymin + 1) * crossing
|
|
* chany_place_cost_fac[bbptr->xmax][bbptr->xmin - 1];
|
|
|
|
return (ncost);
|
|
}
|
|
|
|
/* Finds the bounding box of a net and stores its coordinates in the *
|
|
* bb_coord_new data structure. This routine should only be called *
|
|
* for small nets, since it does not determine enough information for *
|
|
* the bounding box to be updated incrementally later. *
|
|
* Currently assumes channels on both sides of the CLBs forming the *
|
|
* edges of the bounding box can be used. Essentially, I am assuming *
|
|
* the pins always lie on the outside of the bounding box. */
|
|
static void get_non_updateable_bb(ClusterNetId net_id, t_bb* bb_coord_new) {
|
|
//TODO: account for multiple physical pin instances per logical pin
|
|
|
|
int xmax, ymax, xmin, ymin, x, y;
|
|
int pnum;
|
|
|
|
auto& cluster_ctx = g_vpr_ctx.clustering();
|
|
auto& place_ctx = g_vpr_ctx.placement();
|
|
auto& device_ctx = g_vpr_ctx.device();
|
|
|
|
ClusterBlockId bnum = cluster_ctx.clb_nlist.net_driver_block(net_id);
|
|
pnum = net_pin_to_tile_pin_index(net_id, 0);
|
|
x = place_ctx.block_locs[bnum].loc.x + physical_tile_type(bnum)->pin_width_offset[pnum];
|
|
y = place_ctx.block_locs[bnum].loc.y + physical_tile_type(bnum)->pin_height_offset[pnum];
|
|
|
|
xmin = x;
|
|
ymin = y;
|
|
xmax = x;
|
|
ymax = y;
|
|
|
|
for (auto pin_id : cluster_ctx.clb_nlist.net_sinks(net_id)) {
|
|
bnum = cluster_ctx.clb_nlist.pin_block(pin_id);
|
|
pnum = tile_pin_index(pin_id);
|
|
x = place_ctx.block_locs[bnum].loc.x + physical_tile_type(bnum)->pin_width_offset[pnum];
|
|
y = place_ctx.block_locs[bnum].loc.y + physical_tile_type(bnum)->pin_height_offset[pnum];
|
|
|
|
if (x < xmin) {
|
|
xmin = x;
|
|
} else if (x > xmax) {
|
|
xmax = x;
|
|
}
|
|
|
|
if (y < ymin) {
|
|
ymin = y;
|
|
} else if (y > ymax) {
|
|
ymax = y;
|
|
}
|
|
}
|
|
|
|
/* Now I've found the coordinates of the bounding box. There are no *
|
|
* channels beyond device_ctx.grid.width()-2 and *
|
|
* device_ctx.grid.height() - 2, so I want to clip to that. As well,*
|
|
* since I'll always include the channel immediately below and the *
|
|
* channel immediately to the left of the bounding box, I want to *
|
|
* clip to 1 in both directions as well (since minimum channel index *
|
|
* is 0). See route_common.cpp for a channel diagram. */
|
|
|
|
bb_coord_new->xmin = max(min<int>(xmin, device_ctx.grid.width() - 2), 1); //-2 for no perim channels
|
|
bb_coord_new->ymin = max(min<int>(ymin, device_ctx.grid.height() - 2), 1); //-2 for no perim channels
|
|
bb_coord_new->xmax = max(min<int>(xmax, device_ctx.grid.width() - 2), 1); //-2 for no perim channels
|
|
bb_coord_new->ymax = max(min<int>(ymax, device_ctx.grid.height() - 2), 1); //-2 for no perim channels
|
|
}
|
|
|
|
static void update_bb(ClusterNetId net_id, t_bb* bb_coord_new, t_bb* bb_edge_new, int xold, int yold, int xnew, int ynew) {
|
|
/* Updates the bounding box of a net by storing its coordinates in *
|
|
* the bb_coord_new data structure and the number of blocks on each *
|
|
* edge in the bb_edge_new data structure. This routine should only *
|
|
* be called for large nets, since it has some overhead relative to *
|
|
* just doing a brute force bounding box calculation. The bounding *
|
|
* box coordinate and edge information for inet must be valid before *
|
|
* this routine is called. *
|
|
* Currently assumes channels on both sides of the CLBs forming the *
|
|
* edges of the bounding box can be used. Essentially, I am assuming *
|
|
* the pins always lie on the outside of the bounding box. *
|
|
* The x and y coordinates are the pin's x and y coordinates. */
|
|
/* IO blocks are considered to be one cell in for simplicity. */
|
|
//TODO: account for multiple physical pin instances per logical pin
|
|
|
|
t_bb *curr_bb_edge, *curr_bb_coord;
|
|
|
|
auto& device_ctx = g_vpr_ctx.device();
|
|
|
|
xnew = max(min<int>(xnew, device_ctx.grid.width() - 2), 1); //-2 for no perim channels
|
|
ynew = max(min<int>(ynew, device_ctx.grid.height() - 2), 1); //-2 for no perim channels
|
|
xold = max(min<int>(xold, device_ctx.grid.width() - 2), 1); //-2 for no perim channels
|
|
yold = max(min<int>(yold, device_ctx.grid.height() - 2), 1); //-2 for no perim channels
|
|
|
|
/* Check if the net had been updated before. */
|
|
if (bb_updated_before[net_id] == GOT_FROM_SCRATCH) {
|
|
/* The net had been updated from scratch, DO NOT update again! */
|
|
return;
|
|
} else if (bb_updated_before[net_id] == NOT_UPDATED_YET) {
|
|
/* The net had NOT been updated before, could use the old values */
|
|
curr_bb_coord = &bb_coords[net_id];
|
|
curr_bb_edge = &bb_num_on_edges[net_id];
|
|
bb_updated_before[net_id] = UPDATED_ONCE;
|
|
} else {
|
|
/* The net had been updated before, must use the new values */
|
|
curr_bb_coord = bb_coord_new;
|
|
curr_bb_edge = bb_edge_new;
|
|
}
|
|
|
|
/* Check if I can update the bounding box incrementally. */
|
|
|
|
if (xnew < xold) { /* Move to left. */
|
|
|
|
/* Update the xmax fields for coordinates and number of edges first. */
|
|
|
|
if (xold == curr_bb_coord->xmax) { /* Old position at xmax. */
|
|
if (curr_bb_edge->xmax == 1) {
|
|
get_bb_from_scratch(net_id, bb_coord_new, bb_edge_new);
|
|
bb_updated_before[net_id] = GOT_FROM_SCRATCH;
|
|
return;
|
|
} else {
|
|
bb_edge_new->xmax = curr_bb_edge->xmax - 1;
|
|
bb_coord_new->xmax = curr_bb_coord->xmax;
|
|
}
|
|
} else { /* Move to left, old postion was not at xmax. */
|
|
bb_coord_new->xmax = curr_bb_coord->xmax;
|
|
bb_edge_new->xmax = curr_bb_edge->xmax;
|
|
}
|
|
|
|
/* Now do the xmin fields for coordinates and number of edges. */
|
|
|
|
if (xnew < curr_bb_coord->xmin) { /* Moved past xmin */
|
|
bb_coord_new->xmin = xnew;
|
|
bb_edge_new->xmin = 1;
|
|
} else if (xnew == curr_bb_coord->xmin) { /* Moved to xmin */
|
|
bb_coord_new->xmin = xnew;
|
|
bb_edge_new->xmin = curr_bb_edge->xmin + 1;
|
|
} else { /* Xmin unchanged. */
|
|
bb_coord_new->xmin = curr_bb_coord->xmin;
|
|
bb_edge_new->xmin = curr_bb_edge->xmin;
|
|
}
|
|
/* End of move to left case. */
|
|
|
|
} else if (xnew > xold) { /* Move to right. */
|
|
|
|
/* Update the xmin fields for coordinates and number of edges first. */
|
|
|
|
if (xold == curr_bb_coord->xmin) { /* Old position at xmin. */
|
|
if (curr_bb_edge->xmin == 1) {
|
|
get_bb_from_scratch(net_id, bb_coord_new, bb_edge_new);
|
|
bb_updated_before[net_id] = GOT_FROM_SCRATCH;
|
|
return;
|
|
} else {
|
|
bb_edge_new->xmin = curr_bb_edge->xmin - 1;
|
|
bb_coord_new->xmin = curr_bb_coord->xmin;
|
|
}
|
|
} else { /* Move to right, old position was not at xmin. */
|
|
bb_coord_new->xmin = curr_bb_coord->xmin;
|
|
bb_edge_new->xmin = curr_bb_edge->xmin;
|
|
}
|
|
|
|
/* Now do the xmax fields for coordinates and number of edges. */
|
|
|
|
if (xnew > curr_bb_coord->xmax) { /* Moved past xmax. */
|
|
bb_coord_new->xmax = xnew;
|
|
bb_edge_new->xmax = 1;
|
|
} else if (xnew == curr_bb_coord->xmax) { /* Moved to xmax */
|
|
bb_coord_new->xmax = xnew;
|
|
bb_edge_new->xmax = curr_bb_edge->xmax + 1;
|
|
} else { /* Xmax unchanged. */
|
|
bb_coord_new->xmax = curr_bb_coord->xmax;
|
|
bb_edge_new->xmax = curr_bb_edge->xmax;
|
|
}
|
|
/* End of move to right case. */
|
|
|
|
} else { /* xnew == xold -- no x motion. */
|
|
bb_coord_new->xmin = curr_bb_coord->xmin;
|
|
bb_coord_new->xmax = curr_bb_coord->xmax;
|
|
bb_edge_new->xmin = curr_bb_edge->xmin;
|
|
bb_edge_new->xmax = curr_bb_edge->xmax;
|
|
}
|
|
|
|
/* Now account for the y-direction motion. */
|
|
|
|
if (ynew < yold) { /* Move down. */
|
|
|
|
/* Update the ymax fields for coordinates and number of edges first. */
|
|
|
|
if (yold == curr_bb_coord->ymax) { /* Old position at ymax. */
|
|
if (curr_bb_edge->ymax == 1) {
|
|
get_bb_from_scratch(net_id, bb_coord_new, bb_edge_new);
|
|
bb_updated_before[net_id] = GOT_FROM_SCRATCH;
|
|
return;
|
|
} else {
|
|
bb_edge_new->ymax = curr_bb_edge->ymax - 1;
|
|
bb_coord_new->ymax = curr_bb_coord->ymax;
|
|
}
|
|
} else { /* Move down, old postion was not at ymax. */
|
|
bb_coord_new->ymax = curr_bb_coord->ymax;
|
|
bb_edge_new->ymax = curr_bb_edge->ymax;
|
|
}
|
|
|
|
/* Now do the ymin fields for coordinates and number of edges. */
|
|
|
|
if (ynew < curr_bb_coord->ymin) { /* Moved past ymin */
|
|
bb_coord_new->ymin = ynew;
|
|
bb_edge_new->ymin = 1;
|
|
} else if (ynew == curr_bb_coord->ymin) { /* Moved to ymin */
|
|
bb_coord_new->ymin = ynew;
|
|
bb_edge_new->ymin = curr_bb_edge->ymin + 1;
|
|
} else { /* ymin unchanged. */
|
|
bb_coord_new->ymin = curr_bb_coord->ymin;
|
|
bb_edge_new->ymin = curr_bb_edge->ymin;
|
|
}
|
|
/* End of move down case. */
|
|
|
|
} else if (ynew > yold) { /* Moved up. */
|
|
|
|
/* Update the ymin fields for coordinates and number of edges first. */
|
|
|
|
if (yold == curr_bb_coord->ymin) { /* Old position at ymin. */
|
|
if (curr_bb_edge->ymin == 1) {
|
|
get_bb_from_scratch(net_id, bb_coord_new, bb_edge_new);
|
|
bb_updated_before[net_id] = GOT_FROM_SCRATCH;
|
|
return;
|
|
} else {
|
|
bb_edge_new->ymin = curr_bb_edge->ymin - 1;
|
|
bb_coord_new->ymin = curr_bb_coord->ymin;
|
|
}
|
|
} else { /* Moved up, old position was not at ymin. */
|
|
bb_coord_new->ymin = curr_bb_coord->ymin;
|
|
bb_edge_new->ymin = curr_bb_edge->ymin;
|
|
}
|
|
|
|
/* Now do the ymax fields for coordinates and number of edges. */
|
|
|
|
if (ynew > curr_bb_coord->ymax) { /* Moved past ymax. */
|
|
bb_coord_new->ymax = ynew;
|
|
bb_edge_new->ymax = 1;
|
|
} else if (ynew == curr_bb_coord->ymax) { /* Moved to ymax */
|
|
bb_coord_new->ymax = ynew;
|
|
bb_edge_new->ymax = curr_bb_edge->ymax + 1;
|
|
} else { /* ymax unchanged. */
|
|
bb_coord_new->ymax = curr_bb_coord->ymax;
|
|
bb_edge_new->ymax = curr_bb_edge->ymax;
|
|
}
|
|
/* End of move up case. */
|
|
|
|
} else { /* ynew == yold -- no y motion. */
|
|
bb_coord_new->ymin = curr_bb_coord->ymin;
|
|
bb_coord_new->ymax = curr_bb_coord->ymax;
|
|
bb_edge_new->ymin = curr_bb_edge->ymin;
|
|
bb_edge_new->ymax = curr_bb_edge->ymax;
|
|
}
|
|
|
|
if (bb_updated_before[net_id] == NOT_UPDATED_YET) {
|
|
bb_updated_before[net_id] = UPDATED_ONCE;
|
|
}
|
|
}
|
|
|
|
static void free_fast_cost_update() {
|
|
auto& device_ctx = g_vpr_ctx.device();
|
|
|
|
for (size_t i = 0; i < device_ctx.grid.height(); i++) {
|
|
free(chanx_place_cost_fac[i]);
|
|
}
|
|
free(chanx_place_cost_fac);
|
|
chanx_place_cost_fac = nullptr;
|
|
|
|
for (size_t i = 0; i < device_ctx.grid.width(); i++) {
|
|
free(chany_place_cost_fac[i]);
|
|
}
|
|
free(chany_place_cost_fac);
|
|
chany_place_cost_fac = nullptr;
|
|
}
|
|
|
|
static void alloc_and_load_for_fast_cost_update(float place_cost_exp) {
|
|
/* Allocates and loads the chanx_place_cost_fac and chany_place_cost_fac *
|
|
* arrays with the inverse of the average number of tracks per channel *
|
|
* between [subhigh] and [sublow]. This is only useful for the cost *
|
|
* function that takes the length of the net bounding box in each *
|
|
* dimension divided by the average number of tracks in that direction. *
|
|
* For other cost functions, you don't have to bother calling this *
|
|
* routine; when using the cost function described above, however, you *
|
|
* must always call this routine after you call init_chan and before *
|
|
* you do any placement cost determination. The place_cost_exp factor *
|
|
* specifies to what power the width of the channel should be taken -- *
|
|
* larger numbers make narrower channels more expensive. */
|
|
|
|
auto& device_ctx = g_vpr_ctx.device();
|
|
|
|
/* Access arrays below as chan?_place_cost_fac[subhigh][sublow]. Since *
|
|
* subhigh must be greater than or equal to sublow, we only need to *
|
|
* allocate storage for the lower half of a matrix. */
|
|
|
|
chanx_place_cost_fac = (float**)vtr::malloc((device_ctx.grid.height()) * sizeof(float*));
|
|
for (size_t i = 0; i < device_ctx.grid.height(); i++)
|
|
chanx_place_cost_fac[i] = (float*)vtr::malloc((i + 1) * sizeof(float));
|
|
|
|
chany_place_cost_fac = (float**)vtr::malloc((device_ctx.grid.width() + 1) * sizeof(float*));
|
|
for (size_t i = 0; i < device_ctx.grid.width(); i++)
|
|
chany_place_cost_fac[i] = (float*)vtr::malloc((i + 1) * sizeof(float));
|
|
|
|
/* First compute the number of tracks between channel high and channel *
|
|
* low, inclusive, in an efficient manner. */
|
|
|
|
chanx_place_cost_fac[0][0] = device_ctx.chan_width.x_list[0];
|
|
|
|
for (size_t high = 1; high < device_ctx.grid.height(); high++) {
|
|
chanx_place_cost_fac[high][high] = device_ctx.chan_width.x_list[high];
|
|
for (size_t low = 0; low < high; low++) {
|
|
chanx_place_cost_fac[high][low] = chanx_place_cost_fac[high - 1][low] + device_ctx.chan_width.x_list[high];
|
|
}
|
|
}
|
|
|
|
/* Now compute the inverse of the average number of tracks per channel *
|
|
* between high and low. The cost function divides by the average *
|
|
* number of tracks per channel, so by storing the inverse I convert *
|
|
* this to a faster multiplication. Take this final number to the *
|
|
* place_cost_exp power -- numbers other than one mean this is no *
|
|
* longer a simple "average number of tracks"; it is some power of *
|
|
* that, allowing greater penalization of narrow channels. */
|
|
|
|
for (size_t high = 0; high < device_ctx.grid.height(); high++)
|
|
for (size_t low = 0; low <= high; low++) {
|
|
chanx_place_cost_fac[high][low] = (high - low + 1.)
|
|
/ chanx_place_cost_fac[high][low];
|
|
chanx_place_cost_fac[high][low] = pow((double)chanx_place_cost_fac[high][low], (double)place_cost_exp);
|
|
}
|
|
|
|
/* Now do the same thing for the y-directed channels. First get the *
|
|
* number of tracks between channel high and channel low, inclusive. */
|
|
|
|
chany_place_cost_fac[0][0] = device_ctx.chan_width.y_list[0];
|
|
|
|
for (size_t high = 1; high < device_ctx.grid.width(); high++) {
|
|
chany_place_cost_fac[high][high] = device_ctx.chan_width.y_list[high];
|
|
for (size_t low = 0; low < high; low++) {
|
|
chany_place_cost_fac[high][low] = chany_place_cost_fac[high - 1][low] + device_ctx.chan_width.y_list[high];
|
|
}
|
|
}
|
|
|
|
/* Now compute the inverse of the average number of tracks per channel *
|
|
* between high and low. Take to specified power. */
|
|
|
|
for (size_t high = 0; high < device_ctx.grid.width(); high++)
|
|
for (size_t low = 0; low <= high; low++) {
|
|
chany_place_cost_fac[high][low] = (high - low + 1.)
|
|
/ chany_place_cost_fac[high][low];
|
|
chany_place_cost_fac[high][low] = pow((double)chany_place_cost_fac[high][low], (double)place_cost_exp);
|
|
}
|
|
}
|
|
|
|
static void check_place(const t_placer_costs& costs,
|
|
const PlaceDelayModel* delay_model,
|
|
enum e_place_algorithm place_algorithm) {
|
|
/* Checks that the placement has not confused our data structures. *
|
|
* i.e. the clb and block structures agree about the locations of *
|
|
* every block, blocks are in legal spots, etc. Also recomputes *
|
|
* the final placement cost from scratch and makes sure it is *
|
|
* within roundoff of what we think the cost is. */
|
|
|
|
int error = 0;
|
|
|
|
error += check_placement_consistency();
|
|
error += check_placement_costs(costs, delay_model, place_algorithm);
|
|
|
|
if (error == 0) {
|
|
VTR_LOG("\n");
|
|
VTR_LOG("Completed placement consistency check successfully.\n");
|
|
|
|
} else {
|
|
VPR_ERROR(VPR_ERROR_PLACE,
|
|
"\nCompleted placement consistency check, %d errors found.\n"
|
|
"Aborting program.\n",
|
|
error);
|
|
}
|
|
}
|
|
|
|
static int check_placement_costs(const t_placer_costs& costs,
|
|
const PlaceDelayModel* delay_model,
|
|
enum e_place_algorithm place_algorithm) {
|
|
int error = 0;
|
|
double bb_cost_check;
|
|
double timing_cost_check;
|
|
|
|
bb_cost_check = comp_bb_cost(CHECK);
|
|
if (fabs(bb_cost_check - costs.bb_cost) > costs.bb_cost * ERROR_TOL) {
|
|
VTR_LOG_ERROR("bb_cost_check: %g and bb_cost: %g differ in check_place.\n",
|
|
bb_cost_check, costs.bb_cost);
|
|
error++;
|
|
}
|
|
|
|
if (place_algorithm == PATH_TIMING_DRIVEN_PLACE) {
|
|
comp_td_costs(delay_model, &timing_cost_check);
|
|
//VTR_LOG("timing_cost recomputed from scratch: %g\n", timing_cost_check);
|
|
if (fabs(timing_cost_check - costs.timing_cost) > costs.timing_cost * ERROR_TOL) {
|
|
VTR_LOG_ERROR("timing_cost_check: %g and timing_cost: %g differ in check_place.\n",
|
|
timing_cost_check, costs.timing_cost);
|
|
error++;
|
|
}
|
|
}
|
|
return error;
|
|
}
|
|
|
|
static int check_placement_consistency() {
|
|
return check_block_placement_consistency() + check_macro_placement_consistency();
|
|
}
|
|
|
|
static int check_block_placement_consistency() {
|
|
int error = 0;
|
|
|
|
auto& cluster_ctx = g_vpr_ctx.clustering();
|
|
auto& place_ctx = g_vpr_ctx.placement();
|
|
auto& device_ctx = g_vpr_ctx.device();
|
|
|
|
vtr::vector<ClusterBlockId, int> bdone(cluster_ctx.clb_nlist.blocks().size(), 0);
|
|
|
|
/* Step through device grid and placement. Check it against blocks */
|
|
for (size_t i = 0; i < device_ctx.grid.width(); i++)
|
|
for (size_t j = 0; j < device_ctx.grid.height(); j++) {
|
|
if (place_ctx.grid_blocks[i][j].usage > device_ctx.grid[i][j].type->capacity) {
|
|
VTR_LOG_ERROR("Block at grid location (%zu,%zu) overused. Usage is %d.\n",
|
|
i, j, place_ctx.grid_blocks[i][j].usage);
|
|
error++;
|
|
}
|
|
int usage_check = 0;
|
|
for (int k = 0; k < device_ctx.grid[i][j].type->capacity; k++) {
|
|
auto bnum = place_ctx.grid_blocks[i][j].blocks[k];
|
|
if (EMPTY_BLOCK_ID == bnum || INVALID_BLOCK_ID == bnum)
|
|
continue;
|
|
|
|
if (physical_tile_type(bnum) != device_ctx.grid[i][j].type) {
|
|
VTR_LOG_ERROR("Block %zu type (%s) does not match grid location (%zu,%zu) type (%s).\n",
|
|
size_t(bnum), cluster_ctx.clb_nlist.block_type(bnum)->name, i, j, device_ctx.grid[i][j].type->name);
|
|
error++;
|
|
}
|
|
if ((place_ctx.block_locs[bnum].loc.x != int(i)) || (place_ctx.block_locs[bnum].loc.y != int(j))) {
|
|
VTR_LOG_ERROR("Block %zu's location is (%d,%d,%d) but found in grid at (%zu,%zu,%d).\n",
|
|
size_t(bnum), place_ctx.block_locs[bnum].loc.x, place_ctx.block_locs[bnum].loc.y, place_ctx.block_locs[bnum].loc.z,
|
|
i, j, k);
|
|
error++;
|
|
}
|
|
++usage_check;
|
|
bdone[bnum]++;
|
|
}
|
|
if (usage_check != place_ctx.grid_blocks[i][j].usage) {
|
|
VTR_LOG_ERROR("Location (%zu,%zu) usage is %d, but has actual usage %d.\n",
|
|
i, j, place_ctx.grid_blocks[i][j].usage, usage_check);
|
|
error++;
|
|
}
|
|
}
|
|
|
|
/* Check that every block exists in the device_ctx.grid and cluster_ctx.blocks arrays somewhere. */
|
|
for (auto blk_id : cluster_ctx.clb_nlist.blocks())
|
|
if (bdone[blk_id] != 1) {
|
|
VTR_LOG_ERROR("Block %zu listed %d times in data structures.\n",
|
|
size_t(blk_id), bdone[blk_id]);
|
|
error++;
|
|
}
|
|
|
|
return error;
|
|
}
|
|
|
|
int check_macro_placement_consistency() {
|
|
int error = 0;
|
|
auto& place_ctx = g_vpr_ctx.placement();
|
|
|
|
auto& pl_macros = place_ctx.pl_macros;
|
|
|
|
/* Check the pl_macro placement are legal - blocks are in the proper relative position. */
|
|
for (size_t imacro = 0; imacro < place_ctx.pl_macros.size(); imacro++) {
|
|
auto head_iblk = pl_macros[imacro].members[0].blk_index;
|
|
|
|
for (size_t imember = 0; imember < pl_macros[imacro].members.size(); imember++) {
|
|
auto member_iblk = pl_macros[imacro].members[imember].blk_index;
|
|
|
|
// Compute the suppossed member's x,y,z location
|
|
t_pl_loc member_pos = place_ctx.block_locs[head_iblk].loc + pl_macros[imacro].members[imember].offset;
|
|
|
|
// Check the place_ctx.block_locs data structure first
|
|
if (place_ctx.block_locs[member_iblk].loc != member_pos) {
|
|
VTR_LOG_ERROR("Block %zu in pl_macro #%zu is not placed in the proper orientation.\n",
|
|
size_t(member_iblk), imacro);
|
|
error++;
|
|
}
|
|
|
|
// Then check the place_ctx.grid data structure
|
|
if (place_ctx.grid_blocks[member_pos.x][member_pos.y].blocks[member_pos.z] != member_iblk) {
|
|
VTR_LOG_ERROR("Block %zu in pl_macro #%zu is not placed in the proper orientation.\n",
|
|
size_t(member_iblk), imacro);
|
|
error++;
|
|
}
|
|
} // Finish going through all the members
|
|
} // Finish going through all the macros
|
|
return error;
|
|
}
|
|
|
|
#ifdef VERBOSE
|
|
static void print_clb_placement(const char* fname) {
|
|
/* Prints out the clb placements to a file. */
|
|
FILE* fp;
|
|
auto& cluster_ctx = g_vpr_ctx.clustering();
|
|
auto& place_ctx = g_vpr_ctx.placement();
|
|
|
|
fp = vtr::fopen(fname, "w");
|
|
fprintf(fp, "Complex block placements:\n\n");
|
|
|
|
fprintf(fp, "Block #\tName\t(X, Y, Z).\n");
|
|
for (auto i : cluster_ctx.clb_nlist.blocks()) {
|
|
fprintf(fp, "#%d\t%s\t(%d, %d, %d).\n", i, cluster_ctx.clb_nlist.block_name(i), place_ctx.block_locs[i].x, place_ctx.block_locs[i].y, place_ctx.block_locs[i].z);
|
|
}
|
|
|
|
fclose(fp);
|
|
}
|
|
#endif
|
|
|
|
static void free_try_swap_arrays() {
|
|
g_vpr_ctx.mutable_placement().compressed_block_grids.clear();
|
|
}
|
|
|
|
static void calc_placer_stats(t_placer_statistics& stats, float& success_rat, double& std_dev, const t_placer_costs& costs, const int move_lim) {
|
|
success_rat = ((float)stats.success_sum) / move_lim;
|
|
if (stats.success_sum == 0) {
|
|
stats.av_cost = costs.cost;
|
|
stats.av_bb_cost = costs.bb_cost;
|
|
stats.av_timing_cost = costs.timing_cost;
|
|
} else {
|
|
stats.av_cost /= stats.success_sum;
|
|
stats.av_bb_cost /= stats.success_sum;
|
|
stats.av_timing_cost /= stats.success_sum;
|
|
}
|
|
|
|
std_dev = get_std_dev(stats.success_sum, stats.sum_of_squares, stats.av_cost);
|
|
}
|
|
|
|
static void generate_post_place_timing_reports(const t_placer_opts& placer_opts,
|
|
const t_analysis_opts& analysis_opts,
|
|
const SetupTimingInfo& timing_info,
|
|
const PlacementDelayCalculator& delay_calc) {
|
|
auto& timing_ctx = g_vpr_ctx.timing();
|
|
auto& atom_ctx = g_vpr_ctx.atom();
|
|
|
|
VprTimingGraphResolver resolver(atom_ctx.nlist, atom_ctx.lookup, *timing_ctx.graph, delay_calc);
|
|
resolver.set_detail_level(analysis_opts.timing_report_detail);
|
|
|
|
tatum::TimingReporter timing_reporter(resolver, *timing_ctx.graph, *timing_ctx.constraints);
|
|
|
|
timing_reporter.report_timing_setup(placer_opts.post_place_timing_report_file, *timing_info.setup_analyzer(), analysis_opts.timing_report_npaths);
|
|
}
|
|
|
|
#if 0
|
|
static void update_screen_debug();
|
|
|
|
//Performs a major (i.e. interactive) placement screen update.
|
|
//This function with no arguments is useful for calling from a debugger to
|
|
//look at the intermediate implemetnation state.
|
|
static void update_screen_debug() {
|
|
update_screen(ScreenUpdatePriority::MAJOR, "DEBUG", PLACEMENT, nullptr);
|
|
}
|
|
#endif
|
|
|
|
static void print_place_status_header() {
|
|
VTR_LOG("------- ------- ---------- ---------- ------- ---------- -------- ------- ------- ------ -------- --------- ------\n");
|
|
VTR_LOG(" T Av Cost Av BB Cost Av TD Cost CPD sTNS sWNS Ac Rate Std Dev R lim Crit Exp Tot Moves Alpha\n");
|
|
VTR_LOG("------- ------- ---------- ---------- ------- ---------- -------- ------- ------- ------ -------- --------- ------\n");
|
|
}
|
|
|
|
static void print_place_status(const float t,
|
|
const float oldt,
|
|
const t_placer_statistics& stats,
|
|
const float cpd,
|
|
const float sTNS,
|
|
const float sWNS,
|
|
const float acc_rate,
|
|
const float std_dev,
|
|
const float rlim,
|
|
const float crit_exponent,
|
|
size_t tot_moves) {
|
|
VTR_LOG(
|
|
"%7.1e "
|
|
"%7.3f %10.2f %-10.5g "
|
|
"%7.3f % 10.3g % 8.3f "
|
|
"%7.3f %7.4f %6.1f %8.2f",
|
|
oldt,
|
|
stats.av_cost, stats.av_bb_cost, stats.av_timing_cost,
|
|
1e9 * cpd, 1e9 * sTNS, 1e9 * sWNS,
|
|
acc_rate, std_dev, rlim, crit_exponent);
|
|
|
|
pretty_print_uint(" ", tot_moves, 10, 3);
|
|
|
|
VTR_LOG(" %6.3f\n", t / oldt);
|
|
fflush(stdout);
|
|
}
|
|
|
|
static void print_resources_utilization() {
|
|
auto& place_ctx = g_vpr_ctx.placement();
|
|
auto& cluster_ctx = g_vpr_ctx.clustering();
|
|
auto& device_ctx = g_vpr_ctx.device();
|
|
|
|
int max_block_name = 0;
|
|
int max_tile_name = 0;
|
|
|
|
//Record the resource requirement
|
|
std::map<t_logical_block_type_ptr, size_t> num_type_instances;
|
|
std::map<t_logical_block_type_ptr, std::map<t_physical_tile_type_ptr, size_t>> num_placed_instances;
|
|
for (auto blk_id : cluster_ctx.clb_nlist.blocks()) {
|
|
auto block_loc = place_ctx.block_locs[blk_id];
|
|
auto loc = block_loc.loc;
|
|
|
|
auto physical_tile = device_ctx.grid[loc.x][loc.y].type;
|
|
auto logical_block = cluster_ctx.clb_nlist.block_type(blk_id);
|
|
|
|
num_type_instances[logical_block]++;
|
|
num_placed_instances[logical_block][physical_tile]++;
|
|
|
|
max_block_name = std::max<int>(max_block_name, strlen(logical_block->name));
|
|
max_tile_name = std::max<int>(max_tile_name, strlen(physical_tile->name));
|
|
}
|
|
|
|
VTR_LOG("\n");
|
|
VTR_LOG("Placement resource usage:\n");
|
|
for (auto logical_block : num_type_instances) {
|
|
for (auto physical_tile : num_placed_instances[logical_block.first]) {
|
|
VTR_LOG(" %-*s implemented as %-*s: %d\n", max_block_name, logical_block.first->name, max_tile_name, physical_tile.first->name, physical_tile.second);
|
|
}
|
|
}
|
|
VTR_LOG("\n");
|
|
}
|