Import Coloquinte as a Coriolis tool (merge from github/coloquinte).

2015-04-08 17:13:50 +02:00 · 2015-04-08 17:13:50 +02:00 · cdfdccb19d
parent 1625b16d4c 5d2845b80a
commit cdfdccb19d
33 changed files with 45037 additions and 39 deletions
--- a/bootstrap/build.conf
+++ b/bootstrap/build.conf
@ -5,11 +5,8 @@

 projectdir = 'coriolis-2.x'

-projects = [ { 'name'      :   "importeds"
-             , 'tools'     : [ "Coloquinte" ]
-             , 'repository': 'https://github.com/alnurn/Coloquinte' }
-
-           , { 'name'      :   "coriolis"
+projects = [ 
+             { 'name'      :   "coriolis"
             , 'tools'     : [ "bootstrap"
                             , "vlsisapd"
                             , "hurricane"
@ -17,6 +14,7 @@ projects = [ { 'name'      :   "importeds"
                            #, "nimbus"
                            #, "metis"
                            #, "mauka"
+                             , "coloquinte"
                             , "etesian"
                             , "knik"
                             , "katabatic"
--- a/coloquinte/CMakeLists.txt
+++ b/coloquinte/CMakeLists.txt
@ -0,0 +1,36 @@
+# -*- explicit-buffer-name: "CMakeLists.txt<etesian>" -*-
+
+ set(CMAKE_LEGACY_CYGWIN_WIN32 0)
+ project(COLOQUINTE)
+
+ option(BUILD_DOC      "Build the documentation (doxygen)" OFF)
+ 
+ cmake_minimum_required(VERSION 2.8.9)
+
+ list(INSERT CMAKE_MODULE_PATH 0 "${DESTDIR}$ENV{CORIOLIS_TOP}/share/cmake/Modules/")
+ find_package(Bootstrap  REQUIRED)
+ setup_project_paths(CORIOLIS)
+ setup_qt()
+ 
+ set_cmake_policies()
+ set_lib_link_mode()
+ setup_boost(program_options python regex)
+ 
+ find_package(LibXml2            REQUIRED)
+ find_package(PythonLibs         REQUIRED)
+ find_package(PythonSitePackages REQUIRED)
+ find_package(VLSISAPD           REQUIRED)
+ find_package(LEFDEF             REQUIRED)
+ find_package(HURRICANE          REQUIRED)
+ find_package(CORIOLIS           REQUIRED)
+ find_package(Libexecinfo        REQUIRED)
+ 
+ add_subdirectory(src)
+ add_subdirectory(cmake_modules)
+
+ if(BUILD_DOC)
+   find_package(Doxygen)
+#  if(DOXYGEN_FOUND)
+#    add_subdirectory(doc)
+#  endif()
+ endif()
--- a/coloquinte/cmake_modules/CMakeLists.txt
+++ b/coloquinte/cmake_modules/CMakeLists.txt
@ -0,0 +1,2 @@
+
+install ( FILES  FindCOLOQUINTE.cmake  DESTINATION share/cmake/Modules ) 
--- a/coloquinte/cmake_modules/FindCOLOQUINTE.cmake
+++ b/coloquinte/cmake_modules/FindCOLOQUINTE.cmake
@ -0,0 +1,37 @@
+# - Find the Coloquinte includes and libraries.
+# The following variables are set if Coriolis is found.  If COLOQUINTE is not
+# found, COLOQUINTE_FOUND is set to false.
+#  COLOQUINTE_FOUND       - True when the Coriolis include directory is found.
+#  COLOQUINTE_INCLUDE_DIR - the path to where the Coriolis include files are.
+#  COLOQUINTE_LIBRARIES   - The path to where the Coriolis library files are.
+
+
+SET(COLOQUINTE_INCLUDE_PATH_DESCRIPTION "directory containing the Coloquinte include files. E.g /usr/local/include/coriolis or /asim/coriolis/include/coriolis")
+
+SET(COLOQUINTE_DIR_MESSAGE "Set the COLOQUINTE_INCLUDE_DIR cmake cache entry to the ${COLOQUINTE_INCLUDE_PATH_DESCRIPTION}")
+
+# don't even bother under WIN32
+IF(UNIX)
+  #
+  # Look for an installation.
+  #
+  FIND_PATH(COLOQUINTE_INCLUDE_PATH NAMES coloquinte/netlist.hxx PATHS
+    # Look in other places.
+    ${CORIOLIS_DIR_SEARCH}
+    PATH_SUFFIXES include/coriolis
+    # Help the user find it if we cannot.
+    DOC "The ${COLOQUINTE_INCLUDE_PATH_DESCRIPTION}"
+  )
+
+  FIND_LIBRARY(COLOQUINTE_LIBRARY_PATH
+    NAMES coloquinte
+    PATHS ${CORIOLIS_DIR_SEARCH}
+    PATH_SUFFIXES lib${LIB_SUFFIX}
+    # Help the user find it if we cannot.
+    DOC "The ${COLOQUINTE_INCLUDE_PATH_DESCRIPTION}"
+  )
+
+  SET_LIBRARIES_PATH(COLOQUINTE COLOQUINTE)
+  HURRICANE_CHECK_LIBRARIES(COLOQUINTE)
+
+ENDIF(UNIX)
--- a/coloquinte/src/CMakeLists.txt
+++ b/coloquinte/src/CMakeLists.txt
@ -0,0 +1,42 @@
+# -*- explicit-buffer-name: "CMakeLists.txt<Coloquinte/src>" -*-
+
+include_directories( ${COLOQUINTE_SOURCE_DIR}/src
+                     ${CORIOLIS_INCLUDE_DIR} 
+                     ${HURRICANE_INCLUDE_DIR} 
+                     ${CONFIGURATION_INCLUDE_DIR} 
+                   )
+
+set ( includes       coloquinte/circuit.hxx
+                     coloquinte/circuit_helper.hxx
+                     coloquinte/common.hxx
+                     coloquinte/netlist.hxx
+                     coloquinte/solvers.hxx
+                     coloquinte/rough_legalizers.hxx
+                     coloquinte/legalizer.hxx
+                     coloquinte/detailed.hxx
+                     coloquinte/topologies.hxx
+                     coloquinte/optimization_subproblems.hxx
+                     coloquinte/piecewise_linear.hxx
+    )	           
+set ( cpps           circuit.cxx
+                     checkers.cxx
+                     rough_legalizers.cxx
+                     solvers.cxx
+                     optimization_subproblems.cxx
+                     piecewise_linear.cxx
+                     orientation.cxx
+                     detailed.cxx
+                     cell_swapping.cxx
+                     MCF_opt.cxx
+                     row_opt.cxx
+                     topologies.cxx
+                     lookup_table.cxx
+                     legalizer.cxx
+    )
+
+         add_library ( coloquinte       ${cpps} )
+set_target_properties( coloquinte       PROPERTIES VERSION 1.0 SOVERSION 1 )
+
+install( TARGETS coloquinte  DESTINATION lib${LIB_SUFFIX} )
+install( FILES ${includes}   DESTINATION include/coriolis2/coloquinte ) 
+
--- a/coloquinte/src/MCF_opt.cxx
+++ b/coloquinte/src/MCF_opt.cxx
@ -0,0 +1,147 @@
+
+#include "coloquinte/detailed.hxx"
+#include "coloquinte/circuit_helper.hxx"
+
+#include <lemon/smart_graph.h>
+#include <lemon/network_simplex.h>
+
+#include <cassert>
+
+namespace coloquinte{
+namespace dp{
+
+void optimize_on_topology_HPWL(netlist const & circuit, detailed_placement & pl){
+    // Solves a minimum cost flow problem to optimize the placement at fixed topology
+    // Concretely, it means aligning the pins to minimize the wirelength
+    // It uses the Lemon network simplex solver from the Coin-OR initiative, which should scale well up to hundred of thousands of cells
+
+    using namespace lemon;
+    DIGRAPH_TYPEDEFS(SmartDigraph);
+    // Create a graph with the cells and bounds of the nets as node
+    SmartDigraph g;
+
+    std::vector<Node> cell_nodes(circuit.cell_cnt());
+    for(index_t i=0; i<circuit.cell_cnt(); ++i){
+        if((circuit.get_cell(i).attributes & XMovable) != 0)
+            cell_nodes[i] = g.addNode();
+    }
+    std::vector<Node> Lnet_nodes(circuit.net_cnt()), Unet_nodes(circuit.net_cnt());
+    for(index_t i=0; i<circuit.net_cnt(); ++i){
+        if(circuit.get_net(i).pin_cnt > 0){
+            Lnet_nodes[i] = g.addNode();
+            Unet_nodes[i] = g.addNode();
+        }
+    }
+
+    // Two nodes for position constraints
+    Node fixed = g.addNode();
+
+    typedef std::pair<SmartDigraph::Arc, int_t> arc_pair;
+    typedef std::pair<SmartDigraph::Node, int_t> node_pair;
+    // The arcs corresponding to constraints of the original problem
+    std::vector<arc_pair> constraint_arcs;
+
+    // Now we add every positional constraint, which becomes an arc in the min-cost flow problem
+    for(index_t i=0; i<circuit.cell_cnt(); ++i){ // The cells
+        for(index_t l = pl.neighbours_limits_[i]; l < pl.neighbours_limits_[i+1]; ++l){
+            index_t oi = pl.neighbours_[l].second;
+            if(oi == null_ind) continue;
+            
+            if((circuit.get_cell(i).attributes & XMovable) != 0 and (circuit.get_cell(oi).attributes & XMovable) != 0){
+                // Two movable cells: OK
+                auto A = g.addArc(cell_nodes[oi], cell_nodes[i]);
+                constraint_arcs.push_back(arc_pair(A, -circuit.get_cell(i).size.x_));
+            }
+            else if((circuit.get_cell( i).attributes & XMovable) != 0){
+                // The cell c is movable and constrained on the right
+                auto A = g.addArc(fixed, cell_nodes[i]);
+                constraint_arcs.push_back(arc_pair(A, pl.plt_.positions_[oi].x_ - circuit.get_cell(i).size.x_));
+            }
+            else if((circuit.get_cell(oi).attributes & XMovable) != 0){
+                // The cell oc is movable and constrained on the left
+                auto A = g.addArc(cell_nodes[oi], fixed);
+                constraint_arcs.push_back(arc_pair(A, -pl.plt_.positions_[i].x_ - circuit.get_cell(i).size.x_));
+            }
+        }
+    }
+
+    
+    for(index_t r=0; r<pl.row_cnt(); ++r){ // And the boundaries of each row
+        index_t lc = pl.row_first_cells_[r];
+        if(lc != null_ind and (circuit.get_cell(lc).attributes & XMovable) != 0){
+            auto Al = g.addArc(cell_nodes[lc], fixed);
+            constraint_arcs.push_back(arc_pair(Al, -pl.min_x_));
+        }
+    }
+    for(index_t r=0; r<pl.row_cnt(); ++r){ // And the boundaries of each row
+        index_t rc = pl.row_last_cells_[r];
+        if(rc != null_ind and (circuit.get_cell(rc).attributes & XMovable) != 0){
+            auto Ar = g.addArc(fixed, cell_nodes[rc]);
+            constraint_arcs.push_back(arc_pair(Ar, pl.max_x_ - circuit.get_cell(rc).size.x_));
+        }
+    }
+    
+
+    // And every pin of every net: arcs too
+    for(index_t n=0; n<circuit.net_cnt(); ++n){
+        for(auto p : circuit.get_net(n)){
+            index_t c = p.cell_ind;
+            int_t pin_offs = (pl.plt_.orientations_[c].x_ ? p.offset.x_ : circuit.get_cell(c).size.x_ - p.offset.x_); // Offset to the beginning of the cell
+            if((circuit.get_cell(c).attributes & XMovable) != 0){
+                Arc Al = g.addArc(cell_nodes[c], Lnet_nodes[n]);
+                constraint_arcs.push_back(arc_pair(Al, pin_offs));
+                Arc Ar = g.addArc(Unet_nodes[n], cell_nodes[c]);
+                constraint_arcs.push_back(arc_pair(Ar, -pin_offs));
+            }
+            else{ // Fixed offset
+                auto Al = g.addArc(fixed, Lnet_nodes[n]);
+                constraint_arcs.push_back(arc_pair(Al, pl.plt_.positions_[c].x_ + pin_offs));
+                auto Ar = g.addArc(Unet_nodes[n], fixed);
+                constraint_arcs.push_back(arc_pair(Ar, - pl.plt_.positions_[c].x_ - pin_offs));
+            }
+        }
+    }
+
+    // Then the only capacitated arcs: the ones for the nets
+    std::vector<node_pair> net_supplies;
+    for(index_t n=0; n<circuit.net_cnt(); ++n){
+        if(circuit.get_net(n).pin_cnt > 0){
+            net_supplies.push_back(node_pair(Unet_nodes[n],  circuit.get_net(n).weight));
+            net_supplies.push_back(node_pair(Lnet_nodes[n], -circuit.get_net(n).weight));
+        }
+    }
+
+    // Create the maps to have cost and capacity for the arcs
+    IntArcMap cost(g, 0);
+    IntArcMap capacity(g, circuit.net_cnt());
+    IntNodeMap supply(g, 0);
+
+    for(arc_pair A : constraint_arcs){
+        cost[A.first] = A.second;
+    }
+
+    for(node_pair N : net_supplies){
+        supply[N.first] = N.second;
+    }
+
+    // Then we (hope the solver can) solve it
+    NetworkSimplex<SmartDigraph> ns(g);
+    ns.supplyMap(supply).costMap(cost);
+    auto res = ns.run();
+    if(res != ns.OPTIMAL){
+        abort();
+    }
+    
+    // And we get the new positions as the dual values of the current solution (compared to the fixed pin) 
+    for(index_t c=0; c<circuit.cell_cnt(); ++c){ // The cells
+        if((circuit.get_cell(c).attributes & XMovable) != 0){
+            pl.plt_.positions_[c].x_ = ns.potential(cell_nodes[c]) - ns.potential(fixed);
+        }
+    }
+    pl.selfcheck();
+}
+
+} // namespace dp
+} // namespace coloquinte
+
+
--- a/coloquinte/src/cell_swapping.cxx
+++ b/coloquinte/src/cell_swapping.cxx
@ -0,0 +1,185 @@
+
+#include "coloquinte/detailed.hxx"
+#include "coloquinte/circuit_helper.hxx"
+
+#include <functional>
+
+namespace coloquinte{
+namespace dp{
+
+namespace{
+
+// Tries to swap two cells; 
+inline bool try_swap(netlist const & circuit, detailed_placement & pl, index_t c1, index_t c2, bool try_flip,
+std::function<std::int64_t(netlist const &, detailed_placement const &, std::vector<index_t> const &)> get_nets_cost){
+    assert(pl.cell_height(c1) == 1 and pl.cell_height(c2) == 1);
+    assert( (circuit.get_cell(c1).attributes & XMovable) != 0 and (circuit.get_cell(c1).attributes & YMovable) != 0);
+    assert( (circuit.get_cell(c2).attributes & XMovable) != 0 and (circuit.get_cell(c2).attributes & YMovable) != 0);
+
+    auto c1_bnds = pl.get_limit_positions(circuit, c1),
+         c2_bnds = pl.get_limit_positions(circuit, c2);
+
+    // Get the possible positions for a swap
+    int_t swp_min_c1 = c2_bnds.first,
+          swp_min_c2 = c1_bnds.first,
+          swp_max_c1 = c2_bnds.second - circuit.get_cell(c1).size.x_,
+          swp_max_c2 = c1_bnds.second - circuit.get_cell(c2).size.x_;
+
+    if(swp_max_c1 >= swp_min_c1 and swp_max_c2 >= swp_min_c2){
+        // Check both orientations of the cell
+
+        // Get all the nets involved and uniquify them (nets with more than one pin on the cells)
+        std::vector<index_t> involved_nets;
+        for(netlist::pin_t p : circuit.get_cell(c1)){
+            involved_nets.push_back(p.net_ind);
+        }
+        for(netlist::pin_t p : circuit.get_cell(c2)){
+            involved_nets.push_back(p.net_ind);
+        }
+        std::sort(involved_nets.begin(), involved_nets.end());
+        involved_nets.resize(std::distance(involved_nets.begin(), std::unique(involved_nets.begin(), involved_nets.end())));
+
+        // Test the cost for the old position and the cost swapping the cells
+        std::int64_t old_cost = get_nets_cost(circuit, pl, involved_nets);
+
+        // Save the old values
+        point<int_t> p1 = pl.plt_.positions_[c1];
+        point<int_t> p2 = pl.plt_.positions_[c2];
+        point<bool> o1 = pl.plt_.orientations_[c1];
+        point<bool> o2 = pl.plt_.orientations_[c2];
+
+        // Warning: won't work if the two cells don't have the same height
+        pl.plt_.positions_[c1].x_ = (swp_min_c1 + swp_max_c1) / 2;
+        pl.plt_.positions_[c2].x_ = (swp_min_c2 + swp_max_c2) / 2;
+        pl.plt_.positions_[c1].y_ = p2.y_;
+        pl.plt_.positions_[c2].y_ = p1.y_;
+
+        // For standard cell placement, we want all the rows to be aligned in the same way
+        if( (circuit.get_cell(c1).attributes & YFlippable) != 0 and (circuit.get_cell(c2).attributes & YFlippable) != 0)
+            std::swap(pl.plt_.orientations_[c1].y_, pl.plt_.orientations_[c2].y_);
+
+        if(try_flip and (circuit.get_cell(c1).attributes & XFlippable) != 0 and (circuit.get_cell(c2).attributes & XFlippable) != 0){
+            index_t bst_ind = 4;
+            for(index_t i=0; i<4; ++i){
+                pl.plt_.orientations_[c1].x_ = i % 2;
+                pl.plt_.orientations_[c2].x_ = i / 2;
+                std::int64_t new_cost  = get_nets_cost(circuit, pl, involved_nets);
+                if(new_cost < old_cost){
+                    old_cost = new_cost;
+                    bst_ind = i;
+                }
+            }
+
+            // One of the orientations with the new positions was better
+            if(bst_ind < 4){
+                pl.swap_standard_cell_topologies(c1, c2);
+                pl.plt_.orientations_[c1].x_ = bst_ind % 2;
+                pl.plt_.orientations_[c2].x_ = bst_ind / 2;
+                // We kept the swap
+                return true;
+            }
+            else{
+                pl.plt_.positions_[c1] = p1;
+                pl.plt_.positions_[c2] = p2;
+                pl.plt_.orientations_[c1] = o1;
+                pl.plt_.orientations_[c2] = o2;
+                return false;
+            }
+        }
+        else if(get_nets_cost(circuit, pl, involved_nets) < old_cost){
+            pl.swap_standard_cell_topologies(c1, c2);
+            return true;
+        }
+        else{
+            // Reset the old values since we didn't swap anything
+            pl.plt_.positions_[c1] = p1;
+            pl.plt_.positions_[c2] = p2;
+            pl.plt_.orientations_[c1] = o1;
+            pl.plt_.orientations_[c2] = o2;
+            return false;
+        }
+
+        // A better solution would be
+        // Check the cost on y depending on the position (extremely simple: two positions for each cell)
+        // Check the cost on x depending on the position: piecewise linear and relatively complex
+        //      * Get all external pins
+        //      * Get all nets involving only one of the cells: piecewise linear cost for each of them
+        //      * For nets involving the two cells, we have an additional cost
+
+    }
+    else{ // We just cannot swap those two cells without pushing anything
+        return false;
+    }
+}
+
+inline void generic_swaps_global(netlist const & circuit, detailed_placement & pl, index_t row_extent, index_t cell_extent, bool try_flip,
+std::function<std::int64_t(netlist const &, detailed_placement const &, std::vector<index_t> const &)> get_nets_cost){
+    for(index_t main_row = 0; main_row < pl.row_cnt(); ++main_row){
+
+        for(index_t other_row = main_row+1; other_row <= std::min(pl.row_cnt()-1, main_row+row_extent) ; ++other_row){
+
+            index_t first_oc = pl.get_first_standard_cell_on_row(other_row); // The first candidate cell to be examined
+            for(index_t c = pl.get_first_standard_cell_on_row(main_row); c != null_ind; c = pl.get_next_standard_cell_on_row(c, main_row)){
+                assert(pl.cell_rows_[c] == main_row);
+                if( (circuit.get_cell(c).attributes & XMovable) == 0) continue; // Don't touch fixed cells
+
+                // Number of cells after/before the end of the cell
+                index_t nb_after  = 0;
+                index_t nb_before = 0;
+                int_t pos_low = pl.plt_.positions_[c].x_ -   circuit.get_cell(c).size.x_,
+                      pos_hgh = pl.plt_.positions_[c].x_ + 2*circuit.get_cell(c).size.x_;
+                for(index_t oc=first_oc; oc != null_ind and nb_after <= row_extent; oc = pl.get_next_standard_cell_on_row(oc, other_row)){
+                    assert(pl.cell_rows_[oc] == other_row);
+                    if( (circuit.get_cell(oc).attributes & XMovable) == 0) continue; // Don't touche fixed cells
+
+                    // Count the cells which should trigger stop or shouldn't be used at the next iteration
+                    if(pl.plt_.positions_[oc].x_ >= pos_hgh) ++nb_after;
+                    if(pl.plt_.positions_[oc].x_ + circuit.get_cell(oc).size.x_ <= pos_low) ++ nb_before;
+
+                    if(try_swap(circuit, pl, c, oc, try_flip, get_nets_cost)){
+                        std::swap(c, oc);
+                        if(c == first_oc) first_oc = oc;
+                    }
+                }
+                while(nb_before > cell_extent){
+                    nb_before--;
+                    first_oc = pl.get_next_standard_cell_on_row(first_oc, other_row);
+                }
+            }
+        }
+    }
+    pl.selfcheck();
+}
+
+} // End anonymous namespace
+
+void swaps_global_HPWL(netlist const & circuit, detailed_placement & pl, index_t row_extent, index_t cell_extent, bool try_flip){
+    generic_swaps_global(circuit, pl, row_extent, cell_extent, try_flip,
+        [](netlist const & circuit, detailed_placement const & pl, std::vector<index_t> const & involved_nets) -> std::int64_t{
+        std::int64_t sum = 0;
+        for(index_t n : involved_nets){
+            if(circuit.get_net(n).pin_cnt <= 1) continue;
+            sum += get_HPWL_length(circuit, pl.plt_, n);
+        }
+        return sum;
+    });
+}
+
+void swaps_global_RSMT(netlist const & circuit, detailed_placement & pl, index_t row_extent, index_t cell_extent, bool try_flip){
+    generic_swaps_global(circuit, pl, row_extent, cell_extent, try_flip,
+        [](netlist const & circuit, detailed_placement const & pl, std::vector<index_t> const & involved_nets) -> std::int64_t{
+        std::int64_t sum = 0;
+        for(index_t n : involved_nets){
+            if(circuit.get_net(n).pin_cnt <= 1) continue;
+            sum += get_RSMT_length(circuit, pl.plt_, n);
+        }
+        return sum;
+    });
+}
+
+} // namespace dp
+} // namespace coloquinte
+
+
+
+
--- a/coloquinte/src/checkers.cxx
+++ b/coloquinte/src/checkers.cxx
@ -0,0 +1,96 @@
+
+#include "coloquinte/circuit.hxx"
+
+#include <map>
+
+namespace coloquinte{
+
+void netlist::selfcheck() const{
+    index_t cell_cnt = cell_areas_.size();
+    assert(cell_cnt+1 == cell_limits_.size());
+    assert(cell_cnt == cell_sizes_.size());
+    assert(cell_cnt == cell_attributes_.size());
+    assert(cell_cnt == cell_internal_mapping_.size());
+
+    index_t net_cnt = net_weights_.size();
+    assert(net_cnt+1 == net_limits_.size());
+    assert(net_cnt == net_internal_mapping_.size());
+
+    index_t pin_cnt = pin_offsets_.size();
+    assert(pin_cnt == cell_indexes_.size());
+    assert(pin_cnt == pin_indexes_.size());
+    assert(pin_cnt == net_indexes_.size());
+
+    for(auto const p : pin_offsets_){
+        assert(std::isfinite(p.x_) and std::isfinite(p.y_));
+    }
+}
+
+// For compatibility reasons
+void placement_t::selfcheck() const{
+}
+
+void verify_placement_legality(netlist const & circuit, placement_t const & pl, box<int_t> surface){
+    std::vector<box<int_t> > cells;
+    for(index_t i=0; i<circuit.cell_cnt(); ++i){
+        auto S = circuit.get_cell(i).size;
+        cells.push_back(box<int_t>(pl.positions_[i], pl.positions_[i] + S));
+
+        // Verify that they are within the placement surface; doesn't take fixed macros into account
+        if( (circuit.get_cell(i).attributes & XMovable) != 0 or (circuit.get_cell(i).attributes & YMovable) != 0){
+            assert(cells[i].in(surface));
+        }
+    }
+
+    // Simple sweepline algorithm to verify that there is no overlap
+    struct event{
+        int_t x_min, x_max, y;
+        index_t cell;
+        bool removal;
+        bool operator<(event const o) const{
+            return y < o.y
+            or (y == o.y and removal and not o.removal); // Remove before inserting
+        }
+    };
+
+    std::vector<event> all_events;
+    for(index_t i=0; i<circuit.cell_cnt(); ++i){
+        event b, e;
+        b.cell = i; e.cell = i;
+        b.x_min = cells[i].x_min_; e.x_min = cells[i].x_min_;
+        b.x_max = cells[i].x_max_; e.x_max = cells[i].x_max_;
+        b.y = cells[i].y_min_; b.removal = false;
+        e.y = cells[i].y_max_; e.removal = true;
+        if(b.x_max > b.x_min and e.y != b.y){
+            all_events.push_back(b);
+            all_events.push_back(e);
+        }
+    }
+
+    std::sort(all_events.begin(), all_events.end());
+
+    // Indexed by beginning of interval, with end of interval and cell within
+    std::map<int_t, std::pair<int_t, index_t> > active_rectangles;
+
+    for(event E : all_events){
+        if(E.removal){
+            auto it = active_rectangles.find(E.x_min);
+            assert(it != active_rectangles.end());
+            active_rectangles.erase(it);
+        }
+        else{ // Find anything that intersects with E; if not, add it
+            auto it = active_rectangles.lower_bound(E.x_min); // First interval after
+            if(it != active_rectangles.end()){
+                assert(it->first >= E.x_max); //Intersection between E.cell and it->second->second
+            }
+            if(it != active_rectangles.begin()){
+                --it;
+                assert(it->second.first <= E.x_min); //Intersection between E.cell and it->second->second
+            }
+            active_rectangles.insert(std::pair<int_t, std::pair<int_t, index_t> >(E.x_min, std::pair<int_t, index_t>(E.x_max, E.cell)));
+        }
+    }
+
+}
+
+} // namespace coloquinte
--- a/coloquinte/src/circuit.cxx
+++ b/coloquinte/src/circuit.cxx
@ -0,0 +1,407 @@
+
+#include "coloquinte/circuit_helper.hxx"
+#include "coloquinte/circuit.hxx"
+
+namespace coloquinte{
+
+std::int64_t get_HPWL_length(netlist const & circuit, placement_t const & pl, index_t net_ind){
+    if(circuit.get_net(net_ind).pin_cnt <= 1) return 0;
+
+    auto pins = get_pins_1D(circuit, pl, net_ind);
+    auto minmaxX = std::minmax_element(pins.x_.begin(), pins.x_.end()), minmaxY = std::minmax_element(pins.y_.begin(), pins.y_.end());
+    return ((minmaxX.second->pos - minmaxX.first->pos) + (minmaxY.second->pos - minmaxY.first->pos));
+}
+
+std::int64_t get_RSMT_length(netlist const & circuit, placement_t const & pl, index_t net_ind){
+    if(circuit.get_net(net_ind).pin_cnt <= 1) return 0;
+    auto pins = get_pins_2D(circuit, pl, net_ind);
+    std::vector<point<int_t> > points;
+    for(pin_2D const p : pins){
+        points.push_back(p.pos);
+    }
+    return RSMT_length(points, 8);
+}
+
+namespace gp{
+
+void add_force(pin_1D const p1, pin_1D const p2, linear_system & L, float_t force){
+    if(p1.movable && p2.movable){
+        L.add_force(
+            force,
+            p1.cell_ind, p2.cell_ind,
+            p1.offs,     p2.offs
+        );
+    }
+    else if(p1.movable){
+        L.add_fixed_force(
+            force,
+            p1.cell_ind,
+            p2.pos,
+            p1.offs
+        );
+    }
+    else if(p2.movable){
+        L.add_fixed_force(
+            force,
+            p2.cell_ind,
+            p1.pos,
+            p2.offs
+        );
+    }
+}
+
+void add_force(pin_1D const p1, pin_1D const p2, linear_system & L, float_t tol, float_t scale){
+    add_force(p1, p2, L, scale/std::max(tol, static_cast<float_t>(std::abs(p2.pos-p1.pos))));
+}
+
+point<linear_system> empty_linear_systems(netlist const & circuit, placement_t const & pl){
+    point<linear_system> ret = point<linear_system>(linear_system(circuit.cell_cnt()), linear_system(circuit.cell_cnt()));
+
+    for(index_t i=0; i<circuit.cell_cnt(); ++i){
+        bool found_true_net=false;
+        for(auto p : circuit.get_cell(i)){
+            if(circuit.get_net(p.net_ind).pin_cnt > 1){
+                found_true_net = true;
+                break;
+            }
+        }
+
+        if( (XMovable & circuit.get_cell(i).attributes) == 0 or not found_true_net){
+            ret.x_.add_triplet(i, i, 1.0f);
+            ret.x_.add_doublet(i, pl.positions_[i].x_);
+        }
+        if( (YMovable & circuit.get_cell(i).attributes) == 0 or not found_true_net){
+            ret.y_.add_triplet(i, i, 1.0f);
+            ret.y_.add_doublet(i, pl.positions_[i].y_);
+        }
+    }
+
+    return ret;
+}
+
+namespace{ // Anonymous namespace for helper functions
+
+void get_HPWLF(std::vector<pin_1D> const & pins, linear_system & L, float_t tol){
+    if(pins.size() >= 2){
+        auto min_elt = std::min_element(pins.begin(), pins.end()), max_elt = std::max_element(pins.begin(), pins.end());
+
+        for(auto it = pins.begin(); it != pins.end(); ++it){
+            // Just comparing the iterator is poorer due to redundancies in the benchmarks!
+            if(it != min_elt){
+                add_force(*it, *min_elt, L, tol, 1.0f/(pins.size()-1));
+                if(it != max_elt){ // Hopefully only one connexion between the min and max pins
+                    add_force(*it, *max_elt, L, tol, 1.0f/(pins.size()-1));
+                }
+            }
+        }
+    }
+}
+
+void get_HPWLR(std::vector<pin_1D> const & pins, linear_system & L, float_t tol){
+    std::vector<pin_1D> sorted_pins = pins;
+    std::sort(sorted_pins.begin(), sorted_pins.end());
+    // Pins are connected to the pin two places away
+    for(index_t i=0; i+2<sorted_pins.size(); ++i){
+        add_force(sorted_pins[i], sorted_pins[i+2], L, tol, 0.5f);
+    }
+    // The extreme pins are connected with their direct neighbour too
+    if(sorted_pins.size() > 1){
+        add_force(sorted_pins[0], sorted_pins[1], L, tol, 0.5f);
+        add_force(sorted_pins[sorted_pins.size()-1], sorted_pins[sorted_pins.size()-2], L, tol, 0.5f);
+    }
+}
+
+void get_star(std::vector<pin_1D> const & pins, linear_system & L, float_t tol, index_t star_index){
+    // The net is empty, but we still populate the diagonal to avoid divide by zeros
+    if(pins.size() < 2){
+        L.add_triplet(star_index, star_index, 1.0f);
+        return;
+    }
+
+    for(pin_1D p : pins){
+        pin_1D star_pin = pin_1D(star_index, 0, 0, true);
+        add_force(p, star_pin, L, 1.0/pins.size());
+    }
+}
+
+void get_clique(std::vector<pin_1D> const & pins, linear_system & L, float_t tol){
+    // Pins are connected to the pin two places away
+    for(index_t i=0; i+1<pins.size(); ++i){
+        for(index_t j=i+1; j<pins.size(); ++j){
+            add_force(pins[i], pins[j], L, tol, 1.0f/(pins.size()-1));
+        }
+    }
+}
+
+} // End anonymous namespace
+
+point<linear_system> get_HPWLF_linear_system (netlist const & circuit, placement_t const & pl, float_t tol, index_t min_s, index_t max_s){
+    point<linear_system> L = empty_linear_systems(circuit, pl);
+    for(index_t i=0; i<circuit.net_cnt(); ++i){
+        // Has the net the right pin count?
+        index_t pin_cnt = circuit.get_net(i).pin_cnt;
+        if(pin_cnt < min_s or pin_cnt >= max_s) continue;
+
+        auto pins = get_pins_1D(circuit, pl, i);
+        get_HPWLF(pins.x_, L.x_, tol);
+        get_HPWLF(pins.y_, L.y_, tol);
+    }
+    return L;
+}
+
+point<linear_system> get_HPWLR_linear_system (netlist const & circuit, placement_t const & pl, float_t tol, index_t min_s, index_t max_s){
+    point<linear_system> L = empty_linear_systems(circuit, pl);
+    for(index_t i=0; i<circuit.net_cnt(); ++i){
+        // Has the net the right pin count?
+        index_t pin_cnt = circuit.get_net(i).pin_cnt;
+        if(pin_cnt < min_s or pin_cnt >= max_s) continue;
+
+        auto pins = get_pins_1D(circuit, pl, i);
+        get_HPWLR(pins.x_, L.x_, tol);
+        get_HPWLR(pins.y_, L.y_, tol);
+    }
+    return L;
+}
+
+point<linear_system> get_star_linear_system  (netlist const & circuit, placement_t const & pl, float_t tol, index_t min_s, index_t max_s){
+    point<linear_system> L = empty_linear_systems(circuit, pl);
+    L.x_.add_variables(circuit.net_cnt());
+    L.y_.add_variables(circuit.net_cnt());
+    for(index_t i=0; i<circuit.net_cnt(); ++i){
+        // Has the net the right pin count?
+        index_t pin_cnt = circuit.get_net(i).pin_cnt;
+        if(pin_cnt < min_s or pin_cnt >= max_s){
+            // Put a one in the intermediate variable in order to avoid non-invertible matrices
+            L.x_.add_triplet(i+circuit.cell_cnt(), i+circuit.cell_cnt(), 1.0f);
+            L.y_.add_triplet(i+circuit.cell_cnt(), i+circuit.cell_cnt(), 1.0f);
+            continue;
+        }
+
+        auto pins = get_pins_1D(circuit, pl, i);
+        // Provide the index of the star's central pin in the linear system
+        get_star(pins.x_, L.x_, tol, i+circuit.cell_cnt());
+        get_star(pins.y_, L.y_, tol, i+circuit.cell_cnt());
+    }
+    return L;
+}
+
+point<linear_system> get_clique_linear_system (netlist const & circuit, placement_t const & pl, float_t tol, index_t min_s, index_t max_s){
+    point<linear_system> L = empty_linear_systems(circuit, pl);
+    for(index_t i=0; i<circuit.net_cnt(); ++i){
+        // Has the net the right pin count?
+        index_t pin_cnt = circuit.get_net(i).pin_cnt;
+        if(pin_cnt < min_s or pin_cnt >= max_s) continue;
+
+        auto pins = get_pins_1D(circuit, pl, i);
+        get_clique(pins.x_, L.x_, tol);
+        get_clique(pins.y_, L.y_, tol);
+    }
+    return L;
+}
+
+point<linear_system> get_MST_linear_system(netlist const & circuit, placement_t const & pl, float_t tol, index_t min_s, index_t max_s){
+    point<linear_system> L = empty_linear_systems(circuit, pl);
+    for(index_t i=0; i<circuit.net_cnt(); ++i){
+        // Has the net the right pin count?
+        index_t pin_cnt = circuit.get_net(i).pin_cnt;
+        if(pin_cnt < min_s or pin_cnt >= max_s or pin_cnt <= 1) continue;
+            
+        auto pins = get_pins_2D(circuit, pl, i);
+        std::vector<point<int_t> > points;
+        for(pin_2D const p : pins){
+            points.push_back(p.pos);
+        }
+        auto const edges = get_MST_topology(points);
+        for(auto E : edges){
+            add_force(pins[E.first].x(), pins[E.second].x(), L.x_, tol, 1.0f);
+            add_force(pins[E.first].y(), pins[E.second].y(), L.y_, tol, 1.0f);
+        }
+    }
+    return L;
+}
+
+point<linear_system> get_RSMT_linear_system(netlist const & circuit, placement_t const & pl, float_t tol, index_t min_s, index_t max_s){
+    point<linear_system> L = empty_linear_systems(circuit, pl);
+    for(index_t i=0; i<circuit.net_cnt(); ++i){
+        // Has the net the right pin count?
+        index_t pin_cnt = circuit.get_net(i).pin_cnt;
+        if(pin_cnt < min_s or pin_cnt >= max_s or pin_cnt <= 1) continue;
+            
+        auto pins = get_pins_2D(circuit, pl, i);
+        std::vector<point<int_t> > points;
+        for(pin_2D const p : pins){
+            points.push_back(p.pos);
+        }
+        auto const edges = get_RSMT_topology(points, 8);
+        for(auto E : edges.x_){
+            add_force(pins[E.first].x(), pins[E.second].x(), L.x_, tol, 1.0f);
+        }
+        for(auto E : edges.y_){
+            add_force(pins[E.first].y(), pins[E.second].y(), L.y_, tol, 1.0f);
+        }
+    }
+    return L;
+}
+
+
+std::int64_t get_HPWL_wirelength(netlist const & circuit, placement_t const & pl){
+    std::int64_t sum = 0;
+    for(index_t i=0; i<circuit.net_cnt(); ++i){
+        sum += get_HPWL_length(circuit, pl, i);
+    }
+    return sum;
+}
+
+// The true wirelength with minimum spanning trees, except for very small nets (<= 3) where we have HPWL == true WL
+std::int64_t get_MST_wirelength(netlist const & circuit, placement_t const & pl){
+    std::int64_t sum = 0;
+    for(index_t i=0; i<circuit.net_cnt(); ++i){
+        auto pins = get_pins_2D(circuit, pl, i);
+        std::vector<point<int_t> > points;
+        for(pin_2D const p : pins){
+            points.push_back(p.pos);
+        }
+        sum += MST_length(points);
+    }
+    return sum;
+}
+
+std::int64_t get_RSMT_wirelength(netlist const & circuit, placement_t const & pl){
+    std::int64_t sum = 0;
+    for(index_t i=0; i<circuit.net_cnt(); ++i){
+        sum += get_RSMT_length(circuit, pl, i);
+    }
+    return sum;
+}
+
+void solve_linear_system(netlist const & circuit, placement_t & pl, point<linear_system> & L, index_t nbr_iter){
+    std::vector<float_t> x_sol, y_sol;
+    std::vector<float_t> x_guess(pl.cell_cnt()), y_guess(pl.cell_cnt());
+    
+    assert(L.x_.internal_size() == x_guess.size());
+    assert(L.y_.internal_size() == y_guess.size());
+
+    for(index_t i=0; i<pl.cell_cnt(); ++i){
+        x_guess[i] = static_cast<float_t>(pl.positions_[i].x_);
+        y_guess[i] = static_cast<float_t>(pl.positions_[i].y_);
+    }
+    #pragma omp parallel sections num_threads(2)
+    {
+    #pragma omp section
+    x_sol = L.x_.solve_CG(x_guess, nbr_iter);
+    #pragma omp section
+    y_sol = L.y_.solve_CG(y_guess, nbr_iter);
+    }
+    for(index_t i=0; i<pl.cell_cnt(); ++i){
+        if( (circuit.get_cell(i).attributes & XMovable) != 0){
+            assert(std::isfinite(x_sol[i]));
+            pl.positions_[i].x_ = static_cast<int_t>(x_sol[i]);
+        }
+        if( (circuit.get_cell(i).attributes & YMovable) != 0){
+            assert(std::isfinite(y_sol[i]));
+            pl.positions_[i].y_ = static_cast<int_t>(y_sol[i]);
+        }
+    }
+}
+
+// Intended to be used by pulling forces to adapt the forces to the cell's areas
+std::vector<float_t> get_area_scales(netlist const & circuit){
+    std::vector<float_t> ret(circuit.cell_cnt());
+    capacity_t int_tot_area = 0;
+    for(index_t i=0; i<circuit.cell_cnt(); ++i){
+        capacity_t A = circuit.get_cell(i).area;
+        ret[i] = static_cast<float_t>(A);
+        int_tot_area += A;
+    }
+    float_t inv_average_area = circuit.cell_cnt() / static_cast<float_t>(int_tot_area);
+    for(index_t i=0; i<circuit.cell_cnt(); ++i){
+        ret[i] *= inv_average_area;
+    }
+    return ret;
+}
+
+point<linear_system> get_pulling_forces (netlist const & circuit, placement_t const & pl, float_t typical_distance){
+    point<linear_system> L = empty_linear_systems(circuit, pl);
+    float_t typical_force = 1.0f / typical_distance;
+    std::vector<float_t> scaling = get_area_scales(circuit);
+    for(index_t i=0; i<pl.cell_cnt(); ++i){
+        L.x_.add_anchor(
+            typical_force * scaling[i],
+            i, pl.positions_[i].x_
+        );
+        L.y_.add_anchor(
+            typical_force * scaling[i],
+            i, pl.positions_[i].y_
+        );
+    }
+    
+    return L;
+}
+
+point<linear_system> get_linear_pulling_forces (netlist const & circuit, placement_t const & UB_pl, placement_t const & LB_pl, float_t force, float_t min_distance){
+    point<linear_system> L = empty_linear_systems(circuit, UB_pl);
+    assert(LB_pl.cell_cnt() == UB_pl.cell_cnt());
+    std::vector<float_t> scaling = get_area_scales(circuit);
+    for(index_t i=0; i<LB_pl.cell_cnt(); ++i){
+        L.x_.add_anchor(
+            force * scaling[i] / (std::max(static_cast<float_t>(std::abs(UB_pl.positions_[i].x_ - LB_pl.positions_[i].x_)), min_distance)),
+            i, UB_pl.positions_[i].x_
+        );
+        L.y_.add_anchor(
+            force * scaling[i] / (std::max(static_cast<float_t>(std::abs(UB_pl.positions_[i].y_ - LB_pl.positions_[i].y_)), min_distance)),
+            i, UB_pl.positions_[i].y_
+        );
+    }
+    
+
+    return L;
+}
+
+region_distribution get_rough_legalizer(netlist const & circuit, placement_t const & pl, box<int_t> surface){
+    return region_distribution::uniform_density_distribution(surface, circuit, pl);
+}
+
+void get_rough_legalization(netlist const & circuit, placement_t & pl, region_distribution const & legalizer){
+    auto exportation = legalizer.export_spread_positions_linear();
+    for(auto const C : exportation){
+        pl.positions_[C.index_in_placement_] = static_cast<point<int_t> >(C.pos_ - 0.5f * static_cast<point<float_t> >(circuit.get_cell(C.index_in_placement_).size));
+    }
+}
+
+float_t get_mean_linear_disruption(netlist const & circuit, placement_t const & LB_pl, placement_t const & UB_pl){
+    float_t tot_cost = 0.0;
+    float_t tot_area = 0.0;
+    for(index_t i=0; i<circuit.cell_cnt(); ++i){
+        float_t area = static_cast<float_t>(circuit.get_cell(i).area);
+        point<int_t> diff = LB_pl.positions_[i] - UB_pl.positions_[i];
+
+        if( (circuit.get_cell(i).attributes & XMovable) == 0) assert(diff.x_ == 0);
+        if( (circuit.get_cell(i).attributes & YMovable) == 0) assert(diff.y_ == 0);
+
+        tot_cost += area * (std::abs(diff.x_) + std::abs(diff.y_));
+        tot_area += area;
+    }
+    return tot_cost / tot_area;
+}
+
+float_t get_mean_quadratic_disruption(netlist const & circuit, placement_t const & LB_pl, placement_t const & UB_pl){
+    float_t tot_cost = 0.0;
+    float_t tot_area = 0.0;
+    for(index_t i=0; i<circuit.cell_cnt(); ++i){
+        float_t area = static_cast<float_t>(circuit.get_cell(i).area);
+        point<int_t> diff = LB_pl.positions_[i] - UB_pl.positions_[i];
+
+        if( (circuit.get_cell(i).attributes & XMovable) == 0) assert(diff.x_ == 0);
+        if( (circuit.get_cell(i).attributes & YMovable) == 0) assert(diff.y_ == 0);
+
+        float_t manhattan = (std::abs(diff.x_) + std::abs(diff.y_));
+        tot_cost += area * manhattan * manhattan;
+        tot_area += area;
+    }
+    return std::sqrt(tot_cost / tot_area);
+}
+
+} // namespace gp
+} // namespace coloquinte
+
+
--- a/coloquinte/src/coloquinte/circuit.hxx
+++ b/coloquinte/src/coloquinte/circuit.hxx
@ -0,0 +1,59 @@
+
+#ifndef COLOQUINTE_GP_CIRCUIT
+#define COLOQUINTE_GP_CIRCUIT
+
+#include "common.hxx"
+#include "solvers.hxx"
+#include "netlist.hxx"
+#include "rough_legalizers.hxx"
+
+#include <vector>
+#include <cassert>
+
+namespace coloquinte{
+
+void verify_placement_legality(netlist const & circuit, placement_t const & pl, box<int_t> surface);
+
+namespace gp{
+
+point<linear_system> empty_linear_systems(netlist const & circuit, placement_t const & pl);
+
+// Net models stuff
+point<linear_system> get_HPWLF_linear_system  (netlist const & circuit, placement_t const & pl, float_t tol, index_t min_s, index_t max_s);
+point<linear_system> get_HPWLR_linear_system  (netlist const & circuit, placement_t const & pl, float_t tol, index_t min_s, index_t max_s);
+point<linear_system> get_star_linear_system   (netlist const & circuit, placement_t const & pl, float_t tol, index_t min_s, index_t max_s);
+point<linear_system> get_clique_linear_system (netlist const & circuit, placement_t const & pl, float_t tol, index_t min_s, index_t max_s);
+point<linear_system> get_MST_linear_system    (netlist const & circuit, placement_t const & pl, float_t tol, index_t min_s, index_t max_s);
+point<linear_system> get_RSMT_linear_system   (netlist const & circuit, placement_t const & pl, float_t tol, index_t min_s, index_t max_s);
+
+// Additional forces
+point<linear_system> get_pulling_forces (netlist const & circuit, placement_t const & pl, float_t typical_distance);
+point<linear_system> get_linear_pulling_forces (netlist const & circuit, placement_t const & UB_pl, placement_t const & LB_pl, float_t force, float_t min_distance);
+
+// Solve the final linear system
+void solve_linear_system(netlist const & circuit, placement_t & pl, point<linear_system> & L, index_t nbr_iter);
+
+// Cost-related stuff, whether wirelength or disruption
+std::int64_t get_HPWL_wirelength (netlist const & circuit, placement_t const & pl);
+std::int64_t get_MST_wirelength  (netlist const & circuit, placement_t const & pl);
+std::int64_t get_RSMT_wirelength (netlist const & circuit, placement_t const & pl);
+
+float_t get_mean_linear_disruption(netlist const & circuit, placement_t const & LB_pl, placement_t const & UB_pl);
+float_t get_mean_quadratic_disruption(netlist const & circuit, placement_t const & LB_pl, placement_t const & UB_pl);
+
+// Legalizer-related stuff
+region_distribution get_rough_legalizer(netlist const & circuit, placement_t const & pl, box<int_t> surface);
+void get_rough_legalization(netlist const & circuit, placement_t & pl, region_distribution const & legalizer);
+
+// Cell orientation optimization
+void optimize_x_orientations(netlist const & circuit, placement_t & pl);
+void optimize_y_orientations(netlist const & circuit, placement_t & pl);
+void optimize_exact_orientations(netlist const & circuit, placement_t & pl);
+//void spread_orientations(netlist const & circuit, placement_t & pl);
+
+
+} // namespace gp
+} // namespace coloquinte
+
+#endif
+
--- a/coloquinte/src/coloquinte/circuit_helper.hxx
+++ b/coloquinte/src/coloquinte/circuit_helper.hxx
@ -0,0 +1,90 @@
+
+#ifndef COLOQUINTE_GP_HELPERCIRCUIT
+#define COLOQUINTE_GP_HELPERCIRCUIT
+
+#include "common.hxx"
+#include "netlist.hxx"
+
+namespace coloquinte{
+
+struct pin_1D{
+    index_t cell_ind;
+    int_t   pos;
+    int_t   offs;
+    bool    movable;
+
+    bool operator<(pin_1D const o) const { return pos < o.pos; }
+
+    pin_1D(index_t c, int_t p, int_t o, bool m) : cell_ind(c), pos(p), offs(o), movable(m){}
+};
+struct pin_2D{
+    index_t      cell_ind;
+    point<int_t> pos;
+    point<int_t> offs;
+    bool         movable;
+
+    pin_2D(index_t c, point<int_t> p, point<int_t> o, bool m) : cell_ind(c), pos(p), offs(o), movable(m){}
+    pin_1D x() const{ return pin_1D(cell_ind, pos.x_, offs.x_, movable); }
+    pin_1D y() const{ return pin_1D(cell_ind, pos.y_, offs.y_, movable); }
+};
+
+inline int_t dist(pin_2D const a, pin_2D const b){
+    point<int_t> diff = a.pos - b.pos;
+    return std::abs(diff.x_) + std::abs(diff.y_);
+}
+
+inline std::vector<pin_2D>         get_pins_2D(netlist const & circuit, placement_t const & pl, index_t net_ind){
+    std::vector<pin_2D> ret;
+    for(auto p : circuit.get_net(net_ind)){
+        assert(std::isfinite(pl.positions_[p.cell_ind].x_) and std::isfinite(pl.positions_[p.cell_ind].y_));
+        assert(std::isfinite(pl.orientations_[p.cell_ind].x_) and std::isfinite(pl.orientations_[p.cell_ind].y_));
+
+        point<int_t> offs;
+            offs.x_ = pl.orientations_[p.cell_ind].x_ ? p.offset.x_ : circuit.get_cell(p.cell_ind).size.x_ - p.offset.x_;
+            offs.y_ = pl.orientations_[p.cell_ind].y_ ? p.offset.y_ : circuit.get_cell(p.cell_ind).size.y_ - p.offset.y_;
+        point<int_t> pos  = offs + pl.positions_[p.cell_ind];
+
+        assert(std::isfinite(offs.x_) and std::isfinite(offs.y_));
+        assert(std::isfinite(pos.x_) and std::isfinite(pos.y_));
+
+        bool movable = (circuit.get_cell(p.cell_ind).attributes & XMovable) != 0 and (circuit.get_cell(p.cell_ind).attributes & YMovable) != 0;
+        ret.push_back(pin_2D(p.cell_ind, pos, offs, movable));
+    }
+    return ret;
+}
+
+inline point<std::vector<pin_1D> > get_pins_1D(netlist const & circuit, placement_t const & pl, index_t net_ind){
+    point<std::vector<pin_1D> > ret;
+    for(auto p : circuit.get_net(net_ind)){
+        assert(std::isfinite(pl.positions_[p.cell_ind].x_) and std::isfinite(pl.positions_[p.cell_ind].y_));
+        assert(std::isfinite(pl.orientations_[p.cell_ind].x_) and std::isfinite(pl.orientations_[p.cell_ind].y_));
+
+        point<int_t> offs;
+            offs.x_ = pl.orientations_[p.cell_ind].x_ ? p.offset.x_ : circuit.get_cell(p.cell_ind).size.x_ - p.offset.x_;
+            offs.y_ = pl.orientations_[p.cell_ind].y_ ? p.offset.y_ : circuit.get_cell(p.cell_ind).size.y_ - p.offset.y_;
+        point<int_t> pos  = offs + pl.positions_[p.cell_ind];
+
+        assert(std::isfinite(offs.x_) and std::isfinite(offs.y_));
+        assert(std::isfinite(pos.x_) and std::isfinite(pos.y_));
+
+        bool x_movable = (circuit.get_cell(p.cell_ind).attributes & XMovable) != 0;
+        bool y_movable = (circuit.get_cell(p.cell_ind).attributes & YMovable) != 0;
+
+        ret.x_.push_back(pin_1D(p.cell_ind, pos.x_, offs.x_, x_movable));
+        ret.y_.push_back(pin_1D(p.cell_ind, pos.y_, offs.y_, y_movable));
+    }
+    return ret;
+}
+
+std::int64_t MST_length(std::vector<point<int_t> > const & pins);
+std::int64_t RSMT_length(std::vector<point<int_t> > const & pins, index_t exactitude_limit);
+std::int64_t get_HPWL_length(netlist const & circuit, placement_t const & pl, index_t net_ind);
+std::int64_t get_RSMT_length(netlist const & circuit, placement_t const & pl, index_t net_ind);
+
+std::vector<std::pair<index_t, index_t> > get_MST_topology(std::vector<point<int_t> > const & pins);
+point<std::vector<std::pair<index_t, index_t> > > get_RSMT_topology(std::vector<point<int_t> > const & pins, index_t exactitude_limit);
+
+} // namespace coloquinte
+
+#endif
+
--- a/coloquinte/src/coloquinte/common.hxx
+++ b/coloquinte/src/coloquinte/common.hxx
@ -0,0 +1,110 @@
+
+#ifndef COLOQUINTE_GP_COMMON
+#define COLOQUINTE_GP_COMMON
+
+#include <cstdint>
+#include <algorithm>
+
+namespace coloquinte{
+
+using float_t    = float;
+using int_t      = std::int32_t;
+using index_t    = std::uint32_t;
+using capacity_t = std::int64_t;
+using mask_t     = std::uint32_t;
+
+using ext_object = std::uint64_t;
+
+enum PlacementType{
+    Optimist  = 0,
+    Pessimist = 1
+};
+
+enum Movability{
+    XMovable   = 1     ,
+    YMovable   = 1 << 1,
+    XFlippable = 1 << 2,
+    YFlippable = 1 << 3,
+    SoftMacro  = 1 << 4
+};
+
+template<typename T>
+struct point{
+    T x_, y_;
+    point(){}
+    point(T x, T y): x_(x), y_(y){}
+
+    template<typename S>
+    operator point<S>() const{
+        return point<S>(static_cast<S>(x_), static_cast<S>(y_));
+    }
+
+    void operator+=(point<T> const o){
+        x_ += o.x_;
+        y_ += o.y_;
+    }
+};
+
+template<typename T>
+point<T> operator+(point<T> const a, point<T> const b){
+    return point<T>(a.x_+b.x_, a.y_+b.y_);
+}
+template<typename T>
+point<T> operator-(point<T> const a, point<T> const b){
+    return point<T>(a.x_-b.x_, a.y_-b.y_);
+}
+template<typename T>
+point<T> operator*(T lambda, point<T> const p){
+    return point<T>(lambda * p.x_, lambda * p.y_);
+}
+template<typename T>
+point<T> operator*(point<T> const a, point<T> const b){
+    return point<T>(a.x_*b.x_, a.y_*b.y_);
+}
+
+template<typename T>
+struct box{
+    T x_min_, x_max_, y_min_, y_max_;
+    box(){}
+    box(T x_mn, T x_mx, T y_mn, T y_mx) : x_min_(x_mn), x_max_(x_mx), y_min_(y_mn), y_max_(y_mx){}
+    box(point<T> mn, point<T> mx) : x_min_(mn.x_), x_max_(mx.x_), y_min_(mn.y_), y_max_(mx.y_){}
+
+    bool in(box<T> const o) const{
+        return x_max_   <= o.x_max_
+            && y_max_   <= o.y_max_
+            && x_min_   >= o.x_min_
+            && y_min_   >= o.y_min_;
+    }
+    bool intersects(box<T> const o) const{
+        return x_min_   < o.x_max_
+            && y_min_   < o.y_max_
+            && o.x_min_ < x_max_
+            && o.y_min_ < y_max_;
+    }
+    box<T> intersection(box<T> const o) const{
+        return box<T>(
+            std::max(x_min_, o.x_min_),
+            std::min(x_max_, o.x_max_),
+            std::max(y_min_, o.y_min_),
+            std::min(y_max_, o.y_max_)
+        );
+    }
+    point<T> dimensions() const{
+        return point<T>(x_max_-x_min_, y_max_-y_min_);
+    }
+    bool empty() const{
+        return dimensions().x_ <= 0 or dimensions().y_ <= 0;
+    }
+
+    template<typename S>
+    operator box<S>() const{
+        return box<S>(static_cast<S>(x_min_), static_cast<S>(x_max_), static_cast<S>(y_min_), static_cast<S>(y_max_));
+    }
+};
+
+using orientation_t = point<bool>;
+
+} // Namespace coloquinte
+
+#endif
+
--- a/coloquinte/src/coloquinte/detailed.hxx
+++ b/coloquinte/src/coloquinte/detailed.hxx
@ -0,0 +1,91 @@
+
+#ifndef COLOQUINTE_DETAILED
+#define COLOQUINTE_DETAILED
+
+#include "common.hxx"
+#include "netlist.hxx"
+
+#include <vector>
+#include <limits>
+
+namespace coloquinte{
+namespace dp{
+
+const index_t null_ind = std::numeric_limits<index_t>::max();
+
+struct detailed_placement{
+    // All position and orientation stuff
+    placement_t plt_;
+
+    std::vector<index_t> cell_rows_;
+
+    // The placement region
+    int_t min_x_, max_x_;
+    int_t y_origin_;
+    int_t row_height_;
+
+    // Encode the topological state of the circuit: which cells are near each other
+    // Makes extracting part of the circuit or optimizing positions at fixed topology easy
+    std::vector<std::pair<index_t, index_t> > neighbours_; // The cells before and after on each row; cells spanning multiple columns use several positions
+    // In order to get the neighbours in the detailed placement
+    std::vector<index_t> neighbours_limits_;
+
+    std::vector<index_t> row_first_cells_, row_last_cells_; // For each row, which cells are the on the boundaries
+
+    // Tests the coherency between positions, widths and topological representation
+    void selfcheck() const;
+
+    detailed_placement(
+            placement_t pl,
+            std::vector<index_t> placement_rows,
+            std::vector<index_t> cell_heights,
+            std::vector<std::vector<index_t> > rows,
+            int_t min_x, int_t max_x,
+            int_t y_origin,
+            index_t nbr_rows, int_t row_height
+        );
+
+    index_t cell_height(index_t c) const{ return neighbours_limits_[c+1] - neighbours_limits_[c]; }
+    index_t cell_cnt() const{ return cell_rows_.size(); }
+    index_t row_cnt()  const{ return row_first_cells_.size(); }
+    index_t neighbour_index(index_t c, index_t r) const{
+        assert(r - cell_rows_[c] < cell_height(c));
+        return neighbours_limits_[c] + r - cell_rows_[c];
+    }
+
+    void swap_standard_cell_topologies(index_t c1, index_t c2);
+    std::pair<int_t, int_t> get_limit_positions(netlist const & circuit, index_t c) const;
+
+    index_t get_first_cell_on_row(index_t r);
+    index_t get_next_cell_on_row(index_t c, index_t r);
+    index_t get_prev_cell_on_row(index_t c, index_t r);
+
+    index_t get_first_standard_cell_on_row(index_t r);
+    index_t get_next_standard_cell_on_row(index_t c, index_t r);
+
+    void reorder_standard_cells(std::vector<index_t> const old_order, std::vector<index_t> const new_order);
+    void reorder_cells(std::vector<index_t> const old_order, std::vector<index_t> const new_order, index_t row);
+};
+
+void swaps_global_HPWL(netlist const & circuit, detailed_placement & pl, index_t row_extent, index_t cell_extent, bool try_flip = false);
+void swaps_global_RSMT(netlist const & circuit, detailed_placement & pl, index_t row_extent, index_t cell_extent, bool try_flip = false);
+
+void swaps_row_convex_HPWL(netlist const & circuit, detailed_placement & pl, index_t range);
+void swaps_row_convex_RSMT(netlist const & circuit, detailed_placement & pl, index_t range);
+void swaps_row_noncvx_HPWL(netlist const & circuit, detailed_placement & pl, index_t range);
+void swaps_row_noncvx_RSMT(netlist const & circuit, detailed_placement & pl, index_t range);
+
+void OSRP_convex_HPWL(netlist const & circuit, detailed_placement & pl);
+void OSRP_convex_RSMT(netlist const & circuit, detailed_placement & pl);
+void OSRP_noncvx_HPWL(netlist const & circuit, detailed_placement & pl);
+void OSRP_noncvx_RSMT(netlist const & circuit, detailed_placement & pl);
+
+void optimize_on_topology_HPWL(netlist const & circuit, detailed_placement & pl);
+
+void row_compatible_orientation(netlist const & circuit, detailed_placement & pl, bool first_row_orient);
+
+} // namespace dp
+} // namespace coloquinte
+
+#endif
+
--- a/coloquinte/src/coloquinte/legalizer.hxx
+++ b/coloquinte/src/coloquinte/legalizer.hxx
@ -0,0 +1,12 @@
+
+#include "circuit.hxx"
+#include "detailed.hxx"
+
+namespace coloquinte{
+namespace dp{
+
+detailed_placement legalize(netlist const & circuit, placement_t const & pl, box<int_t> surface, int_t row_height);
+void get_result(netlist const & circuit, detailed_placement const & dpl, placement_t & pl);
+
+} // namespace dp
+} // namespace coloquinte
--- a/coloquinte/src/coloquinte/netlist.hxx
+++ b/coloquinte/src/coloquinte/netlist.hxx
@ -0,0 +1,244 @@
+
+#ifndef COLOQUINTE_NETLIST
+#define COLOQUINTE_NETLIST
+
+#include "common.hxx"
+#include <vector>
+#include <cassert>
+
+
+namespace coloquinte{
+
+// Structures for construction and circuit_loader
+struct temporary_pin{
+    point<int_t> offset;
+    index_t cell_ind, net_ind;
+    temporary_pin(){}
+    temporary_pin(point<int_t> offs, index_t c, index_t n) : offset(offs), cell_ind(c), net_ind(n){}
+};
+
+struct temporary_cell{
+    point<int_t> size;
+    capacity_t area;
+    mask_t attributes;
+    index_t list_index;
+
+    temporary_cell(){}
+    temporary_cell(point<int_t> s, mask_t attr, index_t ind) : size(s), attributes(attr), list_index(ind){ area = static_cast<capacity_t>(s.x_) * static_cast<capacity_t>(s.y_);}
+};
+
+struct temporary_net{
+    int_t weight;
+    index_t list_index;
+    temporary_net(){}
+    temporary_net(index_t ind, int_t wght) : weight(wght), list_index(ind){}
+};
+
+
+// Main class
+class netlist{
+    std::vector<int_t>       net_weights_;
+
+    std::vector<capacity_t>    cell_areas_;
+    std::vector<point<int_t> > cell_sizes_;
+    std::vector<mask_t>        cell_attributes_;
+
+    // Mapping of the order given at construction time to the internal representation
+    std::vector<index_t>       cell_internal_mapping_;
+    std::vector<index_t>       net_internal_mapping_;
+
+    // Optimized sparse storage for nets
+    std::vector<index_t>         net_limits_;
+    std::vector<index_t>         cell_indexes_;
+    std::vector<point<int_t> > pin_offsets_;
+
+    // Sparse storage from cell to net appartenance
+    std::vector<index_t>         cell_limits_;
+    std::vector<index_t>         net_indexes_;
+    std::vector<index_t>         pin_indexes_;
+
+    public:
+    netlist(std::vector<temporary_cell> cells, std::vector<temporary_net> nets, std::vector<temporary_pin> all_pins);
+    netlist(){}
+
+    void selfcheck() const;
+
+    struct pin_t{
+        point<int_t> offset;
+        index_t cell_ind, net_ind;
+        pin_t(point<int_t> offs, index_t c, index_t n) : offset(offs), cell_ind(c), net_ind(n){}
+    };
+
+    class net_pin_iterator{
+        index_t pin_ind, net_ind;
+        netlist const & N;
+
+        public:
+        pin_t operator*() const{
+            return pin_t(N.pin_offsets_[pin_ind], N.cell_indexes_[pin_ind], net_ind);
+        }
+        net_pin_iterator & operator++(){
+            pin_ind++;
+            return *this;
+        }
+        bool operator!=(net_pin_iterator const o) const{
+            return pin_ind != o.pin_ind;
+        }
+
+        net_pin_iterator(index_t net_index, index_t pin_index, netlist const & orig) : pin_ind(pin_index), net_ind(net_index), N(orig){}
+    };
+
+    class cell_pin_iterator{
+        index_t pin_ind, cell_ind;
+        netlist const & N;
+
+        public:
+        pin_t operator*() const{
+            return pin_t(N.pin_offsets_[N.pin_indexes_[pin_ind]], cell_ind, N.net_indexes_[pin_ind]);
+        }
+        cell_pin_iterator & operator++(){
+            pin_ind++;
+            return *this;
+        }
+        bool operator!=(cell_pin_iterator const o) const{
+            return pin_ind != o.pin_ind;
+        }
+
+        cell_pin_iterator(index_t cell_index, index_t pin_index, netlist const & orig) : pin_ind(pin_index), cell_ind(cell_index), N(orig){}
+    };
+
+    struct internal_cell{
+        point<int_t> size;
+        capacity_t area;
+        mask_t attributes;
+        netlist const & N;
+        index_t index;
+        index_t pin_cnt;
+
+        internal_cell(index_t ind, netlist const & orig) :
+            size(orig.cell_sizes_[ind]),
+            area(orig.cell_areas_[ind]),
+            attributes(orig.cell_attributes_[ind]),
+            N(orig),
+            index(ind),
+            pin_cnt(N.cell_limits_[ind+1] - N.cell_limits_[ind])
+            {}
+
+        cell_pin_iterator begin(){ return cell_pin_iterator(index, N.cell_limits_[index], N); }
+        cell_pin_iterator end(){ return cell_pin_iterator(index, N.cell_limits_[index+1], N); }
+    };
+
+    struct internal_net{
+        int_t weight;
+        netlist const & N;
+        index_t index;
+        index_t pin_cnt;
+
+        internal_net(index_t ind, netlist const & orig) :
+            weight(orig.net_weights_[ind]),
+            N(orig),
+            index(ind),
+            pin_cnt(N.net_limits_[ind+1] - N.net_limits_[ind])
+            {}
+
+        net_pin_iterator begin(){ return net_pin_iterator(index, N.net_limits_[index], N); }
+        net_pin_iterator end(){ return net_pin_iterator(index, N.net_limits_[index+1], N); }
+    };
+
+    internal_cell get_cell(index_t ind) const{
+        return internal_cell(ind, *this);
+    }
+    internal_net  get_net(index_t ind) const{
+        return internal_net(ind, *this);
+    }
+
+    index_t cell_cnt() const{ return cell_internal_mapping_.size(); }
+    index_t net_cnt()  const{ return net_internal_mapping_.size(); }
+    index_t pin_cnt()  const{ return pin_offsets_.size(); }
+
+    index_t get_cell_ind(index_t external_ind) const{ return cell_internal_mapping_[external_ind]; }
+    index_t get_net_ind(index_t external_ind) const{ return net_internal_mapping_[external_ind]; }
+
+};
+
+inline netlist::netlist(std::vector<temporary_cell> cells, std::vector<temporary_net> nets, std::vector<temporary_pin> all_pins){
+    struct extended_pin : public temporary_pin{
+        index_t pin_index;
+        extended_pin(temporary_pin const p) : temporary_pin(p){} 
+    };
+    std::vector<extended_pin> pins;
+    for(temporary_pin const p : all_pins){
+        pins.push_back(extended_pin(p));
+    }
+
+    cell_limits_.resize(cells.size()+1);
+    net_limits_.resize(nets.size()+1);
+
+    net_weights_.resize(nets.size());
+
+    cell_areas_.resize(cells.size());
+    cell_sizes_.resize(cells.size());
+    cell_attributes_.resize(cells.size());
+
+    cell_internal_mapping_.resize(cells.size());
+    net_internal_mapping_.resize(nets.size());
+
+    cell_indexes_.resize(pins.size());
+    pin_offsets_.resize(pins.size());
+    net_indexes_.resize(pins.size());
+    pin_indexes_.resize(pins.size());
+
+    for(index_t i=0; i<nets.size(); ++i){
+        net_internal_mapping_[i] = i;
+    }
+    for(index_t i=0; i<cells.size(); ++i){
+        cell_internal_mapping_[i] = i;
+    }
+
+    std::sort(pins.begin(), pins.end(), [](extended_pin const a, extended_pin const b){ return a.net_ind < b.net_ind; });
+    for(index_t n=0, p=0; n<nets.size(); ++n){
+        net_weights_[n] = nets[n].weight;
+
+        net_limits_[n] = p;
+        while(p<pins.size() && pins[p].net_ind == n){
+            cell_indexes_[p] = pins[p].cell_ind;
+            pin_offsets_[p]  = pins[p].offset;
+            pins[p].pin_index = p;
+            ++p;
+        }
+    }
+    net_limits_.back() = pins.size();
+
+    std::sort(pins.begin(), pins.end(), [](extended_pin const a, extended_pin const b){ return a.cell_ind < b.cell_ind; });
+
+    for(index_t c=0, p=0; c<cells.size(); ++c){
+        cell_areas_[c] = cells[c].area;
+        cell_attributes_[c] = cells[c].attributes;
+        cell_sizes_[c] = cells[c].size;
+
+        cell_limits_[c] = p;
+        while(p<pins.size() && pins[p].cell_ind == c){
+            net_indexes_[p] = pins[p].net_ind;
+            pin_indexes_[p] = pins[p].pin_index;
+            ++p;
+        }
+    }
+    cell_limits_.back() = pins.size();
+}
+
+struct placement_t{
+    std::vector<point<int_t> > positions_;
+    std::vector<point<bool> > orientations_;
+
+    index_t cell_cnt() const{
+        assert(positions_.size() == orientations_.size());
+        return positions_.size();
+    }
+
+    void selfcheck() const;
+};
+
+} // namespace coloquinte
+
+#endif
+
--- a/coloquinte/src/coloquinte/optimization_subproblems.hxx
+++ b/coloquinte/src/coloquinte/optimization_subproblems.hxx
@ -0,0 +1,158 @@
+
+#ifndef COLOQUINTE_GP_OPTSUBPROBLEMS 
+#define COLOQUINTE_GP_OPTSUBPROBLEMS 
+
+#include "common.hxx"
+
+#include <queue>
+#include <vector>
+#include <cassert>
+
+namespace coloquinte{
+
+typedef std::pair<int_t, capacity_t> t1D_elt;
+
+std::vector<capacity_t>  transport_1D(std::vector<t1D_elt> sources, std::vector<t1D_elt> sinks);
+std::vector<std::vector<capacity_t> > transport_convex(std::vector<capacity_t> const & capacities, std::vector<capacity_t> const & demands, std::vector<std::vector<float_t> > const & costs);
+std::vector<std::vector<capacity_t> > transport_generic(std::vector<capacity_t> const & capacities, std::vector<capacity_t> const & demands, std::vector<std::vector<float_t> > const & costs);
+
+template<typename T>
+struct legalizable_task{
+    T width;
+    T target_pos;
+    index_t ind;
+    legalizable_task(T w, T p, index_t i) : width(w), target_pos(p), ind(i){}
+    bool operator<(legalizable_task<T> const o) const{ return target_pos < o.target_pos; }
+};
+
+// A class to obtain the optimal positions minimizing total weighted displacement along a row
+// It is an ordered single row problem/fixed order single machine scheduling problem, solved by the clumping/specialized cascading descent algorithm
+// The cost is linear in the distance to the target position, weighted by the width of the cells
+template<typename T>
+class OSRP_leg{
+    struct OSRP_bound{
+        T absolute_pos; // Will be the target absolute position of the cell
+        T weight;       // Will be the width of the cell
+    
+        bool operator<(OSRP_bound const o) const{ return absolute_pos < o.absolute_pos; }
+        OSRP_bound(T w, T abs_pos) : absolute_pos(abs_pos), weight(w) {}
+    };
+
+    T begin, end;
+
+    std::vector<index_t> cells;            // The indexes in the circuit
+    std::vector<T>   constraining_pos; // Where the cells have been pushed and constrain the positions of preceding cells
+    std::vector<T>   prev_width;       // Cumulative width of the cells: calculates the absolute position of new cells
+
+    std::priority_queue<OSRP_bound> bounds;
+
+    // Get the cost of pushing a cell on the row
+    T get_displacement(legalizable_task<T> const newly_pushed, bool update);
+
+    public:
+    T current_width() const{ return prev_width.back(); }
+    T remaining_space() const{ return end - begin - current_width(); }
+    T last_available_pos() const{ return constraining_pos.back() + current_width(); }
+
+    T get_cost(legalizable_task<T> const task){ return get_displacement(task, false); }
+    void push(legalizable_task<T> const task){ get_displacement(task, true); }
+
+    // Initialize
+    OSRP_leg(T b, T e) : begin(b), end(e), prev_width(1, 0) {}
+    OSRP_leg(){}
+
+    typedef std::pair<index_t, T> result_t;
+
+    // Get the resulting placement
+    std::vector<result_t> get_placement() const;
+};
+
+struct cell_bound{
+    index_t c;
+    int_t pos;
+    int_t slope;
+    bool operator<(cell_bound const o) const{ return c < o.c; }
+    cell_bound(index_t order, int_t p, int_t s) : c(order), pos(p), slope(s) {}
+};
+
+bool place_convex_single_row(std::vector<int_t> const & widths, std::vector<std::pair<int_t, int_t> > const & ranges, std::vector<cell_bound> bounds, std::vector<int_t> const & const_slopes, std::vector<int_t> & positions);
+bool place_noncvx_single_row(std::vector<int_t> const & widths, std::vector<std::pair<int_t, int_t> > const & ranges, std::vector<int> const & flippables, std::vector<cell_bound> bounds, std::vector<int_t> const & const_slopes, std::vector<int_t> & positions, std::vector<int> & flippings);
+
+template<typename T>
+inline T OSRP_leg<T>::get_displacement(legalizable_task<T> const newly_pushed, bool update){
+    T target_abs_pos = newly_pushed.target_pos - current_width();
+    T width = newly_pushed.width;
+    T slope = - width;
+
+    T cur_pos  = end;
+    T cur_cost = 0;
+
+    std::vector<OSRP_bound> passed_bounds;
+
+    while( not bounds.empty() and
+        ((slope < 0 and bounds.top().absolute_pos > target_abs_pos) // Not reached equilibrium
+        or bounds.top().absolute_pos > end - current_width() - width) // Still not a legal position
+        ){
+        T old_pos = cur_pos;
+        cur_pos = bounds.top().absolute_pos;
+        cur_cost += (old_pos - cur_pos) * (slope + width); // The additional cost for the other cells encountered
+        slope += bounds.top().weight;
+
+        // Remember which bounds we encountered in order to reset the object to its initial state
+        if(not update)
+            passed_bounds.push_back(bounds.top());
+        bounds.pop();
+    }
+
+    T final_abs_pos = std::min(end - current_width() - width, // Always before the end and after the beginning
+                            std::max(begin, slope >= 0 ? cur_pos : target_abs_pos) // but did we stop before reaching the target position? 
+                                );
+
+    cur_cost += (cur_pos - final_abs_pos) * (slope + width); // The additional cost for the other cells encountered
+
+    if(std::numeric_limits<T>::is_integer){
+        assert(final_abs_pos >= begin);
+        assert(final_abs_pos <= end - current_width() - width);
+    }
+
+    if(update){
+        prev_width.push_back(width + current_width());
+        cells.push_back(newly_pushed.ind);
+        constraining_pos.push_back(final_abs_pos);
+        if(slope > 0){ // Remaining capacity of an encountered bound
+            bounds.push(OSRP_bound(slope, cur_pos));
+        }
+        // The new bound, minus what it absorbs of the remaining slope
+        if(target_abs_pos > begin){
+            bounds.push(OSRP_bound(2*width + std::min(slope, static_cast<T>(0) ), target_abs_pos));
+        }
+    }
+    else{
+        for(OSRP_bound b : passed_bounds){
+            bounds.push(b);
+        }
+    }
+
+    return cur_cost + width * std::abs(final_abs_pos - target_abs_pos); // Add the cost of the new cell
+}
+
+template<typename T>
+inline std::vector<std::pair<index_t, T> > OSRP_leg<T>::get_placement() const{
+    auto final_abs_pos = constraining_pos;
+    std::partial_sum(final_abs_pos.rbegin(), final_abs_pos.rend(), final_abs_pos.rbegin(), [](T a, T b)->T{ return std::min(a,b); });
+
+    std::vector<result_t> ret(cells.size());
+    for(index_t i=0; i<cells.size(); ++i){
+        ret[i] = result_t(cells[i], final_abs_pos[i] + prev_width[i]);
+
+        if(std::numeric_limits<T>::is_integer){
+            assert(final_abs_pos[i] >= begin);
+            assert(final_abs_pos[i] + prev_width[i+1] <= end);
+        }
+    }
+    return ret;
+}
+
+}
+#endif
+
--- a/coloquinte/src/coloquinte/piecewise_linear.hxx
+++ b/coloquinte/src/coloquinte/piecewise_linear.hxx
@ -0,0 +1,29 @@
+
+
+#include "common.hxx"
+
+#include <vector>
+
+namespace coloquinte{
+
+typedef std::pair<int_t, int_t> p_v;
+
+struct piecewise_linear_function{
+    std::vector<p_v> point_values;
+
+    static piecewise_linear_function minimum(piecewise_linear_function const & a, piecewise_linear_function const & b);
+    piecewise_linear_function previous_min_of_sum(piecewise_linear_function const & o, int_t added_cell_width) const;
+    piecewise_linear_function previous_min() const;
+
+    int_t value_at(int_t pos) const;
+    int_t last_before(int_t pos) const;
+
+    void add_monotone(int_t slope, int_t offset);
+    void add_bislope(int_t s_l, int_t s_r, int_t pos);
+
+    piecewise_linear_function(){}
+    piecewise_linear_function(int_t min_def, int_t max_def);
+};
+
+} // End namespace coloquinte
+
--- a/coloquinte/src/coloquinte/rough_legalizers.hxx
+++ b/coloquinte/src/coloquinte/rough_legalizers.hxx
@ -0,0 +1,252 @@
+
+#ifndef COLOQUINTE_GP_ROUGH_LEGALIZER
+#define COLOQUINTE_GP_ROUGH_LEGALIZER
+
+#include "common.hxx"
+#include "netlist.hxx"
+
+#include <vector>
+#include <cassert>
+#include <cmath>
+#include <functional>
+
+/*
+ * A simple class to perform approximate legalization with extreme efficiency
+ * 
+ * To be called during global placement or before an exact legalization
+ *
+ */
+
+namespace coloquinte{
+namespace gp{
+
+class region_distribution{
+    /*
+     * Coordinates are mostly float but obstacles and areas are integers for correctness
+     */
+
+    public:
+    struct movable_cell{
+        capacity_t demand_; // == area; No FP!!!
+        point<float_t> pos_;  // Target position, determining the cost to allocate it
+        // int_t x_size, y_size; // May split cells
+        index_t index_in_placement_;
+
+        movable_cell();
+        movable_cell(capacity_t demand, point<float_t> p, index_t ind);
+    };
+
+    // Specifies a maximum density of movable cells per usable area
+    // Representing either a macroblock or a routing congestion
+    struct density_limit{
+        box<int_t> box_;
+        float_t density_; // from 0.0 for a macro to 1.0 if it does nothing
+    };
+
+    private:
+
+    struct region;
+    
+    struct cell_ref{
+        capacity_t allocated_capacity_;
+        point<float_t> pos_;
+        index_t index_in_list_;
+
+        cell_ref(){}
+        cell_ref(capacity_t demand, point<float_t> p, index_t ind) : allocated_capacity_(demand), pos_(p), index_in_list_(ind){}
+        friend region;
+    };
+    
+    struct region{
+        public:
+        // Data members
+        capacity_t capacity_; // ==area; No floating point!!! 
+        point<float_t> pos_;
+    
+        std::vector<cell_ref> cell_references_;
+
+        // Constructors
+        region(){} // Necessary if we want to resize vectors 
+        region(capacity_t cap, point<float_t> pos, std::vector<cell_ref> cells);
+
+        // Helper functions for bipartitioning
+        private:
+        static void distribute_new_cells(region & a, region & b, std::vector<cell_ref> cells); // Called by the other two to do the dirty work
+        public:
+        void distribute_cells(region & a, region & b) const;    // Distribute the cells from one region to two
+        static void redistribute_cells(region & a, region & b); // Optimizes the distribution between two regions
+
+        // Helper functions for multipartitioning
+        private:
+        static void distribute_new_cells(std::vector<std::reference_wrapper<region_distribution::region> > regions, std::vector<cell_ref> cells);
+        public:
+        void distribute_cells(std::vector<std::reference_wrapper<region_distribution::region> > regions) const;
+        static void redistribute_cells(std::vector<std::reference_wrapper<region_distribution::region> > regions);
+
+        // Helper functions for 1D transportation
+        public:
+        static void distribute_new_cells(std::vector<std::reference_wrapper<region_distribution::region> > regions, std::vector<cell_ref> cells, std::function<float_t (point<float_t>)> coord);
+        static void redistribute_cells(std::vector<std::reference_wrapper<region_distribution::region> > & regions, std::function<float_t (point<float_t>)> coord);
+
+        public:
+        void uniquify_references();
+        void selfcheck() const;
+
+        // Accessors
+        capacity_t capacity() const;
+        capacity_t allocated_capacity() const;
+        capacity_t unused_capacity() const;
+        index_t cell_cnt() const;
+
+        float_t distance(cell_ref const & C) const;
+        float_t cost() const;
+    };
+
+    private:
+    // Members
+    index_t x_regions_cnt_, y_regions_cnt_;
+    
+    std::vector<movable_cell> cell_list_;
+    std::vector<region> placement_regions_;
+
+    box<int_t> placement_area_;
+    std::vector<density_limit> density_map_;
+    const capacity_t full_density_mul; // Multiplicator giving the grain for fractional areas for the surface
+          capacity_t cell_density_mul; // ANd for the cells
+    float_t density_scaling_factor_;
+    
+    private:
+    // Helper functions
+    region & get_region(index_t x_coord, index_t y_coord);
+    region const & get_region(index_t x_coord, index_t y_coord) const;
+    box<int_t> get_box(index_t x, index_t y, index_t x_cnt, index_t y_cnt) const;
+
+    static void sort_uniquify(std::vector<cell_ref> & cell_references);
+    static void just_uniquify(std::vector<cell_ref> & cell_references);
+
+    // Prepare regions with the right positions and capacities; different levels of nesting are compatible
+    std::vector<region> prepare_regions(index_t x_cnt, index_t y_cnt) const;
+
+    public:
+    
+    inline index_t x_regions_cnt() const;
+    inline index_t y_regions_cnt() const;
+    inline index_t regions_cnt()   const;
+    
+    inline index_t cell_cnt() const;
+    inline index_t fractional_cell_cnt() const;
+    
+    /*
+     * Two types of export
+     *    Region center             : upper bound of legalization cost
+     *    1D quadratic optimization : lower bound of legalization cost
+     */
+
+    std::vector<movable_cell> export_positions() const;
+    std::vector<movable_cell> export_spread_positions_quadratic() const;
+    std::vector<movable_cell> export_spread_positions_linear() const;
+
+    // The cost as seen by the partitioning algorithms (but not the export)
+    float_t cost() const;
+
+    /*
+     * Further partitions
+     */
+    
+    void x_bipartition();
+    void y_bipartition();
+    void x_resize(index_t sz);
+    void y_resize(index_t sz);
+    void multipartition(index_t x_width, index_t y_width);
+    void multipartition(index_t width){ multipartition(width, width); }
+    
+    /*
+     * Optimization functions
+     */
+
+    // Bipartitioning: only two regions are considered at a time
+    void redo_adjacent_bipartitions();
+    void redo_diagonal_bipartitions();
+    void redo_bipartitions();
+
+    // Line partitioning: optimal on coordinate axis with Manhattan distance (Euclidean distance could use it in any direction)
+    void redo_line_partitions();
+
+    // Multipartitioning: several regions considered, slow runtimes
+    void redo_diag_partitions(index_t len);
+    void redo_multipartitions(index_t x_width, index_t y_width);
+    void redo_multipartitions(index_t width){ redo_multipartitions(width, width); }
+
+    // Try to remove duplicate fractional cells    
+    void fractions_minimization();
+
+    // Verify
+    void selfcheck() const;
+
+    private:
+    region_distribution(box<int_t> placement_area, netlist const & circuit, placement_t const & pl, std::vector<density_limit> const & density_map, bool full_density);
+
+    public:
+    /*
+     * Obtain a region_distribution from a placement
+     *
+     *     Full density: the object tries to pack the cells as much as possible while still respecting the density limits
+     *     Uniform density: not only are the density limits respected, the allocated capacities are proportional to the allowed densities
+     *
+     */
+
+    static region_distribution full_density_distribution(box<int_t> placement_area, netlist const & circuit, placement_t const & pl, std::vector<density_limit> const & density_map = std::vector<density_limit>());
+    static region_distribution uniform_density_distribution(box<int_t> placement_area, netlist const & circuit, placement_t const & pl, std::vector<density_limit> const & density_map = std::vector<density_limit>());
+
+    void update(netlist const & circuit, placement_t const & pl);
+};
+
+inline region_distribution::movable_cell::movable_cell(){}
+inline region_distribution::movable_cell::movable_cell(capacity_t demand, point<float_t> p, index_t ind) : demand_(demand), pos_(p), index_in_placement_(ind){}
+
+inline index_t region_distribution::x_regions_cnt() const { return x_regions_cnt_; }
+inline index_t region_distribution::y_regions_cnt() const { return y_regions_cnt_; }
+inline index_t region_distribution::regions_cnt()   const { index_t ret = x_regions_cnt() * y_regions_cnt(); assert(placement_regions_.size() == ret); return ret; }
+inline region_distribution::region & region_distribution::get_region(index_t x_coord, index_t y_coord){
+    return placement_regions_[y_coord * x_regions_cnt() + x_coord];
+}
+inline region_distribution::region const & region_distribution::get_region(index_t x_coord, index_t y_coord) const{
+    return placement_regions_[y_coord * x_regions_cnt() + x_coord];
+}
+
+inline index_t region_distribution::cell_cnt() const{ return cell_list_.size(); }
+inline index_t region_distribution::fractional_cell_cnt() const{
+    index_t tot_cnt = 0;
+    for(auto const & R : placement_regions_){
+        tot_cnt += R.cell_cnt();
+    }
+    return tot_cnt;
+}
+
+
+inline capacity_t region_distribution::region::capacity() const{ return capacity_; }
+inline capacity_t region_distribution::region::unused_capacity() const{ return capacity() - allocated_capacity(); }
+inline capacity_t region_distribution::region::allocated_capacity() const{
+    capacity_t ret = 0;
+    for(cell_ref const C : cell_references_){
+       ret += C.allocated_capacity_; 
+    }
+    return ret;
+}
+inline index_t region_distribution::region::cell_cnt() const{ return cell_references_.size(); }
+
+inline float_t region_distribution::region::distance(region_distribution::cell_ref const & C) const{
+    return std::abs(pos_.x_ - C.pos_.x_) + std::abs(pos_.y_ - C.pos_.y_);
+    /*
+    float_t manhattan = std::max(static_cast<float_t>(0.0), std::max(C.pos_.x_ - surface_.x_max_, surface_.x_min_ - C.pos_.x_))
+                      + std::max(static_cast<float_t>(0.0), std::max(C.pos_.y_ - surface_.y_max_, surface_.y_min_ - C.pos_.y_));
+    return manhattan * (1.0 + manhattan * 0.0001);
+    */
+}
+
+
+} // Namespace gp
+} // Namespace coloquinte
+
+#endif
+
--- a/coloquinte/src/coloquinte/solvers.hxx
+++ b/coloquinte/src/coloquinte/solvers.hxx
@ -0,0 +1,88 @@
+
+#ifndef COLOQUINE_GP_SOLVERS
+#define COLOQUINE_GP_SOLVERS
+
+#include "common.hxx"
+
+#include <vector>
+
+namespace coloquinte{
+namespace gp{
+
+struct matrix_doublet{
+    index_t c_;
+    float val_;
+    bool operator<(matrix_doublet const o) const{ return c_ < o.c_; }
+    matrix_doublet(){}
+    matrix_doublet(index_t c, float v) : c_(c), val_(v){}
+};
+
+struct matrix_triplet{
+    index_t r_, c_;
+    float_t val_;
+    matrix_triplet(index_t ri, index_t ci, float_t v) : r_(ri), c_(ci), val_(v){}
+    bool operator<(matrix_triplet const o){ return r_ < o.r_ || (r_ == o.r_ && c_ < o.c_); }
+};
+
+class linear_system{
+    std::vector<matrix_triplet> matrix_;
+    std::vector<float_t> target_;
+    index_t internal_size_;
+    
+    public:
+    void add_triplet(index_t row, index_t col, float_t val){ matrix_.push_back(matrix_triplet(row, col, val)); }
+
+    linear_system operator+(linear_system const & o) const;
+
+    void add_doublet(index_t row, float_t val){
+        target_[row] += val;
+    }
+
+    void add_force(
+        float_t force,
+        index_t c1,    index_t c2,
+        float_t offs1, float_t offs2
+    ){
+        add_triplet(c1, c1, force);
+        add_triplet(c2, c2, force);
+        add_triplet(c1, c2, -force);
+        add_triplet(c2, c1, -force);
+        add_doublet(c1, force * (offs2-offs1));
+        add_doublet(c2, force * (offs1-offs2));
+    }
+
+    void add_fixed_force(
+        float_t force,
+        index_t c,
+        float_t fixed_pos,
+        float_t offs
+    ){
+        add_triplet(c, c, force);
+        add_doublet(c, force * (fixed_pos-offs));
+    }
+
+    void add_anchor(
+        float_t scale,
+        index_t c,
+        float_t pos
+    ){
+        add_triplet(c, c, scale);
+        add_doublet(c, scale*pos);
+    }
+
+    linear_system(index_t s) : target_(s, 0.0), internal_size_(s){}
+    linear_system(index_t s, index_t i) : target_(s, 0.0), internal_size_(i){}
+
+    index_t size() const{ return target_.size(); }
+    index_t internal_size() const{ return internal_size_; }
+    void add_variables(index_t cnt){ target_.resize(target_.size() + cnt, 0.0); }
+
+    std::vector<float_t> solve_CG(std::vector<float_t> guess, index_t nbr_iter);
+};
+
+} // namespace gp
+} // namespace coloquinte
+
+#endif
+
+
--- a/coloquinte/src/coloquinte/topologies.hxx
+++ b/coloquinte/src/coloquinte/topologies.hxx
@ -0,0 +1,35 @@
+
+#include "common.hxx"
+
+#include <array>
+
+#ifndef COLOQUINTE_TOPOLOGIES
+#define COLOQUINTE_TOPOLOGIES
+
+namespace coloquinte{
+namespace steiner_lookup{
+
+template<int pin_cnt>
+struct Hconnectivity{
+    // The edges and the couple of pins connected to the extreme ones are represented by one char each
+    // The first 4 bits represent the first pin minus one, the next 4 bits the second pin minus one
+    std::uint8_t connexions[pin_cnt-3];
+    std::uint8_t extremes;
+
+    int_t get_wirelength(std::array<point<int_t>, pin_cnt> const sorted_points) const;
+    std::array<std::pair<index_t, index_t>, pin_cnt-1> get_x_topology(std::array<point<int_t>, pin_cnt> const sorted_points) const;
+};
+
+extern std::array<Hconnectivity<4>, 2> const topologies_4;
+extern std::array<Hconnectivity<5>, 6> const topologies_5;
+extern std::array<Hconnectivity<6>, 23> const topologies_6;
+extern std::array<Hconnectivity<7>, 111> const topologies_7;
+extern std::array<Hconnectivity<8>, 642> const topologies_8;
+extern std::array<Hconnectivity<9>, 4334> const topologies_9;
+extern std::array<Hconnectivity<10>, 33510> const topologies_10;
+
+}
+}
+
+#endif
+
--- a/coloquinte/src/coloquinte/union_find.hxx
+++ b/coloquinte/src/coloquinte/union_find.hxx
@ -0,0 +1,47 @@
+
+#ifndef COLOQUINTE_UNION_FIND
+#define COLOQUINTE_UNION_FIND
+
+#include "common.hxx"
+
+#include <vector>
+
+namespace coloquinte{
+
+class union_find{
+	std::vector<index_t> connex_representants;
+
+	public:
+	index_t size() const { return connex_representants.size(); }
+
+	void merge(index_t a, index_t b){
+		connex_representants[find(a)] = b;
+	}
+
+	index_t find(index_t ind){
+		if(connex_representants[ind] != ind){
+			connex_representants[ind] = find(connex_representants[ind]);
+		}
+		return connex_representants[ind];
+	}
+
+	union_find(index_t s) : connex_representants(s){
+		for(index_t i=0; i<size(); ++i){
+			connex_representants[i] = i;
+		}
+	}
+	
+    bool is_connex(){
+        bool connex = true;
+        for(index_t i=0; i+1<size(); ++i){
+            connex = connex && (find(i) == find(i+1));
+        }
+        return connex;
+    }
+	
+};
+
+} // End namespace coloquinte
+
+#endif
+
--- a/coloquinte/src/detailed.cxx
+++ b/coloquinte/src/detailed.cxx
@ -0,0 +1,260 @@
+
+#include "coloquinte/detailed.hxx"
+#include "coloquinte/circuit_helper.hxx"
+
+#include <cassert>
+
+namespace coloquinte{
+namespace dp{
+
+detailed_placement::detailed_placement(
+        placement_t pl,
+        std::vector<index_t> placement_rows,
+        std::vector<index_t> cell_heights,
+        std::vector<std::vector<index_t> > rows,
+        int_t min_x, int_t max_x,
+        int_t y_origin,
+        index_t nbr_rows, int_t row_height
+    )
+    :
+        plt_(pl),
+        cell_rows_(placement_rows),
+        min_x_(min_x), max_x_(max_x),
+        y_origin_(y_origin)
+    {
+
+    assert(row_height > 0);
+    assert(min_x < max_x);
+    assert(rows.size() == nbr_rows);
+
+    neighbours_limits_.push_back(0); 
+    for(index_t h : cell_heights){
+        neighbours_limits_.push_back(neighbours_limits_.back() + h);
+    }
+
+    neighbours_ .resize(neighbours_limits_.back(), std::pair<index_t, index_t>(null_ind, null_ind) );
+
+    row_first_cells_ .resize(nbr_rows, null_ind);
+    row_last_cells_  .resize(nbr_rows, null_ind);
+
+    std::vector<bool> explored(neighbours_limits_.back(), false);
+    // Now we extract the dependencies
+    for(index_t r=0; r<rows.size(); ++r){
+
+        if(not rows[r].empty()){
+            row_first_cells_[r] = rows[r].front();
+            row_last_cells_[r]  = rows[r].back();
+        }
+
+        for(index_t c : rows[r]){
+            // Has this row of the cell already been visited?
+            assert(not explored[neighbour_index(c, r)]);
+            explored[neighbour_index(c, r)] = true;
+        }
+
+        for(index_t i=0; i+1<rows[r].size(); ++i){
+            index_t c1 = rows[r][i], c2 = rows[r][i+1];
+
+            // Save in the internal format
+            neighbours_[neighbour_index(c1, r)].second = c2;
+            neighbours_[neighbour_index(c2, r)].first  = c1;
+
+            // The positions are correct
+        }
+    }
+
+    // Every level of every cell must have been visited
+    for(bool o : explored)
+        assert(o);
+
+    // Verify that we haven't made any obvious mistake
+    selfcheck();
+}
+
+void detailed_placement::selfcheck() const{
+    assert(row_first_cells_.size() == row_last_cells_.size());
+
+    for(index_t i=0; i<cell_cnt(); ++i){
+        for(index_t l=0; l<cell_height(i); ++l){
+            // not verified now since we don't modify the position for the obstacles
+            // : assert(c.position.x_ >= min_x_ and c.position.x_ + c.width <= max_x_);
+
+            index_t n_ind = l + neighbours_limits_[i];
+            assert(cell_rows_[i] + cell_height(i) <= row_cnt());
+
+            if(neighbours_[n_ind].first != null_ind){
+                index_t oi = neighbours_[n_ind].first;
+                // Correct neighbour position
+                assert(neighbours_[neighbour_index(oi, cell_rows_[i]+l)].second == i);
+            }
+            else{
+                // Beginning of a row
+                assert(row_first_cells_[cell_rows_[i] + l] == i);
+            }
+            if(neighbours_[n_ind].second != null_ind){
+                index_t oi = neighbours_[n_ind].second;
+                // Correct neighbour position
+                assert(neighbours_[neighbour_index(oi, cell_rows_[i]+l)].first == i);
+            }
+            else{
+                // End of a row
+                assert(row_last_cells_[cell_rows_[i] + l] == i);
+            }
+        }
+    }
+}
+
+void detailed_placement::swap_standard_cell_topologies(index_t c1, index_t c2){
+    assert(cell_height(c1) == cell_height(c2));
+    assert(cell_height(c1) == 1 and cell_height(c2) == 1);
+
+    index_t row_c1 = cell_rows_[c1],
+            row_c2 = cell_rows_[c2];
+
+    index_t b_c1 = neighbours_[neighbours_limits_[c1]].first;
+    index_t b_c2 = neighbours_[neighbours_limits_[c2]].first;
+    index_t a_c1 = neighbours_[neighbours_limits_[c1]].second;
+    index_t a_c2 = neighbours_[neighbours_limits_[c2]].second;
+
+    // Two cases: they were adjacent or they were not
+    // Luckily updating in the neighbours first then swapping the recorded neighbours works in both cases for standard cells
+
+    // Update the pointers in the cells' neighbours
+    if(b_c1 != null_ind) neighbours_[neighbour_index(b_c1, row_c1)].second = c2;
+    else row_first_cells_[row_c1] = c2;
+    if(b_c2 != null_ind) neighbours_[neighbour_index(b_c2, row_c2)].second = c1;
+    else row_first_cells_[row_c2] = c1;
+
+    if(a_c1 != null_ind) neighbours_[neighbour_index(a_c1, row_c1)].first  = c2;
+    else row_last_cells_[row_c1] = c2;
+    if(a_c2 != null_ind) neighbours_[neighbour_index(a_c2, row_c2)].first  = c1;
+    else row_last_cells_[row_c2] = c1;
+
+    // Swap the properties in both cells
+    std::swap(neighbours_[neighbours_limits_[c1]], neighbours_[neighbours_limits_[c2]]);
+    std::swap(cell_rows_[c1], cell_rows_[c2]);
+}
+
+std::pair<int_t, int_t> detailed_placement::get_limit_positions(netlist const & circuit, index_t c) const{
+    auto ret = std::pair<int_t, int_t>(min_x_, max_x_);
+    for(index_t l=neighbours_limits_[c]; l<neighbours_limits_[c+1]; ++l){
+        index_t b_i = neighbours_[l].first,
+                a_i = neighbours_[l].second;
+
+        if(b_i != null_ind){
+            ret.first  = std::max(ret.first,  plt_.positions_[b_i].x_ + circuit.get_cell(b_i).size.x_);
+        }
+        if(a_i != null_ind){
+            ret.second = std::min(ret.second, plt_.positions_[a_i].x_);
+        }
+    }
+    return ret;
+}
+
+index_t detailed_placement::get_first_cell_on_row(index_t r){
+    return row_first_cells_[r];
+}
+
+index_t detailed_placement::get_first_standard_cell_on_row(index_t r){
+    index_t c = get_first_cell_on_row(r);
+    while(c != null_ind and cell_height(c) != 1){
+        index_t next_c = get_next_cell_on_row(c, r);
+        assert(c != next_c);
+        c = next_c;
+    }
+    assert(c == null_ind or cell_rows_[c] == r);
+    return c;
+}
+
+index_t detailed_placement::get_next_cell_on_row(index_t c, index_t r){
+    return neighbours_[neighbour_index(c, r)].second;
+}
+index_t detailed_placement::get_prev_cell_on_row(index_t c, index_t r){
+    return neighbours_[neighbour_index(c, r)].first;
+}
+
+index_t detailed_placement::get_next_standard_cell_on_row(index_t c, index_t r){
+    do{
+        index_t next_c = get_next_cell_on_row(c, r);
+        assert(c != next_c);
+        c = next_c;
+    }while(c != null_ind and cell_height(c) != 1);
+    assert(c == null_ind or cell_rows_[c] == r);
+    return c;
+}
+
+void detailed_placement::reorder_cells(std::vector<index_t> const old_order, std::vector<index_t> const new_order, index_t r){
+    assert(old_order.size() == new_order.size());
+    assert(not old_order.empty());
+
+    index_t before_row = get_prev_cell_on_row(old_order.front(), r);
+    index_t after_row  = get_next_cell_on_row(old_order.back(),  r);
+
+    for(index_t i=0; i<new_order.size(); ++i){
+        auto & nghs = neighbours_[neighbour_index(new_order[i], r)];
+        if(i > 0){
+            nghs.first = new_order[i-1];
+        }
+        else{
+            nghs.first = before_row;
+        }
+        if(i+1 < new_order.size()){
+            nghs.second = new_order[i+1];
+        }
+        else{
+            nghs.second = after_row;
+        }
+    }
+
+    if(before_row != null_ind) neighbours_[neighbour_index(before_row, r)].second = new_order.front();
+    else row_first_cells_[r] = new_order.front();
+    if(after_row != null_ind) neighbours_[neighbour_index(after_row, r)].first = new_order.back();
+    else row_last_cells_[r] = new_order.back(); 
+}
+
+void detailed_placement::reorder_standard_cells(std::vector<index_t> const old_order, std::vector<index_t> const new_order){
+    assert(old_order.size() == new_order.size());
+    assert(not old_order.empty());
+
+    index_t before_row = neighbours_[neighbours_limits_[old_order.front()]].first;
+    index_t after_row  = neighbours_[neighbours_limits_[old_order.back() ]].second;
+
+    index_t r = cell_rows_[new_order.front()];
+
+    for(index_t i=0; i<new_order.size(); ++i){
+        assert(cell_height(new_order[i]) == 1);
+        assert(cell_rows_[new_order[i]] == r);
+
+        auto & nghs = neighbours_[neighbours_limits_[new_order[i]]];
+        if(i > 0){
+            nghs.first = new_order[i-1];
+        }
+        else{
+            nghs.first = before_row;
+        }
+        if(i+1 < new_order.size()){
+            nghs.second = new_order[i+1];
+        }
+        else{
+            nghs.second = after_row;
+        }
+    }
+
+    if(before_row != null_ind) neighbours_[neighbour_index(before_row, r)].second = new_order.front();
+    else row_first_cells_[r] = new_order.front();
+    if(after_row != null_ind) neighbours_[neighbour_index(after_row, r)].first = new_order.back();
+    else row_last_cells_[r] = new_order.back(); 
+}
+
+void row_compatible_orientation(netlist const & circuit, detailed_placement & pl, bool first_row_orient){
+    for(index_t c=0; c<circuit.cell_cnt(); ++c){
+        if( (circuit.get_cell(c).attributes & YFlippable) != 0 and pl.cell_height(c) == 1){
+            pl.plt_.orientations_[c].y_ = (pl.cell_rows_[c] % 2 != 0) ^ first_row_orient;
+        }
+    }
+}
+
+} // namespace dp
+} // namespace coloquinte
+
+
--- a/coloquinte/src/legalizer.cxx
+++ b/coloquinte/src/legalizer.cxx
@ -0,0 +1,446 @@
+
+#include "coloquinte/legalizer.hxx"
+#include "coloquinte/optimization_subproblems.hxx"
+
+#include <algorithm>
+#include <cmath>
+#include <queue>
+
+namespace coloquinte{
+namespace dp{
+
+void get_result(netlist const & circuit, detailed_placement const & dpl, placement_t & gpl){
+    for(index_t c=0; c<circuit.cell_cnt(); ++c){
+        if( (circuit.get_cell(c).attributes & XMovable) != 0)
+            gpl.positions_[c].x_ = dpl.plt_.positions_[c].x_;
+        if( (circuit.get_cell(c).attributes & YMovable) != 0)
+            gpl.positions_[c].y_ = dpl.plt_.positions_[c].y_;
+
+        if( (circuit.get_cell(c).attributes & XFlippable) != 0)
+            gpl.orientations_[c].x_ = dpl.plt_.orientations_[c].x_;
+        if( (circuit.get_cell(c).attributes & YFlippable) != 0)
+            gpl.orientations_[c].y_ = dpl.plt_.orientations_[c].y_;
+    }
+}
+
+struct cell_to_leg{
+    int_t x_pos, y_pos;
+    index_t original_cell;
+    int_t width;
+    index_t nbr_rows;
+
+    bool operator<(cell_to_leg const o) const{ return x_pos < o.x_pos; }
+
+    cell_to_leg(int_t x, int_t y, index_t ind, int_t w, index_t rows)
+    : x_pos(x), y_pos(y),
+    original_cell(ind),
+    width(w),
+    nbr_rows(rows)
+    {}
+
+    legalizable_task<int_t> task() const{ return legalizable_task<int_t>(width, x_pos, original_cell); }
+};
+
+struct fixed_cell_interval{
+    int_t min_x, max_x;
+    index_t cell_ind;
+
+    bool operator<(fixed_cell_interval const o) const{ return min_x > o.min_x; }
+    fixed_cell_interval(int_t mn, int_t mx, index_t ind) : min_x(mn), max_x(mx), cell_ind(ind){}
+};
+
+struct cell_leg_properties{
+    int_t   x_pos;
+    index_t row_pos;
+    index_t ind;
+
+    cell_leg_properties(){}
+    cell_leg_properties(int_t x, int_t r, index_t i) : x_pos(x), row_pos(r), ind(i){}
+};
+
+std::vector<cell_leg_properties> simple_legalize(
+        std::vector<std::vector<fixed_cell_interval> > obstacles, std::vector<cell_to_leg> cells,
+        std::vector<std::vector<index_t> > & rows,
+        int_t x_min, int_t x_max, int_t y_orig,
+        int_t row_height, index_t nbr_rows
+    ){
+
+    std::vector<int_t> first_available_position(nbr_rows, x_min);
+    rows.resize(nbr_rows);
+
+    // Sort the cells by x position
+    std::sort(cells.begin(), cells.end());
+
+    std::vector<cell_leg_properties> ret;
+
+    for(cell_to_leg C : cells){
+        // Dumb, quick and dirty best-fit legalization
+        bool found_location = false;
+
+        // Properties of the current best solution
+        int_t best_x=0;
+        int_t best_cost=0;
+        index_t best_row=0;
+
+        // Helper function
+        auto check_row_cost = [&](index_t r, cell_to_leg const cell, int_t additional_cost){
+            // Find where to put the cell in these rows
+            // Simple method: get a range where we can put the cell
+
+            assert(r + cell.nbr_rows <= nbr_rows);
+            assert(additional_cost >= 0);
+
+            // First position where we can put it
+            int_t cur_pos = *std::max_element(first_available_position.begin() + r, first_available_position.begin() + r + cell.nbr_rows);
+            int_t max_lim = x_max - cell.width;
+            int_t interval_lim;
+            do{
+                interval_lim = max_lim;
+                // For each row, test if obstacles prevent us from putting a cell here
+                // Until we find a correct position or are beyond the maximum position
+                for(index_t i = 0; i<cell.nbr_rows; ++i){
+                    // Find the first obstacle which is after this position
+                    // TODO: use lower/upper bound
+                    auto it=obstacles[r+i].rbegin();
+                    for(; it != obstacles[r+i].rend() && it->max_x <= cur_pos; ++it){
+                    }
+                    if(it != obstacles[r+i].rend()){ // There is an obstacle on the right
+                        assert(it->min_x < it->max_x);
+                        int_t cur_lim = it->min_x - cell.width; // Where the obstacles contrains us
+                        interval_lim = std::min(cur_lim, interval_lim); // Constraint
+                        if(cur_lim < cur_pos){ // If this particular obstacle constrained us so that it is not possible to make it here, we increment the position
+                            cur_pos = std::max(it->max_x, cur_pos);
+                        }
+                    }
+                }
+                // Do it again until we find a solution
+                // TODO: continue until we can't find a better solution (currently sticks before the first obstacle if there is enough whitespace)
+            }while(interval_lim < cur_pos and interval_lim < max_lim and cur_pos < max_lim); // Not admissible and we encountered an obstacle and there is still hope
+
+            if(interval_lim >= cur_pos){ // An admissible solution is found (and if cell.x_pos is between cur_pos and interval_lim it is optimal)
+                int_t row_best_x = std::min(interval_lim, std::max(cur_pos, cell.x_pos));
+                int_t row_cost_x = std::abs(row_best_x - cell.x_pos);
+                if(not found_location or row_cost_x + additional_cost < best_cost){
+                    found_location = true;
+                    best_cost = row_cost_x + additional_cost;
+                    best_x = row_best_x;
+                    best_row = r;
+                }
+            }
+        };
+
+        // The row where we would prefer the cell to go
+        if(C.nbr_rows > nbr_rows) throw std::runtime_error("Impossible to legalize a cell spanning more rows than are available\n");
+        index_t central_row = std::min( (index_t) std::max( (C.y_pos - y_orig) / row_height, 0), nbr_rows-C.nbr_rows);
+
+        // Try every possible row from the best one, until we can't improve the cost
+        for(index_t row_dist = 0;
+            (central_row + row_dist < nbr_rows or central_row >= row_dist)
+            and (not found_location or (int_t) row_dist * row_height * C.width < (int_t) row_height + best_cost);
+            ++row_dist
+        ){
+            if(central_row + row_dist < nbr_rows - C.nbr_rows){
+                int_t add_cost = C.width * std::abs(static_cast<int_t>(central_row + row_dist) * static_cast<int_t>(row_height) + y_orig - C.y_pos);
+                check_row_cost(central_row + row_dist, C, add_cost);
+            }
+            if(central_row >= row_dist){
+                int_t add_cost = C.width * std::abs(static_cast<int_t>(central_row - row_dist) * static_cast<int_t>(row_height) + y_orig - C.y_pos);
+                check_row_cost(central_row - row_dist, C, add_cost);
+            }
+        }
+
+        if(not found_location){ // We didn't find any whitespace to put the cell in
+            throw std::runtime_error("Didn't manage to pack a cell due to dumb algorithm\n");
+        }
+        else{
+            assert(best_x + C.width <= x_max and best_x >= x_min);
+            // Update the occupied rows
+            for(index_t r = best_row; r < best_row + C.nbr_rows; ++r){
+                // Include the obstacles
+                while(not obstacles[r].empty()
+                    and obstacles[r].back().max_x <= best_x){
+                    rows[r].push_back(obstacles[r].back().cell_ind);
+                    obstacles[r].pop_back();
+                }
+                assert(obstacles[r].empty() or obstacles[r].back().min_x >= best_x + C.width);
+
+                rows[r].push_back(C.original_cell);
+                first_available_position[r] = best_x + C.width;
+            }
+
+            ret.push_back(cell_leg_properties(best_x, best_row, C.original_cell));
+        }
+    }
+
+    // Finally, push the remaining fixed cells
+    for(index_t r=0; r<nbr_rows; ++r){
+        while(not obstacles[r].empty()){
+            rows[r].push_back(obstacles[r].back().cell_ind);
+            obstacles[r].pop_back();
+        }
+    }
+
+    return ret;
+}
+
+// A better legalization function which is able to push already legalized cells
+std::vector<cell_leg_properties> good_legalize(
+        std::vector<std::vector<fixed_cell_interval> > obstacles, std::vector<cell_to_leg> cells,
+        std::vector<std::vector<index_t> > & rows,
+        int_t x_min, int_t x_max, int_t y_orig,
+        int_t row_height, index_t nbr_rows
+    ){
+
+    // Two possibilities:
+    //    * Single OSRP (group of movable cells) at the current end of the row of standard cells
+    //    * Multiple OSRPs, between each pair of obstacles
+    //          -> allows pushing cells past obstacles
+    //          -> tricky with multiple standard cell heights
+    // Therefore I chose single OSRP, which gets cleared and pushed to the final state whenever
+    //    * we encounter a multiple-rows cell
+    //    * a new standard cell gets past an obstacle
+
+    // The current group of standard cells on the right of the row
+    std::vector<OSRP_leg<int_t> > single_row_problems(nbr_rows);
+    for(index_t r=0; r<nbr_rows; ++r){
+        single_row_problems[r] = OSRP_leg<int_t>(x_min, obstacles[r].empty() ? x_max : obstacles[r].back().min_x);
+    }
+    rows.resize(nbr_rows);
+
+    // Sort the cells by x position
+    std::sort(cells.begin(), cells.end());
+
+    std::vector<cell_leg_properties> ret;
+
+    for(cell_to_leg C : cells){
+        // Dumb, quick and dirty best-fit legalization
+        bool found_location = false;
+
+        // Properties of the current best solution
+        int_t best_cost=0;
+        index_t best_row=0;
+        index_t obstacles_passed = 0;
+
+        // Helper function
+        auto check_row_cost = [&](index_t r, cell_to_leg const cell, int_t additional_cost){
+            // Find where to put the cell in these rows
+            // Check if we can put it in the current ranges and at what cost; if not or if the optimal position is beyond an obstacle, try after this obstacle too
+
+            assert(cell.nbr_rows > 0);
+            assert(r + cell.nbr_rows <= nbr_rows);
+            assert(additional_cost >= 0);
+
+            // Where can we put a standard cell if we allow to move the cells?
+            if(cell.nbr_rows == 1){
+                int_t cur_cost = 0;
+
+                // Can we simply add it to the single row problem?
+                bool found_here = single_row_problems[r].remaining_space() >= cell.width;
+                int_t loc_obstacles_passed = 0;
+                if(found_here){
+                    // Check the cost of pushing it here with possible displacement
+                    cur_cost = single_row_problems[r].get_cost(cell.task()); // Don't update the row
+                }
+
+                // Other positions where we can put it, without moving other cells this time
+                if(not found_here or cur_cost > 0){
+                    index_t obstacles_to_throw = 0;
+                    auto it = obstacles[r].rbegin();
+                    while(it != obstacles[r].rend()){
+                        ++ obstacles_to_throw;
+                        auto prev_it = it++;
+                        int_t region_end = it != obstacles[r].rend() ? it->min_x : x_max;
+                        if(region_end >= prev_it->max_x + cell.width){
+                            int_t loc_x = std::min(region_end - cell.width, std::max(prev_it->max_x, cell.x_pos));
+                            int_t loc_cost = cell.width * std::abs(cell.x_pos - loc_x);
+                            if(not found_here or cur_cost > loc_cost){
+                                found_here = true;
+                                cur_cost = loc_cost;
+                                loc_obstacles_passed = obstacles_to_throw;
+                            }
+                        }
+                    }
+                }
+                if(found_here and (not found_location or cur_cost + additional_cost < best_cost)){
+                    found_location = true;
+                    //std::cout << "Found with displacement cost " << cur_cost << " and total cost " << cur_cost + additional_cost << std::endl;
+                    best_cost = cur_cost + additional_cost;
+                    best_row = r;
+                    obstacles_passed = loc_obstacles_passed;
+                    if(loc_obstacles_passed > 0) assert(not obstacles[r].empty());
+                }
+            }
+            else{
+                // If it is a fixed cell, we use fixed locations
+                throw std::runtime_error("I don't handle fucking macros\n");
+            }
+        };
+
+        // The row where we would prefer the cell to go
+        if(C.nbr_rows > nbr_rows) throw std::runtime_error("Impossible to legalize a cell spanning more rows than are available\n");
+        index_t central_row = std::min( (index_t) std::max( (C.y_pos - y_orig) / row_height, 0), nbr_rows-C.nbr_rows);
+
+        // Try every possible row from the best one, until we can't improve the cost
+        for(index_t row_dist = 0;
+            (central_row + row_dist < nbr_rows or central_row >= row_dist)
+            and (not found_location or (int_t) row_dist * row_height * C.width < (int_t) row_height + best_cost);
+            ++row_dist
+        ){
+            if(central_row + row_dist < nbr_rows - C.nbr_rows){
+                int_t add_cost = C.width * std::abs(static_cast<int_t>(central_row + row_dist) * static_cast<int_t>(row_height) + y_orig - C.y_pos);
+                check_row_cost(central_row + row_dist, C, add_cost);
+            }
+            if(central_row >= row_dist){
+                int_t add_cost = C.width * std::abs(static_cast<int_t>(central_row - row_dist) * static_cast<int_t>(row_height) + y_orig - C.y_pos);
+                check_row_cost(central_row - row_dist, C, add_cost);
+            }
+        }
+
+        if(not found_location){ // We didn't find any whitespace to put the cell in
+            throw std::runtime_error("Didn't manage to pack a cell: leave more whitespace and avoid macros near the right side\n");
+        }
+        else{
+            //std::cout << "Cell " << C.original_cell << " of width " << C.width << " targetting row " << central_row << " and position " << C.x_pos << " put at row " << best_row << " with displacement " << best_cost / C.width << " with " << obstacles_passed << " obstacles passed" << std::endl;
+            // If the cell spans multiple rows, it becomes fixed
+            // In this case or if the cell goes after an obstacle, push everything before the cell to the fixed state
+
+            if(C.nbr_rows == 1){
+                if(obstacles_passed == 0){ // Ok; just update the old single row problem
+                    single_row_problems[best_row].push(C.task()); // Push it to the row
+                }
+                else{
+                    assert(obstacles_passed > 0);
+                    // Empty the single row problem
+                    for(auto p : single_row_problems[best_row].get_placement()){
+                        rows[best_row].push_back(p.first);
+                        ret.push_back(cell_leg_properties(p.second, best_row, p.first));
+                    }
+                    // Find where to put it
+                    int_t region_begin = x_min;
+                    for(index_t i=0; i<obstacles_passed; ++i){
+                        assert(not obstacles[best_row].empty());
+                        region_begin = obstacles[best_row].back().max_x;
+                        rows[best_row].push_back(obstacles[best_row].back().cell_ind);
+                        obstacles[best_row].pop_back();
+                    }
+                    int_t region_end = obstacles[best_row].empty() ? x_max : obstacles[best_row].back().min_x;
+                    single_row_problems[best_row] = OSRP_leg<int_t>(region_begin, region_end);
+                    assert(region_end - region_begin >= C.width);
+                    single_row_problems[best_row].push(C.task()); // Push this only cell to the single row problem
+                }
+            }
+            else{
+                throw std::runtime_error("I don't handle fucking macros\n");
+            }
+        }
+    }
+
+    for(index_t r=0; r<nbr_rows; ++r){
+        // Finally, push the remaining standard cells in the row
+        for(auto p : single_row_problems[r].get_placement()){
+            rows[r].push_back(p.first);
+            ret.push_back(cell_leg_properties(p.second, r, p.first));
+        }
+        // And the fixed cells
+        while(not obstacles[r].empty()){
+            rows[r].push_back(obstacles[r].back().cell_ind);
+            obstacles[r].pop_back();
+        }
+    }
+
+    rows.resize(nbr_rows);
+    return ret;
+}
+
+
+detailed_placement legalize(netlist const & circuit, placement_t const & pl, box<int_t> surface, int_t row_height){
+    if(row_height <= 0) throw std::runtime_error("The rows' height should be positive\n");
+
+    index_t nbr_rows = (surface.y_max_ - surface.y_min_) / row_height;
+    // The position of the ith row is surface.y_min_ + i * row_height
+
+    std::vector<std::vector<fixed_cell_interval> > row_occupation(nbr_rows);
+    std::vector<cell_to_leg> cells;
+
+    placement_t new_placement = pl;
+    std::vector<index_t> placement_rows(circuit.cell_cnt());
+    std::vector<index_t> cell_heights(circuit.cell_cnt());
+
+    for(index_t i=0; i<circuit.cell_cnt(); ++i){
+        auto cur = circuit.get_cell(i);
+        // Assumes fixed if not both XMovable and YMovable
+        if( (cur.attributes & XMovable) != 0 && (cur.attributes & YMovable) != 0){
+            // Just truncate the position we target
+            point<int_t> target_pos = pl.positions_[i];
+            index_t cur_cell_rows = (cur.size.y_ + row_height -1) / row_height;
+            cells.push_back(cell_to_leg(target_pos.x_, target_pos.y_, i, cur.size.x_, cur_cell_rows));
+            cell_heights[i] = cur_cell_rows;
+        }
+        else{
+            // In each row, we put the index of the fixed cell and the range that is already occupied
+            int_t low_x_pos  = pl.positions_[i].x_,
+                  hgh_x_pos  = pl.positions_[i].x_ + cur.size.x_,
+                  low_y_pos  = pl.positions_[i].y_,
+                  hgh_y_pos  = pl.positions_[i].y_ + cur.size.y_;
+
+            new_placement.positions_[i] = point<int_t>(low_x_pos, low_y_pos);
+            if(hgh_y_pos <= surface.y_min_ or low_y_pos >= surface.y_max_ or hgh_x_pos <= surface.x_min_ or low_x_pos >= surface.x_max_){
+                placement_rows[i] = null_ind;
+                cell_heights[i] = 0;
+            }
+            else{
+                assert(low_x_pos < hgh_x_pos and low_y_pos < hgh_y_pos);
+
+                int_t rnd_hgh_x_pos = std::min(surface.x_max_, hgh_x_pos);
+                int_t rnd_hgh_y_pos = std::min(surface.y_max_, hgh_y_pos);
+                int_t rnd_low_x_pos = std::max(surface.x_min_, low_x_pos);
+                int_t rnd_low_y_pos = std::max(surface.y_min_, low_y_pos);
+
+                index_t first_row = (rnd_low_y_pos - surface.y_min_) / row_height;
+                index_t last_row = (index_t) (rnd_hgh_y_pos - surface.y_min_ + row_height - 1) / row_height; // Exclusive: if the cell spans the next row, i.e. pos % row_height >= 0, include it too
+                assert(last_row <= nbr_rows);
+
+                placement_rows[i] = first_row;
+                cell_heights[i] = last_row - first_row;
+                for(index_t r=first_row; r<last_row; ++r){
+                    row_occupation[r].push_back(fixed_cell_interval(rnd_low_x_pos, rnd_hgh_x_pos, i));
+                }
+            }
+        }
+    }
+
+    for(std::vector<fixed_cell_interval> & L : row_occupation){
+        std::sort(L.begin(), L.end()); // Sorts from last to first, so that we may use pop_back()
+        // Doesn't collapse them yet, which may make for bigger complexities
+        for(index_t i=0; i+1<L.size(); ++i){
+            if(L[i].min_x < L[i+1].max_x)
+                throw std::runtime_error("Sorry, I don't handle overlapping fixed cells yet\n");
+        }
+    }
+
+    std::vector<std::vector<index_t> > cells_by_rows;
+
+    auto final_cells = good_legalize(row_occupation, cells, cells_by_rows,
+        surface.x_min_, surface.x_max_, surface.y_min_,
+        row_height, nbr_rows
+    );
+
+    for(cell_leg_properties C : final_cells){
+        new_placement.positions_[C.ind] = point<int_t>(C.x_pos, static_cast<int_t>(C.row_pos) * row_height + surface.y_min_);
+        placement_rows[C.ind] = C.row_pos;
+    }
+
+    return detailed_placement(
+        new_placement,
+        placement_rows,
+        cell_heights,
+        cells_by_rows,
+        surface.x_min_, surface.x_max_,
+        surface.y_min_,
+        nbr_rows, row_height
+    );
+}
+
+} // namespace dp
+} // namespace coloquinte
+
--- a/coloquinte/src/lookup_table.cxx
+++ b/coloquinte/src/lookup_table.cxx
--- a/coloquinte/src/optimization_subproblems.cxx
+++ b/coloquinte/src/optimization_subproblems.cxx
@ -0,0 +1,513 @@
+
+#include "coloquinte/optimization_subproblems.hxx"
+
+#include <stdexcept>
+
+namespace coloquinte{
+
+std::vector<capacity_t>  transport_1D(std::vector<t1D_elt> sources, std::vector<t1D_elt> sinks){
+    /* Description of the algorithm:
+     *
+     *    For each cell, put it in its optimal region or the last region where a cell is if there is no space in it
+     *    Push all changes in the derivative of the cost function to a priority queue; those changes occur
+     *          when evicting the preceding cell from a region (most such changes are 0 and not considered, hence the complexity)
+     *          when moving to a non-full region
+     *    While the new cell overlaps with a new region, get the new slope (derivative) at this point
+     *    and push all preceding cell until this region is freed or the slope becomes 0 (in which case the new region is now occupied)
+     */
+
+    struct bound{
+        capacity_t pos;
+        int_t slope_diff;
+        bool operator<(bound const o) const{ return pos < o.pos; }
+    };
+
+    std::priority_queue<bound> bounds;
+    std::vector<capacity_t> constraining_pos;
+    std::vector<capacity_t> prev_cap(1, 0), prev_dem(1, 0);
+    for(auto const s : sinks){
+        prev_cap.push_back(s.second + prev_cap.back());
+    }
+    for(auto const s : sources){
+        prev_dem.push_back(s.second + prev_dem.back());
+    }
+    // The sinks have enough capacity to hold the whole demand
+    assert(prev_cap.back() >= prev_dem.back());
+
+    const capacity_t min_abs_pos = 0, max_abs_pos = prev_cap.back() - prev_dem.back();
+    assert(min_abs_pos <= max_abs_pos);
+
+    auto push_bound = [&](capacity_t p, int_t s){
+        assert(s >= 0);
+        if(p > min_abs_pos){
+            bound B;
+            B.pos = p;
+            B.slope_diff = s;
+            bounds.push(B);
+        }
+    }; 
+
+    // Distance to the right - distance to the left
+    auto get_slope = [&](index_t src, index_t boundary){
+        assert(boundary+1 < sinks.size());
+        assert(src < sources.size());
+        return std::abs(sources[src].first - sinks[boundary+1].first) - std::abs(sources[src].first - sinks[boundary].first);
+    };
+
+    capacity_t cur_abs_pos = min_abs_pos;
+    index_t opt_r=0, next_r=0, first_free_r=0;
+
+    for(index_t i=0; i<sources.size(); ++i){
+        // Update the optimal region
+        while(opt_r+1 < sinks.size() and (sinks[opt_r].first + sinks[opt_r+1].first)/2 < sources[i].first){
+            ++opt_r;
+        }
+        // Update the next region
+        index_t prev_next_r = next_r;
+        while(next_r < sinks.size() and sinks[next_r].first <= sources[i].first){
+            ++next_r;
+        }
+
+        index_t dest_reg = std::max(first_free_r, opt_r);
+        assert(dest_reg < sinks.size());
+
+        if(i>0){
+            // Push bounds due to changing the source crossing the boundary j/j+1
+            // Linear amortized complexity accross all sources (next_r grows)
+            // get_slope(i-1, j) - get_slope(i, j) == 0 if j >= next_r
+            // get_slope(i-1, j) - get_slope(i, j) == 0 if j < prev_next_r-1
+
+            for(index_t j=std::max(prev_next_r,1u)-1; j<std::min(first_free_r, next_r+1); ++j){
+                assert(get_slope(i,j) <= get_slope(i-1,j));
+                push_bound(prev_cap[j+1] - prev_dem[i], get_slope(i-1, j) - get_slope(i,j));
+            }
+        }
+        // Add the bounds due to crossing the boundaries alone
+        for(index_t j=first_free_r; j<opt_r; ++j){
+            assert(get_slope(i,j) <= 0);
+            push_bound(prev_cap[j+1] - prev_dem[i], -get_slope(i, j));
+        }
+
+        first_free_r = std::max(first_free_r, opt_r);
+        capacity_t this_abs_pos = std::max(cur_abs_pos, prev_cap[first_free_r] - prev_dem[i]); // Just after the previous cell or at the beginning of the destination region
+
+        while(first_free_r+1 < sinks.size() and this_abs_pos > std::max(prev_cap[first_free_r+1] - prev_dem[i+1], min_abs_pos)){ // Absolute position that wouldn't make the cell fit in the region, and we are not in the last region yet
+            capacity_t end_pos = std::max(prev_cap[first_free_r+1] - prev_dem[i+1], min_abs_pos);
+
+            int_t add_slope = get_slope(i, first_free_r);
+            int_t slope = add_slope;
+
+            while(not bounds.empty() and slope >= 0 and bounds.top().pos > end_pos){
+                this_abs_pos = bounds.top().pos;
+                slope -= bounds.top().slope_diff;
+                bounds.pop();
+            }
+            if(slope >= 0){ // We still push: the cell completely escapes the region
+                this_abs_pos = end_pos;
+                push_bound(end_pos, add_slope-slope);
+            }
+            else{ // Ok, absorbed the whole slope: push what remains and we still occupy the next region
+                push_bound(this_abs_pos, -slope);
+                ++first_free_r;
+            }
+        }
+        cur_abs_pos = this_abs_pos;
+        constraining_pos.push_back(this_abs_pos);
+    }
+
+    assert(constraining_pos.size() == sources.size());
+    if(not constraining_pos.empty()){
+        // Calculate the final constraining_pos
+        constraining_pos.back() = std::min(max_abs_pos, constraining_pos.back());
+    }
+
+    std::partial_sum(constraining_pos.rbegin(), constraining_pos.rend(), constraining_pos.rbegin(), [](capacity_t a, capacity_t b)->capacity_t{ return std::min(a, b); });
+
+    for(index_t i=0; i<constraining_pos.size(); ++i){
+        constraining_pos[i] += prev_dem[i];
+    }
+
+    return constraining_pos;
+}
+
+namespace{ // Anonymous namespace to hide the transportation structures
+
+class current_allocation{
+    static const index_t null_ind = std::numeric_limits<index_t>::max();
+
+    // Internal data structures
+
+    // Priority queue element to determine the source to be used between regions
+    struct movable_source{
+        index_t source;
+        float_t cost;
+        bool operator<(movable_source const o) const{
+               return cost > o.cost // Sorted by cost
+            || (cost == o.cost && source < o.source); // And by index to limit the number of fractional elements between two regions
+        }
+        movable_source(index_t s, float_t c) : source(s), cost(c) {}
+    };
+
+    // Member data
+
+    // The current state
+    std::vector<std::vector<capacity_t>  > sr_allocations; // For each region, for each source, the capacity allocated by the region
+    std::vector<std::vector<float_t> >     sr_costs;       // The costs from a region to a source
+    std::vector<capacity_t>                s_demands;      // The demands of the sources
+    std::vector<capacity_t>                r_capacities;   // The remaining capacities of the regions
+
+    // Shortest path data
+    std::vector<float_t>                   r_costs;        // The costs of allocating to a region
+    std::vector<index_t>                   r_parents;      // The parents of the regions i.e. the regions where we push sources first (or null_ind)
+    std::vector<index_t>                   r_sources;      // The source involved in these edges
+    std::vector<capacity_t>                arc_capacities; // The capacities of the edges to the parents, or of the region if no parent
+
+    // Best edges data
+    std::vector<std::vector<std::priority_queue<movable_source> > > best_interregions_costs; // What is the best source to move to go from region k1 to region k2?
+    index_t dijkstra_cnt;
+
+
+    // Helper functions
+
+    // Number of regions
+    index_t region_cnt() const{
+        assert(sr_costs.size() == sr_allocations.size());
+        return sr_costs.size();
+    }
+
+    // Update the edge between two regions
+    void update_edge(index_t r1, index_t r2);
+    // Add a source to all heaps of a region; returns if we need to update a path
+    bool add_source_to_heaps(index_t r, index_t source);
+    // Initialize the heaps of a region
+    void create_heaps(index_t reg);
+
+    // Run the shortest path algorithm to update the cost of each region
+    void dijkstra_update();
+
+    // Update the edge and returns if we need to rerun Dijkstra
+    bool push_edge(index_t reg, capacity_t flow);
+    // Updates a full path when pushing an element; returns if we need to rerun Dijkstra
+    bool push_path(index_t pushed_reg, capacity_t demanded, capacity_t & flow);
+
+    public:
+    // Add a new source to the transportation problem; should be done in decreasing order of demand to keep low complexity
+    void add_source(index_t elt_ind);
+
+    current_allocation(std::vector<capacity_t> caps, std::vector<capacity_t> demands, std::vector<std::vector<float_t> > costs)
+        :
+        sr_allocations(caps.size()),
+        sr_costs(costs),
+        s_demands(demands),
+        r_capacities(caps),
+        r_costs(caps.size(), 0.0),
+        r_parents(caps.size(), null_ind),
+        r_sources(caps.size(), null_ind),
+        arc_capacities(caps),
+        best_interregions_costs(caps.size(), std::vector<std::priority_queue<movable_source> >(caps.size())),
+        dijkstra_cnt(0)
+        {
+            assert(caps.size() > 0);
+            assert(costs.size() == caps.size());
+            dijkstra_update();
+        }
+
+    std::vector<std::vector<capacity_t> > get_allocations() const{ return sr_allocations; }
+    index_t get_iterations_cnt() const { return dijkstra_cnt; }
+};
+
+void current_allocation::update_edge(index_t r1, index_t r2){
+    while(not best_interregions_costs[r1][r2].empty() and sr_allocations[r1][best_interregions_costs[r1][r2].top().source] == 0){
+        best_interregions_costs[r1][r2].pop();
+    }
+
+    if(not best_interregions_costs[r1][r2].empty()){
+        // There is an edge
+        movable_source cur = best_interregions_costs[r1][r2].top();
+        float_t new_cost = r_costs[r2] + cur.cost;
+        if(new_cost < r_costs[r1]){
+            r_costs[r1] = cur.cost;
+            r_sources[r1] = cur.source;
+            r_parents[r1] = r2;
+            arc_capacities[r1] = sr_allocations[r1][cur.source];
+        }
+    }
+}
+
+bool current_allocation::add_source_to_heaps(index_t r, index_t source){
+    bool need_rerun = false;
+    for(index_t i=0; i<region_cnt(); ++i){
+        if(i == r) continue;
+        best_interregions_costs[r][i].push(
+            movable_source(source,
+                sr_costs[i][source] - sr_costs[r][source]
+            )
+        );
+        while(sr_allocations[r][best_interregions_costs[r][i].top().source] == 0){
+            best_interregions_costs[r][i].pop();
+        }
+        need_rerun = (best_interregions_costs[r][i].top().source == source) or need_rerun;
+    }
+    return need_rerun;
+}
+
+void current_allocation::create_heaps(index_t reg){
+    // Get all relevant elements
+    std::vector<std::vector<movable_source> > interregion_costs(region_cnt());
+    for(index_t i=0; i<sr_allocations[reg].size(); ++i){
+        if(sr_allocations[reg][i] > 0){
+            for(index_t oreg=0; oreg<region_cnt(); ++oreg){
+                if(oreg == reg) continue;
+                interregion_costs[oreg].push_back(
+                    movable_source(
+                        i,
+                        sr_costs[oreg][i] - sr_costs[reg][i]
+                    )
+                );
+            }
+        }
+    }
+    // Create the heaps
+    for(index_t oreg=0; oreg<region_cnt(); ++oreg){
+        best_interregions_costs[reg][oreg] = std::priority_queue<movable_source>(interregion_costs[oreg].begin(), interregion_costs[oreg].end());
+    }
+}
+
+// Returns if the path has been modified so that we would need to rerun Dijkstra
+bool current_allocation::push_edge(index_t reg, capacity_t flow){
+    index_t cur_source = r_sources[reg];
+
+    // Does this edge allocates a new source in the destination region? If yes, update the corresponding heaps
+    bool already_present = sr_allocations[r_parents[reg]][cur_source] > 0;
+
+    // Deallocating from the first region is handled by the get_edge function: just substract the flow
+    sr_allocations[          reg ][cur_source] -= flow;
+    sr_allocations[r_parents[reg]][cur_source] += flow;
+
+    assert(sr_allocations[reg][cur_source] >= 0); // The source to be pushed was indeed present in the region
+    assert(r_capacities[reg] == 0); // The region is full, which explains why we need to push
+    assert(flow <= arc_capacities[reg]); // The flow is not bigger than what can be sent
+
+    arc_capacities[reg] = sr_allocations[reg][cur_source]; // Just update the capacity if it turns out that we don't need to run Dijkstra
+    
+    if(arc_capacities[reg] == 0){
+        // The source may have been deleted from a region: rerun Dijkstra at the end
+        return true;
+    }
+    else if(not already_present and r_capacities[r_parents[reg]] == 0){
+        // A new source is allocated to a full region: rerun Dijkstra at the end if it changed the heap's top
+        return add_source_to_heaps(r_parents[reg], cur_source);
+    }
+    else{
+        // The edge is still present with the same cost and non-zero updated capacity
+        // The path still exists: no need to rerun Dijkstra yet
+        return false;
+    }
+}
+
+void current_allocation::dijkstra_update(){
+    // Simple case of the regions with remaining capacity
+    std::vector<int> visited(region_cnt(), 0);
+    index_t visited_cnt = 0;
+    for(index_t i=0; i<region_cnt(); ++i){
+        r_sources[i] = null_ind;
+        r_parents[i] = null_ind;
+        if(r_capacities[i] > 0){
+            r_costs[i] = 0.0;
+            arc_capacities[i] = r_capacities[i];
+
+            visited[i] = 1;
+            ++visited_cnt;
+        }
+        else{
+            r_costs[i] = std::numeric_limits<float_t>::infinity();
+            arc_capacities[i] = 0;
+        }
+    }
+    // if(visited_cnt <= 0) throw std::runtime_error("Capacity problem: no region has been marked as reachable\n");
+    if(visited_cnt == region_cnt()){ return; }
+    // Get the costs for every non-visited region
+    for(index_t i=0; i<region_cnt(); ++i) if(visited[i] == 0){ // For every region that is not visited yet
+        for(index_t j=0; j<region_cnt(); ++j) if(visited[j] == 1){ // For every already visited region
+            // Get the best interregion cost
+            update_edge(i,j);
+        }
+    }
+    while(visited_cnt < region_cnt()){
+        // Find the region with the lowest cost to visit; mark it visited
+        index_t best_reg = null_ind;
+        float_t best_cost = std::numeric_limits<float_t>::infinity();
+        for(index_t i=0; i<region_cnt(); ++i) if(visited[i] == 0){ // For every region that is not visited yet
+            if(r_costs[i] < best_cost){
+                best_cost = r_costs[i];
+                best_reg  = i;
+            }
+        }
+        if(best_reg == null_ind) break; // Some regions are unreachable, typically because they have zero capacity at the beginning
+        visited[best_reg] = 1;
+        ++visited_cnt;
+        // Update the cost for every unvisited region
+        for(index_t i=0; i<region_cnt(); ++i) if(visited[i] == 0){ // For every region that is not visited yet
+            update_edge(i, best_reg);
+        }
+    }
+}
+
+bool current_allocation::push_path(index_t pushed_reg, capacity_t demanded, capacity_t & flow){
+    // Get the final flow sent, which is smaller than the capacities on the path
+    flow = demanded;
+    for(index_t reg = pushed_reg; reg != null_ind; reg = r_parents[reg]){
+        flow = std::min(flow, arc_capacities[reg]);
+    }
+
+    bool rerun_dijkstra = false;
+    // Update the path between the regions
+    index_t reg = pushed_reg;
+    for(; r_parents[reg] != null_ind; reg = r_parents[reg]){
+        assert(r_capacities[reg] == 0);
+        rerun_dijkstra = push_edge(reg, flow) or rerun_dijkstra;
+    }
+
+    assert(r_capacities[reg] > 0);
+    assert(arc_capacities[reg] == r_capacities[reg]);
+    assert(r_capacities[reg] >= flow);
+
+    // Update the capacities at the end
+    r_capacities[reg] -= flow;
+    arc_capacities[reg] -= flow;
+
+    // The last region on the path is the one that satisfies the demand
+    if(r_capacities[reg] == 0){ // If we just consumed the available capacity, it becomes useful to move sources off this region: build the heap
+        create_heaps(reg);
+        rerun_dijkstra = true;
+    }
+
+    assert(flow > 0);
+
+    // If an edge changes cost or a region is full,
+    // we need to update the costs, parents, sources and arc_capacities using a Dijkstra
+    // but later
+    return rerun_dijkstra;
+}
+
+void current_allocation::add_source(index_t elt_ind){ //capacity_t demand, std::vector<float_t> const & costs){
+    for(index_t i=0; i<region_cnt(); ++i){
+        sr_allocations[i].push_back(0);
+    }
+
+    bool need_rerun = false;
+    capacity_t demand = s_demands[elt_ind];
+
+    while(demand > 0){
+        // In case we modified the structures earlier
+        if(need_rerun){
+            dijkstra_update();
+            need_rerun = false;
+        }
+
+        ++ dijkstra_cnt;
+        index_t best_reg = null_ind;
+        float_t best_cost = std::numeric_limits<float_t>::infinity();
+        for(index_t reg=0; reg<region_cnt(); ++reg){
+            // Find the region which gets the source
+            if(r_costs[reg] + sr_costs[reg][elt_ind] < best_cost){
+                best_reg = reg;
+                best_cost = r_costs[reg] + sr_costs[reg][elt_ind];
+            }
+        }
+        if(best_reg == null_ind){ throw std::runtime_error("No reachable region found\n"); }
+
+        capacity_t flow = 0;
+        // Tells us whether we need to update the data structures
+        need_rerun = push_path(best_reg, demand, flow);
+        demand -= flow;
+
+        // Lazily store the change
+        sr_allocations[best_reg][elt_ind] += flow;
+    }
+
+    // Set the source's demand
+    for(index_t i=0; i<region_cnt(); ++i){
+        if(r_capacities[i] == 0 and sr_allocations[i][elt_ind] > 0){
+            need_rerun = add_source_to_heaps(i, elt_ind) or need_rerun;
+        }
+    }
+    // We leave a clean set with correct paths for the next iteration
+    if(need_rerun)
+        dijkstra_update();
+}
+
+} // End anonymous namespace
+
+std::vector<std::vector<capacity_t> > transport_generic(std::vector<capacity_t> const & capacities, std::vector<capacity_t> const & demands, std::vector<std::vector<float_t> > const & costs){
+    current_allocation transporter(capacities, demands, costs);
+
+    for(index_t i=0; i<demands.size(); ++i){
+        transporter.add_source(i);
+    }
+
+    return transporter.get_allocations();
+}
+
+bool place_convex_single_row(std::vector<int_t> const & widths, std::vector<std::pair<int_t, int_t> > const & ranges, std::vector<cell_bound> bounds, std::vector<int_t> const & const_slopes, std::vector<int_t> & positions){
+    std::sort(bounds.begin(), bounds.end());
+
+    struct bound{
+        int_t abs_pos;
+        int_t slope_diff;
+
+        bool operator<(bound const o) const{ return abs_pos < o.abs_pos; }
+        bound(int_t p, int_t s) : abs_pos(p), slope_diff(s) {}
+    };
+    std::priority_queue<bound> prio_queue;
+
+    std::vector<int_t> prev_widths(widths.size()+1, 0);
+    std::partial_sum(widths.begin(), widths.end(), std::next(prev_widths.begin()));
+
+    std::vector<int_t> constraining_pos(widths.size());
+
+    int_t lower_lim = std::numeric_limits<int_t>::min();
+    for(index_t i=0, j=0; i<widths.size(); ++i){
+        int_t old_width = prev_widths[i];
+        int_t new_width = prev_widths[i+1];
+
+        lower_lim = std::max(ranges[i].first - old_width, lower_lim);
+        int_t upper_lim = ranges[i].second - new_width;
+
+        for(; j<bounds.size() and bounds[j].c == i; ++j){
+            prio_queue.push(bound(bounds[j].pos - old_width, bounds[j].slope));
+        }
+
+        if(upper_lim < lower_lim){ // Infeasible
+            return false;
+        }
+        int_t cur_slope = const_slopes[i];
+        int_t cur_pos = upper_lim;
+
+        while(not prio_queue.empty() and (cur_slope > 0 or prio_queue.top().abs_pos > upper_lim)){
+            cur_slope -= prio_queue.top().slope_diff;
+            cur_pos = prio_queue.top().abs_pos;
+            prio_queue.pop();
+        }
+        int_t final_abs_pos = std::max(std::min(cur_pos, upper_lim), lower_lim);
+        constraining_pos[i] = final_abs_pos;
+        if(cur_slope < 0){
+            prio_queue.push(bound(final_abs_pos, -cur_slope));
+        }
+    }
+
+    positions.resize(constraining_pos.size());
+    std::partial_sum(constraining_pos.rbegin(), constraining_pos.rend(), positions.rbegin(), [](int_t a, int_t b)->int_t{ return std::min(a,b); });
+    for(index_t i=0; i<positions.size(); ++i){
+        positions[i] += prev_widths[i];
+    }
+    return true;
+}
+
+bool place_noncvx_single_row(std::vector<int_t> const & widths, std::vector<std::pair<int_t, int_t> > const & ranges, std::vector<int> const & flippables, std::vector<cell_bound> bounds, std::vector<int_t> const & const_slopes, std::vector<int_t> & positions, std::vector<int> & flippings){
+    flippings = std::vector<int>(positions.size(), 0);
+    return place_convex_single_row(widths, ranges, bounds, const_slopes, positions);
+}
+
+} // Namespace coloquinte
+
--- a/coloquinte/src/orientation.cxx
+++ b/coloquinte/src/orientation.cxx
@ -0,0 +1,166 @@
+#include "coloquinte/circuit_helper.hxx"
+
+#include <stack>
+#include <functional>
+#include <algorithm>
+
+namespace coloquinte{
+namespace gp{
+
+namespace{
+index_t const null_ind = std::numeric_limits<index_t>::max();
+
+inline void opt_orient(netlist const & circuit, placement_t & pl, std::function<int_t (point<int_t>)> i_coor, std::function<bool & (point<bool> &)> b_coor,mask_t FLIPPABLE){
+    std::stack<index_t> opt_cells;
+    for(index_t cell_ind = 0; cell_ind < circuit.cell_cnt(); ++cell_ind){
+        if( (circuit.get_cell(cell_ind).attributes & FLIPPABLE) != 0)
+            opt_cells.push(cell_ind);
+    }
+    while(not opt_cells.empty()){
+        index_t cell_ind = opt_cells.top(); opt_cells.pop();
+        assert((circuit.get_cell(cell_ind).attributes & FLIPPABLE) != 0);
+
+        // What is the current orientation?
+        bool old_orientation = b_coor(pl.orientations_[cell_ind]);
+        int_t pos = i_coor(pl.positions_[cell_ind]);
+        int_t size = i_coor(circuit.get_cell(cell_ind).size);
+
+        // Check both orientations of the cell
+        std::vector<index_t> involved_nets;
+        for(netlist::pin_t p : circuit.get_cell(cell_ind)){
+            involved_nets.push_back(p.net_ind);
+        }
+        // Deal with cells with multiple pins in one net (uniquify)
+        std::sort(involved_nets.begin(), involved_nets.end());
+        involved_nets.resize(std::distance(involved_nets.begin(), std::unique(involved_nets.begin(), involved_nets.end())));
+
+        std::int64_t p_cost = 0, n_cost = 0;
+        std::vector<index_t> extreme_elements;
+        for(index_t n : involved_nets){
+            std::vector<pin_1D> other_pins;
+            std::vector<int_t> offsets;
+            for(auto p : circuit.get_net(n)){
+                if(p.cell_ind != cell_ind){
+                    other_pins.push_back(pin_1D(
+                        p.cell_ind,
+                        i_coor(pl.positions_[p.cell_ind])
+                + (b_coor(pl.orientations_[p.cell_ind]) ? i_coor(p.offset) : i_coor(circuit.get_cell(p.cell_ind).size) - i_coor(p.offset)),
+                        0, // Don't care about the offset
+                        (circuit.get_cell(p.cell_ind).attributes & FLIPPABLE) != 0)
+                    );
+                }
+                else{
+                    offsets.push_back(i_coor(p.offset));
+                }
+            }
+            assert(offsets.size() > 0);
+            if(other_pins.size() > 0){ // Else the orientation of the cell doesn't change anything
+                auto minmaxC = std::minmax_element(other_pins.begin(), other_pins.end());
+                auto minmaxO = std::minmax_element(offsets.begin(), offsets.end());
+                p_cost += std::max(pos + *minmaxO.second, minmaxC.second->pos) - std::min(pos + *minmaxO.first, minmaxC.first->pos);
+                n_cost += std::max(pos + size - *minmaxO.first, minmaxC.second->pos) - std::min(pos + size - *minmaxO.second, minmaxC.first->pos);
+
+                int_t min_pin_pos = std::min(pos + *minmaxO.second, pos + size - *minmaxO.first),
+                      max_pin_pos = std::max(pos + *minmaxO.second, pos + size - *minmaxO.first);
+
+                // Do the extreme elements change between the two positions?
+                if(minmaxC.second->movable
+              and (minmaxC.second->pos < max_pin_pos)
+              and (minmaxC.second->pos > min_pin_pos) ){
+                    extreme_elements.push_back(minmaxC.second->cell_ind);
+                }
+                if(minmaxC.first->movable
+              and (minmaxC.first->pos < max_pin_pos)
+              and (minmaxC.first->pos > min_pin_pos) ){ 
+                    extreme_elements.push_back(minmaxC.first->cell_ind);
+                }
+            }
+        }
+
+        if(p_cost < n_cost)
+            b_coor(pl.orientations_[cell_ind]) = true;
+        if(p_cost > n_cost)
+            b_coor(pl.orientations_[cell_ind]) = false;
+
+        // If we changed the orientation, check the extreme pins which changed and try their cells again
+        if(b_coor(pl.orientations_[cell_ind]) != old_orientation){
+            std::sort(extreme_elements.begin(), extreme_elements.end());
+            extreme_elements.resize(std::distance(extreme_elements.begin(), std::unique(extreme_elements.begin(), extreme_elements.end())));
+            for(index_t extreme_cell : extreme_elements){
+                if( (circuit.get_cell(extreme_cell).attributes & FLIPPABLE) != 0)
+                    opt_cells.push(extreme_cell);
+            }
+        }
+    }
+}
+/*
+inline void spread_orient(netlist const & circuit, placement_t & pl, std::function<float_t & (point<float_t> &)> coor, mask_t FLIPPABLE){
+    std::vector<float_t> weights(circuit.cell_cnt(), 0.0);
+    for(index_t n=0; n<circuit.net_cnt(); ++n){
+        float_t min_pos=INF, max_pos=-INF;
+        float_t min_offs=INF, max_offs=-INF;
+        index_t min_ind=null_ind, max_ind=null_ind;
+        for(netlist::pin_t p : circuit.get_net(n)){
+            if( (circuit.get_cell(p.cell_ind).attributes & FLIPPABLE) != 0){
+                float_t pos = coor(pl.positions_[p.cell_ind]);
+                if(pos < min_pos){
+                    min_pos = pos;
+                    min_ind = p.cell_ind;
+                    min_offs = coor(p.offset);
+                }
+                if(pos > max_pos){
+                    max_pos = pos;
+                    max_ind = p.cell_ind;
+                    max_offs = coor(p.offset);
+                }
+            }
+            else{
+                float_t pos = coor(pl.positions_[p.cell_ind]) + coor(pl.orientations_[p.cell_ind]) * coor(p.offset);
+                if(pos < min_pos){
+                    min_pos = pos;
+                    min_ind = null_ind;
+                }
+                if(pos > max_pos){
+                    max_pos = pos;
+                    max_ind = null_ind;
+                }
+            }
+        }
+
+        float_t net_weight = circuit.get_net(n).weight;
+
+        if(min_ind != null_ind) weights[min_ind] += net_weight * min_offs;
+        if(max_ind != null_ind) weights[max_ind] -= net_weight * max_offs;
+    }
+
+    for(index_t c=0; c<circuit.cell_cnt(); ++c){
+        coor(pl.orientations_[c]) = (weights[c] >= 0.0) ? 1.0 : -1.0;
+    }
+}
+*/
+} // End anonymous namespace
+
+void optimize_x_orientations(netlist const & circuit, placement_t & pl){
+    opt_orient(circuit, pl, [](point<int_t> p) -> int_t { return p.x_; }, [](point<bool> & p) -> bool & { return p.x_; }, XFlippable);
+}
+void optimize_y_orientations(netlist const & circuit, placement_t & pl){
+    opt_orient(circuit, pl, [](point<int_t> p) -> int_t { return p.y_; }, [](point<bool> & p) -> bool & { return p.y_; }, YFlippable);
+}
+
+// Iteratively optimize feasible orientations; performs only one pass
+void optimize_exact_orientations(netlist const & circuit, placement_t & pl){
+    optimize_x_orientations(circuit, pl);
+    optimize_y_orientations(circuit, pl);
+}
+
+/*
+void spread_orientations(netlist const & circuit, placement_t & pl){
+    spread_orient(circuit, pl, [](point<float_t> & p) -> float_t & { return p.x_; }, XFlippable);
+    spread_orient(circuit, pl, [](point<float_t> & p) -> float_t & { return p.y_; }, YFlippable);
+}
+*/
+
+} // namespace gp
+} // namespace coloquinte
+
+
--- a/coloquinte/src/piecewise_linear.cxx
+++ b/coloquinte/src/piecewise_linear.cxx
@ -0,0 +1,258 @@
+
+#include "coloquinte/piecewise_linear.hxx"
+
+#include <cassert>
+
+namespace coloquinte{
+
+namespace{
+
+struct pl_edge{
+    p_v f, s;
+
+    static void push_intersections(pl_edge a, pl_edge b, piecewise_linear_function & lf){
+        // Strict, because it makes everything easier
+        //assert(a.f.first < b.s.first and a.s.first > b.f.first);
+        //assert(a.f.first < a.s.first and b.f.first < b.s.first);
+        assert(a.f.first <= b.s.first and a.s.first >= b.f.first);
+        assert(a.f.first <= a.s.first and b.f.first <= b.s.first);
+        
+        // ra = (a.s.second - a.f.second) / (a.s.first - a.f.first)
+        // xintersect = (yb - ya - xb * rb + xa * ra) / (ra - rb)
+
+        double ra = static_cast<double>(a.s.second - a.f.second) / (a.s.first - a.f.first);
+        double rb = static_cast<double>(b.s.second - b.f.second) / (b.s.first - b.f.first);
+
+        double xintersect = (b.f.second - a.f.second - rb * b.f.first + ra * a.f.first) / (ra - rb);
+        if( not xintersect ) return;
+
+        int_t pos = xintersect;
+        if( std::ceil(xintersect) == std::floor(xintersect) ){ // Exact integer intersection
+            if(pos > std::max(a.f.first, b.f.first) and pos < std::min(a.s.first, b.s.first) ){ // Necessarily smaller than s.first due to the previous condition
+                lf.point_values.push_back(p_v(pos, a.value_at(pos)));
+            }
+        }
+        else{ // Non exact intersection: create two integers since I don't want to mess with floating point
+            int_t pos1 = pos;
+            int_t pos2 = pos + 1;
+            // Value_at is only an approximation, but it shouldn't be too bad
+            if(pos1 > std::max(a.f.first, b.f.first) and pos1 < std::min(a.s.first, b.s.first))
+                lf.point_values.push_back(p_v(pos1, std::min(a.value_at(pos1), b.value_at(pos1))));
+            if(pos2 > std::max(a.f.first, b.f.first) and pos2 < std::min(a.s.first, b.s.first))
+                lf.point_values.push_back(p_v(pos2, std::min(a.value_at(pos2), b.value_at(pos2))));
+        }
+    }
+
+    // Lower-rounded value
+    int_t value_at(int_t pos) const{
+        assert(pos >= f.first and pos <= s.first and s.first > f.first);
+        return (static_cast<std::int64_t>(f.second) * (s.first - pos) + static_cast<std::int64_t>(s.second) * (pos - f.first)) / (s.first - f.first); 
+    }
+    // Lower-rounded value
+    int_t pos_at(int_t val) const{
+        assert(val <= std::max(f.second, s.second) and val >= std::min(f.second, s.second));
+        assert(f.second != s.second);
+        return (static_cast<std::int64_t>(f.first) * (s.second - val) + static_cast<std::int64_t>(s.first) * (val - f.second)) / (s.second - f.second); 
+    }
+
+    bool above(p_v const o) const{
+        int_t pos = o.first;
+        assert(pos > f.first and pos < s.first);
+        return (static_cast<std::int64_t>(f.second) * (s.first - pos) + static_cast<std::int64_t>(s.second) * (pos - f.first)) > o.second * (s.first - f.first); 
+    }
+
+    pl_edge(p_v a, p_v b) : f(a), s(b) {}
+};
+} // End anonymous namespace
+
+void piecewise_linear_function::add_monotone(int_t slope, int_t offset){
+    for(auto & V : point_values){
+        // Offset taken into account here, multiplied with the slope
+        V.second += slope * (V.first - point_values.front().first - offset);
+    }
+}
+
+void piecewise_linear_function::add_bislope(int_t s_l, int_t s_r, int_t pos){
+    //assert(pos <= point_values.back().first);
+    //assert(pos >= point_values.front().first);
+
+/*
+    if(pos >= point_values.back().first){
+        add_monotone(s_l, pos - point_values.front().first);
+    }
+    else if(pos <= point_values.front().first){
+        add_monotone(s_r, pos - point_values.front().first);
+    }
+    else{
+        auto it = point_values.begin();
+        while(it->first < pos){
+            it->second += s_l * (it->first - pos);
+            ++it;
+            assert(it != point_values.end());
+        }
+        if(it->first != pos){
+            point_values.insert(it, p_v(pos, pl_edge(*std::prev(it), *it).value_at(pos)));
+        }
+        for(auto & V : point_values){
+            if(V.first > pos)
+                V.second += s_r * (V.first - pos);
+        }
+    }
+*/
+
+    auto it = std::lower_bound(point_values.begin(), point_values.end(), pos, [](p_v o, int_t v){ return o.first < v; });
+    if(it != point_values.end() and it->first != pos and it != point_values.begin()){
+        assert(it->first > pos);
+        point_values.insert(it, p_v(pos, pl_edge(*std::prev(it), *it).value_at(pos)));
+    }
+
+    for(auto & V : point_values){
+        if(V.first > pos)
+            V.second += s_r * (V.first - pos);
+        if(V.first < pos)
+            V.second += s_l * (V.first - pos);
+    }
+}
+
+piecewise_linear_function::piecewise_linear_function(int_t min_def, int_t max_def){
+    point_values.push_back(p_v(min_def, 0));
+    point_values.push_back(p_v(max_def, 0));
+}
+
+piecewise_linear_function piecewise_linear_function::previous_min() const{
+
+    piecewise_linear_function ret;
+
+    assert(not point_values.empty());
+
+    auto it = point_values.begin();
+    ret.point_values.push_back(*it);
+    ++it;
+    // Use the previous minimum to detect when we find something smaller
+    for(; it != point_values.end(); ++it){
+        int_t cur_min = ret.point_values.back().second;
+        assert(it->first >= ret.point_values.back().first);
+        if(it->second < cur_min){
+            if(std::prev(it)->first != ret.point_values.back().first){ // May be equal, in which case we don't need to push anything new
+                int_t pos = pl_edge(*std::prev(it), *it).pos_at(cur_min);
+                if(pos != ret.point_values.back().first and pos != it->first){
+                    ret.point_values.push_back(p_v(pos, cur_min));
+                }
+            }
+            ret.point_values.push_back(*it);
+        }
+    }
+    return ret;
+}
+
+piecewise_linear_function piecewise_linear_function::previous_min_of_sum(piecewise_linear_function const & a, int_t shift) const{
+    piecewise_linear_function ret;
+
+    // Go to the correct definition
+    auto b_begin = point_values.begin(), a_begin = a.point_values.begin();
+    auto b_it = b_begin, a_it = a_begin;
+    auto b_end = point_values.end(), a_end = a.point_values.end();
+
+    while(a_it != a_end){
+        if(b_it == b_end or a_it->first < b_it->first+shift){ // Ok, create an edge and calculate the value
+            if(b_it != b_begin){
+                int_t value;
+                if(b_it != b_end){
+                    pl_edge b_edge(*std::prev(b_it), *b_it);
+                    value = b_edge.value_at(a_it->first-shift);
+                }
+                else{
+                    value = point_values.back().second;
+                }
+                ret.point_values.push_back(p_v(a_it->first, a_it->second + value));
+            }
+            ++a_it;
+        }
+        else if(a_it->first > b_it->first+shift){
+            if(a_it != a_begin){
+                pl_edge a_edge(*std::prev(a_it), *a_it);
+                int_t value = a_edge.value_at(b_it->first+shift);
+                ret.point_values.push_back(p_v(b_it->first+shift, b_it->second + value));
+            }
+            ++b_it;
+        }
+        else{ // if(a_it->first == b_it->first+shift){
+            ret.point_values.push_back(p_v(a_it->first, a_it->second + b_it->second));
+            ++a_it;
+            ++b_it;
+        }
+    }
+
+    return ret.previous_min();
+}
+
+
+int_t piecewise_linear_function::last_before(int_t pos) const{
+    auto it = point_values.rbegin();
+    while(it != point_values.rend()){
+        if(it->first <= pos){
+            if(it != point_values.rbegin() and std::prev(it)->first > pos){ // On a negative slope
+                return pos;
+            }
+            else{
+                return it->first; // First point or not mapped to a negative slope in the original function
+            }
+        }
+        ++it;
+    }
+    assert(false); // We should have found it if the bound was correct
+}
+
+int_t piecewise_linear_function::value_at(int_t pos) const{
+    // First position bigger or equal than pos
+    auto it = std::lower_bound(point_values.begin(), point_values.end(), pos, [](p_v o, int_t v){ return o.first < v; });
+    if(pos != it->first){
+        assert(it != point_values.begin());
+        return pl_edge(*std::prev(it), *it).value_at(pos);
+    }
+    else{
+        return it->second;
+    }
+}
+
+piecewise_linear_function piecewise_linear_function::piecewise_linear_function::minimum(piecewise_linear_function const & a, piecewise_linear_function const & b){
+    assert(a.point_values.front().first == b.point_values.front().first);
+    assert(a.point_values.back().first == b.point_values.back().first);
+
+    piecewise_linear_function ret;
+    auto a_it = a.point_values.begin(), b_it = b.point_values.begin();
+    auto a_end = a.point_values.end(), b_end = b.point_values.end();
+
+    ret.point_values.push_back(p_v(a_it->first, std::min(a_it->second, b_it->second)));
+
+    assert(std::next(a_it) != a_end and std::next(b_it) != b_end);
+    while(std::next(a_it) != a_end and std::next(b_it) != b_end){
+        pl_edge a_edge(*a_it, *std::next(a_it)), b_edge(*b_it, *std::next(b_it));
+        // Three cases: one of them always below, or both intersect
+        // Both intersect: we push the values when intersecting
+        pl_edge::push_intersections(a_edge, b_edge, ret);
+
+        // In any case, we push the value of the one below if it finishes, and increment the iterator
+        if(a_edge.s.first < b_edge.s.first){
+            ++a_it;
+            if(b_edge.above(a_edge.s)){ // We push a_edge.s
+                ret.point_values.push_back(a_edge.s);
+            }
+        }
+        else if(a_edge.s.first > b_edge.s.first){
+            ++b_it;
+            if(a_edge.above(b_edge.s)){ // We push a_edge.s
+                ret.point_values.push_back(b_edge.s);
+            }
+        }
+        else{
+            ret.point_values.push_back(p_v(a_edge.s.first, std::min(a_edge.s.second, b_edge.s.second)));
+            ++a_it;
+            ++b_it;
+        }
+    }
+    return ret;
+}
+
+} // End namespace coloquinte
+
--- a/coloquinte/src/rough_legalizers.cxx
+++ b/coloquinte/src/rough_legalizers.cxx
--- a/coloquinte/src/row_opt.cxx
+++ b/coloquinte/src/row_opt.cxx
@ -0,0 +1,599 @@
+
+#include "coloquinte/detailed.hxx"
+#include "coloquinte/circuit_helper.hxx"
+
+#include "coloquinte/optimization_subproblems.hxx"
+#include "coloquinte/union_find.hxx"
+#include "coloquinte/piecewise_linear.hxx"
+
+#include <cassert>
+
+#include <iostream>
+
+namespace coloquinte{
+namespace dp{
+
+namespace{
+
+struct minmax{
+    int_t min, max;
+    minmax(){}
+    minmax(int_t f, int_t s) : min(f), max(s){}
+    void merge(minmax const o){
+        min = std::min(min, o.min);
+        max = std::max(max, o.max);
+    }
+    void merge(int_t const o){
+        merge(minmax(o, o));
+    }
+};
+
+struct order_gettr{
+    index_t cell_ind, seq_order;
+    bool operator<(order_gettr const o) const{ return cell_ind < o.cell_ind; }
+    bool operator<(index_t const o) const{ return cell_ind < o; }
+    order_gettr(index_t c, index_t i) : cell_ind(c), seq_order(i) {}
+};
+
+std::vector<order_gettr> get_sorted_ordered_cells(std::vector<index_t> const & cells){
+    std::vector<order_gettr> ret;
+    for(index_t i=0; i<cells.size(); ++i){
+        ret.push_back(order_gettr(cells[i],i));
+    }
+    std::sort(ret.begin(), ret.end());
+    return ret;
+}
+
+std::vector<index_t> get_unique_nets(netlist const & circuit, std::vector<index_t> const & cells){
+    std::vector<index_t> involved_nets;
+    for(index_t c : cells){
+        for(netlist::pin_t p : circuit.get_cell(c)){
+            involved_nets.push_back(p.net_ind);
+        }
+    }
+    // Uniquify the nets
+    std::sort(involved_nets.begin(), involved_nets.end());
+    involved_nets.resize(std::distance(involved_nets.begin(), std::unique(involved_nets.begin(), involved_nets.end())));
+    return involved_nets;
+}
+
+struct Hnet_group{
+    struct Hpin{
+        index_t cell_index; // Not indexes in the circuit!!! Rather in the internal algorithm
+        minmax offset;
+        bool operator<(Hpin const o) const{ return cell_index < o.cell_index; }
+    };
+    struct Hnet{
+        bool has_ext_pins;
+        minmax ext_pins;
+        int_t weight;
+
+        Hnet(){
+            has_ext_pins = false;
+            ext_pins = minmax(std::numeric_limits<int_t>::max(), 0);
+            weight = 1;
+        }
+    };
+
+    std::vector<index_t> net_limits;
+    std::vector<Hnet> nets;
+
+    std::vector<Hpin> pins;
+    std::vector<int_t> cell_widths;
+
+    Hnet_group(){
+        net_limits.push_back(0);
+    }
+
+    void add_net(std::vector<pin_1D> const added_pins, int_t weight){
+        Hnet cur_net;
+        cur_net.weight = weight;
+        std::vector<Hpin> new_pins;
+        for(auto const p : added_pins){
+            if(p.movable){
+                Hpin new_pin;
+                new_pin.cell_index = p.cell_ind;
+                new_pin.offset = minmax(p.offs, p.offs);
+                new_pins.push_back(new_pin);
+            }
+            else{
+                cur_net.has_ext_pins = true;
+
+                cur_net.ext_pins.merge(p.pos);
+            }
+        }
+        std::sort(new_pins.begin(), new_pins.end());
+
+        if(not new_pins.empty()){ // Possible when generating from a Steiner topology
+            // Uniquify just in case there are several pins on the net on a single cell
+            index_t j=0;
+            auto prev_pin = new_pins[0];
+            for(auto it = new_pins.begin()+1; it != new_pins.end(); ++it){
+                if(it->cell_index == prev_pin.cell_index){
+                    prev_pin.offset.merge(it->offset);
+                }
+                else{
+                    new_pins[j] = prev_pin;
+                    ++j;
+                    prev_pin = *it;
+                }
+            }
+            new_pins[j]=prev_pin;
+            new_pins.resize(j+1);
+            nets.push_back(cur_net);
+            net_limits.push_back(net_limits.back() + new_pins.size());
+            pins.insert(pins.end(), new_pins.begin(), new_pins.end());
+        }
+    }
+
+    std::int64_t get_cost(std::vector<int_t> const & pos) const{
+        std::int64_t cost=0;
+        for(index_t n=0; n<nets.size(); ++n){
+            auto cur_net = nets[n];
+
+            minmax mm(std::numeric_limits<int_t>::max(), std::numeric_limits<int_t>::min());
+            if(cur_net.has_ext_pins){
+                mm = cur_net.ext_pins;
+            }
+
+            assert(net_limits[n+1] > net_limits[n]);
+            for(index_t p=net_limits[n]; p<net_limits[n+1]; ++p){
+                int_t cur_pos = pos[pins[p].cell_index];
+                mm.merge( minmax(cur_pos + pins[p].offset.min, cur_pos + pins[p].offset.max) );
+            }
+            cost += static_cast<std::int64_t>(cur_net.weight) * (mm.max - mm.min);
+        }
+        return cost;
+    }
+
+    std::int64_t get_cost(std::vector<int_t> const & pos, std::vector<int> const & flip) const{
+        std::int64_t cost=0;
+        for(index_t n=0; n<nets.size(); ++n){
+            auto cur_net = nets[n];
+
+            minmax mm(std::numeric_limits<int_t>::max(), std::numeric_limits<int_t>::min());
+            if(cur_net.has_ext_pins){
+                mm = cur_net.ext_pins;
+            }
+
+            assert(net_limits[n+1] > net_limits[n]);
+            for(index_t p=net_limits[n]; p<net_limits[n+1]; ++p){
+                int_t cur_pos = pos[pins[p].cell_index];
+                bool flipped  = flip[pins[p].cell_index];
+                int_t wdth    = cell_widths[pins[p].cell_index];
+                mm.merge( flipped ? 
+                    minmax(cur_pos + wdth - pins[p].offset.max, cur_pos + wdth - pins[p].offset.min)
+                  : minmax(cur_pos + pins[p].offset.min, cur_pos + pins[p].offset.max)
+                );
+            }
+            cost += static_cast<std::int64_t>(cur_net.weight) * (mm.max - mm.min);
+        }
+        return cost;
+    }
+
+};
+
+Hnet_group get_B2B_netgroup(netlist const & circuit, detailed_placement const & pl, std::vector<index_t> const & cells){
+
+    std::vector<order_gettr> cells_in_row = get_sorted_ordered_cells(cells);
+    std::vector<index_t> involved_nets = get_unique_nets(circuit, cells);
+
+    Hnet_group ret;
+    for(index_t c : cells)
+        ret.cell_widths.push_back(circuit.get_cell(c).size.x_);
+
+    for(index_t n : involved_nets){
+        std::vector<pin_1D> cur_pins = get_pins_1D(circuit, pl.plt_, n).x_;
+        for(pin_1D & p : cur_pins){
+            auto it = std::lower_bound(cells_in_row.begin(), cells_in_row.end(), p.cell_ind);
+            if(it != cells_in_row.end() and it->cell_ind == p.cell_ind){
+                p.cell_ind = it->seq_order;
+            }
+            else{ // Found a pin which remains fixed for this round
+                p.movable = false;
+            }
+        }
+        ret.add_net(cur_pins, circuit.get_net(n).weight);
+    }
+
+    return ret;
+}
+
+Hnet_group get_RSMT_netgroup(netlist const & circuit, detailed_placement const & pl, std::vector<index_t> const & cells){
+
+    std::vector<order_gettr> cells_in_row = get_sorted_ordered_cells(cells);
+    std::vector<index_t> involved_nets = get_unique_nets(circuit, cells);
+
+    Hnet_group ret;
+    for(index_t c : cells)
+        ret.cell_widths.push_back(circuit.get_cell(c).size.x_);
+
+    for(index_t n : involved_nets){
+        auto vpins = get_pins_2D(circuit, pl.plt_, n);
+        for(auto & p : vpins){
+            auto it = std::lower_bound(cells_in_row.begin(), cells_in_row.end(), p.cell_ind);
+            if(it != cells_in_row.end() and it->cell_ind == p.cell_ind){
+                p.cell_ind = it->seq_order;
+            }
+            else{
+                p.movable = false;
+            }
+        }
+
+        std::vector<point<int_t> > pin_locations;
+        for(auto p : vpins)
+            pin_locations.push_back(p.pos);
+        auto const Htopo = get_RSMT_topology(pin_locations, 8).x_;
+
+        // In the horizontal topology, we transform the parts of the tree that are on the row into HPWL subnets
+        // Two pins sharing an edge are in the same subnet if one of them is on the row: use union-find
+        union_find UF(vpins.size());
+        for(auto E : Htopo){
+            if( vpins[E.first].movable or vpins[E.second].movable){
+                UF.merge(E.first, E.second);
+            }
+        }
+
+        std::vector<std::vector<pin_1D> > connex_comps(vpins.size());
+        for(index_t i=0; i<vpins.size(); ++i){
+            connex_comps[UF.find(i)].push_back(vpins[i].x());;
+        }
+
+        int_t weight = circuit.get_net(n).weight;
+        for(index_t i=0; i<vpins.size(); ++i){
+            if(not connex_comps[i].empty()){
+                ret.add_net(connex_comps[i], weight);
+            }
+        }
+    }
+
+    return ret;
+}
+
+// Optimizes an ordered sequence of standard cells on the same row, returns the cost and the corresponding positions
+inline std::int64_t optimize_convex_sequence(Hnet_group const & nets, std::vector<index_t> const & permutation, std::vector<int_t> & positions, std::vector<std::pair<int_t, int_t> > const & cell_ranges){
+    // Get the widths of the cells in row order
+    std::vector<int_t> loc_widths(permutation.size());
+    std::vector<std::pair<int_t, int_t> > loc_ranges(permutation.size());
+    for(index_t i=0; i<permutation.size(); ++i){
+         loc_widths[permutation[i]] = nets.cell_widths[i];
+         loc_ranges[permutation[i]] = cell_ranges[i];
+    }
+
+    std::vector<cell_bound> bounds;
+    std::vector<int_t> right_slopes(permutation.size(), 0);
+    for(index_t n=0; n<nets.nets.size(); ++n){
+        index_t fst_c=std::numeric_limits<index_t>::max(), lst_c=0;
+        int_t fst_pin_offs=0, lst_pin_offs=0;
+        assert(nets.net_limits[n+1] > nets.net_limits[n]);
+        auto cur_net = nets.nets[n];
+        for(index_t p=nets.net_limits[n]; p<nets.net_limits[n+1]; ++p){
+            // Permutation: index in the Hnet_group to index in the row
+            index_t cur_cell = permutation[nets.pins[p].cell_index];
+            if(cur_cell < fst_c){
+                fst_c = cur_cell;
+                fst_pin_offs = nets.pins[p].offset.min;
+            }
+            if(cur_cell >= lst_c){
+                lst_c = cur_cell;
+                lst_pin_offs = nets.pins[p].offset.max;
+            }
+        }
+        if(cur_net.has_ext_pins){
+            bounds.push_back(cell_bound(fst_c, cur_net.ext_pins.min - fst_pin_offs, cur_net.weight));
+            bounds.push_back(cell_bound(lst_c, cur_net.ext_pins.max - lst_pin_offs, cur_net.weight));
+
+            right_slopes[lst_c] += cur_net.weight;
+        }
+        else{
+            right_slopes[lst_c] += cur_net.weight;
+            right_slopes[fst_c] -= cur_net.weight;
+        }
+    }
+
+    bool feasible = place_convex_single_row(loc_widths, loc_ranges, bounds, right_slopes, positions);
+
+    auto permuted_positions = positions;
+    for(index_t i=0; i<permutation.size(); ++i){
+        permuted_positions[i] = positions[permutation[i]];
+    }
+    if(feasible)
+        return nets.get_cost(permuted_positions);
+    else
+        return std::numeric_limits<std::int64_t>::max(); // Infeasible: return a very big cost
+}
+
+// TODO: take modified order relative to the obstacles into account
+inline std::int64_t optimize_noncvx_sequence(Hnet_group const & nets, std::vector<index_t> const & permutation, std::vector<int_t> & positions, std::vector<int> & flippings, std::vector<int> const & flippability, std::vector<std::pair<int_t, int_t> > const & cell_ranges){
+    // Get the widths of the cells in row order
+    std::vector<int_t> loc_widths(permutation.size());
+    std::vector<int> loc_flipps(permutation.size());
+    std::vector<std::pair<int_t, int_t> > loc_ranges(permutation.size());
+    for(index_t i=0; i<permutation.size(); ++i){
+         loc_widths[permutation[i]] = nets.cell_widths[i];
+         loc_ranges[permutation[i]] = cell_ranges[i];
+         loc_flipps[permutation[i]] = flippability[i];
+    }
+
+    int_t min_limit = std::numeric_limits<int_t>::min();
+    for(index_t i=0; i<loc_ranges.size(); ++i){
+        min_limit = std::max(loc_ranges[i].first, min_limit);
+        loc_ranges[i].first = min_limit;
+        min_limit += loc_widths[i];
+    }
+    int_t max_limit = std::numeric_limits<int_t>::max();
+    for(index_t i=loc_ranges.size(); i>0; --i){
+        max_limit = std::min(loc_ranges[i-1].second, max_limit);
+        max_limit -= loc_widths[i-1];
+        loc_ranges[i-1].second = max_limit;
+    }
+
+    for(index_t i=0; i<loc_ranges.size(); ++i){
+        if(loc_ranges[i].first > loc_ranges[i].second){
+            return std::numeric_limits<std::int64_t>::max(); // Infeasible: return a very big cost
+        }
+    }
+
+    std::vector<piecewise_linear_function> unflipped_cost_functions, flipped_cost_functions;
+    for(index_t i=0; i<loc_ranges.size(); ++i){
+        auto cur = piecewise_linear_function(loc_ranges[i].first, loc_ranges[i].second);
+        unflipped_cost_functions.push_back(cur);
+        flipped_cost_functions.push_back(cur);
+    }
+
+    for(index_t n=0; n<nets.nets.size(); ++n){
+        index_t fst_c=std::numeric_limits<index_t>::max(), lst_c=0;
+        int_t fst_pin_offs_mn=0, lst_pin_offs_mn=0,
+              fst_pin_offs_mx=0, lst_pin_offs_mx=0;
+
+        assert(nets.net_limits[n+1] > nets.net_limits[n]);
+        auto cur_net = nets.nets[n];
+        for(index_t p=nets.net_limits[n]; p<nets.net_limits[n+1]; ++p){
+            // Permutation: index in the Hnet_group to index in the row
+            index_t cur_cell = permutation[nets.pins[p].cell_index];
+            if(cur_cell < fst_c){
+                fst_c = cur_cell;
+                fst_pin_offs_mn = nets.pins[p].offset.min;
+                fst_pin_offs_mx = nets.pins[p].offset.max;
+            }
+            if(cur_cell >= lst_c){
+                lst_c = cur_cell;
+                lst_pin_offs_mn = nets.pins[p].offset.min;
+                lst_pin_offs_mx = nets.pins[p].offset.max;
+            }
+        }
+        if(cur_net.has_ext_pins){
+            unflipped_cost_functions[fst_c].add_bislope(-cur_net.weight, 0, cur_net.ext_pins.min - fst_pin_offs_mn);
+            unflipped_cost_functions[lst_c].add_bislope(0,  cur_net.weight, cur_net.ext_pins.max - lst_pin_offs_mx);
+            flipped_cost_functions[fst_c].add_bislope(-cur_net.weight, 0, cur_net.ext_pins.min - loc_widths[fst_c] + fst_pin_offs_mx);
+            flipped_cost_functions[lst_c].add_bislope(0,  cur_net.weight, cur_net.ext_pins.max - loc_widths[lst_c] + lst_pin_offs_mn);
+        }
+        else{
+            unflipped_cost_functions[fst_c].add_monotone(-cur_net.weight, -fst_pin_offs_mn);
+            unflipped_cost_functions[lst_c].add_monotone( cur_net.weight, -lst_pin_offs_mx);
+            flipped_cost_functions[fst_c].add_monotone(-cur_net.weight, fst_pin_offs_mx - loc_widths[fst_c] );
+            flipped_cost_functions[lst_c].add_monotone( cur_net.weight, lst_pin_offs_mn - loc_widths[lst_c] );
+        }
+    }
+
+    std::vector<piecewise_linear_function> prev_mins, merged_costs;
+    for(index_t i=0; i<loc_ranges.size(); ++i){
+        merged_costs.push_back(loc_flipps[i] ?
+            piecewise_linear_function::minimum(unflipped_cost_functions[i], flipped_cost_functions[i])
+          : unflipped_cost_functions[i]
+        );
+
+        if(i>0){
+            prev_mins.push_back(prev_mins.back().previous_min_of_sum(merged_costs.back(), loc_widths[i-1]));
+        }
+        else{
+            prev_mins.push_back(merged_costs.back().previous_min());
+        }
+    }
+
+    for(auto const M : prev_mins){
+        for(index_t i=0; i+1<M.point_values.size(); ++i){
+            assert(M.point_values[i].second >= M.point_values[i+1].second);
+        }
+    }
+
+    flippings.resize(cell_ranges.size(), 0); positions.resize(cell_ranges.size(), 0);
+
+    int_t pos = std::numeric_limits<int_t>::max();
+    for(index_t i=loc_ranges.size(); i>0; --i){
+        // Find the best position and flipping for each cell
+        pos = prev_mins[i-1].last_before(std::min(pos - loc_widths[i-1], loc_ranges[i-1].second) );
+        positions[i-1] = pos;
+
+        if(loc_flipps[i-1] and flipped_cost_functions[i-1].value_at(pos) < unflipped_cost_functions[i-1].value_at(pos)){
+            flippings[i-1] = 1;
+        }
+    }
+
+    for(index_t i=0; i<loc_ranges.size(); ++i){
+        assert(positions[i] >= loc_ranges[i].first);
+        assert(positions[i] <= loc_ranges[i].second);
+    }
+    for(index_t i=0; i+1<loc_ranges.size(); ++i){
+        assert(positions[i] + loc_widths[i] <= positions[i+1]);
+    }
+
+    auto permuted_positions = positions;
+    auto permuted_flippings = flippings;
+    for(index_t i=0; i<permutation.size(); ++i){
+        permuted_positions[i] = positions[permutation[i]];
+        permuted_flippings[i] = flippings[permutation[i]];
+    }
+
+    return nets.get_cost(permuted_positions, permuted_flippings);
+}
+
+std::vector<std::pair<int_t, int_t> > get_cell_ranges(netlist const & circuit, detailed_placement const & pl, std::vector<index_t> const & cells){
+    std::vector<std::pair<int_t, int_t> > lims;
+
+    for(index_t i=0; i+1<cells.size(); ++i){
+        assert(pl.plt_.positions_[cells[i]].x_ + circuit.get_cell(cells[i]).size.x_ <= pl.plt_.positions_[cells[i+1]].x_);
+    }
+
+    // Extreme limits, except macros are allowed to be beyond the limit of the placement area
+    int_t lower_lim = pl.get_limit_positions(circuit, cells.front()).first;
+    int_t upper_lim = pl.get_limit_positions(circuit, cells.back()).second;
+
+    for(index_t OSRP_cell : cells){
+        auto attr = circuit.get_cell(OSRP_cell).attributes;
+        auto cur_lim = std::pair<int_t, int_t>(lower_lim, upper_lim);
+        int_t pos = pl.plt_.positions_[OSRP_cell].x_;
+        if( (attr & XMovable) == 0 or pl.cell_height(OSRP_cell) != 1){
+            cur_lim = std::pair<int_t, int_t>(pos, pos + circuit.get_cell(OSRP_cell).size.x_);
+        }
+	    else{
+            assert(pos >= lower_lim);
+            assert(pos + circuit.get_cell(OSRP_cell).size.x_ <= upper_lim);
+        }
+        lims.push_back(cur_lim);
+    }
+
+    return lims;
+}
+
+template<bool NON_CONVEX, bool RSMT>
+void OSRP_generic(netlist const & circuit, detailed_placement & pl){
+    for(index_t r=0; r<pl.row_cnt(); ++r){
+        // Complete optimization on a row, comprising possible obstacles
+
+        std::vector<index_t> cells;
+        std::vector<int> flippability;
+
+        // Get the movable cells, if we can flip them, and the obstacles on the row
+        for(index_t OSRP_cell = pl.get_first_cell_on_row(r); OSRP_cell != null_ind; OSRP_cell = pl.get_next_cell_on_row(OSRP_cell, r)){
+            auto attr = circuit.get_cell(OSRP_cell).attributes;
+            cells.push_back(OSRP_cell);
+            flippability.push_back( (attr & XFlippable) != 0 ? 1 : 0);
+        }
+
+        if(not cells.empty()){
+            std::vector<std::pair<int_t, int_t> > lims = get_cell_ranges(circuit, pl, cells); // Limit positions for each cell
+
+            Hnet_group nets = RSMT ?
+                get_RSMT_netgroup(circuit, pl, cells)
+             :  get_B2B_netgroup(circuit, pl, cells);
+
+            std::vector<index_t> no_permutation(cells.size());
+            for(index_t i=0; i<cells.size(); ++i) no_permutation[i] = i;
+
+            std::vector<int_t> final_positions;
+            if(NON_CONVEX){
+                std::vector<int> flipped;
+                optimize_noncvx_sequence(nets, no_permutation, final_positions, flipped, flippability, lims);
+                for(index_t i=0; i<cells.size(); ++i){
+                    bool old_orient = pl.plt_.orientations_[cells[i]].x_;
+                    pl.plt_.orientations_[cells[i]].x_ = flipped[i] ? not old_orient : old_orient;
+                }
+            }
+            else{
+                optimize_convex_sequence(nets, no_permutation, final_positions, lims);
+            }
+
+            // Update the positions and orientations
+            for(index_t i=0; i<cells.size(); ++i){
+                pl.plt_.positions_[cells[i]].x_ = final_positions[i];
+            }
+        }
+    } // Iteration on the rows
+
+    pl.selfcheck();
+}
+
+template<bool NON_CONVEX, bool RSMT>
+void swaps_row_generic(netlist const & circuit, detailed_placement & pl, index_t range){
+    assert(range >= 2);
+
+    for(index_t r=0; r<pl.row_cnt(); ++r){
+        index_t OSRP_cell = pl.get_first_cell_on_row(r);
+
+        while(OSRP_cell != null_ind){
+            std::vector<index_t> cells;
+            std::vector<std::pair<int_t, int_t> > lims;
+            std::vector<int> flippables;
+
+            for(index_t nbr_cells=0;
+                    OSRP_cell != null_ind
+                and nbr_cells < range;
+                OSRP_cell = pl.get_next_cell_on_row(OSRP_cell, r), ++nbr_cells
+            ){
+                cells.push_back(OSRP_cell);
+                flippables.push_back( (circuit.get_cell(OSRP_cell).attributes & XFlippable) != 0);
+            }
+
+            if(not cells.empty()){
+                std::vector<std::pair<int_t, int_t> > lims = get_cell_ranges(circuit, pl, cells); // Limit positions for each cell
+
+                Hnet_group nets = RSMT ?
+                    get_RSMT_netgroup(circuit, pl, cells)
+                 :  get_B2B_netgroup(circuit, pl, cells);
+
+                std::int64_t best_cost = std::numeric_limits<std::int64_t>::max();
+                std::vector<int_t> positions(cells.size());
+                std::vector<int>   flippings(cells.size());
+                std::vector<int_t> best_positions(cells.size());
+                std::vector<int>   best_flippings(cells.size());
+
+                std::vector<index_t> permutation(cells.size());
+                for(index_t i=0; i<cells.size(); ++i) permutation[i] = i;
+                std::vector<index_t> best_permutation;
+
+                // Check every possible permutation of the cells
+                do{
+                    std::int64_t cur_cost = NON_CONVEX ?
+                        optimize_noncvx_sequence(nets, permutation, positions, flippings, flippables, lims) :
+                        optimize_convex_sequence(nets, permutation, positions, lims);
+                    if(cur_cost <= best_cost){
+                        best_cost = cur_cost;
+                        best_permutation = permutation;
+                        best_flippings = flippings;
+                        best_positions = positions;
+                    }
+                }while(std::next_permutation(permutation.begin(), permutation.end()));
+
+                std::vector<index_t> new_cell_order(cells.size());
+                // Update the positions and the topology
+                for(index_t i=0; i<cells.size(); ++i){
+                    index_t r_ind = best_permutation[i]; // In the row from in the Hnet_group
+                    new_cell_order[r_ind] = cells[i];
+                    pl.plt_.positions_[cells[i]].x_ = best_positions[r_ind];
+                    if(NON_CONVEX){
+                        bool old_orient = pl.plt_.orientations_[cells[i]].x_;
+                        pl.plt_.orientations_[cells[i]].x_ = best_flippings[r_ind] ? not old_orient : old_orient;
+                    }
+                }
+
+                pl.reorder_cells(cells, new_cell_order, r);
+                cells = new_cell_order;
+
+                assert(best_cost < std::numeric_limits<std::int64_t>::max());
+            }
+    
+            if(OSRP_cell != null_ind){
+                assert(cells.size() == range);
+                OSRP_cell = cells[range/2];
+            }
+        } // Iteration on the entire row
+    } // Iteration on the rows
+
+    pl.selfcheck();
+}
+} // End anonymous namespace
+
+void OSRP_convex_HPWL(netlist const & circuit, detailed_placement & pl){ OSRP_generic< false, false>(circuit, pl); }
+void OSRP_convex_RSMT(netlist const & circuit, detailed_placement & pl){ OSRP_generic< false, true >(circuit, pl); }
+void OSRP_noncvx_HPWL(netlist const & circuit, detailed_placement & pl){ OSRP_generic< true , false>(circuit, pl); }
+void OSRP_noncvx_RSMT(netlist const & circuit, detailed_placement & pl){ OSRP_generic< true , true >(circuit, pl); }
+void swaps_row_convex_HPWL(netlist const & circuit, detailed_placement & pl, index_t range){ swaps_row_generic< false, false>(circuit, pl, range); }
+void swaps_row_convex_RSMT(netlist const & circuit, detailed_placement & pl, index_t range){ swaps_row_generic< false, true >(circuit, pl, range); }
+void swaps_row_noncvx_HPWL(netlist const & circuit, detailed_placement & pl, index_t range){ swaps_row_generic< true , false>(circuit, pl, range); }
+void swaps_row_noncvx_RSMT(netlist const & circuit, detailed_placement & pl, index_t range){ swaps_row_generic< true , true >(circuit, pl, range); }
+
+} // namespace dp
+} // namespace coloquinte
+
+
--- a/coloquinte/src/solvers.cxx
+++ b/coloquinte/src/solvers.cxx
@ -0,0 +1,382 @@
+
+#include "coloquinte/solvers.hxx"
+
+#include <cassert>
+#include <stdexcept>
+
+namespace coloquinte{
+namespace gp{
+
+linear_system linear_system::operator+(linear_system const & o) const{
+    if(o.internal_size() != internal_size()){ throw std::runtime_error("Mismatched system sizes"); }
+    linear_system ret(target_.size() + o.target_.size() - internal_size(), internal_size());
+
+    ret.matrix_ = matrix_;
+    std::vector<matrix_triplet> omatrix = o.matrix_;
+    for(matrix_triplet & t : omatrix){
+        if(t.c_ >= internal_size()){
+            t.c_ += (target_.size() - internal_size());
+        }
+        if(t.r_ >= internal_size()){
+            t.r_ += (target_.size() - internal_size());
+        }
+    }
+    ret.matrix_.insert(ret.matrix_.end(), omatrix.begin(), omatrix.end());
+
+    // ret.target_.resize(target_.size() + o.target_.size() - internal_size);
+    for(index_t i=0; i<internal_size(); ++i){
+        ret.target_[i] = target_[i] + o.target_[i];
+    }
+    for(index_t i=internal_size(); i<target_.size(); ++i){
+        ret.target_[i] = target_[i];
+    }
+    for(index_t i=internal_size(); i<o.target_.size(); ++i){
+        ret.target_[i + target_.size() - internal_size()] = o.target_[i];
+    }
+
+    return ret;
+}
+
+
+// The classical compressed sparse row storage
+struct csr_matrix{
+    std::vector<std::uint32_t> row_limits, col_indexes;
+    std::vector<float> values, diag;
+
+    std::vector<float> mul(std::vector<float> const & x) const;
+    std::vector<float> solve_CG(std::vector<float> const & goal, std::vector<float> guess, std::uint32_t min_iter, std::uint32_t max_iter, float tol) const;
+    csr_matrix(std::vector<std::uint32_t> const & row_l, std::vector<std::uint32_t> const & col_i, std::vector<float> const & vals, std::vector<float> const D) : row_limits(row_l), col_indexes(col_i), values(vals), diag(D){
+        assert(values.size() == col_indexes.size());
+        assert(diag.size()+1 == row_limits.size());
+    }
+};
+
+// A matrix with successive rows padded to the same length and accessed column-major; hopefully a little better
+template<std::uint32_t unroll_len>
+struct ellpack_matrix{
+    std::vector<std::uint32_t> row_limits, col_indexes;
+    std::vector<float> values, diag;
+
+    std::vector<float> mul(std::vector<float> const & x) const;
+    std::vector<float> solve_CG(std::vector<float> goal, std::vector<float> guess, std::uint32_t min_iter, std::uint32_t max_iter, float tol) const;
+
+    ellpack_matrix(std::vector<std::uint32_t> const & row_l, std::vector<std::uint32_t> const & col_i, std::vector<float> const & vals, std::vector<float> const D) : row_limits(row_l), col_indexes(col_i), values(vals), diag(D){
+        assert(values.size() == col_indexes.size());
+        assert(diag.size() % unroll_len == 0);
+        assert((row_limits.size()-1) * unroll_len == diag.size() );
+        assert(row_limits.back() * unroll_len == values.size());
+        assert(values.size() % unroll_len == 0);
+        assert(col_indexes.size() % unroll_len == 0);
+    }
+};
+
+// The proxy matrix for compressed sparse storage
+class doublet_matrix{
+    std::vector<std::uint32_t> row_limits;
+    std::vector<matrix_doublet> doublets;
+    std::uint32_t size;
+
+    void get_compressed(std::vector<std::uint32_t> & limits, std::vector<matrix_doublet> & elements, std::vector<float> & diag) const;
+    public:
+    doublet_matrix(std::vector<matrix_triplet> const & triplets, std::uint32_t size);
+    csr_matrix get_compressed_matrix() const;
+    template<std::uint32_t unroll_len>
+    ellpack_matrix<unroll_len> get_ellpack_matrix() const;
+};
+
+doublet_matrix::doublet_matrix(std::vector<matrix_triplet> const & triplets, std::uint32_t n) : size(n){
+    row_limits.resize(size+1, 0);
+     // First store the row sizes in the array
+    for(uint32_t i=0; i<triplets.size(); ++i){
+        ++row_limits[triplets[i].r_+1];
+    }
+
+    // The total size of the uncompressed matrix
+    uint32_t tot_triplets=0;
+    // Get the beginning position of each row in the csr matrix
+    for(uint32_t i=1; i<n+1; ++i){
+        uint32_t new_tot_triplets = tot_triplets + row_limits[i];
+        row_limits[i] = tot_triplets; // Stores the beginning of the row
+        tot_triplets = new_tot_triplets;
+    }
+    assert(tot_triplets == triplets.size());
+
+    // Now we know the size and can allocate storage for the indices and values
+    doublets.resize(tot_triplets);
+    
+    // We store the triplets in the new storage and tranform beginning positions into end positions
+    for(uint32_t i=0; i<triplets.size(); ++i){
+        doublets[row_limits[triplets[i].r_+1]] = matrix_doublet(triplets[i].c_, triplets[i].val_);
+        ++row_limits[triplets[i].r_+1]; // row_limits will hold the end position of the row
+    }
+}
+
+void doublet_matrix::get_compressed(std::vector<std::uint32_t> & sizes, std::vector<matrix_doublet> & elements, std::vector<float> & diag) const{
+    assert(size+1 == row_limits.size());
+    sizes.resize(size);
+    diag.resize(size, 0.0);
+    std::vector<matrix_doublet> tmp_doublets = doublets;
+
+    for(uint32_t i=0; i<size; ++i){
+        // Sort the elements in the row
+        std::sort(tmp_doublets.begin() + row_limits[i], tmp_doublets.begin() + row_limits[i+1]);
+        // Compress them and extract the diagonal
+        std::uint32_t l=0;
+        matrix_doublet cur(tmp_doublets[row_limits[i]]);
+        for(uint32_t j=row_limits[i]+1; j<row_limits[i+1]; ++j){
+            if(tmp_doublets[j].c_ == cur.c_){
+                cur.val_ += tmp_doublets[j].val_;
+            }
+            else{
+                if(i != cur.c_){
+                    elements.push_back(cur);
+                    ++l;
+                }
+                else{
+                    diag[i] = cur.val_;
+                }
+                cur = tmp_doublets[j];
+            }
+        }
+        if(i != cur.c_){
+            elements.push_back(cur);
+            ++l;
+        }
+        else{
+            diag[i] = cur.val_;
+        }
+        sizes[i] = l;
+    }
+}
+
+csr_matrix doublet_matrix::get_compressed_matrix() const{
+    std::vector<matrix_doublet> tmp_doublets;
+    std::vector<std::uint32_t> sizes;
+    std::vector<float> diag;
+    get_compressed(sizes, tmp_doublets, diag);
+
+    // Get the limits of each row
+    std::vector<std::uint32_t> new_row_limits(row_limits.size());
+    new_row_limits[0] = 0;
+    for(std::uint32_t i=0; i<size; ++i){
+        new_row_limits[i+1] = new_row_limits[i] + sizes[i];
+    }
+
+    // Store the doublets to the sparse storage
+    std::vector<std::uint32_t> col_indices(tmp_doublets.size());
+    std::vector<float> values(tmp_doublets.size());
+    for(std::uint32_t i=0; i<tmp_doublets.size(); ++i){
+        col_indices[i] = tmp_doublets[i].c_;
+        values[i] = tmp_doublets[i].val_;
+    }
+
+    return csr_matrix(new_row_limits, col_indices, values, diag);
+}
+
+template<std::uint32_t unroll_len>
+ellpack_matrix<unroll_len> doublet_matrix::get_ellpack_matrix() const{
+    std::vector<matrix_doublet> tmp_doublets;
+    std::vector<std::uint32_t> sizes;
+    std::vector<float> diag;
+    get_compressed(sizes, tmp_doublets, diag);
+
+    std::uint32_t unrolled_size = (diag.size() % unroll_len == 0)? diag.size()/unroll_len : diag.size() / unroll_len + 1;
+    sizes.resize(unroll_len * unrolled_size, 0);
+    diag.resize(unroll_len * unrolled_size, 1.0);
+
+    // Store the maximum size of a group of rows
+    std::vector<std::uint32_t> new_row_limits(unrolled_size+1);
+    new_row_limits[0] = 0;
+    for(std::uint32_t i=0; i<unrolled_size; ++i){
+        std::uint32_t max_sz = sizes[unroll_len*i];
+        for(int j=1; j<unroll_len; ++j){
+            max_sz = std::max(max_sz, sizes[unroll_len*i + j]);
+        }
+        new_row_limits[i+1] = new_row_limits[i] + max_sz;
+    }
+
+    std::vector<std::uint32_t> col_indices(unroll_len * new_row_limits.back());
+    std::vector<float> values(unroll_len * new_row_limits.back());
+
+    std::uint32_t d = 0;
+    for(std::uint32_t i=0; i<sizes.size(); ++i){ // For every line
+        std::uint32_t ui = i/unroll_len;
+        std::uint32_t k = i%unroll_len;
+        std::uint32_t max_sz = new_row_limits[ui+1] - new_row_limits[ui];
+        std::uint32_t row_begin = new_row_limits[ui];
+        for(std::uint32_t j=0; j<sizes[i]; ++j, ++d){ // For the non-zero values
+            col_indices[unroll_len * (row_begin+j) + k] = tmp_doublets[d].c_;
+            values[unroll_len * (row_begin+j) + k] = tmp_doublets[d].val_;
+        }
+        for(std::uint32_t j=sizes[i]; j<max_sz; ++j){ // For the padding zeroes
+            col_indices[unroll_len * (row_begin+j) + k] = 0;
+            values[unroll_len * (row_begin+j) + k] = 0;
+        }
+    }
+    
+    return ellpack_matrix<unroll_len>(new_row_limits, col_indices, values, diag);
+}
+
+std::vector<float> csr_matrix::mul(std::vector<float> const & x) const{
+    std::vector<float> res(x.size());
+    assert(x.size() == diag.size());
+    for(std::uint32_t i=0; i<diag.size(); ++i){
+        res[i] = diag[i] * x[i];
+        for(std::uint32_t j=row_limits[i]; j<row_limits[i+1]; ++j){
+            res[i] += values[j] * x[col_indexes[j]];
+        }
+    }
+    return res;
+}
+
+template<std::uint32_t unroll_len>
+std::vector<float> ellpack_matrix<unroll_len>::mul(std::vector<float> const & x) const{
+    std::vector<float> res(x.size());
+    assert(x.size() % unroll_len == 0);
+    assert(x.size() == diag.size());
+    for(std::uint32_t i=0; i+1<row_limits.size(); ++i){
+        float cur[unroll_len];
+        for(int k=0; k<unroll_len; ++k){
+            cur[k] = diag[unroll_len*i+k] * x[unroll_len*i+k];
+        }
+        for(std::uint32_t j=row_limits[i]; j<row_limits[i+1]; ++j){
+            for(int k=0; k<unroll_len; ++k){
+                cur[k] += values[unroll_len*j+k] * x[col_indexes[unroll_len*j+k]];
+            }
+        }
+        for(int k=0; k<unroll_len; ++k){
+            res[unroll_len*i+k] = cur[k];
+        }
+    }
+    return res;
+}
+
+template<std::uint32_t unroll_len>
+float dot_prod(std::vector<float> const & a, std::vector<float> const & b){
+    assert(a.size() == b.size());
+    float vals[unroll_len];
+    for(int j=0; j<unroll_len; ++j) vals[j] = 0.0;
+    for(std::uint32_t i=0; i<a.size() / unroll_len; ++i){
+        for(int j=0; j<unroll_len; ++j){
+            vals[j] += a[unroll_len*i + j] * b[unroll_len*i + j];
+        }
+    }
+    float res = 0.0;
+    for(int j=0; j<unroll_len; ++j) res += vals[j];
+    for(int i = unroll_len*(a.size() / unroll_len); i< a.size(); ++i){
+        res += a[i] * b[i];
+    }
+    return res;
+}
+
+std::vector<float> csr_matrix::solve_CG(std::vector<float> const & goal, std::vector<float> x, std::uint32_t min_iter, std::uint32_t max_iter, float tol_ratio) const{
+    std::uint32_t n = diag.size();
+    assert(goal.size() == n);
+    assert(x.size() == n);
+    std::vector<float> r, p(n), z(n), mul_res, preconditioner(n);
+    r = mul(x);
+    for(uint32_t i=0; i<n; ++i){
+        r[i] = goal[i] - r[i];
+        preconditioner[i] = 1.0/diag[i];
+        assert(std::isfinite(preconditioner[i]));
+        z[i] = preconditioner[i] * r[i];
+        p[i] = z[i];
+    }
+
+    float cross_norm = dot_prod<16>(r, z);
+    assert(std::isfinite(cross_norm));
+    float_t const epsilon = std::numeric_limits<float_t>::min();
+
+    float start_norm = cross_norm;
+    for(uint32_t k=0; k < max_iter; ++k){
+        mul_res = mul(p);
+
+        float_t pr_prod = dot_prod<16>(p, mul_res);
+        float_t alpha = cross_norm / pr_prod;
+
+        if(
+            not std::isfinite(cross_norm) or not std::isfinite(alpha) or not std::isfinite(pr_prod)
+            or cross_norm <= epsilon or alpha <= epsilon or pr_prod <= epsilon
+            ){
+            break;
+        }
+
+        // Update the result
+        for(uint32_t i=0; i<n; ++i){
+            x[i] = x[i] + alpha * p[i];
+            r[i] = r[i] - alpha * mul_res[i];
+            z[i] = preconditioner[i] * r[i];
+        }
+        float new_cross_norm = dot_prod<16>(r, z); 
+
+        // Update the scaled residual and the search direction
+        if(k >= min_iter && new_cross_norm <= tol_ratio * start_norm){
+            break;
+        }
+        float beta = new_cross_norm / cross_norm;
+        cross_norm = new_cross_norm;
+        for(uint32_t i=0; i<n; ++i)
+            p[i] = z[i] + beta * p[i];
+    }
+
+    return x;
+}
+
+template<std::uint32_t unroll_len>
+std::vector<float> ellpack_matrix<unroll_len>::solve_CG(std::vector<float> goal, std::vector<float> x, std::uint32_t min_iter, std::uint32_t max_iter, float tol_ratio) const{
+    std::uint32_t n = diag.size();
+    std::uint32_t old_n = x.size();
+    assert(goal.size() == x.size());
+    x.resize(diag.size(), 0.0);
+    goal.resize(diag.size(), 0.0);
+
+    std::vector<float> r, p(n), z(n), mul_res, preconditioner(n);
+    r = mul(x);
+    for(uint32_t i=0; i<n; ++i){
+        r[i] = goal[i] - r[i];
+        preconditioner[i] = 1.0/diag[i];
+        z[i] = preconditioner[i] * r[i];
+        p[i] = z[i];
+    }
+
+    float cross_norm = dot_prod<unroll_len>(r, z);
+    float start_norm = cross_norm;
+    for(uint32_t k=0; k < max_iter; ++k){
+        mul_res = mul(p);
+        float alpha = cross_norm / dot_prod<unroll_len>(p, mul_res);
+        // Update the result
+        for(uint32_t i=0; i<n; ++i){
+            x[i] = x[i] + alpha * p[i];
+            r[i] = r[i] - alpha * mul_res[i];
+            z[i] = preconditioner[i] * r[i];
+        }
+        float new_cross_norm = dot_prod<unroll_len>(r, z); 
+
+        // Update the scaled residual and the search direction
+        if(k >= min_iter && new_cross_norm <= tol_ratio * start_norm){
+            break;
+        }
+        float beta = new_cross_norm / cross_norm;
+        cross_norm = new_cross_norm;
+        for(uint32_t i=0; i<n; ++i)
+            p[i] = z[i] + beta * p[i];
+    }
+    x.resize(old_n);
+    return x;
+}
+
+std::vector<float_t> linear_system::solve_CG(std::vector<float_t> guess, index_t nbr_iter){
+    doublet_matrix tmp(matrix_, size());
+    csr_matrix mat = tmp.get_compressed_matrix();
+    //ellpack_matrix<16> mat = tmp.get_ellpack_matrix<16>();
+    guess.resize(target_.size(), 0.0);
+    auto ret = mat.solve_CG(target_, guess, nbr_iter, nbr_iter, 0.0);
+    ret.resize(internal_size());
+    return ret;
+}
+
+}
+}
+
+
+
--- a/coloquinte/src/topologies.cxx
+++ b/coloquinte/src/topologies.cxx
@ -0,0 +1,513 @@
+
+#include "coloquinte/topologies.hxx"
+#include "coloquinte/circuit_helper.hxx"
+#include "coloquinte/union_find.hxx"
+
+#include <algorithm>
+#include <cassert>
+#include <set>
+#include <functional>
+
+namespace coloquinte{
+using edge_t = std::pair<index_t, index_t>;
+
+namespace{
+struct minmax_t{
+    int_t min, max;
+
+    minmax_t(int_t mn, int_t mx) : min(mn), max(mx) {}
+    minmax_t() {}
+    void merge(minmax_t const o){
+        min = std::min(o.max, min);
+        max = std::max(o.min, max);
+    }
+    void merge(int_t const p){
+        min = std::min(p, min);
+        max = std::max(p, max);
+    }
+};
+}
+
+namespace steiner_lookup{
+
+template<int pin_cnt>
+int_t Hconnectivity<pin_cnt>::get_wirelength(std::array<point<int_t>, pin_cnt> const sorted_points) const{
+    std::array<minmax_t, pin_cnt-2> minmaxs;
+    for(index_t i=0; i<pin_cnt-2; ++i){
+        minmaxs[i] = minmax_t(sorted_points[i+1].y_, sorted_points[i+1].y_);
+    }
+    std::uint8_t b_con = extremes & 15u, e_con = extremes >> 4;
+    minmaxs[b_con].merge(sorted_points.front() .y_);
+    minmaxs[e_con].merge(sorted_points.back()  .y_);
+    for(std::uint8_t const E : connexions){
+        minmaxs[(E >> 4)].merge(minmaxs[(E & 15u)]);
+    }
+    int_t cost = sorted_points.back().x_ - sorted_points.front().x_ + sorted_points[b_con+1].x_ - sorted_points[e_con+1].x_;
+    for(std::uint8_t const E : connexions){
+        cost += std::abs(sorted_points[(E >> 4) +1].x_ - sorted_points[(E & 15u) +1].x_);
+    }
+    for(index_t i=0; i<pin_cnt-2; ++i){
+        cost += (minmaxs[i].max - minmaxs[i].min);
+    }
+    return cost;
+}
+
+template<int pin_cnt>
+std::array<edge_t, pin_cnt-1> Hconnectivity<pin_cnt>::get_x_topology(std::array<point<int_t>, pin_cnt> const sorted_points) const{
+    std::array<edge_t, pin_cnt-1> ret;
+    std::uint8_t b_con = extremes & 15u, e_con = extremes >> 4;
+    ret[0] = edge_t(0, b_con+1);
+    ret[1] = edge_t(pin_cnt-1, e_con+1);
+    for(index_t i=0; i<pin_cnt-3; ++i){
+        std::uint8_t E = connexions[i];
+        ret[i+2] = edge_t((E & 15u) +1, (E >> 4) +1);
+    }
+
+    return ret;
+}
+} // End namespace steiner_lookup
+
+namespace {
+
+template<int n, int array_size>
+int_t get_wirelength_from_sorted(std::vector<point<int_t> > const & pins, std::array<steiner_lookup::Hconnectivity<n>, array_size> const & lookups){
+    std::array<point<int_t>, n> points;
+    std::copy_n(pins.begin(), n, points.begin());
+
+    int_t cost = std::numeric_limits<int_t>::max();
+    for(auto const L : lookups){
+        cost = std::min(cost, L.get_wirelength(points));
+    }
+    return cost;
+}
+
+std::int64_t get_wirelength_from_topo(std::vector<point<int_t> > const & points, std::vector<std::pair<index_t, index_t> > Htopo){
+    std::vector<minmax_t> minmaxs(points.size());
+    for(index_t i=0; i<points.size(); ++i){
+        minmaxs[i] = minmax_t(points[i].y_, points[i].y_);
+    }
+    for(auto const E : Htopo){
+        minmaxs[E.second].merge(minmaxs[E.first]);
+    }
+    std::int64_t cost = 0;
+    for(edge_t const E : Htopo){
+        cost += std::abs(points[E.first].x_ - points[E.second].x_);
+    }
+    for(index_t i=0; i<points.size(); ++i){
+        cost += (minmaxs[i].max - minmaxs[i].min);
+    }
+    return cost;
+}
+
+struct indexed_pt : point<int_t>{
+    index_t index;
+    indexed_pt(point<int_t> pt, index_t pos) : point<int_t>(pt), index(pos) {}
+    indexed_pt(){}
+};
+
+template<int n, int array_size>
+std::vector<std::pair<index_t, index_t> > get_topology_from_sorted(std::vector<point<int_t> > const & pins, std::array<steiner_lookup::Hconnectivity<n>, array_size> const & lookups){
+    std::array<point<int_t>, n> points;
+    std::copy_n(pins.begin(), n, points.begin());
+
+    // Find the horizontal topology with the smallest cost
+    int_t cost = std::numeric_limits<int_t>::max();
+    index_t ind  = std::numeric_limits<index_t>::max();
+    for(index_t i=0; i<array_size; ++i){
+        int_t this_cost = lookups[i].get_wirelength(points);
+        if(this_cost < cost){
+            cost = this_cost;
+            ind = i;
+        }
+    }
+    assert(ind != std::numeric_limits<index_t>::max());
+    auto ret = lookups[ind].get_x_topology(points);
+    return std::vector<std::pair<index_t, index_t> >(ret.begin(), ret.end());
+}
+
+std::vector<edge_t> get_vertical_topology(std::vector<point<int_t> > pins, std::vector<edge_t> const & Htopo){
+    index_t const null_ind = std::numeric_limits<index_t>::max();
+
+    std::vector<indexed_pt> ipoints(pins.size());
+    for(index_t i=0; i<pins.size(); ++i){
+        ipoints[i] = indexed_pt(pins[i], i);
+    }
+
+    std::sort(ipoints.begin(), ipoints.end(), [](indexed_pt a , indexed_pt b){return a.y_ < b.y_; });
+
+    // First pin with y ordering
+    std::vector<index_t> min_y_pin(pins.size());
+    for(index_t i=0; i<ipoints.size(); ++i){
+        min_y_pin[ipoints[i].index] = i;
+    }
+    std::vector<index_t> max_y_pin = min_y_pin;
+
+
+    std::vector<index_t> nxt_y_pin(pins.size(), null_ind);
+    std::vector<edge_t> ret;
+    for(auto const E : Htopo){
+        // Assuming a correctly ordered horizontal topology where the first node of the edge is never visited again
+        index_t f=E.first, s=E.second;
+        index_t first_yf=min_y_pin[f], first_ys=min_y_pin[s];
+
+        // Push the edges from the first and insert one of its elements in the second's linked structure
+        if(max_y_pin[f] < min_y_pin[s] or max_y_pin[s] < min_y_pin[f]){
+            for(index_t yf=first_yf; nxt_y_pin[yf] != null_ind; yf = nxt_y_pin[yf]){
+                ret.push_back(edge_t(yf, nxt_y_pin[yf]));
+            }
+
+            if(max_y_pin[f] < min_y_pin[s]){
+                nxt_y_pin[max_y_pin[f]] = min_y_pin[s];
+                min_y_pin[s] = max_y_pin[f];
+            }
+            else if(max_y_pin[s] < min_y_pin[f]){
+                nxt_y_pin[max_y_pin[s]] = min_y_pin[f];
+                max_y_pin[s] = min_y_pin[f];
+                nxt_y_pin[min_y_pin[f]] = null_ind;
+            }
+            else{
+                abort();
+            }
+        }
+        else{ // Need to chose a pin with two connexions because there will be no L route
+            // One pin from the second is in the middle of the first
+            if(max_y_pin[f] > max_y_pin[s]){
+                index_t middle_pin = max_y_pin[s];
+                index_t yf=first_yf;
+                // Make the first connexions
+                for(; nxt_y_pin[yf] < middle_pin; yf = nxt_y_pin[yf]){
+                    ret.push_back(edge_t(yf, nxt_y_pin[yf]));
+                }
+                // Make the two connexions with the new pin
+                ret.push_back(edge_t(yf, middle_pin));
+                yf = nxt_y_pin[yf];
+                ret.push_back(edge_t(yf, middle_pin));
+                // Finish the connexions
+                for(; nxt_y_pin[yf] != null_ind; yf = nxt_y_pin[yf]){
+                    ret.push_back(edge_t(yf, nxt_y_pin[yf]));
+                }
+            }
+            // One pin from the first is in the middle of the second
+            else{
+                for(index_t yf=first_yf; nxt_y_pin[yf] != null_ind; yf = nxt_y_pin[yf]){
+                    ret.push_back(edge_t(yf, nxt_y_pin[yf]));
+                }
+                index_t middle_pin = max_y_pin[f];
+                // Find the place where we can insert this pin
+                index_t ys=first_ys;
+                for(; nxt_y_pin[ys] < middle_pin; ys = nxt_y_pin[ys]);
+                nxt_y_pin[middle_pin] = nxt_y_pin[ys];
+                nxt_y_pin[ys] = middle_pin;
+            }
+        }
+    }
+    // The last visited gives the remaining connexions to push
+    for(index_t yf=min_y_pin[Htopo.back().second]; nxt_y_pin[yf] != null_ind; yf = nxt_y_pin[yf]){
+        ret.push_back(edge_t(yf, nxt_y_pin[yf]));
+    }
+    
+    // Back to the original ordering
+    for(auto & E : ret){
+        E.first = ipoints[E.first].index;
+        E.second = ipoints[E.second].index;
+    }
+    return ret;
+}
+
+inline void northeast_octant_neighbours(std::vector<point<int_t> > pins, std::vector<std::pair<index_t, index_t> > & edges){
+
+    std::vector<indexed_pt> point_list;
+    for(index_t i=0; i<pins.size(); ++i){
+        point_list.push_back(indexed_pt(pins[i], i));
+    }
+
+    std::sort(point_list.begin(), point_list.end(),
+                [](indexed_pt const a, indexed_pt const b){ return a.x_ + a.y_ < b.x_ + b.y_; }
+              );
+
+    // Decreasing order of x and y; multiset not necessary because no two elements have same coordinate
+    std::set<indexed_pt, std::function<bool (indexed_pt const, indexed_pt const)> >
+                      active_upper_octant([](indexed_pt const a, indexed_pt const b)->bool{return a.x_ > b.x_;}),
+                      active_lower_octant([](indexed_pt const a, indexed_pt const b)->bool{return a.y_ > b.y_;});
+
+    for(indexed_pt const current : point_list){
+        { // North to north-east region
+            auto first_it = active_upper_octant.lower_bound(current); // Largest x with x <= current.x
+            auto it = first_it;
+            for(; it != active_upper_octant.end() && it->x_ - it->y_ >= current.x_ - current.y_; ++it){
+                edges.push_back(std::pair<index_t, index_t>(current.index, it->index));
+            }
+            if(first_it != active_upper_octant.end()){ active_upper_octant.erase(first_it, it); }
+            active_upper_octant.insert(it, current); // Hint to insert the element since it is the correct position
+        } // End region
+        { // North-east to east region
+            auto first_it = active_lower_octant.lower_bound(current); // Largest y with y <= current.y
+            auto it = first_it;
+            for(; it != active_lower_octant.end() && it->y_ - it->x_ >= current.y_ - current.x_; ++it){
+                edges.push_back(std::pair<index_t, index_t>(current.index, it->index));
+            }
+            if(first_it != active_lower_octant.end()){ active_lower_octant.erase(first_it, it); }
+            active_lower_octant.insert(it, current); // Hint to insert the element since it is the correct position
+        } // End region
+    }
+}
+
+// Gets the nearest octant neighbour for each point in the south-east quadrant
+inline void southeast_octant_neighbours(std::vector<point<int_t> > pins, std::vector<std::pair<index_t, index_t> > & edges){
+    for(auto & pin : pins){
+        pin.y_ = - pin.y_;
+    }
+    northeast_octant_neighbours(pins, edges);
+}
+
+std::vector<std::pair<index_t, index_t> > get_small_horizontal_topology_from_sorted(std::vector<point<int_t> > const & pins){
+    assert(pins.size() <= 10);
+
+    switch(pins.size()){
+        case 2:
+            return std::vector<edge_t>(1, edge_t(0, 1));
+        case 3:
+            return std::vector<edge_t>{{0, 1}, {1, 2}};
+        case 4:
+            return get_topology_from_sorted<4, 2>(pins, steiner_lookup::topologies_4);
+        case 5:
+            return get_topology_from_sorted<5, 6>(pins, steiner_lookup::topologies_5);
+        case 6:
+            return get_topology_from_sorted<6, 23>(pins, steiner_lookup::topologies_6);
+        case 7:
+            return get_topology_from_sorted<7, 111>(pins, steiner_lookup::topologies_7);
+        case 8:
+            return get_topology_from_sorted<8, 642>(pins, steiner_lookup::topologies_8);
+        case 9:
+            return get_topology_from_sorted<9, 4334>(pins, steiner_lookup::topologies_9);
+        case 10:
+            return get_topology_from_sorted<10, 33510>(pins, steiner_lookup::topologies_10);
+        default: // Only 1 and 0 left (11 and more are protected by an assertion)
+            return std::vector<edge_t>();
+    }
+}
+
+// Get an ordering of the edges that is compatible with the processing functions
+std::vector<edge_t> get_tree_topo_sort(std::vector<edge_t> const & topo){
+    std::vector<edge_t> sorted_topo;
+    std::vector<std::vector<index_t> > neighbours(topo.size()+1);
+    for(edge_t const E : topo){
+        neighbours[E.first].push_back(E.second);
+        neighbours[E.second].push_back(E.first);
+    }
+    std::vector<index_t> to_visit;
+    std::vector<int_t> nbr_unvisited(topo.size()+1);
+    for(index_t i=0; i<=topo.size(); ++i){
+        nbr_unvisited[i] = neighbours[i].size();
+        assert(topo.size() == 0 or nbr_unvisited[i] >= 1);
+        if(nbr_unvisited[i] == 1)
+            to_visit.push_back(i);
+    }
+    std::vector<int> visited(topo.size()+1, 0);
+    while(not to_visit.empty()){
+        index_t f = to_visit.back();
+        assert(visited[f] == 0);
+        visited[f] = 1;
+        to_visit.pop_back();
+        for(index_t s : neighbours[f]){
+            --nbr_unvisited[s];
+            if(visited[s] == 0){ // It is not a node we already visited
+                sorted_topo.push_back(edge_t(f, s));
+            }
+            if(nbr_unvisited[s] == 1){
+                to_visit.push_back(s);
+            }
+        }
+    }
+    assert(sorted_topo.size() == topo.size());
+    return sorted_topo;
+}
+
+std::vector<edge_t> get_big_horizontal_topology_from_sorted(std::vector<point<int_t> > const & pins, index_t exactitude_limit){
+    auto spanning = get_MST_topology(pins);
+
+    // TODO: perform local optimizations on the topology using exact Steiner tree algorithms
+
+    // Remove horizontal suboptimalities i.e. when the connexions to the left and right are unbalanced
+    // Reuse existing code by translation to vertical topology
+    auto first_Htopo = get_tree_topo_sort(spanning);
+    auto Vtopo = get_vertical_topology(pins, first_Htopo);
+    Vtopo = get_tree_topo_sort(Vtopo);
+
+    std::vector<point<int_t> > inverted_coords = pins;
+    for(point<int_t> & pt : inverted_coords){
+        std::swap(pt.x_, pt.y_);
+    }
+    auto Htopo = get_vertical_topology(inverted_coords, Vtopo);
+
+    // Sort the tree so that it is usable when building an RSMT    
+    return get_tree_topo_sort(Htopo);
+}
+
+} // End anonymous namespace
+
+std::vector<std::pair<index_t, index_t> > get_MST_topology(std::vector<point<int_t> > const & pins){
+
+	std::vector<edge_t> edges;
+    
+    if(pins.size() <= 2){
+        if(pins.size() == 2){
+            edges.push_back(edge_t(0, 1));
+        }
+        if(pins.size() == 3){
+            auto D = [](point<int_t> a, point<int_t> b){ return std::abs(a.x_ - b.x_) + std::abs(a.y_ - b.y_); };
+            auto dists = std::array<int_t, 3>({D(pins[1], pins[2]), D(pins[1], pins[2]), D(pins[0], pins[1])});
+            index_t mx = std::max_element(dists.begin(), dists.end()) - dists.begin();
+            for(index_t i=0; i<3; ++i){
+                if(i != mx)
+                    edges.push_back(edge_t((i+1) % 3, (i+2) % 3));
+            }
+        }
+        return edges;
+    }
+    
+    northeast_octant_neighbours(pins, edges);
+    southeast_octant_neighbours(pins, edges);
+
+	std::vector<edge_t> returned_edges;
+
+    auto edge_length = [&](edge_t E){
+        point<int_t> p1 = pins[E.first],
+                     p2 = pins[E.second];
+        return std::abs(p1.x_ - p2.x_) + std::abs(p1.y_ - p2.y_);
+    };
+	// Perform Kruskal to get the tree
+	std::sort(edges.begin(), edges.end(), [&](edge_t a, edge_t b){ return edge_length(a) < edge_length(b); });
+
+	union_find merger(pins.size());
+
+	for(index_t i=0; i<edges.size() && returned_edges.size()+1 < pins.size(); ++i){
+		edge_t E = edges[i];
+		if(merger.find(E.first) != merger.find(E.second)){
+			merger.merge(E.first, E.second);
+            assert(merger.find(E.first) == merger.find(E.second));
+			returned_edges.push_back(E);
+		}
+	}
+	assert(returned_edges.size() + 1 == pins.size());
+    assert(merger.is_connex());
+	return returned_edges;
+}
+
+std::int64_t MST_length(std::vector<point<int_t> > const & pins){
+    auto edges = get_MST_topology(pins);
+    std::int64_t sum = 0;
+    for(auto E : edges){
+        sum += std::abs(pins[E.first].x_ - pins[E.second].x_);
+        sum += std::abs(pins[E.first].y_ - pins[E.second].y_);
+    }
+    return sum;
+}
+
+std::int64_t RSMT_length(std::vector<point<int_t> > const & pins, index_t exactitude_limit){
+    assert(exactitude_limit <= 10 and exactitude_limit >= 3);
+    if(pins.size() <= 3){
+        if(pins.size() == 2){
+            return std::abs(pins[0].x_ - pins[1].x_) + std::abs(pins[0].y_ - pins[1].y_);
+        }
+        else if(pins.size() == 3){
+            auto minmaxX = std::minmax_element(pins.begin(), pins.end(), [](point<int_t> a, point<int_t> b){ return a.x_ < b.x_; }), 
+                 minmaxY = std::minmax_element(pins.begin(), pins.end(), [](point<int_t> a, point<int_t> b){ return a.y_ < b.y_; });
+            return (minmaxX.second->x_ - minmaxX.first->x_) + (minmaxY.second->y_ - minmaxY.first->y_);
+        }
+        else{
+            return 0;
+        }
+    }
+    else{
+        std::vector<point<int_t> > points = pins;
+        std::sort(points.begin(), points.end(), [](point<int_t> a , point<int_t> b){return a.x_ < b.x_; });
+
+        if(points.size() <= exactitude_limit){
+            switch(points.size()){
+                case 4:
+                    return get_wirelength_from_sorted<4, 2>(points, steiner_lookup::topologies_4);
+                case 5:
+                    return get_wirelength_from_sorted<5, 6>(points, steiner_lookup::topologies_5);
+                case 6:
+                    return get_wirelength_from_sorted<6, 23>(points, steiner_lookup::topologies_6);
+                case 7:
+                    return get_wirelength_from_sorted<7, 111>(points, steiner_lookup::topologies_7);
+                case 8:
+                    return get_wirelength_from_sorted<8, 642>(points, steiner_lookup::topologies_8);
+                case 9:
+                    return get_wirelength_from_sorted<9, 4334>(points, steiner_lookup::topologies_9);
+                case 10:
+                    return get_wirelength_from_sorted<10, 33510>(points, steiner_lookup::topologies_10);
+                default:
+                    abort();
+            }
+        }
+        else{ // Need to create the full topology, then calculate the length back
+            //return MST_length(points);
+            auto horizontal_topology = get_big_horizontal_topology_from_sorted(points, exactitude_limit);
+            return get_wirelength_from_topo(points, horizontal_topology);
+        }
+    }
+}
+
+point<std::vector<std::pair<index_t, index_t> > > get_RSMT_topology(std::vector<point<int_t> > const & pins, index_t exactitude_limit){
+
+    assert(exactitude_limit <= 10 and exactitude_limit >= 3);
+
+    // For 3 pin and fewer, the topology is very simple
+    if(pins.size() <= 2){
+        if(pins.size() == 2){
+            auto ret = std::vector<edge_t>(1, edge_t(0, 1));
+            return point<std::vector<edge_t> >(ret, ret);
+        }
+        else{
+            return point<std::vector<edge_t> >();
+        }
+    }
+    else if(pins.size() == 3){
+        std::vector<indexed_pt> ipoints(pins.size());
+        for(index_t i=0; i<pins.size(); ++i){
+            ipoints[i] = indexed_pt(pins[i], i);
+        }
+        auto xpoints=ipoints;
+        std::sort(xpoints.begin(), xpoints.end(), [](indexed_pt a , indexed_pt b){return a.x_ < b.x_; });
+        auto ypoints=ipoints;
+        std::sort(ypoints.begin(), ypoints.end(), [](indexed_pt a , indexed_pt b){return a.y_ < b.y_; });
+        
+        return point<std::vector<edge_t> >{{{xpoints[0].index, xpoints[1].index}, {xpoints[1].index, xpoints[2].index}}, {{ypoints[0].index, ypoints[1].index}, {ypoints[1].index, ypoints[2].index}}};
+    }
+    else{
+        std::vector<edge_t> horizontal_topology;
+
+        // Sort the pins by x coordinate
+        std::vector<indexed_pt> ipoints(pins.size());
+        for(index_t i=0; i<pins.size(); ++i){
+            ipoints[i] = indexed_pt(pins[i], i);
+        }
+        std::sort(ipoints.begin(), ipoints.end(), [](indexed_pt a , indexed_pt b){return a.x_ < b.x_; });
+        std::vector<point<int_t> > sorted_pins(pins.size());
+        for(index_t i=0; i<pins.size(); ++i){
+            sorted_pins[i] = ipoints[i];
+        }
+
+        // Get the topology for this ordering
+        if(pins.size() <= exactitude_limit){
+            horizontal_topology = get_small_horizontal_topology_from_sorted(sorted_pins);
+        }
+        else{
+            horizontal_topology = get_big_horizontal_topology_from_sorted(sorted_pins, exactitude_limit);
+        }
+
+        // Back to the original ordering
+        for(auto & E : horizontal_topology){
+            E.first = ipoints[E.first].index;
+            E.second = ipoints[E.second].index;
+        }
+
+        return point<std::vector<edge_t> >(horizontal_topology, get_vertical_topology(sorted_pins, horizontal_topology));
+    }
+}
+
+} // Namespace coloquinte
+
--- a/etesian/CMakeLists.txt
+++ b/etesian/CMakeLists.txt
@ -8,7 +8,6 @@
 cmake_minimum_required(VERSION 2.8.9)

 list(INSERT CMAKE_MODULE_PATH 0 "${DESTDIR}$ENV{CORIOLIS_TOP}/share/cmake/Modules/")
- find_package(Coloquinte REQUIRED)
 find_package(Bootstrap  REQUIRED)
 setup_project_paths(CORIOLIS)
 setup_qt()
@ -24,6 +23,7 @@
 find_package(LEFDEF             REQUIRED)
 find_package(HURRICANE          REQUIRED)
 find_package(CORIOLIS           REQUIRED)
+ find_package(COLOQUINTE         REQUIRED)
 find_package(Libexecinfo        REQUIRED)
 
 add_subdirectory(src)
--- a/etesian/src/EtesianEngine.cpp
+++ b/etesian/src/EtesianEngine.cpp
@ -18,10 +18,8 @@
 #include <sstream>
 #include <fstream>
 #include <iomanip>
-#if HAVE_COLOQUINTE
 #include "coloquinte/circuit.hxx"
 #include "coloquinte/legalizer.hxx"
-#endif
 #include "vlsisapd/configuration/Configuration.h"
 #include "vlsisapd/utilities/Dots.h"
 #include "hurricane/DebugSession.h"
@ -58,7 +56,6 @@ namespace {



-#if HAVE_COLOQUINTE
  //inline bool  isNan( const float_t& f ) { return (f != f); }


@ -113,22 +110,6 @@ namespace {
  }


-#if 0
-  Coloquinte::cell::pin::pin_dir  extractDirection ( const RoutingPad* rp )
-  {
-    switch ( rp->_getEntityAsComponent()->getNet()->getDirection() ) {
-      case Net::Direction::IN:       return Coloquinte::cell::pin::I;
-      default:
-      case Net::Direction::OUT:
-      case Net::Direction::TRISTATE: return Coloquinte::cell::pin::O;
-      case Net::Direction::INOUT:    return Coloquinte::cell::pin::B;
-    }
-
-    return Coloquinte::cell::pin::O;
-  }
-#endif
-
-
  Point  extractRpOffset ( const RoutingPad* rp )
  {
    Cell*      masterCell = rp->getOccurrence().getMasterCell();
@ -175,8 +156,6 @@ namespace {

    return Transformation( tx, ty, orient );
  }
-#endif
-

 } // Anonymous namespace.

@ -451,7 +430,6 @@ namespace Etesian {

  void  EtesianEngine::toColoquinte ()
  {
-#if HAVE_COLOQUINTE
    cmess1 << "  o  Converting <" << getCell()->getName() << "> into Coloquinte." << endl;

    resetPlacement();
@ -594,12 +572,10 @@ namespace Etesian {
    _placementUB = _placementLB;
  //cerr << "Coloquinte cell height: " << _circuit.get_cell(0).size.y_ << endl;

-#endif  // HAVE_COLOQUINTE
  }

  void  EtesianEngine::place ()
  {
-#if HAVE_COLOQUINTE
    using namespace coloquinte::gp;
    using namespace coloquinte::dp;

@ -823,15 +799,11 @@ namespace Etesian {
      ( "     - RMST", DbU::getValueString( (DbU::Unit)get_RSMT_wirelength(_circuit,_placementUB )*getPitch() ) ) << endl;

    _placed = true;
-#else
-    cerr << Warning("Coloquinte library wasn't found, Etesian is disabled.") << endl;
-#endif
  }


  void  EtesianEngine::_progressReport1 ( time_t startTime, string label ) const
  {
-#if HAVE_COLOQUINTE
    size_t w      = label.size();
    string indent ( w, ' ' );
    if (not w) {
@ -850,13 +822,11 @@ namespace Etesian {
           <<  "  Linear Disrupt.:" << coloquinte::gp::get_mean_linear_disruption   ( _circuit, _placementLB, _placementUB )
           <<   " Quad Disrupt.:"   << coloquinte::gp::get_mean_quadratic_disruption( _circuit, _placementLB, _placementUB )
           << endl;
-#endif
  }


  void  EtesianEngine::_progressReport2 ( time_t startTime, string label ) const
  {
-#if HAVE_COLOQUINTE
    size_t w      = label.size();
    string indent ( w, ' ' );
    if (not w) {
@ -871,13 +841,11 @@ namespace Etesian {
           << " HPWL:" << coloquinte::gp::get_HPWL_wirelength( _circuit, _placementLB )
           << " RMST:" << coloquinte::gp::get_RSMT_wirelength( _circuit, _placementLB )
           << endl;
-#endif
  }


  void  EtesianEngine::_updatePlacement ( const coloquinte::placement_t& placement )
  {
-#if HAVE_COLOQUINTE
    UpdateSession::open();

    forEach ( Occurrence, ioccurrence, getCell()->getLeafInstanceOccurrences() )
@ -914,7 +882,6 @@ namespace Etesian {
    UpdateSession::close();

    if (_cellWidget) _cellWidget->refresh();
-#endif
  }
				`@ -0,0 +1,2 @@`

				`install ( FILES FindCOLOQUINTE.cmake DESTINATION share/cmake/Modules )`