Improved SRAM output multiplexer, using NAND/NOR.

* New: In cumulus/plugins/sram_256x32.py, build the output mux using a NAND2/NOR2 binary tree instead of mux2/mux3. Use more, but much smaller cells. The reduction of wirelength (from Yosys) goes from 4% to 15% for the non-folded variant. Uses a specially placed tree to minimize wire length. * New: In cumulus/plugins/sram.py, extend StdCellConf to convert names accross library flavors (FlexLib_TSMC_C180, FlexLib_Sky130 and generic SxLib).
2022-10-17 17:18:49 +02:00 · 2022-10-17 17:18:49 +02:00 · 9594476ab6
parent d294a770c4
commit 9594476ab6
2 changed files with 316 additions and 114 deletions
--- a/cumulus/src/plugins/sram/sram.py
+++ b/cumulus/src/plugins/sram/sram.py
@ -26,8 +26,8 @@ from   helpers.io      import ErrorMessage, WarningMessage
 from   helpers.overlay import UpdateSession
 from   helpers         import trace, l, u, n
 import plugins
-from   Hurricane import Breakpoint, DbU, Box, Net, Cell, Instance, \
-                        Transformation, PythonAttributes
+from   Hurricane import DataBase, Breakpoint, DbU, Box, Net, Cell, \
+                        Instance, Transformation, PythonAttributes
 import CRL
 from   Foehn     import FoehnEngine, DagExtension
 from   plugins.chip.configuration import GaugeConf
@ -49,6 +49,12 @@ class StdCellConf ( object ):
    reDataIn  = re.compile( r'^i[0-9]?' )
    reDataOut = re.compile( r'^n?q' )

+    def __init__ ( self ):
+        self.techName = DataBase.getDB().getTechnology().getName()
+
+    def __repr__ ( self ):
+        return '<StdCellConf "{}">'.format( self.techName )
+
    def isRegister ( self, cell ):
        """Returns True if the cell is a register."""
        m = StdCellConf.reDFF.match( cell.getName() )
@ -85,6 +91,16 @@ class StdCellConf ( object ):
        """Returns True if the net is a data flow (i.e. not a control)."""
        return self.isDataIn(net) or self.isDataOut(net)

+    def getStdCellName ( self, name ):
+        if self.techName == 'Sky130':
+            if name == 'na2_x1': name = 'nand2_x0'
+            if name == 'no2_x1': name = 'nor2_x0'
+            if name == 'no3_x1': name = 'nor3_x0'
+        return name
+
+    def getStdCell ( self, name ):
+        return af.getCell( self.getStdCellName(name), CRL.Catalog.State.Views )
+

 # --------------------------------------------------------------------
 # Class :  Bus.
@ -186,6 +202,9 @@ class Column ( object ):
        Connect a bus to the column. ``busName`` is the name of the master net
        in the reference cell of the column.
        """
+        if not busName in self.busPlugs:
+            raise ErrorMessage( 1, 'Column.setBusNet(): {} has no bus named "{}".' \
+                                   .format( self.tag, busName ))
        busPlug = self.busPlugs[ busName ]
        if busPlug[0].getNet() and busPlug[0].getNet() != busNet[0]:
            print( Warning( 'Column.setBusNet(): Overrode {} {} -> {} with {}' \
@ -318,11 +337,12 @@ class ColGroup ( object ):
        Initialize an *empty* column group. Sub-group or columns must be
        added afterwards.
        """
-        self.tag    = tag
-        self.parent = None
-        self.order  = None
-        self.depth  = 0
-        self.childs = []
+        self.tag        = tag
+        self.parent     = None
+        self.order      = None
+        self.depth      = 0
+        self.childs     = []
+        self.isReversed = False

    def __iter__ ( self ):
        return ColGroupIterator( self )
@ -358,11 +378,25 @@ class ColGroup ( object ):
            busWidth = max( busWidth, child.busWidth )
        return busWidth

-    def group ( self, child ):
+    def group ( self, newChild, after=None, before=None ):
        """ Add a new child to the group. """
-        self.childs.append( child )
-        child.parent = self
-        self.depth = max( self.depth, child.depth+1 )
+        inserted = False
+        if after is not None:
+            for i in range(len(self.childs)):
+                if self.childs[i] == after:
+                    self.childs.insert( i+1, newChild )
+                    inserted = True
+                    break
+        if before is not None:
+            for i in range(len(self.childs)):
+                if self.childs[i] == before:
+                    self.childs.insert( i, newChild )
+                    inserted = True
+                    break
+        if not inserted:
+            self.childs.append( newChild )
+        newChild.parent = self
+        self.depth = max( self.depth, newChild.depth+1 )

    def unGroup ( self, child=None ):
        """ Remove a child from the group (the child is *not* deleted). """
@ -403,6 +437,7 @@ class ColGroup ( object ):
        for child in self.childs:
            child.reverse()
        self.childs.reverse()
+        self.isReversed = not self.isReversed

    def place ( self ):
        """ Place childs/colums from left to rigth. """
@ -572,7 +607,7 @@ class FoldState ( object ):
        else:
            self.direction = BaseSRAM.TO_RIGHT
            self.x         = self.xmin
-        self.irow += self.sram.rootGroup.busWidth + 1
+        self.irow += self.sram.rootGroup.busWidth + 2
        self.fold += 1

    def addWidth ( self, width ):
@ -622,12 +657,16 @@ class BaseSRAM ( object ):
        The overall relative placement is organized as follow : ::

            +---------+-------------------------------------------+
-            |         |            headers[1] (1 row)             |
+            |         |            headers[4] (1 row)             |
+            |         +-------------------------------------------+
+            |         |            headers[3] (1 row)             |
            |         +-------------------------------------------+
            |         |                                           |
            |         |       Column area, fold 1 (N rows)        |
            |         |                                           |
            | decoder +-------------------------------------------+
+            |         |            headers[1] (1 row)             |
+            |         +-------------------------------------------+
            |         |            headers[0] (1 row)             |
            |         +-------------------------------------------+
            |         |                                           |
@ -646,7 +685,7 @@ class BaseSRAM ( object ):
        self.busses      = {}
        self.decoder     = None
        self.toHeaders   = []
-        self.headers     = [ HeaderRow( self ) for row in range(fold) ]
+        self.headers     = [ HeaderRow( self ) for row in range(fold*2) ]

    @property
    def fold ( self ):
@ -656,6 +695,12 @@ class BaseSRAM ( object ):
        if column.tag in self.foldTags:
            self.foldState.forceFold()

+    def getBus ( self, fmt ):
+        """ Find a bus by it's formatting string. """
+        if fmt in self.busses:
+            return self.busses[ fmt ]
+        return None
+
    def getNet ( self, name, create=True ):
        """
        Find a Net by name. If it doesn't exists and ``create`` is set to ``True``,
@ -694,7 +739,9 @@ class BaseSRAM ( object ):
                             ,  'q' : 'net_output_X' } )
        """
        masterCell = af.getCell( masterName, CRL.Catalog.State.Views )
-        inst       = Instance.create( self.cell, instName, masterCell )
+        if not masterCell:
+            raise ErrorMessage( 1, 'BaseSRAM.addInstance(): Cannot find cell "{}".'.format( masterName ))
+        inst = Instance.create( self.cell, instName, masterCell )
        for masterNetName, netName in netMapNames.items():
            masterNet = masterCell.getNet( masterNetName )
            net       = self.getNet( netName )
@ -796,18 +843,18 @@ class BaseSRAM ( object ):
            bb = Box()
            bb.merge( self.decoder.place( 0 ) )
            bb.merge( self.rootGroup.place() )
-            for inst, refInst in self.toHeaders:
-                self.headers[ refInst.fold ].addInstanceAt( inst, refInst )
+            for inst, refInst, headerRow in self.toHeaders:
+                self.headers[ refInst.fold*2 + headerRow ].addInstanceAt( inst, refInst )
            for i in range(len(self.headers)):
                trace( 610, ',+', 'Place row header {} {}\n'.format( i, self.headers[i].row ))
-                if i % 2:
+                if i//2 % 2:
                    xstart    = bb.getXMax()
                    direction = BaseSRAM.TO_LEFT
                else:
                    xstart    = self.decoder.width
                    direction = BaseSRAM.TO_RIGHT
                bb.merge( self.headers[i].place( xstart
-                                               , self.rootGroup.busWidth*(i + 1) + i
+                                               , self.rootGroup.busWidth*(i//2 + 1) + i
                                               , direction ))
                trace( 610, '-,' )
            self.cell.setAbutmentBox( bb )
--- a/cumulus/src/plugins/sram/sram_256x32.py
+++ b/cumulus/src/plugins/sram/sram_256x32.py
@ -73,31 +73,50 @@ Provisional results

 .. note:: All length are in micro-meters.

-+--------------+-----------------------------+-----------------------------+
-| Kind         | Generator                   | Yosys                       |
-+==============+=============================+=============================+
-| # Gates      | 23209      (-25.4%)         | 32121                       |
-+--------------+-----------------------------+-----------------------------+
-|                                  1 Fold                                  |
-+--------------+-----------------------------+-----------------------------+
-| Area         | 7182 x 330  (-5.5%)         | 7380 x 340                  |
-+--------------+-----------------------------+-----------------------------+
-| Wirelength   | 1841036     (-4.3%)         | 1924153                     |
-+--------------+-----------------------------+-----------------------------+
-|                                  2 Fold                                  |
-+--------------+-----------------------------+-----------------------------+
-| Area         | 3599 x 660  (-5.3%)         | 3690 x 680                  |
-+--------------+-----------------------------+-----------------------------+
-| Wirelength   | 1670455     (-6.3%)         | 1782558                     |
-+--------------+-----------------------------+-----------------------------+
-|                                  4 Fold                                  |
-+--------------+-----------------------------+-----------------------------+
-| Area         | 1812 x 1320 (-4.6%)         | 1900 x 1320                 |
-+--------------+-----------------------------+-----------------------------+
-| Wirelength   | 1699810     (-1.5%)         | 1726436                     |
-+--------------+-----------------------------+-----------------------------+
+--------+--------------+-----------------------------+-----------------------------+
+| Arch   | Kind         | Generator                   | Yosys                       |
+========+==============+=============================+=============================+
+|  Mux   | # Gates      | 23209      (-25.4%)         | 32121                       |
+--------+--------------+-----------------------------+                             |
+|  Nao   | # Gates      | 34637      (+7.8%)          |                             |
+--------+--------------+-----------------------------+-----------------------------+
+|                                       1 Fold                                      |
+--------+--------------+-----------------------------+-----------------------------+
+|        | Area         | 7182 x 330  (-5.5%)         | 7380 x 340                  |
+|  Mux   +--------------+-----------------------------+-----------------------------+
+|        | Wirelength   | 1841036     (-4.3%)         | 1924153                     |
+--------+--------------+-----------------------------+-----------------------------+
+|        | Area         | 6680 x 340  (-14.9%)        |                             |
+|  Nao   +--------------+-----------------------------+                             |
+|        | Wirelength   | 1637781     (-14.9%)        |                             |
+--------+--------------+-----------------------------+-----------------------------+
+|                                       2 Fold                                      |
+--------+--------------+-----------------------------+-----------------------------+
+|        | Area         | 3599 x 660  (-5.3%)         | 3690 x 680                  |
+|  Mux   +--------------+-----------------------------+-----------------------------+
+|        | Wirelength   | 1670455     (-6.3%)         | 1782558                     |
+--------+--------------+-----------------------------+-----------------------------+
+|        | Area         | 3350 x 680  (-9.2%)         |                             |
+|  Nao   +--------------+-----------------------------+                             |
+|        | Wirelength   | 1548358     (-13.1%)        |                             |
+--------+--------------+-----------------------------+-----------------------------+
+|                                       4 Fold                                      |
+--------+--------------+-----------------------------+-----------------------------+
+|        | Area         | 1812 x 1320 (-4.6%)         | 1900 x 1320                 |
+|  Mux   +--------------+-----------------------------+-----------------------------+
+|        | Wirelength   | 1699810     (-1.5%)         | 1726436                     |
+--------+--------------+-----------------------------+-----------------------------+
+|        | Area         | 1692 x 1360 (-8.2%)         |                             |
+|  Nao   +--------------+-----------------------------+                             |
+|        | Wirelength   | 1512107     (-12.4%)        |                             |
+--------+--------------+-----------------------------+-----------------------------+

-Conclusions that we can draw from those results are :
+
+The difference between the two implementations resides only in the *output*
+multiplexer. With a 4 inputs mux made of mux2+mux3 or 2 inputs multiplexer
+made of alternate layers of nand2+nor2.
+
+Conclusions for the mux2+mux3 implementation :

 1. The generator version uses subtantially less gates than the Yosys one.
   As the both SRAM uses the exact same number of SFFs, the difference is
@ -111,26 +130,42 @@ Conclusions that we can draw from those results are :

   In particular, to build the output multiplexer we only have mx2 and
   mx3 cells, which are large. The density of the SRAM could be much
-   increased if we did have nmx2 and nmx3. We could also try to synthesise
-   the tree using nandX and norX but we are short of time.
+   increased if we did have nmx2 and nmx3.

   Furthermore for the output multiplexers, as it is a controlled case,
   we may also uses three-state drivers cells (which have not been
   ported either).

+Conclusion for the nand2+nor2 implementation:
+
+1. The multiplexer allows us for a much more compact area and noticeably
+   lesser wire length. With an increased number of cells (not an issue).
+
+2. The total wire length is extremely sensitive to the placement, which
+   in our case is just a column ordering. To optimize, the binary tree
+   (for the netlist) is not placed fully symmetrically but slightly
+   "askew".
+
+
 .. note:: Cell width in the SkyWater 130 port of FlexLib:

          ==============  =====
          Cell            Width
          ==============  =====
+          inv_x2          2
          mx2_x2          7
          mx3_x2          11
+          a3_x2           5
          nand2_x0        2
          nand3_x0        3
          nand4_x0        4
          nor2_x0         2
+          nor3_x0         3
+          sff1_x4         15
          ==============  =====

+          Differrent ways of implementing the output multiplexer :
+
          1. mx2_x2 + mx3_x2         = 18
          2. 9 * nand2_x0            = 18
          3. 4 * nand3_x0 + nand4_x0 = 16
@ -178,15 +213,18 @@ class SRAM_256x32 ( BaseSRAM ):
        if fold == 1:
            pass
        elif fold == 2:
-            self.foldTags = [ 'imux_addr0128' ]
+           #self.foldTags = [ 'imux_addr0128' ]
+            self.foldTags = [ 'imux_addr0192' ]
        elif fold == 4:
-            self.foldTags = [ 'omux_0_to_127', 'imux_addr0128', 'imux_addr0240' ]
-           #self.foldTags = [ 'imux_addr0064', 'imux_addr0128', 'imux_addr0192' ]
+           #self.foldTags = [ 'omux_0_to_127', 'imux_addr0128', 'imux_addr0240' ]
+            self.foldTags = [ 'imux_addr0096', 'imux_addr0192', 'imux_addr0160' ]
        else:
            raise ErrorMessage( 1, 'SRAM_256x32.__init__(): Unsupported fold {}, valid values are 1, 2, 4.'.format( fold ))
        self.cell    = af.createCell( 'spram_256x32' )
-        self.mx2Cell = af.getCell( 'mx2_x2', CRL.Catalog.State.Views )
-        self.mx3Cell = af.getCell( 'mx3_x2', CRL.Catalog.State.Views )
+        self.mx2Cell = self.confLib.getStdCell( 'mx2_x2' )
+        self.mx3Cell = self.confLib.getStdCell( 'mx3_x2' )
+        self.na2Cell = self.confLib.getStdCell( 'na2_x1' )
+        self.no2Cell = self.confLib.getStdCell( 'no2_x1' )
        with UpdateSession():
            self.buildInterface()
            self.decoder = ColBlock( self, 'decod', 33 )
@ -203,61 +241,149 @@ class SRAM_256x32 ( BaseSRAM ):
                                      , 'bit_addr{:04d}'.format( addr )
                                      , '_byte{byte}_{bbit}'
                                      , 32 ))
-                bus = Bus( self, 'imux_addr{:04d}_b_q({{}})'.format(addr), 32 )
+                bus = Bus( self, 'imux_addr{:04d}_q({{}})'.format(addr), 32 )
                bitGroup.childs[0].setBusNet( 'q', bus )
                bitGroup.childs[1].setBusNet( 'i', bus )
-                bus = Bus( self, 'bit_addr{:04d}_b_q({{}})'.format(addr), 32 )
+                bus = Bus( self, 'bit_addr{:04d}_q({{}})'.format(addr), 32 )
                bitGroup.childs[0].setBusNet( 'i0', bus )
                bitGroup.childs[1].setBusNet(  'q', bus )
                bus = Bus( self, 'di({})', 32 )
                bitGroup.childs[0].setBusNet( 'i1', bus )
                bitGroup.childs[1].setCmdNet( 'ck', self.getNet( 'clk' ))
-            omuxGroupsCurr = []
-            omuxGroupsNext = []
-            muxDepth       = 0
-            for i in range(256//4):
-                childs = []
-                for addr in range(i*4, (i+1)*4):
-                    tag = SRAM_256x32.BIT_GROUP_FMT.format( addr )
-                    childs.append( self.rootGroup.findChild( tag ))
-                    childs[-1].unGroup()
-                omuxGroupsCurr.append( self._doMux4( childs, muxDepth ))
-            while len(omuxGroupsCurr) >= 4:
-                trace( 610, '\tGrouping {} elements.\n'.format( len(omuxGroupsCurr )))
-                muxDepth += 1
-                for i in range(len(omuxGroupsCurr)//4):
-                    omuxGroupsNext.append( self._doMux4( omuxGroupsCurr[i*4:(i+1)*4], muxDepth ))
-                omuxGroupsCurr = omuxGroupsNext
-                omuxGroupsNext = []
-            for group in omuxGroupsCurr:
-                self.rootGroup.group( group )
+            self.buildDecoder()
+           #self._buildOutputMux_mx23()
+            self._buildOutputMux_nao23()
+           #inst = self.addInstance( 'inv_x2'
+           #                        , 'nrst_inv'
+           #                        , { 'i'  : 'rst'
+           #                          , 'nq' : 'n_rst'
+           #                          }
+           #                       )
+           #self.decoder.addInstance( 0, inst )
            inst = self.addInstance( 'inv_x2'
-                                    , 'nrst_inv'
-                                    , { 'i'  : 'rst'
-                                      , 'nq' : 'nrst'
+                                    , 'nce_inv'
+                                    , { 'i'  : 'ce'
+                                      , 'nq' : 'n_ce'
                                      }
                                   )
            self.decoder.addInstance( 0, inst )
            for child in self.rootGroup.childs[0].childs:
                if child.kind == Column.KIND_COLUMN:
-                    if child.insts[0].getMasterCell() != self.mx3Cell:
-                        continue
-                    rstCol = Column( self
-                                   , af.getCell( 'a2_x2', CRL.Catalog.State.Views )
-                                   , 'omux_nrst'
-                                   , '_byte{byte}_{bbit}'
-                                   , 32 )
-                    busOMux = Bus( self, child.tag+'_b_q({})', 32 )
-                    busDato = Bus( self, 'dato({})', 32 )
-                    child .setBusNet( 'q' , busOMux )
-                    rstCol.setBusNet( 'i0', busOMux )
-                    rstCol.setCmdNet( 'i1', self.getNet('nrst') )
-                    rstCol.setBusNet( 'q' , busDato )
-                    self.rootGroup.group( rstCol )
-            self.buildDecoder()
+                    if child.insts[0].getMasterCell() == self.mx3Cell:
+                        rstCol = Column( self
+                                       , af.getCell( 'a2_x2', CRL.Catalog.State.Views )
+                                       , 'omux_n_rst'
+                                       , '_byte{byte}_{bbit}'
+                                       , 32 )
+                        busOMux = Bus( self, child.tag+'_q({})', 32 )
+                        busDato = Bus( self, 'dato({})', 32 )
+                        child .setBusNet( 'q' , busOMux )
+                        rstCol.setBusNet( 'i0', busOMux )
+                        rstCol.setCmdNet( 'i1', self.getNet('n_rst') )
+                        rstCol.setBusNet( 'q' , busDato )
+                        self.rootGroup.group( rstCol )
+                        break
+            omuxRoot = self.rootGroup.findChild( 'omux_0_to_255' )
+            rstCol = Column( self
+                           , self.no2Cell
+                           , 'omux_rst'
+                           , '_byte{byte}_{bbit}'
+                           , 32 )
+            busOMux = Bus( self, 'omux_0_to_255_nq({})', 32 )
+            busDato = Bus( self, 'dato({})', 32 )
+            omuxRoot.setBusNet( 'nq', busOMux )
+            rstCol.setBusNet( 'i0', busOMux )
+            rstCol.setCmdNet( 'i1', self.getNet('rst') )
+            rstCol.setBusNet( 'nq', busDato )
+            omuxRoot.parent.group( rstCol, after=omuxRoot )
            af.saveCell( self.cell, CRL.Catalog.State.Logical )

-    def _doMux4 ( self, childs, muxDepth ):
+    def _buildOutputMux_nao23 ( self ):
+        """
+        Build the complete output mux based on successive layers of NAND2
+        then NOR2. More compact than the mux based version.
+
+        Use an "askew" tree to minimize wiring.
+        """
+        muxDepth = 0
+        levels   = [ [], ]
+        for addr in range(256):
+            oneHotName = 'rdecod_' + self._getDecodNetName( addr, 8 ).replace('_n_','_')
+            tag        = SRAM_256x32.BIT_GROUP_FMT.format( addr )
+            bitGroup   = self.rootGroup.findChild(tag)
+            bitGroup.unGroup()
+            tag = 'sel_' + tag[:-2]
+            nand2Col  = Column( self
+                              , self.na2Cell
+                              , tag
+                              , '_byte{byte}_{bbit}'
+                              , 32 )
+            nand2Col.setCmdNet( 'i0', self.getNet( oneHotName ))
+            busDff = self.getBus( 'bit_addr{:04d}_q({{}})'.format(addr) )
+            nand2Col.setBusNet( 'i1', busDff )
+            bitGroup.group( nand2Col )
+            levels[0].append( (bitGroup, nand2Col ) )
+        while len(levels[muxDepth]) > 1:
+            levels.append( [] )
+            childIndex = 1 if muxDepth else 2
+            for i in range(len( levels[muxDepth]) // 2 ):
+                naoCell     = self.no2Cell if muxDepth%2 else self.na2Cell
+                childs      = [ levels[muxDepth][i*2][0], levels[muxDepth][i*2+1][0] ]
+                leftRoot    = levels[muxDepth][i*2    ][1]
+                rightRoot   = levels[muxDepth][i*2 + 1][1]
+                tags        = [ childs[0].tag, childs[1].tag ]
+                naoTag      = SRAM_256x32._mergeOMuxTags( tags )
+                naoGroup    = ColGroup( naoTag+'_g' )
+                trace( 610, ',+', '\tSRAM_256x32._buildOutputmux() {} + {} -> {}\n' \
+                                  .format( tags[0], tags[1], naoTag ))
+                nao2Col  = Column( self
+                                 , naoCell
+                                 , naoTag
+                                 , '_byte{byte}_{bbit}'
+                                 , 32 )
+                naoGroup.group( childs[0] )
+                naoGroup.group( childs[1] )
+                bus0   = Bus( self, tags[0][:-2]+'_nq({})', 32 )
+                bus1   = Bus( self, tags[1][:-2]+'_nq({})', 32 )
+                busNao = Bus( self, naoTag+'_nq({})', 32 )
+                leftRoot .setBusNet( 'nq', bus0 )
+                rightRoot.setBusNet( 'nq', bus1 )
+                nao2Col.setBusNet( 'i0', bus0 )
+                nao2Col.setBusNet( 'i1', bus1 )
+                childs[1].reverse()
+                rightRoot.parent.group( nao2Col, before=rightRoot )
+                trace( 610, '\tInsert mux {} before {}\n'.format( nao2Col, rightRoot.parent ))
+                levels[muxDepth+1].append( (naoGroup, nao2Col) )
+                trace( 610, '-,' )
+            muxDepth += 1
+        self.rootGroup.group( levels[muxDepth][0][0] )
+
+    def _buildOutputMux_mx23 ( self ):
+        """
+        Build the complete output mux based on successive layers of mux4,
+        each mux4 beeing built upon a mux2 + mux3 (_doMux4_mux23).
+        """
+        omuxGroupsCurr = []
+        omuxGroupsNext = []
+        muxDepth       = 0
+        for i in range(256//4):
+            childs = []
+            for addr in range(i*4, (i+1)*4):
+                tag = SRAM_256x32.BIT_GROUP_FMT.format( addr )
+                childs.append( self.rootGroup.findChild( tag ))
+                childs[-1].unGroup()
+            omuxGroupsCurr.append( self._doMux4_mx23( childs, muxDepth ))
+        while len(omuxGroupsCurr) >= 4:
+            trace( 610, '\tGrouping {} elements.\n'.format( len(omuxGroupsCurr )))
+            muxDepth += 1
+            for i in range(len(omuxGroupsCurr)//4):
+                omuxGroupsNext.append( self._doMux4_mx23( omuxGroupsCurr[i*4:(i+1)*4], muxDepth ))
+            omuxGroupsCurr = omuxGroupsNext
+            omuxGroupsNext = []
+        for group in omuxGroupsCurr:
+            self.rootGroup.group( group )
+
+    def _doMux4_mx23 ( self, childs, muxDepth ):
        """
        Build a 4 entry mux. It uses a mux2 / mux3 combination.
        Returns a newly build group.
@ -276,7 +402,7 @@ class SRAM_256x32 ( BaseSRAM ):
        mux2Tag    = SRAM_256x32._mergeOMuxTags( tags[0:2] )
        mux3Tag    = SRAM_256x32._mergeOMuxTags( tags )
        muxGroup   = ColGroup( muxTag+'_g' )
-        trace( 610, ',+', '\tSRAM_256x32._doMux4() {} + {} -> {}\n' \
+        trace( 610, ',+', '\tSRAM_256x32._doMux4_mx23() {} + {} -> {}\n' \
                          .format( mux2Tag, mux3Tag, muxTag ))
        mux2Col  = Column( self
                         , self.mx2Cell
@ -297,11 +423,11 @@ class SRAM_256x32 ( BaseSRAM ):
        muxGroup.group( childs[2] )
        muxGroup.group( mux3Col )
        muxGroup.group( childs[3] )
-        bus0   = Bus( self, tags[0][:-2]+'_b_q({})', 32 )
-        bus1   = Bus( self, tags[1][:-2]+'_b_q({})', 32 )
-        bus2   = Bus( self, tags[2][:-2]+'_b_q({})', 32 )
-        bus3   = Bus( self, tags[3][:-2]+'_b_q({})', 32 )
-        busMx2 = Bus( self, mux2Tag+'_b_q({})', 32 )
+        bus0   = Bus( self, tags[0][:-2]+'_q({})', 32 )
+        bus1   = Bus( self, tags[1][:-2]+'_q({})', 32 )
+        bus2   = Bus( self, tags[2][:-2]+'_q({})', 32 )
+        bus3   = Bus( self, tags[3][:-2]+'_q({})', 32 )
+        busMx2 = Bus( self, mux2Tag+'_q({})', 32 )
        childs[0].childs[ childIndex ].setBusNet( 'q', bus0 )
        childs[1].childs[ childIndex ].setBusNet( 'q', bus1 )
        childs[2].childs[ childIndex ].setBusNet( 'q', bus2 )
@ -331,8 +457,10 @@ class SRAM_256x32 ( BaseSRAM ):
        addrs    = []
        for tag in tags:
            end = -2 if tag.endswith('_g') else 0
-            if tag.startswith('bit'):
+            if tag.startswith('bit_addr'):
                addrs.append( int( tag[8:end] ))
+            if tag.startswith('sel_bit_addr'):
+                addrs.append( int( tag[12:end] ))
            elif tag.startswith('omux'):
                m = vectorRe.match( tag )
                addrs += [ int(m.group('lsb')), int(m.group('msb')) ]
@ -397,14 +525,14 @@ class SRAM_256x32 ( BaseSRAM ):
        if addrWidth == 2:
            indexFirstBit = (oneHot >> addrWidth) * addrWidth
            valueAddr     =  oneHot % (1 << addrWidth)
-            trunkName     = self._getDecodNetName( oneHot, addrWidth )
-            instConf.append( 'a2_x2' )
-            instConf.append( 'decod_a2_{}'.format( trunkName ))
+            trunkName     = 'n_'+self._getDecodNetName( oneHot, addrWidth )
+            instConf.append( self.confLib.getStdCellName('na2_x1') )
+            instConf.append( 'decod_nand2_{}'.format( trunkName ))
            instConf.append( {} )
            for i in range(2):
                inv  = '' if (valueAddr & (1 << i)) else 'n_'
                instConf[2][ 'i{}'.format(i) ] = '{}addr({})'.format( inv, indexFirstBit+i )
-            instConf[2][ 'q' ] = 'decod_'+trunkName
+            instConf[2][ 'nq' ] = 'decod_'+trunkName
        elif addrWidth == 4 or addrWidth == 8:
            halfWidth = addrWidth>>1
            halfMask  = 0
@ -413,15 +541,16 @@ class SRAM_256x32 ( BaseSRAM ):
            indexFirstBit = (oneHot >> addrWidth) * addrWidth
            valueAddr     =  oneHot % (1 << addrWidth)
            trunkName     = self._getDecodNetName( oneHot, addrWidth )
-            instConf.append( 'a2_x2' )
-            instConf.append( 'decod_a2_{}'.format( trunkName ))
+            gate          = 'no2_x1' if addrWidth == 4 else 'na2_x1'
+            instConf.append( self.confLib.getStdCellName(gate) )
+            instConf.append( 'decod_{}_{}'.format( gate[:-3], trunkName ))
            instConf.append( {} )
            offset  = (oneHot >> addrWidth) << (halfWidth+1)
            oneHot0 = (oneHot & halfMask) + offset
-            instConf[2][ 'i0' ] = 'decod_'+self._getDecodNetName( oneHot0, halfWidth )
+            instConf[2][ 'i0' ] = 'decod_n_'+self._getDecodNetName( oneHot0, halfWidth )
            oneHot1 = ((oneHot >> halfWidth) & halfMask) + (1<<(halfWidth)) + offset
-            instConf[2][ 'i1' ] = 'decod_'+self._getDecodNetName( oneHot1, halfWidth )
-            instConf[2][ 'q'  ] = 'decod_'+trunkName
+            instConf[2][ 'i1' ] = 'decod_n_'+self._getDecodNetName( oneHot1, halfWidth )
+            instConf[2][ 'nq' ] = 'decod_n_'+trunkName
            trace( 610, '\t{:08b} {:3d}:{} + {:3d}:{} => {:3d}::{:08b}:{}\n' \
                        .format( halfMask
                               , oneHot0, self._getDecodNetName( oneHot0, halfWidth )
@ -449,6 +578,14 @@ class SRAM_256x32 ( BaseSRAM ):
                                   )
            self.decoder.addInstance( bit * 4, inst )
            self.connect( 'raddr_sff_{}'.format(bit), 'ck', 'clk' )
+        for we in range(4):
+            inst = self.addInstance( 'inv_x1'
+                                   , 'decod_n_we_{}'.format(we)
+                                   , {  'i' :   'we({})'.format(we)
+                                     , 'nq' : 'n_we({})'.format(we)
+                                     }
+                                   )
+            self.decoder.addInstance( we*4 + 1, inst )
        for bit in range(8):
            inst = self.addInstance( 'inv_x1'
                                   , 'decod_inv_{}'.format(bit)
@ -460,6 +597,7 @@ class SRAM_256x32 ( BaseSRAM ):
        for oneHot in range(16):
            trace( 610, '\t{}\n'.format( self._getDecodNetName(oneHot,2) ))
            instDatas = self._getDecodInstConf( oneHot, 2 )
+            print( instDatas )
            inst = self.addInstance( instDatas[0], instDatas[1], instDatas[2] )
            self.decoder.addInstance( oneHot*2 + 1, inst )
        for oneHot in range(32):
@ -473,21 +611,38 @@ class SRAM_256x32 ( BaseSRAM ):
            inst      = self.addInstance( instDatas[0], instDatas[1], instDatas[2] )
            dffCol    = self.rootGroup.findChild( bitTag )
            imuxCol   = self.rootGroup.findChild( imuxTag )
-            self.toHeaders.append(( inst, imuxCol.insts[0] ))
+            self.toHeaders.append(( inst, imuxCol.insts[0], 0 ))
            for we in range(4):
                cmdNetName = 'decod_addr{:04d}_we({})'.format( oneHot, we )
-                inst = self.addInstance( 'a3_x2'
-                                       , 'decod_a3_we_{}_{}'.format(we,oneHot)
-                                       , { 'i0' : instDatas[2]['q']
-                                         , 'i1' : 'ce'
-                                         , 'i2' : 'we({})'.format(we)
-                                         , 'q'  : cmdNetName
+                inst = self.addInstance( self.confLib.getStdCellName('no3_x1')
+                                       , 'decod_no3_we_{}_{}'.format(we,oneHot)
+                                       , { 'i0' : instDatas[2]['nq']
+                                         , 'i1' : 'n_ce'
+                                         , 'i2' : 'n_we({})'.format(we)
+                                         , 'nq' : cmdNetName
                                         }
                                       )
-                self.toHeaders.append(( inst, dffCol.insts[0] ))
+                self.toHeaders.append(( inst, imuxCol.insts[0], 0 ))
                for bit in range(8):
                    self.connect( 'imux_addr{:04d}_byte{byte}_{bbit}'.format( oneHot, byte=we, bbit=bit )
                                , 'cmd'
                                , cmdNetName
                                )
+            oneHotName = instDatas[2]['nq'].replace('_n_','_')
+            inst = self.addInstance( 'inv_x1'
+                                   , 'omux_onehot_inv_{:04d}'.format(oneHot)
+                                   , {  'i' : instDatas[2]['nq']
+                                     , 'nq' : oneHotName
+                                     }
+                                   )
+            self.toHeaders.append(( inst, imuxCol.insts[0], 0))
+            sffName = 'omux_onehot_dff_{:04d}'.format(oneHot)
+            inst = self.addInstance( 'sff1_x4'
+                                   , sffName
+                                   , { 'i' :     oneHotName
+                                     , 'q' : 'r'+oneHotName
+                                     }
+                                   )
+            self.connect( sffName, 'ck', 'clk' )
+            self.toHeaders.append(( inst, imuxCol.insts[0], 1 ))
        trace( 610, '-,' )