// This file describes the third of three pattern matcher setups that // forms the `xilinx_dsp` pass described in xilinx_dsp.cc // At a high level, it works as follows: // (1) Starting from a DSP48E1 cell that (a) has the Z multiplexer // (controlled by OPMODE[6:4]) set to zero and (b) doesn't already // use the 'PCOUT' port // (2.1) Match another DSP48E1 cell that (a) does not have the CREG enabled, // (b) has its Z multiplexer output set to the 'C' port, which is // driven by the 'P' output of the previous DSP cell, and (c) has its // 'PCIN' port unused // (2.2) Same as (2.1) but with the 'C' port driven by the 'P' output of the // previous DSP cell right-shifted by 17 bits // (3) For this subequent DSP48E1 match (i.e. PCOUT -> PCIN cascade exists) // if (a) the previous DSP48E1 uses either the A2REG or A1REG, (b) this // DSP48 does not use A2REG nor A1REG, (c) this DSP48E1 does not already // have an ACOUT -> ACIN cascade, (d) the previous DSP does not already // use its ACOUT port, then examine if an ACOUT -> ACIN cascade // opportunity exists by matching for a $dff-with-optional-clock-enable- // or-reset and checking that the 'D' input of this register is the same // as the 'A' input of the previous DSP // (4) Same as (3) but for BCOUT -> BCIN cascade // (5) Recursively go to (2.1) until no more matches possible, keeping track // of the longest possible chain found // (6) The longest chain is then divided into chunks of no more than // MAX_DSP_CASCADE in length (to prevent long cascades that exceed the // height of a DSP column) with each DSP in each chunk being rewritten // to use [ABP]COUT -> [ABP]CIN cascading as appropriate // Notes: // - Currently, [AB]COUT -> [AB]COUT cascades (3 or 4) are only considered // if a PCOUT -> PCIN cascade is (2.1 or 2.2) first identified; this need // not be the case --- [AB] cascades can exist independently of a P cascade // (though all three cascades must come from the same DSP). This situation // is not handled currently. // - In addition, [AB]COUT -> [AB]COUT cascades (3 or 4) are currently // conservative in that they examine the situation where (a) the previous // DSP has [AB]2REG or [AB]1REG enabled, (b) that the downstream DSP has no // registers enabled, and (c) that there exists only one additional register // between the upstream and downstream DSPs. This can certainly be relaxed // to identify situations ranging from (i) neither DSP uses any registers, // to (ii) upstream DSP has 2 registers, downstream DSP has 2 registers, and // there exists a further 2 registers between them. This remains a TODO // item. pattern xilinx_dsp_cascade udata > unextend udata >> chain longest_chain state next state clock state AREG BREG // Variables used for subpatterns state argQ argD state ffcepol ffrstpol state ffoffset udata dffD dffQ udata dffclock udata dff dffcemux dffrstmux udata dffcepol dffrstpol code #define MAX_DSP_CASCADE 20 endcode // (1) Starting from a DSP48E1 cell that (a) has the Z multiplexer // (controlled by OPMODE[6:4]) set to zero and (b) doesn't already // use the 'PCOUT' port match first select first->type.in(\DSP48E1) select port(first, \OPMODE, Const(0, 7)).extract(4,3) == Const::from_string("000") select nusers(port(first, \PCOUT, SigSpec())) <= 1 endmatch // (6) The longest chain is then divided into chunks of no more than // MAX_DSP_CASCADE in length (to prevent long cascades that exceed the // height of a DSP column) with each DSP in each chunk being rewritten // to use [ABP]COUT -> [ABP]CIN cascading as appropriate code longest_chain.clear(); chain.emplace_back(first, -1, -1, -1); subpattern(tail); finally chain.pop_back(); log_assert(chain.empty()); if (GetSize(longest_chain) > 1) { Cell *dsp = std::get<0>(longest_chain.front()); Cell *dsp_pcin; int P, AREG, BREG; for (int i = 1; i < GetSize(longest_chain); i++) { std::tie(dsp_pcin,P,AREG,BREG) = longest_chain[i]; if (i % MAX_DSP_CASCADE > 0) { if (P >= 0) { Wire *cascade = module->addWire(NEW_ID, 48); dsp_pcin->setPort(ID(C), Const(0, 48)); dsp_pcin->setPort(ID(PCIN), cascade); dsp->setPort(ID(PCOUT), cascade); add_siguser(cascade, dsp_pcin); add_siguser(cascade, dsp); SigSpec opmode = port(dsp_pcin, \OPMODE, Const(0, 7)); if (P == 17) opmode[6] = State::S1; else if (P == 0) opmode[6] = State::S0; else log_abort(); opmode[5] = State::S0; opmode[4] = State::S1; dsp_pcin->setPort(\OPMODE, opmode); log_debug("PCOUT -> PCIN cascade for %s -> %s\n", log_id(dsp), log_id(dsp_pcin)); } if (AREG >= 0) { Wire *cascade = module->addWire(NEW_ID, 30); dsp_pcin->setPort(ID(A), Const(0, 30)); dsp_pcin->setPort(ID(ACIN), cascade); dsp->setPort(ID(ACOUT), cascade); add_siguser(cascade, dsp_pcin); add_siguser(cascade, dsp); dsp->setParam(ID(ACASCREG), AREG); dsp_pcin->setParam(ID(A_INPUT), Const("CASCADE")); log_debug("ACOUT -> ACIN cascade for %s -> %s\n", log_id(dsp), log_id(dsp_pcin)); } if (BREG >= 0) { Wire *cascade = module->addWire(NEW_ID, 18); dsp_pcin->setPort(ID(B), Const(0, 18)); dsp_pcin->setPort(ID(BCIN), cascade); dsp->setPort(ID(BCOUT), cascade); add_siguser(cascade, dsp_pcin); add_siguser(cascade, dsp); dsp->setParam(ID(BCASCREG), BREG); dsp_pcin->setParam(ID(B_INPUT), Const("CASCADE")); log_debug("BCOUT -> BCIN cascade for %s -> %s\n", log_id(dsp), log_id(dsp_pcin)); } } else { log_debug(" Blocking %s -> %s cascade (exceeds max: %d)\n", log_id(dsp), log_id(dsp_pcin), MAX_DSP_CASCADE); } dsp = dsp_pcin; } accept; } endcode // ------------------------------------------------------------------ subpattern tail arg first arg next // (2.1) Match another DSP48E1 cell that (a) does not have the CREG enabled, // (b) has its Z multiplexer output set to the 'C' port, which is // driven by the 'P' output of the previous DSP cell, and (c) has its // 'PCIN' port unused match nextP select nextP->type.in(\DSP48E1) select !param(nextP, \CREG, State::S1).as_bool() select port(nextP, \OPMODE, Const(0, 7)).extract(4,3) == Const::from_string("011") select nusers(port(nextP, \C, SigSpec())) > 1 select nusers(port(nextP, \PCIN, SigSpec())) == 0 index port(nextP, \C)[0] === port(std::get<0>(chain.back()), \P)[0] semioptional endmatch // (2.2) Same as (2.1) but with the 'C' port driven by the 'P' output of the // previous DSP cell right-shifted by 17 bits match nextP_shift17 if !nextP select nextP_shift17->type.in(\DSP48E1) select !param(nextP_shift17, \CREG, State::S1).as_bool() select port(nextP_shift17, \OPMODE, Const(0, 7)).extract(4,3) == Const::from_string("011") select nusers(port(nextP_shift17, \C, SigSpec())) > 1 select nusers(port(nextP_shift17, \PCIN, SigSpec())) == 0 index port(nextP_shift17, \C)[0] === port(std::get<0>(chain.back()), \P)[17] semioptional endmatch code next next = nextP; if (!nextP) next = nextP_shift17; if (next) { unextend = [](const SigSpec &sig) { int i; for (i = GetSize(sig)-1; i > 0; i--) if (sig[i] != sig[i-1]) break; // Do not remove non-const sign bit if (sig[i].wire) ++i; return sig.extract(0, i); }; } endcode // (3) For this subequent DSP48E1 match (i.e. PCOUT -> PCIN cascade exists) // if (a) the previous DSP48E1 uses either the A2REG or A1REG, (b) this // DSP48 does not use A2REG nor A1REG, (c) this DSP48E1 does not already // have an ACOUT -> ACIN cascade, (d) the previous DSP does not already // use its ACOUT port, then examine if an ACOUT -> ACIN cascade // opportunity exists by matching for a $dff-with-optional-clock-enable- // or-reset and checking that the 'D' input of this register is the same // as the 'A' input of the previous DSP code argQ clock AREG AREG = -1; if (next) { Cell *prev = std::get<0>(chain.back()); if (param(prev, \AREG, 2).as_int() > 0 && param(next, \AREG, 2).as_int() > 0 && param(next, \A_INPUT, Const("DIRECT")).decode_string() == "DIRECT" && nusers(port(prev, \ACOUT, SigSpec())) <= 1) { argQ = unextend(port(next, \A)); clock = port(prev, \CLK); subpattern(in_dffe); if (dff) { if (!dffrstmux && port(prev, \RSTA, State::S0) != State::S0) goto reject_AREG; if (dffrstmux && port(dffrstmux, \S) != port(prev, \RSTA, State::S0)) goto reject_AREG; if (!dffcemux && port(prev, \CEA2, State::S0) != State::S0) goto reject_AREG; if (dffcemux && port(dffcemux, \S) != port(prev, \CEA2, State::S0)) goto reject_AREG; if (dffD == unextend(port(prev, \A))) AREG = 1; reject_AREG: ; } } } endcode // (4) Same as (3) but for BCOUT -> BCIN cascade code argQ clock BREG BREG = -1; if (next) { Cell *prev = std::get<0>(chain.back()); if (param(prev, \BREG, 2).as_int() > 0 && param(next, \BREG, 2).as_int() > 0 && param(next, \B_INPUT, Const("DIRECT")).decode_string() == "DIRECT" && port(next, \BCIN, SigSpec()).is_fully_zero() && nusers(port(prev, \BCOUT, SigSpec())) <= 1) { argQ = unextend(port(next, \B)); clock = port(prev, \CLK); subpattern(in_dffe); if (dff) { if (!dffrstmux && port(prev, \RSTB, State::S0) != State::S0) goto reject_BREG; if (dffrstmux && port(dffrstmux, \S) != port(prev, \RSTB, State::S0)) goto reject_BREG; if (!dffcemux && port(prev, \CEB2, State::S0) != State::S0) goto reject_BREG; if (dffcemux && port(dffcemux, \S) != port(prev, \CEB2, State::S0)) goto reject_BREG; if (dffD == unextend(port(prev, \B))) BREG = 1; reject_BREG: ; } } } endcode // (5) Recursively go to (2.1) until no more matches possible, recording the // longest possible chain code if (next) { chain.emplace_back(next, nextP_shift17 ? 17 : nextP ? 0 : -1, AREG, BREG); SigSpec sigC = unextend(port(next, \C)); if (nextP_shift17) { if (GetSize(sigC)+17 <= GetSize(port(std::get<0>(chain.back()), \P)) && port(std::get<0>(chain.back()), \P).extract(17, GetSize(sigC)) != sigC) subpattern(tail); } else { if (GetSize(sigC) <= GetSize(port(std::get<0>(chain.back()), \P)) && port(std::get<0>(chain.back()), \P).extract(0, GetSize(sigC)) != sigC) subpattern(tail); } } else { if (GetSize(chain) > GetSize(longest_chain)) longest_chain = chain; } finally if (next) chain.pop_back(); endcode // ####################### // Subpattern for matching against input registers, based on knowledge of the // 'Q' input. Typically, identifying registers with clock-enable and reset // capability would be a task would be handled by other Yosys passes such as // dff2dffe, but since DSP inference happens much before this, these patterns // have to be manually identified. // At a high level: // (1) Starting from a $dff cell that (partially or fully) drives the given // 'Q' argument // (2) Match for a $mux cell implementing synchronous reset semantics --- // one that exclusively drives the 'D' input of the $dff, with one of its // $mux inputs being fully zero // (3) Match for a $mux cell implement clock enable semantics --- one that // exclusively drives the 'D' input of the $dff (or the other input of // the reset $mux) and where one of this $mux's inputs is connected to // the 'Q' output of the $dff subpattern in_dffe arg argD argQ clock code dff = nullptr; for (const auto &c : argQ.chunks()) { // Abandon matches when 'Q' is a constant if (!c.wire) reject; // Abandon matches when 'Q' has the keep attribute set if (c.wire->get_bool_attribute(\keep)) reject; // Abandon matches when 'Q' has a non-zero init attribute set // (not supported by DSP48E1) Const init = c.wire->attributes.at(\init, Const()); for (auto b : init.extract(c.offset, c.width)) if (b != State::Sx && b != State::S0) reject; } endcode // (1) Starting from a $dff cell that (partially or fully) drives the given // 'Q' argument match ff select ff->type.in($dff) // DSP48E1 does not support clock inversion select param(ff, \CLK_POLARITY).as_bool() slice offset GetSize(port(ff, \D)) index port(ff, \Q)[offset] === argQ[0] // Check that the rest of argQ is present filter GetSize(port(ff, \Q)) >= offset + GetSize(argQ) filter port(ff, \Q).extract(offset, GetSize(argQ)) == argQ filter clock == SigBit() || port(ff, \CLK) == clock set ffoffset offset endmatch code argQ argD SigSpec Q = port(ff, \Q); dff = ff; dffclock = port(ff, \CLK); dffD = argQ; argD = port(ff, \D); argQ = Q; dffD.replace(argQ, argD); // Only search for ffrstmux if dffD only // has two (ff, ffrstmux) users if (nusers(dffD) > 2) argD = SigSpec(); endcode // (2) Match for a $mux cell implementing synchronous reset semantics --- // exclusively drives the 'D' input of the $dff, with one of the $mux // inputs being fully zero match ffrstmux if !argD.empty() select ffrstmux->type.in($mux) index port(ffrstmux, \Y) === argD choice BA {\B, \A} // DSP48E1 only supports reset to zero select port(ffrstmux, BA).is_fully_zero() define pol (BA == \B) set ffrstpol pol semioptional endmatch code argD if (ffrstmux) { dffrstmux = ffrstmux; dffrstpol = ffrstpol; argD = port(ffrstmux, ffrstpol ? \A : \B); dffD.replace(port(ffrstmux, \Y), argD); // Only search for ffcemux if argQ has at // least 3 users (ff, , ffrstmux) and // dffD only has two (ff, ffrstmux) if (!(nusers(argQ) >= 3 && nusers(dffD) == 2)) argD = SigSpec(); } else dffrstmux = nullptr; endcode // (3) Match for a $mux cell implement clock enable semantics --- one that // exclusively drives the 'D' input of the $dff (or the other input of // the reset $mux) and where one of this $mux's inputs is connected to // the 'Q' output of the $dff match ffcemux if !argD.empty() select ffcemux->type.in($mux) index port(ffcemux, \Y) === argD choice AB {\A, \B} index port(ffcemux, AB) === argQ define pol (AB == \A) set ffcepol pol semioptional endmatch code argD if (ffcemux) { dffcemux = ffcemux; dffcepol = ffcepol; argD = port(ffcemux, ffcepol ? \B : \A); dffD.replace(port(ffcemux, \Y), argD); } else dffcemux = nullptr; endcode