// This file describes the third of three pattern matcher setups that
//   forms the `xilinx_dsp` pass described in xilinx_dsp.cc
// At a high level, it works as follows:
//   (1) Starting from a DSP48E1 cell that (a) has the Z multiplexer
//       (controlled by OPMODE[6:4]) set to zero and (b) doesn't already
//       use the 'PCOUT' port
//   (2.1) Match another DSP48E1 cell that (a) does not have the CREG enabled,
//         (b) has its Z multiplexer output set to the 'C' port, which is
//         driven by the 'P' output of the previous DSP cell, and (c) has its
//         'PCIN' port unused
//   (2.2) Same as (2.1) but with the 'C' port driven by the 'P' output of the
//         previous DSP cell right-shifted by 17 bits
//   (3) For this subequent DSP48E1 match (i.e. PCOUT -> PCIN cascade exists)
//       if (a) the previous DSP48E1 uses either the A2REG or A1REG, (b) this
//       DSP48 does not use A2REG nor A1REG, (c) this DSP48E1 does not already
//       have an ACOUT -> ACIN cascade, (d) the previous DSP does not already
//       use its ACOUT port, then examine if an ACOUT -> ACIN cascade
//       opportunity exists by matching for a $dff-with-optional-clock-enable-
//       or-reset and checking that the 'D' input of this register is the same
//       as the 'A' input of the previous DSP
//   (4) Same as (3) but for BCOUT -> BCIN cascade
//   (5) Recursively go to (2.1) until no more matches possible, keeping track
//       of the longest possible chain found
//   (6) The longest chain is then divided into chunks of no more than
//       MAX_DSP_CASCADE in length (to prevent long cascades that exceed the
//       height of a DSP column) with each DSP in each chunk being rewritten
//       to use [ABP]COUT -> [ABP]CIN cascading as appropriate
// Notes:
//   - Currently, [AB]COUT -> [AB]COUT cascades (3 or 4) are only considered
//     if a PCOUT -> PCIN cascade is (2.1 or 2.2) first identified; this need
//     not be the case --- [AB] cascades can exist independently of a P cascade
//     (though all three cascades must come from the same DSP). This situation
//     is not handled currently.
//   - In addition, [AB]COUT -> [AB]COUT cascades (3 or 4) are currently
//     conservative in that they examine the situation where (a) the previous
//     DSP has [AB]2REG or [AB]1REG enabled, (b) that the downstream DSP has no
//     registers enabled, and (c) that there exists only one additional register
//     between the upstream and downstream DSPs. This can certainly be relaxed
//     to identify situations ranging from (i) neither DSP uses any registers,
//     to (ii) upstream DSP has 2 registers, downstream DSP has 2 registers, and
//     there exists a further 2 registers between them. This remains a TODO
//     item.

pattern xilinx_dsp_cascade

udata <std::function<SigSpec(const SigSpec&)>> unextend
udata <vector<std::tuple<Cell*,int,int,int>>> chain longest_chain
state <Cell*> next
state <SigSpec> clock
state <int> AREG BREG

// Variables used for subpatterns
state <SigSpec> argQ argD
state <bool> ffcepol ffrstpol
state <int> ffoffset
udata <SigSpec> dffD dffQ
udata <SigBit> dffclock
udata <Cell*> dff dffcemux dffrstmux
udata <bool> dffcepol dffrstpol

code
#define MAX_DSP_CASCADE 20
endcode

// (1) Starting from a DSP48* cell that (a) has the Z multiplexer
//     (controlled by OPMODE[3:2] for DSP48A*, by OPMODE[6:4] for DSP48E1)
//     set to zero and (b) doesn't already use the 'PCOUT' port
match first
	select (first->type.in(\DSP48A, \DSP48A1) && port(first, \OPMODE, Const(0, 8)).extract(2,2) == Const::from_string("00")) || (first->type.in(\DSP48E1) && port(first, \OPMODE, Const(0, 7)).extract(4,3) == Const::from_string("000"))
	select nusers(port(first, \PCOUT, SigSpec())) <= 1
endmatch

// (6) The longest chain is then divided into chunks of no more than
//     MAX_DSP_CASCADE in length (to prevent long cascades that exceed the
//     height of a DSP column) with each DSP in each chunk being rewritten
//     to use [ABP]COUT -> [ABP]CIN cascading as appropriate
code
	longest_chain.clear();
	chain.emplace_back(first, -1, -1, -1);
	subpattern(tail);
finally
	chain.pop_back();
	log_assert(chain.empty());
	if (GetSize(longest_chain) > 1) {
		Cell *dsp = std::get<0>(longest_chain.front());

		Cell *dsp_pcin;
		int P, AREG, BREG;
		for (int i = 1; i < GetSize(longest_chain); i++) {
			std::tie(dsp_pcin,P,AREG,BREG) = longest_chain[i];

			if (i % MAX_DSP_CASCADE > 0) {
				if (P >= 0) {
					Wire *cascade = module->addWire(NEW_ID, 48);
					dsp_pcin->setPort(ID(C), Const(0, 48));
					dsp_pcin->setPort(ID(PCIN), cascade);
					dsp->setPort(ID(PCOUT), cascade);
					add_siguser(cascade, dsp_pcin);
					add_siguser(cascade, dsp);

					SigSpec opmode = port(dsp_pcin, \OPMODE, Const(0, 7));
					if (dsp->type.in(\DSP48A, \DSP48A1)) {
						log_assert(P == 0);
						opmode[3] = State::S0;
						opmode[2] = State::S1;
					}
					else if (dsp->type.in(\DSP48E1)) {
						if (P == 17)
							opmode[6] = State::S1;
						else if (P == 0)
							opmode[6] = State::S0;
						else log_abort();

						opmode[5] = State::S0;
						opmode[4] = State::S1;
					}
					dsp_pcin->setPort(\OPMODE, opmode);

					log_debug("PCOUT -> PCIN cascade for %s -> %s\n", log_id(dsp), log_id(dsp_pcin));
				}
				if (AREG >= 0) {
					Wire *cascade = module->addWire(NEW_ID, 30);
					dsp_pcin->setPort(ID(A), Const(0, 30));
					dsp_pcin->setPort(ID(ACIN), cascade);
					dsp->setPort(ID(ACOUT), cascade);
					add_siguser(cascade, dsp_pcin);
					add_siguser(cascade, dsp);

					if (dsp->type.in(\DSP48E1))
						dsp->setParam(ID(ACASCREG), AREG);
					dsp_pcin->setParam(ID(A_INPUT), Const("CASCADE"));

					log_debug("ACOUT -> ACIN cascade for %s -> %s\n", log_id(dsp), log_id(dsp_pcin));
				}
				if (BREG >= 0) {
					Wire *cascade = module->addWire(NEW_ID, 18);
					if (dsp->type.in(\DSP48A, \DSP48A1)) {
						// According to UG389 p9 [https://www.xilinx.com/support/documentation/user_guides/ug389.pdf]
						// "The DSP48A1 component uses this input when cascading
						//   BCOUT from an adjacent DSP48A1 slice. The tools then
						//   translate BCOUT cascading to the dedicated BCIN input
						//   and set the B_INPUT attribute for implementation."
						dsp_pcin->setPort(ID(B), cascade);
					}
					else {
						dsp_pcin->setPort(ID(B), Const(0, 18));
						dsp_pcin->setPort(ID(BCIN), cascade);
					}
					dsp->setPort(ID(BCOUT), cascade);
					add_siguser(cascade, dsp_pcin);
					add_siguser(cascade, dsp);

					if (dsp->type.in(\DSP48E1)) {
						dsp->setParam(ID(BCASCREG), BREG);
						// According to UG389 p13 [https://www.xilinx.com/support/documentation/user_guides/ug389.pdf]
						// "The attribute is only used by place and route tools and
						//   is not necessary for the users to set for synthesis. The
						//   attribute is determined by the connection to the B port
						//   of the DSP48A1 slice. If the B port is connected to the
						//   BCOUT of another DSP48A1 slice, then the tools automatically
						//   set the attribute to 'CASCADE', otherwise it is set to
						//   'DIRECT'".
						dsp_pcin->setParam(ID(B_INPUT), Const("CASCADE"));
					}

					log_debug("BCOUT -> BCIN cascade for %s -> %s\n", log_id(dsp), log_id(dsp_pcin));
				}
			}
			else {
				log_debug("  Blocking %s -> %s cascade (exceeds max: %d)\n", log_id(dsp), log_id(dsp_pcin), MAX_DSP_CASCADE);
			}

			dsp = dsp_pcin;
		}

		accept;
	}
endcode

// ------------------------------------------------------------------

subpattern tail
arg first
arg next

// (2.1) Match another DSP48* cell that (a) does not have the CREG enabled,
//       (b) has its Z multiplexer output set to the 'C' port, which is
//       driven by the 'P' output of the previous DSP cell, and (c) has its
//       'PCIN' port unused
match nextP
	select !param(nextP, \CREG, State::S1).as_bool()
	select (nextP->type.in(\DSP48A, \DSP48A1) && port(nextP, \OPMODE, Const(0, 8)).extract(2,2) == Const::from_string("11")) || (nextP->type.in(\DSP48E1) && port(nextP, \OPMODE, Const(0, 7)).extract(4,3) == Const::from_string("011"))
	select nusers(port(nextP, \C, SigSpec())) > 1
	select nusers(port(nextP, \PCIN, SigSpec())) == 0
	index <SigBit> port(nextP, \C)[0] === port(std::get<0>(chain.back()), \P)[0]
	semioptional
endmatch

// (2.2) For DSP48E1 only, same as (2.1) but with the 'C' port driven
//       by the 'P' output of the previous DSP cell right-shifted by 17 bits
match nextP_shift17
	if !nextP
	select nextP_shift17->type.in(\DSP48E1)
	select !param(nextP_shift17, \CREG, State::S1).as_bool()
	select port(nextP_shift17, \OPMODE, Const(0, 7)).extract(4,3) == Const::from_string("011")
	select nusers(port(nextP_shift17, \C, SigSpec())) > 1
	select nusers(port(nextP_shift17, \PCIN, SigSpec())) == 0
	index <SigBit> port(nextP_shift17, \C)[0] === port(std::get<0>(chain.back()), \P)[17]
	semioptional
endmatch

code next
	next = nextP;
	if (!nextP)
		next = nextP_shift17;
	if (next) {
		if (next->type != first->type)
			reject;
		unextend = [](const SigSpec &sig) {
			int i;
			for (i = GetSize(sig)-1; i > 0; i--)
				if (sig[i] != sig[i-1])
					break;
			// Do not remove non-const sign bit
			if (sig[i].wire)
				++i;
			return sig.extract(0, i);
		};
	}
endcode

// (3) For this subequent DSP48E1 match (i.e. PCOUT -> PCIN cascade exists)
//     if (a) this DSP48E1 does not already have an ACOUT -> ACIN cascade,
//     (b) the previous DSP does  not already use its ACOUT port, then
//     examine if an ACOUT -> ACIN cascade  opportunity exists if
//     (i) A ports are identical, or (ii) separated by a
//     $dff-with-optional-clock-enable-or-reset and checking that the 'D' input
//     of this register is the same as the 'A' input of the previous DSP
//     TODO: Check for two levels of flops, instead of just one
code argQ clock AREG
	AREG = -1;
	if (next && next->type.in(\DSP48E1)) {
		Cell *prev = std::get<0>(chain.back());

		if (param(next, \A_INPUT, Const("DIRECT")).decode_string() == "DIRECT" &&
				port(next, \ACIN, SigSpec()).is_fully_zero() &&
				nusers(port(prev, \ACOUT, SigSpec())) <= 1) {
			if (param(prev, \AREG, 2) == 0) {
				if (port(prev, \A) == port(next, \A))
					AREG = 0;
			}
			else {
				argQ = unextend(port(next, \A));
				clock = port(prev, \CLK);
				subpattern(in_dffe);
				if (dff) {
					if (!dffrstmux && port(prev, \RSTA, State::S0) != State::S0)
						goto reject_AREG;
					if (dffrstmux && port(dffrstmux, \S) != port(prev, \RSTA, State::S0))
						goto reject_AREG;
					IdString CEA;
					if (param(prev, \AREG, 2) == 1)
						CEA = \CEA2;
					else if (param(prev, \AREG, 2) == 2)
						CEA = \CEA1;
					else log_abort();
					if (!dffcemux && port(prev, CEA, State::S0) != State::S1)
						goto reject_AREG;
					if (dffcemux && port(dffcemux, \S) != port(prev, CEA, State::S0))
						goto reject_AREG;
					if (dffD == unextend(port(prev, \A)))
						AREG = 1;
				}
			}
		}
reject_AREG:	;
	}
endcode

// (4) Same as (3) but for BCOUT -> BCIN cascade
code argQ clock BREG
	BREG = -1;
	if (next) {
		Cell *prev = std::get<0>(chain.back());
		if (param(next, \B_INPUT, Const("DIRECT")).decode_string() == "DIRECT" &&
				port(next, \BCIN, SigSpec()).is_fully_zero() &&
				nusers(port(prev, \BCOUT, SigSpec())) <= 1) {
			if ((next->type.in(\DSP48A, \DSP48A1) && param(prev, \B0REG, 0) == 0 && param(prev, \B1REG, 1) == 0) ||
				(next->type.in(\DSP48E1) && param(prev, \BREG, 2) == 0)) {
				if (port(prev, \B) == port(next, \B))
					BREG = 0;
			}
			else {
				argQ = unextend(port(next, \B));
				clock = port(prev, \CLK);
				subpattern(in_dffe);
				if (dff) {
					if (!dffrstmux && port(prev, \RSTB, State::S0) != State::S0)
						goto reject_BREG;
					if (dffrstmux && port(dffrstmux, \S) != port(prev, \RSTB, State::S0))
						goto reject_BREG;
					IdString CEB;
					if (next->type.in(\DSP48A, \DSP48A1))
						CEB = \CEB;
					else if (next->type.in(\DSP48E1)) {
						if (param(prev, \BREG, 2) == 1)
							CEB = \CEB2;
						else if (param(prev, \BREG, 2) == 2)
							CEB = \CEB1;
						else log_abort();
					}
					else log_abort();
					if (!dffcemux && port(prev, CEB, State::S0) != State::S1)
						goto reject_BREG;
					if (dffcemux && port(dffcemux, \S) != port(prev, CEB, State::S0))
						goto reject_BREG;
					if (dffD == unextend(port(prev, \B))) {
						if (next->type.in(\DSP48A, \DSP48A1) && param(prev, \B0REG, 0) != 0)
							goto reject_BREG;
						BREG = 1;
					}
				}
			}
		}
reject_BREG:	;
	}
endcode

// (5) Recursively go to (2.1) until no more matches possible, recording the
//     longest possible chain
code
	if (next) {
		chain.emplace_back(next, nextP_shift17 ? 17 : nextP ? 0 : -1, AREG, BREG);

		SigSpec sigC = unextend(port(next, \C));

		if (nextP_shift17) {
			if (GetSize(sigC)+17 <= GetSize(port(std::get<0>(chain.back()), \P)) &&
					port(std::get<0>(chain.back()), \P).extract(17, GetSize(sigC)) != sigC)
				subpattern(tail);
		}
		else {
			if (GetSize(sigC) <= GetSize(port(std::get<0>(chain.back()), \P)) &&
					port(std::get<0>(chain.back()), \P).extract(0, GetSize(sigC)) != sigC)
				subpattern(tail);

		}
	} else {
		if (GetSize(chain) > GetSize(longest_chain))
			longest_chain = chain;
	}
finally
	if (next)
		chain.pop_back();
endcode

// #######################

// Subpattern for matching against input registers, based on knowledge of the
//   'Q' input. Typically, identifying registers with clock-enable and reset
//   capability would be a task would be handled by other Yosys passes such as
//   dff2dffe, but since DSP inference happens much before this, these patterns
//   have to be manually identified.
// At a high level:
//   (1) Starting from a $dff cell that (partially or fully) drives the given
//       'Q' argument
//   (2) Match for a $mux cell implementing synchronous reset semantics ---
//       one that exclusively drives the 'D' input of the $dff, with one of its
//       $mux inputs being fully zero
//   (3) Match for a $mux cell implement clock enable semantics --- one that
//       exclusively drives the 'D' input of the $dff (or the other input of
//       the reset $mux) and where one of this $mux's inputs is connected to
//       the 'Q' output of the $dff
subpattern in_dffe
arg argD argQ clock

code
	dff = nullptr;
	for (const auto &c : argQ.chunks()) {
		// Abandon matches when 'Q' is a constant
		if (!c.wire)
			reject;
		// Abandon matches when 'Q' has the keep attribute set
		if (c.wire->get_bool_attribute(\keep))
			reject;
		// Abandon matches when 'Q' has a non-zero init attribute set
		// (not supported by DSP48E1)
		Const init = c.wire->attributes.at(\init, Const());
		for (auto b : init.extract(c.offset, c.width))
			if (b != State::Sx && b != State::S0)
				reject;
	}
endcode

// (1) Starting from a $dff cell that (partially or fully) drives the given
//     'Q' argument
match ff
	select ff->type.in($dff)
	// DSP48E1 does not support clock inversion
	select param(ff, \CLK_POLARITY).as_bool()

	slice offset GetSize(port(ff, \D))
	index <SigBit> port(ff, \Q)[offset] === argQ[0]

	// Check that the rest of argQ is present
	filter GetSize(port(ff, \Q)) >= offset + GetSize(argQ)
	filter port(ff, \Q).extract(offset, GetSize(argQ)) == argQ

	filter clock == SigBit() || port(ff, \CLK) == clock

	set ffoffset offset
endmatch

code argQ argD
	SigSpec Q = port(ff, \Q);
	dff = ff;
	dffclock = port(ff, \CLK);
	dffD = argQ;
	argD = port(ff, \D);
	argQ = Q;
	dffD.replace(argQ, argD);
	// Only search for ffrstmux if dffD only
	//   has two (ff, ffrstmux) users
	if (nusers(dffD) > 2)
		argD = SigSpec();
endcode

// (2) Match for a $mux cell implementing synchronous reset semantics ---
//     exclusively drives the 'D' input of the $dff, with one of the $mux
//     inputs being fully zero
match ffrstmux
	if !argD.empty()
	select ffrstmux->type.in($mux)
	index <SigSpec> port(ffrstmux, \Y) === argD

	choice <IdString> BA {\B, \A}
	// DSP48E1 only supports reset to zero
	select port(ffrstmux, BA).is_fully_zero()

	define <bool> pol (BA == \B)
	set ffrstpol pol
	semioptional
endmatch

code argD
	if (ffrstmux) {
		dffrstmux = ffrstmux;
		dffrstpol = ffrstpol;
		argD = port(ffrstmux, ffrstpol ? \A : \B);
		dffD.replace(port(ffrstmux, \Y), argD);

		// Only search for ffcemux if argQ has at
		//   least 3 users (ff, <upstream>, ffrstmux) and
		//   dffD only has two (ff, ffrstmux)
		if (!(nusers(argQ) >= 3 && nusers(dffD) == 2))
			argD = SigSpec();
	}
	else
		dffrstmux = nullptr;
endcode

// (3) Match for a $mux cell implement clock enable semantics --- one that
//     exclusively drives the 'D' input of the $dff (or the other input of
//     the reset $mux) and where one of this $mux's inputs is connected to
//     the 'Q' output of the $dff
match ffcemux
	if !argD.empty()
	select ffcemux->type.in($mux)
	index <SigSpec> port(ffcemux, \Y) === argD
	choice <IdString> AB {\A, \B}
	index <SigSpec> port(ffcemux, AB) === argQ
	define <bool> pol (AB == \A)
	set ffcepol pol
	semioptional
endmatch

code argD
	if (ffcemux) {
		dffcemux = ffcemux;
		dffcepol = ffcepol;
		argD = port(ffcemux, ffcepol ? \B : \A);
		dffD.replace(port(ffcemux, \Y), argD);
	}
	else
		dffcemux = nullptr;
endcode