Merge remote-tracking branch 'origin/master' into xaig_dff

2019-10-08 13:03:06 -07:00 · 2019-10-08 13:03:06 -07:00 · 304e5f9ea4
parent 4f0818275f 3fb604c75d
commit 304e5f9ea4
14 changed files with 550 additions and 149 deletions
--- a/frontends/rpc/rpc_frontend.cc
+++ b/frontends/rpc/rpc_frontend.cc
@ -28,14 +28,13 @@
 #include <sys/wait.h>
 #include <sys/socket.h>
 #include <sys/un.h>
+extern char **environ;
 #endif

 #include "libs/json11/json11.hpp"
 #include "libs/sha1/sha1.h"
 #include "kernel/yosys.h"

-extern char **environ;
-
 YOSYS_NAMESPACE_BEGIN

 #if defined(_WIN32)
@ -238,6 +237,11 @@ struct RpcModule : RTLIL::Module {

 #if defined(_WIN32)

+#if defined(_MSC_VER)
+#include <BaseTsd.h>
+typedef SSIZE_T ssize_t;
+#endif
+
 struct HandleRpcServer : RpcServer {
 	HANDLE hsend, hrecv;

--- a/passes/equiv/equiv_opt.cc
+++ b/passes/equiv/equiv_opt.cc
@ -33,7 +33,7 @@ struct EquivOptPass:public ScriptPass
 		log("    equiv_opt [options] [command]\n");
 		log("\n");
 		log("This command uses temporal induction to check circuit equivalence before and\n");
-                log("after an optimization pass.\n");
+		log("after an optimization pass.\n");
 		log("\n");
 		log("    -run <from_label>:<to_label>\n");
 		log("        only run the commands between the labels (see below). an empty\n");
@ -50,6 +50,9 @@ struct EquivOptPass:public ScriptPass
 		log("    -multiclock\n");
 		log("        run clk2fflogic before equivalence checking.\n");
 		log("\n");
+		log("    -async2sync\n");
+		log("        run async2sync before equivalence checking.\n");
+		log("\n");
 		log("    -undef\n");
 		log("        enable modelling of undef states during equiv_induct.\n");
 		log("\n");
@ -59,7 +62,7 @@ struct EquivOptPass:public ScriptPass
 	}

 	std::string command, techmap_opts;
-	bool assert, undef, multiclock;
+	bool assert, undef, multiclock, async2sync;

 	void clear_flags() YS_OVERRIDE
 	{
@ -68,6 +71,7 @@ struct EquivOptPass:public ScriptPass
 		assert = false;
 		undef = false;
 		multiclock = false;
+		async2sync = false;
 	}

 	void execute(std::vector < std::string > args, RTLIL::Design * design) YS_OVERRIDE
@ -101,6 +105,10 @@ struct EquivOptPass:public ScriptPass
 				multiclock = true;
 				continue;
 			}
+			if (args[argidx] == "-async2sync") {
+				async2sync = true;
+				continue;
+			}
 			break;
 		}

@ -120,6 +128,9 @@ struct EquivOptPass:public ScriptPass
 		if (!design->full_selection())
 			log_cmd_error("This command only operates on fully selected designs!\n");

+		if (async2sync && multiclock)
+			log_cmd_error("The '-async2sync' and '-multiclock' options are mutually exclusive!\n");
+
 		log_header(design, "Executing EQUIV_OPT pass.\n");
 		log_push();

@ -157,8 +168,8 @@ struct EquivOptPass:public ScriptPass
 		if (check_label("prove")) {
 			if (multiclock || help_mode)
 				run("clk2fflogic", "(only with -multiclock)");
-			if (!multiclock || help_mode)
-				run("async2sync", "(only without -multiclock)");
+			if (async2sync || help_mode)
+				run("async2sync", " (only with -async2sync)");
 			run("equiv_make gold gate equiv");
 			if (help_mode)
 				run("equiv_induct [-undef] equiv");
--- a/passes/pmgen/README.md
+++ b/passes/pmgen/README.md
@ -190,7 +190,7 @@ create matches for different sections of a cell. For example:
        select pmux->type == $pmux
        slice idx GetSize(port(pmux, \S))
        index <SigBit> port(pmux, \S)[idx] === port(eq, \Y)
-	set pmux_slice idx
+        set pmux_slice idx
    endmatch

 The first argument to `slice` is the local variable name used to identify the
--- a/passes/pmgen/ice40_wrapcarry.pmg
+++ b/passes/pmgen/ice40_wrapcarry.pmg
@ -9,3 +9,7 @@ match lut
 	index <SigSpec> port(lut, \I1) === port(carry, \I0)
 	index <SigSpec> port(lut, \I2) === port(carry, \I1)
 endmatch
+
+code
+	accept;
+endcode
--- a/passes/pmgen/peepopt_dffmux.pmg
+++ b/passes/pmgen/peepopt_dffmux.pmg
@ -8,21 +8,23 @@ match dff
 	select GetSize(port(dff, \D)) > 1
 endmatch

+code sigD
+	sigD = port(dff, \D);
+endcode
+
 match rstmux
 	select rstmux->type == $mux
 	select GetSize(port(rstmux, \Y)) > 1
-	index <SigSpec> port(rstmux, \Y) === port(dff, \D)
+	index <SigSpec> port(rstmux, \Y) === sigD
 	choice <IdString> BA {\B, \A}
 	select port(rstmux, BA).is_fully_const()
 	set rstmuxBA BA
-	optional
+	semioptional
 endmatch

 code sigD
 	if (rstmux)
 		sigD = port(rstmux, rstmuxBA == \B ? \A : \B);
-	else
-		sigD = port(dff, \D);
 endcode

 match cemux
@ -32,66 +34,97 @@ match cemux
 	choice <IdString> AB {\A, \B}
 	index <SigSpec> port(cemux, AB) === port(dff, \Q)
 	set cemuxAB AB
+	semioptional
 endmatch

 code
-	SigSpec D = port(cemux, cemuxAB == \A ? \B : \A);
-	SigSpec Q = port(dff, \Q);
+	if (!cemux && !rstmux)
+		reject;
+endcode
+
+code
 	Const rst;
-	if (rstmux)
-		rst = port(rstmux, rstmuxBA).as_const();
-	int width = GetSize(D);
-
-	SigSpec &ceA = cemux->connections_.at(\A);
-	SigSpec &ceB = cemux->connections_.at(\B);
-	SigSpec &ceY = cemux->connections_.at(\Y);
-	SigSpec &dffD = dff->connections_.at(\D);
-	SigSpec &dffQ = dff->connections_.at(\Q);
-
-	if (D[width-1] == D[width-2]) {
-		did_something = true;
-
-		SigBit sign = D[width-1];
-		bool is_signed = sign.wire;
-		int i;
-		for (i = width-1; i >= 2; i--) {
-			if (!is_signed) {
-				module->connect(Q[i], sign);
-				if (D[i-1] != sign || (rst.size() && rst[i-1] != rst[width-1]))
-					break;
-			}
-			else {
-				module->connect(Q[i], Q[i-1]);
-				if (D[i-2] != sign || (rst.size() && rst[i-1] != rst[width-1]))
-					break;
-			}
-		}
-
-		ceA.remove(i, width-i);
-		ceB.remove(i, width-i);
-		ceY.remove(i, width-i);
-		cemux->fixup_parameters();
-		dffD.remove(i, width-i);
-		dffQ.remove(i, width-i);
-		dff->fixup_parameters();
-
-		log("dffcemux pattern in %s: dff=%s, cemux=%s; removed top %d bits.\n", log_id(module), log_id(dff), log_id(cemux), width-i);
-		accept;
+	SigSpec D;
+	if (cemux) {
+		D = port(cemux, cemuxAB == \A ? \B : \A);
+		if (rstmux)
+			rst = port(rstmux, rstmuxBA).as_const();
+		else
+			rst = Const(State::Sx, GetSize(D));
 	}
 	else {
+		log_assert(rstmux);
+		D = port(rstmux, rstmuxBA  == \B ? \A : \B);
+		rst = port(rstmux, rstmuxBA).as_const();
+	}
+	SigSpec Q = port(dff, \Q);
+	int width = GetSize(D);
+
+	SigSpec &dffD = dff->connections_.at(\D);
+	SigSpec &dffQ = dff->connections_.at(\Q);
+	Const init;
+	for (const auto &b : Q) {
+		auto it = b.wire->attributes.find(\init);
+		init.bits.push_back(it == b.wire->attributes.end() ? State::Sx : it->second[b.offset]);
+	}
+
+	auto cmpx = [=](State lhs, State rhs) {
+		if (lhs == State::Sx || rhs == State::Sx)
+			return true;
+		return lhs == rhs;
+	};
+
+	int i = width-1;
+	while (i > 1) {
+		log_dump(i, D[i], D[i-1], rst[i], rst[i-1], init[i], init[i-1]);
+		if (D[i] != D[i-1])
+			break;
+		if (!cmpx(rst[i], rst[i-1]))
+			break;
+		if (!cmpx(init[i], init[i-1]))
+			break;
+		if (!cmpx(rst[i], init[i]))
+			break;
+		module->connect(Q[i], Q[i-1]);
+		i--;
+	}
+	if (i < width-1) {
+		did_something = true;
+		if (cemux) {
+			SigSpec &ceA = cemux->connections_.at(\A);
+			SigSpec &ceB = cemux->connections_.at(\B);
+			SigSpec &ceY = cemux->connections_.at(\Y);
+			ceA.remove(i, width-1-i);
+			ceB.remove(i, width-1-i);
+			ceY.remove(i, width-1-i);
+			cemux->fixup_parameters();
+		}
+		if (rstmux) {
+			SigSpec &rstA = rstmux->connections_.at(\A);
+			SigSpec &rstB = rstmux->connections_.at(\B);
+			SigSpec &rstY = rstmux->connections_.at(\Y);
+			rstA.remove(i, width-1-i);
+			rstB.remove(i, width-1-i);
+			rstY.remove(i, width-1-i);
+			rstmux->fixup_parameters();
+		}
+		dffD.remove(i, width-1-i);
+		dffQ.remove(i, width-1-i);
+		dff->fixup_parameters();
+
+		log("dffcemux pattern in %s: dff=%s, cemux=%s, rstmux=%s; removed top %d bits.\n", log_id(module), log_id(dff), log_id(cemux, "n/a"), log_id(rstmux, "n/a"), width-1-i);
+		width = i+1;
+	}
+	if (cemux) {
+		SigSpec &ceA = cemux->connections_.at(\A);
+		SigSpec &ceB = cemux->connections_.at(\B);
+		SigSpec &ceY = cemux->connections_.at(\Y);
+
 		int count = 0;
 		for (int i = width-1; i >= 0; i--) {
 			if (D[i].wire)
 				continue;
-			Wire *w = Q[i].wire;
-			auto it = w->attributes.find(\init);
-			State init;
-			if (it != w->attributes.end())
-				init = it->second[Q[i].offset];
-			else
-				init = State::Sx;
-
-			if (init == State::Sx || init == D[i].data) {
+			if (cmpx(rst[i], D[i].data) && cmpx(init[i], D[i].data)) {
 				count++;
 				module->connect(Q[i], D[i]);
 				ceA.remove(i);
@ -105,9 +138,10 @@ code
 			did_something = true;
 			cemux->fixup_parameters();
 			dff->fixup_parameters();
-			log("dffcemux pattern in %s: dff=%s, cemux=%s; removed %d constant bits.\n", log_id(module), log_id(dff), log_id(cemux), count);
+			log("dffcemux pattern in %s: dff=%s, cemux=%s, rstmux=%s; removed %d constant bits.\n", log_id(module), log_id(dff), log_id(cemux), log_id(rstmux, "n/a"), count);
 		}
-
-		accept;
 	}
+
+	if (did_something)
+		accept;
 endcode
--- a/passes/pmgen/xilinx_dsp.cc
+++ b/passes/pmgen/xilinx_dsp.cc
@ -20,6 +20,7 @@

 #include "kernel/yosys.h"
 #include "kernel/sigtools.h"
+#include <deque>

 USING_YOSYS_NAMESPACE
 PRIVATE_NAMESPACE_BEGIN
@ -608,8 +609,13 @@ struct XilinxDspPass : public Pass {
 		extra_args(args, argidx, design);

 		for (auto module : design->selected_modules()) {
+			// Experimental feature: pack $add/$sub cells with
+			//   (* use_dsp48="simd" *) into DSP48E1's using its
+			//   SIMD feature
 			xilinx_simd_pack(module, module->selected_cells());

+			// Match for all features ([ABDMP][12]?REG, pre-adder,
+			// post-adder, pattern detector, etc.) except for CREG
 			{
 				xilinx_dsp_pm pm(module, module->selected_cells());
 				pm.run_xilinx_dsp_pack(xilinx_dsp_pack);
@ -618,14 +624,17 @@ struct XilinxDspPass : public Pass {
 			//   is no guarantee that the cell ordering corresponds
 			//   to the "expected" case (i.e. the order in which
 			//   they appear in the source) thus the possiblity
-			//   existed that a register got packed as CREG into a
+			//   existed that a register got packed as a CREG into a
 			//   downstream DSP that should have otherwise been a
-			//   PREG of an upstream DSP that had not been pattern
-			//   matched yet
+			//   PREG of an upstream DSP that had not been visited
+			//   yet
 			{
 				xilinx_dsp_CREG_pm pm(module, module->selected_cells());
 				pm.run_xilinx_dsp_packC(xilinx_dsp_packC);
 			}
+			// Lastly, identify and utilise PCOUT -> PCIN,
+			//   ACOUT -> ACIN, and BCOUT-> BCIN dedicated cascade
+			//   chains
 			{
 				xilinx_dsp_cascade_pm pm(module, module->selected_cells());
 				pm.run_xilinx_dsp_cascade();
--- a/passes/pmgen/xilinx_dsp.pmg
+++ b/passes/pmgen/xilinx_dsp.pmg
@ -1,3 +1,57 @@
+// This file describes the main pattern matcher setup (of three total) that
+//   forms the `xilinx_dsp` pass described in xilinx_dsp.cc
+// At a high level, it works as follows:
+//   ( 1) Starting from a DSP48E1 cell
+//   ( 2) Match the driver of the 'A' input to a possible $dff cell (ADREG)
+//        (attached to at most two $mux cells that implement clock-enable or
+//         reset functionality, using a subpattern discussed below)
+//        If ADREG matched, treat 'A' input as input of ADREG
+//   ( 3) Match the driver of the 'A' and 'D' inputs for a possible $add cell
+//       (pre-adder)
+//   ( 4) If pre-adder was present, find match 'A' input for A2REG
+//        If pre-adder was not present, move ADREG to A2REG
+//        If A2REG, then match 'A' input for A1REG
+//   ( 5) Match 'B' input for B2REG
+//        If B2REG, then match 'B' input for B1REG
+//   ( 6) Match 'D' input for DREG
+//   ( 7) Match 'P' output that exclusively drives an MREG
+//   ( 8) Match 'P' output that exclusively drives one of two inputs to an $add
+//        cell (post-adder).
+//        The other input to the adder is assumed to come in from the 'C' input
+//        (note: 'P' -> 'C' connections that exist for accumulators are
+//         recognised in xilinx_dsp.cc).
+//   ( 9) Match 'P' output that exclusively drives a PREG
+//   (10) If post-adder and PREG both present, match for a $mux cell driving
+//        the 'C' input, where one of the $mux's inputs is the PREG output.
+//        This indicates an accumulator situation, and one where a $mux exists
+//        to override the accumulated value:
+//             +--------------------------------+
+//             |   ____                         |
+//             +--|    \                        |
+//                |$mux|-+                      |
+//         'C' ---|____/ |                      |
+//                       | /-------\   +----+   |
+//            +----+     +-| post- |___|PREG|---+ 'P'
+//            |MREG|------ | adder |   +----+
+//            +----+       \-------/
+//   (11) If PREG present, match for a greater-than-or-equal $ge cell attached
+//        to the 'P' output where it is compared to a constant that is a
+//        power-of-2: e.g. `assign overflow = (PREG >= 2**40);`
+//        In this scenario, the pattern detector functionality of a DSP48E1 can
+//        to implement this function
+// Notes:
+//   - The intention of this pattern matcher is for it to be compatible with
+//     DSP48E1 cells inferred from multiply operations by Yosys, as well as for
+//     user instantiations that may already contain the cells being packed...
+//     (though the latter is currently untested)
+//   - Since the $dff-with-optional-clock-enable-or-reset-mux pattern is used
+//     for each *REG match, it has been factored out into two subpatterns:
+//     in_dffe and out_dffe located at the bottom of this file.
+//   - Matching for pattern detector features is currently incomplete. For
+//     example, matching for underflow as well as overflow detection is
+//     possible, as would auto-reset, enabling saturated arithmetic, detecting
+//     custom patterns, etc.
+
 pattern xilinx_dsp_pack

 state <SigBit> clock
@ -5,12 +59,11 @@ state <SigSpec> sigA sigB sigC sigD sigM sigP
 state <IdString> postAddAB postAddMuxAB
 state <bool> ffA1cepol ffA2cepol ffADcepol ffB1cepol ffB2cepol ffDcepol ffMcepol ffPcepol
 state <bool> ffArstpol ffADrstpol ffBrstpol ffDrstpol ffMrstpol ffPrstpol
-
 state <Cell*> ffAD ffADcemux ffADrstmux ffA1 ffA1cemux ffA1rstmux ffA2 ffA2cemux ffA2rstmux
 state <Cell*> ffB1 ffB1cemux ffB1rstmux ffB2 ffB2cemux ffB2rstmux
 state <Cell*> ffD ffDcemux ffDrstmux ffM ffMcemux ffMrstmux ffP ffPcemux ffPrstmux

-// subpattern
+// Variables used for subpatterns
 state <SigSpec> argQ argD
 state <bool> ffcepol ffrstpol
 state <int> ffoffset
@ -19,6 +72,7 @@ udata <SigBit> dffclock
 udata <Cell*> dff dffcemux dffrstmux
 udata <bool> dffcepol dffrstpol

+// (1) Starting from a DSP48E1 cell
 match dsp
 	select dsp->type.in(\DSP48E1)
 endmatch
@ -50,17 +104,21 @@ code sigA sigB sigC sigD sigM clock
 			sigM.append(P[i]);
 		}
 		log_assert(nusers(P.extract_end(i)) <= 1);
+		// This sigM could have no users if downstream sinks (e.g. $add) is
+		//   narrower than $mul result, for example
+		if (sigM.empty())
+			reject;
 	}
 	else
 		sigM = P;
-	// This sigM could have no users if downstream $add
-	//   is narrower than $mul result, for example
-	if (sigM.empty())
-		reject;

 	clock = port(dsp, \CLK, SigBit());
 endcode

+// (2) Match the driver of the 'A' input to a possible $dff cell (ADREG)
+//     (attached to at most two $mux cells that implement clock-enable or
+//      reset functionality, using a subpattern discussed above)
+//     If matched, treat 'A' input as input of ADREG
 code argQ ffAD ffADcemux ffADrstmux ffADcepol ffADrstpol sigA clock
 	if (param(dsp, \ADREG).as_int() == 0) {
 		argQ = sigA;
@ -81,6 +139,8 @@ code argQ ffAD ffADcemux ffADrstmux ffADcepol ffADrstpol sigA clock
 	}
 endcode

+// (3) Match the driver of the 'A' and 'D' inputs for a possible $add cell
+//     (pre-adder)
 match preAdd
 	if sigD.empty() || sigD.is_fully_zero()
 	// Ensure that preAdder not already used
@ -106,11 +166,12 @@ code sigA sigD
 	if (preAdd) {
 		sigA = port(preAdd, \A);
 		sigD = port(preAdd, \B);
-		if (GetSize(sigA) < GetSize(sigD))
-			std::swap(sigA, sigD);
 	}
 endcode

+// (4) If pre-adder was present, find match 'A' input for A2REG
+//     If pre-adder was not present, move ADREG to A2REG
+//     Then match 'A' input for A1REG
 code argQ ffAD ffADcemux ffADrstmux ffADcepol ffADrstpol sigA clock ffA2 ffA2cemux ffA2rstmux ffA2cepol ffArstpol ffA1 ffA1cemux ffA1rstmux ffA1cepol
 	// Only search for ffA2 if there was a pre-adder
 	//   (otherwise ffA2 would have been matched as ffAD)
@ -173,6 +234,8 @@ ffA1_end:		;
 	}
 endcode

+// (5) Match 'B' input for B2REG
+//     If B2REG, then match 'B' input for B1REG
 code argQ ffB2 ffB2cemux ffB2rstmux ffB2cepol ffBrstpol sigB clock ffB1 ffB1cemux ffB1rstmux ffB1cepol
 	if (param(dsp, \BREG).as_int() == 0) {
 		argQ = sigB;
@ -222,6 +285,7 @@ ffB1_end:				;
 	}
 endcode

+// (6) Match 'D' input for DREG
 code argQ ffD ffDcemux ffDrstmux ffDcepol ffDrstpol sigD clock
 	if (param(dsp, \DREG).as_int() == 0) {
 		argQ = sigD;
@ -242,6 +306,7 @@ code argQ ffD ffDcemux ffDrstmux ffDcepol ffDrstpol sigD clock
 	}
 endcode

+// (7) Match 'P' output that exclusively drives an MREG
 code argD ffM ffMcemux ffMrstmux ffMcepol ffMrstpol sigM sigP clock
 	if (param(dsp, \MREG).as_int() == 0 && nusers(sigM) == 2) {
 		argD = sigM;
@ -263,6 +328,11 @@ code argD ffM ffMcemux ffMrstmux ffMcepol ffMrstpol sigM sigP clock
 	sigP = sigM;
 endcode

+// (8) Match 'P' output that exclusively drives one of two inputs to an $add
+//     cell (post-adder).
+//     The other input to the adder is assumed to come in from the 'C' input
+//     (note: 'P' -> 'C' connections that exist for accumulators are
+//      recognised in xilinx_dsp.cc).
 match postAdd
 	// Ensure that Z mux is not already used
 	if port(dsp, \OPMODE, SigSpec()).extract(4,3).is_fully_zero()
@ -291,6 +361,7 @@ code sigC sigP
 	}
 endcode

+// (9) Match 'P' output that exclusively drives a PREG
 code argD ffP ffPcemux ffPrstmux ffPcepol ffPrstpol sigP clock
 	if (param(dsp, \PREG).as_int() == 0) {
 		int users = 2;
@ -316,6 +387,19 @@ code argD ffP ffPcemux ffPrstmux ffPcepol ffPrstpol sigP clock
 	}
 endcode

+// (10) If post-adder and PREG both present, match for a $mux cell driving
+//      the 'C' input, where one of the $mux's inputs is the PREG output.
+//      This indicates an accumulator situation, and one where a $mux exists
+//      to override the accumulated value:
+//           +--------------------------------+
+//           |   ____                         |
+//           +--|    \                        |
+//              |$mux|-+                      |
+//       'C' ---|____/ |                      |
+//                     | /-------\   +----+   |
+//          +----+     +-| post- |___|PREG|---+ 'P'
+//          |MREG|------ | adder |   +----+
+//          +----+       \-------/
 match postAddMux
 	if postAdd
 	if ffP
@ -333,6 +417,11 @@ code sigC
 		sigC = port(postAddMux, postAddMuxAB == \A ? \B : \A);
 endcode

+// (11) If PREG present, match for a greater-than-or-equal $ge cell attached to
+//      the 'P' output where it is compared to a constant that is a power-of-2:
+//      e.g. `assign overflow = (PREG >= 2**40);`
+//      In this scenario, the pattern detector functionality of a DSP48E1 can
+//      to implement this function
 match overflow
 	if ffP
 	if param(dsp, \USE_PATTERN_DETECT, Const("NO_PATDET")).decode_string() == "NO_PATDET"
@ -351,22 +440,45 @@ endcode

 // #######################

+// Subpattern for matching against input registers, based on knowledge of the
+//   'Q' input. Typically, identifying registers with clock-enable and reset
+//   capability would be a task would be handled by other Yosys passes such as
+//   dff2dffe, but since DSP inference happens much before this, these patterns
+//   have to be manually identified.
+// At a high level:
+//   (1) Starting from a $dff cell that (partially or fully) drives the given
+//       'Q' argument
+//   (2) Match for a $mux cell implementing synchronous reset semantics ---
+//       one that exclusively drives the 'D' input of the $dff, with one of its
+//       $mux inputs being fully zero
+//   (3) Match for a $mux cell implement clock enable semantics --- one that
+//       exclusively drives the 'D' input of the $dff (or the other input of
+//       the reset $mux) and where one of this $mux's inputs is connected to
+//       the 'Q' output of the $dff
 subpattern in_dffe
 arg argD argQ clock

 code
 	dff = nullptr;
-	for (auto c : argQ.chunks()) {
+	for (const auto &c : argQ.chunks()) {
+		// Abandon matches when 'Q' is a constant
 		if (!c.wire)
 			reject;
+		// Abandon matches when 'Q' has the keep attribute set
 		if (c.wire->get_bool_attribute(\keep))
 			reject;
-		Const init = c.wire->attributes.at(\init, State::Sx);
-		if (!init.is_fully_undef() && !init.is_fully_zero())
-			reject;
+		// Abandon matches when 'Q' has a non-zero init attribute set
+		// (not supported by DSP48E1)
+		Const init = c.wire->attributes.at(\init, Const());
+		if (!init.empty())
+			for (auto b : init.extract(c.offset, c.width))
+				if (b != State::Sx && b != State::S0)
+					reject;
 	}
 endcode

+// (1) Starting from a $dff cell that (partially or fully) drives the given
+//     'Q' argument
 match ff
 	select ff->type.in($dff)
 	// DSP48E1 does not support clock inversion
@ -379,14 +491,12 @@ match ff
 	filter GetSize(port(ff, \Q)) >= offset + GetSize(argQ)
 	filter port(ff, \Q).extract(offset, GetSize(argQ)) == argQ

+	filter clock == SigBit() || port(ff, \CLK) == clock
+
 	set ffoffset offset
 endmatch

 code argQ argD
-{
-	if (clock != SigBit() && port(ff, \CLK) != clock)
-		reject;
-
 	SigSpec Q = port(ff, \Q);
 	dff = ff;
 	dffclock = port(ff, \CLK);
@ -398,9 +508,11 @@ code argQ argD
 	//   has two (ff, ffrstmux) users
 	if (nusers(dffD) > 2)
 		argD = SigSpec();
-}
 endcode

+// (2) Match for a $mux cell implementing synchronous reset semantics ---
+//     exclusively drives the 'D' input of the $dff, with one of the $mux
+//     inputs being fully zero
 match ffrstmux
 	if !argD.empty()
 	select ffrstmux->type.in($mux)
@ -432,6 +544,10 @@ code argD
 		dffrstmux = nullptr;
 endcode

+// (3) Match for a $mux cell implement clock enable semantics --- one that
+//     exclusively drives the 'D' input of the $dff (or the other input of
+//     the reset $mux) and where one of this $mux's inputs is connected to
+//     the 'Q' output of the $dff
 match ffcemux
 	if !argD.empty()
 	select ffcemux->type.in($mux)
@ -456,16 +572,32 @@ endcode

 // #######################

+// Subpattern for matching against output registers, based on knowledge of the
+//   'D' input.
+// At a high level:
+//   (1) Starting from an optional $mux cell that implements clock enable
+//       semantics --- one where the given 'D' argument (partially or fully)
+//       drives one of its two inputs
+//   (2) Starting from, or continuing onto, another optional $mux cell that
+//       implements synchronous reset semantics --- one where the given 'D'
+//       argument (or the clock enable $mux output) drives one of its two inputs
+//       and where the other input is fully zero
+//   (3) Match for a $dff cell (whose 'D' input is the 'D' argument, or the
+//       output of the previous clock enable or reset $mux cells)
 subpattern out_dffe
 arg argD argQ clock

 code
 	dff = nullptr;
 	for (auto c : argD.chunks())
+		// Abandon matches when 'D' has the keep attribute set
 		if (c.wire->get_bool_attribute(\keep))
 			reject;
 endcode

+// (1) Starting from an optional $mux cell that implements clock enable
+//     semantics --- one where the given 'D' argument (partially or fully)
+//     drives one of its two inputs
 match ffcemux
 	select ffcemux->type.in($mux)
 	// ffcemux output must have two users: ffcemux and ff.D
@ -504,6 +636,10 @@ code argD argQ
 	}
 endcode

+// (2) Starting from, or continuing onto, another optional $mux cell that
+//     implements synchronous reset semantics --- one where the given 'D'
+//     argument (or the clock enable $mux output) drives one of its two inputs
+//     and where the other input is fully zero
 match ffrstmux
 	select ffrstmux->type.in($mux)
 	// ffrstmux output must have two users: ffrstmux and ff.D
@ -542,6 +678,8 @@ code argD argQ
 	}
 endcode

+// (3) Match for a $dff cell (whose 'D' input is the 'D' argument, or the
+//     output of the previous clock enable or reset $mux cells)
 match ff
 	select ff->type.in($dff)
 	// DSP48E1 does not support clock inversion
@ -558,32 +696,30 @@ match ff
 	// Check that FF.Q is connected to CE-mux
 	filter !ffcemux || port(ff, \Q).extract(offset, GetSize(argQ)) == argQ

+	filter clock == SigBit() || port(ff, \CLK) == clock
+
 	set ffoffset offset
 endmatch

 code argQ
-	if (ff) {
-		if (clock != SigBit() && port(ff, \CLK) != clock)
-			reject;
-
-		SigSpec D = port(ff, \D);
-		SigSpec Q = port(ff, \Q);
-		if (!ffcemux) {
-			argQ = argD;
-			argQ.replace(D, Q);
-		}
-
-		for (auto c : argQ.chunks()) {
-			Const init = c.wire->attributes.at(\init, State::Sx);
-			if (!init.is_fully_undef() && !init.is_fully_zero())
-				reject;
-		}
-
-		dff = ff;
-		dffQ = argQ;
-		dffclock = port(ff, \CLK);
+	SigSpec D = port(ff, \D);
+	SigSpec Q = port(ff, \Q);
+	if (!ffcemux) {
+		argQ = argD;
+		argQ.replace(D, Q);
 	}
-	// No enable/reset mux possible without flop
-	else if (dffcemux || dffrstmux)
-		reject;
+
+	// Abandon matches when 'Q' has a non-zero init attribute set
+	// (not supported by DSP48E1)
+	for (auto c : argQ.chunks()) {
+		Const init = c.wire->attributes.at(\init, Const());
+		if (!init.empty())
+			for (auto b : init.extract(c.offset, c.width))
+				if (b != State::Sx && b != State::S0)
+					reject;
+	}
+
+	dff = ff;
+	dffQ = argQ;
+	dffclock = port(ff, \CLK);
 endcode
--- a/passes/pmgen/xilinx_dsp_CREG.pmg
+++ b/passes/pmgen/xilinx_dsp_CREG.pmg
@ -1,3 +1,26 @@
+// This file describes the second of three pattern matcher setups that
+//   forms the `xilinx_dsp` pass described in xilinx_dsp.cc
+// At a high level, it works as follows:
+//   (1) Starting from a DSP48E1 cell that (a) doesn't have a CREG already,
+//       and (b) uses the 'C' port
+//   (2) Match the driver of the 'C' input to a possible $dff cell (CREG)
+//       (attached to at most two $mux cells that implement clock-enable or
+//        reset functionality, using a subpattern discussed below)
+// Notes:
+//   - Running CREG packing after xilinx_dsp_pack is necessary since there is no
+//     guarantee that the cell ordering corresponds to the "expected" case (i.e.
+//     the order in which they appear in the source) thus the possiblity existed
+//     that a register got packed as a CREG into a downstream DSP that should
+//     have otherwise been a PREG of an upstream DSP that had not been visited
+//     yet
+//   - The reason this is separated out from the xilinx_dsp.pmg file is
+//     for efficiency --- each *.pmg file creates a class of the same basename,
+//     which when constructed, creates a custom database tailored to the
+//     pattern(s) contained within. Since the pattern in this file must be
+//     executed after the pattern contained in xilinx_dsp.pmg, it is necessary
+//     to reconstruct this database. Separating the two patterns into
+//     independent files causes two smaller, more specific, databases.
+
 pattern xilinx_dsp_packC

 udata <std::function<SigSpec(const SigSpec&)>> unextend
@ -6,7 +29,7 @@ state <SigSpec> sigC sigP
 state <bool> ffCcepol ffCrstpol
 state <Cell*> ffC ffCcemux ffCrstmux

-// subpattern
+// Variables used for subpatterns
 state <SigSpec> argQ argD
 state <bool> ffcepol ffrstpol
 state <int> ffoffset
@ -15,13 +38,15 @@ udata <SigBit> dffclock
 udata <Cell*> dff dffcemux dffrstmux
 udata <bool> dffcepol dffrstpol

+// (1) Starting from a DSP48E1 cell that (a) doesn't have a CREG already,
+//     and (b) uses the 'C' port
 match dsp
 	select dsp->type.in(\DSP48E1)
 	select param(dsp, \CREG, 1).as_int() == 0
 	select nusers(port(dsp, \C, SigSpec())) > 1
 endmatch

-code argQ ffC ffCcemux ffCrstmux ffCcepol ffCrstpol sigC sigP clock
+code sigC sigP clock
 	unextend = [](const SigSpec &sig) {
 		int i;
 		for (i = GetSize(sig)-1; i > 0; i--)
@ -48,11 +73,13 @@ code argQ ffC ffCcemux ffCrstmux ffCcepol ffCrstpol sigC sigP clock
 	else
 		sigP = P;

-	if (sigC == sigP)
-		reject;
-
 	clock = port(dsp, \CLK, SigBit());
+endcode

+// (2) Match the driver of the 'C' input to a possible $dff cell (CREG)
+//     (attached to at most two $mux cells that implement clock-enable or
+//      reset functionality, using the in_dffe subpattern)
+code argQ ffC ffCcemux ffCrstmux ffCcepol ffCrstpol sigC clock
 	argQ = sigC;
 	subpattern(in_dffe);
 	if (dff) {
@ -77,22 +104,44 @@ endcode

 // #######################

+// Subpattern for matching against input registers, based on knowledge of the
+//   'Q' input. Typically, identifying registers with clock-enable and reset
+//   capability would be a task would be handled by other Yosys passes such as
+//   dff2dffe, but since DSP inference happens much before this, these patterns
+//   have to be manually identified.
+// At a high level:
+//   (1) Starting from a $dff cell that (partially or fully) drives the given
+//       'Q' argument
+//   (2) Match for a $mux cell implementing synchronous reset semantics ---
+//       one that exclusively drives the 'D' input of the $dff, with one of its
+//       $mux inputs being fully zero
+//   (3) Match for a $mux cell implement clock enable semantics --- one that
+//       exclusively drives the 'D' input of the $dff (or the other input of
+//       the reset $mux) and where one of this $mux's inputs is connected to
+//       the 'Q' output of the $dff
 subpattern in_dffe
 arg argD argQ clock

 code
 	dff = nullptr;
-	for (auto c : argQ.chunks()) {
+	for (const auto &c : argQ.chunks()) {
+		// Abandon matches when 'Q' is a constant
 		if (!c.wire)
 			reject;
+		// Abandon matches when 'Q' has the keep attribute set
 		if (c.wire->get_bool_attribute(\keep))
 			reject;
-		Const init = c.wire->attributes.at(\init, State::Sx);
-		if (!init.is_fully_undef() && !init.is_fully_zero())
-			reject;
+		// Abandon matches when 'Q' has a non-zero init attribute set
+		// (not supported by DSP48E1)
+		Const init = c.wire->attributes.at(\init, Const());
+		for (auto b : init.extract(c.offset, c.width))
+			if (b != State::Sx && b != State::S0)
+				reject;
 	}
 endcode

+// (1) Starting from a $dff cell that (partially or fully) drives the given
+//     'Q' argument
 match ff
 	select ff->type.in($dff)
 	// DSP48E1 does not support clock inversion
@ -105,14 +154,12 @@ match ff
 	filter GetSize(port(ff, \Q)) >= offset + GetSize(argQ)
 	filter port(ff, \Q).extract(offset, GetSize(argQ)) == argQ

+	filter clock == SigBit() || port(ff, \CLK) == clock
+
 	set ffoffset offset
 endmatch

 code argQ argD
-{
-	if (clock != SigBit() && port(ff, \CLK) != clock)
-		reject;
-
 	SigSpec Q = port(ff, \Q);
 	dff = ff;
 	dffclock = port(ff, \CLK);
@ -124,9 +171,11 @@ code argQ argD
 	//   has two (ff, ffrstmux) users
 	if (nusers(dffD) > 2)
 		argD = SigSpec();
-}
 endcode

+// (2) Match for a $mux cell implementing synchronous reset semantics ---
+//     exclusively drives the 'D' input of the $dff, with one of the $mux
+//     inputs being fully zero
 match ffrstmux
 	if !argD.empty()
 	select ffrstmux->type.in($mux)
@ -158,6 +207,10 @@ code argD
 		dffrstmux = nullptr;
 endcode

+// (3) Match for a $mux cell implement clock enable semantics --- one that
+//     exclusively drives the 'D' input of the $dff (or the other input of
+//     the reset $mux) and where one of this $mux's inputs is connected to
+//     the 'Q' output of the $dff
 match ffcemux
 	if !argD.empty()
 	select ffcemux->type.in($mux)
--- a/passes/pmgen/xilinx_dsp_cascade.pmg
+++ b/passes/pmgen/xilinx_dsp_cascade.pmg
@ -1,3 +1,46 @@
+// This file describes the third of three pattern matcher setups that
+//   forms the `xilinx_dsp` pass described in xilinx_dsp.cc
+// At a high level, it works as follows:
+//   (1) Starting from a DSP48E1 cell that (a) has the Z multiplexer
+//       (controlled by OPMODE[6:4]) set to zero and (b) doesn't already
+//       use the 'PCOUT' port
+//   (2.1) Match another DSP48E1 cell that (a) does not have the CREG enabled,
+//         (b) has its Z multiplexer output set to the 'C' port, which is
+//         driven by the 'P' output of the previous DSP cell, and (c) has its
+//         'PCIN' port unused
+//   (2.2) Same as (2.1) but with the 'C' port driven by the 'P' output of the
+//         previous DSP cell right-shifted by 17 bits
+//   (3) For this subequent DSP48E1 match (i.e. PCOUT -> PCIN cascade exists)
+//       if (a) the previous DSP48E1 uses either the A2REG or A1REG, (b) this
+//       DSP48 does not use A2REG nor A1REG, (c) this DSP48E1 does not already
+//       have an ACOUT -> ACIN cascade, (d) the previous DSP does not already
+//       use its ACOUT port, then examine if an ACOUT -> ACIN cascade
+//       opportunity exists by matching for a $dff-with-optional-clock-enable-
+//       or-reset and checking that the 'D' input of this register is the same
+//       as the 'A' input of the previous DSP
+//   (4) Same as (3) but for BCOUT -> BCIN cascade
+//   (5) Recursively go to (2.1) until no more matches possible, keeping track
+//       of the longest possible chain found
+//   (6) The longest chain is then divided into chunks of no more than
+//       MAX_DSP_CASCADE in length (to prevent long cascades that exceed the
+//       height of a DSP column) with each DSP in each chunk being rewritten
+//       to use [ABP]COUT -> [ABP]CIN cascading as appropriate
+// Notes:
+//   - Currently, [AB]COUT -> [AB]COUT cascades (3 or 4) are only considered
+//     if a PCOUT -> PCIN cascade is (2.1 or 2.2) first identified; this need
+//     not be the case --- [AB] cascades can exist independently of a P cascade
+//     (though all three cascades must come from the same DSP). This situation
+//     is not handled currently.
+//   - In addition, [AB]COUT -> [AB]COUT cascades (3 or 4) are currently
+//     conservative in that they examine the situation where (a) the previous
+//     DSP has [AB]2REG or [AB]1REG enabled, (b) that the downstream DSP has no
+//     registers enabled, and (c) that there exists only one additional register
+//     between the upstream and downstream DSPs. This can certainly be relaxed
+//     to identify situations ranging from (i) neither DSP uses any registers,
+//     to (ii) upstream DSP has 2 registers, downstream DSP has 2 registers, and
+//     there exists a further 2 registers between them. This remains a TODO
+//     item.
+
 pattern xilinx_dsp_cascade

 udata <std::function<SigSpec(const SigSpec&)>> unextend
@ -6,7 +49,7 @@ state <Cell*> next
 state <SigSpec> clock
 state <int> AREG BREG

-// subpattern
+// Variables used for subpatterns
 state <SigSpec> argQ argD
 state <bool> ffcepol ffrstpol
 state <int> ffoffset
@ -19,12 +62,19 @@ code
 #define MAX_DSP_CASCADE 20
 endcode

+// (1) Starting from a DSP48E1 cell that (a) has the Z multiplexer
+//     (controlled by OPMODE[6:4]) set to zero and (b) doesn't already
+//     use the 'PCOUT' port
 match first
 	select first->type.in(\DSP48E1)
 	select port(first, \OPMODE, Const(0, 7)).extract(4,3) == Const::from_string("000")
 	select nusers(port(first, \PCOUT, SigSpec())) <= 1
 endmatch

+// (6) The longest chain is then divided into chunks of no more than
+//     MAX_DSP_CASCADE in length (to prevent long cascades that exceed the
+//     height of a DSP column) with each DSP in each chunk being rewritten
+//     to use [ABP]COUT -> [ABP]CIN cascading as appropriate
 code
 	longest_chain.clear();
 	chain.emplace_back(first, -1, -1, -1);
@ -106,6 +156,10 @@ subpattern tail
 arg first
 arg next

+// (2.1) Match another DSP48E1 cell that (a) does not have the CREG enabled,
+//       (b) has its Z multiplexer output set to the 'C' port, which is
+//       driven by the 'P' output of the previous DSP cell, and (c) has its
+//       'PCIN' port unused
 match nextP
 	select nextP->type.in(\DSP48E1)
 	select !param(nextP, \CREG, State::S1).as_bool()
@ -116,6 +170,8 @@ match nextP
 	semioptional
 endmatch

+// (2.2) Same as (2.1) but with the 'C' port driven by the 'P' output of the
+//       previous DSP cell right-shifted by 17 bits
 match nextP_shift17
 	if !nextP
 	select nextP_shift17->type.in(\DSP48E1)
@ -145,6 +201,14 @@ code next
 	}
 endcode

+// (3) For this subequent DSP48E1 match (i.e. PCOUT -> PCIN cascade exists)
+//     if (a) the previous DSP48E1 uses either the A2REG or A1REG, (b) this
+//     DSP48 does not use A2REG nor A1REG, (c) this DSP48E1 does not already
+//     have an ACOUT -> ACIN cascade, (d) the previous DSP does not already
+//     use its ACOUT port, then examine if an ACOUT -> ACIN cascade
+//     opportunity exists by matching for a $dff-with-optional-clock-enable-
+//     or-reset and checking that the 'D' input of this register is the same
+//     as the 'A' input of the previous DSP
 code argQ clock AREG
 	AREG = -1;
 	if (next) {
@ -152,7 +216,6 @@ code argQ clock AREG
 		if (param(prev, \AREG, 2).as_int() > 0 &&
 				param(next, \AREG, 2).as_int() > 0 &&
 				param(next, \A_INPUT, Const("DIRECT")).decode_string() == "DIRECT" &&
-				port(next, \ACIN, SigSpec()).is_fully_zero() &&
 				nusers(port(prev, \ACOUT, SigSpec())) <= 1) {
 			argQ = unextend(port(next, \A));
 			clock = port(prev, \CLK);
@ -174,6 +237,7 @@ reject_AREG:			;
 	}
 endcode

+// (4) Same as (3) but for BCOUT -> BCIN cascade
 code argQ clock BREG
 	BREG = -1;
 	if (next) {
@ -203,13 +267,14 @@ reject_BREG:			;
 	}
 endcode

+// (5) Recursively go to (2.1) until no more matches possible, recording the
+//     longest possible chain
 code
 	if (next) {
 		chain.emplace_back(next, nextP_shift17 ? 17 : nextP ? 0 : -1, AREG, BREG);

 		SigSpec sigC = unextend(port(next, \C));

-		// TODO: Cannot use 'reject' since semioptional
 		if (nextP_shift17) {
 			if (GetSize(sigC)+17 <= GetSize(port(std::get<0>(chain.back()), \P)) &&
 					port(std::get<0>(chain.back()), \P).extract(17, GetSize(sigC)) != sigC)
@ -232,22 +297,44 @@ endcode

 // #######################

+// Subpattern for matching against input registers, based on knowledge of the
+//   'Q' input. Typically, identifying registers with clock-enable and reset
+//   capability would be a task would be handled by other Yosys passes such as
+//   dff2dffe, but since DSP inference happens much before this, these patterns
+//   have to be manually identified.
+// At a high level:
+//   (1) Starting from a $dff cell that (partially or fully) drives the given
+//       'Q' argument
+//   (2) Match for a $mux cell implementing synchronous reset semantics ---
+//       one that exclusively drives the 'D' input of the $dff, with one of its
+//       $mux inputs being fully zero
+//   (3) Match for a $mux cell implement clock enable semantics --- one that
+//       exclusively drives the 'D' input of the $dff (or the other input of
+//       the reset $mux) and where one of this $mux's inputs is connected to
+//       the 'Q' output of the $dff
 subpattern in_dffe
 arg argD argQ clock

 code
 	dff = nullptr;
-	for (auto c : argQ.chunks()) {
+	for (const auto &c : argQ.chunks()) {
+		// Abandon matches when 'Q' is a constant
 		if (!c.wire)
 			reject;
+		// Abandon matches when 'Q' has the keep attribute set
 		if (c.wire->get_bool_attribute(\keep))
 			reject;
-		Const init = c.wire->attributes.at(\init, State::Sx);
-		if (!init.is_fully_undef() && !init.is_fully_zero())
-			reject;
+		// Abandon matches when 'Q' has a non-zero init attribute set
+		// (not supported by DSP48E1)
+		Const init = c.wire->attributes.at(\init, Const());
+		for (auto b : init.extract(c.offset, c.width))
+			if (b != State::Sx && b != State::S0)
+				reject;
 	}
 endcode

+// (1) Starting from a $dff cell that (partially or fully) drives the given
+//     'Q' argument
 match ff
 	select ff->type.in($dff)
 	// DSP48E1 does not support clock inversion
@ -260,14 +347,12 @@ match ff
 	filter GetSize(port(ff, \Q)) >= offset + GetSize(argQ)
 	filter port(ff, \Q).extract(offset, GetSize(argQ)) == argQ

+	filter clock == SigBit() || port(ff, \CLK) == clock
+
 	set ffoffset offset
 endmatch

 code argQ argD
-{
-	if (clock != SigBit() && port(ff, \CLK) != clock)
-		reject;
-
 	SigSpec Q = port(ff, \Q);
 	dff = ff;
 	dffclock = port(ff, \CLK);
@ -279,9 +364,11 @@ code argQ argD
 	//   has two (ff, ffrstmux) users
 	if (nusers(dffD) > 2)
 		argD = SigSpec();
-}
 endcode

+// (2) Match for a $mux cell implementing synchronous reset semantics ---
+//     exclusively drives the 'D' input of the $dff, with one of the $mux
+//     inputs being fully zero
 match ffrstmux
 	if !argD.empty()
 	select ffrstmux->type.in($mux)
@ -313,6 +400,10 @@ code argD
 		dffrstmux = nullptr;
 endcode

+// (3) Match for a $mux cell implement clock enable semantics --- one that
+//     exclusively drives the 'D' input of the $dff (or the other input of
+//     the reset $mux) and where one of this $mux's inputs is connected to
+//     the 'Q' output of the $dff
 match ffcemux
 	if !argD.empty()
 	select ffcemux->type.in($mux)
--- a/techlibs/xilinx/synth_xilinx.cc
+++ b/techlibs/xilinx/synth_xilinx.cc
@ -340,13 +340,17 @@ struct SynthXilinxPass : public ScriptPass
 			run("techmap -map +/cmp2lut.v -D LUT_WIDTH=6");
 		}

-		if (check_label("map_dsp"), "(skip if '-nodsp')") {
+		if (check_label("map_dsp", "(skip if '-nodsp')")) {
 			if (!nodsp || help_mode) {
 				// NB: Xilinx multipliers are signed only
-				run("techmap -map +/mul2dsp.v -map +/xilinx/dsp_map.v -D DSP_A_MAXWIDTH=25 -D DSP_A_MAXWIDTH_PARTIAL=18 -D DSP_B_MAXWIDTH=18 "
-						"-D DSP_A_MINWIDTH=2 -D DSP_B_MINWIDTH=2 " // Blocks Nx1 multipliers
-						"-D DSP_Y_MINWIDTH=9 " // UG901 suggests small multiplies are those 4x4 and smaller
-						"-D DSP_SIGNEDONLY=1 -D DSP_NAME=$__MUL25X18");
+				run("techmap -map +/mul2dsp.v -map +/xilinx/dsp_map.v -D DSP_A_MAXWIDTH=25 "
+					"-D DSP_A_MAXWIDTH_PARTIAL=18 -D DSP_B_MAXWIDTH=18 "    // Partial multipliers are intentionally
+												// limited to 18x18 in order to take
+												// advantage of the (PCOUT << 17) -> PCIN
+												// dedicated cascade chain capability
+					"-D DSP_A_MINWIDTH=2 -D DSP_B_MINWIDTH=2 " // Blocks Nx1 multipliers
+					"-D DSP_Y_MINWIDTH=9 " // UG901 suggests small multiplies are those 4x4 and smaller
+					"-D DSP_SIGNEDONLY=1 -D DSP_NAME=$__MUL25X18");
 				run("select a:mul2dsp");
 				run("setattr -unset mul2dsp");
 				run("opt_expr -fine");
--- a/tests/ice40/latches.ys
+++ b/tests/ice40/latches.ys
@ -1,14 +1,11 @@
 read_verilog latches.v
-design -save read

 proc
-async2sync # converts latches to a 'sync' variant clocked by a 'super'-clock
 flatten
-synth_ice40
-equiv_opt -assert -map +/ice40/cells_sim.v synth_ice40 # equivalency check
-design -load postopt # load the post-opt design (otherwise equiv_opt loads the pre-opt design)
+# Can't run any sort of equivalence check because latches are blown to LUTs
+#equiv_opt -async2sync -assert -map +/ice40/cells_sim.v synth_ice40 # equivalency check

-design -load read
+#design -load preopt
 synth_ice40
 cd top
 select -assert-count 4 t:SB_LUT4
--- a/tests/ice40/wrapcarry.ys
+++ b/tests/ice40/wrapcarry.ys
@ -0,0 +1,22 @@
+read_verilog <<EOT
+module top(input A, B, CI, output O, CO);
+	SB_CARRY carry (
+		.I0(A),
+		.I1(B),
+		.CI(CI),
+		.CO(CO)
+	);
+	SB_LUT4 #(
+		.LUT_INIT(16'b 0110_1001_1001_0110)
+	) adder (
+		.I0(1'b0),
+		.I1(A),
+		.I2(B),
+		.I3(1'b0),
+		.O(O)
+	);
+endmodule
+EOT
+
+ice40_wrapcarry
+select -assert-count 1 t:$__ICE40_CARRY_WRAPPER
--- a/tests/various/peepopt.ys
+++ b/tests/various/peepopt.ys
@ -131,8 +131,8 @@ EOT
 proc
 equiv_opt -assert peepopt
 design -load postopt
-select -assert-count 1 t:$dff r:WIDTH=5 %i
-select -assert-count 1 t:$mux r:WIDTH=5 %i
+select -assert-count 1 t:$dff r:WIDTH=4 %i
+select -assert-count 1 t:$mux r:WIDTH=4 %i
 select -assert-count 0 t:$dff t:$mux %% t:* %D

 ####################
@ -173,3 +173,41 @@ select -assert-count 1 t:$dff r:WIDTH=2 %i
 select -assert-count 2 t:$mux
 select -assert-count 2 t:$mux r:WIDTH=2 %i
 select -assert-count 0 t:$logic_not t:$dff t:$mux %% t:* %D
+
+####################
+
+design -reset
+read_verilog <<EOT
+module peepopt_dffmuxext_signed_rst_init(input clk, ce, rstn, input signed [1:0] i, output reg signed [3:0] o);
+    initial o <= 4'b0010;
+    always @(posedge clk) begin
+        if (ce) o <= i;
+        if (!rstn) o <= 4'b1111;
+    end
+endmodule
+EOT
+
+proc
+# NB: equiv_opt uses equiv_induct which covers
+#     only the induction half of temporal induction
+#     --- missing the base-case half
+#     This makes it akin to `sat -tempinduct-inductonly`
+#     instead of `sat -tempinduct-baseonly` or
+#     `sat -tempinduct` which is necessary for this
+#     testcase
+#equiv_opt -assert peepopt
+
+design -save gold
+peepopt
+wreduce
+design -stash gate
+design -import gold -as gold
+design -import gate -as gate
+miter -equiv -flatten -make_assert -make_outputs gold gate miter
+sat -tempinduct -verify -prove-asserts -show-ports miter
+
+design -load gate
+select -assert-count 1 t:$dff r:WIDTH=4 %i
+select -assert-count 2 t:$mux
+select -assert-count 2 t:$mux r:WIDTH=4 %i
+select -assert-count 0 t:$logic_not t:$dff t:$mux %% t:* %D
--- a/tests/xilinx/latches.ys
+++ b/tests/xilinx/latches.ys
@ -2,9 +2,7 @@ read_verilog latches.v

 proc
 flatten
-equiv_opt -assert -run :prove -map +/xilinx/cells_sim.v synth_xilinx # equivalency check
-async2sync
-equiv_opt -assert -run prove: -map +/xilinx/cells_sim.v synth_xilinx # equivalency check
+equiv_opt -async2sync -assert -map +/xilinx/cells_sim.v synth_xilinx # equivalency check

 design -load preopt
 synth_xilinx