Merge pull request #4021 from povik/booth-wallace

Change `booth` architecture for improved delay, similar signed/unsigned results
2023-11-27 16:26:03 +01:00 · 2023-11-27 16:26:03 +01:00 · beaae79e73
parent 031ad38b5c de16cd253d
commit beaae79e73
3 changed files with 209 additions and 289 deletions
--- a/kernel/rtlil.h
+++ b/kernel/rtlil.h
@ -921,6 +921,9 @@ public:
 	RTLIL::SigSpec extract(int offset, int length = 1) const;
 	RTLIL::SigSpec extract_end(int offset) const { return extract(offset, width_ - offset); }

+	RTLIL::SigBit lsb() const { log_assert(width_); return (*this)[0]; };
+	RTLIL::SigBit msb() const { log_assert(width_); return (*this)[width_ - 1]; };
+
 	void append(const RTLIL::SigSpec &signal);
 	inline void append(Wire *wire) { append(RTLIL::SigSpec(wire)); }
 	inline void append(const RTLIL::SigChunk &chunk) { append(RTLIL::SigSpec(chunk)); }
--- a/passes/techmap/booth.cc
+++ b/passes/techmap/booth.cc
@ -66,6 +66,8 @@ struct BoothPassWorker {
 	RTLIL::Module *module;
 	SigMap sigmap;
 	int booth_counter;
+	bool lowpower = false;
+	bool mapped_cpa = false;

 	BoothPassWorker(RTLIL::Module *module) : module(module), sigmap(module) { booth_counter = 0; }

@ -184,6 +186,24 @@ struct BoothPassWorker {
 		cor_o = module->AndGate(NEW_ID_SUFFIX(name), pp1_nor_pp0, cori_i);
 	}

+	void BuildBitwiseFa(Module *mod, std::string name, const SigSpec &sig_a, const SigSpec &sig_b,
+			    const SigSpec &sig_c, const SigSpec &sig_x, const SigSpec &sig_y,
+			    const std::string &src = "")
+	{
+		// We can't emit a single wide full-adder cell here since
+		// there would typically be feedback loops involving the cells'
+		// input and output ports, and Yosys doesn't cope well with
+		// those
+		log_assert(sig_a.size() == sig_b.size());
+		log_assert(sig_a.size() == sig_c.size());
+		log_assert(sig_a.size() == sig_x.size());
+		log_assert(sig_a.size() == sig_y.size());
+
+		for (int i = 0; i < sig_a.size(); i++)
+			mod->addFa(stringf("%s[%d]", name.c_str(), i), sig_a[i], sig_b[i],
+				   sig_c[i], sig_x[i], sig_y[i], src);
+	}
+
 	void run()
 	{
 		for (auto cell : module->selected_cells()) {
@ -258,17 +278,19 @@ struct BoothPassWorker {
 			}
 			log_assert(GetSize(Y) == required_op_size);

-			if (!is_signed) /* unsigned multiplier */
-				CreateBoothUMult(module,
+			if (!lowpower)
+				CreateBoothMult(module,
 					A, // multiplicand
 					B, // multiplier(scanned)
-						 Y  // result
+					Y, // result
+					is_signed
 				);
-			else /* signed multiplier */
-				CreateBoothSMult(module,
+			else
+				CreateBoothLowpowerMult(module,
 					A, // multiplicand
 					B, // multiplier(scanned)
-						 Y  // result (sized)
+					Y, // result
+					is_signed
 				);

 			module->remove(cell);
@ -276,25 +298,54 @@ struct BoothPassWorker {
 		}
 	}

+	SigSig WallaceSum(int width, std::vector<SigSpec> summands)
+	{
+		for (auto &s : summands)
+			s.extend_u0(width);
+
+		while (summands.size() > 2) {
+			std::vector<SigSpec> new_summands;
+			int i;
+			for (i = 0; i < (int) summands.size() - 2; i += 3) {
+				SigSpec x = module->addWire(NEW_ID, width);
+				SigSpec y = module->addWire(NEW_ID, width);
+				BuildBitwiseFa(module, NEW_ID.str(), summands[i], summands[i + 1],
+					       summands[i + 2], x, y);
+				new_summands.push_back(y);
+				new_summands.push_back({x.extract(0, width - 1), State::S0});
+			}
+
+			new_summands.insert(new_summands.begin(), summands.begin() + i, summands.end());
+
+			std::swap(summands, new_summands);
+		}
+
+		if (!summands.size())
+			return SigSig(SigSpec(width, State::S0), SigSpec(width, State::S0));
+		else if (summands.size() == 1)
+			return SigSig(summands[0], SigSpec(width, State::S0));
+		else
+			return SigSig(summands[0], summands[1]);
+	}
+
 	/*
-	  Build Unsigned Multiplier.
+	  Build Multiplier.
 	  -------------------------
-	  Create a booth unsigned multiplier.
-	  Uses a generic booth multiplier with
-	  extra row of decoders and extended multiplier
+	  Uses a generic booth multiplier
 	*/

-	void CreateBoothUMult(RTLIL::Module *module,
+	void CreateBoothMult(RTLIL::Module *module,
 			      SigSpec X, // multiplicand
 			      SigSpec Y, // multiplier
-			      SigSpec Z)
+			      SigSpec Z,
+			      bool is_signed)
 	{ // result
-		int x_sz = X.size(), z_sz = Z.size();
+		int z_sz = Z.size();

 		SigSpec one_int, two_int, s_int, sb_int;
 		int encoder_count = 0;

-		BuildBoothUMultEncoders(Y, one_int, two_int, s_int, sb_int, module, encoder_count);
+		BuildBoothMultEncoders(Y, one_int, two_int, s_int, sb_int, module, encoder_count, is_signed);

 		// Build the decoder rows
 		// format of each Partial product to be passed to CSA
@ -308,43 +359,24 @@ struct BoothPassWorker {

 		// Row 0: special case 1. Format S/.S.S.C.Data
 		SigSpec ppij_row_0;
-		BuildBoothUMultDecoderRow0(module, X, s_int, sb_int, one_int, two_int, ppij_row_0);
+		BuildBoothMultDecoderRow0(module, X, s_int, sb_int, one_int, two_int, ppij_row_0, is_signed);

 		// data, shift, sign
 		ppij_int.push_back(std::make_tuple(ppij_row_0, 0, s_int[0]));

-		for (int i = 1; i < encoder_count - 2; i++) {
+		for (int i = 1; i < encoder_count; i++) {
 			// format 1,S.Data.shift = encoder_ix*2,sign = sb_int[i]
 			SigSpec ppij_row_n;

-			BuildBoothUMultDecoderRowN(module,
+			BuildBoothMultDecoderRowN(module,
 						   X, // multiplicand
 						   one_int[i], two_int[i], s_int[i], sb_int[i], ppij_row_n, i,
-						   false, // include sign
-						   false  // include constant
+						   is_signed
 			);
 			// data, shift, sign
 			ppij_int.push_back(std::make_tuple(ppij_row_n, i * 2, s_int[i]));
 		}

-		// Build second to last row
-		// format S/,Data + sign bit
-		SigSpec ppij_row_em1;
-		BuildBoothUMultDecoderRowN(module, X, one_int[encoder_count - 2], two_int[encoder_count - 2], s_int[encoder_count - 2],
-					   sb_int[encoder_count - 2], ppij_row_em1, encoder_count - 2,
-					   false, // include sign
-					   true	  // no constant
-		);
-		ppij_int.push_back(std::make_tuple(ppij_row_em1, (encoder_count - 2) * 2, s_int[encoder_count - 2]));
-		// Build last row
-		// format Data + sign bit
-		SigSpec ppij_row_e;
-		BuildBoothUMultDecoderRowN(module, X, one_int[encoder_count - 1], two_int[encoder_count - 1], s_int[encoder_count - 1],
-					   sb_int[encoder_count - 1], ppij_row_e, encoder_count - 1,
-					   true, // no sign
-					   true	 // no constant
-		);
-		ppij_int.push_back(std::make_tuple(ppij_row_e, (encoder_count - 1) * 2, s_int[encoder_count - 1]));

 		//  Debug dump out partial products
 		//  DebugDumpPP(ppij_int);
@ -358,35 +390,34 @@ struct BoothPassWorker {
 		for (int i = 0; i < encoder_count + 1; i++)
 			aligned_pp[i].extend_u0(z_sz);

-		AlignPP(x_sz, z_sz, ppij_int, aligned_pp);
+		AlignPP(z_sz, ppij_int, aligned_pp);

 		// Debug: dump out aligned partial products.
 		// Later on yosys will clean up unused constants
 		//  DebugDumpAlignPP(aligned_pp);

-		SigSpec s_vec;
-		SigSpec c_vec;
-		std::vector<std::vector<RTLIL::Cell *>> debug_csa_trees;
-
-		debug_csa_trees.resize(z_sz);
-
-		BuildCSATree(module, aligned_pp, s_vec, c_vec, debug_csa_trees);
+		SigSig wtree_sum = WallaceSum(z_sz, aligned_pp);

 		// Debug code: Dump out the csa trees
 		// DumpCSATrees(debug_csa_trees);
 		// Build the CPA to do the final accumulation.
-		BuildCPA(module, s_vec, c_vec, Z);
+		log_assert(wtree_sum.second[0] == State::S0);
+		if (mapped_cpa)
+			BuildCPA(module, wtree_sum.first, {State::S0, wtree_sum.second.extract_end(1)}, Z);
+		else
+			module->addAdd(NEW_ID, wtree_sum.first, {wtree_sum.second.extract_end(1), State::S0}, Z);
 	}

 	/*
 	  Build Row 0 of decoders
 	*/

-	void BuildBoothUMultDecoderRow0(RTLIL::Module *module,
+	void BuildBoothMultDecoderRow0(RTLIL::Module *module,
 					SigSpec X, // multiplicand
 					SigSpec s_int, SigSpec sb_int, SigSpec one_int,
-					SigSpec two_int, SigSpec &ppij_vec)
+					SigSpec two_int, SigSpec &ppij_vec, bool is_signed)
 	{
+		(void)sb_int;
 		(void)module;
 		int x_sz = GetSize(X);
 		SigBit ppij;
@ -399,21 +430,32 @@ struct BoothPassWorker {
 			ppij_vec.append(Bur4d_n(stringf("row0_dec_%d", i), X[i], X[i - 1],
 						one_int[0], two_int[0], s_int[0]));

+
 		// The redundant bit. Duplicate decoding of last bit.
-		ppij_vec.append(Bur4d_msb("row0_dec_msb", X[x_sz - 1], two_int[0], s_int[0]));
+		if (!is_signed) {
+			ppij_vec.append(Bur4d_msb("row0_dec_msb", X.msb(), two_int[0], s_int[0]));
+		} else {
+			ppij_vec.append(Bur4d_n("row0_dec_msb", X.msb(), X.msb(),
+										  one_int[0], two_int[0], s_int[0]));
+		}

 		// append the sign bits
-		ppij_vec.append(s_int[0]);
-		ppij_vec.append(s_int[0]);
-		ppij_vec.append(sb_int[0]);
+		if (is_signed) {
+			SigBit e = module->XorGate(NEW_ID, s_int[0], module->AndGate(NEW_ID, X.msb(), module->OrGate(NEW_ID, two_int[0], one_int[0])));
+			ppij_vec.append({module->NotGate(NEW_ID, e), e, e});
+		} else {
+			// append the sign bits
+			ppij_vec.append({module->NotGate(NEW_ID, s_int[0]), s_int[0], s_int[0]});
+		}
 	}

 	// Build a generic row of decoders.

-	void BuildBoothUMultDecoderRowN(RTLIL::Module *module,
+	void BuildBoothMultDecoderRowN(RTLIL::Module *module,
 					SigSpec X, // multiplicand
 					SigSpec one_int, SigSpec two_int, SigSpec s_int, SigSpec sb_int,
-					SigSpec &ppij_vec, int row_ix, bool no_sign, bool no_constant)
+					SigSpec &ppij_vec, int row_ix,
+					bool is_signed)
 	{
 		(void)module;
 		int x_sz = GetSize(X);
@ -426,15 +468,14 @@ struct BoothPassWorker {
 			ppij_vec.append(Bur4d_n(stringf("row_%d_dec_%d", row_ix, i), X[i], X[i - 1],
 				     		one_int, two_int, s_int));

-		// redundant bit
+		if (!is_signed) {			// redundant bit
 			ppij_vec.append(Bur4d_msb("row_dec_red", X[x_sz - 1], two_int, s_int));
+		} else {
+			ppij_vec.append(Bur4d_n(stringf("row_%d_dec_msb", row_ix), X[x_sz - 1], X[x_sz - 1],
+				     					one_int, two_int, s_int));
+		}

-		// sign bit
-		if (!no_sign) // if no sign is false then make a sign bit
-			ppij_vec.append(sb_int);
-
-		// constant bit
-		if (!no_constant) // if non constant is false make a constant bit
+		ppij_vec.append(!is_signed ? sb_int[0] : module->XorGate(NEW_ID, sb_int, module->AndGate(NEW_ID, X.msb(), module->OrGate(NEW_ID, two_int, one_int))));
 		ppij_vec.append(State::S1);
 	}

@ -591,7 +632,7 @@ struct BoothPassWorker {
 	  Pad out rows with zeros and left the opt pass clean them up.

 	*/
-	void AlignPP(int x_sz, int z_sz, std::vector<std::tuple<SigSpec, int, SigBit>> &ppij_int,
+	void AlignPP(int z_sz, std::vector<std::tuple<SigSpec, int, SigBit>> &ppij_int,
 		     std::vector<SigSpec> &aligned_pp)
 	{
 		unsigned aligned_pp_ix = aligned_pp.size() - 1;
@ -611,12 +652,10 @@ struct BoothPassWorker {
 		// in first column of the last partial product
 		// which is at index corresponding to size of multiplicand
 		{
+			int prior_row_idx = get<1>(ppij_int[aligned_pp_ix - 1]);
 			SigBit prior_row_sign = get<2>(ppij_int[aligned_pp_ix - 1]);
-			//if (prior_row_sign) {
-				log_assert(aligned_pp_ix < aligned_pp.size());
-				log_assert(x_sz - 1 < (int)(aligned_pp[aligned_pp_ix].size()));
-				aligned_pp[aligned_pp_ix][x_sz - 1] = prior_row_sign;
-			//}
+			if (prior_row_idx < z_sz)
+				aligned_pp[aligned_pp_ix][prior_row_idx] = prior_row_sign;
 		}

 		for (int row_ix = aligned_pp_ix - 1; row_ix >= 0; row_ix--) {
@ -813,12 +852,12 @@ struct BoothPassWorker {
 		}
 	}

-	void BuildBoothUMultEncoders(SigSpec Y, SigSpec &one_int, SigSpec &two_int,
-				     SigSpec &s_int, SigSpec &sb_int, RTLIL::Module *module, int &encoder_ix)
+	void BuildBoothMultEncoders(SigSpec Y, SigSpec &one_int, SigSpec &two_int,
+				     SigSpec &s_int, SigSpec &sb_int, RTLIL::Module *module, int &encoder_ix, bool is_signed)
 	{
 		int y_sz = GetSize(Y);

-		for (int y_ix = 0; y_ix < y_sz;) {
+		for (int y_ix = 0; y_ix < (!is_signed ? y_sz : y_sz - 1);) {
 			std::string enc_name = stringf("bur_enc_%d", encoder_ix);

 			two_int.append(module->addWire(NEW_ID_SUFFIX(stringf("two_int_%d", encoder_ix)), 1));
@ -844,7 +883,7 @@ struct BoothPassWorker {
 				bool need_padded_cell = false;

 				if (y_ix > y_sz - 1) {
-					y0 = State::S0;
+					y0 = is_signed ? Y.msb() : State::S0;
 					need_padded_cell = false;
 				} else {
 					y0 = Y[y_ix];
@ -853,7 +892,7 @@ struct BoothPassWorker {

 				if (y_ix > y_sz - 1) {
 					need_padded_cell = false;
-					y1 = State::S0;
+					y1 = is_signed ? Y.msb() : State::S0;
 				} else {
 					y1 = Y[y_ix];
 					y_ix++;
@ -861,10 +900,10 @@ struct BoothPassWorker {

 				if (y_ix > y_sz - 1) {
 					need_padded_cell = false;
-					y2 = State::S0;
+					y2 = is_signed ? Y.msb() : State::S0;
 				} else {
 					if (y_ix == y_sz - 1)
-						need_padded_cell = true;
+						need_padded_cell = !is_signed;
 					else
 						need_padded_cell = false;
 					y2 = Y[y_ix];
@ -902,12 +941,15 @@ struct BoothPassWorker {
 	}

 	/*
-	  Signed Multiplier
+	  Low-power Multiplier
 	*/
-	void CreateBoothSMult(RTLIL::Module *module, SigSpec X, SigSpec Y, SigSpec Z)
+	void CreateBoothLowpowerMult(RTLIL::Module *module, SigSpec X, SigSpec Y, SigSpec Z, bool is_signed)
 	{ // product
 		int x_sz = X.size(), y_sz = Y.size(), z_sz = Z.size();

+		if (!is_signed)
+			log_error("Low-power Booth architecture is only supported on signed multipliers.\n");
+
 		unsigned enc_count = (y_sz / 2) + (((y_sz % 2) != 0) ? 1 : 0);
 		int dec_count = x_sz + 1;

@ -1009,219 +1051,89 @@ struct BoothPassWorker {
 				  PPij[((encoder_ix - 1) * dec_count) + dec_count - 1], unused_op);
 		}

+		//
+		// instantiate the quadrant 1 cell. This is the upper right
+		// quadrant which can be realized using non-booth encoded logic.
+		//
+		SigBit pp0_o_int, pp1_o_int, nxj_o_int, q1_carry_out;
+
+		BuildBoothQ1("icb_booth_q1_",
+			     negi_n_int[0], // negi
+			     cori_n_int[0], // cori
+			     X[0], X[1], Y[0], Y[1],
+			     nxj_o_int, q1_carry_out, pp0_o_int, pp1_o_int);
+
+		module->connect(Z[0], pp0_o_int);
+		module->connect(Z[1], pp1_o_int);
+		module->connect(nxj[(0 * dec_count) + 2], nxj_o_int);
+
 		//
 		// sum up the partial products
 		//
-		int fa_el_ix = 0;
 		int fa_row_ix = 0;
-		// use 1 d arrays (2d cannot have variable sized indices)
-		SigSpec fa_sum_n(State::S0, fa_row_count * fa_count);
-		SigSpec fa_carry_n(State::S0, fa_row_count * fa_count);
+		std::vector<SigSpec> fa_sum;
+		std::vector<SigSpec> fa_carry;

 		for (fa_row_ix = 0; fa_row_ix < fa_row_count; fa_row_ix++) {
-			for (fa_el_ix = 0; fa_el_ix < fa_count; fa_el_ix++) {
-				fa_sum_n[(fa_row_ix * fa_count) + fa_el_ix] =
-					module->addWire(NEW_ID_SUFFIX(stringf("fa_sum_n_%d_%d", fa_row_ix, fa_el_ix)), 1);
-				fa_carry_n[(fa_row_ix * fa_count) + fa_el_ix] =
-					module->addWire(NEW_ID_SUFFIX(stringf("fa_carry_n_%d_%d", fa_row_ix, fa_el_ix)), 1);
-			}
+			fa_sum.push_back(module->addWire(NEW_ID_SUFFIX(stringf("fa_sum_%d", fa_row_ix)), fa_count));
+			fa_carry.push_back(module->addWire(NEW_ID_SUFFIX(stringf("fa_carry_%d", fa_row_ix)), fa_count));
 		}

 		// full adder creation
-		std::string bfa_name;
-		std::string exc_inv_name;
-		for (fa_row_ix = 0; fa_row_ix < fa_row_count; fa_row_ix++) {
-			for (fa_el_ix = 0; fa_el_ix < fa_count; fa_el_ix++) {
-				// base case: 1st row. Inputs from decoders
-				// Note in rest of tree inputs from prior addition and a decoder
-				if (fa_row_ix == 0) {
-					// beginning
-					// base case:
-					// first two cells: have B input hooked to 0.
-					if (fa_el_ix == 0) {
-						// quadrant 1: we hard code these using non-booth
-						fa_el_ix++;
-
-					}
-					// step case
-					else if (fa_el_ix >= 2 && fa_el_ix <= x_sz) {
-						// middle (2...x_sz cells)
-						module->addFa(NEW_ID_SUFFIX(stringf("bfa_0_step_%d_%d_L", fa_row_ix, fa_el_ix)),
-							/* A */ PPij[(0 * dec_count) + fa_el_ix],
-							/* B */ PPij[(1 * dec_count) + fa_el_ix - 2],
-							/* C */ fa_carry_n[(fa_row_ix * fa_count) + fa_el_ix - 1],
-							/* X */ fa_carry_n[(fa_row_ix * fa_count) + fa_el_ix],
-							/* Y */ fa_sum_n[(fa_row_ix * fa_count) + fa_el_ix]
+		// base case: 1st row: Inputs from decoders
+		// 1st row exception: two localized inverters due to sign extension structure
+		SigBit d08_inv = module->NotGate(NEW_ID_SUFFIX("bfa_0_exc_inv1"), PPij[(0 * dec_count) + dec_count - 1]);
+		SigBit d18_inv = module->NotGate(NEW_ID_SUFFIX("bfa_0_exc_inv2"), PPij[(1 * dec_count) + dec_count - 1]);
+		BuildBitwiseFa(module, NEW_ID_SUFFIX("fa_row_0").str(),
+			/* A */ {State::S0, d08_inv, PPij[(0 * dec_count) + x_sz], PPij.extract((0 * dec_count) + 2, x_sz - 1)},
+			/* B */ {State::S1, d18_inv, PPij.extract((1 * dec_count), x_sz)},
+			/* C */ fa_carry[0].extract(1, x_sz + 2),
+			/* X */ fa_carry[0].extract(2, x_sz + 2),
+			/* Y */ fa_sum[0].extract(2, x_sz + 2)
 		);
-					}
-					// end 3 cells: x_sz+1.2.3
-					//
-					else {
-						// fa_el_ix = x_sz+1
-						module->addFa(NEW_ID_SUFFIX(stringf("bfa_0_se_0_%d_%d_L", fa_row_ix, fa_el_ix)),
-							/* A */ PPij[(0 * dec_count) + x_sz],
-							/* B */ PPij[(1 * dec_count) + fa_el_ix - 2],
-							/* C */ fa_carry_n[(fa_row_ix * fa_count) + fa_el_ix - 1],
-							/* X */ fa_carry_n[(fa_row_ix * fa_count) + fa_el_ix],
-							/* Y */ fa_sum_n[(fa_row_ix * fa_count) + fa_el_ix]
-						);
-
-						// exception:invert ppi
-						fa_el_ix++;
-						SigBit d08_inv = module->NotGate(NEW_ID_SUFFIX(stringf("bfa_0_exc_inv1_%d_%d_L", fa_row_ix, fa_el_ix)),
-										 PPij[(0 * dec_count) + dec_count - 1]);
-
-						SigBit d18_inv = module->NotGate(NEW_ID_SUFFIX(stringf("bfa_0_exc_inv2_%d_%d_L", fa_row_ix, fa_el_ix)),
-										 PPij[(1 * dec_count) + dec_count - 1]);
-
-						module->addFa(NEW_ID_SUFFIX(stringf("bfa_0_se_1_%d_%d_L", fa_row_ix, fa_el_ix)),
-							/* A */ d08_inv,
-							/* B */ d18_inv,
-							/* C */ fa_carry_n[(fa_row_ix * fa_count) + fa_el_ix - 1],
-							/* X */ fa_carry_n[(fa_row_ix * fa_count) + fa_el_ix],
-							/* Y */ fa_sum_n[(fa_row_ix * fa_count) + fa_el_ix]
-						);
-
-						// sign extension
-						fa_el_ix++;
-
-						module->addFa(NEW_ID_SUFFIX(stringf("bfa_0_se_2_%d_%d_L", fa_row_ix, fa_el_ix)),
-							/* A */ State::S0,
-							/* B */ State::S1,
-							/* C */ fa_carry_n[(fa_row_ix * fa_count) + fa_el_ix - 1],
-							/* X */ fa_carry_n[(fa_row_ix * fa_count) + fa_el_ix],
-							/* Y */ fa_sum_n[(fa_row_ix * fa_count) + fa_el_ix]
-						);
-					}
-				}
+		module->connect(fa_carry[0][1], q1_carry_out);

 		// step case: 2nd and rest of rows. (fa_row_ix == 1...n)
 		// special because these are driven by a decoder and prior fa.
-				else {
-					// beginning
-					if (fa_el_ix == 0) {
-						// first two cells: have B input hooked to 0.
-						// column is offset by row_ix*2
-
-						module->addFa(NEW_ID_SUFFIX(stringf("bfa_base_%d_%d_L", fa_row_ix, fa_el_ix)),
-							/* A */ fa_sum_n[(fa_row_ix - 1) * fa_count + 2],
-							/* B */ State::S0,
-							/* C */ cori_n_int[fa_row_ix],
-							/* X */ fa_carry_n[(fa_row_ix * fa_count) + fa_el_ix],
-							/* Y */ fa_sum_n[(fa_row_ix * fa_count) + fa_el_ix]
-						);
-						fa_el_ix++;
-
-						module->addFa(NEW_ID_SUFFIX(stringf("bfa_base_%d_%d_L", fa_row_ix, fa_el_ix)),
-							/* A */ fa_sum_n[(fa_row_ix - 1) * fa_count + 3], // from prior full adder row
-							/* B */ State::S0,
-							/* C */ fa_carry_n[(fa_row_ix * fa_count) + fa_el_ix - 1],
-							/* X */ fa_carry_n[(fa_row_ix * fa_count) + fa_el_ix],
-							/* Y */ fa_sum_n[(fa_row_ix * fa_count) + fa_el_ix]
-						);
-
-					}
-
-					else if (fa_el_ix >= 2 && fa_el_ix <= x_sz + 1) {
-						// middle (2...x_sz+1 cells)
-						module->addFa(NEW_ID_SUFFIX(stringf("bfa_step_%d_%d_L", fa_row_ix, fa_el_ix)),
-							/* A */ fa_sum_n[(fa_row_ix - 1) * fa_count + fa_el_ix + 2],
-							/* B */ PPij[(fa_row_ix + 1) * dec_count + fa_el_ix - 2],
-							/* C */ fa_carry_n[(fa_row_ix * fa_count) + fa_el_ix - 1],
-							/* X */ fa_carry_n[(fa_row_ix * fa_count) + fa_el_ix],
-							/* Y */ fa_sum_n[(fa_row_ix * fa_count) + fa_el_ix]
-						);
-					}
-
-					else if (fa_el_ix > x_sz + 1) {
+		for (fa_row_ix = 1; fa_row_ix < fa_row_count; fa_row_ix++) {
 			// end two bits: sign extension
-						SigBit d_inv = module->NotGate(NEW_ID_SUFFIX(stringf("bfa_se_inv_%d_%d_L", fa_row_ix, fa_el_ix)),
+			SigBit d_inv = module->NotGate(NEW_ID_SUFFIX(stringf("bfa_se_inv_%d_L", fa_row_ix)),
 						       PPij[((fa_row_ix + 1) * dec_count) + dec_count - 1]);

-						module->addFa(NEW_ID_SUFFIX(stringf("bfa_se_%d_%d_L", fa_row_ix, fa_el_ix)),
-							/* A */ fa_carry_n[((fa_row_ix - 1) * fa_count) + fa_count - 1],
-							/* B */ d_inv,
-							/* C */ fa_carry_n[(fa_row_ix * fa_count) + fa_el_ix - 1],
-							/* X */ fa_carry_n[(fa_row_ix * fa_count) + fa_el_ix],
-							/* Y */ fa_sum_n[(fa_row_ix * fa_count) + fa_el_ix]
-						);
-						fa_el_ix++;
+			BuildBitwiseFa(module, NEW_ID_SUFFIX(stringf("fa_row_%d", fa_row_ix)).str(),
+				/* A */	{State::S0, fa_carry[fa_row_ix - 1][fa_count - 1], fa_sum[fa_row_ix - 1].extract(2, x_sz + 2)},
+				/* B */ {State::S1, d_inv, PPij.extract((fa_row_ix + 1) * dec_count, x_sz), State::S0, State::S0},

-						// sign extension
-						module->addFa(NEW_ID_SUFFIX(stringf("bfa_se_%d_%d_L", fa_row_ix, fa_el_ix)),
-							/* A */ State::S0,
-							/* B */ State::S1,
-							/* C */ fa_carry_n[(fa_row_ix * fa_count) + fa_el_ix - 1],
-							/* X */ fa_carry_n[(fa_row_ix * fa_count) + fa_el_ix],
-							/* Y */ fa_sum_n[(fa_row_ix * fa_count) + fa_el_ix]
+				/* C */ {fa_carry[fa_row_ix].extract(0, x_sz + 3), cori_n_int[fa_row_ix]},
+				/* X */ fa_carry[fa_row_ix],
+				/* Y */ fa_sum[fa_row_ix]
 			);
 		}
-				}
-			}
-		}

 		// instantiate the cpa
 		SigSpec cpa_carry;
+		if (z_sz > fa_row_count * 2)
+			cpa_carry = module->addWire(NEW_ID_SUFFIX("cpa_carry"), z_sz - fa_row_count * 2);

-		for (int cix = 0; cix < z_sz; cix++)
-			cpa_carry.append(module->addWire(NEW_ID_SUFFIX(stringf("cpa_carry_%d", cix)), 1));
-
-		for (int cpa_ix = 0; cpa_ix < z_sz; cpa_ix++) {
 		// The end case where we pass the last two summands
 		// from prior row directly to product output
 		// without using a cpa cell. This is always
 		// 0,1 index of prior fa row
-			if (cpa_ix <= fa_row_count * 2 - 1) {
+		for (int cpa_ix = 0; cpa_ix < fa_row_count * 2; cpa_ix += 2) {
 			int fa_row_ix = cpa_ix / 2;
+			module->connect(Z.extract(cpa_ix, 2), fa_sum[fa_row_ix].extract(0, 2));
+		}

-				module->addBufGate(NEW_ID_SUFFIX(stringf("pp_buf_%d_driven_by_fa_row_%d", cpa_ix, fa_row_ix)),
-						   fa_sum_n[(fa_row_ix * fa_count) + 0], Z[cpa_ix]);
-
-				cpa_ix++;
-				module->addBufGate(NEW_ID_SUFFIX(stringf("pp_buf_%d_driven_by_fa_row_%d", cpa_ix, fa_row_ix)),
-						   fa_sum_n[(fa_row_ix * fa_count) + 1], Z[cpa_ix]);
-			} else {
+		for (int cpa_ix = fa_row_count * 2; cpa_ix < z_sz; cpa_ix++) {
 			int offset = fa_row_count * 2;
-				bool base_case = cpa_ix - offset == 0 ? true : false;
 			std::string cpa_name = stringf("cpa_%d", cpa_ix - offset);

-				SigBit ci;
-				if (base_case)
-					ci = cori_n_int[enc_count - 1];
-				else
-					ci = cpa_carry[cpa_ix - offset - 1];
-
+			SigBit ci = (cpa_ix == offset) ? cori_n_int[enc_count - 1] : cpa_carry[cpa_ix - offset - 1];
 			SigBit op;
-				BuildHa(cpa_name, fa_sum_n[(fa_row_count - 1) * fa_count + cpa_ix - offset + 2], ci, op,
-					cpa_carry[cpa_ix - offset]);
+			BuildHa(cpa_name, fa_sum[fa_row_count - 1][cpa_ix - offset + 2], ci, op, cpa_carry[cpa_ix - offset]);
 			module->connect(Z[cpa_ix], op);
 		}
 	}
-
-		//
-		// instantiate the quadrant 1 cell. This is the upper right
-		// quadrant which can be realized using non-booth encoded logic.
-		//
-		std::string q1_name = "icb_booth_q1_";
-
-		SigBit pp0_o_int;
-		SigBit pp1_o_int;
-		SigBit nxj_o_int;
-		SigBit cor_o_int;
-
-		BuildBoothQ1(q1_name,
-			     negi_n_int[0], // negi
-			     cori_n_int[0], // cori
-
-			     X[0], X[1], Y[0], Y[1],
-
-			     nxj_o_int, cor_o_int, pp0_o_int, pp1_o_int);
-
-		module->connect(fa_sum_n[(0 * fa_count) + 0], pp0_o_int);
-		module->connect(fa_sum_n[(0 * fa_count) + 1], pp1_o_int);
-		module->connect(fa_carry_n[(0 * fa_count) + 1], cor_o_int);
-		module->connect(nxj[(0 * dec_count) + 2], nxj_o_int);
-	}
 };

 struct BoothPass : public Pass {
@ -1232,21 +1144,13 @@ struct BoothPass : public Pass {
 		log("\n");
 		log("    booth [selection]\n");
 		log("\n");
-		log("This pass replaces multiplier cells with an implementation based on the Booth\n");
-		log("algorithm. It operates on $mul cells whose width of operands is at least 4x4\n");
-		log("and whose width of result is at least 8. The detailed architecture is selected\n");
-		log("from two options based on the signedness of the operands to the $mul cell.\n");
+		log("This pass replaces multiplier cells with a radix-4 Booth-encoded implementation.\n");
+		log("It operates on $mul cells whose width of operands is at least 4x4 and whose\n");
+		log("width of result is at least 8.\n");
 		log("\n");
-		log("See the references below for the description of the architectures.\n");
-		log("\n");
-		log("Signed-multiplier architecture:\n");
-		log("Y. J. Chang, Y. C. Cheng, S. C. Liao and C. H. Hsiao, \"A Low Power Radix-4 Booth\n");
-		log("Multiplier With Pre-Encoded Mechanism,\" in IEEE Access, vol. 8, pp. 114842-114853,\n");
-		log("2020, doi: 10.1109/ACCESS.2020.3003684\n");
-		log("\n");
-		log("Unsigned-multiplier architecture:\n");
-		log("G. W. Bewick, \"Fast Multiplication: Algorithms and Implementations,\" PhD Thesis,\n");
-		log("Department of Electrical Engineering, Stanford University, 1994\n");
+		log("    -lowpower\n");
+		log("        use an alternative low-power architecture for the generated multiplier\n");
+		log("        (signed multipliers only)\n");
 		log("\n");
 	}
 	void execute(vector<string> args, RTLIL::Design *design) override
@ -1254,7 +1158,16 @@ struct BoothPass : public Pass {
 		log_header(design, "Executing BOOTH pass (map to Booth multipliers).\n");

 		size_t argidx;
+		bool mapped_cpa = false;
+		bool lowpower = false;
 		for (argidx = 1; argidx < args.size(); argidx++) {
+			if (args[argidx] == "-mapped_cpa")
+				// Have an undocumented option which helps with multiplier
+				// verification using specialized tools (AMulet2 in particular)
+				mapped_cpa = true;
+			else if (args[argidx] == "-lowpower")
+				lowpower = true;
+			else
 				break;
 		}
 		extra_args(args, argidx, design);
@ -1264,6 +1177,8 @@ struct BoothPass : public Pass {
 		for (auto mod : design->selected_modules()) {
 			if (!mod->has_processes_warn()) {
 				BoothPassWorker worker(mod);
+				worker.mapped_cpa = mapped_cpa;
+				worker.lowpower = lowpower;
 				worker.run();
 				total += worker.booth_counter;
 			}
--- a/techlibs/lattice/synth_lattice.cc
+++ b/techlibs/lattice/synth_lattice.cc
@ -362,6 +362,8 @@ struct SynthLatticePass : public ScriptPass
 				run("techmap -map +/mul2dsp.v -map +/lattice/dsp_map" + dsp_map + ".v -D DSP_A_MAXWIDTH=18 -D DSP_B_MAXWIDTH=18  -D DSP_A_MINWIDTH=2 -D DSP_B_MINWIDTH=2  -D DSP_NAME=$__MUL18X18", "(unless -nodsp)");
 				run("chtype -set $mul t:$__soft_mul", "(unless -nodsp)");
 			}
+			if (family == "xo3" || help_mode)
+				run("booth", "(only if '-family xo3')");
 			run("alumacc");
 			run("opt");
 			run("memory -nomap" + no_rw_check_opt);