Merge pull request #4021 from povik/booth-wallace

Change `booth` architecture for improved delay, similar signed/unsigned results
This commit is contained in:
N. Engelhardt 2023-11-27 16:26:03 +01:00 committed by GitHub
commit beaae79e73
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 209 additions and 289 deletions

View File

@ -921,6 +921,9 @@ public:
RTLIL::SigSpec extract(int offset, int length = 1) const;
RTLIL::SigSpec extract_end(int offset) const { return extract(offset, width_ - offset); }
RTLIL::SigBit lsb() const { log_assert(width_); return (*this)[0]; };
RTLIL::SigBit msb() const { log_assert(width_); return (*this)[width_ - 1]; };
void append(const RTLIL::SigSpec &signal);
inline void append(Wire *wire) { append(RTLIL::SigSpec(wire)); }
inline void append(const RTLIL::SigChunk &chunk) { append(RTLIL::SigSpec(chunk)); }

View File

@ -66,6 +66,8 @@ struct BoothPassWorker {
RTLIL::Module *module;
SigMap sigmap;
int booth_counter;
bool lowpower = false;
bool mapped_cpa = false;
BoothPassWorker(RTLIL::Module *module) : module(module), sigmap(module) { booth_counter = 0; }
@ -184,6 +186,24 @@ struct BoothPassWorker {
cor_o = module->AndGate(NEW_ID_SUFFIX(name), pp1_nor_pp0, cori_i);
}
void BuildBitwiseFa(Module *mod, std::string name, const SigSpec &sig_a, const SigSpec &sig_b,
const SigSpec &sig_c, const SigSpec &sig_x, const SigSpec &sig_y,
const std::string &src = "")
{
// We can't emit a single wide full-adder cell here since
// there would typically be feedback loops involving the cells'
// input and output ports, and Yosys doesn't cope well with
// those
log_assert(sig_a.size() == sig_b.size());
log_assert(sig_a.size() == sig_c.size());
log_assert(sig_a.size() == sig_x.size());
log_assert(sig_a.size() == sig_y.size());
for (int i = 0; i < sig_a.size(); i++)
mod->addFa(stringf("%s[%d]", name.c_str(), i), sig_a[i], sig_b[i],
sig_c[i], sig_x[i], sig_y[i], src);
}
void run()
{
for (auto cell : module->selected_cells()) {
@ -258,17 +278,19 @@ struct BoothPassWorker {
}
log_assert(GetSize(Y) == required_op_size);
if (!is_signed) /* unsigned multiplier */
CreateBoothUMult(module,
if (!lowpower)
CreateBoothMult(module,
A, // multiplicand
B, // multiplier(scanned)
Y // result
Y, // result
is_signed
);
else /* signed multiplier */
CreateBoothSMult(module,
else
CreateBoothLowpowerMult(module,
A, // multiplicand
B, // multiplier(scanned)
Y // result (sized)
Y, // result
is_signed
);
module->remove(cell);
@ -276,25 +298,54 @@ struct BoothPassWorker {
}
}
SigSig WallaceSum(int width, std::vector<SigSpec> summands)
{
for (auto &s : summands)
s.extend_u0(width);
while (summands.size() > 2) {
std::vector<SigSpec> new_summands;
int i;
for (i = 0; i < (int) summands.size() - 2; i += 3) {
SigSpec x = module->addWire(NEW_ID, width);
SigSpec y = module->addWire(NEW_ID, width);
BuildBitwiseFa(module, NEW_ID.str(), summands[i], summands[i + 1],
summands[i + 2], x, y);
new_summands.push_back(y);
new_summands.push_back({x.extract(0, width - 1), State::S0});
}
new_summands.insert(new_summands.begin(), summands.begin() + i, summands.end());
std::swap(summands, new_summands);
}
if (!summands.size())
return SigSig(SigSpec(width, State::S0), SigSpec(width, State::S0));
else if (summands.size() == 1)
return SigSig(summands[0], SigSpec(width, State::S0));
else
return SigSig(summands[0], summands[1]);
}
/*
Build Unsigned Multiplier.
Build Multiplier.
-------------------------
Create a booth unsigned multiplier.
Uses a generic booth multiplier with
extra row of decoders and extended multiplier
Uses a generic booth multiplier
*/
void CreateBoothUMult(RTLIL::Module *module,
void CreateBoothMult(RTLIL::Module *module,
SigSpec X, // multiplicand
SigSpec Y, // multiplier
SigSpec Z)
SigSpec Z,
bool is_signed)
{ // result
int x_sz = X.size(), z_sz = Z.size();
int z_sz = Z.size();
SigSpec one_int, two_int, s_int, sb_int;
int encoder_count = 0;
BuildBoothUMultEncoders(Y, one_int, two_int, s_int, sb_int, module, encoder_count);
BuildBoothMultEncoders(Y, one_int, two_int, s_int, sb_int, module, encoder_count, is_signed);
// Build the decoder rows
// format of each Partial product to be passed to CSA
@ -308,43 +359,24 @@ struct BoothPassWorker {
// Row 0: special case 1. Format S/.S.S.C.Data
SigSpec ppij_row_0;
BuildBoothUMultDecoderRow0(module, X, s_int, sb_int, one_int, two_int, ppij_row_0);
BuildBoothMultDecoderRow0(module, X, s_int, sb_int, one_int, two_int, ppij_row_0, is_signed);
// data, shift, sign
ppij_int.push_back(std::make_tuple(ppij_row_0, 0, s_int[0]));
for (int i = 1; i < encoder_count - 2; i++) {
for (int i = 1; i < encoder_count; i++) {
// format 1,S.Data.shift = encoder_ix*2,sign = sb_int[i]
SigSpec ppij_row_n;
BuildBoothUMultDecoderRowN(module,
BuildBoothMultDecoderRowN(module,
X, // multiplicand
one_int[i], two_int[i], s_int[i], sb_int[i], ppij_row_n, i,
false, // include sign
false // include constant
is_signed
);
// data, shift, sign
ppij_int.push_back(std::make_tuple(ppij_row_n, i * 2, s_int[i]));
}
// Build second to last row
// format S/,Data + sign bit
SigSpec ppij_row_em1;
BuildBoothUMultDecoderRowN(module, X, one_int[encoder_count - 2], two_int[encoder_count - 2], s_int[encoder_count - 2],
sb_int[encoder_count - 2], ppij_row_em1, encoder_count - 2,
false, // include sign
true // no constant
);
ppij_int.push_back(std::make_tuple(ppij_row_em1, (encoder_count - 2) * 2, s_int[encoder_count - 2]));
// Build last row
// format Data + sign bit
SigSpec ppij_row_e;
BuildBoothUMultDecoderRowN(module, X, one_int[encoder_count - 1], two_int[encoder_count - 1], s_int[encoder_count - 1],
sb_int[encoder_count - 1], ppij_row_e, encoder_count - 1,
true, // no sign
true // no constant
);
ppij_int.push_back(std::make_tuple(ppij_row_e, (encoder_count - 1) * 2, s_int[encoder_count - 1]));
// Debug dump out partial products
// DebugDumpPP(ppij_int);
@ -358,35 +390,34 @@ struct BoothPassWorker {
for (int i = 0; i < encoder_count + 1; i++)
aligned_pp[i].extend_u0(z_sz);
AlignPP(x_sz, z_sz, ppij_int, aligned_pp);
AlignPP(z_sz, ppij_int, aligned_pp);
// Debug: dump out aligned partial products.
// Later on yosys will clean up unused constants
// DebugDumpAlignPP(aligned_pp);
SigSpec s_vec;
SigSpec c_vec;
std::vector<std::vector<RTLIL::Cell *>> debug_csa_trees;
debug_csa_trees.resize(z_sz);
BuildCSATree(module, aligned_pp, s_vec, c_vec, debug_csa_trees);
SigSig wtree_sum = WallaceSum(z_sz, aligned_pp);
// Debug code: Dump out the csa trees
// DumpCSATrees(debug_csa_trees);
// Build the CPA to do the final accumulation.
BuildCPA(module, s_vec, c_vec, Z);
log_assert(wtree_sum.second[0] == State::S0);
if (mapped_cpa)
BuildCPA(module, wtree_sum.first, {State::S0, wtree_sum.second.extract_end(1)}, Z);
else
module->addAdd(NEW_ID, wtree_sum.first, {wtree_sum.second.extract_end(1), State::S0}, Z);
}
/*
Build Row 0 of decoders
*/
void BuildBoothUMultDecoderRow0(RTLIL::Module *module,
void BuildBoothMultDecoderRow0(RTLIL::Module *module,
SigSpec X, // multiplicand
SigSpec s_int, SigSpec sb_int, SigSpec one_int,
SigSpec two_int, SigSpec &ppij_vec)
SigSpec two_int, SigSpec &ppij_vec, bool is_signed)
{
(void)sb_int;
(void)module;
int x_sz = GetSize(X);
SigBit ppij;
@ -399,21 +430,32 @@ struct BoothPassWorker {
ppij_vec.append(Bur4d_n(stringf("row0_dec_%d", i), X[i], X[i - 1],
one_int[0], two_int[0], s_int[0]));
// The redundant bit. Duplicate decoding of last bit.
ppij_vec.append(Bur4d_msb("row0_dec_msb", X[x_sz - 1], two_int[0], s_int[0]));
if (!is_signed) {
ppij_vec.append(Bur4d_msb("row0_dec_msb", X.msb(), two_int[0], s_int[0]));
} else {
ppij_vec.append(Bur4d_n("row0_dec_msb", X.msb(), X.msb(),
one_int[0], two_int[0], s_int[0]));
}
// append the sign bits
ppij_vec.append(s_int[0]);
ppij_vec.append(s_int[0]);
ppij_vec.append(sb_int[0]);
if (is_signed) {
SigBit e = module->XorGate(NEW_ID, s_int[0], module->AndGate(NEW_ID, X.msb(), module->OrGate(NEW_ID, two_int[0], one_int[0])));
ppij_vec.append({module->NotGate(NEW_ID, e), e, e});
} else {
// append the sign bits
ppij_vec.append({module->NotGate(NEW_ID, s_int[0]), s_int[0], s_int[0]});
}
}
// Build a generic row of decoders.
void BuildBoothUMultDecoderRowN(RTLIL::Module *module,
void BuildBoothMultDecoderRowN(RTLIL::Module *module,
SigSpec X, // multiplicand
SigSpec one_int, SigSpec two_int, SigSpec s_int, SigSpec sb_int,
SigSpec &ppij_vec, int row_ix, bool no_sign, bool no_constant)
SigSpec &ppij_vec, int row_ix,
bool is_signed)
{
(void)module;
int x_sz = GetSize(X);
@ -426,15 +468,14 @@ struct BoothPassWorker {
ppij_vec.append(Bur4d_n(stringf("row_%d_dec_%d", row_ix, i), X[i], X[i - 1],
one_int, two_int, s_int));
// redundant bit
if (!is_signed) { // redundant bit
ppij_vec.append(Bur4d_msb("row_dec_red", X[x_sz - 1], two_int, s_int));
} else {
ppij_vec.append(Bur4d_n(stringf("row_%d_dec_msb", row_ix), X[x_sz - 1], X[x_sz - 1],
one_int, two_int, s_int));
}
// sign bit
if (!no_sign) // if no sign is false then make a sign bit
ppij_vec.append(sb_int);
// constant bit
if (!no_constant) // if non constant is false make a constant bit
ppij_vec.append(!is_signed ? sb_int[0] : module->XorGate(NEW_ID, sb_int, module->AndGate(NEW_ID, X.msb(), module->OrGate(NEW_ID, two_int, one_int))));
ppij_vec.append(State::S1);
}
@ -591,7 +632,7 @@ struct BoothPassWorker {
Pad out rows with zeros and left the opt pass clean them up.
*/
void AlignPP(int x_sz, int z_sz, std::vector<std::tuple<SigSpec, int, SigBit>> &ppij_int,
void AlignPP(int z_sz, std::vector<std::tuple<SigSpec, int, SigBit>> &ppij_int,
std::vector<SigSpec> &aligned_pp)
{
unsigned aligned_pp_ix = aligned_pp.size() - 1;
@ -611,12 +652,10 @@ struct BoothPassWorker {
// in first column of the last partial product
// which is at index corresponding to size of multiplicand
{
int prior_row_idx = get<1>(ppij_int[aligned_pp_ix - 1]);
SigBit prior_row_sign = get<2>(ppij_int[aligned_pp_ix - 1]);
//if (prior_row_sign) {
log_assert(aligned_pp_ix < aligned_pp.size());
log_assert(x_sz - 1 < (int)(aligned_pp[aligned_pp_ix].size()));
aligned_pp[aligned_pp_ix][x_sz - 1] = prior_row_sign;
//}
if (prior_row_idx < z_sz)
aligned_pp[aligned_pp_ix][prior_row_idx] = prior_row_sign;
}
for (int row_ix = aligned_pp_ix - 1; row_ix >= 0; row_ix--) {
@ -813,12 +852,12 @@ struct BoothPassWorker {
}
}
void BuildBoothUMultEncoders(SigSpec Y, SigSpec &one_int, SigSpec &two_int,
SigSpec &s_int, SigSpec &sb_int, RTLIL::Module *module, int &encoder_ix)
void BuildBoothMultEncoders(SigSpec Y, SigSpec &one_int, SigSpec &two_int,
SigSpec &s_int, SigSpec &sb_int, RTLIL::Module *module, int &encoder_ix, bool is_signed)
{
int y_sz = GetSize(Y);
for (int y_ix = 0; y_ix < y_sz;) {
for (int y_ix = 0; y_ix < (!is_signed ? y_sz : y_sz - 1);) {
std::string enc_name = stringf("bur_enc_%d", encoder_ix);
two_int.append(module->addWire(NEW_ID_SUFFIX(stringf("two_int_%d", encoder_ix)), 1));
@ -844,7 +883,7 @@ struct BoothPassWorker {
bool need_padded_cell = false;
if (y_ix > y_sz - 1) {
y0 = State::S0;
y0 = is_signed ? Y.msb() : State::S0;
need_padded_cell = false;
} else {
y0 = Y[y_ix];
@ -853,7 +892,7 @@ struct BoothPassWorker {
if (y_ix > y_sz - 1) {
need_padded_cell = false;
y1 = State::S0;
y1 = is_signed ? Y.msb() : State::S0;
} else {
y1 = Y[y_ix];
y_ix++;
@ -861,10 +900,10 @@ struct BoothPassWorker {
if (y_ix > y_sz - 1) {
need_padded_cell = false;
y2 = State::S0;
y2 = is_signed ? Y.msb() : State::S0;
} else {
if (y_ix == y_sz - 1)
need_padded_cell = true;
need_padded_cell = !is_signed;
else
need_padded_cell = false;
y2 = Y[y_ix];
@ -902,12 +941,15 @@ struct BoothPassWorker {
}
/*
Signed Multiplier
Low-power Multiplier
*/
void CreateBoothSMult(RTLIL::Module *module, SigSpec X, SigSpec Y, SigSpec Z)
void CreateBoothLowpowerMult(RTLIL::Module *module, SigSpec X, SigSpec Y, SigSpec Z, bool is_signed)
{ // product
int x_sz = X.size(), y_sz = Y.size(), z_sz = Z.size();
if (!is_signed)
log_error("Low-power Booth architecture is only supported on signed multipliers.\n");
unsigned enc_count = (y_sz / 2) + (((y_sz % 2) != 0) ? 1 : 0);
int dec_count = x_sz + 1;
@ -1009,219 +1051,89 @@ struct BoothPassWorker {
PPij[((encoder_ix - 1) * dec_count) + dec_count - 1], unused_op);
}
//
// instantiate the quadrant 1 cell. This is the upper right
// quadrant which can be realized using non-booth encoded logic.
//
SigBit pp0_o_int, pp1_o_int, nxj_o_int, q1_carry_out;
BuildBoothQ1("icb_booth_q1_",
negi_n_int[0], // negi
cori_n_int[0], // cori
X[0], X[1], Y[0], Y[1],
nxj_o_int, q1_carry_out, pp0_o_int, pp1_o_int);
module->connect(Z[0], pp0_o_int);
module->connect(Z[1], pp1_o_int);
module->connect(nxj[(0 * dec_count) + 2], nxj_o_int);
//
// sum up the partial products
//
int fa_el_ix = 0;
int fa_row_ix = 0;
// use 1 d arrays (2d cannot have variable sized indices)
SigSpec fa_sum_n(State::S0, fa_row_count * fa_count);
SigSpec fa_carry_n(State::S0, fa_row_count * fa_count);
std::vector<SigSpec> fa_sum;
std::vector<SigSpec> fa_carry;
for (fa_row_ix = 0; fa_row_ix < fa_row_count; fa_row_ix++) {
for (fa_el_ix = 0; fa_el_ix < fa_count; fa_el_ix++) {
fa_sum_n[(fa_row_ix * fa_count) + fa_el_ix] =
module->addWire(NEW_ID_SUFFIX(stringf("fa_sum_n_%d_%d", fa_row_ix, fa_el_ix)), 1);
fa_carry_n[(fa_row_ix * fa_count) + fa_el_ix] =
module->addWire(NEW_ID_SUFFIX(stringf("fa_carry_n_%d_%d", fa_row_ix, fa_el_ix)), 1);
}
fa_sum.push_back(module->addWire(NEW_ID_SUFFIX(stringf("fa_sum_%d", fa_row_ix)), fa_count));
fa_carry.push_back(module->addWire(NEW_ID_SUFFIX(stringf("fa_carry_%d", fa_row_ix)), fa_count));
}
// full adder creation
std::string bfa_name;
std::string exc_inv_name;
for (fa_row_ix = 0; fa_row_ix < fa_row_count; fa_row_ix++) {
for (fa_el_ix = 0; fa_el_ix < fa_count; fa_el_ix++) {
// base case: 1st row. Inputs from decoders
// Note in rest of tree inputs from prior addition and a decoder
if (fa_row_ix == 0) {
// beginning
// base case:
// first two cells: have B input hooked to 0.
if (fa_el_ix == 0) {
// quadrant 1: we hard code these using non-booth
fa_el_ix++;
}
// step case
else if (fa_el_ix >= 2 && fa_el_ix <= x_sz) {
// middle (2...x_sz cells)
module->addFa(NEW_ID_SUFFIX(stringf("bfa_0_step_%d_%d_L", fa_row_ix, fa_el_ix)),
/* A */ PPij[(0 * dec_count) + fa_el_ix],
/* B */ PPij[(1 * dec_count) + fa_el_ix - 2],
/* C */ fa_carry_n[(fa_row_ix * fa_count) + fa_el_ix - 1],
/* X */ fa_carry_n[(fa_row_ix * fa_count) + fa_el_ix],
/* Y */ fa_sum_n[(fa_row_ix * fa_count) + fa_el_ix]
// base case: 1st row: Inputs from decoders
// 1st row exception: two localized inverters due to sign extension structure
SigBit d08_inv = module->NotGate(NEW_ID_SUFFIX("bfa_0_exc_inv1"), PPij[(0 * dec_count) + dec_count - 1]);
SigBit d18_inv = module->NotGate(NEW_ID_SUFFIX("bfa_0_exc_inv2"), PPij[(1 * dec_count) + dec_count - 1]);
BuildBitwiseFa(module, NEW_ID_SUFFIX("fa_row_0").str(),
/* A */ {State::S0, d08_inv, PPij[(0 * dec_count) + x_sz], PPij.extract((0 * dec_count) + 2, x_sz - 1)},
/* B */ {State::S1, d18_inv, PPij.extract((1 * dec_count), x_sz)},
/* C */ fa_carry[0].extract(1, x_sz + 2),
/* X */ fa_carry[0].extract(2, x_sz + 2),
/* Y */ fa_sum[0].extract(2, x_sz + 2)
);
}
// end 3 cells: x_sz+1.2.3
//
else {
// fa_el_ix = x_sz+1
module->addFa(NEW_ID_SUFFIX(stringf("bfa_0_se_0_%d_%d_L", fa_row_ix, fa_el_ix)),
/* A */ PPij[(0 * dec_count) + x_sz],
/* B */ PPij[(1 * dec_count) + fa_el_ix - 2],
/* C */ fa_carry_n[(fa_row_ix * fa_count) + fa_el_ix - 1],
/* X */ fa_carry_n[(fa_row_ix * fa_count) + fa_el_ix],
/* Y */ fa_sum_n[(fa_row_ix * fa_count) + fa_el_ix]
);
// exception:invert ppi
fa_el_ix++;
SigBit d08_inv = module->NotGate(NEW_ID_SUFFIX(stringf("bfa_0_exc_inv1_%d_%d_L", fa_row_ix, fa_el_ix)),
PPij[(0 * dec_count) + dec_count - 1]);
SigBit d18_inv = module->NotGate(NEW_ID_SUFFIX(stringf("bfa_0_exc_inv2_%d_%d_L", fa_row_ix, fa_el_ix)),
PPij[(1 * dec_count) + dec_count - 1]);
module->addFa(NEW_ID_SUFFIX(stringf("bfa_0_se_1_%d_%d_L", fa_row_ix, fa_el_ix)),
/* A */ d08_inv,
/* B */ d18_inv,
/* C */ fa_carry_n[(fa_row_ix * fa_count) + fa_el_ix - 1],
/* X */ fa_carry_n[(fa_row_ix * fa_count) + fa_el_ix],
/* Y */ fa_sum_n[(fa_row_ix * fa_count) + fa_el_ix]
);
// sign extension
fa_el_ix++;
module->addFa(NEW_ID_SUFFIX(stringf("bfa_0_se_2_%d_%d_L", fa_row_ix, fa_el_ix)),
/* A */ State::S0,
/* B */ State::S1,
/* C */ fa_carry_n[(fa_row_ix * fa_count) + fa_el_ix - 1],
/* X */ fa_carry_n[(fa_row_ix * fa_count) + fa_el_ix],
/* Y */ fa_sum_n[(fa_row_ix * fa_count) + fa_el_ix]
);
}
}
module->connect(fa_carry[0][1], q1_carry_out);
// step case: 2nd and rest of rows. (fa_row_ix == 1...n)
// special because these are driven by a decoder and prior fa.
else {
// beginning
if (fa_el_ix == 0) {
// first two cells: have B input hooked to 0.
// column is offset by row_ix*2
module->addFa(NEW_ID_SUFFIX(stringf("bfa_base_%d_%d_L", fa_row_ix, fa_el_ix)),
/* A */ fa_sum_n[(fa_row_ix - 1) * fa_count + 2],
/* B */ State::S0,
/* C */ cori_n_int[fa_row_ix],
/* X */ fa_carry_n[(fa_row_ix * fa_count) + fa_el_ix],
/* Y */ fa_sum_n[(fa_row_ix * fa_count) + fa_el_ix]
);
fa_el_ix++;
module->addFa(NEW_ID_SUFFIX(stringf("bfa_base_%d_%d_L", fa_row_ix, fa_el_ix)),
/* A */ fa_sum_n[(fa_row_ix - 1) * fa_count + 3], // from prior full adder row
/* B */ State::S0,
/* C */ fa_carry_n[(fa_row_ix * fa_count) + fa_el_ix - 1],
/* X */ fa_carry_n[(fa_row_ix * fa_count) + fa_el_ix],
/* Y */ fa_sum_n[(fa_row_ix * fa_count) + fa_el_ix]
);
}
else if (fa_el_ix >= 2 && fa_el_ix <= x_sz + 1) {
// middle (2...x_sz+1 cells)
module->addFa(NEW_ID_SUFFIX(stringf("bfa_step_%d_%d_L", fa_row_ix, fa_el_ix)),
/* A */ fa_sum_n[(fa_row_ix - 1) * fa_count + fa_el_ix + 2],
/* B */ PPij[(fa_row_ix + 1) * dec_count + fa_el_ix - 2],
/* C */ fa_carry_n[(fa_row_ix * fa_count) + fa_el_ix - 1],
/* X */ fa_carry_n[(fa_row_ix * fa_count) + fa_el_ix],
/* Y */ fa_sum_n[(fa_row_ix * fa_count) + fa_el_ix]
);
}
else if (fa_el_ix > x_sz + 1) {
for (fa_row_ix = 1; fa_row_ix < fa_row_count; fa_row_ix++) {
// end two bits: sign extension
SigBit d_inv = module->NotGate(NEW_ID_SUFFIX(stringf("bfa_se_inv_%d_%d_L", fa_row_ix, fa_el_ix)),
SigBit d_inv = module->NotGate(NEW_ID_SUFFIX(stringf("bfa_se_inv_%d_L", fa_row_ix)),
PPij[((fa_row_ix + 1) * dec_count) + dec_count - 1]);
module->addFa(NEW_ID_SUFFIX(stringf("bfa_se_%d_%d_L", fa_row_ix, fa_el_ix)),
/* A */ fa_carry_n[((fa_row_ix - 1) * fa_count) + fa_count - 1],
/* B */ d_inv,
/* C */ fa_carry_n[(fa_row_ix * fa_count) + fa_el_ix - 1],
/* X */ fa_carry_n[(fa_row_ix * fa_count) + fa_el_ix],
/* Y */ fa_sum_n[(fa_row_ix * fa_count) + fa_el_ix]
);
fa_el_ix++;
BuildBitwiseFa(module, NEW_ID_SUFFIX(stringf("fa_row_%d", fa_row_ix)).str(),
/* A */ {State::S0, fa_carry[fa_row_ix - 1][fa_count - 1], fa_sum[fa_row_ix - 1].extract(2, x_sz + 2)},
/* B */ {State::S1, d_inv, PPij.extract((fa_row_ix + 1) * dec_count, x_sz), State::S0, State::S0},
// sign extension
module->addFa(NEW_ID_SUFFIX(stringf("bfa_se_%d_%d_L", fa_row_ix, fa_el_ix)),
/* A */ State::S0,
/* B */ State::S1,
/* C */ fa_carry_n[(fa_row_ix * fa_count) + fa_el_ix - 1],
/* X */ fa_carry_n[(fa_row_ix * fa_count) + fa_el_ix],
/* Y */ fa_sum_n[(fa_row_ix * fa_count) + fa_el_ix]
/* C */ {fa_carry[fa_row_ix].extract(0, x_sz + 3), cori_n_int[fa_row_ix]},
/* X */ fa_carry[fa_row_ix],
/* Y */ fa_sum[fa_row_ix]
);
}
}
}
}
// instantiate the cpa
SigSpec cpa_carry;
if (z_sz > fa_row_count * 2)
cpa_carry = module->addWire(NEW_ID_SUFFIX("cpa_carry"), z_sz - fa_row_count * 2);
for (int cix = 0; cix < z_sz; cix++)
cpa_carry.append(module->addWire(NEW_ID_SUFFIX(stringf("cpa_carry_%d", cix)), 1));
for (int cpa_ix = 0; cpa_ix < z_sz; cpa_ix++) {
// The end case where we pass the last two summands
// from prior row directly to product output
// without using a cpa cell. This is always
// 0,1 index of prior fa row
if (cpa_ix <= fa_row_count * 2 - 1) {
for (int cpa_ix = 0; cpa_ix < fa_row_count * 2; cpa_ix += 2) {
int fa_row_ix = cpa_ix / 2;
module->connect(Z.extract(cpa_ix, 2), fa_sum[fa_row_ix].extract(0, 2));
}
module->addBufGate(NEW_ID_SUFFIX(stringf("pp_buf_%d_driven_by_fa_row_%d", cpa_ix, fa_row_ix)),
fa_sum_n[(fa_row_ix * fa_count) + 0], Z[cpa_ix]);
cpa_ix++;
module->addBufGate(NEW_ID_SUFFIX(stringf("pp_buf_%d_driven_by_fa_row_%d", cpa_ix, fa_row_ix)),
fa_sum_n[(fa_row_ix * fa_count) + 1], Z[cpa_ix]);
} else {
for (int cpa_ix = fa_row_count * 2; cpa_ix < z_sz; cpa_ix++) {
int offset = fa_row_count * 2;
bool base_case = cpa_ix - offset == 0 ? true : false;
std::string cpa_name = stringf("cpa_%d", cpa_ix - offset);
SigBit ci;
if (base_case)
ci = cori_n_int[enc_count - 1];
else
ci = cpa_carry[cpa_ix - offset - 1];
SigBit ci = (cpa_ix == offset) ? cori_n_int[enc_count - 1] : cpa_carry[cpa_ix - offset - 1];
SigBit op;
BuildHa(cpa_name, fa_sum_n[(fa_row_count - 1) * fa_count + cpa_ix - offset + 2], ci, op,
cpa_carry[cpa_ix - offset]);
BuildHa(cpa_name, fa_sum[fa_row_count - 1][cpa_ix - offset + 2], ci, op, cpa_carry[cpa_ix - offset]);
module->connect(Z[cpa_ix], op);
}
}
//
// instantiate the quadrant 1 cell. This is the upper right
// quadrant which can be realized using non-booth encoded logic.
//
std::string q1_name = "icb_booth_q1_";
SigBit pp0_o_int;
SigBit pp1_o_int;
SigBit nxj_o_int;
SigBit cor_o_int;
BuildBoothQ1(q1_name,
negi_n_int[0], // negi
cori_n_int[0], // cori
X[0], X[1], Y[0], Y[1],
nxj_o_int, cor_o_int, pp0_o_int, pp1_o_int);
module->connect(fa_sum_n[(0 * fa_count) + 0], pp0_o_int);
module->connect(fa_sum_n[(0 * fa_count) + 1], pp1_o_int);
module->connect(fa_carry_n[(0 * fa_count) + 1], cor_o_int);
module->connect(nxj[(0 * dec_count) + 2], nxj_o_int);
}
};
struct BoothPass : public Pass {
@ -1232,21 +1144,13 @@ struct BoothPass : public Pass {
log("\n");
log(" booth [selection]\n");
log("\n");
log("This pass replaces multiplier cells with an implementation based on the Booth\n");
log("algorithm. It operates on $mul cells whose width of operands is at least 4x4\n");
log("and whose width of result is at least 8. The detailed architecture is selected\n");
log("from two options based on the signedness of the operands to the $mul cell.\n");
log("This pass replaces multiplier cells with a radix-4 Booth-encoded implementation.\n");
log("It operates on $mul cells whose width of operands is at least 4x4 and whose\n");
log("width of result is at least 8.\n");
log("\n");
log("See the references below for the description of the architectures.\n");
log("\n");
log("Signed-multiplier architecture:\n");
log("Y. J. Chang, Y. C. Cheng, S. C. Liao and C. H. Hsiao, \"A Low Power Radix-4 Booth\n");
log("Multiplier With Pre-Encoded Mechanism,\" in IEEE Access, vol. 8, pp. 114842-114853,\n");
log("2020, doi: 10.1109/ACCESS.2020.3003684\n");
log("\n");
log("Unsigned-multiplier architecture:\n");
log("G. W. Bewick, \"Fast Multiplication: Algorithms and Implementations,\" PhD Thesis,\n");
log("Department of Electrical Engineering, Stanford University, 1994\n");
log(" -lowpower\n");
log(" use an alternative low-power architecture for the generated multiplier\n");
log(" (signed multipliers only)\n");
log("\n");
}
void execute(vector<string> args, RTLIL::Design *design) override
@ -1254,7 +1158,16 @@ struct BoothPass : public Pass {
log_header(design, "Executing BOOTH pass (map to Booth multipliers).\n");
size_t argidx;
bool mapped_cpa = false;
bool lowpower = false;
for (argidx = 1; argidx < args.size(); argidx++) {
if (args[argidx] == "-mapped_cpa")
// Have an undocumented option which helps with multiplier
// verification using specialized tools (AMulet2 in particular)
mapped_cpa = true;
else if (args[argidx] == "-lowpower")
lowpower = true;
else
break;
}
extra_args(args, argidx, design);
@ -1264,6 +1177,8 @@ struct BoothPass : public Pass {
for (auto mod : design->selected_modules()) {
if (!mod->has_processes_warn()) {
BoothPassWorker worker(mod);
worker.mapped_cpa = mapped_cpa;
worker.lowpower = lowpower;
worker.run();
total += worker.booth_counter;
}

View File

@ -362,6 +362,8 @@ struct SynthLatticePass : public ScriptPass
run("techmap -map +/mul2dsp.v -map +/lattice/dsp_map" + dsp_map + ".v -D DSP_A_MAXWIDTH=18 -D DSP_B_MAXWIDTH=18 -D DSP_A_MINWIDTH=2 -D DSP_B_MINWIDTH=2 -D DSP_NAME=$__MUL18X18", "(unless -nodsp)");
run("chtype -set $mul t:$__soft_mul", "(unless -nodsp)");
}
if (family == "xo3" || help_mode)
run("booth", "(only if '-family xo3')");
run("alumacc");
run("opt");
run("memory -nomap" + no_rw_check_opt);