cxxrtl: speed up bit repeats (sign extends, etc).

On Minerva SoC SRAM, depending on the compiler, this change improves overall time by 4-7%.
2020-12-21 02:15:55 +00:00 · 2020-12-21 02:15:55 +00:00 · b9721bedf0
parent 40ca9d038b
commit b9721bedf0
2 changed files with 28 additions and 5 deletions
--- a/backends/cxxrtl/cxxrtl.h
+++ b/backends/cxxrtl/cxxrtl.h
@ -317,6 +317,14 @@ struct value : public expr_base<value<Bits>> {
 		return sext_cast<NewBits>()(*this);
 	}

+	// Bit replication is far more efficient than the equivalent concatenation.
+	template<size_t Count>
+	CXXRTL_ALWAYS_INLINE
+	value<Bits * Count> repeat() const {
+		static_assert(Bits == 1, "repeat() is implemented only for 1-bit values");
+		return *this ? value<Bits * Count>().bit_not() : value<Bits * Count>();
+	}
+
 	// Operations with run-time parameters (offsets, amounts, etc).
 	//
 	// These operations are used for computations.
--- a/backends/cxxrtl/cxxrtl_backend.cc
+++ b/backends/cxxrtl/cxxrtl_backend.cc
@ -832,11 +832,26 @@ struct CxxrtlWorker {
 		} else if (sig.is_chunk()) {
 			return dump_sigchunk(sig.as_chunk(), is_lhs, for_debug);
 		} else {
-			dump_sigchunk(*sig.chunks().rbegin(), is_lhs, for_debug);
-			for (auto it = sig.chunks().rbegin() + 1; it != sig.chunks().rend(); ++it) {
-				f << ".concat(";
-				dump_sigchunk(*it, is_lhs, for_debug);
-				f << ")";
+			bool first = true;
+			auto chunks = sig.chunks();
+			for (auto it = chunks.rbegin(); it != chunks.rend(); it++) {
+				if (!first)
+					f << ".concat(";
+				bool is_complex = dump_sigchunk(*it, is_lhs, for_debug);
+				if (!is_lhs && it->width == 1) {
+					size_t repeat = 1;
+					while ((it + repeat) != chunks.rend() && *(it + repeat) == *it)
+						repeat++;
+					if (repeat > 1) {
+						if (is_complex)
+							f << ".val()";
+						f << ".repeat<" << repeat << ">()";
+					}
+					it += repeat - 1;
+				}
+				if (!first)
+					f << ")";
+				first = false;
 			}
 			return true;
 		}