From d20e971725a780458ccd18d08be7ecfb67884eaf Mon Sep 17 00:00:00 2001
From: whitequark <whitequark@whitequark.org>
Date: Sun, 1 Dec 2019 01:51:16 +0000
Subject: [PATCH 01/10] write_cxxrtl: new backend.

This commit adds a basic implementation that isn't very performant
but implements most of the planned features.
---
 Makefile                     |    1 +
 backends/cxxrtl/Makefile.inc |    2 +
 backends/cxxrtl/cxxrtl.cc    |  904 ++++++++++++++++++++++++++++
 backends/cxxrtl/cxxrtl.h     | 1104 ++++++++++++++++++++++++++++++++++
 kernel/yosys.cc              |    2 +
 kernel/yosys.h               |    3 +
 6 files changed, 2016 insertions(+)
 create mode 100644 backends/cxxrtl/Makefile.inc
 create mode 100644 backends/cxxrtl/cxxrtl.cc
 create mode 100644 backends/cxxrtl/cxxrtl.h
diff --git a/Makefile b/Makefile
index 218863b32..d1c4a2030 100644
--- a/Makefile
+++ b/Makefile
@@ -546,6 +546,7 @@ $(eval $(call add_include_file,libs/json11/json11.hpp))
 $(eval $(call add_include_file,passes/fsm/fsmdata.h))
 $(eval $(call add_include_file,frontends/ast/ast.h))
 $(eval $(call add_include_file,backends/ilang/ilang_backend.h))
+$(eval $(call add_include_file,backends/cxxrtl/cxxrtl.h))
 
 OBJS += kernel/driver.o kernel/register.o kernel/rtlil.o kernel/log.o kernel/calc.o kernel/yosys.o
 OBJS += kernel/cellaigs.o kernel/celledges.o
diff --git a/backends/cxxrtl/Makefile.inc b/backends/cxxrtl/Makefile.inc
new file mode 100644
index 000000000..f93e65f85
--- /dev/null
+++ b/backends/cxxrtl/Makefile.inc
@@ -0,0 +1,2 @@
+
+OBJS += backends/cxxrtl/cxxrtl.o
diff --git a/backends/cxxrtl/cxxrtl.cc b/backends/cxxrtl/cxxrtl.cc
new file mode 100644
index 000000000..2dc7b3d36
--- /dev/null
+++ b/backends/cxxrtl/cxxrtl.cc
@@ -0,0 +1,904 @@
+/*
+ *  yosys -- Yosys Open SYnthesis Suite
+ *
+ *  Copyright (C) 2019  whitequark <whitequark@whitequark.org>
+ *
+ *  Permission to use, copy, modify, and/or distribute this software for any
+ *  purpose with or without fee is hereby granted, provided that the above
+ *  copyright notice and this permission notice appear in all copies.
+ *
+ *  THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ *  WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ *  MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ *  ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ *  WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ *  ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ *  OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ *
+ */
+
+#include "kernel/rtlil.h"
+#include "kernel/register.h"
+#include "kernel/sigtools.h"
+#include "kernel/celltypes.h"
+#include "kernel/log.h"
+
+USING_YOSYS_NAMESPACE
+PRIVATE_NAMESPACE_BEGIN
+
+struct CxxrtlWorker {
+	std::ostream &f;
+	std::string indent;
+	int temporary = 0;
+
+	dict<const RTLIL::Module*, SigMap> sigmaps;
+	pool<const RTLIL::Wire*> sync_wires;
+	dict<RTLIL::SigBit, RTLIL::SyncType> sync_types;
+	pool<const RTLIL::Memory*> writable_memories;
+
+	CxxrtlWorker(std::ostream &f) : f(f) {}
+
+	void inc_indent() {
+		indent += "\t";
+	}
+	void dec_indent() {
+		indent.resize(indent.size() - 1);
+	}
+
+	// RTLIL allows any characters in names other than whitespace. This presents an issue for generating C++ code
+	// because C++ identifiers may be only alphanumeric, cannot clash with C++ keywords, and cannot clash with cxxrtl
+	// identifiers. This issue can be solved with a name mangling scheme. We choose a name mangling scheme that results
+	// in readable identifiers, does not depend on an up-to-date list of C++ keywords, and is easy to apply. Its rules:
+	//  1. All generated identifiers start with `_`.
+	//  1a. Generated identifiers for public names (beginning with `\`) start with `p_`.
+	//  1b. Generated identifiers for internal names (beginning with `$`) start with `i_`.
+	//  2. An underscore is escaped with another underscore, i.e. `__`.
+	//  3. Any other non-alnum character is escaped with underscores around its lowercase hex code, e.g. `@` as `_40_`.
+	std::string mangle_name(const RTLIL::IdString &name)
+	{
+		std::string mangled;
+		bool first = true;
+		for (char c : name.str()) {
+			if (first) {
+				first = false;
+				if (c == '\\')
+					mangled += "p_";
+				else if (c == '$')
+					mangled += "i_";
+				else
+					log_assert(false);
+			} else {
+				if (isalnum(c)) {
+					mangled += c;
+				} else if (c == '_') {
+					mangled += "__";
+				} else {
+					char l = c & 0xf, h = (c >> 4) & 0xf;
+					mangled += '_';
+					mangled += (h < 10 ? '0' + h : 'a' + h - 10);
+					mangled += (l < 10 ? '0' + l : 'a' + l - 10);
+					mangled += '_';
+				}
+			}
+		}
+		return mangled;
+	}
+
+	std::string mangle_module_name(const RTLIL::IdString &name)
+	{
+		// Class namespace.
+		return mangle_name(name);
+	}
+
+	std::string mangle_memory_name(const RTLIL::IdString &name)
+	{
+		// Class member namespace.
+		return "memory_" + mangle_name(name);
+	}
+
+	std::string mangle_wire_name(const RTLIL::IdString &name)
+	{
+		// Class member namespace.
+		return mangle_name(name);
+	}
+
+	std::string mangle(const RTLIL::Module *module)
+	{
+		return mangle_module_name(module->name);
+	}
+
+	std::string mangle(const RTLIL::Memory *memory)
+	{
+		return mangle_memory_name(memory->name);
+	}
+
+	std::string mangle(const RTLIL::Wire *wire)
+	{
+		return mangle_wire_name(wire->name);
+	}
+
+	std::string mangle(RTLIL::SigBit sigbit)
+	{
+		log_assert(sigbit.wire != NULL);
+		if (sigbit.wire->width == 1)
+			return mangle(sigbit.wire);
+		return mangle(sigbit.wire) + "_" + std::to_string(sigbit.offset);
+	}
+
+	std::string fresh_temporary()
+	{
+		return stringf("tmp_%d", temporary++);
+	}
+
+	void dump_attrs(const RTLIL::AttrObject *object)
+	{
+		for (auto attr : object->attributes) {
+			f << indent << "// " << attr.first.str() << ": ";
+			if (attr.second.flags & RTLIL::CONST_FLAG_STRING) {
+				f << attr.second.decode_string();
+			} else {
+				f << attr.second.as_int(/*is_signed=*/attr.second.flags & RTLIL::CONST_FLAG_SIGNED);
+			}
+			f << "\n";
+		}
+	}
+
+	void dump_const_init(const RTLIL::Const &data, int width, int offset = 0, bool fixed_width = false)
+	{
+		f << "{";
+		while (width > 0) {
+			const int CHUNK_SIZE = 32;
+			uint32_t chunk = data.extract(offset, width > CHUNK_SIZE ? CHUNK_SIZE : width).as_int();
+			if (fixed_width)
+				f << stringf("0x%08xu", chunk);
+			else
+				f << stringf("%#xu", chunk);
+			if (width > CHUNK_SIZE)
+				f << ',';
+			offset += CHUNK_SIZE;
+			width  -= CHUNK_SIZE;
+		}
+		f << "}";
+	}
+
+	void dump_const_init(const RTLIL::Const &data)
+	{
+		dump_const_init(data, data.size());
+	}
+
+	void dump_const(const RTLIL::Const &data, int width, int offset = 0, bool fixed_width = false)
+	{
+		f << "value<" << width << ">";
+		dump_const_init(data, width, offset, fixed_width);
+	}
+
+	void dump_const(const RTLIL::Const &data)
+	{
+		dump_const(data, data.size());
+	}
+
+	bool dump_sigchunk(const RTLIL::SigChunk &chunk, bool is_lhs)
+	{
+		if (chunk.wire == NULL) {
+			dump_const(chunk.data, chunk.width, chunk.offset);
+			return false;
+		} else {
+			f << mangle(chunk.wire) << (is_lhs ? ".next" : ".curr");
+			if (chunk.width == chunk.wire->width && chunk.offset == 0)
+				return false;
+			else if (chunk.width == 1)
+				f << ".slice<" << chunk.offset << ">()";
+			else
+				f << ".slice<" << chunk.offset+chunk.width-1 << "," << chunk.offset << ">()";
+			return true;
+		}
+	}
+
+	bool dump_sigspec(const RTLIL::SigSpec &sig, bool is_lhs)
+	{
+		if (sig.empty()) {
+			f << "value<0>()";
+			return false;
+		} else if (sig.is_chunk()) {
+			return dump_sigchunk(sig.as_chunk(), is_lhs);
+		} else {
+			dump_sigchunk(*sig.chunks().rbegin(), is_lhs);
+			for (auto it = sig.chunks().rbegin() + 1; it != sig.chunks().rend(); ++it) {
+				f << ".concat(";
+				dump_sigchunk(*it, is_lhs);
+				f << ")";
+			}
+			return true;
+		}
+	}
+
+	void dump_sigspec_lhs(const RTLIL::SigSpec &sig)
+	{
+		dump_sigspec(sig, /*is_lhs=*/true);
+	}
+
+	void dump_sigspec_rhs(const RTLIL::SigSpec &sig)
+	{
+		// In the contexts where we want template argument deduction to occur for `template<size_t Bits> ... value<Bits>`,
+		// it is necessary to have the argument to already be a `value<N>`, since template argument deduction and implicit
+		// type conversion are mutually exclusive. In these contexts, we use dump_sigspec_rhs() to emit an explicit
+		// type conversion, but only if the expression needs it.
+		bool is_complex = dump_sigspec(sig, /*is_lhs=*/false);
+		if (is_complex)
+			f << ".val()";
+	}
+
+	void dump_assign(const RTLIL::SigSig &sigsig)
+	{
+		f << indent;
+		dump_sigspec_lhs(sigsig.first);
+		f << " = ";
+		dump_sigspec_rhs(sigsig.second);
+		f << ";\n";
+	}
+
+	void dump_cell(const RTLIL::Cell *cell)
+	{
+		dump_attrs(cell);
+		f << indent << "// cell " << cell->name.str() << "\n";
+		// Unary cells
+		if (cell->type.in(
+		    ID($not), ID($logic_not), ID($reduce_and), ID($reduce_or), ID($reduce_xor), ID($reduce_xnor), ID($reduce_bool),
+		    ID($pos), ID($neg))) {
+			f << indent;
+			dump_sigspec_lhs(cell->getPort(ID(Y)));
+			f << " = " << cell->type.substr(1) << '_' <<
+			     (cell->getParam(ID(A_SIGNED)).as_bool() ? 's' : 'u') <<
+			     "<" << cell->getParam(ID(Y_WIDTH)).as_int() << ">(";
+			dump_sigspec_rhs(cell->getPort(ID(A)));
+			f << ");\n";
+		// Binary cells
+		} else if (cell->type.in(
+		    ID($and), ID($or), ID($xor), ID($xnor), ID($logic_and), ID($logic_or),
+		    ID($shl), ID($sshl), ID($shr), ID($sshr), ID($shift), ID($shiftx),
+		    ID($eq), ID($ne), ID($eqx), ID($nex), ID($gt), ID($ge), ID($lt), ID($le),
+		    ID($add), ID($sub), ID($mul), ID($div), ID($mod))) {
+			f << indent;
+			dump_sigspec_lhs(cell->getPort(ID(Y)));
+			f << " = " << cell->type.substr(1) << '_' <<
+			     (cell->getParam(ID(A_SIGNED)).as_bool() ? 's' : 'u') <<
+			     (cell->getParam(ID(B_SIGNED)).as_bool() ? 's' : 'u') <<
+			     "<" << cell->getParam(ID(Y_WIDTH)).as_int() << ">(";
+			dump_sigspec_rhs(cell->getPort(ID(A)));
+			f << ", ";
+			dump_sigspec_rhs(cell->getPort(ID(B)));
+			f << ");\n";
+		// Muxes
+		} else if (cell->type == ID($mux)) {
+			f << indent;
+			dump_sigspec_lhs(cell->getPort(ID(Y)));
+			f << " = ";
+			dump_sigspec_rhs(cell->getPort(ID(S)));
+			f << " ? ";
+			dump_sigspec_rhs(cell->getPort(ID(B)));
+			f << " : ";
+			dump_sigspec_rhs(cell->getPort(ID(A)));
+			f << ";\n";
+		// Parallel (one-hot) muxes
+		} else if (cell->type == ID($pmux)) {
+			int width = cell->getParam(ID(WIDTH)).as_int();
+			int s_width = cell->getParam(ID(S_WIDTH)).as_int();
+			bool first = true;
+			for (int part = 0; part < s_width; part++) {
+				f << (first ? indent : " else ");
+				first = false;
+				f << "if (";
+				dump_sigspec_rhs(cell->getPort(ID(S)).extract(part));
+				f << ") {\n";
+				inc_indent();
+					f << indent;
+					dump_sigspec_lhs(cell->getPort(ID(Y)));
+					f << " = ";
+					dump_sigspec_rhs(cell->getPort(ID(B)).extract(part * width, width));
+					f << ";\n";
+				dec_indent();
+				f << indent << "}";
+			}
+			f << " else {\n";
+			inc_indent();
+				f << indent;
+				dump_sigspec_lhs(cell->getPort(ID(Y)));
+				f << " = ";
+				dump_sigspec_rhs(cell->getPort(ID(A)));
+				f << ";\n";
+			dec_indent();
+			f << indent << "}\n";
+		// Flip-flops
+		} else if (cell->type.in(ID($dff), ID($dffe), ID($adff), ID($dffsr))) {
+			if (cell->getPort(ID(CLK)).is_wire()) {
+				// Edge-sensitive logic
+				RTLIL::SigBit clk_bit = cell->getPort(ID(CLK))[0];
+				clk_bit = sigmaps[clk_bit.wire->module](clk_bit);
+				f << indent << "if (" << (cell->getParam(ID(CLK_POLARITY)).as_bool() ? "posedge_" : "negedge_")
+				            << mangle(clk_bit) << ") {\n";
+				inc_indent();
+					if (cell->type == ID($dffe)) {
+						f << indent << "if (";
+						dump_sigspec_rhs(cell->getPort(ID(EN)));
+						f << " == value<1> {" << cell->getParam(ID(EN_POLARITY)).as_bool() << "}) {\n";
+						inc_indent();
+					}
+					f << indent;
+					dump_sigspec_lhs(cell->getPort(ID(Q)));
+					f << " = ";
+					dump_sigspec_rhs(cell->getPort(ID(D)));
+					f << ";\n";
+					if (cell->type == ID($dffe)) {
+						dec_indent();
+						f << indent << "}\n";
+					}
+				dec_indent();
+				f << indent << "}\n";
+			}
+			// Level-sensitive logic
+			if (cell->type == ID($adff)) {
+				f << indent << "if (";
+				dump_sigspec_rhs(cell->getPort(ID(ARST)));
+				f << " == value<1> {" << cell->getParam(ID(ARST_POLARITY)).as_bool() << "}) {\n";
+				inc_indent();
+					f << indent;
+					dump_sigspec_lhs(cell->getPort(ID(Q)));
+					f << " = ";
+					dump_const(cell->getParam(ID(ARST_VALUE)));
+					f << ";\n";
+				dec_indent();
+				f << indent << "}\n";
+			} else if (cell->type == ID($dffsr)) {
+				f << indent << "if (";
+				dump_sigspec_rhs(cell->getPort(ID(CLR)));
+				f << " == value<1> {" << cell->getParam(ID(CLR_POLARITY)).as_bool() << "}) {\n";
+				inc_indent();
+					f << indent;
+					dump_sigspec_lhs(cell->getPort(ID(Q)));
+					f << " = ";
+					dump_const(RTLIL::Const(RTLIL::S0, cell->getParam(ID(WIDTH)).as_int()));
+					f << ";\n";
+				dec_indent();
+				f << indent << "} else if (";
+				dump_sigspec_rhs(cell->getPort(ID(SET)));
+				f << " == value<1> {" << cell->getParam(ID(SET_POLARITY)).as_bool() << "}) {\n";
+				inc_indent();
+					f << indent;
+					dump_sigspec_lhs(cell->getPort(ID(Q)));
+					f << " = ";
+					dump_const(RTLIL::Const(RTLIL::S1, cell->getParam(ID(WIDTH)).as_int()));
+					f << ";\n";
+				dec_indent();
+				f << indent << "}\n";
+			}
+		// Memory ports
+		} else if (cell->type.in(ID($memrd), ID($memwr))) {
+			if (cell->getParam(ID(CLK_ENABLE)).as_bool()) {
+				RTLIL::SigBit clk_bit = cell->getPort(ID(CLK))[0];
+				clk_bit = sigmaps[clk_bit.wire->module](clk_bit);
+				f << indent << "if (" << (cell->getParam(ID(CLK_POLARITY)).as_bool() ? "posedge_" : "negedge_")
+				            << mangle(clk_bit) << ") {\n";
+				inc_indent();
+			}
+			RTLIL::Memory *memory = cell->module->memories[cell->getParam(ID(MEMID)).decode_string()];
+			if (cell->type == ID($memrd)) {
+				if (!cell->getPort(ID(EN)).is_fully_ones()) {
+					f << indent << "if (";
+					dump_sigspec_rhs(cell->getPort(ID(EN)));
+					f << ") {\n";
+					inc_indent();
+				}
+				f << indent;
+				dump_sigspec_lhs(cell->getPort(ID(DATA)));
+				f << " = " << mangle(memory) << "[";
+				dump_sigspec_rhs(cell->getPort(ID(ADDR)));
+				if (writable_memories[memory]) {
+					// FIXME: the handling of transparent read ports is a bit naughty: normally, nothing on RHS should ever
+					// read from `next`, since this can result in evaluation order nondeterminism, as well as issues with
+					// latches. However, for now this is the right tradeoff to make, since it allows to simplify $memrd/$memwr
+					// codegen dramatically.
+					f << "]." << (cell->getParam(ID(TRANSPARENT)).as_bool() ? "next" : "curr") << ";\n";
+				} else {
+					f << "];\n";
+				}
+				if (!cell->getPort(ID(EN)).is_fully_ones()) {
+					dec_indent();
+					f << indent << "}\n";
+				}
+			} else /*if (cell->type == ID($memwr))*/ {
+				log_assert(writable_memories[memory]);
+				// FIXME: handle write port priority.
+				int width = cell->getParam(ID(WIDTH)).as_int();
+				std::string lhs_temp = fresh_temporary();
+				f << indent << "wire<" << width << "> &" << lhs_temp << " = " << mangle(memory) << "[";
+				dump_sigspec_rhs(cell->getPort(ID(ADDR)));
+				f << "];\n";
+				int start = 0;
+				RTLIL::SigBit prev_en_bit = RTLIL::Sm;
+				for (int stop = 0; stop < width + 1; stop++) {
+					if (stop == width || (prev_en_bit != RTLIL::Sm && prev_en_bit != cell->getPort(ID(EN))[stop])) {
+						f << indent << "if (";
+						dump_sigspec_rhs(prev_en_bit);
+						f << ") {\n";
+						inc_indent();
+							f << indent << lhs_temp << ".next.slice<" << (stop - 1) << "," << start << ">() = ";
+							dump_sigspec_rhs(cell->getPort(ID(DATA)).extract(start, stop - start));
+							f << ";\n";
+						dec_indent();
+						f << indent << "}\n";
+						start = stop + 1;
+					}
+					if (stop != width)
+						prev_en_bit = cell->getPort(ID(EN))[stop];
+				}
+			}
+			if (cell->getParam(ID(CLK_ENABLE)).as_bool()) {
+				dec_indent();
+				f << indent << "}\n";
+			}
+		// Memory initializers
+		} else if (cell->type == ID($meminit)) {
+			// Handled elsewhere.
+		} else if (cell->type[0] == '$') {
+			log_cmd_error("Unsupported internal cell `%s'.\n", cell->type.c_str());
+		} else {
+			log_assert(false);
+		}
+	}
+
+	void dump_case_rule(const RTLIL::CaseRule *rule)
+	{
+		for (auto action : rule->actions)
+			dump_assign(action);
+		for (auto switch_ : rule->switches)
+			dump_switch_rule(switch_);
+	}
+
+	void dump_switch_rule(const RTLIL::SwitchRule *rule)
+	{
+		// The switch attributes are printed before the switch condition is captured.
+		dump_attrs(rule);
+		std::string signal_temp = fresh_temporary();
+		f << indent << "const value<" << rule->signal.size() << "> &" << signal_temp << " = ";
+		dump_sigspec(rule->signal, /*is_lhs=*/false);
+		f << ";\n";
+
+		bool first = true;
+		for (auto case_ : rule->cases) {
+			// The case attributes (for nested cases) are printed before the if/else if/else statement.
+			dump_attrs(rule);
+			f << indent;
+			if (!first)
+				f << "} else ";
+			first = false;
+			if (!case_->compare.empty()) {
+				f << "if (";
+				bool first = true;
+				for (auto &compare : case_->compare) {
+					if (!first)
+						f << " || ";
+					first = false;
+					if (compare.is_fully_def()) {
+						f << signal_temp << " == ";
+						dump_sigspec(compare, /*is_lhs=*/false);
+					} else if (compare.is_fully_const()) {
+						RTLIL::Const compare_mask, compare_value;
+						for (auto bit : compare.as_const()) {
+							switch (bit) {
+								case RTLIL::S0:
+								case RTLIL::S1:
+									compare_mask.bits.push_back(RTLIL::S1);
+									compare_value.bits.push_back(bit);
+									break;
+
+								case RTLIL::Sx:
+								case RTLIL::Sz:
+								case RTLIL::Sa:
+									compare_mask.bits.push_back(RTLIL::S0);
+									compare_value.bits.push_back(RTLIL::S0);
+									break;
+
+								default:
+									log_assert(false);
+							}
+						}
+						f << "and_uu<" << compare.size() << ">(" << signal_temp << ", ";
+						dump_const(compare_mask);
+						f << ") == ";
+						dump_const(compare_value);
+					} else {
+						log_assert(false);
+					}
+				}
+				f << ") ";
+			}
+			f << "{\n";
+			inc_indent();
+				dump_case_rule(case_);
+			dec_indent();
+		}
+		f << indent << "}\n";
+	}
+
+	void dump_process(const RTLIL::Process *proc)
+	{
+		dump_attrs(proc);
+		f << indent << "// process " << proc->name.str() << "\n";
+		// The case attributes (for root case) are always empty.
+		log_assert(proc->root_case.attributes.empty());
+		dump_case_rule(&proc->root_case);
+		for (auto sync : proc->syncs) {
+			RTLIL::SigBit sync_bit = sync->signal[0];
+			sync_bit = sigmaps[sync_bit.wire->module](sync_bit);
+
+			pool<std::string> events;
+			switch (sync->type) {
+				case RTLIL::STp:
+					events.insert("posedge_" + mangle(sync_bit));
+					break;
+				case RTLIL::STn:
+					events.insert("negedge_" + mangle(sync_bit));
+				case RTLIL::STe:
+					events.insert("posedge_" + mangle(sync_bit));
+					events.insert("negedge_" + mangle(sync_bit));
+					break;
+
+				case RTLIL::ST0:
+				case RTLIL::ST1:
+				case RTLIL::STa:
+				case RTLIL::STg:
+				case RTLIL::STi:
+					log_assert(false);
+			}
+			if (!events.empty()) {
+				f << indent << "if (";
+				bool first = true;
+				for (auto &event : events) {
+					if (!first)
+						f << " || ";
+					first = false;
+					f << event;
+				}
+				f << ") {\n";
+				inc_indent();
+					for (auto action : sync->actions)
+						dump_assign(action);
+				dec_indent();
+				f << indent << "}\n";
+			}
+		}
+	}
+
+	void dump_wire(const RTLIL::Wire *wire)
+	{
+		dump_attrs(wire);
+		f << indent << "wire<" << wire->width << "> " << mangle(wire);
+		if (wire->attributes.count(ID(init))) {
+			f << " ";
+			dump_const_init(wire->attributes.at(ID(init)));
+		}
+		f << ";\n";
+		if (sync_wires[wire]) {
+			for (auto sync_type : sync_types) {
+				if (sync_type.first.wire == wire) {
+					if (sync_type.second != RTLIL::STn)
+						f << indent << "bool posedge_" << mangle(sync_type.first) << " = false;\n";
+					if (sync_type.second != RTLIL::STp)
+						f << indent << "bool negedge_" << mangle(sync_type.first) << " = false;\n";
+				}
+			}
+		}
+	}
+
+	void dump_memory(RTLIL::Module *module, const RTLIL::Memory *memory)
+	{
+		vector<const RTLIL::Cell*> init_cells;
+		for (auto cell : module->cells())
+			if (cell->type == ID($meminit) && cell->getParam(ID(MEMID)).decode_string() == memory->name.str())
+				init_cells.push_back(cell);
+
+		std::sort(init_cells.begin(), init_cells.end(), [](const RTLIL::Cell *a, const RTLIL::Cell *b) {
+			int a_addr = a->getPort(ID(ADDR)).as_int(), b_addr = b->getPort(ID(ADDR)).as_int();
+			int a_prio = a->getParam(ID(PRIORITY)).as_int(), b_prio = b->getParam(ID(PRIORITY)).as_int();
+			return a_prio > b_prio || (a_prio == b_prio && a_addr < b_addr);
+		});
+
+		dump_attrs(memory);
+		f << indent << "memory_" << (writable_memories[memory] ? "rw" : "ro")
+		            << "<" << memory->width << "> " << mangle(memory)
+		            << " { " << memory->size << "u";
+		if (init_cells.empty()) {
+			f << " };\n";
+		} else {
+			f << ",\n";
+			inc_indent();
+				for (auto cell : init_cells) {
+					dump_attrs(cell);
+					RTLIL::Const data = cell->getPort(ID(DATA)).as_const();
+					size_t width = cell->getParam(ID(WIDTH)).as_int();
+					size_t words = cell->getParam(ID(WORDS)).as_int();
+					f << indent << "memory_" << (writable_memories[memory] ? "rw" : "ro")
+					            << "<" << memory->width << ">::init<" << words << "> { "
+					            << stringf("%#x", cell->getPort(ID(ADDR)).as_int()) << ", {";
+					inc_indent();
+						for (size_t n = 0; n < words; n++) {
+							if (n % 4 == 0)
+								f << "\n" << indent;
+							else
+								f << " ";
+							dump_const(data, width, n * width, /*fixed_width=*/true);
+							f << ",";
+						}
+					dec_indent();
+					f << "\n" << indent << "}},\n";
+				}
+			dec_indent();
+			f << indent << "};\n";
+		}
+	}
+
+	void dump_module(RTLIL::Module *module)
+	{
+		dump_attrs(module);
+		f << "struct " << mangle(module) << " : public module {\n";
+		inc_indent();
+			for (auto wire : module->wires())
+				dump_wire(wire);
+			f << "\n";
+			for (auto memory : module->memories)
+				dump_memory(module, memory.second);
+			if (!module->memories.empty())
+				f << "\n";
+			f << indent << "void eval() override;\n";
+			f << indent << "bool commit() override;\n";
+		dec_indent();
+		f << "}; // struct " << mangle(module) << "\n";
+		f << "\n";
+
+		f << "void " << mangle(module) << "::eval() {\n";
+		inc_indent();
+			for (auto cell : module->cells())
+				dump_cell(cell);
+			f << indent << "// connections\n";
+			for (auto conn : module->connections())
+				dump_assign(conn);
+			for (auto proc : module->processes)
+				dump_process(proc.second);
+			for (auto sync_type : sync_types) {
+				if (sync_type.first.wire->module == module) {
+					if (sync_type.second != RTLIL::STn)
+						f << indent << "posedge_" << mangle(sync_type.first) << " = false;\n";
+					if (sync_type.second != RTLIL::STp)
+						f << indent << "negedge_" << mangle(sync_type.first) << " = false;\n";
+				}
+			}
+		dec_indent();
+		f << "}\n";
+
+		f << "\n";
+		f << "bool " << mangle(module) << "::commit() {\n";
+		inc_indent();
+			f << indent << "bool changed = false;\n";
+			for (auto wire : module->wires()) {
+				if (sync_wires[wire]) {
+					std::string wire_prev = mangle(wire) + "_prev";
+					std::string wire_curr = mangle(wire) + ".curr";
+					std::string wire_edge = mangle(wire) + "_edge";
+					f << indent << "value<" << wire->width << "> " << wire_prev << " = " << wire_curr << ";\n";
+					f << indent << "if (" << mangle(wire) << ".commit()) {\n";
+					inc_indent();
+						f << indent << "value<" << wire->width << "> " << wire_edge << " = "
+						            << wire_prev << ".bit_xor(" << wire_curr << ");\n";
+						for (auto sync_type : sync_types) {
+							if (sync_type.first.wire != wire)
+								continue;
+							if (sync_type.second != RTLIL::STn) {
+								f << indent << "if (" << wire_edge << ".slice<" << sync_type.first.offset << ">().val() && "
+								            << wire_curr << ".slice<" << sync_type.first.offset << ">().val())\n";
+								inc_indent();
+									f << indent << "posedge_" << mangle(sync_type.first) << " = true;\n";
+								dec_indent();
+							}
+							if (sync_type.second != RTLIL::STp) {
+								f << indent << "if (" << wire_edge << ".slice<" << sync_type.first.offset << ">().val() && "
+								            << "!" << wire_curr << ".slice<" << sync_type.first.offset << ">().val())\n";
+								inc_indent();
+									f << indent << "negedge_" << mangle(sync_type.first) << " = true;\n";
+								dec_indent();
+							}
+							f << indent << "changed = true;\n";
+						}
+					dec_indent();
+					f << indent << "}\n";
+				} else {
+					f << indent << "changed |= " << mangle(wire) << ".commit();\n";
+				}
+			}
+			for (auto memory : module->memories) {
+				if (!writable_memories[memory.second])
+					continue;
+				f << indent << "for (size_t i = 0; i < " << memory.second->size << "u; i++)\n";
+				inc_indent();
+					f << indent << "changed |= " << mangle(memory.second) << "[i].commit();\n";
+				dec_indent();
+			}
+			f << indent << "return changed;\n";
+		dec_indent();
+		f << "}\n";
+	}
+
+	void dump_design(RTLIL::Design *design)
+	{
+		f << "#include <cxxrtl.h>\n";
+		f << "\n";
+		f << "using namespace cxxrtl_yosys;\n";
+		f << "\n";
+		f << "namespace cxxrtl_design {\n";
+		for (auto module : design->modules()) {
+			if (module->get_blackbox_attribute())
+				continue;
+
+			if (!design->selected_module(module))
+				continue;
+
+			f << "\n";
+			dump_module(module);
+		}
+		f << "\n";
+		f << "} // namespace cxxrtl_design\n";
+	}
+
+	// Edge-type sync rules require us to emit edge detectors, which require coordination between
+	// eval and commit phases. To do this we need to collect them upfront.
+	//
+	// Note that the simulator commit phase operates at wire granularity but edge-type sync rules
+	// operate at wire bit granularity; it is possible to have code similar to:
+	//     wire [3:0] clocks;
+	//     always @(posedge clocks[0]) ...
+	// To handle this we track edge sensitivity both for wires and wire bits.
+	void register_edge_signal(SigMap &sigmap, RTLIL::SigSpec signal, RTLIL::SyncType type)
+	{
+		signal = sigmap(signal);
+		log_assert(signal.is_wire() && signal.is_bit());
+		log_assert(type == RTLIL::STp || type == RTLIL::STn || type == RTLIL::STe);
+
+		RTLIL::SigBit sigbit = signal[0];
+		if (!sync_types.count(sigbit))
+			sync_types[sigbit] = type;
+		else if (sync_types[sigbit] != type)
+			sync_types[sigbit] = RTLIL::STe;
+		sync_wires.insert(signal.as_wire());
+	}
+
+	void analyze_design(RTLIL::Design *design)
+	{
+		for (auto module : design->modules()) {
+			SigMap &sigmap = sigmaps[module];
+			sigmap.set(module);
+
+			for (auto cell : module->cells()) {
+				// Various DFF cells are treated like posedge/negedge processes, see above for details.
+				if (cell->type.in(ID($dff), ID($dffe), ID($adff), ID($dffsr))) {
+					if (cell->getPort(ID(CLK)).is_wire())
+						register_edge_signal(sigmap, cell->getPort(ID(CLK)),
+							cell->parameters[ID(CLK_POLARITY)].as_bool() ? RTLIL::STp : RTLIL::STn);
+					// The $adff and $dffsr cells are level-sensitive, not edge-sensitive (in spite of the fact that they
+					// are inferred from an edge-sensitive Verilog process) and do not correspond to an edge-type sync rule.
+				}
+				// Similar for memory port cells.
+				if (cell->type.in(ID($memrd), ID($memwr))) {
+					if (cell->getParam(ID(CLK_ENABLE)).as_bool()) {
+						if (cell->getPort(ID(CLK)).is_wire())
+							register_edge_signal(sigmap, cell->getPort(ID(CLK)),
+								cell->parameters[ID(CLK_POLARITY)].as_bool() ? RTLIL::STp : RTLIL::STn);
+					}
+				}
+				// Optimize access to read-only memories.
+				if (cell->type == ID($memwr))
+					writable_memories.insert(module->memories[cell->getParam(ID(MEMID)).decode_string()]);
+				// Handling of packed memories is delegated to the `memory_unpack` pass, so we can rely on the presence
+				// of RTLIL memory objects and $memrd/$memwr/$meminit cells.
+				if (cell->type.in(ID($mem)))
+					log_assert(false);
+			}
+
+			for (auto proc : module->processes)
+				for (auto sync : proc.second->syncs)
+					switch (sync->type) {
+						// Edge-type sync rules require pre-registration.
+						case RTLIL::STp:
+						case RTLIL::STn:
+						case RTLIL::STe:
+							register_edge_signal(sigmap, sync->signal, sync->type);
+							break;
+
+						// Level-type sync rules require no special handling.
+						case RTLIL::ST0:
+						case RTLIL::ST1:
+						case RTLIL::STa:
+							break;
+
+						// Handling of init-type sync rules is delegated to the `proc_init` pass, so we can use the wire
+						// attribute regardless of input.
+						case RTLIL::STi:
+							log_assert(false);
+
+						case RTLIL::STg:
+							log_cmd_error("Global clock is not supported.\n");
+					}
+		}
+	}
+
+	void check_design(RTLIL::Design *design, bool &has_sync_init, bool &has_packed_mem)
+	{
+		has_sync_init = has_packed_mem = false;
+
+		for (auto module : design->modules()) {
+			if (module->get_blackbox_attribute())
+				continue;
+
+			if (!design->selected_whole_module(module))
+				if (design->selected_module(module))
+					log_cmd_error("Can't handle partially selected module %s!\n", id2cstr(module->name));
+
+			for (auto proc : module->processes)
+				for (auto sync : proc.second->syncs)
+					if (sync->type == RTLIL::STi)
+						has_sync_init = true;
+
+			for (auto cell : module->cells())
+				if (cell->type == ID($mem))
+					has_packed_mem = true;
+		}
+	}
+
+	void prepare_design(RTLIL::Design *design)
+	{
+		bool has_sync_init, has_packed_mem;
+		check_design(design, has_sync_init, has_packed_mem);
+		if (has_sync_init)
+			Pass::call(design, "proc_init");
+		if (has_packed_mem)
+			Pass::call(design, "memory_unpack");
+		// Recheck the design if it was modified.
+		if (has_sync_init || has_packed_mem)
+			check_design(design, has_sync_init, has_packed_mem);
+
+		log_assert(!(has_sync_init || has_packed_mem));
+		analyze_design(design);
+	}
+};
+
+struct CxxrtlBackend : public Backend {
+	CxxrtlBackend() : Backend("cxxrtl", "convert design to C++ RTL simulation") { }
+	void help() YS_OVERRIDE
+	{
+		//   |---v---|---v---|---v---|---v---|---v---|---v---|---v---|---v---|---v---|---v---|
+		log("\n");
+		log("    write_cxxrtl [options] [filename]\n");
+		log("\n");
+		log("Write C++ code for simulating the design.\n");
+		log("\n");
+	}
+	void execute(std::ostream *&f, std::string filename, std::vector<std::string> args, RTLIL::Design *design) YS_OVERRIDE
+	{
+		log_header(design, "Executing CXXRTL backend.\n");
+
+		size_t argidx;
+		for (argidx = 1; argidx < args.size(); argidx++)
+		{
+			// if (args[argidx] == "-top" && argidx+1 < args.size()) {
+			// 	top_module_name = args[++argidx];
+			// 	continue;
+			// }
+			break;
+		}
+		extra_args(f, filename, args, argidx);
+
+		CxxrtlWorker worker(*f);
+		worker.prepare_design(design);
+		worker.dump_design(design);
+	}
+} CxxrtlBackend;
+
+PRIVATE_NAMESPACE_END
diff --git a/backends/cxxrtl/cxxrtl.h b/backends/cxxrtl/cxxrtl.h
new file mode 100644
index 000000000..d066530f2
--- /dev/null
+++ b/backends/cxxrtl/cxxrtl.h
@@ -0,0 +1,1104 @@
+/*
+ *  yosys -- Yosys Open SYnthesis Suite
+ *
+ *  Copyright (C) 2019  whitequark <whitequark@whitequark.org>
+ *
+ *  Permission to use, copy, modify, and/or distribute this software for any
+ *  purpose with or without fee is hereby granted.
+ *
+ *  THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ *  WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ *  MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ *  ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ *  WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ *  ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ *  OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ *
+ */
+
+// This file is included by the designs generated with `write_cxxrtl`. It is not used in Yosys itself.
+
+#ifndef CXXRTL_H
+#define CXXRTL_H
+
+#include <cstddef>
+#include <cstdint>
+#include <limits>
+#include <type_traits>
+#include <tuple>
+#include <vector>
+#include <sstream>
+
+// The cxxrtl support library implements compile time specialized arbitrary width arithmetics, as well as provides
+// composite lvalues made out of bit slices and concatenations of lvalues. This allows the `write_cxxrtl` pass
+// to perform a straightforward translation of RTLIL structures to readable C++, relying on the C++ compiler
+// to unwrap the abstraction and generate efficient code.
+namespace cxxrtl {
+
+// All arbitrary-width values in cxxrtl are backed by arrays of unsigned integers called chunks. The chunk size
+// is the same regardless of the value width to simplify manipulating values via FFI interfaces, e.g. driving
+// and introspecting the simulation in Python.
+//
+// It is practical to use chunk sizes between 32 bits and platform register size because when arithmetics on
+// narrower integer types is legalized by the C++ compiler, it inserts code to clear the high bits of the register.
+// However, (a) most of our operations do not change those bits in the first place because of invariants that are
+// invisible to the compiler, (b) we often operate on non-power-of-2 values and have to clear the high bits anyway.
+// Therefore, using relatively wide chunks and clearing the high bits explicitly and only when we know they may be
+// clobbered results in simpler generated code.
+template<typename T>
+struct chunk_traits {
+	static_assert(std::is_integral<T>::value && std::is_unsigned<T>::value,
+	              "chunk type must be an unsigned integral type");
+	using type = T;
+	static constexpr size_t bits = std::numeric_limits<T>::digits;
+	static constexpr T mask = std::numeric_limits<T>::max();
+};
+
+template<class T>
+struct expr_base;
+
+template<size_t Bits>
+struct value : public expr_base<value<Bits>> {
+	static constexpr size_t bits = Bits;
+
+	using chunk = chunk_traits<uint32_t>;
+	static constexpr chunk::type msb_mask = (Bits % chunk::bits == 0) ? chunk::mask
+		: chunk::mask >> (chunk::bits - (Bits % chunk::bits));
+
+	static constexpr size_t chunks = (Bits + chunk::bits - 1) / chunk::bits;
+	chunk::type data[chunks] = {};
+
+	value() = default;
+	template<typename... Init>
+	explicit constexpr value(Init ...init) : data{init...} {}
+
+	// This allows using value<> as well as wire<> in memory initializers.
+	using init = value<Bits>;
+
+	value(const value<Bits> &) = default;
+	value(value<Bits> &&) = default;
+	value<Bits> &operator=(const value<Bits> &) = default;
+
+	// A (no-op) helper that forces the cast to value<>.
+	const value<Bits> &val() const {
+		return *this;
+	}
+
+	std::string str() const {
+		std::stringstream ss;
+		ss << *this;
+		return ss.str();
+	}
+
+	// Operations with compile-time parameters.
+	//
+	// These operations are used to implement slicing, concatenation, and blitting.
+	// The trunc, zext and sext operations add or remove most significant bits (i.e. on the left);
+	// the rtrunc and rzext operations add or remove least significant bits (i.e. on the right).
+	template<size_t NewBits>
+	value<NewBits> trunc() const {
+		static_assert(NewBits <= Bits, "trunc() may not increase width");
+		value<NewBits> result;
+		for (size_t n = 0; n < result.chunks; n++)
+			result.data[n] = data[n];
+		result.data[result.chunks - 1] &= result.msb_mask;
+		return result;
+	}
+
+	template<size_t NewBits>
+	value<NewBits> zext() const {
+		static_assert(NewBits >= Bits, "zext() may not decrease width");
+		value<NewBits> result;
+		for (size_t n = 0; n < chunks; n++)
+			result.data[n] = data[n];
+		return result;
+	}
+
+	template<size_t NewBits>
+	value<NewBits> sext() const {
+		static_assert(NewBits >= Bits, "sext() may not decrease width");
+		value<NewBits> result;
+		for (size_t n = 0; n < chunks; n++)
+			result.data[n] = data[n];
+		if (is_neg()) {
+			result.data[chunks - 1] |= ~msb_mask;
+			for (size_t n = chunks; n < result.chunks; n++)
+				result.data[n] = chunk::mask;
+			result.data[result.chunks - 1] &= result.msb_mask;
+		}
+		return result;
+	}
+
+	template<size_t NewBits>
+	value<NewBits> rtrunc() const {
+		static_assert(NewBits <= Bits, "rtrunc() may not increase width");
+		value<NewBits> result;
+		constexpr size_t shift_chunks = (Bits - NewBits) / chunk::bits;
+		constexpr size_t shift_bits   = (Bits - NewBits) % chunk::bits;
+		chunk::type carry = 0;
+		if (shift_chunks + result.chunks < chunks) {
+			carry = (shift_bits == 0) ? 0
+				: data[shift_chunks + result.chunks] << (chunk::bits - shift_bits);
+		}
+		for (size_t n = result.chunks; n > 0; n--) {
+			result.data[n - 1] = carry | (data[shift_chunks + n - 1] >> shift_bits);
+			carry = (shift_bits == 0) ? 0
+				: data[shift_chunks + n - 1] << (chunk::bits - shift_bits);
+		}
+		return result;
+	}
+
+	template<size_t NewBits>
+	value<NewBits> rzext() const {
+		static_assert(NewBits >= Bits, "rzext() may not decrease width");
+		value<NewBits> result;
+		constexpr size_t shift_chunks = (NewBits - Bits) / chunk::bits;
+		constexpr size_t shift_bits   = (NewBits - Bits) % chunk::bits;
+		chunk::type carry = 0;
+		for (size_t n = 0; n < chunks; n++) {
+			result.data[shift_chunks + n] = (data[n] << shift_bits) | carry;
+			carry = (shift_bits == 0) ? 0
+				: data[n] >> (chunk::bits - shift_bits);
+		}
+		if (carry != 0)
+			result.data[result.chunks - 1] = carry;
+		return result;
+	}
+
+	// Bit blit operation, i.e. a partial read-modify-write.
+	template<size_t Stop, size_t Start>
+	value<Bits> blit(const value<Stop - Start + 1> &source) const {
+		static_assert(Stop >= Start, "blit() may not reverse bit order");
+		constexpr chunk::type start_mask = ~(chunk::mask << (Start % chunk::bits));
+		constexpr chunk::type stop_mask = (Stop % chunk::bits + 1 == chunk::bits) ? 0
+			: (chunk::mask << (Stop % chunk::bits + 1));
+		value<Bits> masked = *this;
+		if (Start / chunk::bits == Stop / chunk::bits) {
+			masked.data[Start / chunk::bits] &= stop_mask | start_mask;
+		} else {
+			masked.data[Start / chunk::bits] &= start_mask;
+			for (size_t n = Start / chunk::bits + 1; n < Stop / chunk::bits; n++)
+				masked.data[n] = 0;
+			masked.data[Stop / chunk::bits] &= stop_mask;
+		}
+		value<Bits> shifted = source
+			.template rzext<Stop + 1>()
+			.template zext<Bits>();
+		return masked.bit_or(shifted);
+	}
+
+	// Helpers for selecting extending or truncating operation depending on whether the result is wider or narrower
+	// than the operand. In C++17 these can be replaced with `if constexpr`.
+	template<size_t NewBits, typename = void>
+	struct zext_cast {
+		value<NewBits> operator()(const value<Bits> &val) {
+			return val.template zext<NewBits>();
+		}
+	};
+
+	template<size_t NewBits>
+	struct zext_cast<NewBits, typename std::enable_if<(NewBits < Bits)>::type> {
+		value<NewBits> operator()(const value<Bits> &val) {
+			return val.template trunc<NewBits>();
+		}
+	};
+
+	template<size_t NewBits, typename = void>
+	struct sext_cast {
+		value<NewBits> operator()(const value<Bits> &val) {
+			return val.template sext<NewBits>();
+		}
+	};
+
+	template<size_t NewBits>
+	struct sext_cast<NewBits, typename std::enable_if<(NewBits < Bits)>::type> {
+		value<NewBits> operator()(const value<Bits> &val) {
+			return val.template trunc<NewBits>();
+		}
+	};
+
+	template<size_t NewBits>
+	value<NewBits> zcast() const {
+		return zext_cast<NewBits>()(*this);
+	}
+
+	template<size_t NewBits>
+	value<NewBits> scast() const {
+		return sext_cast<NewBits>()(*this);
+	}
+
+	// Operations with run-time parameters (offsets, amounts, etc).
+	//
+	// These operations are used for computations.
+	bool bit(size_t offset) const {
+		return data[offset / chunk::bits] & (1 << (offset % chunk::bits));
+	}
+
+	void set_bit(size_t offset, bool value = true) {
+		size_t offset_chunks = offset / chunk::bits;
+		size_t offset_bits = offset % chunk::bits;
+		data[offset_chunks] &= ~(1 << offset_bits);
+		data[offset_chunks] |= value ? 1 << offset_bits : 0;
+	}
+
+	bool is_zero() const {
+		for (size_t n = 0; n < chunks; n++)
+			if (data[n] != 0)
+				return false;
+		return true;
+	}
+
+	explicit operator bool() const {
+		return !is_zero();
+	}
+
+	bool is_neg() const {
+		return data[chunks - 1] & (1 << ((Bits - 1) % chunk::bits));
+	}
+
+	bool operator ==(const value<Bits> &other) const {
+		for (size_t n = 0; n < chunks; n++)
+			if (data[n] != other.data[n])
+				return false;
+		return true;
+	}
+
+	bool operator !=(const value<Bits> &other) const {
+		return !(*this == other);
+	}
+
+	value<Bits> bit_not() const {
+		value<Bits> result;
+		for (size_t n = 0; n < chunks; n++)
+			result.data[n] = ~data[n];
+		result.data[chunks - 1] &= msb_mask;
+		return result;
+	}
+
+	value<Bits> bit_and(const value<Bits> &other) const {
+		value<Bits> result;
+		for (size_t n = 0; n < chunks; n++)
+			result.data[n] = data[n] & other.data[n];
+		return result;
+	}
+
+	value<Bits> bit_or(const value<Bits> &other) const {
+		value<Bits> result;
+		for (size_t n = 0; n < chunks; n++)
+			result.data[n] = data[n] | other.data[n];
+		return result;
+	}
+
+	value<Bits> bit_xor(const value<Bits> &other) const {
+		value<Bits> result;
+		for (size_t n = 0; n < chunks; n++)
+			result.data[n] = data[n] ^ other.data[n];
+		return result;
+	}
+
+	template<size_t AmountBits>
+	value<Bits> shl(const value<AmountBits> &amount) const {
+		// Ensure our early return is correct by prohibiting values larger than 4 Gbit.
+		static_assert(Bits <= chunk::mask, "shl() of unreasonably large values is not supported");
+		// Detect shifts definitely large than Bits early.
+		for (size_t n = 1; n < amount.chunks; n++)
+			if (amount.data[n] != 0)
+				return {};
+		// Past this point we can use the least significant chunk as the shift size.
+		size_t shift_chunks = amount.data[0] / chunk::bits;
+		size_t shift_bits   = amount.data[0] % chunk::bits;
+		if (shift_chunks >= chunks)
+			return {};
+		value<Bits> result;
+		chunk::type carry = 0;
+		for (size_t n = 0; n < chunks - shift_chunks; n++) {
+			result.data[shift_chunks + n] = (data[n] << shift_bits) | carry;
+			carry = (shift_bits == 0) ? 0
+				: data[n] >> (chunk::bits - shift_bits);
+		}
+		return result;
+	}
+
+	template<size_t AmountBits, bool Signed = false>
+	value<Bits> shr(const value<AmountBits> &amount) const {
+		// Ensure our early return is correct by prohibiting values larger than 4 Gbit.
+		static_assert(Bits <= chunk::mask, "shr() of unreasonably large values is not supported");
+		// Detect shifts definitely large than Bits early.
+		for (size_t n = 1; n < amount.chunks; n++)
+			if (amount.data[n] != 0)
+				return {};
+		// Past this point we can use the least significant chunk as the shift size.
+		size_t shift_chunks = amount.data[0] / chunk::bits;
+		size_t shift_bits   = amount.data[0] % chunk::bits;
+		if (shift_chunks >= chunks)
+			return {};
+		value<Bits> result;
+		chunk::type carry = 0;
+		for (size_t n = 0; n < chunks - shift_chunks; n++) {
+			result.data[chunks - shift_chunks - 1 - n] = carry | (data[chunks - 1 - n] >> shift_bits);
+			carry = (shift_bits == 0) ? 0
+				: data[chunks - 1 - n] << (chunk::bits - shift_bits);
+		}
+		if (Signed && is_neg()) {
+			for (size_t n = chunks - shift_chunks; n < chunks; n++)
+				result.data[n] = chunk::mask;
+			if (shift_bits != 0)
+				result.data[chunks - shift_chunks] |= chunk::mask << (chunk::bits - shift_bits);
+		}
+		return result;
+	}
+
+	template<size_t AmountBits>
+	value<Bits> sshr(const value<AmountBits> &amount) const {
+		return shr<AmountBits, /*Signed=*/true>(amount);
+	}
+
+	size_t ctpop() const {
+		size_t count = 0;
+		for (size_t n = 0; n < chunks; n++) {
+			// This loop implements the population count idiom as recognized by LLVM and GCC.
+			for (chunk::type x = data[n]; x != 0; count++)
+				x = x & (x - 1);
+		}
+		return count;
+	}
+
+	size_t ctlz() const {
+		size_t count = 0;
+		for (size_t n = 0; n < chunks; n++) {
+			chunk::type x = data[chunks - 1 - n];
+			if (x == 0) {
+				count += (n == 0 ? Bits % chunk::bits : chunk::bits);
+			} else {
+				// This loop implements the find first set idiom as recognized by LLVM.
+				for (; x != 0; count++)
+					x >>= 1;
+			}
+		}
+		return count;
+	}
+
+	template<bool Invert, bool CarryIn>
+	std::pair<value<Bits>, bool /*CarryOut*/> alu(const value<Bits> &other) const {
+		value<Bits> result;
+		bool carry = CarryIn;
+		for (size_t n = 0; n < result.chunks; n++) {
+			result.data[n] = data[n] + (Invert ? ~other.data[n] : other.data[n]) + carry;
+			carry = (result.data[n] <  data[n]) ||
+			        (result.data[n] == data[n] && carry);
+		}
+		result.data[result.chunks - 1] &= result.msb_mask;
+		return {result, carry};
+	}
+
+	value<Bits> add(const value<Bits> &other) const {
+		return alu</*Invert=*/false, /*CarryIn=*/false>(other).first;
+	}
+
+	value<Bits> sub(const value<Bits> &other) const {
+		return alu</*Invert=*/true, /*CarryIn=*/true>(other).first;
+	}
+
+	value<Bits> neg() const {
+		return value<Bits> { 0u }.sub(*this);
+	}
+
+	bool ucmp(const value<Bits> &other) const {
+		bool carry;
+		std::tie(std::ignore, carry) = alu</*Invert=*/true, /*CarryIn=*/true>(other);
+		return !carry; // a.ucmp(b) ≡ a u< b
+	}
+
+	bool scmp(const value<Bits> &other) const {
+		value<Bits> result;
+		bool carry;
+		std::tie(result, carry) = alu</*Invert=*/true, /*CarryIn=*/true>(other);
+		bool overflow = (is_neg() == !other.is_neg()) && (is_neg() != result.is_neg());
+		return result.is_neg() ^ overflow; // a.scmp(b) ≡ a s< b
+	}
+};
+
+// Expression template for a slice, usable as lvalue or rvalue, and composable with other expression templates here.
+template<class T, size_t Stop, size_t Start>
+struct slice_expr : public expr_base<slice_expr<T, Stop, Start>> {
+	static_assert(Stop >= Start, "slice_expr() may not reverse bit order");
+	static_assert(Start < T::bits && Stop < T::bits, "slice_expr() must be within bounds");
+	static constexpr size_t bits = Stop - Start + 1;
+
+	T &expr;
+
+	slice_expr(T &expr) : expr(expr) {}
+	slice_expr(const slice_expr<T, Stop, Start> &) = delete;
+
+	operator value<bits>() const {
+		return static_cast<const value<T::bits> &>(expr)
+			.template rtrunc<T::bits - Start>()
+			.template trunc<bits>();
+	}
+
+	slice_expr<T, Stop, Start> &operator=(const value<bits> &rhs) {
+		// Generic partial assignment implemented using a read-modify-write operation on the sliced expression.
+		expr = static_cast<const value<T::bits> &>(expr)
+			.template blit<Stop, Start>(rhs);
+		return *this;
+	}
+
+	// A helper that forces the cast to value<>, which allows deduction to work.
+	value<bits> val() const {
+		return static_cast<const value<bits> &>(*this);
+	}
+};
+
+// Expression template for a concatenation, usable as lvalue or rvalue, and composable with other expression templates here.
+template<class T, class U>
+struct concat_expr : public expr_base<concat_expr<T, U>> {
+	static constexpr size_t bits = T::bits + U::bits;
+
+	T &ms_expr;
+	U &ls_expr;
+
+	concat_expr(T &ms_expr, U &ls_expr) : ms_expr(ms_expr), ls_expr(ls_expr) {}
+	concat_expr(const concat_expr<T, U> &) = delete;
+
+	operator value<bits>() const {
+		value<bits> ms_shifted = static_cast<const value<T::bits> &>(ms_expr)
+			.template rzext<bits>();
+		value<bits> ls_extended = static_cast<const value<U::bits> &>(ls_expr)
+			.template zext<bits>();
+		return ms_shifted.bit_or(ls_extended);
+	}
+
+	concat_expr<T, U> &operator=(const value<bits> &rhs) {
+		ms_expr = rhs.template rtrunc<T::bits>();
+		ls_expr = rhs.template trunc<U::bits>();
+		return *this;
+	}
+
+	// A helper that forces the cast to value<>, which allows deduction to work.
+	value<bits> val() const {
+		return static_cast<const value<bits> &>(*this);
+	}
+};
+
+// Base class for expression templates, providing helper methods for operations that are valid on both rvalues and lvalues.
+//
+// Note that expression objects (slices and concatenations) constructed in this way should NEVER be captured because
+// they refer to temporaries that will, in general, only live until the end of the statement. For example, both of
+// these snippets perform use-after-free:
+//
+//    const auto &a = val.slice<7,0>().slice<1>();
+//    value<1> b = a;
+//
+//    auto &&c = val.slice<7,0>().slice<1>();
+//    c = value<1>{1u};
+//
+// An easy way to write code using slices and concatenations safely is to follow two simple rules:
+//   * Never explicitly name any type except `value<W>` or `const value<W> &`.
+//   * Never use a `const auto &` or `auto &&` in any such expression.
+// Then, any code that compiles will be well-defined.
+template<class T>
+struct expr_base {
+	template<size_t Stop, size_t Start = Stop>
+	slice_expr<const T, Stop, Start> slice() const {
+		return {*static_cast<const T *>(this)};
+	}
+
+	template<size_t Stop, size_t Start = Stop>
+	slice_expr<T, Stop, Start> slice() {
+		return {*static_cast<T *>(this)};
+	}
+
+	template<class U>
+	concat_expr<const T, typename std::remove_reference<const U>::type> concat(const U &other) const {
+		return {*static_cast<const T *>(this), other};
+	}
+
+	template<class U>
+	concat_expr<T, typename std::remove_reference<U>::type> concat(U &&other) {
+		return {*static_cast<T *>(this), other};
+	}
+};
+
+template<size_t Bits>
+std::ostream &operator<<(std::ostream &os, const value<Bits> &val) {
+	auto old_flags = os.flags(std::ios::right);
+	auto old_width = os.width(0);
+	auto old_fill  = os.fill('0');
+	os << val.bits << '\'' << std::hex;
+	for (size_t n = val.chunks - 1; n != (size_t)-1; n--) {
+		if (n == val.chunks - 1 && Bits % value<Bits>::chunk::bits != 0)
+			os.width((Bits % value<Bits>::chunk::bits + 3) / 4);
+		else
+			os.width((value<Bits>::chunk::bits + 3) / 4);
+		os << val.data[n];
+	}
+	os.fill(old_fill);
+	os.width(old_width);
+	os.flags(old_flags);
+	return os;
+}
+
+template<size_t Bits>
+struct wire {
+	static constexpr size_t bits = Bits;
+
+	value<Bits> curr;
+	value<Bits> next;
+
+	wire() = default;
+	constexpr wire(const value<Bits> &init) : curr(init), next(init) {}
+	template<typename... Init>
+	explicit constexpr wire(Init ...init) : curr{init...}, next{init...} {}
+
+	wire(const wire<Bits> &) = delete;
+	wire(wire<Bits> &&) = default;
+	wire<Bits> &operator=(const wire<Bits> &) = delete;
+
+	// We want to avoid having operator=(wire<>) or operator=(value<>) that overwrites both curr and next,
+	// since this operation is almost always wrong. But we also need an operation like that for memory
+	// initialization. This is solved by adding a wrapper and making the use of operator= valid only when
+	// this wrapper is used.
+	struct init {
+		value<Bits> data;
+	};
+
+	wire<Bits> &operator=(const init &init) {
+		curr = next = init.data;
+		return *this;
+	}
+
+	bool commit() {
+		if (curr != next) {
+			curr = next;
+			return true;
+		}
+		return false;
+	}
+};
+
+template<size_t Bits>
+std::ostream &operator<<(std::ostream &os, const wire<Bits> &val) {
+	os << val.curr;
+	return os;
+}
+
+template<class Elem>
+struct memory {
+	using StoredElem = typename std::remove_const<Elem>::type;
+	std::vector<StoredElem> data;
+
+	static constexpr size_t width = StoredElem::bits;
+	size_t depth() const {
+		return data.size();
+	}
+
+	memory() = delete;
+	explicit memory(size_t depth) : data(depth) {}
+
+	memory(const memory<Elem> &) = delete;
+	memory<Elem> &operator=(const memory<Elem> &) = delete;
+
+	// The only way to get the compiler to put the initializer in .rodata and do not copy it on stack is to stuff it
+	// into a plain array. You'd think an std::initializer_list would work here, but it doesn't, because you can't
+	// construct an initializer_list in a constexpr (or something) and so if you try to do that the whole thing is
+	// first copied on the stack (probably overflowing it) and then again into `data`.
+	template<size_t Size>
+	struct init {
+		size_t offset;
+		typename Elem::init data[Size];
+	};
+
+	template<size_t... InitSize>
+	explicit memory(size_t depth, const init<InitSize> &...init) : data(depth) {
+		// FIXME: assert(init.size() <= depth);
+		data.resize(depth);
+		// This utterly reprehensible construct is the most reasonable way to apply a function to every element
+		// of a parameter pack, if the elements all have different types and so cannot be cast to an initializer list.
+		auto _ = {std::move(std::begin(init.data), std::end(init.data), data.begin() + init.offset)...};
+	}
+
+	Elem &operator [](size_t index) {
+		// FIXME: assert(index < data.size());
+		return data[index];
+	}
+
+	template<size_t AddrBits>
+	Elem &operator [](const value<AddrBits> &addr) {
+		static_assert(value<AddrBits>::chunks <= 1, "memory indexing with unreasonably large address is not supported");
+		return (*this)[addr.data[0]];
+	}
+};
+
+template<size_t Width>
+using memory_rw = memory<wire<Width>>;
+
+template<size_t Width>
+using memory_ro = memory<const value<Width>>;
+
+struct module {
+	module() {}
+	virtual ~module() {}
+
+	module(const module &) = delete;
+	module &operator=(const module &) = delete;
+
+	virtual void eval() = 0;
+	virtual bool commit() = 0;
+
+	size_t step() {
+		size_t deltas = 0;
+		do {
+			eval();
+			deltas++;
+		} while (commit());
+		return deltas;
+	}
+};
+
+} // namespace cxxrtl
+
+// Definitions of internal Yosys cells. Other than the functions in this namespace, cxxrtl is fully generic
+// and indepenent of Yosys implementation details.
+//
+// The `write_cxxrtl` pass translates internal cells (cells with names that start with `$`) to calls of these
+// functions. All of Yosys arithmetic and logical cells perform sign or zero extension on their operands,
+// whereas basic operations on arbitrary width values require operands to be of the same width. These functions
+// bridge the gap by performing the necessary casts. They are named similar to `cell_A[B]`, where A and B are `u`
+// if the corresponding operand is unsigned, and `s` if it is signed.
+namespace cxxrtl_yosys {
+
+using namespace cxxrtl;
+
+// std::max isn't constexpr until C++14 for no particular reason (it's an oversight), so we define our own.
+template<class T>
+constexpr T max(const T &a, const T &b) {
+	return a > b ? a : b;
+}
+
+// Logic operations
+template<size_t BitsY, size_t BitsA>
+value<BitsY> not_u(const value<BitsA> &a) {
+	return a.template zcast<BitsY>().bit_not();
+}
+
+template<size_t BitsY, size_t BitsA>
+value<BitsY> not_s(const value<BitsA> &a) {
+	return a.template scast<BitsY>().bit_not();
+}
+
+template<size_t BitsY, size_t BitsA>
+value<BitsY> logic_not_u(const value<BitsA> &a) {
+	return value<BitsY> { a ? 0u : 1u };
+}
+
+template<size_t BitsY, size_t BitsA>
+value<BitsY> logic_not_s(const value<BitsA> &a) {
+	return value<BitsY> { a ? 0u : 1u };
+}
+
+template<size_t BitsY, size_t BitsA>
+value<BitsY> reduce_and_u(const value<BitsA> &a) {
+	return value<BitsY> { a.bit_not().is_zero() ? 1u : 0u };
+}
+
+template<size_t BitsY, size_t BitsA>
+value<BitsY> reduce_and_s(const value<BitsA> &a) {
+	return value<BitsY> { a.bit_not().is_zero() ? 1u : 0u };
+}
+
+template<size_t BitsY, size_t BitsA>
+value<BitsY> reduce_or_u(const value<BitsA> &a) {
+	return value<BitsY> { a ? 1u : 0u };
+}
+
+template<size_t BitsY, size_t BitsA>
+value<BitsY> reduce_or_s(const value<BitsA> &a) {
+	return value<BitsY> { a ? 1u : 0u };
+}
+
+template<size_t BitsY, size_t BitsA>
+value<BitsY> reduce_xor_u(const value<BitsA> &a) {
+	return value<BitsY> { (a.ctpop() % 2) ? 1u : 0u };
+}
+
+template<size_t BitsY, size_t BitsA>
+value<BitsY> reduce_xor_s(const value<BitsA> &a) {
+	return value<BitsY> { (a.ctpop() % 2) ? 1u : 0u };
+}
+
+template<size_t BitsY, size_t BitsA>
+value<BitsY> reduce_xnor_u(const value<BitsA> &a) {
+	return value<BitsY> { (a.ctpop() % 2) ? 0u : 1u };
+}
+
+template<size_t BitsY, size_t BitsA>
+value<BitsY> reduce_xnor_s(const value<BitsA> &a) {
+	return value<BitsY> { (a.ctpop() % 2) ? 0u : 1u };
+}
+
+template<size_t BitsY, size_t BitsA>
+value<BitsY> reduce_bool_u(const value<BitsA> &a) {
+	return value<BitsY> { a ? 1u : 0u };
+}
+
+template<size_t BitsY, size_t BitsA>
+value<BitsY> reduce_bool_s(const value<BitsA> &a) {
+	return value<BitsY> { a ? 1u : 0u };
+}
+
+template<size_t BitsY, size_t BitsA, size_t BitsB>
+value<BitsY> and_uu(const value<BitsA> &a, const value<BitsB> &b) {
+	return a.template zcast<BitsY>().bit_and(b.template zcast<BitsY>());
+}
+
+template<size_t BitsY, size_t BitsA, size_t BitsB>
+value<BitsY> and_ss(const value<BitsA> &a, const value<BitsB> &b) {
+	return a.template scast<BitsY>().bit_and(b.template scast<BitsY>());
+}
+
+template<size_t BitsY, size_t BitsA, size_t BitsB>
+value<BitsY> or_uu(const value<BitsA> &a, const value<BitsB> &b) {
+	return a.template zcast<BitsY>().bit_or(b.template zcast<BitsY>());
+}
+
+template<size_t BitsY, size_t BitsA, size_t BitsB>
+value<BitsY> or_ss(const value<BitsA> &a, const value<BitsB> &b) {
+	return a.template scast<BitsY>().bit_or(b.template scast<BitsY>());
+}
+
+template<size_t BitsY, size_t BitsA, size_t BitsB>
+value<BitsY> xor_uu(const value<BitsA> &a, const value<BitsB> &b) {
+	return a.template zcast<BitsY>().bit_xor(b.template zcast<BitsY>());
+}
+
+template<size_t BitsY, size_t BitsA, size_t BitsB>
+value<BitsY> xor_ss(const value<BitsA> &a, const value<BitsB> &b) {
+	return a.template scast<BitsY>().bit_xor(b.template scast<BitsY>());
+}
+
+template<size_t BitsY, size_t BitsA, size_t BitsB>
+value<BitsY> xnor_uu(const value<BitsA> &a, const value<BitsB> &b) {
+	return a.template zcast<BitsY>().bit_xor(b.template zcast<BitsY>()).bit_not();
+}
+
+template<size_t BitsY, size_t BitsA, size_t BitsB>
+value<BitsY> xnor_ss(const value<BitsA> &a, const value<BitsB> &b) {
+	return a.template scast<BitsY>().bit_xor(b.template scast<BitsY>()).bit_not();
+}
+
+template<size_t BitsY, size_t BitsA, size_t BitsB>
+value<BitsY> logic_and_uu(const value<BitsA> &a, const value<BitsB> &b) {
+	return value<BitsY> { (bool(a) & bool(b)) ? 1u : 0u };
+}
+
+template<size_t BitsY, size_t BitsA, size_t BitsB>
+value<BitsY> logic_and_ss(const value<BitsA> &a, const value<BitsB> &b) {
+	return value<BitsY> { (bool(a) & bool(b)) ? 1u : 0u };
+}
+
+template<size_t BitsY, size_t BitsA, size_t BitsB>
+value<BitsY> logic_or_uu(const value<BitsA> &a, const value<BitsB> &b) {
+	return value<BitsY> { (bool(a) | bool(b)) ? 1u : 0u };
+}
+
+template<size_t BitsY, size_t BitsA, size_t BitsB>
+value<BitsY> logic_or_ss(const value<BitsA> &a, const value<BitsB> &b) {
+	return value<BitsY> { (bool(a) | bool(b)) ? 1u : 0u };
+}
+
+template<size_t BitsY, size_t BitsA, size_t BitsB>
+value<BitsY> shl_uu(const value<BitsA> &a, const value<BitsB> &b) {
+	return a.template zcast<BitsY>().template shl(b);
+}
+
+template<size_t BitsY, size_t BitsA, size_t BitsB>
+value<BitsY> shl_su(const value<BitsA> &a, const value<BitsB> &b) {
+	return a.template scast<BitsY>().template shl(b);
+}
+
+template<size_t BitsY, size_t BitsA, size_t BitsB>
+value<BitsY> sshl_uu(const value<BitsA> &a, const value<BitsB> &b) {
+	return a.template zcast<BitsY>().template shl(b);
+}
+
+template<size_t BitsY, size_t BitsA, size_t BitsB>
+value<BitsY> sshl_su(const value<BitsA> &a, const value<BitsB> &b) {
+	return a.template scast<BitsY>().template shl(b);
+}
+
+template<size_t BitsY, size_t BitsA, size_t BitsB>
+value<BitsY> shr_uu(const value<BitsA> &a, const value<BitsB> &b) {
+	return a.template shr(b).template zcast<BitsY>();
+}
+
+template<size_t BitsY, size_t BitsA, size_t BitsB>
+value<BitsY> shr_su(const value<BitsA> &a, const value<BitsB> &b) {
+	return a.template shr(b).template scast<BitsY>();
+}
+
+template<size_t BitsY, size_t BitsA, size_t BitsB>
+value<BitsY> sshr_uu(const value<BitsA> &a, const value<BitsB> &b) {
+	return a.template shr(b).template zcast<BitsY>();
+}
+
+template<size_t BitsY, size_t BitsA, size_t BitsB>
+value<BitsY> sshr_su(const value<BitsA> &a, const value<BitsB> &b) {
+	return a.template shr(b).template scast<BitsY>();
+}
+
+template<size_t BitsY, size_t BitsA, size_t BitsB>
+value<BitsY> shift_uu(const value<BitsA> &a, const value<BitsB> &b) {
+	return shr_uu<BitsY>(a, b);
+}
+
+template<size_t BitsY, size_t BitsA, size_t BitsB>
+value<BitsY> shift_su(const value<BitsA> &a, const value<BitsB> &b) {
+	return shr_su<BitsY>(a, b);
+}
+
+template<size_t BitsY, size_t BitsA, size_t BitsB>
+value<BitsY> shift_us(const value<BitsA> &a, const value<BitsB> &b) {
+	return b.is_neg() ? shl_uu<BitsY>(a, b.template sext<BitsB + 1>().neg()) : shr_uu<BitsY>(a, b);
+}
+
+template<size_t BitsY, size_t BitsA, size_t BitsB>
+value<BitsY> shift_ss(const value<BitsA> &a, const value<BitsB> &b) {
+	return b.is_neg() ? shl_su<BitsY>(a, b.template sext<BitsB + 1>().neg()) : shr_su<BitsY>(a, b);
+}
+
+template<size_t BitsY, size_t BitsA, size_t BitsB>
+value<BitsY> shiftx_uu(const value<BitsA> &a, const value<BitsB> &b) {
+	return shift_uu<BitsY>(a, b);
+}
+
+template<size_t BitsY, size_t BitsA, size_t BitsB>
+value<BitsY> shiftx_su(const value<BitsA> &a, const value<BitsB> &b) {
+	return shift_su<BitsY>(a, b);
+}
+
+template<size_t BitsY, size_t BitsA, size_t BitsB>
+value<BitsY> shiftx_us(const value<BitsA> &a, const value<BitsB> &b) {
+	return shift_us<BitsY>(a, b);
+}
+
+template<size_t BitsY, size_t BitsA, size_t BitsB>
+value<BitsY> shiftx_ss(const value<BitsA> &a, const value<BitsB> &b) {
+	return shift_ss<BitsY>(a, b);
+}
+
+// Comparison operations
+template<size_t BitsY, size_t BitsA, size_t BitsB>
+value<BitsY> eq_uu(const value<BitsA> &a, const value<BitsB> &b) {
+	constexpr size_t BitsExt = max(BitsA, BitsB);
+	return value<BitsY>{ a.template zext<BitsExt>() == b.template zext<BitsExt>() ? 1u : 0u };
+}
+
+template<size_t BitsY, size_t BitsA, size_t BitsB>
+value<BitsY> eq_ss(const value<BitsA> &a, const value<BitsB> &b) {
+	constexpr size_t BitsExt = max(BitsA, BitsB);
+	return value<BitsY>{ a.template sext<BitsExt>() == b.template sext<BitsExt>() ? 1u : 0u };
+}
+
+template<size_t BitsY, size_t BitsA, size_t BitsB>
+value<BitsY> ne_uu(const value<BitsA> &a, const value<BitsB> &b) {
+	constexpr size_t BitsExt = max(BitsA, BitsB);
+	return value<BitsY>{ a.template zext<BitsExt>() != b.template zext<BitsExt>() ? 1u : 0u };
+}
+
+template<size_t BitsY, size_t BitsA, size_t BitsB>
+value<BitsY> ne_ss(const value<BitsA> &a, const value<BitsB> &b) {
+	constexpr size_t BitsExt = max(BitsA, BitsB);
+	return value<BitsY>{ a.template sext<BitsExt>() != b.template sext<BitsExt>() ? 1u : 0u };
+}
+
+template<size_t BitsY, size_t BitsA, size_t BitsB>
+value<BitsY> eqx_uu(const value<BitsA> &a, const value<BitsB> &b) {
+	return eq_uu<BitsY>(a, b);
+}
+
+template<size_t BitsY, size_t BitsA, size_t BitsB>
+value<BitsY> eqx_ss(const value<BitsA> &a, const value<BitsB> &b) {
+	return eq_ss<BitsY>(a, b);
+}
+
+template<size_t BitsY, size_t BitsA, size_t BitsB>
+value<BitsY> nex_uu(const value<BitsA> &a, const value<BitsB> &b) {
+	return ne_uu<BitsY>(a, b);
+}
+
+template<size_t BitsY, size_t BitsA, size_t BitsB>
+value<BitsY> nex_ss(const value<BitsA> &a, const value<BitsB> &b) {
+	return ne_ss<BitsY>(a, b);
+}
+
+template<size_t BitsY, size_t BitsA, size_t BitsB>
+value<BitsY> gt_uu(const value<BitsA> &a, const value<BitsB> &b) {
+	constexpr size_t BitsExt = max(BitsA, BitsB);
+	return value<BitsY> { b.template zext<BitsExt>().ucmp(a.template zext<BitsExt>()) ? 1u : 0u };
+}
+
+template<size_t BitsY, size_t BitsA, size_t BitsB>
+value<BitsY> gt_ss(const value<BitsA> &a, const value<BitsB> &b) {
+	constexpr size_t BitsExt = max(BitsA, BitsB);
+	return value<BitsY> { b.template sext<BitsExt>().scmp(a.template sext<BitsExt>()) ? 1u : 0u };
+}
+
+template<size_t BitsY, size_t BitsA, size_t BitsB>
+value<BitsY> ge_uu(const value<BitsA> &a, const value<BitsB> &b) {
+	constexpr size_t BitsExt = max(BitsA, BitsB);
+	return value<BitsY> { !a.template zext<BitsExt>().ucmp(b.template zext<BitsExt>()) ? 1u : 0u };
+}
+
+template<size_t BitsY, size_t BitsA, size_t BitsB>
+value<BitsY> ge_ss(const value<BitsA> &a, const value<BitsB> &b) {
+	constexpr size_t BitsExt = max(BitsA, BitsB);
+	return value<BitsY> { !a.template sext<BitsExt>().scmp(b.template sext<BitsExt>()) ? 1u : 0u };
+}
+
+template<size_t BitsY, size_t BitsA, size_t BitsB>
+value<BitsY> lt_uu(const value<BitsA> &a, const value<BitsB> &b) {
+	constexpr size_t BitsExt = max(BitsA, BitsB);
+	return value<BitsY> { a.template zext<BitsExt>().ucmp(b.template zext<BitsExt>()) ? 1u : 0u };
+}
+
+template<size_t BitsY, size_t BitsA, size_t BitsB>
+value<BitsY> lt_ss(const value<BitsA> &a, const value<BitsB> &b) {
+	constexpr size_t BitsExt = max(BitsA, BitsB);
+	return value<BitsY> { a.template sext<BitsExt>().scmp(b.template sext<BitsExt>()) ? 1u : 0u };
+}
+
+template<size_t BitsY, size_t BitsA, size_t BitsB>
+value<BitsY> le_uu(const value<BitsA> &a, const value<BitsB> &b) {
+	constexpr size_t BitsExt = max(BitsA, BitsB);
+	return value<BitsY> { !b.template zext<BitsExt>().ucmp(a.template zext<BitsExt>()) ? 1u : 0u };
+}
+
+template<size_t BitsY, size_t BitsA, size_t BitsB>
+value<BitsY> le_ss(const value<BitsA> &a, const value<BitsB> &b) {
+	constexpr size_t BitsExt = max(BitsA, BitsB);
+	return value<BitsY> { !b.template sext<BitsExt>().scmp(a.template sext<BitsExt>()) ? 1u : 0u };
+}
+
+// Arithmetic operations
+template<size_t BitsY, size_t BitsA>
+value<BitsY> pos_u(const value<BitsA> &a) {
+	return a.template zcast<BitsY>();
+}
+
+template<size_t BitsY, size_t BitsA>
+value<BitsY> pos_s(const value<BitsA> &a) {
+	return a.template scast<BitsY>();
+}
+
+template<size_t BitsY, size_t BitsA>
+value<BitsY> neg_u(const value<BitsA> &a) {
+	return a.template zcast<BitsY>().neg();
+}
+
+template<size_t BitsY, size_t BitsA>
+value<BitsY> neg_s(const value<BitsA> &a) {
+	return a.template scast<BitsY>().neg();
+}
+
+template<size_t BitsY, size_t BitsA, size_t BitsB>
+value<BitsY> add_uu(const value<BitsA> &a, const value<BitsB> &b) {
+	return a.template zcast<BitsY>().add(b.template zcast<BitsY>());
+}
+
+template<size_t BitsY, size_t BitsA, size_t BitsB>
+value<BitsY> add_ss(const value<BitsA> &a, const value<BitsB> &b) {
+	return a.template scast<BitsY>().add(b.template scast<BitsY>());
+}
+
+template<size_t BitsY, size_t BitsA, size_t BitsB>
+value<BitsY> sub_uu(const value<BitsA> &a, const value<BitsB> &b) {
+	return a.template zcast<BitsY>().sub(b.template zcast<BitsY>());
+}
+
+template<size_t BitsY, size_t BitsA, size_t BitsB>
+value<BitsY> sub_ss(const value<BitsA> &a, const value<BitsB> &b) {
+	return a.template scast<BitsY>().sub(b.template scast<BitsY>());
+}
+
+template<size_t BitsY, size_t BitsA, size_t BitsB>
+value<BitsY> mul_uu(const value<BitsA> &a, const value<BitsB> &b) {
+	value<BitsY> product;
+	value<BitsY> multiplicand = a.template zcast<BitsY>();
+	const value<BitsB> &multiplier = b;
+	uint32_t multiplicand_shift = 0;
+	for (size_t step = 0; step < BitsB; step++) {
+		if (multiplier.bit(step)) {
+			multiplicand = multiplicand.shl(value<32> { multiplicand_shift });
+			product = product.add(multiplicand);
+			multiplicand_shift = 0;
+		}
+		multiplicand_shift++;
+	}
+	return product;
+}
+
+template<size_t BitsY, size_t BitsA, size_t BitsB>
+value<BitsY> mul_ss(const value<BitsA> &a, const value<BitsB> &b) {
+	value<BitsB + 1> ub = b.template sext<BitsB + 1>();
+	if (ub.is_neg()) ub = ub.neg();
+	value<BitsY> y = mul_uu<BitsY>(a.template scast<BitsY>(), ub);
+	return b.is_neg() ? y.neg() : y;
+}
+
+template<size_t BitsY, size_t BitsA, size_t BitsB>
+std::pair<value<BitsY>, value<BitsY>> divmod_uu(const value<BitsA> &a, const value<BitsB> &b) {
+	constexpr size_t Bits = max(BitsY, max(BitsA, BitsB));
+	value<Bits> quotient;
+	value<Bits> dividend = a.template zext<Bits>();
+	value<Bits> divisor = b.template zext<Bits>();
+	if (dividend.ucmp(divisor))
+		return {/*quotient=*/value<BitsY> { 0u }, /*remainder=*/dividend.template trunc<BitsY>()};
+	uint32_t divisor_shift = dividend.ctlz() - divisor.ctlz();
+	divisor = divisor.shl(value<32> { divisor_shift });
+	for (size_t step = 0; step <= divisor_shift; step++) {
+		quotient = quotient.shl(value<1> { 1u });
+		if (!dividend.ucmp(divisor)) {
+			dividend = dividend.sub(divisor);
+			quotient.set_bit(0, true);
+		}
+		divisor = divisor.shr(value<1> { 1u });
+	}
+	return {quotient.template trunc<BitsY>(), /*remainder=*/dividend.template trunc<BitsY>()};
+}
+
+template<size_t BitsY, size_t BitsA, size_t BitsB>
+std::pair<value<BitsY>, value<BitsY>> divmod_ss(const value<BitsA> &a, const value<BitsB> &b) {
+	value<BitsA + 1> ua = a.template sext<BitsA + 1>();
+	value<BitsB + 1> ub = b.template sext<BitsB + 1>();
+	if (ua.is_neg()) ua = ua.neg();
+	if (ub.is_neg()) ub = ub.neg();
+	value<BitsY> y, r;
+	std::tie(y, r) = divmod_uu<BitsY>(ua, ub);
+	if (a.is_neg() != b.is_neg()) y = y.neg();
+	if (a.is_neg()) r = r.neg();
+	return {y, r};
+}
+
+template<size_t BitsY, size_t BitsA, size_t BitsB>
+value<BitsY> div_uu(const value<BitsA> &a, const value<BitsB> &b) {
+	return divmod_uu<BitsY>(a, b).first;
+}
+
+template<size_t BitsY, size_t BitsA, size_t BitsB>
+value<BitsY> div_ss(const value<BitsA> &a, const value<BitsB> &b) {
+	return divmod_ss<BitsY>(a, b).first;
+}
+
+template<size_t BitsY, size_t BitsA, size_t BitsB>
+value<BitsY> mod_uu(const value<BitsA> &a, const value<BitsB> &b) {
+	return divmod_uu<BitsY>(a, b).second;
+}
+
+template<size_t BitsY, size_t BitsA, size_t BitsB>
+value<BitsY> mod_ss(const value<BitsA> &a, const value<BitsB> &b) {
+	return divmod_ss<BitsY>(a, b).second;
+}
+
+} // namespace cxxrtl_yosys
+
+#endif
diff --git a/kernel/yosys.cc b/kernel/yosys.cc
index 380f7030b..cbf041f79 100644
--- a/kernel/yosys.cc
+++ b/kernel/yosys.cc
@@ -1034,6 +1034,8 @@ void run_backend(std::string filename, std::string command, RTLIL::Design *desig
 			command = "verilog";
 		else if (filename.size() > 3 && filename.compare(filename.size()-3, std::string::npos, ".il") == 0)
 			command = "ilang";
+		else if (filename.size() > 3 && filename.compare(filename.size()-3, std::string::npos, ".cc") == 0)
+			command = "cxxrtl";
 		else if (filename.size() > 4 && filename.compare(filename.size()-4, std::string::npos, ".aig") == 0)
 			command = "aiger";
 		else if (filename.size() > 5 && filename.compare(filename.size()-5, std::string::npos, ".blif") == 0)
diff --git a/kernel/yosys.h b/kernel/yosys.h
index 16e0aaf1c..6aed7c96a 100644
--- a/kernel/yosys.h
+++ b/kernel/yosys.h
@@ -207,6 +207,7 @@ namespace RTLIL {
 	struct SigSpec;
 	struct Wire;
 	struct Cell;
+	struct Memory;
 	struct Module;
 	struct Design;
 	struct Monitor;
@@ -229,6 +230,7 @@ using RTLIL::Design;
 namespace hashlib {
 	template<> struct hash_ops<RTLIL::Wire*> : hash_obj_ops {};
 	template<> struct hash_ops<RTLIL::Cell*> : hash_obj_ops {};
+	template<> struct hash_ops<RTLIL::Memory*> : hash_obj_ops {};
 	template<> struct hash_ops<RTLIL::Module*> : hash_obj_ops {};
 	template<> struct hash_ops<RTLIL::Design*> : hash_obj_ops {};
 	template<> struct hash_ops<RTLIL::Monitor*> : hash_obj_ops {};
@@ -236,6 +238,7 @@ namespace hashlib {
 
 	template<> struct hash_ops<const RTLIL::Wire*> : hash_obj_ops {};
 	template<> struct hash_ops<const RTLIL::Cell*> : hash_obj_ops {};
+	template<> struct hash_ops<const RTLIL::Memory*> : hash_obj_ops {};
 	template<> struct hash_ops<const RTLIL::Module*> : hash_obj_ops {};
 	template<> struct hash_ops<const RTLIL::Design*> : hash_obj_ops {};
 	template<> struct hash_ops<const RTLIL::Monitor*> : hash_obj_ops {};

From d6d727342112eb89451407bd1d9954b8279bd015 Mon Sep 17 00:00:00 2001
From: whitequark <whitequark@whitequark.org>
Date: Mon, 9 Dec 2019 19:05:52 +0000
Subject: [PATCH 02/10] write_cxxrtl: elide wires for results of comb cells
 used once.

This results in massive gains in performance, equally massive
reduction in compile time, and improved readability.
---
 backends/cxxrtl/cxxrtl.cc | 394 ++++++++++++++++++++++++++++++++++----
 1 file changed, 359 insertions(+), 35 deletions(-)

diff --git a/backends/cxxrtl/cxxrtl.cc b/backends/cxxrtl/cxxrtl.cc
index 2dc7b3d36..43d973ade 100644
--- a/backends/cxxrtl/cxxrtl.cc
+++ b/backends/cxxrtl/cxxrtl.cc
@@ -26,7 +26,175 @@
 USING_YOSYS_NAMESPACE
 PRIVATE_NAMESPACE_BEGIN
 
+static bool is_unary_cell(RTLIL::IdString type)
+{
+	return type.in(
+		ID($not), ID($logic_not), ID($reduce_and), ID($reduce_or), ID($reduce_xor), ID($reduce_xnor), ID($reduce_bool),
+		ID($pos), ID($neg));
+}
+
+static bool is_binary_cell(RTLIL::IdString type)
+{
+	return type.in(
+		ID($and), ID($or), ID($xor), ID($xnor), ID($logic_and), ID($logic_or),
+		ID($shl), ID($sshl), ID($shr), ID($sshr), ID($shift), ID($shiftx),
+		ID($eq), ID($ne), ID($eqx), ID($nex), ID($gt), ID($ge), ID($lt), ID($le),
+		ID($add), ID($sub), ID($mul), ID($div), ID($mod));
+}
+
+static bool is_elidable_cell(RTLIL::IdString type)
+{
+	return is_unary_cell(type) || is_binary_cell(type) || type == ID($mux);
+}
+
+static bool is_ff_cell(RTLIL::IdString type)
+{
+	return type.in(
+		ID($dff), ID($dffe), ID($adff), ID($dffsr));
+}
+
+struct FlowGraph {
+	struct Node {
+		enum class Type {
+			CONNECT,
+			CELL,
+			PROCESS
+		};
+
+		Type type;
+		RTLIL::SigSig connect = {};
+		const RTLIL::Cell *cell = NULL;
+		const RTLIL::Process *process = NULL;
+	};
+
+	std::vector<Node*> nodes;
+	dict<const RTLIL::Wire*, pool<Node*, hash_ptr_ops>> wire_defs, wire_uses;
+	dict<const RTLIL::Wire*, bool> wire_def_elidable, wire_use_elidable;
+
+	~FlowGraph()
+	{
+		for (auto node : nodes)
+			delete node;
+	}
+
+	void add_defs(Node *node, const RTLIL::SigSpec &sig, bool elidable)
+	{
+		for (auto chunk : sig.chunks())
+			if (chunk.wire)
+				wire_defs[chunk.wire].insert(node);
+		// Only defs of an entire wire in the right order can be elided.
+		if (sig.is_wire())
+			wire_def_elidable[sig.as_wire()] = elidable;
+	}
+
+	void add_uses(Node *node, const RTLIL::SigSpec &sig)
+	{
+		for (auto chunk : sig.chunks())
+			if (chunk.wire) {
+				wire_uses[chunk.wire].insert(node);
+				// Only a single use of an entire wire in the right order can be elided.
+				// (But the use can include other chunks.)
+				if (!wire_use_elidable.count(chunk.wire))
+					wire_use_elidable[chunk.wire] = true;
+				else
+					wire_use_elidable[chunk.wire] = false;
+			}
+	}
+
+	bool is_elidable(const RTLIL::Wire *wire) const
+	{
+		if (wire_def_elidable.count(wire) && wire_use_elidable.count(wire))
+			return wire_def_elidable.at(wire) && wire_use_elidable.at(wire);
+		return false;
+	}
+
+	// Connections
+	void add_connect_defs_uses(Node *node, const RTLIL::SigSig &conn)
+	{
+		add_defs(node, conn.first, /*elidable=*/true);
+		add_uses(node, conn.second);
+	}
+
+	void add_node(const RTLIL::SigSig &conn)
+	{
+		Node *node = new Node;
+		node->type = Node::Type::CONNECT;
+		node->connect = conn;
+		nodes.push_back(node);
+		add_connect_defs_uses(node, conn);
+	}
+
+	// Cells
+	void add_cell_defs_uses(Node *node, const RTLIL::Cell *cell)
+	{
+		log_assert(cell->known());
+		for (auto conn : cell->connections()) {
+			if (cell->output(conn.first)) {
+				if (is_ff_cell(cell->type))
+					/* non-combinatorial outputs do not introduce defs */;
+				else if (is_elidable_cell(cell->type))
+					add_defs(node, conn.second, /*elidable=*/true);
+				else
+					add_defs(node, conn.second, /*elidable=*/false);
+			}
+			if (cell->input(conn.first))
+				add_uses(node, conn.second);
+		}
+	}
+
+	void add_node(const RTLIL::Cell *cell)
+	{
+		Node *node = new Node;
+		node->type = Node::Type::CELL;
+		node->cell = cell;
+		nodes.push_back(node);
+		add_cell_defs_uses(node, cell);
+	}
+
+	// Processes
+	void add_case_defs_uses(Node *node, const RTLIL::CaseRule *case_)
+	{
+		for (auto &action : case_->actions) {
+			add_defs(node, action.first, /*elidable=*/false);
+			add_uses(node, action.second);
+		}
+		for (auto sub_switch : case_->switches) {
+			add_uses(node, sub_switch->signal);
+			for (auto sub_case : sub_switch->cases) {
+				for (auto &compare : sub_case->compare)
+					add_uses(node, compare);
+				add_case_defs_uses(node, sub_case);
+			}
+		}
+	}
+
+	void add_process_defs_uses(Node *node, const RTLIL::Process *process)
+	{
+		add_case_defs_uses(node, &process->root_case);
+		for (auto sync : process->syncs)
+			for (auto action : sync->actions) {
+				if (sync->type == RTLIL::STp || sync->type == RTLIL::STn || sync->type == RTLIL::STe)
+				  /* sync actions do not introduce feedback */;
+				else
+					add_defs(node, action.first, /*elidable=*/false);
+				add_uses(node, action.second);
+			}
+	}
+
+	void add_node(const RTLIL::Process *process)
+	{
+		Node *node = new Node;
+		node->type = Node::Type::PROCESS;
+		node->process = process;
+		nodes.push_back(node);
+		add_process_defs_uses(node, process);
+	}
+};
+
 struct CxxrtlWorker {
+	bool elide_internal = false;
+	bool elide_public = false;
+
 	std::ostream &f;
 	std::string indent;
 	int temporary = 0;
@@ -35,6 +203,7 @@ struct CxxrtlWorker {
 	pool<const RTLIL::Wire*> sync_wires;
 	dict<RTLIL::SigBit, RTLIL::SyncType> sync_types;
 	pool<const RTLIL::Memory*> writable_memories;
+	dict<const RTLIL::Wire*, FlowGraph::Node> elided_wires;
 
 	CxxrtlWorker(std::ostream &f) : f(f) {}
 
@@ -183,7 +352,21 @@ struct CxxrtlWorker {
 			dump_const(chunk.data, chunk.width, chunk.offset);
 			return false;
 		} else {
-			f << mangle(chunk.wire) << (is_lhs ? ".next" : ".curr");
+			if (!is_lhs && elided_wires.count(chunk.wire)) {
+				const FlowGraph::Node &node = elided_wires[chunk.wire];
+				switch (node.type) {
+					case FlowGraph::Node::Type::CONNECT:
+						dump_connect_elided(node.connect);
+						break;
+					case FlowGraph::Node::Type::CELL:
+						dump_cell_elided(node.cell);
+						break;
+					default:
+						log_assert(false);
+				}
+			} else {
+				f << mangle(chunk.wire) << (is_lhs ? ".next" : ".curr");
+			}
 			if (chunk.width == chunk.wire->width && chunk.offset == 0)
 				return false;
 			else if (chunk.width == 1)
@@ -228,56 +411,134 @@ struct CxxrtlWorker {
 			f << ".val()";
 	}
 
-	void dump_assign(const RTLIL::SigSig &sigsig)
+	void collect_sigspec_rhs(const RTLIL::SigSpec &sig, std::vector<RTLIL::IdString> &cells)
 	{
+		for (auto chunk : sig.chunks()) {
+			if (!chunk.wire || !elided_wires.count(chunk.wire))
+				continue;
+
+			const FlowGraph::Node &node = elided_wires[chunk.wire];
+			switch (node.type) {
+				case FlowGraph::Node::Type::CONNECT:
+					collect_connect(node.connect, cells);
+					break;
+				case FlowGraph::Node::Type::CELL:
+					collect_cell(node.cell, cells);
+					break;
+				default:
+					log_assert(false);
+			}
+		}
+	}
+
+	void dump_connect_elided(const RTLIL::SigSig &conn)
+	{
+		dump_sigspec_rhs(conn.second);
+	}
+
+	bool is_connect_elided(const RTLIL::SigSig &conn)
+	{
+		return conn.first.is_wire() && elided_wires.count(conn.first.as_wire());
+	}
+
+	void collect_connect(const RTLIL::SigSig &conn, std::vector<RTLIL::IdString> &cells)
+	{
+		if (!is_connect_elided(conn))
+			return;
+
+		collect_sigspec_rhs(conn.second, cells);
+	}
+
+	void dump_connect(const RTLIL::SigSig &conn)
+	{
+		if (is_connect_elided(conn))
+			return;
+
 		f << indent;
-		dump_sigspec_lhs(sigsig.first);
+		dump_sigspec_lhs(conn.first);
 		f << " = ";
-		dump_sigspec_rhs(sigsig.second);
+		dump_connect_elided(conn);
 		f << ";\n";
 	}
 
-	void dump_cell(const RTLIL::Cell *cell)
+	void dump_cell_elided(const RTLIL::Cell *cell)
 	{
-		dump_attrs(cell);
-		f << indent << "// cell " << cell->name.str() << "\n";
 		// Unary cells
-		if (cell->type.in(
-		    ID($not), ID($logic_not), ID($reduce_and), ID($reduce_or), ID($reduce_xor), ID($reduce_xnor), ID($reduce_bool),
-		    ID($pos), ID($neg))) {
-			f << indent;
-			dump_sigspec_lhs(cell->getPort(ID(Y)));
-			f << " = " << cell->type.substr(1) << '_' <<
+		if (is_unary_cell(cell->type)) {
+			f << cell->type.substr(1) << '_' <<
 			     (cell->getParam(ID(A_SIGNED)).as_bool() ? 's' : 'u') <<
 			     "<" << cell->getParam(ID(Y_WIDTH)).as_int() << ">(";
 			dump_sigspec_rhs(cell->getPort(ID(A)));
-			f << ");\n";
+			f << ")";
 		// Binary cells
-		} else if (cell->type.in(
-		    ID($and), ID($or), ID($xor), ID($xnor), ID($logic_and), ID($logic_or),
-		    ID($shl), ID($sshl), ID($shr), ID($sshr), ID($shift), ID($shiftx),
-		    ID($eq), ID($ne), ID($eqx), ID($nex), ID($gt), ID($ge), ID($lt), ID($le),
-		    ID($add), ID($sub), ID($mul), ID($div), ID($mod))) {
-			f << indent;
-			dump_sigspec_lhs(cell->getPort(ID(Y)));
-			f << " = " << cell->type.substr(1) << '_' <<
+		} else if (is_binary_cell(cell->type)) {
+			f << cell->type.substr(1) << '_' <<
 			     (cell->getParam(ID(A_SIGNED)).as_bool() ? 's' : 'u') <<
 			     (cell->getParam(ID(B_SIGNED)).as_bool() ? 's' : 'u') <<
 			     "<" << cell->getParam(ID(Y_WIDTH)).as_int() << ">(";
 			dump_sigspec_rhs(cell->getPort(ID(A)));
 			f << ", ";
 			dump_sigspec_rhs(cell->getPort(ID(B)));
-			f << ");\n";
+			f << ")";
 		// Muxes
 		} else if (cell->type == ID($mux)) {
-			f << indent;
-			dump_sigspec_lhs(cell->getPort(ID(Y)));
-			f << " = ";
+			f << "(";
 			dump_sigspec_rhs(cell->getPort(ID(S)));
 			f << " ? ";
 			dump_sigspec_rhs(cell->getPort(ID(B)));
 			f << " : ";
 			dump_sigspec_rhs(cell->getPort(ID(A)));
+			f << ")";
+		} else {
+			log_assert(false);
+		}
+	}
+
+	bool is_cell_elided(const RTLIL::Cell *cell)
+	{
+		return cell->hasPort(ID(Y)) && cell->getPort(ID(Y)).is_wire() && elided_wires.count(cell->getPort(ID(Y)).as_wire());
+	}
+
+	void collect_cell(const RTLIL::Cell *cell, std::vector<RTLIL::IdString> &cells)
+	{
+		if (!is_cell_elided(cell))
+			return;
+
+		cells.push_back(cell->name);
+		for (auto port : cell->connections())
+			if (port.first != ID(Y))
+				collect_sigspec_rhs(port.second, cells);
+	}
+
+	void dump_cell(const RTLIL::Cell *cell)
+	{
+		if (is_cell_elided(cell))
+			return;
+		if (cell->type == ID($meminit))
+			return; // Handled elsewhere.
+
+		std::vector<RTLIL::IdString> elided_cells;
+		if (is_elidable_cell(cell->type)) {
+			for (auto port : cell->connections())
+				if (port.first != ID(Y))
+					collect_sigspec_rhs(port.second, elided_cells);
+		}
+		if (elided_cells.empty()) {
+			dump_attrs(cell);
+			f << indent << "// cell " << cell->name.str() << "\n";
+		} else {
+			f << indent << "// cells";
+			for (auto elided_cell : elided_cells)
+				f << " " << elided_cell.str();
+			f << "\n";
+		}
+
+		// Elidable cells
+		if (is_elidable_cell(cell->type)) {
+			f << indent;
+			dump_sigspec_lhs(cell->getPort(ID(Y)));
+			f << " = ";
+			dump_cell_elided(cell);
 			f << ";\n";
 		// Parallel (one-hot) muxes
 		} else if (cell->type == ID($pmux)) {
@@ -309,7 +570,7 @@ struct CxxrtlWorker {
 			dec_indent();
 			f << indent << "}\n";
 		// Flip-flops
-		} else if (cell->type.in(ID($dff), ID($dffe), ID($adff), ID($dffsr))) {
+		} else if (is_ff_cell(cell->type)) {
 			if (cell->getPort(ID(CLK)).is_wire()) {
 				// Edge-sensitive logic
 				RTLIL::SigBit clk_bit = cell->getPort(ID(CLK))[0];
@@ -437,8 +698,6 @@ struct CxxrtlWorker {
 				f << indent << "}\n";
 			}
 		// Memory initializers
-		} else if (cell->type == ID($meminit)) {
-			// Handled elsewhere.
 		} else if (cell->type[0] == '$') {
 			log_cmd_error("Unsupported internal cell `%s'.\n", cell->type.c_str());
 		} else {
@@ -446,6 +705,15 @@ struct CxxrtlWorker {
 		}
 	}
 
+	void dump_assign(const RTLIL::SigSig &sigsig)
+	{
+		f << indent;
+		dump_sigspec_lhs(sigsig.first);
+		f << " = ";
+		dump_sigspec_rhs(sigsig.second);
+		f << ";\n";
+	}
+
 	void dump_case_rule(const RTLIL::CaseRule *rule)
 	{
 		for (auto action : rule->actions)
@@ -571,6 +839,9 @@ struct CxxrtlWorker {
 
 	void dump_wire(const RTLIL::Wire *wire)
 	{
+		if (elided_wires.count(wire))
+			return;
+
 		dump_attrs(wire);
 		f << indent << "wire<" << wire->width << "> " << mangle(wire);
 		if (wire->attributes.count(ID(init))) {
@@ -661,7 +932,7 @@ struct CxxrtlWorker {
 				dump_cell(cell);
 			f << indent << "// connections\n";
 			for (auto conn : module->connections())
-				dump_assign(conn);
+				dump_connect(conn);
 			for (auto proc : module->processes)
 				dump_process(proc.second);
 			for (auto sync_type : sync_types) {
@@ -680,6 +951,8 @@ struct CxxrtlWorker {
 		inc_indent();
 			f << indent << "bool changed = false;\n";
 			for (auto wire : module->wires()) {
+				if (elided_wires.count(wire))
+					continue;
 				if (sync_wires[wire]) {
 					std::string wire_prev = mangle(wire) + "_prev";
 					std::string wire_curr = mangle(wire) + ".curr";
@@ -773,10 +1046,16 @@ struct CxxrtlWorker {
 	void analyze_design(RTLIL::Design *design)
 	{
 		for (auto module : design->modules()) {
+			FlowGraph flow;
 			SigMap &sigmap = sigmaps[module];
 			sigmap.set(module);
 
+			for (auto conn : module->connections())
+				flow.add_node(conn);
+
 			for (auto cell : module->cells()) {
+				flow.add_node(cell);
+
 				// Various DFF cells are treated like posedge/negedge processes, see above for details.
 				if (cell->type.in(ID($dff), ID($dffe), ID($adff), ID($dffsr))) {
 					if (cell->getPort(ID(CLK)).is_wire())
@@ -802,7 +1081,9 @@ struct CxxrtlWorker {
 					log_assert(false);
 			}
 
-			for (auto proc : module->processes)
+			for (auto proc : module->processes) {
+				flow.add_node(proc.second);
+
 				for (auto sync : proc.second->syncs)
 					switch (sync->type) {
 						// Edge-type sync rules require pre-registration.
@@ -826,6 +1107,18 @@ struct CxxrtlWorker {
 						case RTLIL::STg:
 							log_cmd_error("Global clock is not supported.\n");
 					}
+			}
+
+			for (auto wire : module->wires()) {
+				if (!flow.is_elidable(wire)) continue;
+				if (wire->port_id != 0) continue;
+				if (wire->get_bool_attribute(ID(keep))) continue;
+				if (wire->name.begins_with("$") && !elide_internal) continue;
+				if (wire->name.begins_with("\\") && !elide_public) continue;
+				if (sync_wires[wire]) continue;
+				log_assert(flow.wire_defs[wire].size() == 1);
+				elided_wires[wire] = **flow.wire_defs[wire].begin();
+			}
 		}
 	}
 
@@ -879,23 +1172,54 @@ struct CxxrtlBackend : public Backend {
 		log("\n");
 		log("Write C++ code for simulating the design.\n");
 		log("\n");
+		// -O2 (and not -O1) is the default because wire elision results in dramatic (>10x) decrease in compile- and run-time,
+		// which is well worth the need to manually drop to -O1 or to mark interesting wires with (*keep*).
+		log("    -O <level>\n");
+		log("        set the optimization level. the default is -O2.\n");
+		log("\n");
+		log("    -O0\n");
+		log("        no optimization.\n");
+		log("\n");
+		log("    -O1\n");
+		log("        elide internal wires if possible.\n");
+		log("\n");
+		log("    -O2\n");
+		log("        like -O1, and elide public wires not marked (*keep*) if possible.\n");
+		log("\n");
 	}
 	void execute(std::ostream *&f, std::string filename, std::vector<std::string> args, RTLIL::Design *design) YS_OVERRIDE
 	{
+		int opt_level = 2;
+
 		log_header(design, "Executing CXXRTL backend.\n");
 
 		size_t argidx;
 		for (argidx = 1; argidx < args.size(); argidx++)
 		{
-			// if (args[argidx] == "-top" && argidx+1 < args.size()) {
-			// 	top_module_name = args[++argidx];
-			// 	continue;
-			// }
+			if (args[argidx] == "-O" && argidx+1 < args.size()) {
+				opt_level = std::stoi(args[++argidx]);
+				continue;
+			}
+			if (args[argidx].substr(0, 2) == "-O" && args[argidx].size() == 3 && isdigit(args[argidx][2])) {
+				opt_level = std::stoi(args[argidx].substr(2));
+				continue;
+			}
 			break;
 		}
 		extra_args(f, filename, args, argidx);
 
 		CxxrtlWorker worker(*f);
+		switch (opt_level) {
+			case 2:
+				worker.elide_public = true;
+			case 1:
+				worker.elide_internal = true;
+			case 0:
+				break;
+			default:
+				log_cmd_error("Invalid optimization level %d.\n", opt_level);
+		}
+
 		worker.prepare_design(design);
 		worker.dump_design(design);
 	}

From 5157691f0eca5c5312524483491309a7e07d9710 Mon Sep 17 00:00:00 2001
From: whitequark <whitequark@whitequark.org>
Date: Tue, 10 Dec 2019 20:09:24 +0000
Subject: [PATCH 03/10] write_cxxrtl: statically schedule comb logic and
 localize wires.

This results in further massive gains in performance, modest decrease
in compile time, and, for designs without feedback arcs, makes it
possible to run eval() once per clock edge in certain conditions.
---
 backends/cxxrtl/cxxrtl.cc | 432 ++++++++++++++++++++++++++++++++------
 backends/cxxrtl/cxxrtl.h  |   4 +
 2 files changed, 368 insertions(+), 68 deletions(-)

diff --git a/backends/cxxrtl/cxxrtl.cc b/backends/cxxrtl/cxxrtl.cc
index 43d973ade..94da61a2c 100644
--- a/backends/cxxrtl/cxxrtl.cc
+++ b/backends/cxxrtl/cxxrtl.cc
@@ -26,6 +26,150 @@
 USING_YOSYS_NAMESPACE
 PRIVATE_NAMESPACE_BEGIN
 
+// [[CITE]]
+// Peter Eades; Xuemin Lin; W. F. Smyth, "A Fast Effective Heuristic For The Feedback Arc Set Problem"
+// Information Processing Letters, Vol. 47, pp 319-323, 1993
+// https://pdfs.semanticscholar.org/c7ed/d9acce96ca357876540e19664eb9d976637f.pdf
+
+// A topological sort (on a cell/wire graph) is always possible in a fully flattened RTLIL design without
+// processes or logic loops where every wire has a single driver. Logic loops are illegal in RTLIL and wires
+// with multiple drivers can be split by the `splitnets` pass; however, interdependencies between processes
+// or module instances can create strongly connected components without introducing evaluation nondeterminism.
+// We wish to support designs with such benign SCCs (as well as designs with multiple drivers per wire), so
+// we sort the graph in a way that minimizes feedback arcs. If there are no feedback arcs in the sorted graph,
+// then a more efficient evaluation method is possible, since eval() will always immediately converge.
+template<class T>
+struct Scheduler {
+	struct Vertex {
+		T *data;
+		Vertex *prev, *next;
+		pool<Vertex*, hash_ptr_ops> preds, succs;
+
+		Vertex() : data(NULL), prev(this), next(this) {}
+		Vertex(T *data) : data(data), prev(NULL), next(NULL) {}
+
+		bool empty() const
+		{
+			log_assert(data == NULL);
+			if (next == this) {
+				log_assert(prev == next);
+				return true;
+			}
+			return false;
+		}
+
+		void link(Vertex *list)
+		{
+			log_assert(prev == NULL && next == NULL);
+			next = list;
+			prev = list->prev;
+			list->prev->next = this;
+			list->prev = this;
+		}
+
+		void unlink()
+		{
+			log_assert(prev->next == this && next->prev == this);
+			prev->next = next;
+			next->prev = prev;
+			next = prev = NULL;
+		}
+
+		int delta() const
+		{
+			return succs.size() - preds.size();
+		}
+	};
+
+	std::vector<Vertex*> vertices;
+	Vertex *sources = new Vertex;
+	Vertex *sinks = new Vertex;
+	dict<int, Vertex*> bins;
+
+	~Scheduler()
+	{
+		delete sources;
+		delete sinks;
+		for (auto bin : bins)
+			delete bin.second;
+		for (auto vertex : vertices)
+			delete vertex;
+	}
+
+	Vertex *add(T *data)
+	{
+		Vertex *vertex = new Vertex(data);
+		vertices.push_back(vertex);
+		return vertex;
+	}
+
+	void relink(Vertex *vertex)
+	{
+		if (vertex->succs.empty())
+			vertex->link(sinks);
+		else if (vertex->preds.empty())
+			vertex->link(sources);
+		else {
+			int delta = vertex->delta();
+			if (!bins.count(delta))
+				bins[delta] = new Vertex;
+			vertex->link(bins[delta]);
+		}
+	}
+
+	Vertex *remove(Vertex *vertex)
+	{
+		vertex->unlink();
+		for (auto pred : vertex->preds) {
+			if (pred == vertex)
+				continue;
+			log_assert(pred->succs[vertex]);
+			pred->unlink();
+			pred->succs.erase(vertex);
+			relink(pred);
+		}
+		for (auto succ : vertex->succs) {
+			if (succ == vertex)
+				continue;
+			log_assert(succ->preds[vertex]);
+			succ->unlink();
+			succ->preds.erase(vertex);
+			relink(succ);
+		}
+		vertex->preds.clear();
+		vertex->succs.clear();
+		return vertex;
+	}
+
+	std::vector<Vertex*> schedule()
+	{
+		std::vector<Vertex*> s1, s2r;
+		for (auto vertex : vertices)
+			relink(vertex);
+		bool bins_empty = false;
+		while (!(sinks->empty() && sources->empty() && bins_empty)) {
+			while (!sinks->empty())
+				s2r.push_back(remove(sinks->next));
+			while (!sources->empty())
+				s1.push_back(remove(sources->next));
+			// Choosing u in this implementation isn't O(1), but the paper handwaves which data structure they suggest
+			// using to get O(1) relinking *and* find-max-key ("it is clear"... no it isn't), so this code uses a very
+			// naive implementation of find-max-key.
+			bins_empty = true;
+			bins.template sort<std::greater<int>>();
+			for (auto bin : bins) {
+				if (!bin.second->empty()) {
+					bins_empty = false;
+					s1.push_back(remove(bin.second->next));
+					break;
+				}
+			}
+		}
+		s1.insert(s1.end(), s2r.rbegin(), s2r.rend());
+		return s1;
+	}
+};
+
 static bool is_unary_cell(RTLIL::IdString type)
 {
 	return type.in(
@@ -115,13 +259,14 @@ struct FlowGraph {
 		add_uses(node, conn.second);
 	}
 
-	void add_node(const RTLIL::SigSig &conn)
+	Node *add_node(const RTLIL::SigSig &conn)
 	{
 		Node *node = new Node;
 		node->type = Node::Type::CONNECT;
 		node->connect = conn;
 		nodes.push_back(node);
 		add_connect_defs_uses(node, conn);
+		return node;
 	}
 
 	// Cells
@@ -130,7 +275,7 @@ struct FlowGraph {
 		log_assert(cell->known());
 		for (auto conn : cell->connections()) {
 			if (cell->output(conn.first)) {
-				if (is_ff_cell(cell->type))
+				if (is_ff_cell(cell->type) || (cell->type == ID($memrd) && cell->getParam(ID(CLK_ENABLE)).as_bool()))
 					/* non-combinatorial outputs do not introduce defs */;
 				else if (is_elidable_cell(cell->type))
 					add_defs(node, conn.second, /*elidable=*/true);
@@ -142,13 +287,14 @@ struct FlowGraph {
 		}
 	}
 
-	void add_node(const RTLIL::Cell *cell)
+	Node *add_node(const RTLIL::Cell *cell)
 	{
 		Node *node = new Node;
 		node->type = Node::Type::CELL;
 		node->cell = cell;
 		nodes.push_back(node);
 		add_cell_defs_uses(node, cell);
+		return node;
 	}
 
 	// Processes
@@ -181,19 +327,23 @@ struct FlowGraph {
 			}
 	}
 
-	void add_node(const RTLIL::Process *process)
+	Node *add_node(const RTLIL::Process *process)
 	{
 		Node *node = new Node;
 		node->type = Node::Type::PROCESS;
 		node->process = process;
 		nodes.push_back(node);
 		add_process_defs_uses(node, process);
+		return node;
 	}
 };
 
 struct CxxrtlWorker {
 	bool elide_internal = false;
 	bool elide_public = false;
+	bool localize_internal = false;
+	bool localize_public = false;
+	bool run_splitnets = false;
 
 	std::ostream &f;
 	std::string indent;
@@ -203,7 +353,10 @@ struct CxxrtlWorker {
 	pool<const RTLIL::Wire*> sync_wires;
 	dict<RTLIL::SigBit, RTLIL::SyncType> sync_types;
 	pool<const RTLIL::Memory*> writable_memories;
+	dict<const RTLIL::Cell*, pool<const RTLIL::Cell*>> transparent_for;
 	dict<const RTLIL::Wire*, FlowGraph::Node> elided_wires;
+	dict<const RTLIL::Module*, std::vector<FlowGraph::Node>> schedule;
+	pool<const RTLIL::Wire*> localized_wires;
 
 	CxxrtlWorker(std::ostream &f) : f(f) {}
 
@@ -364,6 +517,8 @@ struct CxxrtlWorker {
 					default:
 						log_assert(false);
 				}
+			} else if (localized_wires[chunk.wire]) {
+				f << mangle(chunk.wire);
 			} else {
 				f << mangle(chunk.wire) << (is_lhs ? ".next" : ".curr");
 			}
@@ -454,6 +609,7 @@ struct CxxrtlWorker {
 		if (is_connect_elided(conn))
 			return;
 
+		f << indent << "// connection\n";
 		f << indent;
 		dump_sigspec_lhs(conn.first);
 		f << " = ";
@@ -649,17 +805,36 @@ struct CxxrtlWorker {
 					f << ") {\n";
 					inc_indent();
 				}
-				f << indent;
-				dump_sigspec_lhs(cell->getPort(ID(DATA)));
-				f << " = " << mangle(memory) << "[";
-				dump_sigspec_rhs(cell->getPort(ID(ADDR)));
 				if (writable_memories[memory]) {
-					// FIXME: the handling of transparent read ports is a bit naughty: normally, nothing on RHS should ever
-					// read from `next`, since this can result in evaluation order nondeterminism, as well as issues with
-					// latches. However, for now this is the right tradeoff to make, since it allows to simplify $memrd/$memwr
-					// codegen dramatically.
-					f << "]." << (cell->getParam(ID(TRANSPARENT)).as_bool() ? "next" : "curr") << ";\n";
+					std::string addr_temp = fresh_temporary();
+					f << indent << "const value<" << cell->getPort(ID(ADDR)).size() << "> &" << addr_temp << " = ";
+					dump_sigspec_rhs(cell->getPort(ID(ADDR)));
+					f << ";\n";
+					std::string lhs_temp = fresh_temporary();
+					f << indent << "value<" << memory->width << "> " << lhs_temp << " = "
+					            << mangle(memory) << "[" << addr_temp << "].curr;\n";
+					for (auto memwr_cell : transparent_for[cell]) {
+						f << indent << "if (" << addr_temp << " == ";
+						dump_sigspec_rhs(memwr_cell->getPort(ID(ADDR)));
+						f << ") {\n";
+						inc_indent();
+							f << indent << lhs_temp << " = " << lhs_temp;
+							f << ".update(";
+							dump_sigspec_rhs(memwr_cell->getPort(ID(EN)));
+							f << ", ";
+							dump_sigspec_rhs(memwr_cell->getPort(ID(DATA)));
+							f << ");\n";
+						dec_indent();
+						f << indent << "}\n";
+					}
+					f << indent;
+					dump_sigspec_lhs(cell->getPort(ID(DATA)));
+					f << " = " << lhs_temp << ";\n";
 				} else {
+					f << indent;
+					dump_sigspec_lhs(cell->getPort(ID(DATA)));
+					f << " = " << mangle(memory) << "[";
+					dump_sigspec_rhs(cell->getPort(ID(ADDR)));
 					f << "];\n";
 				}
 				if (!cell->getPort(ID(EN)).is_fully_ones()) {
@@ -667,31 +842,17 @@ struct CxxrtlWorker {
 					f << indent << "}\n";
 				}
 			} else /*if (cell->type == ID($memwr))*/ {
+				// FIXME: handle write port priority, here and above in transparent $memrd cells
 				log_assert(writable_memories[memory]);
-				// FIXME: handle write port priority.
-				int width = cell->getParam(ID(WIDTH)).as_int();
 				std::string lhs_temp = fresh_temporary();
-				f << indent << "wire<" << width << "> &" << lhs_temp << " = " << mangle(memory) << "[";
+				f << indent << "wire<" << memory->width << "> &" << lhs_temp << " = " << mangle(memory) << "[";
 				dump_sigspec_rhs(cell->getPort(ID(ADDR)));
 				f << "];\n";
-				int start = 0;
-				RTLIL::SigBit prev_en_bit = RTLIL::Sm;
-				for (int stop = 0; stop < width + 1; stop++) {
-					if (stop == width || (prev_en_bit != RTLIL::Sm && prev_en_bit != cell->getPort(ID(EN))[stop])) {
-						f << indent << "if (";
-						dump_sigspec_rhs(prev_en_bit);
-						f << ") {\n";
-						inc_indent();
-							f << indent << lhs_temp << ".next.slice<" << (stop - 1) << "," << start << ">() = ";
-							dump_sigspec_rhs(cell->getPort(ID(DATA)).extract(start, stop - start));
-							f << ";\n";
-						dec_indent();
-						f << indent << "}\n";
-						start = stop + 1;
-					}
-					if (stop != width)
-						prev_en_bit = cell->getPort(ID(EN))[stop];
-				}
+				f << indent << lhs_temp << ".next = " << lhs_temp << ".curr.update(";
+				dump_sigspec_rhs(cell->getPort(ID(EN)));
+				f << ", ";
+				dump_sigspec_rhs(cell->getPort(ID(DATA)));
+				f << ");\n";
 			}
 			if (cell->getParam(ID(CLK_ENABLE)).as_bool()) {
 				dec_indent();
@@ -837,25 +998,36 @@ struct CxxrtlWorker {
 		}
 	}
 
-	void dump_wire(const RTLIL::Wire *wire)
+	void dump_wire(const RTLIL::Wire *wire, bool is_local)
 	{
 		if (elided_wires.count(wire))
 			return;
 
-		dump_attrs(wire);
-		f << indent << "wire<" << wire->width << "> " << mangle(wire);
-		if (wire->attributes.count(ID(init))) {
-			f << " ";
-			dump_const_init(wire->attributes.at(ID(init)));
-		}
-		f << ";\n";
-		if (sync_wires[wire]) {
-			for (auto sync_type : sync_types) {
-				if (sync_type.first.wire == wire) {
-					if (sync_type.second != RTLIL::STn)
-						f << indent << "bool posedge_" << mangle(sync_type.first) << " = false;\n";
-					if (sync_type.second != RTLIL::STp)
-						f << indent << "bool negedge_" << mangle(sync_type.first) << " = false;\n";
+		if (is_local) {
+			if (!localized_wires.count(wire))
+				return;
+
+			dump_attrs(wire);
+			f << indent << "value<" << wire->width << "> " << mangle(wire) << ";\n";
+		} else {
+			if (localized_wires.count(wire))
+				return;
+
+			dump_attrs(wire);
+			f << indent << "wire<" << wire->width << "> " << mangle(wire);
+			if (wire->attributes.count(ID(init))) {
+				f << " ";
+				dump_const_init(wire->attributes.at(ID(init)));
+			}
+			f << ";\n";
+			if (sync_wires[wire]) {
+				for (auto sync_type : sync_types) {
+					if (sync_type.first.wire == wire) {
+						if (sync_type.second != RTLIL::STn)
+							f << indent << "bool posedge_" << mangle(sync_type.first) << " = false;\n";
+						if (sync_type.second != RTLIL::STp)
+							f << indent << "bool negedge_" << mangle(sync_type.first) << " = false;\n";
+					}
 				}
 			}
 		}
@@ -914,7 +1086,7 @@ struct CxxrtlWorker {
 		f << "struct " << mangle(module) << " : public module {\n";
 		inc_indent();
 			for (auto wire : module->wires())
-				dump_wire(wire);
+				dump_wire(wire, /*is_local=*/false);
 			f << "\n";
 			for (auto memory : module->memories)
 				dump_memory(module, memory.second);
@@ -928,13 +1100,21 @@ struct CxxrtlWorker {
 
 		f << "void " << mangle(module) << "::eval() {\n";
 		inc_indent();
-			for (auto cell : module->cells())
-				dump_cell(cell);
-			f << indent << "// connections\n";
-			for (auto conn : module->connections())
-				dump_connect(conn);
-			for (auto proc : module->processes)
-				dump_process(proc.second);
+			for (auto wire : module->wires())
+				dump_wire(wire, /*is_local=*/true);
+			for (auto node : schedule[module]) {
+				switch (node.type) {
+					case FlowGraph::Node::Type::CONNECT:
+						dump_connect(node.connect);
+						break;
+					case FlowGraph::Node::Type::CELL:
+						dump_cell(node.cell);
+						break;
+					case FlowGraph::Node::Type::PROCESS:
+						dump_process(node.process);
+						break;
+				}
+			}
 			for (auto sync_type : sync_types) {
 				if (sync_type.first.wire->module == module) {
 					if (sync_type.second != RTLIL::STn)
@@ -951,7 +1131,7 @@ struct CxxrtlWorker {
 		inc_indent();
 			f << indent << "bool changed = false;\n";
 			for (auto wire : module->wires()) {
-				if (elided_wires.count(wire))
+				if (elided_wires.count(wire) || localized_wires.count(wire))
 					continue;
 				if (sync_wires[wire]) {
 					std::string wire_prev = mangle(wire) + "_prev";
@@ -1045,7 +1225,11 @@ struct CxxrtlWorker {
 
 	void analyze_design(RTLIL::Design *design)
 	{
+		bool has_feedback_arcs = false;
 		for (auto module : design->modules()) {
+			if (!design->selected_module(module))
+				continue;
+
 			FlowGraph flow;
 			SigMap &sigmap = sigmaps[module];
 			sigmap.set(module);
@@ -1053,8 +1237,11 @@ struct CxxrtlWorker {
 			for (auto conn : module->connections())
 				flow.add_node(conn);
 
+			dict<const RTLIL::Cell*, FlowGraph::Node*> memrw_cell_nodes;
+			dict<std::pair<RTLIL::SigBit, const RTLIL::Memory*>,
+			     pool<const RTLIL::Cell*>> memwr_per_domain;
 			for (auto cell : module->cells()) {
-				flow.add_node(cell);
+				FlowGraph::Node *node = flow.add_node(cell);
 
 				// Various DFF cells are treated like posedge/negedge processes, see above for details.
 				if (cell->type.in(ID($dff), ID($dffe), ID($adff), ID($dffsr))) {
@@ -1071,15 +1258,38 @@ struct CxxrtlWorker {
 							register_edge_signal(sigmap, cell->getPort(ID(CLK)),
 								cell->parameters[ID(CLK_POLARITY)].as_bool() ? RTLIL::STp : RTLIL::STn);
 					}
+					memrw_cell_nodes[cell] = node;
 				}
 				// Optimize access to read-only memories.
 				if (cell->type == ID($memwr))
 					writable_memories.insert(module->memories[cell->getParam(ID(MEMID)).decode_string()]);
+				// Collect groups of memory write ports in the same domain.
+				if (cell->type == ID($memwr) && cell->getParam(ID(CLK_ENABLE)).as_bool() && cell->getPort(ID(CLK)).is_wire()) {
+					RTLIL::SigBit clk_bit = sigmap(cell->getPort(ID(CLK)))[0];
+					const RTLIL::Memory *memory = module->memories[cell->getParam(ID(MEMID)).decode_string()];
+					memwr_per_domain[{clk_bit, memory}].insert(cell);
+				}
 				// Handling of packed memories is delegated to the `memory_unpack` pass, so we can rely on the presence
 				// of RTLIL memory objects and $memrd/$memwr/$meminit cells.
 				if (cell->type.in(ID($mem)))
 					log_assert(false);
 			}
+			for (auto cell : module->cells()) {
+				// Collect groups of memory write ports read by every transparent read port.
+				if (cell->type == ID($memrd) && cell->getParam(ID(CLK_ENABLE)).as_bool() && cell->getPort(ID(CLK)).is_wire() &&
+				    cell->getParam(ID(TRANSPARENT)).as_bool()) {
+					RTLIL::SigBit clk_bit = sigmap(cell->getPort(ID(CLK)))[0];
+					const RTLIL::Memory *memory = module->memories[cell->getParam(ID(MEMID)).decode_string()];
+					for (auto memwr_cell : memwr_per_domain[{clk_bit, memory}]) {
+						transparent_for[cell].insert(memwr_cell);
+						// Our implementation of transparent $memrd cells reads \EN, \ADDR and \DATA from every $memwr cell
+						// in the same domain, which isn't directly visible in the netlist. Add these uses explicitly.
+						flow.add_uses(memrw_cell_nodes[cell], memwr_cell->getPort(ID(EN)));
+						flow.add_uses(memrw_cell_nodes[cell], memwr_cell->getPort(ID(ADDR)));
+						flow.add_uses(memrw_cell_nodes[cell], memwr_cell->getPort(ID(DATA)));
+					}
+				}
+			}
 
 			for (auto proc : module->processes) {
 				flow.add_node(proc.second);
@@ -1119,6 +1329,69 @@ struct CxxrtlWorker {
 				log_assert(flow.wire_defs[wire].size() == 1);
 				elided_wires[wire] = **flow.wire_defs[wire].begin();
 			}
+
+			dict<FlowGraph::Node*, pool<const RTLIL::Wire*>, hash_ptr_ops> node_defs;
+			for (auto wire_def : flow.wire_defs)
+				for (auto node : wire_def.second)
+					node_defs[node].insert(wire_def.first);
+
+			Scheduler<FlowGraph::Node> scheduler;
+			dict<FlowGraph::Node*, Scheduler<FlowGraph::Node>::Vertex*, hash_ptr_ops> node_map;
+			for (auto node : flow.nodes)
+				node_map[node] = scheduler.add(node);
+			for (auto node_def : node_defs) {
+				auto vertex = node_map[node_def.first];
+				for (auto wire : node_def.second)
+					for (auto succ_node : flow.wire_uses[wire]) {
+						auto succ_vertex = node_map[succ_node];
+						vertex->succs.insert(succ_vertex);
+						succ_vertex->preds.insert(vertex);
+					}
+			}
+
+			auto eval_order = scheduler.schedule();
+			pool<FlowGraph::Node*, hash_ptr_ops> evaluated;
+			pool<const RTLIL::Wire*> feedback_wires;
+			for (auto vertex : eval_order) {
+				auto node = vertex->data;
+				schedule[module].push_back(*node);
+				// Any wire that is an output of node vo and input of node vi where vo is scheduled later than vi
+				// is a feedback wire. Feedback wires indicate apparent logic loops in the design, which may be
+				// caused by a true logic loop, but usually are a benign result of dependency tracking that works
+				// on wire, not bit, level. Nevertheless, feedback wires cannot be localized.
+				evaluated.insert(node);
+				for (auto wire : node_defs[node])
+					for (auto succ_node : flow.wire_uses[wire])
+						if (evaluated[succ_node]) {
+							feedback_wires.insert(wire);
+							// Feedback wires may never be elided because feedback requires state, but the point of elision
+							// (and localization) is to eliminate state.
+							elided_wires.erase(wire);
+						}
+			}
+
+			if (!feedback_wires.empty()) {
+				has_feedback_arcs = true;
+				log("Module `%s` contains feedback arcs through wires:\n", module->name.c_str());
+				for (auto wire : feedback_wires) {
+					log("  %s\n", wire->name.c_str());
+				}
+			}
+
+			for (auto wire : module->wires()) {
+				if (feedback_wires[wire]) continue;
+				if (wire->port_id != 0) continue;
+				if (wire->get_bool_attribute(ID(keep))) continue;
+				if (wire->name.begins_with("$") && !localize_internal) continue;
+				if (wire->name.begins_with("\\") && !localize_public) continue;
+				if (sync_wires[wire]) continue;
+				// Outputs of FF/$memrd cells and LHS of sync actions do not end up in defs.
+				if (flow.wire_defs[wire].size() != 1) continue;
+				localized_wires.insert(wire);
+			}
+		}
+		if (has_feedback_arcs) {
+			log("Feedback arcs require delta cycles during evaluation.\n");
 		}
 	}
 
@@ -1132,7 +1405,9 @@ struct CxxrtlWorker {
 
 			if (!design->selected_whole_module(module))
 				if (design->selected_module(module))
-					log_cmd_error("Can't handle partially selected module %s!\n", id2cstr(module->name));
+					log_cmd_error("Can't handle partially selected module `%s`!\n", id2cstr(module->name));
+			if (!design->selected_module(module))
+				continue;
 
 			for (auto proc : module->processes)
 				for (auto sync : proc.second->syncs)
@@ -1156,13 +1431,20 @@ struct CxxrtlWorker {
 		// Recheck the design if it was modified.
 		if (has_sync_init || has_packed_mem)
 			check_design(design, has_sync_init, has_packed_mem);
-
 		log_assert(!(has_sync_init || has_packed_mem));
+
+		if (run_splitnets) {
+			Pass::call(design, "splitnets -driver");
+			Pass::call(design, "opt_clean -purge");
+		}
+		log("\n");
 		analyze_design(design);
 	}
 };
 
 struct CxxrtlBackend : public Backend {
+	static const int DEFAULT_OPT_LEVEL = 5;
+
 	CxxrtlBackend() : Backend("cxxrtl", "convert design to C++ RTL simulation") { }
 	void help() YS_OVERRIDE
 	{
@@ -1172,10 +1454,10 @@ struct CxxrtlBackend : public Backend {
 		log("\n");
 		log("Write C++ code for simulating the design.\n");
 		log("\n");
-		// -O2 (and not -O1) is the default because wire elision results in dramatic (>10x) decrease in compile- and run-time,
-		// which is well worth the need to manually drop to -O1 or to mark interesting wires with (*keep*).
 		log("    -O <level>\n");
-		log("        set the optimization level. the default is -O2.\n");
+		log("        set the optimization level. the default is -O%d. higher optimization\n", DEFAULT_OPT_LEVEL);
+		log("        levels dramatically decrease compile and run time, and highest level\n");
+		log("        possible for a design should be used.\n");
 		log("\n");
 		log("    -O0\n");
 		log("        no optimization.\n");
@@ -1184,12 +1466,21 @@ struct CxxrtlBackend : public Backend {
 		log("        elide internal wires if possible.\n");
 		log("\n");
 		log("    -O2\n");
-		log("        like -O1, and elide public wires not marked (*keep*) if possible.\n");
+		log("        like -O1, and localize internal wires if possible.\n");
+		log("\n");
+		log("    -O3\n");
+		log("        like -O2, and elide public wires not marked (*keep*) if possible.\n");
+		log("\n");
+		log("    -O4\n");
+		log("        like -O3, and localize public wires not marked (*keep*) if possible.\n");
+		log("\n");
+		log("    -O5\n");
+		log("        like -O4, and run `splitnets -driver; opt_clean -purge` first.\n");
 		log("\n");
 	}
 	void execute(std::ostream *&f, std::string filename, std::vector<std::string> args, RTLIL::Design *design) YS_OVERRIDE
 	{
-		int opt_level = 2;
+		int opt_level = DEFAULT_OPT_LEVEL;
 
 		log_header(design, "Executing CXXRTL backend.\n");
 
@@ -1210,8 +1501,14 @@ struct CxxrtlBackend : public Backend {
 
 		CxxrtlWorker worker(*f);
 		switch (opt_level) {
-			case 2:
+			case 5:
+				worker.run_splitnets = true;
+			case 4:
+				worker.localize_public = true;
+			case 3:
 				worker.elide_public = true;
+			case 2:
+				worker.localize_internal = true;
 			case 1:
 				worker.elide_internal = true;
 			case 0:
@@ -1219,7 +1516,6 @@ struct CxxrtlBackend : public Backend {
 			default:
 				log_cmd_error("Invalid optimization level %d.\n", opt_level);
 		}
-
 		worker.prepare_design(design);
 		worker.dump_design(design);
 	}
diff --git a/backends/cxxrtl/cxxrtl.h b/backends/cxxrtl/cxxrtl.h
index d066530f2..a67591885 100644
--- a/backends/cxxrtl/cxxrtl.h
+++ b/backends/cxxrtl/cxxrtl.h
@@ -296,6 +296,10 @@ struct value : public expr_base<value<Bits>> {
 		return result;
 	}
 
+	value<Bits> update(const value<Bits> &mask, const value<Bits> &val) const {
+		return bit_and(mask.bit_not()).bit_or(val.bit_and(mask));
+	}
+
 	template<size_t AmountBits>
 	value<Bits> shl(const value<AmountBits> &amount) const {
 		// Ensure our early return is correct by prohibiting values larger than 4 Gbit.

From 3376dcf37c02f10552f84a9602b0d05c8f77ba3a Mon Sep 17 00:00:00 2001
From: whitequark <whitequark@whitequark.org>
Date: Sat, 4 Apr 2020 22:53:46 +0000
Subject: [PATCH 04/10] write_cxxrtl: avoid undefined behavior on out-of-bounds
 memory access.

After this commit, if NDEBUG is not defined, out-of-bounds accesses
cause assertion failures for reads and writes. If NDEBUG is defined,
out-of-bounds reads return zeroes, and out-of-bounds writes are
ignored.

This commit also adds support for memories that start with a non-zero
index (`Memory::start_offset` in RTLIL).
---
 backends/cxxrtl/cxxrtl.cc | 103 ++++++++++++++++++++++++--------------
 backends/cxxrtl/cxxrtl.h  |  21 +++++---
 2 files changed, 78 insertions(+), 46 deletions(-)

diff --git a/backends/cxxrtl/cxxrtl.cc b/backends/cxxrtl/cxxrtl.cc
index 94da61a2c..8a9e8348b 100644
--- a/backends/cxxrtl/cxxrtl.cc
+++ b/backends/cxxrtl/cxxrtl.cc
@@ -798,6 +798,10 @@ struct CxxrtlWorker {
 				inc_indent();
 			}
 			RTLIL::Memory *memory = cell->module->memories[cell->getParam(ID(MEMID)).decode_string()];
+			std::string valid_index_temp = fresh_temporary();
+			f << indent << "std::pair<bool, size_t> " << valid_index_temp << " = memory_index(";
+			dump_sigspec_rhs(cell->getPort(ID(ADDR)));
+			f << ", " << memory->start_offset << ", " << memory->size << ");\n";
 			if (cell->type == ID($memrd)) {
 				if (!cell->getPort(ID(EN)).is_fully_ones()) {
 					f << indent << "if (";
@@ -805,38 +809,54 @@ struct CxxrtlWorker {
 					f << ") {\n";
 					inc_indent();
 				}
-				if (writable_memories[memory]) {
-					std::string addr_temp = fresh_temporary();
-					f << indent << "const value<" << cell->getPort(ID(ADDR)).size() << "> &" << addr_temp << " = ";
-					dump_sigspec_rhs(cell->getPort(ID(ADDR)));
-					f << ";\n";
-					std::string lhs_temp = fresh_temporary();
-					f << indent << "value<" << memory->width << "> " << lhs_temp << " = "
-					            << mangle(memory) << "[" << addr_temp << "].curr;\n";
-					for (auto memwr_cell : transparent_for[cell]) {
-						f << indent << "if (" << addr_temp << " == ";
-						dump_sigspec_rhs(memwr_cell->getPort(ID(ADDR)));
-						f << ") {\n";
-						inc_indent();
-							f << indent << lhs_temp << " = " << lhs_temp;
-							f << ".update(";
-							dump_sigspec_rhs(memwr_cell->getPort(ID(EN)));
-							f << ", ";
-							dump_sigspec_rhs(memwr_cell->getPort(ID(DATA)));
-							f << ");\n";
-						dec_indent();
-						f << indent << "}\n";
+				// The generated code has two bounds checks; one in an assertion, and another that guards the read.
+				// This is done so that the code does not invoke undefined behavior under any conditions, but nevertheless
+				// loudly crashes if an illegal condition is encountered. The assert may be turned off with -NDEBUG not
+				// just for release builds, but also to make sure the simulator (which is presumably embedded in some
+				// larger program) will never crash the code that calls into it.
+				//
+				// If assertions are disabled, out of bounds reads are defined to return zero.
+				f << indent << "assert(" << valid_index_temp << ".first && \"out of bounds read\");\n";
+				f << indent << "if(" << valid_index_temp << ".first) {\n";
+				inc_indent();
+					if (writable_memories[memory]) {
+						std::string addr_temp = fresh_temporary();
+						f << indent << "const value<" << cell->getPort(ID(ADDR)).size() << "> &" << addr_temp << " = ";
+						dump_sigspec_rhs(cell->getPort(ID(ADDR)));
+						f << ";\n";
+						std::string lhs_temp = fresh_temporary();
+						f << indent << "value<" << memory->width << "> " << lhs_temp << " = "
+						            << mangle(memory) << "[" << valid_index_temp << ".second].curr;\n";
+						for (auto memwr_cell : transparent_for[cell]) {
+							f << indent << "if (" << addr_temp << " == ";
+							dump_sigspec_rhs(memwr_cell->getPort(ID(ADDR)));
+							f << ") {\n";
+							inc_indent();
+								f << indent << lhs_temp << " = " << lhs_temp;
+								f << ".update(";
+								dump_sigspec_rhs(memwr_cell->getPort(ID(EN)));
+								f << ", ";
+								dump_sigspec_rhs(memwr_cell->getPort(ID(DATA)));
+								f << ");\n";
+							dec_indent();
+							f << indent << "}\n";
+						}
+						f << indent;
+						dump_sigspec_lhs(cell->getPort(ID(DATA)));
+						f << " = " << lhs_temp << ";\n";
+					} else {
+						f << indent;
+						dump_sigspec_lhs(cell->getPort(ID(DATA)));
+						f << " = " << mangle(memory) << "[" << valid_index_temp << ".second];\n";
 					}
+				dec_indent();
+				f << indent << "} else {\n";
+				inc_indent();
 					f << indent;
 					dump_sigspec_lhs(cell->getPort(ID(DATA)));
-					f << " = " << lhs_temp << ";\n";
-				} else {
-					f << indent;
-					dump_sigspec_lhs(cell->getPort(ID(DATA)));
-					f << " = " << mangle(memory) << "[";
-					dump_sigspec_rhs(cell->getPort(ID(ADDR)));
-					f << "];\n";
-				}
+					f << " = value<" << memory->width << "> {};\n";
+				dec_indent();
+				f << indent << "}\n";
 				if (!cell->getPort(ID(EN)).is_fully_ones()) {
 					dec_indent();
 					f << indent << "}\n";
@@ -844,15 +864,22 @@ struct CxxrtlWorker {
 			} else /*if (cell->type == ID($memwr))*/ {
 				// FIXME: handle write port priority, here and above in transparent $memrd cells
 				log_assert(writable_memories[memory]);
-				std::string lhs_temp = fresh_temporary();
-				f << indent << "wire<" << memory->width << "> &" << lhs_temp << " = " << mangle(memory) << "[";
-				dump_sigspec_rhs(cell->getPort(ID(ADDR)));
-				f << "];\n";
-				f << indent << lhs_temp << ".next = " << lhs_temp << ".curr.update(";
-				dump_sigspec_rhs(cell->getPort(ID(EN)));
-				f << ", ";
-				dump_sigspec_rhs(cell->getPort(ID(DATA)));
-				f << ");\n";
+				// See above for rationale of having both the assert and the condition.
+				//
+				// If assertions are disabled, out of bounds writes are defined to do nothing.
+				f << indent << "assert(" << valid_index_temp << ".first && \"out of bounds write\");\n";
+				f << indent << "if (" << valid_index_temp << ".first) {\n";
+				inc_indent();
+					std::string lhs_temp = fresh_temporary();
+					f << indent << "wire<" << memory->width << "> &" << lhs_temp << " = ";
+					f << mangle(memory) << "[" << valid_index_temp << ".second];\n";
+					f << indent << lhs_temp << ".next = " << lhs_temp << ".curr.update(";
+					dump_sigspec_rhs(cell->getPort(ID(EN)));
+					f << ", ";
+					dump_sigspec_rhs(cell->getPort(ID(DATA)));
+					f << ");\n";
+				dec_indent();
+				f << indent << "}\n";
 			}
 			if (cell->getParam(ID(CLK_ENABLE)).as_bool()) {
 				dec_indent();
diff --git a/backends/cxxrtl/cxxrtl.h b/backends/cxxrtl/cxxrtl.h
index a67591885..18e45e22c 100644
--- a/backends/cxxrtl/cxxrtl.h
+++ b/backends/cxxrtl/cxxrtl.h
@@ -23,6 +23,7 @@
 
 #include <cstddef>
 #include <cstdint>
+#include <cassert>
 #include <limits>
 #include <type_traits>
 #include <tuple>
@@ -614,7 +615,6 @@ struct memory {
 
 	template<size_t... InitSize>
 	explicit memory(size_t depth, const init<InitSize> &...init) : data(depth) {
-		// FIXME: assert(init.size() <= depth);
 		data.resize(depth);
 		// This utterly reprehensible construct is the most reasonable way to apply a function to every element
 		// of a parameter pack, if the elements all have different types and so cannot be cast to an initializer list.
@@ -622,15 +622,9 @@ struct memory {
 	}
 
 	Elem &operator [](size_t index) {
-		// FIXME: assert(index < data.size());
+		assert(index < data.size());
 		return data[index];
 	}
-
-	template<size_t AddrBits>
-	Elem &operator [](const value<AddrBits> &addr) {
-		static_assert(value<AddrBits>::chunks <= 1, "memory indexing with unreasonably large address is not supported");
-		return (*this)[addr.data[0]];
-	}
 };
 
 template<size_t Width>
@@ -1103,6 +1097,17 @@ value<BitsY> mod_ss(const value<BitsA> &a, const value<BitsB> &b) {
 	return divmod_ss<BitsY>(a, b).second;
 }
 
+// Memory helper
+template<size_t BitsAddr>
+std::pair<bool, size_t> memory_index(const value<BitsAddr> &addr, size_t offset, size_t depth) {
+	static_assert(value<BitsAddr>::chunks <= 1, "memory address is too wide");
+	size_t offset_index = addr.data[0];
+
+	bool valid = (offset_index >= offset && offset_index < offset + depth);
+	size_t index = offset_index - offset;
+	return std::make_pair(valid, index);
+}
+
 } // namespace cxxrtl_yosys
 
 #endif

From fb0270b75258fa58cbf0594873721c88964f69a5 Mon Sep 17 00:00:00 2001
From: whitequark <whitequark@whitequark.org>
Date: Fri, 3 Apr 2020 16:07:43 +0000
Subject: [PATCH 05/10] write_cxxrtl: add support for hierarchical designs.

Hierarchical design simulations are generally much slower, but this
comes with a major increase in flexibility:
 1. Since the `flatten` pass currently does not support flattening
    of designs with processes, this is the only way to simulate such
    designs with cxxrtl.
 2. Support for hierarchy paves way for simulation black boxes,
    which are necessary for e.g. replacing PHYs with C++ code that
    integrates with the host system.
---
 backends/cxxrtl/cxxrtl.cc | 125 ++++++++++++++++++++++++++++++++------
 1 file changed, 107 insertions(+), 18 deletions(-)

diff --git a/backends/cxxrtl/cxxrtl.cc b/backends/cxxrtl/cxxrtl.cc
index 8a9e8348b..710462e96 100644
--- a/backends/cxxrtl/cxxrtl.cc
+++ b/backends/cxxrtl/cxxrtl.cc
@@ -1,7 +1,7 @@
 /*
  *  yosys -- Yosys Open SYnthesis Suite
  *
- *  Copyright (C) 2019  whitequark <whitequark@whitequark.org>
+ *  Copyright (C) 2019-2020  whitequark <whitequark@whitequark.org>
  *
  *  Permission to use, copy, modify, and/or distribute this software for any
  *  purpose with or without fee is hereby granted, provided that the above
@@ -20,6 +20,7 @@
 #include "kernel/rtlil.h"
 #include "kernel/register.h"
 #include "kernel/sigtools.h"
+#include "kernel/utils.h"
 #include "kernel/celltypes.h"
 #include "kernel/log.h"
 
@@ -197,6 +198,11 @@ static bool is_ff_cell(RTLIL::IdString type)
 		ID($dff), ID($dffe), ID($adff), ID($dffsr));
 }
 
+static bool is_internal_cell(RTLIL::IdString type)
+{
+	return type[0] == '$' && !type.begins_with("$paramod\\");
+}
+
 struct FlowGraph {
 	struct Node {
 		enum class Type {
@@ -279,8 +285,13 @@ struct FlowGraph {
 					/* non-combinatorial outputs do not introduce defs */;
 				else if (is_elidable_cell(cell->type))
 					add_defs(node, conn.second, /*elidable=*/true);
-				else
+				else if (is_internal_cell(cell->type))
 					add_defs(node, conn.second, /*elidable=*/false);
+				else {
+					// Unlike outputs of internal cells (which generate code that depends on the ability to set the output
+					// wire bits), outputs of user cells are normal wires, and the wires connected to them can be elided.
+					add_defs(node, conn.second, /*elidable=*/true);
+				}
 			}
 			if (cell->input(conn.first))
 				add_uses(node, conn.second);
@@ -354,6 +365,7 @@ struct CxxrtlWorker {
 	dict<RTLIL::SigBit, RTLIL::SyncType> sync_types;
 	pool<const RTLIL::Memory*> writable_memories;
 	dict<const RTLIL::Cell*, pool<const RTLIL::Cell*>> transparent_for;
+	dict<const RTLIL::Cell*, dict<RTLIL::Wire*, RTLIL::IdString>> cell_wire_defs;
 	dict<const RTLIL::Wire*, FlowGraph::Node> elided_wires;
 	dict<const RTLIL::Module*, std::vector<FlowGraph::Node>> schedule;
 	pool<const RTLIL::Wire*> localized_wires;
@@ -418,6 +430,12 @@ struct CxxrtlWorker {
 		return "memory_" + mangle_name(name);
 	}
 
+	std::string mangle_cell_name(const RTLIL::IdString &name)
+	{
+		// Class member namespace.
+		return "cell_" + mangle_name(name);
+	}
+
 	std::string mangle_wire_name(const RTLIL::IdString &name)
 	{
 		// Class member namespace.
@@ -434,6 +452,11 @@ struct CxxrtlWorker {
 		return mangle_memory_name(memory->name);
 	}
 
+	std::string mangle(const RTLIL::Cell *cell)
+	{
+		return mangle_cell_name(cell->name);
+	}
+
 	std::string mangle(const RTLIL::Wire *wire)
 	{
 		return mangle_wire_name(wire->name);
@@ -512,7 +535,11 @@ struct CxxrtlWorker {
 						dump_connect_elided(node.connect);
 						break;
 					case FlowGraph::Node::Type::CELL:
-						dump_cell_elided(node.cell);
+						if (is_elidable_cell(node.cell->type)) {
+							dump_cell_elided(node.cell);
+						} else {
+							f << mangle(node.cell) << "." << mangle_wire_name(cell_wire_defs[node.cell][chunk.wire]) << ".curr";
+						}
 						break;
 					default:
 						log_assert(false);
@@ -652,7 +679,8 @@ struct CxxrtlWorker {
 
 	bool is_cell_elided(const RTLIL::Cell *cell)
 	{
-		return cell->hasPort(ID(Y)) && cell->getPort(ID(Y)).is_wire() && elided_wires.count(cell->getPort(ID(Y)).as_wire());
+		return is_elidable_cell(cell->type) && cell->hasPort(ID(Y)) && cell->getPort(ID(Y)).is_wire() &&
+			elided_wires.count(cell->getPort(ID(Y)).as_wire());
 	}
 
 	void collect_cell(const RTLIL::Cell *cell, std::vector<RTLIL::IdString> &cells)
@@ -885,11 +913,31 @@ struct CxxrtlWorker {
 				dec_indent();
 				f << indent << "}\n";
 			}
-		// Memory initializers
-		} else if (cell->type[0] == '$') {
+		// Internal cells
+		} else if (is_internal_cell(cell->type)) {
 			log_cmd_error("Unsupported internal cell `%s'.\n", cell->type.c_str());
+		// User cells
 		} else {
-			log_assert(false);
+			log_assert(cell->known());
+			for (auto conn : cell->connections())
+				if (cell->input(conn.first)) {
+					f << indent << mangle(cell) << "." << mangle_wire_name(conn.first) << ".next = ";
+					dump_sigspec_rhs(conn.second);
+					f << ";\n";
+				}
+			f << indent << mangle(cell) << ".eval();\n";
+			for (auto conn : cell->connections()) {
+				if (conn.second.is_wire()) {
+					RTLIL::Wire *wire = conn.second.as_wire();
+					if (elided_wires.count(wire) && cell_wire_defs[cell].count(wire))
+						continue;
+				}
+				if (cell->output(conn.first)) {
+					f << indent;
+					dump_sigspec_lhs(conn.second);
+					f << " = " << mangle(cell) << "." << mangle_wire_name(conn.first) << ".curr;\n";
+				}
+			}
 		}
 	}
 
@@ -1115,9 +1163,21 @@ struct CxxrtlWorker {
 			for (auto wire : module->wires())
 				dump_wire(wire, /*is_local=*/false);
 			f << "\n";
-			for (auto memory : module->memories)
+			bool has_memories = false;
+			for (auto memory : module->memories) {
 				dump_memory(module, memory.second);
-			if (!module->memories.empty())
+				has_memories = true;
+			}
+			if (has_memories)
+				f << "\n";
+			bool has_cells = false;
+			for (auto cell : module->cells()) {
+				if (is_internal_cell(cell->type))
+					continue;
+				f << indent << mangle_module_name(cell->type) << " " << mangle(cell) << ";\n";
+				has_cells = true;
+			}
+			if (has_cells)
 				f << "\n";
 			f << indent << "void eval() override;\n";
 			f << indent << "bool commit() override;\n";
@@ -1152,8 +1212,8 @@ struct CxxrtlWorker {
 			}
 		dec_indent();
 		f << "}\n";
-
 		f << "\n";
+
 		f << "bool " << mangle(module) << "::commit() {\n";
 		inc_indent();
 			f << indent << "bool changed = false;\n";
@@ -1202,29 +1262,45 @@ struct CxxrtlWorker {
 					f << indent << "changed |= " << mangle(memory.second) << "[i].commit();\n";
 				dec_indent();
 			}
+			for (auto cell : module->cells()) {
+				if (is_internal_cell(cell->type))
+					continue;
+				f << indent << "changed |= " << mangle(cell) << ".commit();\n";
+			}
 			f << indent << "return changed;\n";
 		dec_indent();
 		f << "}\n";
+		f << "\n";
 	}
 
 	void dump_design(RTLIL::Design *design)
 	{
+		TopoSort<RTLIL::Module*> topo_design;
+		for (auto module : design->modules()) {
+			if (module->get_blackbox_attribute() || !design->selected_module(module))
+				continue;
+			topo_design.node(module);
+
+			for (auto cell : module->cells()) {
+				if (is_internal_cell(cell->type))
+					continue;
+				log_assert(design->has(cell->type));
+				topo_design.edge(design->module(cell->type), module);
+			}
+		}
+		log_assert(topo_design.sort());
+
 		f << "#include <cxxrtl.h>\n";
 		f << "\n";
 		f << "using namespace cxxrtl_yosys;\n";
 		f << "\n";
 		f << "namespace cxxrtl_design {\n";
-		for (auto module : design->modules()) {
-			if (module->get_blackbox_attribute())
-				continue;
-
+		f << "\n";
+		for (auto module : topo_design.sorted) {
 			if (!design->selected_module(module))
 				continue;
-
-			f << "\n";
 			dump_module(module);
 		}
-		f << "\n";
 		f << "} // namespace cxxrtl_design\n";
 	}
 
@@ -1357,6 +1433,14 @@ struct CxxrtlWorker {
 				elided_wires[wire] = **flow.wire_defs[wire].begin();
 			}
 
+			// Elided wires that are outputs of internal cells are always connected to a well known port (Y).
+			// For user cells, there could be multiple of them, and we need a way to look up the port name
+			// knowing only the wire.
+			for (auto cell : module->cells())
+				for (auto conn : cell->connections())
+					if (conn.second.is_wire() && elided_wires.count(conn.second.as_wire()))
+						cell_wire_defs[cell][conn.second.as_wire()] = conn.first;
+
 			dict<FlowGraph::Node*, pool<const RTLIL::Wire*>, hash_ptr_ops> node_defs;
 			for (auto wire_def : flow.wire_defs)
 				for (auto node : wire_def.second)
@@ -1451,8 +1535,13 @@ struct CxxrtlWorker {
 	{
 		bool has_sync_init, has_packed_mem;
 		check_design(design, has_sync_init, has_packed_mem);
-		if (has_sync_init)
+		if (has_sync_init) {
+			// We're only interested in proc_init, but it depends on proc_prune and proc_clean, so call those
+			// in case they weren't already. (This allows `yosys foo.v -o foo.cc` to work.)
+			Pass::call(design, "proc_prune");
+			Pass::call(design, "proc_clean");
 			Pass::call(design, "proc_init");
+		}
 		if (has_packed_mem)
 			Pass::call(design, "memory_unpack");
 		// Recheck the design if it was modified.

From 01e6850bd3b9c884a0ea9785ff5ff1ffd59b82e2 Mon Sep 17 00:00:00 2001
From: whitequark <whitequark@whitequark.org>
Date: Sun, 5 Apr 2020 02:06:26 +0000
Subject: [PATCH 06/10] write_cxxrtl: improve writable memory handling.

This commit reduces space and time overhead for writable memories
to O(write port count) in both cases; implements handling for write
port priorities; and simplifies runtime representation of memories.
---
 backends/cxxrtl/cxxrtl.cc |  49 +++++++++---------
 backends/cxxrtl/cxxrtl.h  | 105 +++++++++++++++++++++++---------------
 2 files changed, 88 insertions(+), 66 deletions(-)

diff --git a/backends/cxxrtl/cxxrtl.cc b/backends/cxxrtl/cxxrtl.cc
index 710462e96..6fd63548f 100644
--- a/backends/cxxrtl/cxxrtl.cc
+++ b/backends/cxxrtl/cxxrtl.cc
@@ -827,7 +827,7 @@ struct CxxrtlWorker {
 			}
 			RTLIL::Memory *memory = cell->module->memories[cell->getParam(ID(MEMID)).decode_string()];
 			std::string valid_index_temp = fresh_temporary();
-			f << indent << "std::pair<bool, size_t> " << valid_index_temp << " = memory_index(";
+			f << indent << "auto " << valid_index_temp << " = memory_index(";
 			dump_sigspec_rhs(cell->getPort(ID(ADDR)));
 			f << ", " << memory->start_offset << ", " << memory->size << ");\n";
 			if (cell->type == ID($memrd)) {
@@ -844,8 +844,8 @@ struct CxxrtlWorker {
 				// larger program) will never crash the code that calls into it.
 				//
 				// If assertions are disabled, out of bounds reads are defined to return zero.
-				f << indent << "assert(" << valid_index_temp << ".first && \"out of bounds read\");\n";
-				f << indent << "if(" << valid_index_temp << ".first) {\n";
+				f << indent << "assert(" << valid_index_temp << ".valid && \"out of bounds read\");\n";
+				f << indent << "if(" << valid_index_temp << ".valid) {\n";
 				inc_indent();
 					if (writable_memories[memory]) {
 						std::string addr_temp = fresh_temporary();
@@ -854,17 +854,22 @@ struct CxxrtlWorker {
 						f << ";\n";
 						std::string lhs_temp = fresh_temporary();
 						f << indent << "value<" << memory->width << "> " << lhs_temp << " = "
-						            << mangle(memory) << "[" << valid_index_temp << ".second].curr;\n";
-						for (auto memwr_cell : transparent_for[cell]) {
+						            << mangle(memory) << "[" << valid_index_temp << ".index];\n";
+						std::vector<const RTLIL::Cell*> memwr_cells(transparent_for[cell].begin(), transparent_for[cell].end());
+						std::sort(memwr_cells.begin(), memwr_cells.end(),
+							[](const RTLIL::Cell *a, const RTLIL::Cell *b) {
+								return a->getParam(ID(PRIORITY)).as_int() < b->getParam(ID(PRIORITY)).as_int();
+							});
+						for (auto memwr_cell : memwr_cells) {
 							f << indent << "if (" << addr_temp << " == ";
 							dump_sigspec_rhs(memwr_cell->getPort(ID(ADDR)));
 							f << ") {\n";
 							inc_indent();
 								f << indent << lhs_temp << " = " << lhs_temp;
 								f << ".update(";
-								dump_sigspec_rhs(memwr_cell->getPort(ID(EN)));
-								f << ", ";
 								dump_sigspec_rhs(memwr_cell->getPort(ID(DATA)));
+								f << ", ";
+								dump_sigspec_rhs(memwr_cell->getPort(ID(EN)));
 								f << ");\n";
 							dec_indent();
 							f << indent << "}\n";
@@ -875,7 +880,7 @@ struct CxxrtlWorker {
 					} else {
 						f << indent;
 						dump_sigspec_lhs(cell->getPort(ID(DATA)));
-						f << " = " << mangle(memory) << "[" << valid_index_temp << ".second];\n";
+						f << " = " << mangle(memory) << "[" << valid_index_temp << ".index];\n";
 					}
 				dec_indent();
 				f << indent << "} else {\n";
@@ -890,22 +895,18 @@ struct CxxrtlWorker {
 					f << indent << "}\n";
 				}
 			} else /*if (cell->type == ID($memwr))*/ {
-				// FIXME: handle write port priority, here and above in transparent $memrd cells
 				log_assert(writable_memories[memory]);
 				// See above for rationale of having both the assert and the condition.
 				//
 				// If assertions are disabled, out of bounds writes are defined to do nothing.
-				f << indent << "assert(" << valid_index_temp << ".first && \"out of bounds write\");\n";
-				f << indent << "if (" << valid_index_temp << ".first) {\n";
+				f << indent << "assert(" << valid_index_temp << ".valid && \"out of bounds write\");\n";
+				f << indent << "if (" << valid_index_temp << ".valid) {\n";
 				inc_indent();
-					std::string lhs_temp = fresh_temporary();
-					f << indent << "wire<" << memory->width << "> &" << lhs_temp << " = ";
-					f << mangle(memory) << "[" << valid_index_temp << ".second];\n";
-					f << indent << lhs_temp << ".next = " << lhs_temp << ".curr.update(";
-					dump_sigspec_rhs(cell->getPort(ID(EN)));
-					f << ", ";
+					f << indent << mangle(memory) << ".update(" << valid_index_temp << ".index, ";
 					dump_sigspec_rhs(cell->getPort(ID(DATA)));
-					f << ");\n";
+					f << ", ";
+					dump_sigspec_rhs(cell->getPort(ID(EN)));
+					f << ", " << cell->getParam(ID(PRIORITY)).as_int() << ");\n";
 				dec_indent();
 				f << indent << "}\n";
 			}
@@ -1122,8 +1123,8 @@ struct CxxrtlWorker {
 		});
 
 		dump_attrs(memory);
-		f << indent << "memory_" << (writable_memories[memory] ? "rw" : "ro")
-		            << "<" << memory->width << "> " << mangle(memory)
+		f << indent << (writable_memories[memory] ? "" : "const ")
+		            << "memory<" << memory->width << "> " << mangle(memory)
 		            << " { " << memory->size << "u";
 		if (init_cells.empty()) {
 			f << " };\n";
@@ -1135,8 +1136,7 @@ struct CxxrtlWorker {
 					RTLIL::Const data = cell->getPort(ID(DATA)).as_const();
 					size_t width = cell->getParam(ID(WIDTH)).as_int();
 					size_t words = cell->getParam(ID(WORDS)).as_int();
-					f << indent << "memory_" << (writable_memories[memory] ? "rw" : "ro")
-					            << "<" << memory->width << ">::init<" << words << "> { "
+					f << indent << "memory<" << memory->width << ">::init<" << words << "> { "
 					            << stringf("%#x", cell->getPort(ID(ADDR)).as_int()) << ", {";
 					inc_indent();
 						for (size_t n = 0; n < words; n++) {
@@ -1257,10 +1257,7 @@ struct CxxrtlWorker {
 			for (auto memory : module->memories) {
 				if (!writable_memories[memory.second])
 					continue;
-				f << indent << "for (size_t i = 0; i < " << memory.second->size << "u; i++)\n";
-				inc_indent();
-					f << indent << "changed |= " << mangle(memory.second) << "[i].commit();\n";
-				dec_indent();
+				f << indent << "changed |= " << mangle(memory.second) << ".commit();\n";
 			}
 			for (auto cell : module->cells()) {
 				if (is_internal_cell(cell->type))
diff --git a/backends/cxxrtl/cxxrtl.h b/backends/cxxrtl/cxxrtl.h
index 18e45e22c..593c31c28 100644
--- a/backends/cxxrtl/cxxrtl.h
+++ b/backends/cxxrtl/cxxrtl.h
@@ -1,7 +1,7 @@
 /*
  *  yosys -- Yosys Open SYnthesis Suite
  *
- *  Copyright (C) 2019  whitequark <whitequark@whitequark.org>
+ *  Copyright (C) 2019-2020  whitequark <whitequark@whitequark.org>
  *
  *  Permission to use, copy, modify, and/or distribute this software for any
  *  purpose with or without fee is hereby granted.
@@ -28,6 +28,7 @@
 #include <type_traits>
 #include <tuple>
 #include <vector>
+#include <algorithm>
 #include <sstream>
 
 // The cxxrtl support library implements compile time specialized arbitrary width arithmetics, as well as provides
@@ -73,9 +74,6 @@ struct value : public expr_base<value<Bits>> {
 	template<typename... Init>
 	explicit constexpr value(Init ...init) : data{init...} {}
 
-	// This allows using value<> as well as wire<> in memory initializers.
-	using init = value<Bits>;
-
 	value(const value<Bits> &) = default;
 	value(value<Bits> &&) = default;
 	value<Bits> &operator=(const value<Bits> &) = default;
@@ -297,7 +295,7 @@ struct value : public expr_base<value<Bits>> {
 		return result;
 	}
 
-	value<Bits> update(const value<Bits> &mask, const value<Bits> &val) const {
+	value<Bits> update(const value<Bits> &val, const value<Bits> &mask) const {
 		return bit_and(mask.bit_not()).bit_or(val.bit_and(mask));
 	}
 
@@ -559,19 +557,6 @@ struct wire {
 	wire(wire<Bits> &&) = default;
 	wire<Bits> &operator=(const wire<Bits> &) = delete;
 
-	// We want to avoid having operator=(wire<>) or operator=(value<>) that overwrites both curr and next,
-	// since this operation is almost always wrong. But we also need an operation like that for memory
-	// initialization. This is solved by adding a wrapper and making the use of operator= valid only when
-	// this wrapper is used.
-	struct init {
-		value<Bits> data;
-	};
-
-	wire<Bits> &operator=(const init &init) {
-		curr = next = init.data;
-		return *this;
-	}
-
 	bool commit() {
 		if (curr != next) {
 			curr = next;
@@ -587,12 +572,10 @@ std::ostream &operator<<(std::ostream &os, const wire<Bits> &val) {
 	return os;
 }
 
-template<class Elem>
+template<size_t Width>
 struct memory {
-	using StoredElem = typename std::remove_const<Elem>::type;
-	std::vector<StoredElem> data;
+	std::vector<value<Width>> data;
 
-	static constexpr size_t width = StoredElem::bits;
 	size_t depth() const {
 		return data.size();
 	}
@@ -600,8 +583,8 @@ struct memory {
 	memory() = delete;
 	explicit memory(size_t depth) : data(depth) {}
 
-	memory(const memory<Elem> &) = delete;
-	memory<Elem> &operator=(const memory<Elem> &) = delete;
+	memory(const memory<Width> &) = delete;
+	memory<Width> &operator=(const memory<Width> &) = delete;
 
 	// The only way to get the compiler to put the initializer in .rodata and do not copy it on stack is to stuff it
 	// into a plain array. You'd think an std::initializer_list would work here, but it doesn't, because you can't
@@ -610,7 +593,7 @@ struct memory {
 	template<size_t Size>
 	struct init {
 		size_t offset;
-		typename Elem::init data[Size];
+		value<Width> data[Size];
 	};
 
 	template<size_t... InitSize>
@@ -621,18 +604,56 @@ struct memory {
 		auto _ = {std::move(std::begin(init.data), std::end(init.data), data.begin() + init.offset)...};
 	}
 
-	Elem &operator [](size_t index) {
+	value<Width> &operator [](size_t index) {
 		assert(index < data.size());
 		return data[index];
 	}
+
+	const value<Width> &operator [](size_t index) const {
+		assert(index < data.size());
+		return data[index];
+	}
+
+	// A simple way to make a writable memory would be to use an array of wires instead of an array of values.
+	// However, there are two significant downsides to this approach: first, it has large overhead (2× space
+	// overhead, and O(depth) time overhead during commit); second, it does not simplify handling write port
+	// priorities. Although in principle write ports could be ordered or conditionally enabled in generated
+	// code based on their priorities and selected addresses, the feedback arc set problem is computationally
+	// expensive, and the heuristic based algorithms are not easily modified to guarantee (rather than prefer)
+	// a particular write port evaluation order.
+	//
+	// The approach used here instead is to queue writes into a buffer during the eval phase, then perform
+	// the writes during the commit phase in the priority order. This approach has low overhead, with both space
+	// and time proportional to the amount of write ports. Because virtually every memory in a practical design
+	// has at most two write ports, linear search is used on every write, being the fastest and simplest approach.
+	struct write {
+		size_t index;
+		value<Width> val;
+		value<Width> mask;
+		int priority;
+	};
+	std::vector<write> write_queue;
+
+	void update(size_t index, const value<Width> &val, const value<Width> &mask, int priority = 0) {
+		assert(index < data.size());
+		write_queue.emplace_back(write { index, val, mask, priority });
+	}
+
+	bool commit() {
+		bool changed = false;
+		std::sort(write_queue.begin(), write_queue.end(),
+			[](const write &a, const write &b) { return a.priority < b.priority; });
+		for (const write &entry : write_queue) {
+			value<Width> elem = data[entry.index];
+			elem = elem.update(entry.val, entry.mask);
+			changed |= (data[entry.index] != elem);
+			data[entry.index] = elem;
+		}
+		write_queue.clear();
+		return changed;
+	}
 };
 
-template<size_t Width>
-using memory_rw = memory<wire<Width>>;
-
-template<size_t Width>
-using memory_ro = memory<const value<Width>>;
-
 struct module {
 	module() {}
 	virtual ~module() {}
@@ -1098,15 +1119,19 @@ value<BitsY> mod_ss(const value<BitsA> &a, const value<BitsB> &b) {
 }
 
 // Memory helper
-template<size_t BitsAddr>
-std::pair<bool, size_t> memory_index(const value<BitsAddr> &addr, size_t offset, size_t depth) {
-	static_assert(value<BitsAddr>::chunks <= 1, "memory address is too wide");
-	size_t offset_index = addr.data[0];
+struct memory_index {
+	bool valid;
+	size_t index;
 
-	bool valid = (offset_index >= offset && offset_index < offset + depth);
-	size_t index = offset_index - offset;
-	return std::make_pair(valid, index);
-}
+	template<size_t BitsAddr>
+	memory_index(const value<BitsAddr> &addr, size_t offset, size_t depth) {
+		static_assert(value<BitsAddr>::chunks <= 1, "memory address is too wide");
+		size_t offset_index = addr.data[0];
+
+		valid = (offset_index >= offset && offset_index < offset + depth);
+		index = offset_index - offset;
+	}
+};
 
 } // namespace cxxrtl_yosys
 

From 9534b512770063baa48d862a375ec7a924766866 Mon Sep 17 00:00:00 2001
From: whitequark <whitequark@whitequark.org>
Date: Sun, 5 Apr 2020 07:46:42 +0000
Subject: [PATCH 07/10] write_cxxrtl: add support for $slice and $concat cells.

---
 backends/cxxrtl/cxxrtl.cc | 17 ++++++++++++++++-
 1 file changed, 16 insertions(+), 1 deletion(-)

diff --git a/backends/cxxrtl/cxxrtl.cc b/backends/cxxrtl/cxxrtl.cc
index 6fd63548f..e2c33c3d7 100644
--- a/backends/cxxrtl/cxxrtl.cc
+++ b/backends/cxxrtl/cxxrtl.cc
@@ -189,7 +189,8 @@ static bool is_binary_cell(RTLIL::IdString type)
 
 static bool is_elidable_cell(RTLIL::IdString type)
 {
-	return is_unary_cell(type) || is_binary_cell(type) || type == ID($mux);
+	return is_unary_cell(type) || is_binary_cell(type) || type.in(
+		ID($mux), ID($concat), ID($slice));
 }
 
 static bool is_ff_cell(RTLIL::IdString type)
@@ -672,6 +673,20 @@ struct CxxrtlWorker {
 			f << " : ";
 			dump_sigspec_rhs(cell->getPort(ID(A)));
 			f << ")";
+		// Concats
+		} else if (cell->type == ID($concat)) {
+			dump_sigspec_rhs(cell->getPort(ID(B)));
+			f << ".concat(";
+			dump_sigspec_rhs(cell->getPort(ID(A)));
+			f << ").val()";
+		// Slices
+		} else if (cell->type == ID($slice)) {
+			dump_sigspec_rhs(cell->getPort(ID(A)));
+			f << ".slice<";
+			f << cell->getParam(ID(OFFSET)).as_int() + cell->getParam(ID(Y_WIDTH)).as_int() - 1;
+			f << ",";
+			f << cell->getParam(ID(OFFSET)).as_int();
+			f << ">().val()";
 		} else {
 			log_assert(false);
 		}

From 711df56ad0174f5f437bcdb1939bc0153e2d051e Mon Sep 17 00:00:00 2001
From: whitequark <whitequark@whitequark.org>
Date: Sun, 5 Apr 2020 09:13:13 +0000
Subject: [PATCH 08/10] write_cxxrtl: add support for $sr cell.

Also, fix the semantics of SET/CLR inputs of the $dffsr cell, and
fix the scheduling of async FF cells to consider ARST/SET/CLR->Q
as a forward combinatorial arc.
---
 backends/cxxrtl/cxxrtl.cc | 62 ++++++++++++++++++++++-----------------
 1 file changed, 35 insertions(+), 27 deletions(-)

diff --git a/backends/cxxrtl/cxxrtl.cc b/backends/cxxrtl/cxxrtl.cc
index e2c33c3d7..49b9e2ddf 100644
--- a/backends/cxxrtl/cxxrtl.cc
+++ b/backends/cxxrtl/cxxrtl.cc
@@ -193,10 +193,16 @@ static bool is_elidable_cell(RTLIL::IdString type)
 		ID($mux), ID($concat), ID($slice));
 }
 
-static bool is_ff_cell(RTLIL::IdString type)
+static bool is_sync_ff_cell(RTLIL::IdString type)
 {
 	return type.in(
-		ID($dff), ID($dffe), ID($adff), ID($dffsr));
+		ID($dff), ID($dffe));
+}
+
+static bool is_ff_cell(RTLIL::IdString type)
+{
+	return is_sync_ff_cell(type) || type.in(
+		ID($adff), ID($dffsr), ID($sr));
 }
 
 static bool is_internal_cell(RTLIL::IdString type)
@@ -282,7 +288,7 @@ struct FlowGraph {
 		log_assert(cell->known());
 		for (auto conn : cell->connections()) {
 			if (cell->output(conn.first)) {
-				if (is_ff_cell(cell->type) || (cell->type == ID($memrd) && cell->getParam(ID(CLK_ENABLE)).as_bool()))
+				if (is_sync_ff_cell(cell->type) || (cell->type == ID($memrd) && cell->getParam(ID(CLK_ENABLE)).as_bool()))
 					/* non-combinatorial outputs do not introduce defs */;
 				else if (is_elidable_cell(cell->type))
 					add_defs(node, conn.second, /*elidable=*/true);
@@ -770,7 +776,7 @@ struct CxxrtlWorker {
 			f << indent << "}\n";
 		// Flip-flops
 		} else if (is_ff_cell(cell->type)) {
-			if (cell->getPort(ID(CLK)).is_wire()) {
+			if (cell->hasPort(ID(CLK)) && cell->getPort(ID(CLK)).is_wire()) {
 				// Edge-sensitive logic
 				RTLIL::SigBit clk_bit = cell->getPort(ID(CLK))[0];
 				clk_bit = sigmaps[clk_bit.wire->module](clk_bit);
@@ -795,8 +801,8 @@ struct CxxrtlWorker {
 				dec_indent();
 				f << indent << "}\n";
 			}
-			// Level-sensitive logic
-			if (cell->type == ID($adff)) {
+			if (cell->hasPort(ID(ARST))) {
+				// Asynchronous reset (entire coarse cell at once)
 				f << indent << "if (";
 				dump_sigspec_rhs(cell->getPort(ID(ARST)));
 				f << " == value<1> {" << cell->getParam(ID(ARST_POLARITY)).as_bool() << "}) {\n";
@@ -808,28 +814,30 @@ struct CxxrtlWorker {
 					f << ";\n";
 				dec_indent();
 				f << indent << "}\n";
-			} else if (cell->type == ID($dffsr)) {
-				f << indent << "if (";
-				dump_sigspec_rhs(cell->getPort(ID(CLR)));
-				f << " == value<1> {" << cell->getParam(ID(CLR_POLARITY)).as_bool() << "}) {\n";
-				inc_indent();
-					f << indent;
-					dump_sigspec_lhs(cell->getPort(ID(Q)));
-					f << " = ";
-					dump_const(RTLIL::Const(RTLIL::S0, cell->getParam(ID(WIDTH)).as_int()));
-					f << ";\n";
-				dec_indent();
-				f << indent << "} else if (";
+			}
+			if (cell->hasPort(ID(SET))) {
+				// Asynchronous set (for individual bits)
+				f << indent;
+				dump_sigspec_lhs(cell->getPort(ID(Q)));
+				f << " = ";
+				dump_sigspec_lhs(cell->getPort(ID(Q)));
+				f << ".update(";
+				dump_const(RTLIL::Const(RTLIL::S1, cell->getParam(ID(WIDTH)).as_int()));
+				f << ", ";
 				dump_sigspec_rhs(cell->getPort(ID(SET)));
-				f << " == value<1> {" << cell->getParam(ID(SET_POLARITY)).as_bool() << "}) {\n";
-				inc_indent();
-					f << indent;
-					dump_sigspec_lhs(cell->getPort(ID(Q)));
-					f << " = ";
-					dump_const(RTLIL::Const(RTLIL::S1, cell->getParam(ID(WIDTH)).as_int()));
-					f << ";\n";
-				dec_indent();
-				f << indent << "}\n";
+				f << (cell->getParam(ID(SET_POLARITY)).as_bool() ? "" : ".bit_not()") << ");\n";
+			}
+			if (cell->hasPort(ID(CLR))) {
+				// Asynchronous clear (for individual bits; priority over set)
+				f << indent;
+				dump_sigspec_lhs(cell->getPort(ID(Q)));
+				f << " = ";
+				dump_sigspec_lhs(cell->getPort(ID(Q)));
+				f << ".update(";
+				dump_const(RTLIL::Const(RTLIL::S0, cell->getParam(ID(WIDTH)).as_int()));
+				f << ", ";
+				dump_sigspec_rhs(cell->getPort(ID(CLR)));
+				f << (cell->getParam(ID(CLR_POLARITY)).as_bool() ? "" : ".bit_not()") << ");\n";
 			}
 		// Memory ports
 		} else if (cell->type.in(ID($memrd), ID($memwr))) {

From 753e34007d64e1bf9d9d5fa19d6a39b328672d88 Mon Sep 17 00:00:00 2001
From: whitequark <whitequark@whitequark.org>
Date: Sun, 5 Apr 2020 09:27:55 +0000
Subject: [PATCH 09/10] write_cxxrtl: add support for $dlatch and $dlatchsr
 cells.

Also, fix codegen for $dffe and $adff.
---
 backends/cxxrtl/cxxrtl.cc | 19 ++++++++++++++++---
 1 file changed, 16 insertions(+), 3 deletions(-)

diff --git a/backends/cxxrtl/cxxrtl.cc b/backends/cxxrtl/cxxrtl.cc
index 49b9e2ddf..d204364ca 100644
--- a/backends/cxxrtl/cxxrtl.cc
+++ b/backends/cxxrtl/cxxrtl.cc
@@ -202,7 +202,7 @@ static bool is_sync_ff_cell(RTLIL::IdString type)
 static bool is_ff_cell(RTLIL::IdString type)
 {
 	return is_sync_ff_cell(type) || type.in(
-		ID($adff), ID($dffsr), ID($sr));
+		ID($adff), ID($dffsr), ID($dlatch), ID($dlatchsr), ID($sr));
 }
 
 static bool is_internal_cell(RTLIL::IdString type)
@@ -786,7 +786,7 @@ struct CxxrtlWorker {
 					if (cell->type == ID($dffe)) {
 						f << indent << "if (";
 						dump_sigspec_rhs(cell->getPort(ID(EN)));
-						f << " == value<1> {" << cell->getParam(ID(EN_POLARITY)).as_bool() << "}) {\n";
+						f << " == value<1> {" << cell->getParam(ID(EN_POLARITY)).as_bool() << "u}) {\n";
 						inc_indent();
 					}
 					f << indent;
@@ -800,12 +800,25 @@ struct CxxrtlWorker {
 					}
 				dec_indent();
 				f << indent << "}\n";
+			} else if (cell->hasPort(ID(EN))) {
+				// Level-sensitive logic
+				f << indent << "if (";
+				dump_sigspec_rhs(cell->getPort(ID(EN)));
+				f << " == value<1> {" << cell->getParam(ID(EN_POLARITY)).as_bool() << "u}) {\n";
+				inc_indent();
+					f << indent;
+					dump_sigspec_lhs(cell->getPort(ID(Q)));
+					f << " = ";
+					dump_sigspec_rhs(cell->getPort(ID(D)));
+					f << ";\n";
+				dec_indent();
+				f << indent << "}\n";
 			}
 			if (cell->hasPort(ID(ARST))) {
 				// Asynchronous reset (entire coarse cell at once)
 				f << indent << "if (";
 				dump_sigspec_rhs(cell->getPort(ID(ARST)));
-				f << " == value<1> {" << cell->getParam(ID(ARST_POLARITY)).as_bool() << "}) {\n";
+				f << " == value<1> {" << cell->getParam(ID(ARST_POLARITY)).as_bool() << "u}) {\n";
 				inc_indent();
 					f << indent;
 					dump_sigspec_lhs(cell->getPort(ID(Q)));

From 4737f426ff20324316ab037cef719da4b5c520dd Mon Sep 17 00:00:00 2001
From: whitequark <whitequark@whitequark.org>
Date: Sun, 5 Apr 2020 10:03:23 +0000
Subject: [PATCH 10/10] write_cxxrtl: add basic documentation.

---
 backends/cxxrtl/cxxrtl.cc | 17 ++++++++++++++++-
 1 file changed, 16 insertions(+), 1 deletion(-)

diff --git a/backends/cxxrtl/cxxrtl.cc b/backends/cxxrtl/cxxrtl.cc
index d204364ca..465882858 100644
--- a/backends/cxxrtl/cxxrtl.cc
+++ b/backends/cxxrtl/cxxrtl.cc
@@ -1601,7 +1601,22 @@ struct CxxrtlBackend : public Backend {
 		log("\n");
 		log("    write_cxxrtl [options] [filename]\n");
 		log("\n");
-		log("Write C++ code for simulating the design.\n");
+		log("Write C++ code for simulating the design. The generated code requires a driver;\n");
+		log("the following simple driver is provided as an example:\n");
+		log("\n");
+		log("    #include \"top.cc\"\n");
+		log("\n");
+		log("    int main() {\n");
+		log("      cxxrtl_design::p_top top;\n");
+		log("      while (1) {\n");
+		log("        top.p_clk.next = value<1> {1u};\n");
+		log("        top.step();\n");
+		log("        top.p_clk.next = value<1> {0u};\n");
+		log("        top.step();\n");
+		log("      }\n");
+		log("    }\n");
+		log("\n");
+		log("The following options are supported by this backend:\n");
 		log("\n");
 		log("    -O <level>\n");
 		log("        set the optimization level. the default is -O%d. higher optimization\n", DEFAULT_OPT_LEVEL);