fmt,cxxrtl: add `UNICHAR` format type.

This format type is used to print an Unicode character (code point) as
its UTF-8 serialization. To this end, two UTF-8 decoders (one for fmt,
one for cxxrtl) are added for rendering. When converted to a Verilog
format specifier, `UNICHAR` degrades to `%c` with the low 7 bits of
the code point, which has equivalent behavior for inputs not exceeding
ASCII. (SystemVerilog leaves source and display encodings completely
undefined.)
This commit is contained in:
Catherine 2024-03-28 07:55:46 +00:00 committed by Marcelina Kościelnicka
parent 1780e2eb1e
commit bf5a960668
3 changed files with 70 additions and 6 deletions

View File

@ -1013,13 +1013,14 @@ struct fmt_part {
LITERAL = 0,
INTEGER = 1,
STRING = 2,
VLOG_TIME = 3,
UNICHAR = 3,
VLOG_TIME = 4,
} type;
// LITERAL type
std::string str;
// INTEGER/STRING types
// INTEGER/STRING/UNICHAR types
// + value<Bits> val;
// INTEGER/STRING/VLOG_TIME types
@ -1073,6 +1074,25 @@ struct fmt_part {
break;
}
case UNICHAR: {
uint32_t codepoint = val.template get<uint32_t>();
if (codepoint >= 0x10000)
buf += (char)(0xf0 | (codepoint >> 18));
else if (codepoint >= 0x800)
buf += (char)(0xe0 | (codepoint >> 12));
else if (codepoint >= 0x80)
buf += (char)(0xc0 | (codepoint >> 6));
else
buf += (char)codepoint;
if (codepoint >= 0x10000)
buf += (char)(0x80 | ((codepoint >> 12) & 0x3f));
if (codepoint >= 0x800)
buf += (char)(0x80 | ((codepoint >> 6) & 0x3f));
if (codepoint >= 0x80)
buf += (char)(0x80 | ((codepoint >> 0) & 0x3f));
break;
}
case INTEGER: {
size_t width = Bits;
if (base != 10) {

View File

@ -42,9 +42,9 @@ void Fmt::parse_rtlil(const RTLIL::Cell *cell) {
} else if (fmt.substr(i, 2) == "{{") {
part.str += '{';
++i;
} else if (fmt[i] == '}')
} else if (fmt[i] == '}') {
log_assert(false && "Unexpected '}' in format string");
else if (fmt[i] == '{') {
} else if (fmt[i] == '{') {
if (!part.str.empty()) {
part.type = FmtPart::LITERAL;
parts.push_back(part);
@ -74,6 +74,12 @@ void Fmt::parse_rtlil(const RTLIL::Cell *cell) {
part.sig = args.extract(0, arg_size);
args.remove(0, arg_size);
if (fmt[i] == 'U') {
part.type = FmtPart::UNICHAR;
++i;
goto success;
}
if (fmt[i] == '>')
part.justify = FmtPart::RIGHT;
else if (fmt[i] == '<')
@ -156,6 +162,7 @@ void Fmt::parse_rtlil(const RTLIL::Cell *cell) {
log_assert(false && "Unexpected end in format substitution");
}
success:
if (fmt[i] != '}')
log_assert(false && "Expected '}' after format substitution");
@ -188,6 +195,11 @@ void Fmt::emit_rtlil(RTLIL::Cell *cell) const {
}
break;
case FmtPart::UNICHAR:
log_assert(part.sig.size() <= 32);
fmt += "{U}";
break;
case FmtPart::VLOG_TIME:
log_assert(part.sig.size() == 0);
YS_FALLTHROUGH
@ -568,6 +580,16 @@ std::vector<VerilogFmtArg> Fmt::emit_verilog() const
break;
}
case FmtPart::UNICHAR: {
VerilogFmtArg arg;
arg.type = VerilogFmtArg::INTEGER;
arg.sig = part.sig.extract(0, 7); // only ASCII
args.push_back(arg);
fmt.str += "%c";
break;
}
case FmtPart::VLOG_TIME: {
VerilogFmtArg arg;
arg.type = VerilogFmtArg::TIME;
@ -630,6 +652,7 @@ void Fmt::emit_cxxrtl(std::ostream &os, std::string indent, std::function<void(c
case FmtPart::LITERAL: os << "LITERAL"; break;
case FmtPart::INTEGER: os << "INTEGER"; break;
case FmtPart::STRING: os << "STRING"; break;
case FmtPart::UNICHAR: os << "UNICHAR"; break;
case FmtPart::VLOG_TIME: os << "VLOG_TIME"; break;
}
os << ", ";
@ -671,6 +694,26 @@ std::string Fmt::render() const
str += part.str;
break;
case FmtPart::UNICHAR: {
RTLIL::Const value = part.sig.as_const();
uint32_t codepoint = value.as_int();
if (codepoint >= 0x10000)
str += (char)(0xf0 | (codepoint >> 18));
else if (codepoint >= 0x800)
str += (char)(0xe0 | (codepoint >> 12));
else if (codepoint >= 0x80)
str += (char)(0xc0 | (codepoint >> 6));
else
str += (char)codepoint;
if (codepoint >= 0x10000)
str += (char)(0x80 | ((codepoint >> 12) & 0x3f));
if (codepoint >= 0x800)
str += (char)(0x80 | ((codepoint >> 6) & 0x3f));
if (codepoint >= 0x80)
str += (char)(0x80 | ((codepoint >> 0) & 0x3f));
break;
}
case FmtPart::INTEGER:
case FmtPart::STRING:
case FmtPart::VLOG_TIME: {

View File

@ -56,13 +56,14 @@ struct FmtPart {
LITERAL = 0,
INTEGER = 1,
STRING = 2,
VLOG_TIME = 3,
UNICHAR = 3,
VLOG_TIME = 4,
} type;
// LITERAL type
std::string str;
// INTEGER/STRING types
// INTEGER/STRING/UNICHAR types
RTLIL::SigSpec sig;
// INTEGER/STRING/VLOG_TIME types