autogenpb/protoReformat.go

845 lines
23 KiB
Go

// Copyright 2017-2025 WIT.COM Inc. All rights reserved.
// Use of this source code is governed by the GPL 3.0
package main
import (
"fmt"
"iter"
"os"
"regexp"
"strings"
sync "sync"
"go.wit.com/log"
)
// like 'goimport', but for .proto files
var allTheLines *LinesScanner
var allTheNewLines []string
var lastMessage *FormatMsg
/*
type EnumMessage struct {
msgPB *FormatMsg
all []Message
}
type StdMessage struct {
msgPB *FormatMsg
all []Message
}
type Message interface {
name() string
addMsg(Message)
}
*/
// protoReformatComments reads a .proto file, processes its comments using
// commentPreprocessor and commentPreprocessorFull, and writes the modified
// content back to the file. This function serves as a dedicated comment
// processing tool.
func protoReformatComments(filename string) error {
// read in the .proto file
data, err := os.ReadFile(filename)
if err != nil {
log.Info("file read failed", filename, err)
return err
}
var newfile string
log.Info("filename", filename)
alltest := makeLineIter(data)
// gets the max vartype and varname
for line := range alltest {
newfile += fmt.Sprintln(commentPreprocessor(line))
}
newfile = commentPreprocessorFull(newfile)
saveFile(filename, newfile)
return nil
}
// protoReformat reads a .proto file, parses it into a structured format,
// aligns its contents for readability, and writes the formatted output back to
// the original file. It acts as the main entry point for the formatting process.
func protoReformat(filename string) error {
// read in the .proto file
data, err := os.ReadFile(filename)
if err != nil {
log.Info("file read failed", filename, err)
return err
}
var newfile string
basemsg := doParse(strings.Split(string(data), "\n"))
for _, newline := range basemsg.format() {
newfile += fmt.Sprintln(newline)
}
return saveFile(filename, newfile)
}
// doParse is the core parsing engine for the protoReformat tool. It processes
// the raw lines of a .proto file and constructs a hierarchical tree of
// FormatMsg structs.
//
// This tree mirrors the .proto file's logical structure, representing `message`,
// `enum`, and `oneof` blocks as distinct nodes. The function recursively
// handles nested definitions and carefully preserves associated comments and
// surrounding whitespace, which are essential for the final formatting.
//
// The returned *FormatMsg is the root of this tree, containing the complete,
// structured representation of the file, which is then used by the format
// functions to generate a clean, aligned output.
func doParse(lines []string) *FormatMsg {
var comments string
var basemsg *FormatMsg
basemsg = new(FormatMsg)
lastMessage = basemsg
// TODO: read in start of the file
allTheLines = newLinesScanner(lines)
for allTheLines.Scan() {
line := allTheLines.NextRaw()
if strings.HasPrefix(line, "oneof ") {
break
}
if strings.HasPrefix(line, "enum ") {
break
}
if strings.HasPrefix(line, "message ") {
break
}
basemsg.Notes = append(basemsg.Notes, line)
}
// rewind a line
allTheLines.UnScan()
// write out the messages
for allTheLines.Scan() {
line := allTheLines.NextRaw()
if strings.HasPrefix(line, "oneof ") {
if strings.Contains(line, "}") {
newmsg := basemsg.newMessage(line, comments, FormatMsg_ONEOF)
newmsg.IsEmpty = true
newmsg.Footer = "} // blah"
comments = ""
continue
}
newmsg := basemsg.newMessage(line, comments, FormatMsg_ONEOF)
comments = ""
newmsg.load()
continue
}
if strings.HasPrefix(line, "enum ") {
newmsg := basemsg.newMessage(line, comments, FormatMsg_ENUM)
comments = ""
if strings.Contains(line, "}") {
newmsg.IsEmpty = true
newmsg.Footer = "} // blah"
continue
}
newmsg.load()
continue
}
if strings.HasPrefix(line, "message ") {
log.Info("got to message", line)
newmsg := basemsg.newMessage(line, comments, FormatMsg_MESSAGE)
comments = ""
if strings.Contains(line, "}") {
newmsg.IsEmpty = true
newmsg.Footer = "} // blah"
continue
}
newmsg.load()
continue
}
if comments == "" {
if strings.TrimSpace(line) == "" {
lastMessage.PadAfter = true
}
}
comments += fmt.Sprintln(line)
}
return basemsg
}
// saveFile writes the provided data to a specified file, creating it if it
// doesn't exist or truncating it if it does. It ensures the data is trimmed of
// surrounding whitespace before writing.
func saveFile(filename string, data string) error {
pf, err := os.OpenFile(filename, os.O_WRONLY|os.O_CREATE|os.O_TRUNC, 0644)
if err != nil {
log.Info("file open error. permissions?", filename, err)
return err
}
data = strings.TrimSpace(data)
fmt.Fprintln(pf, data)
pf.Close()
// for i, s := range slices.Backward(pf.ToSort) {
return nil
}
// newDepth creates a new FormatMsg, representing a nested message structure.
// It inherits formatting parameters from its parent and increments the nesting
// depth, which is used to calculate indentation.
func newDepth(fmtmsg *FormatMsg, header string) *FormatMsg {
newmsg := new(FormatMsg)
lastMessage = newmsg
newmsg.MaxVarname = fmtmsg.MaxVarname
newmsg.MaxVartype = fmtmsg.MaxVartype
newmsg.Header = strings.TrimSpace(header)
newmsg.Depth = fmtmsg.Depth + 1
return newmsg
}
// newMessage creates a new FormatMsg of a specified type (e.g., message, enum)
// and appends it to the parent's list of messages. It associates the new
// message with its preceding comments.
func (msgPB *FormatMsg) newMessage(header string, comments string, msgType FormatMsg_Type) *FormatMsg {
newmsg := newDepth(msgPB, header)
newmsg.Type = msgType
msgPB.Msgs = append(msgPB.Msgs, newmsg)
comments = strings.TrimSpace(comments)
newmsg.Notes = strings.Split(comments, "\n")
return newmsg
}
// load recursively parses the content within a message, enum, or oneof block.
// It consumes lines from the global line scanner until it reaches the closing
// brace '}' of the current block, creating nested messages as needed.
func (msg *FormatMsg) load() {
// fmtmsg := msg.msgPB
for allTheLines.Scan() {
line := allTheLines.Next()
if strings.HasPrefix(line, "oneof ") {
newmsg := msg.newMessage(line, "", FormatMsg_ONEOF)
if strings.Contains(line, "}") {
newmsg.IsEmpty = true
return
}
newmsg.load()
continue
}
if strings.HasPrefix(line, "enum ") {
newmsg := msg.newMessage(line, "", FormatMsg_ENUM)
if strings.Contains(line, "}") {
newmsg.IsEmpty = true
return
}
newmsg.load()
continue
}
if strings.HasPrefix(line, "message ") {
// message inception. search for the architect. don't forget your totem
newmsg := msg.newMessage(line, "", FormatMsg_MESSAGE)
if strings.Contains(line, "}") {
newmsg.IsEmpty = true
return
}
newmsg.load()
continue
}
if strings.HasPrefix(line, "}") {
msg.Footer = line
return
}
msg.Lines = append(msg.Lines, line)
}
return
}
// tokenMsgVar parses a line containing a field definition within a .proto message.
// It extracts and returns the variable type, name, ID, and any trailing comments
// or options. For example, from "string name = 1; // User's name", it returns
// "string", "name", "1", and "// User's name".
func tokenMsgVar(line string) (string, string, string, string) {
parts := strings.Split(line, ";")
front := parts[0]
end := strings.Join(parts[1:], ";")
var id string
var varname string
var vartype string
parts = strings.Fields(front)
parts, id = slicesPop(parts)
parts, _ = slicesPop(parts) // this is the "=" sign
parts, varname = slicesPop(parts)
vartype = strings.Join(parts, " ")
return vartype, varname, id, end
}
// slicesPop removes and returns the last element from a slice of strings,
// along with the modified slice. It is a utility function for tokenizing lines.
func slicesPop(parts []string) ([]string, string) {
if len(parts) == 0 {
return nil, ""
}
if len(parts) == 1 {
return nil, parts[0]
}
x := len(parts)
end := parts[x-1]
return parts[0 : x-1], end
}
// makeLineIter creates a Go 1.24+ style iterator (iter.Seq) from a byte slice
// of file content. This allows for convenient line-by-line iteration using
// `for ... range`.
func makeLineIter(data []byte) iter.Seq[string] {
items := strings.Split(string(data), "\n")
// log.Println("Made All() Iter.Seq[] with length", len(items))
return func(yield func(string) bool) {
for _, v := range items {
if !yield(v) {
return
}
}
}
}
// setMaxSizes calculates the maximum length of field types and names within a
// message. These values are used later to determine the padding required for
// consistent alignment of fields.
func setMaxSizes(curmsg *FormatMsg) {
for _, line := range curmsg.Lines {
parts := strings.Split(line, ";")
if len(parts) < 2 {
// line is blank or just a comment
continue
}
vartype, varname, _, _ := tokenMsgVar(line)
if len(vartype) > int(curmsg.MaxVartype) {
curmsg.MaxVartype = int64(len(vartype))
}
if len(varname) > int(curmsg.MaxVarname) {
curmsg.MaxVarname = int64(len(varname))
}
}
}
// padBase generates the indentation string for message headers and footers
// based on their nesting depth.
func (msg *FormatMsg) padBase() string {
var pad string
for i := 1; i < int(msg.Depth); i += 1 {
pad += fmt.Sprintf("%8s", " ")
}
return pad
}
// pad generates the indentation string for lines inside a message block,
// such as field definitions, based on the message's nesting depth.
func (msg *FormatMsg) pad() string {
var pad string
for i := 0; i < int(msg.Depth); i += 1 {
pad += fmt.Sprintf("%8s", " ")
}
return pad
}
// padding generates an indentation string with a custom offset, allowing for
// flexible alignment calculations.
func (msg *FormatMsg) padding(offset int) string {
var pad string
for i := offset; i < int(msg.Depth); i += 1 {
pad += fmt.Sprintf("%8s", " ")
}
return pad
}
// formatEnum formats an enum block, including its header, values, and footer,
// ensuring proper indentation.
func formatEnum(curmsg *FormatMsg) []string {
var newmsg []string
newmsg = append(newmsg, curmsg.formatLineBase(curmsg.Header, "enum header"))
for _, line := range curmsg.Lines {
newmsg = append(newmsg, curmsg.formatLine(line, "enum"))
}
// newmsg = append(newmsg, curmsg.formatLineBase(curmsg.Footer, "enum footer"))
newmsg = append(newmsg, curmsg.formatFooter(curmsg.Footer, "enum footer"))
if curmsg.PadAfter {
newmsg = append(newmsg, curmsg.formatPadAfter())
}
return newmsg
}
// formatOneof formats a oneof block, aligning its fields and comments
// according to the calculated padding.
func formatOneof(curmsg *FormatMsg) []string {
// curmsg.formatStandardSizes()
var newmsg []string
newmsg = append(newmsg, curmsg.formatLineBase(curmsg.Header, "oneof header"))
for _, line := range curmsg.Lines {
// func (msg *FormatMsg) formatMsgLine(line string, dbg string) string {
newmsg = append(newmsg, curmsg.formatMsgLine(line, "oneof"))
}
// newmsg = append(newmsg, curmsg.formatLineBase(curmsg.Footer, "oneof footer"))
newmsg = append(newmsg, curmsg.formatFooter(curmsg.Footer, "oneof footer"))
if curmsg.PadAfter {
newmsg = append(newmsg, curmsg.formatPadAfter())
}
return newmsg
}
// formatStandardSizes traverses the message tree and standardizes the padding
// for field types and names across all sibling messages. This ensures that
// fields in adjacent messages are vertically aligned, improving readability.
func (parent *FormatMsg) formatStandardSizes() {
var bigType int64
var bigName int64
// find the biggest var names and var types
for _, child := range parent.Msgs {
switch child.Type {
case FormatMsg_ENUM:
case FormatMsg_ONEOF:
// find the max length of varname and vartype
setMaxSizes(child)
if bigType < child.MaxVartype {
bigType = child.MaxVartype
}
if bigName < child.MaxVarname {
bigName = child.MaxVarname
}
case FormatMsg_MESSAGE:
// find the max length of varname and vartype
setMaxSizes(child)
if bigType < child.MaxVartype {
bigType = child.MaxVartype
}
if bigName < child.MaxVarname {
bigName = child.MaxVarname
}
default:
}
}
// set this size in each message
for _, child := range parent.Msgs {
switch child.Type {
case FormatMsg_ENUM:
case FormatMsg_ONEOF:
child.MaxVartype = bigType
child.MaxVarname = bigName
case FormatMsg_MESSAGE:
child.MaxVartype = bigType
child.MaxVarname = bigName
default:
}
}
}
// format is the main formatting dispatcher. It calls the appropriate formatting
// function (formatEnum, formatOneof, or formatMessage) based on the message type.
func (parent *FormatMsg) format() []string {
parent.formatStandardSizes()
switch parent.Type {
case FormatMsg_ENUM:
return formatEnum(parent)
case FormatMsg_ONEOF:
return formatOneof(parent)
case FormatMsg_MESSAGE:
return formatMessage(parent)
default:
return formatMessage(parent)
}
}
// formatFooter formats the closing brace '}' of a message block, applying the
// correct base indentation.
func (msg *FormatMsg) formatFooter(line string, dbg string) string {
if line == "" {
if argv.Debug {
return "// footer was empty"
}
return ""
}
return msg.formatLineBase(line, "footer")
}
// formatPadAfter adds a blank line after a message block if the original
// file contained one, preserving vertical spacing.
func (msg *FormatMsg) formatPadAfter() string {
if argv.Debug {
return msg.formatLineBase("", "pad after")
}
return ""
}
// formatHeader formats the opening line of a message block (e.g., "message Foo {"),
// aligning any trailing comments.
func (msg *FormatMsg) formatHeader(line string, dbg string) string {
if line == "" {
if msg.Depth != 0 {
return "// ERROR: header was blank"
}
}
parts := strings.Fields(line)
if len(parts) <= 3 {
return msg.formatLineBase(msg.Header, "header")
}
// hack to actually indent comments on the message line itself. you're welcome
start := parts[0] + " " + parts[1] + " " + parts[2]
end := strings.Join(parts[3:], " ")
offset := int(msg.MaxVarname) + int(msg.MaxVartype) + 16 - len(start)
pad := fmt.Sprintf("%d", offset)
if argv.Debug {
hmm := "%s%s %" + pad + "s %s // depth=%d"
return fmt.Sprintf(hmm, msg.padBase(), start, " ", end, msg.Depth)
}
hmm := "%s%s %" + pad + "s %s"
return fmt.Sprintf(hmm, msg.padBase(), start, " ", end)
}
// formatLineBase formats a line using the base indentation level, suitable for
// headers, footers, and comments outside of message bodies.
func (msg *FormatMsg) formatLineBase(line string, dbg string) string {
line = strings.TrimSpace(line)
if argv.Debug {
return fmt.Sprintf("/*a*/%s/*b*/%s // %s depth=%d", msg.padBase(), line, dbg, msg.Depth)
}
return fmt.Sprintf("%s%s", msg.padBase(), line)
}
// formatLine formats a line using the standard block indentation, suitable for
// field definitions and other content inside a message body.
func (msg *FormatMsg) formatLine(line string, dbg string) string {
line = strings.TrimSpace(line)
if argv.Debug {
return fmt.Sprintf("/*a*/%s/*b*/%s // %s depth=%d", msg.pad(), line, dbg, msg.Depth)
}
return fmt.Sprintf("%s%s", msg.pad(), line)
}
// formatComment formats a comment line, aligning it with the surrounding code
// based on the calculated maximum field widths.
func (msg *FormatMsg) formatComment(line string, dbg string) string {
line = strings.TrimSpace(line)
pad := fmt.Sprintf("%d", msg.MaxVartype+msg.MaxVarname+13) // 21 is correct?
hmm := "%" + pad + "s %s"
comment := fmt.Sprintf(hmm, " ", line) // todo: compute 50
if argv.Debug {
return fmt.Sprintf("/*a*/%s/*b*/%s // %s depth=%d", msg.pad(), comment, dbg, msg.Depth)
}
return fmt.Sprintf("%s%s", msg.pad(), comment)
}
// formatVarLine formats a field definition line, padding the type and name to
// ensure vertical alignment with other fields in the same scope.
func (msg *FormatMsg) formatVarLine(line string, dbg string) string {
line = strings.TrimSpace(line)
mt := fmt.Sprintf("%d", msg.MaxVartype)
mv := fmt.Sprintf("%d", msg.MaxVarname)
hmm := "%-" + mt + "s %-" + mv + "s = %-3s %s"
vartype, varname, id, end := tokenMsgVar(line)
end = strings.TrimSpace(end)
id = id + ";"
newline := fmt.Sprintf(hmm, vartype, varname, id, end)
newline = strings.TrimRight(newline, " ")
if argv.Debug {
return fmt.Sprintf("/*a*/%s/*b*/%s // %s depth=%d (%d,%d)", msg.pad(), newline, dbg, msg.Depth, msg.MaxVartype, msg.MaxVarname)
}
return fmt.Sprintf("%s%s", msg.pad(), newline)
}
// formatMsgLine is a dispatcher for formatting a single line within a message.
// It determines whether the line is a comment or a field definition and calls
// the appropriate formatting function.
func (msg *FormatMsg) formatMsgLine(line string, dbg string) string {
line = strings.TrimSpace(line)
if line == "" {
if argv.Debug {
return "// empty line " + msg.Header + " empty line end"
} else {
return line
}
}
if strings.HasPrefix(line, "//") {
return msg.formatComment(line, "comment")
}
return msg.formatVarLine(line, "var "+dbg)
}
// trimLines removes leading/trailing whitespace and blank lines from a slice
// of strings by joining and splitting them.
func trimLines(lines []string) []string {
return strings.Split(strings.TrimSpace(strings.Join(lines, "\n")), "\n")
}
// formatMessage is the main function for formatting a `message` block. It
// orchestrates the formatting of comments, nested messages, and field
// definitions to produce a clean, aligned output.
func formatMessage(curmsg *FormatMsg) []string {
var newmsg []string
if curmsg.IsEmpty {
newmsg = append(newmsg, curmsg.formatLineBase("// isEmpty", "IsEmpty"))
return newmsg
}
// add the notes & comments before the header
notes := trimLines(curmsg.Notes)
if len(notes) == 0 {
// do nothing {
} else if len(notes) == 1 {
if notes[0] == "" {
// todo: track space in original file
} else {
newmsg = append(newmsg, curmsg.formatLineBase(notes[0], "notes1"))
}
} else {
for _, line := range notes {
newmsg = append(newmsg, curmsg.formatLineBase(line, "notes2"))
}
}
newmsg = append(newmsg, curmsg.formatHeader(curmsg.Header, "header"))
for _, msg := range curmsg.Msgs {
switch msg.Type {
case FormatMsg_ENUM:
for _, line := range formatEnum(msg) {
newmsg = append(newmsg, line)
}
case FormatMsg_ONEOF:
for _, line := range formatOneof(msg) {
newmsg = append(newmsg, line)
}
case FormatMsg_MESSAGE:
for _, line := range msg.format() {
// line = fmt.Sprintf("%s%s", curmsg.pad(), line)
newmsg = append(newmsg, line)
}
default:
}
}
// trim curmsg.Lines
// if curmsg.Lines is empty, don't do anything
dump := strings.Join(curmsg.Lines, "\n")
dump = strings.TrimSpace(dump)
if dump == "" {
// do nothing
} else {
// print the lines
curmsg.Lines = strings.Split(dump, "\n")
// newmsg = append(newmsg, "// dump "+dump+"dump end\n")
for _, line := range curmsg.Lines {
line = strings.TrimSpace(line)
if line == "" {
if argv.Debug {
newmsg = append(newmsg, "// empty line "+curmsg.Header+" empty line end")
} else {
newmsg = append(newmsg, line)
}
continue
}
if strings.HasPrefix(line, "//") {
/*
pad := fmt.Sprintf("%d", curmsg.MaxVartype+curmsg.MaxVarname+21)
hmm := "%" + pad + "s %s"
line = fmt.Sprintf(hmm, " ", line) // todo: compute 50
newmsg = append(newmsg, line)
*/
newmsg = append(newmsg, curmsg.formatComment(line, "comment"))
continue
}
newmsg = append(newmsg, curmsg.formatVarLine(line, "var"))
}
}
newmsg = append(newmsg, curmsg.formatFooter(curmsg.Footer, "footer"))
if curmsg.PadAfter {
newmsg = append(newmsg, curmsg.formatPadAfter())
}
return newmsg
}
// DEFINE THE Lines ITERATOR.
// newLinesScanner initializes a new LinesScanner iterator with a slice of strings.
func newLinesScanner(things []string) *LinesScanner {
return &LinesScanner{things: things}
}
// LinesScanner provides an iterator over a slice of strings, allowing for
// sequential access and the ability to un-scan (step back).
type LinesScanner struct {
sync.Mutex
things []string
index int
}
// Scan advances the iterator to the next line. It returns false if there are
// no more lines.
func (it *LinesScanner) Scan() bool {
if it.index >= len(it.things) {
return false
}
it.Lock()
it.index++
it.Unlock()
return true
}
// UnScan moves the iterator back one line. It returns false if the iterator
// is already at the beginning.
func (it *LinesScanner) UnScan() bool {
if it.index < 1 {
it.index = 0
return false
}
it.Lock()
it.index--
it.Unlock()
return true
}
// NextRaw returns the current line from the scanner without any modification.
func (it *LinesScanner) NextRaw() string {
if it.index-1 == len(it.things) {
fmt.Println("Next() error in LinesScanner", it.index)
}
return it.things[it.index-1]
}
// Next returns the current line from the scanner with leading and trailing
// whitespace removed.
func (it *LinesScanner) Next() string {
if it.index-1 == len(it.things) {
fmt.Println("Next() error in LinesScanner", it.index)
}
// out := commentPreprocessor(it.things[it.index-1])
out := it.things[it.index-1]
return strings.TrimSpace(out)
// return out
}
// END DEFINE THE ITERATOR
// commentPreprocessor converts single-line C-style block comments (/* ... */)
// into Go-style line comments (// ...) appended to the end of the line.
// For example, "/* test */ reserved 4;" becomes "reserved 4; // test".
func commentPreprocessor(line string) string {
// Match all /* comment */ blocks
re := regexp.MustCompile(`/\*([^*]+)\*/`)
matches := re.FindAllStringSubmatch(line, -1)
// Extract just the comment texts
var comments []string
for _, match := range matches {
comments = append(comments, strings.TrimSpace(match[1]))
// comments = append(comments, match[1])
}
// Remove the block comments from the original line
line = re.ReplaceAllString(line, "")
// line = strings.TrimSpace(line)
line = strings.TrimSuffix(line, " ")
// Append comments at the end with //
for _, comment := range comments {
line += " // " + comment
}
return line
}
// commentPreprocessorFull transforms multi-line C-style block comments
// (/* ... */) into a series of single-line Go-style comments (// ...).
func commentPreprocessorFull(full string) string {
// Match all /* comment */ blocks
// re := regexp.MustCompile(`/\*([^*]+)\*/`)
re := regexp.MustCompile(`(?s)/\*(.*?)\*/`)
return re.ReplaceAllStringFunc(full, func(s string) string {
log.Info("FOUND:\n", s)
lines := strings.Split(s, "\n")
var cleaned []string
for _, line := range lines {
trimmed := strings.TrimSpace(line)
switch {
case strings.HasPrefix(trimmed, "/*"):
trimmed = trimCommentPrefix(trimmed)
case strings.HasPrefix(trimmed, "*/"):
trimmed = strings.TrimPrefix(trimmed, "*/")
case strings.HasPrefix(trimmed, "*"):
trimmed = strings.TrimPrefix(trimmed, "*")
}
trimmed = "// " + trimmed
cleaned = append(cleaned, strings.TrimSpace(trimmed))
}
s = strings.Join(cleaned, "\n")
log.Info("NOW:\n", s)
return s
})
}
// trimCommentPrefix is a helper function for commentPreprocessorFull that
// removes leading comment markers like '/*', '*', and '*/' from a line.
func trimCommentPrefix(line string) string {
trimmed := strings.TrimSpace(line)
if strings.HasPrefix(trimmed, "/") {
i := 1
for i < len(trimmed) && trimmed[i] == '*' {
i++
}
if i > 1 {
return strings.TrimSpace(trimmed[i:])
}
}
if strings.HasPrefix(trimmed, "*") {
return strings.TrimSpace(trimmed[1:])
}
if trimmed == "*/" {
return ""
}
return trimmed
}