package main import ( "fmt" "strings" "time" "go.wit.com/lib/gui/shell" pb "go.wit.com/lib/protobuf/virtbuf" "go.wit.com/log" "google.golang.org/protobuf/types/known/timestamppb" ) func (h *HyperT) pollHypervisor() { url := "http://" + h.pb.Hostname + ":2520/vms" log.Log(POLL, "wget url =", url) s := shell.Wget(url) if s == nil { return } var bytesSplice []byte bytesSplice = s.Bytes() // fmt.Fprintln(w, string(bytesSplice)) for _, line := range strings.Split(string(bytesSplice), "\n") { if line == "" { continue } fields := strings.Fields(line) if len(fields) < 2 { continue } state := fields[0] name := fields[1] if state == "OFF" { // skip locally defined libvirt vms continue } h.lastDroplets[name] = time.Now() // if _, ok := h.lastDroplets[name]; ok { // h.lastDroplets[name] = time.Now() // } // try the protobuf d := findDroplet(name) if d == nil { // not sure whawt now? log.Log(WARN, name, "is unknown on", h.pb.Hostname, "state =", state) log.Log(WARN, name, "this vm was probably started by hand using virtsh") log.Log(WARN, name, "todo: import vm from libvrit") continue } if state == "ON" { log.Log(POLL, h.pb.Hostname, "STATE:", state, "HOST:", name, "rest:", fields[2:]) log.Log(INFO, "ALREADY RECORDED", d.Hostname) // update the status to ON d.CurrentState = pb.DropletState_ON // set the LastPoll time to now now := time.Now() d.LastPoll = timestamppb.New(now) if d.CurrentHypervisor == "" { // this means the droplet was in the config file // but this is the first time it's shown up as running // this should mean a droplet is running where the config file says it probably should be running if d.PreferredHypervisor == h.pb.Hostname { log.Log(EVENT, "poll shows new droplet", d.Hostname, "(matches config hypervisor", h.pb.Hostname+")") d.CurrentHypervisor = h.pb.Hostname continue } log.Log(EVENT, "poll shows new droplet", d.Hostname, "on", h.pb.Hostname, "(in config file without preferred hypervisor)") d.CurrentHypervisor = h.pb.Hostname continue } // if this is blank, the droplet has probably never booted yet if d.CurrentHypervisor == "" { d.CurrentHypervisor = h.pb.Hostname continue } // this means the droplet has moved if d.CurrentHypervisor != h.pb.Hostname { log.Log(EVENT, "droplet", d.Hostname, "moved to", h.pb.Hostname) // record the droplet migrated (or booted somewhere else? recording this is a work in progress) me.cluster.DropletMoved(d, h.pb) continue } d.CurrentHypervisor = h.pb.Hostname } } for name, t := range h.lastDroplets { dur := time.Since(t) if dur > me.hyperPollDelay { d := findDroplet(name) if d == nil { log.Info("droplet has probably powered down", name, "but findDroplet returned nil") // should delete this from h.lastDroplets continue } // everthing below here is dumb and needs to be rethought if d.CurrentState != pb.DropletState_UNKNOWN { d.CurrentState = pb.DropletState_UNKNOWN log.Info("set state UNKNOWN here", name) } if d.CurrentState == pb.DropletState_UNKNOWN { if dur > time.Minute*2 { // what this means is the droplet probably wasn't migrated or the migrate failed // where should this be checked? the status needs to be changed to OFF s := shell.FormatDuration(dur) log.Info("UNKNOWN state for more than 2 minutes (clearing out ?)", name, s) // it might be safe to set the status to OFF here. not really. this poll needs // to be moved somewhere else. there needs to be a new goroutine not tied to the // hypervisor d.CurrentState = pb.DropletState_OFF } } } } h.lastpoll = time.Now() h.killcount = 0 // poll worked. reset killcount } // check the state of the cluster and return a string // that is intended to be sent to an uptime monitor like Kuma func uptimeCheck() (bool, string) { var good bool = true var total int var working int var failed int var missing []*pb.Droplet var unknown int var unknownList []string for _, d := range me.cluster.Droplets { total += 1 if d.StartState != pb.DropletState_ON { continue } dur := time.Since(d.LastPoll.AsTime()) // Calculate the elapsed time if d.CurrentState == pb.DropletState_UNKNOWN { // log.Info("SKIP. hostname has not been polled yet", d.Hostname, d.hname) unknown += 1 unknownList = append(unknownList, d.Hostname) continue } var hname string if d.CurrentHypervisor != "" { hname = d.CurrentHypervisor } if d.CurrentState != pb.DropletState_ON { log.Info("BAD STATE", d.StartState, d.Hostname, hname, "CurrentState =", d.CurrentState, shell.FormatDuration(dur)) good = false failed += 1 missing = append(missing, d) } else { dur := time.Since(d.LastPoll.AsTime()) // Calculate the elapsed time if dur > me.missingDropletTimeout { log.Info("GOOD STATE MISSING", d.Hostname, hname, shell.FormatDuration(dur)) good = false d.CurrentState = pb.DropletState_UNKNOWN failed += 1 continue } l := shell.FormatDuration(dur) if l == "" { log.Info("DUR IS EMPTY", dur) missing = append(missing, d) continue } working += 1 // log.Info("GOOD STATE ON", d.Hostname, d.hname, "dur =", l) } } var summary string = "(" summary += fmt.Sprintf("total = %d ", total) summary += fmt.Sprintf("working = %d ", working) if len(missing) > 0 { summary += fmt.Sprintf("missing = %d ", len(missing)) } if unknown > 0 { summary += fmt.Sprintf("unknown = %d ", unknown, unknownList) } if failed > 0 { summary += fmt.Sprintf("failed = %d ", failed) } summary = strings.TrimSpace(summary) summary += ")" if me.killcount > 0 { summary += "(killcount=" + fmt.Sprintf("%d", me.killcount) + ")" } last := time.Since(me.unstable) s := strings.TrimSpace(shell.FormatDuration(last)) if last > me.clusterStableDuration { // the cluster has not been stable for 10 seconds summary += "(stable=" + s + ")" } else { summary += "(unstable=" + s + ")" } for _, d := range missing { summary += fmt.Sprint("\nmissing droplet: ", d.Hostname, " current state ", d.CurrentState) } if good { return good, "GOOD=true " + summary } // me.unstable = time.Now() return good, "GOOD=false " + summary }