package main import ( "fmt" "strings" "time" "go.wit.com/lib/gui/shell" pb "go.wit.com/lib/protobuf/virtbuf" "go.wit.com/log" "google.golang.org/protobuf/types/known/timestamppb" ) func findHypervisorByName(name string) *HyperT { for _, h := range me.hypers { if h.pb.Hostname == name { return h } } return nil } func (h *HyperT) pollHypervisor() { url := "http://" + h.pb.Hostname + ":2520/vms" log.Log(POLL, "wget url =", url) s := shell.Wget(url) if s == nil { return } var bytesSplice []byte bytesSplice = s.Bytes() // fmt.Fprintln(w, string(bytesSplice)) for _, line := range strings.Split(string(bytesSplice), "\n") { if line == "" { continue } fields := strings.Fields(line) if len(fields) < 2 { log.Log(WARN, "unknown:", h.pb.Hostname, fields) continue } state := fields[0] name := fields[1] d := me.cluster.FindDropletByName(name) if d == nil { log.Log(WARN, name, "local defined domain") log.Log(WARN, name, "local Adding new entry with AddDropletLocal()") log.Log(WARN, name, "local Adding new entry with AddDropletLocal()") log.Log(WARN, name, "local Adding new entry with AddDropletLocal()") me.cluster.AddDropletLocal(name, h.pb.Hostname) continue } start := d.SprintHeader() h.lastDroplets[name] = time.Now() if state == "OFF" { if d.Current.Hypervisor == "" { d.Current.Hypervisor = h.pb.Hostname } if d.LocalOnly == "" { log.Log(WARN, start, "local domain is a duplicate (need to resolve this)") continue } log.Log(WARN, start, "local domain ready to import from hypervisor") continue } if state == "ON" { log.Log(POLL, start, "STATE:", state, "rest:", fields[2:]) // update the status to ON d.SetState(pb.DropletState_ON) // set the LastPoll time to now now := time.Now() d.Current.LastPoll = timestamppb.New(now) if d.Current.Hypervisor == "" { // this means the droplet was in the config file // but this is the first time it's shown up as running // this should mean a droplet is running where the config file says it probably should be running if d.PreferredHypervisor == h.pb.Hostname { log.Log(EVENT, start, "poll shows new droplet", d.Hostname, "(matches config hypervisor", h.pb.Hostname+")") d.Current.Hypervisor = h.pb.Hostname continue } log.Log(EVENT, start, "poll shows new droplet (in config file without preferred hypervisor)") d.Current.Hypervisor = h.pb.Hostname continue } // if this is blank, the droplet has probably never booted yet if d.Current.Hypervisor == "" { d.Current.Hypervisor = h.pb.Hostname continue } // this means the droplet has moved if d.Current.Hypervisor != h.pb.Hostname { log.Log(EVENT, "droplet", d.Hostname, "moved to", h.pb.Hostname) // record the droplet migrated (or booted somewhere else? recording this is a work in progress) me.cluster.DropletMoved(d, h.pb) continue } d.Current.Hypervisor = h.pb.Hostname } } for name, t := range h.lastDroplets { dur := time.Since(t) if dur > me.hyperPollDelay { d := me.cluster.FindDropletByName(name) if d == nil { log.Info("droplet has probably powered down", name, "but findDroplet returned nil") // should delete this from h.lastDroplets continue } // everthing below here is dumb and needs to be rethought if d.Current.State != pb.DropletState_UNKNOWN { d.SetState(pb.DropletState_UNKNOWN) log.Info("set state UNKNOWN here", name) } if d.Current.State == pb.DropletState_UNKNOWN { if dur > time.Minute*2 { // what this means is the droplet probably wasn't migrated or the migrate failed // where should this be checked? the status needs to be changed to OFF s := pb.FormatDuration(dur) log.Info("UNKNOWN state for more than 2 minutes (clearing out ?)", name, s) // it might be safe to set the status to OFF here. not really. this poll needs // to be moved somewhere else. there needs to be a new goroutine not tied to the // hypervisor d.SetState(pb.DropletState_OFF) } } } } h.lastpoll = time.Now() h.killcount = 0 // poll worked. reset killcount } // check the state of the cluster and return a string // that is intended to be sent to an uptime monitor like Kuma func uptimeCheck() (bool, string) { var good bool = true var total int var working int var failed int var missing []*pb.Droplet var unknown int var unknownList []string loop := me.cluster.DropletsAll() // get the list of droplets for loop.Scan() { d := loop.Droplet() total += 1 if d.StartState != pb.DropletState_ON { continue } dur := time.Since(d.Current.LastPoll.AsTime()) // Calculate the elapsed time var hname string if d.Current.Hypervisor != "" { hname = d.Current.Hypervisor } switch d.Current.State { case pb.DropletState_UNKNOWN: // log.Info("SKIP. hostname has not been polled yet", d.Hostname, d.hname) unknown += 1 unknownList = append(unknownList, d.Hostname) case pb.DropletState_ON: if dur > me.missingDropletTimeout { log.Info("GOOD STATE MISSING", d.Hostname, hname, pb.FormatDuration(dur)) good = false d.SetState(pb.DropletState_UNKNOWN) failed += 1 continue } l := pb.FormatDuration(dur) if l == "" { log.Info("DUR IS EMPTY", dur) missing = append(missing, d) continue } working += 1 // log.Info("GOOD STATE ON", d.Hostname, d.hname, "dur =", l) case pb.DropletState_OFF: log.Info("OFF STATE", d.StartState, d.Hostname, hname, pb.FormatDuration(dur)) good = false failed += 1 // missing = append(missing, d) default: log.Info("WTF STATE", d.StartState, d.Hostname, hname, "Current.State =", d.Current.State, pb.FormatDuration(dur)) good = false failed += 1 missing = append(missing, d) } } var summary string = "(" summary += fmt.Sprintf("total = %d ", total) summary += fmt.Sprintf("working = %d ", working) if len(missing) > 0 { summary += fmt.Sprintf("missing = %d ", len(missing)) } if unknown > 0 { summary += fmt.Sprintf("unknown = %d ", unknown, unknownList) } if failed > 0 { summary += fmt.Sprintf("failed = %d ", failed) } summary = strings.TrimSpace(summary) summary += ")" if me.killcount > 0 { summary += "(killcount=" + fmt.Sprintf("%d", me.killcount) + ")" } last := time.Since(me.unstable) s := strings.TrimSpace(pb.FormatDuration(last)) if last > me.clusterStableDuration { // the cluster has not been stable for 10 seconds summary += "(stable=" + s + ")" } else { summary += "(unstable=" + s + ")" } for _, d := range missing { summary += fmt.Sprint("\nmissing droplet: ", d.Hostname, " current state ", d.Current.State) } if good { return good, "GOOD=true " + summary } // me.unstable = time.Now() return good, "GOOD=false " + summary }