virtigo/poll.go

244 lines
7.1 KiB
Go
Raw Normal View History

package main
import (
"fmt"
"strings"
"time"
"go.wit.com/lib/gui/shell"
pb "go.wit.com/lib/protobuf/virtbuf"
"go.wit.com/log"
"google.golang.org/protobuf/types/known/timestamppb"
)
func findHypervisorByName(name string) *HyperT {
for _, h := range me.hypers {
if h.pb.Hostname == name {
return h
}
}
return nil
}
func (h *HyperT) pollHypervisor() {
url := "http://" + h.pb.Hostname + ":2520/vms"
log.Log(POLL, "wget url =", url)
s := shell.Wget(url)
if s == nil {
return
}
var bytesSplice []byte
bytesSplice = s.Bytes()
// fmt.Fprintln(w, string(bytesSplice))
for _, line := range strings.Split(string(bytesSplice), "\n") {
if line == "" {
continue
}
fields := strings.Fields(line)
if len(fields) < 2 {
log.Log(WARN, "unknown:", h.pb.Hostname, fields)
continue
}
state := fields[0]
name := fields[1]
d := me.cluster.FindDropletByName(name)
if d == nil {
log.Log(WARN, name, "local defined domain")
log.Log(WARN, name, "local Adding new entry with AddDropletLocal()")
log.Log(WARN, name, "local Adding new entry with AddDropletLocal()")
log.Log(WARN, name, "local Adding new entry with AddDropletLocal()")
me.cluster.AddDropletLocal(name, h.pb.Hostname)
continue
}
start := d.SprintHeader()
h.lastDroplets[name] = time.Now()
if state == "OFF" {
if d.Current.Hypervisor == "" {
d.Current.Hypervisor = h.pb.Hostname
}
if d.LocalOnly == "" {
log.Log(WARN, start, "local domain is a duplicate (need to resolve this)")
continue
}
log.Log(WARN, start, "local domain ready to import from hypervisor")
continue
}
if state == "ON" {
log.Log(POLL, start, "STATE:", state, "rest:", fields[2:])
// update the status to ON
d.SetState(pb.DropletState_ON)
// set the LastPoll time to now
now := time.Now()
d.Current.LastPoll = timestamppb.New(now)
if d.Current.Hypervisor == "" {
// this means the droplet was in the config file
// but this is the first time it's shown up as running
// this should mean a droplet is running where the config file says it probably should be running
if d.PreferredHypervisor == h.pb.Hostname {
log.Log(EVENT, start, "poll shows new droplet", d.Hostname,
"(matches config hypervisor", h.pb.Hostname+")")
d.Current.Hypervisor = h.pb.Hostname
continue
}
log.Log(EVENT, start, "poll shows new droplet (in config file without preferred hypervisor)")
d.Current.Hypervisor = h.pb.Hostname
continue
}
// if this is blank, the droplet has probably never booted yet
if d.Current.Hypervisor == "" {
d.Current.Hypervisor = h.pb.Hostname
continue
}
// this means the droplet has moved
if d.Current.Hypervisor != h.pb.Hostname {
log.Log(EVENT, "droplet", d.Hostname, "moved to", h.pb.Hostname)
// record the droplet migrated (or booted somewhere else? recording this is a work in progress)
me.cluster.DropletMoved(d, h.pb)
continue
}
d.Current.Hypervisor = h.pb.Hostname
}
}
// these are the droplets that don't exist anymore on this hypervisor
// this should mean you ran shutdown within domU
for name, t := range h.lastDroplets {
dur := time.Since(t)
if dur > me.hyperPollDelay {
d := me.cluster.FindDropletByName(name)
header := d.SprintHeader()
if d == nil {
log.Info(header, "droplet has probably powered down", name, "but findDroplet returned nil")
// should delete this from h.lastDroplets
continue
}
if d.Current.State == pb.DropletState_OFF {
log.Info(header, "droplet timed out and is off. remove from h.lastDroplets[] slice")
delete(h.lastDroplets, name)
continue
}
// everthing below here is dumb and needs to be rethought
if d.Current.State != pb.DropletState_UNKNOWN {
d.SetState(pb.DropletState_UNKNOWN)
log.Info(header, "set state UNKNOWN here", name)
}
if d.Current.State == pb.DropletState_UNKNOWN {
if dur > time.Minute*2 {
// what this means is the droplet probably wasn't migrated or the migrate failed
// where should this be checked? the status needs to be changed to OFF
s := pb.FormatDuration(dur)
log.Info(header, "UNKNOWN state for more than 2 minutes (clearing out ?)", name, s)
// it might be safe to set the status to OFF here. not really. this poll needs
// to be moved somewhere else. there needs to be a new goroutine not tied to the
// hypervisor
d.SetState(pb.DropletState_OFF)
}
}
}
}
h.lastpoll = time.Now()
h.killcount = 0 // poll worked. reset killcount
}
// check the state of the cluster and return a string
// that is intended to be sent to an uptime monitor like Kuma
func uptimeCheck() (bool, string) {
var good bool = true
var total int
var working int
var failed int
var missing []*pb.Droplet
var unknown int
var unknownList []string
loop := me.cluster.DropletsAll() // get the list of droplets
for loop.Scan() {
d := loop.Droplet()
total += 1
if d.StartState != pb.DropletState_ON {
continue
}
dur := time.Since(d.Current.LastPoll.AsTime()) // Calculate the elapsed time
var hname string
if d.Current.Hypervisor != "" {
hname = d.Current.Hypervisor
}
switch d.Current.State {
case pb.DropletState_UNKNOWN:
// log.Info("SKIP. hostname has not been polled yet", d.Hostname, d.hname)
unknown += 1
unknownList = append(unknownList, d.Hostname)
case pb.DropletState_ON:
if dur > me.missingDropletTimeout {
log.Info("GOOD STATE MISSING", d.Hostname, hname, pb.FormatDuration(dur))
good = false
d.SetState(pb.DropletState_UNKNOWN)
failed += 1
continue
}
l := pb.FormatDuration(dur)
if l == "" {
log.Info("DUR IS EMPTY", dur)
missing = append(missing, d)
continue
}
working += 1
// log.Info("GOOD STATE ON", d.Hostname, d.hname, "dur =", l)
case pb.DropletState_OFF:
log.Info("OFF STATE", d.StartState, d.Hostname, hname, pb.FormatDuration(dur))
good = false
failed += 1
// missing = append(missing, d)
default:
log.Info("WTF STATE", d.StartState, d.Hostname, hname, "Current.State =", d.Current.State, pb.FormatDuration(dur))
good = false
failed += 1
missing = append(missing, d)
}
}
var summary string = "("
summary += fmt.Sprintf("total = %d ", total)
summary += fmt.Sprintf("working = %d ", working)
if len(missing) > 0 {
summary += fmt.Sprintf("missing = %d ", len(missing))
}
if unknown > 0 {
summary += fmt.Sprintf("unknown = %d ", unknown, unknownList)
}
if failed > 0 {
summary += fmt.Sprintf("failed = %d ", failed)
}
summary = strings.TrimSpace(summary)
summary += ")"
if me.killcount > 0 {
summary += "(killcount=" + fmt.Sprintf("%d", me.killcount) + ")"
}
last := time.Since(me.unstable)
s := strings.TrimSpace(pb.FormatDuration(last))
if last > me.clusterStableDuration {
// the cluster has not been stable for 10 seconds
summary += "(stable=" + s + ")"
} else {
summary += "(unstable=" + s + ")"
}
for _, d := range missing {
summary += fmt.Sprint("\nmissing droplet: ", d.Hostname, " current state ", d.Current.State)
}
if good {
return good, "GOOD=true " + summary
}
// me.unstable = time.Now()
return good, "GOOD=false " + summary
}