2024-10-12 10:59:11 -05:00
|
|
|
package main
|
|
|
|
|
|
|
|
import (
|
2024-10-13 01:13:19 -05:00
|
|
|
"fmt"
|
2024-10-12 10:59:11 -05:00
|
|
|
"strings"
|
|
|
|
"time"
|
|
|
|
|
|
|
|
"go.wit.com/lib/gui/shell"
|
2024-10-23 19:15:51 -05:00
|
|
|
pb "go.wit.com/lib/protobuf/virtbuf"
|
2024-10-12 10:59:11 -05:00
|
|
|
"go.wit.com/log"
|
2024-10-26 08:54:28 -05:00
|
|
|
"google.golang.org/protobuf/types/known/timestamppb"
|
2024-10-12 10:59:11 -05:00
|
|
|
)
|
|
|
|
|
2024-10-13 00:40:22 -05:00
|
|
|
func (h *HyperT) pollHypervisor() {
|
2024-10-22 17:27:24 -05:00
|
|
|
url := "http://" + h.pb.Hostname + ":2520/vms"
|
2024-10-12 12:45:43 -05:00
|
|
|
log.Log(POLL, "wget url =", url)
|
2024-10-12 10:59:11 -05:00
|
|
|
s := shell.Wget(url)
|
|
|
|
if s == nil {
|
|
|
|
return
|
|
|
|
}
|
2024-10-27 11:02:50 -05:00
|
|
|
|
2024-10-12 10:59:11 -05:00
|
|
|
var bytesSplice []byte
|
|
|
|
bytesSplice = s.Bytes()
|
|
|
|
// fmt.Fprintln(w, string(bytesSplice))
|
|
|
|
for _, line := range strings.Split(string(bytesSplice), "\n") {
|
|
|
|
if line == "" {
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
fields := strings.Fields(line)
|
|
|
|
if len(fields) < 2 {
|
2024-10-31 06:41:30 -05:00
|
|
|
log.Log(WARN, "locally defined:", h.pb.Hostname, fields)
|
2024-10-12 10:59:11 -05:00
|
|
|
continue
|
|
|
|
}
|
|
|
|
state := fields[0]
|
|
|
|
name := fields[1]
|
2024-10-27 11:02:50 -05:00
|
|
|
if state == "OFF" {
|
2024-10-31 06:41:30 -05:00
|
|
|
log.Log(WARN, "locally defined:", h.pb.Hostname, fields)
|
2024-10-27 11:02:50 -05:00
|
|
|
// skip locally defined libvirt vms
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
h.lastDroplets[name] = time.Now()
|
|
|
|
// if _, ok := h.lastDroplets[name]; ok {
|
|
|
|
// h.lastDroplets[name] = time.Now()
|
|
|
|
// }
|
|
|
|
|
|
|
|
// try the protobuf
|
2024-10-31 06:41:30 -05:00
|
|
|
d := me.cluster.FindDropletByName(name)
|
2024-10-27 11:02:50 -05:00
|
|
|
if d == nil {
|
|
|
|
// not sure whawt now?
|
|
|
|
log.Log(WARN, name, "is unknown on", h.pb.Hostname, "state =", state)
|
|
|
|
log.Log(WARN, name, "this vm was probably started by hand using virtsh")
|
|
|
|
log.Log(WARN, name, "todo: import vm from libvrit")
|
|
|
|
continue
|
|
|
|
}
|
2024-10-12 10:59:11 -05:00
|
|
|
if state == "ON" {
|
2024-10-22 17:27:24 -05:00
|
|
|
log.Log(POLL, h.pb.Hostname, "STATE:", state, "HOST:", name, "rest:", fields[2:])
|
2024-10-26 08:54:28 -05:00
|
|
|
log.Log(INFO, "ALREADY RECORDED", d.Hostname)
|
2024-10-17 15:29:47 -05:00
|
|
|
|
2024-10-26 09:33:31 -05:00
|
|
|
// update the status to ON
|
2024-10-31 06:41:30 -05:00
|
|
|
d.Current.State = pb.DropletState_ON
|
2024-10-17 15:29:47 -05:00
|
|
|
|
2024-10-26 09:33:31 -05:00
|
|
|
// set the LastPoll time to now
|
2024-10-26 08:54:28 -05:00
|
|
|
now := time.Now()
|
2024-10-31 06:41:30 -05:00
|
|
|
d.Current.LastPoll = timestamppb.New(now)
|
2024-10-26 08:54:28 -05:00
|
|
|
|
2024-10-31 06:41:30 -05:00
|
|
|
if d.Current.Hypervisor == "" {
|
2024-10-17 15:29:47 -05:00
|
|
|
// this means the droplet was in the config file
|
|
|
|
// but this is the first time it's shown up as running
|
|
|
|
|
|
|
|
// this should mean a droplet is running where the config file says it probably should be running
|
2024-10-26 08:54:28 -05:00
|
|
|
if d.PreferredHypervisor == h.pb.Hostname {
|
2024-10-26 09:33:31 -05:00
|
|
|
log.Log(EVENT, "poll shows new droplet", d.Hostname, "(matches config hypervisor", h.pb.Hostname+")")
|
2024-10-31 06:41:30 -05:00
|
|
|
d.Current.Hypervisor = h.pb.Hostname
|
2024-10-17 15:29:47 -05:00
|
|
|
continue
|
2024-10-12 10:59:11 -05:00
|
|
|
}
|
2024-10-17 15:29:47 -05:00
|
|
|
|
2024-10-26 09:33:31 -05:00
|
|
|
log.Log(EVENT, "poll shows new droplet", d.Hostname, "on", h.pb.Hostname, "(in config file without preferred hypervisor)")
|
2024-10-31 06:41:30 -05:00
|
|
|
d.Current.Hypervisor = h.pb.Hostname
|
2024-10-22 19:19:22 -05:00
|
|
|
continue
|
|
|
|
}
|
|
|
|
|
2024-10-28 05:07:56 -05:00
|
|
|
// if this is blank, the droplet has probably never booted yet
|
2024-10-31 06:41:30 -05:00
|
|
|
if d.Current.Hypervisor == "" {
|
|
|
|
d.Current.Hypervisor = h.pb.Hostname
|
2024-10-28 05:07:56 -05:00
|
|
|
continue
|
|
|
|
}
|
|
|
|
|
2024-10-26 09:33:31 -05:00
|
|
|
// this means the droplet has moved
|
2024-10-31 06:41:30 -05:00
|
|
|
if d.Current.Hypervisor != h.pb.Hostname {
|
2024-10-26 08:54:28 -05:00
|
|
|
log.Log(EVENT, "droplet", d.Hostname, "moved to", h.pb.Hostname)
|
2024-10-28 05:07:56 -05:00
|
|
|
// record the droplet migrated (or booted somewhere else? recording this is a work in progress)
|
|
|
|
me.cluster.DropletMoved(d, h.pb)
|
2024-10-22 19:19:22 -05:00
|
|
|
continue
|
2024-10-12 10:59:11 -05:00
|
|
|
}
|
2024-10-31 06:41:30 -05:00
|
|
|
d.Current.Hypervisor = h.pb.Hostname
|
2024-10-12 10:59:11 -05:00
|
|
|
}
|
|
|
|
}
|
2024-10-27 11:02:50 -05:00
|
|
|
for name, t := range h.lastDroplets {
|
|
|
|
dur := time.Since(t)
|
|
|
|
if dur > me.hyperPollDelay {
|
2024-10-31 06:41:30 -05:00
|
|
|
d := me.cluster.FindDropletByName(name)
|
2024-10-28 08:06:14 -05:00
|
|
|
if d == nil {
|
|
|
|
log.Info("droplet has probably powered down", name, "but findDroplet returned nil")
|
|
|
|
// should delete this from h.lastDroplets
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
// everthing below here is dumb and needs to be rethought
|
2024-10-31 06:41:30 -05:00
|
|
|
if d.Current.State != pb.DropletState_UNKNOWN {
|
|
|
|
d.Current.State = pb.DropletState_UNKNOWN
|
2024-10-28 08:06:14 -05:00
|
|
|
log.Info("set state UNKNOWN here", name)
|
|
|
|
}
|
2024-10-31 06:41:30 -05:00
|
|
|
if d.Current.State == pb.DropletState_UNKNOWN {
|
2024-10-28 08:06:14 -05:00
|
|
|
if dur > time.Minute*2 {
|
|
|
|
// what this means is the droplet probably wasn't migrated or the migrate failed
|
|
|
|
// where should this be checked? the status needs to be changed to OFF
|
2024-10-30 18:09:54 -05:00
|
|
|
s := shell.FormatDuration(dur)
|
|
|
|
log.Info("UNKNOWN state for more than 2 minutes (clearing out ?)", name, s)
|
2024-10-28 08:06:14 -05:00
|
|
|
|
|
|
|
// it might be safe to set the status to OFF here. not really. this poll needs
|
|
|
|
// to be moved somewhere else. there needs to be a new goroutine not tied to the
|
|
|
|
// hypervisor
|
2024-10-31 06:41:30 -05:00
|
|
|
d.Current.State = pb.DropletState_OFF
|
2024-10-28 07:02:42 -05:00
|
|
|
}
|
2024-10-27 11:02:50 -05:00
|
|
|
}
|
2024-10-13 03:04:46 -05:00
|
|
|
}
|
|
|
|
}
|
2024-10-27 11:02:50 -05:00
|
|
|
h.lastpoll = time.Now()
|
|
|
|
h.killcount = 0 // poll worked. reset killcount
|
2024-10-13 03:04:46 -05:00
|
|
|
}
|
|
|
|
|
2024-10-13 01:13:19 -05:00
|
|
|
// check the state of the cluster and return a string
|
|
|
|
// that is intended to be sent to an uptime monitor like Kuma
|
2024-10-27 11:02:50 -05:00
|
|
|
func uptimeCheck() (bool, string) {
|
2024-10-13 01:13:19 -05:00
|
|
|
var good bool = true
|
2024-10-13 01:33:32 -05:00
|
|
|
var total int
|
2024-10-13 01:13:19 -05:00
|
|
|
var working int
|
|
|
|
var failed int
|
2024-10-27 02:29:45 -05:00
|
|
|
var missing []*pb.Droplet
|
2024-10-13 01:38:35 -05:00
|
|
|
var unknown int
|
2024-10-15 11:02:34 -05:00
|
|
|
var unknownList []string
|
2024-10-13 01:38:35 -05:00
|
|
|
|
2024-10-26 08:54:28 -05:00
|
|
|
for _, d := range me.cluster.Droplets {
|
2024-10-13 01:33:32 -05:00
|
|
|
total += 1
|
2024-10-26 08:54:28 -05:00
|
|
|
if d.StartState != pb.DropletState_ON {
|
2024-10-12 13:01:31 -05:00
|
|
|
continue
|
|
|
|
}
|
2024-10-31 06:41:30 -05:00
|
|
|
dur := time.Since(d.Current.LastPoll.AsTime()) // Calculate the elapsed time
|
|
|
|
if d.Current.State == pb.DropletState_UNKNOWN {
|
2024-10-26 08:54:28 -05:00
|
|
|
// log.Info("SKIP. hostname has not been polled yet", d.Hostname, d.hname)
|
2024-10-13 01:38:35 -05:00
|
|
|
unknown += 1
|
2024-10-26 08:54:28 -05:00
|
|
|
unknownList = append(unknownList, d.Hostname)
|
2024-10-13 00:40:22 -05:00
|
|
|
continue
|
|
|
|
}
|
2024-10-22 18:19:21 -05:00
|
|
|
var hname string
|
2024-10-31 06:41:30 -05:00
|
|
|
if d.Current.Hypervisor != "" {
|
|
|
|
hname = d.Current.Hypervisor
|
2024-10-22 18:19:21 -05:00
|
|
|
}
|
2024-10-31 06:41:30 -05:00
|
|
|
if d.Current.State != pb.DropletState_ON {
|
|
|
|
log.Info("BAD STATE", d.StartState, d.Hostname, hname, "Current.State =", d.Current.State, shell.FormatDuration(dur))
|
2024-10-12 13:01:31 -05:00
|
|
|
good = false
|
2024-10-13 01:33:32 -05:00
|
|
|
failed += 1
|
2024-10-27 02:29:45 -05:00
|
|
|
missing = append(missing, d)
|
2024-10-12 13:01:31 -05:00
|
|
|
} else {
|
2024-10-31 06:41:30 -05:00
|
|
|
dur := time.Since(d.Current.LastPoll.AsTime()) // Calculate the elapsed time
|
2024-10-27 07:06:12 -05:00
|
|
|
if dur > me.missingDropletTimeout {
|
2024-10-26 08:54:28 -05:00
|
|
|
log.Info("GOOD STATE MISSING", d.Hostname, hname, shell.FormatDuration(dur))
|
2024-10-12 13:01:31 -05:00
|
|
|
good = false
|
2024-10-31 06:41:30 -05:00
|
|
|
d.Current.State = pb.DropletState_UNKNOWN
|
2024-10-13 01:13:19 -05:00
|
|
|
failed += 1
|
|
|
|
continue
|
2024-10-12 13:01:31 -05:00
|
|
|
}
|
2024-10-13 00:57:29 -05:00
|
|
|
l := shell.FormatDuration(dur)
|
2024-10-13 00:40:22 -05:00
|
|
|
if l == "" {
|
|
|
|
log.Info("DUR IS EMPTY", dur)
|
2024-10-27 02:29:45 -05:00
|
|
|
missing = append(missing, d)
|
2024-10-13 01:13:19 -05:00
|
|
|
continue
|
2024-10-13 00:40:22 -05:00
|
|
|
}
|
2024-10-13 01:13:19 -05:00
|
|
|
working += 1
|
2024-10-26 08:54:28 -05:00
|
|
|
// log.Info("GOOD STATE ON", d.Hostname, d.hname, "dur =", l)
|
2024-10-12 13:01:31 -05:00
|
|
|
}
|
|
|
|
}
|
2024-10-13 01:13:19 -05:00
|
|
|
var summary string = "("
|
2024-10-13 01:33:32 -05:00
|
|
|
summary += fmt.Sprintf("total = %d ", total)
|
2024-10-13 01:38:35 -05:00
|
|
|
summary += fmt.Sprintf("working = %d ", working)
|
2024-10-27 02:29:45 -05:00
|
|
|
if len(missing) > 0 {
|
|
|
|
summary += fmt.Sprintf("missing = %d ", len(missing))
|
2024-10-13 01:38:35 -05:00
|
|
|
}
|
|
|
|
if unknown > 0 {
|
2024-10-15 11:02:34 -05:00
|
|
|
summary += fmt.Sprintf("unknown = %d ", unknown, unknownList)
|
2024-10-13 01:13:19 -05:00
|
|
|
}
|
|
|
|
if failed > 0 {
|
2024-10-13 01:33:32 -05:00
|
|
|
summary += fmt.Sprintf("failed = %d ", failed)
|
2024-10-13 01:13:19 -05:00
|
|
|
}
|
2024-10-13 01:33:32 -05:00
|
|
|
summary = strings.TrimSpace(summary)
|
2024-10-13 01:13:19 -05:00
|
|
|
summary += ")"
|
2024-10-13 03:49:54 -05:00
|
|
|
if me.killcount > 0 {
|
|
|
|
summary += "(killcount=" + fmt.Sprintf("%d", me.killcount) + ")"
|
|
|
|
}
|
2024-10-13 04:34:55 -05:00
|
|
|
last := time.Since(me.unstable)
|
2024-10-28 11:44:53 -05:00
|
|
|
s := strings.TrimSpace(shell.FormatDuration(last))
|
2024-10-27 02:29:45 -05:00
|
|
|
if last > me.clusterStableDuration {
|
2024-10-13 03:49:54 -05:00
|
|
|
// the cluster has not been stable for 10 seconds
|
2024-10-13 04:34:55 -05:00
|
|
|
summary += "(stable=" + s + ")"
|
2024-10-28 11:44:53 -05:00
|
|
|
} else {
|
|
|
|
summary += "(unstable=" + s + ")"
|
2024-10-13 03:49:54 -05:00
|
|
|
}
|
2024-10-27 02:29:45 -05:00
|
|
|
for _, d := range missing {
|
2024-10-31 06:41:30 -05:00
|
|
|
summary += fmt.Sprint("\nmissing droplet: ", d.Hostname, " current state ", d.Current.State)
|
2024-10-27 02:29:45 -05:00
|
|
|
}
|
2024-10-13 01:13:19 -05:00
|
|
|
if good {
|
|
|
|
return good, "GOOD=true " + summary
|
|
|
|
}
|
2024-10-28 07:02:42 -05:00
|
|
|
// me.unstable = time.Now()
|
2024-10-13 01:13:19 -05:00
|
|
|
return good, "GOOD=false " + summary
|
2024-10-12 13:01:31 -05:00
|
|
|
}
|