Compare commits

..

2 Commits

Author SHA1 Message Date
Ian Fijolek
e4e6618af1 Adds ability to run specified alerts on startup
This is helpful to determine if your alerts are valid before an actual failure
2024-04-03 11:50:49 -07:00
Ian Fijolek
6a2b44673e Upgrade prometheus client and protobuf 2024-04-03 11:28:01 -07:00
7 changed files with 142 additions and 262 deletions
+6 -6
View File
@@ -4,7 +4,7 @@ go 1.20
require (
git.iamthefij.com/iamthefij/slog v1.3.0
github.com/prometheus/client_golang v1.15.0
github.com/prometheus/client_golang v1.19.0
gopkg.in/yaml.v2 v2.4.0
)
@@ -14,10 +14,10 @@ require (
github.com/golang/protobuf v1.5.3 // indirect
github.com/kr/text v0.2.0 // indirect
github.com/matttproud/golang_protobuf_extensions v1.0.4 // indirect
github.com/prometheus/client_model v0.3.0 // indirect
github.com/prometheus/common v0.42.0 // indirect
github.com/prometheus/procfs v0.9.0 // indirect
github.com/prometheus/client_model v0.5.0 // indirect
github.com/prometheus/common v0.48.0 // indirect
github.com/prometheus/procfs v0.12.0 // indirect
github.com/rogpeppe/go-internal v1.10.0 // indirect
golang.org/x/sys v0.6.0 // indirect
google.golang.org/protobuf v1.30.0 // indirect
golang.org/x/sys v0.16.0 // indirect
google.golang.org/protobuf v1.33.0 // indirect
)
+14
View File
@@ -182,12 +182,16 @@ github.com/prometheus/client_golang v1.12.1/go.mod h1:3Z9XVyYiZYEO+YQWt3RD2R3jrb
github.com/prometheus/client_golang v1.14.0/go.mod h1:8vpkKitgIVNcqrRBWh1C4TIUQgYNtG/XQE4E/Zae36Y=
github.com/prometheus/client_golang v1.15.0 h1:5fCgGYogn0hFdhyhLbw7hEsWxufKtY9klyvdNfFlFhM=
github.com/prometheus/client_golang v1.15.0/go.mod h1:e9yaBhRPU2pPNsZwE+JdQl0KEt1N9XgF6zxWmaC0xOk=
github.com/prometheus/client_golang v1.19.0 h1:ygXvpU1AoN1MhdzckN+PyD9QJOSD4x7kmXYlnfbA6JU=
github.com/prometheus/client_golang v1.19.0/go.mod h1:ZRM9uEAypZakd+q/x7+gmsvXdURP+DABIEIjnmDdp+k=
github.com/prometheus/client_model v0.0.0-20180712105110-5c3871d89910/go.mod h1:MbSGuTsp3dbXC40dX6PRTWyKYBIrTGTE9sqQNg2J8bo=
github.com/prometheus/client_model v0.0.0-20190129233127-fd36f4220a90/go.mod h1:xMI15A0UPsDsEKsMN9yxemIoYk6Tm2C1GtYGdfGttqA=
github.com/prometheus/client_model v0.0.0-20190812154241-14fe0d1b01d4/go.mod h1:xMI15A0UPsDsEKsMN9yxemIoYk6Tm2C1GtYGdfGttqA=
github.com/prometheus/client_model v0.2.0/go.mod h1:xMI15A0UPsDsEKsMN9yxemIoYk6Tm2C1GtYGdfGttqA=
github.com/prometheus/client_model v0.3.0 h1:UBgGFHqYdG/TPFD1B1ogZywDqEkwp3fBMvqdiQ7Xew4=
github.com/prometheus/client_model v0.3.0/go.mod h1:LDGWKZIo7rky3hgvBe+caln+Dr3dPggB5dvjtD7w9+w=
github.com/prometheus/client_model v0.5.0 h1:VQw1hfvPvk3Uv6Qf29VrPF32JB6rtbgI6cYPYQjL0Qw=
github.com/prometheus/client_model v0.5.0/go.mod h1:dTiFglRmd66nLR9Pv9f0mZi7B7fk5Pm3gvsjB5tr+kI=
github.com/prometheus/common v0.4.1/go.mod h1:TNfzLD0ON7rHzMJeJkieUDPYmFC7Snx/y86RQel1bk4=
github.com/prometheus/common v0.10.0/go.mod h1:Tlit/dnDKsSWFlCLTWaA1cyBgKHSMdTB80sz/V91rCo=
github.com/prometheus/common v0.26.0/go.mod h1:M7rCNAaPfAosfx8veZJCuw84e35h3Cfd9VFqTh1DIvc=
@@ -195,6 +199,8 @@ github.com/prometheus/common v0.32.1/go.mod h1:vu+V0TpY+O6vW9J44gczi3Ap/oXXR10b+
github.com/prometheus/common v0.37.0/go.mod h1:phzohg0JFMnBEFGxTDbfu3QyL5GI8gTQJFhYO5B3mfA=
github.com/prometheus/common v0.42.0 h1:EKsfXEYo4JpWMHH5cg+KOUWeuJSov1Id8zGR8eeI1YM=
github.com/prometheus/common v0.42.0/go.mod h1:xBwqVerjNdUDjgODMpudtOMwlOwf2SaTr1yjz4b7Zbc=
github.com/prometheus/common v0.48.0 h1:QO8U2CdOzSn1BBsmXJXduaaW+dY/5QLjfB8svtSzKKE=
github.com/prometheus/common v0.48.0/go.mod h1:0/KsvlIEfPQCQ5I2iNSAWKPZziNCvRs5EC6ILDTlAPc=
github.com/prometheus/procfs v0.0.0-20181005140218-185b4288413d/go.mod h1:c3At6R/oaqEKCNdg8wHV1ftS6bRYblBhIjjI8uT2IGk=
github.com/prometheus/procfs v0.0.2/go.mod h1:TjEm7ze935MbeOT/UhFTIMYKhuLP4wbCsTZCD3I8kEA=
github.com/prometheus/procfs v0.1.3/go.mod h1:lV6e/gmhEcM9IjHGsFOCxxuZ+z1YqCvr4OA4YeYWdaU=
@@ -203,6 +209,8 @@ github.com/prometheus/procfs v0.7.3/go.mod h1:cz+aTbrPOrUb4q7XlbU9ygM+/jj0fzG6c1
github.com/prometheus/procfs v0.8.0/go.mod h1:z7EfXMXOkbkqb9IINtpCn86r/to3BnA0uaxHdg830/4=
github.com/prometheus/procfs v0.9.0 h1:wzCHvIvM5SxWqYvwgVL7yJY8Lz3PKn49KQtpgMYJfhI=
github.com/prometheus/procfs v0.9.0/go.mod h1:+pB4zwohETzFnmlpe6yd2lSc+0/46IYZRB/chUwxUZY=
github.com/prometheus/procfs v0.12.0 h1:jluTpSng7V9hY0O2R9DzzJHYb2xULk9VTR1V1R/k6Bo=
github.com/prometheus/procfs v0.12.0/go.mod h1:pcuDEFsWDnvcgNzo4EEweacyhjeA9Zk3cnaOZAZEfOo=
github.com/rogpeppe/go-internal v1.3.0/go.mod h1:M8bDsm7K2OlrFYOpmOWEs/qY81heoFRclV5y23lUDJ4=
github.com/rogpeppe/go-internal v1.9.0/go.mod h1:WtVeX8xhTBvf0smdhujwtBcq4Qrzq/fJaraNFVN+nFs=
github.com/rogpeppe/go-internal v1.10.0 h1:TMyTOH3F/DB16zRVcYyreMH6GnZZrwQVAoYjRBZyWFQ=
@@ -365,6 +373,8 @@ golang.org/x/sys v0.3.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.5.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.6.0 h1:MVltZSvRTcU2ljQOhs94SXPftV6DCNnZViHeQps87pQ=
golang.org/x/sys v0.6.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.16.0 h1:xWw16ngr6ZMtmxDyKyIgsE93KNKz5HKmMa3b8ALHidU=
golang.org/x/sys v0.16.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA=
golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo=
golang.org/x/term v0.0.0-20210927222741-03fcf44c2211/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8=
golang.org/x/term v0.5.0/go.mod h1:jMB1sMXY+tzblOD4FWmEbocvup2/aLOaQEp7JmGp78k=
@@ -504,6 +514,10 @@ google.golang.org/protobuf v1.28.0/go.mod h1:HV8QOd/L58Z+nl8r43ehVNZIU/HEI6OcFqw
google.golang.org/protobuf v1.28.1/go.mod h1:HV8QOd/L58Z+nl8r43ehVNZIU/HEI6OcFqwMG9pJV4I=
google.golang.org/protobuf v1.30.0 h1:kPPoIgf3TsEvrm0PFe15JQ+570QVxYzEvvHqChK+cng=
google.golang.org/protobuf v1.30.0/go.mod h1:HV8QOd/L58Z+nl8r43ehVNZIU/HEI6OcFqwMG9pJV4I=
google.golang.org/protobuf v1.32.0 h1:pPC6BG5ex8PDFnkbrGU3EixyhKcQ2aDuBS36lqK/C7I=
google.golang.org/protobuf v1.32.0/go.mod h1:c6P6GXX6sHbq/GpV6MGZEdwhWPcYBgnhAHhKbcUYpos=
google.golang.org/protobuf v1.33.0 h1:uNO2rsAINq/JlFpSdYEKIZ0uKD/R9cpdv0T+yoGwGmI=
google.golang.org/protobuf v1.33.0/go.mod h1:c6P6GXX6sHbq/GpV6MGZEdwhWPcYBgnhAHhKbcUYpos=
gopkg.in/alecthomas/kingpin.v2 v2.2.6/go.mod h1:FMv+mEhP44yOT+4EoQTLFTRgOQ1FBLkstjWtayDeSgw=
gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
gopkg.in/check.v1 v1.0.0-20180628173108-788fd7840127/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
-72
View File
@@ -1,72 +0,0 @@
package main
import (
"fmt"
"io"
"net/http"
"strings"
)
type HealthCheckHandler struct {
isMinitorHealthy bool
monitors []*Monitor
}
func NewHealthCheckHandler(monitors []*Monitor) *HealthCheckHandler {
return &HealthCheckHandler{
false,
monitors,
}
}
func (hch *HealthCheckHandler) MinitorHealthy(healthy bool) {
hch.isMinitorHealthy = healthy
}
func (hch HealthCheckHandler) MinitorHealthCheck() (bool, string) {
if hch.isMinitorHealthy {
return true, "OK"
} else {
return false, "UNHEALTHY"
}
}
func (hch HealthCheckHandler) MonitorsHealthCheck() (bool, string) {
downMonitors := []string{}
for _, monitor := range hch.monitors {
if !monitor.IsUp() {
downMonitors = append(downMonitors, monitor.Name)
}
}
if len(downMonitors) == 0 {
return true, "OK"
} else {
return false, fmt.Sprintf("UNHEALTHY: The following monitors are unhealthy: %s", strings.Join(downMonitors, ", "))
}
}
func (hch HealthCheckHandler) ServeHTTP(w http.ResponseWriter, r *http.Request) {
var healthy bool
var body string
if monitors := r.URL.Query().Get("monitors"); monitors != "" {
healthy, body = hch.MonitorsHealthCheck()
} else {
healthy, body = hch.MinitorHealthCheck()
}
if healthy {
w.WriteHeader(http.StatusOK)
} else {
w.WriteHeader(http.StatusServiceUnavailable)
}
_, _ = io.WriteString(w, body)
}
func HandleHealthCheck() {
http.Handle("/metrics", HealthChecks)
}
-79
View File
@@ -1,79 +0,0 @@
package main
import (
"testing"
)
func TestNewHealthCheck(t *testing.T) {
monitors := []*Monitor{
{Name: "Test Monitor"},
}
hc := NewHealthCheckHandler(monitors)
monitors[0].alertCount++
if healthy, _ := hc.MinitorHealthCheck(); healthy {
t.Errorf("Initial hc state should be unhealthy until some successful alert is sent")
}
if healthy, _ := hc.MonitorsHealthCheck(); healthy {
t.Errorf("Faking an alert on the monitor pointer should make this unhealthy")
}
}
func TestMinitorHealthCheck(t *testing.T) {
monitors := []*Monitor{
{Name: "Test Monitor"},
}
hc := NewHealthCheckHandler(monitors)
t.Run("MinitorHealthCheck(healthy)", func(t *testing.T) {
hc.MinitorHealthy(true)
healthy, body := hc.MinitorHealthCheck()
if !healthy {
t.Errorf("Expected healthy check")
}
if body != "OK" {
t.Errorf("Expected OK response")
}
})
t.Run("MinitorHealthCheck(unhealthy)", func(t *testing.T) {
hc.MinitorHealthy(false)
healthy, body := hc.MinitorHealthCheck()
if healthy {
t.Errorf("Expected healthy check")
}
if body != "UNHEALTHY" {
t.Errorf("Expected UNHEALTHY response")
}
})
}
func TestMonitorsHealthCheck(t *testing.T) {
monitors := []*Monitor{
{Name: "Test Monitor"},
}
hc := NewHealthCheckHandler(monitors)
t.Run("MonitorsHealthCheck(healthy)", func(t *testing.T) {
healthy, body := hc.MonitorsHealthCheck()
if !healthy {
t.Errorf("Expected healthy check")
}
if body != "OK" {
t.Errorf("Expected OK response")
}
})
t.Run("MonitorsHealthCheck(unhealthy)", func(t *testing.T) {
monitors[0].alertCount++
healthy, body := hc.MonitorsHealthCheck()
if healthy {
t.Errorf("Expected healthy check")
}
if body != "UNHEALTHY: The following monitors are unhealthy: Test Monitor" {
t.Errorf("Expected UNHEALTHY response")
}
})
}
+35 -38
View File
@@ -4,7 +4,7 @@ import (
"errors"
"flag"
"fmt"
"net/http"
"strings"
"time"
"git.iamthefij.com/iamthefij/slog"
@@ -17,10 +17,6 @@ var (
MetricsPort = 8080
// Metrics contains all active metrics
Metrics = NewMetrics()
// Self monitor rather than panicing
SelfMonitor = false
// HealthChecks contains health check values
HealthChecks *HealthCheckHandler = nil
// PyCompat enables support for legacy Python templates
PyCompat = false
@@ -56,13 +52,7 @@ func sendAlerts(config *Config, monitor *Monitor, alertNotice *AlertNotice) erro
output,
)
if SelfMonitor {
Metrics.SetMonitorStatus(fmt.Sprintf("Alert %s", alertName), false)
}
return err
} else {
Metrics.SetMonitorStatus(fmt.Sprintf("Alert %s", alertName), true)
}
// Count alert metrics
@@ -80,8 +70,6 @@ func sendAlerts(config *Config, monitor *Monitor, alertNotice *AlertNotice) erro
func checkMonitors(config *Config) error {
// TODO: Run this in goroutines and capture exceptions
healthy := true
for _, monitor := range config.Monitors {
if monitor.ShouldCheck() {
success, alertNotice := monitor.Check()
@@ -93,42 +81,54 @@ func checkMonitors(config *Config) error {
if alertNotice != nil {
err := sendAlerts(config, monitor, alertNotice)
// If there was an error in sending an alert, mark as unhealthy or bubble up
// If there was an error in sending an alert, exit early and bubble it up
if err != nil {
if SelfMonitor {
healthy = false
} else {
return err
}
}
}
}
return nil
}
if HealthChecks != nil {
HealthChecks.MinitorHealthy(healthy)
func sendFirstRunAlerts(config *Config, alertNames []string) error {
for _, alertName := range alertNames {
var err error
alert, ok := config.Alerts[alertName]
if !ok {
err = fmt.Errorf("unknown alert %s: %w", alertName, errUnknownAlert)
}
if err == nil {
_, err = alert.Send(AlertNotice{
AlertCount: 0,
FailureCount: 0,
IsUp: true,
LastSuccess: time.Now(),
MonitorName: fmt.Sprintf("First Run Alert Test: %s", alert.Name),
LastCheckOutput: "",
})
}
if err != nil {
return err
}
}
return nil
}
// ServeMetricsAndHealth starts the default http server
func ServeMetricsAndHealth() {
host := fmt.Sprintf(":%d", MetricsPort)
_ = http.ListenAndServe(host, nil)
}
func main() {
showVersion := flag.Bool("version", false, "Display the version of minitor and exit")
configPath := flag.String("config", "config.yml", "Alternate configuration path (default: config.yml)")
firstRunAlerts := flag.String("first-run-alerts", "", "List of alerts to run on startup. This can help determine unhealthy alerts early on. (default \"\")")
flag.BoolVar(&slog.DebugLevel, "debug", false, "Enables debug logs (default: false)")
flag.BoolVar(&ExportMetrics, "metrics", false, "Enables prometheus metrics exporting (default: false)")
flag.BoolVar(&PyCompat, "py-compat", false, "Enables support for legacy Python Minitor config. Will eventually be removed. (default: false)")
flag.IntVar(&MetricsPort, "metrics-port", MetricsPort, "The port that Prometheus metrics and healthchecks should be exported on, if enabled. (default: 8080)")
flag.BoolVar(&SelfMonitor, "self-monitor", false, "Enables self-monitoring. Export metrics rather than panic when alerts fail. (default: false)")
flag.IntVar(&MetricsPort, "metrics-port", MetricsPort, "The port that Prometheus metrics should be exported on, if enabled. (default: 8080)")
flag.Parse()
// Print version if flag is provided
@@ -145,19 +145,16 @@ func main() {
// Serve metrics exporter, if specified
if ExportMetrics {
slog.Infof("Exporting metrics to Prometheus on port %d", MetricsPort)
HandleMetrics()
go ServeMetrics()
}
if SelfMonitor {
slog.Infof("Starting healthcheck endpoint on port %d", MetricsPort)
if *firstRunAlerts != "" {
alertNames := strings.Split(*firstRunAlerts, ",")
HealthChecks = NewHealthCheckHandler(config.Monitors)
err = sendFirstRunAlerts(&config, alertNames)
HandleHealthCheck()
}
if ExportMetrics || SelfMonitor {
go ServeMetricsAndHealth()
slog.OnErrPanicf(err, "Error running first run alerts")
}
// Start main loop
+61 -46
View File
@@ -7,7 +7,6 @@ func TestCheckMonitors(t *testing.T) {
config Config
expectErr bool
name string
selfMonitor bool
}{
{
config: Config{},
@@ -25,7 +24,6 @@ func TestCheckMonitors(t *testing.T) {
},
expectErr: false,
name: "Monitor success, no alerts",
selfMonitor: false,
},
{
config: Config{
@@ -39,7 +37,6 @@ func TestCheckMonitors(t *testing.T) {
},
expectErr: false,
name: "Monitor failure, no alerts",
selfMonitor: false,
},
{
config: Config{
@@ -53,7 +50,6 @@ func TestCheckMonitors(t *testing.T) {
},
expectErr: false,
name: "Monitor recovery, no alerts",
selfMonitor: false,
},
{
config: Config{
@@ -68,7 +64,6 @@ func TestCheckMonitors(t *testing.T) {
},
expectErr: true,
name: "Monitor failure, unknown alerts",
selfMonitor: false,
},
{
config: Config{
@@ -83,22 +78,6 @@ func TestCheckMonitors(t *testing.T) {
},
expectErr: true,
name: "Monitor recovery, unknown alerts",
selfMonitor: false,
},
{
config: Config{
Monitors: []*Monitor{
{
Name: "Success",
Command: CommandOrShell{Command: []string{"true"}},
AlertUp: []string{"unknown"},
alertCount: 1,
},
},
},
expectErr: false,
name: "Monitor recovery, unknown alerts, with Health Check",
selfMonitor: true,
},
{
config: Config{
@@ -118,7 +97,6 @@ func TestCheckMonitors(t *testing.T) {
},
expectErr: false,
name: "Monitor failure, successful alert",
selfMonitor: false,
},
{
config: Config{
@@ -139,34 +117,10 @@ func TestCheckMonitors(t *testing.T) {
},
expectErr: true,
name: "Monitor failure, bad alert",
selfMonitor: false,
},
{
config: Config{
Monitors: []*Monitor{
{
Name: "Failure",
Command: CommandOrShell{Command: []string{"false"}},
AlertDown: []string{"bad"},
AlertAfter: 1,
},
},
Alerts: map[string]*Alert{
"bad": {
Name: "bad",
Command: CommandOrShell{Command: []string{"false"}},
},
},
},
expectErr: false,
name: "Monitor failure, bad alert, with Health Check",
selfMonitor: true,
},
}
for _, c := range cases {
SelfMonitor = c.selfMonitor
err := c.config.Init()
if err != nil {
t.Errorf("checkMonitors(%s): unexpected error reading config: %v", c.name, err)
@@ -180,3 +134,64 @@ func TestCheckMonitors(t *testing.T) {
}
}
}
func TestFirstRunAlerts(t *testing.T) {
cases := []struct {
config Config
expectErr bool
firstRunAlerts []string
name string
}{
{
config: Config{},
expectErr: false,
firstRunAlerts: []string{},
name: "Empty",
},
{
config: Config{},
expectErr: true,
firstRunAlerts: []string{"missing"},
name: "Unknown",
},
{
config: Config{
Alerts: map[string]*Alert{
"good": {
Command: CommandOrShell{Command: []string{"true"}},
},
},
},
expectErr: false,
firstRunAlerts: []string{"good"},
name: "Successful alert",
},
{
config: Config{
Alerts: map[string]*Alert{
"bad": {
Name: "bad",
Command: CommandOrShell{Command: []string{"false"}},
},
},
},
expectErr: true,
firstRunAlerts: []string{"bad"},
name: "Failed alert",
},
}
for _, c := range cases {
err := c.config.Init()
if err != nil {
t.Errorf("sendFirstRunAlerts(%s): unexpected error reading config: %v", c.name, err)
}
err = sendFirstRunAlerts(&c.config, c.firstRunAlerts)
if err == nil && c.expectErr {
t.Errorf("sendFirstRunAlerts(%s): Expected error, the code did not error", c.name)
} else if err != nil && !c.expectErr {
t.Errorf("sendFirstRunAlerts(%s): Did not expect an error, but we got one anyway: %v", c.name, err)
}
}
}
+7 -2
View File
@@ -1,6 +1,7 @@
package main
import (
"fmt"
"net/http"
"github.com/prometheus/client_golang/prometheus"
@@ -106,7 +107,11 @@ func (metrics *MinitorMetrics) CountAlert(monitor string, alert string) {
).Inc()
}
// HandleMetrics add Prometheus metrics handler to default http server
func HandleMetrics() {
// ServeMetrics starts an http server with a Prometheus metrics handler
func ServeMetrics() {
http.Handle("/metrics", promhttp.Handler())
host := fmt.Sprintf(":%d", MetricsPort)
_ = http.ListenAndServe(host, nil)
}