Compare commits

..

3 Commits

Author SHA1 Message Date
Ian Fijolek
0a36da79d6 Add health check and self reporting of health
This avoids panicing and instead provides an HTTP endpoint to report health
2024-04-03 11:23:26 -07:00
Ian Fijolek
01cca50532 Add tzdata
Allows setting container timezone using TZ env variable
2023-08-11 06:20:35 -07:00
Ian Fijolek
2789aa63e4 More loosely pins apk packages 2023-08-11 06:20:15 -07:00
8 changed files with 265 additions and 30 deletions
+1 -1
View File
@@ -5,7 +5,7 @@ RUN mkdir /app
WORKDIR /app/ WORKDIR /app/
# Add common checking tools # Add common checking tools
RUN apk --no-cache add bash=~5 curl=~8 jq=~1.6 bind-tools~=9 RUN apk --no-cache add bash=~5 curl=~8 jq=~1 bind-tools=~9 tzdata~=2023c
# Add minitor user for running as non-root # Add minitor user for running as non-root
RUN addgroup -S minitor && adduser -S minitor -G minitor RUN addgroup -S minitor && adduser -S minitor -G minitor
+1 -1
View File
@@ -22,7 +22,7 @@ WORKDIR /app/
COPY --from=builder /app/minitor . COPY --from=builder /app/minitor .
# Add common checking tools # Add common checking tools
RUN apk --no-cache add bash=~5 curl=~8 jq=~1.6 bind-tools~=9 RUN apk --no-cache add bash=~5 curl=~8 jq=~1 bind-tools=~9 tzdata~=2023c
# Add minitor user for running as non-root # Add minitor user for running as non-root
RUN addgroup -S minitor && adduser -S minitor -G minitor RUN addgroup -S minitor && adduser -S minitor -G minitor
+2
View File
@@ -46,6 +46,8 @@ docker run -v $PWD/config.yml:/app/config.yml iamthefij/minitor-go:latest
Images are provided for `amd64`, `arm`, and `arm64` architechtures. Images are provided for `amd64`, `arm`, and `arm64` architechtures.
Timezone configuration for the container is set by passing the `TZ` env variable. Eg. `TZ=America/Los_Angeles`.
## Configuring ## Configuring
In this repo, you can explore the `sample-config.yml` file for an example, but the general structure is as follows. It should be noted that environment variable interpolation happens on load of the YAML file. In this repo, you can explore the `sample-config.yml` file for an example, but the general structure is as follows. It should be noted that environment variable interpolation happens on load of the YAML file.
+72
View File
@@ -0,0 +1,72 @@
package main
import (
"fmt"
"io"
"net/http"
"strings"
)
type HealthCheckHandler struct {
isMinitorHealthy bool
monitors []*Monitor
}
func NewHealthCheckHandler(monitors []*Monitor) *HealthCheckHandler {
return &HealthCheckHandler{
false,
monitors,
}
}
func (hch *HealthCheckHandler) MinitorHealthy(healthy bool) {
hch.isMinitorHealthy = healthy
}
func (hch HealthCheckHandler) MinitorHealthCheck() (bool, string) {
if hch.isMinitorHealthy {
return true, "OK"
} else {
return false, "UNHEALTHY"
}
}
func (hch HealthCheckHandler) MonitorsHealthCheck() (bool, string) {
downMonitors := []string{}
for _, monitor := range hch.monitors {
if !monitor.IsUp() {
downMonitors = append(downMonitors, monitor.Name)
}
}
if len(downMonitors) == 0 {
return true, "OK"
} else {
return false, fmt.Sprintf("UNHEALTHY: The following monitors are unhealthy: %s", strings.Join(downMonitors, ", "))
}
}
func (hch HealthCheckHandler) ServeHTTP(w http.ResponseWriter, r *http.Request) {
var healthy bool
var body string
if monitors := r.URL.Query().Get("monitors"); monitors != "" {
healthy, body = hch.MonitorsHealthCheck()
} else {
healthy, body = hch.MinitorHealthCheck()
}
if healthy {
w.WriteHeader(http.StatusOK)
} else {
w.WriteHeader(http.StatusServiceUnavailable)
}
_, _ = io.WriteString(w, body)
}
func HandleHealthCheck() {
http.Handle("/metrics", HealthChecks)
}
+79
View File
@@ -0,0 +1,79 @@
package main
import (
"testing"
)
func TestNewHealthCheck(t *testing.T) {
monitors := []*Monitor{
{Name: "Test Monitor"},
}
hc := NewHealthCheckHandler(monitors)
monitors[0].alertCount++
if healthy, _ := hc.MinitorHealthCheck(); healthy {
t.Errorf("Initial hc state should be unhealthy until some successful alert is sent")
}
if healthy, _ := hc.MonitorsHealthCheck(); healthy {
t.Errorf("Faking an alert on the monitor pointer should make this unhealthy")
}
}
func TestMinitorHealthCheck(t *testing.T) {
monitors := []*Monitor{
{Name: "Test Monitor"},
}
hc := NewHealthCheckHandler(monitors)
t.Run("MinitorHealthCheck(healthy)", func(t *testing.T) {
hc.MinitorHealthy(true)
healthy, body := hc.MinitorHealthCheck()
if !healthy {
t.Errorf("Expected healthy check")
}
if body != "OK" {
t.Errorf("Expected OK response")
}
})
t.Run("MinitorHealthCheck(unhealthy)", func(t *testing.T) {
hc.MinitorHealthy(false)
healthy, body := hc.MinitorHealthCheck()
if healthy {
t.Errorf("Expected healthy check")
}
if body != "UNHEALTHY" {
t.Errorf("Expected UNHEALTHY response")
}
})
}
func TestMonitorsHealthCheck(t *testing.T) {
monitors := []*Monitor{
{Name: "Test Monitor"},
}
hc := NewHealthCheckHandler(monitors)
t.Run("MonitorsHealthCheck(healthy)", func(t *testing.T) {
healthy, body := hc.MonitorsHealthCheck()
if !healthy {
t.Errorf("Expected healthy check")
}
if body != "OK" {
t.Errorf("Expected OK response")
}
})
t.Run("MonitorsHealthCheck(unhealthy)", func(t *testing.T) {
monitors[0].alertCount++
healthy, body := hc.MonitorsHealthCheck()
if healthy {
t.Errorf("Expected healthy check")
}
if body != "UNHEALTHY: The following monitors are unhealthy: Test Monitor" {
t.Errorf("Expected UNHEALTHY response")
}
})
}
+44 -3
View File
@@ -4,6 +4,7 @@ import (
"errors" "errors"
"flag" "flag"
"fmt" "fmt"
"net/http"
"time" "time"
"git.iamthefij.com/iamthefij/slog" "git.iamthefij.com/iamthefij/slog"
@@ -16,6 +17,10 @@ var (
MetricsPort = 8080 MetricsPort = 8080
// Metrics contains all active metrics // Metrics contains all active metrics
Metrics = NewMetrics() Metrics = NewMetrics()
// Self monitor rather than panicing
SelfMonitor = false
// HealthChecks contains health check values
HealthChecks *HealthCheckHandler = nil
// PyCompat enables support for legacy Python templates // PyCompat enables support for legacy Python templates
PyCompat = false PyCompat = false
@@ -51,7 +56,13 @@ func sendAlerts(config *Config, monitor *Monitor, alertNotice *AlertNotice) erro
output, output,
) )
if SelfMonitor {
Metrics.SetMonitorStatus(fmt.Sprintf("Alert %s", alertName), false)
}
return err return err
} else {
Metrics.SetMonitorStatus(fmt.Sprintf("Alert %s", alertName), true)
} }
// Count alert metrics // Count alert metrics
@@ -69,6 +80,8 @@ func sendAlerts(config *Config, monitor *Monitor, alertNotice *AlertNotice) erro
func checkMonitors(config *Config) error { func checkMonitors(config *Config) error {
// TODO: Run this in goroutines and capture exceptions // TODO: Run this in goroutines and capture exceptions
healthy := true
for _, monitor := range config.Monitors { for _, monitor := range config.Monitors {
if monitor.ShouldCheck() { if monitor.ShouldCheck() {
success, alertNotice := monitor.Check() success, alertNotice := monitor.Check()
@@ -80,17 +93,32 @@ func checkMonitors(config *Config) error {
if alertNotice != nil { if alertNotice != nil {
err := sendAlerts(config, monitor, alertNotice) err := sendAlerts(config, monitor, alertNotice)
// If there was an error in sending an alert, exit early and bubble it up // If there was an error in sending an alert, mark as unhealthy or bubble up
if err != nil { if err != nil {
if SelfMonitor {
healthy = false
} else {
return err return err
} }
} }
} }
} }
}
if HealthChecks != nil {
HealthChecks.MinitorHealthy(healthy)
}
return nil return nil
} }
// ServeMetricsAndHealth starts the default http server
func ServeMetricsAndHealth() {
host := fmt.Sprintf(":%d", MetricsPort)
_ = http.ListenAndServe(host, nil)
}
func main() { func main() {
showVersion := flag.Bool("version", false, "Display the version of minitor and exit") showVersion := flag.Bool("version", false, "Display the version of minitor and exit")
configPath := flag.String("config", "config.yml", "Alternate configuration path (default: config.yml)") configPath := flag.String("config", "config.yml", "Alternate configuration path (default: config.yml)")
@@ -98,7 +126,9 @@ func main() {
flag.BoolVar(&slog.DebugLevel, "debug", false, "Enables debug logs (default: false)") flag.BoolVar(&slog.DebugLevel, "debug", false, "Enables debug logs (default: false)")
flag.BoolVar(&ExportMetrics, "metrics", false, "Enables prometheus metrics exporting (default: false)") flag.BoolVar(&ExportMetrics, "metrics", false, "Enables prometheus metrics exporting (default: false)")
flag.BoolVar(&PyCompat, "py-compat", false, "Enables support for legacy Python Minitor config. Will eventually be removed. (default: false)") flag.BoolVar(&PyCompat, "py-compat", false, "Enables support for legacy Python Minitor config. Will eventually be removed. (default: false)")
flag.IntVar(&MetricsPort, "metrics-port", MetricsPort, "The port that Prometheus metrics should be exported on, if enabled. (default: 8080)") flag.IntVar(&MetricsPort, "metrics-port", MetricsPort, "The port that Prometheus metrics and healthchecks should be exported on, if enabled. (default: 8080)")
flag.BoolVar(&SelfMonitor, "self-monitor", false, "Enables self-monitoring. Export metrics rather than panic when alerts fail. (default: false)")
flag.Parse() flag.Parse()
// Print version if flag is provided // Print version if flag is provided
@@ -115,8 +145,19 @@ func main() {
// Serve metrics exporter, if specified // Serve metrics exporter, if specified
if ExportMetrics { if ExportMetrics {
slog.Infof("Exporting metrics to Prometheus on port %d", MetricsPort) slog.Infof("Exporting metrics to Prometheus on port %d", MetricsPort)
HandleMetrics()
}
go ServeMetrics() if SelfMonitor {
slog.Infof("Starting healthcheck endpoint on port %d", MetricsPort)
HealthChecks = NewHealthCheckHandler(config.Monitors)
HandleHealthCheck()
}
if ExportMetrics || SelfMonitor {
go ServeMetricsAndHealth()
} }
// Start main loop // Start main loop
+46
View File
@@ -7,6 +7,7 @@ func TestCheckMonitors(t *testing.T) {
config Config config Config
expectErr bool expectErr bool
name string name string
selfMonitor bool
}{ }{
{ {
config: Config{}, config: Config{},
@@ -24,6 +25,7 @@ func TestCheckMonitors(t *testing.T) {
}, },
expectErr: false, expectErr: false,
name: "Monitor success, no alerts", name: "Monitor success, no alerts",
selfMonitor: false,
}, },
{ {
config: Config{ config: Config{
@@ -37,6 +39,7 @@ func TestCheckMonitors(t *testing.T) {
}, },
expectErr: false, expectErr: false,
name: "Monitor failure, no alerts", name: "Monitor failure, no alerts",
selfMonitor: false,
}, },
{ {
config: Config{ config: Config{
@@ -50,6 +53,7 @@ func TestCheckMonitors(t *testing.T) {
}, },
expectErr: false, expectErr: false,
name: "Monitor recovery, no alerts", name: "Monitor recovery, no alerts",
selfMonitor: false,
}, },
{ {
config: Config{ config: Config{
@@ -64,6 +68,7 @@ func TestCheckMonitors(t *testing.T) {
}, },
expectErr: true, expectErr: true,
name: "Monitor failure, unknown alerts", name: "Monitor failure, unknown alerts",
selfMonitor: false,
}, },
{ {
config: Config{ config: Config{
@@ -78,6 +83,22 @@ func TestCheckMonitors(t *testing.T) {
}, },
expectErr: true, expectErr: true,
name: "Monitor recovery, unknown alerts", name: "Monitor recovery, unknown alerts",
selfMonitor: false,
},
{
config: Config{
Monitors: []*Monitor{
{
Name: "Success",
Command: CommandOrShell{Command: []string{"true"}},
AlertUp: []string{"unknown"},
alertCount: 1,
},
},
},
expectErr: false,
name: "Monitor recovery, unknown alerts, with Health Check",
selfMonitor: true,
}, },
{ {
config: Config{ config: Config{
@@ -97,6 +118,7 @@ func TestCheckMonitors(t *testing.T) {
}, },
expectErr: false, expectErr: false,
name: "Monitor failure, successful alert", name: "Monitor failure, successful alert",
selfMonitor: false,
}, },
{ {
config: Config{ config: Config{
@@ -117,10 +139,34 @@ func TestCheckMonitors(t *testing.T) {
}, },
expectErr: true, expectErr: true,
name: "Monitor failure, bad alert", name: "Monitor failure, bad alert",
selfMonitor: false,
},
{
config: Config{
Monitors: []*Monitor{
{
Name: "Failure",
Command: CommandOrShell{Command: []string{"false"}},
AlertDown: []string{"bad"},
AlertAfter: 1,
},
},
Alerts: map[string]*Alert{
"bad": {
Name: "bad",
Command: CommandOrShell{Command: []string{"false"}},
},
},
},
expectErr: false,
name: "Monitor failure, bad alert, with Health Check",
selfMonitor: true,
}, },
} }
for _, c := range cases { for _, c := range cases {
SelfMonitor = c.selfMonitor
err := c.config.Init() err := c.config.Init()
if err != nil { if err != nil {
t.Errorf("checkMonitors(%s): unexpected error reading config: %v", c.name, err) t.Errorf("checkMonitors(%s): unexpected error reading config: %v", c.name, err)
+2 -7
View File
@@ -1,7 +1,6 @@
package main package main
import ( import (
"fmt"
"net/http" "net/http"
"github.com/prometheus/client_golang/prometheus" "github.com/prometheus/client_golang/prometheus"
@@ -107,11 +106,7 @@ func (metrics *MinitorMetrics) CountAlert(monitor string, alert string) {
).Inc() ).Inc()
} }
// ServeMetrics starts an http server with a Prometheus metrics handler // HandleMetrics add Prometheus metrics handler to default http server
func ServeMetrics() { func HandleMetrics() {
http.Handle("/metrics", promhttp.Handler()) http.Handle("/metrics", promhttp.Handler())
host := fmt.Sprintf(":%d", MetricsPort)
_ = http.ListenAndServe(host, nil)
} }