Add health check and self reporting of health

This avoids panicing and instead provides an HTTP endpoint to report health
Add tzdata
2024-04-03 11:23:26 -07:00 · 2023-08-11 06:20:35 -07:00 · 2023-08-11 06:20:15 -07:00 · 2023-08-10 16:23:02 -04:00 · 2023-08-10 16:22:30 -04:00 · 2023-08-10 16:21:33 -04:00
17 changed files with 424 additions and 58 deletions
@@ -4,7 +4,7 @@ name: test

 steps:
  - name: test
-    image: golang:1.17
+    image: golang:1.20
    environment:
      VERSION: ${DRONE_TAG:-${DRONE_COMMIT}}
    commands:
@@ -30,7 +30,7 @@ trigger:

 steps:
  - name: build all binaries
-    image: golang:1.17
+    image: golang:1.20
    environment:
      VERSION: ${DRONE_TAG:-${DRONE_COMMIT}}
    commands:
@@ -1,11 +1,11 @@
 ARG REPO=library
-FROM ${REPO}/alpine:3.17
+FROM ${REPO}/alpine:3.18

 RUN mkdir /app
 WORKDIR /app/

 # Add common checking tools
-RUN apk --no-cache add bash=~5 curl=~8 jq=~1.6 bind-tools~=9
+RUN apk --no-cache add bash=~5 curl=~8 jq=~1 bind-tools=~9 tzdata~=2023c

 # Add minitor user for running as non-root
 RUN addgroup -S minitor && adduser -S minitor -G minitor
@@ -1,5 +1,5 @@
 ARG REPO=library
-FROM golang:1.17 AS builder
+FROM golang:1.20 AS builder

 RUN mkdir /app
 WORKDIR /app
@@ -14,7 +14,7 @@ ARG VERSION=dev
 ENV CGO_ENABLED=0 GOOS=linux GOARCH=${ARCH}
 RUN go build -ldflags "-X main.version=${VERSION}" -a -installsuffix nocgo -o minitor .

-FROM ${REPO}/alpine:3.17
+FROM ${REPO}/alpine:3.18
 RUN mkdir /app
 WORKDIR /app/

@@ -22,7 +22,7 @@ WORKDIR /app/
 COPY --from=builder /app/minitor .

 # Add common checking tools
-RUN apk --no-cache add bash=~5 curl=~8 jq=~1.6 bind-tools~=9
+RUN apk --no-cache add bash=~5 curl=~8 jq=~1 bind-tools=~9 tzdata~=2023c

 # Add minitor user for running as non-root
 RUN addgroup -S minitor && adduser -S minitor -G minitor
@@ -46,6 +46,8 @@ docker run -v $PWD/config.yml:/app/config.yml iamthefij/minitor-go:latest

 Images are provided for `amd64`, `arm`, and `arm64` architechtures.

+Timezone configuration for the container is set by passing the `TZ` env variable. Eg. `TZ=America/Los_Angeles`.
+
 ## Configuring

 In this repo, you can explore the `sample-config.yml` file for an example, but the general structure is as follows. It should be noted that environment variable interpolation happens on load of the YAML file.
@@ -94,10 +96,28 @@ Also, when alerts are executed, they will be passed through Go's format function
 |`{{.AlertCount}}`|Number of times this monitor has alerted|
 |`{{.FailureCount}}`|The total number of sequential failed checks for this monitor|
 |`{{.LastCheckOutput}}`|The last returned value from the check command to either stderr or stdout|
-|`{{.LastSuccess}}`|The ISO datetime of the last successful check|
+|`{{.LastSuccess}}`|The datetime of the last successful check as a go Time struct|
 |`{{.MonitorName}}`|The name of the monitor that failed and triggered the alert|
 |`{{.IsUp}}`|Indicates if the monitor that is alerting is up or not. Can be used in a conditional message template|

+To provide flexible formatting, the following non-standard functions are available in templates:
+
+|func|description|
+|---|---|
+|`ANSIC <Time>`|Formats provided time in ANSIC format|
+|`UnixDate <Time>`|Formats provided time in UnixDate format|
+|`RubyDate <Time>`|Formats provided time in RubyDate format|
+|`RFC822Z <Time>`|Formats provided time in RFC822Z format|
+|`RFC850 <Time>`|Formats provided time in RFC850 format|
+|`RFC1123 <Time>`|Formats provided time in RFC1123 format|
+|`RFC1123Z <Time>`|Formats provided time in RFC1123Z format|
+|`RFC3339 <Time>`|Formats provided time in RFC3339 format|
+|`RFC3339Nano <Time>`|Formats provided time in RFC3339Nano format|
+|`FormatTime <Time> <string template>`|Formats provided time according to provided template|
+|`InTZ <Time> <string timezone name>`|Converts provided time to parsed timezone from the provided name|
+
+For more information, check out the [Go documentation for the time module](https://pkg.go.dev/time@go1.20.7#pkg-constants).
+
 ### Metrics

 Minitor supports exporting metrics for [Prometheus](https://prometheus.io/). Prometheus is an open source tool for reading and querying metrics from different sources. Combined with another tool, [Grafana](https://grafana.com/), it allows building of charts and dashboards. You could also opt to just use Minitor to log check results, and instead do your alerting with Grafana.
@@ -158,7 +178,7 @@ minitor-go:
 check_interval: 1m30s
 ```

-The `-py-compat` flag has been removed. Any existing Python oriented configuration needs to be migrated to the new templates.
+For the time being, legacy configs for the Python version of Minitor should be compatible if you apply the `-py-compat` flag when running Minitor. Eventually, this flag will go away when later breaking changes are introduced.

 ## Future

@@ -5,6 +5,7 @@ import (
 	"errors"
 	"fmt"
 	"os/exec"
+	"strings"
 	"text/template"
 	"time"

@@ -44,21 +45,70 @@ func (alert Alert) IsValid() bool {

 // BuildTemplates compiles command templates for the Alert
 func (alert *Alert) BuildTemplates() error {
+	// TODO: Remove legacy template support later after 1.0
+	legacy := strings.NewReplacer(
+		"{alert_count}", "{{.AlertCount}}",
+		"{alert_message}", "{{.MonitorName}} check has failed {{.FailureCount}} times",
+		"{failure_count}", "{{.FailureCount}}",
+		"{last_output}", "{{.LastCheckOutput}}",
+		"{last_success}", "{{.LastSuccess}}",
+		"{monitor_name}", "{{.MonitorName}}",
+	)
+
 	slog.Debugf("Building template for alert %s", alert.Name)

+	// Time format func factory
+	tff := func(formatString string) func(time.Time) string {
+		return func(t time.Time) string {
+			return t.Format(formatString)
+		}
+	}
+
+	// Create some functions for formatting datetimes in popular formats
+	timeFormatFuncs := template.FuncMap{
+		"ANSIC":       tff(time.ANSIC),
+		"UnixDate":    tff(time.UnixDate),
+		"RubyDate":    tff(time.RubyDate),
+		"RFC822Z":     tff(time.RFC822Z),
+		"RFC850":      tff(time.RFC850),
+		"RFC1123":     tff(time.RFC1123),
+		"RFC1123Z":    tff(time.RFC1123Z),
+		"RFC3339":     tff(time.RFC3339),
+		"RFC3339Nano": tff(time.RFC3339Nano),
+		"FormatTime": func(t time.Time, timeFormat string) string {
+			return t.Format(timeFormat)
+		},
+		"InTZ": func(t time.Time, tzName string) (time.Time, error) {
+			tz, err := time.LoadLocation(tzName)
+			if err != nil {
+				return t, fmt.Errorf("failed to convert time to specified tz: %w", err)
+			}
+
+			return t.In(tz), nil
+		},
+	}
+
 	switch {
 	case alert.commandTemplate == nil && alert.Command.Command != nil:
 		alert.commandTemplate = []*template.Template{}
 		for i, cmdPart := range alert.Command.Command {
+			if PyCompat {
+				cmdPart = legacy.Replace(cmdPart)
+			}
+
 			alert.commandTemplate = append(alert.commandTemplate, template.Must(
-				template.New(alert.Name+fmt.Sprint(i)).Parse(cmdPart),
+				template.New(alert.Name+fmt.Sprint(i)).Funcs(timeFormatFuncs).Parse(cmdPart),
 			))
 		}
 	case alert.commandShellTemplate == nil && alert.Command.ShellCommand != "":
 		shellCmd := alert.Command.ShellCommand

+		if PyCompat {
+			shellCmd = legacy.Replace(shellCmd)
+		}
+
 		alert.commandShellTemplate = template.Must(
-			template.New(alert.Name).Parse(shellCmd),
+			template.New(alert.Name).Funcs(timeFormatFuncs).Parse(shellCmd),
 		)
 	default:
 		return fmt.Errorf("No template provided for alert %s: %w", alert.Name, errNoTemplate)
@@ -118,7 +168,7 @@ func (alert Alert) Send(notice AlertNotice) (outputStr string, err error) {

 	if err != nil {
 		err = fmt.Errorf(
-			"Alert '%s' failed to send. Returned %v: %w",
+			"Alert %s failed to send. Returned %w: %w",
 			alert.Name,
 			err,
 			ErrAlertFailed,
@@ -70,6 +70,14 @@ func TestAlertSend(t *testing.T) {
 			"Command shell with bad template",
 			false,
 		},
+		{
+			Alert{Command: CommandOrShell{ShellCommand: "echo {alert_message}"}},
+			AlertNotice{MonitorName: "test", FailureCount: 1},
+			"test check has failed 1 times\n",
+			false,
+			"Command shell with legacy template",
+			true,
+		},
 		// Test default log alert down
 		{
 			*NewLogAlert(),
@@ -92,6 +100,8 @@ func TestAlertSend(t *testing.T) {

 	for _, c := range cases {
 		log.Printf("Testing case %s", c.name)
+		// Set PyCompat to value of compat flag
+		PyCompat = c.pyCompat

 		err := c.alert.BuildTemplates()
 		if err != nil {
@@ -111,6 +121,9 @@ func TestAlertSend(t *testing.T) {
 			log.Printf("Case failed: %s", c.name)
 		}

+		// Set PyCompat back to default value
+		PyCompat = false
+
 		log.Println("-----")
 	}
 }
@@ -13,11 +13,11 @@ var errInvalidConfig = errors.New("Invalid configuration")

 // Config type is contains all provided user configuration
 type Config struct {
-	CheckInterval     time.Duration `yaml:"check_interval"`
-	DefaultAlertAfter int16         `yaml:"default_alert_after"`
-	DefaultAlertEvery *int16        `yaml:"default_alert_every"`
-	DefaultAlertDown  []string      `yaml:"default_alert_down"`
-	DefaultAlertUp    []string      `yaml:"default_alert_up"`
+	CheckInterval     SecondsOrDuration `yaml:"check_interval"`
+	DefaultAlertAfter int16             `yaml:"default_alert_after"`
+	DefaultAlertEvery *int16            `yaml:"default_alert_every"`
+	DefaultAlertDown  []string          `yaml:"default_alert_down"`
+	DefaultAlertUp    []string          `yaml:"default_alert_up"`
 	Monitors          []*Monitor
 	Alerts            map[string]*Alert
 }
@@ -56,6 +56,34 @@ func (cos *CommandOrShell) UnmarshalYAML(unmarshal func(interface{}) error) erro
 	return nil
 }

+// SecondsOrDuration wraps a duration value for parsing a duration or seconds from YAML
+// NOTE: This should be removed in favor of only parsing durations once compatibility is broken
+type SecondsOrDuration struct {
+	value time.Duration
+}
+
+// Value returns a duration value
+func (sod SecondsOrDuration) Value() time.Duration {
+	return sod.value
+}
+
+// UnmarshalYAML allows unmarshalling a duration value or seconds if an int was provided
+func (sod *SecondsOrDuration) UnmarshalYAML(unmarshal func(interface{}) error) error {
+	var seconds int64
+	err := unmarshal(&seconds)
+
+	if err == nil {
+		sod.value = time.Second * time.Duration(seconds)
+
+		return nil
+	}
+
+	// Error indicates that we don't have an int
+	err = unmarshal(&sod.value)
+
+	return err
+}
+
 // IsValid checks config validity and returns true if valid
 func (config Config) IsValid() (isValid bool) {
 	isValid = true
@@ -154,6 +182,18 @@ func LoadConfig(filePath string) (config Config, err error) {

 	slog.Debugf("Config values:\n%v\n", config)

+	// Add log alert if not present
+	if PyCompat {
+		// Initialize alerts list if not present
+		if config.Alerts == nil {
+			config.Alerts = map[string]*Alert{}
+		}
+
+		if _, ok := config.Alerts["log"]; !ok {
+			config.Alerts["log"] = NewLogAlert()
+		}
+	}
+
 	// Finish initializing configuration
 	if err = config.Init(); err != nil {
 		return
@@ -15,6 +15,7 @@ func TestLoadConfig(t *testing.T) {
 	}{
 		{"./test/valid-config.yml", false, "Valid config file", false},
 		{"./test/valid-config-default-values.yml", false, "Valid config file with default values", false},
+		{"./test/valid-default-log-alert.yml", false, "Valid config file with default log alert PyCompat", true},
 		{"./test/valid-default-log-alert.yml", true, "Invalid config file no log alert", false},
 		{"./test/does-not-exist", true, "Invalid config path", false},
 		{"./test/invalid-config-type.yml", true, "Invalid config type for key", false},
@@ -24,6 +25,8 @@ func TestLoadConfig(t *testing.T) {

 	for _, c := range cases {
 		log.Printf("Testing case %s", c.name)
+		// Set PyCompat based on compatibility mode
+		PyCompat = c.pyCompat
 		_, err := LoadConfig(c.configPath)
 		hasErr := (err != nil)

@@ -31,6 +34,9 @@ func TestLoadConfig(t *testing.T) {
 			t.Errorf("LoadConfig(%v), expected_error=%v actual=%v", c.name, c.expectErr, err)
 			log.Printf("Case failed: %s", c.name)
 		}
+
+		// Set PyCompat to default value
+		PyCompat = false
 	}
 }

@@ -47,15 +53,15 @@ func TestIntervalParsing(t *testing.T) {
 	oneMinute := time.Minute

 	// validate top level interval seconds represented as an int
-	if config.CheckInterval != oneSecond {
+	if config.CheckInterval.Value() != oneSecond {
 		t.Errorf("Incorrectly parsed int seconds. expected=%v actual=%v", oneSecond, config.CheckInterval)
 	}

-	if config.Monitors[0].CheckInterval != tenSeconds {
+	if config.Monitors[0].CheckInterval.Value() != tenSeconds {
 		t.Errorf("Incorrectly parsed seconds duration. expected=%v actual=%v", oneSecond, config.CheckInterval)
 	}

-	if config.Monitors[1].CheckInterval != oneMinute {
+	if config.Monitors[1].CheckInterval.Value() != oneMinute {
 		t.Errorf("Incorrectly parsed seconds duration. expected=%v actual=%v", oneSecond, config.CheckInterval)
 	}

@@ -1,6 +1,6 @@
 module git.iamthefij.com/iamthefij/minitor-go

-go 1.17
+go 1.20

 require (
 	git.iamthefij.com/iamthefij/slog v1.3.0
@@ -0,0 +1,72 @@
+package main
+
+import (
+	"fmt"
+	"io"
+	"net/http"
+	"strings"
+)
+
+type HealthCheckHandler struct {
+	isMinitorHealthy bool
+	monitors         []*Monitor
+}
+
+func NewHealthCheckHandler(monitors []*Monitor) *HealthCheckHandler {
+	return &HealthCheckHandler{
+		false,
+		monitors,
+	}
+}
+
+func (hch *HealthCheckHandler) MinitorHealthy(healthy bool) {
+	hch.isMinitorHealthy = healthy
+}
+
+func (hch HealthCheckHandler) MinitorHealthCheck() (bool, string) {
+	if hch.isMinitorHealthy {
+		return true, "OK"
+	} else {
+		return false, "UNHEALTHY"
+	}
+}
+
+func (hch HealthCheckHandler) MonitorsHealthCheck() (bool, string) {
+	downMonitors := []string{}
+
+	for _, monitor := range hch.monitors {
+		if !monitor.IsUp() {
+			downMonitors = append(downMonitors, monitor.Name)
+		}
+	}
+
+	if len(downMonitors) == 0 {
+		return true, "OK"
+	} else {
+		return false, fmt.Sprintf("UNHEALTHY: The following monitors are unhealthy: %s", strings.Join(downMonitors, ", "))
+	}
+}
+
+func (hch HealthCheckHandler) ServeHTTP(w http.ResponseWriter, r *http.Request) {
+	var healthy bool
+
+	var body string
+
+	if monitors := r.URL.Query().Get("monitors"); monitors != "" {
+		healthy, body = hch.MonitorsHealthCheck()
+	} else {
+		healthy, body = hch.MinitorHealthCheck()
+	}
+
+	if healthy {
+		w.WriteHeader(http.StatusOK)
+	} else {
+		w.WriteHeader(http.StatusServiceUnavailable)
+	}
+
+	_, _ = io.WriteString(w, body)
+}
+
+func HandleHealthCheck() {
+	http.Handle("/metrics", HealthChecks)
+}
@@ -0,0 +1,79 @@
+package main
+
+import (
+	"testing"
+)
+
+func TestNewHealthCheck(t *testing.T) {
+	monitors := []*Monitor{
+		{Name: "Test Monitor"},
+	}
+	hc := NewHealthCheckHandler(monitors)
+
+	monitors[0].alertCount++
+
+	if healthy, _ := hc.MinitorHealthCheck(); healthy {
+		t.Errorf("Initial hc state should be unhealthy until some successful alert is sent")
+	}
+
+	if healthy, _ := hc.MonitorsHealthCheck(); healthy {
+		t.Errorf("Faking an alert on the monitor pointer should make this unhealthy")
+	}
+}
+
+func TestMinitorHealthCheck(t *testing.T) {
+	monitors := []*Monitor{
+		{Name: "Test Monitor"},
+	}
+	hc := NewHealthCheckHandler(monitors)
+
+	t.Run("MinitorHealthCheck(healthy)", func(t *testing.T) {
+		hc.MinitorHealthy(true)
+		healthy, body := hc.MinitorHealthCheck()
+		if !healthy {
+			t.Errorf("Expected healthy check")
+		}
+		if body != "OK" {
+			t.Errorf("Expected OK response")
+		}
+	})
+
+	t.Run("MinitorHealthCheck(unhealthy)", func(t *testing.T) {
+		hc.MinitorHealthy(false)
+		healthy, body := hc.MinitorHealthCheck()
+		if healthy {
+			t.Errorf("Expected healthy check")
+		}
+		if body != "UNHEALTHY" {
+			t.Errorf("Expected UNHEALTHY response")
+		}
+	})
+}
+
+func TestMonitorsHealthCheck(t *testing.T) {
+	monitors := []*Monitor{
+		{Name: "Test Monitor"},
+	}
+	hc := NewHealthCheckHandler(monitors)
+
+	t.Run("MonitorsHealthCheck(healthy)", func(t *testing.T) {
+		healthy, body := hc.MonitorsHealthCheck()
+		if !healthy {
+			t.Errorf("Expected healthy check")
+		}
+		if body != "OK" {
+			t.Errorf("Expected OK response")
+		}
+	})
+
+	t.Run("MonitorsHealthCheck(unhealthy)", func(t *testing.T) {
+		monitors[0].alertCount++
+		healthy, body := hc.MonitorsHealthCheck()
+		if healthy {
+			t.Errorf("Expected healthy check")
+		}
+		if body != "UNHEALTHY: The following monitors are unhealthy: Test Monitor" {
+			t.Errorf("Expected UNHEALTHY response")
+		}
+	})
+}
@@ -4,6 +4,7 @@ import (
 	"errors"
 	"flag"
 	"fmt"
+	"net/http"
 	"time"

 	"git.iamthefij.com/iamthefij/slog"
@@ -16,6 +17,13 @@ var (
 	MetricsPort = 8080
 	// Metrics contains all active metrics
 	Metrics = NewMetrics()
+	// Self monitor rather than panicing
+	SelfMonitor = false
+	// HealthChecks contains health check values
+	HealthChecks *HealthCheckHandler = nil
+
+	// PyCompat enables support for legacy Python templates
+	PyCompat = false

 	// version of minitor being run
 	version = "dev"
@@ -48,7 +56,13 @@ func sendAlerts(config *Config, monitor *Monitor, alertNotice *AlertNotice) erro
 					output,
 				)

+				if SelfMonitor {
+					Metrics.SetMonitorStatus(fmt.Sprintf("Alert %s", alertName), false)
+				}
+
 				return err
+			} else {
+				Metrics.SetMonitorStatus(fmt.Sprintf("Alert %s", alertName), true)
 			}

 			// Count alert metrics
@@ -66,6 +80,8 @@ func sendAlerts(config *Config, monitor *Monitor, alertNotice *AlertNotice) erro

 func checkMonitors(config *Config) error {
 	// TODO: Run this in goroutines and capture exceptions
+	healthy := true
+
 	for _, monitor := range config.Monitors {
 		if monitor.ShouldCheck() {
 			success, alertNotice := monitor.Check()
@@ -77,24 +93,42 @@ func checkMonitors(config *Config) error {

 			if alertNotice != nil {
 				err := sendAlerts(config, monitor, alertNotice)
-				// If there was an error in sending an alert, exit early and bubble it up
+				// If there was an error in sending an alert, mark as unhealthy or bubble up
 				if err != nil {
-					return err
+					if SelfMonitor {
+						healthy = false
+					} else {
+						return err
+					}
 				}
 			}
 		}
 	}

+	if HealthChecks != nil {
+		HealthChecks.MinitorHealthy(healthy)
+	}
+
 	return nil
 }

+// ServeMetricsAndHealth starts the default http server
+func ServeMetricsAndHealth() {
+	host := fmt.Sprintf(":%d", MetricsPort)
+
+	_ = http.ListenAndServe(host, nil)
+}
+
 func main() {
 	showVersion := flag.Bool("version", false, "Display the version of minitor and exit")
 	configPath := flag.String("config", "config.yml", "Alternate configuration path (default: config.yml)")

 	flag.BoolVar(&slog.DebugLevel, "debug", false, "Enables debug logs (default: false)")
 	flag.BoolVar(&ExportMetrics, "metrics", false, "Enables prometheus metrics exporting (default: false)")
-	flag.IntVar(&MetricsPort, "metrics-port", MetricsPort, "The port that Prometheus metrics should be exported on, if enabled. (default: 8080)")
+	flag.BoolVar(&PyCompat, "py-compat", false, "Enables support for legacy Python Minitor config. Will eventually be removed. (default: false)")
+	flag.IntVar(&MetricsPort, "metrics-port", MetricsPort, "The port that Prometheus metrics and healthchecks should be exported on, if enabled. (default: 8080)")
+	flag.BoolVar(&SelfMonitor, "self-monitor", false, "Enables self-monitoring. Export metrics rather than panic when alerts fail. (default: false)")
+
 	flag.Parse()

 	// Print version if flag is provided
@@ -111,8 +145,19 @@ func main() {
 	// Serve metrics exporter, if specified
 	if ExportMetrics {
 		slog.Infof("Exporting metrics to Prometheus on port %d", MetricsPort)
+		HandleMetrics()
+	}

-		go ServeMetrics()
+	if SelfMonitor {
+		slog.Infof("Starting healthcheck endpoint on port %d", MetricsPort)
+
+		HealthChecks = NewHealthCheckHandler(config.Monitors)
+
+		HandleHealthCheck()
+	}
+
+	if ExportMetrics || SelfMonitor {
+		go ServeMetricsAndHealth()
 	}

 	// Start main loop
@@ -120,6 +165,6 @@ func main() {
 		err = checkMonitors(&config)
 		slog.OnErrPanicf(err, "Error checking monitors")

-		time.Sleep(config.CheckInterval)
+		time.Sleep(config.CheckInterval.Value())
 	}
 }
@@ -4,9 +4,10 @@ import "testing"

 func TestCheckMonitors(t *testing.T) {
 	cases := []struct {
-		config    Config
-		expectErr bool
-		name      string
+		config      Config
+		expectErr   bool
+		name        string
+		selfMonitor bool
 	}{
 		{
 			config:    Config{},
@@ -22,8 +23,9 @@ func TestCheckMonitors(t *testing.T) {
 					},
 				},
 			},
-			expectErr: false,
-			name:      "Monitor success, no alerts",
+			expectErr:   false,
+			name:        "Monitor success, no alerts",
+			selfMonitor: false,
 		},
 		{
 			config: Config{
@@ -35,8 +37,9 @@ func TestCheckMonitors(t *testing.T) {
 					},
 				},
 			},
-			expectErr: false,
-			name:      "Monitor failure, no alerts",
+			expectErr:   false,
+			name:        "Monitor failure, no alerts",
+			selfMonitor: false,
 		},
 		{
 			config: Config{
@@ -48,8 +51,9 @@ func TestCheckMonitors(t *testing.T) {
 					},
 				},
 			},
-			expectErr: false,
-			name:      "Monitor recovery, no alerts",
+			expectErr:   false,
+			name:        "Monitor recovery, no alerts",
+			selfMonitor: false,
 		},
 		{
 			config: Config{
@@ -62,8 +66,9 @@ func TestCheckMonitors(t *testing.T) {
 					},
 				},
 			},
-			expectErr: true,
-			name:      "Monitor failure, unknown alerts",
+			expectErr:   true,
+			name:        "Monitor failure, unknown alerts",
+			selfMonitor: false,
 		},
 		{
 			config: Config{
@@ -76,8 +81,24 @@ func TestCheckMonitors(t *testing.T) {
 					},
 				},
 			},
-			expectErr: true,
-			name:      "Monitor recovery, unknown alerts",
+			expectErr:   true,
+			name:        "Monitor recovery, unknown alerts",
+			selfMonitor: false,
+		},
+		{
+			config: Config{
+				Monitors: []*Monitor{
+					{
+						Name:       "Success",
+						Command:    CommandOrShell{Command: []string{"true"}},
+						AlertUp:    []string{"unknown"},
+						alertCount: 1,
+					},
+				},
+			},
+			expectErr:   false,
+			name:        "Monitor recovery, unknown alerts, with Health Check",
+			selfMonitor: true,
 		},
 		{
 			config: Config{
@@ -95,8 +116,9 @@ func TestCheckMonitors(t *testing.T) {
 					},
 				},
 			},
-			expectErr: false,
-			name:      "Monitor failure, successful alert",
+			expectErr:   false,
+			name:        "Monitor failure, successful alert",
+			selfMonitor: false,
 		},
 		{
 			config: Config{
@@ -115,12 +137,36 @@ func TestCheckMonitors(t *testing.T) {
 					},
 				},
 			},
-			expectErr: true,
-			name:      "Monitor failure, bad alert",
+			expectErr:   true,
+			name:        "Monitor failure, bad alert",
+			selfMonitor: false,
+		},
+		{
+			config: Config{
+				Monitors: []*Monitor{
+					{
+						Name:       "Failure",
+						Command:    CommandOrShell{Command: []string{"false"}},
+						AlertDown:  []string{"bad"},
+						AlertAfter: 1,
+					},
+				},
+				Alerts: map[string]*Alert{
+					"bad": {
+						Name:    "bad",
+						Command: CommandOrShell{Command: []string{"false"}},
+					},
+				},
+			},
+			expectErr:   false,
+			name:        "Monitor failure, bad alert, with Health Check",
+			selfMonitor: true,
 		},
 	}

 	for _, c := range cases {
+		SelfMonitor = c.selfMonitor
+
 		err := c.config.Init()
 		if err != nil {
 			t.Errorf("checkMonitors(%s): unexpected error reading config: %v", c.name, err)
@@ -1,7 +1,6 @@
 package main

 import (
-	"fmt"
 	"net/http"

 	"github.com/prometheus/client_golang/prometheus"
@@ -107,11 +106,7 @@ func (metrics *MinitorMetrics) CountAlert(monitor string, alert string) {
 	).Inc()
 }

-// ServeMetrics starts an http server with a Prometheus metrics handler
-func ServeMetrics() {
+// HandleMetrics add Prometheus metrics handler to default http server
+func HandleMetrics() {
 	http.Handle("/metrics", promhttp.Handler())
-
-	host := fmt.Sprintf(":%d", MetricsPort)
-
-	_ = http.ListenAndServe(host, nil)
 }
@@ -11,9 +11,9 @@ import (
 // Monitor represents a particular periodic check of a command
 type Monitor struct { //nolint:maligned
 	// Config values
-	AlertAfter    int16         `yaml:"alert_after"`
-	AlertEvery    *int16        `yaml:"alert_every"`
-	CheckInterval time.Duration `yaml:"check_interval"`
+	AlertAfter    int16             `yaml:"alert_after"`
+	AlertEvery    *int16            `yaml:"alert_every"`
+	CheckInterval SecondsOrDuration `yaml:"check_interval"`
 	Name          string
 	AlertDown     []string `yaml:"alert_down"`
 	AlertUp       []string `yaml:"alert_up"`
@@ -45,7 +45,7 @@ func (monitor Monitor) ShouldCheck() bool {

 	sinceLastCheck := time.Since(monitor.lastCheck)

-	return sinceLastCheck >= monitor.CheckInterval
+	return sinceLastCheck >= monitor.CheckInterval.Value()
 }

 // Check will run the command configured by the Monitor and return a status
@@ -45,9 +45,9 @@ func TestMonitorShouldCheck(t *testing.T) {
 		name     string
 	}{
 		{Monitor{}, true, "Empty"},
-		{Monitor{lastCheck: timeNow, CheckInterval: time.Second * 15}, false, "Just checked"},
-		{Monitor{lastCheck: timeTenSecAgo, CheckInterval: time.Second * 15}, false, "-10s"},
-		{Monitor{lastCheck: timeTwentySecAgo, CheckInterval: time.Second * 15}, true, "-20s"},
+		{Monitor{lastCheck: timeNow, CheckInterval: SecondsOrDuration{time.Second * 15}}, false, "Just checked"},
+		{Monitor{lastCheck: timeTenSecAgo, CheckInterval: SecondsOrDuration{time.Second * 15}}, false, "-10s"},
+		{Monitor{lastCheck: timeTwentySecAgo, CheckInterval: SecondsOrDuration{time.Second * 15}}, true, "-20s"},
 	}

 	for _, c := range cases {
@@ -1,5 +1,5 @@
 ---
-check_interval: 1s
+check_interval: 1

 monitors:
  - name: Command
Author	SHA1	Message	Date
Ian Fijolek	0a36da79d6	Add health check and self reporting of health This avoids panicing and instead provides an HTTP endpoint to report health	2024-04-03 11:23:26 -07:00
Ian Fijolek	01cca50532	Add tzdata Allows setting container timezone using TZ env variable	2023-08-11 06:20:35 -07:00
Ian Fijolek	2789aa63e4	More loosely pins apk packages	2023-08-11 06:20:15 -07:00
Ian Fijolek	37db4b2db0	Update error string when failing to send alert Wrap both originating errors	2023-08-10 16:23:02 -04:00
Ian Fijolek	41a1dbeceb	Add date format functions	2023-08-10 16:22:30 -04:00
Ian Fijolek	c02d64d674	Update go to 1.20	2023-08-10 16:21:33 -04:00
Ian Fijolek	46f4561bea	Update alpine and system package versions Bump to alpine 3.18	2023-06-14 16:52:04 -07:00
Ian Fijolek	a1e0e9698b	Add dig and nslookup	2023-05-05 14:07:53 -07:00