Compare commits

..

4 Commits

Author SHA1 Message Date
Ian Fijolek
cea16606ba Fix tests after removing legacy compatibility 2023-04-20 14:32:13 -07:00
Ian Fijolek
a03f430d0e Remove 'SecondsOrDuration' for check_interval
Now requires an explicit duration unit. Eg. 30s
2023-04-19 15:31:12 -07:00
Ian Fijolek
f3f7c215a7 Breaking: Remove python compat flag 2023-04-19 15:27:33 -07:00
Ian Fijolek
c75302bdb8 Add dig and update system package versions
Includes bump to alpine 3.17
2023-04-19 15:23:34 -07:00
17 changed files with 58 additions and 424 deletions
+2 -2
View File
@@ -4,7 +4,7 @@ name: test
steps: steps:
- name: test - name: test
image: golang:1.20 image: golang:1.17
environment: environment:
VERSION: ${DRONE_TAG:-${DRONE_COMMIT}} VERSION: ${DRONE_TAG:-${DRONE_COMMIT}}
commands: commands:
@@ -30,7 +30,7 @@ trigger:
steps: steps:
- name: build all binaries - name: build all binaries
image: golang:1.20 image: golang:1.17
environment: environment:
VERSION: ${DRONE_TAG:-${DRONE_COMMIT}} VERSION: ${DRONE_TAG:-${DRONE_COMMIT}}
commands: commands:
+2 -2
View File
@@ -1,11 +1,11 @@
ARG REPO=library ARG REPO=library
FROM ${REPO}/alpine:3.18 FROM ${REPO}/alpine:3.17
RUN mkdir /app RUN mkdir /app
WORKDIR /app/ WORKDIR /app/
# Add common checking tools # Add common checking tools
RUN apk --no-cache add bash=~5 curl=~8 jq=~1 bind-tools=~9 tzdata~=2023c RUN apk --no-cache add bash=~5 curl=~8 jq=~1.6 bind-tools~=9
# Add minitor user for running as non-root # Add minitor user for running as non-root
RUN addgroup -S minitor && adduser -S minitor -G minitor RUN addgroup -S minitor && adduser -S minitor -G minitor
+3 -3
View File
@@ -1,5 +1,5 @@
ARG REPO=library ARG REPO=library
FROM golang:1.20 AS builder FROM golang:1.17 AS builder
RUN mkdir /app RUN mkdir /app
WORKDIR /app WORKDIR /app
@@ -14,7 +14,7 @@ ARG VERSION=dev
ENV CGO_ENABLED=0 GOOS=linux GOARCH=${ARCH} ENV CGO_ENABLED=0 GOOS=linux GOARCH=${ARCH}
RUN go build -ldflags "-X main.version=${VERSION}" -a -installsuffix nocgo -o minitor . RUN go build -ldflags "-X main.version=${VERSION}" -a -installsuffix nocgo -o minitor .
FROM ${REPO}/alpine:3.18 FROM ${REPO}/alpine:3.17
RUN mkdir /app RUN mkdir /app
WORKDIR /app/ WORKDIR /app/
@@ -22,7 +22,7 @@ WORKDIR /app/
COPY --from=builder /app/minitor . COPY --from=builder /app/minitor .
# Add common checking tools # Add common checking tools
RUN apk --no-cache add bash=~5 curl=~8 jq=~1 bind-tools=~9 tzdata~=2023c RUN apk --no-cache add bash=~5 curl=~8 jq=~1.6 bind-tools~=9
# Add minitor user for running as non-root # Add minitor user for running as non-root
RUN addgroup -S minitor && adduser -S minitor -G minitor RUN addgroup -S minitor && adduser -S minitor -G minitor
+2 -22
View File
@@ -46,8 +46,6 @@ docker run -v $PWD/config.yml:/app/config.yml iamthefij/minitor-go:latest
Images are provided for `amd64`, `arm`, and `arm64` architechtures. Images are provided for `amd64`, `arm`, and `arm64` architechtures.
Timezone configuration for the container is set by passing the `TZ` env variable. Eg. `TZ=America/Los_Angeles`.
## Configuring ## Configuring
In this repo, you can explore the `sample-config.yml` file for an example, but the general structure is as follows. It should be noted that environment variable interpolation happens on load of the YAML file. In this repo, you can explore the `sample-config.yml` file for an example, but the general structure is as follows. It should be noted that environment variable interpolation happens on load of the YAML file.
@@ -96,28 +94,10 @@ Also, when alerts are executed, they will be passed through Go's format function
|`{{.AlertCount}}`|Number of times this monitor has alerted| |`{{.AlertCount}}`|Number of times this monitor has alerted|
|`{{.FailureCount}}`|The total number of sequential failed checks for this monitor| |`{{.FailureCount}}`|The total number of sequential failed checks for this monitor|
|`{{.LastCheckOutput}}`|The last returned value from the check command to either stderr or stdout| |`{{.LastCheckOutput}}`|The last returned value from the check command to either stderr or stdout|
|`{{.LastSuccess}}`|The datetime of the last successful check as a go Time struct| |`{{.LastSuccess}}`|The ISO datetime of the last successful check|
|`{{.MonitorName}}`|The name of the monitor that failed and triggered the alert| |`{{.MonitorName}}`|The name of the monitor that failed and triggered the alert|
|`{{.IsUp}}`|Indicates if the monitor that is alerting is up or not. Can be used in a conditional message template| |`{{.IsUp}}`|Indicates if the monitor that is alerting is up or not. Can be used in a conditional message template|
To provide flexible formatting, the following non-standard functions are available in templates:
|func|description|
|---|---|
|`ANSIC <Time>`|Formats provided time in ANSIC format|
|`UnixDate <Time>`|Formats provided time in UnixDate format|
|`RubyDate <Time>`|Formats provided time in RubyDate format|
|`RFC822Z <Time>`|Formats provided time in RFC822Z format|
|`RFC850 <Time>`|Formats provided time in RFC850 format|
|`RFC1123 <Time>`|Formats provided time in RFC1123 format|
|`RFC1123Z <Time>`|Formats provided time in RFC1123Z format|
|`RFC3339 <Time>`|Formats provided time in RFC3339 format|
|`RFC3339Nano <Time>`|Formats provided time in RFC3339Nano format|
|`FormatTime <Time> <string template>`|Formats provided time according to provided template|
|`InTZ <Time> <string timezone name>`|Converts provided time to parsed timezone from the provided name|
For more information, check out the [Go documentation for the time module](https://pkg.go.dev/time@go1.20.7#pkg-constants).
### Metrics ### Metrics
Minitor supports exporting metrics for [Prometheus](https://prometheus.io/). Prometheus is an open source tool for reading and querying metrics from different sources. Combined with another tool, [Grafana](https://grafana.com/), it allows building of charts and dashboards. You could also opt to just use Minitor to log check results, and instead do your alerting with Grafana. Minitor supports exporting metrics for [Prometheus](https://prometheus.io/). Prometheus is an open source tool for reading and querying metrics from different sources. Combined with another tool, [Grafana](https://grafana.com/), it allows building of charts and dashboards. You could also opt to just use Minitor to log check results, and instead do your alerting with Grafana.
@@ -178,7 +158,7 @@ minitor-go:
check_interval: 1m30s check_interval: 1m30s
``` ```
For the time being, legacy configs for the Python version of Minitor should be compatible if you apply the `-py-compat` flag when running Minitor. Eventually, this flag will go away when later breaking changes are introduced. The `-py-compat` flag has been removed. Any existing Python oriented configuration needs to be migrated to the new templates.
## Future ## Future
+3 -53
View File
@@ -5,7 +5,6 @@ import (
"errors" "errors"
"fmt" "fmt"
"os/exec" "os/exec"
"strings"
"text/template" "text/template"
"time" "time"
@@ -45,70 +44,21 @@ func (alert Alert) IsValid() bool {
// BuildTemplates compiles command templates for the Alert // BuildTemplates compiles command templates for the Alert
func (alert *Alert) BuildTemplates() error { func (alert *Alert) BuildTemplates() error {
// TODO: Remove legacy template support later after 1.0
legacy := strings.NewReplacer(
"{alert_count}", "{{.AlertCount}}",
"{alert_message}", "{{.MonitorName}} check has failed {{.FailureCount}} times",
"{failure_count}", "{{.FailureCount}}",
"{last_output}", "{{.LastCheckOutput}}",
"{last_success}", "{{.LastSuccess}}",
"{monitor_name}", "{{.MonitorName}}",
)
slog.Debugf("Building template for alert %s", alert.Name) slog.Debugf("Building template for alert %s", alert.Name)
// Time format func factory
tff := func(formatString string) func(time.Time) string {
return func(t time.Time) string {
return t.Format(formatString)
}
}
// Create some functions for formatting datetimes in popular formats
timeFormatFuncs := template.FuncMap{
"ANSIC": tff(time.ANSIC),
"UnixDate": tff(time.UnixDate),
"RubyDate": tff(time.RubyDate),
"RFC822Z": tff(time.RFC822Z),
"RFC850": tff(time.RFC850),
"RFC1123": tff(time.RFC1123),
"RFC1123Z": tff(time.RFC1123Z),
"RFC3339": tff(time.RFC3339),
"RFC3339Nano": tff(time.RFC3339Nano),
"FormatTime": func(t time.Time, timeFormat string) string {
return t.Format(timeFormat)
},
"InTZ": func(t time.Time, tzName string) (time.Time, error) {
tz, err := time.LoadLocation(tzName)
if err != nil {
return t, fmt.Errorf("failed to convert time to specified tz: %w", err)
}
return t.In(tz), nil
},
}
switch { switch {
case alert.commandTemplate == nil && alert.Command.Command != nil: case alert.commandTemplate == nil && alert.Command.Command != nil:
alert.commandTemplate = []*template.Template{} alert.commandTemplate = []*template.Template{}
for i, cmdPart := range alert.Command.Command { for i, cmdPart := range alert.Command.Command {
if PyCompat {
cmdPart = legacy.Replace(cmdPart)
}
alert.commandTemplate = append(alert.commandTemplate, template.Must( alert.commandTemplate = append(alert.commandTemplate, template.Must(
template.New(alert.Name+fmt.Sprint(i)).Funcs(timeFormatFuncs).Parse(cmdPart), template.New(alert.Name+fmt.Sprint(i)).Parse(cmdPart),
)) ))
} }
case alert.commandShellTemplate == nil && alert.Command.ShellCommand != "": case alert.commandShellTemplate == nil && alert.Command.ShellCommand != "":
shellCmd := alert.Command.ShellCommand shellCmd := alert.Command.ShellCommand
if PyCompat {
shellCmd = legacy.Replace(shellCmd)
}
alert.commandShellTemplate = template.Must( alert.commandShellTemplate = template.Must(
template.New(alert.Name).Funcs(timeFormatFuncs).Parse(shellCmd), template.New(alert.Name).Parse(shellCmd),
) )
default: default:
return fmt.Errorf("No template provided for alert %s: %w", alert.Name, errNoTemplate) return fmt.Errorf("No template provided for alert %s: %w", alert.Name, errNoTemplate)
@@ -168,7 +118,7 @@ func (alert Alert) Send(notice AlertNotice) (outputStr string, err error) {
if err != nil { if err != nil {
err = fmt.Errorf( err = fmt.Errorf(
"Alert %s failed to send. Returned %w: %w", "Alert '%s' failed to send. Returned %v: %w",
alert.Name, alert.Name,
err, err,
ErrAlertFailed, ErrAlertFailed,
-13
View File
@@ -70,14 +70,6 @@ func TestAlertSend(t *testing.T) {
"Command shell with bad template", "Command shell with bad template",
false, false,
}, },
{
Alert{Command: CommandOrShell{ShellCommand: "echo {alert_message}"}},
AlertNotice{MonitorName: "test", FailureCount: 1},
"test check has failed 1 times\n",
false,
"Command shell with legacy template",
true,
},
// Test default log alert down // Test default log alert down
{ {
*NewLogAlert(), *NewLogAlert(),
@@ -100,8 +92,6 @@ func TestAlertSend(t *testing.T) {
for _, c := range cases { for _, c := range cases {
log.Printf("Testing case %s", c.name) log.Printf("Testing case %s", c.name)
// Set PyCompat to value of compat flag
PyCompat = c.pyCompat
err := c.alert.BuildTemplates() err := c.alert.BuildTemplates()
if err != nil { if err != nil {
@@ -121,9 +111,6 @@ func TestAlertSend(t *testing.T) {
log.Printf("Case failed: %s", c.name) log.Printf("Case failed: %s", c.name)
} }
// Set PyCompat back to default value
PyCompat = false
log.Println("-----") log.Println("-----")
} }
} }
+5 -45
View File
@@ -13,11 +13,11 @@ var errInvalidConfig = errors.New("Invalid configuration")
// Config type is contains all provided user configuration // Config type is contains all provided user configuration
type Config struct { type Config struct {
CheckInterval SecondsOrDuration `yaml:"check_interval"` CheckInterval time.Duration `yaml:"check_interval"`
DefaultAlertAfter int16 `yaml:"default_alert_after"` DefaultAlertAfter int16 `yaml:"default_alert_after"`
DefaultAlertEvery *int16 `yaml:"default_alert_every"` DefaultAlertEvery *int16 `yaml:"default_alert_every"`
DefaultAlertDown []string `yaml:"default_alert_down"` DefaultAlertDown []string `yaml:"default_alert_down"`
DefaultAlertUp []string `yaml:"default_alert_up"` DefaultAlertUp []string `yaml:"default_alert_up"`
Monitors []*Monitor Monitors []*Monitor
Alerts map[string]*Alert Alerts map[string]*Alert
} }
@@ -56,34 +56,6 @@ func (cos *CommandOrShell) UnmarshalYAML(unmarshal func(interface{}) error) erro
return nil return nil
} }
// SecondsOrDuration wraps a duration value for parsing a duration or seconds from YAML
// NOTE: This should be removed in favor of only parsing durations once compatibility is broken
type SecondsOrDuration struct {
value time.Duration
}
// Value returns a duration value
func (sod SecondsOrDuration) Value() time.Duration {
return sod.value
}
// UnmarshalYAML allows unmarshalling a duration value or seconds if an int was provided
func (sod *SecondsOrDuration) UnmarshalYAML(unmarshal func(interface{}) error) error {
var seconds int64
err := unmarshal(&seconds)
if err == nil {
sod.value = time.Second * time.Duration(seconds)
return nil
}
// Error indicates that we don't have an int
err = unmarshal(&sod.value)
return err
}
// IsValid checks config validity and returns true if valid // IsValid checks config validity and returns true if valid
func (config Config) IsValid() (isValid bool) { func (config Config) IsValid() (isValid bool) {
isValid = true isValid = true
@@ -182,18 +154,6 @@ func LoadConfig(filePath string) (config Config, err error) {
slog.Debugf("Config values:\n%v\n", config) slog.Debugf("Config values:\n%v\n", config)
// Add log alert if not present
if PyCompat {
// Initialize alerts list if not present
if config.Alerts == nil {
config.Alerts = map[string]*Alert{}
}
if _, ok := config.Alerts["log"]; !ok {
config.Alerts["log"] = NewLogAlert()
}
}
// Finish initializing configuration // Finish initializing configuration
if err = config.Init(); err != nil { if err = config.Init(); err != nil {
return return
+3 -9
View File
@@ -15,7 +15,6 @@ func TestLoadConfig(t *testing.T) {
}{ }{
{"./test/valid-config.yml", false, "Valid config file", false}, {"./test/valid-config.yml", false, "Valid config file", false},
{"./test/valid-config-default-values.yml", false, "Valid config file with default values", false}, {"./test/valid-config-default-values.yml", false, "Valid config file with default values", false},
{"./test/valid-default-log-alert.yml", false, "Valid config file with default log alert PyCompat", true},
{"./test/valid-default-log-alert.yml", true, "Invalid config file no log alert", false}, {"./test/valid-default-log-alert.yml", true, "Invalid config file no log alert", false},
{"./test/does-not-exist", true, "Invalid config path", false}, {"./test/does-not-exist", true, "Invalid config path", false},
{"./test/invalid-config-type.yml", true, "Invalid config type for key", false}, {"./test/invalid-config-type.yml", true, "Invalid config type for key", false},
@@ -25,8 +24,6 @@ func TestLoadConfig(t *testing.T) {
for _, c := range cases { for _, c := range cases {
log.Printf("Testing case %s", c.name) log.Printf("Testing case %s", c.name)
// Set PyCompat based on compatibility mode
PyCompat = c.pyCompat
_, err := LoadConfig(c.configPath) _, err := LoadConfig(c.configPath)
hasErr := (err != nil) hasErr := (err != nil)
@@ -34,9 +31,6 @@ func TestLoadConfig(t *testing.T) {
t.Errorf("LoadConfig(%v), expected_error=%v actual=%v", c.name, c.expectErr, err) t.Errorf("LoadConfig(%v), expected_error=%v actual=%v", c.name, c.expectErr, err)
log.Printf("Case failed: %s", c.name) log.Printf("Case failed: %s", c.name)
} }
// Set PyCompat to default value
PyCompat = false
} }
} }
@@ -53,15 +47,15 @@ func TestIntervalParsing(t *testing.T) {
oneMinute := time.Minute oneMinute := time.Minute
// validate top level interval seconds represented as an int // validate top level interval seconds represented as an int
if config.CheckInterval.Value() != oneSecond { if config.CheckInterval != oneSecond {
t.Errorf("Incorrectly parsed int seconds. expected=%v actual=%v", oneSecond, config.CheckInterval) t.Errorf("Incorrectly parsed int seconds. expected=%v actual=%v", oneSecond, config.CheckInterval)
} }
if config.Monitors[0].CheckInterval.Value() != tenSeconds { if config.Monitors[0].CheckInterval != tenSeconds {
t.Errorf("Incorrectly parsed seconds duration. expected=%v actual=%v", oneSecond, config.CheckInterval) t.Errorf("Incorrectly parsed seconds duration. expected=%v actual=%v", oneSecond, config.CheckInterval)
} }
if config.Monitors[1].CheckInterval.Value() != oneMinute { if config.Monitors[1].CheckInterval != oneMinute {
t.Errorf("Incorrectly parsed seconds duration. expected=%v actual=%v", oneSecond, config.CheckInterval) t.Errorf("Incorrectly parsed seconds duration. expected=%v actual=%v", oneSecond, config.CheckInterval)
} }
+1 -1
View File
@@ -1,6 +1,6 @@
module git.iamthefij.com/iamthefij/minitor-go module git.iamthefij.com/iamthefij/minitor-go
go 1.20 go 1.17
require ( require (
git.iamthefij.com/iamthefij/slog v1.3.0 git.iamthefij.com/iamthefij/slog v1.3.0
-72
View File
@@ -1,72 +0,0 @@
package main
import (
"fmt"
"io"
"net/http"
"strings"
)
type HealthCheckHandler struct {
isMinitorHealthy bool
monitors []*Monitor
}
func NewHealthCheckHandler(monitors []*Monitor) *HealthCheckHandler {
return &HealthCheckHandler{
false,
monitors,
}
}
func (hch *HealthCheckHandler) MinitorHealthy(healthy bool) {
hch.isMinitorHealthy = healthy
}
func (hch HealthCheckHandler) MinitorHealthCheck() (bool, string) {
if hch.isMinitorHealthy {
return true, "OK"
} else {
return false, "UNHEALTHY"
}
}
func (hch HealthCheckHandler) MonitorsHealthCheck() (bool, string) {
downMonitors := []string{}
for _, monitor := range hch.monitors {
if !monitor.IsUp() {
downMonitors = append(downMonitors, monitor.Name)
}
}
if len(downMonitors) == 0 {
return true, "OK"
} else {
return false, fmt.Sprintf("UNHEALTHY: The following monitors are unhealthy: %s", strings.Join(downMonitors, ", "))
}
}
func (hch HealthCheckHandler) ServeHTTP(w http.ResponseWriter, r *http.Request) {
var healthy bool
var body string
if monitors := r.URL.Query().Get("monitors"); monitors != "" {
healthy, body = hch.MonitorsHealthCheck()
} else {
healthy, body = hch.MinitorHealthCheck()
}
if healthy {
w.WriteHeader(http.StatusOK)
} else {
w.WriteHeader(http.StatusServiceUnavailable)
}
_, _ = io.WriteString(w, body)
}
func HandleHealthCheck() {
http.Handle("/metrics", HealthChecks)
}
-79
View File
@@ -1,79 +0,0 @@
package main
import (
"testing"
)
func TestNewHealthCheck(t *testing.T) {
monitors := []*Monitor{
{Name: "Test Monitor"},
}
hc := NewHealthCheckHandler(monitors)
monitors[0].alertCount++
if healthy, _ := hc.MinitorHealthCheck(); healthy {
t.Errorf("Initial hc state should be unhealthy until some successful alert is sent")
}
if healthy, _ := hc.MonitorsHealthCheck(); healthy {
t.Errorf("Faking an alert on the monitor pointer should make this unhealthy")
}
}
func TestMinitorHealthCheck(t *testing.T) {
monitors := []*Monitor{
{Name: "Test Monitor"},
}
hc := NewHealthCheckHandler(monitors)
t.Run("MinitorHealthCheck(healthy)", func(t *testing.T) {
hc.MinitorHealthy(true)
healthy, body := hc.MinitorHealthCheck()
if !healthy {
t.Errorf("Expected healthy check")
}
if body != "OK" {
t.Errorf("Expected OK response")
}
})
t.Run("MinitorHealthCheck(unhealthy)", func(t *testing.T) {
hc.MinitorHealthy(false)
healthy, body := hc.MinitorHealthCheck()
if healthy {
t.Errorf("Expected healthy check")
}
if body != "UNHEALTHY" {
t.Errorf("Expected UNHEALTHY response")
}
})
}
func TestMonitorsHealthCheck(t *testing.T) {
monitors := []*Monitor{
{Name: "Test Monitor"},
}
hc := NewHealthCheckHandler(monitors)
t.Run("MonitorsHealthCheck(healthy)", func(t *testing.T) {
healthy, body := hc.MonitorsHealthCheck()
if !healthy {
t.Errorf("Expected healthy check")
}
if body != "OK" {
t.Errorf("Expected OK response")
}
})
t.Run("MonitorsHealthCheck(unhealthy)", func(t *testing.T) {
monitors[0].alertCount++
healthy, body := hc.MonitorsHealthCheck()
if healthy {
t.Errorf("Expected healthy check")
}
if body != "UNHEALTHY: The following monitors are unhealthy: Test Monitor" {
t.Errorf("Expected UNHEALTHY response")
}
})
}
+5 -50
View File
@@ -4,7 +4,6 @@ import (
"errors" "errors"
"flag" "flag"
"fmt" "fmt"
"net/http"
"time" "time"
"git.iamthefij.com/iamthefij/slog" "git.iamthefij.com/iamthefij/slog"
@@ -17,13 +16,6 @@ var (
MetricsPort = 8080 MetricsPort = 8080
// Metrics contains all active metrics // Metrics contains all active metrics
Metrics = NewMetrics() Metrics = NewMetrics()
// Self monitor rather than panicing
SelfMonitor = false
// HealthChecks contains health check values
HealthChecks *HealthCheckHandler = nil
// PyCompat enables support for legacy Python templates
PyCompat = false
// version of minitor being run // version of minitor being run
version = "dev" version = "dev"
@@ -56,13 +48,7 @@ func sendAlerts(config *Config, monitor *Monitor, alertNotice *AlertNotice) erro
output, output,
) )
if SelfMonitor {
Metrics.SetMonitorStatus(fmt.Sprintf("Alert %s", alertName), false)
}
return err return err
} else {
Metrics.SetMonitorStatus(fmt.Sprintf("Alert %s", alertName), true)
} }
// Count alert metrics // Count alert metrics
@@ -80,8 +66,6 @@ func sendAlerts(config *Config, monitor *Monitor, alertNotice *AlertNotice) erro
func checkMonitors(config *Config) error { func checkMonitors(config *Config) error {
// TODO: Run this in goroutines and capture exceptions // TODO: Run this in goroutines and capture exceptions
healthy := true
for _, monitor := range config.Monitors { for _, monitor := range config.Monitors {
if monitor.ShouldCheck() { if monitor.ShouldCheck() {
success, alertNotice := monitor.Check() success, alertNotice := monitor.Check()
@@ -93,42 +77,24 @@ func checkMonitors(config *Config) error {
if alertNotice != nil { if alertNotice != nil {
err := sendAlerts(config, monitor, alertNotice) err := sendAlerts(config, monitor, alertNotice)
// If there was an error in sending an alert, mark as unhealthy or bubble up // If there was an error in sending an alert, exit early and bubble it up
if err != nil { if err != nil {
if SelfMonitor { return err
healthy = false
} else {
return err
}
} }
} }
} }
} }
if HealthChecks != nil {
HealthChecks.MinitorHealthy(healthy)
}
return nil return nil
} }
// ServeMetricsAndHealth starts the default http server
func ServeMetricsAndHealth() {
host := fmt.Sprintf(":%d", MetricsPort)
_ = http.ListenAndServe(host, nil)
}
func main() { func main() {
showVersion := flag.Bool("version", false, "Display the version of minitor and exit") showVersion := flag.Bool("version", false, "Display the version of minitor and exit")
configPath := flag.String("config", "config.yml", "Alternate configuration path (default: config.yml)") configPath := flag.String("config", "config.yml", "Alternate configuration path (default: config.yml)")
flag.BoolVar(&slog.DebugLevel, "debug", false, "Enables debug logs (default: false)") flag.BoolVar(&slog.DebugLevel, "debug", false, "Enables debug logs (default: false)")
flag.BoolVar(&ExportMetrics, "metrics", false, "Enables prometheus metrics exporting (default: false)") flag.BoolVar(&ExportMetrics, "metrics", false, "Enables prometheus metrics exporting (default: false)")
flag.BoolVar(&PyCompat, "py-compat", false, "Enables support for legacy Python Minitor config. Will eventually be removed. (default: false)") flag.IntVar(&MetricsPort, "metrics-port", MetricsPort, "The port that Prometheus metrics should be exported on, if enabled. (default: 8080)")
flag.IntVar(&MetricsPort, "metrics-port", MetricsPort, "The port that Prometheus metrics and healthchecks should be exported on, if enabled. (default: 8080)")
flag.BoolVar(&SelfMonitor, "self-monitor", false, "Enables self-monitoring. Export metrics rather than panic when alerts fail. (default: false)")
flag.Parse() flag.Parse()
// Print version if flag is provided // Print version if flag is provided
@@ -145,19 +111,8 @@ func main() {
// Serve metrics exporter, if specified // Serve metrics exporter, if specified
if ExportMetrics { if ExportMetrics {
slog.Infof("Exporting metrics to Prometheus on port %d", MetricsPort) slog.Infof("Exporting metrics to Prometheus on port %d", MetricsPort)
HandleMetrics()
}
if SelfMonitor { go ServeMetrics()
slog.Infof("Starting healthcheck endpoint on port %d", MetricsPort)
HealthChecks = NewHealthCheckHandler(config.Monitors)
HandleHealthCheck()
}
if ExportMetrics || SelfMonitor {
go ServeMetricsAndHealth()
} }
// Start main loop // Start main loop
@@ -165,6 +120,6 @@ func main() {
err = checkMonitors(&config) err = checkMonitors(&config)
slog.OnErrPanicf(err, "Error checking monitors") slog.OnErrPanicf(err, "Error checking monitors")
time.Sleep(config.CheckInterval.Value()) time.Sleep(config.CheckInterval)
} }
} }
+17 -63
View File
@@ -4,10 +4,9 @@ import "testing"
func TestCheckMonitors(t *testing.T) { func TestCheckMonitors(t *testing.T) {
cases := []struct { cases := []struct {
config Config config Config
expectErr bool expectErr bool
name string name string
selfMonitor bool
}{ }{
{ {
config: Config{}, config: Config{},
@@ -23,9 +22,8 @@ func TestCheckMonitors(t *testing.T) {
}, },
}, },
}, },
expectErr: false, expectErr: false,
name: "Monitor success, no alerts", name: "Monitor success, no alerts",
selfMonitor: false,
}, },
{ {
config: Config{ config: Config{
@@ -37,9 +35,8 @@ func TestCheckMonitors(t *testing.T) {
}, },
}, },
}, },
expectErr: false, expectErr: false,
name: "Monitor failure, no alerts", name: "Monitor failure, no alerts",
selfMonitor: false,
}, },
{ {
config: Config{ config: Config{
@@ -51,9 +48,8 @@ func TestCheckMonitors(t *testing.T) {
}, },
}, },
}, },
expectErr: false, expectErr: false,
name: "Monitor recovery, no alerts", name: "Monitor recovery, no alerts",
selfMonitor: false,
}, },
{ {
config: Config{ config: Config{
@@ -66,9 +62,8 @@ func TestCheckMonitors(t *testing.T) {
}, },
}, },
}, },
expectErr: true, expectErr: true,
name: "Monitor failure, unknown alerts", name: "Monitor failure, unknown alerts",
selfMonitor: false,
}, },
{ {
config: Config{ config: Config{
@@ -81,24 +76,8 @@ func TestCheckMonitors(t *testing.T) {
}, },
}, },
}, },
expectErr: true, expectErr: true,
name: "Monitor recovery, unknown alerts", name: "Monitor recovery, unknown alerts",
selfMonitor: false,
},
{
config: Config{
Monitors: []*Monitor{
{
Name: "Success",
Command: CommandOrShell{Command: []string{"true"}},
AlertUp: []string{"unknown"},
alertCount: 1,
},
},
},
expectErr: false,
name: "Monitor recovery, unknown alerts, with Health Check",
selfMonitor: true,
}, },
{ {
config: Config{ config: Config{
@@ -116,9 +95,8 @@ func TestCheckMonitors(t *testing.T) {
}, },
}, },
}, },
expectErr: false, expectErr: false,
name: "Monitor failure, successful alert", name: "Monitor failure, successful alert",
selfMonitor: false,
}, },
{ {
config: Config{ config: Config{
@@ -137,36 +115,12 @@ func TestCheckMonitors(t *testing.T) {
}, },
}, },
}, },
expectErr: true, expectErr: true,
name: "Monitor failure, bad alert", name: "Monitor failure, bad alert",
selfMonitor: false,
},
{
config: Config{
Monitors: []*Monitor{
{
Name: "Failure",
Command: CommandOrShell{Command: []string{"false"}},
AlertDown: []string{"bad"},
AlertAfter: 1,
},
},
Alerts: map[string]*Alert{
"bad": {
Name: "bad",
Command: CommandOrShell{Command: []string{"false"}},
},
},
},
expectErr: false,
name: "Monitor failure, bad alert, with Health Check",
selfMonitor: true,
}, },
} }
for _, c := range cases { for _, c := range cases {
SelfMonitor = c.selfMonitor
err := c.config.Init() err := c.config.Init()
if err != nil { if err != nil {
t.Errorf("checkMonitors(%s): unexpected error reading config: %v", c.name, err) t.Errorf("checkMonitors(%s): unexpected error reading config: %v", c.name, err)
+7 -2
View File
@@ -1,6 +1,7 @@
package main package main
import ( import (
"fmt"
"net/http" "net/http"
"github.com/prometheus/client_golang/prometheus" "github.com/prometheus/client_golang/prometheus"
@@ -106,7 +107,11 @@ func (metrics *MinitorMetrics) CountAlert(monitor string, alert string) {
).Inc() ).Inc()
} }
// HandleMetrics add Prometheus metrics handler to default http server // ServeMetrics starts an http server with a Prometheus metrics handler
func HandleMetrics() { func ServeMetrics() {
http.Handle("/metrics", promhttp.Handler()) http.Handle("/metrics", promhttp.Handler())
host := fmt.Sprintf(":%d", MetricsPort)
_ = http.ListenAndServe(host, nil)
} }
+4 -4
View File
@@ -11,9 +11,9 @@ import (
// Monitor represents a particular periodic check of a command // Monitor represents a particular periodic check of a command
type Monitor struct { //nolint:maligned type Monitor struct { //nolint:maligned
// Config values // Config values
AlertAfter int16 `yaml:"alert_after"` AlertAfter int16 `yaml:"alert_after"`
AlertEvery *int16 `yaml:"alert_every"` AlertEvery *int16 `yaml:"alert_every"`
CheckInterval SecondsOrDuration `yaml:"check_interval"` CheckInterval time.Duration `yaml:"check_interval"`
Name string Name string
AlertDown []string `yaml:"alert_down"` AlertDown []string `yaml:"alert_down"`
AlertUp []string `yaml:"alert_up"` AlertUp []string `yaml:"alert_up"`
@@ -45,7 +45,7 @@ func (monitor Monitor) ShouldCheck() bool {
sinceLastCheck := time.Since(monitor.lastCheck) sinceLastCheck := time.Since(monitor.lastCheck)
return sinceLastCheck >= monitor.CheckInterval.Value() return sinceLastCheck >= monitor.CheckInterval
} }
// Check will run the command configured by the Monitor and return a status // Check will run the command configured by the Monitor and return a status
+3 -3
View File
@@ -45,9 +45,9 @@ func TestMonitorShouldCheck(t *testing.T) {
name string name string
}{ }{
{Monitor{}, true, "Empty"}, {Monitor{}, true, "Empty"},
{Monitor{lastCheck: timeNow, CheckInterval: SecondsOrDuration{time.Second * 15}}, false, "Just checked"}, {Monitor{lastCheck: timeNow, CheckInterval: time.Second * 15}, false, "Just checked"},
{Monitor{lastCheck: timeTenSecAgo, CheckInterval: SecondsOrDuration{time.Second * 15}}, false, "-10s"}, {Monitor{lastCheck: timeTenSecAgo, CheckInterval: time.Second * 15}, false, "-10s"},
{Monitor{lastCheck: timeTwentySecAgo, CheckInterval: SecondsOrDuration{time.Second * 15}}, true, "-20s"}, {Monitor{lastCheck: timeTwentySecAgo, CheckInterval: time.Second * 15}, true, "-20s"},
} }
for _, c := range cases { for _, c := range cases {
+1 -1
View File
@@ -1,5 +1,5 @@
--- ---
check_interval: 1 check_interval: 1s
monitors: monitors:
- name: Command - name: Command