Compare commits
8 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
0a36da79d6 | ||
|
|
01cca50532 | ||
|
|
2789aa63e4 | ||
|
|
37db4b2db0 | ||
|
|
41a1dbeceb | ||
|
|
c02d64d674 | ||
|
|
46f4561bea | ||
|
|
a1e0e9698b |
+2
-2
@@ -4,7 +4,7 @@ name: test
|
||||
|
||||
steps:
|
||||
- name: test
|
||||
image: golang:1.17
|
||||
image: golang:1.20
|
||||
environment:
|
||||
VERSION: ${DRONE_TAG:-${DRONE_COMMIT}}
|
||||
commands:
|
||||
@@ -30,7 +30,7 @@ trigger:
|
||||
|
||||
steps:
|
||||
- name: build all binaries
|
||||
image: golang:1.17
|
||||
image: golang:1.20
|
||||
environment:
|
||||
VERSION: ${DRONE_TAG:-${DRONE_COMMIT}}
|
||||
commands:
|
||||
|
||||
+2
-2
@@ -1,11 +1,11 @@
|
||||
ARG REPO=library
|
||||
FROM ${REPO}/alpine:3.17
|
||||
FROM ${REPO}/alpine:3.18
|
||||
|
||||
RUN mkdir /app
|
||||
WORKDIR /app/
|
||||
|
||||
# Add common checking tools
|
||||
RUN apk --no-cache add bash=~5 curl=~8 jq=~1.6 bind-tools~=9
|
||||
RUN apk --no-cache add bash=~5 curl=~8 jq=~1 bind-tools=~9 tzdata~=2023c
|
||||
|
||||
# Add minitor user for running as non-root
|
||||
RUN addgroup -S minitor && adduser -S minitor -G minitor
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
ARG REPO=library
|
||||
FROM golang:1.17 AS builder
|
||||
FROM golang:1.20 AS builder
|
||||
|
||||
RUN mkdir /app
|
||||
WORKDIR /app
|
||||
@@ -14,7 +14,7 @@ ARG VERSION=dev
|
||||
ENV CGO_ENABLED=0 GOOS=linux GOARCH=${ARCH}
|
||||
RUN go build -ldflags "-X main.version=${VERSION}" -a -installsuffix nocgo -o minitor .
|
||||
|
||||
FROM ${REPO}/alpine:3.17
|
||||
FROM ${REPO}/alpine:3.18
|
||||
RUN mkdir /app
|
||||
WORKDIR /app/
|
||||
|
||||
@@ -22,7 +22,7 @@ WORKDIR /app/
|
||||
COPY --from=builder /app/minitor .
|
||||
|
||||
# Add common checking tools
|
||||
RUN apk --no-cache add bash=~5 curl=~8 jq=~1.6 bind-tools~=9
|
||||
RUN apk --no-cache add bash=~5 curl=~8 jq=~1 bind-tools=~9 tzdata~=2023c
|
||||
|
||||
# Add minitor user for running as non-root
|
||||
RUN addgroup -S minitor && adduser -S minitor -G minitor
|
||||
|
||||
@@ -46,6 +46,8 @@ docker run -v $PWD/config.yml:/app/config.yml iamthefij/minitor-go:latest
|
||||
|
||||
Images are provided for `amd64`, `arm`, and `arm64` architechtures.
|
||||
|
||||
Timezone configuration for the container is set by passing the `TZ` env variable. Eg. `TZ=America/Los_Angeles`.
|
||||
|
||||
## Configuring
|
||||
|
||||
In this repo, you can explore the `sample-config.yml` file for an example, but the general structure is as follows. It should be noted that environment variable interpolation happens on load of the YAML file.
|
||||
@@ -94,10 +96,28 @@ Also, when alerts are executed, they will be passed through Go's format function
|
||||
|`{{.AlertCount}}`|Number of times this monitor has alerted|
|
||||
|`{{.FailureCount}}`|The total number of sequential failed checks for this monitor|
|
||||
|`{{.LastCheckOutput}}`|The last returned value from the check command to either stderr or stdout|
|
||||
|`{{.LastSuccess}}`|The ISO datetime of the last successful check|
|
||||
|`{{.LastSuccess}}`|The datetime of the last successful check as a go Time struct|
|
||||
|`{{.MonitorName}}`|The name of the monitor that failed and triggered the alert|
|
||||
|`{{.IsUp}}`|Indicates if the monitor that is alerting is up or not. Can be used in a conditional message template|
|
||||
|
||||
To provide flexible formatting, the following non-standard functions are available in templates:
|
||||
|
||||
|func|description|
|
||||
|---|---|
|
||||
|`ANSIC <Time>`|Formats provided time in ANSIC format|
|
||||
|`UnixDate <Time>`|Formats provided time in UnixDate format|
|
||||
|`RubyDate <Time>`|Formats provided time in RubyDate format|
|
||||
|`RFC822Z <Time>`|Formats provided time in RFC822Z format|
|
||||
|`RFC850 <Time>`|Formats provided time in RFC850 format|
|
||||
|`RFC1123 <Time>`|Formats provided time in RFC1123 format|
|
||||
|`RFC1123Z <Time>`|Formats provided time in RFC1123Z format|
|
||||
|`RFC3339 <Time>`|Formats provided time in RFC3339 format|
|
||||
|`RFC3339Nano <Time>`|Formats provided time in RFC3339Nano format|
|
||||
|`FormatTime <Time> <string template>`|Formats provided time according to provided template|
|
||||
|`InTZ <Time> <string timezone name>`|Converts provided time to parsed timezone from the provided name|
|
||||
|
||||
For more information, check out the [Go documentation for the time module](https://pkg.go.dev/time@go1.20.7#pkg-constants).
|
||||
|
||||
### Metrics
|
||||
|
||||
Minitor supports exporting metrics for [Prometheus](https://prometheus.io/). Prometheus is an open source tool for reading and querying metrics from different sources. Combined with another tool, [Grafana](https://grafana.com/), it allows building of charts and dashboards. You could also opt to just use Minitor to log check results, and instead do your alerting with Grafana.
|
||||
@@ -158,7 +178,7 @@ minitor-go:
|
||||
check_interval: 1m30s
|
||||
```
|
||||
|
||||
The `-py-compat` flag has been removed. Any existing Python oriented configuration needs to be migrated to the new templates.
|
||||
For the time being, legacy configs for the Python version of Minitor should be compatible if you apply the `-py-compat` flag when running Minitor. Eventually, this flag will go away when later breaking changes are introduced.
|
||||
|
||||
## Future
|
||||
|
||||
|
||||
@@ -5,6 +5,7 @@ import (
|
||||
"errors"
|
||||
"fmt"
|
||||
"os/exec"
|
||||
"strings"
|
||||
"text/template"
|
||||
"time"
|
||||
|
||||
@@ -44,21 +45,70 @@ func (alert Alert) IsValid() bool {
|
||||
|
||||
// BuildTemplates compiles command templates for the Alert
|
||||
func (alert *Alert) BuildTemplates() error {
|
||||
// TODO: Remove legacy template support later after 1.0
|
||||
legacy := strings.NewReplacer(
|
||||
"{alert_count}", "{{.AlertCount}}",
|
||||
"{alert_message}", "{{.MonitorName}} check has failed {{.FailureCount}} times",
|
||||
"{failure_count}", "{{.FailureCount}}",
|
||||
"{last_output}", "{{.LastCheckOutput}}",
|
||||
"{last_success}", "{{.LastSuccess}}",
|
||||
"{monitor_name}", "{{.MonitorName}}",
|
||||
)
|
||||
|
||||
slog.Debugf("Building template for alert %s", alert.Name)
|
||||
|
||||
// Time format func factory
|
||||
tff := func(formatString string) func(time.Time) string {
|
||||
return func(t time.Time) string {
|
||||
return t.Format(formatString)
|
||||
}
|
||||
}
|
||||
|
||||
// Create some functions for formatting datetimes in popular formats
|
||||
timeFormatFuncs := template.FuncMap{
|
||||
"ANSIC": tff(time.ANSIC),
|
||||
"UnixDate": tff(time.UnixDate),
|
||||
"RubyDate": tff(time.RubyDate),
|
||||
"RFC822Z": tff(time.RFC822Z),
|
||||
"RFC850": tff(time.RFC850),
|
||||
"RFC1123": tff(time.RFC1123),
|
||||
"RFC1123Z": tff(time.RFC1123Z),
|
||||
"RFC3339": tff(time.RFC3339),
|
||||
"RFC3339Nano": tff(time.RFC3339Nano),
|
||||
"FormatTime": func(t time.Time, timeFormat string) string {
|
||||
return t.Format(timeFormat)
|
||||
},
|
||||
"InTZ": func(t time.Time, tzName string) (time.Time, error) {
|
||||
tz, err := time.LoadLocation(tzName)
|
||||
if err != nil {
|
||||
return t, fmt.Errorf("failed to convert time to specified tz: %w", err)
|
||||
}
|
||||
|
||||
return t.In(tz), nil
|
||||
},
|
||||
}
|
||||
|
||||
switch {
|
||||
case alert.commandTemplate == nil && alert.Command.Command != nil:
|
||||
alert.commandTemplate = []*template.Template{}
|
||||
for i, cmdPart := range alert.Command.Command {
|
||||
if PyCompat {
|
||||
cmdPart = legacy.Replace(cmdPart)
|
||||
}
|
||||
|
||||
alert.commandTemplate = append(alert.commandTemplate, template.Must(
|
||||
template.New(alert.Name+fmt.Sprint(i)).Parse(cmdPart),
|
||||
template.New(alert.Name+fmt.Sprint(i)).Funcs(timeFormatFuncs).Parse(cmdPart),
|
||||
))
|
||||
}
|
||||
case alert.commandShellTemplate == nil && alert.Command.ShellCommand != "":
|
||||
shellCmd := alert.Command.ShellCommand
|
||||
|
||||
if PyCompat {
|
||||
shellCmd = legacy.Replace(shellCmd)
|
||||
}
|
||||
|
||||
alert.commandShellTemplate = template.Must(
|
||||
template.New(alert.Name).Parse(shellCmd),
|
||||
template.New(alert.Name).Funcs(timeFormatFuncs).Parse(shellCmd),
|
||||
)
|
||||
default:
|
||||
return fmt.Errorf("No template provided for alert %s: %w", alert.Name, errNoTemplate)
|
||||
@@ -118,7 +168,7 @@ func (alert Alert) Send(notice AlertNotice) (outputStr string, err error) {
|
||||
|
||||
if err != nil {
|
||||
err = fmt.Errorf(
|
||||
"Alert '%s' failed to send. Returned %v: %w",
|
||||
"Alert %s failed to send. Returned %w: %w",
|
||||
alert.Name,
|
||||
err,
|
||||
ErrAlertFailed,
|
||||
|
||||
@@ -70,6 +70,14 @@ func TestAlertSend(t *testing.T) {
|
||||
"Command shell with bad template",
|
||||
false,
|
||||
},
|
||||
{
|
||||
Alert{Command: CommandOrShell{ShellCommand: "echo {alert_message}"}},
|
||||
AlertNotice{MonitorName: "test", FailureCount: 1},
|
||||
"test check has failed 1 times\n",
|
||||
false,
|
||||
"Command shell with legacy template",
|
||||
true,
|
||||
},
|
||||
// Test default log alert down
|
||||
{
|
||||
*NewLogAlert(),
|
||||
@@ -92,6 +100,8 @@ func TestAlertSend(t *testing.T) {
|
||||
|
||||
for _, c := range cases {
|
||||
log.Printf("Testing case %s", c.name)
|
||||
// Set PyCompat to value of compat flag
|
||||
PyCompat = c.pyCompat
|
||||
|
||||
err := c.alert.BuildTemplates()
|
||||
if err != nil {
|
||||
@@ -111,6 +121,9 @@ func TestAlertSend(t *testing.T) {
|
||||
log.Printf("Case failed: %s", c.name)
|
||||
}
|
||||
|
||||
// Set PyCompat back to default value
|
||||
PyCompat = false
|
||||
|
||||
log.Println("-----")
|
||||
}
|
||||
}
|
||||
|
||||
@@ -13,7 +13,7 @@ var errInvalidConfig = errors.New("Invalid configuration")
|
||||
|
||||
// Config type is contains all provided user configuration
|
||||
type Config struct {
|
||||
CheckInterval time.Duration `yaml:"check_interval"`
|
||||
CheckInterval SecondsOrDuration `yaml:"check_interval"`
|
||||
DefaultAlertAfter int16 `yaml:"default_alert_after"`
|
||||
DefaultAlertEvery *int16 `yaml:"default_alert_every"`
|
||||
DefaultAlertDown []string `yaml:"default_alert_down"`
|
||||
@@ -56,6 +56,34 @@ func (cos *CommandOrShell) UnmarshalYAML(unmarshal func(interface{}) error) erro
|
||||
return nil
|
||||
}
|
||||
|
||||
// SecondsOrDuration wraps a duration value for parsing a duration or seconds from YAML
|
||||
// NOTE: This should be removed in favor of only parsing durations once compatibility is broken
|
||||
type SecondsOrDuration struct {
|
||||
value time.Duration
|
||||
}
|
||||
|
||||
// Value returns a duration value
|
||||
func (sod SecondsOrDuration) Value() time.Duration {
|
||||
return sod.value
|
||||
}
|
||||
|
||||
// UnmarshalYAML allows unmarshalling a duration value or seconds if an int was provided
|
||||
func (sod *SecondsOrDuration) UnmarshalYAML(unmarshal func(interface{}) error) error {
|
||||
var seconds int64
|
||||
err := unmarshal(&seconds)
|
||||
|
||||
if err == nil {
|
||||
sod.value = time.Second * time.Duration(seconds)
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
// Error indicates that we don't have an int
|
||||
err = unmarshal(&sod.value)
|
||||
|
||||
return err
|
||||
}
|
||||
|
||||
// IsValid checks config validity and returns true if valid
|
||||
func (config Config) IsValid() (isValid bool) {
|
||||
isValid = true
|
||||
@@ -154,6 +182,18 @@ func LoadConfig(filePath string) (config Config, err error) {
|
||||
|
||||
slog.Debugf("Config values:\n%v\n", config)
|
||||
|
||||
// Add log alert if not present
|
||||
if PyCompat {
|
||||
// Initialize alerts list if not present
|
||||
if config.Alerts == nil {
|
||||
config.Alerts = map[string]*Alert{}
|
||||
}
|
||||
|
||||
if _, ok := config.Alerts["log"]; !ok {
|
||||
config.Alerts["log"] = NewLogAlert()
|
||||
}
|
||||
}
|
||||
|
||||
// Finish initializing configuration
|
||||
if err = config.Init(); err != nil {
|
||||
return
|
||||
|
||||
+9
-3
@@ -15,6 +15,7 @@ func TestLoadConfig(t *testing.T) {
|
||||
}{
|
||||
{"./test/valid-config.yml", false, "Valid config file", false},
|
||||
{"./test/valid-config-default-values.yml", false, "Valid config file with default values", false},
|
||||
{"./test/valid-default-log-alert.yml", false, "Valid config file with default log alert PyCompat", true},
|
||||
{"./test/valid-default-log-alert.yml", true, "Invalid config file no log alert", false},
|
||||
{"./test/does-not-exist", true, "Invalid config path", false},
|
||||
{"./test/invalid-config-type.yml", true, "Invalid config type for key", false},
|
||||
@@ -24,6 +25,8 @@ func TestLoadConfig(t *testing.T) {
|
||||
|
||||
for _, c := range cases {
|
||||
log.Printf("Testing case %s", c.name)
|
||||
// Set PyCompat based on compatibility mode
|
||||
PyCompat = c.pyCompat
|
||||
_, err := LoadConfig(c.configPath)
|
||||
hasErr := (err != nil)
|
||||
|
||||
@@ -31,6 +34,9 @@ func TestLoadConfig(t *testing.T) {
|
||||
t.Errorf("LoadConfig(%v), expected_error=%v actual=%v", c.name, c.expectErr, err)
|
||||
log.Printf("Case failed: %s", c.name)
|
||||
}
|
||||
|
||||
// Set PyCompat to default value
|
||||
PyCompat = false
|
||||
}
|
||||
}
|
||||
|
||||
@@ -47,15 +53,15 @@ func TestIntervalParsing(t *testing.T) {
|
||||
oneMinute := time.Minute
|
||||
|
||||
// validate top level interval seconds represented as an int
|
||||
if config.CheckInterval != oneSecond {
|
||||
if config.CheckInterval.Value() != oneSecond {
|
||||
t.Errorf("Incorrectly parsed int seconds. expected=%v actual=%v", oneSecond, config.CheckInterval)
|
||||
}
|
||||
|
||||
if config.Monitors[0].CheckInterval != tenSeconds {
|
||||
if config.Monitors[0].CheckInterval.Value() != tenSeconds {
|
||||
t.Errorf("Incorrectly parsed seconds duration. expected=%v actual=%v", oneSecond, config.CheckInterval)
|
||||
}
|
||||
|
||||
if config.Monitors[1].CheckInterval != oneMinute {
|
||||
if config.Monitors[1].CheckInterval.Value() != oneMinute {
|
||||
t.Errorf("Incorrectly parsed seconds duration. expected=%v actual=%v", oneSecond, config.CheckInterval)
|
||||
}
|
||||
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
module git.iamthefij.com/iamthefij/minitor-go
|
||||
|
||||
go 1.17
|
||||
go 1.20
|
||||
|
||||
require (
|
||||
git.iamthefij.com/iamthefij/slog v1.3.0
|
||||
|
||||
@@ -0,0 +1,72 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"io"
|
||||
"net/http"
|
||||
"strings"
|
||||
)
|
||||
|
||||
type HealthCheckHandler struct {
|
||||
isMinitorHealthy bool
|
||||
monitors []*Monitor
|
||||
}
|
||||
|
||||
func NewHealthCheckHandler(monitors []*Monitor) *HealthCheckHandler {
|
||||
return &HealthCheckHandler{
|
||||
false,
|
||||
monitors,
|
||||
}
|
||||
}
|
||||
|
||||
func (hch *HealthCheckHandler) MinitorHealthy(healthy bool) {
|
||||
hch.isMinitorHealthy = healthy
|
||||
}
|
||||
|
||||
func (hch HealthCheckHandler) MinitorHealthCheck() (bool, string) {
|
||||
if hch.isMinitorHealthy {
|
||||
return true, "OK"
|
||||
} else {
|
||||
return false, "UNHEALTHY"
|
||||
}
|
||||
}
|
||||
|
||||
func (hch HealthCheckHandler) MonitorsHealthCheck() (bool, string) {
|
||||
downMonitors := []string{}
|
||||
|
||||
for _, monitor := range hch.monitors {
|
||||
if !monitor.IsUp() {
|
||||
downMonitors = append(downMonitors, monitor.Name)
|
||||
}
|
||||
}
|
||||
|
||||
if len(downMonitors) == 0 {
|
||||
return true, "OK"
|
||||
} else {
|
||||
return false, fmt.Sprintf("UNHEALTHY: The following monitors are unhealthy: %s", strings.Join(downMonitors, ", "))
|
||||
}
|
||||
}
|
||||
|
||||
func (hch HealthCheckHandler) ServeHTTP(w http.ResponseWriter, r *http.Request) {
|
||||
var healthy bool
|
||||
|
||||
var body string
|
||||
|
||||
if monitors := r.URL.Query().Get("monitors"); monitors != "" {
|
||||
healthy, body = hch.MonitorsHealthCheck()
|
||||
} else {
|
||||
healthy, body = hch.MinitorHealthCheck()
|
||||
}
|
||||
|
||||
if healthy {
|
||||
w.WriteHeader(http.StatusOK)
|
||||
} else {
|
||||
w.WriteHeader(http.StatusServiceUnavailable)
|
||||
}
|
||||
|
||||
_, _ = io.WriteString(w, body)
|
||||
}
|
||||
|
||||
func HandleHealthCheck() {
|
||||
http.Handle("/metrics", HealthChecks)
|
||||
}
|
||||
@@ -0,0 +1,79 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"testing"
|
||||
)
|
||||
|
||||
func TestNewHealthCheck(t *testing.T) {
|
||||
monitors := []*Monitor{
|
||||
{Name: "Test Monitor"},
|
||||
}
|
||||
hc := NewHealthCheckHandler(monitors)
|
||||
|
||||
monitors[0].alertCount++
|
||||
|
||||
if healthy, _ := hc.MinitorHealthCheck(); healthy {
|
||||
t.Errorf("Initial hc state should be unhealthy until some successful alert is sent")
|
||||
}
|
||||
|
||||
if healthy, _ := hc.MonitorsHealthCheck(); healthy {
|
||||
t.Errorf("Faking an alert on the monitor pointer should make this unhealthy")
|
||||
}
|
||||
}
|
||||
|
||||
func TestMinitorHealthCheck(t *testing.T) {
|
||||
monitors := []*Monitor{
|
||||
{Name: "Test Monitor"},
|
||||
}
|
||||
hc := NewHealthCheckHandler(monitors)
|
||||
|
||||
t.Run("MinitorHealthCheck(healthy)", func(t *testing.T) {
|
||||
hc.MinitorHealthy(true)
|
||||
healthy, body := hc.MinitorHealthCheck()
|
||||
if !healthy {
|
||||
t.Errorf("Expected healthy check")
|
||||
}
|
||||
if body != "OK" {
|
||||
t.Errorf("Expected OK response")
|
||||
}
|
||||
})
|
||||
|
||||
t.Run("MinitorHealthCheck(unhealthy)", func(t *testing.T) {
|
||||
hc.MinitorHealthy(false)
|
||||
healthy, body := hc.MinitorHealthCheck()
|
||||
if healthy {
|
||||
t.Errorf("Expected healthy check")
|
||||
}
|
||||
if body != "UNHEALTHY" {
|
||||
t.Errorf("Expected UNHEALTHY response")
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
func TestMonitorsHealthCheck(t *testing.T) {
|
||||
monitors := []*Monitor{
|
||||
{Name: "Test Monitor"},
|
||||
}
|
||||
hc := NewHealthCheckHandler(monitors)
|
||||
|
||||
t.Run("MonitorsHealthCheck(healthy)", func(t *testing.T) {
|
||||
healthy, body := hc.MonitorsHealthCheck()
|
||||
if !healthy {
|
||||
t.Errorf("Expected healthy check")
|
||||
}
|
||||
if body != "OK" {
|
||||
t.Errorf("Expected OK response")
|
||||
}
|
||||
})
|
||||
|
||||
t.Run("MonitorsHealthCheck(unhealthy)", func(t *testing.T) {
|
||||
monitors[0].alertCount++
|
||||
healthy, body := hc.MonitorsHealthCheck()
|
||||
if healthy {
|
||||
t.Errorf("Expected healthy check")
|
||||
}
|
||||
if body != "UNHEALTHY: The following monitors are unhealthy: Test Monitor" {
|
||||
t.Errorf("Expected UNHEALTHY response")
|
||||
}
|
||||
})
|
||||
}
|
||||
@@ -4,6 +4,7 @@ import (
|
||||
"errors"
|
||||
"flag"
|
||||
"fmt"
|
||||
"net/http"
|
||||
"time"
|
||||
|
||||
"git.iamthefij.com/iamthefij/slog"
|
||||
@@ -16,6 +17,13 @@ var (
|
||||
MetricsPort = 8080
|
||||
// Metrics contains all active metrics
|
||||
Metrics = NewMetrics()
|
||||
// Self monitor rather than panicing
|
||||
SelfMonitor = false
|
||||
// HealthChecks contains health check values
|
||||
HealthChecks *HealthCheckHandler = nil
|
||||
|
||||
// PyCompat enables support for legacy Python templates
|
||||
PyCompat = false
|
||||
|
||||
// version of minitor being run
|
||||
version = "dev"
|
||||
@@ -48,7 +56,13 @@ func sendAlerts(config *Config, monitor *Monitor, alertNotice *AlertNotice) erro
|
||||
output,
|
||||
)
|
||||
|
||||
if SelfMonitor {
|
||||
Metrics.SetMonitorStatus(fmt.Sprintf("Alert %s", alertName), false)
|
||||
}
|
||||
|
||||
return err
|
||||
} else {
|
||||
Metrics.SetMonitorStatus(fmt.Sprintf("Alert %s", alertName), true)
|
||||
}
|
||||
|
||||
// Count alert metrics
|
||||
@@ -66,6 +80,8 @@ func sendAlerts(config *Config, monitor *Monitor, alertNotice *AlertNotice) erro
|
||||
|
||||
func checkMonitors(config *Config) error {
|
||||
// TODO: Run this in goroutines and capture exceptions
|
||||
healthy := true
|
||||
|
||||
for _, monitor := range config.Monitors {
|
||||
if monitor.ShouldCheck() {
|
||||
success, alertNotice := monitor.Check()
|
||||
@@ -77,24 +93,42 @@ func checkMonitors(config *Config) error {
|
||||
|
||||
if alertNotice != nil {
|
||||
err := sendAlerts(config, monitor, alertNotice)
|
||||
// If there was an error in sending an alert, exit early and bubble it up
|
||||
// If there was an error in sending an alert, mark as unhealthy or bubble up
|
||||
if err != nil {
|
||||
if SelfMonitor {
|
||||
healthy = false
|
||||
} else {
|
||||
return err
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if HealthChecks != nil {
|
||||
HealthChecks.MinitorHealthy(healthy)
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
// ServeMetricsAndHealth starts the default http server
|
||||
func ServeMetricsAndHealth() {
|
||||
host := fmt.Sprintf(":%d", MetricsPort)
|
||||
|
||||
_ = http.ListenAndServe(host, nil)
|
||||
}
|
||||
|
||||
func main() {
|
||||
showVersion := flag.Bool("version", false, "Display the version of minitor and exit")
|
||||
configPath := flag.String("config", "config.yml", "Alternate configuration path (default: config.yml)")
|
||||
|
||||
flag.BoolVar(&slog.DebugLevel, "debug", false, "Enables debug logs (default: false)")
|
||||
flag.BoolVar(&ExportMetrics, "metrics", false, "Enables prometheus metrics exporting (default: false)")
|
||||
flag.IntVar(&MetricsPort, "metrics-port", MetricsPort, "The port that Prometheus metrics should be exported on, if enabled. (default: 8080)")
|
||||
flag.BoolVar(&PyCompat, "py-compat", false, "Enables support for legacy Python Minitor config. Will eventually be removed. (default: false)")
|
||||
flag.IntVar(&MetricsPort, "metrics-port", MetricsPort, "The port that Prometheus metrics and healthchecks should be exported on, if enabled. (default: 8080)")
|
||||
flag.BoolVar(&SelfMonitor, "self-monitor", false, "Enables self-monitoring. Export metrics rather than panic when alerts fail. (default: false)")
|
||||
|
||||
flag.Parse()
|
||||
|
||||
// Print version if flag is provided
|
||||
@@ -111,8 +145,19 @@ func main() {
|
||||
// Serve metrics exporter, if specified
|
||||
if ExportMetrics {
|
||||
slog.Infof("Exporting metrics to Prometheus on port %d", MetricsPort)
|
||||
HandleMetrics()
|
||||
}
|
||||
|
||||
go ServeMetrics()
|
||||
if SelfMonitor {
|
||||
slog.Infof("Starting healthcheck endpoint on port %d", MetricsPort)
|
||||
|
||||
HealthChecks = NewHealthCheckHandler(config.Monitors)
|
||||
|
||||
HandleHealthCheck()
|
||||
}
|
||||
|
||||
if ExportMetrics || SelfMonitor {
|
||||
go ServeMetricsAndHealth()
|
||||
}
|
||||
|
||||
// Start main loop
|
||||
@@ -120,6 +165,6 @@ func main() {
|
||||
err = checkMonitors(&config)
|
||||
slog.OnErrPanicf(err, "Error checking monitors")
|
||||
|
||||
time.Sleep(config.CheckInterval)
|
||||
time.Sleep(config.CheckInterval.Value())
|
||||
}
|
||||
}
|
||||
|
||||
@@ -7,6 +7,7 @@ func TestCheckMonitors(t *testing.T) {
|
||||
config Config
|
||||
expectErr bool
|
||||
name string
|
||||
selfMonitor bool
|
||||
}{
|
||||
{
|
||||
config: Config{},
|
||||
@@ -24,6 +25,7 @@ func TestCheckMonitors(t *testing.T) {
|
||||
},
|
||||
expectErr: false,
|
||||
name: "Monitor success, no alerts",
|
||||
selfMonitor: false,
|
||||
},
|
||||
{
|
||||
config: Config{
|
||||
@@ -37,6 +39,7 @@ func TestCheckMonitors(t *testing.T) {
|
||||
},
|
||||
expectErr: false,
|
||||
name: "Monitor failure, no alerts",
|
||||
selfMonitor: false,
|
||||
},
|
||||
{
|
||||
config: Config{
|
||||
@@ -50,6 +53,7 @@ func TestCheckMonitors(t *testing.T) {
|
||||
},
|
||||
expectErr: false,
|
||||
name: "Monitor recovery, no alerts",
|
||||
selfMonitor: false,
|
||||
},
|
||||
{
|
||||
config: Config{
|
||||
@@ -64,6 +68,7 @@ func TestCheckMonitors(t *testing.T) {
|
||||
},
|
||||
expectErr: true,
|
||||
name: "Monitor failure, unknown alerts",
|
||||
selfMonitor: false,
|
||||
},
|
||||
{
|
||||
config: Config{
|
||||
@@ -78,6 +83,22 @@ func TestCheckMonitors(t *testing.T) {
|
||||
},
|
||||
expectErr: true,
|
||||
name: "Monitor recovery, unknown alerts",
|
||||
selfMonitor: false,
|
||||
},
|
||||
{
|
||||
config: Config{
|
||||
Monitors: []*Monitor{
|
||||
{
|
||||
Name: "Success",
|
||||
Command: CommandOrShell{Command: []string{"true"}},
|
||||
AlertUp: []string{"unknown"},
|
||||
alertCount: 1,
|
||||
},
|
||||
},
|
||||
},
|
||||
expectErr: false,
|
||||
name: "Monitor recovery, unknown alerts, with Health Check",
|
||||
selfMonitor: true,
|
||||
},
|
||||
{
|
||||
config: Config{
|
||||
@@ -97,6 +118,7 @@ func TestCheckMonitors(t *testing.T) {
|
||||
},
|
||||
expectErr: false,
|
||||
name: "Monitor failure, successful alert",
|
||||
selfMonitor: false,
|
||||
},
|
||||
{
|
||||
config: Config{
|
||||
@@ -117,10 +139,34 @@ func TestCheckMonitors(t *testing.T) {
|
||||
},
|
||||
expectErr: true,
|
||||
name: "Monitor failure, bad alert",
|
||||
selfMonitor: false,
|
||||
},
|
||||
{
|
||||
config: Config{
|
||||
Monitors: []*Monitor{
|
||||
{
|
||||
Name: "Failure",
|
||||
Command: CommandOrShell{Command: []string{"false"}},
|
||||
AlertDown: []string{"bad"},
|
||||
AlertAfter: 1,
|
||||
},
|
||||
},
|
||||
Alerts: map[string]*Alert{
|
||||
"bad": {
|
||||
Name: "bad",
|
||||
Command: CommandOrShell{Command: []string{"false"}},
|
||||
},
|
||||
},
|
||||
},
|
||||
expectErr: false,
|
||||
name: "Monitor failure, bad alert, with Health Check",
|
||||
selfMonitor: true,
|
||||
},
|
||||
}
|
||||
|
||||
for _, c := range cases {
|
||||
SelfMonitor = c.selfMonitor
|
||||
|
||||
err := c.config.Init()
|
||||
if err != nil {
|
||||
t.Errorf("checkMonitors(%s): unexpected error reading config: %v", c.name, err)
|
||||
|
||||
+2
-7
@@ -1,7 +1,6 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"net/http"
|
||||
|
||||
"github.com/prometheus/client_golang/prometheus"
|
||||
@@ -107,11 +106,7 @@ func (metrics *MinitorMetrics) CountAlert(monitor string, alert string) {
|
||||
).Inc()
|
||||
}
|
||||
|
||||
// ServeMetrics starts an http server with a Prometheus metrics handler
|
||||
func ServeMetrics() {
|
||||
// HandleMetrics add Prometheus metrics handler to default http server
|
||||
func HandleMetrics() {
|
||||
http.Handle("/metrics", promhttp.Handler())
|
||||
|
||||
host := fmt.Sprintf(":%d", MetricsPort)
|
||||
|
||||
_ = http.ListenAndServe(host, nil)
|
||||
}
|
||||
|
||||
+2
-2
@@ -13,7 +13,7 @@ type Monitor struct { //nolint:maligned
|
||||
// Config values
|
||||
AlertAfter int16 `yaml:"alert_after"`
|
||||
AlertEvery *int16 `yaml:"alert_every"`
|
||||
CheckInterval time.Duration `yaml:"check_interval"`
|
||||
CheckInterval SecondsOrDuration `yaml:"check_interval"`
|
||||
Name string
|
||||
AlertDown []string `yaml:"alert_down"`
|
||||
AlertUp []string `yaml:"alert_up"`
|
||||
@@ -45,7 +45,7 @@ func (monitor Monitor) ShouldCheck() bool {
|
||||
|
||||
sinceLastCheck := time.Since(monitor.lastCheck)
|
||||
|
||||
return sinceLastCheck >= monitor.CheckInterval
|
||||
return sinceLastCheck >= monitor.CheckInterval.Value()
|
||||
}
|
||||
|
||||
// Check will run the command configured by the Monitor and return a status
|
||||
|
||||
+3
-3
@@ -45,9 +45,9 @@ func TestMonitorShouldCheck(t *testing.T) {
|
||||
name string
|
||||
}{
|
||||
{Monitor{}, true, "Empty"},
|
||||
{Monitor{lastCheck: timeNow, CheckInterval: time.Second * 15}, false, "Just checked"},
|
||||
{Monitor{lastCheck: timeTenSecAgo, CheckInterval: time.Second * 15}, false, "-10s"},
|
||||
{Monitor{lastCheck: timeTwentySecAgo, CheckInterval: time.Second * 15}, true, "-20s"},
|
||||
{Monitor{lastCheck: timeNow, CheckInterval: SecondsOrDuration{time.Second * 15}}, false, "Just checked"},
|
||||
{Monitor{lastCheck: timeTenSecAgo, CheckInterval: SecondsOrDuration{time.Second * 15}}, false, "-10s"},
|
||||
{Monitor{lastCheck: timeTwentySecAgo, CheckInterval: SecondsOrDuration{time.Second * 15}}, true, "-20s"},
|
||||
}
|
||||
|
||||
for _, c := range cases {
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
---
|
||||
check_interval: 1s
|
||||
check_interval: 1
|
||||
|
||||
monitors:
|
||||
- name: Command
|
||||
|
||||
Reference in New Issue
Block a user